6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2018, Joyent, Inc.
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
110 * ncec_refcnt).
111 */
112
113 static void nce_cleanup_list(ncec_t *ncec);
114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116 ncec_t *);
117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119 uint16_t ncec_flags, nce_t **newnce);
120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121 uint16_t ncec_flags, nce_t **newnce);
122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124 const in6_addr_t *target, int flag);
125 static void ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129 uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136 uint16_t, uint16_t, nce_t **);
137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138 uint16_t, uint16_t, nce_t **);
139 static int nce_add_v6_postprocess(nce_t *);
140 static int nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149
150 #ifdef DEBUG
151 static void ncec_trace_cleanup(const ncec_t *);
988
989 mutex_exit(&ndp->ndp_g_lock);
990
991 if (free_nce_list != NULL) {
992 nce_cleanup_list(free_nce_list);
993 }
994 }
995
996 /*
997 * Walk everything.
998 * Note that ill can be NULL hence can't derive the ipst from it.
999 */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006
1007 /*
1008 * For each interface an entry is added for the unspecified multicast group.
1009 * Here that mapping is used to form the multicast cache entry for a particular
1010 * multicast destination.
1011 */
1012 static int
1013 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014 uint16_t flags, nce_t **newnce)
1015 {
1016 uchar_t *hw_addr;
1017 int err = 0;
1018 ip_stack_t *ipst = ill->ill_ipst;
1019 nce_t *nce;
1020
1021 ASSERT(ill != NULL);
1022 ASSERT(ill->ill_isv6);
1023 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1024
1025 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1026 nce = nce_lookup_addr(ill, dst);
1027 if (nce != NULL) {
1033 * For IRE_IF_RESOLVER a hardware mapping can be
1034 * generated.
1035 */
1036 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1037 if (hw_addr == NULL) {
1038 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1039 return (ENOMEM);
1040 }
1041 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1042 } else {
1043 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044 hw_addr = NULL;
1045 }
1046 ASSERT((flags & NCE_F_MCAST) != 0);
1047 ASSERT((flags & NCE_F_NONUD) != 0);
1048 /* nce_state will be computed by nce_add_common() */
1049 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050 ND_UNCHANGED, &nce);
1051 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052 if (err == 0)
1053 err = nce_add_v6_postprocess(nce);
1054 if (hw_addr != NULL)
1055 kmem_free(hw_addr, ill->ill_nd_lla_len);
1056 if (err != 0) {
1057 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058 return (err);
1059 }
1060 done:
1061 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062 if (newnce != NULL)
1063 *newnce = nce;
1064 else
1065 nce_refrele(nce);
1066 return (0);
1067 }
1068
1069 /*
1070 * Return the link layer address, and any flags of a ncec.
1071 */
1072 int
1073 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
3083 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3084 ill->ill_sap_length);
3085 }
3086 mutex_exit(&ncec->ncec_lock);
3087 return (nce);
3088 }
3089
3090 /*
3091 * we make nce_fp_mp to have an M_DATA prepend.
3092 * The caller ensures there is hold on ncec for this function.
3093 * Note that since ill_fastpath_probe() copies the mblk there is
3094 * no need to hold the nce or ncec beyond this function.
3095 *
3096 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098 * and will be returned back by this function, so that no extra nce_refrele
3099 * is required for the caller. The calls from nce_add_common() use this
3100 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101 * nce_refrele of the returned nce (when it is non-null).
3102 */
3103 nce_t *
3104 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 {
3106 nce_t *nce;
3107 ill_t *ill = ncec->ncec_ill;
3108
3109 ASSERT(ill != NULL);
3110
3111 if (IS_IPMP(ill) && trigger_fp_req) {
3112 trigger_fp_req = B_FALSE;
3113 ipmp_ncec_refresh_nce(ncec);
3114 }
3115
3116 /*
3117 * If the caller already has the nce corresponding to the ill, use
3118 * that one. Otherwise we have to lookup/add the nce. Calls from
3119 * nce_add_common() fall in the former category, and have just done
3120 * the nce lookup/add that can be reused.
3121 */
3122 if (ncec_nce == NULL)
3123 nce = nce_fastpath_create(ill, ncec);
3141 int res;
3142 ill_t *ill = nce->nce_ill;
3143 ncec_t *ncec = nce->nce_common;
3144
3145 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3146 /*
3147 * EAGAIN is an indication of a transient error
3148 * i.e. allocation failure etc. leave the ncec in the list it
3149 * will be updated when another probe happens for another ire
3150 * if not it will be taken out of the list when the ire is
3151 * deleted.
3152 */
3153 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154 nce_fastpath_list_delete(ill, ncec, NULL);
3155 }
3156
3157 /*
3158 * Add ncec to the nce fastpath list on ill.
3159 */
3160 static nce_t *
3161 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3162 {
3163 nce_t *nce = NULL;
3164
3165 ASSERT(MUTEX_HELD(&ill->ill_lock));
3166 /*
3167 * Atomically ensure that the ill is not CONDEMNED and is not going
3168 * down, before adding the NCE.
3169 */
3170 if (ill->ill_state_flags & ILL_CONDEMNED)
3171 return (NULL);
3172 mutex_enter(&ncec->ncec_lock);
3173 /*
3174 * if ncec has not been deleted and
3175 * is not already in the list add it.
3176 */
3177 if (!NCE_ISCONDEMNED(ncec)) {
3178 nce = nce_lookup(ill, &ncec->ncec_addr);
3179 if (nce != NULL)
3180 goto done;
3181 nce = nce_add(ill, ncec);
3182 }
3183 done:
3184 mutex_exit(&ncec->ncec_lock);
3185 return (nce);
3186 }
3187
3188 nce_t *
3189 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 {
3191 nce_t *nce;
3192
3193 mutex_enter(&ill->ill_lock);
3194 nce = nce_ill_lookup_then_add_locked(ill, ncec);
3195 mutex_exit(&ill->ill_lock);
3196 return (nce);
3197 }
3198
3199
3200 /*
3201 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203 * entry after all locks have been dropped.
3204 */
3205 void
3206 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3207 {
3208 nce_t *nce;
3209
3210 ASSERT(ill != NULL);
3211
3212 /* delete any nces referencing the ncec from underlying ills */
3213 if (IS_IPMP(ill))
3214 ipmp_ncec_delete_nce(ncec);
3215
3226 mutex_exit(&ill->ill_lock);
3227 if (nce != NULL) {
3228 if (dead == NULL)
3229 nce_refrele(nce);
3230 else
3231 list_insert_tail(dead, nce);
3232 }
3233 }
3234
3235 /*
3236 * when the fastpath response does not fit in the datab
3237 * associated with the existing nce_fp_mp, we delete and
3238 * add the nce to retrigger fastpath based on the information
3239 * in the ncec_t.
3240 */
3241 static nce_t *
3242 nce_delete_then_add(nce_t *nce)
3243 {
3244 ill_t *ill = nce->nce_ill;
3245 nce_t *newnce = NULL;
3246
3247 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248 (void *)nce, ill->ill_name));
3249 mutex_enter(&ill->ill_lock);
3250 mutex_enter(&nce->nce_common->ncec_lock);
3251 nce_delete(nce);
3252 /*
3253 * Make sure that ncec is not condemned before adding. We hold the
3254 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255 * ipmp_ncec_delete_nce()
3256 */
3257 if (!NCE_ISCONDEMNED(nce->nce_common))
3258 newnce = nce_add(ill, nce->nce_common);
3259 mutex_exit(&nce->nce_common->ncec_lock);
3260 mutex_exit(&ill->ill_lock);
3261 nce_refrele(nce);
3262 return (newnce); /* could be null if nomem */
3263 }
3264
3265 typedef struct nce_fp_match_s {
3266 nce_t *nce_fp_match_res;
3267 mblk_t *nce_fp_match_ack_mp;
3268 } nce_fp_match_t;
3269
3270 /* ARGSUSED */
3271 static int
3272 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3273 {
3274 nce_fp_match_t *nce_fp_marg = arg;
3275 ncec_t *ncec = nce->nce_common;
3276 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3277 uchar_t *mp_rptr, *ud_mp_rptr;
3278 mblk_t *ud_mp = nce->nce_dlur_mp;
3279 ptrdiff_t cmplen;
3280
3955 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3956 if (hw_addr == NULL) {
3957 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3958 return (ENOMEM);
3959 }
3960 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3961 } else {
3962 /*
3963 * IRE_IF_NORESOLVER type simply copies the resolution
3964 * cookie passed in. So no hw_addr is needed.
3965 */
3966 hw_addr = NULL;
3967 }
3968 ASSERT(flags & NCE_F_MCAST);
3969 ASSERT(flags & NCE_F_NONUD);
3970 /* nce_state will be computed by nce_add_common() */
3971 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972 ND_UNCHANGED, &nce);
3973 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974 if (err == 0)
3975 err = nce_add_v4_postprocess(nce);
3976 if (hw_addr != NULL)
3977 kmem_free(hw_addr, ill->ill_phys_addr_length);
3978 if (err != 0) {
3979 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980 return (err);
3981 }
3982 done:
3983 if (newnce != NULL)
3984 *newnce = nce;
3985 else
3986 nce_refrele(nce);
3987 return (0);
3988 }
3989
3990 /*
3991 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3992 * don't want to have to walk the list for every single one, so we gather up
3993 * batches at a time.
3994 */
3995 #define NCE_RESCHED_LIST_LEN 8
4379 *
4380 * When the caller passes in an nce_state of ND_UNCHANGED,
4381 * nce_add_common() will determine the state of the created nce based
4382 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4383 * be created with state set to the passed in nce_state.
4384 */
4385 static int
4386 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4387 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4388 {
4389 static ncec_t nce_nil;
4390 uchar_t *template = NULL;
4391 int err;
4392 ncec_t *ncec;
4393 ncec_t **ncep;
4394 ip_stack_t *ipst = ill->ill_ipst;
4395 uint16_t state;
4396 boolean_t fastprobe = B_FALSE;
4397 struct ndp_g_s *ndp;
4398 nce_t *nce = NULL;
4399 mblk_t *dlur_mp = NULL;
4400
4401 if (ill->ill_isv6)
4402 ndp = ill->ill_ipst->ips_ndp6;
4403 else
4404 ndp = ill->ill_ipst->ips_ndp4;
4405
4406 *retnce = NULL;
4407
4408 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4409
4410 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4411 ip0dbg(("nce_add_common: no addr\n"));
4412 return (EINVAL);
4413 }
4414 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4415 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4416 return (EINVAL);
4417 }
4418
4669 }
4670 /*
4671 * Acquire the ncec_lock even before adding the ncec to the list
4672 * so that it cannot get deleted after the ncec is added, but
4673 * before we add the nce.
4674 */
4675 mutex_enter(&ncec->ncec_lock);
4676 if ((ncec->ncec_next = *ncep) != NULL)
4677 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4678 *ncep = ncec;
4679 ncec->ncec_ptpn = ncep;
4680
4681 /* Bump up the number of ncec's referencing this ill */
4682 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683 (char *), "ncec", (void *), ncec);
4684 ill->ill_ncec_cnt++;
4685 /*
4686 * Since we hold the ncec_lock at this time, the ncec cannot be
4687 * condemned, and we can safely add the nce.
4688 */
4689 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4690 mutex_exit(&ncec->ncec_lock);
4691 mutex_exit(&ill->ill_lock);
4692
4693 /* caller must trigger fastpath on *retnce */
4694 return (0);
4695
4696 err_ret:
4697 if (ncec != NULL)
4698 kmem_cache_free(ncec_cache, ncec);
4699 if (nce != NULL)
4700 kmem_cache_free(nce_cache, nce);
4701 freemsg(dlur_mp);
4702 if (template != NULL)
4703 kmem_free(template, ill->ill_phys_addr_length);
4704 return (err);
4705 }
4706
4707 /*
4708 * take a ref on the nce
4709 */
4710 void
4711 nce_refhold(nce_t *nce)
4757 ill->ill_nce_cnt--;
4758 nce->nce_ill = NULL;
4759 /*
4760 * If the number of ncec's associated with this ill have dropped
4761 * to zero, check whether we need to restart any operation that
4762 * is waiting for this to happen.
4763 */
4764 if (ILL_DOWN_OK(ill)) {
4765 /* ipif_ill_refrele_tail drops the ill_lock */
4766 ipif_ill_refrele_tail(ill);
4767 } else {
4768 mutex_exit(&ill->ill_lock);
4769 }
4770
4771 mutex_destroy(&nce->nce_lock);
4772 kmem_cache_free(nce_cache, nce);
4773 }
4774
4775 /*
4776 * Add an nce to the ill_nce list.
4777 */
4778 static nce_t *
4779 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4780 {
4781 bzero(nce, sizeof (*nce));
4782 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783 nce->nce_common = ncec;
4784 nce->nce_addr = ncec->ncec_addr;
4785 nce->nce_ill = ill;
4786 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787 (char *), "nce", (void *), nce);
4788 ill->ill_nce_cnt++;
4789
4790 nce->nce_refcnt = 1; /* for the thread */
4791 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792 nce->nce_dlur_mp = dlur_mp;
4793
4794 /* add nce to the ill's fastpath list. */
4795 nce->nce_refcnt++; /* for the list */
4796 list_insert_head(&ill->ill_nce, nce);
4797 return (nce);
4798 }
4799
4800 static nce_t *
4801 nce_add(ill_t *ill, ncec_t *ncec)
4802 {
4803 nce_t *nce;
4804 mblk_t *dlur_mp = NULL;
4805
4806 ASSERT(MUTEX_HELD(&ill->ill_lock));
4807 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808
4809 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810 if (nce == NULL)
4811 return (NULL);
4812 if (ncec->ncec_lladdr != NULL ||
4813 ill->ill_net_type == IRE_IF_NORESOLVER) {
4814 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815 ill->ill_phys_addr_length, ill->ill_sap,
4816 ill->ill_sap_length);
4817 if (dlur_mp == NULL) {
4818 kmem_cache_free(nce_cache, nce);
4819 return (NULL);
4820 }
4821 }
4822 return (nce_add_impl(ill, ncec, nce, dlur_mp));
4823 }
4824
4825 /*
4826 * remove the nce from the ill_faspath list
4827 */
4828 void
4829 nce_delete(nce_t *nce)
4830 {
4831 ill_t *ill = nce->nce_ill;
4832
4833 ASSERT(MUTEX_HELD(&ill->ill_lock));
4834
4835 mutex_enter(&nce->nce_lock);
4836 if (nce->nce_is_condemned) {
4837 /*
4838 * some other thread has removed this nce from the ill_nce list
4839 */
4840 mutex_exit(&nce->nce_lock);
4841 return;
4842 }
4843 nce->nce_is_condemned = B_TRUE;
4844 mutex_exit(&nce->nce_lock);
4845
4846 list_remove(&ill->ill_nce, nce);
4847 /*
4848 * even though we are holding the ill_lock, it is ok to
4849 * call nce_refrele here because we know that we should have
4850 * at least 2 refs on the nce: one for the thread, and one
4851 * for the list. The refrele below will release the one for
4852 * the list.
4853 */
4854 nce_refrele(nce);
4855 }
4856
4857 nce_t *
4858 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4859 {
4860 nce_t *nce = NULL;
4861
4862 ASSERT(ill != NULL);
4863 ASSERT(MUTEX_HELD(&ill->ill_lock));
4864
4865 for (nce = list_head(&ill->ill_nce); nce != NULL;
|
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright (c) 2019, Joyent, Inc.
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
110 * ncec_refcnt).
111 */
112
113 static void nce_cleanup_list(ncec_t *ncec);
114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116 ncec_t *);
117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119 uint16_t ncec_flags, nce_t **newnce);
120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121 uint16_t ncec_flags, nce_t **newnce);
122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124 const in6_addr_t *target, int flag);
125 static void ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129 uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136 uint16_t, uint16_t, nce_t **);
137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138 uint16_t, uint16_t, nce_t **);
139 static int nce_add_v6_postprocess(nce_t *);
140 static int nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149
150 #ifdef DEBUG
151 static void ncec_trace_cleanup(const ncec_t *);
988
989 mutex_exit(&ndp->ndp_g_lock);
990
991 if (free_nce_list != NULL) {
992 nce_cleanup_list(free_nce_list);
993 }
994 }
995
996 /*
997 * Walk everything.
998 * Note that ill can be NULL hence can't derive the ipst from it.
999 */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006
1007 /*
1008 * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009 * NCEs, and the number to reclaim if we hit the limit. Used by
1010 * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011 * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012 */
1013
1014 /* Maximum number of multicast NCEs on an ill. */
1015 uint_t ip_max_ill_mcast_nces = 16384;
1016 /*
1017 * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and
1018 * return an error. Non-zero means delete so many, and if the number is >=
1019 * the max above, that means delete them all.
1020 */
1021 uint_t ip_ill_mcast_reclaim = 256;
1022
1023 /*
1024 * Encapsulate multicast ill capping in a function, for easier DTrace
1025 * detections. Return a list of refheld NCEs to destroy-via-refrele. That
1026 * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027 *
1028 * NOTE: This function must be called while holding the ill_lock AND
1029 * JUST PRIOR to making the insertion into the ill_nce list.
1030 *
1031 * We can't release the ones we delete ourselves because the ill_lock is held
1032 * by the caller. They are, instead, passed back in a list_t for deletion
1033 * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034 *
1035 * While this covers nce_t, ncec_t gets done even further down the road. See
1036 * nce_graveyard_free() for why.
1037 */
1038 static boolean_t
1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 {
1041 uint_t reclaim_count, max_count, reclaimed = 0;
1042 boolean_t too_many;
1043 nce_t *nce, *deadman;
1044
1045 ASSERT(graveyard != NULL);
1046 ASSERT(list_is_empty(graveyard));
1047 ASSERT(MUTEX_HELD(&ill->ill_lock));
1048
1049 /*
1050 * NOTE: Some grinning weirdo may have lowered the global max beyond
1051 * what this ill currently has. The behavior in this case will be
1052 * trim-back just by the reclaim amount for any new ones.
1053 */
1054 max_count = ip_max_ill_mcast_nces;
1055 reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056
1057 /* All good? */
1058 if (ill->ill_mcast_nces < max_count)
1059 return (B_FALSE); /* Yes, all good. */
1060
1061 if (reclaim_count == 0)
1062 return (B_TRUE); /* Don't bother - we're stuck. */
1063
1064 /* We need to reclaim now. Exploit our held ill_lock. */
1065
1066 /*
1067 * Start at the tail and work backwards, new nces are head-inserted,
1068 * so we'll be reaping the oldest entries.
1069 */
1070 nce = list_tail(&ill->ill_nce);
1071 while (reclaimed < reclaim_count) {
1072 /* Skip ahead to a multicast NCE. */
1073 while (nce != NULL &&
1074 (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075 nce = list_prev(&ill->ill_nce, nce);
1076 }
1077 if (nce == NULL)
1078 break;
1079
1080 /*
1081 * NOTE: For now, we just delete the first one(s) we find.
1082 * This is not optimal, and may require some inspection of nce
1083 * & its ncec to be better.
1084 */
1085 deadman = nce;
1086 nce = list_prev(&ill->ill_nce, nce);
1087
1088 /* nce_delete() requires caller holds... */
1089 nce_refhold(deadman);
1090 nce_delete(deadman); /* Bumps down ill_mcast_nces. */
1091
1092 /* Link the dead ones singly, still refheld... */
1093 list_insert_tail(graveyard, deadman);
1094 reclaimed++;
1095 }
1096
1097 if (reclaimed != reclaim_count) {
1098 /* We didn't have enough to reach reclaim_count. Why?!? */
1099 DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100 uint_t, reclaimed, uint_t, reclaim_count);
1101
1102 /* In case for some REALLY weird reason we found none! */
1103 too_many = (reclaimed == 0);
1104 } else {
1105 too_many = B_FALSE;
1106 }
1107
1108 return (too_many);
1109 }
1110
1111 static void
1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 {
1114 boolean_t reapit;
1115 ill_t *ill = (ill_t *)arg;
1116
1117 /* Obvious no-lock-needed checks... */
1118 if (ncec == NULL || ncec->ncec_ill != ill ||
1119 (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120 return;
1121
1122 mutex_enter(&ncec->ncec_lock);
1123 /*
1124 * It's refheld by the walk infrastructure. It has one reference for
1125 * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126 * We want ones without an nce_t, so 2 is the magic number. If it's
1127 * LESS than 2, we have much bigger problems anyway.
1128 */
1129 ASSERT(ncec->ncec_refcnt >= 2);
1130 reapit = (ncec->ncec_refcnt == 2);
1131 mutex_exit(&ncec->ncec_lock);
1132
1133 if (reapit) {
1134 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135 ncec_delete(ncec);
1136 }
1137 }
1138
1139 /*
1140 * Attempt to reap stray multicast ncec_t structures left in the wake of
1141 * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142 * outside any netstack-global locks being held - ndp_g_lock in this case. We
1143 * have a reference hold on the ill, which will prevent any unplumbing races.
1144 */
1145 static void
1146 ncec_mcast_reap(void *arg)
1147 {
1148 ill_t *ill = (ill_t *)arg;
1149
1150 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151 ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152 mutex_enter(&ill->ill_lock);
1153 ill->ill_mcast_ncec_cleanup = B_FALSE;
1154 /*
1155 * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156 * below for why.
1157 */
1158 ill->ill_refcnt--;
1159 if (ill->ill_refcnt == 0)
1160 ipif_ill_refrele_tail(ill); /* Drops ill_lock. */
1161 else
1162 mutex_exit(&ill->ill_lock);
1163 }
1164
1165 /*
1166 * Free a list (including handling an empty list or NULL list) of
1167 * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168 * call. Separate because the caller must have dropped ndp_g_lock first.
1169 *
1170 * This also schedules a taskq task to unlink underlying NCECs from the
1171 * ndp_g_hash, which are protected by ndp_g_lock.
1172 */
1173 static void
1174 nce_graveyard_free(list_t *graveyard)
1175 {
1176 nce_t *deadman, *current;
1177 ill_t *ill;
1178 boolean_t doit;
1179
1180 if (graveyard == NULL)
1181 return;
1182
1183 current = list_head(graveyard);
1184 if (current == NULL) {
1185 list_destroy(graveyard);
1186 return;
1187 }
1188
1189 ill = current->nce_ill;
1190 /*
1191 * Normally one should ill_refhold(ill) here. There's no _notr()
1192 * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193 * the ONLY case that'll break the mh_trace that IP debugging uses for
1194 * reference counts (i.e. they assume same thread releases as
1195 * holds). Instead, we inline ill_refhold() here. We must do the same
1196 * in the release done by the ncec_mcast_reap() above.
1197 */
1198 mutex_enter(&ill->ill_lock);
1199 ill->ill_refcnt++;
1200 mutex_exit(&ill->ill_lock);
1201
1202 while (current != NULL) {
1203 ASSERT3P(ill, ==, current->nce_ill);
1204 deadman = current;
1205 current = list_next(graveyard, deadman);
1206 list_remove(graveyard, deadman);
1207 ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208 0);
1209 nce_refrele(deadman);
1210 }
1211 list_destroy(graveyard);
1212
1213 mutex_enter(&ill->ill_lock);
1214 if (ill->ill_mcast_ncec_cleanup)
1215 doit = B_FALSE;
1216 else {
1217 ill->ill_mcast_ncec_cleanup = B_TRUE;
1218 doit = B_TRUE;
1219 }
1220 mutex_exit(&ill->ill_lock);
1221 if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222 ill, TQ_NOSLEEP) == NULL) {
1223 mutex_enter(&ill->ill_lock);
1224 if (doit) {
1225 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226 ill->ill_mcast_ncec_cleanup = B_FALSE;
1227 }
1228 /* There's no _notr() for ill_refrele(), so inline it here. */
1229 ill->ill_refcnt--;
1230 if (ill->ill_refcnt == 0)
1231 ipif_ill_refrele_tail(ill); /* Drops ill_lock */
1232 else
1233 mutex_exit(&ill->ill_lock);
1234 }
1235 }
1236
1237 /*
1238 * For each interface an entry is added for the unspecified multicast group.
1239 * Here that mapping is used to form the multicast cache entry for a particular
1240 * multicast destination.
1241 */
1242 static int
1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1244 uint16_t flags, nce_t **newnce)
1245 {
1246 uchar_t *hw_addr;
1247 int err = 0;
1248 ip_stack_t *ipst = ill->ill_ipst;
1249 nce_t *nce;
1250
1251 ASSERT(ill != NULL);
1252 ASSERT(ill->ill_isv6);
1253 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1254
1255 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1256 nce = nce_lookup_addr(ill, dst);
1257 if (nce != NULL) {
1263 * For IRE_IF_RESOLVER a hardware mapping can be
1264 * generated.
1265 */
1266 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1267 if (hw_addr == NULL) {
1268 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1269 return (ENOMEM);
1270 }
1271 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1272 } else {
1273 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1274 hw_addr = NULL;
1275 }
1276 ASSERT((flags & NCE_F_MCAST) != 0);
1277 ASSERT((flags & NCE_F_NONUD) != 0);
1278 /* nce_state will be computed by nce_add_common() */
1279 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1280 ND_UNCHANGED, &nce);
1281 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1282 if (err == 0)
1283 err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1284 if (hw_addr != NULL)
1285 kmem_free(hw_addr, ill->ill_nd_lla_len);
1286 if (err != 0) {
1287 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1288 return (err);
1289 }
1290 done:
1291 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1292 if (newnce != NULL)
1293 *newnce = nce;
1294 else
1295 nce_refrele(nce);
1296 return (0);
1297 }
1298
1299 /*
1300 * Return the link layer address, and any flags of a ncec.
1301 */
1302 int
1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
3313 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3314 ill->ill_sap_length);
3315 }
3316 mutex_exit(&ncec->ncec_lock);
3317 return (nce);
3318 }
3319
3320 /*
3321 * we make nce_fp_mp to have an M_DATA prepend.
3322 * The caller ensures there is hold on ncec for this function.
3323 * Note that since ill_fastpath_probe() copies the mblk there is
3324 * no need to hold the nce or ncec beyond this function.
3325 *
3326 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3327 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3328 * and will be returned back by this function, so that no extra nce_refrele
3329 * is required for the caller. The calls from nce_add_common() use this
3330 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3331 * nce_refrele of the returned nce (when it is non-null).
3332 */
3333 static nce_t *
3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3335 {
3336 nce_t *nce;
3337 ill_t *ill = ncec->ncec_ill;
3338
3339 ASSERT(ill != NULL);
3340
3341 if (IS_IPMP(ill) && trigger_fp_req) {
3342 trigger_fp_req = B_FALSE;
3343 ipmp_ncec_refresh_nce(ncec);
3344 }
3345
3346 /*
3347 * If the caller already has the nce corresponding to the ill, use
3348 * that one. Otherwise we have to lookup/add the nce. Calls from
3349 * nce_add_common() fall in the former category, and have just done
3350 * the nce lookup/add that can be reused.
3351 */
3352 if (ncec_nce == NULL)
3353 nce = nce_fastpath_create(ill, ncec);
3371 int res;
3372 ill_t *ill = nce->nce_ill;
3373 ncec_t *ncec = nce->nce_common;
3374
3375 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3376 /*
3377 * EAGAIN is an indication of a transient error
3378 * i.e. allocation failure etc. leave the ncec in the list it
3379 * will be updated when another probe happens for another ire
3380 * if not it will be taken out of the list when the ire is
3381 * deleted.
3382 */
3383 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3384 nce_fastpath_list_delete(ill, ncec, NULL);
3385 }
3386
3387 /*
3388 * Add ncec to the nce fastpath list on ill.
3389 */
3390 static nce_t *
3391 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3392 {
3393 nce_t *nce = NULL;
3394
3395 ASSERT(MUTEX_HELD(&ill->ill_lock));
3396 /*
3397 * Atomically ensure that the ill is not CONDEMNED and is not going
3398 * down, before adding the NCE.
3399 */
3400 if (ill->ill_state_flags & ILL_CONDEMNED)
3401 return (NULL);
3402 mutex_enter(&ncec->ncec_lock);
3403 /*
3404 * if ncec has not been deleted and
3405 * is not already in the list add it.
3406 */
3407 if (!NCE_ISCONDEMNED(ncec)) {
3408 nce = nce_lookup(ill, &ncec->ncec_addr);
3409 if (nce != NULL)
3410 goto done;
3411 nce = nce_add(ill, ncec, graveyard);
3412 }
3413 done:
3414 mutex_exit(&ncec->ncec_lock);
3415 return (nce);
3416 }
3417
3418 static nce_t *
3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3420 {
3421 nce_t *nce;
3422 list_t graveyard;
3423
3424 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3425 mutex_enter(&ill->ill_lock);
3426 nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3427 mutex_exit(&ill->ill_lock);
3428 nce_graveyard_free(&graveyard);
3429 return (nce);
3430 }
3431
3432
3433 /*
3434 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3435 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3436 * entry after all locks have been dropped.
3437 */
3438 void
3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3440 {
3441 nce_t *nce;
3442
3443 ASSERT(ill != NULL);
3444
3445 /* delete any nces referencing the ncec from underlying ills */
3446 if (IS_IPMP(ill))
3447 ipmp_ncec_delete_nce(ncec);
3448
3459 mutex_exit(&ill->ill_lock);
3460 if (nce != NULL) {
3461 if (dead == NULL)
3462 nce_refrele(nce);
3463 else
3464 list_insert_tail(dead, nce);
3465 }
3466 }
3467
3468 /*
3469 * when the fastpath response does not fit in the datab
3470 * associated with the existing nce_fp_mp, we delete and
3471 * add the nce to retrigger fastpath based on the information
3472 * in the ncec_t.
3473 */
3474 static nce_t *
3475 nce_delete_then_add(nce_t *nce)
3476 {
3477 ill_t *ill = nce->nce_ill;
3478 nce_t *newnce = NULL;
3479 list_t graveyard;
3480
3481 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3482 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3483 (void *)nce, ill->ill_name));
3484 mutex_enter(&ill->ill_lock);
3485 mutex_enter(&nce->nce_common->ncec_lock);
3486 nce_delete(nce);
3487 /*
3488 * Make sure that ncec is not condemned before adding. We hold the
3489 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3490 * ipmp_ncec_delete_nce()
3491 */
3492 if (!NCE_ISCONDEMNED(nce->nce_common))
3493 newnce = nce_add(ill, nce->nce_common, &graveyard);
3494 mutex_exit(&nce->nce_common->ncec_lock);
3495 mutex_exit(&ill->ill_lock);
3496 nce_graveyard_free(&graveyard);
3497 nce_refrele(nce);
3498 return (newnce); /* could be null if nomem */
3499 }
3500
3501 typedef struct nce_fp_match_s {
3502 nce_t *nce_fp_match_res;
3503 mblk_t *nce_fp_match_ack_mp;
3504 } nce_fp_match_t;
3505
3506 /* ARGSUSED */
3507 static int
3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3509 {
3510 nce_fp_match_t *nce_fp_marg = arg;
3511 ncec_t *ncec = nce->nce_common;
3512 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3513 uchar_t *mp_rptr, *ud_mp_rptr;
3514 mblk_t *ud_mp = nce->nce_dlur_mp;
3515 ptrdiff_t cmplen;
3516
4191 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
4192 if (hw_addr == NULL) {
4193 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4194 return (ENOMEM);
4195 }
4196 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
4197 } else {
4198 /*
4199 * IRE_IF_NORESOLVER type simply copies the resolution
4200 * cookie passed in. So no hw_addr is needed.
4201 */
4202 hw_addr = NULL;
4203 }
4204 ASSERT(flags & NCE_F_MCAST);
4205 ASSERT(flags & NCE_F_NONUD);
4206 /* nce_state will be computed by nce_add_common() */
4207 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
4208 ND_UNCHANGED, &nce);
4209 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4210 if (err == 0)
4211 err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
4212 if (hw_addr != NULL)
4213 kmem_free(hw_addr, ill->ill_phys_addr_length);
4214 if (err != 0) {
4215 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
4216 return (err);
4217 }
4218 done:
4219 if (newnce != NULL)
4220 *newnce = nce;
4221 else
4222 nce_refrele(nce);
4223 return (0);
4224 }
4225
4226 /*
4227 * This is used when scanning for "old" (least recently broadcast) NCEs. We
4228 * don't want to have to walk the list for every single one, so we gather up
4229 * batches at a time.
4230 */
4231 #define NCE_RESCHED_LIST_LEN 8
4615 *
4616 * When the caller passes in an nce_state of ND_UNCHANGED,
4617 * nce_add_common() will determine the state of the created nce based
4618 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4619 * be created with state set to the passed in nce_state.
4620 */
4621 static int
4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4623 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4624 {
4625 static ncec_t nce_nil;
4626 uchar_t *template = NULL;
4627 int err;
4628 ncec_t *ncec;
4629 ncec_t **ncep;
4630 ip_stack_t *ipst = ill->ill_ipst;
4631 uint16_t state;
4632 boolean_t fastprobe = B_FALSE;
4633 struct ndp_g_s *ndp;
4634 nce_t *nce = NULL;
4635 list_t graveyard;
4636 mblk_t *dlur_mp = NULL;
4637
4638 if (ill->ill_isv6)
4639 ndp = ill->ill_ipst->ips_ndp6;
4640 else
4641 ndp = ill->ill_ipst->ips_ndp4;
4642
4643 *retnce = NULL;
4644
4645 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4646
4647 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4648 ip0dbg(("nce_add_common: no addr\n"));
4649 return (EINVAL);
4650 }
4651 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4652 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4653 return (EINVAL);
4654 }
4655
4906 }
4907 /*
4908 * Acquire the ncec_lock even before adding the ncec to the list
4909 * so that it cannot get deleted after the ncec is added, but
4910 * before we add the nce.
4911 */
4912 mutex_enter(&ncec->ncec_lock);
4913 if ((ncec->ncec_next = *ncep) != NULL)
4914 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4915 *ncep = ncec;
4916 ncec->ncec_ptpn = ncep;
4917
4918 /* Bump up the number of ncec's referencing this ill */
4919 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4920 (char *), "ncec", (void *), ncec);
4921 ill->ill_ncec_cnt++;
4922 /*
4923 * Since we hold the ncec_lock at this time, the ncec cannot be
4924 * condemned, and we can safely add the nce.
4925 */
4926 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4927 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4928 mutex_exit(&ncec->ncec_lock);
4929 mutex_exit(&ill->ill_lock);
4930 nce_graveyard_free(&graveyard);
4931
4932 /* caller must trigger fastpath on *retnce */
4933 return (0);
4934
4935 err_ret:
4936 if (ncec != NULL)
4937 kmem_cache_free(ncec_cache, ncec);
4938 if (nce != NULL)
4939 kmem_cache_free(nce_cache, nce);
4940 freemsg(dlur_mp);
4941 if (template != NULL)
4942 kmem_free(template, ill->ill_phys_addr_length);
4943 return (err);
4944 }
4945
4946 /*
4947 * take a ref on the nce
4948 */
4949 void
4950 nce_refhold(nce_t *nce)
4996 ill->ill_nce_cnt--;
4997 nce->nce_ill = NULL;
4998 /*
4999 * If the number of ncec's associated with this ill have dropped
5000 * to zero, check whether we need to restart any operation that
5001 * is waiting for this to happen.
5002 */
5003 if (ILL_DOWN_OK(ill)) {
5004 /* ipif_ill_refrele_tail drops the ill_lock */
5005 ipif_ill_refrele_tail(ill);
5006 } else {
5007 mutex_exit(&ill->ill_lock);
5008 }
5009
5010 mutex_destroy(&nce->nce_lock);
5011 kmem_cache_free(nce_cache, nce);
5012 }
5013
5014 /*
5015 * Add an nce to the ill_nce list.
5016 *
5017 * Adding multicast NCEs is subject to a per-ill limit. This function returns
5018 * NULL if that's the case, and it may reap a number of multicast nces.
5019 * Callers (and upstack) must be able to cope with NULL returns.
5020 */
5021 static nce_t *
5022 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5023 list_t *graveyard)
5024 {
5025 ASSERT(MUTEX_HELD(&ill->ill_lock));
5026
5027 if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5028 if (nce_too_many_mcast(ill, graveyard)) {
5029 kmem_cache_free(nce_cache, nce);
5030 return (NULL);
5031 }
5032 ill->ill_mcast_nces++;
5033 }
5034
5035 bzero(nce, sizeof (*nce));
5036 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
5037 nce->nce_common = ncec;
5038 nce->nce_addr = ncec->ncec_addr;
5039 nce->nce_ill = ill;
5040 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
5041 (char *), "nce", (void *), nce);
5042 ill->ill_nce_cnt++;
5043
5044 nce->nce_refcnt = 1; /* for the thread */
5045 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
5046 nce->nce_dlur_mp = dlur_mp;
5047
5048 /* add nce to the ill's fastpath list. */
5049 nce->nce_refcnt++; /* for the list */
5050 list_insert_head(&ill->ill_nce, nce);
5051 return (nce);
5052 }
5053
5054 static nce_t *
5055 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
5056 {
5057 nce_t *nce;
5058 mblk_t *dlur_mp = NULL;
5059
5060 ASSERT(MUTEX_HELD(&ill->ill_lock));
5061 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
5062
5063 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
5064 if (nce == NULL)
5065 return (NULL);
5066 if (ncec->ncec_lladdr != NULL ||
5067 ill->ill_net_type == IRE_IF_NORESOLVER) {
5068 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
5069 ill->ill_phys_addr_length, ill->ill_sap,
5070 ill->ill_sap_length);
5071 if (dlur_mp == NULL) {
5072 kmem_cache_free(nce_cache, nce);
5073 return (NULL);
5074 }
5075 }
5076 /*
5077 * If nce_add_impl() returns NULL due to on multicast limiting, caller
5078 * will (correctly) assume ENOMEM.
5079 */
5080 return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
5081 }
5082
5083 /*
5084 * remove the nce from the ill_faspath list
5085 */
5086 void
5087 nce_delete(nce_t *nce)
5088 {
5089 ill_t *ill = nce->nce_ill;
5090
5091 ASSERT(MUTEX_HELD(&ill->ill_lock));
5092
5093 mutex_enter(&nce->nce_lock);
5094 if (nce->nce_is_condemned) {
5095 /*
5096 * some other thread has removed this nce from the ill_nce list
5097 */
5098 mutex_exit(&nce->nce_lock);
5099 return;
5100 }
5101 nce->nce_is_condemned = B_TRUE;
5102 mutex_exit(&nce->nce_lock);
5103
5104 /* Update the count of multicast NCEs. */
5105 if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5106 ill->ill_mcast_nces--;
5107
5108 list_remove(&ill->ill_nce, nce);
5109 /*
5110 * even though we are holding the ill_lock, it is ok to
5111 * call nce_refrele here because we know that we should have
5112 * at least 2 refs on the nce: one for the thread, and one
5113 * for the list. The refrele below will release the one for
5114 * the list.
5115 */
5116 nce_refrele(nce);
5117 }
5118
5119 nce_t *
5120 nce_lookup(ill_t *ill, const in6_addr_t *addr)
5121 {
5122 nce_t *nce = NULL;
5123
5124 ASSERT(ill != NULL);
5125 ASSERT(MUTEX_HELD(&ill->ill_lock));
5126
5127 for (nce = list_head(&ill->ill_nce); nce != NULL;
|