Print this page
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

*** 21,31 **** /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. */ /* ! * Copyright (c) 2018, Joyent, Inc. */ #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> --- 21,31 ---- /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. */ /* ! * Copyright (c) 2019, Joyent, Inc. */ #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h>
*** 125,136 **** static void ncec_refhold_locked(ncec_t *); static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, uint16_t, uint16_t, nce_t **); ! static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); ! static nce_t *nce_add(ill_t *, ncec_t *); static void nce_inactive(nce_t *); extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, uint16_t, uint16_t, nce_t **); --- 125,136 ---- static void ncec_refhold_locked(ncec_t *); static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, uint16_t, uint16_t, nce_t **); ! static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *); ! static nce_t *nce_add(ill_t *, ncec_t *, list_t *); static void nce_inactive(nce_t *); extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, uint16_t, uint16_t, nce_t **);
*** 1003,1012 **** --- 1003,1242 ---- ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE); ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE); } /* + * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast + * NCEs, and the number to reclaim if we hit the limit. Used by + * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until + * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this. + */ + + /* Maximum number of multicast NCEs on an ill. */ + uint_t ip_max_ill_mcast_nces = 16384; + /* + * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and + * return an error. Non-zero means delete so many, and if the number is >= + * the max above, that means delete them all. + */ + uint_t ip_ill_mcast_reclaim = 256; + + /* + * Encapsulate multicast ill capping in a function, for easier DTrace + * detections. Return a list of refheld NCEs to destroy-via-refrele. That + * list can be NULL, but can only be non-NULL if we successfully reclaimed. + * + * NOTE: This function must be called while holding the ill_lock AND + * JUST PRIOR to making the insertion into the ill_nce list. + * + * We can't release the ones we delete ourselves because the ill_lock is held + * by the caller. They are, instead, passed back in a list_t for deletion + * outside of the ill_lock hold. nce_graveyard_free() actually frees them. + * + * While this covers nce_t, ncec_t gets done even further down the road. See + * nce_graveyard_free() for why. + */ + static boolean_t + nce_too_many_mcast(ill_t *ill, list_t *graveyard) + { + uint_t reclaim_count, max_count, reclaimed = 0; + boolean_t too_many; + nce_t *nce, *deadman; + + ASSERT(graveyard != NULL); + ASSERT(list_is_empty(graveyard)); + ASSERT(MUTEX_HELD(&ill->ill_lock)); + + /* + * NOTE: Some grinning weirdo may have lowered the global max beyond + * what this ill currently has. The behavior in this case will be + * trim-back just by the reclaim amount for any new ones. + */ + max_count = ip_max_ill_mcast_nces; + reclaim_count = min(ip_ill_mcast_reclaim, max_count); + + /* All good? */ + if (ill->ill_mcast_nces < max_count) + return (B_FALSE); /* Yes, all good. */ + + if (reclaim_count == 0) + return (B_TRUE); /* Don't bother - we're stuck. */ + + /* We need to reclaim now. Exploit our held ill_lock. */ + + /* + * Start at the tail and work backwards, new nces are head-inserted, + * so we'll be reaping the oldest entries. + */ + nce = list_tail(&ill->ill_nce); + while (reclaimed < reclaim_count) { + /* Skip ahead to a multicast NCE. */ + while (nce != NULL && + (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) { + nce = list_prev(&ill->ill_nce, nce); + } + if (nce == NULL) + break; + + /* + * NOTE: For now, we just delete the first one(s) we find. + * This is not optimal, and may require some inspection of nce + * & its ncec to be better. + */ + deadman = nce; + nce = list_prev(&ill->ill_nce, nce); + + /* nce_delete() requires caller holds... */ + nce_refhold(deadman); + nce_delete(deadman); /* Bumps down ill_mcast_nces. */ + + /* Link the dead ones singly, still refheld... */ + list_insert_tail(graveyard, deadman); + reclaimed++; + } + + if (reclaimed != reclaim_count) { + /* We didn't have enough to reach reclaim_count. Why?!? */ + DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill, + uint_t, reclaimed, uint_t, reclaim_count); + + /* In case for some REALLY weird reason we found none! */ + too_many = (reclaimed == 0); + } else { + too_many = B_FALSE; + } + + return (too_many); + } + + static void + ncec_mcast_reap_one(ncec_t *ncec, void *arg) + { + boolean_t reapit; + ill_t *ill = (ill_t *)arg; + + /* Obvious no-lock-needed checks... */ + if (ncec == NULL || ncec->ncec_ill != ill || + (ncec->ncec_flags & NCE_F_MCAST) == 0) + return; + + mutex_enter(&ncec->ncec_lock); + /* + * It's refheld by the walk infrastructure. It has one reference for + * being in the ndp_g_hash, and if an nce_t exists, that's one more. + * We want ones without an nce_t, so 2 is the magic number. If it's + * LESS than 2, we have much bigger problems anyway. + */ + ASSERT(ncec->ncec_refcnt >= 2); + reapit = (ncec->ncec_refcnt == 2); + mutex_exit(&ncec->ncec_lock); + + if (reapit) { + IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted); + ncec_delete(ncec); + } + } + + /* + * Attempt to reap stray multicast ncec_t structures left in the wake of + * nce_graveyard_free(). This is a taskq servicing routine, as it's well + * outside any netstack-global locks being held - ndp_g_lock in this case. We + * have a reference hold on the ill, which will prevent any unplumbing races. + */ + static void + ncec_mcast_reap(void *arg) + { + ill_t *ill = (ill_t *)arg; + + IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls); + ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst); + mutex_enter(&ill->ill_lock); + ill->ill_mcast_ncec_cleanup = B_FALSE; + /* + * Inline a _notr() version of ill_refrele. See nce_graveyard_free() + * below for why. + */ + ill->ill_refcnt--; + if (ill->ill_refcnt == 0) + ipif_ill_refrele_tail(ill); /* Drops ill_lock. */ + else + mutex_exit(&ill->ill_lock); + } + + /* + * Free a list (including handling an empty list or NULL list) of + * reference-held NCEs that were reaped from a nce_too_many_mcast() + * call. Separate because the caller must have dropped ndp_g_lock first. + * + * This also schedules a taskq task to unlink underlying NCECs from the + * ndp_g_hash, which are protected by ndp_g_lock. + */ + static void + nce_graveyard_free(list_t *graveyard) + { + nce_t *deadman, *current; + ill_t *ill; + boolean_t doit; + + if (graveyard == NULL) + return; + + current = list_head(graveyard); + if (current == NULL) { + list_destroy(graveyard); + return; + } + + ill = current->nce_ill; + /* + * Normally one should ill_refhold(ill) here. There's no _notr() + * variant like there is for ire_t, dce_t, or even ncec_t, but this is + * the ONLY case that'll break the mh_trace that IP debugging uses for + * reference counts (i.e. they assume same thread releases as + * holds). Instead, we inline ill_refhold() here. We must do the same + * in the release done by the ncec_mcast_reap() above. + */ + mutex_enter(&ill->ill_lock); + ill->ill_refcnt++; + mutex_exit(&ill->ill_lock); + + while (current != NULL) { + ASSERT3P(ill, ==, current->nce_ill); + deadman = current; + current = list_next(graveyard, deadman); + list_remove(graveyard, deadman); + ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=, + 0); + nce_refrele(deadman); + } + list_destroy(graveyard); + + mutex_enter(&ill->ill_lock); + if (ill->ill_mcast_ncec_cleanup) + doit = B_FALSE; + else { + ill->ill_mcast_ncec_cleanup = B_TRUE; + doit = B_TRUE; + } + mutex_exit(&ill->ill_lock); + if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap, + ill, TQ_NOSLEEP) == NULL) { + mutex_enter(&ill->ill_lock); + if (doit) { + IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail); + ill->ill_mcast_ncec_cleanup = B_FALSE; + } + /* There's no _notr() for ill_refrele(), so inline it here. */ + ill->ill_refcnt--; + if (ill->ill_refcnt == 0) + ipif_ill_refrele_tail(ill); /* Drops ill_lock */ + else + mutex_exit(&ill->ill_lock); + } + } + + /* * For each interface an entry is added for the unspecified multicast group. * Here that mapping is used to form the multicast cache entry for a particular * multicast destination. */ static int
*** 1048,1058 **** /* nce_state will be computed by nce_add_common() */ err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, ND_UNCHANGED, &nce); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (err == 0) ! err = nce_add_v6_postprocess(nce); if (hw_addr != NULL) kmem_free(hw_addr, ill->ill_nd_lla_len); if (err != 0) { ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); return (err); --- 1278,1288 ---- /* nce_state will be computed by nce_add_common() */ err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, ND_UNCHANGED, &nce); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (err == 0) ! err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM; if (hw_addr != NULL) kmem_free(hw_addr, ill->ill_nd_lla_len); if (err != 0) { ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); return (err);
*** 3098,3108 **** * and will be returned back by this function, so that no extra nce_refrele * is required for the caller. The calls from nce_add_common() use this * method. All other callers (that pass in NULL ncec_nce) will have to do a * nce_refrele of the returned nce (when it is non-null). */ ! nce_t * nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) { nce_t *nce; ill_t *ill = ncec->ncec_ill; --- 3328,3338 ---- * and will be returned back by this function, so that no extra nce_refrele * is required for the caller. The calls from nce_add_common() use this * method. All other callers (that pass in NULL ncec_nce) will have to do a * nce_refrele of the returned nce (when it is non-null). */ ! static nce_t * nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) { nce_t *nce; ill_t *ill = ncec->ncec_ill;
*** 3156,3166 **** /* * Add ncec to the nce fastpath list on ill. */ static nce_t * ! nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) { nce_t *nce = NULL; ASSERT(MUTEX_HELD(&ill->ill_lock)); /* --- 3386,3396 ---- /* * Add ncec to the nce fastpath list on ill. */ static nce_t * ! nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard) { nce_t *nce = NULL; ASSERT(MUTEX_HELD(&ill->ill_lock)); /*
*** 3176,3200 **** */ if (!NCE_ISCONDEMNED(ncec)) { nce = nce_lookup(ill, &ncec->ncec_addr); if (nce != NULL) goto done; ! nce = nce_add(ill, ncec); } done: mutex_exit(&ncec->ncec_lock); return (nce); } ! nce_t * nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) { nce_t *nce; mutex_enter(&ill->ill_lock); ! nce = nce_ill_lookup_then_add_locked(ill, ncec); mutex_exit(&ill->ill_lock); return (nce); } /* --- 3406,3433 ---- */ if (!NCE_ISCONDEMNED(ncec)) { nce = nce_lookup(ill, &ncec->ncec_addr); if (nce != NULL) goto done; ! nce = nce_add(ill, ncec, graveyard); } done: mutex_exit(&ncec->ncec_lock); return (nce); } ! static nce_t * nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) { nce_t *nce; + list_t graveyard; + list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); mutex_enter(&ill->ill_lock); ! nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard); mutex_exit(&ill->ill_lock); + nce_graveyard_free(&graveyard); return (nce); } /*
*** 3241,3251 **** --- 3474,3486 ---- static nce_t * nce_delete_then_add(nce_t *nce) { ill_t *ill = nce->nce_ill; nce_t *newnce = NULL; + list_t graveyard; + list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); ip0dbg(("nce_delete_then_add nce %p ill %s\n", (void *)nce, ill->ill_name)); mutex_enter(&ill->ill_lock); mutex_enter(&nce->nce_common->ncec_lock); nce_delete(nce);
*** 3253,3265 **** * Make sure that ncec is not condemned before adding. We hold the * ill_lock and ncec_lock to synchronize with ncec_delete() and * ipmp_ncec_delete_nce() */ if (!NCE_ISCONDEMNED(nce->nce_common)) ! newnce = nce_add(ill, nce->nce_common); mutex_exit(&nce->nce_common->ncec_lock); mutex_exit(&ill->ill_lock); nce_refrele(nce); return (newnce); /* could be null if nomem */ } typedef struct nce_fp_match_s { --- 3488,3501 ---- * Make sure that ncec is not condemned before adding. We hold the * ill_lock and ncec_lock to synchronize with ncec_delete() and * ipmp_ncec_delete_nce() */ if (!NCE_ISCONDEMNED(nce->nce_common)) ! newnce = nce_add(ill, nce->nce_common, &graveyard); mutex_exit(&nce->nce_common->ncec_lock); mutex_exit(&ill->ill_lock); + nce_graveyard_free(&graveyard); nce_refrele(nce); return (newnce); /* could be null if nomem */ } typedef struct nce_fp_match_s {
*** 3970,3980 **** /* nce_state will be computed by nce_add_common() */ err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, ND_UNCHANGED, &nce); mutex_exit(&ipst->ips_ndp4->ndp_g_lock); if (err == 0) ! err = nce_add_v4_postprocess(nce); if (hw_addr != NULL) kmem_free(hw_addr, ill->ill_phys_addr_length); if (err != 0) { ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); return (err); --- 4206,4216 ---- /* nce_state will be computed by nce_add_common() */ err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, ND_UNCHANGED, &nce); mutex_exit(&ipst->ips_ndp4->ndp_g_lock); if (err == 0) ! err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM; if (hw_addr != NULL) kmem_free(hw_addr, ill->ill_phys_addr_length); if (err != 0) { ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); return (err);
*** 4394,4403 **** --- 4630,4640 ---- ip_stack_t *ipst = ill->ill_ipst; uint16_t state; boolean_t fastprobe = B_FALSE; struct ndp_g_s *ndp; nce_t *nce = NULL; + list_t graveyard; mblk_t *dlur_mp = NULL; if (ill->ill_isv6) ndp = ill->ill_ipst->ips_ndp6; else
*** 4684,4696 **** ill->ill_ncec_cnt++; /* * Since we hold the ncec_lock at this time, the ncec cannot be * condemned, and we can safely add the nce. */ ! *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); mutex_exit(&ncec->ncec_lock); mutex_exit(&ill->ill_lock); /* caller must trigger fastpath on *retnce */ return (0); err_ret: --- 4921,4935 ---- ill->ill_ncec_cnt++; /* * Since we hold the ncec_lock at this time, the ncec cannot be * condemned, and we can safely add the nce. */ ! list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); ! *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard); mutex_exit(&ncec->ncec_lock); mutex_exit(&ill->ill_lock); + nce_graveyard_free(&graveyard); /* caller must trigger fastpath on *retnce */ return (0); err_ret:
*** 4772,4785 **** kmem_cache_free(nce_cache, nce); } /* * Add an nce to the ill_nce list. */ static nce_t * ! nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) { bzero(nce, sizeof (*nce)); mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); nce->nce_common = ncec; nce->nce_addr = ncec->ncec_addr; nce->nce_ill = ill; --- 5011,5039 ---- kmem_cache_free(nce_cache, nce); } /* * Add an nce to the ill_nce list. + * + * Adding multicast NCEs is subject to a per-ill limit. This function returns + * NULL if that's the case, and it may reap a number of multicast nces. + * Callers (and upstack) must be able to cope with NULL returns. */ static nce_t * ! nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp, ! list_t *graveyard) { + ASSERT(MUTEX_HELD(&ill->ill_lock)); + + if ((ncec->ncec_flags & NCE_F_MCAST) != 0) { + if (nce_too_many_mcast(ill, graveyard)) { + kmem_cache_free(nce_cache, nce); + return (NULL); + } + ill->ill_mcast_nces++; + } + bzero(nce, sizeof (*nce)); mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); nce->nce_common = ncec; nce->nce_addr = ncec->ncec_addr; nce->nce_ill = ill;
*** 4796,4806 **** list_insert_head(&ill->ill_nce, nce); return (nce); } static nce_t * ! nce_add(ill_t *ill, ncec_t *ncec) { nce_t *nce; mblk_t *dlur_mp = NULL; ASSERT(MUTEX_HELD(&ill->ill_lock)); --- 5050,5060 ---- list_insert_head(&ill->ill_nce, nce); return (nce); } static nce_t * ! nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard) { nce_t *nce; mblk_t *dlur_mp = NULL; ASSERT(MUTEX_HELD(&ill->ill_lock));
*** 4817,4827 **** if (dlur_mp == NULL) { kmem_cache_free(nce_cache, nce); return (NULL); } } ! return (nce_add_impl(ill, ncec, nce, dlur_mp)); } /* * remove the nce from the ill_faspath list */ --- 5071,5085 ---- if (dlur_mp == NULL) { kmem_cache_free(nce_cache, nce); return (NULL); } } ! /* ! * If nce_add_impl() returns NULL due to on multicast limiting, caller ! * will (correctly) assume ENOMEM. ! */ ! return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard)); } /* * remove the nce from the ill_faspath list */
*** 4841,4850 **** --- 5099,5112 ---- return; } nce->nce_is_condemned = B_TRUE; mutex_exit(&nce->nce_lock); + /* Update the count of multicast NCEs. */ + if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST) + ill->ill_mcast_nces--; + list_remove(&ill->ill_nce, nce); /* * even though we are holding the ill_lock, it is ok to * call nce_refrele here because we know that we should have * at least 2 refs on the nce: one for the thread, and one