Print this page
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
*** 21,31 ****
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
! * Copyright (c) 2018, Joyent, Inc.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
--- 21,31 ----
/*
* Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
! * Copyright (c) 2019, Joyent, Inc.
*/
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/stropts.h>
*** 125,136 ****
static void ncec_refhold_locked(ncec_t *);
static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
uint16_t, uint16_t, nce_t **);
! static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
! static nce_t *nce_add(ill_t *, ncec_t *);
static void nce_inactive(nce_t *);
extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
uint16_t, uint16_t, nce_t **);
--- 125,136 ----
static void ncec_refhold_locked(ncec_t *);
static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
uint16_t, uint16_t, nce_t **);
! static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
! static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
static void nce_inactive(nce_t *);
extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
uint16_t, uint16_t, nce_t **);
*** 1003,1012 ****
--- 1003,1242 ----
ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
}
/*
+ * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
+ * NCEs, and the number to reclaim if we hit the limit. Used by
+ * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
+ * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
+ */
+
+ /* Maximum number of multicast NCEs on an ill. */
+ uint_t ip_max_ill_mcast_nces = 16384;
+ /*
+ * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and
+ * return an error. Non-zero means delete so many, and if the number is >=
+ * the max above, that means delete them all.
+ */
+ uint_t ip_ill_mcast_reclaim = 256;
+
+ /*
+ * Encapsulate multicast ill capping in a function, for easier DTrace
+ * detections. Return a list of refheld NCEs to destroy-via-refrele. That
+ * list can be NULL, but can only be non-NULL if we successfully reclaimed.
+ *
+ * NOTE: This function must be called while holding the ill_lock AND
+ * JUST PRIOR to making the insertion into the ill_nce list.
+ *
+ * We can't release the ones we delete ourselves because the ill_lock is held
+ * by the caller. They are, instead, passed back in a list_t for deletion
+ * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
+ *
+ * While this covers nce_t, ncec_t gets done even further down the road. See
+ * nce_graveyard_free() for why.
+ */
+ static boolean_t
+ nce_too_many_mcast(ill_t *ill, list_t *graveyard)
+ {
+ uint_t reclaim_count, max_count, reclaimed = 0;
+ boolean_t too_many;
+ nce_t *nce, *deadman;
+
+ ASSERT(graveyard != NULL);
+ ASSERT(list_is_empty(graveyard));
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
+
+ /*
+ * NOTE: Some grinning weirdo may have lowered the global max beyond
+ * what this ill currently has. The behavior in this case will be
+ * trim-back just by the reclaim amount for any new ones.
+ */
+ max_count = ip_max_ill_mcast_nces;
+ reclaim_count = min(ip_ill_mcast_reclaim, max_count);
+
+ /* All good? */
+ if (ill->ill_mcast_nces < max_count)
+ return (B_FALSE); /* Yes, all good. */
+
+ if (reclaim_count == 0)
+ return (B_TRUE); /* Don't bother - we're stuck. */
+
+ /* We need to reclaim now. Exploit our held ill_lock. */
+
+ /*
+ * Start at the tail and work backwards, new nces are head-inserted,
+ * so we'll be reaping the oldest entries.
+ */
+ nce = list_tail(&ill->ill_nce);
+ while (reclaimed < reclaim_count) {
+ /* Skip ahead to a multicast NCE. */
+ while (nce != NULL &&
+ (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
+ nce = list_prev(&ill->ill_nce, nce);
+ }
+ if (nce == NULL)
+ break;
+
+ /*
+ * NOTE: For now, we just delete the first one(s) we find.
+ * This is not optimal, and may require some inspection of nce
+ * & its ncec to be better.
+ */
+ deadman = nce;
+ nce = list_prev(&ill->ill_nce, nce);
+
+ /* nce_delete() requires caller holds... */
+ nce_refhold(deadman);
+ nce_delete(deadman); /* Bumps down ill_mcast_nces. */
+
+ /* Link the dead ones singly, still refheld... */
+ list_insert_tail(graveyard, deadman);
+ reclaimed++;
+ }
+
+ if (reclaimed != reclaim_count) {
+ /* We didn't have enough to reach reclaim_count. Why?!? */
+ DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
+ uint_t, reclaimed, uint_t, reclaim_count);
+
+ /* In case for some REALLY weird reason we found none! */
+ too_many = (reclaimed == 0);
+ } else {
+ too_many = B_FALSE;
+ }
+
+ return (too_many);
+ }
+
+ static void
+ ncec_mcast_reap_one(ncec_t *ncec, void *arg)
+ {
+ boolean_t reapit;
+ ill_t *ill = (ill_t *)arg;
+
+ /* Obvious no-lock-needed checks... */
+ if (ncec == NULL || ncec->ncec_ill != ill ||
+ (ncec->ncec_flags & NCE_F_MCAST) == 0)
+ return;
+
+ mutex_enter(&ncec->ncec_lock);
+ /*
+ * It's refheld by the walk infrastructure. It has one reference for
+ * being in the ndp_g_hash, and if an nce_t exists, that's one more.
+ * We want ones without an nce_t, so 2 is the magic number. If it's
+ * LESS than 2, we have much bigger problems anyway.
+ */
+ ASSERT(ncec->ncec_refcnt >= 2);
+ reapit = (ncec->ncec_refcnt == 2);
+ mutex_exit(&ncec->ncec_lock);
+
+ if (reapit) {
+ IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
+ ncec_delete(ncec);
+ }
+ }
+
+ /*
+ * Attempt to reap stray multicast ncec_t structures left in the wake of
+ * nce_graveyard_free(). This is a taskq servicing routine, as it's well
+ * outside any netstack-global locks being held - ndp_g_lock in this case. We
+ * have a reference hold on the ill, which will prevent any unplumbing races.
+ */
+ static void
+ ncec_mcast_reap(void *arg)
+ {
+ ill_t *ill = (ill_t *)arg;
+
+ IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
+ ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
+ mutex_enter(&ill->ill_lock);
+ ill->ill_mcast_ncec_cleanup = B_FALSE;
+ /*
+ * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
+ * below for why.
+ */
+ ill->ill_refcnt--;
+ if (ill->ill_refcnt == 0)
+ ipif_ill_refrele_tail(ill); /* Drops ill_lock. */
+ else
+ mutex_exit(&ill->ill_lock);
+ }
+
+ /*
+ * Free a list (including handling an empty list or NULL list) of
+ * reference-held NCEs that were reaped from a nce_too_many_mcast()
+ * call. Separate because the caller must have dropped ndp_g_lock first.
+ *
+ * This also schedules a taskq task to unlink underlying NCECs from the
+ * ndp_g_hash, which are protected by ndp_g_lock.
+ */
+ static void
+ nce_graveyard_free(list_t *graveyard)
+ {
+ nce_t *deadman, *current;
+ ill_t *ill;
+ boolean_t doit;
+
+ if (graveyard == NULL)
+ return;
+
+ current = list_head(graveyard);
+ if (current == NULL) {
+ list_destroy(graveyard);
+ return;
+ }
+
+ ill = current->nce_ill;
+ /*
+ * Normally one should ill_refhold(ill) here. There's no _notr()
+ * variant like there is for ire_t, dce_t, or even ncec_t, but this is
+ * the ONLY case that'll break the mh_trace that IP debugging uses for
+ * reference counts (i.e. they assume same thread releases as
+ * holds). Instead, we inline ill_refhold() here. We must do the same
+ * in the release done by the ncec_mcast_reap() above.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_refcnt++;
+ mutex_exit(&ill->ill_lock);
+
+ while (current != NULL) {
+ ASSERT3P(ill, ==, current->nce_ill);
+ deadman = current;
+ current = list_next(graveyard, deadman);
+ list_remove(graveyard, deadman);
+ ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
+ 0);
+ nce_refrele(deadman);
+ }
+ list_destroy(graveyard);
+
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_mcast_ncec_cleanup)
+ doit = B_FALSE;
+ else {
+ ill->ill_mcast_ncec_cleanup = B_TRUE;
+ doit = B_TRUE;
+ }
+ mutex_exit(&ill->ill_lock);
+ if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
+ ill, TQ_NOSLEEP) == NULL) {
+ mutex_enter(&ill->ill_lock);
+ if (doit) {
+ IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
+ ill->ill_mcast_ncec_cleanup = B_FALSE;
+ }
+ /* There's no _notr() for ill_refrele(), so inline it here. */
+ ill->ill_refcnt--;
+ if (ill->ill_refcnt == 0)
+ ipif_ill_refrele_tail(ill); /* Drops ill_lock */
+ else
+ mutex_exit(&ill->ill_lock);
+ }
+ }
+
+ /*
* For each interface an entry is added for the unspecified multicast group.
* Here that mapping is used to form the multicast cache entry for a particular
* multicast destination.
*/
static int
*** 1048,1058 ****
/* nce_state will be computed by nce_add_common() */
err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
ND_UNCHANGED, &nce);
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
if (err == 0)
! err = nce_add_v6_postprocess(nce);
if (hw_addr != NULL)
kmem_free(hw_addr, ill->ill_nd_lla_len);
if (err != 0) {
ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
return (err);
--- 1278,1288 ----
/* nce_state will be computed by nce_add_common() */
err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
ND_UNCHANGED, &nce);
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
if (err == 0)
! err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
if (hw_addr != NULL)
kmem_free(hw_addr, ill->ill_nd_lla_len);
if (err != 0) {
ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
return (err);
*** 3098,3108 ****
* and will be returned back by this function, so that no extra nce_refrele
* is required for the caller. The calls from nce_add_common() use this
* method. All other callers (that pass in NULL ncec_nce) will have to do a
* nce_refrele of the returned nce (when it is non-null).
*/
! nce_t *
nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
{
nce_t *nce;
ill_t *ill = ncec->ncec_ill;
--- 3328,3338 ----
* and will be returned back by this function, so that no extra nce_refrele
* is required for the caller. The calls from nce_add_common() use this
* method. All other callers (that pass in NULL ncec_nce) will have to do a
* nce_refrele of the returned nce (when it is non-null).
*/
! static nce_t *
nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
{
nce_t *nce;
ill_t *ill = ncec->ncec_ill;
*** 3156,3166 ****
/*
* Add ncec to the nce fastpath list on ill.
*/
static nce_t *
! nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
{
nce_t *nce = NULL;
ASSERT(MUTEX_HELD(&ill->ill_lock));
/*
--- 3386,3396 ----
/*
* Add ncec to the nce fastpath list on ill.
*/
static nce_t *
! nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
{
nce_t *nce = NULL;
ASSERT(MUTEX_HELD(&ill->ill_lock));
/*
*** 3176,3200 ****
*/
if (!NCE_ISCONDEMNED(ncec)) {
nce = nce_lookup(ill, &ncec->ncec_addr);
if (nce != NULL)
goto done;
! nce = nce_add(ill, ncec);
}
done:
mutex_exit(&ncec->ncec_lock);
return (nce);
}
! nce_t *
nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
{
nce_t *nce;
mutex_enter(&ill->ill_lock);
! nce = nce_ill_lookup_then_add_locked(ill, ncec);
mutex_exit(&ill->ill_lock);
return (nce);
}
/*
--- 3406,3433 ----
*/
if (!NCE_ISCONDEMNED(ncec)) {
nce = nce_lookup(ill, &ncec->ncec_addr);
if (nce != NULL)
goto done;
! nce = nce_add(ill, ncec, graveyard);
}
done:
mutex_exit(&ncec->ncec_lock);
return (nce);
}
! static nce_t *
nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
{
nce_t *nce;
+ list_t graveyard;
+ list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
mutex_enter(&ill->ill_lock);
! nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
mutex_exit(&ill->ill_lock);
+ nce_graveyard_free(&graveyard);
return (nce);
}
/*
*** 3241,3251 ****
--- 3474,3486 ----
static nce_t *
nce_delete_then_add(nce_t *nce)
{
ill_t *ill = nce->nce_ill;
nce_t *newnce = NULL;
+ list_t graveyard;
+ list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
ip0dbg(("nce_delete_then_add nce %p ill %s\n",
(void *)nce, ill->ill_name));
mutex_enter(&ill->ill_lock);
mutex_enter(&nce->nce_common->ncec_lock);
nce_delete(nce);
*** 3253,3265 ****
* Make sure that ncec is not condemned before adding. We hold the
* ill_lock and ncec_lock to synchronize with ncec_delete() and
* ipmp_ncec_delete_nce()
*/
if (!NCE_ISCONDEMNED(nce->nce_common))
! newnce = nce_add(ill, nce->nce_common);
mutex_exit(&nce->nce_common->ncec_lock);
mutex_exit(&ill->ill_lock);
nce_refrele(nce);
return (newnce); /* could be null if nomem */
}
typedef struct nce_fp_match_s {
--- 3488,3501 ----
* Make sure that ncec is not condemned before adding. We hold the
* ill_lock and ncec_lock to synchronize with ncec_delete() and
* ipmp_ncec_delete_nce()
*/
if (!NCE_ISCONDEMNED(nce->nce_common))
! newnce = nce_add(ill, nce->nce_common, &graveyard);
mutex_exit(&nce->nce_common->ncec_lock);
mutex_exit(&ill->ill_lock);
+ nce_graveyard_free(&graveyard);
nce_refrele(nce);
return (newnce); /* could be null if nomem */
}
typedef struct nce_fp_match_s {
*** 3970,3980 ****
/* nce_state will be computed by nce_add_common() */
err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
ND_UNCHANGED, &nce);
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
if (err == 0)
! err = nce_add_v4_postprocess(nce);
if (hw_addr != NULL)
kmem_free(hw_addr, ill->ill_phys_addr_length);
if (err != 0) {
ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
return (err);
--- 4206,4216 ----
/* nce_state will be computed by nce_add_common() */
err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
ND_UNCHANGED, &nce);
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
if (err == 0)
! err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
if (hw_addr != NULL)
kmem_free(hw_addr, ill->ill_phys_addr_length);
if (err != 0) {
ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
return (err);
*** 4394,4403 ****
--- 4630,4640 ----
ip_stack_t *ipst = ill->ill_ipst;
uint16_t state;
boolean_t fastprobe = B_FALSE;
struct ndp_g_s *ndp;
nce_t *nce = NULL;
+ list_t graveyard;
mblk_t *dlur_mp = NULL;
if (ill->ill_isv6)
ndp = ill->ill_ipst->ips_ndp6;
else
*** 4684,4696 ****
ill->ill_ncec_cnt++;
/*
* Since we hold the ncec_lock at this time, the ncec cannot be
* condemned, and we can safely add the nce.
*/
! *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
mutex_exit(&ncec->ncec_lock);
mutex_exit(&ill->ill_lock);
/* caller must trigger fastpath on *retnce */
return (0);
err_ret:
--- 4921,4935 ----
ill->ill_ncec_cnt++;
/*
* Since we hold the ncec_lock at this time, the ncec cannot be
* condemned, and we can safely add the nce.
*/
! list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
! *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
mutex_exit(&ncec->ncec_lock);
mutex_exit(&ill->ill_lock);
+ nce_graveyard_free(&graveyard);
/* caller must trigger fastpath on *retnce */
return (0);
err_ret:
*** 4772,4785 ****
kmem_cache_free(nce_cache, nce);
}
/*
* Add an nce to the ill_nce list.
*/
static nce_t *
! nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
{
bzero(nce, sizeof (*nce));
mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
nce->nce_common = ncec;
nce->nce_addr = ncec->ncec_addr;
nce->nce_ill = ill;
--- 5011,5039 ----
kmem_cache_free(nce_cache, nce);
}
/*
* Add an nce to the ill_nce list.
+ *
+ * Adding multicast NCEs is subject to a per-ill limit. This function returns
+ * NULL if that's the case, and it may reap a number of multicast nces.
+ * Callers (and upstack) must be able to cope with NULL returns.
*/
static nce_t *
! nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
! list_t *graveyard)
{
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
+
+ if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
+ if (nce_too_many_mcast(ill, graveyard)) {
+ kmem_cache_free(nce_cache, nce);
+ return (NULL);
+ }
+ ill->ill_mcast_nces++;
+ }
+
bzero(nce, sizeof (*nce));
mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
nce->nce_common = ncec;
nce->nce_addr = ncec->ncec_addr;
nce->nce_ill = ill;
*** 4796,4806 ****
list_insert_head(&ill->ill_nce, nce);
return (nce);
}
static nce_t *
! nce_add(ill_t *ill, ncec_t *ncec)
{
nce_t *nce;
mblk_t *dlur_mp = NULL;
ASSERT(MUTEX_HELD(&ill->ill_lock));
--- 5050,5060 ----
list_insert_head(&ill->ill_nce, nce);
return (nce);
}
static nce_t *
! nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
{
nce_t *nce;
mblk_t *dlur_mp = NULL;
ASSERT(MUTEX_HELD(&ill->ill_lock));
*** 4817,4827 ****
if (dlur_mp == NULL) {
kmem_cache_free(nce_cache, nce);
return (NULL);
}
}
! return (nce_add_impl(ill, ncec, nce, dlur_mp));
}
/*
* remove the nce from the ill_faspath list
*/
--- 5071,5085 ----
if (dlur_mp == NULL) {
kmem_cache_free(nce_cache, nce);
return (NULL);
}
}
! /*
! * If nce_add_impl() returns NULL due to on multicast limiting, caller
! * will (correctly) assume ENOMEM.
! */
! return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
}
/*
* remove the nce from the ill_faspath list
*/
*** 4841,4850 ****
--- 5099,5112 ----
return;
}
nce->nce_is_condemned = B_TRUE;
mutex_exit(&nce->nce_lock);
+ /* Update the count of multicast NCEs. */
+ if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
+ ill->ill_mcast_nces--;
+
list_remove(&ill->ill_nce, nce);
/*
* even though we are holding the ill_lock, it is ok to
* call nce_refrele here because we know that we should have
* at least 2 refs on the nce: one for the thread, and one