Print this page
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -21,11 +21,11 @@
 /*
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2019, Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/stream.h>
 #include <sys/stropts.h>

@@ -125,12 +125,12 @@
 static void     ncec_refhold_locked(ncec_t *);
 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
     uint16_t, uint16_t, nce_t **);
-static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
-static nce_t *nce_add(ill_t *, ncec_t *);
+static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
+static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
 static void nce_inactive(nce_t *);
 extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
     uint16_t, uint16_t, nce_t **);

@@ -1003,10 +1003,240 @@
         ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
         ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
 }
 
 /*
+ * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
+ * NCEs, and the number to reclaim if we hit the limit.  Used by
+ * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
+ * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
+ */
+
+/* Maximum number of multicast NCEs on an ill. */
+uint_t ip_max_ill_mcast_nces = 16384;
+/*
+ * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
+ * return an error.  Non-zero means delete so many, and if the number is >=
+ * the max above, that means delete them all.
+ */
+uint_t ip_ill_mcast_reclaim = 256;
+
+/*
+ * Encapsulate multicast ill capping in a function, for easier DTrace
+ * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
+ * list can be NULL, but can only be non-NULL if we successfully reclaimed.
+ *
+ * NOTE:  This function must be called while holding the ill_lock AND
+ * JUST PRIOR to making the insertion into the ill_nce list.
+ *
+ * We can't release the ones we delete ourselves because the ill_lock is held
+ * by the caller. They are, instead, passed back in a list_t for deletion
+ * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
+ *
+ * While this covers nce_t, ncec_t gets done even further down the road.  See
+ * nce_graveyard_free() for why.
+ */
+static boolean_t
+nce_too_many_mcast(ill_t *ill, list_t *graveyard)
+{
+        uint_t reclaim_count, max_count, reclaimed = 0;
+        boolean_t too_many;
+        nce_t *nce, *deadman;
+
+        ASSERT(graveyard != NULL);
+        ASSERT(list_is_empty(graveyard));
+        ASSERT(MUTEX_HELD(&ill->ill_lock));
+
+        /*
+         * NOTE: Some grinning weirdo may have lowered the global max beyond
+         * what this ill currently has.  The behavior in this case will be
+         * trim-back just by the reclaim amount for any new ones.
+         */
+        max_count = ip_max_ill_mcast_nces;
+        reclaim_count = min(ip_ill_mcast_reclaim, max_count);
+
+        /* All good? */
+        if (ill->ill_mcast_nces < max_count)
+                return (B_FALSE);       /* Yes, all good. */
+
+        if (reclaim_count == 0)
+                return (B_TRUE);        /* Don't bother - we're stuck. */
+
+        /* We need to reclaim now.  Exploit our held ill_lock. */
+
+        /*
+         * Start at the tail and work backwards, new nces are head-inserted,
+         * so we'll be reaping the oldest entries.
+         */
+        nce = list_tail(&ill->ill_nce);
+        while (reclaimed < reclaim_count) {
+                /* Skip ahead to a multicast NCE. */
+                while (nce != NULL &&
+                    (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
+                        nce = list_prev(&ill->ill_nce, nce);
+                }
+                if (nce == NULL)
+                        break;
+
+                /*
+                 * NOTE: For now, we just delete the first one(s) we find.
+                 * This is not optimal, and may require some inspection of nce
+                 * & its ncec to be better.
+                 */
+                deadman = nce;
+                nce = list_prev(&ill->ill_nce, nce);
+
+                /* nce_delete() requires caller holds... */
+                nce_refhold(deadman);
+                nce_delete(deadman);    /* Bumps down ill_mcast_nces. */
+
+                /* Link the dead ones singly, still refheld... */
+                list_insert_tail(graveyard, deadman);
+                reclaimed++;
+        }
+
+        if (reclaimed != reclaim_count) {
+                /* We didn't have enough to reach reclaim_count. Why?!? */
+                DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
+                    uint_t, reclaimed, uint_t, reclaim_count);
+
+                /* In case for some REALLY weird reason we found none! */
+                too_many = (reclaimed == 0);
+        } else {
+                too_many = B_FALSE;
+        }
+
+        return (too_many);
+}
+
+static void
+ncec_mcast_reap_one(ncec_t *ncec, void *arg)
+{
+        boolean_t reapit;
+        ill_t *ill = (ill_t *)arg;
+
+        /* Obvious no-lock-needed checks... */
+        if (ncec == NULL || ncec->ncec_ill != ill ||
+            (ncec->ncec_flags & NCE_F_MCAST) == 0)
+                return;
+
+        mutex_enter(&ncec->ncec_lock);
+        /*
+         * It's refheld by the walk infrastructure. It has one reference for
+         * being in the ndp_g_hash, and if an nce_t exists, that's one more.
+         * We want ones without an nce_t, so 2 is the magic number.  If it's
+         * LESS than 2, we have much bigger problems anyway.
+         */
+        ASSERT(ncec->ncec_refcnt >= 2);
+        reapit = (ncec->ncec_refcnt == 2);
+        mutex_exit(&ncec->ncec_lock);
+
+        if (reapit) {
+                IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
+                ncec_delete(ncec);
+        }
+}
+
+/*
+ * Attempt to reap stray multicast ncec_t structures left in the wake of
+ * nce_graveyard_free(). This is a taskq servicing routine, as it's well
+ * outside any netstack-global locks being held - ndp_g_lock in this case.  We
+ * have a reference hold on the ill, which will prevent any unplumbing races.
+ */
+static void
+ncec_mcast_reap(void *arg)
+{
+        ill_t *ill = (ill_t *)arg;
+
+        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
+        ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
+        mutex_enter(&ill->ill_lock);
+        ill->ill_mcast_ncec_cleanup = B_FALSE;
+        /*
+         * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
+         * below for why.
+         */
+        ill->ill_refcnt--;
+        if (ill->ill_refcnt == 0)
+                ipif_ill_refrele_tail(ill);     /* Drops ill_lock. */
+        else
+                mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Free a list (including handling an empty list or NULL list) of
+ * reference-held NCEs that were reaped from a nce_too_many_mcast()
+ * call. Separate because the caller must have dropped ndp_g_lock first.
+ *
+ * This also schedules a taskq task to unlink underlying NCECs from the
+ * ndp_g_hash, which are protected by ndp_g_lock.
+ */
+static void
+nce_graveyard_free(list_t *graveyard)
+{
+        nce_t *deadman, *current;
+        ill_t *ill;
+        boolean_t doit;
+
+        if (graveyard == NULL)
+                return;
+
+        current = list_head(graveyard);
+        if (current == NULL) {
+                list_destroy(graveyard);
+                return;
+        }
+
+        ill = current->nce_ill;
+        /*
+         * Normally one should ill_refhold(ill) here.  There's no _notr()
+         * variant like there is for ire_t, dce_t, or even ncec_t, but this is
+         * the ONLY case that'll break the mh_trace that IP debugging uses for
+         * reference counts (i.e. they assume same thread releases as
+         * holds). Instead, we inline ill_refhold() here.  We must do the same
+         * in the release done by the ncec_mcast_reap() above.
+         */
+        mutex_enter(&ill->ill_lock);
+        ill->ill_refcnt++;
+        mutex_exit(&ill->ill_lock);
+
+        while (current != NULL) {
+                ASSERT3P(ill, ==, current->nce_ill);
+                deadman = current;
+                current = list_next(graveyard, deadman);
+                list_remove(graveyard, deadman);
+                ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
+                    0);
+                nce_refrele(deadman);
+        }
+        list_destroy(graveyard);
+
+        mutex_enter(&ill->ill_lock);
+        if (ill->ill_mcast_ncec_cleanup)
+                doit = B_FALSE;
+        else {
+                ill->ill_mcast_ncec_cleanup = B_TRUE;
+                doit = B_TRUE;
+        }
+        mutex_exit(&ill->ill_lock);
+        if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
+            ill, TQ_NOSLEEP) == NULL) {
+                mutex_enter(&ill->ill_lock);
+                if (doit) {
+                        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
+                        ill->ill_mcast_ncec_cleanup = B_FALSE;
+                }
+                /* There's no _notr() for ill_refrele(), so inline it here. */
+                ill->ill_refcnt--;
+                if (ill->ill_refcnt == 0)
+                        ipif_ill_refrele_tail(ill);     /* Drops ill_lock */
+                else
+                        mutex_exit(&ill->ill_lock);
+        }
+}
+
+/*
  * For each interface an entry is added for the unspecified multicast group.
  * Here that mapping is used to form the multicast cache entry for a particular
  * multicast destination.
  */
 static int

@@ -1048,11 +1278,11 @@
         /* nce_state will be computed by nce_add_common() */
         err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
             ND_UNCHANGED, &nce);
         mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
         if (err == 0)
-                err = nce_add_v6_postprocess(nce);
+                err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
         if (hw_addr != NULL)
                 kmem_free(hw_addr, ill->ill_nd_lla_len);
         if (err != 0) {
                 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
                 return (err);

@@ -3098,11 +3328,11 @@
  * and will be returned back by this function, so that no extra nce_refrele
  * is required for the caller. The calls from nce_add_common() use this
  * method. All other callers (that pass in NULL ncec_nce) will have to do a
  * nce_refrele of the returned nce (when it is non-null).
  */
-nce_t *
+static nce_t *
 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
 {
         nce_t *nce;
         ill_t *ill = ncec->ncec_ill;
 

@@ -3156,11 +3386,11 @@
 
 /*
  * Add ncec to the nce fastpath list on ill.
  */
 static nce_t *
-nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
+nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
 {
         nce_t *nce = NULL;
 
         ASSERT(MUTEX_HELD(&ill->ill_lock));
         /*

@@ -3176,25 +3406,28 @@
          */
         if (!NCE_ISCONDEMNED(ncec)) {
                 nce = nce_lookup(ill, &ncec->ncec_addr);
                 if (nce != NULL)
                         goto done;
-                nce = nce_add(ill, ncec);
+                nce = nce_add(ill, ncec, graveyard);
         }
 done:
         mutex_exit(&ncec->ncec_lock);
         return (nce);
 }
 
-nce_t *
+static nce_t *
 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
 {
         nce_t *nce;
+        list_t graveyard;
 
+        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
         mutex_enter(&ill->ill_lock);
-        nce = nce_ill_lookup_then_add_locked(ill, ncec);
+        nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
         mutex_exit(&ill->ill_lock);
+        nce_graveyard_free(&graveyard);
         return (nce);
 }
 
 
 /*

@@ -3241,11 +3474,13 @@
 static nce_t *
 nce_delete_then_add(nce_t *nce)
 {
         ill_t           *ill = nce->nce_ill;
         nce_t           *newnce = NULL;
+        list_t          graveyard;
 
+        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
         ip0dbg(("nce_delete_then_add nce %p ill %s\n",
             (void *)nce, ill->ill_name));
         mutex_enter(&ill->ill_lock);
         mutex_enter(&nce->nce_common->ncec_lock);
         nce_delete(nce);

@@ -3253,13 +3488,14 @@
          * Make sure that ncec is not condemned before adding. We hold the
          * ill_lock and ncec_lock to synchronize with ncec_delete() and
          * ipmp_ncec_delete_nce()
          */
         if (!NCE_ISCONDEMNED(nce->nce_common))
-                newnce = nce_add(ill, nce->nce_common);
+                newnce = nce_add(ill, nce->nce_common, &graveyard);
         mutex_exit(&nce->nce_common->ncec_lock);
         mutex_exit(&ill->ill_lock);
+        nce_graveyard_free(&graveyard);
         nce_refrele(nce);
         return (newnce); /* could be null if nomem */
 }
 
 typedef struct nce_fp_match_s {

@@ -3970,11 +4206,11 @@
         /* nce_state will be computed by nce_add_common() */
         err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
             ND_UNCHANGED, &nce);
         mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
         if (err == 0)
-                err = nce_add_v4_postprocess(nce);
+                err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
         if (hw_addr != NULL)
                 kmem_free(hw_addr, ill->ill_phys_addr_length);
         if (err != 0) {
                 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
                 return (err);

@@ -4394,10 +4630,11 @@
         ip_stack_t              *ipst = ill->ill_ipst;
         uint16_t                state;
         boolean_t               fastprobe = B_FALSE;
         struct ndp_g_s          *ndp;
         nce_t                   *nce = NULL;
+        list_t                  graveyard;
         mblk_t                  *dlur_mp = NULL;
 
         if (ill->ill_isv6)
                 ndp = ill->ill_ipst->ips_ndp6;
         else

@@ -4684,13 +4921,15 @@
         ill->ill_ncec_cnt++;
         /*
          * Since we hold the ncec_lock at this time, the ncec cannot be
          * condemned, and we can safely add the nce.
          */
-        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
+        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
+        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
         mutex_exit(&ncec->ncec_lock);
         mutex_exit(&ill->ill_lock);
+        nce_graveyard_free(&graveyard);
 
         /* caller must trigger fastpath on *retnce */
         return (0);
 
 err_ret:

@@ -4772,14 +5011,29 @@
         kmem_cache_free(nce_cache, nce);
 }
 
 /*
  * Add an nce to the ill_nce list.
+ *
+ * Adding multicast NCEs is subject to a per-ill limit. This function returns
+ * NULL if that's the case, and it may reap a number of multicast nces.
+ * Callers (and upstack) must be able to cope with NULL returns.
  */
 static nce_t *
-nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
+nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
+    list_t *graveyard)
 {
+        ASSERT(MUTEX_HELD(&ill->ill_lock));
+
+        if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
+                if (nce_too_many_mcast(ill, graveyard)) {
+                        kmem_cache_free(nce_cache, nce);
+                        return (NULL);
+                }
+                ill->ill_mcast_nces++;
+        }
+
         bzero(nce, sizeof (*nce));
         mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
         nce->nce_common = ncec;
         nce->nce_addr = ncec->ncec_addr;
         nce->nce_ill = ill;

@@ -4796,11 +5050,11 @@
         list_insert_head(&ill->ill_nce, nce);
         return (nce);
 }
 
 static nce_t *
-nce_add(ill_t *ill, ncec_t *ncec)
+nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
 {
         nce_t   *nce;
         mblk_t  *dlur_mp = NULL;
 
         ASSERT(MUTEX_HELD(&ill->ill_lock));

@@ -4817,11 +5071,15 @@
                 if (dlur_mp == NULL) {
                         kmem_cache_free(nce_cache, nce);
                         return (NULL);
                 }
         }
-        return (nce_add_impl(ill, ncec, nce, dlur_mp));
+        /*
+         * If nce_add_impl() returns NULL due to on multicast limiting, caller
+         * will (correctly) assume ENOMEM.
+         */
+        return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
 }
 
 /*
  * remove the nce from the ill_faspath list
  */

@@ -4841,10 +5099,14 @@
                 return;
         }
         nce->nce_is_condemned = B_TRUE;
         mutex_exit(&nce->nce_lock);
 
+        /* Update the count of multicast NCEs. */
+        if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
+                ill->ill_mcast_nces--;
+
         list_remove(&ill->ill_nce, nce);
         /*
          * even though we are holding the ill_lock, it is ok to
          * call nce_refrele here because we know that we should have
          * at least 2 refs on the nce: one for the thread, and one