Print this page
OS-XXXX netstack_find_by_stackid() drops-and-reacquires
OS-5423 deadlock between netstack teardown and kstat read

@@ -35,10 +35,11 @@
 #include <sys/debug.h>
 #include <sys/sdt.h>
 #include <sys/mutex.h>
 #include <sys/bitmap.h>
 #include <sys/atomic.h>
+#include <sys/sunddi.h>
 #include <sys/kobj.h>
 #include <sys/disp.h>
 #include <vm/seg_kmem.h>
 #include <sys/zone.h>
 #include <sys/netstack.h>

@@ -120,16 +121,23 @@
 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
     kmutex_t *);
 
+static void netstack_hold_locked(netstack_t *);
+static void netstack_reap_work(netstack_t *, boolean_t);
+ksema_t netstack_reap_limiter;
+
 void
 netstack_init(void)
 {
         mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 
+        /* XXX KEBE SAYS hard-coded constant needs to be fixed. */
+        sema_init(&netstack_reap_limiter, 1024, NULL, SEMA_DRIVER, NULL);
+
         netstack_initialized = 1;
 
         /*
          * We want to be informed each time a zone is created or
          * destroyed in the kernel, so we can maintain the

@@ -1028,12 +1036,12 @@
         mutex_enter(&netstack_g_lock);
         for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
                 mutex_enter(&ns->netstack_lock);
                 if (ns->netstack_stackid == stackid &&
                     !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
+                        netstack_hold_locked(ns);
                         mutex_exit(&ns->netstack_lock);
-                        netstack_hold(ns);
                         mutex_exit(&netstack_g_lock);
                         return (ns);
                 }
                 mutex_exit(&ns->netstack_lock);
         }

@@ -1059,34 +1067,33 @@
         mutex_exit(&netstack_g_lock);
 
         return (rval);
 }
 
-void
-netstack_rele(netstack_t *ns)
+
+static void
+netstack_reap(void *arg)
 {
+        /* Indicate we took a semaphore to get here. */
+        netstack_reap_work((netstack_t *)arg, B_TRUE);
+}
+
+static void
+netstack_reap_intr(void *arg)
+{
+        /* Indicate we did NOT TAKE a semaphore to get here. */
+        netstack_reap_work((netstack_t *)arg, B_FALSE);
+}
+
+static void
+netstack_reap_work(netstack_t *ns, boolean_t semaphore_signal)
+{
         netstack_t **nsp;
         boolean_t found;
-        int refcnt, numzones;
         int i;
 
-        mutex_enter(&ns->netstack_lock);
-        ASSERT(ns->netstack_refcnt > 0);
-        ns->netstack_refcnt--;
         /*
-         * As we drop the lock additional netstack_rele()s can come in
-         * and decrement the refcnt to zero and free the netstack_t.
-         * Store pointers in local variables and if we were not the last
-         * then don't reference the netstack_t after that.
-         */
-        refcnt = ns->netstack_refcnt;
-        numzones = ns->netstack_numzones;
-        DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
-        mutex_exit(&ns->netstack_lock);
-
-        if (refcnt == 0 && numzones == 0) {
-                /*
                  * Time to call the destroy functions and free up
                  * the structure
                  */
                 netstack_stack_inactive(ns);
 

@@ -1121,23 +1128,93 @@
                         cv_destroy(&nms->nms_cv);
                 }
                 mutex_destroy(&ns->netstack_lock);
                 cv_destroy(&ns->netstack_cv);
                 kmem_free(ns, sizeof (*ns));
-        }
+        /* Allow another reap to be scheduled. */
+        if (semaphore_signal)
+                sema_v(&netstack_reap_limiter);
 }
 
 void
-netstack_hold(netstack_t *ns)
+netstack_rele(netstack_t *ns)
 {
+        int refcnt, numzones;
+
         mutex_enter(&ns->netstack_lock);
-        ns->netstack_refcnt++;
         ASSERT(ns->netstack_refcnt > 0);
+        ns->netstack_refcnt--;
+        /*
+         * As we drop the lock additional netstack_rele()s can come in
+         * and decrement the refcnt to zero and free the netstack_t.
+         * Store pointers in local variables and if we were not the last
+         * then don't reference the netstack_t after that.
+         */
+        refcnt = ns->netstack_refcnt;
+        numzones = ns->netstack_numzones;
+        DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
         mutex_exit(&ns->netstack_lock);
+
+        if (refcnt == 0 && numzones == 0) {
+                boolean_t is_not_intr = !servicing_interrupt();
+
+                /*
+                 * Because there are possibilities of kstats being held by
+                 * callers, which would then be immediately freed, but held up
+                 * due to kstat's odd reference model recording the thread, we
+                 * choose to schedule the actual deletion of this netstack as
+                 * a deferred task on the system taskq.  This way, any
+                 * store-the-thread-pointer semantics won't trip over
+                 * themselves.
+                 *
+                 * On the off chance this is called in interrupt context, we
+                 * cannot use the semaphore to enforce rate-limiting.
+                 */
+                if (is_not_intr && sema_tryp(&netstack_reap_limiter) == 0) {
+                        /*
+                         * XXX KEBE SAYS inidicate we're slamming against
+                         * a limit.
+                         */
+                        hrtime_t measurement = gethrtime();
+
+                        sema_p(&netstack_reap_limiter);
+                        /* Caputre delay in ns. */
+                        DTRACE_PROBE1(netstack__reap__rate__limited,
+                            hrtime_t *, gethrtime() - measurement);
+                }
+
+                if (taskq_dispatch(system_taskq,
+                    is_not_intr ? netstack_reap : netstack_reap_intr, ns,
+                    TQ_NOSLEEP) == NULL) {
+                        /*
+                         * Well shoot, why can't we taskq_dispatch?
+                         * Take our chances with a direct call.
+                         */
+                        DTRACE_PROBE1(netstack__reap__taskq__fail,
+                            netstack_t *, ns);
+                        netstack_reap_work(ns, is_not_intr);
+                }
+        }
+}
+
+static void
+netstack_hold_locked(netstack_t *ns)
+{
+        ASSERT(MUTEX_HELD(&ns->netstack_lock));
+        ns->netstack_refcnt++;
+        ASSERT(ns->netstack_refcnt > 0);
         DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
 }
 
+void
+netstack_hold(netstack_t *ns)
+{
+        mutex_enter(&ns->netstack_lock);
+        netstack_hold_locked(ns);
+        mutex_exit(&ns->netstack_lock);
+}
+
 /*
  * To support kstat_create_netstack() using kstat_zone_add we need
  * to track both
  *  - all zoneids that use the global/shared stack
  *  - all kstats that have been added for the shared stack