Print this page
8900 deadlock between netstack teardown and kstat read
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Ryan Zezeski <rpz@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -20,11 +20,11 @@
  */
 
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2016, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/param.h>
 #include <sys/sysmacros.h>
 #include <sys/vm.h>

@@ -35,10 +35,11 @@
 #include <sys/debug.h>
 #include <sys/sdt.h>
 #include <sys/mutex.h>
 #include <sys/bitmap.h>
 #include <sys/atomic.h>
+#include <sys/sunddi.h>
 #include <sys/kobj.h>
 #include <sys/disp.h>
 #include <vm/seg_kmem.h>
 #include <sys/zone.h>
 #include <sys/netstack.h>

@@ -120,16 +121,26 @@
 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
     kmutex_t *);
 
+static ksema_t netstack_reap_limiter;
+/*
+ * Hard-coded constant, but since this is not tunable in real-time, it seems
+ * making it an /etc/system tunable is better than nothing.
+ */
+uint_t netstack_outstanding_reaps = 1024;
+
 void
 netstack_init(void)
 {
         mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
 
+        sema_init(&netstack_reap_limiter, netstack_outstanding_reaps, NULL,
+            SEMA_DRIVER, NULL);
+
         netstack_initialized = 1;
 
         /*
          * We want to be informed each time a zone is created or
          * destroyed in the kernel, so we can maintain the

@@ -1059,34 +1070,19 @@
         mutex_exit(&netstack_g_lock);
 
         return (rval);
 }
 
-void
-netstack_rele(netstack_t *ns)
+
+static void
+netstack_reap(void *arg)
 {
-        netstack_t **nsp;
+        netstack_t **nsp, *ns = (netstack_t *)arg;
         boolean_t found;
-        int refcnt, numzones;
         int i;
 
-        mutex_enter(&ns->netstack_lock);
-        ASSERT(ns->netstack_refcnt > 0);
-        ns->netstack_refcnt--;
         /*
-         * As we drop the lock additional netstack_rele()s can come in
-         * and decrement the refcnt to zero and free the netstack_t.
-         * Store pointers in local variables and if we were not the last
-         * then don't reference the netstack_t after that.
-         */
-        refcnt = ns->netstack_refcnt;
-        numzones = ns->netstack_numzones;
-        DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
-        mutex_exit(&ns->netstack_lock);
-
-        if (refcnt == 0 && numzones == 0) {
-                /*
                  * Time to call the destroy functions and free up
                  * the structure
                  */
                 netstack_stack_inactive(ns);
 

@@ -1121,11 +1117,61 @@
                         cv_destroy(&nms->nms_cv);
                 }
                 mutex_destroy(&ns->netstack_lock);
                 cv_destroy(&ns->netstack_cv);
                 kmem_free(ns, sizeof (*ns));
+        /* Allow another reap to be scheduled. */
+        sema_v(&netstack_reap_limiter);
+}
+
+void
+netstack_rele(netstack_t *ns)
+{
+        int refcnt, numzones;
+
+        mutex_enter(&ns->netstack_lock);
+        ASSERT(ns->netstack_refcnt > 0);
+        ns->netstack_refcnt--;
+        /*
+         * As we drop the lock additional netstack_rele()s can come in
+         * and decrement the refcnt to zero and free the netstack_t.
+         * Store pointers in local variables and if we were not the last
+         * then don't reference the netstack_t after that.
+         */
+        refcnt = ns->netstack_refcnt;
+        numzones = ns->netstack_numzones;
+        DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
+        mutex_exit(&ns->netstack_lock);
+
+        if (refcnt == 0 && numzones == 0) {
+                /*
+                 * Because there are possibilities of re-entrancy in various
+                 * netstack structures by callers, which might cause a lock up
+                 * due to odd reference models, or other factors, we choose to
+                 * schedule the actual deletion of this netstack as a deferred
+                 * task on the system taskq.  This way, any such reference
+                 * models won't trip over themselves.
+                 *
+                 * Assume we aren't in a high-priority interrupt context, so
+                 * we can use KM_SLEEP and semaphores.
+                 */
+                if (sema_tryp(&netstack_reap_limiter) == 0) {
+                        /*
+                         * Indicate we're slamming against a limit.
+                         */
+                        hrtime_t measurement = gethrtime();
+
+                        sema_p(&netstack_reap_limiter);
+                        /* Capture delay in ns. */
+                        DTRACE_PROBE1(netstack__reap__rate__limited,
+                            hrtime_t, gethrtime() - measurement);
         }
+
+                /* TQ_SLEEP should prevent taskq_dispatch() from failing. */
+                (void) taskq_dispatch(system_taskq, netstack_reap, ns,
+                    TQ_SLEEP);
+        }
 }
 
 void
 netstack_hold(netstack_t *ns)
 {