Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

@@ -19,10 +19,11 @@
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>

@@ -400,12 +401,13 @@
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
-        dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+        dnode_t *dn;
 
+        dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
         ASSERT(!POINTER_IS_VALID(dn->dn_objset));
         dn->dn_moved = 0;
 
         /*
          * Defer setting dn_objset until the dnode is ready to be a candidate

@@ -438,17 +440,35 @@
         dmu_zfetch_init(&dn->dn_zfetch, dn);
 
         ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
         mutex_enter(&os->os_lock);
+        if (dnh->dnh_dnode != NULL) {
+                /* Lost the allocation race. */
+                mutex_exit(&os->os_lock);
+                kmem_cache_free(dnode_cache, dn);
+                return (dnh->dnh_dnode);
+        }
+
+        /*
+         * Exclude special dnodes from os_dnodes so an empty os_dnodes
+         * signifies that the special dnodes have no references from
+         * their children (the entries in os_dnodes).  This allows
+         * dnode_destroy() to easily determine if the last child has
+         * been removed and then complete eviction of the objset.
+         */
+        if (!DMU_OBJECT_IS_SPECIAL(object))
         list_insert_head(&os->os_dnodes, dn);
         membar_producer();
+
         /*
-         * Everything else must be valid before assigning dn_objset makes the
-         * dnode eligible for dnode_move().
+         * Everything else must be valid before assigning dn_objset
+         * makes the dnode eligible for dnode_move().
          */
         dn->dn_objset = os;
+
+        dnh->dnh_dnode = dn;
         mutex_exit(&os->os_lock);
 
         arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
         return (dn);
 }

@@ -458,16 +478,22 @@
  */
 static void
 dnode_destroy(dnode_t *dn)
 {
         objset_t *os = dn->dn_objset;
+        boolean_t complete_os_eviction = B_FALSE;
 
         ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
         mutex_enter(&os->os_lock);
         POINTER_INVALIDATE(&dn->dn_objset);
+        if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
         list_remove(&os->os_dnodes, dn);
+                complete_os_eviction =
+                    list_is_empty(&os->os_dnodes) &&
+                    list_link_active(&os->os_evicting_node);
+        }
         mutex_exit(&os->os_lock);
 
         /* the dnode can no longer move, so we can release the handle */
         zrl_remove(&dn->dn_handle->dnh_zrlock);
 

@@ -498,10 +524,13 @@
         dn->dn_unlisted_l0_blkid = 0;
 
         dmu_zfetch_rele(&dn->dn_zfetch);
         kmem_cache_free(dnode_cache, dn);
         arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+
+        if (complete_os_eviction)
+                dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)

@@ -964,37 +993,36 @@
          * has a hold on this dnode while we are trying to evict this
          * dnode.
          */
         while (refcount_count(&dn->dn_holds) > 0)
                 delay(1);
+        ASSERT(dn->dn_dbuf == NULL ||
+            dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
         zrl_add(&dnh->dnh_zrlock);
         dnode_destroy(dn); /* implicit zrl_remove() */
         zrl_destroy(&dnh->dnh_zrlock);
         dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
+void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
-        dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
-        dnh->dnh_dnode = dn;
+        dnode_t *dn;
+
+        dn = dnode_create(os, dnp, NULL, object, dnh);
         zrl_init(&dnh->dnh_zrlock);
         DNODE_VERIFY(dn);
-        return (dn);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_pageout(void *dbu)
 {
-        dnode_children_t *children_dnodes = arg;
+        dnode_children_t *children_dnodes = dbu;
         int i;
-        int epb = db->db_size >> DNODE_SHIFT;
 
-        ASSERT(epb == children_dnodes->dnc_count);
-
-        for (i = 0; i < epb; i++) {
+        for (i = 0; i < children_dnodes->dnc_count; i++) {
                 dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
                 dnode_t *dn;
 
                 /*
                  * The dnode handle lock guards against the dnode moving to

@@ -1020,11 +1048,11 @@
                 dnode_destroy(dn); /* implicit zrl_remove() */
                 zrl_destroy(&dnh->dnh_zrlock);
                 dnh->dnh_dnode = NULL;
         }
         kmem_free(children_dnodes, sizeof (dnode_children_t) +
-            epb * sizeof (dnode_handle_t));
+            children_dnodes->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
  * errors:
  * EINVAL - invalid object number.

@@ -1104,20 +1132,21 @@
         ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
         children_dnodes = dmu_buf_get_user(&db->db);
         if (children_dnodes == NULL) {
                 int i;
                 dnode_children_t *winner;
-                children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
+                children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
                     epb * sizeof (dnode_handle_t), KM_SLEEP);
                 children_dnodes->dnc_count = epb;
                 dnh = &children_dnodes->dnc_children[0];
                 for (i = 0; i < epb; i++) {
                         zrl_init(&dnh[i].dnh_zrlock);
-                        dnh[i].dnh_dnode = NULL;
                 }
-                if (winner = dmu_buf_set_user(&db->db, children_dnodes,
-                    dnode_buf_pageout)) {
+                dmu_buf_init_user(&children_dnodes->dnc_dbu,
+                    dnode_buf_pageout, NULL);
+                winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+                if (winner != NULL) {
 
                         for (i = 0; i < epb; i++) {
                                 zrl_destroy(&dnh[i].dnh_zrlock);
                         }
 

@@ -1128,22 +1157,16 @@
         }
         ASSERT(children_dnodes->dnc_count == epb);
 
         dnh = &children_dnodes->dnc_children[idx];
         zrl_add(&dnh->dnh_zrlock);
-        if ((dn = dnh->dnh_dnode) == NULL) {
+        dn = dnh->dnh_dnode;
+        if (dn == NULL) {
                 dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-                dnode_t *winner;
 
                 dn = dnode_create(os, phys, db, object, dnh);
-                winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-                if (winner != NULL) {
-                        zrl_add(&dnh->dnh_zrlock);
-                        dnode_destroy(dn); /* implicit zrl_remove() */
-                        dn = winner;
                 }
-        }
 
         mutex_enter(&dn->dn_mtx);
         type = dn->dn_type;
         if (dn->dn_free_txg ||
             ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||

@@ -1152,14 +1175,14 @@
                 mutex_exit(&dn->dn_mtx);
                 zrl_remove(&dnh->dnh_zrlock);
                 dbuf_rele(db, FTAG);
                 return (type == DMU_OT_NONE ? ENOENT : EEXIST);
         }
-        mutex_exit(&dn->dn_mtx);
-
         if (refcount_add(&dn->dn_holds, tag) == 1)
                 dbuf_add_ref(db, dnh);
+        mutex_exit(&dn->dn_mtx);
+
         /* Now we can rely on the hold to prevent the dnode from moving. */
         zrl_remove(&dnh->dnh_zrlock);
 
         DNODE_VERIFY(dn);
         ASSERT3P(dn->dn_dbuf, ==, db);