Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

@@ -21,10 +21,11 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>

@@ -353,11 +354,11 @@
                 if (err == 0) {
                         err = dsl_prop_register(ds,
                             zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
                             secondary_cache_changed_cb, os);
                 }
-                if (!dsl_dataset_is_snapshot(ds)) {
+                if (!ds->ds_is_snapshot) {
                         if (err == 0) {
                                 err = dsl_prop_register(ds,
                                     zfs_prop_to_name(ZFS_PROP_CHECKSUM),
                                     checksum_changed_cb, os);
                         }

@@ -415,11 +416,11 @@
                 os->os_sync = ZFS_SYNC_STANDARD;
                 os->os_primary_cache = ZFS_CACHE_ALL;
                 os->os_secondary_cache = ZFS_CACHE_ALL;
         }
 
-        if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+        if (ds == NULL || !ds->ds_is_snapshot)
                 os->os_zil_header = os->os_phys->os_zil_header;
         os->os_zil = zil_alloc(os, &os->os_zil_header);
 
         for (i = 0; i < TXG_SIZE; i++) {
                 list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),

@@ -434,20 +435,17 @@
 
         mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
         mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-        DMU_META_DNODE(os) = dnode_special_open(os,
-            &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
-            &os->os_meta_dnode);
+        dnode_special_open(os, &os->os_phys->os_meta_dnode,
+            DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
         if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-                DMU_USERUSED_DNODE(os) = dnode_special_open(os,
-                    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
-                    &os->os_userused_dnode);
-                DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
-                    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
-                    &os->os_groupused_dnode);
+                dnode_special_open(os, &os->os_phys->os_userused_dnode,
+                    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+                dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+                    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
         }
 
         *osp = os;
         return (0);
 }

@@ -531,11 +529,11 @@
         if (err != 0) {
                 dsl_dataset_disown(ds, tag);
         } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
                 dsl_dataset_disown(ds, tag);
                 return (SET_ERROR(EINVAL));
-        } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+        } else if (!readonly && ds->ds_is_snapshot) {
                 dsl_dataset_disown(ds, tag);
                 return (SET_ERROR(EROFS));
         }
         return (err);
 }

@@ -587,55 +585,67 @@
 }
 
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
+        dnode_t dn_marker;
         dnode_t *dn;
 
         mutex_enter(&os->os_lock);
-
-        /* process the mdn last, since the other dnodes have holds on it */
-        list_remove(&os->os_dnodes, DMU_META_DNODE(os));
-        list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
-
+        dn = list_head(&os->os_dnodes);
+        while (dn != NULL) {
         /*
-         * Find the first dnode with holds.  We have to do this dance
-         * because dnode_add_ref() only works if you already have a
-         * hold.  If there are no holds then it has no dbufs so OK to
-         * skip.
+                 * Skip dnodes without holds.  We have to do this dance
+                 * because dnode_add_ref() only works if there is already a
+                 * hold.  If the dnode has no holds, then it has no dbufs.
          */
-        for (dn = list_head(&os->os_dnodes);
-            dn && !dnode_add_ref(dn, FTAG);
-            dn = list_next(&os->os_dnodes, dn))
-                continue;
-
-        while (dn) {
-                dnode_t *next_dn = dn;
-
-                do {
-                        next_dn = list_next(&os->os_dnodes, next_dn);
-                } while (next_dn && !dnode_add_ref(next_dn, FTAG));
-
+                if (dnode_add_ref(dn, FTAG)) {
+                        list_insert_after(&os->os_dnodes, dn, &dn_marker);
                 mutex_exit(&os->os_lock);
+
                 dnode_evict_dbufs(dn);
                 dnode_rele(dn, FTAG);
+
                 mutex_enter(&os->os_lock);
-                dn = next_dn;
+                        dn = list_next(&os->os_dnodes, &dn_marker);
+                        list_remove(&os->os_dnodes, &dn_marker);
+                } else {
+                        dn = list_next(&os->os_dnodes, dn);
         }
+        }
         mutex_exit(&os->os_lock);
+
+        if (DMU_USERUSED_DNODE(os) != NULL) {
+                dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+                dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
+        }
+        dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction.  Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ *       dnode_buf_pageout()), it is possible for the meta dnode for the
+ *       objset to have no holds even though os->os_dnodes is not empty.
+ */
 void
 dmu_objset_evict(objset_t *os)
 {
         dsl_dataset_t *ds = os->os_dsl_dataset;
 
         for (int t = 0; t < TXG_SIZE; t++)
                 ASSERT(!dmu_objset_is_dirty(os, t));
 
         if (ds) {
-                if (!dsl_dataset_is_snapshot(ds)) {
+                if (!ds->ds_is_snapshot) {
                         VERIFY0(dsl_prop_unregister(ds,
                             zfs_prop_to_name(ZFS_PROP_CHECKSUM),
                             checksum_changed_cb, os));
                         VERIFY0(dsl_prop_unregister(ds,
                             zfs_prop_to_name(ZFS_PROP_COMPRESSION),

@@ -668,21 +678,35 @@
         }
 
         if (os->os_sa)
                 sa_tear_down(os);
 
+        os->os_evicting = B_TRUE;
         dmu_objset_evict_dbufs(os);
 
+        mutex_enter(&os->os_lock);
+        spa_evicting_os_register(os->os_spa, os);
+        if (list_is_empty(&os->os_dnodes)) {
+                mutex_exit(&os->os_lock);
+                dmu_objset_evict_done(os);
+        } else {
+                mutex_exit(&os->os_lock);
+        }
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
         dnode_special_close(&os->os_meta_dnode);
         if (DMU_USERUSED_DNODE(os)) {
                 dnode_special_close(&os->os_userused_dnode);
                 dnode_special_close(&os->os_groupused_dnode);
         }
         zil_free(os->os_zil);
 
-        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
-
         VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 
         /*
          * This is a barrier to prevent the objset from going away in
          * dnode_move() until we can safely ensure that the objset is still in

@@ -693,10 +717,11 @@
         rw_exit(&os_lock);
 
         mutex_destroy(&os->os_lock);
         mutex_destroy(&os->os_obj_lock);
         mutex_destroy(&os->os_user_ptr_lock);
+        spa_evicting_os_deregister(os->os_spa, os);
         kmem_free(os, sizeof (objset_t));
 }
 
 timestruc_t
 dmu_objset_snap_cmtime(objset_t *os)

@@ -901,11 +926,11 @@
                 dsl_dataset_rele(origin, FTAG);
                 return (SET_ERROR(EXDEV));
         }
 
         /* You can only clone snapshots, not the head datasets. */
-        if (!dsl_dataset_is_snapshot(origin)) {
+        if (!origin->ds_is_snapshot) {
                 dsl_dataset_rele(origin, FTAG);
                 return (SET_ERROR(EINVAL));
         }
         dsl_dataset_rele(origin, FTAG);
 

@@ -1465,11 +1490,11 @@
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
         if (os->os_dsl_dataset != NULL)
-                return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
+                return (os->os_dsl_dataset->ds_is_snapshot);
         else
                 return (B_FALSE);
 }
 
 int