Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

@@ -22,10 +22,11 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>

@@ -52,14 +53,20 @@
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
+#ifndef __lint
+extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
+    dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
+#endif /* ! __lint */
+
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_cache;
+static taskq_t *dbu_evict_taskq;
 
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {

@@ -213,21 +220,76 @@
         atomic_dec_64(&dbuf_hash_count);
 }
 
 static arc_evict_func_t dbuf_do_evict;
 
+typedef enum {
+        DBVU_EVICTING,
+        DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
 static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+        int64_t holds;
+
+        if (db->db_user == NULL)
+                return;
+
+        /* Only data blocks support the attachment of user data. */
+        ASSERT(db->db_level == 0);
+
+        /* Clients must resolve a dbuf before attaching user data. */
+        ASSERT(db->db.db_data != NULL);
+        ASSERT3U(db->db_state, ==, DB_CACHED);
+
+        holds = refcount_count(&db->db_holds);
+        if (verify_type == DBVU_EVICTING) {
+                /*
+                 * Immediate eviction occurs when holds == dirtycnt.
+                 * For normal eviction buffers, holds is zero on
+                 * eviction, except when dbuf_fix_old_data() calls
+                 * dbuf_clear_data().  However, the hold count can grow
+                 * during eviction even though db_mtx is held (see
+                 * dmu_bonus_hold() for an example), so we can only
+                 * test the generic invariant that holds >= dirtycnt.
+                 */
+                ASSERT3U(holds, >=, db->db_dirtycnt);
+        } else {
+                if (db->db_immediate_evict == TRUE)
+                        ASSERT3U(holds, >=, db->db_dirtycnt);
+                else
+                        ASSERT3U(holds, >, 0);
+        }
+#endif
+}
+
+static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
+        dmu_buf_user_t *dbu = db->db_user;
+
         ASSERT(MUTEX_HELD(&db->db_mtx));
 
-        if (db->db_level != 0 || db->db_evict_func == NULL)
+        if (dbu == NULL)
                 return;
 
-        db->db_evict_func(&db->db, db->db_user_ptr);
-        db->db_user_ptr = NULL;
-        db->db_evict_func = NULL;
+        dbuf_verify_user(db, DBVU_EVICTING);
+        db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+        if (dbu->dbu_clear_on_evict_dbufp != NULL)
+                *dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+        /*
+         * Invoke the callback from a taskq to avoid lock order reversals
+         * and limit stack depth.
+         */
+        taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
+            &dbu->dbu_tqent);
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {

@@ -284,10 +346,16 @@
             sizeof (dmu_buf_impl_t),
             0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
         for (i = 0; i < DBUF_MUTEXES; i++)
                 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+        /*
+         * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+         * configuration is not required.
+         */
+        dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
 }
 
 void
 dbuf_fini(void)
 {

@@ -296,10 +364,11 @@
 
         for (i = 0; i < DBUF_MUTEXES; i++)
                 mutex_destroy(&h->hash_mutexes[i]);
         kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
         kmem_cache_destroy(dbuf_cache);
+        taskq_destroy(dbu_evict_taskq);
 }
 
 /*
  * Other stuff.
  */

@@ -413,25 +482,31 @@
         DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
+dbuf_clear_data(dmu_buf_impl_t *db)
+{
+        ASSERT(MUTEX_HELD(&db->db_mtx));
+        dbuf_evict_user(db);
+        db->db_buf = NULL;
+        db->db.db_data = NULL;
+        if (db->db_state != DB_NOFILL)
+                db->db_state = DB_UNCACHED;
+}
+
+static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
         ASSERT(MUTEX_HELD(&db->db_mtx));
+        ASSERT(buf != NULL);
+
         db->db_buf = buf;
-        if (buf != NULL) {
                 ASSERT(buf->b_data != NULL);
                 db->db.db_data = buf->b_data;
                 if (!arc_released(buf))
                         arc_set_callback(buf, dbuf_do_evict, db);
-        } else {
-                dbuf_evict_user(db);
-                db->db.db_data = NULL;
-                if (db->db_state != DB_NOFILL)
-                        db->db_state = DB_UNCACHED;
-        }
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */

@@ -449,11 +524,11 @@
                 abuf = arc_loan_buf(spa, blksz);
                 bcopy(db->db.db_data, abuf->b_data, blksz);
         } else {
                 abuf = db->db_buf;
                 arc_loan_inuse_buf(abuf, db);
-                dbuf_set_data(db, NULL);
+                dbuf_clear_data(db);
                 mutex_exit(&db->db_mtx);
         }
         return (abuf);
 }
 

@@ -685,11 +760,11 @@
                 ASSERT(db->db_buf == NULL);
                 ASSERT(db->db.db_data == NULL);
                 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
                 db->db_state = DB_FILL;
         } else if (db->db_state == DB_NOFILL) {
-                dbuf_set_data(db, NULL);
+                dbuf_clear_data(db);
         } else {
                 ASSERT3U(db->db_state, ==, DB_CACHED);
         }
         mutex_exit(&db->db_mtx);
 }

@@ -741,11 +816,11 @@
                 spa_t *spa = db->db_objset->os_spa;
 
                 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
                 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
         } else {
-                dbuf_set_data(db, NULL);
+                dbuf_clear_data(db);
         }
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)

@@ -792,11 +867,12 @@
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
-        dmu_buf_impl_t *db, *db_next, db_search;
+        dmu_buf_impl_t db_search;
+        dmu_buf_impl_t *db, *db_next;
         uint64_t txg = tx->tx_txg;
         avl_index_t where;
 
         if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
                 end_blkid = dn->dn_maxblkid;

@@ -1368,11 +1444,11 @@
 
         if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
                 arc_buf_t *buf = db->db_buf;
 
                 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
-                dbuf_set_data(db, NULL);
+                dbuf_clear_data(db);
                 VERIFY(arc_buf_remove_ref(buf, db));
                 dbuf_evict(db);
                 return (B_TRUE);
         }
 

@@ -1708,12 +1784,11 @@
         db->db_dirtycnt = 0;
         db->db_dnode_handle = dn->dn_handle;
         db->db_parent = parent;
         db->db_blkptr = blkptr;
 
-        db->db_user_ptr = NULL;
-        db->db_evict_func = NULL;
+        db->db_user = NULL;
         db->db_immediate_evict = 0;
         db->db_freed_in_flight = 0;
 
         if (blkid == DMU_BONUS_BLKID) {
                 ASSERT3P(parent, ==, dn->dn_dbuf);

@@ -2112,11 +2187,11 @@
                 } else if (arc_released(db->db_buf)) {
                         arc_buf_t *buf = db->db_buf;
                         /*
                          * This dbuf has anonymous data associated with it.
                          */
-                        dbuf_set_data(db, NULL);
+                        dbuf_clear_data(db);
                         VERIFY(arc_buf_remove_ref(buf, db));
                         dbuf_evict(db);
                 } else {
                         VERIFY(!arc_buf_remove_ref(db->db_buf, db));
 

@@ -2145,11 +2220,12 @@
                                         dbuf_clear(db);
                                         arc_freed(spa, &bp);
                                 } else {
                                         dbuf_clear(db);
                                 }
-                        } else if (arc_buf_eviction_needed(db->db_buf)) {
+                        } else if (db->db_objset->os_evicting ||
+                            arc_buf_eviction_needed(db->db_buf)) {
                                 dbuf_clear(db);
                         } else {
                                 mutex_exit(&db->db_mtx);
                         }
                 }

@@ -2164,57 +2240,63 @@
 {
         return (refcount_count(&db->db_holds));
 }
 
 void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+    dmu_buf_user_t *new_user)
 {
-        return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
+        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+        mutex_enter(&db->db_mtx);
+        dbuf_verify_user(db, DBVU_NOT_EVICTING);
+        if (db->db_user == old_user)
+                db->db_user = new_user;
+        else
+                old_user = db->db_user;
+        dbuf_verify_user(db, DBVU_NOT_EVICTING);
+        mutex_exit(&db->db_mtx);
+
+        return (old_user);
 }
 
 void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
-        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-        db->db_immediate_evict = TRUE;
-        return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
+        return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-        ASSERT(db->db_level == 0);
 
-        ASSERT((user_ptr == NULL) == (evict_func == NULL));
+        db->db_immediate_evict = TRUE;
+        return (dmu_buf_set_user(db_fake, user));
+}
 
-        mutex_enter(&db->db_mtx);
-
-        if (db->db_user_ptr == old_user_ptr) {
-                db->db_user_ptr = user_ptr;
-                db->db_evict_func = evict_func;
-        } else {
-                old_user_ptr = db->db_user_ptr;
-        }
-
-        mutex_exit(&db->db_mtx);
-        return (old_user_ptr);
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+        return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-        ASSERT(!refcount_is_zero(&db->db_holds));
 
-        return (db->db_user_ptr);
+        dbuf_verify_user(db, DBVU_NOT_EVICTING);
+        return (db->db_user);
 }
 
+void
+dmu_buf_user_evict_wait()
+{
+        taskq_wait(dbu_evict_taskq);
+}
+
 boolean_t
 dmu_buf_freeable(dmu_buf_t *dbuf)
 {
         boolean_t res = B_FALSE;
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;