Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
        
*** 19,28 ****
--- 19,29 ----
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   */
  
  #include <sys/zfs_context.h>
  #include <sys/dbuf.h>
  #include <sys/dnode.h>
*** 400,411 ****
  
  static dnode_t *
  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
      uint64_t object, dnode_handle_t *dnh)
  {
!         dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
  
          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
          dn->dn_moved = 0;
  
          /*
           * Defer setting dn_objset until the dnode is ready to be a candidate
--- 401,413 ----
  
  static dnode_t *
  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
      uint64_t object, dnode_handle_t *dnh)
  {
!         dnode_t *dn;
  
+         dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
          dn->dn_moved = 0;
  
          /*
           * Defer setting dn_objset until the dnode is ready to be a candidate
*** 438,454 ****
          dmu_zfetch_init(&dn->dn_zfetch, dn);
  
          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
  
          mutex_enter(&os->os_lock);
          list_insert_head(&os->os_dnodes, dn);
          membar_producer();
          /*
!          * Everything else must be valid before assigning dn_objset makes the
!          * dnode eligible for dnode_move().
           */
          dn->dn_objset = os;
          mutex_exit(&os->os_lock);
  
          arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
          return (dn);
  }
--- 440,474 ----
          dmu_zfetch_init(&dn->dn_zfetch, dn);
  
          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
  
          mutex_enter(&os->os_lock);
+         if (dnh->dnh_dnode != NULL) {
+                 /* Lost the allocation race. */
+                 mutex_exit(&os->os_lock);
+                 kmem_cache_free(dnode_cache, dn);
+                 return (dnh->dnh_dnode);
+         }
+ 
+         /*
+          * Exclude special dnodes from os_dnodes so an empty os_dnodes
+          * signifies that the special dnodes have no references from
+          * their children (the entries in os_dnodes).  This allows
+          * dnode_destroy() to easily determine if the last child has
+          * been removed and then complete eviction of the objset.
+          */
+         if (!DMU_OBJECT_IS_SPECIAL(object))
                  list_insert_head(&os->os_dnodes, dn);
          membar_producer();
+ 
          /*
!          * Everything else must be valid before assigning dn_objset
!          * makes the dnode eligible for dnode_move().
           */
          dn->dn_objset = os;
+ 
+         dnh->dnh_dnode = dn;
          mutex_exit(&os->os_lock);
  
          arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
          return (dn);
  }
*** 458,473 ****
--- 478,499 ----
   */
  static void
  dnode_destroy(dnode_t *dn)
  {
          objset_t *os = dn->dn_objset;
+         boolean_t complete_os_eviction = B_FALSE;
  
          ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
  
          mutex_enter(&os->os_lock);
          POINTER_INVALIDATE(&dn->dn_objset);
+         if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
                  list_remove(&os->os_dnodes, dn);
+                 complete_os_eviction =
+                     list_is_empty(&os->os_dnodes) &&
+                     list_link_active(&os->os_evicting_node);
+         }
          mutex_exit(&os->os_lock);
  
          /* the dnode can no longer move, so we can release the handle */
          zrl_remove(&dn->dn_handle->dnh_zrlock);
  
*** 498,507 ****
--- 524,536 ----
          dn->dn_unlisted_l0_blkid = 0;
  
          dmu_zfetch_rele(&dn->dn_zfetch);
          kmem_cache_free(dnode_cache, dn);
          arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+ 
+         if (complete_os_eviction)
+                 dmu_objset_evict_done(os);
  }
  
  void
  dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
*** 964,1000 ****
           * has a hold on this dnode while we are trying to evict this
           * dnode.
           */
          while (refcount_count(&dn->dn_holds) > 0)
                  delay(1);
          zrl_add(&dnh->dnh_zrlock);
          dnode_destroy(dn); /* implicit zrl_remove() */
          zrl_destroy(&dnh->dnh_zrlock);
          dnh->dnh_dnode = NULL;
  }
  
! dnode_t *
  dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
      dnode_handle_t *dnh)
  {
!         dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
!         dnh->dnh_dnode = dn;
          zrl_init(&dnh->dnh_zrlock);
          DNODE_VERIFY(dn);
-         return (dn);
  }
  
  static void
! dnode_buf_pageout(dmu_buf_t *db, void *arg)
  {
!         dnode_children_t *children_dnodes = arg;
          int i;
-         int epb = db->db_size >> DNODE_SHIFT;
  
!         ASSERT(epb == children_dnodes->dnc_count);
! 
!         for (i = 0; i < epb; i++) {
                  dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
                  dnode_t *dn;
  
                  /*
                   * The dnode handle lock guards against the dnode moving to
--- 993,1028 ----
           * has a hold on this dnode while we are trying to evict this
           * dnode.
           */
          while (refcount_count(&dn->dn_holds) > 0)
                  delay(1);
+         ASSERT(dn->dn_dbuf == NULL ||
+             dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
          zrl_add(&dnh->dnh_zrlock);
          dnode_destroy(dn); /* implicit zrl_remove() */
          zrl_destroy(&dnh->dnh_zrlock);
          dnh->dnh_dnode = NULL;
  }
  
! void
  dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
      dnode_handle_t *dnh)
  {
!         dnode_t *dn;
! 
!         dn = dnode_create(os, dnp, NULL, object, dnh);
          zrl_init(&dnh->dnh_zrlock);
          DNODE_VERIFY(dn);
  }
  
  static void
! dnode_buf_pageout(void *dbu)
  {
!         dnode_children_t *children_dnodes = dbu;
          int i;
  
!         for (i = 0; i < children_dnodes->dnc_count; i++) {
                  dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
                  dnode_t *dn;
  
                  /*
                   * The dnode handle lock guards against the dnode moving to
*** 1020,1030 ****
                  dnode_destroy(dn); /* implicit zrl_remove() */
                  zrl_destroy(&dnh->dnh_zrlock);
                  dnh->dnh_dnode = NULL;
          }
          kmem_free(children_dnodes, sizeof (dnode_children_t) +
!             epb * sizeof (dnode_handle_t));
  }
  
  /*
   * errors:
   * EINVAL - invalid object number.
--- 1048,1058 ----
                  dnode_destroy(dn); /* implicit zrl_remove() */
                  zrl_destroy(&dnh->dnh_zrlock);
                  dnh->dnh_dnode = NULL;
          }
          kmem_free(children_dnodes, sizeof (dnode_children_t) +
!             children_dnodes->dnc_count * sizeof (dnode_handle_t));
  }
  
  /*
   * errors:
   * EINVAL - invalid object number.
*** 1104,1123 ****
          ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
          children_dnodes = dmu_buf_get_user(&db->db);
          if (children_dnodes == NULL) {
                  int i;
                  dnode_children_t *winner;
!                 children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
                      epb * sizeof (dnode_handle_t), KM_SLEEP);
                  children_dnodes->dnc_count = epb;
                  dnh = &children_dnodes->dnc_children[0];
                  for (i = 0; i < epb; i++) {
                          zrl_init(&dnh[i].dnh_zrlock);
-                         dnh[i].dnh_dnode = NULL;
                  }
!                 if (winner = dmu_buf_set_user(&db->db, children_dnodes,
!                     dnode_buf_pageout)) {
  
                          for (i = 0; i < epb; i++) {
                                  zrl_destroy(&dnh[i].dnh_zrlock);
                          }
  
--- 1132,1152 ----
          ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
          children_dnodes = dmu_buf_get_user(&db->db);
          if (children_dnodes == NULL) {
                  int i;
                  dnode_children_t *winner;
!                 children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
                      epb * sizeof (dnode_handle_t), KM_SLEEP);
                  children_dnodes->dnc_count = epb;
                  dnh = &children_dnodes->dnc_children[0];
                  for (i = 0; i < epb; i++) {
                          zrl_init(&dnh[i].dnh_zrlock);
                  }
!                 dmu_buf_init_user(&children_dnodes->dnc_dbu,
!                     dnode_buf_pageout, NULL);
!                 winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
!                 if (winner != NULL) {
  
                          for (i = 0; i < epb; i++) {
                                  zrl_destroy(&dnh[i].dnh_zrlock);
                          }
  
*** 1128,1149 ****
          }
          ASSERT(children_dnodes->dnc_count == epb);
  
          dnh = &children_dnodes->dnc_children[idx];
          zrl_add(&dnh->dnh_zrlock);
!         if ((dn = dnh->dnh_dnode) == NULL) {
                  dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-                 dnode_t *winner;
  
                  dn = dnode_create(os, phys, db, object, dnh);
-                 winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-                 if (winner != NULL) {
-                         zrl_add(&dnh->dnh_zrlock);
-                         dnode_destroy(dn); /* implicit zrl_remove() */
-                         dn = winner;
                  }
-         }
  
          mutex_enter(&dn->dn_mtx);
          type = dn->dn_type;
          if (dn->dn_free_txg ||
              ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
--- 1157,1172 ----
          }
          ASSERT(children_dnodes->dnc_count == epb);
  
          dnh = &children_dnodes->dnc_children[idx];
          zrl_add(&dnh->dnh_zrlock);
!         dn = dnh->dnh_dnode;
!         if (dn == NULL) {
                  dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
  
                  dn = dnode_create(os, phys, db, object, dnh);
          }
  
          mutex_enter(&dn->dn_mtx);
          type = dn->dn_type;
          if (dn->dn_free_txg ||
              ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
*** 1152,1165 ****
                  mutex_exit(&dn->dn_mtx);
                  zrl_remove(&dnh->dnh_zrlock);
                  dbuf_rele(db, FTAG);
                  return (type == DMU_OT_NONE ? ENOENT : EEXIST);
          }
-         mutex_exit(&dn->dn_mtx);
- 
          if (refcount_add(&dn->dn_holds, tag) == 1)
                  dbuf_add_ref(db, dnh);
          /* Now we can rely on the hold to prevent the dnode from moving. */
          zrl_remove(&dnh->dnh_zrlock);
  
          DNODE_VERIFY(dn);
          ASSERT3P(dn->dn_dbuf, ==, db);
--- 1175,1188 ----
                  mutex_exit(&dn->dn_mtx);
                  zrl_remove(&dnh->dnh_zrlock);
                  dbuf_rele(db, FTAG);
                  return (type == DMU_OT_NONE ? ENOENT : EEXIST);
          }
          if (refcount_add(&dn->dn_holds, tag) == 1)
                  dbuf_add_ref(db, dnh);
+         mutex_exit(&dn->dn_mtx);
+ 
          /* Now we can rely on the hold to prevent the dnode from moving. */
          zrl_remove(&dnh->dnh_zrlock);
  
          DNODE_VERIFY(dn);
          ASSERT3P(dn->dn_dbuf, ==, db);