Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.

  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dbuf.h>
  29 #include <sys/dnode.h>
  30 #include <sys/dmu.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dmu_objset.h>
  33 #include <sys/dsl_dataset.h>
  34 #include <sys/spa.h>
  35 #include <sys/range_tree.h>
  36 #include <sys/zfeature.h>
  37 
  38 static void
  39 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
  40 {
  41         dmu_buf_impl_t *db;
  42         int txgoff = tx->tx_txg & TXG_MASK;
  43         int nblkptr = dn->dn_phys->dn_nblkptr;
  44         int old_toplvl = dn->dn_phys->dn_nlevels - 1;


 379         dmu_tx_t *dsfra_tx;
 380 } dnode_sync_free_range_arg_t;
 381 
 382 static void
 383 dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
 384 {
 385         dnode_sync_free_range_arg_t *dsfra = arg;
 386         dnode_t *dn = dsfra->dsfra_dnode;
 387 
 388         mutex_exit(&dn->dn_mtx);
 389         dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
 390         mutex_enter(&dn->dn_mtx);
 391 }
 392 
 393 /*
 394  * Try to kick all the dnode's dbufs out of the cache...
 395  */
 396 void
 397 dnode_evict_dbufs(dnode_t *dn)
 398 {
 399         int progress;
 400         int pass = 0;
 401 
 402         do {
 403                 dmu_buf_impl_t *db, *db_next;
 404                 int evicting = FALSE;
 405 
 406                 progress = FALSE;
 407                 mutex_enter(&dn->dn_dbufs_mtx);
 408                 for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 409                         db_next = AVL_NEXT(&dn->dn_dbufs, db);
 410 #ifdef  DEBUG
 411                         DB_DNODE_ENTER(db);
 412                         ASSERT3P(DB_DNODE(db), ==, dn);
 413                         DB_DNODE_EXIT(db);
 414 #endif  /* DEBUG */
 415 
 416                         mutex_enter(&db->db_mtx);
 417                         if (db->db_state == DB_EVICTING) {
 418                                 progress = TRUE;
 419                                 evicting = TRUE;
 420                                 mutex_exit(&db->db_mtx);
 421                         } else if (refcount_is_zero(&db->db_holds)) {
 422                                 progress = TRUE;
 423                                 dbuf_clear(db); /* exits db_mtx for us */





 424                         } else {
 425                                 mutex_exit(&db->db_mtx);

 426                         }
 427 
 428                 }
 429                 /*
 430                  * NB: we need to drop dn_dbufs_mtx between passes so
 431                  * that any DB_EVICTING dbufs can make progress.
 432                  * Ideally, we would have some cv we could wait on, but
 433                  * since we don't, just wait a bit to give the other
 434                  * thread a chance to run.
 435                  */
 436                 mutex_exit(&dn->dn_dbufs_mtx);
 437                 if (evicting)
 438                         delay(1);
 439                 pass++;
 440                 ASSERT(pass < 100); /* sanity check */
 441         } while (progress);
 442 
 443         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 444         if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 445                 mutex_enter(&dn->dn_bonus->db_mtx);
 446                 dbuf_evict(dn->dn_bonus);
 447                 dn->dn_bonus = NULL;
 448         }
 449         rw_exit(&dn->dn_struct_rwlock);
 450 }
 451 
 452 static void
 453 dnode_undirty_dbufs(list_t *list)
 454 {
 455         dbuf_dirty_record_t *dr;
 456 
 457         while (dr = list_head(list)) {
 458                 dmu_buf_impl_t *db = dr->dr_dbuf;
 459                 uint64_t txg = dr->dr_txg;
 460 
 461                 if (db->db_level != 0)


 480         }
 481 }
 482 
 483 static void
 484 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 485 {
 486         int txgoff = tx->tx_txg & TXG_MASK;
 487 
 488         ASSERT(dmu_tx_is_syncing(tx));
 489 
 490         /*
 491          * Our contents should have been freed in dnode_sync() by the
 492          * free range record inserted by the caller of dnode_free().
 493          */
 494         ASSERT0(DN_USED_BYTES(dn->dn_phys));
 495         ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 496 
 497         dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 498         dnode_evict_dbufs(dn);
 499         ASSERT(avl_is_empty(&dn->dn_dbufs));
 500         ASSERT3P(dn->dn_bonus, ==, NULL);
 501 
 502         /*
 503          * XXX - It would be nice to assert this, but we may still
 504          * have residual holds from async evictions from the arc...
 505          *
 506          * zfs_obj_to_path() also depends on this being
 507          * commented out.
 508          *
 509          * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
 510          */
 511 
 512         /* Undirty next bits */
 513         dn->dn_next_nlevels[txgoff] = 0;
 514         dn->dn_next_indblkshift[txgoff] = 0;
 515         dn->dn_next_blksz[txgoff] = 0;
 516 
 517         /* ASSERT(blkptrs are zero); */
 518         ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 519         ASSERT(dn->dn_type != DMU_OT_NONE);
 520 




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  26  */
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/dbuf.h>
  30 #include <sys/dnode.h>
  31 #include <sys/dmu.h>
  32 #include <sys/dmu_tx.h>
  33 #include <sys/dmu_objset.h>
  34 #include <sys/dsl_dataset.h>
  35 #include <sys/spa.h>
  36 #include <sys/range_tree.h>
  37 #include <sys/zfeature.h>
  38 
  39 static void
  40 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
  41 {
  42         dmu_buf_impl_t *db;
  43         int txgoff = tx->tx_txg & TXG_MASK;
  44         int nblkptr = dn->dn_phys->dn_nblkptr;
  45         int old_toplvl = dn->dn_phys->dn_nlevels - 1;


 380         dmu_tx_t *dsfra_tx;
 381 } dnode_sync_free_range_arg_t;
 382 
 383 static void
 384 dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
 385 {
 386         dnode_sync_free_range_arg_t *dsfra = arg;
 387         dnode_t *dn = dsfra->dsfra_dnode;
 388 
 389         mutex_exit(&dn->dn_mtx);
 390         dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
 391         mutex_enter(&dn->dn_mtx);
 392 }
 393 
 394 /*
 395  * Try to kick all the dnode's dbufs out of the cache...
 396  */
 397 void
 398 dnode_evict_dbufs(dnode_t *dn)
 399 {
 400         dmu_buf_impl_t db_marker;



 401         dmu_buf_impl_t *db, *db_next;

 402 

 403         mutex_enter(&dn->dn_dbufs_mtx);
 404         for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 405 
 406 #ifdef  DEBUG
 407                 DB_DNODE_ENTER(db);
 408                 ASSERT3P(DB_DNODE(db), ==, dn);
 409                 DB_DNODE_EXIT(db);
 410 #endif  /* DEBUG */
 411 
 412                 mutex_enter(&db->db_mtx);
 413                 if (db->db_state != DB_EVICTING &&
 414                     refcount_is_zero(&db->db_holds)) {
 415                         db_marker.db_level = db->db_level;
 416                         db_marker.db_blkid = db->db_blkid;
 417                         db_marker.db_state = DB_SEARCH;
 418                         avl_insert_here(&dn->dn_dbufs, &db_marker, db,
 419                             AVL_BEFORE);
 420 
 421                         dbuf_clear(db);
 422 
 423                         db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
 424                         avl_remove(&dn->dn_dbufs, &db_marker);
 425                 } else {
 426                         mutex_exit(&db->db_mtx);
 427                         db_next = AVL_NEXT(&dn->dn_dbufs, db);
 428                 }

 429         }







 430         mutex_exit(&dn->dn_dbufs_mtx);





 431 
 432         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 433         if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 434                 mutex_enter(&dn->dn_bonus->db_mtx);
 435                 dbuf_evict(dn->dn_bonus);
 436                 dn->dn_bonus = NULL;
 437         }
 438         rw_exit(&dn->dn_struct_rwlock);
 439 }
 440 
 441 static void
 442 dnode_undirty_dbufs(list_t *list)
 443 {
 444         dbuf_dirty_record_t *dr;
 445 
 446         while (dr = list_head(list)) {
 447                 dmu_buf_impl_t *db = dr->dr_dbuf;
 448                 uint64_t txg = dr->dr_txg;
 449 
 450                 if (db->db_level != 0)


 469         }
 470 }
 471 
 472 static void
 473 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 474 {
 475         int txgoff = tx->tx_txg & TXG_MASK;
 476 
 477         ASSERT(dmu_tx_is_syncing(tx));
 478 
 479         /*
 480          * Our contents should have been freed in dnode_sync() by the
 481          * free range record inserted by the caller of dnode_free().
 482          */
 483         ASSERT0(DN_USED_BYTES(dn->dn_phys));
 484         ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 485 
 486         dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 487         dnode_evict_dbufs(dn);
 488         ASSERT(avl_is_empty(&dn->dn_dbufs));

 489 
 490         /*
 491          * XXX - It would be nice to assert this, but we may still
 492          * have residual holds from async evictions from the arc...
 493          *
 494          * zfs_obj_to_path() also depends on this being
 495          * commented out.
 496          *
 497          * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
 498          */
 499 
 500         /* Undirty next bits */
 501         dn->dn_next_nlevels[txgoff] = 0;
 502         dn->dn_next_indblkshift[txgoff] = 0;
 503         dn->dn_next_blksz[txgoff] = 0;
 504 
 505         /* ASSERT(blkptrs are zero); */
 506         ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 507         ASSERT(dn->dn_type != DMU_OT_NONE);
 508