Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dnode_sync.c
          +++ new/usr/src/uts/common/fs/zfs/dnode_sync.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
       25 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/zfs_context.h>
  28   29  #include <sys/dbuf.h>
  29   30  #include <sys/dnode.h>
  30   31  #include <sys/dmu.h>
  31   32  #include <sys/dmu_tx.h>
  32   33  #include <sys/dmu_objset.h>
  33   34  #include <sys/dsl_dataset.h>
  34   35  #include <sys/spa.h>
↓ open down ↓ 354 lines elided ↑ open up ↑
 389  390          dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
 390  391          mutex_enter(&dn->dn_mtx);
 391  392  }
 392  393  
 393  394  /*
 394  395   * Try to kick all the dnode's dbufs out of the cache...
 395  396   */
 396  397  void
 397  398  dnode_evict_dbufs(dnode_t *dn)
 398  399  {
 399      -        int progress;
 400      -        int pass = 0;
      400 +        dmu_buf_impl_t db_marker;
      401 +        dmu_buf_impl_t *db, *db_next;
 401  402  
 402      -        do {
 403      -                dmu_buf_impl_t *db, *db_next;
 404      -                int evicting = FALSE;
      403 +        mutex_enter(&dn->dn_dbufs_mtx);
      404 +        for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 405  405  
 406      -                progress = FALSE;
 407      -                mutex_enter(&dn->dn_dbufs_mtx);
 408      -                for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
 409      -                        db_next = AVL_NEXT(&dn->dn_dbufs, db);
 410  406  #ifdef  DEBUG
 411      -                        DB_DNODE_ENTER(db);
 412      -                        ASSERT3P(DB_DNODE(db), ==, dn);
 413      -                        DB_DNODE_EXIT(db);
      407 +                DB_DNODE_ENTER(db);
      408 +                ASSERT3P(DB_DNODE(db), ==, dn);
      409 +                DB_DNODE_EXIT(db);
 414  410  #endif  /* DEBUG */
 415  411  
 416      -                        mutex_enter(&db->db_mtx);
 417      -                        if (db->db_state == DB_EVICTING) {
 418      -                                progress = TRUE;
 419      -                                evicting = TRUE;
 420      -                                mutex_exit(&db->db_mtx);
 421      -                        } else if (refcount_is_zero(&db->db_holds)) {
 422      -                                progress = TRUE;
 423      -                                dbuf_clear(db); /* exits db_mtx for us */
 424      -                        } else {
 425      -                                mutex_exit(&db->db_mtx);
 426      -                        }
      412 +                mutex_enter(&db->db_mtx);
      413 +                if (db->db_state != DB_EVICTING &&
      414 +                    refcount_is_zero(&db->db_holds)) {
      415 +                        db_marker.db_level = db->db_level;
      416 +                        db_marker.db_blkid = db->db_blkid;
      417 +                        db_marker.db_state = DB_SEARCH;
      418 +                        avl_insert_here(&dn->dn_dbufs, &db_marker, db,
      419 +                            AVL_BEFORE);
 427  420  
      421 +                        dbuf_clear(db);
      422 +
      423 +                        db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
      424 +                        avl_remove(&dn->dn_dbufs, &db_marker);
      425 +                } else {
      426 +                        mutex_exit(&db->db_mtx);
      427 +                        db_next = AVL_NEXT(&dn->dn_dbufs, db);
 428  428                  }
 429      -                /*
 430      -                 * NB: we need to drop dn_dbufs_mtx between passes so
 431      -                 * that any DB_EVICTING dbufs can make progress.
 432      -                 * Ideally, we would have some cv we could wait on, but
 433      -                 * since we don't, just wait a bit to give the other
 434      -                 * thread a chance to run.
 435      -                 */
 436      -                mutex_exit(&dn->dn_dbufs_mtx);
 437      -                if (evicting)
 438      -                        delay(1);
 439      -                pass++;
 440      -                ASSERT(pass < 100); /* sanity check */
 441      -        } while (progress);
      429 +        }
      430 +        mutex_exit(&dn->dn_dbufs_mtx);
 442  431  
 443  432          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 444  433          if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 445  434                  mutex_enter(&dn->dn_bonus->db_mtx);
 446  435                  dbuf_evict(dn->dn_bonus);
 447  436                  dn->dn_bonus = NULL;
 448  437          }
 449  438          rw_exit(&dn->dn_struct_rwlock);
 450  439  }
 451  440  
↓ open down ↓ 38 lines elided ↑ open up ↑
 490  479          /*
 491  480           * Our contents should have been freed in dnode_sync() by the
 492  481           * free range record inserted by the caller of dnode_free().
 493  482           */
 494  483          ASSERT0(DN_USED_BYTES(dn->dn_phys));
 495  484          ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 496  485  
 497  486          dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 498  487          dnode_evict_dbufs(dn);
 499  488          ASSERT(avl_is_empty(&dn->dn_dbufs));
 500      -        ASSERT3P(dn->dn_bonus, ==, NULL);
 501  489  
 502  490          /*
 503  491           * XXX - It would be nice to assert this, but we may still
 504  492           * have residual holds from async evictions from the arc...
 505  493           *
 506  494           * zfs_obj_to_path() also depends on this being
 507  495           * commented out.
 508  496           *
 509  497           * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
 510  498           */
↓ open down ↓ 217 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX