Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
*** 21,30 ****
--- 21,31 ----
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/cred.h>
*** 353,363 ****
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os);
}
! if (!dsl_dataset_is_snapshot(ds)) {
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
checksum_changed_cb, os);
}
--- 354,364 ----
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os);
}
! if (!ds->ds_is_snapshot) {
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
checksum_changed_cb, os);
}
*** 415,425 ****
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
! if (ds == NULL || !dsl_dataset_is_snapshot(ds))
os->os_zil_header = os->os_phys->os_zil_header;
os->os_zil = zil_alloc(os, &os->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
--- 416,426 ----
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
! if (ds == NULL || !ds->ds_is_snapshot)
os->os_zil_header = os->os_phys->os_zil_header;
os->os_zil = zil_alloc(os, &os->os_zil_header);
for (i = 0; i < TXG_SIZE; i++) {
list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
*** 434,453 ****
mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
! DMU_META_DNODE(os) = dnode_special_open(os,
! &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
! &os->os_meta_dnode);
if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
! DMU_USERUSED_DNODE(os) = dnode_special_open(os,
! &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
! &os->os_userused_dnode);
! DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
! &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
! &os->os_groupused_dnode);
}
*osp = os;
return (0);
}
--- 435,451 ----
mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
! dnode_special_open(os, &os->os_phys->os_meta_dnode,
! DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
! dnode_special_open(os, &os->os_phys->os_userused_dnode,
! DMU_USERUSED_OBJECT, &os->os_userused_dnode);
! dnode_special_open(os, &os->os_phys->os_groupused_dnode,
! DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
}
*osp = os;
return (0);
}
*** 531,541 ****
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
! } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
}
--- 529,539 ----
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
! } else if (!readonly && ds->ds_is_snapshot) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
}
*** 587,641 ****
}
void
dmu_objset_evict_dbufs(objset_t *os)
{
dnode_t *dn;
mutex_enter(&os->os_lock);
!
! /* process the mdn last, since the other dnodes have holds on it */
! list_remove(&os->os_dnodes, DMU_META_DNODE(os));
! list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
!
/*
! * Find the first dnode with holds. We have to do this dance
! * because dnode_add_ref() only works if you already have a
! * hold. If there are no holds then it has no dbufs so OK to
! * skip.
*/
! for (dn = list_head(&os->os_dnodes);
! dn && !dnode_add_ref(dn, FTAG);
! dn = list_next(&os->os_dnodes, dn))
! continue;
!
! while (dn) {
! dnode_t *next_dn = dn;
!
! do {
! next_dn = list_next(&os->os_dnodes, next_dn);
! } while (next_dn && !dnode_add_ref(next_dn, FTAG));
!
mutex_exit(&os->os_lock);
dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
mutex_enter(&os->os_lock);
! dn = next_dn;
}
mutex_exit(&os->os_lock);
}
void
dmu_objset_evict(objset_t *os)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
! if (!dsl_dataset_is_snapshot(ds)) {
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
checksum_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
--- 585,651 ----
}
void
dmu_objset_evict_dbufs(objset_t *os)
{
+ dnode_t dn_marker;
dnode_t *dn;
mutex_enter(&os->os_lock);
! dn = list_head(&os->os_dnodes);
! while (dn != NULL) {
/*
! * Skip dnodes without holds. We have to do this dance
! * because dnode_add_ref() only works if there is already a
! * hold. If the dnode has no holds, then it has no dbufs.
*/
! if (dnode_add_ref(dn, FTAG)) {
! list_insert_after(&os->os_dnodes, dn, &dn_marker);
mutex_exit(&os->os_lock);
+
dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
+
mutex_enter(&os->os_lock);
! dn = list_next(&os->os_dnodes, &dn_marker);
! list_remove(&os->os_dnodes, &dn_marker);
! } else {
! dn = list_next(&os->os_dnodes, dn);
}
+ }
mutex_exit(&os->os_lock);
+
+ if (DMU_USERUSED_DNODE(os) != NULL) {
+ dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+ dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
+ }
+ dnode_evict_dbufs(DMU_META_DNODE(os));
}
+ /*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction. Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ * dnode_buf_pageout()), it is possible for the meta dnode for the
+ * objset to have no holds even though os->os_dnodes is not empty.
+ */
void
dmu_objset_evict(objset_t *os)
{
dsl_dataset_t *ds = os->os_dsl_dataset;
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
! if (!ds->ds_is_snapshot) {
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
checksum_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
*** 668,688 ****
}
if (os->os_sa)
sa_tear_down(os);
dmu_objset_evict_dbufs(os);
dnode_special_close(&os->os_meta_dnode);
if (DMU_USERUSED_DNODE(os)) {
dnode_special_close(&os->os_userused_dnode);
dnode_special_close(&os->os_groupused_dnode);
}
zil_free(os->os_zil);
- ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
-
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
/*
* This is a barrier to prevent the objset from going away in
* dnode_move() until we can safely ensure that the objset is still in
--- 678,712 ----
}
if (os->os_sa)
sa_tear_down(os);
+ os->os_evicting = B_TRUE;
dmu_objset_evict_dbufs(os);
+ mutex_enter(&os->os_lock);
+ spa_evicting_os_register(os->os_spa, os);
+ if (list_is_empty(&os->os_dnodes)) {
+ mutex_exit(&os->os_lock);
+ dmu_objset_evict_done(os);
+ } else {
+ mutex_exit(&os->os_lock);
+ }
+ }
+
+ void
+ dmu_objset_evict_done(objset_t *os)
+ {
+ ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
dnode_special_close(&os->os_meta_dnode);
if (DMU_USERUSED_DNODE(os)) {
dnode_special_close(&os->os_userused_dnode);
dnode_special_close(&os->os_groupused_dnode);
}
zil_free(os->os_zil);
VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
/*
* This is a barrier to prevent the objset from going away in
* dnode_move() until we can safely ensure that the objset is still in
*** 693,702 ****
--- 717,727 ----
rw_exit(&os_lock);
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_obj_lock);
mutex_destroy(&os->os_user_ptr_lock);
+ spa_evicting_os_deregister(os->os_spa, os);
kmem_free(os, sizeof (objset_t));
}
timestruc_t
dmu_objset_snap_cmtime(objset_t *os)
*** 901,911 ****
dsl_dataset_rele(origin, FTAG);
return (SET_ERROR(EXDEV));
}
/* You can only clone snapshots, not the head datasets. */
! if (!dsl_dataset_is_snapshot(origin)) {
dsl_dataset_rele(origin, FTAG);
return (SET_ERROR(EINVAL));
}
dsl_dataset_rele(origin, FTAG);
--- 926,936 ----
dsl_dataset_rele(origin, FTAG);
return (SET_ERROR(EXDEV));
}
/* You can only clone snapshots, not the head datasets. */
! if (!origin->ds_is_snapshot) {
dsl_dataset_rele(origin, FTAG);
return (SET_ERROR(EINVAL));
}
dsl_dataset_rele(origin, FTAG);
*** 1465,1475 ****
int
dmu_objset_is_snapshot(objset_t *os)
{
if (os->os_dsl_dataset != NULL)
! return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
else
return (B_FALSE);
}
int
--- 1490,1500 ----
int
dmu_objset_is_snapshot(objset_t *os)
{
if (os->os_dsl_dataset != NULL)
! return (os->os_dsl_dataset->ds_is_snapshot);
else
return (B_FALSE);
}
int