Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

*** 21,30 **** --- 21,31 ---- /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include <sys/cred.h>
*** 353,363 **** if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } ! if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } --- 354,364 ---- if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } ! if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); }
*** 415,425 **** os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } ! if (ds == NULL || !dsl_dataset_is_snapshot(ds)) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), --- 416,426 ---- os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } ! if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
*** 434,453 **** mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); ! DMU_META_DNODE(os) = dnode_special_open(os, ! &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, ! &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { ! DMU_USERUSED_DNODE(os) = dnode_special_open(os, ! &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, ! &os->os_userused_dnode); ! DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, ! &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, ! &os->os_groupused_dnode); } *osp = os; return (0); } --- 435,451 ---- mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); ! dnode_special_open(os, &os->os_phys->os_meta_dnode, ! DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { ! dnode_special_open(os, &os->os_phys->os_userused_dnode, ! DMU_USERUSED_OBJECT, &os->os_userused_dnode); ! dnode_special_open(os, &os->os_phys->os_groupused_dnode, ! DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; return (0); }
*** 531,541 **** if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); ! } else if (!readonly && dsl_dataset_is_snapshot(ds)) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } return (err); } --- 529,539 ---- if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); ! } else if (!readonly && ds->ds_is_snapshot) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } return (err); }
*** 587,641 **** } void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn; mutex_enter(&os->os_lock); ! ! /* process the mdn last, since the other dnodes have holds on it */ ! list_remove(&os->os_dnodes, DMU_META_DNODE(os)); ! list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); ! /* ! * Find the first dnode with holds. We have to do this dance ! * because dnode_add_ref() only works if you already have a ! * hold. If there are no holds then it has no dbufs so OK to ! * skip. */ ! for (dn = list_head(&os->os_dnodes); ! dn && !dnode_add_ref(dn, FTAG); ! dn = list_next(&os->os_dnodes, dn)) ! continue; ! ! while (dn) { ! dnode_t *next_dn = dn; ! ! do { ! next_dn = list_next(&os->os_dnodes, next_dn); ! } while (next_dn && !dnode_add_ref(next_dn, FTAG)); ! mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); ! dn = next_dn; } mutex_exit(&os->os_lock); } void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { ! if (!dsl_dataset_is_snapshot(ds)) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), --- 585,651 ---- } void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t dn_marker; dnode_t *dn; mutex_enter(&os->os_lock); ! dn = list_head(&os->os_dnodes); ! while (dn != NULL) { /* ! * Skip dnodes without holds. We have to do this dance ! * because dnode_add_ref() only works if there is already a ! * hold. If the dnode has no holds, then it has no dbufs. */ ! if (dnode_add_ref(dn, FTAG)) { ! list_insert_after(&os->os_dnodes, dn, &dn_marker); mutex_exit(&os->os_lock); + dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); + mutex_enter(&os->os_lock); ! dn = list_next(&os->os_dnodes, &dn_marker); ! list_remove(&os->os_dnodes, &dn_marker); ! } else { ! dn = list_next(&os->os_dnodes, dn); } + } mutex_exit(&os->os_lock); + + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + } + dnode_evict_dbufs(DMU_META_DNODE(os)); } + /* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { ! if (!ds->ds_is_snapshot) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION),
*** 668,688 **** } if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in --- 678,712 ---- } if (os->os_sa) sa_tear_down(os); + os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } + } + + void + dmu_objset_evict_done(objset_t *os) + { + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in
*** 693,702 **** --- 717,727 ---- rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } timestruc_t dmu_objset_snap_cmtime(objset_t *os)
*** 901,911 **** dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EXDEV)); } /* You can only clone snapshots, not the head datasets. */ ! if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); --- 926,936 ---- dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EXDEV)); } /* You can only clone snapshots, not the head datasets. */ ! if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG);
*** 1465,1475 **** int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) ! return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } int --- 1490,1500 ---- int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) ! return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } int