Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

*** 19,28 **** --- 19,29 ---- * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include <sys/zfs_context.h> #include <sys/dbuf.h> #include <sys/dnode.h>
*** 400,411 **** static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { ! dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate --- 401,413 ---- static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { ! dnode_t *dn; + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate
*** 438,454 **** dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); list_insert_head(&os->os_dnodes, dn); membar_producer(); /* ! * Everything else must be valid before assigning dn_objset makes the ! * dnode eligible for dnode_move(). */ dn->dn_objset = os; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } --- 440,474 ---- dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) list_insert_head(&os->os_dnodes, dn); membar_producer(); + /* ! * Everything else must be valid before assigning dn_objset ! * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; + + dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); }
*** 458,473 **** --- 478,499 ---- */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ zrl_remove(&dn->dn_handle->dnh_zrlock);
*** 498,507 **** --- 524,536 ---- dn->dn_unlisted_l0_blkid = 0; dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + + if (complete_os_eviction) + dmu_objset_evict_done(os); } void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
*** 964,1000 **** * has a hold on this dnode while we are trying to evict this * dnode. */ while (refcount_count(&dn->dn_holds) > 0) delay(1); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } ! dnode_t * dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { ! dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); ! dnh->dnh_dnode = dn; zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - return (dn); } static void ! dnode_buf_pageout(dmu_buf_t *db, void *arg) { ! dnode_children_t *children_dnodes = arg; int i; - int epb = db->db_size >> DNODE_SHIFT; ! ASSERT(epb == children_dnodes->dnc_count); ! ! for (i = 0; i < epb; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to --- 993,1028 ---- * has a hold on this dnode while we are trying to evict this * dnode. */ while (refcount_count(&dn->dn_holds) > 0) delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } ! void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { ! dnode_t *dn; ! ! dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); } static void ! dnode_buf_pageout(void *dbu) { ! dnode_children_t *children_dnodes = dbu; int i; ! for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to
*** 1020,1030 **** dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + ! epb * sizeof (dnode_handle_t)); } /* * errors: * EINVAL - invalid object number. --- 1048,1058 ---- dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + ! children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* * errors: * EINVAL - invalid object number.
*** 1104,1123 **** ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; ! children_dnodes = kmem_alloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); - dnh[i].dnh_dnode = NULL; } ! if (winner = dmu_buf_set_user(&db->db, children_dnodes, ! dnode_buf_pageout)) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); } --- 1132,1152 ---- ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; ! children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); } ! dmu_buf_init_user(&children_dnodes->dnc_dbu, ! dnode_buf_pageout, NULL); ! winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); ! if (winner != NULL) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); }
*** 1128,1149 **** } ASSERT(children_dnodes->dnc_count == epb); dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); ! if ((dn = dnh->dnh_dnode) == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); - winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); - if (winner != NULL) { - zrl_add(&dnh->dnh_zrlock); - dnode_destroy(dn); /* implicit zrl_remove() */ - dn = winner; } - } mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || --- 1157,1172 ---- } ASSERT(children_dnodes->dnc_count == epb); dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); ! dn = dnh->dnh_dnode; ! if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; dn = dnode_create(os, phys, db, object, dnh); } mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
*** 1152,1165 **** mutex_exit(&dn->dn_mtx); zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } - mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); --- 1175,1188 ---- mutex_exit(&dn->dn_mtx); zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); + /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db);