Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.

  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/dbuf.h>
  28 #include <sys/dnode.h>
  29 #include <sys/dmu.h>
  30 #include <sys/dmu_impl.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dmu_objset.h>
  33 #include <sys/dsl_dir.h>
  34 #include <sys/dsl_dataset.h>
  35 #include <sys/spa.h>
  36 #include <sys/zio.h>
  37 #include <sys/dmu_zfetch.h>
  38 #include <sys/range_tree.h>
  39 
  40 static kmem_cache_t *dnode_cache;
  41 /*
  42  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  43  * turned on when DEBUG is also defined.


 385         dn->dn_have_spill = B_FALSE;
 386 }
 387 
 388 static void
 389 dnode_setdblksz(dnode_t *dn, int size)
 390 {
 391         ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 392         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 393         ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 394         ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 395             1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 396         dn->dn_datablksz = size;
 397         dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 398         dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 399 }
 400 
 401 static dnode_t *
 402 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 403     uint64_t object, dnode_handle_t *dnh)
 404 {
 405         dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 406 

 407         ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 408         dn->dn_moved = 0;
 409 
 410         /*
 411          * Defer setting dn_objset until the dnode is ready to be a candidate
 412          * for the dnode_move() callback.
 413          */
 414         dn->dn_object = object;
 415         dn->dn_dbuf = db;
 416         dn->dn_handle = dnh;
 417         dn->dn_phys = dnp;
 418 
 419         if (dnp->dn_datablkszsec) {
 420                 dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 421         } else {
 422                 dn->dn_datablksz = 0;
 423                 dn->dn_datablkszsec = 0;
 424                 dn->dn_datablkshift = 0;
 425         }
 426         dn->dn_indblkshift = dnp->dn_indblkshift;
 427         dn->dn_nlevels = dnp->dn_nlevels;
 428         dn->dn_type = dnp->dn_type;
 429         dn->dn_nblkptr = dnp->dn_nblkptr;
 430         dn->dn_checksum = dnp->dn_checksum;
 431         dn->dn_compress = dnp->dn_compress;
 432         dn->dn_bonustype = dnp->dn_bonustype;
 433         dn->dn_bonuslen = dnp->dn_bonuslen;
 434         dn->dn_maxblkid = dnp->dn_maxblkid;
 435         dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 436         dn->dn_id_flags = 0;
 437 
 438         dmu_zfetch_init(&dn->dn_zfetch, dn);
 439 
 440         ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 441 
 442         mutex_enter(&os->os_lock);















 443         list_insert_head(&os->os_dnodes, dn);
 444         membar_producer();

 445         /*
 446          * Everything else must be valid before assigning dn_objset makes the
 447          * dnode eligible for dnode_move().
 448          */
 449         dn->dn_objset = os;


 450         mutex_exit(&os->os_lock);
 451 
 452         arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 453         return (dn);
 454 }
 455 
 456 /*
 457  * Caller must be holding the dnode handle, which is released upon return.
 458  */
 459 static void
 460 dnode_destroy(dnode_t *dn)
 461 {
 462         objset_t *os = dn->dn_objset;

 463 
 464         ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 465 
 466         mutex_enter(&os->os_lock);
 467         POINTER_INVALIDATE(&dn->dn_objset);

 468         list_remove(&os->os_dnodes, dn);




 469         mutex_exit(&os->os_lock);
 470 
 471         /* the dnode can no longer move, so we can release the handle */
 472         zrl_remove(&dn->dn_handle->dnh_zrlock);
 473 
 474         dn->dn_allocated_txg = 0;
 475         dn->dn_free_txg = 0;
 476         dn->dn_assigned_txg = 0;
 477 
 478         dn->dn_dirtyctx = 0;
 479         if (dn->dn_dirtyctx_firstset != NULL) {
 480                 kmem_free(dn->dn_dirtyctx_firstset, 1);
 481                 dn->dn_dirtyctx_firstset = NULL;
 482         }
 483         if (dn->dn_bonus != NULL) {
 484                 mutex_enter(&dn->dn_bonus->db_mtx);
 485                 dbuf_evict(dn->dn_bonus);
 486                 dn->dn_bonus = NULL;
 487         }
 488         dn->dn_zio = NULL;
 489 
 490         dn->dn_have_spill = B_FALSE;
 491         dn->dn_oldused = 0;
 492         dn->dn_oldflags = 0;
 493         dn->dn_olduid = 0;
 494         dn->dn_oldgid = 0;
 495         dn->dn_newuid = 0;
 496         dn->dn_newgid = 0;
 497         dn->dn_id_flags = 0;
 498         dn->dn_unlisted_l0_blkid = 0;
 499 
 500         dmu_zfetch_rele(&dn->dn_zfetch);
 501         kmem_cache_free(dnode_cache, dn);
 502         arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);



 503 }
 504 
 505 void
 506 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 507     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 508 {
 509         int i;
 510 
 511         ASSERT3U(blocksize, <=,
 512             spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 513         if (blocksize == 0)
 514                 blocksize = 1 << zfs_default_bs;
 515         else
 516                 blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 517 
 518         if (ibs == 0)
 519                 ibs = zfs_default_ibs;
 520 
 521         ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 522 


 949         zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 950         mutex_exit(&os->os_lock);
 951 
 952         return (KMEM_CBRC_YES);
 953 }
 954 #endif  /* _KERNEL */
 955 
 956 void
 957 dnode_special_close(dnode_handle_t *dnh)
 958 {
 959         dnode_t *dn = dnh->dnh_dnode;
 960 
 961         /*
 962          * Wait for final references to the dnode to clear.  This can
 963          * only happen if the arc is asyncronously evicting state that
 964          * has a hold on this dnode while we are trying to evict this
 965          * dnode.
 966          */
 967         while (refcount_count(&dn->dn_holds) > 0)
 968                 delay(1);


 969         zrl_add(&dnh->dnh_zrlock);
 970         dnode_destroy(dn); /* implicit zrl_remove() */
 971         zrl_destroy(&dnh->dnh_zrlock);
 972         dnh->dnh_dnode = NULL;
 973 }
 974 
 975 dnode_t *
 976 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 977     dnode_handle_t *dnh)
 978 {
 979         dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
 980         dnh->dnh_dnode = dn;

 981         zrl_init(&dnh->dnh_zrlock);
 982         DNODE_VERIFY(dn);
 983         return (dn);
 984 }
 985 
 986 static void
 987 dnode_buf_pageout(dmu_buf_t *db, void *arg)
 988 {
 989         dnode_children_t *children_dnodes = arg;
 990         int i;
 991         int epb = db->db_size >> DNODE_SHIFT;
 992 
 993         ASSERT(epb == children_dnodes->dnc_count);
 994 
 995         for (i = 0; i < epb; i++) {
 996                 dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 997                 dnode_t *dn;
 998 
 999                 /*
1000                  * The dnode handle lock guards against the dnode moving to
1001                  * another valid address, so there is no need here to guard
1002                  * against changes to or from NULL.
1003                  */
1004                 if (dnh->dnh_dnode == NULL) {
1005                         zrl_destroy(&dnh->dnh_zrlock);
1006                         continue;
1007                 }
1008 
1009                 zrl_add(&dnh->dnh_zrlock);
1010                 dn = dnh->dnh_dnode;
1011                 /*
1012                  * If there are holds on this dnode, then there should
1013                  * be holds on the dnode's containing dbuf as well; thus
1014                  * it wouldn't be eligible for eviction and this function
1015                  * would not have been called.
1016                  */
1017                 ASSERT(refcount_is_zero(&dn->dn_holds));
1018                 ASSERT(refcount_is_zero(&dn->dn_tx_holds));
1019 
1020                 dnode_destroy(dn); /* implicit zrl_remove() */
1021                 zrl_destroy(&dnh->dnh_zrlock);
1022                 dnh->dnh_dnode = NULL;
1023         }
1024         kmem_free(children_dnodes, sizeof (dnode_children_t) +
1025             epb * sizeof (dnode_handle_t));
1026 }
1027 
1028 /*
1029  * errors:
1030  * EINVAL - invalid object number.
1031  * EIO - i/o error.
1032  * succeeds even for free dnodes.
1033  */
1034 int
1035 dnode_hold_impl(objset_t *os, uint64_t object, int flag,
1036     void *tag, dnode_t **dnp)
1037 {
1038         int epb, idx, err;
1039         int drop_struct_lock = FALSE;
1040         int type;
1041         uint64_t blk;
1042         dnode_t *mdn, *dn;
1043         dmu_buf_impl_t *db;
1044         dnode_children_t *children_dnodes;
1045         dnode_handle_t *dnh;


1089         if (drop_struct_lock)
1090                 rw_exit(&mdn->dn_struct_rwlock);
1091         if (db == NULL)
1092                 return (SET_ERROR(EIO));
1093         err = dbuf_read(db, NULL, DB_RF_CANFAIL);
1094         if (err) {
1095                 dbuf_rele(db, FTAG);
1096                 return (err);
1097         }
1098 
1099         ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1100         epb = db->db.db_size >> DNODE_SHIFT;
1101 
1102         idx = object & (epb-1);
1103 
1104         ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1105         children_dnodes = dmu_buf_get_user(&db->db);
1106         if (children_dnodes == NULL) {
1107                 int i;
1108                 dnode_children_t *winner;
1109                 children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
1110                     epb * sizeof (dnode_handle_t), KM_SLEEP);
1111                 children_dnodes->dnc_count = epb;
1112                 dnh = &children_dnodes->dnc_children[0];
1113                 for (i = 0; i < epb; i++) {
1114                         zrl_init(&dnh[i].dnh_zrlock);
1115                         dnh[i].dnh_dnode = NULL;
1116                 }
1117                 if (winner = dmu_buf_set_user(&db->db, children_dnodes,
1118                     dnode_buf_pageout)) {


1119 
1120                         for (i = 0; i < epb; i++) {
1121                                 zrl_destroy(&dnh[i].dnh_zrlock);
1122                         }
1123 
1124                         kmem_free(children_dnodes, sizeof (dnode_children_t) +
1125                             epb * sizeof (dnode_handle_t));
1126                         children_dnodes = winner;
1127                 }
1128         }
1129         ASSERT(children_dnodes->dnc_count == epb);
1130 
1131         dnh = &children_dnodes->dnc_children[idx];
1132         zrl_add(&dnh->dnh_zrlock);
1133         if ((dn = dnh->dnh_dnode) == NULL) {

1134                 dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
1135                 dnode_t *winner;
1136 
1137                 dn = dnode_create(os, phys, db, object, dnh);
1138                 winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
1139                 if (winner != NULL) {
1140                         zrl_add(&dnh->dnh_zrlock);
1141                         dnode_destroy(dn); /* implicit zrl_remove() */
1142                         dn = winner;
1143                 }
1144         }
1145 
1146         mutex_enter(&dn->dn_mtx);
1147         type = dn->dn_type;
1148         if (dn->dn_free_txg ||
1149             ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
1150             ((flag & DNODE_MUST_BE_FREE) &&
1151             (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
1152                 mutex_exit(&dn->dn_mtx);
1153                 zrl_remove(&dnh->dnh_zrlock);
1154                 dbuf_rele(db, FTAG);
1155                 return (type == DMU_OT_NONE ? ENOENT : EEXIST);
1156         }
1157         mutex_exit(&dn->dn_mtx);
1158 
1159         if (refcount_add(&dn->dn_holds, tag) == 1)
1160                 dbuf_add_ref(db, dnh);


1161         /* Now we can rely on the hold to prevent the dnode from moving. */
1162         zrl_remove(&dnh->dnh_zrlock);
1163 
1164         DNODE_VERIFY(dn);
1165         ASSERT3P(dn->dn_dbuf, ==, db);
1166         ASSERT3U(dn->dn_object, ==, object);
1167         dbuf_rele(db, FTAG);
1168 
1169         *dnp = dn;
1170         return (0);
1171 }
1172 
1173 /*
1174  * Return held dnode if the object is allocated, NULL if not.
1175  */
1176 int
1177 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1178 {
1179         return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
1180 }




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dbuf.h>
  29 #include <sys/dnode.h>
  30 #include <sys/dmu.h>
  31 #include <sys/dmu_impl.h>
  32 #include <sys/dmu_tx.h>
  33 #include <sys/dmu_objset.h>
  34 #include <sys/dsl_dir.h>
  35 #include <sys/dsl_dataset.h>
  36 #include <sys/spa.h>
  37 #include <sys/zio.h>
  38 #include <sys/dmu_zfetch.h>
  39 #include <sys/range_tree.h>
  40 
  41 static kmem_cache_t *dnode_cache;
  42 /*
  43  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  44  * turned on when DEBUG is also defined.


 386         dn->dn_have_spill = B_FALSE;
 387 }
 388 
 389 static void
 390 dnode_setdblksz(dnode_t *dn, int size)
 391 {
 392         ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 393         ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 394         ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 395         ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 396             1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 397         dn->dn_datablksz = size;
 398         dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 399         dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 400 }
 401 
 402 static dnode_t *
 403 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 404     uint64_t object, dnode_handle_t *dnh)
 405 {
 406         dnode_t *dn;
 407 
 408         dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 409         ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 410         dn->dn_moved = 0;
 411 
 412         /*
 413          * Defer setting dn_objset until the dnode is ready to be a candidate
 414          * for the dnode_move() callback.
 415          */
 416         dn->dn_object = object;
 417         dn->dn_dbuf = db;
 418         dn->dn_handle = dnh;
 419         dn->dn_phys = dnp;
 420 
 421         if (dnp->dn_datablkszsec) {
 422                 dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 423         } else {
 424                 dn->dn_datablksz = 0;
 425                 dn->dn_datablkszsec = 0;
 426                 dn->dn_datablkshift = 0;
 427         }
 428         dn->dn_indblkshift = dnp->dn_indblkshift;
 429         dn->dn_nlevels = dnp->dn_nlevels;
 430         dn->dn_type = dnp->dn_type;
 431         dn->dn_nblkptr = dnp->dn_nblkptr;
 432         dn->dn_checksum = dnp->dn_checksum;
 433         dn->dn_compress = dnp->dn_compress;
 434         dn->dn_bonustype = dnp->dn_bonustype;
 435         dn->dn_bonuslen = dnp->dn_bonuslen;
 436         dn->dn_maxblkid = dnp->dn_maxblkid;
 437         dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 438         dn->dn_id_flags = 0;
 439 
 440         dmu_zfetch_init(&dn->dn_zfetch, dn);
 441 
 442         ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 443 
 444         mutex_enter(&os->os_lock);
 445         if (dnh->dnh_dnode != NULL) {
 446                 /* Lost the allocation race. */
 447                 mutex_exit(&os->os_lock);
 448                 kmem_cache_free(dnode_cache, dn);
 449                 return (dnh->dnh_dnode);
 450         }
 451 
 452         /*
 453          * Exclude special dnodes from os_dnodes so an empty os_dnodes
 454          * signifies that the special dnodes have no references from
 455          * their children (the entries in os_dnodes).  This allows
 456          * dnode_destroy() to easily determine if the last child has
 457          * been removed and then complete eviction of the objset.
 458          */
 459         if (!DMU_OBJECT_IS_SPECIAL(object))
 460                 list_insert_head(&os->os_dnodes, dn);
 461         membar_producer();
 462 
 463         /*
 464          * Everything else must be valid before assigning dn_objset
 465          * makes the dnode eligible for dnode_move().
 466          */
 467         dn->dn_objset = os;
 468 
 469         dnh->dnh_dnode = dn;
 470         mutex_exit(&os->os_lock);
 471 
 472         arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 473         return (dn);
 474 }
 475 
 476 /*
 477  * Caller must be holding the dnode handle, which is released upon return.
 478  */
 479 static void
 480 dnode_destroy(dnode_t *dn)
 481 {
 482         objset_t *os = dn->dn_objset;
 483         boolean_t complete_os_eviction = B_FALSE;
 484 
 485         ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 486 
 487         mutex_enter(&os->os_lock);
 488         POINTER_INVALIDATE(&dn->dn_objset);
 489         if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 490                 list_remove(&os->os_dnodes, dn);
 491                 complete_os_eviction =
 492                     list_is_empty(&os->os_dnodes) &&
 493                     list_link_active(&os->os_evicting_node);
 494         }
 495         mutex_exit(&os->os_lock);
 496 
 497         /* the dnode can no longer move, so we can release the handle */
 498         zrl_remove(&dn->dn_handle->dnh_zrlock);
 499 
 500         dn->dn_allocated_txg = 0;
 501         dn->dn_free_txg = 0;
 502         dn->dn_assigned_txg = 0;
 503 
 504         dn->dn_dirtyctx = 0;
 505         if (dn->dn_dirtyctx_firstset != NULL) {
 506                 kmem_free(dn->dn_dirtyctx_firstset, 1);
 507                 dn->dn_dirtyctx_firstset = NULL;
 508         }
 509         if (dn->dn_bonus != NULL) {
 510                 mutex_enter(&dn->dn_bonus->db_mtx);
 511                 dbuf_evict(dn->dn_bonus);
 512                 dn->dn_bonus = NULL;
 513         }
 514         dn->dn_zio = NULL;
 515 
 516         dn->dn_have_spill = B_FALSE;
 517         dn->dn_oldused = 0;
 518         dn->dn_oldflags = 0;
 519         dn->dn_olduid = 0;
 520         dn->dn_oldgid = 0;
 521         dn->dn_newuid = 0;
 522         dn->dn_newgid = 0;
 523         dn->dn_id_flags = 0;
 524         dn->dn_unlisted_l0_blkid = 0;
 525 
 526         dmu_zfetch_rele(&dn->dn_zfetch);
 527         kmem_cache_free(dnode_cache, dn);
 528         arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 529 
 530         if (complete_os_eviction)
 531                 dmu_objset_evict_done(os);
 532 }
 533 
 534 void
 535 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 536     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 537 {
 538         int i;
 539 
 540         ASSERT3U(blocksize, <=,
 541             spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 542         if (blocksize == 0)
 543                 blocksize = 1 << zfs_default_bs;
 544         else
 545                 blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 546 
 547         if (ibs == 0)
 548                 ibs = zfs_default_ibs;
 549 
 550         ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 551 


 978         zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
 979         mutex_exit(&os->os_lock);
 980 
 981         return (KMEM_CBRC_YES);
 982 }
 983 #endif  /* _KERNEL */
 984 
 985 void
 986 dnode_special_close(dnode_handle_t *dnh)
 987 {
 988         dnode_t *dn = dnh->dnh_dnode;
 989 
 990         /*
 991          * Wait for final references to the dnode to clear.  This can
 992          * only happen if the arc is asyncronously evicting state that
 993          * has a hold on this dnode while we are trying to evict this
 994          * dnode.
 995          */
 996         while (refcount_count(&dn->dn_holds) > 0)
 997                 delay(1);
 998         ASSERT(dn->dn_dbuf == NULL ||
 999             dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
1000         zrl_add(&dnh->dnh_zrlock);
1001         dnode_destroy(dn); /* implicit zrl_remove() */
1002         zrl_destroy(&dnh->dnh_zrlock);
1003         dnh->dnh_dnode = NULL;
1004 }
1005 
1006 void
1007 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
1008     dnode_handle_t *dnh)
1009 {
1010         dnode_t *dn;
1011 
1012         dn = dnode_create(os, dnp, NULL, object, dnh);
1013         zrl_init(&dnh->dnh_zrlock);
1014         DNODE_VERIFY(dn);

1015 }
1016 
1017 static void
1018 dnode_buf_pageout(void *dbu)
1019 {
1020         dnode_children_t *children_dnodes = dbu;
1021         int i;

1022 
1023         for (i = 0; i < children_dnodes->dnc_count; i++) {


1024                 dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
1025                 dnode_t *dn;
1026 
1027                 /*
1028                  * The dnode handle lock guards against the dnode moving to
1029                  * another valid address, so there is no need here to guard
1030                  * against changes to or from NULL.
1031                  */
1032                 if (dnh->dnh_dnode == NULL) {
1033                         zrl_destroy(&dnh->dnh_zrlock);
1034                         continue;
1035                 }
1036 
1037                 zrl_add(&dnh->dnh_zrlock);
1038                 dn = dnh->dnh_dnode;
1039                 /*
1040                  * If there are holds on this dnode, then there should
1041                  * be holds on the dnode's containing dbuf as well; thus
1042                  * it wouldn't be eligible for eviction and this function
1043                  * would not have been called.
1044                  */
1045                 ASSERT(refcount_is_zero(&dn->dn_holds));
1046                 ASSERT(refcount_is_zero(&dn->dn_tx_holds));
1047 
1048                 dnode_destroy(dn); /* implicit zrl_remove() */
1049                 zrl_destroy(&dnh->dnh_zrlock);
1050                 dnh->dnh_dnode = NULL;
1051         }
1052         kmem_free(children_dnodes, sizeof (dnode_children_t) +
1053             children_dnodes->dnc_count * sizeof (dnode_handle_t));
1054 }
1055 
1056 /*
1057  * errors:
1058  * EINVAL - invalid object number.
1059  * EIO - i/o error.
1060  * succeeds even for free dnodes.
1061  */
1062 int
1063 dnode_hold_impl(objset_t *os, uint64_t object, int flag,
1064     void *tag, dnode_t **dnp)
1065 {
1066         int epb, idx, err;
1067         int drop_struct_lock = FALSE;
1068         int type;
1069         uint64_t blk;
1070         dnode_t *mdn, *dn;
1071         dmu_buf_impl_t *db;
1072         dnode_children_t *children_dnodes;
1073         dnode_handle_t *dnh;


1117         if (drop_struct_lock)
1118                 rw_exit(&mdn->dn_struct_rwlock);
1119         if (db == NULL)
1120                 return (SET_ERROR(EIO));
1121         err = dbuf_read(db, NULL, DB_RF_CANFAIL);
1122         if (err) {
1123                 dbuf_rele(db, FTAG);
1124                 return (err);
1125         }
1126 
1127         ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1128         epb = db->db.db_size >> DNODE_SHIFT;
1129 
1130         idx = object & (epb-1);
1131 
1132         ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1133         children_dnodes = dmu_buf_get_user(&db->db);
1134         if (children_dnodes == NULL) {
1135                 int i;
1136                 dnode_children_t *winner;
1137                 children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
1138                     epb * sizeof (dnode_handle_t), KM_SLEEP);
1139                 children_dnodes->dnc_count = epb;
1140                 dnh = &children_dnodes->dnc_children[0];
1141                 for (i = 0; i < epb; i++) {
1142                         zrl_init(&dnh[i].dnh_zrlock);

1143                 }
1144                 dmu_buf_init_user(&children_dnodes->dnc_dbu,
1145                     dnode_buf_pageout, NULL);
1146                 winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
1147                 if (winner != NULL) {
1148 
1149                         for (i = 0; i < epb; i++) {
1150                                 zrl_destroy(&dnh[i].dnh_zrlock);
1151                         }
1152 
1153                         kmem_free(children_dnodes, sizeof (dnode_children_t) +
1154                             epb * sizeof (dnode_handle_t));
1155                         children_dnodes = winner;
1156                 }
1157         }
1158         ASSERT(children_dnodes->dnc_count == epb);
1159 
1160         dnh = &children_dnodes->dnc_children[idx];
1161         zrl_add(&dnh->dnh_zrlock);
1162         dn = dnh->dnh_dnode;
1163         if (dn == NULL) {
1164                 dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;

1165 
1166                 dn = dnode_create(os, phys, db, object, dnh);





1167         }

1168 
1169         mutex_enter(&dn->dn_mtx);
1170         type = dn->dn_type;
1171         if (dn->dn_free_txg ||
1172             ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
1173             ((flag & DNODE_MUST_BE_FREE) &&
1174             (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
1175                 mutex_exit(&dn->dn_mtx);
1176                 zrl_remove(&dnh->dnh_zrlock);
1177                 dbuf_rele(db, FTAG);
1178                 return (type == DMU_OT_NONE ? ENOENT : EEXIST);
1179         }


1180         if (refcount_add(&dn->dn_holds, tag) == 1)
1181                 dbuf_add_ref(db, dnh);
1182         mutex_exit(&dn->dn_mtx);
1183 
1184         /* Now we can rely on the hold to prevent the dnode from moving. */
1185         zrl_remove(&dnh->dnh_zrlock);
1186 
1187         DNODE_VERIFY(dn);
1188         ASSERT3P(dn->dn_dbuf, ==, db);
1189         ASSERT3U(dn->dn_object, ==, object);
1190         dbuf_rele(db, FTAG);
1191 
1192         *dnp = dn;
1193         return (0);
1194 }
1195 
1196 /*
1197  * Return held dnode if the object is allocated, NULL if not.
1198  */
1199 int
1200 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
1201 {
1202         return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
1203 }