Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>


   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  * Copyright (c) 2013, Joyent, Inc. All rights reserved.

  27  */
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/dmu.h>
  31 #include <sys/dmu_send.h>
  32 #include <sys/dmu_impl.h>
  33 #include <sys/dbuf.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/dsl_dataset.h>
  36 #include <sys/dsl_dir.h>
  37 #include <sys/dmu_tx.h>
  38 #include <sys/spa.h>
  39 #include <sys/zio.h>
  40 #include <sys/dmu_zfetch.h>
  41 #include <sys/sa.h>
  42 #include <sys/sa_impl.h>
  43 #include <sys/zfeature.h>
  44 #include <sys/blkptr.h>
  45 #include <sys/range_tree.h>
  46 
  47 /*
  48  * Number of times that zfs_free_range() took the slow path while doing
  49  * a zfs receive.  A nonzero value indicates a potential performance problem.
  50  */
  51 uint64_t zfs_free_range_recv_miss;
  52 
  53 static void dbuf_destroy(dmu_buf_impl_t *db);
  54 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  55 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
  56 





  57 /*
  58  * Global data structures and functions for the dbuf cache.
  59  */
  60 static kmem_cache_t *dbuf_cache;

  61 
  62 /* ARGSUSED */
  63 static int
  64 dbuf_cons(void *vdb, void *unused, int kmflag)
  65 {
  66         dmu_buf_impl_t *db = vdb;
  67         bzero(db, sizeof (dmu_buf_impl_t));
  68 
  69         mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
  70         cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
  71         refcount_create(&db->db_holds);
  72 
  73         return (0);
  74 }
  75 
  76 /* ARGSUSED */
  77 static void
  78 dbuf_dest(void *vdb, void *unused)
  79 {
  80         dmu_buf_impl_t *db = vdb;


 198          * DBUF_HASH_MUTEX > db_mtx.
 199          */
 200         ASSERT(refcount_is_zero(&db->db_holds));
 201         ASSERT(db->db_state == DB_EVICTING);
 202         ASSERT(!MUTEX_HELD(&db->db_mtx));
 203 
 204         mutex_enter(DBUF_HASH_MUTEX(h, idx));
 205         dbp = &h->hash_table[idx];
 206         while ((dbf = *dbp) != db) {
 207                 dbp = &dbf->db_hash_next;
 208                 ASSERT(dbf != NULL);
 209         }
 210         *dbp = db->db_hash_next;
 211         db->db_hash_next = NULL;
 212         mutex_exit(DBUF_HASH_MUTEX(h, idx));
 213         atomic_dec_64(&dbuf_hash_count);
 214 }
 215 
 216 static arc_evict_func_t dbuf_do_evict;
 217 





 218 static void





































 219 dbuf_evict_user(dmu_buf_impl_t *db)
 220 {


 221         ASSERT(MUTEX_HELD(&db->db_mtx));
 222 
 223         if (db->db_level != 0 || db->db_evict_func == NULL)
 224                 return;
 225 
 226         db->db_evict_func(&db->db, db->db_user_ptr);
 227         db->db_user_ptr = NULL;
 228         db->db_evict_func = NULL;











 229 }
 230 
 231 boolean_t
 232 dbuf_is_metadata(dmu_buf_impl_t *db)
 233 {
 234         if (db->db_level > 0) {
 235                 return (B_TRUE);
 236         } else {
 237                 boolean_t is_metadata;
 238 
 239                 DB_DNODE_ENTER(db);
 240                 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 241                 DB_DNODE_EXIT(db);
 242 
 243                 return (is_metadata);
 244         }
 245 }
 246 
 247 void
 248 dbuf_evict(dmu_buf_impl_t *db)


 269          */
 270         while (hsize * 4096 < physmem * PAGESIZE)
 271                 hsize <<= 1;
 272 
 273 retry:
 274         h->hash_table_mask = hsize - 1;
 275         h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 276         if (h->hash_table == NULL) {
 277                 /* XXX - we should really return an error instead of assert */
 278                 ASSERT(hsize > (1ULL << 10));
 279                 hsize >>= 1;
 280                 goto retry;
 281         }
 282 
 283         dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 284             sizeof (dmu_buf_impl_t),
 285             0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 286 
 287         for (i = 0; i < DBUF_MUTEXES; i++)
 288                 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);






 289 }
 290 
 291 void
 292 dbuf_fini(void)
 293 {
 294         dbuf_hash_table_t *h = &dbuf_hash_table;
 295         int i;
 296 
 297         for (i = 0; i < DBUF_MUTEXES; i++)
 298                 mutex_destroy(&h->hash_mutexes[i]);
 299         kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 300         kmem_cache_destroy(dbuf_cache);

 301 }
 302 
 303 /*
 304  * Other stuff.
 305  */
 306 
 307 #ifdef ZFS_DEBUG
 308 static void
 309 dbuf_verify(dmu_buf_impl_t *db)
 310 {
 311         dnode_t *dn;
 312         dbuf_dirty_record_t *dr;
 313 
 314         ASSERT(MUTEX_HELD(&db->db_mtx));
 315 
 316         if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 317                 return;
 318 
 319         ASSERT(db->db_objset != NULL);
 320         DB_DNODE_ENTER(db);


 398             db->db_state != DB_FILL && !dn->dn_free_txg) {
 399                 /*
 400                  * If the blkptr isn't set but they have nonzero data,
 401                  * it had better be dirty, otherwise we'll lose that
 402                  * data when we evict this buffer.
 403                  */
 404                 if (db->db_dirtycnt == 0) {
 405                         uint64_t *buf = db->db.db_data;
 406                         int i;
 407 
 408                         for (i = 0; i < db->db.db_size >> 3; i++) {
 409                                 ASSERT(buf[i] == 0);
 410                         }
 411                 }
 412         }
 413         DB_DNODE_EXIT(db);
 414 }
 415 #endif
 416 
 417 static void











 418 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 419 {
 420         ASSERT(MUTEX_HELD(&db->db_mtx));


 421         db->db_buf = buf;
 422         if (buf != NULL) {
 423                 ASSERT(buf->b_data != NULL);
 424                 db->db.db_data = buf->b_data;
 425                 if (!arc_released(buf))
 426                         arc_set_callback(buf, dbuf_do_evict, db);
 427         } else {
 428                 dbuf_evict_user(db);
 429                 db->db.db_data = NULL;
 430                 if (db->db_state != DB_NOFILL)
 431                         db->db_state = DB_UNCACHED;
 432         }
 433 }
 434 
 435 /*
 436  * Loan out an arc_buf for read.  Return the loaned arc_buf.
 437  */
 438 arc_buf_t *
 439 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 440 {
 441         arc_buf_t *abuf;
 442 
 443         mutex_enter(&db->db_mtx);
 444         if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 445                 int blksz = db->db.db_size;
 446                 spa_t *spa = db->db_objset->os_spa;
 447 
 448                 mutex_exit(&db->db_mtx);
 449                 abuf = arc_loan_buf(spa, blksz);
 450                 bcopy(db->db.db_data, abuf->b_data, blksz);
 451         } else {
 452                 abuf = db->db_buf;
 453                 arc_loan_inuse_buf(abuf, db);
 454                 dbuf_set_data(db, NULL);
 455                 mutex_exit(&db->db_mtx);
 456         }
 457         return (abuf);
 458 }
 459 
 460 uint64_t
 461 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 462 {
 463         if (dn->dn_datablkshift) {
 464                 return (offset >> dn->dn_datablkshift);
 465         } else {
 466                 ASSERT3U(offset, <, dn->dn_datablksz);
 467                 return (0);
 468         }
 469 }
 470 
 471 static void
 472 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 473 {
 474         dmu_buf_impl_t *db = vdb;


 670         return (err);
 671 }
 672 
 673 static void
 674 dbuf_noread(dmu_buf_impl_t *db)
 675 {
 676         ASSERT(!refcount_is_zero(&db->db_holds));
 677         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 678         mutex_enter(&db->db_mtx);
 679         while (db->db_state == DB_READ || db->db_state == DB_FILL)
 680                 cv_wait(&db->db_changed, &db->db_mtx);
 681         if (db->db_state == DB_UNCACHED) {
 682                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 683                 spa_t *spa = db->db_objset->os_spa;
 684 
 685                 ASSERT(db->db_buf == NULL);
 686                 ASSERT(db->db.db_data == NULL);
 687                 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 688                 db->db_state = DB_FILL;
 689         } else if (db->db_state == DB_NOFILL) {
 690                 dbuf_set_data(db, NULL);
 691         } else {
 692                 ASSERT3U(db->db_state, ==, DB_CACHED);
 693         }
 694         mutex_exit(&db->db_mtx);
 695 }
 696 
 697 /*
 698  * This is our just-in-time copy function.  It makes a copy of
 699  * buffers, that have been modified in a previous transaction
 700  * group, before we modify them in the current active group.
 701  *
 702  * This function is used in two places: when we are dirtying a
 703  * buffer for the first time in a txg, and when we are freeing
 704  * a range in a dnode that includes this buffer.
 705  *
 706  * Note that when we are called from dbuf_free_range() we do
 707  * not put a hold on the buffer, we just traverse the active
 708  * dbuf list for the dnode.
 709  */
 710 static void


 726          * If the last dirty record for this dbuf has not yet synced
 727          * and its referencing the dbuf data, either:
 728          *      reset the reference to point to a new copy,
 729          * or (if there a no active holders)
 730          *      just null out the current db_data pointer.
 731          */
 732         ASSERT(dr->dr_txg >= txg - 2);
 733         if (db->db_blkid == DMU_BONUS_BLKID) {
 734                 /* Note that the data bufs here are zio_bufs */
 735                 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 736                 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 737                 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 738         } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 739                 int size = db->db.db_size;
 740                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 741                 spa_t *spa = db->db_objset->os_spa;
 742 
 743                 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 744                 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 745         } else {
 746                 dbuf_set_data(db, NULL);
 747         }
 748 }
 749 
 750 void
 751 dbuf_unoverride(dbuf_dirty_record_t *dr)
 752 {
 753         dmu_buf_impl_t *db = dr->dr_dbuf;
 754         blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 755         uint64_t txg = dr->dr_txg;
 756 
 757         ASSERT(MUTEX_HELD(&db->db_mtx));
 758         ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 759         ASSERT(db->db_level == 0);
 760 
 761         if (db->db_blkid == DMU_BONUS_BLKID ||
 762             dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 763                 return;
 764 
 765         ASSERT(db->db_data_pending != dr);
 766 


 777          * modifying the buffer, so they will immediately do
 778          * another (redundant) arc_release().  Therefore, leave
 779          * the buf thawed to save the effort of freezing &
 780          * immediately re-thawing it.
 781          */
 782         arc_release(dr->dt.dl.dr_data, db);
 783 }
 784 
 785 /*
 786  * Evict (if its unreferenced) or clear (if its referenced) any level-0
 787  * data blocks in the free range, so that any future readers will find
 788  * empty blocks.
 789  *
 790  * This is a no-op if the dataset is in the middle of an incremental
 791  * receive; see comment below for details.
 792  */
 793 void
 794 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 795     dmu_tx_t *tx)
 796 {
 797         dmu_buf_impl_t *db, *db_next, db_search;

 798         uint64_t txg = tx->tx_txg;
 799         avl_index_t where;
 800 
 801         if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
 802                 end_blkid = dn->dn_maxblkid;
 803         dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 804 
 805         db_search.db_level = 0;
 806         db_search.db_blkid = start_blkid;
 807         db_search.db_state = DB_SEARCH;
 808 
 809         mutex_enter(&dn->dn_dbufs_mtx);
 810         if (start_blkid >= dn->dn_unlisted_l0_blkid) {
 811                 /* There can't be any dbufs in this range; no need to search. */
 812 #ifdef DEBUG
 813                 db = avl_find(&dn->dn_dbufs, &db_search, &where);
 814                 ASSERT3P(db, ==, NULL);
 815                 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 816                 ASSERT(db == NULL || db->db_level > 0);
 817 #endif


1353                 ASSERT(db->db_buf != NULL);
1354                 ASSERT(dr->dt.dl.dr_data != NULL);
1355                 if (dr->dt.dl.dr_data != db->db_buf)
1356                         VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1357         }
1358 
1359         if (db->db_level != 0) {
1360                 mutex_destroy(&dr->dt.di.dr_mtx);
1361                 list_destroy(&dr->dt.di.dr_children);
1362         }
1363 
1364         kmem_free(dr, sizeof (dbuf_dirty_record_t));
1365 
1366         ASSERT(db->db_dirtycnt > 0);
1367         db->db_dirtycnt -= 1;
1368 
1369         if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1370                 arc_buf_t *buf = db->db_buf;
1371 
1372                 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1373                 dbuf_set_data(db, NULL);
1374                 VERIFY(arc_buf_remove_ref(buf, db));
1375                 dbuf_evict(db);
1376                 return (B_TRUE);
1377         }
1378 
1379         return (B_FALSE);
1380 }
1381 
1382 void
1383 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1384 {
1385         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1386         int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1387 
1388         ASSERT(tx->tx_txg != 0);
1389         ASSERT(!refcount_is_zero(&db->db_holds));
1390 
1391         DB_DNODE_ENTER(db);
1392         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1393                 rf |= DB_RF_HAVESTRUCT;


1693     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1694 {
1695         objset_t *os = dn->dn_objset;
1696         dmu_buf_impl_t *db, *odb;
1697 
1698         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1699         ASSERT(dn->dn_type != DMU_OT_NONE);
1700 
1701         db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1702 
1703         db->db_objset = os;
1704         db->db.db_object = dn->dn_object;
1705         db->db_level = level;
1706         db->db_blkid = blkid;
1707         db->db_last_dirty = NULL;
1708         db->db_dirtycnt = 0;
1709         db->db_dnode_handle = dn->dn_handle;
1710         db->db_parent = parent;
1711         db->db_blkptr = blkptr;
1712 
1713         db->db_user_ptr = NULL;
1714         db->db_evict_func = NULL;
1715         db->db_immediate_evict = 0;
1716         db->db_freed_in_flight = 0;
1717 
1718         if (blkid == DMU_BONUS_BLKID) {
1719                 ASSERT3P(parent, ==, dn->dn_dbuf);
1720                 db->db.db_size = DN_MAX_BONUSLEN -
1721                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1722                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1723                 db->db.db_offset = DMU_BONUS_BLKID;
1724                 db->db_state = DB_UNCACHED;
1725                 /* the bonus dbuf is not placed in the hash table */
1726                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1727                 return (db);
1728         } else if (blkid == DMU_SPILL_BLKID) {
1729                 db->db.db_size = (blkptr != NULL) ?
1730                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1731                 db->db.db_offset = 0;
1732         } else {
1733                 int blocksize =
1734                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;


2097                         DB_DNODE_EXIT(db);
2098                         /*
2099                          * The bonus buffer's dnode hold is no longer discounted
2100                          * in dnode_move(). The dnode cannot move until after
2101                          * the dnode_rele().
2102                          */
2103                         dnode_rele(DB_DNODE(db), db);
2104                 } else if (db->db_buf == NULL) {
2105                         /*
2106                          * This is a special case: we never associated this
2107                          * dbuf with any data allocated from the ARC.
2108                          */
2109                         ASSERT(db->db_state == DB_UNCACHED ||
2110                             db->db_state == DB_NOFILL);
2111                         dbuf_evict(db);
2112                 } else if (arc_released(db->db_buf)) {
2113                         arc_buf_t *buf = db->db_buf;
2114                         /*
2115                          * This dbuf has anonymous data associated with it.
2116                          */
2117                         dbuf_set_data(db, NULL);
2118                         VERIFY(arc_buf_remove_ref(buf, db));
2119                         dbuf_evict(db);
2120                 } else {
2121                         VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2122 
2123                         /*
2124                          * A dbuf will be eligible for eviction if either the
2125                          * 'primarycache' property is set or a duplicate
2126                          * copy of this buffer is already cached in the arc.
2127                          *
2128                          * In the case of the 'primarycache' a buffer
2129                          * is considered for eviction if it matches the
2130                          * criteria set in the property.
2131                          *
2132                          * To decide if our buffer is considered a
2133                          * duplicate, we must call into the arc to determine
2134                          * if multiple buffers are referencing the same
2135                          * block on-disk. If so, then we simply evict
2136                          * ourselves.
2137                          */
2138                         if (!DBUF_IS_CACHEABLE(db)) {
2139                                 if (db->db_blkptr != NULL &&
2140                                     !BP_IS_HOLE(db->db_blkptr) &&
2141                                     !BP_IS_EMBEDDED(db->db_blkptr)) {
2142                                         spa_t *spa =
2143                                             dmu_objset_spa(db->db_objset);
2144                                         blkptr_t bp = *db->db_blkptr;
2145                                         dbuf_clear(db);
2146                                         arc_freed(spa, &bp);
2147                                 } else {
2148                                         dbuf_clear(db);
2149                                 }
2150                         } else if (arc_buf_eviction_needed(db->db_buf)) {

2151                                 dbuf_clear(db);
2152                         } else {
2153                                 mutex_exit(&db->db_mtx);
2154                         }
2155                 }
2156         } else {
2157                 mutex_exit(&db->db_mtx);
2158         }
2159 }
2160 
2161 #pragma weak dmu_buf_refcount = dbuf_refcount
2162 uint64_t
2163 dbuf_refcount(dmu_buf_impl_t *db)
2164 {
2165         return (refcount_count(&db->db_holds));
2166 }
2167 
2168 void *
2169 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr,
2170     dmu_buf_evict_func_t *evict_func)
2171 {
2172         return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));











2173 }
2174 
2175 void *
2176 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr,
2177     dmu_buf_evict_func_t *evict_func)
2178 {
2179         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2180 
2181         db->db_immediate_evict = TRUE;
2182         return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func));
2183 }
2184 
2185 void *
2186 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2187     dmu_buf_evict_func_t *evict_func)
2188 {
2189         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2190         ASSERT(db->db_level == 0);
2191 
2192         ASSERT((user_ptr == NULL) == (evict_func == NULL));


2193 
2194         mutex_enter(&db->db_mtx);
2195 
2196         if (db->db_user_ptr == old_user_ptr) {
2197                 db->db_user_ptr = user_ptr;
2198                 db->db_evict_func = evict_func;
2199         } else {
2200                 old_user_ptr = db->db_user_ptr;
2201         }
2202 
2203         mutex_exit(&db->db_mtx);
2204         return (old_user_ptr);
2205 }
2206 
2207 void *
2208 dmu_buf_get_user(dmu_buf_t *db_fake)
2209 {
2210         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2211         ASSERT(!refcount_is_zero(&db->db_holds));
2212 
2213         return (db->db_user_ptr);

2214 }
2215 






2216 boolean_t
2217 dmu_buf_freeable(dmu_buf_t *dbuf)
2218 {
2219         boolean_t res = B_FALSE;
2220         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2221 
2222         if (db->db_blkptr)
2223                 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2224                     db->db_blkptr, db->db_blkptr->blk_birth);
2225 
2226         return (res);
2227 }
2228 
2229 blkptr_t *
2230 dmu_buf_get_blkptr(dmu_buf_t *db)
2231 {
2232         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2233         return (dbi->db_blkptr);
2234 }
2235 




   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  27  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  28  */
  29 
  30 #include <sys/zfs_context.h>
  31 #include <sys/dmu.h>
  32 #include <sys/dmu_send.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dbuf.h>
  35 #include <sys/dmu_objset.h>
  36 #include <sys/dsl_dataset.h>
  37 #include <sys/dsl_dir.h>
  38 #include <sys/dmu_tx.h>
  39 #include <sys/spa.h>
  40 #include <sys/zio.h>
  41 #include <sys/dmu_zfetch.h>
  42 #include <sys/sa.h>
  43 #include <sys/sa_impl.h>
  44 #include <sys/zfeature.h>
  45 #include <sys/blkptr.h>
  46 #include <sys/range_tree.h>
  47 
  48 /*
  49  * Number of times that zfs_free_range() took the slow path while doing
  50  * a zfs receive.  A nonzero value indicates a potential performance problem.
  51  */
  52 uint64_t zfs_free_range_recv_miss;
  53 
  54 static void dbuf_destroy(dmu_buf_impl_t *db);
  55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
  57 
  58 #ifndef __lint
  59 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
  60     dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp);
  61 #endif /* ! __lint */
  62 
  63 /*
  64  * Global data structures and functions for the dbuf cache.
  65  */
  66 static kmem_cache_t *dbuf_cache;
  67 static taskq_t *dbu_evict_taskq;
  68 
  69 /* ARGSUSED */
  70 static int
  71 dbuf_cons(void *vdb, void *unused, int kmflag)
  72 {
  73         dmu_buf_impl_t *db = vdb;
  74         bzero(db, sizeof (dmu_buf_impl_t));
  75 
  76         mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
  77         cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
  78         refcount_create(&db->db_holds);
  79 
  80         return (0);
  81 }
  82 
  83 /* ARGSUSED */
  84 static void
  85 dbuf_dest(void *vdb, void *unused)
  86 {
  87         dmu_buf_impl_t *db = vdb;


 205          * DBUF_HASH_MUTEX > db_mtx.
 206          */
 207         ASSERT(refcount_is_zero(&db->db_holds));
 208         ASSERT(db->db_state == DB_EVICTING);
 209         ASSERT(!MUTEX_HELD(&db->db_mtx));
 210 
 211         mutex_enter(DBUF_HASH_MUTEX(h, idx));
 212         dbp = &h->hash_table[idx];
 213         while ((dbf = *dbp) != db) {
 214                 dbp = &dbf->db_hash_next;
 215                 ASSERT(dbf != NULL);
 216         }
 217         *dbp = db->db_hash_next;
 218         db->db_hash_next = NULL;
 219         mutex_exit(DBUF_HASH_MUTEX(h, idx));
 220         atomic_dec_64(&dbuf_hash_count);
 221 }
 222 
 223 static arc_evict_func_t dbuf_do_evict;
 224 
 225 typedef enum {
 226         DBVU_EVICTING,
 227         DBVU_NOT_EVICTING
 228 } dbvu_verify_type_t;
 229 
 230 static void
 231 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 232 {
 233 #ifdef ZFS_DEBUG
 234         int64_t holds;
 235 
 236         if (db->db_user == NULL)
 237                 return;
 238 
 239         /* Only data blocks support the attachment of user data. */
 240         ASSERT(db->db_level == 0);
 241 
 242         /* Clients must resolve a dbuf before attaching user data. */
 243         ASSERT(db->db.db_data != NULL);
 244         ASSERT3U(db->db_state, ==, DB_CACHED);
 245 
 246         holds = refcount_count(&db->db_holds);
 247         if (verify_type == DBVU_EVICTING) {
 248                 /*
 249                  * Immediate eviction occurs when holds == dirtycnt.
 250                  * For normal eviction buffers, holds is zero on
 251                  * eviction, except when dbuf_fix_old_data() calls
 252                  * dbuf_clear_data().  However, the hold count can grow
 253                  * during eviction even though db_mtx is held (see
 254                  * dmu_bonus_hold() for an example), so we can only
 255                  * test the generic invariant that holds >= dirtycnt.
 256                  */
 257                 ASSERT3U(holds, >=, db->db_dirtycnt);
 258         } else {
 259                 if (db->db_immediate_evict == TRUE)
 260                         ASSERT3U(holds, >=, db->db_dirtycnt);
 261                 else
 262                         ASSERT3U(holds, >, 0);
 263         }
 264 #endif
 265 }
 266 
 267 static void
 268 dbuf_evict_user(dmu_buf_impl_t *db)
 269 {
 270         dmu_buf_user_t *dbu = db->db_user;
 271 
 272         ASSERT(MUTEX_HELD(&db->db_mtx));
 273 
 274         if (dbu == NULL)
 275                 return;
 276 
 277         dbuf_verify_user(db, DBVU_EVICTING);
 278         db->db_user = NULL;
 279 
 280 #ifdef ZFS_DEBUG
 281         if (dbu->dbu_clear_on_evict_dbufp != NULL)
 282                 *dbu->dbu_clear_on_evict_dbufp = NULL;
 283 #endif
 284 
 285         /*
 286          * Invoke the callback from a taskq to avoid lock order reversals
 287          * and limit stack depth.
 288          */
 289         taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
 290             &dbu->dbu_tqent);
 291 }
 292 
 293 boolean_t
 294 dbuf_is_metadata(dmu_buf_impl_t *db)
 295 {
 296         if (db->db_level > 0) {
 297                 return (B_TRUE);
 298         } else {
 299                 boolean_t is_metadata;
 300 
 301                 DB_DNODE_ENTER(db);
 302                 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 303                 DB_DNODE_EXIT(db);
 304 
 305                 return (is_metadata);
 306         }
 307 }
 308 
 309 void
 310 dbuf_evict(dmu_buf_impl_t *db)


 331          */
 332         while (hsize * 4096 < physmem * PAGESIZE)
 333                 hsize <<= 1;
 334 
 335 retry:
 336         h->hash_table_mask = hsize - 1;
 337         h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 338         if (h->hash_table == NULL) {
 339                 /* XXX - we should really return an error instead of assert */
 340                 ASSERT(hsize > (1ULL << 10));
 341                 hsize >>= 1;
 342                 goto retry;
 343         }
 344 
 345         dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 346             sizeof (dmu_buf_impl_t),
 347             0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 348 
 349         for (i = 0; i < DBUF_MUTEXES; i++)
 350                 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 351 
 352         /*
 353          * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 354          * configuration is not required.
 355          */
 356         dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
 357 }
 358 
 359 void
 360 dbuf_fini(void)
 361 {
 362         dbuf_hash_table_t *h = &dbuf_hash_table;
 363         int i;
 364 
 365         for (i = 0; i < DBUF_MUTEXES; i++)
 366                 mutex_destroy(&h->hash_mutexes[i]);
 367         kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 368         kmem_cache_destroy(dbuf_cache);
 369         taskq_destroy(dbu_evict_taskq);
 370 }
 371 
 372 /*
 373  * Other stuff.
 374  */
 375 
 376 #ifdef ZFS_DEBUG
 377 static void
 378 dbuf_verify(dmu_buf_impl_t *db)
 379 {
 380         dnode_t *dn;
 381         dbuf_dirty_record_t *dr;
 382 
 383         ASSERT(MUTEX_HELD(&db->db_mtx));
 384 
 385         if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 386                 return;
 387 
 388         ASSERT(db->db_objset != NULL);
 389         DB_DNODE_ENTER(db);


 467             db->db_state != DB_FILL && !dn->dn_free_txg) {
 468                 /*
 469                  * If the blkptr isn't set but they have nonzero data,
 470                  * it had better be dirty, otherwise we'll lose that
 471                  * data when we evict this buffer.
 472                  */
 473                 if (db->db_dirtycnt == 0) {
 474                         uint64_t *buf = db->db.db_data;
 475                         int i;
 476 
 477                         for (i = 0; i < db->db.db_size >> 3; i++) {
 478                                 ASSERT(buf[i] == 0);
 479                         }
 480                 }
 481         }
 482         DB_DNODE_EXIT(db);
 483 }
 484 #endif
 485 
 486 static void
 487 dbuf_clear_data(dmu_buf_impl_t *db)
 488 {
 489         ASSERT(MUTEX_HELD(&db->db_mtx));
 490         dbuf_evict_user(db);
 491         db->db_buf = NULL;
 492         db->db.db_data = NULL;
 493         if (db->db_state != DB_NOFILL)
 494                 db->db_state = DB_UNCACHED;
 495 }
 496 
 497 static void
 498 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 499 {
 500         ASSERT(MUTEX_HELD(&db->db_mtx));
 501         ASSERT(buf != NULL);
 502 
 503         db->db_buf = buf;

 504         ASSERT(buf->b_data != NULL);
 505         db->db.db_data = buf->b_data;
 506         if (!arc_released(buf))
 507                 arc_set_callback(buf, dbuf_do_evict, db);






 508 }
 509 
 510 /*
 511  * Loan out an arc_buf for read.  Return the loaned arc_buf.
 512  */
 513 arc_buf_t *
 514 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 515 {
 516         arc_buf_t *abuf;
 517 
 518         mutex_enter(&db->db_mtx);
 519         if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 520                 int blksz = db->db.db_size;
 521                 spa_t *spa = db->db_objset->os_spa;
 522 
 523                 mutex_exit(&db->db_mtx);
 524                 abuf = arc_loan_buf(spa, blksz);
 525                 bcopy(db->db.db_data, abuf->b_data, blksz);
 526         } else {
 527                 abuf = db->db_buf;
 528                 arc_loan_inuse_buf(abuf, db);
 529                 dbuf_clear_data(db);
 530                 mutex_exit(&db->db_mtx);
 531         }
 532         return (abuf);
 533 }
 534 
 535 uint64_t
 536 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 537 {
 538         if (dn->dn_datablkshift) {
 539                 return (offset >> dn->dn_datablkshift);
 540         } else {
 541                 ASSERT3U(offset, <, dn->dn_datablksz);
 542                 return (0);
 543         }
 544 }
 545 
 546 static void
 547 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 548 {
 549         dmu_buf_impl_t *db = vdb;


 745         return (err);
 746 }
 747 
 748 static void
 749 dbuf_noread(dmu_buf_impl_t *db)
 750 {
 751         ASSERT(!refcount_is_zero(&db->db_holds));
 752         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 753         mutex_enter(&db->db_mtx);
 754         while (db->db_state == DB_READ || db->db_state == DB_FILL)
 755                 cv_wait(&db->db_changed, &db->db_mtx);
 756         if (db->db_state == DB_UNCACHED) {
 757                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 758                 spa_t *spa = db->db_objset->os_spa;
 759 
 760                 ASSERT(db->db_buf == NULL);
 761                 ASSERT(db->db.db_data == NULL);
 762                 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 763                 db->db_state = DB_FILL;
 764         } else if (db->db_state == DB_NOFILL) {
 765                 dbuf_clear_data(db);
 766         } else {
 767                 ASSERT3U(db->db_state, ==, DB_CACHED);
 768         }
 769         mutex_exit(&db->db_mtx);
 770 }
 771 
 772 /*
 773  * This is our just-in-time copy function.  It makes a copy of
 774  * buffers, that have been modified in a previous transaction
 775  * group, before we modify them in the current active group.
 776  *
 777  * This function is used in two places: when we are dirtying a
 778  * buffer for the first time in a txg, and when we are freeing
 779  * a range in a dnode that includes this buffer.
 780  *
 781  * Note that when we are called from dbuf_free_range() we do
 782  * not put a hold on the buffer, we just traverse the active
 783  * dbuf list for the dnode.
 784  */
 785 static void


 801          * If the last dirty record for this dbuf has not yet synced
 802          * and its referencing the dbuf data, either:
 803          *      reset the reference to point to a new copy,
 804          * or (if there a no active holders)
 805          *      just null out the current db_data pointer.
 806          */
 807         ASSERT(dr->dr_txg >= txg - 2);
 808         if (db->db_blkid == DMU_BONUS_BLKID) {
 809                 /* Note that the data bufs here are zio_bufs */
 810                 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 811                 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 812                 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 813         } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 814                 int size = db->db.db_size;
 815                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 816                 spa_t *spa = db->db_objset->os_spa;
 817 
 818                 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 819                 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 820         } else {
 821                 dbuf_clear_data(db);
 822         }
 823 }
 824 
 825 void
 826 dbuf_unoverride(dbuf_dirty_record_t *dr)
 827 {
 828         dmu_buf_impl_t *db = dr->dr_dbuf;
 829         blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 830         uint64_t txg = dr->dr_txg;
 831 
 832         ASSERT(MUTEX_HELD(&db->db_mtx));
 833         ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 834         ASSERT(db->db_level == 0);
 835 
 836         if (db->db_blkid == DMU_BONUS_BLKID ||
 837             dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 838                 return;
 839 
 840         ASSERT(db->db_data_pending != dr);
 841 


 852          * modifying the buffer, so they will immediately do
 853          * another (redundant) arc_release().  Therefore, leave
 854          * the buf thawed to save the effort of freezing &
 855          * immediately re-thawing it.
 856          */
 857         arc_release(dr->dt.dl.dr_data, db);
 858 }
 859 
 860 /*
 861  * Evict (if its unreferenced) or clear (if its referenced) any level-0
 862  * data blocks in the free range, so that any future readers will find
 863  * empty blocks.
 864  *
 865  * This is a no-op if the dataset is in the middle of an incremental
 866  * receive; see comment below for details.
 867  */
 868 void
 869 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
 870     dmu_tx_t *tx)
 871 {
 872         dmu_buf_impl_t db_search;
 873         dmu_buf_impl_t *db, *db_next;
 874         uint64_t txg = tx->tx_txg;
 875         avl_index_t where;
 876 
 877         if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
 878                 end_blkid = dn->dn_maxblkid;
 879         dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 880 
 881         db_search.db_level = 0;
 882         db_search.db_blkid = start_blkid;
 883         db_search.db_state = DB_SEARCH;
 884 
 885         mutex_enter(&dn->dn_dbufs_mtx);
 886         if (start_blkid >= dn->dn_unlisted_l0_blkid) {
 887                 /* There can't be any dbufs in this range; no need to search. */
 888 #ifdef DEBUG
 889                 db = avl_find(&dn->dn_dbufs, &db_search, &where);
 890                 ASSERT3P(db, ==, NULL);
 891                 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 892                 ASSERT(db == NULL || db->db_level > 0);
 893 #endif


1429                 ASSERT(db->db_buf != NULL);
1430                 ASSERT(dr->dt.dl.dr_data != NULL);
1431                 if (dr->dt.dl.dr_data != db->db_buf)
1432                         VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1433         }
1434 
1435         if (db->db_level != 0) {
1436                 mutex_destroy(&dr->dt.di.dr_mtx);
1437                 list_destroy(&dr->dt.di.dr_children);
1438         }
1439 
1440         kmem_free(dr, sizeof (dbuf_dirty_record_t));
1441 
1442         ASSERT(db->db_dirtycnt > 0);
1443         db->db_dirtycnt -= 1;
1444 
1445         if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1446                 arc_buf_t *buf = db->db_buf;
1447 
1448                 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1449                 dbuf_clear_data(db);
1450                 VERIFY(arc_buf_remove_ref(buf, db));
1451                 dbuf_evict(db);
1452                 return (B_TRUE);
1453         }
1454 
1455         return (B_FALSE);
1456 }
1457 
1458 void
1459 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1460 {
1461         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1462         int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1463 
1464         ASSERT(tx->tx_txg != 0);
1465         ASSERT(!refcount_is_zero(&db->db_holds));
1466 
1467         DB_DNODE_ENTER(db);
1468         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1469                 rf |= DB_RF_HAVESTRUCT;


1769     dmu_buf_impl_t *parent, blkptr_t *blkptr)
1770 {
1771         objset_t *os = dn->dn_objset;
1772         dmu_buf_impl_t *db, *odb;
1773 
1774         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1775         ASSERT(dn->dn_type != DMU_OT_NONE);
1776 
1777         db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1778 
1779         db->db_objset = os;
1780         db->db.db_object = dn->dn_object;
1781         db->db_level = level;
1782         db->db_blkid = blkid;
1783         db->db_last_dirty = NULL;
1784         db->db_dirtycnt = 0;
1785         db->db_dnode_handle = dn->dn_handle;
1786         db->db_parent = parent;
1787         db->db_blkptr = blkptr;
1788 
1789         db->db_user = NULL;

1790         db->db_immediate_evict = 0;
1791         db->db_freed_in_flight = 0;
1792 
1793         if (blkid == DMU_BONUS_BLKID) {
1794                 ASSERT3P(parent, ==, dn->dn_dbuf);
1795                 db->db.db_size = DN_MAX_BONUSLEN -
1796                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1797                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1798                 db->db.db_offset = DMU_BONUS_BLKID;
1799                 db->db_state = DB_UNCACHED;
1800                 /* the bonus dbuf is not placed in the hash table */
1801                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1802                 return (db);
1803         } else if (blkid == DMU_SPILL_BLKID) {
1804                 db->db.db_size = (blkptr != NULL) ?
1805                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1806                 db->db.db_offset = 0;
1807         } else {
1808                 int blocksize =
1809                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;


2172                         DB_DNODE_EXIT(db);
2173                         /*
2174                          * The bonus buffer's dnode hold is no longer discounted
2175                          * in dnode_move(). The dnode cannot move until after
2176                          * the dnode_rele().
2177                          */
2178                         dnode_rele(DB_DNODE(db), db);
2179                 } else if (db->db_buf == NULL) {
2180                         /*
2181                          * This is a special case: we never associated this
2182                          * dbuf with any data allocated from the ARC.
2183                          */
2184                         ASSERT(db->db_state == DB_UNCACHED ||
2185                             db->db_state == DB_NOFILL);
2186                         dbuf_evict(db);
2187                 } else if (arc_released(db->db_buf)) {
2188                         arc_buf_t *buf = db->db_buf;
2189                         /*
2190                          * This dbuf has anonymous data associated with it.
2191                          */
2192                         dbuf_clear_data(db);
2193                         VERIFY(arc_buf_remove_ref(buf, db));
2194                         dbuf_evict(db);
2195                 } else {
2196                         VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2197 
2198                         /*
2199                          * A dbuf will be eligible for eviction if either the
2200                          * 'primarycache' property is set or a duplicate
2201                          * copy of this buffer is already cached in the arc.
2202                          *
2203                          * In the case of the 'primarycache' a buffer
2204                          * is considered for eviction if it matches the
2205                          * criteria set in the property.
2206                          *
2207                          * To decide if our buffer is considered a
2208                          * duplicate, we must call into the arc to determine
2209                          * if multiple buffers are referencing the same
2210                          * block on-disk. If so, then we simply evict
2211                          * ourselves.
2212                          */
2213                         if (!DBUF_IS_CACHEABLE(db)) {
2214                                 if (db->db_blkptr != NULL &&
2215                                     !BP_IS_HOLE(db->db_blkptr) &&
2216                                     !BP_IS_EMBEDDED(db->db_blkptr)) {
2217                                         spa_t *spa =
2218                                             dmu_objset_spa(db->db_objset);
2219                                         blkptr_t bp = *db->db_blkptr;
2220                                         dbuf_clear(db);
2221                                         arc_freed(spa, &bp);
2222                                 } else {
2223                                         dbuf_clear(db);
2224                                 }
2225                         } else if (db->db_objset->os_evicting ||
2226                             arc_buf_eviction_needed(db->db_buf)) {
2227                                 dbuf_clear(db);
2228                         } else {
2229                                 mutex_exit(&db->db_mtx);
2230                         }
2231                 }
2232         } else {
2233                 mutex_exit(&db->db_mtx);
2234         }
2235 }
2236 
2237 #pragma weak dmu_buf_refcount = dbuf_refcount
2238 uint64_t
2239 dbuf_refcount(dmu_buf_impl_t *db)
2240 {
2241         return (refcount_count(&db->db_holds));
2242 }
2243 
2244 void *
2245 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2246     dmu_buf_user_t *new_user)
2247 {
2248         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2249 
2250         mutex_enter(&db->db_mtx);
2251         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2252         if (db->db_user == old_user)
2253                 db->db_user = new_user;
2254         else
2255                 old_user = db->db_user;
2256         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2257         mutex_exit(&db->db_mtx);
2258 
2259         return (old_user);
2260 }
2261 
2262 void *
2263 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)

2264 {
2265         return (dmu_buf_replace_user(db_fake, NULL, user));



2266 }
2267 
2268 void *
2269 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)

2270 {
2271         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;

2272 
2273         db->db_immediate_evict = TRUE;
2274         return (dmu_buf_set_user(db_fake, user));
2275 }
2276 
2277 void *
2278 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2279 {
2280         return (dmu_buf_replace_user(db_fake, user, NULL));







2281 }
2282 
2283 void *
2284 dmu_buf_get_user(dmu_buf_t *db_fake)
2285 {
2286         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;

2287 
2288         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2289         return (db->db_user);
2290 }
2291 
2292 void
2293 dmu_buf_user_evict_wait()
2294 {
2295         taskq_wait(dbu_evict_taskq);
2296 }
2297 
2298 boolean_t
2299 dmu_buf_freeable(dmu_buf_t *dbuf)
2300 {
2301         boolean_t res = B_FALSE;
2302         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2303 
2304         if (db->db_blkptr)
2305                 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2306                     db->db_blkptr, db->db_blkptr->blk_birth);
2307 
2308         return (res);
2309 }
2310 
2311 blkptr_t *
2312 dmu_buf_get_blkptr(dmu_buf_t *db)
2313 {
2314         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2315         return (dbi->db_blkptr);
2316 }
2317