Print this page
6288 dmu_buf_will_dirty could be faster
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Justin Gibbs <gibbs@scsiguy.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Robert Mustacchi <rm@joyent.com>
6267 dn_bonus evicted too early
Reviewed by: Richard Yao <ryao@gentoo.org>
Reviewed by: Xin LI <delphij@freebsd.org>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>


 255         /* Only data blocks support the attachment of user data. */
 256         ASSERT(db->db_level == 0);
 257 
 258         /* Clients must resolve a dbuf before attaching user data. */
 259         ASSERT(db->db.db_data != NULL);
 260         ASSERT3U(db->db_state, ==, DB_CACHED);
 261 
 262         holds = refcount_count(&db->db_holds);
 263         if (verify_type == DBVU_EVICTING) {
 264                 /*
 265                  * Immediate eviction occurs when holds == dirtycnt.
 266                  * For normal eviction buffers, holds is zero on
 267                  * eviction, except when dbuf_fix_old_data() calls
 268                  * dbuf_clear_data().  However, the hold count can grow
 269                  * during eviction even though db_mtx is held (see
 270                  * dmu_bonus_hold() for an example), so we can only
 271                  * test the generic invariant that holds >= dirtycnt.
 272                  */
 273                 ASSERT3U(holds, >=, db->db_dirtycnt);
 274         } else {
 275                 if (db->db_immediate_evict == TRUE)
 276                         ASSERT3U(holds, >=, db->db_dirtycnt);
 277                 else
 278                         ASSERT3U(holds, >, 0);
 279         }
 280 #endif
 281 }
 282 
 283 static void
 284 dbuf_evict_user(dmu_buf_impl_t *db)
 285 {
 286         dmu_buf_user_t *dbu = db->db_user;
 287 
 288         ASSERT(MUTEX_HELD(&db->db_mtx));
 289 
 290         if (dbu == NULL)
 291                 return;
 292 
 293         dbuf_verify_user(db, DBVU_EVICTING);
 294         db->db_user = NULL;
 295 


1080         }
1081         mutex_exit(&db->db_mtx);
1082 
1083         dnode_willuse_space(dn, size-osize, tx);
1084         DB_DNODE_EXIT(db);
1085 }
1086 
1087 void
1088 dbuf_release_bp(dmu_buf_impl_t *db)
1089 {
1090         objset_t *os = db->db_objset;
1091 
1092         ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1093         ASSERT(arc_released(os->os_phys_buf) ||
1094             list_link_active(&os->os_dsl_dataset->ds_synced_link));
1095         ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1096 
1097         (void) arc_release(db->db_buf, db);
1098 }
1099 


























1100 dbuf_dirty_record_t *
1101 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1102 {
1103         dnode_t *dn;
1104         objset_t *os;
1105         dbuf_dirty_record_t **drp, *dr;
1106         int drop_struct_lock = FALSE;
1107         boolean_t do_free_accounting = B_FALSE;
1108         int txgoff = tx->tx_txg & TXG_MASK;
1109 
1110         ASSERT(tx->tx_txg != 0);
1111         ASSERT(!refcount_is_zero(&db->db_holds));
1112         DMU_TX_DIRTY_BUF(tx, db);
1113 
1114         DB_DNODE_ENTER(db);
1115         dn = DB_DNODE(db);
1116         /*
1117          * Shouldn't dirty a regular buffer in syncing context.  Private
1118          * objects may be dirtied in syncing context, but only if they
1119          * were already pre-dirtied in open context.


1152                     (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1153                 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1154                 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1155         }
1156         mutex_exit(&dn->dn_mtx);
1157 
1158         if (db->db_blkid == DMU_SPILL_BLKID)
1159                 dn->dn_have_spill = B_TRUE;
1160 
1161         /*
1162          * If this buffer is already dirty, we're done.
1163          */
1164         drp = &db->db_last_dirty;
1165         ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1166             db->db.db_object == DMU_META_DNODE_OBJECT);
1167         while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1168                 drp = &dr->dr_next;
1169         if (dr && dr->dr_txg == tx->tx_txg) {
1170                 DB_DNODE_EXIT(db);
1171 
1172                 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1173                         /*
1174                          * If this buffer has already been written out,
1175                          * we now need to reset its state.
1176                          */
1177                         dbuf_unoverride(dr);
1178                         if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1179                             db->db_state != DB_NOFILL)
1180                                 arc_buf_thaw(db->db_buf);
1181                 }
1182                 mutex_exit(&db->db_mtx);
1183                 return (dr);
1184         }
1185 
1186         /*
1187          * Only valid if not already dirty.
1188          */
1189         ASSERT(dn->dn_object == 0 ||
1190             dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1191             (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1192 
1193         ASSERT3U(dn->dn_nlevels, >, db->db_level);
1194         ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1195             dn->dn_phys->dn_nlevels > db->db_level ||
1196             dn->dn_next_nlevels[txgoff] > db->db_level ||
1197             dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1198             dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1199 
1200         /*
1201          * We should only be dirtying in syncing context if it's the


1471 
1472                 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1473                 dbuf_clear_data(db);
1474                 VERIFY(arc_buf_remove_ref(buf, db));
1475                 dbuf_evict(db);
1476                 return (B_TRUE);
1477         }
1478 
1479         return (B_FALSE);
1480 }
1481 
1482 void
1483 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1484 {
1485         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1486         int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1487 
1488         ASSERT(tx->tx_txg != 0);
1489         ASSERT(!refcount_is_zero(&db->db_holds));
1490 
























1491         DB_DNODE_ENTER(db);
1492         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1493                 rf |= DB_RF_HAVESTRUCT;
1494         DB_DNODE_EXIT(db);
1495         (void) dbuf_read(db, NULL, rf);
1496         (void) dbuf_dirty(db, tx);
1497 }
1498 
1499 void
1500 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1501 {
1502         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1503 
1504         db->db_state = DB_NOFILL;
1505 
1506         dmu_buf_will_fill(db_fake, tx);
1507 }
1508 
1509 void
1510 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)


1794 {
1795         objset_t *os = dn->dn_objset;
1796         dmu_buf_impl_t *db, *odb;
1797 
1798         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1799         ASSERT(dn->dn_type != DMU_OT_NONE);
1800 
1801         db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1802 
1803         db->db_objset = os;
1804         db->db.db_object = dn->dn_object;
1805         db->db_level = level;
1806         db->db_blkid = blkid;
1807         db->db_last_dirty = NULL;
1808         db->db_dirtycnt = 0;
1809         db->db_dnode_handle = dn->dn_handle;
1810         db->db_parent = parent;
1811         db->db_blkptr = blkptr;
1812 
1813         db->db_user = NULL;
1814         db->db_immediate_evict = 0;
1815         db->db_freed_in_flight = 0;

1816 
1817         if (blkid == DMU_BONUS_BLKID) {
1818                 ASSERT3P(parent, ==, dn->dn_dbuf);
1819                 db->db.db_size = DN_MAX_BONUSLEN -
1820                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1821                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1822                 db->db.db_offset = DMU_BONUS_BLKID;
1823                 db->db_state = DB_UNCACHED;
1824                 /* the bonus dbuf is not placed in the hash table */
1825                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1826                 return (db);
1827         } else if (blkid == DMU_SPILL_BLKID) {
1828                 db->db.db_size = (blkptr != NULL) ?
1829                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1830                 db->db.db_offset = 0;
1831         } else {
1832                 int blocksize =
1833                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1834                 db->db.db_size = blocksize;
1835                 db->db.db_offset = db->db_blkid * blocksize;


2187 
2188         ASSERT(MUTEX_HELD(&db->db_mtx));
2189         DBUF_VERIFY(db);
2190 
2191         /*
2192          * Remove the reference to the dbuf before removing its hold on the
2193          * dnode so we can guarantee in dnode_move() that a referenced bonus
2194          * buffer has a corresponding dnode hold.
2195          */
2196         holds = refcount_remove(&db->db_holds, tag);
2197         ASSERT(holds >= 0);
2198 
2199         /*
2200          * We can't freeze indirects if there is a possibility that they
2201          * may be modified in the current syncing context.
2202          */
2203         if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2204                 arc_buf_freeze(db->db_buf);
2205 
2206         if (holds == db->db_dirtycnt &&
2207             db->db_level == 0 && db->db_immediate_evict)
2208                 dbuf_evict_user(db);
2209 
2210         if (holds == 0) {
2211                 if (db->db_blkid == DMU_BONUS_BLKID) {
2212                         dnode_t *dn;

2213 
2214                         /*
2215                          * If the dnode moves here, we cannot cross this
2216                          * barrier until the move completes.
2217                          */
2218                         DB_DNODE_ENTER(db);
2219 
2220                         dn = DB_DNODE(db);
2221                         atomic_dec_32(&dn->dn_dbufs_count);
2222 
2223                         /*
2224                          * Decrementing the dbuf count means that the bonus
2225                          * buffer's dnode hold is no longer discounted in
2226                          * dnode_move(). The dnode cannot move until after
2227                          * the dnode_rele_and_unlock() below.
2228                          */
2229                         DB_DNODE_EXIT(db);
2230 
2231                         /*
2232                          * Do not reference db after its lock is dropped.
2233                          * Another thread may evict it.
2234                          */
2235                         mutex_exit(&db->db_mtx);
2236 
2237                         /*
2238                          * If the dnode has been freed, evict the bonus
2239                          * buffer immediately.  The data in the bonus
2240                          * buffer is no longer relevant and this prevents
2241                          * a stale bonus buffer from being associated
2242                          * with this dnode_t should the dnode_t be reused
2243                          * prior to being destroyed.
2244                          */
2245                         mutex_enter(&dn->dn_mtx);
2246                         if (dn->dn_type == DMU_OT_NONE ||
2247                             dn->dn_free_txg != 0) {
2248                                 /*
2249                                  * Drop dn_mtx.  It is a leaf lock and
2250                                  * cannot be held when dnode_evict_bonus()
2251                                  * acquires other locks in order to
2252                                  * perform the eviction.
2253                                  *
2254                                  * Freed dnodes cannot be reused until the
2255                                  * last hold is released.  Since this bonus
2256                                  * buffer has a hold, the dnode will remain
2257                                  * in the free state, even without dn_mtx
2258                                  * held, until the dnode_rele_and_unlock()
2259                                  * below.
2260                                  */
2261                                 mutex_exit(&dn->dn_mtx);
2262                                 dnode_evict_bonus(dn);
2263                                 mutex_enter(&dn->dn_mtx);
2264                         }
2265                         dnode_rele_and_unlock(dn, db);
2266                 } else if (db->db_buf == NULL) {
2267                         /*
2268                          * This is a special case: we never associated this
2269                          * dbuf with any data allocated from the ARC.
2270                          */
2271                         ASSERT(db->db_state == DB_UNCACHED ||
2272                             db->db_state == DB_NOFILL);
2273                         dbuf_evict(db);
2274                 } else if (arc_released(db->db_buf)) {
2275                         arc_buf_t *buf = db->db_buf;
2276                         /*
2277                          * This dbuf has anonymous data associated with it.
2278                          */
2279                         dbuf_clear_data(db);
2280                         VERIFY(arc_buf_remove_ref(buf, db));
2281                         dbuf_evict(db);
2282                 } else {
2283                         VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2284 
2285                         /*


2292                          * criteria set in the property.
2293                          *
2294                          * To decide if our buffer is considered a
2295                          * duplicate, we must call into the arc to determine
2296                          * if multiple buffers are referencing the same
2297                          * block on-disk. If so, then we simply evict
2298                          * ourselves.
2299                          */
2300                         if (!DBUF_IS_CACHEABLE(db)) {
2301                                 if (db->db_blkptr != NULL &&
2302                                     !BP_IS_HOLE(db->db_blkptr) &&
2303                                     !BP_IS_EMBEDDED(db->db_blkptr)) {
2304                                         spa_t *spa =
2305                                             dmu_objset_spa(db->db_objset);
2306                                         blkptr_t bp = *db->db_blkptr;
2307                                         dbuf_clear(db);
2308                                         arc_freed(spa, &bp);
2309                                 } else {
2310                                         dbuf_clear(db);
2311                                 }
2312                         } else if (db->db_objset->os_evicting ||
2313                             arc_buf_eviction_needed(db->db_buf)) {
2314                                 dbuf_clear(db);
2315                         } else {
2316                                 mutex_exit(&db->db_mtx);
2317                         }
2318                 }
2319         } else {
2320                 mutex_exit(&db->db_mtx);
2321         }
2322 }
2323 
2324 #pragma weak dmu_buf_refcount = dbuf_refcount
2325 uint64_t
2326 dbuf_refcount(dmu_buf_impl_t *db)
2327 {
2328         return (refcount_count(&db->db_holds));
2329 }
2330 
2331 void *
2332 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,


2340                 db->db_user = new_user;
2341         else
2342                 old_user = db->db_user;
2343         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2344         mutex_exit(&db->db_mtx);
2345 
2346         return (old_user);
2347 }
2348 
2349 void *
2350 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2351 {
2352         return (dmu_buf_replace_user(db_fake, NULL, user));
2353 }
2354 
2355 void *
2356 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2357 {
2358         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2359 
2360         db->db_immediate_evict = TRUE;
2361         return (dmu_buf_set_user(db_fake, user));
2362 }
2363 
2364 void *
2365 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2366 {
2367         return (dmu_buf_replace_user(db_fake, user, NULL));
2368 }
2369 
2370 void *
2371 dmu_buf_get_user(dmu_buf_t *db_fake)
2372 {
2373         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2374 
2375         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2376         return (db->db_user);
2377 }
2378 
2379 void
2380 dmu_buf_user_evict_wait()




 255         /* Only data blocks support the attachment of user data. */
 256         ASSERT(db->db_level == 0);
 257 
 258         /* Clients must resolve a dbuf before attaching user data. */
 259         ASSERT(db->db.db_data != NULL);
 260         ASSERT3U(db->db_state, ==, DB_CACHED);
 261 
 262         holds = refcount_count(&db->db_holds);
 263         if (verify_type == DBVU_EVICTING) {
 264                 /*
 265                  * Immediate eviction occurs when holds == dirtycnt.
 266                  * For normal eviction buffers, holds is zero on
 267                  * eviction, except when dbuf_fix_old_data() calls
 268                  * dbuf_clear_data().  However, the hold count can grow
 269                  * during eviction even though db_mtx is held (see
 270                  * dmu_bonus_hold() for an example), so we can only
 271                  * test the generic invariant that holds >= dirtycnt.
 272                  */
 273                 ASSERT3U(holds, >=, db->db_dirtycnt);
 274         } else {
 275                 if (db->db_user_immediate_evict == TRUE)
 276                         ASSERT3U(holds, >=, db->db_dirtycnt);
 277                 else
 278                         ASSERT3U(holds, >, 0);
 279         }
 280 #endif
 281 }
 282 
 283 static void
 284 dbuf_evict_user(dmu_buf_impl_t *db)
 285 {
 286         dmu_buf_user_t *dbu = db->db_user;
 287 
 288         ASSERT(MUTEX_HELD(&db->db_mtx));
 289 
 290         if (dbu == NULL)
 291                 return;
 292 
 293         dbuf_verify_user(db, DBVU_EVICTING);
 294         db->db_user = NULL;
 295 


1080         }
1081         mutex_exit(&db->db_mtx);
1082 
1083         dnode_willuse_space(dn, size-osize, tx);
1084         DB_DNODE_EXIT(db);
1085 }
1086 
1087 void
1088 dbuf_release_bp(dmu_buf_impl_t *db)
1089 {
1090         objset_t *os = db->db_objset;
1091 
1092         ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1093         ASSERT(arc_released(os->os_phys_buf) ||
1094             list_link_active(&os->os_dsl_dataset->ds_synced_link));
1095         ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1096 
1097         (void) arc_release(db->db_buf, db);
1098 }
1099 
1100 /*
1101  * We already have a dirty record for this TXG, and we are being
1102  * dirtied again.
1103  */
1104 static void
1105 dbuf_redirty(dbuf_dirty_record_t *dr)
1106 {
1107         dmu_buf_impl_t *db = dr->dr_dbuf;
1108 
1109         ASSERT(MUTEX_HELD(&db->db_mtx));
1110 
1111         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1112                 /*
1113                  * If this buffer has already been written out,
1114                  * we now need to reset its state.
1115                  */
1116                 dbuf_unoverride(dr);
1117                 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1118                     db->db_state != DB_NOFILL) {
1119                         /* Already released on initial dirty, so just thaw. */
1120                         ASSERT(arc_released(db->db_buf));
1121                         arc_buf_thaw(db->db_buf);
1122                 }
1123         }
1124 }
1125 
1126 dbuf_dirty_record_t *
1127 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1128 {
1129         dnode_t *dn;
1130         objset_t *os;
1131         dbuf_dirty_record_t **drp, *dr;
1132         int drop_struct_lock = FALSE;
1133         boolean_t do_free_accounting = B_FALSE;
1134         int txgoff = tx->tx_txg & TXG_MASK;
1135 
1136         ASSERT(tx->tx_txg != 0);
1137         ASSERT(!refcount_is_zero(&db->db_holds));
1138         DMU_TX_DIRTY_BUF(tx, db);
1139 
1140         DB_DNODE_ENTER(db);
1141         dn = DB_DNODE(db);
1142         /*
1143          * Shouldn't dirty a regular buffer in syncing context.  Private
1144          * objects may be dirtied in syncing context, but only if they
1145          * were already pre-dirtied in open context.


1178                     (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1179                 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1180                 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1181         }
1182         mutex_exit(&dn->dn_mtx);
1183 
1184         if (db->db_blkid == DMU_SPILL_BLKID)
1185                 dn->dn_have_spill = B_TRUE;
1186 
1187         /*
1188          * If this buffer is already dirty, we're done.
1189          */
1190         drp = &db->db_last_dirty;
1191         ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1192             db->db.db_object == DMU_META_DNODE_OBJECT);
1193         while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1194                 drp = &dr->dr_next;
1195         if (dr && dr->dr_txg == tx->tx_txg) {
1196                 DB_DNODE_EXIT(db);
1197 
1198                 dbuf_redirty(dr);









1199                 mutex_exit(&db->db_mtx);
1200                 return (dr);
1201         }
1202 
1203         /*
1204          * Only valid if not already dirty.
1205          */
1206         ASSERT(dn->dn_object == 0 ||
1207             dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1208             (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1209 
1210         ASSERT3U(dn->dn_nlevels, >, db->db_level);
1211         ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1212             dn->dn_phys->dn_nlevels > db->db_level ||
1213             dn->dn_next_nlevels[txgoff] > db->db_level ||
1214             dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1215             dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1216 
1217         /*
1218          * We should only be dirtying in syncing context if it's the


1488 
1489                 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1490                 dbuf_clear_data(db);
1491                 VERIFY(arc_buf_remove_ref(buf, db));
1492                 dbuf_evict(db);
1493                 return (B_TRUE);
1494         }
1495 
1496         return (B_FALSE);
1497 }
1498 
1499 void
1500 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1501 {
1502         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1503         int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1504 
1505         ASSERT(tx->tx_txg != 0);
1506         ASSERT(!refcount_is_zero(&db->db_holds));
1507 
1508         /*
1509          * Quick check for dirtyness.  For already dirty blocks, this
1510          * reduces runtime of this function by >90%, and overall performance
1511          * by 50% for some workloads (e.g. file deletion with indirect blocks
1512          * cached).
1513          */
1514         mutex_enter(&db->db_mtx);
1515         dbuf_dirty_record_t *dr;
1516         for (dr = db->db_last_dirty;
1517             dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1518                 /*
1519                  * It's possible that it is already dirty but not cached,
1520                  * because there are some calls to dbuf_dirty() that don't
1521                  * go through dmu_buf_will_dirty().
1522                  */
1523                 if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1524                         /* This dbuf is already dirty and cached. */
1525                         dbuf_redirty(dr);
1526                         mutex_exit(&db->db_mtx);
1527                         return;
1528                 }
1529         }
1530         mutex_exit(&db->db_mtx);
1531 
1532         DB_DNODE_ENTER(db);
1533         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1534                 rf |= DB_RF_HAVESTRUCT;
1535         DB_DNODE_EXIT(db);
1536         (void) dbuf_read(db, NULL, rf);
1537         (void) dbuf_dirty(db, tx);
1538 }
1539 
1540 void
1541 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1542 {
1543         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1544 
1545         db->db_state = DB_NOFILL;
1546 
1547         dmu_buf_will_fill(db_fake, tx);
1548 }
1549 
1550 void
1551 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)


1835 {
1836         objset_t *os = dn->dn_objset;
1837         dmu_buf_impl_t *db, *odb;
1838 
1839         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1840         ASSERT(dn->dn_type != DMU_OT_NONE);
1841 
1842         db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1843 
1844         db->db_objset = os;
1845         db->db.db_object = dn->dn_object;
1846         db->db_level = level;
1847         db->db_blkid = blkid;
1848         db->db_last_dirty = NULL;
1849         db->db_dirtycnt = 0;
1850         db->db_dnode_handle = dn->dn_handle;
1851         db->db_parent = parent;
1852         db->db_blkptr = blkptr;
1853 
1854         db->db_user = NULL;
1855         db->db_user_immediate_evict = FALSE;
1856         db->db_freed_in_flight = FALSE;
1857         db->db_pending_evict = FALSE;
1858 
1859         if (blkid == DMU_BONUS_BLKID) {
1860                 ASSERT3P(parent, ==, dn->dn_dbuf);
1861                 db->db.db_size = DN_MAX_BONUSLEN -
1862                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1863                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1864                 db->db.db_offset = DMU_BONUS_BLKID;
1865                 db->db_state = DB_UNCACHED;
1866                 /* the bonus dbuf is not placed in the hash table */
1867                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1868                 return (db);
1869         } else if (blkid == DMU_SPILL_BLKID) {
1870                 db->db.db_size = (blkptr != NULL) ?
1871                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1872                 db->db.db_offset = 0;
1873         } else {
1874                 int blocksize =
1875                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1876                 db->db.db_size = blocksize;
1877                 db->db.db_offset = db->db_blkid * blocksize;


2229 
2230         ASSERT(MUTEX_HELD(&db->db_mtx));
2231         DBUF_VERIFY(db);
2232 
2233         /*
2234          * Remove the reference to the dbuf before removing its hold on the
2235          * dnode so we can guarantee in dnode_move() that a referenced bonus
2236          * buffer has a corresponding dnode hold.
2237          */
2238         holds = refcount_remove(&db->db_holds, tag);
2239         ASSERT(holds >= 0);
2240 
2241         /*
2242          * We can't freeze indirects if there is a possibility that they
2243          * may be modified in the current syncing context.
2244          */
2245         if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2246                 arc_buf_freeze(db->db_buf);
2247 
2248         if (holds == db->db_dirtycnt &&
2249             db->db_level == 0 && db->db_user_immediate_evict)
2250                 dbuf_evict_user(db);
2251 
2252         if (holds == 0) {
2253                 if (db->db_blkid == DMU_BONUS_BLKID) {
2254                         dnode_t *dn;
2255                         boolean_t evict_dbuf = db->db_pending_evict;
2256 
2257                         /*
2258                          * If the dnode moves here, we cannot cross this
2259                          * barrier until the move completes.
2260                          */
2261                         DB_DNODE_ENTER(db);
2262 
2263                         dn = DB_DNODE(db);
2264                         atomic_dec_32(&dn->dn_dbufs_count);
2265 
2266                         /*
2267                          * Decrementing the dbuf count means that the bonus
2268                          * buffer's dnode hold is no longer discounted in
2269                          * dnode_move(). The dnode cannot move until after
2270                          * the dnode_rele() below.
2271                          */
2272                         DB_DNODE_EXIT(db);
2273 
2274                         /*
2275                          * Do not reference db after its lock is dropped.
2276                          * Another thread may evict it.
2277                          */
2278                         mutex_exit(&db->db_mtx);
2279 
2280                         if (evict_dbuf)
























2281                                 dnode_evict_bonus(dn);
2282 
2283                         dnode_rele(dn, db);

2284                 } else if (db->db_buf == NULL) {
2285                         /*
2286                          * This is a special case: we never associated this
2287                          * dbuf with any data allocated from the ARC.
2288                          */
2289                         ASSERT(db->db_state == DB_UNCACHED ||
2290                             db->db_state == DB_NOFILL);
2291                         dbuf_evict(db);
2292                 } else if (arc_released(db->db_buf)) {
2293                         arc_buf_t *buf = db->db_buf;
2294                         /*
2295                          * This dbuf has anonymous data associated with it.
2296                          */
2297                         dbuf_clear_data(db);
2298                         VERIFY(arc_buf_remove_ref(buf, db));
2299                         dbuf_evict(db);
2300                 } else {
2301                         VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2302 
2303                         /*


2310                          * criteria set in the property.
2311                          *
2312                          * To decide if our buffer is considered a
2313                          * duplicate, we must call into the arc to determine
2314                          * if multiple buffers are referencing the same
2315                          * block on-disk. If so, then we simply evict
2316                          * ourselves.
2317                          */
2318                         if (!DBUF_IS_CACHEABLE(db)) {
2319                                 if (db->db_blkptr != NULL &&
2320                                     !BP_IS_HOLE(db->db_blkptr) &&
2321                                     !BP_IS_EMBEDDED(db->db_blkptr)) {
2322                                         spa_t *spa =
2323                                             dmu_objset_spa(db->db_objset);
2324                                         blkptr_t bp = *db->db_blkptr;
2325                                         dbuf_clear(db);
2326                                         arc_freed(spa, &bp);
2327                                 } else {
2328                                         dbuf_clear(db);
2329                                 }
2330                         } else if (db->db_pending_evict ||
2331                             arc_buf_eviction_needed(db->db_buf)) {
2332                                 dbuf_clear(db);
2333                         } else {
2334                                 mutex_exit(&db->db_mtx);
2335                         }
2336                 }
2337         } else {
2338                 mutex_exit(&db->db_mtx);
2339         }
2340 }
2341 
2342 #pragma weak dmu_buf_refcount = dbuf_refcount
2343 uint64_t
2344 dbuf_refcount(dmu_buf_impl_t *db)
2345 {
2346         return (refcount_count(&db->db_holds));
2347 }
2348 
2349 void *
2350 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,


2358                 db->db_user = new_user;
2359         else
2360                 old_user = db->db_user;
2361         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2362         mutex_exit(&db->db_mtx);
2363 
2364         return (old_user);
2365 }
2366 
2367 void *
2368 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2369 {
2370         return (dmu_buf_replace_user(db_fake, NULL, user));
2371 }
2372 
2373 void *
2374 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2375 {
2376         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2377 
2378         db->db_user_immediate_evict = TRUE;
2379         return (dmu_buf_set_user(db_fake, user));
2380 }
2381 
2382 void *
2383 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
2384 {
2385         return (dmu_buf_replace_user(db_fake, user, NULL));
2386 }
2387 
2388 void *
2389 dmu_buf_get_user(dmu_buf_t *db_fake)
2390 {
2391         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2392 
2393         dbuf_verify_user(db, DBVU_NOT_EVICTING);
2394         return (db->db_user);
2395 }
2396 
2397 void
2398 dmu_buf_user_evict_wait()