3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_send.h>
34 #include <sys/dmu_impl.h>
35 #include <sys/dbuf.h>
36 #include <sys/dmu_objset.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/dmu_tx.h>
40 #include <sys/spa.h>
41 #include <sys/zio.h>
42 #include <sys/dmu_zfetch.h>
43 #include <sys/sa.h>
44 #include <sys/sa_impl.h>
45 #include <sys/zfeature.h>
46 #include <sys/blkptr.h>
47 #include <sys/range_tree.h>
48 #include <sys/callb.h>
49 #include <sys/abd.h>
50 #include <sys/vdev.h>
51 #include <sys/cityhash.h>
52
53 uint_t zfs_dbuf_evict_key;
54
55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
57
58 #ifndef __lint
59 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
60 dmu_buf_evict_func_t *evict_func_sync,
61 dmu_buf_evict_func_t *evict_func_async,
62 dmu_buf_t **clear_on_evict_dbufp);
63 #endif /* ! __lint */
64
65 /*
66 * Global data structures and functions for the dbuf cache.
67 */
68 static kmem_cache_t *dbuf_kmem_cache;
69 static taskq_t *dbu_evict_taskq;
70
71 static kthread_t *dbuf_cache_evict_thread;
72 static kmutex_t dbuf_evict_lock;
73 static kcondvar_t dbuf_evict_cv;
74 static boolean_t dbuf_evict_thread_exit;
75
76 /*
77 * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
78 * are not currently held but have been recently released. These dbufs
79 * are not eligible for arc eviction until they are aged out of the cache.
80 * Dbufs are added to the dbuf cache once the last hold is released. If a
81 * dbuf is later accessed and still exists in the dbuf cache, then it will
82 * be removed from the cache and later re-added to the head of the cache.
83 * Dbufs that are aged out of the cache will be immediately destroyed and
84 * become eligible for arc eviction.
85 */
86 static multilist_t *dbuf_cache;
87 static refcount_t dbuf_cache_size;
88 uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
89
90 /* Cap the size of the dbuf cache to log2 fraction of arc size. */
91 int dbuf_cache_max_shift = 5;
92
93 /*
94 * The dbuf cache uses a three-stage eviction policy:
95 * - A low water marker designates when the dbuf eviction thread
96 * should stop evicting from the dbuf cache.
97 * - When we reach the maximum size (aka mid water mark), we
98 * signal the eviction thread to run.
99 * - The high water mark indicates when the eviction thread
100 * is unable to keep up with the incoming load and eviction must
101 * happen in the context of the calling thread.
102 *
103 * The dbuf cache:
104 * (max size)
105 * low water mid water hi water
106 * +----------------------------------------+----------+----------+
107 * | | | |
108 * | | | |
109 * | | | |
110 * | | | |
111 * +----------------------------------------+----------+----------+
112 * stop signal evict
113 * evicting eviction directly
114 * thread
147 multilist_link_init(&db->db_cache_link);
148 refcount_create(&db->db_holds);
149
150 return (0);
151 }
152
153 /* ARGSUSED */
154 static void
155 dbuf_dest(void *vdb, void *unused)
156 {
157 dmu_buf_impl_t *db = vdb;
158 mutex_destroy(&db->db_mtx);
159 cv_destroy(&db->db_changed);
160 ASSERT(!multilist_link_active(&db->db_cache_link));
161 refcount_destroy(&db->db_holds);
162 }
163
164 /*
165 * dbuf hash table routines
166 */
167 static dbuf_hash_table_t dbuf_hash_table;
168
169 static uint64_t dbuf_hash_count;
170
171 /*
172 * We use Cityhash for this. It's fast, and has good hash properties without
173 * requiring any large static buffers.
174 */
175 static uint64_t
176 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
177 {
178 return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
179 }
180
181 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
182 ((dbuf)->db.db_object == (obj) && \
183 (dbuf)->db_objset == (os) && \
184 (dbuf)->db_level == (level) && \
185 (dbuf)->db_blkid == (blkid))
186
187 dmu_buf_impl_t *
188 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
189 {
190 dbuf_hash_table_t *h = &dbuf_hash_table;
191 uint64_t hv = dbuf_hash(os, obj, level, blkid);
192 uint64_t idx = hv & h->hash_table_mask;
193 dmu_buf_impl_t *db;
194
195 mutex_enter(DBUF_HASH_MUTEX(h, idx));
196 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
197 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
198 mutex_enter(&db->db_mtx);
376 dbu, 0, &dbu->dbu_tqent);
377 }
378 }
379
380 boolean_t
381 dbuf_is_metadata(dmu_buf_impl_t *db)
382 {
383 if (db->db_level > 0) {
384 return (B_TRUE);
385 } else {
386 boolean_t is_metadata;
387
388 DB_DNODE_ENTER(db);
389 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
390 DB_DNODE_EXIT(db);
391
392 return (is_metadata);
393 }
394 }
395
396 /*
397 * This function *must* return indices evenly distributed between all
398 * sublists of the multilist. This is needed due to how the dbuf eviction
399 * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
400 * distributed between all sublists and uses this assumption when
401 * deciding which sublist to evict from and how much to evict from it.
402 */
403 unsigned int
404 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
405 {
406 dmu_buf_impl_t *db = obj;
407
408 /*
409 * The assumption here, is the hash value for a given
410 * dmu_buf_impl_t will remain constant throughout it's lifetime
411 * (i.e. it's objset, object, level and blkid fields don't change).
412 * Thus, we don't need to store the dbuf's sublist index
413 * on insertion, as this index can be recalculated on removal.
414 *
415 * Also, the low order bits of the hash value are thought to be
416 * distributed evenly. Otherwise, in the case that the multilist
417 * has a power of two number of sublists, each sublists' usage
418 * would not be evenly distributed.
419 */
420 return (dbuf_hash(db->db_objset, db->db.db_object,
421 db->db_level, db->db_blkid) %
422 multilist_get_num_sublists(ml));
423 }
424
425 static inline boolean_t
426 dbuf_cache_above_hiwater(void)
427 {
428 uint64_t dbuf_cache_hiwater_bytes =
429 (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
430
431 return (refcount_count(&dbuf_cache_size) >
432 dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
433 }
434
435 static inline boolean_t
436 dbuf_cache_above_lowater(void)
437 {
438 uint64_t dbuf_cache_lowater_bytes =
439 (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
440
441 return (refcount_count(&dbuf_cache_size) >
442 dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
443 }
444
445 /*
446 * Evict the oldest eligible dbuf from the dbuf cache.
447 */
448 static void
449 dbuf_evict_one(void)
450 {
451 int idx = multilist_get_random_index(dbuf_cache);
452 multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
453
454 ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
455
456 /*
457 * Set the thread's tsd to indicate that it's processing evictions.
458 * Once a thread stops evicting from the dbuf cache it will
459 * reset its tsd to NULL.
460 */
461 ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
462 (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
463
464 dmu_buf_impl_t *db = multilist_sublist_tail(mls);
465 while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
466 db = multilist_sublist_prev(mls, db);
467 }
468
469 DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
470 multilist_sublist_t *, mls);
471
472 if (db != NULL) {
473 multilist_sublist_remove(mls, db);
474 multilist_sublist_unlock(mls);
475 (void) refcount_remove_many(&dbuf_cache_size,
476 db->db.db_size, db);
477 dbuf_destroy(db);
478 } else {
479 multilist_sublist_unlock(mls);
480 }
481 (void) tsd_set(zfs_dbuf_evict_key, NULL);
482 }
483
484 /*
485 * The dbuf evict thread is responsible for aging out dbufs from the
486 * cache. Once the cache has reached it's maximum size, dbufs are removed
487 * and destroyed. The eviction thread will continue running until the size
488 * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
489 * out of the cache it is destroyed and becomes eligible for arc eviction.
490 */
491 /* ARGSUSED */
492 static void
493 dbuf_evict_thread(void *unused)
494 {
495 callb_cpr_t cpr;
496
509 /*
510 * Keep evicting as long as we're above the low water mark
511 * for the cache. We do this without holding the locks to
512 * minimize lock contention.
513 */
514 while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
515 dbuf_evict_one();
516 }
517
518 mutex_enter(&dbuf_evict_lock);
519 }
520
521 dbuf_evict_thread_exit = B_FALSE;
522 cv_broadcast(&dbuf_evict_cv);
523 CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
524 thread_exit();
525 }
526
527 /*
528 * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
529 * If the dbuf cache is at its high water mark, then evict a dbuf from the
530 * dbuf cache using the callers context.
531 */
532 static void
533 dbuf_evict_notify(void)
534 {
535
536 /*
537 * We use thread specific data to track when a thread has
538 * started processing evictions. This allows us to avoid deeply
539 * nested stacks that would have a call flow similar to this:
540 *
541 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
542 * ^ |
543 * | |
544 * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
545 *
546 * The dbuf_eviction_thread will always have its tsd set until
547 * that thread exits. All other threads will only set their tsd
548 * if they are participating in the eviction process. This only
549 * happens if the eviction thread is unable to process evictions
550 * fast enough. To keep the dbuf cache size in check, other threads
551 * can evict from the dbuf cache directly. Those threads will set
552 * their tsd values so that we ensure that they only evict one dbuf
553 * from the dbuf cache.
554 */
555 if (tsd_get(zfs_dbuf_evict_key) != NULL)
556 return;
557
558 /*
559 * We check if we should evict without holding the dbuf_evict_lock,
560 * because it's OK to occasionally make the wrong decision here,
561 * and grabbing the lock results in massive lock contention.
562 */
563 if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
564 if (dbuf_cache_above_hiwater())
565 dbuf_evict_one();
566 cv_signal(&dbuf_evict_cv);
567 }
568 }
569
570 void
571 dbuf_init(void)
572 {
573 uint64_t hsize = 1ULL << 16;
574 dbuf_hash_table_t *h = &dbuf_hash_table;
575 int i;
576
577 /*
578 * The hash table is big enough to fill all of physical memory
579 * with an average 4K block size. The table will take up
580 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
581 */
582 while (hsize * 4096 < physmem * PAGESIZE)
583 hsize <<= 1;
584
585 retry:
586 h->hash_table_mask = hsize - 1;
587 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
588 if (h->hash_table == NULL) {
589 /* XXX - we should really return an error instead of assert */
590 ASSERT(hsize > (1ULL << 10));
591 hsize >>= 1;
592 goto retry;
593 }
594
595 dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
596 sizeof (dmu_buf_impl_t),
597 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
598
599 for (i = 0; i < DBUF_MUTEXES; i++)
600 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
601
602 /*
603 * Setup the parameters for the dbuf cache. We cap the size of the
604 * dbuf cache to 1/32nd (default) of the size of the ARC.
605 */
606 dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
607 arc_max_bytes() >> dbuf_cache_max_shift);
608
609 /*
610 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
611 * configuration is not required.
612 */
613 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
614
615 dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
616 offsetof(dmu_buf_impl_t, db_cache_link),
617 dbuf_cache_multilist_index_func);
618 refcount_create(&dbuf_cache_size);
619
620 tsd_create(&zfs_dbuf_evict_key, NULL);
621 dbuf_evict_thread_exit = B_FALSE;
622 mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
623 cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
624 dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
625 NULL, 0, &p0, TS_RUN, minclsyspri);
626 }
627
628 void
629 dbuf_fini(void)
630 {
631 dbuf_hash_table_t *h = &dbuf_hash_table;
632 int i;
633
634 for (i = 0; i < DBUF_MUTEXES; i++)
635 mutex_destroy(&h->hash_mutexes[i]);
636 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
637 kmem_cache_destroy(dbuf_kmem_cache);
638 taskq_destroy(dbu_evict_taskq);
639
640 mutex_enter(&dbuf_evict_lock);
641 dbuf_evict_thread_exit = B_TRUE;
642 while (dbuf_evict_thread_exit) {
643 cv_signal(&dbuf_evict_cv);
644 cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
645 }
646 mutex_exit(&dbuf_evict_lock);
647 tsd_destroy(&zfs_dbuf_evict_key);
648
649 mutex_destroy(&dbuf_evict_lock);
650 cv_destroy(&dbuf_evict_cv);
651
652 refcount_destroy(&dbuf_cache_size);
653 multilist_destroy(dbuf_cache);
654 }
655
656 /*
657 * Other stuff.
658 */
659
660 #ifdef ZFS_DEBUG
661 static void
662 dbuf_verify(dmu_buf_impl_t *db)
663 {
664 dnode_t *dn;
665 dbuf_dirty_record_t *dr;
666
667 ASSERT(MUTEX_HELD(&db->db_mtx));
668
669 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
670 return;
671
672 ASSERT(db->db_objset != NULL);
673 DB_DNODE_ENTER(db);
1397 }
1398
1399 void
1400 dbuf_release_bp(dmu_buf_impl_t *db)
1401 {
1402 objset_t *os = db->db_objset;
1403
1404 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1405 ASSERT(arc_released(os->os_phys_buf) ||
1406 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1407 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1408
1409 (void) arc_release(db->db_buf, db);
1410 }
1411
1412 /*
1413 * We already have a dirty record for this TXG, and we are being
1414 * dirtied again.
1415 */
1416 static void
1417 dbuf_redirty(dbuf_dirty_record_t *dr)
1418 {
1419 dmu_buf_impl_t *db = dr->dr_dbuf;
1420
1421 ASSERT(MUTEX_HELD(&db->db_mtx));
1422
1423 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1424 /*
1425 * If this buffer has already been written out,
1426 * we now need to reset its state.
1427 */
1428 dbuf_unoverride(dr);
1429 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1430 db->db_state != DB_NOFILL) {
1431 /* Already released on initial dirty, so just thaw. */
1432 ASSERT(arc_released(db->db_buf));
1433 arc_buf_thaw(db->db_buf);
1434 }
1435 }
1436 }
1437
1438 dbuf_dirty_record_t *
1439 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1440 {
1441 dnode_t *dn;
1442 objset_t *os;
1443 dbuf_dirty_record_t **drp, *dr;
1444 int drop_struct_lock = FALSE;
1445 int txgoff = tx->tx_txg & TXG_MASK;
1446
1447 ASSERT(tx->tx_txg != 0);
1448 ASSERT(!refcount_is_zero(&db->db_holds));
1449 DMU_TX_DIRTY_BUF(tx, db);
1450
1451 DB_DNODE_ENTER(db);
1452 dn = DB_DNODE(db);
1453 /*
1454 * Shouldn't dirty a regular buffer in syncing context. Private
1455 * objects may be dirtied in syncing context, but only if they
1456 * were already pre-dirtied in open context.
1457 */
1458 #ifdef DEBUG
1459 if (dn->dn_objset->os_dsl_dataset != NULL) {
1506 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1507 FTAG);
1508 }
1509 }
1510 mutex_exit(&dn->dn_mtx);
1511
1512 if (db->db_blkid == DMU_SPILL_BLKID)
1513 dn->dn_have_spill = B_TRUE;
1514
1515 /*
1516 * If this buffer is already dirty, we're done.
1517 */
1518 drp = &db->db_last_dirty;
1519 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1520 db->db.db_object == DMU_META_DNODE_OBJECT);
1521 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1522 drp = &dr->dr_next;
1523 if (dr && dr->dr_txg == tx->tx_txg) {
1524 DB_DNODE_EXIT(db);
1525
1526 dbuf_redirty(dr);
1527 mutex_exit(&db->db_mtx);
1528 return (dr);
1529 }
1530
1531 /*
1532 * Only valid if not already dirty.
1533 */
1534 ASSERT(dn->dn_object == 0 ||
1535 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1536 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1537
1538 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1539
1540 /*
1541 * We should only be dirtying in syncing context if it's the
1542 * mos or we're initializing the os or it's a special object.
1543 * However, we are allowed to dirty in syncing context provided
1544 * we already dirtied it in open context. Hence we must make
1545 * this assertion only if we're not already dirty.
1546 */
1586 * then).
1587 */
1588 arc_release(db->db_buf, db);
1589 dbuf_fix_old_data(db, tx->tx_txg);
1590 data_old = db->db_buf;
1591 }
1592 ASSERT(data_old != NULL);
1593 }
1594 dr->dt.dl.dr_data = data_old;
1595 } else {
1596 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1597 list_create(&dr->dt.di.dr_children,
1598 sizeof (dbuf_dirty_record_t),
1599 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1600 }
1601 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1602 dr->dr_accounted = db->db.db_size;
1603 dr->dr_dbuf = db;
1604 dr->dr_txg = tx->tx_txg;
1605 dr->dr_next = *drp;
1606 *drp = dr;
1607
1608 /*
1609 * We could have been freed_in_flight between the dbuf_noread
1610 * and dbuf_dirty. We win, as though the dbuf_noread() had
1611 * happened after the free.
1612 */
1613 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1614 db->db_blkid != DMU_SPILL_BLKID) {
1615 mutex_enter(&dn->dn_mtx);
1616 if (dn->dn_free_ranges[txgoff] != NULL) {
1617 range_tree_clear(dn->dn_free_ranges[txgoff],
1618 db->db_blkid, 1);
1619 }
1620 mutex_exit(&dn->dn_mtx);
1621 db->db_freed_in_flight = FALSE;
1622 }
1623
1624 /*
1625 * This buffer is now part of this txg
1626 */
1627 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1628 db->db_dirtycnt += 1;
1629 ASSERT3U(db->db_dirtycnt, <=, 3);
1630
1631 mutex_exit(&db->db_mtx);
1632
1633 if (db->db_blkid == DMU_BONUS_BLKID ||
1634 db->db_blkid == DMU_SPILL_BLKID) {
1635 mutex_enter(&dn->dn_mtx);
1636 ASSERT(!list_link_active(&dr->dr_dirty_node));
1637 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1638 mutex_exit(&dn->dn_mtx);
1639 dnode_setdirty(dn, tx);
1640 DB_DNODE_EXIT(db);
1641 return (dr);
1642 }
1643
1644 /*
1645 * The dn_struct_rwlock prevents db_blkptr from changing
1646 * due to a write from syncing context completing
1647 * while we are running, so we want to acquire it before
1648 * looking at db_blkptr.
1649 */
1650 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1651 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1652 drop_struct_lock = TRUE;
1653 }
1654
1655 /*
1656 * We need to hold the dn_struct_rwlock to make this assertion,
1657 * because it protects dn_phys / dn_next_nlevels from changing.
1658 */
1659 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1660 dn->dn_phys->dn_nlevels > db->db_level ||
1661 dn->dn_next_nlevels[txgoff] > db->db_level ||
1662 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1663 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1664
1665 /*
1666 * If we are overwriting a dedup BP, then unless it is snapshotted,
1667 * when we get to syncing context we will need to decrement its
1668 * refcount in the DDT. Prefetch the relevant DDT block so that
1669 * syncing context won't have to wait for the i/o.
1670 */
1671 ddt_prefetch(os->os_spa, db->db_blkptr);
1672
1673 if (db->db_level == 0) {
1674 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1675 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1676 }
1677
1678 if (db->db_level+1 < dn->dn_nlevels) {
1679 dmu_buf_impl_t *parent = db->db_parent;
1680 dbuf_dirty_record_t *di;
1681 int parent_held = FALSE;
1682
1683 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1684 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1685
1686 parent = dbuf_hold_level(dn, db->db_level+1,
1687 db->db_blkid >> epbs, FTAG);
1688 ASSERT(parent != NULL);
1689 parent_held = TRUE;
1690 }
1691 if (drop_struct_lock)
1692 rw_exit(&dn->dn_struct_rwlock);
1693 ASSERT3U(db->db_level+1, ==, parent->db_level);
1694 di = dbuf_dirty(parent, tx);
1695 if (parent_held)
1696 dbuf_rele(parent, FTAG);
1697
1698 mutex_enter(&db->db_mtx);
1699 /*
1700 * Since we've dropped the mutex, it's possible that
1701 * dbuf_undirty() might have changed this out from under us.
1702 */
1703 if (db->db_last_dirty == dr ||
1704 dn->dn_object == DMU_META_DNODE_OBJECT) {
1705 mutex_enter(&di->dt.di.dr_mtx);
1706 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1707 ASSERT(!list_link_active(&dr->dr_dirty_node));
1708 list_insert_tail(&di->dt.di.dr_children, dr);
1709 mutex_exit(&di->dt.di.dr_mtx);
1710 dr->dr_parent = di;
1711 }
1712 mutex_exit(&db->db_mtx);
1713 } else {
1714 ASSERT(db->db_level+1 == dn->dn_nlevels);
1715 ASSERT(db->db_blkid < dn->dn_nblkptr);
1716 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1717 mutex_enter(&dn->dn_mtx);
1718 ASSERT(!list_link_active(&dr->dr_dirty_node));
1719 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1720 mutex_exit(&dn->dn_mtx);
1721 if (drop_struct_lock)
1722 rw_exit(&dn->dn_struct_rwlock);
1723 }
1724
1725 dnode_setdirty(dn, tx);
1726 DB_DNODE_EXIT(db);
1727 return (dr);
1728 }
1729
1730 /*
1731 * Undirty a buffer in the transaction group referenced by the given
1732 * transaction. Return whether this evicted the dbuf.
1733 */
1734 static boolean_t
1735 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1736 {
1737 dnode_t *dn;
1738 uint64_t txg = tx->tx_txg;
1739 dbuf_dirty_record_t *dr, **drp;
1740
1741 ASSERT(txg != 0);
1742
1743 /*
1744 * Due to our use of dn_nlevels below, this can only be called
1745 * in open context, unless we are operating on the MOS.
1746 * From syncing context, dn_nlevels may be different from the
1747 * dn_nlevels used when dbuf was dirtied.
1748 */
1749 ASSERT(db->db_objset ==
1805 }
1806
1807 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1808
1809 ASSERT(db->db_dirtycnt > 0);
1810 db->db_dirtycnt -= 1;
1811
1812 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1813 ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
1814 dbuf_destroy(db);
1815 return (B_TRUE);
1816 }
1817
1818 return (B_FALSE);
1819 }
1820
1821 void
1822 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1823 {
1824 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1825 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1826
1827 ASSERT(tx->tx_txg != 0);
1828 ASSERT(!refcount_is_zero(&db->db_holds));
1829
1830 /*
1831 * Quick check for dirtyness. For already dirty blocks, this
1832 * reduces runtime of this function by >90%, and overall performance
1833 * by 50% for some workloads (e.g. file deletion with indirect blocks
1834 * cached).
1835 */
1836 mutex_enter(&db->db_mtx);
1837 dbuf_dirty_record_t *dr;
1838 for (dr = db->db_last_dirty;
1839 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
1840 /*
1841 * It's possible that it is already dirty but not cached,
1842 * because there are some calls to dbuf_dirty() that don't
1843 * go through dmu_buf_will_dirty().
1844 */
1845 if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
1846 /* This dbuf is already dirty and cached. */
1847 dbuf_redirty(dr);
1848 mutex_exit(&db->db_mtx);
1849 return;
1850 }
1851 }
1852 mutex_exit(&db->db_mtx);
1853
1854 DB_DNODE_ENTER(db);
1855 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1856 rf |= DB_RF_HAVESTRUCT;
1857 DB_DNODE_EXIT(db);
1858 (void) dbuf_read(db, NULL, rf);
1859 (void) dbuf_dirty(db, tx);
1860 }
1861
1862 void
1863 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1864 {
1865 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1866
1867 db->db_state = DB_NOFILL;
1868
1869 dmu_buf_will_fill(db_fake, tx);
1870 }
1871
1872 void
1873 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1874 {
1875 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1876
1877 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1878 ASSERT(tx->tx_txg != 0);
1879 ASSERT(db->db_level == 0);
1880 ASSERT(!refcount_is_zero(&db->db_holds));
1881
2016 dmu_buf_impl_t *dndb;
2017
2018 ASSERT(MUTEX_HELD(&db->db_mtx));
2019 ASSERT(refcount_is_zero(&db->db_holds));
2020
2021 if (db->db_buf != NULL) {
2022 arc_buf_destroy(db->db_buf, db);
2023 db->db_buf = NULL;
2024 }
2025
2026 if (db->db_blkid == DMU_BONUS_BLKID) {
2027 ASSERT(db->db.db_data != NULL);
2028 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
2029 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2030 db->db_state = DB_UNCACHED;
2031 }
2032
2033 dbuf_clear_data(db);
2034
2035 if (multilist_link_active(&db->db_cache_link)) {
2036 multilist_remove(dbuf_cache, db);
2037 (void) refcount_remove_many(&dbuf_cache_size,
2038 db->db.db_size, db);
2039 }
2040
2041 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
2042 ASSERT(db->db_data_pending == NULL);
2043
2044 db->db_state = DB_EVICTING;
2045 db->db_blkptr = NULL;
2046
2047 /*
2048 * Now that db_state is DB_EVICTING, nobody else can find this via
2049 * the hash table. We can now drop db_mtx, which allows us to
2050 * acquire the dn_dbufs_mtx.
2051 */
2052 mutex_exit(&db->db_mtx);
2053
2054 DB_DNODE_ENTER(db);
2055 dn = DB_DNODE(db);
2056 dndb = dn->dn_dbuf;
2057 if (db->db_blkid != DMU_BONUS_BLKID) {
2058 boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
2072 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
2073 * release any lock.
2074 */
2075 dnode_rele(dn, db);
2076 db->db_dnode_handle = NULL;
2077
2078 dbuf_hash_remove(db);
2079 } else {
2080 DB_DNODE_EXIT(db);
2081 }
2082
2083 ASSERT(refcount_is_zero(&db->db_holds));
2084
2085 db->db_parent = NULL;
2086
2087 ASSERT(db->db_buf == NULL);
2088 ASSERT(db->db.db_data == NULL);
2089 ASSERT(db->db_hash_next == NULL);
2090 ASSERT(db->db_blkptr == NULL);
2091 ASSERT(db->db_data_pending == NULL);
2092 ASSERT(!multilist_link_active(&db->db_cache_link));
2093
2094 kmem_cache_free(dbuf_kmem_cache, db);
2095 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2096
2097 /*
2098 * If this dbuf is referenced from an indirect dbuf,
2099 * decrement the ref count on the indirect dbuf.
2100 */
2101 if (parent && parent != dndb)
2102 dbuf_rele(parent, db);
2103 }
2104
2105 /*
2106 * Note: While bpp will always be updated if the function returns success,
2107 * parentp will not be updated if the dnode does not have dn_dbuf filled in;
2108 * this happens when the dnode is the meta-dnode, or a userused or groupused
2109 * object.
2110 */
2111 static int
2210 db->db_level = level;
2211 db->db_blkid = blkid;
2212 db->db_last_dirty = NULL;
2213 db->db_dirtycnt = 0;
2214 db->db_dnode_handle = dn->dn_handle;
2215 db->db_parent = parent;
2216 db->db_blkptr = blkptr;
2217
2218 db->db_user = NULL;
2219 db->db_user_immediate_evict = FALSE;
2220 db->db_freed_in_flight = FALSE;
2221 db->db_pending_evict = FALSE;
2222
2223 if (blkid == DMU_BONUS_BLKID) {
2224 ASSERT3P(parent, ==, dn->dn_dbuf);
2225 db->db.db_size = DN_MAX_BONUSLEN -
2226 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
2227 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
2228 db->db.db_offset = DMU_BONUS_BLKID;
2229 db->db_state = DB_UNCACHED;
2230 /* the bonus dbuf is not placed in the hash table */
2231 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2232 return (db);
2233 } else if (blkid == DMU_SPILL_BLKID) {
2234 db->db.db_size = (blkptr != NULL) ?
2235 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
2236 db->db.db_offset = 0;
2237 } else {
2238 int blocksize =
2239 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
2240 db->db.db_size = blocksize;
2241 db->db.db_offset = db->db_blkid * blocksize;
2242 }
2243
2244 /*
2245 * Hold the dn_dbufs_mtx while we get the new dbuf
2246 * in the hash table *and* added to the dbufs list.
2247 * This prevents a possible deadlock with someone
2248 * trying to look up this dbuf before its added to the
2249 * dn_dbufs list.
2250 */
2251 mutex_enter(&dn->dn_dbufs_mtx);
2252 db->db_state = DB_EVICTING;
2253 if ((odb = dbuf_hash_insert(db)) != NULL) {
2254 /* someone else inserted it first */
2255 kmem_cache_free(dbuf_kmem_cache, db);
2256 mutex_exit(&dn->dn_dbufs_mtx);
2257 return (odb);
2258 }
2259 avl_add(&dn->dn_dbufs, db);
2260
2261 db->db_state = DB_UNCACHED;
2262 mutex_exit(&dn->dn_dbufs_mtx);
2263 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2264
2265 if (parent && parent != dn->dn_dbuf)
2266 dbuf_add_ref(parent, db);
2267
2268 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2269 refcount_count(&dn->dn_holds) > 0);
2270 (void) refcount_add(&dn->dn_holds, db);
2271 atomic_inc_32(&dn->dn_dbufs_count);
2272
2273 dprintf_dbuf(db, "db=%p\n", db);
2274
2275 return (db);
2276 }
2277
2278 typedef struct dbuf_prefetch_arg {
2279 spa_t *dpa_spa; /* The spa to issue the prefetch in. */
2280 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2281 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2548 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2549 if (fail_sparse) {
2550 if (err == 0 && bp && BP_IS_HOLE(bp))
2551 err = SET_ERROR(ENOENT);
2552 if (err) {
2553 if (parent)
2554 dbuf_rele(parent, NULL);
2555 return (err);
2556 }
2557 }
2558 if (err && err != ENOENT)
2559 return (err);
2560 db = dbuf_create(dn, level, blkid, parent, bp);
2561 }
2562
2563 if (fail_uncached && db->db_state != DB_CACHED) {
2564 mutex_exit(&db->db_mtx);
2565 return (SET_ERROR(ENOENT));
2566 }
2567
2568 if (db->db_buf != NULL)
2569 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2570
2571 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2572
2573 /*
2574 * If this buffer is currently syncing out, and we are are
2575 * still referencing it from db_data, we need to make a copy
2576 * of it in case we decide we want to dirty it again in this txg.
2577 */
2578 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2579 dn->dn_object != DMU_META_DNODE_OBJECT &&
2580 db->db_state == DB_CACHED && db->db_data_pending) {
2581 dbuf_dirty_record_t *dr = db->db_data_pending;
2582
2583 if (dr->dt.dl.dr_data == db->db_buf) {
2584 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2585
2586 dbuf_set_data(db,
2587 arc_alloc_buf(dn->dn_objset->os_spa, db, type,
2588 db->db.db_size));
2589 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2590 db->db.db_size);
2591 }
2592 }
2593
2594 if (multilist_link_active(&db->db_cache_link)) {
2595 ASSERT(refcount_is_zero(&db->db_holds));
2596 multilist_remove(dbuf_cache, db);
2597 (void) refcount_remove_many(&dbuf_cache_size,
2598 db->db.db_size, db);
2599 }
2600 (void) refcount_add(&db->db_holds, tag);
2601 DBUF_VERIFY(db);
2602 mutex_exit(&db->db_mtx);
2603
2604 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2605 if (parent)
2606 dbuf_rele(parent, NULL);
2607
2608 ASSERT3P(DB_DNODE(db), ==, dn);
2609 ASSERT3U(db->db_blkid, ==, blkid);
2610 ASSERT3U(db->db_level, ==, level);
2611 *dbp = db;
2612
2613 return (0);
2614 }
2615
2616 dmu_buf_impl_t *
2617 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2618 {
2795 * This dbuf has anonymous data associated with it.
2796 */
2797 dbuf_destroy(db);
2798 } else {
2799 boolean_t do_arc_evict = B_FALSE;
2800 blkptr_t bp;
2801 spa_t *spa = dmu_objset_spa(db->db_objset);
2802
2803 if (!DBUF_IS_CACHEABLE(db) &&
2804 db->db_blkptr != NULL &&
2805 !BP_IS_HOLE(db->db_blkptr) &&
2806 !BP_IS_EMBEDDED(db->db_blkptr)) {
2807 do_arc_evict = B_TRUE;
2808 bp = *db->db_blkptr;
2809 }
2810
2811 if (!DBUF_IS_CACHEABLE(db) ||
2812 db->db_pending_evict) {
2813 dbuf_destroy(db);
2814 } else if (!multilist_link_active(&db->db_cache_link)) {
2815 multilist_insert(dbuf_cache, db);
2816 (void) refcount_add_many(&dbuf_cache_size,
2817 db->db.db_size, db);
2818 mutex_exit(&db->db_mtx);
2819
2820 dbuf_evict_notify();
2821 }
2822
2823 if (do_arc_evict)
2824 arc_freed(spa, &bp);
2825 }
2826 } else {
2827 mutex_exit(&db->db_mtx);
2828 }
2829
2830 }
2831
2832 #pragma weak dmu_buf_refcount = dbuf_refcount
2833 uint64_t
2834 dbuf_refcount(dmu_buf_impl_t *db)
2835 {
2836 return (refcount_count(&db->db_holds));
2837 }
2838
2839 void *
2840 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
2841 dmu_buf_user_t *new_user)
2983 /* Read the block if it hasn't been read yet. */
2984 if (db->db_buf == NULL) {
2985 mutex_exit(&db->db_mtx);
2986 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2987 mutex_enter(&db->db_mtx);
2988 }
2989 ASSERT3U(db->db_state, ==, DB_CACHED);
2990 ASSERT(db->db_buf != NULL);
2991
2992 DB_DNODE_ENTER(db);
2993 dn = DB_DNODE(db);
2994 /* Indirect block size must match what the dnode thinks it is. */
2995 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2996 dbuf_check_blkptr(dn, db);
2997 DB_DNODE_EXIT(db);
2998
2999 /* Provide the pending dirty record to child dbufs */
3000 db->db_data_pending = dr;
3001
3002 mutex_exit(&db->db_mtx);
3003
3004 dbuf_write(dr, db->db_buf, tx);
3005
3006 zio = dr->dr_zio;
3007 mutex_enter(&dr->dt.di.dr_mtx);
3008 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
3009 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3010 mutex_exit(&dr->dt.di.dr_mtx);
3011 zio_nowait(zio);
3012 }
3013
3014 static void
3015 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3016 {
3017 arc_buf_t **datap = &dr->dt.dl.dr_data;
3018 dmu_buf_impl_t *db = dr->dr_dbuf;
3019 dnode_t *dn;
3020 objset_t *os;
3021 uint64_t txg = tx->tx_txg;
3022
3023 ASSERT(dmu_tx_is_syncing(tx));
3455 static void
3456 dbuf_write_override_done(zio_t *zio)
3457 {
3458 dbuf_dirty_record_t *dr = zio->io_private;
3459 dmu_buf_impl_t *db = dr->dr_dbuf;
3460 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3461
3462 mutex_enter(&db->db_mtx);
3463 if (!BP_EQUAL(zio->io_bp, obp)) {
3464 if (!BP_IS_HOLE(obp))
3465 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3466 arc_release(dr->dt.dl.dr_data, db);
3467 }
3468 mutex_exit(&db->db_mtx);
3469 dbuf_write_done(zio, NULL, db);
3470
3471 if (zio->io_abd != NULL)
3472 abd_put(zio->io_abd);
3473 }
3474
3475 typedef struct dbuf_remap_impl_callback_arg {
3476 objset_t *drica_os;
3477 uint64_t drica_blk_birth;
3478 dmu_tx_t *drica_tx;
3479 } dbuf_remap_impl_callback_arg_t;
3480
3481 static void
3482 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
3483 void *arg)
3484 {
3485 dbuf_remap_impl_callback_arg_t *drica = arg;
3486 objset_t *os = drica->drica_os;
3487 spa_t *spa = dmu_objset_spa(os);
3488 dmu_tx_t *tx = drica->drica_tx;
3489
3490 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
3491
3492 if (os == spa_meta_objset(spa)) {
3493 spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
3494 } else {
3495 dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
3496 size, drica->drica_blk_birth, tx);
3497 }
3498 }
3499
3500 static void
3501 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
3502 {
3503 blkptr_t bp_copy = *bp;
3504 spa_t *spa = dmu_objset_spa(dn->dn_objset);
3505 dbuf_remap_impl_callback_arg_t drica;
3506
3507 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
3508
3509 drica.drica_os = dn->dn_objset;
3510 drica.drica_blk_birth = bp->blk_birth;
3511 drica.drica_tx = tx;
3512 if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
3513 &drica)) {
3514 /*
3515 * The struct_rwlock prevents dbuf_read_impl() from
3516 * dereferencing the BP while we are changing it. To
3517 * avoid lock contention, only grab it when we are actually
3518 * changing the BP.
3519 */
3520 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
3521 *bp = bp_copy;
3522 rw_exit(&dn->dn_struct_rwlock);
3523 }
3524 }
3525
3526 /*
3527 * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
3528 * to remap a copy of every bp in the dbuf.
3529 */
3530 boolean_t
3531 dbuf_can_remap(const dmu_buf_impl_t *db)
3532 {
3533 spa_t *spa = dmu_objset_spa(db->db_objset);
3534 blkptr_t *bp = db->db.db_data;
3535 boolean_t ret = B_FALSE;
3536
3537 ASSERT3U(db->db_level, >, 0);
3538 ASSERT3S(db->db_state, ==, DB_CACHED);
3539
3540 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
3541
3542 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3543 for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
3544 blkptr_t bp_copy = bp[i];
3545 if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
3546 ret = B_TRUE;
3547 break;
3548 }
3549 }
3550 spa_config_exit(spa, SCL_VDEV, FTAG);
3551
3552 return (ret);
3553 }
3554
3555 boolean_t
3556 dnode_needs_remap(const dnode_t *dn)
3557 {
3558 spa_t *spa = dmu_objset_spa(dn->dn_objset);
3559 boolean_t ret = B_FALSE;
3560
3561 if (dn->dn_phys->dn_nlevels == 0) {
3562 return (B_FALSE);
3563 }
3564
3565 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
3566
3567 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3568 for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
3569 blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
3570 if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
3571 ret = B_TRUE;
3572 break;
3573 }
3574 }
3575 spa_config_exit(spa, SCL_VDEV, FTAG);
3576
3577 return (ret);
3578 }
3579
3580 /*
3581 * Remap any existing BP's to concrete vdevs, if possible.
3582 */
3583 static void
3584 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
3585 {
3586 spa_t *spa = dmu_objset_spa(db->db_objset);
3587 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
3588
3589 if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
3590 return;
3591
3592 if (db->db_level > 0) {
3593 blkptr_t *bp = db->db.db_data;
3594 for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
3595 dbuf_remap_impl(dn, &bp[i], tx);
3596 }
3597 } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
3598 dnode_phys_t *dnp = db->db.db_data;
3599 ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
3600 DMU_OT_DNODE);
3601 for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
3602 for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
3603 dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
3604 }
3605 }
3606 }
3607 }
3608
3609
3610 /* Issue I/O to commit a dirty buffer to disk. */
3611 static void
3612 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3613 {
3614 dmu_buf_impl_t *db = dr->dr_dbuf;
3615 dnode_t *dn;
3616 objset_t *os;
3617 dmu_buf_impl_t *parent = db->db_parent;
3618 uint64_t txg = tx->tx_txg;
3619 zbookmark_phys_t zb;
3620 zio_prop_t zp;
3621 zio_t *zio;
3622 int wp_flag = 0;
3623
3624 ASSERT(dmu_tx_is_syncing(tx));
3625
3626 DB_DNODE_ENTER(db);
3627 dn = DB_DNODE(db);
3628 os = dn->dn_objset;
3629
3630 if (db->db_state != DB_NOFILL) {
3631 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3632 /*
3633 * Private object buffers are released here rather
3634 * than in dbuf_dirty() since they are only modified
3635 * in the syncing context and we don't want the
3636 * overhead of making multiple copies of the data.
3637 */
3638 if (BP_IS_HOLE(db->db_blkptr)) {
3639 arc_buf_thaw(data);
3640 } else {
3641 dbuf_release_bp(db);
3642 }
3643 dbuf_remap(dn, db, tx);
3644 }
3645 }
3646
3647 if (parent != dn->dn_dbuf) {
3648 /* Our parent is an indirect block. */
3649 /* We have a dirty parent that has been scheduled for write. */
3650 ASSERT(parent && parent->db_data_pending);
3651 /* Our parent's buffer is one level closer to the dnode. */
3652 ASSERT(db->db_level == parent->db_level-1);
3653 /*
3654 * We're about to modify our parent's db_data by modifying
3655 * our block pointer, so the parent must be released.
3656 */
3657 ASSERT(arc_released(parent->db_buf));
3658 zio = parent->db_data_pending->dr_zio;
3659 } else {
3660 /* Our parent is the dnode itself. */
3661 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
3662 db->db_blkid != DMU_SPILL_BLKID) ||
3663 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
3664 if (db->db_blkid != DMU_SPILL_BLKID)
3665 ASSERT3P(db->db_blkptr, ==,
3666 &dn->dn_phys->dn_blkptr[db->db_blkid]);
3667 zio = dn->dn_zio;
3668 }
3669
3670 ASSERT(db->db_level == 0 || data == db->db_buf);
3671 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3672 ASSERT(zio);
3673
3674 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3675 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3676 db->db.db_object, db->db_level, db->db_blkid);
3677
3678 if (db->db_blkid == DMU_SPILL_BLKID)
3679 wp_flag = WP_SPILL;
3680 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
3681
3682 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3683 DB_DNODE_EXIT(db);
3684
3685 /*
3686 * We copy the blkptr now (rather than when we instantiate the dirty
3687 * record), because its value can change between open context and
3688 * syncing context. We do not need to hold dn_struct_rwlock to read
3689 * db_blkptr because we are in syncing context.
3690 */
3691 dr->dr_bp_copy = *db->db_blkptr;
3692
3693 if (db->db_level == 0 &&
3694 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
3695 /*
3696 * The BP for this block has been provided by open context
3697 * (by dmu_sync() or dmu_buf_write_embedded()).
3698 */
3699 abd_t *contents = (data != NULL) ?
3700 abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
3701
3702 dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
3703 contents, db->db.db_size, db->db.db_size, &zp,
3704 dbuf_write_override_ready, NULL, NULL,
3705 dbuf_write_override_done,
3706 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3707 mutex_enter(&db->db_mtx);
3708 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3709 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
3710 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3711 mutex_exit(&db->db_mtx);
3712 } else if (db->db_state == DB_NOFILL) {
3713 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3714 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3715 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3716 &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
3717 dbuf_write_nofill_ready, NULL, NULL,
3718 dbuf_write_nofill_done, db,
3719 ZIO_PRIORITY_ASYNC_WRITE,
3720 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
3721 } else {
3722 ASSERT(arc_released(data));
3723
3724 /*
3725 * For indirect blocks, we want to setup the children
3726 * ready callback so that we can properly handle an indirect
3727 * block that only contains holes.
3728 */
3729 arc_done_func_t *children_ready_cb = NULL;
3730 if (db->db_level != 0)
3731 children_ready_cb = dbuf_write_children_ready;
3732
3733 dr->dr_zio = arc_write(zio, os->os_spa, txg,
3734 &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
3735 &zp, dbuf_write_ready, children_ready_cb,
3736 dbuf_write_physdone, dbuf_write_done, db,
3737 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
3738 }
3739 }
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_send.h>
34 #include <sys/dmu_impl.h>
35 #include <sys/dbuf.h>
36 #include <sys/dmu_objset.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/dmu_tx.h>
40 #include <sys/spa.h>
41 #include <sys/spa_impl.h>
42 #include <sys/zio.h>
43 #include <sys/dmu_zfetch.h>
44 #include <sys/sa.h>
45 #include <sys/sa_impl.h>
46 #include <sys/zfeature.h>
47 #include <sys/blkptr.h>
48 #include <sys/range_tree.h>
49 #include <sys/callb.h>
50 #include <sys/abd.h>
51
52 uint_t zfs_dbuf_evict_key;
53
54 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
55 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
56
57 #ifndef __lint
58 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
59 dmu_buf_evict_func_t *evict_func_sync,
60 dmu_buf_evict_func_t *evict_func_async,
61 dmu_buf_t **clear_on_evict_dbufp);
62 #endif /* ! __lint */
63
64 /*
65 * Global data structures and functions for the dbuf cache.
66 */
67 static kmem_cache_t *dbuf_kmem_cache;
68 static taskq_t *dbu_evict_taskq;
69
70 static kthread_t *dbuf_cache_evict_thread;
71 static kmutex_t dbuf_evict_lock;
72 static kcondvar_t dbuf_evict_cv;
73 static boolean_t dbuf_evict_thread_exit;
74
75 /*
76 * There are two dbuf caches; each dbuf can only be in one of them at a time.
77 *
78 * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
79 * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
80 * that represent the metadata that describes filesystems/snapshots/
81 * bookmarks/properties/etc. We only evict from this cache when we export a
82 * pool, to short-circuit as much I/O as possible for all administrative
83 * commands that need the metadata. There is no eviction policy for this
84 * cache, because we try to only include types in it which would occupy a
85 * very small amount of space per object but create a large impact on the
86 * performance of these commands. Instead, after it reaches a maximum size
87 * (which should only happen on very small memory systems with a very large
88 * number of filesystem objects), we stop taking new dbufs into the
89 * metadata cache, instead putting them in the normal dbuf cache.
90 *
91 * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
92 * are not currently held but have been recently released. These dbufs
93 * are not eligible for arc eviction until they are aged out of the cache.
94 * Dbufs that are aged out of the cache will be immediately destroyed and
95 * become eligible for arc eviction.
96 *
97 * Dbufs are added to these caches once the last hold is released. If a dbuf is
98 * later accessed and still exists in the dbuf cache, then it will be removed
99 * from the cache and later re-added to the head of the cache.
100 *
101 * If a given dbuf meets the requirements for the metadata cache, it will go
102 * there, otherwise it will be considered for the generic LRU dbuf cache. The
103 * caches and the refcounts tracking their sizes are stored in an array indexed
104 * by those caches' matching enum values (from dbuf_cached_state_t).
105 */
106 typedef struct dbuf_cache {
107 multilist_t *cache;
108 refcount_t size;
109 } dbuf_cache_t;
110 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
111
112 /* Size limits for the caches */
113 uint64_t dbuf_cache_max_bytes = 0;
114 uint64_t dbuf_metadata_cache_max_bytes = 0;
115 /* Set the default sizes of the caches to log2 fraction of arc size */
116 int dbuf_cache_shift = 5;
117 int dbuf_metadata_cache_shift = 6;
118
119 /*
120 * For diagnostic purposes, this is incremented whenever we can't add
121 * something to the metadata cache because it's full, and instead put
122 * the data in the regular dbuf cache.
123 */
124 uint64_t dbuf_metadata_cache_overflow;
125
126 /*
127 * The LRU dbuf cache uses a three-stage eviction policy:
128 * - A low water marker designates when the dbuf eviction thread
129 * should stop evicting from the dbuf cache.
130 * - When we reach the maximum size (aka mid water mark), we
131 * signal the eviction thread to run.
132 * - The high water mark indicates when the eviction thread
133 * is unable to keep up with the incoming load and eviction must
134 * happen in the context of the calling thread.
135 *
136 * The dbuf cache:
137 * (max size)
138 * low water mid water hi water
139 * +----------------------------------------+----------+----------+
140 * | | | |
141 * | | | |
142 * | | | |
143 * | | | |
144 * +----------------------------------------+----------+----------+
145 * stop signal evict
146 * evicting eviction directly
147 * thread
180 multilist_link_init(&db->db_cache_link);
181 refcount_create(&db->db_holds);
182
183 return (0);
184 }
185
186 /* ARGSUSED */
187 static void
188 dbuf_dest(void *vdb, void *unused)
189 {
190 dmu_buf_impl_t *db = vdb;
191 mutex_destroy(&db->db_mtx);
192 cv_destroy(&db->db_changed);
193 ASSERT(!multilist_link_active(&db->db_cache_link));
194 refcount_destroy(&db->db_holds);
195 }
196
197 /*
198 * dbuf hash table routines
199 */
200 #pragma align 64(dbuf_hash_table)
201 static dbuf_hash_table_t dbuf_hash_table;
202
203 static uint64_t dbuf_hash_count;
204
205 static uint64_t
206 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
207 {
208 uintptr_t osv = (uintptr_t)os;
209 uint64_t crc = -1ULL;
210
211 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
212 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
213 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
214 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
215 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
216 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
217 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
218
219 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
220
221 return (crc);
222 }
223
224 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
225 ((dbuf)->db.db_object == (obj) && \
226 (dbuf)->db_objset == (os) && \
227 (dbuf)->db_level == (level) && \
228 (dbuf)->db_blkid == (blkid))
229
230 dmu_buf_impl_t *
231 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
232 {
233 dbuf_hash_table_t *h = &dbuf_hash_table;
234 uint64_t hv = dbuf_hash(os, obj, level, blkid);
235 uint64_t idx = hv & h->hash_table_mask;
236 dmu_buf_impl_t *db;
237
238 mutex_enter(DBUF_HASH_MUTEX(h, idx));
239 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
240 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
241 mutex_enter(&db->db_mtx);
419 dbu, 0, &dbu->dbu_tqent);
420 }
421 }
422
423 boolean_t
424 dbuf_is_metadata(dmu_buf_impl_t *db)
425 {
426 if (db->db_level > 0) {
427 return (B_TRUE);
428 } else {
429 boolean_t is_metadata;
430
431 DB_DNODE_ENTER(db);
432 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
433 DB_DNODE_EXIT(db);
434
435 return (is_metadata);
436 }
437 }
438
439 boolean_t
440 dbuf_is_ddt(dmu_buf_impl_t *db)
441 {
442 boolean_t is_ddt;
443
444 DB_DNODE_ENTER(db);
445 is_ddt = (DB_DNODE(db)->dn_type == DMU_OT_DDT_ZAP) ||
446 (DB_DNODE(db)->dn_type == DMU_OT_DDT_STATS);
447 DB_DNODE_EXIT(db);
448
449 return (is_ddt);
450 }
451
452 /*
453 * This returns whether this dbuf should be stored in the metadata cache, which
454 * is based on whether it's from one of the dnode types that store data related
455 * to traversing dataset hierarchies.
456 */
457 static boolean_t
458 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
459 {
460 DB_DNODE_ENTER(db);
461 dmu_object_type_t type = DB_DNODE(db)->dn_type;
462 DB_DNODE_EXIT(db);
463
464 /* Check if this dbuf is one of the types we care about */
465 if (DMU_OT_IS_METADATA_CACHED(type)) {
466 /* If we hit this, then we set something up wrong in dmu_ot */
467 ASSERT(DMU_OT_IS_METADATA(type));
468
469 /*
470 * Sanity check for small-memory systems: don't allocate too
471 * much memory for this purpose.
472 */
473 if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
474 dbuf_metadata_cache_max_bytes) {
475 dbuf_metadata_cache_overflow++;
476 DTRACE_PROBE1(dbuf__metadata__cache__overflow,
477 dmu_buf_impl_t *, db);
478 return (B_FALSE);
479 }
480
481 return (B_TRUE);
482 }
483
484 return (B_FALSE);
485 }
486
487 /*
488 * This function *must* return indices evenly distributed between all
489 * sublists of the multilist. This is needed due to how the dbuf eviction
490 * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
491 * distributed between all sublists and uses this assumption when
492 * deciding which sublist to evict from and how much to evict from it.
493 */
494 unsigned int
495 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
496 {
497 dmu_buf_impl_t *db = obj;
498
499 /*
500 * The assumption here, is the hash value for a given
501 * dmu_buf_impl_t will remain constant throughout it's lifetime
502 * (i.e. it's objset, object, level and blkid fields don't change).
503 * Thus, we don't need to store the dbuf's sublist index
504 * on insertion, as this index can be recalculated on removal.
505 *
506 * Also, the low order bits of the hash value are thought to be
507 * distributed evenly. Otherwise, in the case that the multilist
508 * has a power of two number of sublists, each sublists' usage
509 * would not be evenly distributed.
510 */
511 return (dbuf_hash(db->db_objset, db->db.db_object,
512 db->db_level, db->db_blkid) %
513 multilist_get_num_sublists(ml));
514 }
515
516 static inline boolean_t
517 dbuf_cache_above_hiwater(void)
518 {
519 uint64_t dbuf_cache_hiwater_bytes =
520 (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
521
522 return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
523 dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
524 }
525
526 static inline boolean_t
527 dbuf_cache_above_lowater(void)
528 {
529 uint64_t dbuf_cache_lowater_bytes =
530 (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
531
532 return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
533 dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
534 }
535
536 /*
537 * Evict the oldest eligible dbuf from the dbuf cache.
538 */
539 static void
540 dbuf_evict_one(void)
541 {
542 int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
543 multilist_sublist_t *mls = multilist_sublist_lock(
544 dbuf_caches[DB_DBUF_CACHE].cache, idx);
545
546 ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
547
548 /*
549 * Set the thread's tsd to indicate that it's processing evictions.
550 * Once a thread stops evicting from the dbuf cache it will
551 * reset its tsd to NULL.
552 */
553 ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
554 (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
555
556 dmu_buf_impl_t *db = multilist_sublist_tail(mls);
557 while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
558 db = multilist_sublist_prev(mls, db);
559 }
560
561 DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
562 multilist_sublist_t *, mls);
563
564 if (db != NULL) {
565 multilist_sublist_remove(mls, db);
566 multilist_sublist_unlock(mls);
567 (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
568 db->db.db_size, db);
569 ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
570 db->db_caching_status = DB_NO_CACHE;
571 dbuf_destroy(db);
572 } else {
573 multilist_sublist_unlock(mls);
574 }
575 (void) tsd_set(zfs_dbuf_evict_key, NULL);
576 }
577
578 /*
579 * The dbuf evict thread is responsible for aging out dbufs from the
580 * cache. Once the cache has reached it's maximum size, dbufs are removed
581 * and destroyed. The eviction thread will continue running until the size
582 * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
583 * out of the cache it is destroyed and becomes eligible for arc eviction.
584 */
585 /* ARGSUSED */
586 static void
587 dbuf_evict_thread(void *unused)
588 {
589 callb_cpr_t cpr;
590
603 /*
604 * Keep evicting as long as we're above the low water mark
605 * for the cache. We do this without holding the locks to
606 * minimize lock contention.
607 */
608 while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
609 dbuf_evict_one();
610 }
611
612 mutex_enter(&dbuf_evict_lock);
613 }
614
615 dbuf_evict_thread_exit = B_FALSE;
616 cv_broadcast(&dbuf_evict_cv);
617 CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
618 thread_exit();
619 }
620
621 /*
622 * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
623 *
624 * Direct eviction (dbuf_evict_one()) is not called here, because
625 * the function doesn't care about the selected dbuf, so the following
626 * case is possible which will cause a deadlock-panic:
627 *
628 * Thread A is evicting dbufs that are related to dnodeA
629 * dnode_evict_dbufs(dnoneA) enters dn_dbufs_mtx and after that walks
630 * its own AVL of dbufs and calls dbuf_destroy():
631 * dbuf_destroy() ->...-> dbuf_evict_notify() -> dbuf_evict_one() ->
632 * -> select a dbuf from cache -> dbuf_destroy() ->
633 * -> mutex_enter(dn_dbufs_mtx of dnoneB)
634 *
635 * Thread B is evicting dbufs that are related to dnodeB
636 * dnode_evict_dbufs(dnoneB) enters dn_dbufs_mtx and after that walks
637 * its own AVL of dbufs and calls dbuf_destroy():
638 * dbuf_destroy() ->...-> dbuf_evict_notify() -> dbuf_evict_one() ->
639 * -> select a dbuf from cache -> dbuf_destroy() ->
640 * -> mutex_enter(dn_dbufs_mtx of dnoneA)
641 */
642 static void
643 dbuf_evict_notify(void)
644 {
645
646 /*
647 * We use thread specific data to track when a thread has
648 * started processing evictions. This allows us to avoid deeply
649 * nested stacks that would have a call flow similar to this:
650 *
651 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
652 * ^ |
653 * | |
654 * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
655 *
656 * The dbuf_eviction_thread will always have its tsd set until
657 * that thread exits. All other threads will only set their tsd
658 * if they are participating in the eviction process. This only
659 * happens if the eviction thread is unable to process evictions
660 * fast enough. To keep the dbuf cache size in check, other threads
661 * can evict from the dbuf cache directly. Those threads will set
662 * their tsd values so that we ensure that they only evict one dbuf
663 * from the dbuf cache.
664 */
665 if (tsd_get(zfs_dbuf_evict_key) != NULL)
666 return;
667
668 /*
669 * We check if we should evict without holding the dbuf_evict_lock,
670 * because it's OK to occasionally make the wrong decision here,
671 * and grabbing the lock results in massive lock contention.
672 */
673 if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
674 dbuf_cache_max_bytes) {
675 if (dbuf_cache_above_hiwater())
676 dbuf_evict_one();
677 cv_signal(&dbuf_evict_cv);
678 }
679 }
680
681 void
682 dbuf_init(void)
683 {
684 uint64_t hsize = 1ULL << 16;
685 dbuf_hash_table_t *h = &dbuf_hash_table;
686 int i;
687
688 /*
689 * The hash table is big enough to fill all of physical memory
690 * with an average 4K block size. The table will take up
691 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
692 */
693 while (hsize * 4096 < physmem * PAGESIZE)
694 hsize <<= 1;
695
696 retry:
697 h->hash_table_mask = hsize - 1;
698 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
699 if (h->hash_table == NULL) {
700 /* XXX - we should really return an error instead of assert */
701 ASSERT(hsize > (1ULL << 10));
702 hsize >>= 1;
703 goto retry;
704 }
705
706 dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
707 sizeof (dmu_buf_impl_t),
708 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
709
710 for (i = 0; i < DBUF_MUTEXES; i++)
711 mutex_init(DBUF_HASH_MUTEX(h, i), NULL, MUTEX_DEFAULT, NULL);
712
713
714 /*
715 * Setup the parameters for the dbuf caches. We set the sizes of the
716 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
717 * of the size of the ARC, respectively.
718 */
719 if (dbuf_cache_max_bytes == 0 ||
720 dbuf_cache_max_bytes >= arc_max_bytes()) {
721 dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
722 }
723 if (dbuf_metadata_cache_max_bytes == 0 ||
724 dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
725 dbuf_metadata_cache_max_bytes =
726 arc_max_bytes() >> dbuf_metadata_cache_shift;
727 }
728
729 /*
730 * The combined size of both caches should be less
731 * the size of ARC, otherwise need to set them to
732 * the default values.
733 *
734 * divide by 2 is a simple overflow protection
735 */
736 if (((dbuf_cache_max_bytes / 2) +
737 (dbuf_metadata_cache_max_bytes / 2)) >= (arc_max_bytes() / 2)) {
738 dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
739 dbuf_metadata_cache_max_bytes =
740 arc_max_bytes() >> dbuf_metadata_cache_shift;
741 }
742
743
744 /*
745 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
746 * configuration is not required.
747 */
748 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
749
750 for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
751 dbuf_caches[dcs].cache =
752 multilist_create(sizeof (dmu_buf_impl_t),
753 offsetof(dmu_buf_impl_t, db_cache_link),
754 dbuf_cache_multilist_index_func);
755 refcount_create(&dbuf_caches[dcs].size);
756 }
757
758 tsd_create(&zfs_dbuf_evict_key, NULL);
759 dbuf_evict_thread_exit = B_FALSE;
760 mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
761 cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
762 dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
763 NULL, 0, &p0, TS_RUN, minclsyspri);
764 }
765
766 void
767 dbuf_fini(void)
768 {
769 dbuf_hash_table_t *h = &dbuf_hash_table;
770 int i;
771
772 for (i = 0; i < DBUF_MUTEXES; i++)
773 mutex_destroy(DBUF_HASH_MUTEX(h, i));
774 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
775 kmem_cache_destroy(dbuf_kmem_cache);
776 taskq_destroy(dbu_evict_taskq);
777
778 mutex_enter(&dbuf_evict_lock);
779 dbuf_evict_thread_exit = B_TRUE;
780 while (dbuf_evict_thread_exit) {
781 cv_signal(&dbuf_evict_cv);
782 cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
783 }
784 mutex_exit(&dbuf_evict_lock);
785 tsd_destroy(&zfs_dbuf_evict_key);
786
787 mutex_destroy(&dbuf_evict_lock);
788 cv_destroy(&dbuf_evict_cv);
789
790 for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
791 refcount_destroy(&dbuf_caches[dcs].size);
792 multilist_destroy(dbuf_caches[dcs].cache);
793 }
794 }
795
796 /*
797 * Other stuff.
798 */
799
800 #ifdef ZFS_DEBUG
801 static void
802 dbuf_verify(dmu_buf_impl_t *db)
803 {
804 dnode_t *dn;
805 dbuf_dirty_record_t *dr;
806
807 ASSERT(MUTEX_HELD(&db->db_mtx));
808
809 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
810 return;
811
812 ASSERT(db->db_objset != NULL);
813 DB_DNODE_ENTER(db);
1537 }
1538
1539 void
1540 dbuf_release_bp(dmu_buf_impl_t *db)
1541 {
1542 objset_t *os = db->db_objset;
1543
1544 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1545 ASSERT(arc_released(os->os_phys_buf) ||
1546 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1547 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1548
1549 (void) arc_release(db->db_buf, db);
1550 }
1551
1552 /*
1553 * We already have a dirty record for this TXG, and we are being
1554 * dirtied again.
1555 */
1556 static void
1557 dbuf_redirty(dbuf_dirty_record_t *dr, boolean_t usesc)
1558 {
1559 dmu_buf_impl_t *db = dr->dr_dbuf;
1560
1561 ASSERT(MUTEX_HELD(&db->db_mtx));
1562
1563 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1564 /*
1565 * If this buffer has already been written out,
1566 * we now need to reset its state.
1567 */
1568 dbuf_unoverride(dr);
1569 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1570 db->db_state != DB_NOFILL) {
1571 /* Already released on initial dirty, so just thaw. */
1572 ASSERT(arc_released(db->db_buf));
1573 arc_buf_thaw(db->db_buf);
1574 }
1575 }
1576 /*
1577 * Special class usage of dirty dbuf could be changed,
1578 * update the dirty entry.
1579 */
1580 dr->dr_usesc = usesc;
1581 }
1582
1583 dbuf_dirty_record_t *
1584 dbuf_dirty_sc(dmu_buf_impl_t *db, dmu_tx_t *tx, boolean_t usesc)
1585 {
1586 dnode_t *dn;
1587 objset_t *os;
1588 dbuf_dirty_record_t **drp, *dr;
1589 int drop_struct_lock = FALSE;
1590 int txgoff = tx->tx_txg & TXG_MASK;
1591
1592 ASSERT(tx->tx_txg != 0);
1593 ASSERT(!refcount_is_zero(&db->db_holds));
1594 DMU_TX_DIRTY_BUF(tx, db);
1595
1596 DB_DNODE_ENTER(db);
1597 dn = DB_DNODE(db);
1598 /*
1599 * Shouldn't dirty a regular buffer in syncing context. Private
1600 * objects may be dirtied in syncing context, but only if they
1601 * were already pre-dirtied in open context.
1602 */
1603 #ifdef DEBUG
1604 if (dn->dn_objset->os_dsl_dataset != NULL) {
1651 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1652 FTAG);
1653 }
1654 }
1655 mutex_exit(&dn->dn_mtx);
1656
1657 if (db->db_blkid == DMU_SPILL_BLKID)
1658 dn->dn_have_spill = B_TRUE;
1659
1660 /*
1661 * If this buffer is already dirty, we're done.
1662 */
1663 drp = &db->db_last_dirty;
1664 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1665 db->db.db_object == DMU_META_DNODE_OBJECT);
1666 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1667 drp = &dr->dr_next;
1668 if (dr && dr->dr_txg == tx->tx_txg) {
1669 DB_DNODE_EXIT(db);
1670
1671 dbuf_redirty(dr, usesc);
1672 mutex_exit(&db->db_mtx);
1673 return (dr);
1674 }
1675
1676 /*
1677 * Only valid if not already dirty.
1678 */
1679 ASSERT(dn->dn_object == 0 ||
1680 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1681 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1682
1683 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1684
1685 /*
1686 * We should only be dirtying in syncing context if it's the
1687 * mos or we're initializing the os or it's a special object.
1688 * However, we are allowed to dirty in syncing context provided
1689 * we already dirtied it in open context. Hence we must make
1690 * this assertion only if we're not already dirty.
1691 */
1731 * then).
1732 */
1733 arc_release(db->db_buf, db);
1734 dbuf_fix_old_data(db, tx->tx_txg);
1735 data_old = db->db_buf;
1736 }
1737 ASSERT(data_old != NULL);
1738 }
1739 dr->dt.dl.dr_data = data_old;
1740 } else {
1741 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1742 list_create(&dr->dt.di.dr_children,
1743 sizeof (dbuf_dirty_record_t),
1744 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1745 }
1746 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1747 dr->dr_accounted = db->db.db_size;
1748 dr->dr_dbuf = db;
1749 dr->dr_txg = tx->tx_txg;
1750 dr->dr_next = *drp;
1751 dr->dr_usesc = usesc;
1752 *drp = dr;
1753
1754 /*
1755 * We could have been freed_in_flight between the dbuf_noread
1756 * and dbuf_dirty. We win, as though the dbuf_noread() had
1757 * happened after the free.
1758 */
1759 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1760 db->db_blkid != DMU_SPILL_BLKID) {
1761 mutex_enter(&dn->dn_mtx);
1762 if (dn->dn_free_ranges[txgoff] != NULL) {
1763 range_tree_clear(dn->dn_free_ranges[txgoff],
1764 db->db_blkid, 1);
1765 }
1766 mutex_exit(&dn->dn_mtx);
1767 db->db_freed_in_flight = FALSE;
1768 }
1769
1770 /*
1771 * This buffer is now part of this txg
1772 */
1773 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1774 db->db_dirtycnt += 1;
1775 ASSERT3U(db->db_dirtycnt, <=, 3);
1776
1777 mutex_exit(&db->db_mtx);
1778
1779 if (db->db_blkid == DMU_BONUS_BLKID ||
1780 db->db_blkid == DMU_SPILL_BLKID) {
1781 mutex_enter(&dn->dn_mtx);
1782 ASSERT(!list_link_active(&dr->dr_dirty_node));
1783 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1784 mutex_exit(&dn->dn_mtx);
1785 dnode_setdirty_sc(dn, tx, usesc);
1786 DB_DNODE_EXIT(db);
1787 return (dr);
1788 }
1789
1790 /*
1791 * The dn_struct_rwlock prevents db_blkptr from changing
1792 * due to a write from syncing context completing
1793 * while we are running, so we want to acquire it before
1794 * looking at db_blkptr.
1795 */
1796 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1797 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1798 drop_struct_lock = TRUE;
1799 }
1800
1801 /*
1802 * We need to hold the dn_struct_rwlock to make this assertion,
1803 * because it protects dn_phys / dn_next_nlevels from changing.
1804 */
1805 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1806 dn->dn_phys->dn_nlevels > db->db_level ||
1807 dn->dn_next_nlevels[txgoff] > db->db_level ||
1808 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1809 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1810
1811 /*
1812 * If we are overwriting a dedup BP, then unless it is snapshotted,
1813 * when we get to syncing context we will need to decrement its
1814 * refcount in the DDT. Prefetch the relevant DDT block so that
1815 * syncing context won't have to wait for the i/o.
1816 */
1817 ddt_prefetch(os->os_spa, db->db_blkptr);
1818
1819 if (db->db_level == 0) {
1820 dnode_new_blkid(dn, db->db_blkid, tx, usesc, drop_struct_lock);
1821 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1822 }
1823
1824 if (db->db_level+1 < dn->dn_nlevels) {
1825 dmu_buf_impl_t *parent = db->db_parent;
1826 dbuf_dirty_record_t *di;
1827 int parent_held = FALSE;
1828
1829 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1830 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1831
1832 parent = dbuf_hold_level(dn, db->db_level+1,
1833 db->db_blkid >> epbs, FTAG);
1834 ASSERT(parent != NULL);
1835 parent_held = TRUE;
1836 }
1837 if (drop_struct_lock)
1838 rw_exit(&dn->dn_struct_rwlock);
1839 ASSERT3U(db->db_level+1, ==, parent->db_level);
1840 di = dbuf_dirty_sc(parent, tx, usesc);
1841 if (parent_held)
1842 dbuf_rele(parent, FTAG);
1843
1844 mutex_enter(&db->db_mtx);
1845 /*
1846 * Since we've dropped the mutex, it's possible that
1847 * dbuf_undirty() might have changed this out from under us.
1848 */
1849 if (db->db_last_dirty == dr ||
1850 dn->dn_object == DMU_META_DNODE_OBJECT) {
1851 mutex_enter(&di->dt.di.dr_mtx);
1852 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1853 ASSERT(!list_link_active(&dr->dr_dirty_node));
1854 list_insert_tail(&di->dt.di.dr_children, dr);
1855 mutex_exit(&di->dt.di.dr_mtx);
1856 dr->dr_parent = di;
1857 }
1858
1859 /*
1860 * Special class usage of dirty dbuf could be changed,
1861 * update the dirty entry.
1862 */
1863 dr->dr_usesc = usesc;
1864 mutex_exit(&db->db_mtx);
1865 } else {
1866 ASSERT(db->db_level+1 == dn->dn_nlevels);
1867 ASSERT(db->db_blkid < dn->dn_nblkptr);
1868 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1869 mutex_enter(&dn->dn_mtx);
1870 ASSERT(!list_link_active(&dr->dr_dirty_node));
1871 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1872 mutex_exit(&dn->dn_mtx);
1873 if (drop_struct_lock)
1874 rw_exit(&dn->dn_struct_rwlock);
1875 }
1876
1877 dnode_setdirty_sc(dn, tx, usesc);
1878 DB_DNODE_EXIT(db);
1879 return (dr);
1880 }
1881
1882 dbuf_dirty_record_t *
1883 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1884 {
1885 spa_t *spa;
1886
1887 ASSERT(db->db_objset != NULL);
1888 spa = db->db_objset->os_spa;
1889
1890 return (dbuf_dirty_sc(db, tx, spa->spa_usesc));
1891 }
1892
1893 /*
1894 * Undirty a buffer in the transaction group referenced by the given
1895 * transaction. Return whether this evicted the dbuf.
1896 */
1897 static boolean_t
1898 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1899 {
1900 dnode_t *dn;
1901 uint64_t txg = tx->tx_txg;
1902 dbuf_dirty_record_t *dr, **drp;
1903
1904 ASSERT(txg != 0);
1905
1906 /*
1907 * Due to our use of dn_nlevels below, this can only be called
1908 * in open context, unless we are operating on the MOS.
1909 * From syncing context, dn_nlevels may be different from the
1910 * dn_nlevels used when dbuf was dirtied.
1911 */
1912 ASSERT(db->db_objset ==
1968 }
1969
1970 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1971
1972 ASSERT(db->db_dirtycnt > 0);
1973 db->db_dirtycnt -= 1;
1974
1975 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1976 ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
1977 dbuf_destroy(db);
1978 return (B_TRUE);
1979 }
1980
1981 return (B_FALSE);
1982 }
1983
1984 void
1985 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
1986 {
1987 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1988 spa_t *spa = db->db_objset->os_spa;
1989 dmu_buf_will_dirty_sc(db_fake, tx, spa->spa_usesc);
1990 }
1991
1992 void
1993 dmu_buf_will_dirty_sc(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t usesc)
1994 {
1995 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1996 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1997
1998 ASSERT(tx->tx_txg != 0);
1999 ASSERT(!refcount_is_zero(&db->db_holds));
2000
2001 /*
2002 * Quick check for dirtyness. For already dirty blocks, this
2003 * reduces runtime of this function by >90%, and overall performance
2004 * by 50% for some workloads (e.g. file deletion with indirect blocks
2005 * cached).
2006 */
2007 mutex_enter(&db->db_mtx);
2008 dbuf_dirty_record_t *dr;
2009 for (dr = db->db_last_dirty;
2010 dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
2011 /*
2012 * It's possible that it is already dirty but not cached,
2013 * because there are some calls to dbuf_dirty() that don't
2014 * go through dmu_buf_will_dirty().
2015 */
2016 if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
2017 /* This dbuf is already dirty and cached. */
2018 dbuf_redirty(dr, usesc);
2019 mutex_exit(&db->db_mtx);
2020 return;
2021 }
2022 }
2023 mutex_exit(&db->db_mtx);
2024
2025 DB_DNODE_ENTER(db);
2026 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
2027 rf |= DB_RF_HAVESTRUCT;
2028 DB_DNODE_EXIT(db);
2029 (void) dbuf_read(db, NULL, rf);
2030 (void) dbuf_dirty_sc(db, tx, usesc);
2031 }
2032
2033
2034 void
2035 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2036 {
2037 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2038
2039 db->db_state = DB_NOFILL;
2040
2041 dmu_buf_will_fill(db_fake, tx);
2042 }
2043
2044 void
2045 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2046 {
2047 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2048
2049 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2050 ASSERT(tx->tx_txg != 0);
2051 ASSERT(db->db_level == 0);
2052 ASSERT(!refcount_is_zero(&db->db_holds));
2053
2188 dmu_buf_impl_t *dndb;
2189
2190 ASSERT(MUTEX_HELD(&db->db_mtx));
2191 ASSERT(refcount_is_zero(&db->db_holds));
2192
2193 if (db->db_buf != NULL) {
2194 arc_buf_destroy(db->db_buf, db);
2195 db->db_buf = NULL;
2196 }
2197
2198 if (db->db_blkid == DMU_BONUS_BLKID) {
2199 ASSERT(db->db.db_data != NULL);
2200 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
2201 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2202 db->db_state = DB_UNCACHED;
2203 }
2204
2205 dbuf_clear_data(db);
2206
2207 if (multilist_link_active(&db->db_cache_link)) {
2208 ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
2209 db->db_caching_status == DB_DBUF_METADATA_CACHE);
2210
2211 multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
2212 (void) refcount_remove_many(
2213 &dbuf_caches[db->db_caching_status].size,
2214 db->db.db_size, db);
2215
2216 db->db_caching_status = DB_NO_CACHE;
2217 }
2218
2219 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
2220 ASSERT(db->db_data_pending == NULL);
2221
2222 db->db_state = DB_EVICTING;
2223 db->db_blkptr = NULL;
2224
2225 /*
2226 * Now that db_state is DB_EVICTING, nobody else can find this via
2227 * the hash table. We can now drop db_mtx, which allows us to
2228 * acquire the dn_dbufs_mtx.
2229 */
2230 mutex_exit(&db->db_mtx);
2231
2232 DB_DNODE_ENTER(db);
2233 dn = DB_DNODE(db);
2234 dndb = dn->dn_dbuf;
2235 if (db->db_blkid != DMU_BONUS_BLKID) {
2236 boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
2250 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
2251 * release any lock.
2252 */
2253 dnode_rele(dn, db);
2254 db->db_dnode_handle = NULL;
2255
2256 dbuf_hash_remove(db);
2257 } else {
2258 DB_DNODE_EXIT(db);
2259 }
2260
2261 ASSERT(refcount_is_zero(&db->db_holds));
2262
2263 db->db_parent = NULL;
2264
2265 ASSERT(db->db_buf == NULL);
2266 ASSERT(db->db.db_data == NULL);
2267 ASSERT(db->db_hash_next == NULL);
2268 ASSERT(db->db_blkptr == NULL);
2269 ASSERT(db->db_data_pending == NULL);
2270 ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
2271 ASSERT(!multilist_link_active(&db->db_cache_link));
2272
2273 kmem_cache_free(dbuf_kmem_cache, db);
2274 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2275
2276 /*
2277 * If this dbuf is referenced from an indirect dbuf,
2278 * decrement the ref count on the indirect dbuf.
2279 */
2280 if (parent && parent != dndb)
2281 dbuf_rele(parent, db);
2282 }
2283
2284 /*
2285 * Note: While bpp will always be updated if the function returns success,
2286 * parentp will not be updated if the dnode does not have dn_dbuf filled in;
2287 * this happens when the dnode is the meta-dnode, or a userused or groupused
2288 * object.
2289 */
2290 static int
2389 db->db_level = level;
2390 db->db_blkid = blkid;
2391 db->db_last_dirty = NULL;
2392 db->db_dirtycnt = 0;
2393 db->db_dnode_handle = dn->dn_handle;
2394 db->db_parent = parent;
2395 db->db_blkptr = blkptr;
2396
2397 db->db_user = NULL;
2398 db->db_user_immediate_evict = FALSE;
2399 db->db_freed_in_flight = FALSE;
2400 db->db_pending_evict = FALSE;
2401
2402 if (blkid == DMU_BONUS_BLKID) {
2403 ASSERT3P(parent, ==, dn->dn_dbuf);
2404 db->db.db_size = DN_MAX_BONUSLEN -
2405 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
2406 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
2407 db->db.db_offset = DMU_BONUS_BLKID;
2408 db->db_state = DB_UNCACHED;
2409 db->db_caching_status = DB_NO_CACHE;
2410 /* the bonus dbuf is not placed in the hash table */
2411 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2412 return (db);
2413 } else if (blkid == DMU_SPILL_BLKID) {
2414 db->db.db_size = (blkptr != NULL) ?
2415 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
2416 db->db.db_offset = 0;
2417 } else {
2418 int blocksize =
2419 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
2420 db->db.db_size = blocksize;
2421 db->db.db_offset = db->db_blkid * blocksize;
2422 }
2423
2424 /*
2425 * Hold the dn_dbufs_mtx while we get the new dbuf
2426 * in the hash table *and* added to the dbufs list.
2427 * This prevents a possible deadlock with someone
2428 * trying to look up this dbuf before its added to the
2429 * dn_dbufs list.
2430 */
2431 mutex_enter(&dn->dn_dbufs_mtx);
2432 db->db_state = DB_EVICTING;
2433 if ((odb = dbuf_hash_insert(db)) != NULL) {
2434 /* someone else inserted it first */
2435 kmem_cache_free(dbuf_kmem_cache, db);
2436 mutex_exit(&dn->dn_dbufs_mtx);
2437 return (odb);
2438 }
2439 avl_add(&dn->dn_dbufs, db);
2440
2441 db->db_state = DB_UNCACHED;
2442 db->db_caching_status = DB_NO_CACHE;
2443 mutex_exit(&dn->dn_dbufs_mtx);
2444 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
2445
2446 if (parent && parent != dn->dn_dbuf)
2447 dbuf_add_ref(parent, db);
2448
2449 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2450 refcount_count(&dn->dn_holds) > 0);
2451 (void) refcount_add(&dn->dn_holds, db);
2452 atomic_inc_32(&dn->dn_dbufs_count);
2453
2454 dprintf_dbuf(db, "db=%p\n", db);
2455
2456 return (db);
2457 }
2458
2459 typedef struct dbuf_prefetch_arg {
2460 spa_t *dpa_spa; /* The spa to issue the prefetch in. */
2461 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2462 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2729 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
2730 if (fail_sparse) {
2731 if (err == 0 && bp && BP_IS_HOLE(bp))
2732 err = SET_ERROR(ENOENT);
2733 if (err) {
2734 if (parent)
2735 dbuf_rele(parent, NULL);
2736 return (err);
2737 }
2738 }
2739 if (err && err != ENOENT)
2740 return (err);
2741 db = dbuf_create(dn, level, blkid, parent, bp);
2742 }
2743
2744 if (fail_uncached && db->db_state != DB_CACHED) {
2745 mutex_exit(&db->db_mtx);
2746 return (SET_ERROR(ENOENT));
2747 }
2748
2749 if (db->db_buf != NULL) {
2750 arc_buf_access(db->db_buf);
2751 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
2752 }
2753
2754 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
2755
2756 /*
2757 * If this buffer is currently syncing out, and we are are
2758 * still referencing it from db_data, we need to make a copy
2759 * of it in case we decide we want to dirty it again in this txg.
2760 */
2761 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2762 dn->dn_object != DMU_META_DNODE_OBJECT &&
2763 db->db_state == DB_CACHED && db->db_data_pending) {
2764 dbuf_dirty_record_t *dr = db->db_data_pending;
2765
2766 if (dr->dt.dl.dr_data == db->db_buf) {
2767 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2768
2769 dbuf_set_data(db,
2770 arc_alloc_buf(dn->dn_objset->os_spa, db, type,
2771 db->db.db_size));
2772 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
2773 db->db.db_size);
2774 }
2775 }
2776
2777 if (multilist_link_active(&db->db_cache_link)) {
2778 ASSERT(refcount_is_zero(&db->db_holds));
2779 ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
2780 db->db_caching_status == DB_DBUF_METADATA_CACHE);
2781
2782 multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
2783 (void) refcount_remove_many(
2784 &dbuf_caches[db->db_caching_status].size,
2785 db->db.db_size, db);
2786
2787 db->db_caching_status = DB_NO_CACHE;
2788 }
2789 (void) refcount_add(&db->db_holds, tag);
2790 DBUF_VERIFY(db);
2791 mutex_exit(&db->db_mtx);
2792
2793 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2794 if (parent)
2795 dbuf_rele(parent, NULL);
2796
2797 ASSERT3P(DB_DNODE(db), ==, dn);
2798 ASSERT3U(db->db_blkid, ==, blkid);
2799 ASSERT3U(db->db_level, ==, level);
2800 *dbp = db;
2801
2802 return (0);
2803 }
2804
2805 dmu_buf_impl_t *
2806 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2807 {
2984 * This dbuf has anonymous data associated with it.
2985 */
2986 dbuf_destroy(db);
2987 } else {
2988 boolean_t do_arc_evict = B_FALSE;
2989 blkptr_t bp;
2990 spa_t *spa = dmu_objset_spa(db->db_objset);
2991
2992 if (!DBUF_IS_CACHEABLE(db) &&
2993 db->db_blkptr != NULL &&
2994 !BP_IS_HOLE(db->db_blkptr) &&
2995 !BP_IS_EMBEDDED(db->db_blkptr)) {
2996 do_arc_evict = B_TRUE;
2997 bp = *db->db_blkptr;
2998 }
2999
3000 if (!DBUF_IS_CACHEABLE(db) ||
3001 db->db_pending_evict) {
3002 dbuf_destroy(db);
3003 } else if (!multilist_link_active(&db->db_cache_link)) {
3004 ASSERT3U(db->db_caching_status, ==,
3005 DB_NO_CACHE);
3006
3007 dbuf_cached_state_t dcs =
3008 dbuf_include_in_metadata_cache(db) ?
3009 DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
3010 db->db_caching_status = dcs;
3011
3012 multilist_insert(dbuf_caches[dcs].cache, db);
3013 (void) refcount_add_many(&dbuf_caches[dcs].size,
3014 db->db.db_size, db);
3015 mutex_exit(&db->db_mtx);
3016
3017 if (db->db_caching_status == DB_DBUF_CACHE) {
3018 dbuf_evict_notify();
3019 }
3020 }
3021
3022 if (do_arc_evict)
3023 arc_freed(spa, &bp);
3024 }
3025 } else {
3026 mutex_exit(&db->db_mtx);
3027 }
3028
3029 }
3030
3031 #pragma weak dmu_buf_refcount = dbuf_refcount
3032 uint64_t
3033 dbuf_refcount(dmu_buf_impl_t *db)
3034 {
3035 return (refcount_count(&db->db_holds));
3036 }
3037
3038 void *
3039 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
3040 dmu_buf_user_t *new_user)
3182 /* Read the block if it hasn't been read yet. */
3183 if (db->db_buf == NULL) {
3184 mutex_exit(&db->db_mtx);
3185 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
3186 mutex_enter(&db->db_mtx);
3187 }
3188 ASSERT3U(db->db_state, ==, DB_CACHED);
3189 ASSERT(db->db_buf != NULL);
3190
3191 DB_DNODE_ENTER(db);
3192 dn = DB_DNODE(db);
3193 /* Indirect block size must match what the dnode thinks it is. */
3194 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
3195 dbuf_check_blkptr(dn, db);
3196 DB_DNODE_EXIT(db);
3197
3198 /* Provide the pending dirty record to child dbufs */
3199 db->db_data_pending = dr;
3200
3201 mutex_exit(&db->db_mtx);
3202 dbuf_write(dr, db->db_buf, tx);
3203
3204 zio = dr->dr_zio;
3205 mutex_enter(&dr->dt.di.dr_mtx);
3206 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
3207 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3208 mutex_exit(&dr->dt.di.dr_mtx);
3209 zio_nowait(zio);
3210 }
3211
3212 static void
3213 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3214 {
3215 arc_buf_t **datap = &dr->dt.dl.dr_data;
3216 dmu_buf_impl_t *db = dr->dr_dbuf;
3217 dnode_t *dn;
3218 objset_t *os;
3219 uint64_t txg = tx->tx_txg;
3220
3221 ASSERT(dmu_tx_is_syncing(tx));
3653 static void
3654 dbuf_write_override_done(zio_t *zio)
3655 {
3656 dbuf_dirty_record_t *dr = zio->io_private;
3657 dmu_buf_impl_t *db = dr->dr_dbuf;
3658 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
3659
3660 mutex_enter(&db->db_mtx);
3661 if (!BP_EQUAL(zio->io_bp, obp)) {
3662 if (!BP_IS_HOLE(obp))
3663 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
3664 arc_release(dr->dt.dl.dr_data, db);
3665 }
3666 mutex_exit(&db->db_mtx);
3667 dbuf_write_done(zio, NULL, db);
3668
3669 if (zio->io_abd != NULL)
3670 abd_put(zio->io_abd);
3671 }
3672
3673 /* Issue I/O to commit a dirty buffer to disk. */
3674 static void
3675 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
3676 {
3677 dmu_buf_impl_t *db = dr->dr_dbuf;
3678 dnode_t *dn;
3679 objset_t *os;
3680 dmu_buf_impl_t *parent = db->db_parent;
3681 uint64_t txg = tx->tx_txg;
3682 zbookmark_phys_t zb;
3683 zio_prop_t zp;
3684 zio_t *zio;
3685 int wp_flag = 0;
3686 zio_smartcomp_info_t sc;
3687
3688 ASSERT(dmu_tx_is_syncing(tx));
3689
3690 DB_DNODE_ENTER(db);
3691 dn = DB_DNODE(db);
3692 os = dn->dn_objset;
3693
3694 dnode_setup_zio_smartcomp(db, &sc);
3695
3696 if (db->db_state != DB_NOFILL) {
3697 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
3698 /*
3699 * Private object buffers are released here rather
3700 * than in dbuf_dirty() since they are only modified
3701 * in the syncing context and we don't want the
3702 * overhead of making multiple copies of the data.
3703 */
3704 if (BP_IS_HOLE(db->db_blkptr)) {
3705 arc_buf_thaw(data);
3706 } else {
3707 dbuf_release_bp(db);
3708 }
3709 }
3710 }
3711
3712 if (parent != dn->dn_dbuf) {
3713 /* Our parent is an indirect block. */
3714 /* We have a dirty parent that has been scheduled for write. */
3715 ASSERT(parent && parent->db_data_pending);
3716 /* Our parent's buffer is one level closer to the dnode. */
3717 ASSERT(db->db_level == parent->db_level-1);
3718 /*
3719 * We're about to modify our parent's db_data by modifying
3720 * our block pointer, so the parent must be released.
3721 */
3722 ASSERT(arc_released(parent->db_buf));
3723 zio = parent->db_data_pending->dr_zio;
3724 } else {
3725 /* Our parent is the dnode itself. */
3726 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
3727 db->db_blkid != DMU_SPILL_BLKID) ||
3728 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
3729 if (db->db_blkid != DMU_SPILL_BLKID)
3730 ASSERT3P(db->db_blkptr, ==,
3731 &dn->dn_phys->dn_blkptr[db->db_blkid]);
3732 zio = dn->dn_zio;
3733 }
3734
3735 ASSERT(db->db_level == 0 || data == db->db_buf);
3736 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
3737 ASSERT(zio);
3738
3739 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
3740 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
3741 db->db.db_object, db->db_level, db->db_blkid);
3742
3743 if (db->db_blkid == DMU_SPILL_BLKID)
3744 wp_flag = WP_SPILL;
3745 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
3746 WP_SET_SPECIALCLASS(wp_flag, dr->dr_usesc);
3747
3748 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
3749 DB_DNODE_EXIT(db);
3750
3751 /*
3752 * We copy the blkptr now (rather than when we instantiate the dirty
3753 * record), because its value can change between open context and
3754 * syncing context. We do not need to hold dn_struct_rwlock to read
3755 * db_blkptr because we are in syncing context.
3756 */
3757 dr->dr_bp_copy = *db->db_blkptr;
3758
3759 if (db->db_level == 0 &&
3760 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
3761 /*
3762 * The BP for this block has been provided by open context
3763 * (by dmu_sync() or dmu_buf_write_embedded()).
3764 */
3765 abd_t *contents = (data != NULL) ?
3766 abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
3767
3768 dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
3769 contents, db->db.db_size, db->db.db_size, &zp,
3770 dbuf_write_override_ready, NULL, NULL,
3771 dbuf_write_override_done,
3772 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb,
3773 &sc);
3774 mutex_enter(&db->db_mtx);
3775 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
3776 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
3777 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
3778 mutex_exit(&db->db_mtx);
3779 } else if (db->db_state == DB_NOFILL) {
3780 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
3781 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
3782 dr->dr_zio = zio_write(zio, os->os_spa, txg,
3783 &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
3784 dbuf_write_nofill_ready, NULL, NULL,
3785 dbuf_write_nofill_done, db,
3786 ZIO_PRIORITY_ASYNC_WRITE,
3787 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb, &sc);
3788 } else {
3789 ASSERT(arc_released(data));
3790
3791 /*
3792 * For indirect blocks, we want to setup the children
3793 * ready callback so that we can properly handle an indirect
3794 * block that only contains holes.
3795 */
3796 arc_done_func_t *children_ready_cb = NULL;
3797 if (db->db_level != 0)
3798 children_ready_cb = dbuf_write_children_ready;
3799
3800 dr->dr_zio = arc_write(zio, os->os_spa, txg,
3801 &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
3802 &zp, dbuf_write_ready, children_ready_cb,
3803 dbuf_write_physdone, dbuf_write_done, db,
3804 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb, &sc);
3805 }
3806 }
|