3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/dbuf.h>
28 #include <sys/dnode.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dir.h>
34 #include <sys/dsl_dataset.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_zfetch.h>
38
39 static int free_range_compar(const void *node1, const void *node2);
40
41 static kmem_cache_t *dnode_cache;
42 /*
43 * Define DNODE_STATS to turn on statistic gathering. By default, it is only
44 * turned on when DEBUG is also defined.
45 */
46 #ifdef DEBUG
47 #define DNODE_STATS
48 #endif /* DEBUG */
49
50 #ifdef DNODE_STATS
51 #define DNODE_STAT_ADD(stat) ((stat)++)
52 #else
53 #define DNODE_STAT_ADD(stat) /* nothing */
54 #endif /* DNODE_STATS */
55
56 static dnode_phys_t dnode_phys_zero;
57
58 int zfs_default_bs = SPA_MINBLOCKSHIFT;
59 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
60
73 cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
74
75 /*
76 * Every dbuf has a reference, and dropping a tracked reference is
77 * O(number of references), so don't track dn_holds.
78 */
79 refcount_create_untracked(&dn->dn_holds);
80 refcount_create(&dn->dn_tx_holds);
81 list_link_init(&dn->dn_link);
82
83 bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
84 bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
85 bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
86 bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
87 bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
88 bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
89 bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
90
91 for (i = 0; i < TXG_SIZE; i++) {
92 list_link_init(&dn->dn_dirty_link[i]);
93 avl_create(&dn->dn_ranges[i], free_range_compar,
94 sizeof (free_range_t),
95 offsetof(struct free_range, fr_node));
96 list_create(&dn->dn_dirty_records[i],
97 sizeof (dbuf_dirty_record_t),
98 offsetof(dbuf_dirty_record_t, dr_dirty_node));
99 }
100
101 dn->dn_allocated_txg = 0;
102 dn->dn_free_txg = 0;
103 dn->dn_assigned_txg = 0;
104 dn->dn_dirtyctx = 0;
105 dn->dn_dirtyctx_firstset = NULL;
106 dn->dn_bonus = NULL;
107 dn->dn_have_spill = B_FALSE;
108 dn->dn_zio = NULL;
109 dn->dn_oldused = 0;
110 dn->dn_oldflags = 0;
111 dn->dn_olduid = 0;
112 dn->dn_oldgid = 0;
113 dn->dn_newuid = 0;
114 dn->dn_newgid = 0;
115 dn->dn_id_flags = 0;
123 return (0);
124 }
125
126 /* ARGSUSED */
127 static void
128 dnode_dest(void *arg, void *unused)
129 {
130 int i;
131 dnode_t *dn = arg;
132
133 rw_destroy(&dn->dn_struct_rwlock);
134 mutex_destroy(&dn->dn_mtx);
135 mutex_destroy(&dn->dn_dbufs_mtx);
136 cv_destroy(&dn->dn_notxholds);
137 refcount_destroy(&dn->dn_holds);
138 refcount_destroy(&dn->dn_tx_holds);
139 ASSERT(!list_link_active(&dn->dn_link));
140
141 for (i = 0; i < TXG_SIZE; i++) {
142 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
143 avl_destroy(&dn->dn_ranges[i]);
144 list_destroy(&dn->dn_dirty_records[i]);
145 ASSERT0(dn->dn_next_nblkptr[i]);
146 ASSERT0(dn->dn_next_nlevels[i]);
147 ASSERT0(dn->dn_next_indblkshift[i]);
148 ASSERT0(dn->dn_next_bonustype[i]);
149 ASSERT0(dn->dn_rm_spillblk[i]);
150 ASSERT0(dn->dn_next_bonuslen[i]);
151 ASSERT0(dn->dn_next_blksz[i]);
152 }
153
154 ASSERT0(dn->dn_allocated_txg);
155 ASSERT0(dn->dn_free_txg);
156 ASSERT0(dn->dn_assigned_txg);
157 ASSERT0(dn->dn_dirtyctx);
158 ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
159 ASSERT3P(dn->dn_bonus, ==, NULL);
160 ASSERT(!dn->dn_have_spill);
161 ASSERT3P(dn->dn_zio, ==, NULL);
162 ASSERT0(dn->dn_oldused);
163 ASSERT0(dn->dn_oldflags);
296 byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
297
298 }
299
300 void
301 dnode_buf_byteswap(void *vbuf, size_t size)
302 {
303 dnode_phys_t *buf = vbuf;
304 int i;
305
306 ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
307 ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
308
309 size >>= DNODE_SHIFT;
310 for (i = 0; i < size; i++) {
311 dnode_byteswap(buf);
312 buf++;
313 }
314 }
315
316 static int
317 free_range_compar(const void *node1, const void *node2)
318 {
319 const free_range_t *rp1 = node1;
320 const free_range_t *rp2 = node2;
321
322 if (rp1->fr_blkid < rp2->fr_blkid)
323 return (-1);
324 else if (rp1->fr_blkid > rp2->fr_blkid)
325 return (1);
326 else return (0);
327 }
328
329 void
330 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
331 {
332 ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
333
334 dnode_setdirty(dn, tx);
335 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
336 ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
337 (dn->dn_nblkptr-1) * sizeof (blkptr_t));
338 dn->dn_bonuslen = newsize;
339 if (newsize == 0)
340 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
341 else
342 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
343 rw_exit(&dn->dn_struct_rwlock);
344 }
345
346 void
347 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
348 {
357 void
358 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
359 {
360 ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
361 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
362 dnode_setdirty(dn, tx);
363 dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
364 dn->dn_have_spill = B_FALSE;
365 }
366
367 static void
368 dnode_setdblksz(dnode_t *dn, int size)
369 {
370 ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
371 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
372 ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
373 ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
374 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
375 dn->dn_datablksz = size;
376 dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
377 dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
378 }
379
380 static dnode_t *
381 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
382 uint64_t object, dnode_handle_t *dnh)
383 {
384 dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
385
386 ASSERT(!POINTER_IS_VALID(dn->dn_objset));
387 dn->dn_moved = 0;
388
389 /*
390 * Defer setting dn_objset until the dnode is ready to be a candidate
391 * for the dnode_move() callback.
392 */
393 dn->dn_object = object;
394 dn->dn_dbuf = db;
395 dn->dn_handle = dnh;
396 dn->dn_phys = dnp;
397
513 ASSERT(DMU_OT_IS_VALID(bonustype));
514 ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
515 ASSERT(dn->dn_type == DMU_OT_NONE);
516 ASSERT0(dn->dn_maxblkid);
517 ASSERT0(dn->dn_allocated_txg);
518 ASSERT0(dn->dn_assigned_txg);
519 ASSERT(refcount_is_zero(&dn->dn_tx_holds));
520 ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
521 ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
522
523 for (i = 0; i < TXG_SIZE; i++) {
524 ASSERT0(dn->dn_next_nblkptr[i]);
525 ASSERT0(dn->dn_next_nlevels[i]);
526 ASSERT0(dn->dn_next_indblkshift[i]);
527 ASSERT0(dn->dn_next_bonuslen[i]);
528 ASSERT0(dn->dn_next_bonustype[i]);
529 ASSERT0(dn->dn_rm_spillblk[i]);
530 ASSERT0(dn->dn_next_blksz[i]);
531 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
532 ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
533 ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
534 }
535
536 dn->dn_type = ot;
537 dnode_setdblksz(dn, blocksize);
538 dn->dn_indblkshift = ibs;
539 dn->dn_nlevels = 1;
540 if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
541 dn->dn_nblkptr = 1;
542 else
543 dn->dn_nblkptr = 1 +
544 ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
545 dn->dn_bonustype = bonustype;
546 dn->dn_bonuslen = bonuslen;
547 dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
548 dn->dn_compress = ZIO_COMPRESS_INHERIT;
549 dn->dn_dirtyctx = 0;
550
551 dn->dn_free_txg = 0;
552 if (dn->dn_dirtyctx_firstset) {
553 kmem_free(dn->dn_dirtyctx_firstset, 1);
677 ndn->dn_datablksz = odn->dn_datablksz;
678 ndn->dn_maxblkid = odn->dn_maxblkid;
679 bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
680 sizeof (odn->dn_next_nblkptr));
681 bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
682 sizeof (odn->dn_next_nlevels));
683 bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
684 sizeof (odn->dn_next_indblkshift));
685 bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
686 sizeof (odn->dn_next_bonustype));
687 bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
688 sizeof (odn->dn_rm_spillblk));
689 bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
690 sizeof (odn->dn_next_bonuslen));
691 bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
692 sizeof (odn->dn_next_blksz));
693 for (i = 0; i < TXG_SIZE; i++) {
694 list_move_tail(&ndn->dn_dirty_records[i],
695 &odn->dn_dirty_records[i]);
696 }
697 bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
698 ndn->dn_allocated_txg = odn->dn_allocated_txg;
699 ndn->dn_free_txg = odn->dn_free_txg;
700 ndn->dn_assigned_txg = odn->dn_assigned_txg;
701 ndn->dn_dirtyctx = odn->dn_dirtyctx;
702 ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
703 ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
704 refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
705 ASSERT(list_is_empty(&ndn->dn_dbufs));
706 list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
707 ndn->dn_dbufs_count = odn->dn_dbufs_count;
708 ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
709 ndn->dn_bonus = odn->dn_bonus;
710 ndn->dn_have_spill = odn->dn_have_spill;
711 ndn->dn_zio = odn->dn_zio;
712 ndn->dn_oldused = odn->dn_oldused;
713 ndn->dn_oldflags = odn->dn_oldflags;
714 ndn->dn_olduid = odn->dn_olduid;
715 ndn->dn_oldgid = odn->dn_oldgid;
716 ndn->dn_newuid = odn->dn_newuid;
717 ndn->dn_newgid = odn->dn_newgid;
740 list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
741 offsetof(dmu_buf_impl_t, db_link));
742 odn->dn_dbufs_count = 0;
743 odn->dn_unlisted_l0_blkid = 0;
744 odn->dn_bonus = NULL;
745 odn->dn_zfetch.zf_dnode = NULL;
746
747 /*
748 * Set the low bit of the objset pointer to ensure that dnode_move()
749 * recognizes the dnode as invalid in any subsequent callback.
750 */
751 POINTER_INVALIDATE(&odn->dn_objset);
752
753 /*
754 * Satisfy the destructor.
755 */
756 for (i = 0; i < TXG_SIZE; i++) {
757 list_create(&odn->dn_dirty_records[i],
758 sizeof (dbuf_dirty_record_t),
759 offsetof(dbuf_dirty_record_t, dr_dirty_node));
760 odn->dn_ranges[i].avl_root = NULL;
761 odn->dn_ranges[i].avl_numnodes = 0;
762 odn->dn_next_nlevels[i] = 0;
763 odn->dn_next_indblkshift[i] = 0;
764 odn->dn_next_bonustype[i] = 0;
765 odn->dn_rm_spillblk[i] = 0;
766 odn->dn_next_bonuslen[i] = 0;
767 odn->dn_next_blksz[i] = 0;
768 }
769 odn->dn_allocated_txg = 0;
770 odn->dn_free_txg = 0;
771 odn->dn_assigned_txg = 0;
772 odn->dn_dirtyctx = 0;
773 odn->dn_dirtyctx_firstset = NULL;
774 odn->dn_have_spill = B_FALSE;
775 odn->dn_zio = NULL;
776 odn->dn_oldused = 0;
777 odn->dn_oldflags = 0;
778 odn->dn_olduid = 0;
779 odn->dn_oldgid = 0;
780 odn->dn_newuid = 0;
781 odn->dn_newgid = 0;
1445 dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1446 if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1447 dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1448 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1449 ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1450 list_remove(&dn->dn_dirty_records[txgoff], dr);
1451 list_insert_tail(&new->dt.di.dr_children, dr);
1452 dr->dr_parent = new;
1453 }
1454 }
1455 mutex_exit(&new->dt.di.dr_mtx);
1456 mutex_exit(&dn->dn_mtx);
1457 }
1458
1459 out:
1460 if (have_read)
1461 rw_downgrade(&dn->dn_struct_rwlock);
1462 }
1463
1464 void
1465 dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
1466 {
1467 avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1468 avl_index_t where;
1469 free_range_t *rp;
1470 free_range_t rp_tofind;
1471 uint64_t endblk = blkid + nblks;
1472
1473 ASSERT(MUTEX_HELD(&dn->dn_mtx));
1474 ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
1475
1476 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1477 blkid, nblks, tx->tx_txg);
1478 rp_tofind.fr_blkid = blkid;
1479 rp = avl_find(tree, &rp_tofind, &where);
1480 if (rp == NULL)
1481 rp = avl_nearest(tree, where, AVL_BEFORE);
1482 if (rp == NULL)
1483 rp = avl_nearest(tree, where, AVL_AFTER);
1484
1485 while (rp && (rp->fr_blkid <= blkid + nblks)) {
1486 uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
1487 free_range_t *nrp = AVL_NEXT(tree, rp);
1488
1489 if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
1490 /* clear this entire range */
1491 avl_remove(tree, rp);
1492 kmem_free(rp, sizeof (free_range_t));
1493 } else if (blkid <= rp->fr_blkid &&
1494 endblk > rp->fr_blkid && endblk < fr_endblk) {
1495 /* clear the beginning of this range */
1496 rp->fr_blkid = endblk;
1497 rp->fr_nblks = fr_endblk - endblk;
1498 } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
1499 endblk >= fr_endblk) {
1500 /* clear the end of this range */
1501 rp->fr_nblks = blkid - rp->fr_blkid;
1502 } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
1503 /* clear a chunk out of this range */
1504 free_range_t *new_rp =
1505 kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1506
1507 new_rp->fr_blkid = endblk;
1508 new_rp->fr_nblks = fr_endblk - endblk;
1509 avl_insert_here(tree, new_rp, rp, AVL_AFTER);
1510 rp->fr_nblks = blkid - rp->fr_blkid;
1511 }
1512 /* there may be no overlap */
1513 rp = nrp;
1514 }
1515 }
1516
1517 void
1518 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1519 {
1520 dmu_buf_impl_t *db;
1521 uint64_t blkoff, blkid, nblks;
1522 int blksz, blkshift, head, tail;
1523 int trunc = FALSE;
1524 int epbs;
1525
1526 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1527 blksz = dn->dn_datablksz;
1528 blkshift = dn->dn_datablkshift;
1529 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1530
1531 if (len == DMU_OBJECT_END) {
1532 len = UINT64_MAX - off;
1533 trunc = TRUE;
1534 }
1535
1536 /*
1537 * First, block align the region to free:
1647 if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
1648 dmu_buf_will_dirty(&db->db, tx);
1649 dbuf_rele(db, FTAG);
1650 }
1651 if (trunc)
1652 last = dn->dn_maxblkid >> epbs;
1653 else
1654 last = (blkid + nblks - 1) >> epbs;
1655 if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
1656 dmu_buf_will_dirty(&db->db, tx);
1657 dbuf_rele(db, FTAG);
1658 }
1659 }
1660
1661 done:
1662 /*
1663 * Add this range to the dnode range list.
1664 * We will finish up this free operation in the syncing phase.
1665 */
1666 mutex_enter(&dn->dn_mtx);
1667 dnode_clear_range(dn, blkid, nblks, tx);
1668 {
1669 free_range_t *rp, *found;
1670 avl_index_t where;
1671 avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1672
1673 /* Add new range to dn_ranges */
1674 rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1675 rp->fr_blkid = blkid;
1676 rp->fr_nblks = nblks;
1677 found = avl_find(tree, rp, &where);
1678 ASSERT(found == NULL);
1679 avl_insert(tree, rp, where);
1680 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1681 blkid, nblks, tx->tx_txg);
1682 }
1683 mutex_exit(&dn->dn_mtx);
1684
1685 dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1686 dnode_setdirty(dn, tx);
1687 out:
1688
1689 rw_exit(&dn->dn_struct_rwlock);
1690 }
1691
1692 static boolean_t
1693 dnode_spill_freed(dnode_t *dn)
1694 {
1695 int i;
1696
1697 mutex_enter(&dn->dn_mtx);
1698 for (i = 0; i < TXG_SIZE; i++) {
1699 if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
1700 break;
1701 }
1702 mutex_exit(&dn->dn_mtx);
1703 return (i < TXG_SIZE);
1704 }
1705
1706 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
1707 uint64_t
1708 dnode_block_freed(dnode_t *dn, uint64_t blkid)
1709 {
1710 free_range_t range_tofind;
1711 void *dp = spa_get_dsl(dn->dn_objset->os_spa);
1712 int i;
1713
1714 if (blkid == DMU_BONUS_BLKID)
1715 return (FALSE);
1716
1717 /*
1718 * If we're in the process of opening the pool, dp will not be
1719 * set yet, but there shouldn't be anything dirty.
1720 */
1721 if (dp == NULL)
1722 return (FALSE);
1723
1724 if (dn->dn_free_txg)
1725 return (TRUE);
1726
1727 if (blkid == DMU_SPILL_BLKID)
1728 return (dnode_spill_freed(dn));
1729
1730 range_tofind.fr_blkid = blkid;
1731 mutex_enter(&dn->dn_mtx);
1732 for (i = 0; i < TXG_SIZE; i++) {
1733 free_range_t *range_found;
1734 avl_index_t idx;
1735
1736 range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
1737 if (range_found) {
1738 ASSERT(range_found->fr_nblks > 0);
1739 break;
1740 }
1741 range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
1742 if (range_found &&
1743 range_found->fr_blkid + range_found->fr_nblks > blkid)
1744 break;
1745 }
1746 mutex_exit(&dn->dn_mtx);
1747 return (i < TXG_SIZE);
1748 }
1749
1750 /* call from syncing context when we actually write/free space for this dnode */
1751 void
1752 dnode_diduse_space(dnode_t *dn, int64_t delta)
1753 {
1754 uint64_t space;
1755 dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
1756 dn, dn->dn_phys,
1757 (u_longlong_t)dn->dn_phys->dn_used,
1758 (longlong_t)delta);
1759
1760 mutex_enter(&dn->dn_mtx);
1761 space = DN_USED_BYTES(dn->dn_phys);
1762 if (delta > 0) {
1763 ASSERT3U(space + delta, >=, space); /* no overflow */
1764 } else {
1765 ASSERT3U(space, >=, -delta); /* no underflow */
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 */
25
26 #include <sys/zfs_context.h>
27 #include <sys/dbuf.h>
28 #include <sys/dnode.h>
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dsl_dir.h>
34 #include <sys/dsl_dataset.h>
35 #include <sys/spa.h>
36 #include <sys/zio.h>
37 #include <sys/dmu_zfetch.h>
38 #include <sys/range_tree.h>
39
40 static kmem_cache_t *dnode_cache;
41 /*
42 * Define DNODE_STATS to turn on statistic gathering. By default, it is only
43 * turned on when DEBUG is also defined.
44 */
45 #ifdef DEBUG
46 #define DNODE_STATS
47 #endif /* DEBUG */
48
49 #ifdef DNODE_STATS
50 #define DNODE_STAT_ADD(stat) ((stat)++)
51 #else
52 #define DNODE_STAT_ADD(stat) /* nothing */
53 #endif /* DNODE_STATS */
54
55 static dnode_phys_t dnode_phys_zero;
56
57 int zfs_default_bs = SPA_MINBLOCKSHIFT;
58 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
59
72 cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
73
74 /*
75 * Every dbuf has a reference, and dropping a tracked reference is
76 * O(number of references), so don't track dn_holds.
77 */
78 refcount_create_untracked(&dn->dn_holds);
79 refcount_create(&dn->dn_tx_holds);
80 list_link_init(&dn->dn_link);
81
82 bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
83 bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
84 bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
85 bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
86 bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
87 bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
88 bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
89
90 for (i = 0; i < TXG_SIZE; i++) {
91 list_link_init(&dn->dn_dirty_link[i]);
92 dn->dn_free_ranges[i] = NULL;
93 list_create(&dn->dn_dirty_records[i],
94 sizeof (dbuf_dirty_record_t),
95 offsetof(dbuf_dirty_record_t, dr_dirty_node));
96 }
97
98 dn->dn_allocated_txg = 0;
99 dn->dn_free_txg = 0;
100 dn->dn_assigned_txg = 0;
101 dn->dn_dirtyctx = 0;
102 dn->dn_dirtyctx_firstset = NULL;
103 dn->dn_bonus = NULL;
104 dn->dn_have_spill = B_FALSE;
105 dn->dn_zio = NULL;
106 dn->dn_oldused = 0;
107 dn->dn_oldflags = 0;
108 dn->dn_olduid = 0;
109 dn->dn_oldgid = 0;
110 dn->dn_newuid = 0;
111 dn->dn_newgid = 0;
112 dn->dn_id_flags = 0;
120 return (0);
121 }
122
123 /* ARGSUSED */
124 static void
125 dnode_dest(void *arg, void *unused)
126 {
127 int i;
128 dnode_t *dn = arg;
129
130 rw_destroy(&dn->dn_struct_rwlock);
131 mutex_destroy(&dn->dn_mtx);
132 mutex_destroy(&dn->dn_dbufs_mtx);
133 cv_destroy(&dn->dn_notxholds);
134 refcount_destroy(&dn->dn_holds);
135 refcount_destroy(&dn->dn_tx_holds);
136 ASSERT(!list_link_active(&dn->dn_link));
137
138 for (i = 0; i < TXG_SIZE; i++) {
139 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
140 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
141 list_destroy(&dn->dn_dirty_records[i]);
142 ASSERT0(dn->dn_next_nblkptr[i]);
143 ASSERT0(dn->dn_next_nlevels[i]);
144 ASSERT0(dn->dn_next_indblkshift[i]);
145 ASSERT0(dn->dn_next_bonustype[i]);
146 ASSERT0(dn->dn_rm_spillblk[i]);
147 ASSERT0(dn->dn_next_bonuslen[i]);
148 ASSERT0(dn->dn_next_blksz[i]);
149 }
150
151 ASSERT0(dn->dn_allocated_txg);
152 ASSERT0(dn->dn_free_txg);
153 ASSERT0(dn->dn_assigned_txg);
154 ASSERT0(dn->dn_dirtyctx);
155 ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
156 ASSERT3P(dn->dn_bonus, ==, NULL);
157 ASSERT(!dn->dn_have_spill);
158 ASSERT3P(dn->dn_zio, ==, NULL);
159 ASSERT0(dn->dn_oldused);
160 ASSERT0(dn->dn_oldflags);
293 byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
294
295 }
296
297 void
298 dnode_buf_byteswap(void *vbuf, size_t size)
299 {
300 dnode_phys_t *buf = vbuf;
301 int i;
302
303 ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
304 ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
305
306 size >>= DNODE_SHIFT;
307 for (i = 0; i < size; i++) {
308 dnode_byteswap(buf);
309 buf++;
310 }
311 }
312
313 void
314 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
315 {
316 ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
317
318 dnode_setdirty(dn, tx);
319 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
320 ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
321 (dn->dn_nblkptr-1) * sizeof (blkptr_t));
322 dn->dn_bonuslen = newsize;
323 if (newsize == 0)
324 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
325 else
326 dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
327 rw_exit(&dn->dn_struct_rwlock);
328 }
329
330 void
331 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
332 {
341 void
342 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
343 {
344 ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
345 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
346 dnode_setdirty(dn, tx);
347 dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
348 dn->dn_have_spill = B_FALSE;
349 }
350
351 static void
352 dnode_setdblksz(dnode_t *dn, int size)
353 {
354 ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
355 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
356 ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
357 ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
358 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
359 dn->dn_datablksz = size;
360 dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
361 dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
362 }
363
364 static dnode_t *
365 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
366 uint64_t object, dnode_handle_t *dnh)
367 {
368 dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
369
370 ASSERT(!POINTER_IS_VALID(dn->dn_objset));
371 dn->dn_moved = 0;
372
373 /*
374 * Defer setting dn_objset until the dnode is ready to be a candidate
375 * for the dnode_move() callback.
376 */
377 dn->dn_object = object;
378 dn->dn_dbuf = db;
379 dn->dn_handle = dnh;
380 dn->dn_phys = dnp;
381
497 ASSERT(DMU_OT_IS_VALID(bonustype));
498 ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
499 ASSERT(dn->dn_type == DMU_OT_NONE);
500 ASSERT0(dn->dn_maxblkid);
501 ASSERT0(dn->dn_allocated_txg);
502 ASSERT0(dn->dn_assigned_txg);
503 ASSERT(refcount_is_zero(&dn->dn_tx_holds));
504 ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
505 ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
506
507 for (i = 0; i < TXG_SIZE; i++) {
508 ASSERT0(dn->dn_next_nblkptr[i]);
509 ASSERT0(dn->dn_next_nlevels[i]);
510 ASSERT0(dn->dn_next_indblkshift[i]);
511 ASSERT0(dn->dn_next_bonuslen[i]);
512 ASSERT0(dn->dn_next_bonustype[i]);
513 ASSERT0(dn->dn_rm_spillblk[i]);
514 ASSERT0(dn->dn_next_blksz[i]);
515 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
516 ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
517 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
518 }
519
520 dn->dn_type = ot;
521 dnode_setdblksz(dn, blocksize);
522 dn->dn_indblkshift = ibs;
523 dn->dn_nlevels = 1;
524 if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
525 dn->dn_nblkptr = 1;
526 else
527 dn->dn_nblkptr = 1 +
528 ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
529 dn->dn_bonustype = bonustype;
530 dn->dn_bonuslen = bonuslen;
531 dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
532 dn->dn_compress = ZIO_COMPRESS_INHERIT;
533 dn->dn_dirtyctx = 0;
534
535 dn->dn_free_txg = 0;
536 if (dn->dn_dirtyctx_firstset) {
537 kmem_free(dn->dn_dirtyctx_firstset, 1);
661 ndn->dn_datablksz = odn->dn_datablksz;
662 ndn->dn_maxblkid = odn->dn_maxblkid;
663 bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
664 sizeof (odn->dn_next_nblkptr));
665 bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
666 sizeof (odn->dn_next_nlevels));
667 bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
668 sizeof (odn->dn_next_indblkshift));
669 bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
670 sizeof (odn->dn_next_bonustype));
671 bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
672 sizeof (odn->dn_rm_spillblk));
673 bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
674 sizeof (odn->dn_next_bonuslen));
675 bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
676 sizeof (odn->dn_next_blksz));
677 for (i = 0; i < TXG_SIZE; i++) {
678 list_move_tail(&ndn->dn_dirty_records[i],
679 &odn->dn_dirty_records[i]);
680 }
681 bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
682 sizeof (odn->dn_free_ranges));
683 ndn->dn_allocated_txg = odn->dn_allocated_txg;
684 ndn->dn_free_txg = odn->dn_free_txg;
685 ndn->dn_assigned_txg = odn->dn_assigned_txg;
686 ndn->dn_dirtyctx = odn->dn_dirtyctx;
687 ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
688 ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
689 refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
690 ASSERT(list_is_empty(&ndn->dn_dbufs));
691 list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
692 ndn->dn_dbufs_count = odn->dn_dbufs_count;
693 ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
694 ndn->dn_bonus = odn->dn_bonus;
695 ndn->dn_have_spill = odn->dn_have_spill;
696 ndn->dn_zio = odn->dn_zio;
697 ndn->dn_oldused = odn->dn_oldused;
698 ndn->dn_oldflags = odn->dn_oldflags;
699 ndn->dn_olduid = odn->dn_olduid;
700 ndn->dn_oldgid = odn->dn_oldgid;
701 ndn->dn_newuid = odn->dn_newuid;
702 ndn->dn_newgid = odn->dn_newgid;
725 list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
726 offsetof(dmu_buf_impl_t, db_link));
727 odn->dn_dbufs_count = 0;
728 odn->dn_unlisted_l0_blkid = 0;
729 odn->dn_bonus = NULL;
730 odn->dn_zfetch.zf_dnode = NULL;
731
732 /*
733 * Set the low bit of the objset pointer to ensure that dnode_move()
734 * recognizes the dnode as invalid in any subsequent callback.
735 */
736 POINTER_INVALIDATE(&odn->dn_objset);
737
738 /*
739 * Satisfy the destructor.
740 */
741 for (i = 0; i < TXG_SIZE; i++) {
742 list_create(&odn->dn_dirty_records[i],
743 sizeof (dbuf_dirty_record_t),
744 offsetof(dbuf_dirty_record_t, dr_dirty_node));
745 odn->dn_free_ranges[i] = NULL;
746 odn->dn_next_nlevels[i] = 0;
747 odn->dn_next_indblkshift[i] = 0;
748 odn->dn_next_bonustype[i] = 0;
749 odn->dn_rm_spillblk[i] = 0;
750 odn->dn_next_bonuslen[i] = 0;
751 odn->dn_next_blksz[i] = 0;
752 }
753 odn->dn_allocated_txg = 0;
754 odn->dn_free_txg = 0;
755 odn->dn_assigned_txg = 0;
756 odn->dn_dirtyctx = 0;
757 odn->dn_dirtyctx_firstset = NULL;
758 odn->dn_have_spill = B_FALSE;
759 odn->dn_zio = NULL;
760 odn->dn_oldused = 0;
761 odn->dn_oldflags = 0;
762 odn->dn_olduid = 0;
763 odn->dn_oldgid = 0;
764 odn->dn_newuid = 0;
765 odn->dn_newgid = 0;
1429 dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1430 if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1431 dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1432 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1433 ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1434 list_remove(&dn->dn_dirty_records[txgoff], dr);
1435 list_insert_tail(&new->dt.di.dr_children, dr);
1436 dr->dr_parent = new;
1437 }
1438 }
1439 mutex_exit(&new->dt.di.dr_mtx);
1440 mutex_exit(&dn->dn_mtx);
1441 }
1442
1443 out:
1444 if (have_read)
1445 rw_downgrade(&dn->dn_struct_rwlock);
1446 }
1447
1448 void
1449 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1450 {
1451 dmu_buf_impl_t *db;
1452 uint64_t blkoff, blkid, nblks;
1453 int blksz, blkshift, head, tail;
1454 int trunc = FALSE;
1455 int epbs;
1456
1457 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1458 blksz = dn->dn_datablksz;
1459 blkshift = dn->dn_datablkshift;
1460 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1461
1462 if (len == DMU_OBJECT_END) {
1463 len = UINT64_MAX - off;
1464 trunc = TRUE;
1465 }
1466
1467 /*
1468 * First, block align the region to free:
1578 if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
1579 dmu_buf_will_dirty(&db->db, tx);
1580 dbuf_rele(db, FTAG);
1581 }
1582 if (trunc)
1583 last = dn->dn_maxblkid >> epbs;
1584 else
1585 last = (blkid + nblks - 1) >> epbs;
1586 if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
1587 dmu_buf_will_dirty(&db->db, tx);
1588 dbuf_rele(db, FTAG);
1589 }
1590 }
1591
1592 done:
1593 /*
1594 * Add this range to the dnode range list.
1595 * We will finish up this free operation in the syncing phase.
1596 */
1597 mutex_enter(&dn->dn_mtx);
1598 int txgoff = tx->tx_txg & TXG_MASK;
1599 if (dn->dn_free_ranges[txgoff] == NULL) {
1600 dn->dn_free_ranges[txgoff] =
1601 range_tree_create(NULL, NULL, &dn->dn_mtx);
1602 }
1603 range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
1604 range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
1605 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1606 blkid, nblks, tx->tx_txg);
1607 mutex_exit(&dn->dn_mtx);
1608
1609 dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1610 dnode_setdirty(dn, tx);
1611 out:
1612
1613 rw_exit(&dn->dn_struct_rwlock);
1614 }
1615
1616 static boolean_t
1617 dnode_spill_freed(dnode_t *dn)
1618 {
1619 int i;
1620
1621 mutex_enter(&dn->dn_mtx);
1622 for (i = 0; i < TXG_SIZE; i++) {
1623 if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
1624 break;
1625 }
1626 mutex_exit(&dn->dn_mtx);
1627 return (i < TXG_SIZE);
1628 }
1629
1630 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
1631 uint64_t
1632 dnode_block_freed(dnode_t *dn, uint64_t blkid)
1633 {
1634 void *dp = spa_get_dsl(dn->dn_objset->os_spa);
1635 int i;
1636
1637 if (blkid == DMU_BONUS_BLKID)
1638 return (FALSE);
1639
1640 /*
1641 * If we're in the process of opening the pool, dp will not be
1642 * set yet, but there shouldn't be anything dirty.
1643 */
1644 if (dp == NULL)
1645 return (FALSE);
1646
1647 if (dn->dn_free_txg)
1648 return (TRUE);
1649
1650 if (blkid == DMU_SPILL_BLKID)
1651 return (dnode_spill_freed(dn));
1652
1653 mutex_enter(&dn->dn_mtx);
1654 for (i = 0; i < TXG_SIZE; i++) {
1655 if (dn->dn_free_ranges[i] != NULL &&
1656 range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
1657 break;
1658 }
1659 mutex_exit(&dn->dn_mtx);
1660 return (i < TXG_SIZE);
1661 }
1662
1663 /* call from syncing context when we actually write/free space for this dnode */
1664 void
1665 dnode_diduse_space(dnode_t *dn, int64_t delta)
1666 {
1667 uint64_t space;
1668 dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
1669 dn, dn->dn_phys,
1670 (u_longlong_t)dn->dn_phys->dn_used,
1671 (longlong_t)delta);
1672
1673 mutex_enter(&dn->dn_mtx);
1674 space = DN_USED_BYTES(dn->dn_phys);
1675 if (delta > 0) {
1676 ASSERT3U(space + delta, >=, space); /* no overflow */
1677 } else {
1678 ASSERT3U(space, >=, -delta); /* no underflow */
|