3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 RackTop Systems.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/dbuf.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_impl.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/dmu_objset.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/spa.h>
39 #include <sys/zio.h>
40 #include <sys/dmu_zfetch.h>
41 #include <sys/range_tree.h>
42
43 static kmem_cache_t *dnode_cache;
44 /*
45 * Define DNODE_STATS to turn on statistic gathering. By default, it is only
46 * turned on when DEBUG is also defined.
47 */
48 #ifdef DEBUG
49 #define DNODE_STATS
50 #endif /* DEBUG */
51
52 #ifdef DNODE_STATS
53 #define DNODE_STAT_ADD(stat) ((stat)++)
54 #else
55 #define DNODE_STAT_ADD(stat) /* nothing */
56 #endif /* DNODE_STATS */
57
58 static dnode_phys_t dnode_phys_zero;
59
60 int zfs_default_bs = SPA_MINBLOCKSHIFT;
61 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
62
63 #ifdef _KERNEL
64 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
65 #endif /* _KERNEL */
66
67 static int
68 dbuf_compare(const void *x1, const void *x2)
69 {
70 const dmu_buf_impl_t *d1 = x1;
71 const dmu_buf_impl_t *d2 = x2;
72
73 if (d1->db_level < d2->db_level) {
74 return (-1);
75 }
76 if (d1->db_level > d2->db_level) {
77 return (1);
78 }
79
80 if (d1->db_blkid < d2->db_blkid) {
81 return (-1);
141 dn->dn_free_txg = 0;
142 dn->dn_assigned_txg = 0;
143 dn->dn_dirtyctx = 0;
144 dn->dn_dirtyctx_firstset = NULL;
145 dn->dn_bonus = NULL;
146 dn->dn_have_spill = B_FALSE;
147 dn->dn_zio = NULL;
148 dn->dn_oldused = 0;
149 dn->dn_oldflags = 0;
150 dn->dn_olduid = 0;
151 dn->dn_oldgid = 0;
152 dn->dn_newuid = 0;
153 dn->dn_newgid = 0;
154 dn->dn_id_flags = 0;
155
156 dn->dn_dbufs_count = 0;
157 avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
158 offsetof(dmu_buf_impl_t, db_link));
159
160 dn->dn_moved = 0;
161 return (0);
162 }
163
164 /* ARGSUSED */
165 static void
166 dnode_dest(void *arg, void *unused)
167 {
168 int i;
169 dnode_t *dn = arg;
170
171 rw_destroy(&dn->dn_struct_rwlock);
172 mutex_destroy(&dn->dn_mtx);
173 mutex_destroy(&dn->dn_dbufs_mtx);
174 cv_destroy(&dn->dn_notxholds);
175 refcount_destroy(&dn->dn_holds);
176 refcount_destroy(&dn->dn_tx_holds);
177 ASSERT(!list_link_active(&dn->dn_link));
178
179 for (i = 0; i < TXG_SIZE; i++) {
180 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
181 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
182 list_destroy(&dn->dn_dirty_records[i]);
183 ASSERT0(dn->dn_next_nblkptr[i]);
184 ASSERT0(dn->dn_next_nlevels[i]);
185 ASSERT0(dn->dn_next_indblkshift[i]);
186 ASSERT0(dn->dn_next_bonustype[i]);
187 ASSERT0(dn->dn_rm_spillblk[i]);
188 ASSERT0(dn->dn_next_bonuslen[i]);
189 ASSERT0(dn->dn_next_blksz[i]);
190 }
621
622 void
623 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
624 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
625 {
626 int nblkptr;
627
628 ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
629 ASSERT3U(blocksize, <=,
630 spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
631 ASSERT0(blocksize % SPA_MINBLOCKSIZE);
632 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
633 ASSERT(tx->tx_txg != 0);
634 ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
635 (bonustype != DMU_OT_NONE && bonuslen != 0) ||
636 (bonustype == DMU_OT_SA && bonuslen == 0));
637 ASSERT(DMU_OT_IS_VALID(bonustype));
638 ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
639
640 /* clean up any unreferenced dbufs */
641 dnode_evict_dbufs(dn);
642
643 dn->dn_id_flags = 0;
644
645 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
646 dnode_setdirty(dn, tx);
647 if (dn->dn_datablksz != blocksize) {
648 /* change blocksize */
649 ASSERT(dn->dn_maxblkid == 0 &&
650 (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
651 dnode_block_freed(dn, 0)));
652 dnode_setdblksz(dn, blocksize);
653 dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
654 }
655 if (dn->dn_bonuslen != bonuslen)
656 dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
657
658 if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
659 nblkptr = 1;
660 else
661 nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
1250 */
1251 ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1252
1253 /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1254 if (refs == 0 && db != NULL) {
1255 /*
1256 * Another thread could add a hold to the dnode handle in
1257 * dnode_hold_impl() while holding the parent dbuf. Since the
1258 * hold on the parent dbuf prevents the handle from being
1259 * destroyed, the hold on the handle is OK. We can't yet assert
1260 * that the handle has zero references, but that will be
1261 * asserted anyway when the handle gets destroyed.
1262 */
1263 dbuf_rele(db, dnh);
1264 }
1265 }
1266
1267 void
1268 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1269 {
1270 objset_t *os = dn->dn_objset;
1271 uint64_t txg = tx->tx_txg;
1272
1273 if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1274 dsl_dataset_dirty(os->os_dsl_dataset, tx);
1275 return;
1276 }
1277
1278 DNODE_VERIFY(dn);
1279
1280 #ifdef ZFS_DEBUG
1281 mutex_enter(&dn->dn_mtx);
1282 ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1283 ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1284 mutex_exit(&dn->dn_mtx);
1285 #endif
1286
1287 /*
1288 * Determine old uid/gid when necessary
1289 */
1308 ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1309
1310 dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1311 dn->dn_object, txg);
1312
1313 multilist_sublist_insert_head(mls, dn);
1314
1315 multilist_sublist_unlock(mls);
1316
1317 /*
1318 * The dnode maintains a hold on its containing dbuf as
1319 * long as there are holds on it. Each instantiated child
1320 * dbuf maintains a hold on the dnode. When the last child
1321 * drops its hold, the dnode will drop its hold on the
1322 * containing dbuf. We add a "dirty hold" here so that the
1323 * dnode will hang around after we finish processing its
1324 * children.
1325 */
1326 VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1327
1328 (void) dbuf_dirty(dn->dn_dbuf, tx);
1329
1330 dsl_dataset_dirty(os->os_dsl_dataset, tx);
1331 }
1332
1333 void
1334 dnode_free(dnode_t *dn, dmu_tx_t *tx)
1335 {
1336 mutex_enter(&dn->dn_mtx);
1337 if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1338 mutex_exit(&dn->dn_mtx);
1339 return;
1340 }
1341 dn->dn_free_txg = tx->tx_txg;
1342 mutex_exit(&dn->dn_mtx);
1343
1344 dnode_setdirty(dn, tx);
1345 }
1346
1347 /*
1348 * Try to change the block size for the indicated dnode. This can only
1349 * succeed if there are no blocks allocated or dirty beyond first block
1397 dnode_setdirty(dn, tx);
1398 dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1399 if (ibs) {
1400 dn->dn_indblkshift = ibs;
1401 dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1402 }
1403 /* rele after we have fixed the blocksize in the dnode */
1404 if (db)
1405 dbuf_rele(db, FTAG);
1406
1407 rw_exit(&dn->dn_struct_rwlock);
1408 return (0);
1409
1410 fail:
1411 rw_exit(&dn->dn_struct_rwlock);
1412 return (SET_ERROR(ENOTSUP));
1413 }
1414
1415 /* read-holding callers must not rely on the lock being continuously held */
1416 void
1417 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
1418 {
1419 uint64_t txgoff = tx->tx_txg & TXG_MASK;
1420 int epbs, new_nlevels;
1421 uint64_t sz;
1422
1423 ASSERT(blkid != DMU_BONUS_BLKID);
1424
1425 ASSERT(have_read ?
1426 RW_READ_HELD(&dn->dn_struct_rwlock) :
1427 RW_WRITE_HELD(&dn->dn_struct_rwlock));
1428
1429 /*
1430 * if we have a read-lock, check to see if we need to do any work
1431 * before upgrading to a write-lock.
1432 */
1433 if (have_read) {
1434 if (blkid <= dn->dn_maxblkid)
1435 return;
1436
1437 if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1451 new_nlevels = 1;
1452 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1453 for (sz = dn->dn_nblkptr;
1454 sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1455 new_nlevels++;
1456
1457 if (new_nlevels > dn->dn_nlevels) {
1458 int old_nlevels = dn->dn_nlevels;
1459 dmu_buf_impl_t *db;
1460 list_t *list;
1461 dbuf_dirty_record_t *new, *dr, *dr_next;
1462
1463 dn->dn_nlevels = new_nlevels;
1464
1465 ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1466 dn->dn_next_nlevels[txgoff] = new_nlevels;
1467
1468 /* dirty the left indirects */
1469 db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1470 ASSERT(db != NULL);
1471 new = dbuf_dirty(db, tx);
1472 dbuf_rele(db, FTAG);
1473
1474 /* transfer the dirty records to the new indirect */
1475 mutex_enter(&dn->dn_mtx);
1476 mutex_enter(&new->dt.di.dr_mtx);
1477 list = &dn->dn_dirty_records[txgoff];
1478 for (dr = list_head(list); dr; dr = dr_next) {
1479 dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1480 if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1481 dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1482 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1483 ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1484 list_remove(&dn->dn_dirty_records[txgoff], dr);
1485 list_insert_tail(&new->dt.di.dr_children, dr);
1486 dr->dr_parent = new;
1487 }
1488 }
1489 mutex_exit(&new->dt.di.dr_mtx);
1490 mutex_exit(&dn->dn_mtx);
1491 }
1680 * this block in syncing context, it will use
1681 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
1682 * to the "failmode" property. dnode_next_offset()
1683 * doesn't have a flag to indicate MUSTSUCCEED.
1684 */
1685 if (err != 0)
1686 break;
1687
1688 dnode_dirty_l1(dn, i, tx);
1689 }
1690 }
1691
1692 done:
1693 /*
1694 * Add this range to the dnode range list.
1695 * We will finish up this free operation in the syncing phase.
1696 */
1697 mutex_enter(&dn->dn_mtx);
1698 int txgoff = tx->tx_txg & TXG_MASK;
1699 if (dn->dn_free_ranges[txgoff] == NULL) {
1700 dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
1701 }
1702 range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
1703 range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
1704 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1705 blkid, nblks, tx->tx_txg);
1706 mutex_exit(&dn->dn_mtx);
1707
1708 dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1709 dnode_setdirty(dn, tx);
1710 out:
1711
1712 rw_exit(&dn->dn_struct_rwlock);
1713 }
1714
1715 static boolean_t
1716 dnode_spill_freed(dnode_t *dn)
1717 {
1718 int i;
1719
1720 mutex_enter(&dn->dn_mtx);
1979 flags, offset, lvl, blkfill, txg);
1980 }
1981
1982 /*
1983 * There's always a "virtual hole" at the end of the object, even
1984 * if all BP's which physically exist are non-holes.
1985 */
1986 if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
1987 minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
1988 error = 0;
1989 }
1990
1991 if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
1992 initial_offset < *offset : initial_offset > *offset))
1993 error = SET_ERROR(ESRCH);
1994 out:
1995 if (!(flags & DNODE_FIND_HAVELOCK))
1996 rw_exit(&dn->dn_struct_rwlock);
1997
1998 return (error);
1999 }
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2017 RackTop Systems.
28 */
29
30 #include <sys/zfs_context.h>
31 #include <sys/dbuf.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu.h>
34 #include <sys/dmu_impl.h>
35 #include <sys/dmu_tx.h>
36 #include <sys/dmu_objset.h>
37 #include <sys/dsl_dir.h>
38 #include <sys/dsl_dataset.h>
39 #include <sys/spa.h>
40 #include <sys/zio.h>
41 #include <sys/dmu_zfetch.h>
42 #include <sys/range_tree.h>
43
44 static void smartcomp_check_comp(dnode_smartcomp_t *sc);
45
46 static kmem_cache_t *dnode_cache;
47 /*
48 * Define DNODE_STATS to turn on statistic gathering. By default, it is only
49 * turned on when DEBUG is also defined.
50 */
51 #ifdef DEBUG
52 #define DNODE_STATS
53 #endif /* DEBUG */
54
55 #ifdef DNODE_STATS
56 #define DNODE_STAT_ADD(stat) ((stat)++)
57 #else
58 #define DNODE_STAT_ADD(stat) /* nothing */
59 #endif /* DNODE_STATS */
60
61 static dnode_phys_t dnode_phys_zero;
62
63 int zfs_default_bs = SPA_MINBLOCKSHIFT;
64 int zfs_default_ibs = DN_DFL_INDBLKSHIFT;
65
66 #ifdef _KERNEL
67 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
68 #endif /* _KERNEL */
69
70 static int
71 dbuf_compare(const void *x1, const void *x2)
72 {
73 const dmu_buf_impl_t *d1 = x1;
74 const dmu_buf_impl_t *d2 = x2;
75
76 if (d1->db_level < d2->db_level) {
77 return (-1);
78 }
79 if (d1->db_level > d2->db_level) {
80 return (1);
81 }
82
83 if (d1->db_blkid < d2->db_blkid) {
84 return (-1);
144 dn->dn_free_txg = 0;
145 dn->dn_assigned_txg = 0;
146 dn->dn_dirtyctx = 0;
147 dn->dn_dirtyctx_firstset = NULL;
148 dn->dn_bonus = NULL;
149 dn->dn_have_spill = B_FALSE;
150 dn->dn_zio = NULL;
151 dn->dn_oldused = 0;
152 dn->dn_oldflags = 0;
153 dn->dn_olduid = 0;
154 dn->dn_oldgid = 0;
155 dn->dn_newuid = 0;
156 dn->dn_newgid = 0;
157 dn->dn_id_flags = 0;
158
159 dn->dn_dbufs_count = 0;
160 avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
161 offsetof(dmu_buf_impl_t, db_link));
162
163 dn->dn_moved = 0;
164
165 bzero(&dn->dn_smartcomp, sizeof (dn->dn_smartcomp));
166 mutex_init(&dn->dn_smartcomp.sc_lock, NULL, MUTEX_DEFAULT, NULL);
167
168 return (0);
169 }
170
171 /* ARGSUSED */
172 static void
173 dnode_dest(void *arg, void *unused)
174 {
175 int i;
176 dnode_t *dn = arg;
177
178 mutex_destroy(&dn->dn_smartcomp.sc_lock);
179
180 rw_destroy(&dn->dn_struct_rwlock);
181 mutex_destroy(&dn->dn_mtx);
182 mutex_destroy(&dn->dn_dbufs_mtx);
183 cv_destroy(&dn->dn_notxholds);
184 refcount_destroy(&dn->dn_holds);
185 refcount_destroy(&dn->dn_tx_holds);
186 ASSERT(!list_link_active(&dn->dn_link));
187
188 for (i = 0; i < TXG_SIZE; i++) {
189 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
190 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
191 list_destroy(&dn->dn_dirty_records[i]);
192 ASSERT0(dn->dn_next_nblkptr[i]);
193 ASSERT0(dn->dn_next_nlevels[i]);
194 ASSERT0(dn->dn_next_indblkshift[i]);
195 ASSERT0(dn->dn_next_bonustype[i]);
196 ASSERT0(dn->dn_rm_spillblk[i]);
197 ASSERT0(dn->dn_next_bonuslen[i]);
198 ASSERT0(dn->dn_next_blksz[i]);
199 }
630
631 void
632 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
633 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
634 {
635 int nblkptr;
636
637 ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
638 ASSERT3U(blocksize, <=,
639 spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
640 ASSERT0(blocksize % SPA_MINBLOCKSIZE);
641 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
642 ASSERT(tx->tx_txg != 0);
643 ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
644 (bonustype != DMU_OT_NONE && bonuslen != 0) ||
645 (bonustype == DMU_OT_SA && bonuslen == 0));
646 ASSERT(DMU_OT_IS_VALID(bonustype));
647 ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
648
649 /* clean up any unreferenced dbufs */
650 dnode_evict_dbufs(dn, DBUF_EVICT_ALL);
651
652 dn->dn_id_flags = 0;
653
654 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
655 dnode_setdirty(dn, tx);
656 if (dn->dn_datablksz != blocksize) {
657 /* change blocksize */
658 ASSERT(dn->dn_maxblkid == 0 &&
659 (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
660 dnode_block_freed(dn, 0)));
661 dnode_setdblksz(dn, blocksize);
662 dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
663 }
664 if (dn->dn_bonuslen != bonuslen)
665 dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
666
667 if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
668 nblkptr = 1;
669 else
670 nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
1259 */
1260 ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1261
1262 /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1263 if (refs == 0 && db != NULL) {
1264 /*
1265 * Another thread could add a hold to the dnode handle in
1266 * dnode_hold_impl() while holding the parent dbuf. Since the
1267 * hold on the parent dbuf prevents the handle from being
1268 * destroyed, the hold on the handle is OK. We can't yet assert
1269 * that the handle has zero references, but that will be
1270 * asserted anyway when the handle gets destroyed.
1271 */
1272 dbuf_rele(db, dnh);
1273 }
1274 }
1275
1276 void
1277 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1278 {
1279 dnode_setdirty_sc(dn, tx, B_TRUE);
1280 }
1281
1282 void
1283 dnode_setdirty_sc(dnode_t *dn, dmu_tx_t *tx, boolean_t usesc)
1284 {
1285 objset_t *os = dn->dn_objset;
1286 uint64_t txg = tx->tx_txg;
1287
1288 if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1289 dsl_dataset_dirty(os->os_dsl_dataset, tx);
1290 return;
1291 }
1292
1293 DNODE_VERIFY(dn);
1294
1295 #ifdef ZFS_DEBUG
1296 mutex_enter(&dn->dn_mtx);
1297 ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1298 ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1299 mutex_exit(&dn->dn_mtx);
1300 #endif
1301
1302 /*
1303 * Determine old uid/gid when necessary
1304 */
1323 ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1324
1325 dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1326 dn->dn_object, txg);
1327
1328 multilist_sublist_insert_head(mls, dn);
1329
1330 multilist_sublist_unlock(mls);
1331
1332 /*
1333 * The dnode maintains a hold on its containing dbuf as
1334 * long as there are holds on it. Each instantiated child
1335 * dbuf maintains a hold on the dnode. When the last child
1336 * drops its hold, the dnode will drop its hold on the
1337 * containing dbuf. We add a "dirty hold" here so that the
1338 * dnode will hang around after we finish processing its
1339 * children.
1340 */
1341 VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1342
1343 (void) dbuf_dirty_sc(dn->dn_dbuf, tx, usesc);
1344 dsl_dataset_dirty(os->os_dsl_dataset, tx);
1345 }
1346
1347 void
1348 dnode_free(dnode_t *dn, dmu_tx_t *tx)
1349 {
1350 mutex_enter(&dn->dn_mtx);
1351 if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1352 mutex_exit(&dn->dn_mtx);
1353 return;
1354 }
1355 dn->dn_free_txg = tx->tx_txg;
1356 mutex_exit(&dn->dn_mtx);
1357
1358 dnode_setdirty(dn, tx);
1359 }
1360
1361 /*
1362 * Try to change the block size for the indicated dnode. This can only
1363 * succeed if there are no blocks allocated or dirty beyond first block
1411 dnode_setdirty(dn, tx);
1412 dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1413 if (ibs) {
1414 dn->dn_indblkshift = ibs;
1415 dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1416 }
1417 /* rele after we have fixed the blocksize in the dnode */
1418 if (db)
1419 dbuf_rele(db, FTAG);
1420
1421 rw_exit(&dn->dn_struct_rwlock);
1422 return (0);
1423
1424 fail:
1425 rw_exit(&dn->dn_struct_rwlock);
1426 return (SET_ERROR(ENOTSUP));
1427 }
1428
1429 /* read-holding callers must not rely on the lock being continuously held */
1430 void
1431 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
1432 boolean_t usesc, boolean_t have_read)
1433 {
1434 uint64_t txgoff = tx->tx_txg & TXG_MASK;
1435 int epbs, new_nlevels;
1436 uint64_t sz;
1437
1438 ASSERT(blkid != DMU_BONUS_BLKID);
1439
1440 ASSERT(have_read ?
1441 RW_READ_HELD(&dn->dn_struct_rwlock) :
1442 RW_WRITE_HELD(&dn->dn_struct_rwlock));
1443
1444 /*
1445 * if we have a read-lock, check to see if we need to do any work
1446 * before upgrading to a write-lock.
1447 */
1448 if (have_read) {
1449 if (blkid <= dn->dn_maxblkid)
1450 return;
1451
1452 if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
1466 new_nlevels = 1;
1467 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1468 for (sz = dn->dn_nblkptr;
1469 sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1470 new_nlevels++;
1471
1472 if (new_nlevels > dn->dn_nlevels) {
1473 int old_nlevels = dn->dn_nlevels;
1474 dmu_buf_impl_t *db;
1475 list_t *list;
1476 dbuf_dirty_record_t *new, *dr, *dr_next;
1477
1478 dn->dn_nlevels = new_nlevels;
1479
1480 ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1481 dn->dn_next_nlevels[txgoff] = new_nlevels;
1482
1483 /* dirty the left indirects */
1484 db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1485 ASSERT(db != NULL);
1486 new = dbuf_dirty_sc(db, tx, usesc);
1487 dbuf_rele(db, FTAG);
1488
1489 /* transfer the dirty records to the new indirect */
1490 mutex_enter(&dn->dn_mtx);
1491 mutex_enter(&new->dt.di.dr_mtx);
1492 list = &dn->dn_dirty_records[txgoff];
1493 for (dr = list_head(list); dr; dr = dr_next) {
1494 dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1495 if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1496 dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1497 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1498 ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1499 list_remove(&dn->dn_dirty_records[txgoff], dr);
1500 list_insert_tail(&new->dt.di.dr_children, dr);
1501 dr->dr_parent = new;
1502 }
1503 }
1504 mutex_exit(&new->dt.di.dr_mtx);
1505 mutex_exit(&dn->dn_mtx);
1506 }
1695 * this block in syncing context, it will use
1696 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
1697 * to the "failmode" property. dnode_next_offset()
1698 * doesn't have a flag to indicate MUSTSUCCEED.
1699 */
1700 if (err != 0)
1701 break;
1702
1703 dnode_dirty_l1(dn, i, tx);
1704 }
1705 }
1706
1707 done:
1708 /*
1709 * Add this range to the dnode range list.
1710 * We will finish up this free operation in the syncing phase.
1711 */
1712 mutex_enter(&dn->dn_mtx);
1713 int txgoff = tx->tx_txg & TXG_MASK;
1714 if (dn->dn_free_ranges[txgoff] == NULL) {
1715 dn->dn_free_ranges[txgoff] =
1716 range_tree_create(NULL, NULL, &dn->dn_mtx);
1717 }
1718 range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
1719 range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
1720 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1721 blkid, nblks, tx->tx_txg);
1722 mutex_exit(&dn->dn_mtx);
1723
1724 dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1725 dnode_setdirty(dn, tx);
1726 out:
1727
1728 rw_exit(&dn->dn_struct_rwlock);
1729 }
1730
1731 static boolean_t
1732 dnode_spill_freed(dnode_t *dn)
1733 {
1734 int i;
1735
1736 mutex_enter(&dn->dn_mtx);
1995 flags, offset, lvl, blkfill, txg);
1996 }
1997
1998 /*
1999 * There's always a "virtual hole" at the end of the object, even
2000 * if all BP's which physically exist are non-holes.
2001 */
2002 if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
2003 minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
2004 error = 0;
2005 }
2006
2007 if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2008 initial_offset < *offset : initial_offset > *offset))
2009 error = SET_ERROR(ESRCH);
2010 out:
2011 if (!(flags & DNODE_FIND_HAVELOCK))
2012 rw_exit(&dn->dn_struct_rwlock);
2013
2014 return (error);
2015 }
2016
2017 /*
2018 * When in the compressing phase, we check our results every 1 MiB. If
2019 * compression ratio drops below the threshold factor, we give up trying
2020 * to compress the file for a while. The length of the interval is
2021 * calculated from this interval value according to the algorithm in
2022 * smartcomp_check_comp.
2023 */
2024 uint64_t zfs_smartcomp_interval = 1 * 1024 * 1024;
2025
2026 /*
2027 * Minimum compression factor is 12.5% (100% / factor) - below that we
2028 * consider compression to have failed.
2029 */
2030 uint64_t zfs_smartcomp_threshold_factor = 8;
2031
2032 /*
2033 * Maximum power-of-2 exponent on the deny interval and consequently
2034 * the maximum number of compression successes and failures we track.
2035 * Successive compression failures extend the deny interval, whereas
2036 * repeated successes makes the algorithm more hesitant to start denying.
2037 */
2038 int64_t zfs_smartcomp_interval_exp = 5;
2039
2040 /*
2041 * Callback invoked by the zio machinery when it wants to compress a data
2042 * block. If we are in the denying compression phase, we add the amount of
2043 * data written to our stats and check if we've denied enough data to
2044 * transition back in to the compression phase again.
2045 */
2046 boolean_t
2047 dnode_smartcomp_ask_cb(void *userinfo, const zio_t *zio)
2048 {
2049 dnode_t *dn = userinfo;
2050 dnode_smartcomp_t *sc;
2051 dnode_smartcomp_state_t old_state;
2052
2053 ASSERT(dn != NULL);
2054
2055 sc = &dn->dn_smartcomp;
2056 mutex_enter(&sc->sc_lock);
2057 old_state = sc->sc_state;
2058 if (sc->sc_state == DNODE_SMARTCOMP_DENYING) {
2059 sc->sc_orig_size += zio->io_orig_size;
2060 if (sc->sc_orig_size >= sc->sc_deny_interval) {
2061 /* time to retry compression on next call */
2062 sc->sc_state = DNODE_SMARTCOMP_COMPRESSING;
2063 sc->sc_size = 0;
2064 sc->sc_orig_size = 0;
2065 }
2066 }
2067 mutex_exit(&sc->sc_lock);
2068
2069 return (old_state != DNODE_SMARTCOMP_DENYING);
2070 }
2071
2072 /*
2073 * Callback invoked after compression has been performed to allow us to
2074 * monitor compression performance. If we're in a compressing phase, we
2075 * add the uncompressed and compressed data volumes to our state counters
2076 * and see if we need to recheck compression performance in
2077 * smartcomp_check_comp.
2078 */
2079 void
2080 dnode_smartcomp_result_cb(void *userinfo, const zio_t *zio)
2081 {
2082 dnode_t *dn = userinfo;
2083 dnode_smartcomp_t *sc;
2084 uint64_t io_size = zio->io_size, io_orig_size = zio->io_orig_size;
2085
2086 ASSERT(dn != NULL);
2087 sc = &dn->dn_smartcomp;
2088
2089 if (io_orig_size == 0)
2090 /* XXX: is this valid anyway? */
2091 return;
2092
2093 mutex_enter(&sc->sc_lock);
2094 if (sc->sc_state == DNODE_SMARTCOMP_COMPRESSING) {
2095 /* add last block's compression performance to our stats */
2096 sc->sc_size += io_size;
2097 sc->sc_orig_size += io_orig_size;
2098 /* time to recheck compression performance? */
2099 if (sc->sc_orig_size >= zfs_smartcomp_interval)
2100 smartcomp_check_comp(sc);
2101 }
2102 mutex_exit(&sc->sc_lock);
2103 }
2104
2105 /*
2106 * This function checks whether the compression we've been getting is above
2107 * the threshold value. If it is, we decrement the sc_comp_failures counter
2108 * to indicate compression success. If it isn't we increment the same
2109 * counter and potentially start a compression deny phase.
2110 */
2111 static void
2112 smartcomp_check_comp(dnode_smartcomp_t *sc)
2113 {
2114 uint64_t threshold = sc->sc_orig_size -
2115 sc->sc_orig_size / zfs_smartcomp_threshold_factor;
2116
2117 ASSERT(MUTEX_HELD(&sc->sc_lock));
2118 if (sc->sc_size > threshold) {
2119 sc->sc_comp_failures =
2120 MIN(sc->sc_comp_failures + 1, zfs_smartcomp_interval_exp);
2121 if (sc->sc_comp_failures > 0) {
2122 /* consistently getting too little compression, stop */
2123 sc->sc_state = DNODE_SMARTCOMP_DENYING;
2124 sc->sc_deny_interval =
2125 zfs_smartcomp_interval << sc->sc_comp_failures;
2126 /* randomize the interval by +-10% to avoid patterns */
2127 sc->sc_deny_interval = (sc->sc_deny_interval -
2128 (sc->sc_deny_interval / 10)) +
2129 spa_get_random(sc->sc_deny_interval / 5 + 1);
2130 }
2131 } else {
2132 if (sc->sc_comp_failures > 0) {
2133 /*
2134 * We're biased for compression, so any success makes
2135 * us forget the file's past incompressibility.
2136 */
2137 sc->sc_comp_failures = 0;
2138 } else {
2139 sc->sc_comp_failures = MAX(sc->sc_comp_failures - 1,
2140 -zfs_smartcomp_interval_exp);
2141 }
2142 }
2143 /* reset state counters */
2144 sc->sc_size = 0;
2145 sc->sc_orig_size = 0;
2146 }
2147
2148 /*
2149 * Prepares a zio_smartcomp_info_t structure for passing to zio_write or
2150 * arc_write depending on whether smart compression should be applied to
2151 * the specified objset, dnode and buffer.
2152 */
2153 extern void
2154 dnode_setup_zio_smartcomp(dmu_buf_impl_t *db, zio_smartcomp_info_t *sc)
2155 {
2156 dnode_t *dn = DB_DNODE(db);
2157 objset_t *os = dn->dn_objset;
2158
2159 /* Only do smart compression on user data of plain files. */
2160 if (dn->dn_type == DMU_OT_PLAIN_FILE_CONTENTS && db->db_level == 0 &&
2161 os->os_smartcomp_enabled && os->os_compress != ZIO_COMPRESS_OFF) {
2162 sc->sc_ask = dnode_smartcomp_ask_cb;
2163 sc->sc_result = dnode_smartcomp_result_cb;
2164 sc->sc_userinfo = dn;
2165 } else {
2166 /*
2167 * Zeroing out the structure passed to zio_write will turn
2168 * smart compression off.
2169 */
2170 bzero(sc, sizeof (*sc));
2171 }
2172 }
|