big-one Sdiff usr/src/uts/common/fs/zfs/dnode.c

Print this page

Revert "8958 Update Intel ucode to 20180108 release"
This reverts commit 1adc3ffcd976ec0a34010cc7db08037a14c3ea4c.
NEX-15280 New default metadata block size is too large
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-15280 New default metadata block size is too large
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-5366 Race between unique_insert() and unique_remove() causes ZFS fsid change
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Dan Vatca <dan.vatca@gmail.com>
NEX-5058 WBC: Race between the purging of window and opening new one
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
NEX-2830 ZFS smart compression
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
NEX-3266 5630 stale bonus buffer in recycled dnode_t leads to data corruption
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Will Andrews <will@freebsd.org>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan Fields <dan.fields@nexenta.com>
SUP-507 Delete or truncate of large files delayed on datasets with small recordsize
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Reviewed by: Tony Nguyen <tony.nguyen@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.

  23  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  * Copyright 2017 RackTop Systems.
  27  */
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/dbuf.h>
  31 #include <sys/dnode.h>
  32 #include <sys/dmu.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/dmu_objset.h>
  36 #include <sys/dsl_dir.h>
  37 #include <sys/dsl_dataset.h>
  38 #include <sys/spa.h>
  39 #include <sys/zio.h>
  40 #include <sys/dmu_zfetch.h>
  41 #include <sys/range_tree.h>
  42 


  43 static kmem_cache_t *dnode_cache;
  44 /*
  45  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  46  * turned on when DEBUG is also defined.
  47  */
  48 #ifdef  DEBUG
  49 #define DNODE_STATS
  50 #endif  /* DEBUG */
  51 
  52 #ifdef  DNODE_STATS
  53 #define DNODE_STAT_ADD(stat)                    ((stat)++)
  54 #else
  55 #define DNODE_STAT_ADD(stat)                    /* nothing */
  56 #endif  /* DNODE_STATS */
  57 
  58 static dnode_phys_t dnode_phys_zero;
  59 
  60 int zfs_default_bs = SPA_MINBLOCKSHIFT;
  61 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
  62 
  63 #ifdef  _KERNEL
  64 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  65 #endif  /* _KERNEL */
  66 
  67 static int
  68 dbuf_compare(const void *x1, const void *x2)
  69 {
  70         const dmu_buf_impl_t *d1 = x1;
  71         const dmu_buf_impl_t *d2 = x2;
  72 
  73         if (d1->db_level < d2->db_level) {
  74                 return (-1);
  75         }
  76         if (d1->db_level > d2->db_level) {
  77                 return (1);
  78         }
  79 
  80         if (d1->db_blkid < d2->db_blkid) {
  81                 return (-1);

 141         dn->dn_free_txg = 0;
 142         dn->dn_assigned_txg = 0;
 143         dn->dn_dirtyctx = 0;
 144         dn->dn_dirtyctx_firstset = NULL;
 145         dn->dn_bonus = NULL;
 146         dn->dn_have_spill = B_FALSE;
 147         dn->dn_zio = NULL;
 148         dn->dn_oldused = 0;
 149         dn->dn_oldflags = 0;
 150         dn->dn_olduid = 0;
 151         dn->dn_oldgid = 0;
 152         dn->dn_newuid = 0;
 153         dn->dn_newgid = 0;
 154         dn->dn_id_flags = 0;
 155 
 156         dn->dn_dbufs_count = 0;
 157         avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 158             offsetof(dmu_buf_impl_t, db_link));
 159 
 160         dn->dn_moved = 0;




 161         return (0);
 162 }
 163 
 164 /* ARGSUSED */
 165 static void
 166 dnode_dest(void *arg, void *unused)
 167 {
 168         int i;
 169         dnode_t *dn = arg;
 170 


 171         rw_destroy(&dn->dn_struct_rwlock);
 172         mutex_destroy(&dn->dn_mtx);
 173         mutex_destroy(&dn->dn_dbufs_mtx);
 174         cv_destroy(&dn->dn_notxholds);
 175         refcount_destroy(&dn->dn_holds);
 176         refcount_destroy(&dn->dn_tx_holds);
 177         ASSERT(!list_link_active(&dn->dn_link));
 178 
 179         for (i = 0; i < TXG_SIZE; i++) {
 180                 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 181                 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 182                 list_destroy(&dn->dn_dirty_records[i]);
 183                 ASSERT0(dn->dn_next_nblkptr[i]);
 184                 ASSERT0(dn->dn_next_nlevels[i]);
 185                 ASSERT0(dn->dn_next_indblkshift[i]);
 186                 ASSERT0(dn->dn_next_bonustype[i]);
 187                 ASSERT0(dn->dn_rm_spillblk[i]);
 188                 ASSERT0(dn->dn_next_bonuslen[i]);
 189                 ASSERT0(dn->dn_next_blksz[i]);
 190         }

 621 
 622 void
 623 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 624     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 625 {
 626         int nblkptr;
 627 
 628         ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 629         ASSERT3U(blocksize, <=,
 630             spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 631         ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 632         ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 633         ASSERT(tx->tx_txg != 0);
 634         ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 635             (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 636             (bonustype == DMU_OT_SA && bonuslen == 0));
 637         ASSERT(DMU_OT_IS_VALID(bonustype));
 638         ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 639 
 640         /* clean up any unreferenced dbufs */
 641         dnode_evict_dbufs(dn);
 642 
 643         dn->dn_id_flags = 0;
 644 
 645         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 646         dnode_setdirty(dn, tx);
 647         if (dn->dn_datablksz != blocksize) {
 648                 /* change blocksize */
 649                 ASSERT(dn->dn_maxblkid == 0 &&
 650                     (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 651                     dnode_block_freed(dn, 0)));
 652                 dnode_setdblksz(dn, blocksize);
 653                 dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 654         }
 655         if (dn->dn_bonuslen != bonuslen)
 656                 dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 657 
 658         if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 659                 nblkptr = 1;
 660         else
 661                 nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);

1250          */
1251         ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1252 
1253         /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1254         if (refs == 0 && db != NULL) {
1255                 /*
1256                  * Another thread could add a hold to the dnode handle in
1257                  * dnode_hold_impl() while holding the parent dbuf. Since the
1258                  * hold on the parent dbuf prevents the handle from being
1259                  * destroyed, the hold on the handle is OK. We can't yet assert
1260                  * that the handle has zero references, but that will be
1261                  * asserted anyway when the handle gets destroyed.
1262                  */
1263                 dbuf_rele(db, dnh);
1264         }
1265 }
1266 
1267 void
1268 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1269 {






1270         objset_t *os = dn->dn_objset;
1271         uint64_t txg = tx->tx_txg;
1272 
1273         if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1274                 dsl_dataset_dirty(os->os_dsl_dataset, tx);
1275                 return;
1276         }
1277 
1278         DNODE_VERIFY(dn);
1279 
1280 #ifdef ZFS_DEBUG
1281         mutex_enter(&dn->dn_mtx);
1282         ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1283         ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1284         mutex_exit(&dn->dn_mtx);
1285 #endif
1286 
1287         /*
1288          * Determine old uid/gid when necessary
1289          */

1308         ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1309 
1310         dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1311             dn->dn_object, txg);
1312 
1313         multilist_sublist_insert_head(mls, dn);
1314 
1315         multilist_sublist_unlock(mls);
1316 
1317         /*
1318          * The dnode maintains a hold on its containing dbuf as
1319          * long as there are holds on it.  Each instantiated child
1320          * dbuf maintains a hold on the dnode.  When the last child
1321          * drops its hold, the dnode will drop its hold on the
1322          * containing dbuf. We add a "dirty hold" here so that the
1323          * dnode will hang around after we finish processing its
1324          * children.
1325          */
1326         VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1327 
1328         (void) dbuf_dirty(dn->dn_dbuf, tx);
1329 
1330         dsl_dataset_dirty(os->os_dsl_dataset, tx);
1331 }
1332 
1333 void
1334 dnode_free(dnode_t *dn, dmu_tx_t *tx)
1335 {
1336         mutex_enter(&dn->dn_mtx);
1337         if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1338                 mutex_exit(&dn->dn_mtx);
1339                 return;
1340         }
1341         dn->dn_free_txg = tx->tx_txg;
1342         mutex_exit(&dn->dn_mtx);
1343 
1344         dnode_setdirty(dn, tx);
1345 }
1346 
1347 /*
1348  * Try to change the block size for the indicated dnode.  This can only
1349  * succeed if there are no blocks allocated or dirty beyond first block

1397         dnode_setdirty(dn, tx);
1398         dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1399         if (ibs) {
1400                 dn->dn_indblkshift = ibs;
1401                 dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1402         }
1403         /* rele after we have fixed the blocksize in the dnode */
1404         if (db)
1405                 dbuf_rele(db, FTAG);
1406 
1407         rw_exit(&dn->dn_struct_rwlock);
1408         return (0);
1409 
1410 fail:
1411         rw_exit(&dn->dn_struct_rwlock);
1412         return (SET_ERROR(ENOTSUP));
1413 }
1414 
1415 /* read-holding callers must not rely on the lock being continuously held */
1416 void
1417 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)

1418 {
1419         uint64_t txgoff = tx->tx_txg & TXG_MASK;
1420         int epbs, new_nlevels;
1421         uint64_t sz;
1422 
1423         ASSERT(blkid != DMU_BONUS_BLKID);
1424 
1425         ASSERT(have_read ?
1426             RW_READ_HELD(&dn->dn_struct_rwlock) :
1427             RW_WRITE_HELD(&dn->dn_struct_rwlock));
1428 
1429         /*
1430          * if we have a read-lock, check to see if we need to do any work
1431          * before upgrading to a write-lock.
1432          */
1433         if (have_read) {
1434                 if (blkid <= dn->dn_maxblkid)
1435                         return;
1436 
1437                 if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {

1451         new_nlevels = 1;
1452         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1453         for (sz = dn->dn_nblkptr;
1454             sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1455                 new_nlevels++;
1456 
1457         if (new_nlevels > dn->dn_nlevels) {
1458                 int old_nlevels = dn->dn_nlevels;
1459                 dmu_buf_impl_t *db;
1460                 list_t *list;
1461                 dbuf_dirty_record_t *new, *dr, *dr_next;
1462 
1463                 dn->dn_nlevels = new_nlevels;
1464 
1465                 ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1466                 dn->dn_next_nlevels[txgoff] = new_nlevels;
1467 
1468                 /* dirty the left indirects */
1469                 db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1470                 ASSERT(db != NULL);
1471                 new = dbuf_dirty(db, tx);
1472                 dbuf_rele(db, FTAG);
1473 
1474                 /* transfer the dirty records to the new indirect */
1475                 mutex_enter(&dn->dn_mtx);
1476                 mutex_enter(&new->dt.di.dr_mtx);
1477                 list = &dn->dn_dirty_records[txgoff];
1478                 for (dr = list_head(list); dr; dr = dr_next) {
1479                         dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1480                         if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1481                             dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1482                             dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1483                                 ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1484                                 list_remove(&dn->dn_dirty_records[txgoff], dr);
1485                                 list_insert_tail(&new->dt.di.dr_children, dr);
1486                                 dr->dr_parent = new;
1487                         }
1488                 }
1489                 mutex_exit(&new->dt.di.dr_mtx);
1490                 mutex_exit(&dn->dn_mtx);
1491         }

1680                          * this block in syncing context, it will use
1681                          * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
1682                          * to the "failmode" property.  dnode_next_offset()
1683                          * doesn't have a flag to indicate MUSTSUCCEED.
1684                          */
1685                         if (err != 0)
1686                                 break;
1687 
1688                         dnode_dirty_l1(dn, i, tx);
1689                 }
1690         }
1691 
1692 done:
1693         /*
1694          * Add this range to the dnode range list.
1695          * We will finish up this free operation in the syncing phase.
1696          */
1697         mutex_enter(&dn->dn_mtx);
1698         int txgoff = tx->tx_txg & TXG_MASK;
1699         if (dn->dn_free_ranges[txgoff] == NULL) {
1700                 dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);

1701         }
1702         range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
1703         range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
1704         dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1705             blkid, nblks, tx->tx_txg);
1706         mutex_exit(&dn->dn_mtx);
1707 
1708         dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1709         dnode_setdirty(dn, tx);
1710 out:
1711 
1712         rw_exit(&dn->dn_struct_rwlock);
1713 }
1714 
1715 static boolean_t
1716 dnode_spill_freed(dnode_t *dn)
1717 {
1718         int i;
1719 
1720         mutex_enter(&dn->dn_mtx);

1979                     flags, offset, lvl, blkfill, txg);
1980         }
1981 
1982         /*
1983          * There's always a "virtual hole" at the end of the object, even
1984          * if all BP's which physically exist are non-holes.
1985          */
1986         if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
1987             minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
1988                 error = 0;
1989         }
1990 
1991         if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
1992             initial_offset < *offset : initial_offset > *offset))
1993                 error = SET_ERROR(ESRCH);
1994 out:
1995         if (!(flags & DNODE_FIND_HAVELOCK))
1996                 rw_exit(&dn->dn_struct_rwlock);
1997 
1998         return (error);





























































































































































1999 }

   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2017 RackTop Systems.
  28  */
  29 
  30 #include <sys/zfs_context.h>
  31 #include <sys/dbuf.h>
  32 #include <sys/dnode.h>
  33 #include <sys/dmu.h>
  34 #include <sys/dmu_impl.h>
  35 #include <sys/dmu_tx.h>
  36 #include <sys/dmu_objset.h>
  37 #include <sys/dsl_dir.h>
  38 #include <sys/dsl_dataset.h>
  39 #include <sys/spa.h>
  40 #include <sys/zio.h>
  41 #include <sys/dmu_zfetch.h>
  42 #include <sys/range_tree.h>
  43 
  44 static void smartcomp_check_comp(dnode_smartcomp_t *sc);
  45 
  46 static kmem_cache_t *dnode_cache;
  47 /*
  48  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  49  * turned on when DEBUG is also defined.
  50  */
  51 #ifdef  DEBUG
  52 #define DNODE_STATS
  53 #endif  /* DEBUG */
  54 
  55 #ifdef  DNODE_STATS
  56 #define DNODE_STAT_ADD(stat)                    ((stat)++)
  57 #else
  58 #define DNODE_STAT_ADD(stat)                    /* nothing */
  59 #endif  /* DNODE_STATS */
  60 
  61 static dnode_phys_t dnode_phys_zero;
  62 
  63 int zfs_default_bs = SPA_MINBLOCKSHIFT;
  64 int zfs_default_ibs = DN_DFL_INDBLKSHIFT;
  65 
  66 #ifdef  _KERNEL
  67 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  68 #endif  /* _KERNEL */
  69 
  70 static int
  71 dbuf_compare(const void *x1, const void *x2)
  72 {
  73         const dmu_buf_impl_t *d1 = x1;
  74         const dmu_buf_impl_t *d2 = x2;
  75 
  76         if (d1->db_level < d2->db_level) {
  77                 return (-1);
  78         }
  79         if (d1->db_level > d2->db_level) {
  80                 return (1);
  81         }
  82 
  83         if (d1->db_blkid < d2->db_blkid) {
  84                 return (-1);

 144         dn->dn_free_txg = 0;
 145         dn->dn_assigned_txg = 0;
 146         dn->dn_dirtyctx = 0;
 147         dn->dn_dirtyctx_firstset = NULL;
 148         dn->dn_bonus = NULL;
 149         dn->dn_have_spill = B_FALSE;
 150         dn->dn_zio = NULL;
 151         dn->dn_oldused = 0;
 152         dn->dn_oldflags = 0;
 153         dn->dn_olduid = 0;
 154         dn->dn_oldgid = 0;
 155         dn->dn_newuid = 0;
 156         dn->dn_newgid = 0;
 157         dn->dn_id_flags = 0;
 158 
 159         dn->dn_dbufs_count = 0;
 160         avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 161             offsetof(dmu_buf_impl_t, db_link));
 162 
 163         dn->dn_moved = 0;
 164 
 165         bzero(&dn->dn_smartcomp, sizeof (dn->dn_smartcomp));
 166         mutex_init(&dn->dn_smartcomp.sc_lock, NULL, MUTEX_DEFAULT, NULL);
 167 
 168         return (0);
 169 }
 170 
 171 /* ARGSUSED */
 172 static void
 173 dnode_dest(void *arg, void *unused)
 174 {
 175         int i;
 176         dnode_t *dn = arg;
 177 
 178         mutex_destroy(&dn->dn_smartcomp.sc_lock);
 179 
 180         rw_destroy(&dn->dn_struct_rwlock);
 181         mutex_destroy(&dn->dn_mtx);
 182         mutex_destroy(&dn->dn_dbufs_mtx);
 183         cv_destroy(&dn->dn_notxholds);
 184         refcount_destroy(&dn->dn_holds);
 185         refcount_destroy(&dn->dn_tx_holds);
 186         ASSERT(!list_link_active(&dn->dn_link));
 187 
 188         for (i = 0; i < TXG_SIZE; i++) {
 189                 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 190                 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 191                 list_destroy(&dn->dn_dirty_records[i]);
 192                 ASSERT0(dn->dn_next_nblkptr[i]);
 193                 ASSERT0(dn->dn_next_nlevels[i]);
 194                 ASSERT0(dn->dn_next_indblkshift[i]);
 195                 ASSERT0(dn->dn_next_bonustype[i]);
 196                 ASSERT0(dn->dn_rm_spillblk[i]);
 197                 ASSERT0(dn->dn_next_bonuslen[i]);
 198                 ASSERT0(dn->dn_next_blksz[i]);
 199         }

 630 
 631 void
 632 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 633     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 634 {
 635         int nblkptr;
 636 
 637         ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 638         ASSERT3U(blocksize, <=,
 639             spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 640         ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 641         ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 642         ASSERT(tx->tx_txg != 0);
 643         ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
 644             (bonustype != DMU_OT_NONE && bonuslen != 0) ||
 645             (bonustype == DMU_OT_SA && bonuslen == 0));
 646         ASSERT(DMU_OT_IS_VALID(bonustype));
 647         ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 648 
 649         /* clean up any unreferenced dbufs */
 650         dnode_evict_dbufs(dn, DBUF_EVICT_ALL);
 651 
 652         dn->dn_id_flags = 0;
 653 
 654         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 655         dnode_setdirty(dn, tx);
 656         if (dn->dn_datablksz != blocksize) {
 657                 /* change blocksize */
 658                 ASSERT(dn->dn_maxblkid == 0 &&
 659                     (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
 660                     dnode_block_freed(dn, 0)));
 661                 dnode_setdblksz(dn, blocksize);
 662                 dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 663         }
 664         if (dn->dn_bonuslen != bonuslen)
 665                 dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 666 
 667         if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 668                 nblkptr = 1;
 669         else
 670                 nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);

1259          */
1260         ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
1261 
1262         /* NOTE: the DNODE_DNODE does not have a dn_dbuf */
1263         if (refs == 0 && db != NULL) {
1264                 /*
1265                  * Another thread could add a hold to the dnode handle in
1266                  * dnode_hold_impl() while holding the parent dbuf. Since the
1267                  * hold on the parent dbuf prevents the handle from being
1268                  * destroyed, the hold on the handle is OK. We can't yet assert
1269                  * that the handle has zero references, but that will be
1270                  * asserted anyway when the handle gets destroyed.
1271                  */
1272                 dbuf_rele(db, dnh);
1273         }
1274 }
1275 
1276 void
1277 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
1278 {
1279         dnode_setdirty_sc(dn, tx, B_TRUE);
1280 }
1281 
1282 void
1283 dnode_setdirty_sc(dnode_t *dn, dmu_tx_t *tx, boolean_t usesc)
1284 {
1285         objset_t *os = dn->dn_objset;
1286         uint64_t txg = tx->tx_txg;
1287 
1288         if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
1289                 dsl_dataset_dirty(os->os_dsl_dataset, tx);
1290                 return;
1291         }
1292 
1293         DNODE_VERIFY(dn);
1294 
1295 #ifdef ZFS_DEBUG
1296         mutex_enter(&dn->dn_mtx);
1297         ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
1298         ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
1299         mutex_exit(&dn->dn_mtx);
1300 #endif
1301 
1302         /*
1303          * Determine old uid/gid when necessary
1304          */

1323         ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
1324 
1325         dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
1326             dn->dn_object, txg);
1327 
1328         multilist_sublist_insert_head(mls, dn);
1329 
1330         multilist_sublist_unlock(mls);
1331 
1332         /*
1333          * The dnode maintains a hold on its containing dbuf as
1334          * long as there are holds on it.  Each instantiated child
1335          * dbuf maintains a hold on the dnode.  When the last child
1336          * drops its hold, the dnode will drop its hold on the
1337          * containing dbuf. We add a "dirty hold" here so that the
1338          * dnode will hang around after we finish processing its
1339          * children.
1340          */
1341         VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
1342 
1343         (void) dbuf_dirty_sc(dn->dn_dbuf, tx, usesc);

1344         dsl_dataset_dirty(os->os_dsl_dataset, tx);
1345 }
1346 
1347 void
1348 dnode_free(dnode_t *dn, dmu_tx_t *tx)
1349 {
1350         mutex_enter(&dn->dn_mtx);
1351         if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
1352                 mutex_exit(&dn->dn_mtx);
1353                 return;
1354         }
1355         dn->dn_free_txg = tx->tx_txg;
1356         mutex_exit(&dn->dn_mtx);
1357 
1358         dnode_setdirty(dn, tx);
1359 }
1360 
1361 /*
1362  * Try to change the block size for the indicated dnode.  This can only
1363  * succeed if there are no blocks allocated or dirty beyond first block

1411         dnode_setdirty(dn, tx);
1412         dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
1413         if (ibs) {
1414                 dn->dn_indblkshift = ibs;
1415                 dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
1416         }
1417         /* rele after we have fixed the blocksize in the dnode */
1418         if (db)
1419                 dbuf_rele(db, FTAG);
1420 
1421         rw_exit(&dn->dn_struct_rwlock);
1422         return (0);
1423 
1424 fail:
1425         rw_exit(&dn->dn_struct_rwlock);
1426         return (SET_ERROR(ENOTSUP));
1427 }
1428 
1429 /* read-holding callers must not rely on the lock being continuously held */
1430 void
1431 dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
1432     boolean_t usesc, boolean_t have_read)
1433 {
1434         uint64_t txgoff = tx->tx_txg & TXG_MASK;
1435         int epbs, new_nlevels;
1436         uint64_t sz;
1437 
1438         ASSERT(blkid != DMU_BONUS_BLKID);
1439 
1440         ASSERT(have_read ?
1441             RW_READ_HELD(&dn->dn_struct_rwlock) :
1442             RW_WRITE_HELD(&dn->dn_struct_rwlock));
1443 
1444         /*
1445          * if we have a read-lock, check to see if we need to do any work
1446          * before upgrading to a write-lock.
1447          */
1448         if (have_read) {
1449                 if (blkid <= dn->dn_maxblkid)
1450                         return;
1451 
1452                 if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {

1466         new_nlevels = 1;
1467         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1468         for (sz = dn->dn_nblkptr;
1469             sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
1470                 new_nlevels++;
1471 
1472         if (new_nlevels > dn->dn_nlevels) {
1473                 int old_nlevels = dn->dn_nlevels;
1474                 dmu_buf_impl_t *db;
1475                 list_t *list;
1476                 dbuf_dirty_record_t *new, *dr, *dr_next;
1477 
1478                 dn->dn_nlevels = new_nlevels;
1479 
1480                 ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
1481                 dn->dn_next_nlevels[txgoff] = new_nlevels;
1482 
1483                 /* dirty the left indirects */
1484                 db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
1485                 ASSERT(db != NULL);
1486                 new = dbuf_dirty_sc(db, tx, usesc);
1487                 dbuf_rele(db, FTAG);
1488 
1489                 /* transfer the dirty records to the new indirect */
1490                 mutex_enter(&dn->dn_mtx);
1491                 mutex_enter(&new->dt.di.dr_mtx);
1492                 list = &dn->dn_dirty_records[txgoff];
1493                 for (dr = list_head(list); dr; dr = dr_next) {
1494                         dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
1495                         if (dr->dr_dbuf->db_level != new_nlevels-1 &&
1496                             dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
1497                             dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
1498                                 ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
1499                                 list_remove(&dn->dn_dirty_records[txgoff], dr);
1500                                 list_insert_tail(&new->dt.di.dr_children, dr);
1501                                 dr->dr_parent = new;
1502                         }
1503                 }
1504                 mutex_exit(&new->dt.di.dr_mtx);
1505                 mutex_exit(&dn->dn_mtx);
1506         }

1695                          * this block in syncing context, it will use
1696                          * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
1697                          * to the "failmode" property.  dnode_next_offset()
1698                          * doesn't have a flag to indicate MUSTSUCCEED.
1699                          */
1700                         if (err != 0)
1701                                 break;
1702 
1703                         dnode_dirty_l1(dn, i, tx);
1704                 }
1705         }
1706 
1707 done:
1708         /*
1709          * Add this range to the dnode range list.
1710          * We will finish up this free operation in the syncing phase.
1711          */
1712         mutex_enter(&dn->dn_mtx);
1713         int txgoff = tx->tx_txg & TXG_MASK;
1714         if (dn->dn_free_ranges[txgoff] == NULL) {
1715                 dn->dn_free_ranges[txgoff] =
1716                     range_tree_create(NULL, NULL, &dn->dn_mtx);
1717         }
1718         range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
1719         range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
1720         dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1721             blkid, nblks, tx->tx_txg);
1722         mutex_exit(&dn->dn_mtx);
1723 
1724         dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1725         dnode_setdirty(dn, tx);
1726 out:
1727 
1728         rw_exit(&dn->dn_struct_rwlock);
1729 }
1730 
1731 static boolean_t
1732 dnode_spill_freed(dnode_t *dn)
1733 {
1734         int i;
1735 
1736         mutex_enter(&dn->dn_mtx);

1995                     flags, offset, lvl, blkfill, txg);
1996         }
1997 
1998         /*
1999          * There's always a "virtual hole" at the end of the object, even
2000          * if all BP's which physically exist are non-holes.
2001          */
2002         if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
2003             minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
2004                 error = 0;
2005         }
2006 
2007         if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
2008             initial_offset < *offset : initial_offset > *offset))
2009                 error = SET_ERROR(ESRCH);
2010 out:
2011         if (!(flags & DNODE_FIND_HAVELOCK))
2012                 rw_exit(&dn->dn_struct_rwlock);
2013 
2014         return (error);
2015 }
2016 
2017 /*
2018  * When in the compressing phase, we check our results every 1 MiB. If
2019  * compression ratio drops below the threshold factor, we give up trying
2020  * to compress the file for a while. The length of the interval is
2021  * calculated from this interval value according to the algorithm in
2022  * smartcomp_check_comp.
2023  */
2024 uint64_t zfs_smartcomp_interval = 1 * 1024 * 1024;
2025 
2026 /*
2027  * Minimum compression factor is 12.5% (100% / factor) - below that we
2028  * consider compression to have failed.
2029  */
2030 uint64_t zfs_smartcomp_threshold_factor = 8;
2031 
2032 /*
2033  * Maximum power-of-2 exponent on the deny interval and consequently
2034  * the maximum number of compression successes and failures we track.
2035  * Successive compression failures extend the deny interval, whereas
2036  * repeated successes makes the algorithm more hesitant to start denying.
2037  */
2038 int64_t zfs_smartcomp_interval_exp = 5;
2039 
2040 /*
2041  * Callback invoked by the zio machinery when it wants to compress a data
2042  * block. If we are in the denying compression phase, we add the amount of
2043  * data written to our stats and check if we've denied enough data to
2044  * transition back in to the compression phase again.
2045  */
2046 boolean_t
2047 dnode_smartcomp_ask_cb(void *userinfo, const zio_t *zio)
2048 {
2049         dnode_t *dn = userinfo;
2050         dnode_smartcomp_t *sc;
2051         dnode_smartcomp_state_t old_state;
2052 
2053         ASSERT(dn != NULL);
2054 
2055         sc = &dn->dn_smartcomp;
2056         mutex_enter(&sc->sc_lock);
2057         old_state = sc->sc_state;
2058         if (sc->sc_state == DNODE_SMARTCOMP_DENYING) {
2059                 sc->sc_orig_size += zio->io_orig_size;
2060                 if (sc->sc_orig_size >= sc->sc_deny_interval) {
2061                         /* time to retry compression on next call */
2062                         sc->sc_state = DNODE_SMARTCOMP_COMPRESSING;
2063                         sc->sc_size = 0;
2064                         sc->sc_orig_size = 0;
2065                 }
2066         }
2067         mutex_exit(&sc->sc_lock);
2068 
2069         return (old_state != DNODE_SMARTCOMP_DENYING);
2070 }
2071 
2072 /*
2073  * Callback invoked after compression has been performed to allow us to
2074  * monitor compression performance. If we're in a compressing phase, we
2075  * add the uncompressed and compressed data volumes to our state counters
2076  * and see if we need to recheck compression performance in
2077  * smartcomp_check_comp.
2078  */
2079 void
2080 dnode_smartcomp_result_cb(void *userinfo, const zio_t *zio)
2081 {
2082         dnode_t *dn = userinfo;
2083         dnode_smartcomp_t *sc;
2084         uint64_t io_size = zio->io_size, io_orig_size = zio->io_orig_size;
2085 
2086         ASSERT(dn != NULL);
2087         sc = &dn->dn_smartcomp;
2088 
2089         if (io_orig_size == 0)
2090                 /* XXX: is this valid anyway? */
2091                 return;
2092 
2093         mutex_enter(&sc->sc_lock);
2094         if (sc->sc_state == DNODE_SMARTCOMP_COMPRESSING) {
2095                 /* add last block's compression performance to our stats */
2096                 sc->sc_size += io_size;
2097                 sc->sc_orig_size += io_orig_size;
2098                 /* time to recheck compression performance? */
2099                 if (sc->sc_orig_size >= zfs_smartcomp_interval)
2100                         smartcomp_check_comp(sc);
2101         }
2102         mutex_exit(&sc->sc_lock);
2103 }
2104 
2105 /*
2106  * This function checks whether the compression we've been getting is above
2107  * the threshold value. If it is, we decrement the sc_comp_failures counter
2108  * to indicate compression success. If it isn't we increment the same
2109  * counter and potentially start a compression deny phase.
2110  */
2111 static void
2112 smartcomp_check_comp(dnode_smartcomp_t *sc)
2113 {
2114         uint64_t threshold = sc->sc_orig_size -
2115             sc->sc_orig_size / zfs_smartcomp_threshold_factor;
2116 
2117         ASSERT(MUTEX_HELD(&sc->sc_lock));
2118         if (sc->sc_size > threshold) {
2119                 sc->sc_comp_failures =
2120                     MIN(sc->sc_comp_failures + 1, zfs_smartcomp_interval_exp);
2121                 if (sc->sc_comp_failures > 0) {
2122                         /* consistently getting too little compression, stop */
2123                         sc->sc_state = DNODE_SMARTCOMP_DENYING;
2124                         sc->sc_deny_interval =
2125                             zfs_smartcomp_interval << sc->sc_comp_failures;
2126                         /* randomize the interval by +-10% to avoid patterns */
2127                         sc->sc_deny_interval = (sc->sc_deny_interval -
2128                             (sc->sc_deny_interval / 10)) +
2129                             spa_get_random(sc->sc_deny_interval / 5 + 1);
2130                 }
2131         } else {
2132                 if (sc->sc_comp_failures > 0) {
2133                         /*
2134                          * We're biased for compression, so any success makes
2135                          * us forget the file's past incompressibility.
2136                          */
2137                         sc->sc_comp_failures = 0;
2138                 } else {
2139                         sc->sc_comp_failures = MAX(sc->sc_comp_failures - 1,
2140                             -zfs_smartcomp_interval_exp);
2141                 }
2142         }
2143         /* reset state counters */
2144         sc->sc_size = 0;
2145         sc->sc_orig_size = 0;
2146 }
2147 
2148 /*
2149  * Prepares a zio_smartcomp_info_t structure for passing to zio_write or
2150  * arc_write depending on whether smart compression should be applied to
2151  * the specified objset, dnode and buffer.
2152  */
2153 extern void
2154 dnode_setup_zio_smartcomp(dmu_buf_impl_t *db, zio_smartcomp_info_t *sc)
2155 {
2156         dnode_t *dn = DB_DNODE(db);
2157         objset_t *os = dn->dn_objset;
2158 
2159         /* Only do smart compression on user data of plain files. */
2160         if (dn->dn_type == DMU_OT_PLAIN_FILE_CONTENTS && db->db_level == 0 &&
2161             os->os_smartcomp_enabled && os->os_compress != ZIO_COMPRESS_OFF) {
2162                 sc->sc_ask = dnode_smartcomp_ask_cb;
2163                 sc->sc_result = dnode_smartcomp_result_cb;
2164                 sc->sc_userinfo = dn;
2165         } else {
2166                 /*
2167                  * Zeroing out the structure passed to zio_write will turn
2168                  * smart compression off.
2169                  */
2170                 bzero(sc, sizeof (*sc));
2171         }
2172 }