big-one Udiff usr/src/uts/common/fs/zfs/dnode.c

Print this page

Revert "8958 Update Intel ucode to 20180108 release"
This reverts commit 1adc3ffcd976ec0a34010cc7db08037a14c3ea4c.
NEX-15280 New default metadata block size is too large
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-15280 New default metadata block size is too large
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-5366 Race between unique_insert() and unique_remove() causes ZFS fsid change
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Dan Vatca <dan.vatca@gmail.com>
NEX-5058 WBC: Race between the purging of window and opening new one
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
NEX-2830 ZFS smart compression
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
NEX-3266 5630 stale bonus buffer in recycled dnode_t leads to data corruption
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Will Andrews <will@freebsd.org>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan Fields <dan.fields@nexenta.com>
SUP-507 Delete or truncate of large files delayed on datasets with small recordsize
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Reviewed by: Tony Nguyen <tony.nguyen@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

@@ -18,10 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 RackTop Systems.
  */

@@ -38,10 +39,12 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/range_tree.h>
 
+static void smartcomp_check_comp(dnode_smartcomp_t *sc);
+
 static kmem_cache_t *dnode_cache;
 /*
  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */

@@ -56,11 +59,11 @@
 #endif  /* DNODE_STATS */
 
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
-int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+int zfs_default_ibs = DN_DFL_INDBLKSHIFT;
 
 #ifdef  _KERNEL
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif  /* _KERNEL */

@@ -156,10 +159,14 @@
         dn->dn_dbufs_count = 0;
         avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
             offsetof(dmu_buf_impl_t, db_link));
 
         dn->dn_moved = 0;
+
+        bzero(&dn->dn_smartcomp, sizeof (dn->dn_smartcomp));
+        mutex_init(&dn->dn_smartcomp.sc_lock, NULL, MUTEX_DEFAULT, NULL);
+
         return (0);
 }
 
 /* ARGSUSED */
 static void

@@ -166,10 +173,12 @@
 dnode_dest(void *arg, void *unused)
 {
         int i;
         dnode_t *dn = arg;
 
+        mutex_destroy(&dn->dn_smartcomp.sc_lock);
+
         rw_destroy(&dn->dn_struct_rwlock);
         mutex_destroy(&dn->dn_mtx);
         mutex_destroy(&dn->dn_dbufs_mtx);
         cv_destroy(&dn->dn_notxholds);
         refcount_destroy(&dn->dn_holds);

@@ -636,11 +645,11 @@
             (bonustype == DMU_OT_SA && bonuslen == 0));
         ASSERT(DMU_OT_IS_VALID(bonustype));
         ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
         /* clean up any unreferenced dbufs */
-        dnode_evict_dbufs(dn);
+        dnode_evict_dbufs(dn, DBUF_EVICT_ALL);
 
         dn->dn_id_flags = 0;
 
         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
         dnode_setdirty(dn, tx);

@@ -1265,10 +1274,16 @@
 }
 
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
+        dnode_setdirty_sc(dn, tx, B_TRUE);
+}
+
+void
+dnode_setdirty_sc(dnode_t *dn, dmu_tx_t *tx, boolean_t usesc)
+{
         objset_t *os = dn->dn_objset;
         uint64_t txg = tx->tx_txg;
 
         if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
                 dsl_dataset_dirty(os->os_dsl_dataset, tx);

@@ -1323,12 +1338,11 @@
          * dnode will hang around after we finish processing its
          * children.
          */
         VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
-        (void) dbuf_dirty(dn->dn_dbuf, tx);
-
+        (void) dbuf_dirty_sc(dn->dn_dbuf, tx, usesc);
         dsl_dataset_dirty(os->os_dsl_dataset, tx);
 }
 
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)

@@ -1412,11 +1426,12 @@
         return (SET_ERROR(ENOTSUP));
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
 void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
+    boolean_t usesc, boolean_t have_read)
 {
         uint64_t txgoff = tx->tx_txg & TXG_MASK;
         int epbs, new_nlevels;
         uint64_t sz;

@@ -1466,11 +1481,11 @@
                 dn->dn_next_nlevels[txgoff] = new_nlevels;
 
                 /* dirty the left indirects */
                 db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
                 ASSERT(db != NULL);
-                new = dbuf_dirty(db, tx);
+                new = dbuf_dirty_sc(db, tx, usesc);
                 dbuf_rele(db, FTAG);
 
                 /* transfer the dirty records to the new indirect */
                 mutex_enter(&dn->dn_mtx);
                 mutex_enter(&new->dt.di.dr_mtx);

@@ -1695,11 +1710,12 @@
          * We will finish up this free operation in the syncing phase.
          */
         mutex_enter(&dn->dn_mtx);
         int txgoff = tx->tx_txg & TXG_MASK;
         if (dn->dn_free_ranges[txgoff] == NULL) {
-                dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
+                dn->dn_free_ranges[txgoff] =
+                    range_tree_create(NULL, NULL, &dn->dn_mtx);
         }
         range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
         range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
         dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
             blkid, nblks, tx->tx_txg);

@@ -1994,6 +2010,163 @@
 out:
         if (!(flags & DNODE_FIND_HAVELOCK))
                 rw_exit(&dn->dn_struct_rwlock);
 
         return (error);
+}
+
+/*
+ * When in the compressing phase, we check our results every 1 MiB. If
+ * compression ratio drops below the threshold factor, we give up trying
+ * to compress the file for a while. The length of the interval is
+ * calculated from this interval value according to the algorithm in
+ * smartcomp_check_comp.
+ */
+uint64_t zfs_smartcomp_interval = 1 * 1024 * 1024;
+
+/*
+ * Minimum compression factor is 12.5% (100% / factor) - below that we
+ * consider compression to have failed.
+ */
+uint64_t zfs_smartcomp_threshold_factor = 8;
+
+/*
+ * Maximum power-of-2 exponent on the deny interval and consequently
+ * the maximum number of compression successes and failures we track.
+ * Successive compression failures extend the deny interval, whereas
+ * repeated successes makes the algorithm more hesitant to start denying.
+ */
+int64_t zfs_smartcomp_interval_exp = 5;
+
+/*
+ * Callback invoked by the zio machinery when it wants to compress a data
+ * block. If we are in the denying compression phase, we add the amount of
+ * data written to our stats and check if we've denied enough data to
+ * transition back in to the compression phase again.
+ */
+boolean_t
+dnode_smartcomp_ask_cb(void *userinfo, const zio_t *zio)
+{
+        dnode_t *dn = userinfo;
+        dnode_smartcomp_t *sc;
+        dnode_smartcomp_state_t old_state;
+
+        ASSERT(dn != NULL);
+
+        sc = &dn->dn_smartcomp;
+        mutex_enter(&sc->sc_lock);
+        old_state = sc->sc_state;
+        if (sc->sc_state == DNODE_SMARTCOMP_DENYING) {
+                sc->sc_orig_size += zio->io_orig_size;
+                if (sc->sc_orig_size >= sc->sc_deny_interval) {
+                        /* time to retry compression on next call */
+                        sc->sc_state = DNODE_SMARTCOMP_COMPRESSING;
+                        sc->sc_size = 0;
+                        sc->sc_orig_size = 0;
+                }
+        }
+        mutex_exit(&sc->sc_lock);
+
+        return (old_state != DNODE_SMARTCOMP_DENYING);
+}
+
+/*
+ * Callback invoked after compression has been performed to allow us to
+ * monitor compression performance. If we're in a compressing phase, we
+ * add the uncompressed and compressed data volumes to our state counters
+ * and see if we need to recheck compression performance in
+ * smartcomp_check_comp.
+ */
+void
+dnode_smartcomp_result_cb(void *userinfo, const zio_t *zio)
+{
+        dnode_t *dn = userinfo;
+        dnode_smartcomp_t *sc;
+        uint64_t io_size = zio->io_size, io_orig_size = zio->io_orig_size;
+
+        ASSERT(dn != NULL);
+        sc = &dn->dn_smartcomp;
+
+        if (io_orig_size == 0)
+                /* XXX: is this valid anyway? */
+                return;
+
+        mutex_enter(&sc->sc_lock);
+        if (sc->sc_state == DNODE_SMARTCOMP_COMPRESSING) {
+                /* add last block's compression performance to our stats */
+                sc->sc_size += io_size;
+                sc->sc_orig_size += io_orig_size;
+                /* time to recheck compression performance? */
+                if (sc->sc_orig_size >= zfs_smartcomp_interval)
+                        smartcomp_check_comp(sc);
+        }
+        mutex_exit(&sc->sc_lock);
+}
+
+/*
+ * This function checks whether the compression we've been getting is above
+ * the threshold value. If it is, we decrement the sc_comp_failures counter
+ * to indicate compression success. If it isn't we increment the same
+ * counter and potentially start a compression deny phase.
+ */
+static void
+smartcomp_check_comp(dnode_smartcomp_t *sc)
+{
+        uint64_t threshold = sc->sc_orig_size -
+            sc->sc_orig_size / zfs_smartcomp_threshold_factor;
+
+        ASSERT(MUTEX_HELD(&sc->sc_lock));
+        if (sc->sc_size > threshold) {
+                sc->sc_comp_failures =
+                    MIN(sc->sc_comp_failures + 1, zfs_smartcomp_interval_exp);
+                if (sc->sc_comp_failures > 0) {
+                        /* consistently getting too little compression, stop */
+                        sc->sc_state = DNODE_SMARTCOMP_DENYING;
+                        sc->sc_deny_interval =
+                            zfs_smartcomp_interval << sc->sc_comp_failures;
+                        /* randomize the interval by +-10% to avoid patterns */
+                        sc->sc_deny_interval = (sc->sc_deny_interval -
+                            (sc->sc_deny_interval / 10)) +
+                            spa_get_random(sc->sc_deny_interval / 5 + 1);
+                }
+        } else {
+                if (sc->sc_comp_failures > 0) {
+                        /*
+                         * We're biased for compression, so any success makes
+                         * us forget the file's past incompressibility.
+                         */
+                        sc->sc_comp_failures = 0;
+                } else {
+                        sc->sc_comp_failures = MAX(sc->sc_comp_failures - 1,
+                            -zfs_smartcomp_interval_exp);
+                }
+        }
+        /* reset state counters */
+        sc->sc_size = 0;
+        sc->sc_orig_size = 0;
+}
+
+/*
+ * Prepares a zio_smartcomp_info_t structure for passing to zio_write or
+ * arc_write depending on whether smart compression should be applied to
+ * the specified objset, dnode and buffer.
+ */
+extern void
+dnode_setup_zio_smartcomp(dmu_buf_impl_t *db, zio_smartcomp_info_t *sc)
+{
+        dnode_t *dn = DB_DNODE(db);
+        objset_t *os = dn->dn_objset;
+
+        /* Only do smart compression on user data of plain files. */
+        if (dn->dn_type == DMU_OT_PLAIN_FILE_CONTENTS && db->db_level == 0 &&
+            os->os_smartcomp_enabled && os->os_compress != ZIO_COMPRESS_OFF) {
+                sc->sc_ask = dnode_smartcomp_ask_cb;
+                sc->sc_result = dnode_smartcomp_result_cb;
+                sc->sc_userinfo = dn;
+        } else {
+                /*
+                 * Zeroing out the structure passed to zio_write will turn
+                 * smart compression off.
+                 */
+                bzero(sc, sizeof (*sc));
+        }
 }