big-one Cdiff usr/src/uts/common/fs/zfs/dnode.c

Print this page

Revert "8958 Update Intel ucode to 20180108 release"
This reverts commit 1adc3ffcd976ec0a34010cc7db08037a14c3ea4c.
NEX-15280 New default metadata block size is too large
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-15280 New default metadata block size is too large
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-5366 Race between unique_insert() and unique_remove() causes ZFS fsid change
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Dan Vatca <dan.vatca@gmail.com>
NEX-5058 WBC: Race between the purging of window and opening new one
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
NEX-2830 ZFS smart compression
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
5987 zfs prefetch code needs work
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Approved by: Gordon Ross <gordon.ross@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
NEX-3266 5630 stale bonus buffer in recycled dnode_t leads to data corruption
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Will Andrews <will@freebsd.org>
Approved by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan Fields <dan.fields@nexenta.com>
SUP-507 Delete or truncate of large files delayed on datasets with small recordsize
Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Ilya Usvyatsky <ilya.usvyatsky@nexenta.com>
Reviewed by: Tony Nguyen <tony.nguyen@nexenta.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties


*** 18,27 ****
--- 18,28 ----
   *
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   * Copyright (c) 2014 Integros [integros.com]
   * Copyright 2017 RackTop Systems.
   */
*** 38,47 ****
--- 39,50 ----
  #include <sys/spa.h>
  #include <sys/zio.h>
  #include <sys/dmu_zfetch.h>
  #include <sys/range_tree.h>
  
+ static void smartcomp_check_comp(dnode_smartcomp_t *sc);
+ 
  static kmem_cache_t *dnode_cache;
  /*
   * Define DNODE_STATS to turn on statistic gathering. By default, it is only
   * turned on when DEBUG is also defined.
   */
*** 56,66 ****
  #endif  /* DNODE_STATS */
  
  static dnode_phys_t dnode_phys_zero;
  
  int zfs_default_bs = SPA_MINBLOCKSHIFT;
! int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
  
  #ifdef  _KERNEL
  static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  #endif  /* _KERNEL */
  
--- 59,69 ----
  #endif  /* DNODE_STATS */
  
  static dnode_phys_t dnode_phys_zero;
  
  int zfs_default_bs = SPA_MINBLOCKSHIFT;
! int zfs_default_ibs = DN_DFL_INDBLKSHIFT;
  
  #ifdef  _KERNEL
  static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
  #endif  /* _KERNEL */
  
*** 156,165 ****
--- 159,172 ----
          dn->dn_dbufs_count = 0;
          avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
              offsetof(dmu_buf_impl_t, db_link));
  
          dn->dn_moved = 0;
+ 
+         bzero(&dn->dn_smartcomp, sizeof (dn->dn_smartcomp));
+         mutex_init(&dn->dn_smartcomp.sc_lock, NULL, MUTEX_DEFAULT, NULL);
+ 
          return (0);
  }
  
  /* ARGSUSED */
  static void
*** 166,175 ****
--- 173,184 ----
  dnode_dest(void *arg, void *unused)
  {
          int i;
          dnode_t *dn = arg;
  
+         mutex_destroy(&dn->dn_smartcomp.sc_lock);
+ 
          rw_destroy(&dn->dn_struct_rwlock);
          mutex_destroy(&dn->dn_mtx);
          mutex_destroy(&dn->dn_dbufs_mtx);
          cv_destroy(&dn->dn_notxholds);
          refcount_destroy(&dn->dn_holds);
*** 636,646 ****
              (bonustype == DMU_OT_SA && bonuslen == 0));
          ASSERT(DMU_OT_IS_VALID(bonustype));
          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
  
          /* clean up any unreferenced dbufs */
!         dnode_evict_dbufs(dn);
  
          dn->dn_id_flags = 0;
  
          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
          dnode_setdirty(dn, tx);
--- 645,655 ----
              (bonustype == DMU_OT_SA && bonuslen == 0));
          ASSERT(DMU_OT_IS_VALID(bonustype));
          ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
  
          /* clean up any unreferenced dbufs */
!         dnode_evict_dbufs(dn, DBUF_EVICT_ALL);
  
          dn->dn_id_flags = 0;
  
          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
          dnode_setdirty(dn, tx);
*** 1265,1274 ****
--- 1274,1289 ----
  }
  
  void
  dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
  {
+         dnode_setdirty_sc(dn, tx, B_TRUE);
+ }
+ 
+ void
+ dnode_setdirty_sc(dnode_t *dn, dmu_tx_t *tx, boolean_t usesc)
+ {
          objset_t *os = dn->dn_objset;
          uint64_t txg = tx->tx_txg;
  
          if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
                  dsl_dataset_dirty(os->os_dsl_dataset, tx);
*** 1323,1334 ****
           * dnode will hang around after we finish processing its
           * children.
           */
          VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
  
!         (void) dbuf_dirty(dn->dn_dbuf, tx);
! 
          dsl_dataset_dirty(os->os_dsl_dataset, tx);
  }
  
  void
  dnode_free(dnode_t *dn, dmu_tx_t *tx)
--- 1338,1348 ----
           * dnode will hang around after we finish processing its
           * children.
           */
          VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
  
!         (void) dbuf_dirty_sc(dn->dn_dbuf, tx, usesc);
          dsl_dataset_dirty(os->os_dsl_dataset, tx);
  }
  
  void
  dnode_free(dnode_t *dn, dmu_tx_t *tx)
*** 1412,1422 ****
          return (SET_ERROR(ENOTSUP));
  }
  
  /* read-holding callers must not rely on the lock being continuously held */
  void
! dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
  {
          uint64_t txgoff = tx->tx_txg & TXG_MASK;
          int epbs, new_nlevels;
          uint64_t sz;
  
--- 1426,1437 ----
          return (SET_ERROR(ENOTSUP));
  }
  
  /* read-holding callers must not rely on the lock being continuously held */
  void
! dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
!     boolean_t usesc, boolean_t have_read)
  {
          uint64_t txgoff = tx->tx_txg & TXG_MASK;
          int epbs, new_nlevels;
          uint64_t sz;
  
*** 1466,1476 ****
                  dn->dn_next_nlevels[txgoff] = new_nlevels;
  
                  /* dirty the left indirects */
                  db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
                  ASSERT(db != NULL);
!                 new = dbuf_dirty(db, tx);
                  dbuf_rele(db, FTAG);
  
                  /* transfer the dirty records to the new indirect */
                  mutex_enter(&dn->dn_mtx);
                  mutex_enter(&new->dt.di.dr_mtx);
--- 1481,1491 ----
                  dn->dn_next_nlevels[txgoff] = new_nlevels;
  
                  /* dirty the left indirects */
                  db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
                  ASSERT(db != NULL);
!                 new = dbuf_dirty_sc(db, tx, usesc);
                  dbuf_rele(db, FTAG);
  
                  /* transfer the dirty records to the new indirect */
                  mutex_enter(&dn->dn_mtx);
                  mutex_enter(&new->dt.di.dr_mtx);
*** 1695,1705 ****
           * We will finish up this free operation in the syncing phase.
           */
          mutex_enter(&dn->dn_mtx);
          int txgoff = tx->tx_txg & TXG_MASK;
          if (dn->dn_free_ranges[txgoff] == NULL) {
!                 dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL);
          }
          range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
          range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
          dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
              blkid, nblks, tx->tx_txg);
--- 1710,1721 ----
           * We will finish up this free operation in the syncing phase.
           */
          mutex_enter(&dn->dn_mtx);
          int txgoff = tx->tx_txg & TXG_MASK;
          if (dn->dn_free_ranges[txgoff] == NULL) {
!                 dn->dn_free_ranges[txgoff] =
!                     range_tree_create(NULL, NULL, &dn->dn_mtx);
          }
          range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
          range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
          dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
              blkid, nblks, tx->tx_txg);
*** 1994,1999 ****
--- 2010,2172 ----
  out:
          if (!(flags & DNODE_FIND_HAVELOCK))
                  rw_exit(&dn->dn_struct_rwlock);
  
          return (error);
+ }
+ 
+ /*
+  * When in the compressing phase, we check our results every 1 MiB. If
+  * compression ratio drops below the threshold factor, we give up trying
+  * to compress the file for a while. The length of the interval is
+  * calculated from this interval value according to the algorithm in
+  * smartcomp_check_comp.
+  */
+ uint64_t zfs_smartcomp_interval = 1 * 1024 * 1024;
+ 
+ /*
+  * Minimum compression factor is 12.5% (100% / factor) - below that we
+  * consider compression to have failed.
+  */
+ uint64_t zfs_smartcomp_threshold_factor = 8;
+ 
+ /*
+  * Maximum power-of-2 exponent on the deny interval and consequently
+  * the maximum number of compression successes and failures we track.
+  * Successive compression failures extend the deny interval, whereas
+  * repeated successes makes the algorithm more hesitant to start denying.
+  */
+ int64_t zfs_smartcomp_interval_exp = 5;
+ 
+ /*
+  * Callback invoked by the zio machinery when it wants to compress a data
+  * block. If we are in the denying compression phase, we add the amount of
+  * data written to our stats and check if we've denied enough data to
+  * transition back in to the compression phase again.
+  */
+ boolean_t
+ dnode_smartcomp_ask_cb(void *userinfo, const zio_t *zio)
+ {
+         dnode_t *dn = userinfo;
+         dnode_smartcomp_t *sc;
+         dnode_smartcomp_state_t old_state;
+ 
+         ASSERT(dn != NULL);
+ 
+         sc = &dn->dn_smartcomp;
+         mutex_enter(&sc->sc_lock);
+         old_state = sc->sc_state;
+         if (sc->sc_state == DNODE_SMARTCOMP_DENYING) {
+                 sc->sc_orig_size += zio->io_orig_size;
+                 if (sc->sc_orig_size >= sc->sc_deny_interval) {
+                         /* time to retry compression on next call */
+                         sc->sc_state = DNODE_SMARTCOMP_COMPRESSING;
+                         sc->sc_size = 0;
+                         sc->sc_orig_size = 0;
+                 }
+         }
+         mutex_exit(&sc->sc_lock);
+ 
+         return (old_state != DNODE_SMARTCOMP_DENYING);
+ }
+ 
+ /*
+  * Callback invoked after compression has been performed to allow us to
+  * monitor compression performance. If we're in a compressing phase, we
+  * add the uncompressed and compressed data volumes to our state counters
+  * and see if we need to recheck compression performance in
+  * smartcomp_check_comp.
+  */
+ void
+ dnode_smartcomp_result_cb(void *userinfo, const zio_t *zio)
+ {
+         dnode_t *dn = userinfo;
+         dnode_smartcomp_t *sc;
+         uint64_t io_size = zio->io_size, io_orig_size = zio->io_orig_size;
+ 
+         ASSERT(dn != NULL);
+         sc = &dn->dn_smartcomp;
+ 
+         if (io_orig_size == 0)
+                 /* XXX: is this valid anyway? */
+                 return;
+ 
+         mutex_enter(&sc->sc_lock);
+         if (sc->sc_state == DNODE_SMARTCOMP_COMPRESSING) {
+                 /* add last block's compression performance to our stats */
+                 sc->sc_size += io_size;
+                 sc->sc_orig_size += io_orig_size;
+                 /* time to recheck compression performance? */
+                 if (sc->sc_orig_size >= zfs_smartcomp_interval)
+                         smartcomp_check_comp(sc);
+         }
+         mutex_exit(&sc->sc_lock);
+ }
+ 
+ /*
+  * This function checks whether the compression we've been getting is above
+  * the threshold value. If it is, we decrement the sc_comp_failures counter
+  * to indicate compression success. If it isn't we increment the same
+  * counter and potentially start a compression deny phase.
+  */
+ static void
+ smartcomp_check_comp(dnode_smartcomp_t *sc)
+ {
+         uint64_t threshold = sc->sc_orig_size -
+             sc->sc_orig_size / zfs_smartcomp_threshold_factor;
+ 
+         ASSERT(MUTEX_HELD(&sc->sc_lock));
+         if (sc->sc_size > threshold) {
+                 sc->sc_comp_failures =
+                     MIN(sc->sc_comp_failures + 1, zfs_smartcomp_interval_exp);
+                 if (sc->sc_comp_failures > 0) {
+                         /* consistently getting too little compression, stop */
+                         sc->sc_state = DNODE_SMARTCOMP_DENYING;
+                         sc->sc_deny_interval =
+                             zfs_smartcomp_interval << sc->sc_comp_failures;
+                         /* randomize the interval by +-10% to avoid patterns */
+                         sc->sc_deny_interval = (sc->sc_deny_interval -
+                             (sc->sc_deny_interval / 10)) +
+                             spa_get_random(sc->sc_deny_interval / 5 + 1);
+                 }
+         } else {
+                 if (sc->sc_comp_failures > 0) {
+                         /*
+                          * We're biased for compression, so any success makes
+                          * us forget the file's past incompressibility.
+                          */
+                         sc->sc_comp_failures = 0;
+                 } else {
+                         sc->sc_comp_failures = MAX(sc->sc_comp_failures - 1,
+                             -zfs_smartcomp_interval_exp);
+                 }
+         }
+         /* reset state counters */
+         sc->sc_size = 0;
+         sc->sc_orig_size = 0;
+ }
+ 
+ /*
+  * Prepares a zio_smartcomp_info_t structure for passing to zio_write or
+  * arc_write depending on whether smart compression should be applied to
+  * the specified objset, dnode and buffer.
+  */
+ extern void
+ dnode_setup_zio_smartcomp(dmu_buf_impl_t *db, zio_smartcomp_info_t *sc)
+ {
+         dnode_t *dn = DB_DNODE(db);
+         objset_t *os = dn->dn_objset;
+ 
+         /* Only do smart compression on user data of plain files. */
+         if (dn->dn_type == DMU_OT_PLAIN_FILE_CONTENTS && db->db_level == 0 &&
+             os->os_smartcomp_enabled && os->os_compress != ZIO_COMPRESS_OFF) {
+                 sc->sc_ask = dnode_smartcomp_ask_cb;
+                 sc->sc_result = dnode_smartcomp_result_cb;
+                 sc->sc_userinfo = dn;
+         } else {
+                 /*
+                  * Zeroing out the structure passed to zio_write will turn
+                  * smart compression off.
+                  */
+                 bzero(sc, sizeof (*sc));
+         }
  }