Print this page
NEX-19083 backport OS-7314 zil_commit should omit cache thrash
9962 zil_commit should omit cache thrash
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
NEX-9752 backport illumos 6950 ARC should cache compressed data
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
6950 ARC should cache compressed data
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Don Brady <don.brady@intel.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
5269 zpool import slow
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

@@ -18,10 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */

@@ -89,11 +90,11 @@
 int zfs_commit_timeout_pct = 5;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
-int zil_replay_disable = 0;
+int zil_replay_disable = 0;    /* disable intent logging replay */
 
 /*
  * Tunable parameter for debugging or performance analysis.  Setting
  * zfs_nocacheflush will cause corruption on power loss if a volatile
  * out-of-order write cache is enabled.

@@ -516,11 +517,11 @@
         ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
         ASSERT3P(lwb->lwb_write_zio, ==, NULL);
         ASSERT3P(lwb->lwb_root_zio, ==, NULL);
         ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
         ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
-            lwb->lwb_state == LWB_STATE_DONE);
+            lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 
         /*
          * Clear the zilog's field to indicate this lwb is no longer
          * valid, and prevent use-after-free errors.
          */

@@ -874,11 +875,12 @@
         mutex_enter(&zcw->zcw_lock);
         ASSERT(!list_link_active(&zcw->zcw_node));
         ASSERT3P(zcw->zcw_lwb, ==, NULL);
         ASSERT3P(lwb, !=, NULL);
         ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
-            lwb->lwb_state == LWB_STATE_ISSUED);
+            lwb->lwb_state == LWB_STATE_ISSUED ||
+            lwb->lwb_state == LWB_STATE_WRITE_DONE);
 
         list_insert_tail(&lwb->lwb_waiters, zcw);
         zcw->zcw_lwb = lwb;
         mutex_exit(&zcw->zcw_lock);
 }

@@ -920,20 +922,60 @@
                 }
         }
         mutex_exit(&lwb->lwb_vdev_lock);
 }
 
+static void
+zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
+{
+        avl_tree_t *src = &lwb->lwb_vdev_tree;
+        avl_tree_t *dst = &nlwb->lwb_vdev_tree;
+        void *cookie = NULL;
+        zil_vdev_node_t *zv;
+
+        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+        ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+        ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+        /*
+         * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
+         * not need the protection of lwb_vdev_lock (it will only be modified
+         * while holding zilog->zl_lock) as its writes and those of its
+         * children have all completed.  The younger 'nlwb' may be waiting on
+         * future writes to additional vdevs.
+         */
+        mutex_enter(&nlwb->lwb_vdev_lock);
+        /*
+         * Tear down the 'lwb' vdev tree, ensuring that entries which do not
+         * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
+         */
+        while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
+                avl_index_t where;
+
+                if (avl_find(dst, zv, &where) == NULL) {
+                        avl_insert(dst, zv, where);
+                } else {
+                        kmem_free(zv, sizeof (*zv));
+                }
+        }
+        mutex_exit(&nlwb->lwb_vdev_lock);
+}
+
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
         lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
- * This function is a called after all VDEVs associated with a given lwb
+ * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zfs_nocacheflush" is set.
+ * as the lwb write completes, if "zil_nocacheflush" is set. Further,
+ * all "previous" lwb's will have completed before this function is
+ * called; i.e. this function is called for all previous lwbs before
+ * it's called for "this" lwb (enforced via zio the dependencies
+ * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures

@@ -966,12 +1008,14 @@
 
         ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
         zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
 
         lwb->lwb_root_zio = NULL;
-        lwb->lwb_state = LWB_STATE_DONE;
 
+        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+        lwb->lwb_state = LWB_STATE_FLUSH_DONE;
+
         if (zilog->zl_last_lwb_opened == lwb) {
                 /*
                  * Remember the highest committed log sequence number
                  * for ztest. We only update this value when all the log
                  * writes succeeded, because ztest wants to ASSERT that

@@ -1007,18 +1051,21 @@
          */
         dmu_tx_commit(tx);
 }
 
 /*
- * This is called when an lwb write completes. This means, this specific
- * lwb was written to disk, and all dependent lwb have also been
- * written to disk.
- *
- * At this point, a DKIOCFLUSHWRITECACHE command hasn't been issued to
- * the VDEVs involved in writing out this specific lwb. The lwb will be
- * "done" once zil_lwb_flush_vdevs_done() is called, which occurs in the
- * zio completion callback for the lwb's root zio.
+ * This is called when an lwb's write zio completes. The callback's
+ * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
+ * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
+ * in writing out this specific lwb's data, and in the case that cache
+ * flushes have been deferred, vdevs involved in writing the data for
+ * previous lwbs. The writes corresponding to all the vdevs in the
+ * lwb_vdev_tree will have completed by the time this is called, due to
+ * the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done"
+ * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
+ * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
         lwb_t *lwb = zio->io_private;

@@ -1025,10 +1072,11 @@
         spa_t *spa = zio->io_spa;
         zilog_t *zilog = lwb->lwb_zilog;
         avl_tree_t *t = &lwb->lwb_vdev_tree;
         void *cookie = NULL;
         zil_vdev_node_t *zv;
+        lwb_t *nlwb;
 
         ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
         ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
         ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);

@@ -1038,14 +1086,15 @@
         ASSERT(!BP_IS_HOLE(zio->io_bp));
         ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
         abd_put(zio->io_abd);
 
-        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
-
         mutex_enter(&zilog->zl_lock);
+        ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
+        lwb->lwb_state = LWB_STATE_WRITE_DONE;
         lwb->lwb_write_zio = NULL;
+        nlwb = list_next(&zilog->zl_lwb_list, lwb);
         mutex_exit(&zilog->zl_lock);
 
         if (avl_numnodes(t) == 0)
                 return;
 

@@ -1060,18 +1109,106 @@
                 while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
                         kmem_free(zv, sizeof (*zv));
                 return;
         }
 
+        /*
+         * If this lwb does not have any threads waiting for it to
+         * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
+         * command to the vdevs written to by "this" lwb, and instead
+         * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
+         * command for those vdevs. Thus, we merge the vdev tree of
+         * "this" lwb with the vdev tree of the "next" lwb in the list,
+         * and assume the "next" lwb will handle flushing the vdevs (or
+         * deferring the flush(s) again).
+         *
+         * This is a useful performance optimization, especially for
+         * workloads with lots of async write activity and few sync
+         * write and/or fsync activity, as it has the potential to
+         * coalesce multiple flush commands to a vdev into one.
+         */
+        if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+                zil_lwb_flush_defer(lwb, nlwb);
+                ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+                return;
+        }
+
         while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
                 vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
                 if (vd != NULL)
                         zio_flush(lwb->lwb_root_zio, vd);
                 kmem_free(zv, sizeof (*zv));
         }
 }
 
+static void
+zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
+{
+        lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
+
+        ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+        ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+        /*
+         * The zilog's "zl_last_lwb_opened" field is used to build the
+         * lwb/zio dependency chain, which is used to preserve the
+         * ordering of lwb completions that is required by the semantics
+         * of the ZIL. Each new lwb zio becomes a parent of the
+         * "previous" lwb zio, such that the new lwb's zio cannot
+         * complete until the "previous" lwb's zio completes.
+         *
+         * This is required by the semantics of zil_commit(); the commit
+         * waiters attached to the lwbs will be woken in the lwb zio's
+         * completion callback, so this zio dependency graph ensures the
+         * waiters are woken in the correct order (the same order the
+         * lwbs were created).
+         */
+        if (last_lwb_opened != NULL &&
+            last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
+                ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+                    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
+                    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
+
+                ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
+                zio_add_child(lwb->lwb_root_zio,
+                    last_lwb_opened->lwb_root_zio);
+
+                /*
+                 * If the previous lwb's write hasn't already completed,
+                 * we also want to order the completion of the lwb write
+                 * zios (above, we only order the completion of the lwb
+                 * root zios). This is required because of how we can
+                 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+                 *
+                 * When the DKIOCFLUSHWRITECACHE commands are defered,
+                 * the previous lwb will rely on this lwb to flush the
+                 * vdevs written to by that previous lwb. Thus, we need
+                 * to ensure this lwb doesn't issue the flush until
+                 * after the previous lwb's write completes. We ensure
+                 * this ordering by setting the zio parent/child
+                 * relationship here.
+                 *
+                 * Without this relationship on the lwb's write zio,
+                 * it's possible for this lwb's write to complete prior
+                 * to the previous lwb's write completing; and thus, the
+                 * vdevs for the previous lwb would be flushed prior to
+                 * that lwb's data being written to those vdevs (the
+                 * vdevs are flushed in the lwb write zio's completion
+                 * handler, zil_lwb_write_done()).
+                 */
+                if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
+                        ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+                            last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
+
+                        ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
+                        zio_add_child(lwb->lwb_write_zio,
+                            last_lwb_opened->lwb_write_zio);
+                }
+        }
+}
+
+
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. To do this, the lwb's zio
  * structures are created, and linked to the lwb. This function is
  * idempotent; if the passed in lwb has already been opened, this

@@ -1112,37 +1249,12 @@
                 ASSERT3P(lwb->lwb_write_zio, !=, NULL);
 
                 lwb->lwb_state = LWB_STATE_OPENED;
 
                 mutex_enter(&zilog->zl_lock);
-
-                /*
-                 * The zilog's "zl_last_lwb_opened" field is used to
-                 * build the lwb/zio dependency chain, which is used to
-                 * preserve the ordering of lwb completions that is
-                 * required by the semantics of the ZIL. Each new lwb
-                 * zio becomes a parent of the "previous" lwb zio, such
-                 * that the new lwb's zio cannot complete until the
-                 * "previous" lwb's zio completes.
-                 *
-                 * This is required by the semantics of zil_commit();
-                 * the commit waiters attached to the lwbs will be woken
-                 * in the lwb zio's completion callback, so this zio
-                 * dependency graph ensures the waiters are woken in the
-                 * correct order (the same order the lwbs were created).
-                 */
-                lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
-                if (last_lwb_opened != NULL &&
-                    last_lwb_opened->lwb_state != LWB_STATE_DONE) {
-                        ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
-                            last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
-                        ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
-                        zio_add_child(lwb->lwb_root_zio,
-                            last_lwb_opened->lwb_root_zio);
-                }
+                zil_lwb_set_zio_dependency(zilog, lwb);
                 zilog->zl_last_lwb_opened = lwb;
-
                 mutex_exit(&zilog->zl_lock);
         }
 
         ASSERT3P(lwb->lwb_root_zio, !=, NULL);
         ASSERT3P(lwb->lwb_write_zio, !=, NULL);

@@ -1206,17 +1318,26 @@
          */
 
         tx = dmu_tx_create(zilog->zl_os);
 
         /*
-         * Since we are not going to create any new dirty data, and we
-         * can even help with clearing the existing dirty data, we
-         * should not be subject to the dirty data based delays. We
-         * use TXG_NOTHROTTLE to bypass the delay mechanism.
+         * Since we are not going to create any new dirty data and we can even
+         * help with clearing the existing dirty data, we should not be subject
+         * to the dirty data based delays.
+         * We (ab)use TXG_WAITED to bypass the delay mechanism.
+         * One side effect from using TXG_WAITED is that dmu_tx_assign() can
+         * fail if the pool is suspended.  Those are dramatic circumstances,
+         * so we return NULL to signal that the normal ZIL processing is not
+         * possible and txg_wait_synced() should be used to ensure that the data
+         * is on disk.
          */
-        VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
-
+        error = dmu_tx_assign(tx, TXG_WAITED);
+        if (error != 0) {
+                ASSERT3S(error, ==, EIO);
+                dmu_tx_abort(tx);
+                return (NULL);
+        }
         dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
         txg = dmu_tx_get_txg(tx);
 
         lwb->lwb_tx = tx;
 

@@ -1838,11 +1959,12 @@
                         break;
 
                 mutex_enter(&zilog->zl_lock);
 
                 lwb_t *last_lwb = zilog->zl_last_lwb_opened;
-                if (last_lwb == NULL || last_lwb->lwb_state == LWB_STATE_DONE) {
+                if (last_lwb == NULL ||
+                    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
                         /*
                          * All of the itxs this waiter was waiting on
                          * must have already completed (or there were
                          * never any itx's for it to wait on), so it's
                          * safe to skip this waiter and mark it done.

@@ -1919,11 +2041,12 @@
         lwb = list_tail(&zilog->zl_lwb_list);
         if (lwb == NULL) {
                 lwb = zil_create(zilog);
         } else {
                 ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
+                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
         }
 
         while (itx = list_head(&zilog->zl_itx_commit_list)) {
                 lr_t *lrc = &itx->itx_lr;
                 uint64_t txg = lrc->lrc_txg;

@@ -2021,11 +2144,12 @@
                 }
         } else {
                 ASSERT(list_is_empty(&nolwb_waiters));
                 ASSERT3P(lwb, !=, NULL);
                 ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
-                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_DONE);
+                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+                ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
                 /*
                  * At this point, the ZIL block pointed at by the "lwb"
                  * variable is in one of the following states: "closed"
                  * or "open".

@@ -2142,11 +2266,12 @@
          * point of this function is to issue the lwb). Additionally, we
          * do this prior to acquiring the zl_issuer_lock, to avoid
          * acquiring it when it's not necessary to do so.
          */
         if (lwb->lwb_state == LWB_STATE_ISSUED ||
-            lwb->lwb_state == LWB_STATE_DONE)
+            lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+            lwb->lwb_state == LWB_STATE_FLUSH_DONE)
                 return;
 
         /*
          * In order to call zil_lwb_write_issue() we must hold the
          * zilog's "zl_issuer_lock". We can't simply acquire that lock,

@@ -2190,11 +2315,12 @@
          *
          * See the comment above the lwb_state_t structure definition for
          * more details on the lwb states, and locking requirements.
          */
         if (lwb->lwb_state == LWB_STATE_ISSUED ||
-            lwb->lwb_state == LWB_STATE_DONE)
+            lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+            lwb->lwb_state == LWB_STATE_FLUSH_DONE)
                 goto out;
 
         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 
         /*

@@ -2363,11 +2489,12 @@
                          * is required.
                          */
 
                         IMPLY(lwb != NULL,
                             lwb->lwb_state == LWB_STATE_ISSUED ||
-                            lwb->lwb_state == LWB_STATE_DONE);
+                            lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+                            lwb->lwb_state == LWB_STATE_FLUSH_DONE);
                         cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
                 }
         }
 
         mutex_exit(&zcw->zcw_lock);

@@ -3005,17 +3132,17 @@
          * We need to use zil_commit_impl to ensure we wait for all
          * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
          * to disk before proceeding. If we used zil_commit instead, it
          * would just call txg_wait_synced(), because zl_suspend is set.
          * txg_wait_synced() doesn't wait for these lwb's to be
-         * LWB_STATE_DONE before returning.
+         * LWB_STATE_FLUSH_DONE before returning.
          */
         zil_commit_impl(zilog, 0);
 
         /*
-         * Now that we've ensured all lwb's are LWB_STATE_DONE, we use
-         * txg_wait_synced() to ensure the data from the zilog has
+         * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
+         * use txg_wait_synced() to ensure the data from the zilog has
          * migrated to the main pool before calling zil_destroy().
          */
         txg_wait_synced(zilog->zl_dmu_pool, 0);
 
         zil_destroy(zilog, B_FALSE);

@@ -3215,11 +3342,11 @@
         return (B_FALSE);
 }
 
 /* ARGSUSED */
 int
-zil_reset(const char *osname, void *arg)
+zil_vdev_offline(const char *osname, void *arg)
 {
         int error;
 
         error = zil_suspend(osname, NULL);
         if (error != 0)