Print this page
NEX-20218 Backport Illumos #9474 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
MFV illumos-gate@fa41d87de9ec9000964c605eb01d6dc19e4a1abe
    9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
    Reviewed by: Matt Ahrens <matt@delphix.com>
    Reviewed by: Brad Lewis <brad.lewis@delphix.com>
    Reviewed by: Andriy Gapon <avg@FreeBSD.org>
    Approved by: Dan McDonald <danmcd@joyent.com>
NEX-6859 TX-commit callback that is registered in sync-ctx causes system panic
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
        
@@ -20,10 +20,11 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
@@ -102,10 +103,19 @@
  * syncing state in order to reduce the latency of the administrative
  * activity. To complete the syncing state, ZFS writes out a new uberblock,
  * the root of the tree of blocks that comprise all state stored on the ZFS
  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
  * now transition to the syncing state.
+ *
+ * It is possible to register a callback for a TX, so the callback will be
+ * called after sync of the corresponding TX-group to disk.
+ * Required callback and its optional argument can registered by using
+ * dmu_tx_callback_register().
+ * All callback are executed async via taskq (see txg_dispatch_callbacks).
+ * There are 2 possible cases when a registered callback is called:
+ *  1) the corresponding TX is commited to disk (the first arg is 0)
+ *  2) the corresponding TX is aborted (the first arg is ECANCELED)
  */
 
 static void txg_sync_thread(void *arg);
 static void txg_quiesce_thread(void *arg);
 
@@ -327,11 +337,27 @@
         mutex_enter(&tc->tc_lock);
         list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
         mutex_exit(&tc->tc_lock);
 }
 
+/* This register function can be called only from sync-context */
 void
+txg_register_callbacks_sync(dsl_pool_t *dp, uint64_t txg, list_t *tx_callbacks)
+{
+        tx_state_t *tx = &dp->dp_tx;
+        tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
+        txg_handle_t th;
+
+        VERIFY3U(tx->tx_syncing_txg, ==, txg);
+
+        th.th_cpu = tc;
+        th.th_txg = txg;
+
+        txg_register_callbacks(&th, tx_callbacks);
+}
+
+void
 txg_rele_to_sync(txg_handle_t *th)
 {
         tx_cpu_t *tc = th->th_cpu;
         int g = th->th_txg & TXG_MASK;
 
@@ -442,10 +468,34 @@
                 (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
                     txg_do_callbacks, cb_list, TQ_SLEEP);
         }
 }
 
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+        tx_state_t *tx = &dp->dp_tx;
+        ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+        return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+        tx_state_t *tx = &dp->dp_tx;
+        ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+        return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+        tx_state_t *tx = &dp->dp_tx;
+        ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+        return (tx->tx_quiesced_txg != 0);
+}
+
 static void
 txg_sync_thread(void *arg)
 {
         dsl_pool_t *dp = arg;
         spa_t *spa = dp->dp_spa;
@@ -468,11 +518,11 @@
                  */
                 timer = (delta >= timeout ? 0 : timeout - delta);
                 while (!dsl_scan_active(dp->dp_scan) &&
                     !tx->tx_exiting && timer > 0 &&
                     tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-                    tx->tx_quiesced_txg == 0 &&
+                    !txg_has_quiesced_to_sync(dp) &&
                     dp->dp_dirty_total < zfs_dirty_data_sync) {
                         dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
                             tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
                         txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
                         delta = ddi_get_lbolt() - start;
@@ -481,11 +531,11 @@
 
                 /*
                  * Wait until the quiesce thread hands off a txg to us,
                  * prompting it to do so if necessary.
                  */
-                while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+                while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
                         if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
                                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
                         cv_broadcast(&tx->tx_quiesce_more_cv);
                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
                 }
@@ -496,10 +546,11 @@
                 /*
                  * Consume the quiesced txg which has been handed off to
                  * us.  This may cause the quiescing thread to now be
                  * able to quiesce another txg, so we must signal it.
                  */
+                ASSERT(tx->tx_quiesced_txg != 0);
                 txg = tx->tx_quiesced_txg;
                 tx->tx_quiesced_txg = 0;
                 tx->tx_syncing_txg = txg;
                 DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
                 cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -544,28 +595,31 @@
                  * the "quiesced, waiting to sync" txg has been consumed
                  * by the sync thread.
                  */
                 while (!tx->tx_exiting &&
                     (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
-                    tx->tx_quiesced_txg != 0))
+                    txg_has_quiesced_to_sync(dp)))
                         txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 
                 if (tx->tx_exiting)
                         txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 
                 txg = tx->tx_open_txg;
                 dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
                     txg, tx->tx_quiesce_txg_waiting,
                     tx->tx_sync_txg_waiting);
+                tx->tx_quiescing_txg = txg;
+
                 mutex_exit(&tx->tx_sync_lock);
                 txg_quiesce(dp, txg);
                 mutex_enter(&tx->tx_sync_lock);
 
                 /*
                  * Hand this txg off to the sync thread.
                  */
                 dprintf("quiesce done, handing off txg %llu\n", txg);
+                tx->tx_quiescing_txg = 0;
                 tx->tx_quiesced_txg = txg;
                 DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
                 cv_broadcast(&tx->tx_sync_more_cv);
                 cv_broadcast(&tx->tx_quiesce_done_cv);
         }
@@ -659,11 +713,12 @@
         tx_state_t *tx = &dp->dp_tx;
 
         ASSERT(!dsl_pool_config_held(dp));
 
         mutex_enter(&tx->tx_sync_lock);
-        if (tx->tx_syncing_txg == 0 &&
+        if (!txg_is_syncing(dp) &&
+            !txg_is_quiescing(dp) &&
             tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
             tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
             tx->tx_quiesced_txg <= tx->tx_synced_txg) {
                 tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
                 cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -820,12 +875,10 @@
         void *p = NULL;
 
         txg_verify(tl->tl_spa, txg);
         mutex_enter(&tl->tl_lock);
         if ((tn = tl->tl_head[t]) != NULL) {
-                ASSERT(tn->tn_member[t]);
-                ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
                 p = (char *)tn - tl->tl_offset;
                 tl->tl_head[t] = tn->tn_next[t];
                 tn->tn_next[t] = NULL;
                 tn->tn_member[t] = 0;
         }