Print this page
NEX-20218 Backport Illumos #9474 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
MFV illumos-gate@fa41d87de9ec9000964c605eb01d6dc19e4a1abe
    9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
    Reviewed by: Matt Ahrens <matt@delphix.com>
    Reviewed by: Brad Lewis <brad.lewis@delphix.com>
    Reviewed by: Andriy Gapon <avg@FreeBSD.org>
    Approved by: Dan McDonald <danmcd@joyent.com>
NEX-6859 TX-commit callback that is registered in sync-ctx causes system panic
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/txg.c
          +++ new/usr/src/uts/common/fs/zfs/txg.c
↓ open down ↓ 14 lines elided ↑ open up ↑
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Portions Copyright 2011 Martin Matuska
  24   24   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
       25 + * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/zfs_context.h>
  28   29  #include <sys/txg_impl.h>
  29   30  #include <sys/dmu_impl.h>
  30   31  #include <sys/dmu_tx.h>
  31   32  #include <sys/dsl_pool.h>
  32   33  #include <sys/dsl_scan.h>
  33   34  #include <sys/zil.h>
  34   35  #include <sys/callb.h>
↓ open down ↓ 62 lines elided ↑ open up ↑
  97   98   * In addition to writing out user data, we must also execute synctasks during
  98   99   * the syncing context. A synctask is the mechanism by which some
  99  100   * administrative activities work such as creating and destroying snapshots or
 100  101   * datasets. Note that when a synctask is initiated it enters the open txg,
 101  102   * and ZFS then pushes that txg as quickly as possible to completion of the
 102  103   * syncing state in order to reduce the latency of the administrative
 103  104   * activity. To complete the syncing state, ZFS writes out a new uberblock,
 104  105   * the root of the tree of blocks that comprise all state stored on the ZFS
 105  106   * pool. Finally, if there is a quiesced txg waiting, we signal that it can
 106  107   * now transition to the syncing state.
      108 + *
      109 + * It is possible to register a callback for a TX, so the callback will be
      110 + * called after sync of the corresponding TX-group to disk.
      111 + * Required callback and its optional argument can registered by using
      112 + * dmu_tx_callback_register().
      113 + * All callback are executed async via taskq (see txg_dispatch_callbacks).
      114 + * There are 2 possible cases when a registered callback is called:
      115 + *  1) the corresponding TX is commited to disk (the first arg is 0)
      116 + *  2) the corresponding TX is aborted (the first arg is ECANCELED)
 107  117   */
 108  118  
 109  119  static void txg_sync_thread(void *arg);
 110  120  static void txg_quiesce_thread(void *arg);
 111  121  
 112  122  int zfs_txg_timeout = 5;        /* max seconds worth of delta per txg */
 113  123  
 114  124  /*
 115  125   * Prepare the txg subsystem.
 116  126   */
↓ open down ↓ 205 lines elided ↑ open up ↑
 322  332  txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
 323  333  {
 324  334          tx_cpu_t *tc = th->th_cpu;
 325  335          int g = th->th_txg & TXG_MASK;
 326  336  
 327  337          mutex_enter(&tc->tc_lock);
 328  338          list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
 329  339          mutex_exit(&tc->tc_lock);
 330  340  }
 331  341  
      342 +/* This register function can be called only from sync-context */
 332  343  void
      344 +txg_register_callbacks_sync(dsl_pool_t *dp, uint64_t txg, list_t *tx_callbacks)
      345 +{
      346 +        tx_state_t *tx = &dp->dp_tx;
      347 +        tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
      348 +        txg_handle_t th;
      349 +
      350 +        VERIFY3U(tx->tx_syncing_txg, ==, txg);
      351 +
      352 +        th.th_cpu = tc;
      353 +        th.th_txg = txg;
      354 +
      355 +        txg_register_callbacks(&th, tx_callbacks);
      356 +}
      357 +
      358 +void
 333  359  txg_rele_to_sync(txg_handle_t *th)
 334  360  {
 335  361          tx_cpu_t *tc = th->th_cpu;
 336  362          int g = th->th_txg & TXG_MASK;
 337  363  
 338  364          mutex_enter(&tc->tc_lock);
 339  365          ASSERT(tc->tc_count[g] != 0);
 340  366          if (--tc->tc_count[g] == 0)
 341  367                  cv_broadcast(&tc->tc_cv[g]);
 342  368          mutex_exit(&tc->tc_lock);
↓ open down ↓ 94 lines elided ↑ open up ↑
 437  463                  list_create(cb_list, sizeof (dmu_tx_callback_t),
 438  464                      offsetof(dmu_tx_callback_t, dcb_node));
 439  465  
 440  466                  list_move_tail(cb_list, &tc->tc_callbacks[g]);
 441  467  
 442  468                  (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 443  469                      txg_do_callbacks, cb_list, TQ_SLEEP);
 444  470          }
 445  471  }
 446  472  
      473 +static boolean_t
      474 +txg_is_syncing(dsl_pool_t *dp)
      475 +{
      476 +        tx_state_t *tx = &dp->dp_tx;
      477 +        ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
      478 +        return (tx->tx_syncing_txg != 0);
      479 +}
      480 +
      481 +static boolean_t
      482 +txg_is_quiescing(dsl_pool_t *dp)
      483 +{
      484 +        tx_state_t *tx = &dp->dp_tx;
      485 +        ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
      486 +        return (tx->tx_quiescing_txg != 0);
      487 +}
      488 +
      489 +static boolean_t
      490 +txg_has_quiesced_to_sync(dsl_pool_t *dp)
      491 +{
      492 +        tx_state_t *tx = &dp->dp_tx;
      493 +        ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
      494 +        return (tx->tx_quiesced_txg != 0);
      495 +}
      496 +
 447  497  static void
 448  498  txg_sync_thread(void *arg)
 449  499  {
 450  500          dsl_pool_t *dp = arg;
 451  501          spa_t *spa = dp->dp_spa;
 452  502          tx_state_t *tx = &dp->dp_tx;
 453  503          callb_cpr_t cpr;
 454  504          uint64_t start, delta;
 455  505  
 456  506          txg_thread_enter(tx, &cpr);
↓ open down ↓ 6 lines elided ↑ open up ↑
 463  513  
 464  514                  /*
 465  515                   * We sync when we're scanning, there's someone waiting
 466  516                   * on us, or the quiesce thread has handed off a txg to
 467  517                   * us, or we have reached our timeout.
 468  518                   */
 469  519                  timer = (delta >= timeout ? 0 : timeout - delta);
 470  520                  while (!dsl_scan_active(dp->dp_scan) &&
 471  521                      !tx->tx_exiting && timer > 0 &&
 472  522                      tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 473      -                    tx->tx_quiesced_txg == 0 &&
      523 +                    !txg_has_quiesced_to_sync(dp) &&
 474  524                      dp->dp_dirty_total < zfs_dirty_data_sync) {
 475  525                          dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 476  526                              tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 477  527                          txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 478  528                          delta = ddi_get_lbolt() - start;
 479  529                          timer = (delta > timeout ? 0 : timeout - delta);
 480  530                  }
 481  531  
 482  532                  /*
 483  533                   * Wait until the quiesce thread hands off a txg to us,
 484  534                   * prompting it to do so if necessary.
 485  535                   */
 486      -                while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
      536 +                while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
 487  537                          if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 488  538                                  tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 489  539                          cv_broadcast(&tx->tx_quiesce_more_cv);
 490  540                          txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 491  541                  }
 492  542  
 493  543                  if (tx->tx_exiting)
 494  544                          txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 495  545  
 496  546                  /*
 497  547                   * Consume the quiesced txg which has been handed off to
 498  548                   * us.  This may cause the quiescing thread to now be
 499  549                   * able to quiesce another txg, so we must signal it.
 500  550                   */
      551 +                ASSERT(tx->tx_quiesced_txg != 0);
 501  552                  txg = tx->tx_quiesced_txg;
 502  553                  tx->tx_quiesced_txg = 0;
 503  554                  tx->tx_syncing_txg = txg;
 504  555                  DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 505  556                  cv_broadcast(&tx->tx_quiesce_more_cv);
 506  557  
 507  558                  dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 508  559                      txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 509  560                  mutex_exit(&tx->tx_sync_lock);
 510  561  
↓ open down ↓ 28 lines elided ↑ open up ↑
 539  590  
 540  591                  /*
 541  592                   * We quiesce when there's someone waiting on us.
 542  593                   * However, we can only have one txg in "quiescing" or
 543  594                   * "quiesced, waiting to sync" state.  So we wait until
 544  595                   * the "quiesced, waiting to sync" txg has been consumed
 545  596                   * by the sync thread.
 546  597                   */
 547  598                  while (!tx->tx_exiting &&
 548  599                      (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
 549      -                    tx->tx_quiesced_txg != 0))
      600 +                    txg_has_quiesced_to_sync(dp)))
 550  601                          txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 551  602  
 552  603                  if (tx->tx_exiting)
 553  604                          txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 554  605  
 555  606                  txg = tx->tx_open_txg;
 556  607                  dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 557  608                      txg, tx->tx_quiesce_txg_waiting,
 558  609                      tx->tx_sync_txg_waiting);
      610 +                tx->tx_quiescing_txg = txg;
      611 +
 559  612                  mutex_exit(&tx->tx_sync_lock);
 560  613                  txg_quiesce(dp, txg);
 561  614                  mutex_enter(&tx->tx_sync_lock);
 562  615  
 563  616                  /*
 564  617                   * Hand this txg off to the sync thread.
 565  618                   */
 566  619                  dprintf("quiesce done, handing off txg %llu\n", txg);
      620 +                tx->tx_quiescing_txg = 0;
 567  621                  tx->tx_quiesced_txg = txg;
 568  622                  DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 569  623                  cv_broadcast(&tx->tx_sync_more_cv);
 570  624                  cv_broadcast(&tx->tx_quiesce_done_cv);
 571  625          }
 572  626  }
 573  627  
 574  628  /*
 575  629   * Delay this thread by delay nanoseconds if we are still in the open
 576  630   * transaction group and there is already a waiting txg quiescing or quiesced.
↓ open down ↓ 77 lines elided ↑ open up ↑
 654  708   * the pipeline by queiscing the open txg.
 655  709   */
 656  710  void
 657  711  txg_kick(dsl_pool_t *dp)
 658  712  {
 659  713          tx_state_t *tx = &dp->dp_tx;
 660  714  
 661  715          ASSERT(!dsl_pool_config_held(dp));
 662  716  
 663  717          mutex_enter(&tx->tx_sync_lock);
 664      -        if (tx->tx_syncing_txg == 0 &&
      718 +        if (!txg_is_syncing(dp) &&
      719 +            !txg_is_quiescing(dp) &&
 665  720              tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 666  721              tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 667  722              tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 668  723                  tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 669  724                  cv_broadcast(&tx->tx_quiesce_more_cv);
 670  725          }
 671  726          mutex_exit(&tx->tx_sync_lock);
 672  727  }
 673  728  
 674  729  boolean_t
↓ open down ↓ 140 lines elided ↑ open up ↑
 815  870  void *
 816  871  txg_list_remove(txg_list_t *tl, uint64_t txg)
 817  872  {
 818  873          int t = txg & TXG_MASK;
 819  874          txg_node_t *tn;
 820  875          void *p = NULL;
 821  876  
 822  877          txg_verify(tl->tl_spa, txg);
 823  878          mutex_enter(&tl->tl_lock);
 824  879          if ((tn = tl->tl_head[t]) != NULL) {
 825      -                ASSERT(tn->tn_member[t]);
 826      -                ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
 827  880                  p = (char *)tn - tl->tl_offset;
 828  881                  tl->tl_head[t] = tn->tn_next[t];
 829  882                  tn->tn_next[t] = NULL;
 830  883                  tn->tn_member[t] = 0;
 831  884          }
 832  885          mutex_exit(&tl->tl_lock);
 833  886  
 834  887          return (p);
 835  888  }
 836  889  
↓ open down ↓ 61 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX