Print this page
NEX-20218 Backport Illumos #9474 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
MFV illumos-gate@fa41d87de9ec9000964c605eb01d6dc19e4a1abe
    9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
    Reviewed by: Matt Ahrens <matt@delphix.com>
    Reviewed by: Brad Lewis <brad.lewis@delphix.com>
    Reviewed by: Andriy Gapon <avg@FreeBSD.org>
    Approved by: Dan McDonald <danmcd@joyent.com>
NEX-6859 TX-commit callback that is registered in sync-ctx causes system panic
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
       23 + * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   */
  27   27  
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_impl.h>
  30   30  #include <sys/dbuf.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/dmu_objset.h>
  33   33  #include <sys/dsl_dataset.h>
↓ open down ↓ 260 lines elided ↑ open up ↑
 294  294  
 295  295          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 296  296              object, THT_WRITE, off, len);
 297  297          if (txh != NULL) {
 298  298                  dmu_tx_count_write(txh, off, len);
 299  299                  dmu_tx_count_dnode(txh);
 300  300          }
 301  301  }
 302  302  
 303  303  void
 304      -dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
 305      -{
 306      -        dmu_tx_hold_t *txh;
 307      -
 308      -        ASSERT(tx->tx_txg == 0);
 309      -        txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 310      -            object, THT_WRITE, 0, 0);
 311      -        if (txh == NULL)
 312      -                return;
 313      -
 314      -        dnode_t *dn = txh->txh_dnode;
 315      -        (void) refcount_add_many(&txh->txh_space_towrite,
 316      -            1ULL << dn->dn_indblkshift, FTAG);
 317      -        dmu_tx_count_dnode(txh);
 318      -}
 319      -
 320      -void
 321  304  dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 322  305  {
 323  306          dmu_tx_hold_t *txh;
 324  307  
 325  308          ASSERT0(tx->tx_txg);
 326  309          ASSERT3U(len, <=, DMU_MAX_ACCESS);
 327  310          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 328  311  
 329  312          txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
 330  313          if (txh != NULL) {
↓ open down ↓ 531 lines elided ↑ open up ↑
 862  845   * in the rough estimate (e.g. txh_space_towrite doesn't take into account
 863  846   * indirect blocks, and dd_space_towrite[] doesn't take into account changes
 864  847   * to the MOS).
 865  848   *
 866  849   * Note that due to this algorithm, it is possible to exceed the allowed
 867  850   * usage by one transaction.  Also, as we approach the allowed usage,
 868  851   * we will allow a very limited amount of changes into each TXG, thus
 869  852   * decreasing performance.
 870  853   */
 871  854  static int
 872      -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
      855 +dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 873  856  {
 874  857          spa_t *spa = tx->tx_pool->dp_spa;
 875  858  
 876  859          ASSERT0(tx->tx_txg);
 877  860  
 878  861          if (tx->tx_err)
 879  862                  return (tx->tx_err);
 880  863  
 881  864          if (spa_suspended(spa)) {
 882  865                  /*
 883  866                   * If the user has indicated a blocking failure mode
 884  867                   * then return ERESTART which will block in dmu_tx_wait().
 885  868                   * Otherwise, return EIO so that an error can get
 886  869                   * propagated back to the VOP calls.
 887  870                   *
 888  871                   * Note that we always honor the txg_how flag regardless
 889  872                   * of the failuremode setting.
 890  873                   */
 891  874                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 892      -                    !(txg_how & TXG_WAIT))
      875 +                    txg_how != TXG_WAIT)
 893  876                          return (SET_ERROR(EIO));
 894  877  
 895  878                  return (SET_ERROR(ERESTART));
 896  879          }
 897  880  
 898      -        if (!tx->tx_dirty_delayed &&
      881 +        if (!tx->tx_waited &&
 899  882              dsl_pool_need_dirty_delay(tx->tx_pool)) {
 900  883                  tx->tx_wait_dirty = B_TRUE;
 901  884                  return (SET_ERROR(ERESTART));
 902  885          }
 903  886  
 904  887          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 905  888          tx->tx_needassign_txh = NULL;
 906  889  
 907  890          /*
 908  891           * NB: No error returns are allowed after txg_hold_open, but
↓ open down ↓ 67 lines elided ↑ open up ↑
 976  959                  mutex_exit(&dn->dn_mtx);
 977  960          }
 978  961  
 979  962          txg_rele_to_sync(&tx->tx_txgh);
 980  963  
 981  964          tx->tx_lasttried_txg = tx->tx_txg;
 982  965          tx->tx_txg = 0;
 983  966  }
 984  967  
 985  968  /*
 986      - * Assign tx to a transaction group; txg_how is a bitmask:
      969 + * Assign tx to a transaction group.  txg_how can be one of:
 987  970   *
 988      - * If TXG_WAIT is set and the currently open txg is full, this function
 989      - * will wait until there's a new txg. This should be used when no locks
 990      - * are being held. With this bit set, this function will only fail if
 991      - * we're truly out of space (or over quota).
      971 + * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
      972 + *      a new one.  This should be used when you're not holding locks.
      973 + *      It will only fail if we're truly out of space (or over quota).
 992  974   *
 993      - * If TXG_WAIT is *not* set and we can't assign into the currently open
 994      - * txg without blocking, this function will return immediately with
 995      - * ERESTART. This should be used whenever locks are being held.  On an
 996      - * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
 997      - * and try again.
      975 + * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
      976 + *      blocking, returns immediately with ERESTART.  This should be used
      977 + *      whenever you're holding locks.  On an ERESTART error, the caller
      978 + *      should drop locks, do a dmu_tx_wait(tx), and try again.
 998  979   *
 999      - * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
1000      - * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
1001      - * details on the throttle). This is used by the VFS operations, after
1002      - * they have already called dmu_tx_wait() (though most likely on a
1003      - * different tx).
      980 + * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
      981 + *      has already been called on behalf of this operation (though
      982 + *      most likely on a different tx).
1004  983   */
1005  984  int
1006      -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
      985 +dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1007  986  {
1008  987          int err;
1009  988  
1010  989          ASSERT(tx->tx_txg == 0);
1011      -        ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
      990 +        ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
      991 +            txg_how == TXG_WAITED);
1012  992          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1013  993  
1014  994          /* If we might wait, we must not hold the config lock. */
1015      -        IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
      995 +        ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1016  996  
1017      -        if ((txg_how & TXG_NOTHROTTLE))
1018      -                tx->tx_dirty_delayed = B_TRUE;
      997 +        if (txg_how == TXG_WAITED)
      998 +                tx->tx_waited = B_TRUE;
1019  999  
1020 1000          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1021 1001                  dmu_tx_unassign(tx);
1022 1002  
1023      -                if (err != ERESTART || !(txg_how & TXG_WAIT))
     1003 +                if (err != ERESTART || txg_how != TXG_WAIT)
1024 1004                          return (err);
1025 1005  
1026 1006                  dmu_tx_wait(tx);
1027 1007          }
1028 1008  
1029 1009          txg_rele_to_quiesce(&tx->tx_txgh);
1030 1010  
1031 1011          return (0);
1032 1012  }
1033 1013  
↓ open down ↓ 16 lines elided ↑ open up ↑
1050 1030                  while (dp->dp_dirty_total >= zfs_dirty_data_max)
1051 1031                          cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1052 1032                  uint64_t dirty = dp->dp_dirty_total;
1053 1033                  mutex_exit(&dp->dp_lock);
1054 1034  
1055 1035                  dmu_tx_delay(tx, dirty);
1056 1036  
1057 1037                  tx->tx_wait_dirty = B_FALSE;
1058 1038  
1059 1039                  /*
1060      -                 * Note: setting tx_dirty_delayed only has effect if the
1061      -                 * caller used TX_WAIT.  Otherwise they are going to
1062      -                 * destroy this tx and try again.  The common case,
1063      -                 * zfs_write(), uses TX_WAIT.
     1040 +                 * Note: setting tx_waited only has effect if the caller
     1041 +                 * used TX_WAIT.  Otherwise they are going to destroy
     1042 +                 * this tx and try again.  The common case, zfs_write(),
     1043 +                 * uses TX_WAIT.
1064 1044                   */
1065      -                tx->tx_dirty_delayed = B_TRUE;
     1045 +                tx->tx_waited = B_TRUE;
1066 1046          } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1067 1047                  /*
1068 1048                   * If the pool is suspended we need to wait until it
1069 1049                   * is resumed.  Note that it's possible that the pool
1070 1050                   * has become active after this thread has tried to
1071 1051                   * obtain a tx.  If that's the case then tx_lasttried_txg
1072 1052                   * would not have been set.
1073 1053                   */
1074 1054                  txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1075 1055          } else if (tx->tx_needassign_txh) {
↓ open down ↓ 2 lines elided ↑ open up ↑
1078 1058                   * transaction to complete.
1079 1059                   */
1080 1060                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1081 1061  
1082 1062                  mutex_enter(&dn->dn_mtx);
1083 1063                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1084 1064                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1085 1065                  mutex_exit(&dn->dn_mtx);
1086 1066                  tx->tx_needassign_txh = NULL;
1087 1067          } else {
1088      -                txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
     1068 +                /*
     1069 +                 * If we have a lot of dirty data just wait until we sync
     1070 +                 * out a TXG at which point we'll hopefully have synced
     1071 +                 * a portion of the changes.
     1072 +                 */
     1073 +                txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1089 1074          }
1090 1075  }
1091 1076  
1092 1077  static void
1093 1078  dmu_tx_destroy(dmu_tx_t *tx)
1094 1079  {
1095 1080          dmu_tx_hold_t *txh;
1096 1081  
1097 1082          while ((txh = list_head(&tx->tx_holds)) != NULL) {
1098 1083                  dnode_t *dn = txh->txh_dnode;
↓ open down ↓ 35 lines elided ↑ open up ↑
1134 1119                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1135 1120                          dn->dn_assigned_txg = 0;
1136 1121                          cv_broadcast(&dn->dn_notxholds);
1137 1122                  }
1138 1123                  mutex_exit(&dn->dn_mtx);
1139 1124          }
1140 1125  
1141 1126          if (tx->tx_tempreserve_cookie)
1142 1127                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1143 1128  
1144      -        if (!list_is_empty(&tx->tx_callbacks))
1145      -                txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
     1129 +        if (!list_is_empty(&tx->tx_callbacks)) {
     1130 +                if (dmu_tx_is_syncing(tx)) {
     1131 +                        txg_register_callbacks_sync(tx->tx_pool,
     1132 +                            tx->tx_txg, &tx->tx_callbacks);
     1133 +                } else {
     1134 +                        txg_register_callbacks(&tx->tx_txgh,
     1135 +                            &tx->tx_callbacks);
     1136 +                }
     1137 +        }
1146 1138  
1147 1139          if (tx->tx_anyobj == FALSE)
1148 1140                  txg_rele_to_sync(&tx->tx_txgh);
1149 1141  
1150 1142          dmu_tx_destroy(tx);
1151 1143  }
1152 1144  
1153 1145  void
1154 1146  dmu_tx_abort(dmu_tx_t *tx)
1155 1147  {
↓ open down ↓ 176 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX