big-one Sdiff usr/src/uts/common/fs/zfs/dmu

Print this page

NEX-20218 Backport Illumos #9474 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
MFV illumos-gate@fa41d87de9ec9000964c605eb01d6dc19e4a1abe
    9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
    Reviewed by: Matt Ahrens <matt@delphix.com>
    Reviewed by: Brad Lewis <brad.lewis@delphix.com>
    Reviewed by: Andriy Gapon <avg@FreeBSD.org>
    Approved by: Dan McDonald <danmcd@joyent.com>
NEX-6859 TX-commit callback that is registered in sync-ctx causes system panic
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27 
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dbuf.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dmu_objset.h>
  33 #include <sys/dsl_dataset.h>
  34 #include <sys/dsl_dir.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zap_impl.h>
  37 #include <sys/spa.h>
  38 #include <sys/sa.h>
  39 #include <sys/sa_impl.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/varargs.h>
  42 
  43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,

 284 }
 285 
 286 void
 287 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 288 {
 289         dmu_tx_hold_t *txh;
 290 
 291         ASSERT0(tx->tx_txg);
 292         ASSERT3U(len, <=, DMU_MAX_ACCESS);
 293         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 294 
 295         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 296             object, THT_WRITE, off, len);
 297         if (txh != NULL) {
 298                 dmu_tx_count_write(txh, off, len);
 299                 dmu_tx_count_dnode(txh);
 300         }
 301 }
 302 
 303 void
 304 dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
 305 {
 306         dmu_tx_hold_t *txh;
 307 
 308         ASSERT(tx->tx_txg == 0);
 309         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 310             object, THT_WRITE, 0, 0);
 311         if (txh == NULL)
 312                 return;
 313 
 314         dnode_t *dn = txh->txh_dnode;
 315         (void) refcount_add_many(&txh->txh_space_towrite,
 316             1ULL << dn->dn_indblkshift, FTAG);
 317         dmu_tx_count_dnode(txh);
 318 }
 319 
 320 void
 321 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 322 {
 323         dmu_tx_hold_t *txh;
 324 
 325         ASSERT0(tx->tx_txg);
 326         ASSERT3U(len, <=, DMU_MAX_ACCESS);
 327         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 328 
 329         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
 330         if (txh != NULL) {
 331                 dmu_tx_count_write(txh, off, len);
 332                 dmu_tx_count_dnode(txh);
 333         }
 334 }
 335 
 336 /*
 337  * This function marks the transaction as being a "net free".  The end
 338  * result is that refquotas will be disabled for this transaction, and
 339  * this transaction will be able to use half of the pool space overhead
 340  * (see dsl_pool_adjustedsize()).  Therefore this function should only

 852  *
 853  *  - dd_tempreserved[], which is the sum of in-flight transactions'
 854  *    holds' txh_space_towrite (i.e. those transactions that have called
 855  *    dmu_tx_assign() but not yet called dmu_tx_commit()).
 856  *
 857  *  - dd_space_towrite[], which is the amount of dirtied dbufs.
 858  *
 859  * Note that all of these values are inflated by spa_get_worst_case_asize(),
 860  * which means that we may get ERESTART well before we are actually in danger
 861  * of running out of space, but this also mitigates any small inaccuracies
 862  * in the rough estimate (e.g. txh_space_towrite doesn't take into account
 863  * indirect blocks, and dd_space_towrite[] doesn't take into account changes
 864  * to the MOS).
 865  *
 866  * Note that due to this algorithm, it is possible to exceed the allowed
 867  * usage by one transaction.  Also, as we approach the allowed usage,
 868  * we will allow a very limited amount of changes into each TXG, thus
 869  * decreasing performance.
 870  */
 871 static int
 872 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 873 {
 874         spa_t *spa = tx->tx_pool->dp_spa;
 875 
 876         ASSERT0(tx->tx_txg);
 877 
 878         if (tx->tx_err)
 879                 return (tx->tx_err);
 880 
 881         if (spa_suspended(spa)) {
 882                 /*
 883                  * If the user has indicated a blocking failure mode
 884                  * then return ERESTART which will block in dmu_tx_wait().
 885                  * Otherwise, return EIO so that an error can get
 886                  * propagated back to the VOP calls.
 887                  *
 888                  * Note that we always honor the txg_how flag regardless
 889                  * of the failuremode setting.
 890                  */
 891                 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 892                     !(txg_how & TXG_WAIT))
 893                         return (SET_ERROR(EIO));
 894 
 895                 return (SET_ERROR(ERESTART));
 896         }
 897 
 898         if (!tx->tx_dirty_delayed &&
 899             dsl_pool_need_dirty_delay(tx->tx_pool)) {
 900                 tx->tx_wait_dirty = B_TRUE;
 901                 return (SET_ERROR(ERESTART));
 902         }
 903 
 904         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 905         tx->tx_needassign_txh = NULL;
 906 
 907         /*
 908          * NB: No error returns are allowed after txg_hold_open, but
 909          * before processing the dnode holds, due to the
 910          * dmu_tx_unassign() logic.
 911          */
 912 
 913         uint64_t towrite = 0;
 914         uint64_t tohold = 0;
 915         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 916             txh = list_next(&tx->tx_holds, txh)) {
 917                 dnode_t *dn = txh->txh_dnode;
 918                 if (dn != NULL) {

 966 
 967                 if (dn == NULL)
 968                         continue;
 969                 mutex_enter(&dn->dn_mtx);
 970                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 971 
 972                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 973                         dn->dn_assigned_txg = 0;
 974                         cv_broadcast(&dn->dn_notxholds);
 975                 }
 976                 mutex_exit(&dn->dn_mtx);
 977         }
 978 
 979         txg_rele_to_sync(&tx->tx_txgh);
 980 
 981         tx->tx_lasttried_txg = tx->tx_txg;
 982         tx->tx_txg = 0;
 983 }
 984 
 985 /*
 986  * Assign tx to a transaction group; txg_how is a bitmask:
 987  *
 988  * If TXG_WAIT is set and the currently open txg is full, this function
 989  * will wait until there's a new txg. This should be used when no locks
 990  * are being held. With this bit set, this function will only fail if
 991  * we're truly out of space (or over quota).
 992  *
 993  * If TXG_WAIT is *not* set and we can't assign into the currently open
 994  * txg without blocking, this function will return immediately with
 995  * ERESTART. This should be used whenever locks are being held.  On an
 996  * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
 997  * and try again.
 998  *
 999  * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
1000  * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
1001  * details on the throttle). This is used by the VFS operations, after
1002  * they have already called dmu_tx_wait() (though most likely on a
1003  * different tx).
1004  */
1005 int
1006 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1007 {
1008         int err;
1009 
1010         ASSERT(tx->tx_txg == 0);
1011         ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));

1012         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1013 
1014         /* If we might wait, we must not hold the config lock. */
1015         IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
1016 
1017         if ((txg_how & TXG_NOTHROTTLE))
1018                 tx->tx_dirty_delayed = B_TRUE;
1019 
1020         while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1021                 dmu_tx_unassign(tx);
1022 
1023                 if (err != ERESTART || !(txg_how & TXG_WAIT))
1024                         return (err);
1025 
1026                 dmu_tx_wait(tx);
1027         }
1028 
1029         txg_rele_to_quiesce(&tx->tx_txgh);
1030 
1031         return (0);
1032 }
1033 
1034 void
1035 dmu_tx_wait(dmu_tx_t *tx)
1036 {
1037         spa_t *spa = tx->tx_pool->dp_spa;
1038         dsl_pool_t *dp = tx->tx_pool;
1039 
1040         ASSERT(tx->tx_txg == 0);
1041         ASSERT(!dsl_pool_config_held(tx->tx_pool));
1042 
1043         if (tx->tx_wait_dirty) {
1044                 /*
1045                  * dmu_tx_try_assign() has determined that we need to wait
1046                  * because we've consumed much or all of the dirty buffer
1047                  * space.
1048                  */
1049                 mutex_enter(&dp->dp_lock);
1050                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1051                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1052                 uint64_t dirty = dp->dp_dirty_total;
1053                 mutex_exit(&dp->dp_lock);
1054 
1055                 dmu_tx_delay(tx, dirty);
1056 
1057                 tx->tx_wait_dirty = B_FALSE;
1058 
1059                 /*
1060                  * Note: setting tx_dirty_delayed only has effect if the
1061                  * caller used TX_WAIT.  Otherwise they are going to
1062                  * destroy this tx and try again.  The common case,
1063                  * zfs_write(), uses TX_WAIT.
1064                  */
1065                 tx->tx_dirty_delayed = B_TRUE;
1066         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1067                 /*
1068                  * If the pool is suspended we need to wait until it
1069                  * is resumed.  Note that it's possible that the pool
1070                  * has become active after this thread has tried to
1071                  * obtain a tx.  If that's the case then tx_lasttried_txg
1072                  * would not have been set.
1073                  */
1074                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1075         } else if (tx->tx_needassign_txh) {
1076                 /*
1077                  * A dnode is assigned to the quiescing txg.  Wait for its
1078                  * transaction to complete.
1079                  */
1080                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1081 
1082                 mutex_enter(&dn->dn_mtx);
1083                 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1084                         cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1085                 mutex_exit(&dn->dn_mtx);
1086                 tx->tx_needassign_txh = NULL;
1087         } else {
1088                 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);





1089         }
1090 }
1091 
1092 static void
1093 dmu_tx_destroy(dmu_tx_t *tx)
1094 {
1095         dmu_tx_hold_t *txh;
1096 
1097         while ((txh = list_head(&tx->tx_holds)) != NULL) {
1098                 dnode_t *dn = txh->txh_dnode;
1099 
1100                 list_remove(&tx->tx_holds, txh);
1101                 refcount_destroy_many(&txh->txh_space_towrite,
1102                     refcount_count(&txh->txh_space_towrite));
1103                 refcount_destroy_many(&txh->txh_memory_tohold,
1104                     refcount_count(&txh->txh_memory_tohold));
1105                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1106                 if (dn != NULL)
1107                         dnode_rele(dn, tx);
1108         }

1124         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1125             txh = list_next(&tx->tx_holds, txh)) {
1126                 dnode_t *dn = txh->txh_dnode;
1127 
1128                 if (dn == NULL)
1129                         continue;
1130 
1131                 mutex_enter(&dn->dn_mtx);
1132                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1133 
1134                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1135                         dn->dn_assigned_txg = 0;
1136                         cv_broadcast(&dn->dn_notxholds);
1137                 }
1138                 mutex_exit(&dn->dn_mtx);
1139         }
1140 
1141         if (tx->tx_tempreserve_cookie)
1142                 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1143 
1144         if (!list_is_empty(&tx->tx_callbacks))
1145                 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);







1146 
1147         if (tx->tx_anyobj == FALSE)
1148                 txg_rele_to_sync(&tx->tx_txgh);
1149 
1150         dmu_tx_destroy(tx);
1151 }
1152 
1153 void
1154 dmu_tx_abort(dmu_tx_t *tx)
1155 {
1156         ASSERT(tx->tx_txg == 0);
1157 
1158         /*
1159          * Call any registered callbacks with an error code.
1160          */
1161         if (!list_is_empty(&tx->tx_callbacks))
1162                 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1163 
1164         dmu_tx_destroy(tx);
1165 }

   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27 
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dbuf.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dmu_objset.h>
  33 #include <sys/dsl_dataset.h>
  34 #include <sys/dsl_dir.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zap_impl.h>
  37 #include <sys/spa.h>
  38 #include <sys/sa.h>
  39 #include <sys/sa_impl.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/varargs.h>
  42 
  43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,

 284 }
 285 
 286 void
 287 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 288 {
 289         dmu_tx_hold_t *txh;
 290 
 291         ASSERT0(tx->tx_txg);
 292         ASSERT3U(len, <=, DMU_MAX_ACCESS);
 293         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 294 
 295         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 296             object, THT_WRITE, off, len);
 297         if (txh != NULL) {
 298                 dmu_tx_count_write(txh, off, len);
 299                 dmu_tx_count_dnode(txh);
 300         }
 301 }
 302 
 303 void

















 304 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 305 {
 306         dmu_tx_hold_t *txh;
 307 
 308         ASSERT0(tx->tx_txg);
 309         ASSERT3U(len, <=, DMU_MAX_ACCESS);
 310         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 311 
 312         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
 313         if (txh != NULL) {
 314                 dmu_tx_count_write(txh, off, len);
 315                 dmu_tx_count_dnode(txh);
 316         }
 317 }
 318 
 319 /*
 320  * This function marks the transaction as being a "net free".  The end
 321  * result is that refquotas will be disabled for this transaction, and
 322  * this transaction will be able to use half of the pool space overhead
 323  * (see dsl_pool_adjustedsize()).  Therefore this function should only

 835  *
 836  *  - dd_tempreserved[], which is the sum of in-flight transactions'
 837  *    holds' txh_space_towrite (i.e. those transactions that have called
 838  *    dmu_tx_assign() but not yet called dmu_tx_commit()).
 839  *
 840  *  - dd_space_towrite[], which is the amount of dirtied dbufs.
 841  *
 842  * Note that all of these values are inflated by spa_get_worst_case_asize(),
 843  * which means that we may get ERESTART well before we are actually in danger
 844  * of running out of space, but this also mitigates any small inaccuracies
 845  * in the rough estimate (e.g. txh_space_towrite doesn't take into account
 846  * indirect blocks, and dd_space_towrite[] doesn't take into account changes
 847  * to the MOS).
 848  *
 849  * Note that due to this algorithm, it is possible to exceed the allowed
 850  * usage by one transaction.  Also, as we approach the allowed usage,
 851  * we will allow a very limited amount of changes into each TXG, thus
 852  * decreasing performance.
 853  */
 854 static int
 855 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 856 {
 857         spa_t *spa = tx->tx_pool->dp_spa;
 858 
 859         ASSERT0(tx->tx_txg);
 860 
 861         if (tx->tx_err)
 862                 return (tx->tx_err);
 863 
 864         if (spa_suspended(spa)) {
 865                 /*
 866                  * If the user has indicated a blocking failure mode
 867                  * then return ERESTART which will block in dmu_tx_wait().
 868                  * Otherwise, return EIO so that an error can get
 869                  * propagated back to the VOP calls.
 870                  *
 871                  * Note that we always honor the txg_how flag regardless
 872                  * of the failuremode setting.
 873                  */
 874                 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 875                     txg_how != TXG_WAIT)
 876                         return (SET_ERROR(EIO));
 877 
 878                 return (SET_ERROR(ERESTART));
 879         }
 880 
 881         if (!tx->tx_waited &&
 882             dsl_pool_need_dirty_delay(tx->tx_pool)) {
 883                 tx->tx_wait_dirty = B_TRUE;
 884                 return (SET_ERROR(ERESTART));
 885         }
 886 
 887         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 888         tx->tx_needassign_txh = NULL;
 889 
 890         /*
 891          * NB: No error returns are allowed after txg_hold_open, but
 892          * before processing the dnode holds, due to the
 893          * dmu_tx_unassign() logic.
 894          */
 895 
 896         uint64_t towrite = 0;
 897         uint64_t tohold = 0;
 898         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 899             txh = list_next(&tx->tx_holds, txh)) {
 900                 dnode_t *dn = txh->txh_dnode;
 901                 if (dn != NULL) {

 949 
 950                 if (dn == NULL)
 951                         continue;
 952                 mutex_enter(&dn->dn_mtx);
 953                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 954 
 955                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 956                         dn->dn_assigned_txg = 0;
 957                         cv_broadcast(&dn->dn_notxholds);
 958                 }
 959                 mutex_exit(&dn->dn_mtx);
 960         }
 961 
 962         txg_rele_to_sync(&tx->tx_txgh);
 963 
 964         tx->tx_lasttried_txg = tx->tx_txg;
 965         tx->tx_txg = 0;
 966 }
 967 
 968 /*
 969  * Assign tx to a transaction group.  txg_how can be one of:
 970  *
 971  * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
 972  *      a new one.  This should be used when you're not holding locks.
 973  *      It will only fail if we're truly out of space (or over quota).

 974  *
 975  * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
 976  *      blocking, returns immediately with ERESTART.  This should be used
 977  *      whenever you're holding locks.  On an ERESTART error, the caller
 978  *      should drop locks, do a dmu_tx_wait(tx), and try again.

 979  *
 980  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
 981  *      has already been called on behalf of this operation (though
 982  *      most likely on a different tx).


 983  */
 984 int
 985 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 986 {
 987         int err;
 988 
 989         ASSERT(tx->tx_txg == 0);
 990         ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
 991             txg_how == TXG_WAITED);
 992         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 993 
 994         /* If we might wait, we must not hold the config lock. */
 995         ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
 996 
 997         if (txg_how == TXG_WAITED)
 998                 tx->tx_waited = B_TRUE;
 999 
1000         while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1001                 dmu_tx_unassign(tx);
1002 
1003                 if (err != ERESTART || txg_how != TXG_WAIT)
1004                         return (err);
1005 
1006                 dmu_tx_wait(tx);
1007         }
1008 
1009         txg_rele_to_quiesce(&tx->tx_txgh);
1010 
1011         return (0);
1012 }
1013 
1014 void
1015 dmu_tx_wait(dmu_tx_t *tx)
1016 {
1017         spa_t *spa = tx->tx_pool->dp_spa;
1018         dsl_pool_t *dp = tx->tx_pool;
1019 
1020         ASSERT(tx->tx_txg == 0);
1021         ASSERT(!dsl_pool_config_held(tx->tx_pool));
1022 
1023         if (tx->tx_wait_dirty) {
1024                 /*
1025                  * dmu_tx_try_assign() has determined that we need to wait
1026                  * because we've consumed much or all of the dirty buffer
1027                  * space.
1028                  */
1029                 mutex_enter(&dp->dp_lock);
1030                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1031                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1032                 uint64_t dirty = dp->dp_dirty_total;
1033                 mutex_exit(&dp->dp_lock);
1034 
1035                 dmu_tx_delay(tx, dirty);
1036 
1037                 tx->tx_wait_dirty = B_FALSE;
1038 
1039                 /*
1040                  * Note: setting tx_waited only has effect if the caller
1041                  * used TX_WAIT.  Otherwise they are going to destroy
1042                  * this tx and try again.  The common case, zfs_write(),
1043                  * uses TX_WAIT.
1044                  */
1045                 tx->tx_waited = B_TRUE;
1046         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1047                 /*
1048                  * If the pool is suspended we need to wait until it
1049                  * is resumed.  Note that it's possible that the pool
1050                  * has become active after this thread has tried to
1051                  * obtain a tx.  If that's the case then tx_lasttried_txg
1052                  * would not have been set.
1053                  */
1054                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1055         } else if (tx->tx_needassign_txh) {
1056                 /*
1057                  * A dnode is assigned to the quiescing txg.  Wait for its
1058                  * transaction to complete.
1059                  */
1060                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1061 
1062                 mutex_enter(&dn->dn_mtx);
1063                 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1064                         cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1065                 mutex_exit(&dn->dn_mtx);
1066                 tx->tx_needassign_txh = NULL;
1067         } else {
1068                 /*
1069                  * If we have a lot of dirty data just wait until we sync
1070                  * out a TXG at which point we'll hopefully have synced
1071                  * a portion of the changes.
1072                  */
1073                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1074         }
1075 }
1076 
1077 static void
1078 dmu_tx_destroy(dmu_tx_t *tx)
1079 {
1080         dmu_tx_hold_t *txh;
1081 
1082         while ((txh = list_head(&tx->tx_holds)) != NULL) {
1083                 dnode_t *dn = txh->txh_dnode;
1084 
1085                 list_remove(&tx->tx_holds, txh);
1086                 refcount_destroy_many(&txh->txh_space_towrite,
1087                     refcount_count(&txh->txh_space_towrite));
1088                 refcount_destroy_many(&txh->txh_memory_tohold,
1089                     refcount_count(&txh->txh_memory_tohold));
1090                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1091                 if (dn != NULL)
1092                         dnode_rele(dn, tx);
1093         }

1109         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1110             txh = list_next(&tx->tx_holds, txh)) {
1111                 dnode_t *dn = txh->txh_dnode;
1112 
1113                 if (dn == NULL)
1114                         continue;
1115 
1116                 mutex_enter(&dn->dn_mtx);
1117                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1118 
1119                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1120                         dn->dn_assigned_txg = 0;
1121                         cv_broadcast(&dn->dn_notxholds);
1122                 }
1123                 mutex_exit(&dn->dn_mtx);
1124         }
1125 
1126         if (tx->tx_tempreserve_cookie)
1127                 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1128 
1129         if (!list_is_empty(&tx->tx_callbacks)) {
1130                 if (dmu_tx_is_syncing(tx)) {
1131                         txg_register_callbacks_sync(tx->tx_pool,
1132                             tx->tx_txg, &tx->tx_callbacks);
1133                 } else {
1134                         txg_register_callbacks(&tx->tx_txgh,
1135                             &tx->tx_callbacks);
1136                 }
1137         }
1138 
1139         if (tx->tx_anyobj == FALSE)
1140                 txg_rele_to_sync(&tx->tx_txgh);
1141 
1142         dmu_tx_destroy(tx);
1143 }
1144 
1145 void
1146 dmu_tx_abort(dmu_tx_t *tx)
1147 {
1148         ASSERT(tx->tx_txg == 0);
1149 
1150         /*
1151          * Call any registered callbacks with an error code.
1152          */
1153         if (!list_is_empty(&tx->tx_callbacks))
1154                 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1155 
1156         dmu_tx_destroy(tx);
1157 }