big-one New usr/src/uts/common/fs/zfs/dmu

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  */
  27 
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dbuf.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dmu_objset.h>
  33 #include <sys/dsl_dataset.h>
  34 #include <sys/dsl_dir.h>
  35 #include <sys/dsl_pool.h>
  36 #include <sys/zap_impl.h>
  37 #include <sys/spa.h>
  38 #include <sys/sa.h>
  39 #include <sys/sa_impl.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/varargs.h>
  42 
  43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  44     uint64_t arg1, uint64_t arg2);
  45 
  46 
  47 dmu_tx_t *
  48 dmu_tx_create_dd(dsl_dir_t *dd)
  49 {
  50         dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  51         tx->tx_dir = dd;
  52         if (dd != NULL)
  53                 tx->tx_pool = dd->dd_pool;
  54         list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  55             offsetof(dmu_tx_hold_t, txh_node));
  56         list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  57             offsetof(dmu_tx_callback_t, dcb_node));
  58         tx->tx_start = gethrtime();
  59         return (tx);
  60 }
  61 
  62 dmu_tx_t *
  63 dmu_tx_create(objset_t *os)
  64 {
  65         dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  66         tx->tx_objset = os;
  67         return (tx);
  68 }
  69 
  70 dmu_tx_t *
  71 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  72 {
  73         dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  74 
  75         txg_verify(dp->dp_spa, txg);
  76         tx->tx_pool = dp;
  77         tx->tx_txg = txg;
  78         tx->tx_anyobj = TRUE;
  79 
  80         return (tx);
  81 }
  82 
  83 int
  84 dmu_tx_is_syncing(dmu_tx_t *tx)
  85 {
  86         return (tx->tx_anyobj);
  87 }
  88 
  89 int
  90 dmu_tx_private_ok(dmu_tx_t *tx)
  91 {
  92         return (tx->tx_anyobj);
  93 }
  94 
  95 static dmu_tx_hold_t *
  96 dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
  97     uint64_t arg1, uint64_t arg2)
  98 {
  99         dmu_tx_hold_t *txh;
 100 
 101         if (dn != NULL) {
 102                 (void) refcount_add(&dn->dn_holds, tx);
 103                 if (tx->tx_txg != 0) {
 104                         mutex_enter(&dn->dn_mtx);
 105                         /*
 106                          * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 107                          * problem, but there's no way for it to happen (for
 108                          * now, at least).
 109                          */
 110                         ASSERT(dn->dn_assigned_txg == 0);
 111                         dn->dn_assigned_txg = tx->tx_txg;
 112                         (void) refcount_add(&dn->dn_tx_holds, tx);
 113                         mutex_exit(&dn->dn_mtx);
 114                 }
 115         }
 116 
 117         txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 118         txh->txh_tx = tx;
 119         txh->txh_dnode = dn;
 120         refcount_create(&txh->txh_space_towrite);
 121         refcount_create(&txh->txh_memory_tohold);
 122         txh->txh_type = type;
 123         txh->txh_arg1 = arg1;
 124         txh->txh_arg2 = arg2;
 125         list_insert_tail(&tx->tx_holds, txh);
 126 
 127         return (txh);
 128 }
 129 
 130 static dmu_tx_hold_t *
 131 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 132     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 133 {
 134         dnode_t *dn = NULL;
 135         dmu_tx_hold_t *txh;
 136         int err;
 137 
 138         if (object != DMU_NEW_OBJECT) {
 139                 err = dnode_hold(os, object, FTAG, &dn);
 140                 if (err != 0) {
 141                         tx->tx_err = err;
 142                         return (NULL);
 143                 }
 144         }
 145         txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
 146         if (dn != NULL)
 147                 dnode_rele(dn, FTAG);
 148         return (txh);
 149 }
 150 
 151 void
 152 dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
 153 {
 154         /*
 155          * If we're syncing, they can manipulate any object anyhow, and
 156          * the hold on the dnode_t can cause problems.
 157          */
 158         if (!dmu_tx_is_syncing(tx))
 159                 (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
 160 }
 161 
 162 /*
 163  * This function reads specified data from disk.  The specified data will
 164  * be needed to perform the transaction -- i.e, it will be read after
 165  * we do dmu_tx_assign().  There are two reasons that we read the data now
 166  * (before dmu_tx_assign()):
 167  *
 168  * 1. Reading it now has potentially better performance.  The transaction
 169  * has not yet been assigned, so the TXG is not held open, and also the
 170  * caller typically has less locks held when calling dmu_tx_hold_*() than
 171  * after the transaction has been assigned.  This reduces the lock (and txg)
 172  * hold times, thus reducing lock contention.
 173  *
 174  * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
 175  * that are detected before they start making changes to the DMU state
 176  * (i.e. now).  Once the transaction has been assigned, and some DMU
 177  * state has been changed, it can be difficult to recover from an i/o
 178  * error (e.g. to undo the changes already made in memory at the DMU
 179  * layer).  Typically code to do so does not exist in the caller -- it
 180  * assumes that the data has already been cached and thus i/o errors are
 181  * not possible.
 182  *
 183  * It has been observed that the i/o initiated here can be a performance
 184  * problem, and it appears to be optional, because we don't look at the
 185  * data which is read.  However, removing this read would only serve to
 186  * move the work elsewhere (after the dmu_tx_assign()), where it may
 187  * have a greater impact on performance (in addition to the impact on
 188  * fault tolerance noted above).
 189  */
 190 static int
 191 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 192 {
 193         int err;
 194         dmu_buf_impl_t *db;
 195 
 196         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 197         db = dbuf_hold_level(dn, level, blkid, FTAG);
 198         rw_exit(&dn->dn_struct_rwlock);
 199         if (db == NULL)
 200                 return (SET_ERROR(EIO));
 201         err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 202         dbuf_rele(db, FTAG);
 203         return (err);
 204 }
 205 
 206 /* ARGSUSED */
 207 static void
 208 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 209 {
 210         dnode_t *dn = txh->txh_dnode;
 211         int err = 0;
 212 
 213         if (len == 0)
 214                 return;
 215 
 216         (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
 217 
 218         if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
 219                 err = SET_ERROR(EFBIG);
 220 
 221         if (dn == NULL)
 222                 return;
 223 
 224         /*
 225          * For i/o error checking, read the blocks that will be needed
 226          * to perform the write: the first and last level-0 blocks (if
 227          * they are not aligned, i.e. if they are partial-block writes),
 228          * and all the level-1 blocks.
 229          */
 230         if (dn->dn_maxblkid == 0) {
 231                 if (off < dn->dn_datablksz &&
 232                     (off > 0 || len < dn->dn_datablksz)) {
 233                         err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 234                         if (err != 0) {
 235                                 txh->txh_tx->tx_err = err;
 236                         }
 237                 }
 238         } else {
 239                 zio_t *zio = zio_root(dn->dn_objset->os_spa,
 240                     NULL, NULL, ZIO_FLAG_CANFAIL);
 241 
 242                 /* first level-0 block */
 243                 uint64_t start = off >> dn->dn_datablkshift;
 244                 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
 245                         err = dmu_tx_check_ioerr(zio, dn, 0, start);
 246                         if (err != 0) {
 247                                 txh->txh_tx->tx_err = err;
 248                         }
 249                 }
 250 
 251                 /* last level-0 block */
 252                 uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
 253                 if (end != start && end <= dn->dn_maxblkid &&
 254                     P2PHASE(off + len, dn->dn_datablksz)) {
 255                         err = dmu_tx_check_ioerr(zio, dn, 0, end);
 256                         if (err != 0) {
 257                                 txh->txh_tx->tx_err = err;
 258                         }
 259                 }
 260 
 261                 /* level-1 blocks */
 262                 if (dn->dn_nlevels > 1) {
 263                         int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 264                         for (uint64_t i = (start >> shft) + 1;
 265                             i < end >> shft; i++) {
 266                                 err = dmu_tx_check_ioerr(zio, dn, 1, i);
 267                                 if (err != 0) {
 268                                         txh->txh_tx->tx_err = err;
 269                                 }
 270                         }
 271                 }
 272 
 273                 err = zio_wait(zio);
 274                 if (err != 0) {
 275                         txh->txh_tx->tx_err = err;
 276                 }
 277         }
 278 }
 279 
 280 static void
 281 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 282 {
 283         (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
 284 }
 285 
 286 void
 287 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 288 {
 289         dmu_tx_hold_t *txh;
 290 
 291         ASSERT0(tx->tx_txg);
 292         ASSERT3U(len, <=, DMU_MAX_ACCESS);
 293         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 294 
 295         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 296             object, THT_WRITE, off, len);
 297         if (txh != NULL) {
 298                 dmu_tx_count_write(txh, off, len);
 299                 dmu_tx_count_dnode(txh);
 300         }
 301 }
 302 
 303 void
 304 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 305 {
 306         dmu_tx_hold_t *txh;
 307 
 308         ASSERT0(tx->tx_txg);
 309         ASSERT3U(len, <=, DMU_MAX_ACCESS);
 310         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 311 
 312         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
 313         if (txh != NULL) {
 314                 dmu_tx_count_write(txh, off, len);
 315                 dmu_tx_count_dnode(txh);
 316         }
 317 }
 318 
 319 /*
 320  * This function marks the transaction as being a "net free".  The end
 321  * result is that refquotas will be disabled for this transaction, and
 322  * this transaction will be able to use half of the pool space overhead
 323  * (see dsl_pool_adjustedsize()).  Therefore this function should only
 324  * be called for transactions that we expect will not cause a net increase
 325  * in the amount of space used (but it's OK if that is occasionally not true).
 326  */
 327 void
 328 dmu_tx_mark_netfree(dmu_tx_t *tx)
 329 {
 330         tx->tx_netfree = B_TRUE;
 331 }
 332 
 333 static void
 334 dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 335 {
 336         dmu_tx_t *tx;
 337         dnode_t *dn;
 338         int err;
 339 
 340         tx = txh->txh_tx;
 341         ASSERT(tx->tx_txg == 0);
 342 
 343         dn = txh->txh_dnode;
 344         dmu_tx_count_dnode(txh);
 345 
 346         if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
 347                 return;
 348         if (len == DMU_OBJECT_END)
 349                 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
 350 
 351         /*
 352          * For i/o error checking, we read the first and last level-0
 353          * blocks if they are not aligned, and all the level-1 blocks.
 354          *
 355          * Note:  dbuf_free_range() assumes that we have not instantiated
 356          * any level-0 dbufs that will be completely freed.  Therefore we must
 357          * exercise care to not read or count the first and last blocks
 358          * if they are blocksize-aligned.
 359          */
 360         if (dn->dn_datablkshift == 0) {
 361                 if (off != 0 || len < dn->dn_datablksz)
 362                         dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 363         } else {
 364                 /* first block will be modified if it is not aligned */
 365                 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 366                         dmu_tx_count_write(txh, off, 1);
 367                 /* last block will be modified if it is not aligned */
 368                 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 369                         dmu_tx_count_write(txh, off + len, 1);
 370         }
 371 
 372         /*
 373          * Check level-1 blocks.
 374          */
 375         if (dn->dn_nlevels > 1) {
 376                 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 377                     SPA_BLKPTRSHIFT;
 378                 uint64_t start = off >> shift;
 379                 uint64_t end = (off + len) >> shift;
 380 
 381                 ASSERT(dn->dn_indblkshift != 0);
 382 
 383                 /*
 384                  * dnode_reallocate() can result in an object with indirect
 385                  * blocks having an odd data block size.  In this case,
 386                  * just check the single block.
 387                  */
 388                 if (dn->dn_datablkshift == 0)
 389                         start = end = 0;
 390 
 391                 zio_t *zio = zio_root(tx->tx_pool->dp_spa,
 392                     NULL, NULL, ZIO_FLAG_CANFAIL);
 393                 for (uint64_t i = start; i <= end; i++) {
 394                         uint64_t ibyte = i << shift;
 395                         err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 396                         i = ibyte >> shift;
 397                         if (err == ESRCH || i > end)
 398                                 break;
 399                         if (err != 0) {
 400                                 tx->tx_err = err;
 401                                 (void) zio_wait(zio);
 402                                 return;
 403                         }
 404 
 405                         (void) refcount_add_many(&txh->txh_memory_tohold,
 406                             1 << dn->dn_indblkshift, FTAG);
 407 
 408                         err = dmu_tx_check_ioerr(zio, dn, 1, i);
 409                         if (err != 0) {
 410                                 tx->tx_err = err;
 411                                 (void) zio_wait(zio);
 412                                 return;
 413                         }
 414                 }
 415                 err = zio_wait(zio);
 416                 if (err != 0) {
 417                         tx->tx_err = err;
 418                         return;
 419                 }
 420         }
 421 }
 422 
 423 void
 424 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 425 {
 426         dmu_tx_hold_t *txh;
 427 
 428         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 429             object, THT_FREE, off, len);
 430         if (txh != NULL)
 431                 (void) dmu_tx_hold_free_impl(txh, off, len);
 432 }
 433 
 434 void
 435 dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 436 {
 437         dmu_tx_hold_t *txh;
 438 
 439         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
 440         if (txh != NULL)
 441                 (void) dmu_tx_hold_free_impl(txh, off, len);
 442 }
 443 
 444 static void
 445 dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
 446 {
 447         dmu_tx_t *tx = txh->txh_tx;
 448         dnode_t *dn;
 449         int err;
 450 
 451         ASSERT(tx->tx_txg == 0);
 452 
 453         dn = txh->txh_dnode;
 454 
 455         dmu_tx_count_dnode(txh);
 456 
 457         /*
 458          * Modifying a almost-full microzap is around the worst case (128KB)
 459          *
 460          * If it is a fat zap, the worst case would be 7*16KB=112KB:
 461          * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 462          * - 4 new blocks written if adding:
 463          *    - 2 blocks for possibly split leaves,
 464          *    - 2 grown ptrtbl blocks
 465          */
 466         (void) refcount_add_many(&txh->txh_space_towrite,
 467             MZAP_MAX_BLKSZ, FTAG);
 468 
 469         if (dn == NULL)
 470                 return;
 471 
 472         ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 473 
 474         if (dn->dn_maxblkid == 0 || name == NULL) {
 475                 /*
 476                  * This is a microzap (only one block), or we don't know
 477                  * the name.  Check the first block for i/o errors.
 478                  */
 479                 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 480                 if (err != 0) {
 481                         tx->tx_err = err;
 482                 }
 483         } else {
 484                 /*
 485                  * Access the name so that we'll check for i/o errors to
 486                  * the leaf blocks, etc.  We ignore ENOENT, as this name
 487                  * may not yet exist.
 488                  */
 489                 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 490                 if (err == EIO || err == ECKSUM || err == ENXIO) {
 491                         tx->tx_err = err;
 492                 }
 493         }
 494 }
 495 
 496 void
 497 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 498 {
 499         dmu_tx_hold_t *txh;
 500 
 501         ASSERT0(tx->tx_txg);
 502 
 503         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 504             object, THT_ZAP, add, (uintptr_t)name);
 505         if (txh != NULL)
 506                 dmu_tx_hold_zap_impl(txh, name);
 507 }
 508 
 509 void
 510 dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
 511 {
 512         dmu_tx_hold_t *txh;
 513 
 514         ASSERT0(tx->tx_txg);
 515         ASSERT(dn != NULL);
 516 
 517         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
 518         if (txh != NULL)
 519                 dmu_tx_hold_zap_impl(txh, name);
 520 }
 521 
 522 void
 523 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 524 {
 525         dmu_tx_hold_t *txh;
 526 
 527         ASSERT(tx->tx_txg == 0);
 528 
 529         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 530             object, THT_BONUS, 0, 0);
 531         if (txh)
 532                 dmu_tx_count_dnode(txh);
 533 }
 534 
 535 void
 536 dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
 537 {
 538         dmu_tx_hold_t *txh;
 539 
 540         ASSERT0(tx->tx_txg);
 541 
 542         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
 543         if (txh)
 544                 dmu_tx_count_dnode(txh);
 545 }
 546 
 547 void
 548 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 549 {
 550         dmu_tx_hold_t *txh;
 551         ASSERT(tx->tx_txg == 0);
 552 
 553         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 554             DMU_NEW_OBJECT, THT_SPACE, space, 0);
 555 
 556         (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
 557 }
 558 
 559 #ifdef ZFS_DEBUG
 560 void
 561 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 562 {
 563         boolean_t match_object = B_FALSE;
 564         boolean_t match_offset = B_FALSE;
 565 
 566         DB_DNODE_ENTER(db);
 567         dnode_t *dn = DB_DNODE(db);
 568         ASSERT(tx->tx_txg != 0);
 569         ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 570         ASSERT3U(dn->dn_object, ==, db->db.db_object);
 571 
 572         if (tx->tx_anyobj) {
 573                 DB_DNODE_EXIT(db);
 574                 return;
 575         }
 576 
 577         /* XXX No checking on the meta dnode for now */
 578         if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 579                 DB_DNODE_EXIT(db);
 580                 return;
 581         }
 582 
 583         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 584             txh = list_next(&tx->tx_holds, txh)) {
 585                 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 586                 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 587                         match_object = TRUE;
 588                 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 589                         int datablkshift = dn->dn_datablkshift ?
 590                             dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 591                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 592                         int shift = datablkshift + epbs * db->db_level;
 593                         uint64_t beginblk = shift >= 64 ? 0 :
 594                             (txh->txh_arg1 >> shift);
 595                         uint64_t endblk = shift >= 64 ? 0 :
 596                             ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 597                         uint64_t blkid = db->db_blkid;
 598 
 599                         /* XXX txh_arg2 better not be zero... */
 600 
 601                         dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 602                             txh->txh_type, beginblk, endblk);
 603 
 604                         switch (txh->txh_type) {
 605                         case THT_WRITE:
 606                                 if (blkid >= beginblk && blkid <= endblk)
 607                                         match_offset = TRUE;
 608                                 /*
 609                                  * We will let this hold work for the bonus
 610                                  * or spill buffer so that we don't need to
 611                                  * hold it when creating a new object.
 612                                  */
 613                                 if (blkid == DMU_BONUS_BLKID ||
 614                                     blkid == DMU_SPILL_BLKID)
 615                                         match_offset = TRUE;
 616                                 /*
 617                                  * They might have to increase nlevels,
 618                                  * thus dirtying the new TLIBs.  Or the
 619                                  * might have to change the block size,
 620                                  * thus dirying the new lvl=0 blk=0.
 621                                  */
 622                                 if (blkid == 0)
 623                                         match_offset = TRUE;
 624                                 break;
 625                         case THT_FREE:
 626                                 /*
 627                                  * We will dirty all the level 1 blocks in
 628                                  * the free range and perhaps the first and
 629                                  * last level 0 block.
 630                                  */
 631                                 if (blkid >= beginblk && (blkid <= endblk ||
 632                                     txh->txh_arg2 == DMU_OBJECT_END))
 633                                         match_offset = TRUE;
 634                                 break;
 635                         case THT_SPILL:
 636                                 if (blkid == DMU_SPILL_BLKID)
 637                                         match_offset = TRUE;
 638                                 break;
 639                         case THT_BONUS:
 640                                 if (blkid == DMU_BONUS_BLKID)
 641                                         match_offset = TRUE;
 642                                 break;
 643                         case THT_ZAP:
 644                                 match_offset = TRUE;
 645                                 break;
 646                         case THT_NEWOBJECT:
 647                                 match_object = TRUE;
 648                                 break;
 649                         default:
 650                                 ASSERT(!"bad txh_type");
 651                         }
 652                 }
 653                 if (match_object && match_offset) {
 654                         DB_DNODE_EXIT(db);
 655                         return;
 656                 }
 657         }
 658         DB_DNODE_EXIT(db);
 659         panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 660             (u_longlong_t)db->db.db_object, db->db_level,
 661             (u_longlong_t)db->db_blkid);
 662 }
 663 #endif
 664 
 665 /*
 666  * If we can't do 10 iops, something is wrong.  Let us go ahead
 667  * and hit zfs_dirty_data_max.
 668  */
 669 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 670 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 671 
 672 /*
 673  * We delay transactions when we've determined that the backend storage
 674  * isn't able to accommodate the rate of incoming writes.
 675  *
 676  * If there is already a transaction waiting, we delay relative to when
 677  * that transaction finishes waiting.  This way the calculated min_time
 678  * is independent of the number of threads concurrently executing
 679  * transactions.
 680  *
 681  * If we are the only waiter, wait relative to when the transaction
 682  * started, rather than the current time.  This credits the transaction for
 683  * "time already served", e.g. reading indirect blocks.
 684  *
 685  * The minimum time for a transaction to take is calculated as:
 686  *     min_time = scale * (dirty - min) / (max - dirty)
 687  *     min_time is then capped at zfs_delay_max_ns.
 688  *
 689  * The delay has two degrees of freedom that can be adjusted via tunables.
 690  * The percentage of dirty data at which we start to delay is defined by
 691  * zfs_delay_min_dirty_percent. This should typically be at or above
 692  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
 693  * delay after writing at full speed has failed to keep up with the incoming
 694  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
 695  * speaking, this variable determines the amount of delay at the midpoint of
 696  * the curve.
 697  *
 698  * delay
 699  *  10ms +-------------------------------------------------------------*+
 700  *       |                                                             *|
 701  *   9ms +                                                             *+
 702  *       |                                                             *|
 703  *   8ms +                                                             *+
 704  *       |                                                            * |
 705  *   7ms +                                                            * +
 706  *       |                                                            * |
 707  *   6ms +                                                            * +
 708  *       |                                                            * |
 709  *   5ms +                                                           *  +
 710  *       |                                                           *  |
 711  *   4ms +                                                           *  +
 712  *       |                                                           *  |
 713  *   3ms +                                                          *   +
 714  *       |                                                          *   |
 715  *   2ms +                                              (midpoint) *    +
 716  *       |                                                  |    **     |
 717  *   1ms +                                                  v ***       +
 718  *       |             zfs_delay_scale ---------->     ********         |
 719  *     0 +-------------------------------------*********----------------+
 720  *       0%                    <- zfs_dirty_data_max ->               100%
 721  *
 722  * Note that since the delay is added to the outstanding time remaining on the
 723  * most recent transaction, the delay is effectively the inverse of IOPS.
 724  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
 725  * was chosen such that small changes in the amount of accumulated dirty data
 726  * in the first 3/4 of the curve yield relatively small differences in the
 727  * amount of delay.
 728  *
 729  * The effects can be easier to understand when the amount of delay is
 730  * represented on a log scale:
 731  *
 732  * delay
 733  * 100ms +-------------------------------------------------------------++
 734  *       +                                                              +
 735  *       |                                                              |
 736  *       +                                                             *+
 737  *  10ms +                                                             *+
 738  *       +                                                           ** +
 739  *       |                                              (midpoint)  **  |
 740  *       +                                                  |     **    +
 741  *   1ms +                                                  v ****      +
 742  *       +             zfs_delay_scale ---------->        *****         +
 743  *       |                                             ****             |
 744  *       +                                          ****                +
 745  * 100us +                                        **                    +
 746  *       +                                       *                      +
 747  *       |                                      *                       |
 748  *       +                                     *                        +
 749  *  10us +                                     *                        +
 750  *       +                                                              +
 751  *       |                                                              |
 752  *       +                                                              +
 753  *       +--------------------------------------------------------------+
 754  *       0%                    <- zfs_dirty_data_max ->               100%
 755  *
 756  * Note here that only as the amount of dirty data approaches its limit does
 757  * the delay start to increase rapidly. The goal of a properly tuned system
 758  * should be to keep the amount of dirty data out of that range by first
 759  * ensuring that the appropriate limits are set for the I/O scheduler to reach
 760  * optimal throughput on the backend storage, and then by changing the value
 761  * of zfs_delay_scale to increase the steepness of the curve.
 762  */
 763 static void
 764 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 765 {
 766         dsl_pool_t *dp = tx->tx_pool;
 767         uint64_t delay_min_bytes =
 768             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 769         hrtime_t wakeup, min_tx_time, now;
 770 
 771         if (dirty <= delay_min_bytes)
 772                 return;
 773 
 774         /*
 775          * The caller has already waited until we are under the max.
 776          * We make them pass us the amount of dirty data so we don't
 777          * have to handle the case of it being >= the max, which could
 778          * cause a divide-by-zero if it's == the max.
 779          */
 780         ASSERT3U(dirty, <, zfs_dirty_data_max);
 781 
 782         now = gethrtime();
 783         min_tx_time = zfs_delay_scale *
 784             (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
 785         if (now > tx->tx_start + min_tx_time)
 786                 return;
 787 
 788         min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
 789 
 790         DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
 791             uint64_t, min_tx_time);
 792 
 793         mutex_enter(&dp->dp_lock);
 794         wakeup = MAX(tx->tx_start + min_tx_time,
 795             dp->dp_last_wakeup + min_tx_time);
 796         dp->dp_last_wakeup = wakeup;
 797         mutex_exit(&dp->dp_lock);
 798 
 799 #ifdef _KERNEL
 800         mutex_enter(&curthread->t_delay_lock);
 801         while (cv_timedwait_hires(&curthread->t_delay_cv,
 802             &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
 803             CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
 804                 continue;
 805         mutex_exit(&curthread->t_delay_lock);
 806 #else
 807         hrtime_t delta = wakeup - gethrtime();
 808         struct timespec ts;
 809         ts.tv_sec = delta / NANOSEC;
 810         ts.tv_nsec = delta % NANOSEC;
 811         (void) nanosleep(&ts, NULL);
 812 #endif
 813 }
 814 
 815 /*
 816  * This routine attempts to assign the transaction to a transaction group.
 817  * To do so, we must determine if there is sufficient free space on disk.
 818  *
 819  * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
 820  * on it), then it is assumed that there is sufficient free space,
 821  * unless there's insufficient slop space in the pool (see the comment
 822  * above spa_slop_shift in spa_misc.c).
 823  *
 824  * If it is not a "netfree" transaction, then if the data already on disk
 825  * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
 826  * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
 827  * plus the rough estimate of this transaction's changes, may exceed the
 828  * allowed usage, then this will fail with ERESTART, which will cause the
 829  * caller to wait for the pending changes to be written to disk (by waiting
 830  * for the next TXG to open), and then check the space usage again.
 831  *
 832  * The rough estimate of pending changes is comprised of the sum of:
 833  *
 834  *  - this transaction's holds' txh_space_towrite
 835  *
 836  *  - dd_tempreserved[], which is the sum of in-flight transactions'
 837  *    holds' txh_space_towrite (i.e. those transactions that have called
 838  *    dmu_tx_assign() but not yet called dmu_tx_commit()).
 839  *
 840  *  - dd_space_towrite[], which is the amount of dirtied dbufs.
 841  *
 842  * Note that all of these values are inflated by spa_get_worst_case_asize(),
 843  * which means that we may get ERESTART well before we are actually in danger
 844  * of running out of space, but this also mitigates any small inaccuracies
 845  * in the rough estimate (e.g. txh_space_towrite doesn't take into account
 846  * indirect blocks, and dd_space_towrite[] doesn't take into account changes
 847  * to the MOS).
 848  *
 849  * Note that due to this algorithm, it is possible to exceed the allowed
 850  * usage by one transaction.  Also, as we approach the allowed usage,
 851  * we will allow a very limited amount of changes into each TXG, thus
 852  * decreasing performance.
 853  */
 854 static int
 855 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 856 {
 857         spa_t *spa = tx->tx_pool->dp_spa;
 858 
 859         ASSERT0(tx->tx_txg);
 860 
 861         if (tx->tx_err)
 862                 return (tx->tx_err);
 863 
 864         if (spa_suspended(spa)) {
 865                 /*
 866                  * If the user has indicated a blocking failure mode
 867                  * then return ERESTART which will block in dmu_tx_wait().
 868                  * Otherwise, return EIO so that an error can get
 869                  * propagated back to the VOP calls.
 870                  *
 871                  * Note that we always honor the txg_how flag regardless
 872                  * of the failuremode setting.
 873                  */
 874                 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 875                     txg_how != TXG_WAIT)
 876                         return (SET_ERROR(EIO));
 877 
 878                 return (SET_ERROR(ERESTART));
 879         }
 880 
 881         if (!tx->tx_waited &&
 882             dsl_pool_need_dirty_delay(tx->tx_pool)) {
 883                 tx->tx_wait_dirty = B_TRUE;
 884                 return (SET_ERROR(ERESTART));
 885         }
 886 
 887         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 888         tx->tx_needassign_txh = NULL;
 889 
 890         /*
 891          * NB: No error returns are allowed after txg_hold_open, but
 892          * before processing the dnode holds, due to the
 893          * dmu_tx_unassign() logic.
 894          */
 895 
 896         uint64_t towrite = 0;
 897         uint64_t tohold = 0;
 898         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 899             txh = list_next(&tx->tx_holds, txh)) {
 900                 dnode_t *dn = txh->txh_dnode;
 901                 if (dn != NULL) {
 902                         mutex_enter(&dn->dn_mtx);
 903                         if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 904                                 mutex_exit(&dn->dn_mtx);
 905                                 tx->tx_needassign_txh = txh;
 906                                 return (SET_ERROR(ERESTART));
 907                         }
 908                         if (dn->dn_assigned_txg == 0)
 909                                 dn->dn_assigned_txg = tx->tx_txg;
 910                         ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 911                         (void) refcount_add(&dn->dn_tx_holds, tx);
 912                         mutex_exit(&dn->dn_mtx);
 913                 }
 914                 towrite += refcount_count(&txh->txh_space_towrite);
 915                 tohold += refcount_count(&txh->txh_memory_tohold);
 916         }
 917 
 918         /* needed allocation: worst-case estimate of write space */
 919         uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
 920         /* calculate memory footprint estimate */
 921         uint64_t memory = towrite + tohold;
 922 
 923         if (tx->tx_dir != NULL && asize != 0) {
 924                 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
 925                     asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
 926                 if (err != 0)
 927                         return (err);
 928         }
 929 
 930         return (0);
 931 }
 932 
 933 static void
 934 dmu_tx_unassign(dmu_tx_t *tx)
 935 {
 936         if (tx->tx_txg == 0)
 937                 return;
 938 
 939         txg_rele_to_quiesce(&tx->tx_txgh);
 940 
 941         /*
 942          * Walk the transaction's hold list, removing the hold on the
 943          * associated dnode, and notifying waiters if the refcount drops to 0.
 944          */
 945         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
 946             txh != tx->tx_needassign_txh;
 947             txh = list_next(&tx->tx_holds, txh)) {
 948                 dnode_t *dn = txh->txh_dnode;
 949 
 950                 if (dn == NULL)
 951                         continue;
 952                 mutex_enter(&dn->dn_mtx);
 953                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 954 
 955                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 956                         dn->dn_assigned_txg = 0;
 957                         cv_broadcast(&dn->dn_notxholds);
 958                 }
 959                 mutex_exit(&dn->dn_mtx);
 960         }
 961 
 962         txg_rele_to_sync(&tx->tx_txgh);
 963 
 964         tx->tx_lasttried_txg = tx->tx_txg;
 965         tx->tx_txg = 0;
 966 }
 967 
 968 /*
 969  * Assign tx to a transaction group.  txg_how can be one of:
 970  *
 971  * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
 972  *      a new one.  This should be used when you're not holding locks.
 973  *      It will only fail if we're truly out of space (or over quota).
 974  *
 975  * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
 976  *      blocking, returns immediately with ERESTART.  This should be used
 977  *      whenever you're holding locks.  On an ERESTART error, the caller
 978  *      should drop locks, do a dmu_tx_wait(tx), and try again.
 979  *
 980  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
 981  *      has already been called on behalf of this operation (though
 982  *      most likely on a different tx).
 983  */
 984 int
 985 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 986 {
 987         int err;
 988 
 989         ASSERT(tx->tx_txg == 0);
 990         ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
 991             txg_how == TXG_WAITED);
 992         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 993 
 994         /* If we might wait, we must not hold the config lock. */
 995         ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
 996 
 997         if (txg_how == TXG_WAITED)
 998                 tx->tx_waited = B_TRUE;
 999 
1000         while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1001                 dmu_tx_unassign(tx);
1002 
1003                 if (err != ERESTART || txg_how != TXG_WAIT)
1004                         return (err);
1005 
1006                 dmu_tx_wait(tx);
1007         }
1008 
1009         txg_rele_to_quiesce(&tx->tx_txgh);
1010 
1011         return (0);
1012 }
1013 
1014 void
1015 dmu_tx_wait(dmu_tx_t *tx)
1016 {
1017         spa_t *spa = tx->tx_pool->dp_spa;
1018         dsl_pool_t *dp = tx->tx_pool;
1019 
1020         ASSERT(tx->tx_txg == 0);
1021         ASSERT(!dsl_pool_config_held(tx->tx_pool));
1022 
1023         if (tx->tx_wait_dirty) {
1024                 /*
1025                  * dmu_tx_try_assign() has determined that we need to wait
1026                  * because we've consumed much or all of the dirty buffer
1027                  * space.
1028                  */
1029                 mutex_enter(&dp->dp_lock);
1030                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1031                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1032                 uint64_t dirty = dp->dp_dirty_total;
1033                 mutex_exit(&dp->dp_lock);
1034 
1035                 dmu_tx_delay(tx, dirty);
1036 
1037                 tx->tx_wait_dirty = B_FALSE;
1038 
1039                 /*
1040                  * Note: setting tx_waited only has effect if the caller
1041                  * used TX_WAIT.  Otherwise they are going to destroy
1042                  * this tx and try again.  The common case, zfs_write(),
1043                  * uses TX_WAIT.
1044                  */
1045                 tx->tx_waited = B_TRUE;
1046         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1047                 /*
1048                  * If the pool is suspended we need to wait until it
1049                  * is resumed.  Note that it's possible that the pool
1050                  * has become active after this thread has tried to
1051                  * obtain a tx.  If that's the case then tx_lasttried_txg
1052                  * would not have been set.
1053                  */
1054                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1055         } else if (tx->tx_needassign_txh) {
1056                 /*
1057                  * A dnode is assigned to the quiescing txg.  Wait for its
1058                  * transaction to complete.
1059                  */
1060                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1061 
1062                 mutex_enter(&dn->dn_mtx);
1063                 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1064                         cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1065                 mutex_exit(&dn->dn_mtx);
1066                 tx->tx_needassign_txh = NULL;
1067         } else {
1068                 /*
1069                  * If we have a lot of dirty data just wait until we sync
1070                  * out a TXG at which point we'll hopefully have synced
1071                  * a portion of the changes.
1072                  */
1073                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1074         }
1075 }
1076 
1077 static void
1078 dmu_tx_destroy(dmu_tx_t *tx)
1079 {
1080         dmu_tx_hold_t *txh;
1081 
1082         while ((txh = list_head(&tx->tx_holds)) != NULL) {
1083                 dnode_t *dn = txh->txh_dnode;
1084 
1085                 list_remove(&tx->tx_holds, txh);
1086                 refcount_destroy_many(&txh->txh_space_towrite,
1087                     refcount_count(&txh->txh_space_towrite));
1088                 refcount_destroy_many(&txh->txh_memory_tohold,
1089                     refcount_count(&txh->txh_memory_tohold));
1090                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1091                 if (dn != NULL)
1092                         dnode_rele(dn, tx);
1093         }
1094 
1095         list_destroy(&tx->tx_callbacks);
1096         list_destroy(&tx->tx_holds);
1097         kmem_free(tx, sizeof (dmu_tx_t));
1098 }
1099 
1100 void
1101 dmu_tx_commit(dmu_tx_t *tx)
1102 {
1103         ASSERT(tx->tx_txg != 0);
1104 
1105         /*
1106          * Go through the transaction's hold list and remove holds on
1107          * associated dnodes, notifying waiters if no holds remain.
1108          */
1109         for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1110             txh = list_next(&tx->tx_holds, txh)) {
1111                 dnode_t *dn = txh->txh_dnode;
1112 
1113                 if (dn == NULL)
1114                         continue;
1115 
1116                 mutex_enter(&dn->dn_mtx);
1117                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1118 
1119                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1120                         dn->dn_assigned_txg = 0;
1121                         cv_broadcast(&dn->dn_notxholds);
1122                 }
1123                 mutex_exit(&dn->dn_mtx);
1124         }
1125 
1126         if (tx->tx_tempreserve_cookie)
1127                 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1128 
1129         if (!list_is_empty(&tx->tx_callbacks)) {
1130                 if (dmu_tx_is_syncing(tx)) {
1131                         txg_register_callbacks_sync(tx->tx_pool,
1132                             tx->tx_txg, &tx->tx_callbacks);
1133                 } else {
1134                         txg_register_callbacks(&tx->tx_txgh,
1135                             &tx->tx_callbacks);
1136                 }
1137         }
1138 
1139         if (tx->tx_anyobj == FALSE)
1140                 txg_rele_to_sync(&tx->tx_txgh);
1141 
1142         dmu_tx_destroy(tx);
1143 }
1144 
1145 void
1146 dmu_tx_abort(dmu_tx_t *tx)
1147 {
1148         ASSERT(tx->tx_txg == 0);
1149 
1150         /*
1151          * Call any registered callbacks with an error code.
1152          */
1153         if (!list_is_empty(&tx->tx_callbacks))
1154                 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1155 
1156         dmu_tx_destroy(tx);
1157 }
1158 
1159 uint64_t
1160 dmu_tx_get_txg(dmu_tx_t *tx)
1161 {
1162         ASSERT(tx->tx_txg != 0);
1163         return (tx->tx_txg);
1164 }
1165 
1166 dsl_pool_t *
1167 dmu_tx_pool(dmu_tx_t *tx)
1168 {
1169         ASSERT(tx->tx_pool != NULL);
1170         return (tx->tx_pool);
1171 }
1172 
1173 void
1174 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1175 {
1176         dmu_tx_callback_t *dcb;
1177 
1178         dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1179 
1180         dcb->dcb_func = func;
1181         dcb->dcb_data = data;
1182 
1183         list_insert_tail(&tx->tx_callbacks, dcb);
1184 }
1185 
1186 /*
1187  * Call all the commit callbacks on a list, with a given error code.
1188  */
1189 void
1190 dmu_tx_do_callbacks(list_t *cb_list, int error)
1191 {
1192         dmu_tx_callback_t *dcb;
1193 
1194         while ((dcb = list_head(cb_list)) != NULL) {
1195                 list_remove(cb_list, dcb);
1196                 dcb->dcb_func(dcb->dcb_data, error);
1197                 kmem_free(dcb, sizeof (dmu_tx_callback_t));
1198         }
1199 }
1200 
1201 /*
1202  * Interface to hold a bunch of attributes.
1203  * used for creating new files.
1204  * attrsize is the total size of all attributes
1205  * to be added during object creation
1206  *
1207  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1208  */
1209 
1210 /*
1211  * hold necessary attribute name for attribute registration.
1212  * should be a very rare case where this is needed.  If it does
1213  * happen it would only happen on the first write to the file system.
1214  */
1215 static void
1216 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1217 {
1218         if (!sa->sa_need_attr_registration)
1219                 return;
1220 
1221         for (int i = 0; i != sa->sa_num_attrs; i++) {
1222                 if (!sa->sa_attr_table[i].sa_registered) {
1223                         if (sa->sa_reg_attr_obj)
1224                                 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1225                                     B_TRUE, sa->sa_attr_table[i].sa_name);
1226                         else
1227                                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1228                                     B_TRUE, sa->sa_attr_table[i].sa_name);
1229                 }
1230         }
1231 }
1232 
1233 void
1234 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1235 {
1236         dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
1237             tx->tx_objset, object, THT_SPILL, 0, 0);
1238 
1239         (void) refcount_add_many(&txh->txh_space_towrite,
1240             SPA_OLD_MAXBLOCKSIZE, FTAG);
1241 }
1242 
1243 void
1244 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1245 {
1246         sa_os_t *sa = tx->tx_objset->os_sa;
1247 
1248         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1249 
1250         if (tx->tx_objset->os_sa->sa_master_obj == 0)
1251                 return;
1252 
1253         if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
1254                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1255         } else {
1256                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1257                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1258                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1259                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1260         }
1261 
1262         dmu_tx_sa_registration_hold(sa, tx);
1263 
1264         if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1265                 return;
1266 
1267         (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1268             THT_SPILL, 0, 0);
1269 }
1270 
1271 /*
1272  * Hold SA attribute
1273  *
1274  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1275  *
1276  * variable_size is the total size of all variable sized attributes
1277  * passed to this function.  It is not the total size of all
1278  * variable size attributes that *may* exist on this object.
1279  */
1280 void
1281 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1282 {
1283         uint64_t object;
1284         sa_os_t *sa = tx->tx_objset->os_sa;
1285 
1286         ASSERT(hdl != NULL);
1287 
1288         object = sa_handle_object(hdl);
1289 
1290         dmu_tx_hold_bonus(tx, object);
1291 
1292         if (tx->tx_objset->os_sa->sa_master_obj == 0)
1293                 return;
1294 
1295         if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1296             tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1297                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1298                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1299                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1300                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1301         }
1302 
1303         dmu_tx_sa_registration_hold(sa, tx);
1304 
1305         if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1306                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1307 
1308         if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1309                 ASSERT(tx->tx_txg == 0);
1310                 dmu_tx_hold_spill(tx, object);
1311         } else {
1312                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1313                 dnode_t *dn;
1314 
1315                 DB_DNODE_ENTER(db);
1316                 dn = DB_DNODE(db);
1317                 if (dn->dn_have_spill) {
1318                         ASSERT(tx->tx_txg == 0);
1319                         dmu_tx_hold_spill(tx, object);
1320                 }
1321                 DB_DNODE_EXIT(db);
1322         }
1323 }