big-one Wdiff usr/src/uts/common/fs/zfs/dmu_tx.c

Print this page

NEX-20218 Backport Illumos #9474 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
MFV illumos-gate@fa41d87de9ec9000964c605eb01d6dc19e4a1abe
    9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
    Reviewed by: Matt Ahrens <matt@delphix.com>
    Reviewed by: Brad Lewis <brad.lewis@delphix.com>
    Reviewed by: Andriy Gapon <avg@FreeBSD.org>
    Approved by: Dan McDonald <danmcd@joyent.com>
NEX-6859 TX-commit callback that is registered in sync-ctx causes system panic
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
       23 + * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  24   24   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   */
  27   27  
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_impl.h>
  30   30  #include <sys/dbuf.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/dmu_objset.h>
  33   33  #include <sys/dsl_dataset.h>

  34   34  #include <sys/dsl_dir.h>
  35   35  #include <sys/dsl_pool.h>
  36   36  #include <sys/zap_impl.h>
  37   37  #include <sys/spa.h>
  38   38  #include <sys/sa.h>
  39   39  #include <sys/sa_impl.h>
  40   40  #include <sys/zfs_context.h>
  41   41  #include <sys/varargs.h>
  42   42  
  43   43  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  44   44      uint64_t arg1, uint64_t arg2);
  45   45  
  46   46  
  47   47  dmu_tx_t *
  48   48  dmu_tx_create_dd(dsl_dir_t *dd)
  49   49  {
  50   50          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  51   51          tx->tx_dir = dd;
  52   52          if (dd != NULL)
  53   53                  tx->tx_pool = dd->dd_pool;
  54   54          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  55   55              offsetof(dmu_tx_hold_t, txh_node));
  56   56          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  57   57              offsetof(dmu_tx_callback_t, dcb_node));
  58   58          tx->tx_start = gethrtime();
  59   59          return (tx);
  60   60  }
  61   61  
  62   62  dmu_tx_t *
  63   63  dmu_tx_create(objset_t *os)
  64   64  {
  65   65          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  66   66          tx->tx_objset = os;
  67   67          return (tx);
  68   68  }
  69   69  
  70   70  dmu_tx_t *
  71   71  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  72   72  {
  73   73          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  74   74  
  75   75          txg_verify(dp->dp_spa, txg);
  76   76          tx->tx_pool = dp;
  77   77          tx->tx_txg = txg;
  78   78          tx->tx_anyobj = TRUE;
  79   79  
  80   80          return (tx);
  81   81  }
  82   82  
  83   83  int
  84   84  dmu_tx_is_syncing(dmu_tx_t *tx)
  85   85  {
  86   86          return (tx->tx_anyobj);
  87   87  }
  88   88  
  89   89  int
  90   90  dmu_tx_private_ok(dmu_tx_t *tx)
  91   91  {
  92   92          return (tx->tx_anyobj);
  93   93  }
  94   94  
  95   95  static dmu_tx_hold_t *
  96   96  dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
  97   97      uint64_t arg1, uint64_t arg2)
  98   98  {
  99   99          dmu_tx_hold_t *txh;
 100  100  
 101  101          if (dn != NULL) {
 102  102                  (void) refcount_add(&dn->dn_holds, tx);
 103  103                  if (tx->tx_txg != 0) {
 104  104                          mutex_enter(&dn->dn_mtx);
 105  105                          /*
 106  106                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 107  107                           * problem, but there's no way for it to happen (for
 108  108                           * now, at least).
 109  109                           */
 110  110                          ASSERT(dn->dn_assigned_txg == 0);
 111  111                          dn->dn_assigned_txg = tx->tx_txg;
 112  112                          (void) refcount_add(&dn->dn_tx_holds, tx);
 113  113                          mutex_exit(&dn->dn_mtx);
 114  114                  }
 115  115          }
 116  116  
 117  117          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 118  118          txh->txh_tx = tx;
 119  119          txh->txh_dnode = dn;
 120  120          refcount_create(&txh->txh_space_towrite);
 121  121          refcount_create(&txh->txh_memory_tohold);
 122  122          txh->txh_type = type;
 123  123          txh->txh_arg1 = arg1;
 124  124          txh->txh_arg2 = arg2;
 125  125          list_insert_tail(&tx->tx_holds, txh);
 126  126  
 127  127          return (txh);
 128  128  }
 129  129  
 130  130  static dmu_tx_hold_t *
 131  131  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 132  132      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 133  133  {
 134  134          dnode_t *dn = NULL;
 135  135          dmu_tx_hold_t *txh;
 136  136          int err;
 137  137  
 138  138          if (object != DMU_NEW_OBJECT) {
 139  139                  err = dnode_hold(os, object, FTAG, &dn);
 140  140                  if (err != 0) {
 141  141                          tx->tx_err = err;
 142  142                          return (NULL);
 143  143                  }
 144  144          }
 145  145          txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
 146  146          if (dn != NULL)
 147  147                  dnode_rele(dn, FTAG);
 148  148          return (txh);
 149  149  }
 150  150  
 151  151  void
 152  152  dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
 153  153  {
 154  154          /*
 155  155           * If we're syncing, they can manipulate any object anyhow, and
 156  156           * the hold on the dnode_t can cause problems.
 157  157           */
 158  158          if (!dmu_tx_is_syncing(tx))
 159  159                  (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
 160  160  }
 161  161  
 162  162  /*
 163  163   * This function reads specified data from disk.  The specified data will
 164  164   * be needed to perform the transaction -- i.e, it will be read after
 165  165   * we do dmu_tx_assign().  There are two reasons that we read the data now
 166  166   * (before dmu_tx_assign()):
 167  167   *
 168  168   * 1. Reading it now has potentially better performance.  The transaction
 169  169   * has not yet been assigned, so the TXG is not held open, and also the
 170  170   * caller typically has less locks held when calling dmu_tx_hold_*() than
 171  171   * after the transaction has been assigned.  This reduces the lock (and txg)
 172  172   * hold times, thus reducing lock contention.
 173  173   *
 174  174   * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
 175  175   * that are detected before they start making changes to the DMU state
 176  176   * (i.e. now).  Once the transaction has been assigned, and some DMU
 177  177   * state has been changed, it can be difficult to recover from an i/o
 178  178   * error (e.g. to undo the changes already made in memory at the DMU
 179  179   * layer).  Typically code to do so does not exist in the caller -- it
 180  180   * assumes that the data has already been cached and thus i/o errors are
 181  181   * not possible.
 182  182   *
 183  183   * It has been observed that the i/o initiated here can be a performance
 184  184   * problem, and it appears to be optional, because we don't look at the
 185  185   * data which is read.  However, removing this read would only serve to
 186  186   * move the work elsewhere (after the dmu_tx_assign()), where it may
 187  187   * have a greater impact on performance (in addition to the impact on
 188  188   * fault tolerance noted above).
 189  189   */
 190  190  static int
 191  191  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 192  192  {
 193  193          int err;
 194  194          dmu_buf_impl_t *db;
 195  195  
 196  196          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 197  197          db = dbuf_hold_level(dn, level, blkid, FTAG);
 198  198          rw_exit(&dn->dn_struct_rwlock);
 199  199          if (db == NULL)
 200  200                  return (SET_ERROR(EIO));
 201  201          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 202  202          dbuf_rele(db, FTAG);
 203  203          return (err);
 204  204  }
 205  205  
 206  206  /* ARGSUSED */
 207  207  static void
 208  208  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 209  209  {
 210  210          dnode_t *dn = txh->txh_dnode;
 211  211          int err = 0;
 212  212  
 213  213          if (len == 0)
 214  214                  return;
 215  215  
 216  216          (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
 217  217  
 218  218          if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
 219  219                  err = SET_ERROR(EFBIG);
 220  220  
 221  221          if (dn == NULL)
 222  222                  return;
 223  223  
 224  224          /*
 225  225           * For i/o error checking, read the blocks that will be needed
 226  226           * to perform the write: the first and last level-0 blocks (if
 227  227           * they are not aligned, i.e. if they are partial-block writes),
 228  228           * and all the level-1 blocks.
 229  229           */
 230  230          if (dn->dn_maxblkid == 0) {
 231  231                  if (off < dn->dn_datablksz &&
 232  232                      (off > 0 || len < dn->dn_datablksz)) {
 233  233                          err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 234  234                          if (err != 0) {
 235  235                                  txh->txh_tx->tx_err = err;
 236  236                          }
 237  237                  }
 238  238          } else {
 239  239                  zio_t *zio = zio_root(dn->dn_objset->os_spa,
 240  240                      NULL, NULL, ZIO_FLAG_CANFAIL);
 241  241  
 242  242                  /* first level-0 block */
 243  243                  uint64_t start = off >> dn->dn_datablkshift;
 244  244                  if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
 245  245                          err = dmu_tx_check_ioerr(zio, dn, 0, start);
 246  246                          if (err != 0) {
 247  247                                  txh->txh_tx->tx_err = err;
 248  248                          }
 249  249                  }
 250  250  
 251  251                  /* last level-0 block */
 252  252                  uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
 253  253                  if (end != start && end <= dn->dn_maxblkid &&
 254  254                      P2PHASE(off + len, dn->dn_datablksz)) {
 255  255                          err = dmu_tx_check_ioerr(zio, dn, 0, end);
 256  256                          if (err != 0) {
 257  257                                  txh->txh_tx->tx_err = err;
 258  258                          }
 259  259                  }
 260  260  
 261  261                  /* level-1 blocks */
 262  262                  if (dn->dn_nlevels > 1) {
 263  263                          int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 264  264                          for (uint64_t i = (start >> shft) + 1;
 265  265                              i < end >> shft; i++) {
 266  266                                  err = dmu_tx_check_ioerr(zio, dn, 1, i);
 267  267                                  if (err != 0) {
 268  268                                          txh->txh_tx->tx_err = err;
 269  269                                  }
 270  270                          }
 271  271                  }
 272  272  
 273  273                  err = zio_wait(zio);
 274  274                  if (err != 0) {
 275  275                          txh->txh_tx->tx_err = err;
 276  276                  }
 277  277          }
 278  278  }
 279  279  
 280  280  static void
 281  281  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 282  282  {
 283  283          (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
 284  284  }
 285  285  
 286  286  void
 287  287  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 288  288  {
 289  289          dmu_tx_hold_t *txh;
 290  290  
 291  291          ASSERT0(tx->tx_txg);
 292  292          ASSERT3U(len, <=, DMU_MAX_ACCESS);
 293  293          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);

↓ open down ↓

260 lines elided

↑ open up ↑

 294  294  
 295  295          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 296  296              object, THT_WRITE, off, len);
 297  297          if (txh != NULL) {
 298  298                  dmu_tx_count_write(txh, off, len);
 299  299                  dmu_tx_count_dnode(txh);
 300  300          }
 301  301  }
 302  302  
 303  303  void
 304      -dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
 305      -{
 306      -        dmu_tx_hold_t *txh;
 307      -
 308      -        ASSERT(tx->tx_txg == 0);
 309      -        txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 310      -            object, THT_WRITE, 0, 0);
 311      -        if (txh == NULL)
 312      -                return;
 313      -
 314      -        dnode_t *dn = txh->txh_dnode;
 315      -        (void) refcount_add_many(&txh->txh_space_towrite,
 316      -            1ULL << dn->dn_indblkshift, FTAG);
 317      -        dmu_tx_count_dnode(txh);
 318      -}
 319      -
 320      -void
 321  304  dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 322  305  {
 323  306          dmu_tx_hold_t *txh;
 324  307  
 325  308          ASSERT0(tx->tx_txg);
 326  309          ASSERT3U(len, <=, DMU_MAX_ACCESS);
 327  310          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 328  311  
 329  312          txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
 330  313          if (txh != NULL) {

 331  314                  dmu_tx_count_write(txh, off, len);
 332  315                  dmu_tx_count_dnode(txh);
 333  316          }
 334  317  }
 335  318  
 336  319  /*
 337  320   * This function marks the transaction as being a "net free".  The end
 338  321   * result is that refquotas will be disabled for this transaction, and
 339  322   * this transaction will be able to use half of the pool space overhead
 340  323   * (see dsl_pool_adjustedsize()).  Therefore this function should only
 341  324   * be called for transactions that we expect will not cause a net increase
 342  325   * in the amount of space used (but it's OK if that is occasionally not true).
 343  326   */
 344  327  void
 345  328  dmu_tx_mark_netfree(dmu_tx_t *tx)
 346  329  {
 347  330          tx->tx_netfree = B_TRUE;
 348  331  }
 349  332  
 350  333  static void
 351  334  dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 352  335  {
 353  336          dmu_tx_t *tx;
 354  337          dnode_t *dn;
 355  338          int err;
 356  339  
 357  340          tx = txh->txh_tx;
 358  341          ASSERT(tx->tx_txg == 0);
 359  342  
 360  343          dn = txh->txh_dnode;
 361  344          dmu_tx_count_dnode(txh);
 362  345  
 363  346          if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
 364  347                  return;
 365  348          if (len == DMU_OBJECT_END)
 366  349                  len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
 367  350  
 368  351          /*
 369  352           * For i/o error checking, we read the first and last level-0
 370  353           * blocks if they are not aligned, and all the level-1 blocks.
 371  354           *
 372  355           * Note:  dbuf_free_range() assumes that we have not instantiated
 373  356           * any level-0 dbufs that will be completely freed.  Therefore we must
 374  357           * exercise care to not read or count the first and last blocks
 375  358           * if they are blocksize-aligned.
 376  359           */
 377  360          if (dn->dn_datablkshift == 0) {
 378  361                  if (off != 0 || len < dn->dn_datablksz)
 379  362                          dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 380  363          } else {
 381  364                  /* first block will be modified if it is not aligned */
 382  365                  if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 383  366                          dmu_tx_count_write(txh, off, 1);
 384  367                  /* last block will be modified if it is not aligned */
 385  368                  if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 386  369                          dmu_tx_count_write(txh, off + len, 1);
 387  370          }
 388  371  
 389  372          /*
 390  373           * Check level-1 blocks.
 391  374           */
 392  375          if (dn->dn_nlevels > 1) {
 393  376                  int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 394  377                      SPA_BLKPTRSHIFT;
 395  378                  uint64_t start = off >> shift;
 396  379                  uint64_t end = (off + len) >> shift;
 397  380  
 398  381                  ASSERT(dn->dn_indblkshift != 0);
 399  382  
 400  383                  /*
 401  384                   * dnode_reallocate() can result in an object with indirect
 402  385                   * blocks having an odd data block size.  In this case,
 403  386                   * just check the single block.
 404  387                   */
 405  388                  if (dn->dn_datablkshift == 0)
 406  389                          start = end = 0;
 407  390  
 408  391                  zio_t *zio = zio_root(tx->tx_pool->dp_spa,
 409  392                      NULL, NULL, ZIO_FLAG_CANFAIL);
 410  393                  for (uint64_t i = start; i <= end; i++) {
 411  394                          uint64_t ibyte = i << shift;
 412  395                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 413  396                          i = ibyte >> shift;
 414  397                          if (err == ESRCH || i > end)
 415  398                                  break;
 416  399                          if (err != 0) {
 417  400                                  tx->tx_err = err;
 418  401                                  (void) zio_wait(zio);
 419  402                                  return;
 420  403                          }
 421  404  
 422  405                          (void) refcount_add_many(&txh->txh_memory_tohold,
 423  406                              1 << dn->dn_indblkshift, FTAG);
 424  407  
 425  408                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 426  409                          if (err != 0) {
 427  410                                  tx->tx_err = err;
 428  411                                  (void) zio_wait(zio);
 429  412                                  return;
 430  413                          }
 431  414                  }
 432  415                  err = zio_wait(zio);
 433  416                  if (err != 0) {
 434  417                          tx->tx_err = err;
 435  418                          return;
 436  419                  }
 437  420          }
 438  421  }
 439  422  
 440  423  void
 441  424  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 442  425  {
 443  426          dmu_tx_hold_t *txh;
 444  427  
 445  428          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 446  429              object, THT_FREE, off, len);
 447  430          if (txh != NULL)
 448  431                  (void) dmu_tx_hold_free_impl(txh, off, len);
 449  432  }
 450  433  
 451  434  void
 452  435  dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 453  436  {
 454  437          dmu_tx_hold_t *txh;
 455  438  
 456  439          txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
 457  440          if (txh != NULL)
 458  441                  (void) dmu_tx_hold_free_impl(txh, off, len);
 459  442  }
 460  443  
 461  444  static void
 462  445  dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
 463  446  {
 464  447          dmu_tx_t *tx = txh->txh_tx;
 465  448          dnode_t *dn;
 466  449          int err;
 467  450  
 468  451          ASSERT(tx->tx_txg == 0);
 469  452  
 470  453          dn = txh->txh_dnode;
 471  454  
 472  455          dmu_tx_count_dnode(txh);
 473  456  
 474  457          /*
 475  458           * Modifying a almost-full microzap is around the worst case (128KB)
 476  459           *
 477  460           * If it is a fat zap, the worst case would be 7*16KB=112KB:
 478  461           * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 479  462           * - 4 new blocks written if adding:
 480  463           *    - 2 blocks for possibly split leaves,
 481  464           *    - 2 grown ptrtbl blocks
 482  465           */
 483  466          (void) refcount_add_many(&txh->txh_space_towrite,
 484  467              MZAP_MAX_BLKSZ, FTAG);
 485  468  
 486  469          if (dn == NULL)
 487  470                  return;
 488  471  
 489  472          ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 490  473  
 491  474          if (dn->dn_maxblkid == 0 || name == NULL) {
 492  475                  /*
 493  476                   * This is a microzap (only one block), or we don't know
 494  477                   * the name.  Check the first block for i/o errors.
 495  478                   */
 496  479                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 497  480                  if (err != 0) {
 498  481                          tx->tx_err = err;
 499  482                  }
 500  483          } else {
 501  484                  /*
 502  485                   * Access the name so that we'll check for i/o errors to
 503  486                   * the leaf blocks, etc.  We ignore ENOENT, as this name
 504  487                   * may not yet exist.
 505  488                   */
 506  489                  err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 507  490                  if (err == EIO || err == ECKSUM || err == ENXIO) {
 508  491                          tx->tx_err = err;
 509  492                  }
 510  493          }
 511  494  }
 512  495  
 513  496  void
 514  497  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 515  498  {
 516  499          dmu_tx_hold_t *txh;
 517  500  
 518  501          ASSERT0(tx->tx_txg);
 519  502  
 520  503          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 521  504              object, THT_ZAP, add, (uintptr_t)name);
 522  505          if (txh != NULL)
 523  506                  dmu_tx_hold_zap_impl(txh, name);
 524  507  }
 525  508  
 526  509  void
 527  510  dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
 528  511  {
 529  512          dmu_tx_hold_t *txh;
 530  513  
 531  514          ASSERT0(tx->tx_txg);
 532  515          ASSERT(dn != NULL);
 533  516  
 534  517          txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
 535  518          if (txh != NULL)
 536  519                  dmu_tx_hold_zap_impl(txh, name);
 537  520  }
 538  521  
 539  522  void
 540  523  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 541  524  {
 542  525          dmu_tx_hold_t *txh;
 543  526  
 544  527          ASSERT(tx->tx_txg == 0);
 545  528  
 546  529          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 547  530              object, THT_BONUS, 0, 0);
 548  531          if (txh)
 549  532                  dmu_tx_count_dnode(txh);
 550  533  }
 551  534  
 552  535  void
 553  536  dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
 554  537  {
 555  538          dmu_tx_hold_t *txh;
 556  539  
 557  540          ASSERT0(tx->tx_txg);
 558  541  
 559  542          txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
 560  543          if (txh)
 561  544                  dmu_tx_count_dnode(txh);
 562  545  }
 563  546  
 564  547  void
 565  548  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 566  549  {
 567  550          dmu_tx_hold_t *txh;
 568  551          ASSERT(tx->tx_txg == 0);
 569  552  
 570  553          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 571  554              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 572  555  
 573  556          (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
 574  557  }
 575  558  
 576  559  #ifdef ZFS_DEBUG
 577  560  void
 578  561  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 579  562  {
 580  563          boolean_t match_object = B_FALSE;
 581  564          boolean_t match_offset = B_FALSE;
 582  565  
 583  566          DB_DNODE_ENTER(db);
 584  567          dnode_t *dn = DB_DNODE(db);
 585  568          ASSERT(tx->tx_txg != 0);
 586  569          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 587  570          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 588  571  
 589  572          if (tx->tx_anyobj) {
 590  573                  DB_DNODE_EXIT(db);
 591  574                  return;
 592  575          }
 593  576  
 594  577          /* XXX No checking on the meta dnode for now */
 595  578          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 596  579                  DB_DNODE_EXIT(db);
 597  580                  return;
 598  581          }
 599  582  
 600  583          for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 601  584              txh = list_next(&tx->tx_holds, txh)) {
 602  585                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 603  586                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 604  587                          match_object = TRUE;
 605  588                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 606  589                          int datablkshift = dn->dn_datablkshift ?
 607  590                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 608  591                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 609  592                          int shift = datablkshift + epbs * db->db_level;
 610  593                          uint64_t beginblk = shift >= 64 ? 0 :
 611  594                              (txh->txh_arg1 >> shift);
 612  595                          uint64_t endblk = shift >= 64 ? 0 :
 613  596                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 614  597                          uint64_t blkid = db->db_blkid;
 615  598  
 616  599                          /* XXX txh_arg2 better not be zero... */
 617  600  
 618  601                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 619  602                              txh->txh_type, beginblk, endblk);
 620  603  
 621  604                          switch (txh->txh_type) {
 622  605                          case THT_WRITE:
 623  606                                  if (blkid >= beginblk && blkid <= endblk)
 624  607                                          match_offset = TRUE;
 625  608                                  /*
 626  609                                   * We will let this hold work for the bonus
 627  610                                   * or spill buffer so that we don't need to
 628  611                                   * hold it when creating a new object.
 629  612                                   */
 630  613                                  if (blkid == DMU_BONUS_BLKID ||
 631  614                                      blkid == DMU_SPILL_BLKID)
 632  615                                          match_offset = TRUE;
 633  616                                  /*
 634  617                                   * They might have to increase nlevels,
 635  618                                   * thus dirtying the new TLIBs.  Or the
 636  619                                   * might have to change the block size,
 637  620                                   * thus dirying the new lvl=0 blk=0.
 638  621                                   */
 639  622                                  if (blkid == 0)
 640  623                                          match_offset = TRUE;
 641  624                                  break;
 642  625                          case THT_FREE:
 643  626                                  /*
 644  627                                   * We will dirty all the level 1 blocks in
 645  628                                   * the free range and perhaps the first and
 646  629                                   * last level 0 block.
 647  630                                   */
 648  631                                  if (blkid >= beginblk && (blkid <= endblk ||
 649  632                                      txh->txh_arg2 == DMU_OBJECT_END))
 650  633                                          match_offset = TRUE;
 651  634                                  break;
 652  635                          case THT_SPILL:
 653  636                                  if (blkid == DMU_SPILL_BLKID)
 654  637                                          match_offset = TRUE;
 655  638                                  break;
 656  639                          case THT_BONUS:
 657  640                                  if (blkid == DMU_BONUS_BLKID)
 658  641                                          match_offset = TRUE;
 659  642                                  break;
 660  643                          case THT_ZAP:
 661  644                                  match_offset = TRUE;
 662  645                                  break;
 663  646                          case THT_NEWOBJECT:
 664  647                                  match_object = TRUE;
 665  648                                  break;
 666  649                          default:
 667  650                                  ASSERT(!"bad txh_type");
 668  651                          }
 669  652                  }
 670  653                  if (match_object && match_offset) {
 671  654                          DB_DNODE_EXIT(db);
 672  655                          return;
 673  656                  }
 674  657          }
 675  658          DB_DNODE_EXIT(db);
 676  659          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 677  660              (u_longlong_t)db->db.db_object, db->db_level,
 678  661              (u_longlong_t)db->db_blkid);
 679  662  }
 680  663  #endif
 681  664  
 682  665  /*
 683  666   * If we can't do 10 iops, something is wrong.  Let us go ahead
 684  667   * and hit zfs_dirty_data_max.
 685  668   */
 686  669  hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 687  670  int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 688  671  
 689  672  /*
 690  673   * We delay transactions when we've determined that the backend storage
 691  674   * isn't able to accommodate the rate of incoming writes.
 692  675   *
 693  676   * If there is already a transaction waiting, we delay relative to when
 694  677   * that transaction finishes waiting.  This way the calculated min_time
 695  678   * is independent of the number of threads concurrently executing
 696  679   * transactions.
 697  680   *
 698  681   * If we are the only waiter, wait relative to when the transaction
 699  682   * started, rather than the current time.  This credits the transaction for
 700  683   * "time already served", e.g. reading indirect blocks.
 701  684   *
 702  685   * The minimum time for a transaction to take is calculated as:
 703  686   *     min_time = scale * (dirty - min) / (max - dirty)
 704  687   *     min_time is then capped at zfs_delay_max_ns.
 705  688   *
 706  689   * The delay has two degrees of freedom that can be adjusted via tunables.
 707  690   * The percentage of dirty data at which we start to delay is defined by
 708  691   * zfs_delay_min_dirty_percent. This should typically be at or above
 709  692   * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
 710  693   * delay after writing at full speed has failed to keep up with the incoming
 711  694   * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
 712  695   * speaking, this variable determines the amount of delay at the midpoint of
 713  696   * the curve.
 714  697   *
 715  698   * delay
 716  699   *  10ms +-------------------------------------------------------------*+
 717  700   *       |                                                             *|
 718  701   *   9ms +                                                             *+
 719  702   *       |                                                             *|
 720  703   *   8ms +                                                             *+
 721  704   *       |                                                            * |
 722  705   *   7ms +                                                            * +
 723  706   *       |                                                            * |
 724  707   *   6ms +                                                            * +
 725  708   *       |                                                            * |
 726  709   *   5ms +                                                           *  +
 727  710   *       |                                                           *  |
 728  711   *   4ms +                                                           *  +
 729  712   *       |                                                           *  |
 730  713   *   3ms +                                                          *   +
 731  714   *       |                                                          *   |
 732  715   *   2ms +                                              (midpoint) *    +
 733  716   *       |                                                  |    **     |
 734  717   *   1ms +                                                  v ***       +
 735  718   *       |             zfs_delay_scale ---------->     ********         |
 736  719   *     0 +-------------------------------------*********----------------+
 737  720   *       0%                    <- zfs_dirty_data_max ->               100%
 738  721   *
 739  722   * Note that since the delay is added to the outstanding time remaining on the
 740  723   * most recent transaction, the delay is effectively the inverse of IOPS.
 741  724   * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
 742  725   * was chosen such that small changes in the amount of accumulated dirty data
 743  726   * in the first 3/4 of the curve yield relatively small differences in the
 744  727   * amount of delay.
 745  728   *
 746  729   * The effects can be easier to understand when the amount of delay is
 747  730   * represented on a log scale:
 748  731   *
 749  732   * delay
 750  733   * 100ms +-------------------------------------------------------------++
 751  734   *       +                                                              +
 752  735   *       |                                                              |
 753  736   *       +                                                             *+
 754  737   *  10ms +                                                             *+
 755  738   *       +                                                           ** +
 756  739   *       |                                              (midpoint)  **  |
 757  740   *       +                                                  |     **    +
 758  741   *   1ms +                                                  v ****      +
 759  742   *       +             zfs_delay_scale ---------->        *****         +
 760  743   *       |                                             ****             |
 761  744   *       +                                          ****                +
 762  745   * 100us +                                        **                    +
 763  746   *       +                                       *                      +
 764  747   *       |                                      *                       |
 765  748   *       +                                     *                        +
 766  749   *  10us +                                     *                        +
 767  750   *       +                                                              +
 768  751   *       |                                                              |
 769  752   *       +                                                              +
 770  753   *       +--------------------------------------------------------------+
 771  754   *       0%                    <- zfs_dirty_data_max ->               100%
 772  755   *
 773  756   * Note here that only as the amount of dirty data approaches its limit does
 774  757   * the delay start to increase rapidly. The goal of a properly tuned system
 775  758   * should be to keep the amount of dirty data out of that range by first
 776  759   * ensuring that the appropriate limits are set for the I/O scheduler to reach
 777  760   * optimal throughput on the backend storage, and then by changing the value
 778  761   * of zfs_delay_scale to increase the steepness of the curve.
 779  762   */
 780  763  static void
 781  764  dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 782  765  {
 783  766          dsl_pool_t *dp = tx->tx_pool;
 784  767          uint64_t delay_min_bytes =
 785  768              zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 786  769          hrtime_t wakeup, min_tx_time, now;
 787  770  
 788  771          if (dirty <= delay_min_bytes)
 789  772                  return;
 790  773  
 791  774          /*
 792  775           * The caller has already waited until we are under the max.
 793  776           * We make them pass us the amount of dirty data so we don't
 794  777           * have to handle the case of it being >= the max, which could
 795  778           * cause a divide-by-zero if it's == the max.
 796  779           */
 797  780          ASSERT3U(dirty, <, zfs_dirty_data_max);
 798  781  
 799  782          now = gethrtime();
 800  783          min_tx_time = zfs_delay_scale *
 801  784              (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
 802  785          if (now > tx->tx_start + min_tx_time)
 803  786                  return;
 804  787  
 805  788          min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
 806  789  
 807  790          DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
 808  791              uint64_t, min_tx_time);
 809  792  
 810  793          mutex_enter(&dp->dp_lock);
 811  794          wakeup = MAX(tx->tx_start + min_tx_time,
 812  795              dp->dp_last_wakeup + min_tx_time);
 813  796          dp->dp_last_wakeup = wakeup;
 814  797          mutex_exit(&dp->dp_lock);
 815  798  
 816  799  #ifdef _KERNEL
 817  800          mutex_enter(&curthread->t_delay_lock);
 818  801          while (cv_timedwait_hires(&curthread->t_delay_cv,
 819  802              &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
 820  803              CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
 821  804                  continue;
 822  805          mutex_exit(&curthread->t_delay_lock);
 823  806  #else
 824  807          hrtime_t delta = wakeup - gethrtime();
 825  808          struct timespec ts;
 826  809          ts.tv_sec = delta / NANOSEC;
 827  810          ts.tv_nsec = delta % NANOSEC;
 828  811          (void) nanosleep(&ts, NULL);
 829  812  #endif
 830  813  }
 831  814  
 832  815  /*
 833  816   * This routine attempts to assign the transaction to a transaction group.
 834  817   * To do so, we must determine if there is sufficient free space on disk.
 835  818   *
 836  819   * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
 837  820   * on it), then it is assumed that there is sufficient free space,
 838  821   * unless there's insufficient slop space in the pool (see the comment
 839  822   * above spa_slop_shift in spa_misc.c).
 840  823   *
 841  824   * If it is not a "netfree" transaction, then if the data already on disk
 842  825   * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
 843  826   * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
 844  827   * plus the rough estimate of this transaction's changes, may exceed the
 845  828   * allowed usage, then this will fail with ERESTART, which will cause the
 846  829   * caller to wait for the pending changes to be written to disk (by waiting
 847  830   * for the next TXG to open), and then check the space usage again.
 848  831   *
 849  832   * The rough estimate of pending changes is comprised of the sum of:
 850  833   *
 851  834   *  - this transaction's holds' txh_space_towrite
 852  835   *
 853  836   *  - dd_tempreserved[], which is the sum of in-flight transactions'
 854  837   *    holds' txh_space_towrite (i.e. those transactions that have called
 855  838   *    dmu_tx_assign() but not yet called dmu_tx_commit()).
 856  839   *
 857  840   *  - dd_space_towrite[], which is the amount of dirtied dbufs.
 858  841   *
 859  842   * Note that all of these values are inflated by spa_get_worst_case_asize(),
 860  843   * which means that we may get ERESTART well before we are actually in danger
 861  844   * of running out of space, but this also mitigates any small inaccuracies

↓ open down ↓

531 lines elided

↑ open up ↑

 862  845   * in the rough estimate (e.g. txh_space_towrite doesn't take into account
 863  846   * indirect blocks, and dd_space_towrite[] doesn't take into account changes
 864  847   * to the MOS).
 865  848   *
 866  849   * Note that due to this algorithm, it is possible to exceed the allowed
 867  850   * usage by one transaction.  Also, as we approach the allowed usage,
 868  851   * we will allow a very limited amount of changes into each TXG, thus
 869  852   * decreasing performance.
 870  853   */
 871  854  static int
 872      -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
      855 +dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 873  856  {
 874  857          spa_t *spa = tx->tx_pool->dp_spa;
 875  858  
 876  859          ASSERT0(tx->tx_txg);
 877  860  
 878  861          if (tx->tx_err)
 879  862                  return (tx->tx_err);
 880  863  
 881  864          if (spa_suspended(spa)) {
 882  865                  /*
 883  866                   * If the user has indicated a blocking failure mode
 884  867                   * then return ERESTART which will block in dmu_tx_wait().
 885  868                   * Otherwise, return EIO so that an error can get
 886  869                   * propagated back to the VOP calls.
 887  870                   *
 888  871                   * Note that we always honor the txg_how flag regardless
 889  872                   * of the failuremode setting.
 890  873                   */
 891  874                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 892      -                    !(txg_how & TXG_WAIT))
      875 +                    txg_how != TXG_WAIT)
 893  876                          return (SET_ERROR(EIO));
 894  877  
 895  878                  return (SET_ERROR(ERESTART));
 896  879          }
 897  880  
 898      -        if (!tx->tx_dirty_delayed &&
      881 +        if (!tx->tx_waited &&
 899  882              dsl_pool_need_dirty_delay(tx->tx_pool)) {
 900  883                  tx->tx_wait_dirty = B_TRUE;
 901  884                  return (SET_ERROR(ERESTART));
 902  885          }
 903  886  
 904  887          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 905  888          tx->tx_needassign_txh = NULL;
 906  889  
 907  890          /*
 908  891           * NB: No error returns are allowed after txg_hold_open, but

 909  892           * before processing the dnode holds, due to the
 910  893           * dmu_tx_unassign() logic.
 911  894           */
 912  895  
 913  896          uint64_t towrite = 0;
 914  897          uint64_t tohold = 0;
 915  898          for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 916  899              txh = list_next(&tx->tx_holds, txh)) {
 917  900                  dnode_t *dn = txh->txh_dnode;
 918  901                  if (dn != NULL) {
 919  902                          mutex_enter(&dn->dn_mtx);
 920  903                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 921  904                                  mutex_exit(&dn->dn_mtx);
 922  905                                  tx->tx_needassign_txh = txh;
 923  906                                  return (SET_ERROR(ERESTART));
 924  907                          }
 925  908                          if (dn->dn_assigned_txg == 0)
 926  909                                  dn->dn_assigned_txg = tx->tx_txg;
 927  910                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 928  911                          (void) refcount_add(&dn->dn_tx_holds, tx);
 929  912                          mutex_exit(&dn->dn_mtx);
 930  913                  }
 931  914                  towrite += refcount_count(&txh->txh_space_towrite);
 932  915                  tohold += refcount_count(&txh->txh_memory_tohold);
 933  916          }
 934  917  
 935  918          /* needed allocation: worst-case estimate of write space */
 936  919          uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
 937  920          /* calculate memory footprint estimate */
 938  921          uint64_t memory = towrite + tohold;
 939  922  
 940  923          if (tx->tx_dir != NULL && asize != 0) {
 941  924                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
 942  925                      asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
 943  926                  if (err != 0)
 944  927                          return (err);
 945  928          }
 946  929  
 947  930          return (0);
 948  931  }
 949  932  
 950  933  static void
 951  934  dmu_tx_unassign(dmu_tx_t *tx)
 952  935  {
 953  936          if (tx->tx_txg == 0)
 954  937                  return;
 955  938  
 956  939          txg_rele_to_quiesce(&tx->tx_txgh);
 957  940  
 958  941          /*
 959  942           * Walk the transaction's hold list, removing the hold on the
 960  943           * associated dnode, and notifying waiters if the refcount drops to 0.
 961  944           */
 962  945          for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
 963  946              txh != tx->tx_needassign_txh;
 964  947              txh = list_next(&tx->tx_holds, txh)) {
 965  948                  dnode_t *dn = txh->txh_dnode;
 966  949  
 967  950                  if (dn == NULL)
 968  951                          continue;
 969  952                  mutex_enter(&dn->dn_mtx);
 970  953                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 971  954  
 972  955                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 973  956                          dn->dn_assigned_txg = 0;
 974  957                          cv_broadcast(&dn->dn_notxholds);
 975  958                  }

↓ open down ↓

67 lines elided

↑ open up ↑

 976  959                  mutex_exit(&dn->dn_mtx);
 977  960          }
 978  961  
 979  962          txg_rele_to_sync(&tx->tx_txgh);
 980  963  
 981  964          tx->tx_lasttried_txg = tx->tx_txg;
 982  965          tx->tx_txg = 0;
 983  966  }
 984  967  
 985  968  /*
 986      - * Assign tx to a transaction group; txg_how is a bitmask:
      969 + * Assign tx to a transaction group.  txg_how can be one of:
 987  970   *
 988      - * If TXG_WAIT is set and the currently open txg is full, this function
 989      - * will wait until there's a new txg. This should be used when no locks
 990      - * are being held. With this bit set, this function will only fail if
 991      - * we're truly out of space (or over quota).
      971 + * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
      972 + *      a new one.  This should be used when you're not holding locks.
      973 + *      It will only fail if we're truly out of space (or over quota).
 992  974   *
 993      - * If TXG_WAIT is *not* set and we can't assign into the currently open
 994      - * txg without blocking, this function will return immediately with
 995      - * ERESTART. This should be used whenever locks are being held.  On an
 996      - * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
 997      - * and try again.
      975 + * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
      976 + *      blocking, returns immediately with ERESTART.  This should be used
      977 + *      whenever you're holding locks.  On an ERESTART error, the caller
      978 + *      should drop locks, do a dmu_tx_wait(tx), and try again.
 998  979   *
 999      - * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
1000      - * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
1001      - * details on the throttle). This is used by the VFS operations, after
1002      - * they have already called dmu_tx_wait() (though most likely on a
1003      - * different tx).
      980 + * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
      981 + *      has already been called on behalf of this operation (though
      982 + *      most likely on a different tx).
1004  983   */
1005  984  int
1006      -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
      985 +dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1007  986  {
1008  987          int err;
1009  988  
1010  989          ASSERT(tx->tx_txg == 0);
1011      -        ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
      990 +        ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
      991 +            txg_how == TXG_WAITED);
1012  992          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1013  993  
1014  994          /* If we might wait, we must not hold the config lock. */
1015      -        IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
      995 +        ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1016  996  
1017      -        if ((txg_how & TXG_NOTHROTTLE))
1018      -                tx->tx_dirty_delayed = B_TRUE;
      997 +        if (txg_how == TXG_WAITED)
      998 +                tx->tx_waited = B_TRUE;
1019  999  
1020 1000          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1021 1001                  dmu_tx_unassign(tx);
1022 1002  
1023      -                if (err != ERESTART || !(txg_how & TXG_WAIT))
     1003 +                if (err != ERESTART || txg_how != TXG_WAIT)
1024 1004                          return (err);
1025 1005  
1026 1006                  dmu_tx_wait(tx);
1027 1007          }
1028 1008  
1029 1009          txg_rele_to_quiesce(&tx->tx_txgh);
1030 1010  
1031 1011          return (0);
1032 1012  }
1033 1013

1034 1014  void
1035 1015  dmu_tx_wait(dmu_tx_t *tx)
1036 1016  {
1037 1017          spa_t *spa = tx->tx_pool->dp_spa;
1038 1018          dsl_pool_t *dp = tx->tx_pool;
1039 1019  
1040 1020          ASSERT(tx->tx_txg == 0);
1041 1021          ASSERT(!dsl_pool_config_held(tx->tx_pool));
1042 1022  
1043 1023          if (tx->tx_wait_dirty) {
1044 1024                  /*
1045 1025                   * dmu_tx_try_assign() has determined that we need to wait
1046 1026                   * because we've consumed much or all of the dirty buffer
1047 1027                   * space.
1048 1028                   */
1049 1029                  mutex_enter(&dp->dp_lock);

↓ open down ↓

16 lines elided

↑ open up ↑

1050 1030                  while (dp->dp_dirty_total >= zfs_dirty_data_max)
1051 1031                          cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1052 1032                  uint64_t dirty = dp->dp_dirty_total;
1053 1033                  mutex_exit(&dp->dp_lock);
1054 1034  
1055 1035                  dmu_tx_delay(tx, dirty);
1056 1036  
1057 1037                  tx->tx_wait_dirty = B_FALSE;
1058 1038  
1059 1039                  /*
1060      -                 * Note: setting tx_dirty_delayed only has effect if the
1061      -                 * caller used TX_WAIT.  Otherwise they are going to
1062      -                 * destroy this tx and try again.  The common case,
1063      -                 * zfs_write(), uses TX_WAIT.
     1040 +                 * Note: setting tx_waited only has effect if the caller
     1041 +                 * used TX_WAIT.  Otherwise they are going to destroy
     1042 +                 * this tx and try again.  The common case, zfs_write(),
     1043 +                 * uses TX_WAIT.
1064 1044                   */
1065      -                tx->tx_dirty_delayed = B_TRUE;
     1045 +                tx->tx_waited = B_TRUE;
1066 1046          } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1067 1047                  /*
1068 1048                   * If the pool is suspended we need to wait until it
1069 1049                   * is resumed.  Note that it's possible that the pool
1070 1050                   * has become active after this thread has tried to
1071 1051                   * obtain a tx.  If that's the case then tx_lasttried_txg
1072 1052                   * would not have been set.
1073 1053                   */
1074 1054                  txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1075 1055          } else if (tx->tx_needassign_txh) {

1076 1056                  /*
1077 1057                   * A dnode is assigned to the quiescing txg.  Wait for its

↓ open down ↓

2 lines elided

↑ open up ↑

1078 1058                   * transaction to complete.
1079 1059                   */
1080 1060                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1081 1061  
1082 1062                  mutex_enter(&dn->dn_mtx);
1083 1063                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1084 1064                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1085 1065                  mutex_exit(&dn->dn_mtx);
1086 1066                  tx->tx_needassign_txh = NULL;
1087 1067          } else {
1088      -                txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
     1068 +                /*
     1069 +                 * If we have a lot of dirty data just wait until we sync
     1070 +                 * out a TXG at which point we'll hopefully have synced
     1071 +                 * a portion of the changes.
     1072 +                 */
     1073 +                txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1089 1074          }
1090 1075  }
1091 1076  
1092 1077  static void
1093 1078  dmu_tx_destroy(dmu_tx_t *tx)
1094 1079  {
1095 1080          dmu_tx_hold_t *txh;
1096 1081  
1097 1082          while ((txh = list_head(&tx->tx_holds)) != NULL) {
1098 1083                  dnode_t *dn = txh->txh_dnode;

1099 1084  
1100 1085                  list_remove(&tx->tx_holds, txh);
1101 1086                  refcount_destroy_many(&txh->txh_space_towrite,
1102 1087                      refcount_count(&txh->txh_space_towrite));
1103 1088                  refcount_destroy_many(&txh->txh_memory_tohold,
1104 1089                      refcount_count(&txh->txh_memory_tohold));
1105 1090                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1106 1091                  if (dn != NULL)
1107 1092                          dnode_rele(dn, tx);
1108 1093          }
1109 1094  
1110 1095          list_destroy(&tx->tx_callbacks);
1111 1096          list_destroy(&tx->tx_holds);
1112 1097          kmem_free(tx, sizeof (dmu_tx_t));
1113 1098  }
1114 1099  
1115 1100  void
1116 1101  dmu_tx_commit(dmu_tx_t *tx)
1117 1102  {
1118 1103          ASSERT(tx->tx_txg != 0);
1119 1104  
1120 1105          /*
1121 1106           * Go through the transaction's hold list and remove holds on
1122 1107           * associated dnodes, notifying waiters if no holds remain.
1123 1108           */
1124 1109          for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1125 1110              txh = list_next(&tx->tx_holds, txh)) {
1126 1111                  dnode_t *dn = txh->txh_dnode;
1127 1112  
1128 1113                  if (dn == NULL)
1129 1114                          continue;
1130 1115  
1131 1116                  mutex_enter(&dn->dn_mtx);
1132 1117                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1133 1118

↓ open down ↓

35 lines elided

↑ open up ↑

1134 1119                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1135 1120                          dn->dn_assigned_txg = 0;
1136 1121                          cv_broadcast(&dn->dn_notxholds);
1137 1122                  }
1138 1123                  mutex_exit(&dn->dn_mtx);
1139 1124          }
1140 1125  
1141 1126          if (tx->tx_tempreserve_cookie)
1142 1127                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1143 1128  
1144      -        if (!list_is_empty(&tx->tx_callbacks))
1145      -                txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
     1129 +        if (!list_is_empty(&tx->tx_callbacks)) {
     1130 +                if (dmu_tx_is_syncing(tx)) {
     1131 +                        txg_register_callbacks_sync(tx->tx_pool,
     1132 +                            tx->tx_txg, &tx->tx_callbacks);
     1133 +                } else {
     1134 +                        txg_register_callbacks(&tx->tx_txgh,
     1135 +                            &tx->tx_callbacks);
     1136 +                }
     1137 +        }
1146 1138  
1147 1139          if (tx->tx_anyobj == FALSE)
1148 1140                  txg_rele_to_sync(&tx->tx_txgh);
1149 1141  
1150 1142          dmu_tx_destroy(tx);
1151 1143  }
1152 1144  
1153 1145  void
1154 1146  dmu_tx_abort(dmu_tx_t *tx)
1155 1147  {

1156 1148          ASSERT(tx->tx_txg == 0);
1157 1149  
1158 1150          /*
1159 1151           * Call any registered callbacks with an error code.
1160 1152           */
1161 1153          if (!list_is_empty(&tx->tx_callbacks))
1162 1154                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1163 1155  
1164 1156          dmu_tx_destroy(tx);
1165 1157  }
1166 1158  
1167 1159  uint64_t
1168 1160  dmu_tx_get_txg(dmu_tx_t *tx)
1169 1161  {
1170 1162          ASSERT(tx->tx_txg != 0);
1171 1163          return (tx->tx_txg);
1172 1164  }
1173 1165  
1174 1166  dsl_pool_t *
1175 1167  dmu_tx_pool(dmu_tx_t *tx)
1176 1168  {
1177 1169          ASSERT(tx->tx_pool != NULL);
1178 1170          return (tx->tx_pool);
1179 1171  }
1180 1172  
1181 1173  void
1182 1174  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1183 1175  {
1184 1176          dmu_tx_callback_t *dcb;
1185 1177  
1186 1178          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1187 1179  
1188 1180          dcb->dcb_func = func;
1189 1181          dcb->dcb_data = data;
1190 1182  
1191 1183          list_insert_tail(&tx->tx_callbacks, dcb);
1192 1184  }
1193 1185  
1194 1186  /*
1195 1187   * Call all the commit callbacks on a list, with a given error code.
1196 1188   */
1197 1189  void
1198 1190  dmu_tx_do_callbacks(list_t *cb_list, int error)
1199 1191  {
1200 1192          dmu_tx_callback_t *dcb;
1201 1193  
1202 1194          while ((dcb = list_head(cb_list)) != NULL) {
1203 1195                  list_remove(cb_list, dcb);
1204 1196                  dcb->dcb_func(dcb->dcb_data, error);
1205 1197                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1206 1198          }
1207 1199  }
1208 1200  
1209 1201  /*
1210 1202   * Interface to hold a bunch of attributes.
1211 1203   * used for creating new files.
1212 1204   * attrsize is the total size of all attributes
1213 1205   * to be added during object creation
1214 1206   *
1215 1207   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1216 1208   */
1217 1209  
1218 1210  /*
1219 1211   * hold necessary attribute name for attribute registration.
1220 1212   * should be a very rare case where this is needed.  If it does
1221 1213   * happen it would only happen on the first write to the file system.
1222 1214   */
1223 1215  static void
1224 1216  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1225 1217  {
1226 1218          if (!sa->sa_need_attr_registration)
1227 1219                  return;
1228 1220  
1229 1221          for (int i = 0; i != sa->sa_num_attrs; i++) {
1230 1222                  if (!sa->sa_attr_table[i].sa_registered) {
1231 1223                          if (sa->sa_reg_attr_obj)
1232 1224                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1233 1225                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1234 1226                          else
1235 1227                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1236 1228                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1237 1229                  }
1238 1230          }
1239 1231  }
1240 1232  
1241 1233  void
1242 1234  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1243 1235  {
1244 1236          dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
1245 1237              tx->tx_objset, object, THT_SPILL, 0, 0);
1246 1238  
1247 1239          (void) refcount_add_many(&txh->txh_space_towrite,
1248 1240              SPA_OLD_MAXBLOCKSIZE, FTAG);
1249 1241  }
1250 1242  
1251 1243  void
1252 1244  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1253 1245  {
1254 1246          sa_os_t *sa = tx->tx_objset->os_sa;
1255 1247  
1256 1248          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1257 1249  
1258 1250          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1259 1251                  return;
1260 1252  
1261 1253          if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
1262 1254                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1263 1255          } else {
1264 1256                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1265 1257                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1266 1258                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1267 1259                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1268 1260          }
1269 1261  
1270 1262          dmu_tx_sa_registration_hold(sa, tx);
1271 1263  
1272 1264          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1273 1265                  return;
1274 1266  
1275 1267          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1276 1268              THT_SPILL, 0, 0);
1277 1269  }
1278 1270  
1279 1271  /*
1280 1272   * Hold SA attribute
1281 1273   *
1282 1274   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1283 1275   *
1284 1276   * variable_size is the total size of all variable sized attributes
1285 1277   * passed to this function.  It is not the total size of all
1286 1278   * variable size attributes that *may* exist on this object.
1287 1279   */
1288 1280  void
1289 1281  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1290 1282  {
1291 1283          uint64_t object;
1292 1284          sa_os_t *sa = tx->tx_objset->os_sa;
1293 1285  
1294 1286          ASSERT(hdl != NULL);
1295 1287  
1296 1288          object = sa_handle_object(hdl);
1297 1289  
1298 1290          dmu_tx_hold_bonus(tx, object);
1299 1291  
1300 1292          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1301 1293                  return;
1302 1294  
1303 1295          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1304 1296              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1305 1297                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1306 1298                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1307 1299                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1308 1300                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1309 1301          }
1310 1302  
1311 1303          dmu_tx_sa_registration_hold(sa, tx);
1312 1304  
1313 1305          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1314 1306                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1315 1307  
1316 1308          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1317 1309                  ASSERT(tx->tx_txg == 0);
1318 1310                  dmu_tx_hold_spill(tx, object);
1319 1311          } else {
1320 1312                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1321 1313                  dnode_t *dn;
1322 1314  
1323 1315                  DB_DNODE_ENTER(db);
1324 1316                  dn = DB_DNODE(db);
1325 1317                  if (dn->dn_have_spill) {
1326 1318                          ASSERT(tx->tx_txg == 0);
1327 1319                          dmu_tx_hold_spill(tx, object);
1328 1320                  }
1329 1321                  DB_DNODE_EXIT(db);
1330 1322          }
1331 1323  }

↓ open down ↓

176 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX