Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dmu_tx.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_tx.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - */
  24      -/*
  25   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
       24 + * Copyright (c) 2012 by Delphix. All rights reserved.
  26   25   */
  27   26  
  28   27  #include <sys/dmu.h>
  29   28  #include <sys/dmu_impl.h>
  30   29  #include <sys/dbuf.h>
  31   30  #include <sys/dmu_tx.h>
  32   31  #include <sys/dmu_objset.h>
  33   32  #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  34   33  #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  35   34  #include <sys/dsl_pool.h>
  36   35  #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  37   36  #include <sys/spa.h>
  38   37  #include <sys/sa.h>
  39   38  #include <sys/sa_impl.h>
  40   39  #include <sys/zfs_context.h>
  41   40  #include <sys/varargs.h>
  42   41  
  43   42  typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  44   43      uint64_t arg1, uint64_t arg2);
  45   44  
  46   45  
  47   46  dmu_tx_t *
  48   47  dmu_tx_create_dd(dsl_dir_t *dd)
  49   48  {
  50   49          dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  51   50          tx->tx_dir = dd;
  52   51          if (dd)
  53   52                  tx->tx_pool = dd->dd_pool;
  54   53          list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  55   54              offsetof(dmu_tx_hold_t, txh_node));
  56   55          list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  57   56              offsetof(dmu_tx_callback_t, dcb_node));
  58   57  #ifdef ZFS_DEBUG
  59   58          refcount_create(&tx->tx_space_written);
  60   59          refcount_create(&tx->tx_space_freed);
  61   60  #endif
  62   61          return (tx);
  63   62  }
  64   63  
  65   64  dmu_tx_t *
  66   65  dmu_tx_create(objset_t *os)
  67   66  {
  68   67          dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  69   68          tx->tx_objset = os;
  70   69          tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  71   70          return (tx);
  72   71  }
  73   72  
  74   73  dmu_tx_t *
  75   74  dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  76   75  {
  77   76          dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  78   77  
  79   78          ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  80   79          tx->tx_pool = dp;
  81   80          tx->tx_txg = txg;
  82   81          tx->tx_anyobj = TRUE;
  83   82  
  84   83          return (tx);
  85   84  }
  86   85  
  87   86  int
  88   87  dmu_tx_is_syncing(dmu_tx_t *tx)
  89   88  {
  90   89          return (tx->tx_anyobj);
  91   90  }
  92   91  
  93   92  int
  94   93  dmu_tx_private_ok(dmu_tx_t *tx)
  95   94  {
  96   95          return (tx->tx_anyobj);
  97   96  }
  98   97  
  99   98  static dmu_tx_hold_t *
 100   99  dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 101  100      enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 102  101  {
 103  102          dmu_tx_hold_t *txh;
 104  103          dnode_t *dn = NULL;
 105  104          int err;
 106  105  
 107  106          if (object != DMU_NEW_OBJECT) {
 108  107                  err = dnode_hold(os, object, tx, &dn);
 109  108                  if (err) {
 110  109                          tx->tx_err = err;
 111  110                          return (NULL);
 112  111                  }
 113  112  
 114  113                  if (err == 0 && tx->tx_txg != 0) {
 115  114                          mutex_enter(&dn->dn_mtx);
 116  115                          /*
 117  116                           * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 118  117                           * problem, but there's no way for it to happen (for
 119  118                           * now, at least).
 120  119                           */
 121  120                          ASSERT(dn->dn_assigned_txg == 0);
 122  121                          dn->dn_assigned_txg = tx->tx_txg;
 123  122                          (void) refcount_add(&dn->dn_tx_holds, tx);
 124  123                          mutex_exit(&dn->dn_mtx);
 125  124                  }
 126  125          }
 127  126  
 128  127          txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 129  128          txh->txh_tx = tx;
 130  129          txh->txh_dnode = dn;
 131  130  #ifdef ZFS_DEBUG
 132  131          txh->txh_type = type;
 133  132          txh->txh_arg1 = arg1;
 134  133          txh->txh_arg2 = arg2;
 135  134  #endif
 136  135          list_insert_tail(&tx->tx_holds, txh);
 137  136  
 138  137          return (txh);
 139  138  }
 140  139  
 141  140  void
 142  141  dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 143  142  {
 144  143          /*
 145  144           * If we're syncing, they can manipulate any object anyhow, and
 146  145           * the hold on the dnode_t can cause problems.
 147  146           */
 148  147          if (!dmu_tx_is_syncing(tx)) {
 149  148                  (void) dmu_tx_hold_object_impl(tx, os,
 150  149                      object, THT_NEWOBJECT, 0, 0);
 151  150          }
 152  151  }
 153  152  
 154  153  static int
 155  154  dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 156  155  {
 157  156          int err;
 158  157          dmu_buf_impl_t *db;
 159  158  
 160  159          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 161  160          db = dbuf_hold_level(dn, level, blkid, FTAG);
 162  161          rw_exit(&dn->dn_struct_rwlock);
 163  162          if (db == NULL)
 164  163                  return (EIO);
 165  164          err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 166  165          dbuf_rele(db, FTAG);
 167  166          return (err);
 168  167  }
 169  168  
 170  169  static void
 171  170  dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 172  171      int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 173  172  {
 174  173          objset_t *os = dn->dn_objset;
 175  174          dsl_dataset_t *ds = os->os_dsl_dataset;
 176  175          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 177  176          dmu_buf_impl_t *parent = NULL;
 178  177          blkptr_t *bp = NULL;
 179  178          uint64_t space;
 180  179  
 181  180          if (level >= dn->dn_nlevels || history[level] == blkid)
 182  181                  return;
 183  182  
 184  183          history[level] = blkid;
 185  184  
 186  185          space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 187  186  
 188  187          if (db == NULL || db == dn->dn_dbuf) {
 189  188                  ASSERT(level != 0);
 190  189                  db = NULL;
 191  190          } else {
 192  191                  ASSERT(DB_DNODE(db) == dn);
 193  192                  ASSERT(db->db_level == level);
 194  193                  ASSERT(db->db.db_size == space);
 195  194                  ASSERT(db->db_blkid == blkid);
 196  195                  bp = db->db_blkptr;
 197  196                  parent = db->db_parent;
 198  197          }
 199  198  
 200  199          freeable = (bp && (freeable ||
 201  200              dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 202  201  
 203  202          if (freeable)
 204  203                  txh->txh_space_tooverwrite += space;
 205  204          else
 206  205                  txh->txh_space_towrite += space;
 207  206          if (bp)
 208  207                  txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 209  208  
 210  209          dmu_tx_count_twig(txh, dn, parent, level + 1,
 211  210              blkid >> epbs, freeable, history);
 212  211  }
 213  212  
 214  213  /* ARGSUSED */
 215  214  static void
 216  215  dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 217  216  {
 218  217          dnode_t *dn = txh->txh_dnode;
 219  218          uint64_t start, end, i;
 220  219          int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 221  220          int err = 0;
 222  221  
 223  222          if (len == 0)
 224  223                  return;
 225  224  
 226  225          min_bs = SPA_MINBLOCKSHIFT;
 227  226          max_bs = SPA_MAXBLOCKSHIFT;
 228  227          min_ibs = DN_MIN_INDBLKSHIFT;
 229  228          max_ibs = DN_MAX_INDBLKSHIFT;
 230  229  
 231  230          if (dn) {
 232  231                  uint64_t history[DN_MAX_LEVELS];
 233  232                  int nlvls = dn->dn_nlevels;
 234  233                  int delta;
 235  234  
 236  235                  /*
 237  236                   * For i/o error checking, read the first and last level-0
 238  237                   * blocks (if they are not aligned), and all the level-1 blocks.
 239  238                   */
 240  239                  if (dn->dn_maxblkid == 0) {
 241  240                          delta = dn->dn_datablksz;
 242  241                          start = (off < dn->dn_datablksz) ? 0 : 1;
 243  242                          end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 244  243                          if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 245  244                                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 246  245                                  if (err)
 247  246                                          goto out;
 248  247                                  delta -= off;
 249  248                          }
 250  249                  } else {
 251  250                          zio_t *zio = zio_root(dn->dn_objset->os_spa,
 252  251                              NULL, NULL, ZIO_FLAG_CANFAIL);
 253  252  
 254  253                          /* first level-0 block */
 255  254                          start = off >> dn->dn_datablkshift;
 256  255                          if (P2PHASE(off, dn->dn_datablksz) ||
 257  256                              len < dn->dn_datablksz) {
 258  257                                  err = dmu_tx_check_ioerr(zio, dn, 0, start);
 259  258                                  if (err)
 260  259                                          goto out;
 261  260                          }
 262  261  
 263  262                          /* last level-0 block */
 264  263                          end = (off+len-1) >> dn->dn_datablkshift;
 265  264                          if (end != start && end <= dn->dn_maxblkid &&
 266  265                              P2PHASE(off+len, dn->dn_datablksz)) {
 267  266                                  err = dmu_tx_check_ioerr(zio, dn, 0, end);
 268  267                                  if (err)
 269  268                                          goto out;
 270  269                          }
 271  270  
 272  271                          /* level-1 blocks */
 273  272                          if (nlvls > 1) {
 274  273                                  int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 275  274                                  for (i = (start>>shft)+1; i < end>>shft; i++) {
 276  275                                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 277  276                                          if (err)
 278  277                                                  goto out;
 279  278                                  }
 280  279                          }
 281  280  
 282  281                          err = zio_wait(zio);
 283  282                          if (err)
 284  283                                  goto out;
 285  284                          delta = P2NPHASE(off, dn->dn_datablksz);
 286  285                  }
 287  286  
 288  287                  if (dn->dn_maxblkid > 0) {
 289  288                          /*
 290  289                           * The blocksize can't change,
 291  290                           * so we can make a more precise estimate.
 292  291                           */
 293  292                          ASSERT(dn->dn_datablkshift != 0);
 294  293                          min_bs = max_bs = dn->dn_datablkshift;
 295  294                          min_ibs = max_ibs = dn->dn_indblkshift;
 296  295                  } else if (dn->dn_indblkshift > max_ibs) {
 297  296                          /*
 298  297                           * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
 299  298                           * the code will still work correctly on older pools.
 300  299                           */
 301  300                          min_ibs = max_ibs = dn->dn_indblkshift;
 302  301                  }
 303  302  
 304  303                  /*
 305  304                   * If this write is not off the end of the file
 306  305                   * we need to account for overwrites/unref.
 307  306                   */
 308  307                  if (start <= dn->dn_maxblkid) {
 309  308                          for (int l = 0; l < DN_MAX_LEVELS; l++)
 310  309                                  history[l] = -1ULL;
 311  310                  }
 312  311                  while (start <= dn->dn_maxblkid) {
 313  312                          dmu_buf_impl_t *db;
 314  313  
 315  314                          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 316  315                          err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 317  316                          rw_exit(&dn->dn_struct_rwlock);
 318  317  
 319  318                          if (err) {
 320  319                                  txh->txh_tx->tx_err = err;
 321  320                                  return;
 322  321                          }
 323  322  
 324  323                          dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 325  324                              history);
 326  325                          dbuf_rele(db, FTAG);
 327  326                          if (++start > end) {
 328  327                                  /*
 329  328                                   * Account for new indirects appearing
 330  329                                   * before this IO gets assigned into a txg.
 331  330                                   */
 332  331                                  bits = 64 - min_bs;
 333  332                                  epbs = min_ibs - SPA_BLKPTRSHIFT;
 334  333                                  for (bits -= epbs * (nlvls - 1);
 335  334                                      bits >= 0; bits -= epbs)
 336  335                                          txh->txh_fudge += 1ULL << max_ibs;
 337  336                                  goto out;
 338  337                          }
 339  338                          off += delta;
 340  339                          if (len >= delta)
 341  340                                  len -= delta;
 342  341                          delta = dn->dn_datablksz;
 343  342                  }
 344  343          }
 345  344  
 346  345          /*
 347  346           * 'end' is the last thing we will access, not one past.
 348  347           * This way we won't overflow when accessing the last byte.
 349  348           */
 350  349          start = P2ALIGN(off, 1ULL << max_bs);
 351  350          end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 352  351          txh->txh_space_towrite += end - start + 1;
 353  352  
 354  353          start >>= min_bs;
 355  354          end >>= min_bs;
 356  355  
 357  356          epbs = min_ibs - SPA_BLKPTRSHIFT;
 358  357  
 359  358          /*
 360  359           * The object contains at most 2^(64 - min_bs) blocks,
 361  360           * and each indirect level maps 2^epbs.
 362  361           */
 363  362          for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 364  363                  start >>= epbs;
 365  364                  end >>= epbs;
 366  365                  ASSERT3U(end, >=, start);
 367  366                  txh->txh_space_towrite += (end - start + 1) << max_ibs;
 368  367                  if (start != 0) {
 369  368                          /*
 370  369                           * We also need a new blkid=0 indirect block
 371  370                           * to reference any existing file data.
 372  371                           */
 373  372                          txh->txh_space_towrite += 1ULL << max_ibs;
 374  373                  }
 375  374          }
 376  375  
 377  376  out:
 378  377          if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 379  378              2 * DMU_MAX_ACCESS)
 380  379                  err = EFBIG;
 381  380  
 382  381          if (err)
 383  382                  txh->txh_tx->tx_err = err;
 384  383  }
 385  384  
 386  385  static void
 387  386  dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 388  387  {
 389  388          dnode_t *dn = txh->txh_dnode;
 390  389          dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 391  390          uint64_t space = mdn->dn_datablksz +
 392  391              ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 393  392  
 394  393          if (dn && dn->dn_dbuf->db_blkptr &&
 395  394              dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 396  395              dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 397  396                  txh->txh_space_tooverwrite += space;
 398  397                  txh->txh_space_tounref += space;
 399  398          } else {
 400  399                  txh->txh_space_towrite += space;
 401  400                  if (dn && dn->dn_dbuf->db_blkptr)
 402  401                          txh->txh_space_tounref += space;
 403  402          }
 404  403  }
 405  404  
 406  405  void
 407  406  dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 408  407  {
 409  408          dmu_tx_hold_t *txh;
 410  409  
 411  410          ASSERT(tx->tx_txg == 0);
 412  411          ASSERT(len < DMU_MAX_ACCESS);
 413  412          ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 414  413  
 415  414          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 416  415              object, THT_WRITE, off, len);
 417  416          if (txh == NULL)
 418  417                  return;
 419  418  
 420  419          dmu_tx_count_write(txh, off, len);
 421  420          dmu_tx_count_dnode(txh);
 422  421  }
 423  422  
 424  423  static void
 425  424  dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 426  425  {
 427  426          uint64_t blkid, nblks, lastblk;
 428  427          uint64_t space = 0, unref = 0, skipped = 0;
 429  428          dnode_t *dn = txh->txh_dnode;
 430  429          dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 431  430          spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 432  431          int epbs;
 433  432  
 434  433          if (dn->dn_nlevels == 0)
 435  434                  return;
 436  435  
 437  436          /*
 438  437           * The struct_rwlock protects us against dn_nlevels
 439  438           * changing, in case (against all odds) we manage to dirty &
 440  439           * sync out the changes after we check for being dirty.
 441  440           * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 442  441           */
 443  442          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 444  443          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 445  444          if (dn->dn_maxblkid == 0) {
 446  445                  if (off == 0 && len >= dn->dn_datablksz) {
 447  446                          blkid = 0;
 448  447                          nblks = 1;
 449  448                  } else {
 450  449                          rw_exit(&dn->dn_struct_rwlock);
 451  450                          return;
 452  451                  }
 453  452          } else {
 454  453                  blkid = off >> dn->dn_datablkshift;
 455  454                  nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 456  455  
 457  456                  if (blkid >= dn->dn_maxblkid) {
 458  457                          rw_exit(&dn->dn_struct_rwlock);
 459  458                          return;
 460  459                  }
 461  460                  if (blkid + nblks > dn->dn_maxblkid)
 462  461                          nblks = dn->dn_maxblkid - blkid;
 463  462  
 464  463          }
 465  464          if (dn->dn_nlevels == 1) {
 466  465                  int i;
 467  466                  for (i = 0; i < nblks; i++) {
 468  467                          blkptr_t *bp = dn->dn_phys->dn_blkptr;
 469  468                          ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 470  469                          bp += blkid + i;
 471  470                          if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 472  471                                  dprintf_bp(bp, "can free old%s", "");
 473  472                                  space += bp_get_dsize(spa, bp);
 474  473                          }
 475  474                          unref += BP_GET_ASIZE(bp);
 476  475                  }
 477  476                  nblks = 0;
 478  477          }
 479  478  
 480  479          /*
 481  480           * Add in memory requirements of higher-level indirects.
 482  481           * This assumes a worst-possible scenario for dn_nlevels.
 483  482           */
 484  483          {
 485  484                  uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
 486  485                  int level = (dn->dn_nlevels > 1) ? 2 : 1;
 487  486  
 488  487                  while (level++ < DN_MAX_LEVELS) {
 489  488                          txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
 490  489                          blkcnt = 1 + (blkcnt >> epbs);
 491  490                  }
 492  491                  ASSERT(blkcnt <= dn->dn_nblkptr);
 493  492          }
 494  493  
 495  494          lastblk = blkid + nblks - 1;
 496  495          while (nblks) {
 497  496                  dmu_buf_impl_t *dbuf;
 498  497                  uint64_t ibyte, new_blkid;
 499  498                  int epb = 1 << epbs;
 500  499                  int err, i, blkoff, tochk;
 501  500                  blkptr_t *bp;
 502  501  
 503  502                  ibyte = blkid << dn->dn_datablkshift;
 504  503                  err = dnode_next_offset(dn,
 505  504                      DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 506  505                  new_blkid = ibyte >> dn->dn_datablkshift;
 507  506                  if (err == ESRCH) {
 508  507                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 509  508                          break;
 510  509                  }
 511  510                  if (err) {
 512  511                          txh->txh_tx->tx_err = err;
 513  512                          break;
 514  513                  }
 515  514                  if (new_blkid > lastblk) {
 516  515                          skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 517  516                          break;
 518  517                  }
 519  518  
 520  519                  if (new_blkid > blkid) {
 521  520                          ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 522  521                          skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 523  522                          nblks -= new_blkid - blkid;
 524  523                          blkid = new_blkid;
 525  524                  }
 526  525                  blkoff = P2PHASE(blkid, epb);
 527  526                  tochk = MIN(epb - blkoff, nblks);
 528  527  
 529  528                  err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 530  529                  if (err) {
 531  530                          txh->txh_tx->tx_err = err;
 532  531                          break;
 533  532                  }
 534  533  
 535  534                  txh->txh_memory_tohold += dbuf->db.db_size;
 536  535  
 537  536                  /*
 538  537                   * We don't check memory_tohold against DMU_MAX_ACCESS because
 539  538                   * memory_tohold is an over-estimation (especially the >L1
 540  539                   * indirect blocks), so it could fail.  Callers should have
 541  540                   * already verified that they will not be holding too much
 542  541                   * memory.
 543  542                   */
 544  543  
 545  544                  err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 546  545                  if (err != 0) {
 547  546                          txh->txh_tx->tx_err = err;
 548  547                          dbuf_rele(dbuf, FTAG);
 549  548                          break;
 550  549                  }
 551  550  
 552  551                  bp = dbuf->db.db_data;
 553  552                  bp += blkoff;
 554  553  
 555  554                  for (i = 0; i < tochk; i++) {
 556  555                          if (dsl_dataset_block_freeable(ds, &bp[i],
 557  556                              bp[i].blk_birth)) {
 558  557                                  dprintf_bp(&bp[i], "can free old%s", "");
 559  558                                  space += bp_get_dsize(spa, &bp[i]);
 560  559                          }
 561  560                          unref += BP_GET_ASIZE(bp);
 562  561                  }
 563  562                  dbuf_rele(dbuf, FTAG);
 564  563  
 565  564                  blkid += tochk;
 566  565                  nblks -= tochk;
 567  566          }
 568  567          rw_exit(&dn->dn_struct_rwlock);
 569  568  
 570  569          /* account for new level 1 indirect blocks that might show up */
 571  570          if (skipped > 0) {
 572  571                  txh->txh_fudge += skipped << dn->dn_indblkshift;
 573  572                  skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 574  573                  txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 575  574          }
 576  575          txh->txh_space_tofree += space;
 577  576          txh->txh_space_tounref += unref;
 578  577  }
 579  578  
 580  579  void
 581  580  dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 582  581  {
 583  582          dmu_tx_hold_t *txh;
 584  583          dnode_t *dn;
 585  584          uint64_t start, end, i;
 586  585          int err, shift;
 587  586          zio_t *zio;
 588  587  
 589  588          ASSERT(tx->tx_txg == 0);
 590  589  
 591  590          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 592  591              object, THT_FREE, off, len);
 593  592          if (txh == NULL)
 594  593                  return;
 595  594          dn = txh->txh_dnode;
 596  595  
 597  596          /* first block */
 598  597          if (off != 0)
 599  598                  dmu_tx_count_write(txh, off, 1);
 600  599          /* last block */
 601  600          if (len != DMU_OBJECT_END)
 602  601                  dmu_tx_count_write(txh, off+len, 1);
 603  602  
 604  603          dmu_tx_count_dnode(txh);
 605  604  
 606  605          if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 607  606                  return;
 608  607          if (len == DMU_OBJECT_END)
 609  608                  len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 610  609  
 611  610          /*
 612  611           * For i/o error checking, read the first and last level-0
 613  612           * blocks, and all the level-1 blocks.  The above count_write's
 614  613           * have already taken care of the level-0 blocks.
 615  614           */
 616  615          if (dn->dn_nlevels > 1) {
 617  616                  shift = dn->dn_datablkshift + dn->dn_indblkshift -
 618  617                      SPA_BLKPTRSHIFT;
 619  618                  start = off >> shift;
 620  619                  end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
 621  620  
 622  621                  zio = zio_root(tx->tx_pool->dp_spa,
 623  622                      NULL, NULL, ZIO_FLAG_CANFAIL);
 624  623                  for (i = start; i <= end; i++) {
 625  624                          uint64_t ibyte = i << shift;
 626  625                          err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 627  626                          i = ibyte >> shift;
 628  627                          if (err == ESRCH)
 629  628                                  break;
 630  629                          if (err) {
 631  630                                  tx->tx_err = err;
 632  631                                  return;
 633  632                          }
 634  633  
 635  634                          err = dmu_tx_check_ioerr(zio, dn, 1, i);
 636  635                          if (err) {
 637  636                                  tx->tx_err = err;
 638  637                                  return;
 639  638                          }
 640  639                  }
 641  640                  err = zio_wait(zio);
 642  641                  if (err) {
 643  642                          tx->tx_err = err;
 644  643                          return;
 645  644                  }
 646  645          }
 647  646  
 648  647          dmu_tx_count_free(txh, off, len);
 649  648  }
 650  649  
 651  650  void
 652  651  dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 653  652  {
 654  653          dmu_tx_hold_t *txh;
 655  654          dnode_t *dn;
 656  655          uint64_t nblocks;
 657  656          int epbs, err;
 658  657  
 659  658          ASSERT(tx->tx_txg == 0);
 660  659  
 661  660          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 662  661              object, THT_ZAP, add, (uintptr_t)name);
 663  662          if (txh == NULL)
 664  663                  return;
 665  664          dn = txh->txh_dnode;
 666  665  
 667  666          dmu_tx_count_dnode(txh);
 668  667  
  
    | 
      ↓ open down ↓ | 
    633 lines elided | 
    
      ↑ open up ↑ | 
  
 669  668          if (dn == NULL) {
 670  669                  /*
 671  670                   * We will be able to fit a new object's entries into one leaf
 672  671                   * block.  So there will be at most 2 blocks total,
 673  672                   * including the header block.
 674  673                   */
 675  674                  dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 676  675                  return;
 677  676          }
 678  677  
 679      -        ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
      678 +        ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 680  679  
 681  680          if (dn->dn_maxblkid == 0 && !add) {
 682  681                  blkptr_t *bp;
 683  682  
 684  683                  /*
 685  684                   * If there is only one block  (i.e. this is a micro-zap)
 686  685                   * and we are not adding anything, the accounting is simple.
 687  686                   */
 688  687                  err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 689  688                  if (err) {
 690  689                          tx->tx_err = err;
 691  690                          return;
 692  691                  }
 693  692  
 694  693                  /*
 695  694                   * Use max block size here, since we don't know how much
 696  695                   * the size will change between now and the dbuf dirty call.
 697  696                   */
 698  697                  bp = &dn->dn_phys->dn_blkptr[0];
 699  698                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 700  699                      bp, bp->blk_birth))
 701  700                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 702  701                  else
 703  702                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
 704  703                  if (!BP_IS_HOLE(bp))
 705  704                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 706  705                  return;
 707  706          }
 708  707  
 709  708          if (dn->dn_maxblkid > 0 && name) {
 710  709                  /*
 711  710                   * access the name in this fat-zap so that we'll check
 712  711                   * for i/o errors to the leaf blocks, etc.
 713  712                   */
 714  713                  err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 715  714                      8, 0, NULL);
 716  715                  if (err == EIO) {
 717  716                          tx->tx_err = err;
 718  717                          return;
 719  718                  }
 720  719          }
 721  720  
 722  721          err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 723  722              &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 724  723  
 725  724          /*
 726  725           * If the modified blocks are scattered to the four winds,
 727  726           * we'll have to modify an indirect twig for each.
 728  727           */
 729  728          epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 730  729          for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 731  730                  if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
 732  731                          txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 733  732                  else
 734  733                          txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 735  734  }
 736  735  
 737  736  void
 738  737  dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 739  738  {
 740  739          dmu_tx_hold_t *txh;
 741  740  
 742  741          ASSERT(tx->tx_txg == 0);
 743  742  
 744  743          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 745  744              object, THT_BONUS, 0, 0);
 746  745          if (txh)
 747  746                  dmu_tx_count_dnode(txh);
 748  747  }
 749  748  
 750  749  void
 751  750  dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 752  751  {
 753  752          dmu_tx_hold_t *txh;
 754  753          ASSERT(tx->tx_txg == 0);
 755  754  
 756  755          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 757  756              DMU_NEW_OBJECT, THT_SPACE, space, 0);
 758  757  
 759  758          txh->txh_space_towrite += space;
 760  759  }
 761  760  
 762  761  int
 763  762  dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 764  763  {
 765  764          dmu_tx_hold_t *txh;
 766  765          int holds = 0;
 767  766  
 768  767          /*
 769  768           * By asserting that the tx is assigned, we're counting the
 770  769           * number of dn_tx_holds, which is the same as the number of
 771  770           * dn_holds.  Otherwise, we'd be counting dn_holds, but
 772  771           * dn_tx_holds could be 0.
 773  772           */
 774  773          ASSERT(tx->tx_txg != 0);
 775  774  
 776  775          /* if (tx->tx_anyobj == TRUE) */
 777  776                  /* return (0); */
 778  777  
 779  778          for (txh = list_head(&tx->tx_holds); txh;
 780  779              txh = list_next(&tx->tx_holds, txh)) {
 781  780                  if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 782  781                          holds++;
 783  782          }
 784  783  
 785  784          return (holds);
 786  785  }
 787  786  
 788  787  #ifdef ZFS_DEBUG
 789  788  void
 790  789  dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 791  790  {
 792  791          dmu_tx_hold_t *txh;
 793  792          int match_object = FALSE, match_offset = FALSE;
 794  793          dnode_t *dn;
 795  794  
 796  795          DB_DNODE_ENTER(db);
 797  796          dn = DB_DNODE(db);
 798  797          ASSERT(tx->tx_txg != 0);
 799  798          ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 800  799          ASSERT3U(dn->dn_object, ==, db->db.db_object);
 801  800  
 802  801          if (tx->tx_anyobj) {
 803  802                  DB_DNODE_EXIT(db);
 804  803                  return;
 805  804          }
 806  805  
 807  806          /* XXX No checking on the meta dnode for now */
 808  807          if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 809  808                  DB_DNODE_EXIT(db);
 810  809                  return;
 811  810          }
 812  811  
 813  812          for (txh = list_head(&tx->tx_holds); txh;
 814  813              txh = list_next(&tx->tx_holds, txh)) {
 815  814                  ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 816  815                  if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 817  816                          match_object = TRUE;
 818  817                  if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 819  818                          int datablkshift = dn->dn_datablkshift ?
 820  819                              dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 821  820                          int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 822  821                          int shift = datablkshift + epbs * db->db_level;
 823  822                          uint64_t beginblk = shift >= 64 ? 0 :
 824  823                              (txh->txh_arg1 >> shift);
 825  824                          uint64_t endblk = shift >= 64 ? 0 :
 826  825                              ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 827  826                          uint64_t blkid = db->db_blkid;
 828  827  
 829  828                          /* XXX txh_arg2 better not be zero... */
 830  829  
 831  830                          dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 832  831                              txh->txh_type, beginblk, endblk);
 833  832  
 834  833                          switch (txh->txh_type) {
 835  834                          case THT_WRITE:
 836  835                                  if (blkid >= beginblk && blkid <= endblk)
 837  836                                          match_offset = TRUE;
 838  837                                  /*
 839  838                                   * We will let this hold work for the bonus
 840  839                                   * or spill buffer so that we don't need to
 841  840                                   * hold it when creating a new object.
 842  841                                   */
 843  842                                  if (blkid == DMU_BONUS_BLKID ||
 844  843                                      blkid == DMU_SPILL_BLKID)
 845  844                                          match_offset = TRUE;
 846  845                                  /*
 847  846                                   * They might have to increase nlevels,
 848  847                                   * thus dirtying the new TLIBs.  Or the
 849  848                                   * might have to change the block size,
 850  849                                   * thus dirying the new lvl=0 blk=0.
 851  850                                   */
 852  851                                  if (blkid == 0)
 853  852                                          match_offset = TRUE;
 854  853                                  break;
 855  854                          case THT_FREE:
 856  855                                  /*
 857  856                                   * We will dirty all the level 1 blocks in
 858  857                                   * the free range and perhaps the first and
 859  858                                   * last level 0 block.
 860  859                                   */
 861  860                                  if (blkid >= beginblk && (blkid <= endblk ||
 862  861                                      txh->txh_arg2 == DMU_OBJECT_END))
 863  862                                          match_offset = TRUE;
 864  863                                  break;
 865  864                          case THT_SPILL:
 866  865                                  if (blkid == DMU_SPILL_BLKID)
 867  866                                          match_offset = TRUE;
 868  867                                  break;
 869  868                          case THT_BONUS:
 870  869                                  if (blkid == DMU_BONUS_BLKID)
 871  870                                          match_offset = TRUE;
 872  871                                  break;
 873  872                          case THT_ZAP:
 874  873                                  match_offset = TRUE;
 875  874                                  break;
 876  875                          case THT_NEWOBJECT:
 877  876                                  match_object = TRUE;
 878  877                                  break;
 879  878                          default:
 880  879                                  ASSERT(!"bad txh_type");
 881  880                          }
 882  881                  }
 883  882                  if (match_object && match_offset) {
 884  883                          DB_DNODE_EXIT(db);
 885  884                          return;
 886  885                  }
 887  886          }
 888  887          DB_DNODE_EXIT(db);
 889  888          panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 890  889              (u_longlong_t)db->db.db_object, db->db_level,
 891  890              (u_longlong_t)db->db_blkid);
 892  891  }
 893  892  #endif
 894  893  
 895  894  static int
 896  895  dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 897  896  {
 898  897          dmu_tx_hold_t *txh;
 899  898          spa_t *spa = tx->tx_pool->dp_spa;
 900  899          uint64_t memory, asize, fsize, usize;
 901  900          uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 902  901  
 903  902          ASSERT3U(tx->tx_txg, ==, 0);
 904  903  
 905  904          if (tx->tx_err)
 906  905                  return (tx->tx_err);
 907  906  
 908  907          if (spa_suspended(spa)) {
 909  908                  /*
 910  909                   * If the user has indicated a blocking failure mode
 911  910                   * then return ERESTART which will block in dmu_tx_wait().
 912  911                   * Otherwise, return EIO so that an error can get
 913  912                   * propagated back to the VOP calls.
 914  913                   *
 915  914                   * Note that we always honor the txg_how flag regardless
 916  915                   * of the failuremode setting.
 917  916                   */
 918  917                  if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 919  918                      txg_how != TXG_WAIT)
 920  919                          return (EIO);
 921  920  
 922  921                  return (ERESTART);
 923  922          }
 924  923  
 925  924          tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 926  925          tx->tx_needassign_txh = NULL;
 927  926  
 928  927          /*
 929  928           * NB: No error returns are allowed after txg_hold_open, but
 930  929           * before processing the dnode holds, due to the
 931  930           * dmu_tx_unassign() logic.
 932  931           */
 933  932  
 934  933          towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
 935  934          for (txh = list_head(&tx->tx_holds); txh;
 936  935              txh = list_next(&tx->tx_holds, txh)) {
 937  936                  dnode_t *dn = txh->txh_dnode;
 938  937                  if (dn != NULL) {
 939  938                          mutex_enter(&dn->dn_mtx);
 940  939                          if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 941  940                                  mutex_exit(&dn->dn_mtx);
 942  941                                  tx->tx_needassign_txh = txh;
 943  942                                  return (ERESTART);
 944  943                          }
 945  944                          if (dn->dn_assigned_txg == 0)
 946  945                                  dn->dn_assigned_txg = tx->tx_txg;
 947  946                          ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 948  947                          (void) refcount_add(&dn->dn_tx_holds, tx);
 949  948                          mutex_exit(&dn->dn_mtx);
 950  949                  }
 951  950                  towrite += txh->txh_space_towrite;
 952  951                  tofree += txh->txh_space_tofree;
 953  952                  tooverwrite += txh->txh_space_tooverwrite;
 954  953                  tounref += txh->txh_space_tounref;
 955  954                  tohold += txh->txh_memory_tohold;
 956  955                  fudge += txh->txh_fudge;
 957  956          }
 958  957  
 959  958          /*
 960  959           * NB: This check must be after we've held the dnodes, so that
 961  960           * the dmu_tx_unassign() logic will work properly
 962  961           */
 963  962          if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
 964  963                  return (ERESTART);
 965  964  
 966  965          /*
 967  966           * If a snapshot has been taken since we made our estimates,
 968  967           * assume that we won't be able to free or overwrite anything.
 969  968           */
 970  969          if (tx->tx_objset &&
 971  970              dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 972  971              tx->tx_lastsnap_txg) {
 973  972                  towrite += tooverwrite;
 974  973                  tooverwrite = tofree = 0;
 975  974          }
 976  975  
 977  976          /* needed allocation: worst-case estimate of write space */
 978  977          asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
 979  978          /* freed space estimate: worst-case overwrite + free estimate */
 980  979          fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
 981  980          /* convert unrefd space to worst-case estimate */
 982  981          usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
 983  982          /* calculate memory footprint estimate */
 984  983          memory = towrite + tooverwrite + tohold;
 985  984  
 986  985  #ifdef ZFS_DEBUG
 987  986          /*
 988  987           * Add in 'tohold' to account for our dirty holds on this memory
 989  988           * XXX - the "fudge" factor is to account for skipped blocks that
 990  989           * we missed because dnode_next_offset() misses in-core-only blocks.
 991  990           */
 992  991          tx->tx_space_towrite = asize +
 993  992              spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
 994  993          tx->tx_space_tofree = tofree;
 995  994          tx->tx_space_tooverwrite = tooverwrite;
 996  995          tx->tx_space_tounref = tounref;
 997  996  #endif
 998  997  
 999  998          if (tx->tx_dir && asize != 0) {
1000  999                  int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1001 1000                      asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1002 1001                  if (err)
1003 1002                          return (err);
1004 1003          }
1005 1004  
1006 1005          return (0);
1007 1006  }
1008 1007  
1009 1008  static void
1010 1009  dmu_tx_unassign(dmu_tx_t *tx)
1011 1010  {
1012 1011          dmu_tx_hold_t *txh;
1013 1012  
1014 1013          if (tx->tx_txg == 0)
1015 1014                  return;
1016 1015  
1017 1016          txg_rele_to_quiesce(&tx->tx_txgh);
1018 1017  
1019 1018          for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1020 1019              txh = list_next(&tx->tx_holds, txh)) {
1021 1020                  dnode_t *dn = txh->txh_dnode;
1022 1021  
1023 1022                  if (dn == NULL)
1024 1023                          continue;
1025 1024                  mutex_enter(&dn->dn_mtx);
1026 1025                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1027 1026  
1028 1027                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1029 1028                          dn->dn_assigned_txg = 0;
1030 1029                          cv_broadcast(&dn->dn_notxholds);
1031 1030                  }
1032 1031                  mutex_exit(&dn->dn_mtx);
1033 1032          }
1034 1033  
1035 1034          txg_rele_to_sync(&tx->tx_txgh);
1036 1035  
1037 1036          tx->tx_lasttried_txg = tx->tx_txg;
1038 1037          tx->tx_txg = 0;
1039 1038  }
1040 1039  
1041 1040  /*
1042 1041   * Assign tx to a transaction group.  txg_how can be one of:
1043 1042   *
1044 1043   * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1045 1044   *      a new one.  This should be used when you're not holding locks.
1046 1045   *      If will only fail if we're truly out of space (or over quota).
1047 1046   *
1048 1047   * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1049 1048   *      blocking, returns immediately with ERESTART.  This should be used
1050 1049   *      whenever you're holding locks.  On an ERESTART error, the caller
1051 1050   *      should drop locks, do a dmu_tx_wait(tx), and try again.
1052 1051   *
1053 1052   * (3)  A specific txg.  Use this if you need to ensure that multiple
1054 1053   *      transactions all sync in the same txg.  Like TXG_NOWAIT, it
1055 1054   *      returns ERESTART if it can't assign you into the requested txg.
1056 1055   */
1057 1056  int
1058 1057  dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1059 1058  {
1060 1059          int err;
1061 1060  
1062 1061          ASSERT(tx->tx_txg == 0);
1063 1062          ASSERT(txg_how != 0);
1064 1063          ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1065 1064  
1066 1065          while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1067 1066                  dmu_tx_unassign(tx);
1068 1067  
1069 1068                  if (err != ERESTART || txg_how != TXG_WAIT)
1070 1069                          return (err);
1071 1070  
1072 1071                  dmu_tx_wait(tx);
1073 1072          }
1074 1073  
1075 1074          txg_rele_to_quiesce(&tx->tx_txgh);
1076 1075  
1077 1076          return (0);
1078 1077  }
1079 1078  
1080 1079  void
1081 1080  dmu_tx_wait(dmu_tx_t *tx)
1082 1081  {
1083 1082          spa_t *spa = tx->tx_pool->dp_spa;
1084 1083  
1085 1084          ASSERT(tx->tx_txg == 0);
1086 1085  
1087 1086          /*
1088 1087           * It's possible that the pool has become active after this thread
1089 1088           * has tried to obtain a tx. If that's the case then his
1090 1089           * tx_lasttried_txg would not have been assigned.
1091 1090           */
1092 1091          if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1093 1092                  txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
1094 1093          } else if (tx->tx_needassign_txh) {
1095 1094                  dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1096 1095  
1097 1096                  mutex_enter(&dn->dn_mtx);
1098 1097                  while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1099 1098                          cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1100 1099                  mutex_exit(&dn->dn_mtx);
1101 1100                  tx->tx_needassign_txh = NULL;
1102 1101          } else {
1103 1102                  txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1104 1103          }
1105 1104  }
1106 1105  
1107 1106  void
1108 1107  dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1109 1108  {
1110 1109  #ifdef ZFS_DEBUG
1111 1110          if (tx->tx_dir == NULL || delta == 0)
1112 1111                  return;
1113 1112  
1114 1113          if (delta > 0) {
1115 1114                  ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1116 1115                      tx->tx_space_towrite);
1117 1116                  (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1118 1117          } else {
1119 1118                  (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1120 1119          }
1121 1120  #endif
1122 1121  }
1123 1122  
1124 1123  void
1125 1124  dmu_tx_commit(dmu_tx_t *tx)
1126 1125  {
1127 1126          dmu_tx_hold_t *txh;
1128 1127  
1129 1128          ASSERT(tx->tx_txg != 0);
1130 1129  
1131 1130          while (txh = list_head(&tx->tx_holds)) {
1132 1131                  dnode_t *dn = txh->txh_dnode;
1133 1132  
1134 1133                  list_remove(&tx->tx_holds, txh);
1135 1134                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1136 1135                  if (dn == NULL)
1137 1136                          continue;
1138 1137                  mutex_enter(&dn->dn_mtx);
1139 1138                  ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1140 1139  
1141 1140                  if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1142 1141                          dn->dn_assigned_txg = 0;
1143 1142                          cv_broadcast(&dn->dn_notxholds);
1144 1143                  }
1145 1144                  mutex_exit(&dn->dn_mtx);
1146 1145                  dnode_rele(dn, tx);
1147 1146          }
1148 1147  
1149 1148          if (tx->tx_tempreserve_cookie)
1150 1149                  dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1151 1150  
1152 1151          if (!list_is_empty(&tx->tx_callbacks))
1153 1152                  txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1154 1153  
1155 1154          if (tx->tx_anyobj == FALSE)
1156 1155                  txg_rele_to_sync(&tx->tx_txgh);
1157 1156  
1158 1157          list_destroy(&tx->tx_callbacks);
1159 1158          list_destroy(&tx->tx_holds);
1160 1159  #ifdef ZFS_DEBUG
1161 1160          dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1162 1161              tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1163 1162              tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1164 1163          refcount_destroy_many(&tx->tx_space_written,
1165 1164              refcount_count(&tx->tx_space_written));
1166 1165          refcount_destroy_many(&tx->tx_space_freed,
1167 1166              refcount_count(&tx->tx_space_freed));
1168 1167  #endif
1169 1168          kmem_free(tx, sizeof (dmu_tx_t));
1170 1169  }
1171 1170  
1172 1171  void
1173 1172  dmu_tx_abort(dmu_tx_t *tx)
1174 1173  {
1175 1174          dmu_tx_hold_t *txh;
1176 1175  
1177 1176          ASSERT(tx->tx_txg == 0);
1178 1177  
1179 1178          while (txh = list_head(&tx->tx_holds)) {
1180 1179                  dnode_t *dn = txh->txh_dnode;
1181 1180  
1182 1181                  list_remove(&tx->tx_holds, txh);
1183 1182                  kmem_free(txh, sizeof (dmu_tx_hold_t));
1184 1183                  if (dn != NULL)
1185 1184                          dnode_rele(dn, tx);
1186 1185          }
1187 1186  
1188 1187          /*
1189 1188           * Call any registered callbacks with an error code.
1190 1189           */
1191 1190          if (!list_is_empty(&tx->tx_callbacks))
1192 1191                  dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1193 1192  
1194 1193          list_destroy(&tx->tx_callbacks);
1195 1194          list_destroy(&tx->tx_holds);
1196 1195  #ifdef ZFS_DEBUG
1197 1196          refcount_destroy_many(&tx->tx_space_written,
1198 1197              refcount_count(&tx->tx_space_written));
1199 1198          refcount_destroy_many(&tx->tx_space_freed,
1200 1199              refcount_count(&tx->tx_space_freed));
1201 1200  #endif
1202 1201          kmem_free(tx, sizeof (dmu_tx_t));
1203 1202  }
1204 1203  
1205 1204  uint64_t
1206 1205  dmu_tx_get_txg(dmu_tx_t *tx)
1207 1206  {
1208 1207          ASSERT(tx->tx_txg != 0);
1209 1208          return (tx->tx_txg);
1210 1209  }
1211 1210  
1212 1211  void
1213 1212  dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1214 1213  {
1215 1214          dmu_tx_callback_t *dcb;
1216 1215  
1217 1216          dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1218 1217  
1219 1218          dcb->dcb_func = func;
1220 1219          dcb->dcb_data = data;
1221 1220  
1222 1221          list_insert_tail(&tx->tx_callbacks, dcb);
1223 1222  }
1224 1223  
1225 1224  /*
1226 1225   * Call all the commit callbacks on a list, with a given error code.
1227 1226   */
1228 1227  void
1229 1228  dmu_tx_do_callbacks(list_t *cb_list, int error)
1230 1229  {
1231 1230          dmu_tx_callback_t *dcb;
1232 1231  
1233 1232          while (dcb = list_head(cb_list)) {
1234 1233                  list_remove(cb_list, dcb);
1235 1234                  dcb->dcb_func(dcb->dcb_data, error);
1236 1235                  kmem_free(dcb, sizeof (dmu_tx_callback_t));
1237 1236          }
1238 1237  }
1239 1238  
1240 1239  /*
1241 1240   * Interface to hold a bunch of attributes.
1242 1241   * used for creating new files.
1243 1242   * attrsize is the total size of all attributes
1244 1243   * to be added during object creation
1245 1244   *
1246 1245   * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1247 1246   */
1248 1247  
1249 1248  /*
1250 1249   * hold necessary attribute name for attribute registration.
1251 1250   * should be a very rare case where this is needed.  If it does
1252 1251   * happen it would only happen on the first write to the file system.
1253 1252   */
1254 1253  static void
1255 1254  dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1256 1255  {
1257 1256          int i;
1258 1257  
1259 1258          if (!sa->sa_need_attr_registration)
1260 1259                  return;
1261 1260  
1262 1261          for (i = 0; i != sa->sa_num_attrs; i++) {
1263 1262                  if (!sa->sa_attr_table[i].sa_registered) {
1264 1263                          if (sa->sa_reg_attr_obj)
1265 1264                                  dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1266 1265                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1267 1266                          else
1268 1267                                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1269 1268                                      B_TRUE, sa->sa_attr_table[i].sa_name);
1270 1269                  }
1271 1270          }
1272 1271  }
1273 1272  
1274 1273  
1275 1274  void
1276 1275  dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1277 1276  {
1278 1277          dnode_t *dn;
1279 1278          dmu_tx_hold_t *txh;
1280 1279  
1281 1280          txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1282 1281              THT_SPILL, 0, 0);
1283 1282  
1284 1283          dn = txh->txh_dnode;
1285 1284  
1286 1285          if (dn == NULL)
1287 1286                  return;
1288 1287  
1289 1288          /* If blkptr doesn't exist then add space to towrite */
1290 1289          if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1291 1290                  txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1292 1291          } else {
1293 1292                  blkptr_t *bp;
1294 1293  
1295 1294                  bp = &dn->dn_phys->dn_spill;
1296 1295                  if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1297 1296                      bp, bp->blk_birth))
1298 1297                          txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
1299 1298                  else
1300 1299                          txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
1301 1300                  if (!BP_IS_HOLE(bp))
1302 1301                          txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
1303 1302          }
1304 1303  }
1305 1304  
1306 1305  void
1307 1306  dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1308 1307  {
1309 1308          sa_os_t *sa = tx->tx_objset->os_sa;
1310 1309  
1311 1310          dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1312 1311  
1313 1312          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1314 1313                  return;
1315 1314  
1316 1315          if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1317 1316                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1318 1317          else {
1319 1318                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1320 1319                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1321 1320                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1322 1321                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1323 1322          }
1324 1323  
1325 1324          dmu_tx_sa_registration_hold(sa, tx);
1326 1325  
1327 1326          if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1328 1327                  return;
1329 1328  
1330 1329          (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1331 1330              THT_SPILL, 0, 0);
1332 1331  }
1333 1332  
1334 1333  /*
1335 1334   * Hold SA attribute
1336 1335   *
1337 1336   * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1338 1337   *
1339 1338   * variable_size is the total size of all variable sized attributes
1340 1339   * passed to this function.  It is not the total size of all
1341 1340   * variable size attributes that *may* exist on this object.
1342 1341   */
1343 1342  void
1344 1343  dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1345 1344  {
1346 1345          uint64_t object;
1347 1346          sa_os_t *sa = tx->tx_objset->os_sa;
1348 1347  
1349 1348          ASSERT(hdl != NULL);
1350 1349  
1351 1350          object = sa_handle_object(hdl);
1352 1351  
1353 1352          dmu_tx_hold_bonus(tx, object);
1354 1353  
1355 1354          if (tx->tx_objset->os_sa->sa_master_obj == 0)
1356 1355                  return;
1357 1356  
1358 1357          if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1359 1358              tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1360 1359                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1361 1360                  dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1362 1361                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1363 1362                  dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1364 1363          }
1365 1364  
1366 1365          dmu_tx_sa_registration_hold(sa, tx);
1367 1366  
1368 1367          if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1369 1368                  dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1370 1369  
1371 1370          if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1372 1371                  ASSERT(tx->tx_txg == 0);
1373 1372                  dmu_tx_hold_spill(tx, object);
1374 1373          } else {
1375 1374                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1376 1375                  dnode_t *dn;
1377 1376  
1378 1377                  DB_DNODE_ENTER(db);
1379 1378                  dn = DB_DNODE(db);
1380 1379                  if (dn->dn_have_spill) {
1381 1380                          ASSERT(tx->tx_txg == 0);
1382 1381                          dmu_tx_hold_spill(tx, object);
1383 1382                  }
1384 1383                  DB_DNODE_EXIT(db);
1385 1384          }
1386 1385  }
  
    | 
      ↓ open down ↓ | 
    697 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX