illumos-gate Wdiff usr/src/uts/common/fs/zfs/dsl_pool.c

Print this page

5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dsl_pool.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_pool.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 Steven Hartland. All rights reserved.
       25 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25   26   */
  26   27  
  27   28  #include <sys/dsl_pool.h>
  28   29  #include <sys/dsl_dataset.h>
  29   30  #include <sys/dsl_prop.h>
  30   31  #include <sys/dsl_dir.h>
  31   32  #include <sys/dsl_synctask.h>
  32   33  #include <sys/dsl_scan.h>
  33   34  #include <sys/dnode.h>
  34   35  #include <sys/dmu_tx.h>

  35   36  #include <sys/dmu_objset.h>
  36   37  #include <sys/arc.h>
  37   38  #include <sys/zap.h>
  38   39  #include <sys/zio.h>
  39   40  #include <sys/zfs_context.h>
  40   41  #include <sys/fs/zfs.h>
  41   42  #include <sys/zfs_znode.h>
  42   43  #include <sys/spa_impl.h>
  43   44  #include <sys/dsl_deadlist.h>
  44   45  #include <sys/bptree.h>
  45   46  #include <sys/zfeature.h>
  46   47  #include <sys/zil_impl.h>
  47   48  #include <sys/dsl_userhold.h>
  48   49  
  49   50  /*
  50   51   * ZFS Write Throttle
  51   52   * ------------------
  52   53   *
  53   54   * ZFS must limit the rate of incoming writes to the rate at which it is able
  54   55   * to sync data modifications to the backend storage. Throttling by too much
  55   56   * creates an artificial limit; throttling by too little can only be sustained
  56   57   * for short periods and would lead to highly lumpy performance. On a per-pool
  57   58   * basis, ZFS tracks the amount of modified (dirty) data. As operations change
  58   59   * data, the amount of dirty data increases; as ZFS syncs out data, the amount
  59   60   * of dirty data decreases. When the amount of dirty data exceeds a
  60   61   * predetermined threshold further modifications are blocked until the amount
  61   62   * of dirty data decreases (as data is synced out).
  62   63   *
  63   64   * The limit on dirty data is tunable, and should be adjusted according to
  64   65   * both the IO capacity and available memory of the system. The larger the
  65   66   * window, the more ZFS is able to aggregate and amortize metadata (and data)
  66   67   * changes. However, memory is a limited resource, and allowing for more dirty
  67   68   * data comes at the cost of keeping other useful data in memory (for example
  68   69   * ZFS data cached by the ARC).
  69   70   *
  70   71   * Implementation
  71   72   *
  72   73   * As buffers are modified dsl_pool_willuse_space() increments both the per-
  73   74   * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
  74   75   * dirty space used; dsl_pool_dirty_space() decrements those values as data
  75   76   * is synced out from dsl_pool_sync(). While only the poolwide value is
  76   77   * relevant, the per-txg value is useful for debugging. The tunable
  77   78   * zfs_dirty_data_max determines the dirty space limit. Once that value is
  78   79   * exceeded, new writes are halted until space frees up.
  79   80   *
  80   81   * The zfs_dirty_data_sync tunable dictates the threshold at which we
  81   82   * ensure that there is a txg syncing (see the comment in txg.c for a full
  82   83   * description of transaction group stages).
  83   84   *
  84   85   * The IO scheduler uses both the dirty space limit and current amount of
  85   86   * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
  86   87   * issues. See the comment in vdev_queue.c for details of the IO scheduler.
  87   88   *
  88   89   * The delay is also calculated based on the amount of dirty data.  See the
  89   90   * comment above dmu_tx_delay() for details.
  90   91   */
  91   92  
  92   93  /*
  93   94   * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
  94   95   * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
  95   96   */
  96   97  uint64_t zfs_dirty_data_max;
  97   98  uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
  98   99  int zfs_dirty_data_max_percent = 10;
  99  100  
 100  101  /*
 101  102   * If there is at least this much dirty data, push out a txg.
 102  103   */
 103  104  uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
 104  105  
 105  106  /*
 106  107   * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
 107  108   * and delay each transaction.
 108  109   * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
 109  110   */
 110  111  int zfs_delay_min_dirty_percent = 60;
 111  112  
 112  113  /*
 113  114   * This controls how quickly the delay approaches infinity.
 114  115   * Larger values cause it to delay more for a given amount of dirty data.
 115  116   * Therefore larger values will cause there to be less dirty data for a
 116  117   * given throughput.
 117  118   *
 118  119   * For the smoothest delay, this value should be about 1 billion divided
 119  120   * by the maximum number of operations per second.  This will smoothly
 120  121   * handle between 10x and 1/10th this number.
 121  122   *
 122  123   * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
 123  124   * multiply in dmu_tx_delay().
 124  125   */
 125  126  uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
 126  127  
 127  128  
 128  129  hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
 129  130  hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
 130  131  
 131  132  int
 132  133  dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 133  134  {
 134  135          uint64_t obj;
 135  136          int err;
 136  137  
 137  138          err = zap_lookup(dp->dp_meta_objset,
 138  139              dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 139  140              name, sizeof (obj), 1, &obj);
 140  141          if (err)
 141  142                  return (err);
 142  143  
 143  144          return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 144  145  }
 145  146  
 146  147  static dsl_pool_t *
 147  148  dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 148  149  {
 149  150          dsl_pool_t *dp;
 150  151          blkptr_t *bp = spa_get_rootblkptr(spa);
 151  152  
 152  153          dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 153  154          dp->dp_spa = spa;
 154  155          dp->dp_meta_rootbp = *bp;
 155  156          rrw_init(&dp->dp_config_rwlock, B_TRUE);
 156  157          txg_init(dp, txg);
 157  158  
 158  159          txg_list_create(&dp->dp_dirty_datasets,
 159  160              offsetof(dsl_dataset_t, ds_dirty_link));
 160  161          txg_list_create(&dp->dp_dirty_zilogs,
 161  162              offsetof(zilog_t, zl_dirty_link));
 162  163          txg_list_create(&dp->dp_dirty_dirs,
 163  164              offsetof(dsl_dir_t, dd_dirty_link));
 164  165          txg_list_create(&dp->dp_sync_tasks,
 165  166              offsetof(dsl_sync_task_t, dst_node));
 166  167  
 167  168          mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 168  169          cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 169  170  
 170  171          dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 171  172              1, 4, 0);
 172  173  
 173  174          return (dp);
 174  175  }
 175  176  
 176  177  int
 177  178  dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 178  179  {
 179  180          int err;
 180  181          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 181  182  
 182  183          err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 183  184              &dp->dp_meta_objset);
 184  185          if (err != 0)
 185  186                  dsl_pool_close(dp);
 186  187          else
 187  188                  *dpp = dp;
 188  189  
 189  190          return (err);
 190  191  }
 191  192  
 192  193  int
 193  194  dsl_pool_open(dsl_pool_t *dp)
 194  195  {
 195  196          int err;
 196  197          dsl_dir_t *dd;
 197  198          dsl_dataset_t *ds;
 198  199          uint64_t obj;
 199  200  
 200  201          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 201  202          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 202  203              DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 203  204              &dp->dp_root_dir_obj);
 204  205          if (err)
 205  206                  goto out;
 206  207  
 207  208          err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 208  209              NULL, dp, &dp->dp_root_dir);
 209  210          if (err)
 210  211                  goto out;
 211  212  
 212  213          err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 213  214          if (err)
 214  215                  goto out;
 215  216  
 216  217          if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 217  218                  err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 218  219                  if (err)
 219  220                          goto out;
 220  221                  err = dsl_dataset_hold_obj(dp,
 221  222                      dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 222  223                  if (err == 0) {
 223  224                          err = dsl_dataset_hold_obj(dp,
 224  225                              dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 225  226                              &dp->dp_origin_snap);
 226  227                          dsl_dataset_rele(ds, FTAG);
 227  228                  }
 228  229                  dsl_dir_rele(dd, dp);
 229  230                  if (err)
 230  231                          goto out;
 231  232          }
 232  233  
 233  234          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 234  235                  err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 235  236                      &dp->dp_free_dir);
 236  237                  if (err)
 237  238                          goto out;
 238  239  
 239  240                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 240  241                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 241  242                  if (err)
 242  243                          goto out;
 243  244                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 244  245                      dp->dp_meta_objset, obj));
 245  246          }
 246  247  
 247  248          /*
 248  249           * Note: errors ignored, because the leak dir will not exist if we
 249  250           * have not encountered a leak yet.
 250  251           */
 251  252          (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
 252  253              &dp->dp_leak_dir);
 253  254  
 254  255          if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 255  256                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 256  257                      DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 257  258                      &dp->dp_bptree_obj);
 258  259                  if (err != 0)
 259  260                          goto out;
 260  261          }
 261  262  
 262  263          if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 263  264                  err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 264  265                      DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
 265  266                      &dp->dp_empty_bpobj);
 266  267                  if (err != 0)
 267  268                          goto out;
 268  269          }
 269  270  
 270  271          err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 271  272              DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 272  273              &dp->dp_tmp_userrefs_obj);
 273  274          if (err == ENOENT)
 274  275                  err = 0;
 275  276          if (err)
 276  277                  goto out;
 277  278  
 278  279          err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 279  280  
 280  281  out:
 281  282          rrw_exit(&dp->dp_config_rwlock, FTAG);
 282  283          return (err);
 283  284  }
 284  285  
 285  286  void
 286  287  dsl_pool_close(dsl_pool_t *dp)
 287  288  {
 288  289          /*
 289  290           * Drop our references from dsl_pool_open().
 290  291           *
 291  292           * Since we held the origin_snap from "syncing" context (which
 292  293           * includes pool-opening context), it actually only got a "ref"
 293  294           * and not a hold, so just drop that here.
 294  295           */
 295  296          if (dp->dp_origin_snap)
 296  297                  dsl_dataset_rele(dp->dp_origin_snap, dp);
 297  298          if (dp->dp_mos_dir)
 298  299                  dsl_dir_rele(dp->dp_mos_dir, dp);
 299  300          if (dp->dp_free_dir)
 300  301                  dsl_dir_rele(dp->dp_free_dir, dp);
 301  302          if (dp->dp_leak_dir)
 302  303                  dsl_dir_rele(dp->dp_leak_dir, dp);
 303  304          if (dp->dp_root_dir)
 304  305                  dsl_dir_rele(dp->dp_root_dir, dp);
 305  306  
 306  307          bpobj_close(&dp->dp_free_bpobj);
 307  308  
 308  309          /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 309  310          if (dp->dp_meta_objset)

↓ open down ↓

275 lines elided

↑ open up ↑

 310  311                  dmu_objset_evict(dp->dp_meta_objset);
 311  312  
 312  313          txg_list_destroy(&dp->dp_dirty_datasets);
 313  314          txg_list_destroy(&dp->dp_dirty_zilogs);
 314  315          txg_list_destroy(&dp->dp_sync_tasks);
 315  316          txg_list_destroy(&dp->dp_dirty_dirs);
 316  317  
 317  318          arc_flush(dp->dp_spa);
 318  319          txg_fini(dp);
 319  320          dsl_scan_fini(dp);
      321 +        dmu_buf_user_evict_wait();
      322 +
 320  323          rrw_destroy(&dp->dp_config_rwlock);
 321  324          mutex_destroy(&dp->dp_lock);
 322  325          taskq_destroy(dp->dp_vnrele_taskq);
 323  326          if (dp->dp_blkstats)
 324  327                  kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 325  328          kmem_free(dp, sizeof (dsl_pool_t));
 326  329  }
 327  330  
 328  331  dsl_pool_t *
 329  332  dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)

 330  333  {
 331  334          int err;
 332  335          dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 333  336          dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 334  337          objset_t *os;
 335  338          dsl_dataset_t *ds;
 336  339          uint64_t obj;
 337  340  
 338  341          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 339  342  
 340  343          /* create and open the MOS (meta-objset) */
 341  344          dp->dp_meta_objset = dmu_objset_create_impl(spa,
 342  345              NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 343  346  
 344  347          /* create the pool directory */
 345  348          err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 346  349              DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
 347  350          ASSERT0(err);
 348  351  
 349  352          /* Initialize scan structures */
 350  353          VERIFY0(dsl_scan_init(dp, txg));
 351  354  
 352  355          /* create and open the root dir */
 353  356          dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 354  357          VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 355  358              NULL, dp, &dp->dp_root_dir));
 356  359  
 357  360          /* create and open the meta-objset dir */
 358  361          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
 359  362          VERIFY0(dsl_pool_open_special_dir(dp,
 360  363              MOS_DIR_NAME, &dp->dp_mos_dir));
 361  364  
 362  365          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 363  366                  /* create and open the free dir */
 364  367                  (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 365  368                      FREE_DIR_NAME, tx);
 366  369                  VERIFY0(dsl_pool_open_special_dir(dp,
 367  370                      FREE_DIR_NAME, &dp->dp_free_dir));
 368  371  
 369  372                  /* create and open the free_bplist */
 370  373                  obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
 371  374                  VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 372  375                      DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
 373  376                  VERIFY0(bpobj_open(&dp->dp_free_bpobj,
 374  377                      dp->dp_meta_objset, obj));
 375  378          }
 376  379  
 377  380          if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 378  381                  dsl_pool_create_origin(dp, tx);
 379  382  
 380  383          /* create the root dataset */
 381  384          obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 382  385  
 383  386          /* create the root objset */
 384  387          VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 385  388          os = dmu_objset_create_impl(dp->dp_spa, ds,
 386  389              dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 387  390  #ifdef _KERNEL
 388  391          zfs_create_fs(os, kcred, zplprops, tx);
 389  392  #endif
 390  393          dsl_dataset_rele(ds, FTAG);
 391  394  
 392  395          dmu_tx_commit(tx);
 393  396  
 394  397          rrw_exit(&dp->dp_config_rwlock, FTAG);
 395  398  
 396  399          return (dp);
 397  400  }
 398  401  
 399  402  /*
 400  403   * Account for the meta-objset space in its placeholder dsl_dir.
 401  404   */
 402  405  void
 403  406  dsl_pool_mos_diduse_space(dsl_pool_t *dp,
 404  407      int64_t used, int64_t comp, int64_t uncomp)
 405  408  {
 406  409          ASSERT3U(comp, ==, uncomp); /* it's all metadata */
 407  410          mutex_enter(&dp->dp_lock);
 408  411          dp->dp_mos_used_delta += used;
 409  412          dp->dp_mos_compressed_delta += comp;
 410  413          dp->dp_mos_uncompressed_delta += uncomp;
 411  414          mutex_exit(&dp->dp_lock);
 412  415  }
 413  416  
 414  417  static int
 415  418  deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 416  419  {
 417  420          dsl_deadlist_t *dl = arg;
 418  421          dsl_deadlist_insert(dl, bp, tx);
 419  422          return (0);
 420  423  }
 421  424  
 422  425  static void
 423  426  dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
 424  427  {
 425  428          zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 426  429          dmu_objset_sync(dp->dp_meta_objset, zio, tx);
 427  430          VERIFY0(zio_wait(zio));
 428  431          dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 429  432          spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 430  433  }
 431  434  
 432  435  static void
 433  436  dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 434  437  {
 435  438          ASSERT(MUTEX_HELD(&dp->dp_lock));
 436  439  
 437  440          if (delta < 0)
 438  441                  ASSERT3U(-delta, <=, dp->dp_dirty_total);
 439  442  
 440  443          dp->dp_dirty_total += delta;
 441  444  
 442  445          /*
 443  446           * Note: we signal even when increasing dp_dirty_total.
 444  447           * This ensures forward progress -- each thread wakes the next waiter.
 445  448           */
 446  449          if (dp->dp_dirty_total <= zfs_dirty_data_max)
 447  450                  cv_signal(&dp->dp_spaceavail_cv);
 448  451  }
 449  452  
 450  453  void
 451  454  dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 452  455  {
 453  456          zio_t *zio;
 454  457          dmu_tx_t *tx;
 455  458          dsl_dir_t *dd;
 456  459          dsl_dataset_t *ds;
 457  460          objset_t *mos = dp->dp_meta_objset;
 458  461          list_t synced_datasets;
 459  462  
 460  463          list_create(&synced_datasets, sizeof (dsl_dataset_t),
 461  464              offsetof(dsl_dataset_t, ds_synced_link));
 462  465  
 463  466          tx = dmu_tx_create_assigned(dp, txg);
 464  467  
 465  468          /*
 466  469           * Write out all dirty blocks of dirty datasets.
 467  470           */
 468  471          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 469  472          while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 470  473                  /*
 471  474                   * We must not sync any non-MOS datasets twice, because
 472  475                   * we may have taken a snapshot of them.  However, we
 473  476                   * may sync newly-created datasets on pass 2.
 474  477                   */
 475  478                  ASSERT(!list_link_active(&ds->ds_synced_link));
 476  479                  list_insert_tail(&synced_datasets, ds);
 477  480                  dsl_dataset_sync(ds, zio, tx);
 478  481          }
 479  482          VERIFY0(zio_wait(zio));
 480  483  
 481  484          /*
 482  485           * We have written all of the accounted dirty data, so our
 483  486           * dp_space_towrite should now be zero.  However, some seldom-used
 484  487           * code paths do not adhere to this (e.g. dbuf_undirty(), also
 485  488           * rounding error in dbuf_write_physdone).
 486  489           * Shore up the accounting of any dirtied space now.
 487  490           */
 488  491          dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
 489  492  
 490  493          /*
 491  494           * After the data blocks have been written (ensured by the zio_wait()
 492  495           * above), update the user/group space accounting.
 493  496           */
 494  497          for (ds = list_head(&synced_datasets); ds != NULL;
 495  498              ds = list_next(&synced_datasets, ds)) {
 496  499                  dmu_objset_do_userquota_updates(ds->ds_objset, tx);
 497  500          }
 498  501  
 499  502          /*
 500  503           * Sync the datasets again to push out the changes due to
 501  504           * userspace updates.  This must be done before we process the
 502  505           * sync tasks, so that any snapshots will have the correct
 503  506           * user accounting information (and we won't get confused
 504  507           * about which blocks are part of the snapshot).
 505  508           */
 506  509          zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 507  510          while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 508  511                  ASSERT(list_link_active(&ds->ds_synced_link));
 509  512                  dmu_buf_rele(ds->ds_dbuf, ds);
 510  513                  dsl_dataset_sync(ds, zio, tx);
 511  514          }
 512  515          VERIFY0(zio_wait(zio));
 513  516  
 514  517          /*
 515  518           * Now that the datasets have been completely synced, we can
 516  519           * clean up our in-memory structures accumulated while syncing:
 517  520           *
 518  521           *  - move dead blocks from the pending deadlist to the on-disk deadlist
 519  522           *  - release hold from dsl_dataset_dirty()
 520  523           */
 521  524          while ((ds = list_remove_head(&synced_datasets)) != NULL) {
 522  525                  objset_t *os = ds->ds_objset;
 523  526                  bplist_iterate(&ds->ds_pending_deadlist,
 524  527                      deadlist_enqueue_cb, &ds->ds_deadlist, tx);
 525  528                  ASSERT(!dmu_objset_is_dirty(os, txg));
 526  529                  dmu_buf_rele(ds->ds_dbuf, ds);
 527  530          }
 528  531          while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 529  532                  dsl_dir_sync(dd, tx);
 530  533          }
 531  534  
 532  535          /*
 533  536           * The MOS's space is accounted for in the pool/$MOS
 534  537           * (dp_mos_dir).  We can't modify the mos while we're syncing
 535  538           * it, so we remember the deltas and apply them here.
 536  539           */
 537  540          if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
 538  541              dp->dp_mos_uncompressed_delta != 0) {
 539  542                  dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
 540  543                      dp->dp_mos_used_delta,
 541  544                      dp->dp_mos_compressed_delta,
 542  545                      dp->dp_mos_uncompressed_delta, tx);
 543  546                  dp->dp_mos_used_delta = 0;
 544  547                  dp->dp_mos_compressed_delta = 0;
 545  548                  dp->dp_mos_uncompressed_delta = 0;
 546  549          }
 547  550  
 548  551          if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 549  552              list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 550  553                  dsl_pool_sync_mos(dp, tx);
 551  554          }
 552  555  
 553  556          /*
 554  557           * If we modify a dataset in the same txg that we want to destroy it,
 555  558           * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
 556  559           * dsl_dir_destroy_check() will fail if there are unexpected holds.
 557  560           * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
 558  561           * and clearing the hold on it) before we process the sync_tasks.
 559  562           * The MOS data dirtied by the sync_tasks will be synced on the next
 560  563           * pass.
 561  564           */
 562  565          if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
 563  566                  dsl_sync_task_t *dst;
 564  567                  /*
 565  568                   * No more sync tasks should have been added while we
 566  569                   * were syncing.
 567  570                   */
 568  571                  ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
 569  572                  while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
 570  573                          dsl_sync_task_sync(dst, tx);
 571  574          }
 572  575  
 573  576          dmu_tx_commit(tx);
 574  577  
 575  578          DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 576  579  }
 577  580  
 578  581  void
 579  582  dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 580  583  {
 581  584          zilog_t *zilog;
 582  585  
 583  586          while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
 584  587                  dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 585  588                  zil_clean(zilog, txg);
 586  589                  ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 587  590                  dmu_buf_rele(ds->ds_dbuf, zilog);
 588  591          }
 589  592          ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 590  593  }
 591  594  
 592  595  /*
 593  596   * TRUE if the current thread is the tx_sync_thread or if we
 594  597   * are being called from SPA context during pool initialization.
 595  598   */
 596  599  int
 597  600  dsl_pool_sync_context(dsl_pool_t *dp)
 598  601  {
 599  602          return (curthread == dp->dp_tx.tx_sync_thread ||
 600  603              spa_is_initializing(dp->dp_spa));
 601  604  }
 602  605  
 603  606  uint64_t
 604  607  dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 605  608  {
 606  609          uint64_t space, resv;
 607  610  
 608  611          /*
 609  612           * If we're trying to assess whether it's OK to do a free,
 610  613           * cut the reservation in half to allow forward progress
 611  614           * (e.g. make it possible to rm(1) files from a full pool).
 612  615           */
 613  616          space = spa_get_dspace(dp->dp_spa);
 614  617          resv = spa_get_slop_space(dp->dp_spa);
 615  618          if (netfree)
 616  619                  resv >>= 1;
 617  620  
 618  621          return (space - resv);
 619  622  }
 620  623  
 621  624  boolean_t
 622  625  dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 623  626  {
 624  627          uint64_t delay_min_bytes =
 625  628              zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 626  629          boolean_t rv;
 627  630  
 628  631          mutex_enter(&dp->dp_lock);
 629  632          if (dp->dp_dirty_total > zfs_dirty_data_sync)
 630  633                  txg_kick(dp);
 631  634          rv = (dp->dp_dirty_total > delay_min_bytes);
 632  635          mutex_exit(&dp->dp_lock);
 633  636          return (rv);
 634  637  }
 635  638  
 636  639  void
 637  640  dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 638  641  {
 639  642          if (space > 0) {
 640  643                  mutex_enter(&dp->dp_lock);
 641  644                  dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 642  645                  dsl_pool_dirty_delta(dp, space);
 643  646                  mutex_exit(&dp->dp_lock);
 644  647          }
 645  648  }
 646  649  
 647  650  void
 648  651  dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 649  652  {
 650  653          ASSERT3S(space, >=, 0);
 651  654          if (space == 0)
 652  655                  return;
 653  656          mutex_enter(&dp->dp_lock);
 654  657          if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
 655  658                  /* XXX writing something we didn't dirty? */
 656  659                  space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 657  660          }
 658  661          ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
 659  662          dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
 660  663          ASSERT3U(dp->dp_dirty_total, >=, space);
 661  664          dsl_pool_dirty_delta(dp, -space);
 662  665          mutex_exit(&dp->dp_lock);
 663  666  }
 664  667  
 665  668  /* ARGSUSED */
 666  669  static int
 667  670  upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 668  671  {
 669  672          dmu_tx_t *tx = arg;
 670  673          dsl_dataset_t *ds, *prev = NULL;
 671  674          int err;
 672  675  
 673  676          err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 674  677          if (err)
 675  678                  return (err);
 676  679  
 677  680          while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 678  681                  err = dsl_dataset_hold_obj(dp,
 679  682                      dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 680  683                  if (err) {
 681  684                          dsl_dataset_rele(ds, FTAG);
 682  685                          return (err);
 683  686                  }
 684  687  
 685  688                  if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 686  689                          break;
 687  690                  dsl_dataset_rele(ds, FTAG);
 688  691                  ds = prev;
 689  692                  prev = NULL;
 690  693          }
 691  694  
 692  695          if (prev == NULL) {
 693  696                  prev = dp->dp_origin_snap;
 694  697  
 695  698                  /*
 696  699                   * The $ORIGIN can't have any data, or the accounting
 697  700                   * will be wrong.
 698  701                   */
 699  702                  ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
 700  703  
 701  704                  /* The origin doesn't get attached to itself */
 702  705                  if (ds->ds_object == prev->ds_object) {
 703  706                          dsl_dataset_rele(ds, FTAG);
 704  707                          return (0);
 705  708                  }
 706  709  
 707  710                  dmu_buf_will_dirty(ds->ds_dbuf, tx);
 708  711                  dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
 709  712                  dsl_dataset_phys(ds)->ds_prev_snap_txg =
 710  713                      dsl_dataset_phys(prev)->ds_creation_txg;
 711  714  
 712  715                  dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 713  716                  dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 714  717  
 715  718                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 716  719                  dsl_dataset_phys(prev)->ds_num_children++;
 717  720  
 718  721                  if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 719  722                          ASSERT(ds->ds_prev == NULL);
 720  723                          VERIFY0(dsl_dataset_hold_obj(dp,
 721  724                              dsl_dataset_phys(ds)->ds_prev_snap_obj,
 722  725                              ds, &ds->ds_prev));
 723  726                  }
 724  727          }
 725  728  
 726  729          ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
 727  730          ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 728  731  
 729  732          if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 730  733                  dmu_buf_will_dirty(prev->ds_dbuf, tx);
 731  734                  dsl_dataset_phys(prev)->ds_next_clones_obj =
 732  735                      zap_create(dp->dp_meta_objset,
 733  736                      DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 734  737          }
 735  738          VERIFY0(zap_add_int(dp->dp_meta_objset,
 736  739              dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 737  740  
 738  741          dsl_dataset_rele(ds, FTAG);
 739  742          if (prev != dp->dp_origin_snap)
 740  743                  dsl_dataset_rele(prev, FTAG);
 741  744          return (0);
 742  745  }
 743  746  
 744  747  void
 745  748  dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 746  749  {
 747  750          ASSERT(dmu_tx_is_syncing(tx));
 748  751          ASSERT(dp->dp_origin_snap != NULL);
 749  752  
 750  753          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
 751  754              tx, DS_FIND_CHILDREN));
 752  755  }
 753  756  
 754  757  /* ARGSUSED */
 755  758  static int
 756  759  upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 757  760  {
 758  761          dmu_tx_t *tx = arg;
 759  762          objset_t *mos = dp->dp_meta_objset;
 760  763  
 761  764          if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
 762  765                  dsl_dataset_t *origin;
 763  766  
 764  767                  VERIFY0(dsl_dataset_hold_obj(dp,
 765  768                      dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
 766  769  
 767  770                  if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
 768  771                          dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
 769  772                          dsl_dir_phys(origin->ds_dir)->dd_clones =
 770  773                              zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
 771  774                              0, tx);
 772  775                  }
 773  776  
 774  777                  VERIFY0(zap_add_int(dp->dp_meta_objset,
 775  778                      dsl_dir_phys(origin->ds_dir)->dd_clones,
 776  779                      ds->ds_object, tx));
 777  780  
 778  781                  dsl_dataset_rele(origin, FTAG);
 779  782          }
 780  783          return (0);
 781  784  }
 782  785  
 783  786  void
 784  787  dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 785  788  {
 786  789          ASSERT(dmu_tx_is_syncing(tx));
 787  790          uint64_t obj;
 788  791  
 789  792          (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
 790  793          VERIFY0(dsl_pool_open_special_dir(dp,
 791  794              FREE_DIR_NAME, &dp->dp_free_dir));
 792  795  
 793  796          /*
 794  797           * We can't use bpobj_alloc(), because spa_version() still
 795  798           * returns the old version, and we need a new-version bpobj with
 796  799           * subobj support.  So call dmu_object_alloc() directly.
 797  800           */
 798  801          obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
 799  802              SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
 800  803          VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 801  804              DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
 802  805          VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
 803  806  
 804  807          VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 805  808              upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
 806  809  }
 807  810  
 808  811  void
 809  812  dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 810  813  {
 811  814          uint64_t dsobj;
 812  815          dsl_dataset_t *ds;
 813  816  
 814  817          ASSERT(dmu_tx_is_syncing(tx));
 815  818          ASSERT(dp->dp_origin_snap == NULL);
 816  819          ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 817  820  
 818  821          /* create the origin dir, ds, & snap-ds */
 819  822          dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 820  823              NULL, 0, kcred, tx);
 821  824          VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 822  825          dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
 823  826          VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 824  827              dp, &dp->dp_origin_snap));
 825  828          dsl_dataset_rele(ds, FTAG);
 826  829  }
 827  830  
 828  831  taskq_t *
 829  832  dsl_pool_vnrele_taskq(dsl_pool_t *dp)
 830  833  {
 831  834          return (dp->dp_vnrele_taskq);
 832  835  }
 833  836  
 834  837  /*
 835  838   * Walk through the pool-wide zap object of temporary snapshot user holds
 836  839   * and release them.
 837  840   */
 838  841  void
 839  842  dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
 840  843  {
 841  844          zap_attribute_t za;
 842  845          zap_cursor_t zc;
 843  846          objset_t *mos = dp->dp_meta_objset;
 844  847          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 845  848          nvlist_t *holds;
 846  849  
 847  850          if (zapobj == 0)
 848  851                  return;
 849  852          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 850  853  
 851  854          holds = fnvlist_alloc();
 852  855  
 853  856          for (zap_cursor_init(&zc, mos, zapobj);
 854  857              zap_cursor_retrieve(&zc, &za) == 0;
 855  858              zap_cursor_advance(&zc)) {
 856  859                  char *htag;
 857  860                  nvlist_t *tags;
 858  861  
 859  862                  htag = strchr(za.za_name, '-');
 860  863                  *htag = '\0';
 861  864                  ++htag;
 862  865                  if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
 863  866                          tags = fnvlist_alloc();
 864  867                          fnvlist_add_boolean(tags, htag);
 865  868                          fnvlist_add_nvlist(holds, za.za_name, tags);
 866  869                          fnvlist_free(tags);
 867  870                  } else {
 868  871                          fnvlist_add_boolean(tags, htag);
 869  872                  }
 870  873          }
 871  874          dsl_dataset_user_release_tmp(dp, holds);
 872  875          fnvlist_free(holds);
 873  876          zap_cursor_fini(&zc);
 874  877  }
 875  878  
 876  879  /*
 877  880   * Create the pool-wide zap object for storing temporary snapshot holds.
 878  881   */
 879  882  void
 880  883  dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 881  884  {
 882  885          objset_t *mos = dp->dp_meta_objset;
 883  886  
 884  887          ASSERT(dp->dp_tmp_userrefs_obj == 0);
 885  888          ASSERT(dmu_tx_is_syncing(tx));
 886  889  
 887  890          dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 888  891              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 889  892  }
 890  893  
 891  894  static int
 892  895  dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 893  896      const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 894  897  {
 895  898          objset_t *mos = dp->dp_meta_objset;
 896  899          uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 897  900          char *name;
 898  901          int error;
 899  902  
 900  903          ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 901  904          ASSERT(dmu_tx_is_syncing(tx));
 902  905  
 903  906          /*
 904  907           * If the pool was created prior to SPA_VERSION_USERREFS, the
 905  908           * zap object for temporary holds might not exist yet.
 906  909           */
 907  910          if (zapobj == 0) {
 908  911                  if (holding) {
 909  912                          dsl_pool_user_hold_create_obj(dp, tx);
 910  913                          zapobj = dp->dp_tmp_userrefs_obj;
 911  914                  } else {
 912  915                          return (SET_ERROR(ENOENT));
 913  916                  }
 914  917          }
 915  918  
 916  919          name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 917  920          if (holding)
 918  921                  error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 919  922          else
 920  923                  error = zap_remove(mos, zapobj, name, tx);
 921  924          strfree(name);
 922  925  
 923  926          return (error);
 924  927  }
 925  928  
 926  929  /*
 927  930   * Add a temporary hold for the given dataset object and tag.
 928  931   */
 929  932  int
 930  933  dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 931  934      uint64_t now, dmu_tx_t *tx)
 932  935  {
 933  936          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 934  937  }
 935  938  
 936  939  /*
 937  940   * Release a temporary hold for the given dataset object and tag.
 938  941   */
 939  942  int
 940  943  dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
 941  944      dmu_tx_t *tx)
 942  945  {
 943  946          return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
 944  947              tx, B_FALSE));
 945  948  }
 946  949  
 947  950  /*
 948  951   * DSL Pool Configuration Lock
 949  952   *
 950  953   * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
 951  954   * creation / destruction / rename / property setting).  It must be held for
 952  955   * read to hold a dataset or dsl_dir.  I.e. you must call
 953  956   * dsl_pool_config_enter() or dsl_pool_hold() before calling
 954  957   * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
 955  958   * must be held continuously until all datasets and dsl_dirs are released.
 956  959   *
 957  960   * The only exception to this rule is that if a "long hold" is placed on
 958  961   * a dataset, then the dp_config_rwlock may be dropped while the dataset
 959  962   * is still held.  The long hold will prevent the dataset from being
 960  963   * destroyed -- the destroy will fail with EBUSY.  A long hold can be
 961  964   * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
 962  965   * (by calling dsl_{dataset,objset}_{try}own{_obj}).
 963  966   *
 964  967   * Legitimate long-holders (including owners) should be long-running, cancelable
 965  968   * tasks that should cause "zfs destroy" to fail.  This includes DMU
 966  969   * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
 967  970   * "zfs send", and "zfs diff".  There are several other long-holders whose
 968  971   * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
 969  972   *
 970  973   * The usual formula for long-holding would be:
 971  974   * dsl_pool_hold()
 972  975   * dsl_dataset_hold()
 973  976   * ... perform checks ...
 974  977   * dsl_dataset_long_hold()
 975  978   * dsl_pool_rele()
 976  979   * ... perform long-running task ...
 977  980   * dsl_dataset_long_rele()
 978  981   * dsl_dataset_rele()
 979  982   *
 980  983   * Note that when the long hold is released, the dataset is still held but
 981  984   * the pool is not held.  The dataset may change arbitrarily during this time
 982  985   * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
 983  986   * dataset except release it.
 984  987   *
 985  988   * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
 986  989   * or modifying operations.
 987  990   *
 988  991   * Modifying operations should generally use dsl_sync_task().  The synctask
 989  992   * infrastructure enforces proper locking strategy with respect to the
 990  993   * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
 991  994   *
 992  995   * Read-only operations will manually hold the pool, then the dataset, obtain
 993  996   * information from the dataset, then release the pool and dataset.
 994  997   * dmu_objset_{hold,rele}() are convenience routines that also do the pool
 995  998   * hold/rele.
 996  999   */
 997 1000  
 998 1001  int
 999 1002  dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
1000 1003  {
1001 1004          spa_t *spa;
1002 1005          int error;
1003 1006  
1004 1007          error = spa_open(name, &spa, tag);
1005 1008          if (error == 0) {
1006 1009                  *dp = spa_get_dsl(spa);
1007 1010                  dsl_pool_config_enter(*dp, tag);
1008 1011          }
1009 1012          return (error);
1010 1013  }
1011 1014  
1012 1015  void
1013 1016  dsl_pool_rele(dsl_pool_t *dp, void *tag)
1014 1017  {
1015 1018          dsl_pool_config_exit(dp, tag);
1016 1019          spa_close(dp->dp_spa, tag);
1017 1020  }
1018 1021  
1019 1022  void
1020 1023  dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
1021 1024  {
1022 1025          /*
1023 1026           * We use a "reentrant" reader-writer lock, but not reentrantly.
1024 1027           *
1025 1028           * The rrwlock can (with the track_all flag) track all reading threads,
1026 1029           * which is very useful for debugging which code path failed to release
1027 1030           * the lock, and for verifying that the *current* thread does hold
1028 1031           * the lock.
1029 1032           *
1030 1033           * (Unlike a rwlock, which knows that N threads hold it for
1031 1034           * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
1032 1035           * if any thread holds it for read, even if this thread doesn't).
1033 1036           */
1034 1037          ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
1035 1038          rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
1036 1039  }
1037 1040  
1038 1041  void
1039 1042  dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
1040 1043  {
1041 1044          rrw_exit(&dp->dp_config_rwlock, tag);
1042 1045  }
1043 1046  
1044 1047  boolean_t
1045 1048  dsl_pool_config_held(dsl_pool_t *dp)
1046 1049  {
1047 1050          return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
1048 1051  }

↓ open down ↓

719 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX