Print this page
    
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/spa_checkpoint.c
          +++ new/usr/src/uts/common/fs/zfs/spa_checkpoint.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2017 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Storage Pool Checkpoint
  28   28   *
  29   29   * A storage pool checkpoint can be thought of as a pool-wide snapshot or
  30   30   * a stable version of extreme rewind that guarantees no blocks from the
  31   31   * checkpointed state will have been overwritten. It remembers the entire
  32   32   * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
  33   33   * point that it was taken and the user can rewind back to that point even if
  34   34   * they applied destructive operations on their datasets or even enabled new
  35   35   * zpool on-disk features. If a pool has a checkpoint that is no longer
  36   36   * needed, the user can discard it.
  37   37   *
  38   38   * == On disk data structures used ==
  39   39   *
  40   40   * - The pool has a new feature flag and a new entry in the MOS. The feature
  41   41   *   flag is set to active when we create the checkpoint and remains active
  42   42   *   until the checkpoint is fully discarded. The entry in the MOS config
  43   43   *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
  44   44   *   references the state of the pool when we take the checkpoint. The entry
  45   45   *   remains populated until we start discarding the checkpoint or we rewind
  46   46   *   back to it.
  47   47   *
  48   48   * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
  49   49   *   which persists until the checkpoint is fully discarded. The space map
  50   50   *   contains entries that have been freed in the current state of the pool
  51   51   *   but we want to keep around in case we decide to rewind to the checkpoint.
  52   52   *   [see vdev_checkpoint_sm]
  53   53   *
  54   54   * - Each metaslab's ms_sm space map behaves the same as without the
  55   55   *   checkpoint, with the only exception being the scenario when we free
  56   56   *   blocks that belong to the checkpoint. In this case, these blocks remain
  57   57   *   ALLOCATED in the metaslab's space map and they are added as FREE in the
  58   58   *   vdev's checkpoint space map.
  59   59   *
  60   60   * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
  61   61   *   the uberblock was checkpointed. For normal uberblocks this field is 0.
  62   62   *
  63   63   * == Overview of operations ==
  64   64   *
  65   65   * - To create a checkpoint, we first wait for the current TXG to be synced,
  66   66   *   so we can use the most recently synced uberblock (spa_ubsync) as the
  67   67   *   checkpointed uberblock. Then we use an early synctask to place that
  68   68   *   uberblock in MOS config, increment the feature flag for the checkpoint
  69   69   *   (marking it active), and setting spa_checkpoint_txg (see its use below)
  70   70   *   to the TXG of the checkpointed uberblock. We use an early synctask for
  71   71   *   the aforementioned operations to ensure that no blocks were dirtied
  72   72   *   between the current TXG and the TXG of the checkpointed uberblock
  73   73   *   (e.g the previous txg).
  74   74   *
  75   75   * - When a checkpoint exists, we need to ensure that the blocks that
  76   76   *   belong to the checkpoint are freed but never reused. This means that
  77   77   *   these blocks should never end up in the ms_allocatable or the ms_freeing
  78   78   *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
  79   79   *   ms_checkpointing tree is used in addition to the aforementioned ones.
  80   80   *
  81   81   *   Whenever a block is freed and we find out that it is referenced by the
  82   82   *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
  83   83   *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
  84   84   *   This way, we divide the blocks that are being freed into checkpointed
  85   85   *   and not-checkpointed blocks.
  86   86   *
  87   87   *   In order to persist these frees, we write the extents from the
  88   88   *   ms_freeingtree to the ms_sm as usual, and the extents from the
  89   89   *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
  90   90   *   checkpointed extents will remain allocated in the metaslab's ms_sm space
  91   91   *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
  92   92   *   when we discard the checkpoint, we can find the entries that have
  93   93   *   actually been freed in vdev_checkpoint_sm.
  94   94   *   [see spa_checkpoint_discard_thread_sync()]
  95   95   *
  96   96   * - To discard the checkpoint we use an early synctask to delete the
  97   97   *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
  98   98   *   and wakeup the discarding zthr thread (an open-context async thread).
  99   99   *   We use an early synctask to ensure that the operation happens before any
 100  100   *   new data end up in the checkpoint's data structures.
 101  101   *
 102  102   *   Once the synctask is done and the discarding zthr is awake, we discard
 103  103   *   the checkpointed data over multiple TXGs by having the zthr prefetching
 104  104   *   entries from vdev_checkpoint_sm and then starting a synctask that places
 105  105   *   them as free blocks in to their respective ms_allocatable and ms_sm
 106  106   *   structures.
 107  107   *   [see spa_checkpoint_discard_thread()]
 108  108   *
 109  109   *   When there are no entries left in the vdev_checkpoint_sm of all
 110  110   *   top-level vdevs, a final synctask runs that decrements the feature flag.
 111  111   *
 112  112   * - To rewind to the checkpoint, we first use the current uberblock and
 113  113   *   open the MOS so we can access the checkpointed uberblock from the MOS
 114  114   *   config. After we retrieve the checkpointed uberblock, we use it as the
 115  115   *   current uberblock for the pool by writing it to disk with an updated
 116  116   *   TXG, opening its version of the MOS, and moving on as usual from there.
 117  117   *   [see spa_ld_checkpoint_rewind()]
 118  118   *
 119  119   *   An important note on rewinding to the checkpoint has to do with how we
 120  120   *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
 121  121   *   blocks that have not been claimed by the time we took the checkpoint
 122  122   *   as they should no longer be valid.
 123  123   *   [see comment in zil_claim()]
 124  124   *
 125  125   * == Miscellaneous information ==
 126  126   *
 127  127   * - In the hypothetical event that we take a checkpoint, remove a vdev,
 128  128   *   and attempt to rewind, the rewind would fail as the checkpointed
 129  129   *   uberblock would reference data in the removed device. For this reason
 130  130   *   and others of similar nature, we disallow the following operations that
 131  131   *   can change the config:
 132  132   *      vdev removal and attach/detach, mirror splitting, and pool reguid.
 133  133   *
 134  134   * - As most of the checkpoint logic is implemented in the SPA and doesn't
 135  135   *   distinguish datasets when it comes to space accounting, having a
 136  136   *   checkpoint can potentially break the boundaries set by dataset
 137  137   *   reservations.
 138  138   */
 139  139  
 140  140  #include <sys/dmu_tx.h>
 141  141  #include <sys/dsl_dir.h>
 142  142  #include <sys/dsl_synctask.h>
 143  143  #include <sys/metaslab_impl.h>
 144  144  #include <sys/spa.h>
 145  145  #include <sys/spa_impl.h>
 146  146  #include <sys/spa_checkpoint.h>
 147  147  #include <sys/vdev_impl.h>
 148  148  #include <sys/zap.h>
 149  149  #include <sys/zfeature.h>
 150  150  
 151  151  /*
 152  152   * The following parameter limits the amount of memory to be used for the
 153  153   * prefetching of the checkpoint space map done on each vdev while
 154  154   * discarding the checkpoint.
 155  155   *
 156  156   * The reason it exists is because top-level vdevs with long checkpoint
 157  157   * space maps can potentially take up a lot of memory depending on the
 158  158   * amount of checkpointed data that has been freed within them while
 159  159   * the pool had a checkpoint.
 160  160   */
 161  161  uint64_t        zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
 162  162  
 163  163  int
 164  164  spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
 165  165  {
 166  166          if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 167  167                  return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 168  168  
 169  169          bzero(pcs, sizeof (pool_checkpoint_stat_t));
 170  170  
 171  171          int error = zap_contains(spa_meta_objset(spa),
 172  172              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
 173  173          ASSERT(error == 0 || error == ENOENT);
 174  174  
 175  175          if (error == ENOENT)
 176  176                  pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
 177  177          else
 178  178                  pcs->pcs_state = CS_CHECKPOINT_EXISTS;
 179  179  
 180  180          pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
 181  181          pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
 182  182  
 183  183          return (0);
 184  184  }
 185  185  
 186  186  static void
 187  187  spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
 188  188  {
 189  189          spa_t *spa = arg;
 190  190  
 191  191          spa->spa_checkpoint_info.sci_timestamp = 0;
 192  192  
 193  193          spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
 194  194  
 195  195          spa_history_log_internal(spa, "spa discard checkpoint", tx,
 196  196              "finished discarding checkpointed state from the pool");
 197  197  }
 198  198  
 199  199  typedef struct spa_checkpoint_discard_sync_callback_arg {
 200  200          vdev_t *sdc_vd;
 201  201          uint64_t sdc_txg;
 202  202          uint64_t sdc_entry_limit;
 203  203  } spa_checkpoint_discard_sync_callback_arg_t;
 204  204  
 205  205  static int
 206  206  spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
 207  207  {
 208  208          spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 209  209          vdev_t *vd = sdc->sdc_vd;
 210  210          metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 211  211          uint64_t end = sme->sme_offset + sme->sme_run;
 212  212  
 213  213          if (sdc->sdc_entry_limit == 0)
 214  214                  return (EINTR);
 215  215  
 216  216          /*
 217  217           * Since the space map is not condensed, we know that
 218  218           * none of its entries is crossing the boundaries of
 219  219           * its respective metaslab.
 220  220           *
 221  221           * That said, there is no fundamental requirement that
 222  222           * the checkpoint's space map entries should not cross
 223  223           * metaslab boundaries. So if needed we could add code
 224  224           * that handles metaslab-crossing segments in the future.
 225  225           */
 226  226          VERIFY3U(sme->sme_type, ==, SM_FREE);
 227  227          VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 228  228          VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 229  229  
 230  230          /*
 231  231           * At this point we should not be processing any
 232  232           * other frees concurrently, so the lock is technically
 233  233           * unnecessary. We use the lock anyway though to
 234  234           * potentially save ourselves from future headaches.
 235  235           */
 236  236          mutex_enter(&ms->ms_lock);
 237  237          if (range_tree_is_empty(ms->ms_freeing))
 238  238                  vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
 239  239          range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
 240  240          mutex_exit(&ms->ms_lock);
 241  241  
 242  242          ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
 243  243              sme->sme_run);
 244  244          ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
 245  245  
 246  246          vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
 247  247          vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
 248  248          sdc->sdc_entry_limit--;
 249  249  
 250  250          return (0);
 251  251  }
 252  252  
 253  253  static void
 254  254  spa_checkpoint_accounting_verify(spa_t *spa)
  
    | 
      ↓ open down ↓ | 
    254 lines elided | 
    
      ↑ open up ↑ | 
  
 255  255  {
 256  256          vdev_t *rvd = spa->spa_root_vdev;
 257  257          uint64_t ckpoint_sm_space_sum = 0;
 258  258          uint64_t vs_ckpoint_space_sum = 0;
 259  259  
 260  260          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 261  261                  vdev_t *vd = rvd->vdev_child[c];
 262  262  
 263  263                  if (vd->vdev_checkpoint_sm != NULL) {
 264  264                          ckpoint_sm_space_sum +=
 265      -                            -vd->vdev_checkpoint_sm->sm_alloc;
      265 +                            -space_map_allocated(vd->vdev_checkpoint_sm);
 266  266                          vs_ckpoint_space_sum +=
 267  267                              vd->vdev_stat.vs_checkpoint_space;
 268  268                          ASSERT3U(ckpoint_sm_space_sum, ==,
 269  269                              vs_ckpoint_space_sum);
 270  270                  } else {
 271  271                          ASSERT0(vd->vdev_stat.vs_checkpoint_space);
 272  272                  }
 273  273          }
 274  274          ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
 275  275  }
 276  276  
 277  277  static void
 278  278  spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 279  279  {
 280  280          vdev_t *vd = arg;
 281  281          int error;
 282  282  
 283  283          /*
 284  284           * The space map callback is applied only to non-debug entries.
 285  285           * Because the number of debug entries is less or equal to the
 286  286           * number of non-debug entries, we want to ensure that we only
 287  287           * read what we prefetched from open-context.
 288  288           *
 289  289           * Thus, we set the maximum entries that the space map callback
 290  290           * will be applied to be half the entries that could fit in the
 291  291           * imposed memory limit.
 292  292           *
 293  293           * Note that since this is a conservative estimate we also
 294  294           * assume the worst case scenario in our computation where each
 295  295           * entry is two-word.
 296  296           */
 297  297          uint64_t max_entry_limit =
 298  298              (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
 299  299  
 300  300          /*
 301  301           * Iterate from the end of the space map towards the beginning,
 302  302           * placing its entries on ms_freeing and removing them from the
 303  303           * space map. The iteration stops if one of the following
 304  304           * conditions is true:
 305  305           *
 306  306           * 1] We reached the beginning of the space map. At this point
 307  307           *    the space map should be completely empty and
 308  308           *    space_map_incremental_destroy should have returned 0.
 309  309           *    The next step would be to free and close the space map
 310  310           *    and remove its entry from its vdev's top zap. This allows
 311  311           *    spa_checkpoint_discard_thread() to move on to the next vdev.
 312  312           *
 313  313           * 2] We reached the memory limit (amount of memory used to hold
 314  314           *    space map entries in memory) and space_map_incremental_destroy
 315  315           *    returned EINTR. This means that there are entries remaining
 316  316           *    in the space map that will be cleared in a future invocation
 317  317           *    of this function by spa_checkpoint_discard_thread().
 318  318           */
 319  319          spa_checkpoint_discard_sync_callback_arg_t sdc;
 320  320          sdc.sdc_vd = vd;
 321  321          sdc.sdc_txg = tx->tx_txg;
 322  322          sdc.sdc_entry_limit = max_entry_limit;
 323  323  
 324  324          uint64_t words_before =
 325  325              space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 326  326  
 327  327          error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 328  328              spa_checkpoint_discard_sync_callback, &sdc, tx);
 329  329  
 330  330          uint64_t words_after =
 331  331              space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 332  332  
 333  333  #ifdef DEBUG
 334  334          spa_checkpoint_accounting_verify(vd->vdev_spa);
 335  335  #endif
 336  336  
 337  337          zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
 338  338              "deleted %llu words - %llu words are left",
 339  339              tx->tx_txg, vd->vdev_id, (words_before - words_after),
  
    | 
      ↓ open down ↓ | 
    64 lines elided | 
    
      ↑ open up ↑ | 
  
 340  340              words_after);
 341  341  
 342  342          if (error != EINTR) {
 343  343                  if (error != 0) {
 344  344                          zfs_panic_recover("zfs: error %d was returned "
 345  345                              "while incrementally destroying the checkpoint "
 346  346                              "space map of vdev %llu\n",
 347  347                              error, vd->vdev_id);
 348  348                  }
 349  349                  ASSERT0(words_after);
 350      -                ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
      350 +                ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
 351  351                  ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
 352  352  
 353  353                  space_map_free(vd->vdev_checkpoint_sm, tx);
 354  354                  space_map_close(vd->vdev_checkpoint_sm);
 355  355                  vd->vdev_checkpoint_sm = NULL;
 356  356  
 357  357                  VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
 358  358                      vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 359  359          }
 360  360  }
 361  361  
 362  362  static boolean_t
 363  363  spa_checkpoint_discard_is_done(spa_t *spa)
 364  364  {
 365  365          vdev_t *rvd = spa->spa_root_vdev;
 366  366  
 367  367          ASSERT(!spa_has_checkpoint(spa));
 368  368          ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
 369  369  
 370  370          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 371  371                  if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
 372  372                          return (B_FALSE);
 373  373                  ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
 374  374          }
 375  375  
 376  376          return (B_TRUE);
 377  377  }
 378  378  
 379  379  /* ARGSUSED */
 380  380  boolean_t
 381  381  spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
 382  382  {
 383  383          spa_t *spa = arg;
 384  384  
 385  385          if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 386  386                  return (B_FALSE);
 387  387  
 388  388          if (spa_has_checkpoint(spa))
 389  389                  return (B_FALSE);
 390  390  
 391  391          return (B_TRUE);
 392  392  }
 393  393  
 394  394  int
 395  395  spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
 396  396  {
 397  397          spa_t *spa = arg;
 398  398          vdev_t *rvd = spa->spa_root_vdev;
 399  399  
 400  400          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 401  401                  vdev_t *vd = rvd->vdev_child[c];
 402  402  
 403  403                  while (vd->vdev_checkpoint_sm != NULL) {
 404  404                          space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
 405  405                          int numbufs;
 406  406                          dmu_buf_t **dbp;
 407  407  
 408  408                          if (zthr_iscancelled(zthr))
 409  409                                  return (0);
 410  410  
 411  411                          ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
 412  412  
 413  413                          uint64_t size = MIN(space_map_length(checkpoint_sm),
 414  414                              zfs_spa_discard_memory_limit);
 415  415                          uint64_t offset =
 416  416                              space_map_length(checkpoint_sm) - size;
 417  417  
 418  418                          /*
 419  419                           * Ensure that the part of the space map that will
 420  420                           * be destroyed by the synctask, is prefetched in
 421  421                           * memory before the synctask runs.
 422  422                           */
 423  423                          int error = dmu_buf_hold_array_by_bonus(
 424  424                              checkpoint_sm->sm_dbuf, offset, size,
 425  425                              B_TRUE, FTAG, &numbufs, &dbp);
 426  426                          if (error != 0) {
 427  427                                  zfs_panic_recover("zfs: error %d was returned "
 428  428                                      "while prefetching checkpoint space map "
 429  429                                      "entries of vdev %llu\n",
 430  430                                      error, vd->vdev_id);
 431  431                          }
 432  432  
 433  433                          VERIFY0(dsl_sync_task(spa->spa_name, NULL,
 434  434                              spa_checkpoint_discard_thread_sync, vd,
 435  435                              0, ZFS_SPACE_CHECK_NONE));
 436  436  
 437  437                          dmu_buf_rele_array(dbp, numbufs, FTAG);
 438  438                  }
 439  439          }
 440  440  
 441  441          VERIFY(spa_checkpoint_discard_is_done(spa));
 442  442          VERIFY0(spa->spa_checkpoint_info.sci_dspace);
 443  443          VERIFY0(dsl_sync_task(spa->spa_name, NULL,
 444  444              spa_checkpoint_discard_complete_sync, spa,
 445  445              0, ZFS_SPACE_CHECK_NONE));
 446  446  
 447  447          return (0);
 448  448  }
 449  449  
 450  450  
 451  451  /* ARGSUSED */
 452  452  static int
 453  453  spa_checkpoint_check(void *arg, dmu_tx_t *tx)
 454  454  {
 455  455          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 456  456  
 457  457          if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
 458  458                  return (SET_ERROR(ENOTSUP));
 459  459  
 460  460          if (!spa_top_vdevs_spacemap_addressable(spa))
 461  461                  return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
 462  462  
 463  463          if (spa->spa_vdev_removal != NULL)
 464  464                  return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
 465  465  
 466  466          if (spa->spa_checkpoint_txg != 0)
 467  467                  return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
 468  468  
 469  469          if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 470  470                  return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
 471  471  
 472  472          return (0);
 473  473  }
 474  474  
 475  475  /* ARGSUSED */
 476  476  static void
 477  477  spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
 478  478  {
 479  479          dsl_pool_t *dp = dmu_tx_pool(tx);
 480  480          spa_t *spa = dp->dp_spa;
 481  481          uberblock_t checkpoint = spa->spa_ubsync;
 482  482  
 483  483          /*
 484  484           * At this point, there should not be a checkpoint in the MOS.
 485  485           */
 486  486          ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 487  487              DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
 488  488  
 489  489          ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
 490  490          ASSERT0(spa->spa_checkpoint_info.sci_dspace);
 491  491  
 492  492          /*
 493  493           * Since the checkpointed uberblock is the one that just got synced
 494  494           * (we use spa_ubsync), its txg must be equal to the txg number of
 495  495           * the txg we are syncing, minus 1.
 496  496           */
 497  497          ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
 498  498  
 499  499          /*
 500  500           * Once the checkpoint is in place, we need to ensure that none of
 501  501           * its blocks will be marked for reuse after it has been freed.
 502  502           * When there is a checkpoint and a block is freed, we compare its
 503  503           * birth txg to the txg of the checkpointed uberblock to see if the
 504  504           * block is part of the checkpoint or not. Therefore, we have to set
 505  505           * spa_checkpoint_txg before any frees happen in this txg (which is
 506  506           * why this is done as an early_synctask as explained in the comment
 507  507           * in spa_checkpoint()).
 508  508           */
 509  509          spa->spa_checkpoint_txg = checkpoint.ub_txg;
 510  510          spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 511  511  
 512  512          checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
 513  513          VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
 514  514              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
 515  515              sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
 516  516              &checkpoint, tx));
 517  517  
 518  518          /*
 519  519           * Increment the feature refcount and thus activate the feature.
 520  520           * Note that the feature will be deactivated when we've
 521  521           * completely discarded all checkpointed state (both vdev
 522  522           * space maps and uberblock).
 523  523           */
 524  524          spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
 525  525  
 526  526          spa_history_log_internal(spa, "spa checkpoint", tx,
 527  527              "checkpointed uberblock txg=%llu", checkpoint.ub_txg);
 528  528  }
 529  529  
 530  530  /*
 531  531   * Create a checkpoint for the pool.
 532  532   */
 533  533  int
 534  534  spa_checkpoint(const char *pool)
 535  535  {
 536  536          int error;
 537  537          spa_t *spa;
 538  538  
 539  539          error = spa_open(pool, &spa, FTAG);
 540  540          if (error != 0)
 541  541                  return (error);
 542  542  
 543  543          mutex_enter(&spa->spa_vdev_top_lock);
 544  544  
 545  545          /*
 546  546           * Wait for current syncing txg to finish so the latest synced
 547  547           * uberblock (spa_ubsync) has all the changes that we expect
 548  548           * to see if we were to revert later to the checkpoint. In other
 549  549           * words we want the checkpointed uberblock to include/reference
 550  550           * all the changes that were pending at the time that we issued
 551  551           * the checkpoint command.
 552  552           */
 553  553          txg_wait_synced(spa_get_dsl(spa), 0);
 554  554  
 555  555          /*
 556  556           * As the checkpointed uberblock references blocks from the previous
 557  557           * txg (spa_ubsync) we want to ensure that are not freeing any of
 558  558           * these blocks in the same txg that the following synctask will
 559  559           * run. Thus, we run it as an early synctask, so the dirty changes
 560  560           * that are synced to disk afterwards during zios and other synctasks
 561  561           * do not reuse checkpointed blocks.
 562  562           */
 563  563          error = dsl_early_sync_task(pool, spa_checkpoint_check,
 564  564              spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
 565  565  
 566  566          mutex_exit(&spa->spa_vdev_top_lock);
 567  567  
 568  568          spa_close(spa, FTAG);
 569  569          return (error);
 570  570  }
 571  571  
 572  572  /* ARGSUSED */
 573  573  static int
 574  574  spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
 575  575  {
 576  576          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 577  577  
 578  578          if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 579  579                  return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 580  580  
 581  581          if (spa->spa_checkpoint_txg == 0)
 582  582                  return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
 583  583  
 584  584          VERIFY0(zap_contains(spa_meta_objset(spa),
 585  585              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
 586  586  
 587  587          return (0);
 588  588  }
 589  589  
 590  590  /* ARGSUSED */
 591  591  static void
 592  592  spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
 593  593  {
 594  594          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 595  595  
 596  596          VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 597  597              DMU_POOL_ZPOOL_CHECKPOINT, tx));
 598  598  
 599  599          spa->spa_checkpoint_txg = 0;
 600  600  
 601  601          zthr_wakeup(spa->spa_checkpoint_discard_zthr);
 602  602  
 603  603          spa_history_log_internal(spa, "spa discard checkpoint", tx,
 604  604              "started discarding checkpointed state from the pool");
 605  605  }
 606  606  
 607  607  /*
 608  608   * Discard the checkpoint from a pool.
 609  609   */
 610  610  int
 611  611  spa_checkpoint_discard(const char *pool)
 612  612  {
 613  613          /*
 614  614           * Similarly to spa_checkpoint(), we want our synctask to run
 615  615           * before any pending dirty data are written to disk so they
 616  616           * won't end up in the checkpoint's data structures (e.g.
 617  617           * ms_checkpointing and vdev_checkpoint_sm) and re-create any
 618  618           * space maps that the discarding open-context thread has
 619  619           * deleted.
 620  620           * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
 621  621           */
 622  622          return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
 623  623              spa_checkpoint_discard_sync, NULL, 0,
 624  624              ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
 625  625  }
  
    | 
      ↓ open down ↓ | 
    265 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX