10592 Wdiff usr/src/uts/common/fs/zfs/vdev_initialize.c

Print this page

10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/vdev_initialize.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_initialize.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2016 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/spa.h>
  27   27  #include <sys/spa_impl.h>
  28   28  #include <sys/txg.h>
  29   29  #include <sys/vdev_impl.h>
  30   30  #include <sys/refcount.h>
  31   31  #include <sys/metaslab_impl.h>
  32   32  #include <sys/dsl_synctask.h>
  33   33  #include <sys/zap.h>
  34   34  #include <sys/dmu_tx.h>
  35   35  
  36   36  /*
  37   37   * Maximum number of metaslabs per group that can be initialized
  38   38   * simultaneously.
  39   39   */
  40   40  int max_initialize_ms = 3;
  41   41  
  42   42  /*
  43   43   * Value that is written to disk during initialization.
  44   44   */
  45   45  uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
  46   46  
  47   47  /* maximum number of I/Os outstanding per leaf vdev */
  48   48  int zfs_initialize_limit = 1;
  49   49  
  50   50  /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
  51   51  uint64_t zfs_initialize_chunk_size = 1024 * 1024;
  52   52  
  53   53  static boolean_t
  54   54  vdev_initialize_should_stop(vdev_t *vd)
  55   55  {
  56   56          return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
  57   57              vd->vdev_detached || vd->vdev_top->vdev_removing);
  58   58  }
  59   59  
  60   60  static void
  61   61  vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
  62   62  {
  63   63          /*
  64   64           * We pass in the guid instead of the vdev_t since the vdev may
  65   65           * have been freed prior to the sync task being processed. This
  66   66           * happens when a vdev is detached as we call spa_config_vdev_exit(),
  67   67           * stop the intializing thread, schedule the sync task, and free
  68   68           * the vdev. Later when the scheduled sync task is invoked, it would
  69   69           * find that the vdev has been freed.
  70   70           */
  71   71          uint64_t guid = *(uint64_t *)arg;
  72   72          uint64_t txg = dmu_tx_get_txg(tx);
  73   73          kmem_free(arg, sizeof (uint64_t));
  74   74  
  75   75          vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
  76   76          if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
  77   77                  return;
  78   78  
  79   79          uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
  80   80          vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
  81   81  
  82   82          VERIFY(vd->vdev_leaf_zap != 0);
  83   83  
  84   84          objset_t *mos = vd->vdev_spa->spa_meta_objset;
  85   85  
  86   86          if (last_offset > 0) {
  87   87                  vd->vdev_initialize_last_offset = last_offset;
  88   88                  VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
  89   89                      VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
  90   90                      sizeof (last_offset), 1, &last_offset, tx));
  91   91          }
  92   92          if (vd->vdev_initialize_action_time > 0) {
  93   93                  uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
  94   94                  VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
  95   95                      VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
  96   96                      1, &val, tx));
  97   97          }
  98   98  
  99   99          uint64_t initialize_state = vd->vdev_initialize_state;
 100  100          VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
 101  101              VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
 102  102              &initialize_state, tx));
 103  103  }
 104  104  
 105  105  static void
 106  106  vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
 107  107  {
 108  108          ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 109  109          spa_t *spa = vd->vdev_spa;
 110  110  
 111  111          if (new_state == vd->vdev_initialize_state)
 112  112                  return;
 113  113  
 114  114          /*
 115  115           * Copy the vd's guid, this will be freed by the sync task.
 116  116           */
 117  117          uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 118  118          *guid = vd->vdev_guid;
 119  119  
 120  120          /*
 121  121           * If we're suspending, then preserving the original start time.
 122  122           */
 123  123          if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
 124  124                  vd->vdev_initialize_action_time = gethrestime_sec();
 125  125          }
 126  126          vd->vdev_initialize_state = new_state;
 127  127  
 128  128          dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 129  129          VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 130  130          dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
 131  131              guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
 132  132  
 133  133          switch (new_state) {
 134  134          case VDEV_INITIALIZE_ACTIVE:
 135  135                  spa_history_log_internal(spa, "initialize", tx,
 136  136                      "vdev=%s activated", vd->vdev_path);
 137  137                  break;
 138  138          case VDEV_INITIALIZE_SUSPENDED:
 139  139                  spa_history_log_internal(spa, "initialize", tx,
 140  140                      "vdev=%s suspended", vd->vdev_path);
 141  141                  break;
 142  142          case VDEV_INITIALIZE_CANCELED:
 143  143                  spa_history_log_internal(spa, "initialize", tx,
 144  144                      "vdev=%s canceled", vd->vdev_path);
 145  145                  break;
 146  146          case VDEV_INITIALIZE_COMPLETE:
 147  147                  spa_history_log_internal(spa, "initialize", tx,
 148  148                      "vdev=%s complete", vd->vdev_path);
 149  149                  break;
 150  150          default:
 151  151                  panic("invalid state %llu", (unsigned long long)new_state);
 152  152          }
 153  153  
 154  154          dmu_tx_commit(tx);
 155  155  }
 156  156  
 157  157  static void
 158  158  vdev_initialize_cb(zio_t *zio)
 159  159  {
 160  160          vdev_t *vd = zio->io_vd;
 161  161          mutex_enter(&vd->vdev_initialize_io_lock);
 162  162          if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
 163  163                  /*
 164  164                   * The I/O failed because the vdev was unavailable; roll the
 165  165                   * last offset back. (This works because spa_sync waits on
 166  166                   * spa_txg_zio before it runs sync tasks.)
 167  167                   */
 168  168                  uint64_t *off =
 169  169                      &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
 170  170                  *off = MIN(*off, zio->io_offset);
 171  171          } else {
 172  172                  /*
 173  173                   * Since initializing is best-effort, we ignore I/O errors and
 174  174                   * rely on vdev_probe to determine if the errors are more
 175  175                   * critical.
 176  176                   */
 177  177                  if (zio->io_error != 0)
 178  178                          vd->vdev_stat.vs_initialize_errors++;
 179  179  
 180  180                  vd->vdev_initialize_bytes_done += zio->io_orig_size;
 181  181          }
 182  182          ASSERT3U(vd->vdev_initialize_inflight, >, 0);
 183  183          vd->vdev_initialize_inflight--;
 184  184          cv_broadcast(&vd->vdev_initialize_io_cv);
 185  185          mutex_exit(&vd->vdev_initialize_io_lock);
 186  186  
 187  187          spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 188  188  }
 189  189  
 190  190  /* Takes care of physical writing and limiting # of concurrent ZIOs. */
 191  191  static int
 192  192  vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
 193  193  {
 194  194          spa_t *spa = vd->vdev_spa;
 195  195  
 196  196          /* Limit inflight initializing I/Os */
 197  197          mutex_enter(&vd->vdev_initialize_io_lock);
 198  198          while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
 199  199                  cv_wait(&vd->vdev_initialize_io_cv,
 200  200                      &vd->vdev_initialize_io_lock);
 201  201          }
 202  202          vd->vdev_initialize_inflight++;
 203  203          mutex_exit(&vd->vdev_initialize_io_lock);
 204  204  
 205  205          dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 206  206          VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 207  207          uint64_t txg = dmu_tx_get_txg(tx);
 208  208  
 209  209          spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
 210  210          mutex_enter(&vd->vdev_initialize_lock);
 211  211  
 212  212          if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
 213  213                  uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
 214  214                  *guid = vd->vdev_guid;
 215  215  
 216  216                  /* This is the first write of this txg. */
 217  217                  dsl_sync_task_nowait(spa_get_dsl(spa),
 218  218                      vdev_initialize_zap_update_sync, guid, 2,
 219  219                      ZFS_SPACE_CHECK_RESERVED, tx);
 220  220          }
 221  221  
 222  222          /*
 223  223           * We know the vdev struct will still be around since all
 224  224           * consumers of vdev_free must stop the initialization first.
 225  225           */
 226  226          if (vdev_initialize_should_stop(vd)) {
 227  227                  mutex_enter(&vd->vdev_initialize_io_lock);
 228  228                  ASSERT3U(vd->vdev_initialize_inflight, >, 0);
 229  229                  vd->vdev_initialize_inflight--;
 230  230                  mutex_exit(&vd->vdev_initialize_io_lock);
 231  231                  spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 232  232                  mutex_exit(&vd->vdev_initialize_lock);
 233  233                  dmu_tx_commit(tx);
 234  234                  return (SET_ERROR(EINTR));
 235  235          }
 236  236          mutex_exit(&vd->vdev_initialize_lock);
 237  237  
 238  238          vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
 239  239          zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
 240  240              size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
 241  241              ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
 242  242          /* vdev_initialize_cb releases SCL_STATE_ALL */
 243  243  
 244  244          dmu_tx_commit(tx);
 245  245  
 246  246          return (0);
 247  247  }
 248  248  
 249  249  /*
 250  250   * Translate a logical range to the physical range for the specified vdev_t.
 251  251   * This function is initially called with a leaf vdev and will walk each
 252  252   * parent vdev until it reaches a top-level vdev. Once the top-level is
 253  253   * reached the physical range is initialized and the recursive function
 254  254   * begins to unwind. As it unwinds it calls the parent's vdev specific
 255  255   * translation function to do the real conversion.
 256  256   */
 257  257  void
 258  258  vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
 259  259  {
 260  260          /*
 261  261           * Walk up the vdev tree
 262  262           */
 263  263          if (vd != vd->vdev_top) {
 264  264                  vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
 265  265          } else {
 266  266                  /*
 267  267                   * We've reached the top-level vdev, initialize the
 268  268                   * physical range to the logical range and start to
 269  269                   * unwind.
 270  270                   */
 271  271                  physical_rs->rs_start = logical_rs->rs_start;
 272  272                  physical_rs->rs_end = logical_rs->rs_end;
 273  273                  return;
 274  274          }
 275  275  
 276  276          vdev_t *pvd = vd->vdev_parent;
 277  277          ASSERT3P(pvd, !=, NULL);
 278  278          ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
 279  279  
 280  280          /*
 281  281           * As this recursive function unwinds, translate the logical
 282  282           * range into its physical components by calling the
 283  283           * vdev specific translate function.
 284  284           */
 285  285          range_seg_t intermediate = { 0 };
 286  286          pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
 287  287  
 288  288          physical_rs->rs_start = intermediate.rs_start;
 289  289          physical_rs->rs_end = intermediate.rs_end;
 290  290  }
 291  291  
 292  292  /*
 293  293   * Callback to fill each ABD chunk with zfs_initialize_value. len must be
 294  294   * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
 295  295   * allocation will guarantee these for us.
 296  296   */
 297  297  /* ARGSUSED */
 298  298  static int
 299  299  vdev_initialize_block_fill(void *buf, size_t len, void *unused)
 300  300  {
 301  301          ASSERT0(len % sizeof (uint64_t));
 302  302          for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
 303  303                  *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
 304  304          }
 305  305          return (0);
 306  306  }
 307  307  
 308  308  static abd_t *
 309  309  vdev_initialize_block_alloc()
 310  310  {
 311  311          /* Allocate ABD for filler data */
 312  312          abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
 313  313  
 314  314          ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
 315  315          (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
 316  316              vdev_initialize_block_fill, NULL);
 317  317  
 318  318          return (data);
 319  319  }
 320  320  
 321  321  static void
 322  322  vdev_initialize_block_free(abd_t *data)
 323  323  {
 324  324          abd_free(data);
 325  325  }
 326  326  
 327  327  static int
 328  328  vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 329  329  {
 330  330          avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
 331  331  
 332  332          for (range_seg_t *rs = avl_first(rt); rs != NULL;
 333  333              rs = AVL_NEXT(rt, rs)) {
 334  334                  uint64_t size = rs->rs_end - rs->rs_start;
 335  335  
 336  336                  /* Split range into legally-sized physical chunks */
 337  337                  uint64_t writes_required =
 338  338                      ((size - 1) / zfs_initialize_chunk_size) + 1;
 339  339  
 340  340                  for (uint64_t w = 0; w < writes_required; w++) {
 341  341                          int error;
 342  342  
 343  343                          error = vdev_initialize_write(vd,
 344  344                              VDEV_LABEL_START_SIZE + rs->rs_start +
 345  345                              (w * zfs_initialize_chunk_size),
 346  346                              MIN(size - (w * zfs_initialize_chunk_size),
 347  347                              zfs_initialize_chunk_size), data);
 348  348                          if (error != 0)
 349  349                                  return (error);
 350  350                  }
 351  351          }
 352  352          return (0);
 353  353  }
 354  354  
 355  355  static void
 356  356  vdev_initialize_mg_wait(metaslab_group_t *mg)
 357  357  {
 358  358          ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
 359  359          while (mg->mg_initialize_updating) {
 360  360                  cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
 361  361          }
 362  362  }
 363  363  
 364  364  static void
 365  365  vdev_initialize_mg_mark(metaslab_group_t *mg)
 366  366  {
 367  367          ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
 368  368          ASSERT(mg->mg_initialize_updating);
 369  369  
 370  370          while (mg->mg_ms_initializing >= max_initialize_ms) {
 371  371                  cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
 372  372          }
 373  373          mg->mg_ms_initializing++;
 374  374          ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
 375  375  }
 376  376  
 377  377  /*
 378  378   * Mark the metaslab as being initialized to prevent any allocations
 379  379   * on this metaslab. We must also track how many metaslabs are currently
 380  380   * being initialized within a metaslab group and limit them to prevent
 381  381   * allocation failures from occurring because all metaslabs are being
 382  382   * initialized.
 383  383   */
 384  384  static void
 385  385  vdev_initialize_ms_mark(metaslab_t *msp)
 386  386  {
 387  387          ASSERT(!MUTEX_HELD(&msp->ms_lock));
 388  388          metaslab_group_t *mg = msp->ms_group;
 389  389  
 390  390          mutex_enter(&mg->mg_ms_initialize_lock);
 391  391  
 392  392          /*
 393  393           * To keep an accurate count of how many threads are initializing
 394  394           * a specific metaslab group, we only allow one thread to mark
 395  395           * the metaslab group at a time. This ensures that the value of
 396  396           * ms_initializing will be accurate when we decide to mark a metaslab
 397  397           * group as being initialized. To do this we force all other threads
 398  398           * to wait till the metaslab's mg_initialize_updating flag is no
 399  399           * longer set.
 400  400           */
 401  401          vdev_initialize_mg_wait(mg);
 402  402          mg->mg_initialize_updating = B_TRUE;
 403  403          if (msp->ms_initializing == 0) {
 404  404                  vdev_initialize_mg_mark(mg);
 405  405          }
 406  406          mutex_enter(&msp->ms_lock);
 407  407          msp->ms_initializing++;
 408  408          mutex_exit(&msp->ms_lock);
 409  409  
 410  410          mg->mg_initialize_updating = B_FALSE;
 411  411          cv_broadcast(&mg->mg_ms_initialize_cv);
 412  412          mutex_exit(&mg->mg_ms_initialize_lock);
 413  413  }
 414  414  
 415  415  static void
 416  416  vdev_initialize_ms_unmark(metaslab_t *msp)
 417  417  {
 418  418          ASSERT(!MUTEX_HELD(&msp->ms_lock));
 419  419          metaslab_group_t *mg = msp->ms_group;
 420  420          mutex_enter(&mg->mg_ms_initialize_lock);
 421  421          mutex_enter(&msp->ms_lock);
 422  422          if (--msp->ms_initializing == 0) {
 423  423                  mg->mg_ms_initializing--;
 424  424                  cv_broadcast(&mg->mg_ms_initialize_cv);
 425  425          }
 426  426          mutex_exit(&msp->ms_lock);
 427  427          mutex_exit(&mg->mg_ms_initialize_lock);
 428  428  }
 429  429  
 430  430  static void
 431  431  vdev_initialize_calculate_progress(vdev_t *vd)
 432  432  {
 433  433          ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 434  434              spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));

↓ open down ↓

434 lines elided

↑ open up ↑

 435  435          ASSERT(vd->vdev_leaf_zap != 0);
 436  436  
 437  437          vd->vdev_initialize_bytes_est = 0;
 438  438          vd->vdev_initialize_bytes_done = 0;
 439  439  
 440  440          for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
 441  441                  metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 442  442                  mutex_enter(&msp->ms_lock);
 443  443  
 444  444                  uint64_t ms_free = msp->ms_size -
 445      -                    space_map_allocated(msp->ms_sm);
      445 +                    metaslab_allocated_space(msp);
 446  446  
 447  447                  if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
 448  448                          ms_free /= vd->vdev_top->vdev_children;
 449  449  
 450  450                  /*
 451  451                   * Convert the metaslab range to a physical range
 452  452                   * on our vdev. We use this to determine if we are
 453  453                   * in the middle of this metaslab range.
 454  454                   */
 455  455                  range_seg_t logical_rs, physical_rs;

 456  456                  logical_rs.rs_start = msp->ms_start;
 457  457                  logical_rs.rs_end = msp->ms_start + msp->ms_size;
 458  458                  vdev_xlate(vd, &logical_rs, &physical_rs);
 459  459  
 460  460                  if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
 461  461                          vd->vdev_initialize_bytes_est += ms_free;
 462  462                          mutex_exit(&msp->ms_lock);
 463  463                          continue;
 464  464                  } else if (vd->vdev_initialize_last_offset >
 465  465                      physical_rs.rs_end) {
 466  466                          vd->vdev_initialize_bytes_done += ms_free;
 467  467                          vd->vdev_initialize_bytes_est += ms_free;
 468  468                          mutex_exit(&msp->ms_lock);
 469  469                          continue;
 470  470                  }
 471  471  
 472  472                  /*
 473  473                   * If we get here, we're in the middle of initializing this
 474  474                   * metaslab. Load it and walk the free tree for more accurate
 475  475                   * progress estimation.
 476  476                   */
 477  477                  VERIFY0(metaslab_load(msp));
 478  478  
 479  479                  for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
 480  480                      rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
 481  481                          logical_rs.rs_start = rs->rs_start;
 482  482                          logical_rs.rs_end = rs->rs_end;
 483  483                          vdev_xlate(vd, &logical_rs, &physical_rs);
 484  484  
 485  485                          uint64_t size = physical_rs.rs_end -
 486  486                              physical_rs.rs_start;
 487  487                          vd->vdev_initialize_bytes_est += size;
 488  488                          if (vd->vdev_initialize_last_offset >
 489  489                              physical_rs.rs_end) {
 490  490                                  vd->vdev_initialize_bytes_done += size;
 491  491                          } else if (vd->vdev_initialize_last_offset >
 492  492                              physical_rs.rs_start &&
 493  493                              vd->vdev_initialize_last_offset <
 494  494                              physical_rs.rs_end) {
 495  495                                  vd->vdev_initialize_bytes_done +=
 496  496                                      vd->vdev_initialize_last_offset -
 497  497                                      physical_rs.rs_start;
 498  498                          }
 499  499                  }
 500  500                  mutex_exit(&msp->ms_lock);
 501  501          }
 502  502  }
 503  503  
 504  504  static void
 505  505  vdev_initialize_load(vdev_t *vd)
 506  506  {
 507  507          ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
 508  508              spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
 509  509          ASSERT(vd->vdev_leaf_zap != 0);
 510  510  
 511  511          if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
 512  512              vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
 513  513                  int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 514  514                      vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
 515  515                      sizeof (vd->vdev_initialize_last_offset), 1,
 516  516                      &vd->vdev_initialize_last_offset);
 517  517                  ASSERT(err == 0 || err == ENOENT);
 518  518          }
 519  519  
 520  520          vdev_initialize_calculate_progress(vd);
 521  521  }
 522  522  
 523  523  
 524  524  /*
 525  525   * Convert the logical range into a physcial range and add it to our
 526  526   * avl tree.
 527  527   */
 528  528  void
 529  529  vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
 530  530  {
 531  531          vdev_t *vd = arg;
 532  532          range_seg_t logical_rs, physical_rs;
 533  533          logical_rs.rs_start = start;
 534  534          logical_rs.rs_end = start + size;
 535  535  
 536  536          ASSERT(vd->vdev_ops->vdev_op_leaf);
 537  537          vdev_xlate(vd, &logical_rs, &physical_rs);
 538  538  
 539  539          IMPLY(vd->vdev_top == vd,
 540  540              logical_rs.rs_start == physical_rs.rs_start);
 541  541          IMPLY(vd->vdev_top == vd,
 542  542              logical_rs.rs_end == physical_rs.rs_end);
 543  543  
 544  544          /* Only add segments that we have not visited yet */
 545  545          if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
 546  546                  return;
 547  547  
 548  548          /* Pick up where we left off mid-range. */
 549  549          if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
 550  550                  zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
 551  551                      "(%llu, %llu)", vd->vdev_path,
 552  552                      (u_longlong_t)physical_rs.rs_start,
 553  553                      (u_longlong_t)physical_rs.rs_end,
 554  554                      (u_longlong_t)vd->vdev_initialize_last_offset,
 555  555                      (u_longlong_t)physical_rs.rs_end);
 556  556                  ASSERT3U(physical_rs.rs_end, >,
 557  557                      vd->vdev_initialize_last_offset);
 558  558                  physical_rs.rs_start = vd->vdev_initialize_last_offset;
 559  559          }
 560  560          ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
 561  561  
 562  562          /*
 563  563           * With raidz, it's possible that the logical range does not live on
 564  564           * this leaf vdev. We only add the physical range to this vdev's if it
 565  565           * has a length greater than 0.
 566  566           */
 567  567          if (physical_rs.rs_end > physical_rs.rs_start) {
 568  568                  range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
 569  569                      physical_rs.rs_end - physical_rs.rs_start);
 570  570          } else {
 571  571                  ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
 572  572          }
 573  573  }
 574  574  
 575  575  static void
 576  576  vdev_initialize_thread(void *arg)
 577  577  {
 578  578          vdev_t *vd = arg;
 579  579          spa_t *spa = vd->vdev_spa;
 580  580          int error = 0;
 581  581          uint64_t ms_count = 0;
 582  582  
 583  583          ASSERT(vdev_is_concrete(vd));
 584  584          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 585  585  
 586  586          vd->vdev_initialize_last_offset = 0;
 587  587          vdev_initialize_load(vd);
 588  588  
 589  589          abd_t *deadbeef = vdev_initialize_block_alloc();
 590  590  
 591  591          vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
 592  592  
 593  593          for (uint64_t i = 0; !vd->vdev_detached &&
 594  594              i < vd->vdev_top->vdev_ms_count; i++) {
 595  595                  metaslab_t *msp = vd->vdev_top->vdev_ms[i];
 596  596  
 597  597                  /*
 598  598                   * If we've expanded the top-level vdev or it's our
 599  599                   * first pass, calculate our progress.
 600  600                   */
 601  601                  if (vd->vdev_top->vdev_ms_count != ms_count) {
 602  602                          vdev_initialize_calculate_progress(vd);
 603  603                          ms_count = vd->vdev_top->vdev_ms_count;
 604  604                  }
 605  605  
 606  606                  vdev_initialize_ms_mark(msp);
 607  607                  mutex_enter(&msp->ms_lock);
 608  608                  VERIFY0(metaslab_load(msp));
 609  609  
 610  610                  range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
 611  611                      vd);
 612  612                  mutex_exit(&msp->ms_lock);
 613  613  
 614  614                  spa_config_exit(spa, SCL_CONFIG, FTAG);
 615  615                  error = vdev_initialize_ranges(vd, deadbeef);
 616  616                  vdev_initialize_ms_unmark(msp);
 617  617                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 618  618  
 619  619                  range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
 620  620                  if (error != 0)
 621  621                          break;
 622  622          }
 623  623  
 624  624          spa_config_exit(spa, SCL_CONFIG, FTAG);
 625  625          mutex_enter(&vd->vdev_initialize_io_lock);
 626  626          while (vd->vdev_initialize_inflight > 0) {
 627  627                  cv_wait(&vd->vdev_initialize_io_cv,
 628  628                      &vd->vdev_initialize_io_lock);
 629  629          }
 630  630          mutex_exit(&vd->vdev_initialize_io_lock);
 631  631  
 632  632          range_tree_destroy(vd->vdev_initialize_tree);
 633  633          vdev_initialize_block_free(deadbeef);
 634  634          vd->vdev_initialize_tree = NULL;
 635  635  
 636  636          mutex_enter(&vd->vdev_initialize_lock);
 637  637          if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
 638  638                  vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
 639  639          }
 640  640          ASSERT(vd->vdev_initialize_thread != NULL ||
 641  641              vd->vdev_initialize_inflight == 0);
 642  642  
 643  643          /*
 644  644           * Drop the vdev_initialize_lock while we sync out the
 645  645           * txg since it's possible that a device might be trying to
 646  646           * come online and must check to see if it needs to restart an
 647  647           * initialization. That thread will be holding the spa_config_lock
 648  648           * which would prevent the txg_wait_synced from completing.
 649  649           */
 650  650          mutex_exit(&vd->vdev_initialize_lock);
 651  651          txg_wait_synced(spa_get_dsl(spa), 0);
 652  652          mutex_enter(&vd->vdev_initialize_lock);
 653  653  
 654  654          vd->vdev_initialize_thread = NULL;
 655  655          cv_broadcast(&vd->vdev_initialize_cv);
 656  656          mutex_exit(&vd->vdev_initialize_lock);
 657  657  }
 658  658  
 659  659  /*
 660  660   * Initiates a device. Caller must hold vdev_initialize_lock.
 661  661   * Device must be a leaf and not already be initializing.
 662  662   */
 663  663  void
 664  664  vdev_initialize(vdev_t *vd)
 665  665  {
 666  666          ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 667  667          ASSERT(vd->vdev_ops->vdev_op_leaf);
 668  668          ASSERT(vdev_is_concrete(vd));
 669  669          ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 670  670          ASSERT(!vd->vdev_detached);
 671  671          ASSERT(!vd->vdev_initialize_exit_wanted);
 672  672          ASSERT(!vd->vdev_top->vdev_removing);
 673  673  
 674  674          vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
 675  675          vd->vdev_initialize_thread = thread_create(NULL, 0,
 676  676              vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 677  677  }
 678  678  
 679  679  /*
 680  680   * Stop initializng a device, with the resultant initialing state being
 681  681   * tgt_state. Blocks until the initializing thread has exited.
 682  682   * Caller must hold vdev_initialize_lock and must not be writing to the spa
 683  683   * config, as the initializing thread may try to enter the config as a reader
 684  684   * before exiting.
 685  685   */
 686  686  void
 687  687  vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
 688  688  {
 689  689          spa_t *spa = vd->vdev_spa;
 690  690          ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
 691  691  
 692  692          ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
 693  693          ASSERT(vd->vdev_ops->vdev_op_leaf);
 694  694          ASSERT(vdev_is_concrete(vd));
 695  695  
 696  696          /*
 697  697           * Allow cancel requests to proceed even if the initialize thread
 698  698           * has stopped.
 699  699           */
 700  700          if (vd->vdev_initialize_thread == NULL &&
 701  701              tgt_state != VDEV_INITIALIZE_CANCELED) {
 702  702                  return;
 703  703          }
 704  704  
 705  705          vdev_initialize_change_state(vd, tgt_state);
 706  706          vd->vdev_initialize_exit_wanted = B_TRUE;
 707  707          while (vd->vdev_initialize_thread != NULL)
 708  708                  cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
 709  709  
 710  710          ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 711  711          vd->vdev_initialize_exit_wanted = B_FALSE;
 712  712  }
 713  713  
 714  714  static void
 715  715  vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
 716  716  {
 717  717          if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
 718  718                  mutex_enter(&vd->vdev_initialize_lock);
 719  719                  vdev_initialize_stop(vd, tgt_state);
 720  720                  mutex_exit(&vd->vdev_initialize_lock);
 721  721                  return;
 722  722          }
 723  723  
 724  724          for (uint64_t i = 0; i < vd->vdev_children; i++) {
 725  725                  vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
 726  726          }
 727  727  }
 728  728  
 729  729  /*
 730  730   * Convenience function to stop initializing of a vdev tree and set all
 731  731   * initialize thread pointers to NULL.
 732  732   */
 733  733  void
 734  734  vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
 735  735  {
 736  736          vdev_initialize_stop_all_impl(vd, tgt_state);
 737  737  
 738  738          if (vd->vdev_spa->spa_sync_on) {
 739  739                  /* Make sure that our state has been synced to disk */
 740  740                  txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
 741  741          }
 742  742  }
 743  743  
 744  744  void
 745  745  vdev_initialize_restart(vdev_t *vd)
 746  746  {
 747  747          ASSERT(MUTEX_HELD(&spa_namespace_lock));
 748  748          ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 749  749  
 750  750          if (vd->vdev_leaf_zap != 0) {
 751  751                  mutex_enter(&vd->vdev_initialize_lock);
 752  752                  uint64_t initialize_state = VDEV_INITIALIZE_NONE;
 753  753                  int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 754  754                      vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
 755  755                      sizeof (initialize_state), 1, &initialize_state);
 756  756                  ASSERT(err == 0 || err == ENOENT);
 757  757                  vd->vdev_initialize_state = initialize_state;
 758  758  
 759  759                  uint64_t timestamp = 0;
 760  760                  err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 761  761                      vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
 762  762                      sizeof (timestamp), 1, &timestamp);
 763  763                  ASSERT(err == 0 || err == ENOENT);
 764  764                  vd->vdev_initialize_action_time = (time_t)timestamp;
 765  765  
 766  766                  if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
 767  767                      vd->vdev_offline) {
 768  768                          /* load progress for reporting, but don't resume */
 769  769                          vdev_initialize_load(vd);
 770  770                  } else if (vd->vdev_initialize_state ==
 771  771                      VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
 772  772                          vdev_initialize(vd);
 773  773                  }
 774  774  
 775  775                  mutex_exit(&vd->vdev_initialize_lock);
 776  776          }
 777  777  
 778  778          for (uint64_t i = 0; i < vd->vdev_children; i++) {
 779  779                  vdev_initialize_restart(vd->vdev_child[i]);
 780  780          }
 781  781  }

↓ open down ↓

326 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX