Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
        
*** 16,27 ****
--- 16,29 ----
   * fields enclosed by brackets "[]" replaced with your own identifying
   * information: Portions Copyright [yyyy] [name of copyright owner]
   *
   * CDDL HEADER END
   */
+ 
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  /*
   * Virtual Device Labels
   * ---------------------
*** 119,128 ****
--- 121,132 ----
   *      name            Pool name
   *      state           Pool state
   *      txg             Transaction group in which this label was written
   *      pool_guid       Unique identifier for this pool
   *      vdev_tree       An nvlist describing vdev tree.
+  *      features_for_read
+  *                      An nvlist of the features necessary for reading the MOS.
   *
   * Each leaf device label also contains the following:
   *
   *      top_guid        Unique ID for top-level vdev in which this is contained
   *      guid            Unique ID for the leaf vdev
*** 426,437 ****
              rvd->vdev_children) == 0);
  
          kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
  }
  
  nvlist_t *
! vdev_label_read_config(vdev_t *vd)
  {
          spa_t *spa = vd->vdev_spa;
          nvlist_t *config = NULL;
          vdev_phys_t *vp;
          zio_t *zio;
--- 430,446 ----
              rvd->vdev_children) == 0);
  
          kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
  }
  
+ /*
+  * Returns the configuration from the label of the given vdev. If 'label' is
+  * VDEV_BEST_LABEL, each label of the vdev will be read until a valid
+  * configuration is found; otherwise, only the specified label will be read.
+  */
  nvlist_t *
! vdev_label_read_config(vdev_t *vd, int label)
  {
          spa_t *spa = vd->vdev_spa;
          nvlist_t *config = NULL;
          vdev_phys_t *vp;
          zio_t *zio;
*** 445,454 ****
--- 454,465 ----
  
          vp = zio_buf_alloc(sizeof (vdev_phys_t));
  
  retry:
          for (int l = 0; l < VDEV_LABELS; l++) {
+                 if (label >= 0 && label < VDEV_LABELS && label != l)
+                         continue;
  
                  zio = zio_root(spa, NULL, NULL, flags);
  
                  vdev_label_read(zio, vd, l, vp,
                      offsetof(vdev_label_t, vl_vdev_phys),
*** 494,504 ****
                  *l2cache_guid = 0ULL;
  
          /*
           * Read the label, if any, and perform some basic sanity checks.
           */
!         if ((label = vdev_label_read_config(vd)) == NULL)
                  return (B_FALSE);
  
          (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
              &vdtxg);
  
--- 505,515 ----
                  *l2cache_guid = 0ULL;
  
          /*
           * Read the label, if any, and perform some basic sanity checks.
           */
!         if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL)
                  return (B_FALSE);
  
          (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
              &vdtxg);
  
*** 831,841 ****
   * Consider the following situation: txg is safely synced to disk.  We've
   * written the first uberblock for txg + 1, and then we lose power.  When we
   * come back up, we fail to see the uberblock for txg + 1 because, say,
   * it was on a mirrored device and the replica to which we wrote txg + 1
   * is now offline.  If we then make some changes and sync txg + 1, and then
!  * the missing replica comes back, then for a new seconds we'll have two
   * conflicting uberblocks on disk with the same txg.  The solution is simple:
   * among uberblocks with equal txg, choose the one with the latest timestamp.
   */
  static int
  vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
--- 842,852 ----
   * Consider the following situation: txg is safely synced to disk.  We've
   * written the first uberblock for txg + 1, and then we lose power.  When we
   * come back up, we fail to see the uberblock for txg + 1 because, say,
   * it was on a mirrored device and the replica to which we wrote txg + 1
   * is now offline.  If we then make some changes and sync txg + 1, and then
!  * the missing replica comes back, then for a few seconds we'll have two
   * conflicting uberblocks on disk with the same txg.  The solution is simple:
   * among uberblocks with equal txg, choose the one with the latest timestamp.
   */
  static int
  vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
*** 851,900 ****
                  return (1);
  
          return (0);
  }
  
  static void
  vdev_uberblock_load_done(zio_t *zio)
  {
          spa_t *spa = zio->io_spa;
          zio_t *rio = zio->io_private;
          uberblock_t *ub = zio->io_data;
!         uberblock_t *ubbest = rio->io_private;
  
!         ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
  
          if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
                  mutex_enter(&rio->io_lock);
                  if (ub->ub_txg <= spa->spa_load_max_txg &&
!                     vdev_uberblock_compare(ub, ubbest) > 0)
!                         *ubbest = *ub;
                  mutex_exit(&rio->io_lock);
          }
  
          zio_buf_free(zio->io_data, zio->io_size);
  }
  
! void
! vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
  {
-         spa_t *spa = vd->vdev_spa;
-         vdev_t *rvd = spa->spa_root_vdev;
-         int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
-             ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
- 
-         if (vd == rvd) {
-                 ASSERT(zio == NULL);
-                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-                 zio = zio_root(spa, NULL, ubbest, flags);
-                 bzero(ubbest, sizeof (uberblock_t));
-         }
- 
-         ASSERT(zio != NULL);
- 
          for (int c = 0; c < vd->vdev_children; c++)
!                 vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
  
          if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
                  for (int l = 0; l < VDEV_LABELS; l++) {
                          for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
                                  vdev_label_read(zio, vd, l,
--- 862,915 ----
                  return (1);
  
          return (0);
  }
  
+ struct ubl_cbdata {
+         uberblock_t     *ubl_ubbest;    /* Best uberblock */
+         vdev_t          *ubl_vd;        /* vdev associated with the above */
+         int             ubl_label;      /* Label associated with the above */
+ };
+ 
  static void
  vdev_uberblock_load_done(zio_t *zio)
  {
+         vdev_t *vd = zio->io_vd;
          spa_t *spa = zio->io_spa;
          zio_t *rio = zio->io_private;
          uberblock_t *ub = zio->io_data;
!         struct ubl_cbdata *cbp = rio->io_private;
  
!         ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
  
          if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
                  mutex_enter(&rio->io_lock);
                  if (ub->ub_txg <= spa->spa_load_max_txg &&
!                     vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
!                         /*
!                          * Keep track of the vdev and label in which this
!                          * uberblock was found. We will use this information
!                          * later to obtain the config nvlist associated with
!                          * this uberblock.
!                          */
!                         *cbp->ubl_ubbest = *ub;
!                         cbp->ubl_vd = vd;
!                         cbp->ubl_label = vdev_label_number(vd->vdev_psize,
!                             zio->io_offset);
!                 }
                  mutex_exit(&rio->io_lock);
          }
  
          zio_buf_free(zio->io_data, zio->io_size);
  }
  
! static void
! vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
!     struct ubl_cbdata *cbp)
  {
          for (int c = 0; c < vd->vdev_children; c++)
!                 vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
  
          if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
                  for (int l = 0; l < VDEV_LABELS; l++) {
                          for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
                                  vdev_label_read(zio, vd, l,
*** 903,917 ****
                                      VDEV_UBERBLOCK_SIZE(vd),
                                      vdev_uberblock_load_done, zio, flags);
                          }
                  }
          }
  
!         if (vd == rvd) {
                  (void) zio_wait(zio);
!                 spa_config_exit(spa, SCL_ALL, FTAG);
          }
  }
  
  /*
   * On success, increment root zio's count of good writes.
   * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
--- 918,966 ----
                                      VDEV_UBERBLOCK_SIZE(vd),
                                      vdev_uberblock_load_done, zio, flags);
                          }
                  }
          }
+ }
  
! /*
!  * Reads the 'best' uberblock from disk along with its associated
!  * configuration. First, we read the uberblock array of each label of each
!  * vdev, keeping track of the uberblock with the highest txg in each array.
!  * Then, we read the configuration from the same label as the best uberblock.
!  */
! void
! vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
! {
!         int i;
!         zio_t *zio;
!         spa_t *spa = rvd->vdev_spa;
!         struct ubl_cbdata cb;
!         int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
!             ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
! 
!         ASSERT(ub);
!         ASSERT(config);
! 
!         bzero(ub, sizeof (uberblock_t));
!         *config = NULL;
! 
!         cb.ubl_ubbest = ub;
!         cb.ubl_vd = NULL;
! 
!         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
!         zio = zio_root(spa, NULL, &cb, flags);
!         vdev_uberblock_load_impl(zio, rvd, flags, &cb);
          (void) zio_wait(zio);
!         if (cb.ubl_vd != NULL) {
!                 for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) {
!                         *config = vdev_label_read_config(cb.ubl_vd, i);
!                         if (*config != NULL)
!                                 break;
                  }
+         }
+         spa_config_exit(spa, SCL_ALL, FTAG);
  }
  
  /*
   * On success, increment root zio's count of good writes.
   * We only get credit for writes to known-visible vdevs; see spa_vdev_add().