Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/vdev_label.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_label.c
↓ open down ↓ 10 lines elided ↑ open up ↑
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       24 + * Copyright (c) 2012 by Delphix. All rights reserved.
  23   25   */
  24   26  
  25   27  /*
  26   28   * Virtual Device Labels
  27   29   * ---------------------
  28   30   *
  29   31   * The vdev label serves several distinct purposes:
  30   32   *
  31   33   *      1. Uniquely identify this device as part of a ZFS pool and confirm its
  32   34   *         identity within the pool.
↓ open down ↓ 81 lines elided ↑ open up ↑
 114  116   * -------------------------
 115  117   *
 116  118   * The nvlist describing the pool and vdev contains the following elements:
 117  119   *
 118  120   *      version         ZFS on-disk version
 119  121   *      name            Pool name
 120  122   *      state           Pool state
 121  123   *      txg             Transaction group in which this label was written
 122  124   *      pool_guid       Unique identifier for this pool
 123  125   *      vdev_tree       An nvlist describing vdev tree.
      126 + *      features_for_read
      127 + *                      An nvlist of the features necessary for reading the MOS.
 124  128   *
 125  129   * Each leaf device label also contains the following:
 126  130   *
 127  131   *      top_guid        Unique ID for top-level vdev in which this is contained
 128  132   *      guid            Unique ID for the leaf vdev
 129  133   *
 130  134   * The 'vs' configuration follows the format described in 'spa_config.c'.
 131  135   */
 132  136  
 133  137  #include <sys/zfs_context.h>
↓ open down ↓ 287 lines elided ↑ open up ↑
 421  425                  VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
 422  426                      array, idx) == 0);
 423  427          }
 424  428  
 425  429          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 426  430              rvd->vdev_children) == 0);
 427  431  
 428  432          kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 429  433  }
 430  434  
      435 +/*
      436 + * Returns the configuration from the label of the given vdev. If 'label' is
      437 + * VDEV_BEST_LABEL, each label of the vdev will be read until a valid
      438 + * configuration is found; otherwise, only the specified label will be read.
      439 + */
 431  440  nvlist_t *
 432      -vdev_label_read_config(vdev_t *vd)
      441 +vdev_label_read_config(vdev_t *vd, int label)
 433  442  {
 434  443          spa_t *spa = vd->vdev_spa;
 435  444          nvlist_t *config = NULL;
 436  445          vdev_phys_t *vp;
 437  446          zio_t *zio;
 438  447          int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 439  448              ZIO_FLAG_SPECULATIVE;
 440  449  
 441  450          ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 442  451  
 443  452          if (!vdev_readable(vd))
 444  453                  return (NULL);
 445  454  
 446  455          vp = zio_buf_alloc(sizeof (vdev_phys_t));
 447  456  
 448  457  retry:
 449  458          for (int l = 0; l < VDEV_LABELS; l++) {
      459 +                if (label >= 0 && label < VDEV_LABELS && label != l)
      460 +                        continue;
 450  461  
 451  462                  zio = zio_root(spa, NULL, NULL, flags);
 452  463  
 453  464                  vdev_label_read(zio, vd, l, vp,
 454  465                      offsetof(vdev_label_t, vl_vdev_phys),
 455  466                      sizeof (vdev_phys_t), NULL, NULL, flags);
 456  467  
 457  468                  if (zio_wait(zio) == 0 &&
 458  469                      nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
 459  470                      &config, 0) == 0)
↓ open down ↓ 29 lines elided ↑ open up ↑
 489  500          nvlist_t *label;
 490  501  
 491  502          if (spare_guid)
 492  503                  *spare_guid = 0ULL;
 493  504          if (l2cache_guid)
 494  505                  *l2cache_guid = 0ULL;
 495  506  
 496  507          /*
 497  508           * Read the label, if any, and perform some basic sanity checks.
 498  509           */
 499      -        if ((label = vdev_label_read_config(vd)) == NULL)
      510 +        if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL)
 500  511                  return (B_FALSE);
 501  512  
 502  513          (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 503  514              &vdtxg);
 504  515  
 505  516          if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 506  517              &state) != 0 ||
 507  518              nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 508  519              &device_guid) != 0) {
 509  520                  nvlist_free(label);
↓ open down ↓ 316 lines elided ↑ open up ↑
 826  837   * uberblock load/sync
 827  838   * ==========================================================================
 828  839   */
 829  840  
 830  841  /*
 831  842   * Consider the following situation: txg is safely synced to disk.  We've
 832  843   * written the first uberblock for txg + 1, and then we lose power.  When we
 833  844   * come back up, we fail to see the uberblock for txg + 1 because, say,
 834  845   * it was on a mirrored device and the replica to which we wrote txg + 1
 835  846   * is now offline.  If we then make some changes and sync txg + 1, and then
 836      - * the missing replica comes back, then for a new seconds we'll have two
      847 + * the missing replica comes back, then for a few seconds we'll have two
 837  848   * conflicting uberblocks on disk with the same txg.  The solution is simple:
 838  849   * among uberblocks with equal txg, choose the one with the latest timestamp.
 839  850   */
 840  851  static int
 841  852  vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 842  853  {
 843  854          if (ub1->ub_txg < ub2->ub_txg)
 844  855                  return (-1);
 845  856          if (ub1->ub_txg > ub2->ub_txg)
 846  857                  return (1);
 847  858  
 848  859          if (ub1->ub_timestamp < ub2->ub_timestamp)
 849  860                  return (-1);
 850  861          if (ub1->ub_timestamp > ub2->ub_timestamp)
 851  862                  return (1);
 852  863  
 853  864          return (0);
 854  865  }
 855  866  
      867 +struct ubl_cbdata {
      868 +        uberblock_t     *ubl_ubbest;    /* Best uberblock */
      869 +        vdev_t          *ubl_vd;        /* vdev associated with the above */
      870 +        int             ubl_label;      /* Label associated with the above */
      871 +};
      872 +
 856  873  static void
 857  874  vdev_uberblock_load_done(zio_t *zio)
 858  875  {
      876 +        vdev_t *vd = zio->io_vd;
 859  877          spa_t *spa = zio->io_spa;
 860  878          zio_t *rio = zio->io_private;
 861  879          uberblock_t *ub = zio->io_data;
 862      -        uberblock_t *ubbest = rio->io_private;
      880 +        struct ubl_cbdata *cbp = rio->io_private;
 863  881  
 864      -        ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
      882 +        ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 865  883  
 866  884          if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 867  885                  mutex_enter(&rio->io_lock);
 868  886                  if (ub->ub_txg <= spa->spa_load_max_txg &&
 869      -                    vdev_uberblock_compare(ub, ubbest) > 0)
 870      -                        *ubbest = *ub;
      887 +                    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
      888 +                        /*
      889 +                         * Keep track of the vdev and label in which this
      890 +                         * uberblock was found. We will use this information
      891 +                         * later to obtain the config nvlist associated with
      892 +                         * this uberblock.
      893 +                         */
      894 +                        *cbp->ubl_ubbest = *ub;
      895 +                        cbp->ubl_vd = vd;
      896 +                        cbp->ubl_label = vdev_label_number(vd->vdev_psize,
      897 +                            zio->io_offset);
      898 +                }
 871  899                  mutex_exit(&rio->io_lock);
 872  900          }
 873  901  
 874  902          zio_buf_free(zio->io_data, zio->io_size);
 875  903  }
 876  904  
 877      -void
 878      -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
      905 +static void
      906 +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
      907 +    struct ubl_cbdata *cbp)
 879  908  {
 880      -        spa_t *spa = vd->vdev_spa;
 881      -        vdev_t *rvd = spa->spa_root_vdev;
 882      -        int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 883      -            ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 884      -
 885      -        if (vd == rvd) {
 886      -                ASSERT(zio == NULL);
 887      -                spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 888      -                zio = zio_root(spa, NULL, ubbest, flags);
 889      -                bzero(ubbest, sizeof (uberblock_t));
 890      -        }
 891      -
 892      -        ASSERT(zio != NULL);
 893      -
 894  909          for (int c = 0; c < vd->vdev_children; c++)
 895      -                vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
      910 +                vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 896  911  
 897  912          if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 898  913                  for (int l = 0; l < VDEV_LABELS; l++) {
 899  914                          for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 900  915                                  vdev_label_read(zio, vd, l,
 901  916                                      zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
 902  917                                      VDEV_UBERBLOCK_OFFSET(vd, n),
 903  918                                      VDEV_UBERBLOCK_SIZE(vd),
 904  919                                      vdev_uberblock_load_done, zio, flags);
 905  920                          }
 906  921                  }
 907  922          }
      923 +}
 908  924  
 909      -        if (vd == rvd) {
 910      -                (void) zio_wait(zio);
 911      -                spa_config_exit(spa, SCL_ALL, FTAG);
      925 +/*
      926 + * Reads the 'best' uberblock from disk along with its associated
      927 + * configuration. First, we read the uberblock array of each label of each
      928 + * vdev, keeping track of the uberblock with the highest txg in each array.
      929 + * Then, we read the configuration from the same label as the best uberblock.
      930 + */
      931 +void
      932 +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
      933 +{
      934 +        int i;
      935 +        zio_t *zio;
      936 +        spa_t *spa = rvd->vdev_spa;
      937 +        struct ubl_cbdata cb;
      938 +        int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
      939 +            ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
      940 +
      941 +        ASSERT(ub);
      942 +        ASSERT(config);
      943 +
      944 +        bzero(ub, sizeof (uberblock_t));
      945 +        *config = NULL;
      946 +
      947 +        cb.ubl_ubbest = ub;
      948 +        cb.ubl_vd = NULL;
      949 +
      950 +        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
      951 +        zio = zio_root(spa, NULL, &cb, flags);
      952 +        vdev_uberblock_load_impl(zio, rvd, flags, &cb);
      953 +        (void) zio_wait(zio);
      954 +        if (cb.ubl_vd != NULL) {
      955 +                for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) {
      956 +                        *config = vdev_label_read_config(cb.ubl_vd, i);
      957 +                        if (*config != NULL)
      958 +                                break;
      959 +                }
 912  960          }
      961 +        spa_config_exit(spa, SCL_ALL, FTAG);
 913  962  }
 914  963  
 915  964  /*
 916  965   * On success, increment root zio's count of good writes.
 917  966   * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
 918  967   */
 919  968  static void
 920  969  vdev_uberblock_sync_done(zio_t *zio)
 921  970  {
 922  971          uint64_t *good_writes = zio->io_private;
↓ open down ↓ 294 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX