Print this page
NEX-7397 Hotspare didn't kick in automatically when one of the drive in pool went "Faulty" (is_ssd fix)
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-2846 Enable Automatic/Intelligent Hot Sparing capability
Reviewed by: Jeffry Molanus <jeffry.molanus@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
6414 vdev_config_sync could be simpler
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
6368 Remove superfluous statement
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Will Andrews <will@freebsd.org>
Approved by: Robert Mustacchi <rm@joyent.com>
6386 Fix function call with uninitialized value in vdev_inuse
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
6328 Fix cstyle errors in zfs codebase (fix studio)
6328 Fix cstyle errors in zfs codebase
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Approved by: Robert Mustacchi <rm@joyent.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-3541 Implement persistent L2ARC
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/uts/common/fs/zfs/sys/spa.h
4121 vdev_label_init should treat request as succeeded when pool is read only
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Fixup merge results
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.

  25  */
  26 
  27 /*
  28  * Virtual Device Labels
  29  * ---------------------
  30  *
  31  * The vdev label serves several distinct purposes:
  32  *
  33  *      1. Uniquely identify this device as part of a ZFS pool and confirm its
  34  *         identity within the pool.
  35  *
  36  *      2. Verify that all the devices given in a configuration are present
  37  *         within the pool.
  38  *
  39  *      3. Determine the uberblock for the pool.
  40  *
  41  *      4. In case of an import operation, determine the configuration of the
  42  *         toplevel vdev of which it is a part.
  43  *
  44  *      5. If an import operation cannot find all the devices in the pool,


 126  *      features_for_read
 127  *                      An nvlist of the features necessary for reading the MOS.
 128  *
 129  * Each leaf device label also contains the following:
 130  *
 131  *      top_guid        Unique ID for top-level vdev in which this is contained
 132  *      guid            Unique ID for the leaf vdev
 133  *
 134  * The 'vs' configuration follows the format described in 'spa_config.c'.
 135  */
 136 
 137 #include <sys/zfs_context.h>
 138 #include <sys/spa.h>
 139 #include <sys/spa_impl.h>
 140 #include <sys/dmu.h>
 141 #include <sys/zap.h>
 142 #include <sys/vdev.h>
 143 #include <sys/vdev_impl.h>
 144 #include <sys/uberblock_impl.h>
 145 #include <sys/metaslab.h>
 146 #include <sys/metaslab_impl.h>
 147 #include <sys/zio.h>
 148 #include <sys/dsl_scan.h>
 149 #include <sys/abd.h>
 150 #include <sys/fs/zfs.h>
 151 
 152 /*
 153  * Basic routines to read and write from a vdev label.
 154  * Used throughout the rest of this file.
 155  */
 156 uint64_t
 157 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 158 {
 159         ASSERT(offset < sizeof (vdev_label_t));
 160         ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
 161 
 162         return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 163             0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 164 }
 165 
 166 /*


 200         ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
 201             (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
 202             (SCL_CONFIG | SCL_STATE) &&
 203             dsl_pool_sync_context(spa_get_dsl(zio->io_spa))));
 204         ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 205 
 206         zio_nowait(zio_write_phys(zio, vd,
 207             vdev_label_offset(vd->vdev_psize, l, offset),
 208             size, buf, ZIO_CHECKSUM_LABEL, done, private,
 209             ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 210 }
 211 
 212 /*
 213  * Generate the nvlist representing this vdev's config.
 214  */
 215 nvlist_t *
 216 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 217     vdev_config_flag_t flags)
 218 {
 219         nvlist_t *nv = NULL;
 220         vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 221 
 222         nv = fnvlist_alloc();
 223 
 224         fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
 225         if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 226                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
 227         fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 228 
 229         if (vd->vdev_path != NULL)
 230                 fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 231 
 232         if (vd->vdev_devid != NULL)
 233                 fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 234 
 235         if (vd->vdev_physpath != NULL)
 236                 fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 237                     vd->vdev_physpath);
 238 
 239         if (vd->vdev_fru != NULL)
 240                 fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);


 248                  * into a crufty old storage pool.
 249                  */
 250                 ASSERT(vd->vdev_nparity == 1 ||
 251                     (vd->vdev_nparity <= 2 &&
 252                     spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
 253                     (vd->vdev_nparity <= 3 &&
 254                     spa_version(spa) >= SPA_VERSION_RAIDZ3));
 255 
 256                 /*
 257                  * Note that we'll add the nparity tag even on storage pools
 258                  * that only support a single parity device -- older software
 259                  * will just ignore it.
 260                  */
 261                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
 262         }
 263 
 264         if (vd->vdev_wholedisk != -1ULL)
 265                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 266                     vd->vdev_wholedisk);
 267 
 268         if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
 269                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 270 
 271         if (vd->vdev_isspare)
 272                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 273 
 274         if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 275             vd == vd->vdev_top) {
 276                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 277                     vd->vdev_ms_array);
 278                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 279                     vd->vdev_ms_shift);
 280                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 281                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 282                     vd->vdev_asize);
 283                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 284                 if (vd->vdev_removing) {


 285                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
 286                             vd->vdev_removing);
 287                 }
 288         }
 289 







 290         if (vd->vdev_dtl_sm != NULL) {
 291                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
 292                     space_map_object(vd->vdev_dtl_sm));
 293         }
 294 
 295         if (vic->vic_mapping_object != 0) {
 296                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 297                     vic->vic_mapping_object);
 298         }
 299 
 300         if (vic->vic_births_object != 0) {
 301                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 302                     vic->vic_births_object);
 303         }
 304 
 305         if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
 306                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 307                     vic->vic_prev_indirect_vdev);
 308         }
 309 
 310         if (vd->vdev_crtxg)
 311                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 312 
 313         if (flags & VDEV_CONFIG_MOS) {
 314                 if (vd->vdev_leaf_zap != 0) {
 315                         ASSERT(vd->vdev_ops->vdev_op_leaf);
 316                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
 317                             vd->vdev_leaf_zap);
 318                 }
 319 
 320                 if (vd->vdev_top_zap != 0) {
 321                         ASSERT(vd == vd->vdev_top);
 322                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 323                             vd->vdev_top_zap);
 324                 }
 325         }
 326 
 327         if (getstats) {
 328                 vdev_stat_t vs;

 329 
 330                 vdev_get_stats(vd, &vs);
 331                 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 332                     (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
 333 
 334                 /* provide either current or previous scan information */
 335                 pool_scan_stat_t ps;
 336                 if (spa_scan_get_stats(spa, &ps) == 0) {
 337                         fnvlist_add_uint64_array(nv,
 338                             ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
 339                             sizeof (pool_scan_stat_t) / sizeof (uint64_t));
 340                 }
 341 
 342                 pool_removal_stat_t prs;
 343                 if (spa_removal_get_stats(spa, &prs) == 0) {
 344                         fnvlist_add_uint64_array(nv,
 345                             ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
 346                             sizeof (prs) / sizeof (uint64_t));
 347                 }
 348 
 349                 /*
 350                  * Note: this can be called from open context
 351                  * (spa_get_stats()), so we need the rwlock to prevent
 352                  * the mapping from being changed by condensing.
 353                  */
 354                 rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
 355                 if (vd->vdev_indirect_mapping != NULL) {
 356                         ASSERT(vd->vdev_indirect_births != NULL);
 357                         vdev_indirect_mapping_t *vim =
 358                             vd->vdev_indirect_mapping;
 359                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 360                             vdev_indirect_mapping_size(vim));
 361                 }
 362                 rw_exit(&vd->vdev_indirect_rwlock);
 363                 if (vd->vdev_mg != NULL &&
 364                     vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
 365                         /*
 366                          * Compute approximately how much memory would be used
 367                          * for the indirect mapping if this device were to
 368                          * be removed.
 369                          *
 370                          * Note: If the frag metric is invalid, then not
 371                          * enough metaslabs have been converted to have
 372                          * histograms.
 373                          */
 374                         uint64_t seg_count = 0;
 375 
 376                         /*
 377                          * There are the same number of allocated segments
 378                          * as free segments, so we will have at least one
 379                          * entry per free segment.
 380                          */
 381                         for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 382                                 seg_count += vd->vdev_mg->mg_histogram[i];
 383                         }
 384 
 385                         /*
 386                          * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
 387                          * so we need at least one entry per SPA_MAXBLOCKSIZE
 388                          * of allocated data.
 389                          */
 390                         seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
 391 
 392                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 393                             seg_count *
 394                             sizeof (vdev_indirect_mapping_entry_phys_t));
 395                 }
 396         }
 397 
 398         if (!vd->vdev_ops->vdev_op_leaf) {
 399                 nvlist_t **child;
 400                 int c, idx;
 401 
 402                 ASSERT(!vd->vdev_ishole);
 403 
 404                 child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 405                     KM_SLEEP);
 406 
 407                 for (c = 0, idx = 0; c < vd->vdev_children; c++) {
 408                         vdev_t *cvd = vd->vdev_child[c];
 409 
 410                         /*
 411                          * If we're generating an nvlist of removing
 412                          * vdevs then skip over any device which is
 413                          * not being removed.
 414                          */
 415                         if ((flags & VDEV_CONFIG_REMOVING) &&
 416                             !cvd->vdev_removing)
 417                                 continue;


 449                 if (vd->vdev_ishole)
 450                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 451 
 452                 switch (vd->vdev_stat.vs_aux) {
 453                 case VDEV_AUX_ERR_EXCEEDED:
 454                         aux = "err_exceeded";
 455                         break;
 456 
 457                 case VDEV_AUX_EXTERNAL:
 458                         aux = "external";
 459                         break;
 460                 }
 461 
 462                 if (aux != NULL)
 463                         fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 464 
 465                 if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
 466                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
 467                             vd->vdev_orig_guid);
 468                 }





 469         }

 470 
 471         return (nv);
 472 }
 473 
 474 /*
 475  * Generate a view of the top-level vdevs.  If we currently have holes
 476  * in the namespace, then generate an array which contains a list of holey
 477  * vdevs.  Additionally, add the number of top-level children that currently
 478  * exist.
 479  */
 480 void
 481 vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 482 {
 483         vdev_t *rvd = spa->spa_root_vdev;
 484         uint64_t *array;
 485         uint_t c, idx;
 486 
 487         array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 488 
 489         for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 490                 vdev_t *tvd = rvd->vdev_child[c];
 491 
 492                 if (tvd->vdev_ishole) {
 493                         array[idx++] = c;
 494                 }
 495         }
 496 
 497         if (idx) {
 498                 VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
 499                     array, idx) == 0);
 500         }
 501 
 502         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 503             rvd->vdev_children) == 0);
 504 
 505         kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 506 }
 507 
 508 /*
 509  * Returns the configuration from the label of the given vdev. For vdevs
 510  * which don't have a txg value stored on their label (i.e. spares/cache)
 511  * or have not been completely initialized (txg = 0) just return
 512  * the configuration from the first valid label we find. Otherwise,
 513  * find the most up-to-date label that does not exceed the specified
 514  * 'txg' value.
 515  */


1040         ASSERT(ub);
1041         ASSERT(config);
1042 
1043         bzero(ub, sizeof (uberblock_t));
1044         *config = NULL;
1045 
1046         cb.ubl_ubbest = ub;
1047         cb.ubl_vd = NULL;
1048 
1049         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1050         zio = zio_root(spa, NULL, &cb, flags);
1051         vdev_uberblock_load_impl(zio, rvd, flags, &cb);
1052         (void) zio_wait(zio);
1053 
1054         /*
1055          * It's possible that the best uberblock was discovered on a label
1056          * that has a configuration which was written in a future txg.
1057          * Search all labels on this vdev to find the configuration that
1058          * matches the txg for our uberblock.
1059          */
1060         if (cb.ubl_vd != NULL) {
1061                 vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
1062                     "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
1063 
1064                 *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
1065                 if (*config == NULL && spa->spa_extreme_rewind) {
1066                         vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
1067                             "Trying again without txg restrictions.");
1068                         *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
1069                 }
1070                 if (*config == NULL) {
1071                         vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
1072                 }
1073         }
1074         spa_config_exit(spa, SCL_ALL, FTAG);
1075 }
1076 
1077 /*
1078  * On success, increment root zio's count of good writes.
1079  * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
1080  */
1081 static void
1082 vdev_uberblock_sync_done(zio_t *zio)
1083 {
1084         uint64_t *good_writes = zio->io_private;
1085 
1086         if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
1087                 atomic_inc_64(good_writes);
1088 }
1089 
1090 /*
1091  * Write the uberblock to all labels of all leaves of the specified vdev.
1092  */
1093 static void
1094 vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
1095 {
1096         for (uint64_t c = 0; c < vd->vdev_children; c++)
1097                 vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
1098 
1099         if (!vd->vdev_ops->vdev_op_leaf)
1100                 return;
1101 
1102         if (!vdev_writeable(vd))
1103                 return;
1104 
1105         int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
1106 
1107         /* Copy the uberblock_t into the ABD */
1108         abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
1109         abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
1110         abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
1111 
1112         for (int l = 0; l < VDEV_LABELS; l++)
1113                 vdev_label_write(zio, vd, l, ub_abd,
1114                     VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
1115                     vdev_uberblock_sync_done, zio->io_private,
1116                     flags | ZIO_FLAG_DONT_PROPAGATE);


1123 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
1124 {
1125         spa_t *spa = svd[0]->vdev_spa;
1126         zio_t *zio;
1127         uint64_t good_writes = 0;
1128 
1129         zio = zio_root(spa, NULL, &good_writes, flags);
1130 
1131         for (int v = 0; v < svdcount; v++)
1132                 vdev_uberblock_sync(zio, ub, svd[v], flags);
1133 
1134         (void) zio_wait(zio);
1135 
1136         /*
1137          * Flush the uberblocks to disk.  This ensures that the odd labels
1138          * are no longer needed (because the new uberblocks and the even
1139          * labels are safely on disk), so it is safe to overwrite them.
1140          */
1141         zio = zio_root(spa, NULL, NULL, flags);
1142 
1143         for (int v = 0; v < svdcount; v++) {
1144                 if (vdev_writeable(svd[v])) {
1145                         zio_flush(zio, svd[v]);
1146                 }
1147         }
1148 
1149         (void) zio_wait(zio);
1150 
1151         return (good_writes >= 1 ? 0 : EIO);
1152 }
1153 
1154 /*
1155  * On success, increment the count of good writes for our top-level vdev.
1156  */
1157 static void
1158 vdev_label_sync_done(zio_t *zio)
1159 {
1160         uint64_t *good_writes = zio->io_private;
1161 
1162         if (zio->io_error == 0)
1163                 atomic_inc_64(good_writes);
1164 }
1165 
1166 /*
1167  * If there weren't enough good writes, indicate failure to the parent.




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 /*
  29  * Virtual Device Labels
  30  * ---------------------
  31  *
  32  * The vdev label serves several distinct purposes:
  33  *
  34  *      1. Uniquely identify this device as part of a ZFS pool and confirm its
  35  *         identity within the pool.
  36  *
  37  *      2. Verify that all the devices given in a configuration are present
  38  *         within the pool.
  39  *
  40  *      3. Determine the uberblock for the pool.
  41  *
  42  *      4. In case of an import operation, determine the configuration of the
  43  *         toplevel vdev of which it is a part.
  44  *
  45  *      5. If an import operation cannot find all the devices in the pool,


 127  *      features_for_read
 128  *                      An nvlist of the features necessary for reading the MOS.
 129  *
 130  * Each leaf device label also contains the following:
 131  *
 132  *      top_guid        Unique ID for top-level vdev in which this is contained
 133  *      guid            Unique ID for the leaf vdev
 134  *
 135  * The 'vs' configuration follows the format described in 'spa_config.c'.
 136  */
 137 
 138 #include <sys/zfs_context.h>
 139 #include <sys/spa.h>
 140 #include <sys/spa_impl.h>
 141 #include <sys/dmu.h>
 142 #include <sys/zap.h>
 143 #include <sys/vdev.h>
 144 #include <sys/vdev_impl.h>
 145 #include <sys/uberblock_impl.h>
 146 #include <sys/metaslab.h>

 147 #include <sys/zio.h>
 148 #include <sys/dsl_scan.h>
 149 #include <sys/abd.h>
 150 #include <sys/fs/zfs.h>
 151 
 152 /*
 153  * Basic routines to read and write from a vdev label.
 154  * Used throughout the rest of this file.
 155  */
 156 uint64_t
 157 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 158 {
 159         ASSERT(offset < sizeof (vdev_label_t));
 160         ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
 161 
 162         return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 163             0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 164 }
 165 
 166 /*


 200         ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
 201             (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
 202             (SCL_CONFIG | SCL_STATE) &&
 203             dsl_pool_sync_context(spa_get_dsl(zio->io_spa))));
 204         ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 205 
 206         zio_nowait(zio_write_phys(zio, vd,
 207             vdev_label_offset(vd->vdev_psize, l, offset),
 208             size, buf, ZIO_CHECKSUM_LABEL, done, private,
 209             ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 210 }
 211 
 212 /*
 213  * Generate the nvlist representing this vdev's config.
 214  */
 215 nvlist_t *
 216 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 217     vdev_config_flag_t flags)
 218 {
 219         nvlist_t *nv = NULL;

 220 
 221         nv = fnvlist_alloc();
 222 
 223         fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
 224         if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 225                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
 226         fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 227 
 228         if (vd->vdev_path != NULL)
 229                 fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 230 
 231         if (vd->vdev_devid != NULL)
 232                 fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 233 
 234         if (vd->vdev_physpath != NULL)
 235                 fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 236                     vd->vdev_physpath);
 237 
 238         if (vd->vdev_fru != NULL)
 239                 fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);


 247                  * into a crufty old storage pool.
 248                  */
 249                 ASSERT(vd->vdev_nparity == 1 ||
 250                     (vd->vdev_nparity <= 2 &&
 251                     spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
 252                     (vd->vdev_nparity <= 3 &&
 253                     spa_version(spa) >= SPA_VERSION_RAIDZ3));
 254 
 255                 /*
 256                  * Note that we'll add the nparity tag even on storage pools
 257                  * that only support a single parity device -- older software
 258                  * will just ignore it.
 259                  */
 260                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
 261         }
 262 
 263         if (vd->vdev_wholedisk != -1ULL)
 264                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 265                     vd->vdev_wholedisk);
 266 
 267         if (vd->vdev_not_present)
 268                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 269 
 270         if (vd->vdev_isspare)
 271                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 272 
 273         if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 274             vd == vd->vdev_top) {
 275                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 276                     vd->vdev_ms_array);
 277                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 278                     vd->vdev_ms_shift);
 279                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 280                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 281                     vd->vdev_asize);
 282                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 283                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPECIAL,
 284                     vd->vdev_isspecial);
 285                 if (vd->vdev_removing)
 286                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
 287                             vd->vdev_removing);
 288         }

 289 
 290         if (flags & VDEV_CONFIG_L2CACHE)
 291                 /* indicate that we support L2ARC persistency */
 292                 VERIFY(nvlist_add_boolean_value(nv,
 293                     ZPOOL_CONFIG_L2CACHE_PERSISTENT, B_TRUE) == 0);
 294 
 295         fnvlist_add_boolean_value(nv, ZPOOL_CONFIG_IS_SSD, vd->vdev_is_ssd);
 296 
 297         if (vd->vdev_dtl_sm != NULL) {
 298                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
 299                     space_map_object(vd->vdev_dtl_sm));
 300         }
 301 















 302         if (vd->vdev_crtxg)
 303                 fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 304 
 305         if (flags & VDEV_CONFIG_MOS) {
 306                 if (vd->vdev_leaf_zap != 0) {
 307                         ASSERT(vd->vdev_ops->vdev_op_leaf);
 308                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
 309                             vd->vdev_leaf_zap);
 310                 }
 311 
 312                 if (vd->vdev_top_zap != 0) {
 313                         ASSERT(vd == vd->vdev_top);
 314                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 315                             vd->vdev_top_zap);
 316                 }
 317         }
 318 
 319         if (getstats) {
 320                 vdev_stat_t vs;
 321                 pool_scan_stat_t ps;
 322 
 323                 vdev_get_stats(vd, &vs);
 324                 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 325                     (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
 326 
 327                 /* provide either current or previous scan information */

 328                 if (spa_scan_get_stats(spa, &ps) == 0) {
 329                         fnvlist_add_uint64_array(nv,
 330                             ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
 331                             sizeof (pool_scan_stat_t) / sizeof (uint64_t));
 332                 }






 333         }
 334 

















































 335         if (!vd->vdev_ops->vdev_op_leaf) {
 336                 nvlist_t **child;
 337                 int c, idx;
 338 
 339                 ASSERT(!vd->vdev_ishole);
 340 
 341                 child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 342                     KM_SLEEP);
 343 
 344                 for (c = 0, idx = 0; c < vd->vdev_children; c++) {
 345                         vdev_t *cvd = vd->vdev_child[c];
 346 
 347                         /*
 348                          * If we're generating an nvlist of removing
 349                          * vdevs then skip over any device which is
 350                          * not being removed.
 351                          */
 352                         if ((flags & VDEV_CONFIG_REMOVING) &&
 353                             !cvd->vdev_removing)
 354                                 continue;


 386                 if (vd->vdev_ishole)
 387                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 388 
 389                 switch (vd->vdev_stat.vs_aux) {
 390                 case VDEV_AUX_ERR_EXCEEDED:
 391                         aux = "err_exceeded";
 392                         break;
 393 
 394                 case VDEV_AUX_EXTERNAL:
 395                         aux = "external";
 396                         break;
 397                 }
 398 
 399                 if (aux != NULL)
 400                         fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 401 
 402                 if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
 403                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
 404                             vd->vdev_orig_guid);
 405                 }
 406 
 407                 /* grab per-leaf-vdev trim stats */
 408                 if (getstats) {
 409                         fnvlist_add_uint64(nv, ZPOOL_CONFIG_TRIM_PROG,
 410                             vd->vdev_trim_prog);
 411                 }
 412         }
 413 
 414         return (nv);
 415 }
 416 
 417 /*
 418  * Generate a view of the top-level vdevs.  If we currently have holes
 419  * in the namespace, then generate an array which contains a list of holey
 420  * vdevs.  Additionally, add the number of top-level children that currently
 421  * exist.
 422  */
 423 void
 424 vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 425 {
 426         vdev_t *rvd = spa->spa_root_vdev;
 427         uint64_t *array;
 428         uint_t c, idx;
 429 
 430         array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 431 
 432         for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 433                 vdev_t *tvd = rvd->vdev_child[c];
 434 
 435                 if (tvd->vdev_ishole)
 436                         array[idx++] = c;
 437         }

 438 
 439         if (idx) {
 440                 VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
 441                     array, idx) == 0);
 442         }
 443 
 444         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 445             rvd->vdev_children) == 0);
 446 
 447         kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 448 }
 449 
 450 /*
 451  * Returns the configuration from the label of the given vdev. For vdevs
 452  * which don't have a txg value stored on their label (i.e. spares/cache)
 453  * or have not been completely initialized (txg = 0) just return
 454  * the configuration from the first valid label we find. Otherwise,
 455  * find the most up-to-date label that does not exceed the specified
 456  * 'txg' value.
 457  */


 982         ASSERT(ub);
 983         ASSERT(config);
 984 
 985         bzero(ub, sizeof (uberblock_t));
 986         *config = NULL;
 987 
 988         cb.ubl_ubbest = ub;
 989         cb.ubl_vd = NULL;
 990 
 991         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 992         zio = zio_root(spa, NULL, &cb, flags);
 993         vdev_uberblock_load_impl(zio, rvd, flags, &cb);
 994         (void) zio_wait(zio);
 995 
 996         /*
 997          * It's possible that the best uberblock was discovered on a label
 998          * that has a configuration which was written in a future txg.
 999          * Search all labels on this vdev to find the configuration that
1000          * matches the txg for our uberblock.
1001          */
1002         if (cb.ubl_vd != NULL)



1003                 *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);









1004         spa_config_exit(spa, SCL_ALL, FTAG);
1005 }
1006 
1007 /*
1008  * On success, increment root zio's count of good writes.
1009  * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
1010  */
1011 static void
1012 vdev_uberblock_sync_done(zio_t *zio)
1013 {
1014         uint64_t *good_writes = zio->io_private;
1015 
1016         if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
1017                 atomic_inc_64(good_writes);
1018 }
1019 
1020 /*
1021  * Write the uberblock to all labels of all leaves of the specified vdev.
1022  */
1023 static void
1024 vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
1025 {
1026         for (int c = 0; c < vd->vdev_children; c++)
1027                 vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
1028 
1029         if (!vd->vdev_ops->vdev_op_leaf)
1030                 return;
1031 
1032         if (!vdev_writeable(vd))
1033                 return;
1034 
1035         int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
1036 
1037         /* Copy the uberblock_t into the ABD */
1038         abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
1039         abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
1040         abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
1041 
1042         for (int l = 0; l < VDEV_LABELS; l++)
1043                 vdev_label_write(zio, vd, l, ub_abd,
1044                     VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
1045                     vdev_uberblock_sync_done, zio->io_private,
1046                     flags | ZIO_FLAG_DONT_PROPAGATE);


1053 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
1054 {
1055         spa_t *spa = svd[0]->vdev_spa;
1056         zio_t *zio;
1057         uint64_t good_writes = 0;
1058 
1059         zio = zio_root(spa, NULL, &good_writes, flags);
1060 
1061         for (int v = 0; v < svdcount; v++)
1062                 vdev_uberblock_sync(zio, ub, svd[v], flags);
1063 
1064         (void) zio_wait(zio);
1065 
1066         /*
1067          * Flush the uberblocks to disk.  This ensures that the odd labels
1068          * are no longer needed (because the new uberblocks and the even
1069          * labels are safely on disk), so it is safe to overwrite them.
1070          */
1071         zio = zio_root(spa, NULL, NULL, flags);
1072 
1073         for (int v = 0; v < svdcount; v++)

1074                 zio_flush(zio, svd[v]);


1075 
1076         (void) zio_wait(zio);
1077 
1078         return (good_writes >= 1 ? 0 : EIO);
1079 }
1080 
1081 /*
1082  * On success, increment the count of good writes for our top-level vdev.
1083  */
1084 static void
1085 vdev_label_sync_done(zio_t *zio)
1086 {
1087         uint64_t *good_writes = zio->io_private;
1088 
1089         if (zio->io_error == 0)
1090                 atomic_inc_64(good_writes);
1091 }
1092 
1093 /*
1094  * If there weren't enough good writes, indicate failure to the parent.