1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2011 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  * This file contains all the routines used when modifying on-disk SPA state.
  30  * This includes opening, importing, destroying, exporting a pool, and syncing a
  31  * pool.
  32  */
  33 
  34 #include <sys/zfs_context.h>
  35 #include <sys/fm/fs/zfs.h>
  36 #include <sys/spa_impl.h>
  37 #include <sys/zio.h>
  38 #include <sys/zio_checksum.h>
  39 #include <sys/dmu.h>
  40 #include <sys/dmu_tx.h>
  41 #include <sys/zap.h>
  42 #include <sys/zil.h>
  43 #include <sys/ddt.h>
  44 #include <sys/vdev_impl.h>
  45 #include <sys/metaslab.h>
  46 #include <sys/metaslab_impl.h>
  47 #include <sys/uberblock_impl.h>
  48 #include <sys/txg.h>
  49 #include <sys/avl.h>
  50 #include <sys/dmu_traverse.h>
  51 #include <sys/dmu_objset.h>
  52 #include <sys/unique.h>
  53 #include <sys/dsl_pool.h>
  54 #include <sys/dsl_dataset.h>
  55 #include <sys/dsl_dir.h>
  56 #include <sys/dsl_prop.h>
  57 #include <sys/dsl_synctask.h>
  58 #include <sys/fs/zfs.h>
  59 #include <sys/arc.h>
  60 #include <sys/callb.h>
  61 #include <sys/systeminfo.h>
  62 #include <sys/spa_boot.h>
  63 #include <sys/zfs_ioctl.h>
  64 #include <sys/dsl_scan.h>
  65 
  66 #ifdef  _KERNEL
  67 #include <sys/bootprops.h>
  68 #include <sys/callb.h>
  69 #include <sys/cpupart.h>
  70 #include <sys/pool.h>
  71 #include <sys/sysdc.h>
  72 #include <sys/zone.h>
  73 #endif  /* _KERNEL */
  74 
  75 #include "zfs_prop.h"
  76 #include "zfs_comutil.h"
  77 
  78 typedef enum zti_modes {
  79         zti_mode_fixed,                 /* value is # of threads (min 1) */
  80         zti_mode_online_percent,        /* value is % of online CPUs */
  81         zti_mode_batch,                 /* cpu-intensive; value is ignored */
  82         zti_mode_null,                  /* don't create a taskq */
  83         zti_nmodes
  84 } zti_modes_t;
  85 
  86 #define ZTI_FIX(n)      { zti_mode_fixed, (n) }
  87 #define ZTI_PCT(n)      { zti_mode_online_percent, (n) }
  88 #define ZTI_BATCH       { zti_mode_batch, 0 }
  89 #define ZTI_NULL        { zti_mode_null, 0 }
  90 
  91 #define ZTI_ONE         ZTI_FIX(1)
  92 
  93 typedef struct zio_taskq_info {
  94         enum zti_modes zti_mode;
  95         uint_t zti_value;
  96 } zio_taskq_info_t;
  97 
  98 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  99         "issue", "issue_high", "intr", "intr_high"
 100 };
 101 
 102 /*
 103  * Define the taskq threads for the following I/O types:
 104  *      NULL, READ, WRITE, FREE, CLAIM, and IOCTL
 105  */
 106 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 107         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 108         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 109         { ZTI_FIX(8),   ZTI_NULL,       ZTI_BATCH,      ZTI_NULL },
 110         { ZTI_BATCH,    ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
 111         { ZTI_FIX(100), ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 112         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 113         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 114 };
 115 
 116 static dsl_syncfunc_t spa_sync_props;
 117 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 118 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 119     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 120     char **ereport);
 121 static void spa_vdev_resilver_done(spa_t *spa);
 122 
 123 uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 124 id_t            zio_taskq_psrset_bind = PS_NONE;
 125 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 126 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 127 
 128 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 129 
 130 /*
 131  * This (illegal) pool name is used when temporarily importing a spa_t in order
 132  * to get the vdev stats associated with the imported devices.
 133  */
 134 #define TRYIMPORT_NAME  "$import"
 135 
 136 /*
 137  * ==========================================================================
 138  * SPA properties routines
 139  * ==========================================================================
 140  */
 141 
 142 /*
 143  * Add a (source=src, propname=propval) list to an nvlist.
 144  */
 145 static void
 146 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 147     uint64_t intval, zprop_source_t src)
 148 {
 149         const char *propname = zpool_prop_to_name(prop);
 150         nvlist_t *propval;
 151 
 152         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 153         VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 154 
 155         if (strval != NULL)
 156                 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 157         else
 158                 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 159 
 160         VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 161         nvlist_free(propval);
 162 }
 163 
 164 /*
 165  * Get property values from the spa configuration.
 166  */
 167 static void
 168 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 169 {
 170         uint64_t size;
 171         uint64_t alloc;
 172         uint64_t cap, version;
 173         zprop_source_t src = ZPROP_SRC_NONE;
 174         spa_config_dirent_t *dp;
 175 
 176         ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 177 
 178         if (spa->spa_root_vdev != NULL) {
 179                 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 180                 size = metaslab_class_get_space(spa_normal_class(spa));
 181                 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 182                 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 183                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 184                 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 185                     size - alloc, src);
 186                 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 187                     (spa_mode(spa) == FREAD), src);
 188 
 189                 cap = (size == 0) ? 0 : (alloc * 100 / size);
 190                 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 191 
 192                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 193                     ddt_get_pool_dedup_ratio(spa), src);
 194 
 195                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 196                     spa->spa_root_vdev->vdev_state, src);
 197 
 198                 version = spa_version(spa);
 199                 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 200                         src = ZPROP_SRC_DEFAULT;
 201                 else
 202                         src = ZPROP_SRC_LOCAL;
 203                 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 204         }
 205 
 206         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 207 
 208         if (spa->spa_comment != NULL) {
 209                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 210                     0, ZPROP_SRC_LOCAL);
 211         }
 212 
 213         if (spa->spa_root != NULL)
 214                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 215                     0, ZPROP_SRC_LOCAL);
 216 
 217         if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 218                 if (dp->scd_path == NULL) {
 219                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 220                             "none", 0, ZPROP_SRC_LOCAL);
 221                 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 222                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 223                             dp->scd_path, 0, ZPROP_SRC_LOCAL);
 224                 }
 225         }
 226 }
 227 
 228 /*
 229  * Get zpool property values.
 230  */
 231 int
 232 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 233 {
 234         objset_t *mos = spa->spa_meta_objset;
 235         zap_cursor_t zc;
 236         zap_attribute_t za;
 237         int err;
 238 
 239         VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 240 
 241         mutex_enter(&spa->spa_props_lock);
 242 
 243         /*
 244          * Get properties from the spa config.
 245          */
 246         spa_prop_get_config(spa, nvp);
 247 
 248         /* If no pool property object, no more prop to get. */
 249         if (mos == NULL || spa->spa_pool_props_object == 0) {
 250                 mutex_exit(&spa->spa_props_lock);
 251                 return (0);
 252         }
 253 
 254         /*
 255          * Get properties from the MOS pool property object.
 256          */
 257         for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 258             (err = zap_cursor_retrieve(&zc, &za)) == 0;
 259             zap_cursor_advance(&zc)) {
 260                 uint64_t intval = 0;
 261                 char *strval = NULL;
 262                 zprop_source_t src = ZPROP_SRC_DEFAULT;
 263                 zpool_prop_t prop;
 264 
 265                 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 266                         continue;
 267 
 268                 switch (za.za_integer_length) {
 269                 case 8:
 270                         /* integer property */
 271                         if (za.za_first_integer !=
 272                             zpool_prop_default_numeric(prop))
 273                                 src = ZPROP_SRC_LOCAL;
 274 
 275                         if (prop == ZPOOL_PROP_BOOTFS) {
 276                                 dsl_pool_t *dp;
 277                                 dsl_dataset_t *ds = NULL;
 278 
 279                                 dp = spa_get_dsl(spa);
 280                                 rw_enter(&dp->dp_config_rwlock, RW_READER);
 281                                 if (err = dsl_dataset_hold_obj(dp,
 282                                     za.za_first_integer, FTAG, &ds)) {
 283                                         rw_exit(&dp->dp_config_rwlock);
 284                                         break;
 285                                 }
 286 
 287                                 strval = kmem_alloc(
 288                                     MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 289                                     KM_SLEEP);
 290                                 dsl_dataset_name(ds, strval);
 291                                 dsl_dataset_rele(ds, FTAG);
 292                                 rw_exit(&dp->dp_config_rwlock);
 293                         } else {
 294                                 strval = NULL;
 295                                 intval = za.za_first_integer;
 296                         }
 297 
 298                         spa_prop_add_list(*nvp, prop, strval, intval, src);
 299 
 300                         if (strval != NULL)
 301                                 kmem_free(strval,
 302                                     MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 303 
 304                         break;
 305 
 306                 case 1:
 307                         /* string property */
 308                         strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 309                         err = zap_lookup(mos, spa->spa_pool_props_object,
 310                             za.za_name, 1, za.za_num_integers, strval);
 311                         if (err) {
 312                                 kmem_free(strval, za.za_num_integers);
 313                                 break;
 314                         }
 315                         spa_prop_add_list(*nvp, prop, strval, 0, src);
 316                         kmem_free(strval, za.za_num_integers);
 317                         break;
 318 
 319                 default:
 320                         break;
 321                 }
 322         }
 323         zap_cursor_fini(&zc);
 324         mutex_exit(&spa->spa_props_lock);
 325 out:
 326         if (err && err != ENOENT) {
 327                 nvlist_free(*nvp);
 328                 *nvp = NULL;
 329                 return (err);
 330         }
 331 
 332         return (0);
 333 }
 334 
 335 /*
 336  * Validate the given pool properties nvlist and modify the list
 337  * for the property values to be set.
 338  */
 339 static int
 340 spa_prop_validate(spa_t *spa, nvlist_t *props)
 341 {
 342         nvpair_t *elem;
 343         int error = 0, reset_bootfs = 0;
 344         uint64_t objnum;
 345 
 346         elem = NULL;
 347         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 348                 zpool_prop_t prop;
 349                 char *propname, *strval;
 350                 uint64_t intval;
 351                 objset_t *os;
 352                 char *slash, *check;
 353 
 354                 propname = nvpair_name(elem);
 355 
 356                 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
 357                         return (EINVAL);
 358 
 359                 switch (prop) {
 360                 case ZPOOL_PROP_VERSION:
 361                         error = nvpair_value_uint64(elem, &intval);
 362                         if (!error &&
 363                             (intval < spa_version(spa) || intval > SPA_VERSION))
 364                                 error = EINVAL;
 365                         break;
 366 
 367                 case ZPOOL_PROP_DELEGATION:
 368                 case ZPOOL_PROP_AUTOREPLACE:
 369                 case ZPOOL_PROP_LISTSNAPS:
 370                 case ZPOOL_PROP_AUTOEXPAND:
 371                         error = nvpair_value_uint64(elem, &intval);
 372                         if (!error && intval > 1)
 373                                 error = EINVAL;
 374                         break;
 375 
 376                 case ZPOOL_PROP_BOOTFS:
 377                         /*
 378                          * If the pool version is less than SPA_VERSION_BOOTFS,
 379                          * or the pool is still being created (version == 0),
 380                          * the bootfs property cannot be set.
 381                          */
 382                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 383                                 error = ENOTSUP;
 384                                 break;
 385                         }
 386 
 387                         /*
 388                          * Make sure the vdev config is bootable
 389                          */
 390                         if (!vdev_is_bootable(spa->spa_root_vdev)) {
 391                                 error = ENOTSUP;
 392                                 break;
 393                         }
 394 
 395                         reset_bootfs = 1;
 396 
 397                         error = nvpair_value_string(elem, &strval);
 398 
 399                         if (!error) {
 400                                 uint64_t compress;
 401 
 402                                 if (strval == NULL || strval[0] == '\0') {
 403                                         objnum = zpool_prop_default_numeric(
 404                                             ZPOOL_PROP_BOOTFS);
 405                                         break;
 406                                 }
 407 
 408                                 if (error = dmu_objset_hold(strval, FTAG, &os))
 409                                         break;
 410 
 411                                 /* Must be ZPL and not gzip compressed. */
 412 
 413                                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
 414                                         error = ENOTSUP;
 415                                 } else if ((error = dsl_prop_get_integer(strval,
 416                                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 417                                     &compress, NULL)) == 0 &&
 418                                     !BOOTFS_COMPRESS_VALID(compress)) {
 419                                         error = ENOTSUP;
 420                                 } else {
 421                                         objnum = dmu_objset_id(os);
 422                                 }
 423                                 dmu_objset_rele(os, FTAG);
 424                         }
 425                         break;
 426 
 427                 case ZPOOL_PROP_FAILUREMODE:
 428                         error = nvpair_value_uint64(elem, &intval);
 429                         if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 430                             intval > ZIO_FAILURE_MODE_PANIC))
 431                                 error = EINVAL;
 432 
 433                         /*
 434                          * This is a special case which only occurs when
 435                          * the pool has completely failed. This allows
 436                          * the user to change the in-core failmode property
 437                          * without syncing it out to disk (I/Os might
 438                          * currently be blocked). We do this by returning
 439                          * EIO to the caller (spa_prop_set) to trick it
 440                          * into thinking we encountered a property validation
 441                          * error.
 442                          */
 443                         if (!error && spa_suspended(spa)) {
 444                                 spa->spa_failmode = intval;
 445                                 error = EIO;
 446                         }
 447                         break;
 448 
 449                 case ZPOOL_PROP_CACHEFILE:
 450                         if ((error = nvpair_value_string(elem, &strval)) != 0)
 451                                 break;
 452 
 453                         if (strval[0] == '\0')
 454                                 break;
 455 
 456                         if (strcmp(strval, "none") == 0)
 457                                 break;
 458 
 459                         if (strval[0] != '/') {
 460                                 error = EINVAL;
 461                                 break;
 462                         }
 463 
 464                         slash = strrchr(strval, '/');
 465                         ASSERT(slash != NULL);
 466 
 467                         if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 468                             strcmp(slash, "/..") == 0)
 469                                 error = EINVAL;
 470                         break;
 471 
 472                 case ZPOOL_PROP_COMMENT:
 473                         if ((error = nvpair_value_string(elem, &strval)) != 0)
 474                                 break;
 475                         for (check = strval; *check != '\0'; check++) {
 476                                 /*
 477                                  * The kernel doesn't have an easy isprint()
 478                                  * check.  For this kernel check, we merely
 479                                  * check ASCII apart from DEL.  Fix this if
 480                                  * there is an easy-to-use kernel isprint().
 481                                  */
 482                                 if (*check >= 0x7f) {
 483                                         error = EINVAL;
 484                                         break;
 485                                 }
 486                                 check++;
 487                         }
 488                         if (strlen(strval) > ZPROP_MAX_COMMENT)
 489                                 error = E2BIG;
 490                         break;
 491 
 492                 case ZPOOL_PROP_DEDUPDITTO:
 493                         if (spa_version(spa) < SPA_VERSION_DEDUP)
 494                                 error = ENOTSUP;
 495                         else
 496                                 error = nvpair_value_uint64(elem, &intval);
 497                         if (error == 0 &&
 498                             intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 499                                 error = EINVAL;
 500                         break;
 501                 }
 502 
 503                 if (error)
 504                         break;
 505         }
 506 
 507         if (!error && reset_bootfs) {
 508                 error = nvlist_remove(props,
 509                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 510 
 511                 if (!error) {
 512                         error = nvlist_add_uint64(props,
 513                             zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 514                 }
 515         }
 516 
 517         return (error);
 518 }
 519 
 520 void
 521 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 522 {
 523         char *cachefile;
 524         spa_config_dirent_t *dp;
 525 
 526         if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 527             &cachefile) != 0)
 528                 return;
 529 
 530         dp = kmem_alloc(sizeof (spa_config_dirent_t),
 531             KM_SLEEP);
 532 
 533         if (cachefile[0] == '\0')
 534                 dp->scd_path = spa_strdup(spa_config_path);
 535         else if (strcmp(cachefile, "none") == 0)
 536                 dp->scd_path = NULL;
 537         else
 538                 dp->scd_path = spa_strdup(cachefile);
 539 
 540         list_insert_head(&spa->spa_config_list, dp);
 541         if (need_sync)
 542                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 543 }
 544 
 545 int
 546 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 547 {
 548         int error;
 549         nvpair_t *elem;
 550         boolean_t need_sync = B_FALSE;
 551         zpool_prop_t prop;
 552 
 553         if ((error = spa_prop_validate(spa, nvp)) != 0)
 554                 return (error);
 555 
 556         elem = NULL;
 557         while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 558                 if ((prop = zpool_name_to_prop(
 559                     nvpair_name(elem))) == ZPROP_INVAL)
 560                         return (EINVAL);
 561 
 562                 if (prop == ZPOOL_PROP_CACHEFILE ||
 563                     prop == ZPOOL_PROP_ALTROOT ||
 564                     prop == ZPOOL_PROP_READONLY)
 565                         continue;
 566 
 567                 need_sync = B_TRUE;
 568                 break;
 569         }
 570 
 571         if (need_sync)
 572                 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
 573                     spa, nvp, 3));
 574         else
 575                 return (0);
 576 }
 577 
 578 /*
 579  * If the bootfs property value is dsobj, clear it.
 580  */
 581 void
 582 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 583 {
 584         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 585                 VERIFY(zap_remove(spa->spa_meta_objset,
 586                     spa->spa_pool_props_object,
 587                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 588                 spa->spa_bootfs = 0;
 589         }
 590 }
 591 
 592 /*
 593  * Change the GUID for the pool.  This is done so that we can later
 594  * re-import a pool built from a clone of our own vdevs.  We will modify
 595  * the root vdev's guid, our own pool guid, and then mark all of our
 596  * vdevs dirty.  Note that we must make sure that all our vdevs are
 597  * online when we do this, or else any vdevs that weren't present
 598  * would be orphaned from our pool.  We are also going to issue a
 599  * sysevent to update any watchers.
 600  */
 601 int
 602 spa_change_guid(spa_t *spa)
 603 {
 604         uint64_t        oldguid, newguid;
 605         uint64_t        txg;
 606 
 607         if (!(spa_mode_global & FWRITE))
 608                 return (EROFS);
 609 
 610         txg = spa_vdev_enter(spa);
 611 
 612         if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
 613                 return (spa_vdev_exit(spa, NULL, txg, ENXIO));
 614 
 615         oldguid = spa_guid(spa);
 616         newguid = spa_generate_guid(NULL);
 617         ASSERT3U(oldguid, !=, newguid);
 618 
 619         spa->spa_root_vdev->vdev_guid = newguid;
 620         spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
 621 
 622         vdev_config_dirty(spa->spa_root_vdev);
 623 
 624         spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 625 
 626         return (spa_vdev_exit(spa, NULL, txg, 0));
 627 }
 628 
 629 /*
 630  * ==========================================================================
 631  * SPA state manipulation (open/create/destroy/import/export)
 632  * ==========================================================================
 633  */
 634 
 635 static int
 636 spa_error_entry_compare(const void *a, const void *b)
 637 {
 638         spa_error_entry_t *sa = (spa_error_entry_t *)a;
 639         spa_error_entry_t *sb = (spa_error_entry_t *)b;
 640         int ret;
 641 
 642         ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 643             sizeof (zbookmark_t));
 644 
 645         if (ret < 0)
 646                 return (-1);
 647         else if (ret > 0)
 648                 return (1);
 649         else
 650                 return (0);
 651 }
 652 
 653 /*
 654  * Utility function which retrieves copies of the current logs and
 655  * re-initializes them in the process.
 656  */
 657 void
 658 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 659 {
 660         ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 661 
 662         bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 663         bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 664 
 665         avl_create(&spa->spa_errlist_scrub,
 666             spa_error_entry_compare, sizeof (spa_error_entry_t),
 667             offsetof(spa_error_entry_t, se_avl));
 668         avl_create(&spa->spa_errlist_last,
 669             spa_error_entry_compare, sizeof (spa_error_entry_t),
 670             offsetof(spa_error_entry_t, se_avl));
 671 }
 672 
 673 static taskq_t *
 674 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
 675     uint_t value)
 676 {
 677         uint_t flags = 0;
 678         boolean_t batch = B_FALSE;
 679 
 680         switch (mode) {
 681         case zti_mode_null:
 682                 return (NULL);          /* no taskq needed */
 683 
 684         case zti_mode_fixed:
 685                 ASSERT3U(value, >=, 1);
 686                 value = MAX(value, 1);
 687                 break;
 688 
 689         case zti_mode_batch:
 690                 batch = B_TRUE;
 691                 flags |= TASKQ_THREADS_CPU_PCT;
 692                 value = zio_taskq_batch_pct;
 693                 break;
 694 
 695         case zti_mode_online_percent:
 696                 flags |= TASKQ_THREADS_CPU_PCT;
 697                 break;
 698 
 699         default:
 700                 panic("unrecognized mode for %s taskq (%u:%u) in "
 701                     "spa_activate()",
 702                     name, mode, value);
 703                 break;
 704         }
 705 
 706         if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 707                 if (batch)
 708                         flags |= TASKQ_DC_BATCH;
 709 
 710                 return (taskq_create_sysdc(name, value, 50, INT_MAX,
 711                     spa->spa_proc, zio_taskq_basedc, flags));
 712         }
 713         return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
 714             spa->spa_proc, flags));
 715 }
 716 
 717 static void
 718 spa_create_zio_taskqs(spa_t *spa)
 719 {
 720         for (int t = 0; t < ZIO_TYPES; t++) {
 721                 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 722                         const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 723                         enum zti_modes mode = ztip->zti_mode;
 724                         uint_t value = ztip->zti_value;
 725                         char name[32];
 726 
 727                         (void) snprintf(name, sizeof (name),
 728                             "%s_%s", zio_type_name[t], zio_taskq_types[q]);
 729 
 730                         spa->spa_zio_taskq[t][q] =
 731                             spa_taskq_create(spa, name, mode, value);
 732                 }
 733         }
 734 }
 735 
 736 #ifdef _KERNEL
 737 static void
 738 spa_thread(void *arg)
 739 {
 740         callb_cpr_t cprinfo;
 741 
 742         spa_t *spa = arg;
 743         user_t *pu = PTOU(curproc);
 744 
 745         CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 746             spa->spa_name);
 747 
 748         ASSERT(curproc != &p0);
 749         (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 750             "zpool-%s", spa->spa_name);
 751         (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 752 
 753         /* bind this thread to the requested psrset */
 754         if (zio_taskq_psrset_bind != PS_NONE) {
 755                 pool_lock();
 756                 mutex_enter(&cpu_lock);
 757                 mutex_enter(&pidlock);
 758                 mutex_enter(&curproc->p_lock);
 759 
 760                 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 761                     0, NULL, NULL) == 0)  {
 762                         curthread->t_bind_pset = zio_taskq_psrset_bind;
 763                 } else {
 764                         cmn_err(CE_WARN,
 765                             "Couldn't bind process for zfs pool \"%s\" to "
 766                             "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 767                 }
 768 
 769                 mutex_exit(&curproc->p_lock);
 770                 mutex_exit(&pidlock);
 771                 mutex_exit(&cpu_lock);
 772                 pool_unlock();
 773         }
 774 
 775         if (zio_taskq_sysdc) {
 776                 sysdc_thread_enter(curthread, 100, 0);
 777         }
 778 
 779         spa->spa_proc = curproc;
 780         spa->spa_did = curthread->t_did;
 781 
 782         spa_create_zio_taskqs(spa);
 783 
 784         mutex_enter(&spa->spa_proc_lock);
 785         ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 786 
 787         spa->spa_proc_state = SPA_PROC_ACTIVE;
 788         cv_broadcast(&spa->spa_proc_cv);
 789 
 790         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 791         while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 792                 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 793         CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 794 
 795         ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 796         spa->spa_proc_state = SPA_PROC_GONE;
 797         spa->spa_proc = &p0;
 798         cv_broadcast(&spa->spa_proc_cv);
 799         CALLB_CPR_EXIT(&cprinfo);   /* drops spa_proc_lock */
 800 
 801         mutex_enter(&curproc->p_lock);
 802         lwp_exit();
 803 }
 804 #endif
 805 
 806 /*
 807  * Activate an uninitialized pool.
 808  */
 809 static void
 810 spa_activate(spa_t *spa, int mode)
 811 {
 812         ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 813 
 814         spa->spa_state = POOL_STATE_ACTIVE;
 815         spa->spa_mode = mode;
 816 
 817         spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 818         spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 819 
 820         /* Try to create a covering process */
 821         mutex_enter(&spa->spa_proc_lock);
 822         ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 823         ASSERT(spa->spa_proc == &p0);
 824         spa->spa_did = 0;
 825 
 826         /* Only create a process if we're going to be around a while. */
 827         if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 828                 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 829                     NULL, 0) == 0) {
 830                         spa->spa_proc_state = SPA_PROC_CREATED;
 831                         while (spa->spa_proc_state == SPA_PROC_CREATED) {
 832                                 cv_wait(&spa->spa_proc_cv,
 833                                     &spa->spa_proc_lock);
 834                         }
 835                         ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 836                         ASSERT(spa->spa_proc != &p0);
 837                         ASSERT(spa->spa_did != 0);
 838                 } else {
 839 #ifdef _KERNEL
 840                         cmn_err(CE_WARN,
 841                             "Couldn't create process for zfs pool \"%s\"\n",
 842                             spa->spa_name);
 843 #endif
 844                 }
 845         }
 846         mutex_exit(&spa->spa_proc_lock);
 847 
 848         /* If we didn't create a process, we need to create our taskqs. */
 849         if (spa->spa_proc == &p0) {
 850                 spa_create_zio_taskqs(spa);
 851         }
 852 
 853         list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 854             offsetof(vdev_t, vdev_config_dirty_node));
 855         list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 856             offsetof(vdev_t, vdev_state_dirty_node));
 857 
 858         txg_list_create(&spa->spa_vdev_txg_list,
 859             offsetof(struct vdev, vdev_txg_node));
 860 
 861         avl_create(&spa->spa_errlist_scrub,
 862             spa_error_entry_compare, sizeof (spa_error_entry_t),
 863             offsetof(spa_error_entry_t, se_avl));
 864         avl_create(&spa->spa_errlist_last,
 865             spa_error_entry_compare, sizeof (spa_error_entry_t),
 866             offsetof(spa_error_entry_t, se_avl));
 867 }
 868 
 869 /*
 870  * Opposite of spa_activate().
 871  */
 872 static void
 873 spa_deactivate(spa_t *spa)
 874 {
 875         ASSERT(spa->spa_sync_on == B_FALSE);
 876         ASSERT(spa->spa_dsl_pool == NULL);
 877         ASSERT(spa->spa_root_vdev == NULL);
 878         ASSERT(spa->spa_async_zio_root == NULL);
 879         ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 880 
 881         txg_list_destroy(&spa->spa_vdev_txg_list);
 882 
 883         list_destroy(&spa->spa_config_dirty_list);
 884         list_destroy(&spa->spa_state_dirty_list);
 885 
 886         for (int t = 0; t < ZIO_TYPES; t++) {
 887                 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 888                         if (spa->spa_zio_taskq[t][q] != NULL)
 889                                 taskq_destroy(spa->spa_zio_taskq[t][q]);
 890                         spa->spa_zio_taskq[t][q] = NULL;
 891                 }
 892         }
 893 
 894         metaslab_class_destroy(spa->spa_normal_class);
 895         spa->spa_normal_class = NULL;
 896 
 897         metaslab_class_destroy(spa->spa_log_class);
 898         spa->spa_log_class = NULL;
 899 
 900         /*
 901          * If this was part of an import or the open otherwise failed, we may
 902          * still have errors left in the queues.  Empty them just in case.
 903          */
 904         spa_errlog_drain(spa);
 905 
 906         avl_destroy(&spa->spa_errlist_scrub);
 907         avl_destroy(&spa->spa_errlist_last);
 908 
 909         spa->spa_state = POOL_STATE_UNINITIALIZED;
 910 
 911         mutex_enter(&spa->spa_proc_lock);
 912         if (spa->spa_proc_state != SPA_PROC_NONE) {
 913                 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 914                 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 915                 cv_broadcast(&spa->spa_proc_cv);
 916                 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 917                         ASSERT(spa->spa_proc != &p0);
 918                         cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 919                 }
 920                 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 921                 spa->spa_proc_state = SPA_PROC_NONE;
 922         }
 923         ASSERT(spa->spa_proc == &p0);
 924         mutex_exit(&spa->spa_proc_lock);
 925 
 926         /*
 927          * We want to make sure spa_thread() has actually exited the ZFS
 928          * module, so that the module can't be unloaded out from underneath
 929          * it.
 930          */
 931         if (spa->spa_did != 0) {
 932                 thread_join(spa->spa_did);
 933                 spa->spa_did = 0;
 934         }
 935 }
 936 
 937 /*
 938  * Verify a pool configuration, and construct the vdev tree appropriately.  This
 939  * will create all the necessary vdevs in the appropriate layout, with each vdev
 940  * in the CLOSED state.  This will prep the pool before open/creation/import.
 941  * All vdev validation is done by the vdev_alloc() routine.
 942  */
 943 static int
 944 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
 945     uint_t id, int atype)
 946 {
 947         nvlist_t **child;
 948         uint_t children;
 949         int error;
 950 
 951         if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 952                 return (error);
 953 
 954         if ((*vdp)->vdev_ops->vdev_op_leaf)
 955                 return (0);
 956 
 957         error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 958             &child, &children);
 959 
 960         if (error == ENOENT)
 961                 return (0);
 962 
 963         if (error) {
 964                 vdev_free(*vdp);
 965                 *vdp = NULL;
 966                 return (EINVAL);
 967         }
 968 
 969         for (int c = 0; c < children; c++) {
 970                 vdev_t *vd;
 971                 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 972                     atype)) != 0) {
 973                         vdev_free(*vdp);
 974                         *vdp = NULL;
 975                         return (error);
 976                 }
 977         }
 978 
 979         ASSERT(*vdp != NULL);
 980 
 981         return (0);
 982 }
 983 
 984 /*
 985  * Opposite of spa_load().
 986  */
 987 static void
 988 spa_unload(spa_t *spa)
 989 {
 990         int i;
 991 
 992         ASSERT(MUTEX_HELD(&spa_namespace_lock));
 993 
 994         /*
 995          * Stop async tasks.
 996          */
 997         spa_async_suspend(spa);
 998 
 999         /*
1000          * Stop syncing.
1001          */
1002         if (spa->spa_sync_on) {
1003                 txg_sync_stop(spa->spa_dsl_pool);
1004                 spa->spa_sync_on = B_FALSE;
1005         }
1006 
1007         /*
1008          * Wait for any outstanding async I/O to complete.
1009          */
1010         if (spa->spa_async_zio_root != NULL) {
1011                 (void) zio_wait(spa->spa_async_zio_root);
1012                 spa->spa_async_zio_root = NULL;
1013         }
1014 
1015         bpobj_close(&spa->spa_deferred_bpobj);
1016 
1017         /*
1018          * Close the dsl pool.
1019          */
1020         if (spa->spa_dsl_pool) {
1021                 dsl_pool_close(spa->spa_dsl_pool);
1022                 spa->spa_dsl_pool = NULL;
1023                 spa->spa_meta_objset = NULL;
1024         }
1025 
1026         ddt_unload(spa);
1027 
1028         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1029 
1030         /*
1031          * Drop and purge level 2 cache
1032          */
1033         spa_l2cache_drop(spa);
1034 
1035         /*
1036          * Close all vdevs.
1037          */
1038         if (spa->spa_root_vdev)
1039                 vdev_free(spa->spa_root_vdev);
1040         ASSERT(spa->spa_root_vdev == NULL);
1041 
1042         for (i = 0; i < spa->spa_spares.sav_count; i++)
1043                 vdev_free(spa->spa_spares.sav_vdevs[i]);
1044         if (spa->spa_spares.sav_vdevs) {
1045                 kmem_free(spa->spa_spares.sav_vdevs,
1046                     spa->spa_spares.sav_count * sizeof (void *));
1047                 spa->spa_spares.sav_vdevs = NULL;
1048         }
1049         if (spa->spa_spares.sav_config) {
1050                 nvlist_free(spa->spa_spares.sav_config);
1051                 spa->spa_spares.sav_config = NULL;
1052         }
1053         spa->spa_spares.sav_count = 0;
1054 
1055         for (i = 0; i < spa->spa_l2cache.sav_count; i++)
1056                 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1057         if (spa->spa_l2cache.sav_vdevs) {
1058                 kmem_free(spa->spa_l2cache.sav_vdevs,
1059                     spa->spa_l2cache.sav_count * sizeof (void *));
1060                 spa->spa_l2cache.sav_vdevs = NULL;
1061         }
1062         if (spa->spa_l2cache.sav_config) {
1063                 nvlist_free(spa->spa_l2cache.sav_config);
1064                 spa->spa_l2cache.sav_config = NULL;
1065         }
1066         spa->spa_l2cache.sav_count = 0;
1067 
1068         spa->spa_async_suspended = 0;
1069 
1070         if (spa->spa_comment != NULL) {
1071                 spa_strfree(spa->spa_comment);
1072                 spa->spa_comment = NULL;
1073         }
1074 
1075         spa_config_exit(spa, SCL_ALL, FTAG);
1076 }
1077 
1078 /*
1079  * Load (or re-load) the current list of vdevs describing the active spares for
1080  * this pool.  When this is called, we have some form of basic information in
1081  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1082  * then re-generate a more complete list including status information.
1083  */
1084 static void
1085 spa_load_spares(spa_t *spa)
1086 {
1087         nvlist_t **spares;
1088         uint_t nspares;
1089         int i;
1090         vdev_t *vd, *tvd;
1091 
1092         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1093 
1094         /*
1095          * First, close and free any existing spare vdevs.
1096          */
1097         for (i = 0; i < spa->spa_spares.sav_count; i++) {
1098                 vd = spa->spa_spares.sav_vdevs[i];
1099 
1100                 /* Undo the call to spa_activate() below */
1101                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1102                     B_FALSE)) != NULL && tvd->vdev_isspare)
1103                         spa_spare_remove(tvd);
1104                 vdev_close(vd);
1105                 vdev_free(vd);
1106         }
1107 
1108         if (spa->spa_spares.sav_vdevs)
1109                 kmem_free(spa->spa_spares.sav_vdevs,
1110                     spa->spa_spares.sav_count * sizeof (void *));
1111 
1112         if (spa->spa_spares.sav_config == NULL)
1113                 nspares = 0;
1114         else
1115                 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1116                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1117 
1118         spa->spa_spares.sav_count = (int)nspares;
1119         spa->spa_spares.sav_vdevs = NULL;
1120 
1121         if (nspares == 0)
1122                 return;
1123 
1124         /*
1125          * Construct the array of vdevs, opening them to get status in the
1126          * process.   For each spare, there is potentially two different vdev_t
1127          * structures associated with it: one in the list of spares (used only
1128          * for basic validation purposes) and one in the active vdev
1129          * configuration (if it's spared in).  During this phase we open and
1130          * validate each vdev on the spare list.  If the vdev also exists in the
1131          * active configuration, then we also mark this vdev as an active spare.
1132          */
1133         spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1134             KM_SLEEP);
1135         for (i = 0; i < spa->spa_spares.sav_count; i++) {
1136                 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1137                     VDEV_ALLOC_SPARE) == 0);
1138                 ASSERT(vd != NULL);
1139 
1140                 spa->spa_spares.sav_vdevs[i] = vd;
1141 
1142                 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1143                     B_FALSE)) != NULL) {
1144                         if (!tvd->vdev_isspare)
1145                                 spa_spare_add(tvd);
1146 
1147                         /*
1148                          * We only mark the spare active if we were successfully
1149                          * able to load the vdev.  Otherwise, importing a pool
1150                          * with a bad active spare would result in strange
1151                          * behavior, because multiple pool would think the spare
1152                          * is actively in use.
1153                          *
1154                          * There is a vulnerability here to an equally bizarre
1155                          * circumstance, where a dead active spare is later
1156                          * brought back to life (onlined or otherwise).  Given
1157                          * the rarity of this scenario, and the extra complexity
1158                          * it adds, we ignore the possibility.
1159                          */
1160                         if (!vdev_is_dead(tvd))
1161                                 spa_spare_activate(tvd);
1162                 }
1163 
1164                 vd->vdev_top = vd;
1165                 vd->vdev_aux = &spa->spa_spares;
1166 
1167                 if (vdev_open(vd) != 0)
1168                         continue;
1169 
1170                 if (vdev_validate_aux(vd) == 0)
1171                         spa_spare_add(vd);
1172         }
1173 
1174         /*
1175          * Recompute the stashed list of spares, with status information
1176          * this time.
1177          */
1178         VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1179             DATA_TYPE_NVLIST_ARRAY) == 0);
1180 
1181         spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1182             KM_SLEEP);
1183         for (i = 0; i < spa->spa_spares.sav_count; i++)
1184                 spares[i] = vdev_config_generate(spa,
1185                     spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1186         VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1187             ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1188         for (i = 0; i < spa->spa_spares.sav_count; i++)
1189                 nvlist_free(spares[i]);
1190         kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1191 }
1192 
1193 /*
1194  * Load (or re-load) the current list of vdevs describing the active l2cache for
1195  * this pool.  When this is called, we have some form of basic information in
1196  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1197  * then re-generate a more complete list including status information.
1198  * Devices which are already active have their details maintained, and are
1199  * not re-opened.
1200  */
1201 static void
1202 spa_load_l2cache(spa_t *spa)
1203 {
1204         nvlist_t **l2cache;
1205         uint_t nl2cache;
1206         int i, j, oldnvdevs;
1207         uint64_t guid;
1208         vdev_t *vd, **oldvdevs, **newvdevs;
1209         spa_aux_vdev_t *sav = &spa->spa_l2cache;
1210 
1211         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1212 
1213         if (sav->sav_config != NULL) {
1214                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1215                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1216                 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1217         } else {
1218                 nl2cache = 0;
1219         }
1220 
1221         oldvdevs = sav->sav_vdevs;
1222         oldnvdevs = sav->sav_count;
1223         sav->sav_vdevs = NULL;
1224         sav->sav_count = 0;
1225 
1226         /*
1227          * Process new nvlist of vdevs.
1228          */
1229         for (i = 0; i < nl2cache; i++) {
1230                 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1231                     &guid) == 0);
1232 
1233                 newvdevs[i] = NULL;
1234                 for (j = 0; j < oldnvdevs; j++) {
1235                         vd = oldvdevs[j];
1236                         if (vd != NULL && guid == vd->vdev_guid) {
1237                                 /*
1238                                  * Retain previous vdev for add/remove ops.
1239                                  */
1240                                 newvdevs[i] = vd;
1241                                 oldvdevs[j] = NULL;
1242                                 break;
1243                         }
1244                 }
1245 
1246                 if (newvdevs[i] == NULL) {
1247                         /*
1248                          * Create new vdev
1249                          */
1250                         VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1251                             VDEV_ALLOC_L2CACHE) == 0);
1252                         ASSERT(vd != NULL);
1253                         newvdevs[i] = vd;
1254 
1255                         /*
1256                          * Commit this vdev as an l2cache device,
1257                          * even if it fails to open.
1258                          */
1259                         spa_l2cache_add(vd);
1260 
1261                         vd->vdev_top = vd;
1262                         vd->vdev_aux = sav;
1263 
1264                         spa_l2cache_activate(vd);
1265 
1266                         if (vdev_open(vd) != 0)
1267                                 continue;
1268 
1269                         (void) vdev_validate_aux(vd);
1270 
1271                         if (!vdev_is_dead(vd))
1272                                 l2arc_add_vdev(spa, vd);
1273                 }
1274         }
1275 
1276         /*
1277          * Purge vdevs that were dropped
1278          */
1279         for (i = 0; i < oldnvdevs; i++) {
1280                 uint64_t pool;
1281 
1282                 vd = oldvdevs[i];
1283                 if (vd != NULL) {
1284                         if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1285                             pool != 0ULL && l2arc_vdev_present(vd))
1286                                 l2arc_remove_vdev(vd);
1287                         (void) vdev_close(vd);
1288                         spa_l2cache_remove(vd);
1289                 }
1290         }
1291 
1292         if (oldvdevs)
1293                 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1294 
1295         if (sav->sav_config == NULL)
1296                 goto out;
1297 
1298         sav->sav_vdevs = newvdevs;
1299         sav->sav_count = (int)nl2cache;
1300 
1301         /*
1302          * Recompute the stashed list of l2cache devices, with status
1303          * information this time.
1304          */
1305         VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1306             DATA_TYPE_NVLIST_ARRAY) == 0);
1307 
1308         l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1309         for (i = 0; i < sav->sav_count; i++)
1310                 l2cache[i] = vdev_config_generate(spa,
1311                     sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1312         VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1313             ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1314 out:
1315         for (i = 0; i < sav->sav_count; i++)
1316                 nvlist_free(l2cache[i]);
1317         if (sav->sav_count)
1318                 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1319 }
1320 
1321 static int
1322 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1323 {
1324         dmu_buf_t *db;
1325         char *packed = NULL;
1326         size_t nvsize = 0;
1327         int error;
1328         *value = NULL;
1329 
1330         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1331         nvsize = *(uint64_t *)db->db_data;
1332         dmu_buf_rele(db, FTAG);
1333 
1334         packed = kmem_alloc(nvsize, KM_SLEEP);
1335         error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1336             DMU_READ_PREFETCH);
1337         if (error == 0)
1338                 error = nvlist_unpack(packed, nvsize, value, 0);
1339         kmem_free(packed, nvsize);
1340 
1341         return (error);
1342 }
1343 
1344 /*
1345  * Checks to see if the given vdev could not be opened, in which case we post a
1346  * sysevent to notify the autoreplace code that the device has been removed.
1347  */
1348 static void
1349 spa_check_removed(vdev_t *vd)
1350 {
1351         for (int c = 0; c < vd->vdev_children; c++)
1352                 spa_check_removed(vd->vdev_child[c]);
1353 
1354         if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1355                 zfs_post_autoreplace(vd->vdev_spa, vd);
1356                 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1357         }
1358 }
1359 
1360 /*
1361  * Validate the current config against the MOS config
1362  */
1363 static boolean_t
1364 spa_config_valid(spa_t *spa, nvlist_t *config)
1365 {
1366         vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1367         nvlist_t *nv;
1368 
1369         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1370 
1371         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1372         VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1373 
1374         ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1375 
1376         /*
1377          * If we're doing a normal import, then build up any additional
1378          * diagnostic information about missing devices in this config.
1379          * We'll pass this up to the user for further processing.
1380          */
1381         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1382                 nvlist_t **child, *nv;
1383                 uint64_t idx = 0;
1384 
1385                 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1386                     KM_SLEEP);
1387                 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1388 
1389                 for (int c = 0; c < rvd->vdev_children; c++) {
1390                         vdev_t *tvd = rvd->vdev_child[c];
1391                         vdev_t *mtvd  = mrvd->vdev_child[c];
1392 
1393                         if (tvd->vdev_ops == &vdev_missing_ops &&
1394                             mtvd->vdev_ops != &vdev_missing_ops &&
1395                             mtvd->vdev_islog)
1396                                 child[idx++] = vdev_config_generate(spa, mtvd,
1397                                     B_FALSE, 0);
1398                 }
1399 
1400                 if (idx) {
1401                         VERIFY(nvlist_add_nvlist_array(nv,
1402                             ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1403                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1404                             ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1405 
1406                         for (int i = 0; i < idx; i++)
1407                                 nvlist_free(child[i]);
1408                 }
1409                 nvlist_free(nv);
1410                 kmem_free(child, rvd->vdev_children * sizeof (char **));
1411         }
1412 
1413         /*
1414          * Compare the root vdev tree with the information we have
1415          * from the MOS config (mrvd). Check each top-level vdev
1416          * with the corresponding MOS config top-level (mtvd).
1417          */
1418         for (int c = 0; c < rvd->vdev_children; c++) {
1419                 vdev_t *tvd = rvd->vdev_child[c];
1420                 vdev_t *mtvd  = mrvd->vdev_child[c];
1421 
1422                 /*
1423                  * Resolve any "missing" vdevs in the current configuration.
1424                  * If we find that the MOS config has more accurate information
1425                  * about the top-level vdev then use that vdev instead.
1426                  */
1427                 if (tvd->vdev_ops == &vdev_missing_ops &&
1428                     mtvd->vdev_ops != &vdev_missing_ops) {
1429 
1430                         if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1431                                 continue;
1432 
1433                         /*
1434                          * Device specific actions.
1435                          */
1436                         if (mtvd->vdev_islog) {
1437                                 spa_set_log_state(spa, SPA_LOG_CLEAR);
1438                         } else {
1439                                 /*
1440                                  * XXX - once we have 'readonly' pool
1441                                  * support we should be able to handle
1442                                  * missing data devices by transitioning
1443                                  * the pool to readonly.
1444                                  */
1445                                 continue;
1446                         }
1447 
1448                         /*
1449                          * Swap the missing vdev with the data we were
1450                          * able to obtain from the MOS config.
1451                          */
1452                         vdev_remove_child(rvd, tvd);
1453                         vdev_remove_child(mrvd, mtvd);
1454 
1455                         vdev_add_child(rvd, mtvd);
1456                         vdev_add_child(mrvd, tvd);
1457 
1458                         spa_config_exit(spa, SCL_ALL, FTAG);
1459                         vdev_load(mtvd);
1460                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1461 
1462                         vdev_reopen(rvd);
1463                 } else if (mtvd->vdev_islog) {
1464                         /*
1465                          * Load the slog device's state from the MOS config
1466                          * since it's possible that the label does not
1467                          * contain the most up-to-date information.
1468                          */
1469                         vdev_load_log_state(tvd, mtvd);
1470                         vdev_reopen(tvd);
1471                 }
1472         }
1473         vdev_free(mrvd);
1474         spa_config_exit(spa, SCL_ALL, FTAG);
1475 
1476         /*
1477          * Ensure we were able to validate the config.
1478          */
1479         return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1480 }
1481 
1482 /*
1483  * Check for missing log devices
1484  */
1485 static int
1486 spa_check_logs(spa_t *spa)
1487 {
1488         switch (spa->spa_log_state) {
1489         case SPA_LOG_MISSING:
1490                 /* need to recheck in case slog has been restored */
1491         case SPA_LOG_UNKNOWN:
1492                 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1493                     DS_FIND_CHILDREN)) {
1494                         spa_set_log_state(spa, SPA_LOG_MISSING);
1495                         return (1);
1496                 }
1497                 break;
1498         }
1499         return (0);
1500 }
1501 
1502 static boolean_t
1503 spa_passivate_log(spa_t *spa)
1504 {
1505         vdev_t *rvd = spa->spa_root_vdev;
1506         boolean_t slog_found = B_FALSE;
1507 
1508         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1509 
1510         if (!spa_has_slogs(spa))
1511                 return (B_FALSE);
1512 
1513         for (int c = 0; c < rvd->vdev_children; c++) {
1514                 vdev_t *tvd = rvd->vdev_child[c];
1515                 metaslab_group_t *mg = tvd->vdev_mg;
1516 
1517                 if (tvd->vdev_islog) {
1518                         metaslab_group_passivate(mg);
1519                         slog_found = B_TRUE;
1520                 }
1521         }
1522 
1523         return (slog_found);
1524 }
1525 
1526 static void
1527 spa_activate_log(spa_t *spa)
1528 {
1529         vdev_t *rvd = spa->spa_root_vdev;
1530 
1531         ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1532 
1533         for (int c = 0; c < rvd->vdev_children; c++) {
1534                 vdev_t *tvd = rvd->vdev_child[c];
1535                 metaslab_group_t *mg = tvd->vdev_mg;
1536 
1537                 if (tvd->vdev_islog)
1538                         metaslab_group_activate(mg);
1539         }
1540 }
1541 
1542 int
1543 spa_offline_log(spa_t *spa)
1544 {
1545         int error = 0;
1546 
1547         if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1548             NULL, DS_FIND_CHILDREN)) == 0) {
1549 
1550                 /*
1551                  * We successfully offlined the log device, sync out the
1552                  * current txg so that the "stubby" block can be removed
1553                  * by zil_sync().
1554                  */
1555                 txg_wait_synced(spa->spa_dsl_pool, 0);
1556         }
1557         return (error);
1558 }
1559 
1560 static void
1561 spa_aux_check_removed(spa_aux_vdev_t *sav)
1562 {
1563         for (int i = 0; i < sav->sav_count; i++)
1564                 spa_check_removed(sav->sav_vdevs[i]);
1565 }
1566 
1567 void
1568 spa_claim_notify(zio_t *zio)
1569 {
1570         spa_t *spa = zio->io_spa;
1571 
1572         if (zio->io_error)
1573                 return;
1574 
1575         mutex_enter(&spa->spa_props_lock);       /* any mutex will do */
1576         if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1577                 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1578         mutex_exit(&spa->spa_props_lock);
1579 }
1580 
1581 typedef struct spa_load_error {
1582         uint64_t        sle_meta_count;
1583         uint64_t        sle_data_count;
1584 } spa_load_error_t;
1585 
1586 static void
1587 spa_load_verify_done(zio_t *zio)
1588 {
1589         blkptr_t *bp = zio->io_bp;
1590         spa_load_error_t *sle = zio->io_private;
1591         dmu_object_type_t type = BP_GET_TYPE(bp);
1592         int error = zio->io_error;
1593 
1594         if (error) {
1595                 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1596                     type != DMU_OT_INTENT_LOG)
1597                         atomic_add_64(&sle->sle_meta_count, 1);
1598                 else
1599                         atomic_add_64(&sle->sle_data_count, 1);
1600         }
1601         zio_data_buf_free(zio->io_data, zio->io_size);
1602 }
1603 
1604 /*ARGSUSED*/
1605 static int
1606 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1607     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1608 {
1609         if (bp != NULL) {
1610                 zio_t *rio = arg;
1611                 size_t size = BP_GET_PSIZE(bp);
1612                 void *data = zio_data_buf_alloc(size);
1613 
1614                 zio_nowait(zio_read(rio, spa, bp, data, size,
1615                     spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1616                     ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1617                     ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1618         }
1619         return (0);
1620 }
1621 
1622 static int
1623 spa_load_verify(spa_t *spa)
1624 {
1625         zio_t *rio;
1626         spa_load_error_t sle = { 0 };
1627         zpool_rewind_policy_t policy;
1628         boolean_t verify_ok = B_FALSE;
1629         int error;
1630 
1631         zpool_get_rewind_policy(spa->spa_config, &policy);
1632 
1633         if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1634                 return (0);
1635 
1636         rio = zio_root(spa, NULL, &sle,
1637             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1638 
1639         error = traverse_pool(spa, spa->spa_verify_min_txg,
1640             TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1641 
1642         (void) zio_wait(rio);
1643 
1644         spa->spa_load_meta_errors = sle.sle_meta_count;
1645         spa->spa_load_data_errors = sle.sle_data_count;
1646 
1647         if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1648             sle.sle_data_count <= policy.zrp_maxdata) {
1649                 int64_t loss = 0;
1650 
1651                 verify_ok = B_TRUE;
1652                 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1653                 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1654 
1655                 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1656                 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1657                     ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1658                 VERIFY(nvlist_add_int64(spa->spa_load_info,
1659                     ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1660                 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1661                     ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1662         } else {
1663                 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1664         }
1665 
1666         if (error) {
1667                 if (error != ENXIO && error != EIO)
1668                         error = EIO;
1669                 return (error);
1670         }
1671 
1672         return (verify_ok ? 0 : EIO);
1673 }
1674 
1675 /*
1676  * Find a value in the pool props object.
1677  */
1678 static void
1679 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1680 {
1681         (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1682             zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1683 }
1684 
1685 /*
1686  * Find a value in the pool directory object.
1687  */
1688 static int
1689 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1690 {
1691         return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1692             name, sizeof (uint64_t), 1, val));
1693 }
1694 
1695 static int
1696 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1697 {
1698         vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1699         return (err);
1700 }
1701 
1702 /*
1703  * Fix up config after a partly-completed split.  This is done with the
1704  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
1705  * pool have that entry in their config, but only the splitting one contains
1706  * a list of all the guids of the vdevs that are being split off.
1707  *
1708  * This function determines what to do with that list: either rejoin
1709  * all the disks to the pool, or complete the splitting process.  To attempt
1710  * the rejoin, each disk that is offlined is marked online again, and
1711  * we do a reopen() call.  If the vdev label for every disk that was
1712  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1713  * then we call vdev_split() on each disk, and complete the split.
1714  *
1715  * Otherwise we leave the config alone, with all the vdevs in place in
1716  * the original pool.
1717  */
1718 static void
1719 spa_try_repair(spa_t *spa, nvlist_t *config)
1720 {
1721         uint_t extracted;
1722         uint64_t *glist;
1723         uint_t i, gcount;
1724         nvlist_t *nvl;
1725         vdev_t **vd;
1726         boolean_t attempt_reopen;
1727 
1728         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1729                 return;
1730 
1731         /* check that the config is complete */
1732         if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1733             &glist, &gcount) != 0)
1734                 return;
1735 
1736         vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1737 
1738         /* attempt to online all the vdevs & validate */
1739         attempt_reopen = B_TRUE;
1740         for (i = 0; i < gcount; i++) {
1741                 if (glist[i] == 0)      /* vdev is hole */
1742                         continue;
1743 
1744                 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1745                 if (vd[i] == NULL) {
1746                         /*
1747                          * Don't bother attempting to reopen the disks;
1748                          * just do the split.
1749                          */
1750                         attempt_reopen = B_FALSE;
1751                 } else {
1752                         /* attempt to re-online it */
1753                         vd[i]->vdev_offline = B_FALSE;
1754                 }
1755         }
1756 
1757         if (attempt_reopen) {
1758                 vdev_reopen(spa->spa_root_vdev);
1759 
1760                 /* check each device to see what state it's in */
1761                 for (extracted = 0, i = 0; i < gcount; i++) {
1762                         if (vd[i] != NULL &&
1763                             vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1764                                 break;
1765                         ++extracted;
1766                 }
1767         }
1768 
1769         /*
1770          * If every disk has been moved to the new pool, or if we never
1771          * even attempted to look at them, then we split them off for
1772          * good.
1773          */
1774         if (!attempt_reopen || gcount == extracted) {
1775                 for (i = 0; i < gcount; i++)
1776                         if (vd[i] != NULL)
1777                                 vdev_split(vd[i]);
1778                 vdev_reopen(spa->spa_root_vdev);
1779         }
1780 
1781         kmem_free(vd, gcount * sizeof (vdev_t *));
1782 }
1783 
1784 static int
1785 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1786     boolean_t mosconfig)
1787 {
1788         nvlist_t *config = spa->spa_config;
1789         char *ereport = FM_EREPORT_ZFS_POOL;
1790         char *comment;
1791         int error;
1792         uint64_t pool_guid;
1793         nvlist_t *nvl;
1794 
1795         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1796                 return (EINVAL);
1797 
1798         ASSERT(spa->spa_comment == NULL);
1799         if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1800                 spa->spa_comment = spa_strdup(comment);
1801 
1802         /*
1803          * Versioning wasn't explicitly added to the label until later, so if
1804          * it's not present treat it as the initial version.
1805          */
1806         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1807             &spa->spa_ubsync.ub_version) != 0)
1808                 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1809 
1810         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1811             &spa->spa_config_txg);
1812 
1813         if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1814             spa_guid_exists(pool_guid, 0)) {
1815                 error = EEXIST;
1816         } else {
1817                 spa->spa_config_guid = pool_guid;
1818 
1819                 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1820                     &nvl) == 0) {
1821                         VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1822                             KM_SLEEP) == 0);
1823                 }
1824 
1825                 gethrestime(&spa->spa_loaded_ts);
1826                 error = spa_load_impl(spa, pool_guid, config, state, type,
1827                     mosconfig, &ereport);
1828         }
1829 
1830         spa->spa_minref = refcount_count(&spa->spa_refcount);
1831         if (error) {
1832                 if (error != EEXIST) {
1833                         spa->spa_loaded_ts.tv_sec = 0;
1834                         spa->spa_loaded_ts.tv_nsec = 0;
1835                 }
1836                 if (error != EBADF) {
1837                         zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1838                 }
1839         }
1840         spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1841         spa->spa_ena = 0;
1842 
1843         return (error);
1844 }
1845 
1846 /*
1847  * Load an existing storage pool, using the pool's builtin spa_config as a
1848  * source of configuration information.
1849  */
1850 static int
1851 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1852     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1853     char **ereport)
1854 {
1855         int error = 0;
1856         nvlist_t *nvroot = NULL;
1857         vdev_t *rvd;
1858         uberblock_t *ub = &spa->spa_uberblock;
1859         uint64_t children, config_cache_txg = spa->spa_config_txg;
1860         int orig_mode = spa->spa_mode;
1861         int parse;
1862         uint64_t obj;
1863 
1864         /*
1865          * If this is an untrusted config, access the pool in read-only mode.
1866          * This prevents things like resilvering recently removed devices.
1867          */
1868         if (!mosconfig)
1869                 spa->spa_mode = FREAD;
1870 
1871         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1872 
1873         spa->spa_load_state = state;
1874 
1875         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1876                 return (EINVAL);
1877 
1878         parse = (type == SPA_IMPORT_EXISTING ?
1879             VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1880 
1881         /*
1882          * Create "The Godfather" zio to hold all async IOs
1883          */
1884         spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
1885             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
1886 
1887         /*
1888          * Parse the configuration into a vdev tree.  We explicitly set the
1889          * value that will be returned by spa_version() since parsing the
1890          * configuration requires knowing the version number.
1891          */
1892         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1893         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
1894         spa_config_exit(spa, SCL_ALL, FTAG);
1895 
1896         if (error != 0)
1897                 return (error);
1898 
1899         ASSERT(spa->spa_root_vdev == rvd);
1900 
1901         if (type != SPA_IMPORT_ASSEMBLE) {
1902                 ASSERT(spa_guid(spa) == pool_guid);
1903         }
1904 
1905         /*
1906          * Try to open all vdevs, loading each label in the process.
1907          */
1908         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1909         error = vdev_open(rvd);
1910         spa_config_exit(spa, SCL_ALL, FTAG);
1911         if (error != 0)
1912                 return (error);
1913 
1914         /*
1915          * We need to validate the vdev labels against the configuration that
1916          * we have in hand, which is dependent on the setting of mosconfig. If
1917          * mosconfig is true then we're validating the vdev labels based on
1918          * that config.  Otherwise, we're validating against the cached config
1919          * (zpool.cache) that was read when we loaded the zfs module, and then
1920          * later we will recursively call spa_load() and validate against
1921          * the vdev config.
1922          *
1923          * If we're assembling a new pool that's been split off from an
1924          * existing pool, the labels haven't yet been updated so we skip
1925          * validation for now.
1926          */
1927         if (type != SPA_IMPORT_ASSEMBLE) {
1928                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1929                 error = vdev_validate(rvd);
1930                 spa_config_exit(spa, SCL_ALL, FTAG);
1931 
1932                 if (error != 0)
1933                         return (error);
1934 
1935                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
1936                         return (ENXIO);
1937         }
1938 
1939         /*
1940          * Find the best uberblock.
1941          */
1942         vdev_uberblock_load(NULL, rvd, ub);
1943 
1944         /*
1945          * If we weren't able to find a single valid uberblock, return failure.
1946          */
1947         if (ub->ub_txg == 0)
1948                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
1949 
1950         /*
1951          * If the pool is newer than the code, we can't open it.
1952          */
1953         if (ub->ub_version > SPA_VERSION)
1954                 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
1955 
1956         /*
1957          * If the vdev guid sum doesn't match the uberblock, we have an
1958          * incomplete configuration.  We first check to see if the pool
1959          * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
1960          * If it is, defer the vdev_guid_sum check till later so we
1961          * can handle missing vdevs.
1962          */
1963         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
1964             &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
1965             rvd->vdev_guid_sum != ub->ub_guid_sum)
1966                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
1967 
1968         if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
1969                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1970                 spa_try_repair(spa, config);
1971                 spa_config_exit(spa, SCL_ALL, FTAG);
1972                 nvlist_free(spa->spa_config_splitting);
1973                 spa->spa_config_splitting = NULL;
1974         }
1975 
1976         /*
1977          * Initialize internal SPA structures.
1978          */
1979         spa->spa_state = POOL_STATE_ACTIVE;
1980         spa->spa_ubsync = spa->spa_uberblock;
1981         spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1982             TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1983         spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1984             spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
1985         spa->spa_claim_max_txg = spa->spa_first_txg;
1986         spa->spa_prev_software_version = ub->ub_software_version;
1987 
1988         error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1989         if (error)
1990                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1991         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1992 
1993         if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
1994                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1995 
1996         if (!mosconfig) {
1997                 uint64_t hostid;
1998                 nvlist_t *policy = NULL, *nvconfig;
1999 
2000                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2001                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2002 
2003                 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2004                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2005                         char *hostname;
2006                         unsigned long myhostid = 0;
2007 
2008                         VERIFY(nvlist_lookup_string(nvconfig,
2009                             ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2010 
2011 #ifdef  _KERNEL
2012                         myhostid = zone_get_hostid(NULL);
2013 #else   /* _KERNEL */
2014                         /*
2015                          * We're emulating the system's hostid in userland, so
2016                          * we can't use zone_get_hostid().
2017                          */
2018                         (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2019 #endif  /* _KERNEL */
2020                         if (hostid != 0 && myhostid != 0 &&
2021                             hostid != myhostid) {
2022                                 nvlist_free(nvconfig);
2023                                 cmn_err(CE_WARN, "pool '%s' could not be "
2024                                     "loaded as it was last accessed by "
2025                                     "another system (host: %s hostid: 0x%lx). "
2026                                     "See: http://www.sun.com/msg/ZFS-8000-EY",
2027                                     spa_name(spa), hostname,
2028                                     (unsigned long)hostid);
2029                                 return (EBADF);
2030                         }
2031                 }
2032                 if (nvlist_lookup_nvlist(spa->spa_config,
2033                     ZPOOL_REWIND_POLICY, &policy) == 0)
2034                         VERIFY(nvlist_add_nvlist(nvconfig,
2035                             ZPOOL_REWIND_POLICY, policy) == 0);
2036 
2037                 spa_config_set(spa, nvconfig);
2038                 spa_unload(spa);
2039                 spa_deactivate(spa);
2040                 spa_activate(spa, orig_mode);
2041 
2042                 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2043         }
2044 
2045         if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2046                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2047         error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2048         if (error != 0)
2049                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2050 
2051         /*
2052          * Load the bit that tells us to use the new accounting function
2053          * (raid-z deflation).  If we have an older pool, this will not
2054          * be present.
2055          */
2056         error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2057         if (error != 0 && error != ENOENT)
2058                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2059 
2060         error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2061             &spa->spa_creation_version);
2062         if (error != 0 && error != ENOENT)
2063                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2064 
2065         /*
2066          * Load the persistent error log.  If we have an older pool, this will
2067          * not be present.
2068          */
2069         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2070         if (error != 0 && error != ENOENT)
2071                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2072 
2073         error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2074             &spa->spa_errlog_scrub);
2075         if (error != 0 && error != ENOENT)
2076                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2077 
2078         /*
2079          * Load the history object.  If we have an older pool, this
2080          * will not be present.
2081          */
2082         error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2083         if (error != 0 && error != ENOENT)
2084                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2085 
2086         /*
2087          * If we're assembling the pool from the split-off vdevs of
2088          * an existing pool, we don't want to attach the spares & cache
2089          * devices.
2090          */
2091 
2092         /*
2093          * Load any hot spares for this pool.
2094          */
2095         error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2096         if (error != 0 && error != ENOENT)
2097                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2098         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2099                 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2100                 if (load_nvlist(spa, spa->spa_spares.sav_object,
2101                     &spa->spa_spares.sav_config) != 0)
2102                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2103 
2104                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2105                 spa_load_spares(spa);
2106                 spa_config_exit(spa, SCL_ALL, FTAG);
2107         } else if (error == 0) {
2108                 spa->spa_spares.sav_sync = B_TRUE;
2109         }
2110 
2111         /*
2112          * Load any level 2 ARC devices for this pool.
2113          */
2114         error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2115             &spa->spa_l2cache.sav_object);
2116         if (error != 0 && error != ENOENT)
2117                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2118         if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2119                 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2120                 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2121                     &spa->spa_l2cache.sav_config) != 0)
2122                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2123 
2124                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2125                 spa_load_l2cache(spa);
2126                 spa_config_exit(spa, SCL_ALL, FTAG);
2127         } else if (error == 0) {
2128                 spa->spa_l2cache.sav_sync = B_TRUE;
2129         }
2130 
2131         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2132 
2133         error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2134         if (error && error != ENOENT)
2135                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2136 
2137         if (error == 0) {
2138                 uint64_t autoreplace;
2139 
2140                 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2141                 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2142                 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2143                 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2144                 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2145                 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2146                     &spa->spa_dedup_ditto);
2147 
2148                 spa->spa_autoreplace = (autoreplace != 0);
2149         }
2150 
2151         /*
2152          * If the 'autoreplace' property is set, then post a resource notifying
2153          * the ZFS DE that it should not issue any faults for unopenable
2154          * devices.  We also iterate over the vdevs, and post a sysevent for any
2155          * unopenable vdevs so that the normal autoreplace handler can take
2156          * over.
2157          */
2158         if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2159                 spa_check_removed(spa->spa_root_vdev);
2160                 /*
2161                  * For the import case, this is done in spa_import(), because
2162                  * at this point we're using the spare definitions from
2163                  * the MOS config, not necessarily from the userland config.
2164                  */
2165                 if (state != SPA_LOAD_IMPORT) {
2166                         spa_aux_check_removed(&spa->spa_spares);
2167                         spa_aux_check_removed(&spa->spa_l2cache);
2168                 }
2169         }
2170 
2171         /*
2172          * Load the vdev state for all toplevel vdevs.
2173          */
2174         vdev_load(rvd);
2175 
2176         /*
2177          * Propagate the leaf DTLs we just loaded all the way up the tree.
2178          */
2179         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2180         vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2181         spa_config_exit(spa, SCL_ALL, FTAG);
2182 
2183         /*
2184          * Load the DDTs (dedup tables).
2185          */
2186         error = ddt_load(spa);
2187         if (error != 0)
2188                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2189 
2190         spa_update_dspace(spa);
2191 
2192         /*
2193          * Validate the config, using the MOS config to fill in any
2194          * information which might be missing.  If we fail to validate
2195          * the config then declare the pool unfit for use. If we're
2196          * assembling a pool from a split, the log is not transferred
2197          * over.
2198          */
2199         if (type != SPA_IMPORT_ASSEMBLE) {
2200                 nvlist_t *nvconfig;
2201 
2202                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2203                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2204 
2205                 if (!spa_config_valid(spa, nvconfig)) {
2206                         nvlist_free(nvconfig);
2207                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2208                             ENXIO));
2209                 }
2210                 nvlist_free(nvconfig);
2211 
2212                 /*
2213                  * Now that we've validate the config, check the state of the
2214                  * root vdev.  If it can't be opened, it indicates one or
2215                  * more toplevel vdevs are faulted.
2216                  */
2217                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2218                         return (ENXIO);
2219 
2220                 if (spa_check_logs(spa)) {
2221                         *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2222                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2223                 }
2224         }
2225 
2226         /*
2227          * We've successfully opened the pool, verify that we're ready
2228          * to start pushing transactions.
2229          */
2230         if (state != SPA_LOAD_TRYIMPORT) {
2231                 if (error = spa_load_verify(spa))
2232                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2233                             error));
2234         }
2235 
2236         if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2237             spa->spa_load_max_txg == UINT64_MAX)) {
2238                 dmu_tx_t *tx;
2239                 int need_update = B_FALSE;
2240 
2241                 ASSERT(state != SPA_LOAD_TRYIMPORT);
2242 
2243                 /*
2244                  * Claim log blocks that haven't been committed yet.
2245                  * This must all happen in a single txg.
2246                  * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2247                  * invoked from zil_claim_log_block()'s i/o done callback.
2248                  * Price of rollback is that we abandon the log.
2249                  */
2250                 spa->spa_claiming = B_TRUE;
2251 
2252                 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2253                     spa_first_txg(spa));
2254                 (void) dmu_objset_find(spa_name(spa),
2255                     zil_claim, tx, DS_FIND_CHILDREN);
2256                 dmu_tx_commit(tx);
2257 
2258                 spa->spa_claiming = B_FALSE;
2259 
2260                 spa_set_log_state(spa, SPA_LOG_GOOD);
2261                 spa->spa_sync_on = B_TRUE;
2262                 txg_sync_start(spa->spa_dsl_pool);
2263 
2264                 /*
2265                  * Wait for all claims to sync.  We sync up to the highest
2266                  * claimed log block birth time so that claimed log blocks
2267                  * don't appear to be from the future.  spa_claim_max_txg
2268                  * will have been set for us by either zil_check_log_chain()
2269                  * (invoked from spa_check_logs()) or zil_claim() above.
2270                  */
2271                 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2272 
2273                 /*
2274                  * If the config cache is stale, or we have uninitialized
2275                  * metaslabs (see spa_vdev_add()), then update the config.
2276                  *
2277                  * If this is a verbatim import, trust the current
2278                  * in-core spa_config and update the disk labels.
2279                  */
2280                 if (config_cache_txg != spa->spa_config_txg ||
2281                     state == SPA_LOAD_IMPORT ||
2282                     state == SPA_LOAD_RECOVER ||
2283                     (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2284                         need_update = B_TRUE;
2285 
2286                 for (int c = 0; c < rvd->vdev_children; c++)
2287                         if (rvd->vdev_child[c]->vdev_ms_array == 0)
2288                                 need_update = B_TRUE;
2289 
2290                 /*
2291                  * Update the config cache asychronously in case we're the
2292                  * root pool, in which case the config cache isn't writable yet.
2293                  */
2294                 if (need_update)
2295                         spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2296 
2297                 /*
2298                  * Check all DTLs to see if anything needs resilvering.
2299                  */
2300                 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2301                     vdev_resilver_needed(rvd, NULL, NULL))
2302                         spa_async_request(spa, SPA_ASYNC_RESILVER);
2303 
2304                 /*
2305                  * Delete any inconsistent datasets.
2306                  */
2307                 (void) dmu_objset_find(spa_name(spa),
2308                     dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2309 
2310                 /*
2311                  * Clean up any stale temporary dataset userrefs.
2312                  */
2313                 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2314         }
2315 
2316         return (0);
2317 }
2318 
2319 static int
2320 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2321 {
2322         int mode = spa->spa_mode;
2323 
2324         spa_unload(spa);
2325         spa_deactivate(spa);
2326 
2327         spa->spa_load_max_txg--;
2328 
2329         spa_activate(spa, mode);
2330         spa_async_suspend(spa);
2331 
2332         return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2333 }
2334 
2335 static int
2336 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2337     uint64_t max_request, int rewind_flags)
2338 {
2339         nvlist_t *config = NULL;
2340         int load_error, rewind_error;
2341         uint64_t safe_rewind_txg;
2342         uint64_t min_txg;
2343 
2344         if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2345                 spa->spa_load_max_txg = spa->spa_load_txg;
2346                 spa_set_log_state(spa, SPA_LOG_CLEAR);
2347         } else {
2348                 spa->spa_load_max_txg = max_request;
2349         }
2350 
2351         load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2352             mosconfig);
2353         if (load_error == 0)
2354                 return (0);
2355 
2356         if (spa->spa_root_vdev != NULL)
2357                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2358 
2359         spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2360         spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2361 
2362         if (rewind_flags & ZPOOL_NEVER_REWIND) {
2363                 nvlist_free(config);
2364                 return (load_error);
2365         }
2366 
2367         /* Price of rolling back is discarding txgs, including log */
2368         if (state == SPA_LOAD_RECOVER)
2369                 spa_set_log_state(spa, SPA_LOG_CLEAR);
2370 
2371         spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2372         safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2373         min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2374             TXG_INITIAL : safe_rewind_txg;
2375 
2376         /*
2377          * Continue as long as we're finding errors, we're still within
2378          * the acceptable rewind range, and we're still finding uberblocks
2379          */
2380         while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2381             spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2382                 if (spa->spa_load_max_txg < safe_rewind_txg)
2383                         spa->spa_extreme_rewind = B_TRUE;
2384                 rewind_error = spa_load_retry(spa, state, mosconfig);
2385         }
2386 
2387         spa->spa_extreme_rewind = B_FALSE;
2388         spa->spa_load_max_txg = UINT64_MAX;
2389 
2390         if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2391                 spa_config_set(spa, config);
2392 
2393         return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
2394 }
2395 
2396 /*
2397  * Pool Open/Import
2398  *
2399  * The import case is identical to an open except that the configuration is sent
2400  * down from userland, instead of grabbed from the configuration cache.  For the
2401  * case of an open, the pool configuration will exist in the
2402  * POOL_STATE_UNINITIALIZED state.
2403  *
2404  * The stats information (gen/count/ustats) is used to gather vdev statistics at
2405  * the same time open the pool, without having to keep around the spa_t in some
2406  * ambiguous state.
2407  */
2408 static int
2409 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2410     nvlist_t **config)
2411 {
2412         spa_t *spa;
2413         spa_load_state_t state = SPA_LOAD_OPEN;
2414         int error;
2415         int locked = B_FALSE;
2416 
2417         *spapp = NULL;
2418 
2419         /*
2420          * As disgusting as this is, we need to support recursive calls to this
2421          * function because dsl_dir_open() is called during spa_load(), and ends
2422          * up calling spa_open() again.  The real fix is to figure out how to
2423          * avoid dsl_dir_open() calling this in the first place.
2424          */
2425         if (mutex_owner(&spa_namespace_lock) != curthread) {
2426                 mutex_enter(&spa_namespace_lock);
2427                 locked = B_TRUE;
2428         }
2429 
2430         if ((spa = spa_lookup(pool)) == NULL) {
2431                 if (locked)
2432                         mutex_exit(&spa_namespace_lock);
2433                 return (ENOENT);
2434         }
2435 
2436         if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2437                 zpool_rewind_policy_t policy;
2438 
2439                 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2440                     &policy);
2441                 if (policy.zrp_request & ZPOOL_DO_REWIND)
2442                         state = SPA_LOAD_RECOVER;
2443 
2444                 spa_activate(spa, spa_mode_global);
2445 
2446                 if (state != SPA_LOAD_RECOVER)
2447                         spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2448 
2449                 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2450                     policy.zrp_request);
2451 
2452                 if (error == EBADF) {
2453                         /*
2454                          * If vdev_validate() returns failure (indicated by
2455                          * EBADF), it indicates that one of the vdevs indicates
2456                          * that the pool has been exported or destroyed.  If
2457                          * this is the case, the config cache is out of sync and
2458                          * we should remove the pool from the namespace.
2459                          */
2460                         spa_unload(spa);
2461                         spa_deactivate(spa);
2462                         spa_config_sync(spa, B_TRUE, B_TRUE);
2463                         spa_remove(spa);
2464                         if (locked)
2465                                 mutex_exit(&spa_namespace_lock);
2466                         return (ENOENT);
2467                 }
2468 
2469                 if (error) {
2470                         /*
2471                          * We can't open the pool, but we still have useful
2472                          * information: the state of each vdev after the
2473                          * attempted vdev_open().  Return this to the user.
2474                          */
2475                         if (config != NULL && spa->spa_config) {
2476                                 VERIFY(nvlist_dup(spa->spa_config, config,
2477                                     KM_SLEEP) == 0);
2478                                 VERIFY(nvlist_add_nvlist(*config,
2479                                     ZPOOL_CONFIG_LOAD_INFO,
2480                                     spa->spa_load_info) == 0);
2481                         }
2482                         spa_unload(spa);
2483                         spa_deactivate(spa);
2484                         spa->spa_last_open_failed = error;
2485                         if (locked)
2486                                 mutex_exit(&spa_namespace_lock);
2487                         *spapp = NULL;
2488                         return (error);
2489                 }
2490         }
2491 
2492         spa_open_ref(spa, tag);
2493 
2494         if (config != NULL)
2495                 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2496 
2497         /*
2498          * If we've recovered the pool, pass back any information we
2499          * gathered while doing the load.
2500          */
2501         if (state == SPA_LOAD_RECOVER) {
2502                 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2503                     spa->spa_load_info) == 0);
2504         }
2505 
2506         if (locked) {
2507                 spa->spa_last_open_failed = 0;
2508                 spa->spa_last_ubsync_txg = 0;
2509                 spa->spa_load_txg = 0;
2510                 mutex_exit(&spa_namespace_lock);
2511         }
2512 
2513         *spapp = spa;
2514 
2515         return (0);
2516 }
2517 
2518 int
2519 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2520     nvlist_t **config)
2521 {
2522         return (spa_open_common(name, spapp, tag, policy, config));
2523 }
2524 
2525 int
2526 spa_open(const char *name, spa_t **spapp, void *tag)
2527 {
2528         return (spa_open_common(name, spapp, tag, NULL, NULL));
2529 }
2530 
2531 /*
2532  * Lookup the given spa_t, incrementing the inject count in the process,
2533  * preventing it from being exported or destroyed.
2534  */
2535 spa_t *
2536 spa_inject_addref(char *name)
2537 {
2538         spa_t *spa;
2539 
2540         mutex_enter(&spa_namespace_lock);
2541         if ((spa = spa_lookup(name)) == NULL) {
2542                 mutex_exit(&spa_namespace_lock);
2543                 return (NULL);
2544         }
2545         spa->spa_inject_ref++;
2546         mutex_exit(&spa_namespace_lock);
2547 
2548         return (spa);
2549 }
2550 
2551 void
2552 spa_inject_delref(spa_t *spa)
2553 {
2554         mutex_enter(&spa_namespace_lock);
2555         spa->spa_inject_ref--;
2556         mutex_exit(&spa_namespace_lock);
2557 }
2558 
2559 /*
2560  * Add spares device information to the nvlist.
2561  */
2562 static void
2563 spa_add_spares(spa_t *spa, nvlist_t *config)
2564 {
2565         nvlist_t **spares;
2566         uint_t i, nspares;
2567         nvlist_t *nvroot;
2568         uint64_t guid;
2569         vdev_stat_t *vs;
2570         uint_t vsc;
2571         uint64_t pool;
2572 
2573         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2574 
2575         if (spa->spa_spares.sav_count == 0)
2576                 return;
2577 
2578         VERIFY(nvlist_lookup_nvlist(config,
2579             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2580         VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2581             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2582         if (nspares != 0) {
2583                 VERIFY(nvlist_add_nvlist_array(nvroot,
2584                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2585                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2586                     ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2587 
2588                 /*
2589                  * Go through and find any spares which have since been
2590                  * repurposed as an active spare.  If this is the case, update
2591                  * their status appropriately.
2592                  */
2593                 for (i = 0; i < nspares; i++) {
2594                         VERIFY(nvlist_lookup_uint64(spares[i],
2595                             ZPOOL_CONFIG_GUID, &guid) == 0);
2596                         if (spa_spare_exists(guid, &pool, NULL) &&
2597                             pool != 0ULL) {
2598                                 VERIFY(nvlist_lookup_uint64_array(
2599                                     spares[i], ZPOOL_CONFIG_VDEV_STATS,
2600                                     (uint64_t **)&vs, &vsc) == 0);
2601                                 vs->vs_state = VDEV_STATE_CANT_OPEN;
2602                                 vs->vs_aux = VDEV_AUX_SPARED;
2603                         }
2604                 }
2605         }
2606 }
2607 
2608 /*
2609  * Add l2cache device information to the nvlist, including vdev stats.
2610  */
2611 static void
2612 spa_add_l2cache(spa_t *spa, nvlist_t *config)
2613 {
2614         nvlist_t **l2cache;
2615         uint_t i, j, nl2cache;
2616         nvlist_t *nvroot;
2617         uint64_t guid;
2618         vdev_t *vd;
2619         vdev_stat_t *vs;
2620         uint_t vsc;
2621 
2622         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2623 
2624         if (spa->spa_l2cache.sav_count == 0)
2625                 return;
2626 
2627         VERIFY(nvlist_lookup_nvlist(config,
2628             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2629         VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2630             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2631         if (nl2cache != 0) {
2632                 VERIFY(nvlist_add_nvlist_array(nvroot,
2633                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2634                 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2635                     ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2636 
2637                 /*
2638                  * Update level 2 cache device stats.
2639                  */
2640 
2641                 for (i = 0; i < nl2cache; i++) {
2642                         VERIFY(nvlist_lookup_uint64(l2cache[i],
2643                             ZPOOL_CONFIG_GUID, &guid) == 0);
2644 
2645                         vd = NULL;
2646                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2647                                 if (guid ==
2648                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2649                                         vd = spa->spa_l2cache.sav_vdevs[j];
2650                                         break;
2651                                 }
2652                         }
2653                         ASSERT(vd != NULL);
2654 
2655                         VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2656                             ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2657                             == 0);
2658                         vdev_get_stats(vd, vs);
2659                 }
2660         }
2661 }
2662 
2663 int
2664 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
2665 {
2666         int error;
2667         spa_t *spa;
2668 
2669         *config = NULL;
2670         error = spa_open_common(name, &spa, FTAG, NULL, config);
2671 
2672         if (spa != NULL) {
2673                 /*
2674                  * This still leaves a window of inconsistency where the spares
2675                  * or l2cache devices could change and the config would be
2676                  * self-inconsistent.
2677                  */
2678                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2679 
2680                 if (*config != NULL) {
2681                         uint64_t loadtimes[2];
2682 
2683                         loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2684                         loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
2685                         VERIFY(nvlist_add_uint64_array(*config,
2686                             ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
2687 
2688                         VERIFY(nvlist_add_uint64(*config,
2689                             ZPOOL_CONFIG_ERRCOUNT,
2690                             spa_get_errlog_size(spa)) == 0);
2691 
2692                         if (spa_suspended(spa))
2693                                 VERIFY(nvlist_add_uint64(*config,
2694                                     ZPOOL_CONFIG_SUSPENDED,
2695                                     spa->spa_failmode) == 0);
2696 
2697                         spa_add_spares(spa, *config);
2698                         spa_add_l2cache(spa, *config);
2699                 }
2700         }
2701 
2702         /*
2703          * We want to get the alternate root even for faulted pools, so we cheat
2704          * and call spa_lookup() directly.
2705          */
2706         if (altroot) {
2707                 if (spa == NULL) {
2708                         mutex_enter(&spa_namespace_lock);
2709                         spa = spa_lookup(name);
2710                         if (spa)
2711                                 spa_altroot(spa, altroot, buflen);
2712                         else
2713                                 altroot[0] = '\0';
2714                         spa = NULL;
2715                         mutex_exit(&spa_namespace_lock);
2716                 } else {
2717                         spa_altroot(spa, altroot, buflen);
2718                 }
2719         }
2720 
2721         if (spa != NULL) {
2722                 spa_config_exit(spa, SCL_CONFIG, FTAG);
2723                 spa_close(spa, FTAG);
2724         }
2725 
2726         return (error);
2727 }
2728 
2729 /*
2730  * Validate that the auxiliary device array is well formed.  We must have an
2731  * array of nvlists, each which describes a valid leaf vdev.  If this is an
2732  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
2733  * specified, as long as they are well-formed.
2734  */
2735 static int
2736 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
2737     spa_aux_vdev_t *sav, const char *config, uint64_t version,
2738     vdev_labeltype_t label)
2739 {
2740         nvlist_t **dev;
2741         uint_t i, ndev;
2742         vdev_t *vd;
2743         int error;
2744 
2745         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2746 
2747         /*
2748          * It's acceptable to have no devs specified.
2749          */
2750         if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
2751                 return (0);
2752 
2753         if (ndev == 0)
2754                 return (EINVAL);
2755 
2756         /*
2757          * Make sure the pool is formatted with a version that supports this
2758          * device type.
2759          */
2760         if (spa_version(spa) < version)
2761                 return (ENOTSUP);
2762 
2763         /*
2764          * Set the pending device list so we correctly handle device in-use
2765          * checking.
2766          */
2767         sav->sav_pending = dev;
2768         sav->sav_npending = ndev;
2769 
2770         for (i = 0; i < ndev; i++) {
2771                 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
2772                     mode)) != 0)
2773                         goto out;
2774 
2775                 if (!vd->vdev_ops->vdev_op_leaf) {
2776                         vdev_free(vd);
2777                         error = EINVAL;
2778                         goto out;
2779                 }
2780 
2781                 /*
2782                  * The L2ARC currently only supports disk devices in
2783                  * kernel context.  For user-level testing, we allow it.
2784                  */
2785 #ifdef _KERNEL
2786                 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
2787                     strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2788                         error = ENOTBLK;
2789                         goto out;
2790                 }
2791 #endif
2792                 vd->vdev_top = vd;
2793 
2794                 if ((error = vdev_open(vd)) == 0 &&
2795                     (error = vdev_label_init(vd, crtxg, label)) == 0) {
2796                         VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
2797                             vd->vdev_guid) == 0);
2798                 }
2799 
2800                 vdev_free(vd);
2801 
2802                 if (error &&
2803                     (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
2804                         goto out;
2805                 else
2806                         error = 0;
2807         }
2808 
2809 out:
2810         sav->sav_pending = NULL;
2811         sav->sav_npending = 0;
2812         return (error);
2813 }
2814 
2815 static int
2816 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2817 {
2818         int error;
2819 
2820         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2821 
2822         if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2823             &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2824             VDEV_LABEL_SPARE)) != 0) {
2825                 return (error);
2826         }
2827 
2828         return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2829             &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2830             VDEV_LABEL_L2CACHE));
2831 }
2832 
2833 static void
2834 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2835     const char *config)
2836 {
2837         int i;
2838 
2839         if (sav->sav_config != NULL) {
2840                 nvlist_t **olddevs;
2841                 uint_t oldndevs;
2842                 nvlist_t **newdevs;
2843 
2844                 /*
2845                  * Generate new dev list by concatentating with the
2846                  * current dev list.
2847                  */
2848                 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2849                     &olddevs, &oldndevs) == 0);
2850 
2851                 newdevs = kmem_alloc(sizeof (void *) *
2852                     (ndevs + oldndevs), KM_SLEEP);
2853                 for (i = 0; i < oldndevs; i++)
2854                         VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2855                             KM_SLEEP) == 0);
2856                 for (i = 0; i < ndevs; i++)
2857                         VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2858                             KM_SLEEP) == 0);
2859 
2860                 VERIFY(nvlist_remove(sav->sav_config, config,
2861                     DATA_TYPE_NVLIST_ARRAY) == 0);
2862 
2863                 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2864                     config, newdevs, ndevs + oldndevs) == 0);
2865                 for (i = 0; i < oldndevs + ndevs; i++)
2866                         nvlist_free(newdevs[i]);
2867                 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2868         } else {
2869                 /*
2870                  * Generate a new dev list.
2871                  */
2872                 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2873                     KM_SLEEP) == 0);
2874                 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2875                     devs, ndevs) == 0);
2876         }
2877 }
2878 
2879 /*
2880  * Stop and drop level 2 ARC devices
2881  */
2882 void
2883 spa_l2cache_drop(spa_t *spa)
2884 {
2885         vdev_t *vd;
2886         int i;
2887         spa_aux_vdev_t *sav = &spa->spa_l2cache;
2888 
2889         for (i = 0; i < sav->sav_count; i++) {
2890                 uint64_t pool;
2891 
2892                 vd = sav->sav_vdevs[i];
2893                 ASSERT(vd != NULL);
2894 
2895                 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2896                     pool != 0ULL && l2arc_vdev_present(vd))
2897                         l2arc_remove_vdev(vd);
2898                 if (vd->vdev_isl2cache)
2899                         spa_l2cache_remove(vd);
2900                 vdev_clear_stats(vd);
2901                 (void) vdev_close(vd);
2902         }
2903 }
2904 
2905 /*
2906  * Pool Creation
2907  */
2908 int
2909 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2910     const char *history_str, nvlist_t *zplprops)
2911 {
2912         spa_t *spa;
2913         char *altroot = NULL;
2914         vdev_t *rvd;
2915         dsl_pool_t *dp;
2916         dmu_tx_t *tx;
2917         int error = 0;
2918         uint64_t txg = TXG_INITIAL;
2919         nvlist_t **spares, **l2cache;
2920         uint_t nspares, nl2cache;
2921         uint64_t version, obj;
2922 
2923         /*
2924          * If this pool already exists, return failure.
2925          */
2926         mutex_enter(&spa_namespace_lock);
2927         if (spa_lookup(pool) != NULL) {
2928                 mutex_exit(&spa_namespace_lock);
2929                 return (EEXIST);
2930         }
2931 
2932         /*
2933          * Allocate a new spa_t structure.
2934          */
2935         (void) nvlist_lookup_string(props,
2936             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2937         spa = spa_add(pool, NULL, altroot);
2938         spa_activate(spa, spa_mode_global);
2939 
2940         if (props && (error = spa_prop_validate(spa, props))) {
2941                 spa_deactivate(spa);
2942                 spa_remove(spa);
2943                 mutex_exit(&spa_namespace_lock);
2944                 return (error);
2945         }
2946 
2947         if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2948             &version) != 0)
2949                 version = SPA_VERSION;
2950         ASSERT(version <= SPA_VERSION);
2951 
2952         spa->spa_first_txg = txg;
2953         spa->spa_uberblock.ub_txg = txg - 1;
2954         spa->spa_uberblock.ub_version = version;
2955         spa->spa_ubsync = spa->spa_uberblock;
2956 
2957         /*
2958          * Create "The Godfather" zio to hold all async IOs
2959          */
2960         spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2961             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2962 
2963         /*
2964          * Create the root vdev.
2965          */
2966         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2967 
2968         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2969 
2970         ASSERT(error != 0 || rvd != NULL);
2971         ASSERT(error != 0 || spa->spa_root_vdev == rvd);
2972 
2973         if (error == 0 && !zfs_allocatable_devs(nvroot))
2974                 error = EINVAL;
2975 
2976         if (error == 0 &&
2977             (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2978             (error = spa_validate_aux(spa, nvroot, txg,
2979             VDEV_ALLOC_ADD)) == 0) {
2980                 for (int c = 0; c < rvd->vdev_children; c++) {
2981                         vdev_metaslab_set_size(rvd->vdev_child[c]);
2982                         vdev_expand(rvd->vdev_child[c], txg);
2983                 }
2984         }
2985 
2986         spa_config_exit(spa, SCL_ALL, FTAG);
2987 
2988         if (error != 0) {
2989                 spa_unload(spa);
2990                 spa_deactivate(spa);
2991                 spa_remove(spa);
2992                 mutex_exit(&spa_namespace_lock);
2993                 return (error);
2994         }
2995 
2996         /*
2997          * Get the list of spares, if specified.
2998          */
2999         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3000             &spares, &nspares) == 0) {
3001                 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3002                     KM_SLEEP) == 0);
3003                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3004                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3005                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3006                 spa_load_spares(spa);
3007                 spa_config_exit(spa, SCL_ALL, FTAG);
3008                 spa->spa_spares.sav_sync = B_TRUE;
3009         }
3010 
3011         /*
3012          * Get the list of level 2 cache devices, if specified.
3013          */
3014         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3015             &l2cache, &nl2cache) == 0) {
3016                 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3017                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
3018                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3019                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3020                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3021                 spa_load_l2cache(spa);
3022                 spa_config_exit(spa, SCL_ALL, FTAG);
3023                 spa->spa_l2cache.sav_sync = B_TRUE;
3024         }
3025 
3026         spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3027         spa->spa_meta_objset = dp->dp_meta_objset;
3028 
3029         /*
3030          * Create DDTs (dedup tables).
3031          */
3032         ddt_create(spa);
3033 
3034         spa_update_dspace(spa);
3035 
3036         tx = dmu_tx_create_assigned(dp, txg);
3037 
3038         /*
3039          * Create the pool config object.
3040          */
3041         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3042             DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3043             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3044 
3045         if (zap_add(spa->spa_meta_objset,
3046             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3047             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3048                 cmn_err(CE_PANIC, "failed to add pool config");
3049         }
3050 
3051         if (zap_add(spa->spa_meta_objset,
3052             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3053             sizeof (uint64_t), 1, &version, tx) != 0) {
3054                 cmn_err(CE_PANIC, "failed to add pool version");
3055         }
3056 
3057         /* Newly created pools with the right version are always deflated. */
3058         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3059                 spa->spa_deflate = TRUE;
3060                 if (zap_add(spa->spa_meta_objset,
3061                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3062                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3063                         cmn_err(CE_PANIC, "failed to add deflate");
3064                 }
3065         }
3066 
3067         /*
3068          * Create the deferred-free bpobj.  Turn off compression
3069          * because sync-to-convergence takes longer if the blocksize
3070          * keeps changing.
3071          */
3072         obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3073         dmu_object_set_compress(spa->spa_meta_objset, obj,
3074             ZIO_COMPRESS_OFF, tx);
3075         if (zap_add(spa->spa_meta_objset,
3076             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3077             sizeof (uint64_t), 1, &obj, tx) != 0) {
3078                 cmn_err(CE_PANIC, "failed to add bpobj");
3079         }
3080         VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3081             spa->spa_meta_objset, obj));
3082 
3083         /*
3084          * Create the pool's history object.
3085          */
3086         if (version >= SPA_VERSION_ZPOOL_HISTORY)
3087                 spa_history_create_obj(spa, tx);
3088 
3089         /*
3090          * Set pool properties.
3091          */
3092         spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3093         spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3094         spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3095         spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3096 
3097         if (props != NULL) {
3098                 spa_configfile_set(spa, props, B_FALSE);
3099                 spa_sync_props(spa, props, tx);
3100         }
3101 
3102         dmu_tx_commit(tx);
3103 
3104         spa->spa_sync_on = B_TRUE;
3105         txg_sync_start(spa->spa_dsl_pool);
3106 
3107         /*
3108          * We explicitly wait for the first transaction to complete so that our
3109          * bean counters are appropriately updated.
3110          */
3111         txg_wait_synced(spa->spa_dsl_pool, txg);
3112 
3113         spa_config_sync(spa, B_FALSE, B_TRUE);
3114 
3115         if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
3116                 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
3117         spa_history_log_version(spa, LOG_POOL_CREATE);
3118 
3119         spa->spa_minref = refcount_count(&spa->spa_refcount);
3120 
3121         mutex_exit(&spa_namespace_lock);
3122 
3123         return (0);
3124 }
3125 
3126 #ifdef _KERNEL
3127 /*
3128  * Get the root pool information from the root disk, then import the root pool
3129  * during the system boot up time.
3130  */
3131 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3132 
3133 static nvlist_t *
3134 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3135 {
3136         nvlist_t *config;
3137         nvlist_t *nvtop, *nvroot;
3138         uint64_t pgid;
3139 
3140         if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3141                 return (NULL);
3142 
3143         /*
3144          * Add this top-level vdev to the child array.
3145          */
3146         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3147             &nvtop) == 0);
3148         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3149             &pgid) == 0);
3150         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3151 
3152         /*
3153          * Put this pool's top-level vdevs into a root vdev.
3154          */
3155         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3156         VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3157             VDEV_TYPE_ROOT) == 0);
3158         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3159         VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3160         VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3161             &nvtop, 1) == 0);
3162 
3163         /*
3164          * Replace the existing vdev_tree with the new root vdev in
3165          * this pool's configuration (remove the old, add the new).
3166          */
3167         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3168         nvlist_free(nvroot);
3169         return (config);
3170 }
3171 
3172 /*
3173  * Walk the vdev tree and see if we can find a device with "better"
3174  * configuration. A configuration is "better" if the label on that
3175  * device has a more recent txg.
3176  */
3177 static void
3178 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3179 {
3180         for (int c = 0; c < vd->vdev_children; c++)
3181                 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3182 
3183         if (vd->vdev_ops->vdev_op_leaf) {
3184                 nvlist_t *label;
3185                 uint64_t label_txg;
3186 
3187                 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3188                     &label) != 0)
3189                         return;
3190 
3191                 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3192                     &label_txg) == 0);
3193 
3194                 /*
3195                  * Do we have a better boot device?
3196                  */
3197                 if (label_txg > *txg) {
3198                         *txg = label_txg;
3199                         *avd = vd;
3200                 }
3201                 nvlist_free(label);
3202         }
3203 }
3204 
3205 /*
3206  * Import a root pool.
3207  *
3208  * For x86. devpath_list will consist of devid and/or physpath name of
3209  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3210  * The GRUB "findroot" command will return the vdev we should boot.
3211  *
3212  * For Sparc, devpath_list consists the physpath name of the booting device
3213  * no matter the rootpool is a single device pool or a mirrored pool.
3214  * e.g.
3215  *      "/pci@1f,0/ide@d/disk@0,0:a"
3216  */
3217 int
3218 spa_import_rootpool(char *devpath, char *devid)
3219 {
3220         spa_t *spa;
3221         vdev_t *rvd, *bvd, *avd = NULL;
3222         nvlist_t *config, *nvtop;
3223         uint64_t guid, txg;
3224         char *pname;
3225         int error;
3226 
3227         /*
3228          * Read the label from the boot device and generate a configuration.
3229          */
3230         config = spa_generate_rootconf(devpath, devid, &guid);
3231 #if defined(_OBP) && defined(_KERNEL)
3232         if (config == NULL) {
3233                 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3234                         /* iscsi boot */
3235                         get_iscsi_bootpath_phy(devpath);
3236                         config = spa_generate_rootconf(devpath, devid, &guid);
3237                 }
3238         }
3239 #endif
3240         if (config == NULL) {
3241                 cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
3242                     devpath);
3243                 return (EIO);
3244         }
3245 
3246         VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3247             &pname) == 0);
3248         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3249 
3250         mutex_enter(&spa_namespace_lock);
3251         if ((spa = spa_lookup(pname)) != NULL) {
3252                 /*
3253                  * Remove the existing root pool from the namespace so that we
3254                  * can replace it with the correct config we just read in.
3255                  */
3256                 spa_remove(spa);
3257         }
3258 
3259         spa = spa_add(pname, config, NULL);
3260         spa->spa_is_root = B_TRUE;
3261         spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3262 
3263         /*
3264          * Build up a vdev tree based on the boot device's label config.
3265          */
3266         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3267             &nvtop) == 0);
3268         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3269         error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3270             VDEV_ALLOC_ROOTPOOL);
3271         spa_config_exit(spa, SCL_ALL, FTAG);
3272         if (error) {
3273                 mutex_exit(&spa_namespace_lock);
3274                 nvlist_free(config);
3275                 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3276                     pname);
3277                 return (error);
3278         }
3279 
3280         /*
3281          * Get the boot vdev.
3282          */
3283         if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3284                 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3285                     (u_longlong_t)guid);
3286                 error = ENOENT;
3287                 goto out;
3288         }
3289 
3290         /*
3291          * Determine if there is a better boot device.
3292          */
3293         avd = bvd;
3294         spa_alt_rootvdev(rvd, &avd, &txg);
3295         if (avd != bvd) {
3296                 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3297                     "try booting from '%s'", avd->vdev_path);
3298                 error = EINVAL;
3299                 goto out;
3300         }
3301 
3302         /*
3303          * If the boot device is part of a spare vdev then ensure that
3304          * we're booting off the active spare.
3305          */
3306         if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3307             !bvd->vdev_isspare) {
3308                 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3309                     "try booting from '%s'",
3310                     bvd->vdev_parent->
3311                     vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3312                 error = EINVAL;
3313                 goto out;
3314         }
3315 
3316         error = 0;
3317         spa_history_log_version(spa, LOG_POOL_IMPORT);
3318 out:
3319         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3320         vdev_free(rvd);
3321         spa_config_exit(spa, SCL_ALL, FTAG);
3322         mutex_exit(&spa_namespace_lock);
3323 
3324         nvlist_free(config);
3325         return (error);
3326 }
3327 
3328 #endif
3329 
3330 /*
3331  * Import a non-root pool into the system.
3332  */
3333 int
3334 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3335 {
3336         spa_t *spa;
3337         char *altroot = NULL;
3338         spa_load_state_t state = SPA_LOAD_IMPORT;
3339         zpool_rewind_policy_t policy;
3340         uint64_t mode = spa_mode_global;
3341         uint64_t readonly = B_FALSE;
3342         int error;
3343         nvlist_t *nvroot;
3344         nvlist_t **spares, **l2cache;
3345         uint_t nspares, nl2cache;
3346 
3347         /*
3348          * If a pool with this name exists, return failure.
3349          */
3350         mutex_enter(&spa_namespace_lock);
3351         if (spa_lookup(pool) != NULL) {
3352                 mutex_exit(&spa_namespace_lock);
3353                 return (EEXIST);
3354         }
3355 
3356         /*
3357          * Create and initialize the spa structure.
3358          */
3359         (void) nvlist_lookup_string(props,
3360             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3361         (void) nvlist_lookup_uint64(props,
3362             zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3363         if (readonly)
3364                 mode = FREAD;
3365         spa = spa_add(pool, config, altroot);
3366         spa->spa_import_flags = flags;
3367 
3368         /*
3369          * Verbatim import - Take a pool and insert it into the namespace
3370          * as if it had been loaded at boot.
3371          */
3372         if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3373                 if (props != NULL)
3374                         spa_configfile_set(spa, props, B_FALSE);
3375 
3376                 spa_config_sync(spa, B_FALSE, B_TRUE);
3377 
3378                 mutex_exit(&spa_namespace_lock);
3379                 spa_history_log_version(spa, LOG_POOL_IMPORT);
3380 
3381                 return (0);
3382         }
3383 
3384         spa_activate(spa, mode);
3385 
3386         /*
3387          * Don't start async tasks until we know everything is healthy.
3388          */
3389         spa_async_suspend(spa);
3390 
3391         zpool_get_rewind_policy(config, &policy);
3392         if (policy.zrp_request & ZPOOL_DO_REWIND)
3393                 state = SPA_LOAD_RECOVER;
3394 
3395         /*
3396          * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
3397          * because the user-supplied config is actually the one to trust when
3398          * doing an import.
3399          */
3400         if (state != SPA_LOAD_RECOVER)
3401                 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3402 
3403         error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3404             policy.zrp_request);
3405 
3406         /*
3407          * Propagate anything learned while loading the pool and pass it
3408          * back to caller (i.e. rewind info, missing devices, etc).
3409          */
3410         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3411             spa->spa_load_info) == 0);
3412 
3413         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3414         /*
3415          * Toss any existing sparelist, as it doesn't have any validity
3416          * anymore, and conflicts with spa_has_spare().
3417          */
3418         if (spa->spa_spares.sav_config) {
3419                 nvlist_free(spa->spa_spares.sav_config);
3420                 spa->spa_spares.sav_config = NULL;
3421                 spa_load_spares(spa);
3422         }
3423         if (spa->spa_l2cache.sav_config) {
3424                 nvlist_free(spa->spa_l2cache.sav_config);
3425                 spa->spa_l2cache.sav_config = NULL;
3426                 spa_load_l2cache(spa);
3427         }
3428 
3429         VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3430             &nvroot) == 0);
3431         if (error == 0)
3432                 error = spa_validate_aux(spa, nvroot, -1ULL,
3433                     VDEV_ALLOC_SPARE);
3434         if (error == 0)
3435                 error = spa_validate_aux(spa, nvroot, -1ULL,
3436                     VDEV_ALLOC_L2CACHE);
3437         spa_config_exit(spa, SCL_ALL, FTAG);
3438 
3439         if (props != NULL)
3440                 spa_configfile_set(spa, props, B_FALSE);
3441 
3442         if (error != 0 || (props && spa_writeable(spa) &&
3443             (error = spa_prop_set(spa, props)))) {
3444                 spa_unload(spa);
3445                 spa_deactivate(spa);
3446                 spa_remove(spa);
3447                 mutex_exit(&spa_namespace_lock);
3448                 return (error);
3449         }
3450 
3451         spa_async_resume(spa);
3452 
3453         /*
3454          * Override any spares and level 2 cache devices as specified by
3455          * the user, as these may have correct device names/devids, etc.
3456          */
3457         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3458             &spares, &nspares) == 0) {
3459                 if (spa->spa_spares.sav_config)
3460                         VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3461                             ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3462                 else
3463                         VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3464                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
3465                 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3466                     ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3467                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3468                 spa_load_spares(spa);
3469                 spa_config_exit(spa, SCL_ALL, FTAG);
3470                 spa->spa_spares.sav_sync = B_TRUE;
3471         }
3472         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3473             &l2cache, &nl2cache) == 0) {
3474                 if (spa->spa_l2cache.sav_config)
3475                         VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3476                             ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3477                 else
3478                         VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3479                             NV_UNIQUE_NAME, KM_SLEEP) == 0);
3480                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3481                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3482                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3483                 spa_load_l2cache(spa);
3484                 spa_config_exit(spa, SCL_ALL, FTAG);
3485                 spa->spa_l2cache.sav_sync = B_TRUE;
3486         }
3487 
3488         /*
3489          * Check for any removed devices.
3490          */
3491         if (spa->spa_autoreplace) {
3492                 spa_aux_check_removed(&spa->spa_spares);
3493                 spa_aux_check_removed(&spa->spa_l2cache);
3494         }
3495 
3496         if (spa_writeable(spa)) {
3497                 /*
3498                  * Update the config cache to include the newly-imported pool.
3499                  */
3500                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3501         }
3502 
3503         /*
3504          * It's possible that the pool was expanded while it was exported.
3505          * We kick off an async task to handle this for us.
3506          */
3507         spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3508 
3509         mutex_exit(&spa_namespace_lock);
3510         spa_history_log_version(spa, LOG_POOL_IMPORT);
3511 
3512         return (0);
3513 }
3514 
3515 nvlist_t *
3516 spa_tryimport(nvlist_t *tryconfig)
3517 {
3518         nvlist_t *config = NULL;
3519         char *poolname;
3520         spa_t *spa;
3521         uint64_t state;
3522         int error;
3523 
3524         if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3525                 return (NULL);
3526 
3527         if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3528                 return (NULL);
3529 
3530         /*
3531          * Create and initialize the spa structure.
3532          */
3533         mutex_enter(&spa_namespace_lock);
3534         spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
3535         spa_activate(spa, FREAD);
3536 
3537         /*
3538          * Pass off the heavy lifting to spa_load().
3539          * Pass TRUE for mosconfig because the user-supplied config
3540          * is actually the one to trust when doing an import.
3541          */
3542         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3543 
3544         /*
3545          * If 'tryconfig' was at least parsable, return the current config.
3546          */
3547         if (spa->spa_root_vdev != NULL) {
3548                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3549                 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3550                     poolname) == 0);
3551                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3552                     state) == 0);
3553                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3554                     spa->spa_uberblock.ub_timestamp) == 0);
3555 
3556                 /*
3557                  * If the bootfs property exists on this pool then we
3558                  * copy it out so that external consumers can tell which
3559                  * pools are bootable.
3560                  */
3561                 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3562                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3563 
3564                         /*
3565                          * We have to play games with the name since the
3566                          * pool was opened as TRYIMPORT_NAME.
3567                          */
3568                         if (dsl_dsobj_to_dsname(spa_name(spa),
3569                             spa->spa_bootfs, tmpname) == 0) {
3570                                 char *cp;
3571                                 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3572 
3573                                 cp = strchr(tmpname, '/');
3574                                 if (cp == NULL) {
3575                                         (void) strlcpy(dsname, tmpname,
3576                                             MAXPATHLEN);
3577                                 } else {
3578                                         (void) snprintf(dsname, MAXPATHLEN,
3579                                             "%s/%s", poolname, ++cp);
3580                                 }
3581                                 VERIFY(nvlist_add_string(config,
3582                                     ZPOOL_CONFIG_BOOTFS, dsname) == 0);
3583                                 kmem_free(dsname, MAXPATHLEN);
3584                         }
3585                         kmem_free(tmpname, MAXPATHLEN);
3586                 }
3587 
3588                 /*
3589                  * Add the list of hot spares and level 2 cache devices.
3590                  */
3591                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3592                 spa_add_spares(spa, config);
3593                 spa_add_l2cache(spa, config);
3594                 spa_config_exit(spa, SCL_CONFIG, FTAG);
3595         }
3596 
3597         spa_unload(spa);
3598         spa_deactivate(spa);
3599         spa_remove(spa);
3600         mutex_exit(&spa_namespace_lock);
3601 
3602         return (config);
3603 }
3604 
3605 /*
3606  * Pool export/destroy
3607  *
3608  * The act of destroying or exporting a pool is very simple.  We make sure there
3609  * is no more pending I/O and any references to the pool are gone.  Then, we
3610  * update the pool state and sync all the labels to disk, removing the
3611  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
3612  * we don't sync the labels or remove the configuration cache.
3613  */
3614 static int
3615 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
3616     boolean_t force, boolean_t hardforce)
3617 {
3618         spa_t *spa;
3619 
3620         if (oldconfig)
3621                 *oldconfig = NULL;
3622 
3623         if (!(spa_mode_global & FWRITE))
3624                 return (EROFS);
3625 
3626         mutex_enter(&spa_namespace_lock);
3627         if ((spa = spa_lookup(pool)) == NULL) {
3628                 mutex_exit(&spa_namespace_lock);
3629                 return (ENOENT);
3630         }
3631 
3632         /*
3633          * Put a hold on the pool, drop the namespace lock, stop async tasks,
3634          * reacquire the namespace lock, and see if we can export.
3635          */
3636         spa_open_ref(spa, FTAG);
3637         mutex_exit(&spa_namespace_lock);
3638         spa_async_suspend(spa);
3639         mutex_enter(&spa_namespace_lock);
3640         spa_close(spa, FTAG);
3641 
3642         /*
3643          * The pool will be in core if it's openable,
3644          * in which case we can modify its state.
3645          */
3646         if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
3647                 /*
3648                  * Objsets may be open only because they're dirty, so we
3649                  * have to force it to sync before checking spa_refcnt.
3650                  */
3651                 txg_wait_synced(spa->spa_dsl_pool, 0);
3652 
3653                 /*
3654                  * A pool cannot be exported or destroyed if there are active
3655                  * references.  If we are resetting a pool, allow references by
3656                  * fault injection handlers.
3657                  */
3658                 if (!spa_refcount_zero(spa) ||
3659                     (spa->spa_inject_ref != 0 &&
3660                     new_state != POOL_STATE_UNINITIALIZED)) {
3661                         spa_async_resume(spa);
3662                         mutex_exit(&spa_namespace_lock);
3663                         return (EBUSY);
3664                 }
3665 
3666                 /*
3667                  * A pool cannot be exported if it has an active shared spare.
3668                  * This is to prevent other pools stealing the active spare
3669                  * from an exported pool. At user's own will, such pool can
3670                  * be forcedly exported.
3671                  */
3672                 if (!force && new_state == POOL_STATE_EXPORTED &&
3673                     spa_has_active_shared_spare(spa)) {
3674                         spa_async_resume(spa);
3675                         mutex_exit(&spa_namespace_lock);
3676                         return (EXDEV);
3677                 }
3678 
3679                 /*
3680                  * We want this to be reflected on every label,
3681                  * so mark them all dirty.  spa_unload() will do the
3682                  * final sync that pushes these changes out.
3683                  */
3684                 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
3685                         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3686                         spa->spa_state = new_state;
3687                         spa->spa_final_txg = spa_last_synced_txg(spa) +
3688                             TXG_DEFER_SIZE + 1;
3689                         vdev_config_dirty(spa->spa_root_vdev);
3690                         spa_config_exit(spa, SCL_ALL, FTAG);
3691                 }
3692         }
3693 
3694         spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
3695 
3696         if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3697                 spa_unload(spa);
3698                 spa_deactivate(spa);
3699         }
3700 
3701         if (oldconfig && spa->spa_config)
3702                 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
3703 
3704         if (new_state != POOL_STATE_UNINITIALIZED) {
3705                 if (!hardforce)
3706                         spa_config_sync(spa, B_TRUE, B_TRUE);
3707                 spa_remove(spa);
3708         }
3709         mutex_exit(&spa_namespace_lock);
3710 
3711         return (0);
3712 }
3713 
3714 /*
3715  * Destroy a storage pool.
3716  */
3717 int
3718 spa_destroy(char *pool)
3719 {
3720         return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
3721             B_FALSE, B_FALSE));
3722 }
3723 
3724 /*
3725  * Export a storage pool.
3726  */
3727 int
3728 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
3729     boolean_t hardforce)
3730 {
3731         return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
3732             force, hardforce));
3733 }
3734 
3735 /*
3736  * Similar to spa_export(), this unloads the spa_t without actually removing it
3737  * from the namespace in any way.
3738  */
3739 int
3740 spa_reset(char *pool)
3741 {
3742         return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
3743             B_FALSE, B_FALSE));
3744 }
3745 
3746 /*
3747  * ==========================================================================
3748  * Device manipulation
3749  * ==========================================================================
3750  */
3751 
3752 /*
3753  * Add a device to a storage pool.
3754  */
3755 int
3756 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
3757 {
3758         uint64_t txg, id;
3759         int error;
3760         vdev_t *rvd = spa->spa_root_vdev;
3761         vdev_t *vd, *tvd;
3762         nvlist_t **spares, **l2cache;
3763         uint_t nspares, nl2cache;
3764 
3765         ASSERT(spa_writeable(spa));
3766 
3767         txg = spa_vdev_enter(spa);
3768 
3769         if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
3770             VDEV_ALLOC_ADD)) != 0)
3771                 return (spa_vdev_exit(spa, NULL, txg, error));
3772 
3773         spa->spa_pending_vdev = vd;  /* spa_vdev_exit() will clear this */
3774 
3775         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
3776             &nspares) != 0)
3777                 nspares = 0;
3778 
3779         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
3780             &nl2cache) != 0)
3781                 nl2cache = 0;
3782 
3783         if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
3784                 return (spa_vdev_exit(spa, vd, txg, EINVAL));
3785 
3786         if (vd->vdev_children != 0 &&
3787             (error = vdev_create(vd, txg, B_FALSE)) != 0)
3788                 return (spa_vdev_exit(spa, vd, txg, error));
3789 
3790         /*
3791          * We must validate the spares and l2cache devices after checking the
3792          * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
3793          */
3794         if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
3795                 return (spa_vdev_exit(spa, vd, txg, error));
3796 
3797         /*
3798          * Transfer each new top-level vdev from vd to rvd.
3799          */
3800         for (int c = 0; c < vd->vdev_children; c++) {
3801 
3802                 /*
3803                  * Set the vdev id to the first hole, if one exists.
3804                  */
3805                 for (id = 0; id < rvd->vdev_children; id++) {
3806                         if (rvd->vdev_child[id]->vdev_ishole) {
3807                                 vdev_free(rvd->vdev_child[id]);
3808                                 break;
3809                         }
3810                 }
3811                 tvd = vd->vdev_child[c];
3812                 vdev_remove_child(vd, tvd);
3813                 tvd->vdev_id = id;
3814                 vdev_add_child(rvd, tvd);
3815                 vdev_config_dirty(tvd);
3816         }
3817 
3818         if (nspares != 0) {
3819                 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
3820                     ZPOOL_CONFIG_SPARES);
3821                 spa_load_spares(spa);
3822                 spa->spa_spares.sav_sync = B_TRUE;
3823         }
3824 
3825         if (nl2cache != 0) {
3826                 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
3827                     ZPOOL_CONFIG_L2CACHE);
3828                 spa_load_l2cache(spa);
3829                 spa->spa_l2cache.sav_sync = B_TRUE;
3830         }
3831 
3832         /*
3833          * We have to be careful when adding new vdevs to an existing pool.
3834          * If other threads start allocating from these vdevs before we
3835          * sync the config cache, and we lose power, then upon reboot we may
3836          * fail to open the pool because there are DVAs that the config cache
3837          * can't translate.  Therefore, we first add the vdevs without
3838          * initializing metaslabs; sync the config cache (via spa_vdev_exit());
3839          * and then let spa_config_update() initialize the new metaslabs.
3840          *
3841          * spa_load() checks for added-but-not-initialized vdevs, so that
3842          * if we lose power at any point in this sequence, the remaining
3843          * steps will be completed the next time we load the pool.
3844          */
3845         (void) spa_vdev_exit(spa, vd, txg, 0);
3846 
3847         mutex_enter(&spa_namespace_lock);
3848         spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3849         mutex_exit(&spa_namespace_lock);
3850 
3851         return (0);
3852 }
3853 
3854 /*
3855  * Attach a device to a mirror.  The arguments are the path to any device
3856  * in the mirror, and the nvroot for the new device.  If the path specifies
3857  * a device that is not mirrored, we automatically insert the mirror vdev.
3858  *
3859  * If 'replacing' is specified, the new device is intended to replace the
3860  * existing device; in this case the two devices are made into their own
3861  * mirror using the 'replacing' vdev, which is functionally identical to
3862  * the mirror vdev (it actually reuses all the same ops) but has a few
3863  * extra rules: you can't attach to it after it's been created, and upon
3864  * completion of resilvering, the first disk (the one being replaced)
3865  * is automatically detached.
3866  */
3867 int
3868 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3869 {
3870         uint64_t txg, dtl_max_txg;
3871         vdev_t *rvd = spa->spa_root_vdev;
3872         vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
3873         vdev_ops_t *pvops;
3874         char *oldvdpath, *newvdpath;
3875         int newvd_isspare;
3876         int error;
3877 
3878         ASSERT(spa_writeable(spa));
3879 
3880         txg = spa_vdev_enter(spa);
3881 
3882         oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3883 
3884         if (oldvd == NULL)
3885                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3886 
3887         if (!oldvd->vdev_ops->vdev_op_leaf)
3888                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3889 
3890         pvd = oldvd->vdev_parent;
3891 
3892         if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
3893             VDEV_ALLOC_ADD)) != 0)
3894                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
3895 
3896         if (newrootvd->vdev_children != 1)
3897                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3898 
3899         newvd = newrootvd->vdev_child[0];
3900 
3901         if (!newvd->vdev_ops->vdev_op_leaf)
3902                 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3903 
3904         if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3905                 return (spa_vdev_exit(spa, newrootvd, txg, error));
3906 
3907         /*
3908          * Spares can't replace logs
3909          */
3910         if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
3911                 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3912 
3913         if (!replacing) {
3914                 /*
3915                  * For attach, the only allowable parent is a mirror or the root
3916                  * vdev.
3917                  */
3918                 if (pvd->vdev_ops != &vdev_mirror_ops &&
3919                     pvd->vdev_ops != &vdev_root_ops)
3920                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3921 
3922                 pvops = &vdev_mirror_ops;
3923         } else {
3924                 /*
3925                  * Active hot spares can only be replaced by inactive hot
3926                  * spares.
3927                  */
3928                 if (pvd->vdev_ops == &vdev_spare_ops &&
3929                     oldvd->vdev_isspare &&
3930                     !spa_has_spare(spa, newvd->vdev_guid))
3931                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3932 
3933                 /*
3934                  * If the source is a hot spare, and the parent isn't already a
3935                  * spare, then we want to create a new hot spare.  Otherwise, we
3936                  * want to create a replacing vdev.  The user is not allowed to
3937                  * attach to a spared vdev child unless the 'isspare' state is
3938                  * the same (spare replaces spare, non-spare replaces
3939                  * non-spare).
3940                  */
3941                 if (pvd->vdev_ops == &vdev_replacing_ops &&
3942                     spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
3943                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3944                 } else if (pvd->vdev_ops == &vdev_spare_ops &&
3945                     newvd->vdev_isspare != oldvd->vdev_isspare) {
3946                         return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3947                 }
3948 
3949                 if (newvd->vdev_isspare)
3950                         pvops = &vdev_spare_ops;
3951                 else
3952                         pvops = &vdev_replacing_ops;
3953         }
3954 
3955         /*
3956          * Make sure the new device is big enough.
3957          */
3958         if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3959                 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3960 
3961         /*
3962          * The new device cannot have a higher alignment requirement
3963          * than the top-level vdev.
3964          */
3965         if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3966                 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3967 
3968         /*
3969          * If this is an in-place replacement, update oldvd's path and devid
3970          * to make it distinguishable from newvd, and unopenable from now on.
3971          */
3972         if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3973                 spa_strfree(oldvd->vdev_path);
3974                 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3975                     KM_SLEEP);
3976                 (void) sprintf(oldvd->vdev_path, "%s/%s",
3977                     newvd->vdev_path, "old");
3978                 if (oldvd->vdev_devid != NULL) {
3979                         spa_strfree(oldvd->vdev_devid);
3980                         oldvd->vdev_devid = NULL;
3981                 }
3982         }
3983 
3984         /* mark the device being resilvered */
3985         newvd->vdev_resilvering = B_TRUE;
3986 
3987         /*
3988          * If the parent is not a mirror, or if we're replacing, insert the new
3989          * mirror/replacing/spare vdev above oldvd.
3990          */
3991         if (pvd->vdev_ops != pvops)
3992                 pvd = vdev_add_parent(oldvd, pvops);
3993 
3994         ASSERT(pvd->vdev_top->vdev_parent == rvd);
3995         ASSERT(pvd->vdev_ops == pvops);
3996         ASSERT(oldvd->vdev_parent == pvd);
3997 
3998         /*
3999          * Extract the new device from its root and add it to pvd.
4000          */
4001         vdev_remove_child(newrootvd, newvd);
4002         newvd->vdev_id = pvd->vdev_children;
4003         newvd->vdev_crtxg = oldvd->vdev_crtxg;
4004         vdev_add_child(pvd, newvd);
4005 
4006         tvd = newvd->vdev_top;
4007         ASSERT(pvd->vdev_top == tvd);
4008         ASSERT(tvd->vdev_parent == rvd);
4009 
4010         vdev_config_dirty(tvd);
4011 
4012         /*
4013          * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4014          * for any dmu_sync-ed blocks.  It will propagate upward when
4015          * spa_vdev_exit() calls vdev_dtl_reassess().
4016          */
4017         dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4018 
4019         vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4020             dtl_max_txg - TXG_INITIAL);
4021 
4022         if (newvd->vdev_isspare) {
4023                 spa_spare_activate(newvd);
4024                 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4025         }
4026 
4027         oldvdpath = spa_strdup(oldvd->vdev_path);
4028         newvdpath = spa_strdup(newvd->vdev_path);
4029         newvd_isspare = newvd->vdev_isspare;
4030 
4031         /*
4032          * Mark newvd's DTL dirty in this txg.
4033          */
4034         vdev_dirty(tvd, VDD_DTL, newvd, txg);
4035 
4036         /*
4037          * Restart the resilver
4038          */
4039         dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4040 
4041         /*
4042          * Commit the config
4043          */
4044         (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4045 
4046         spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
4047             "%s vdev=%s %s vdev=%s",
4048             replacing && newvd_isspare ? "spare in" :
4049             replacing ? "replace" : "attach", newvdpath,
4050             replacing ? "for" : "to", oldvdpath);
4051 
4052         spa_strfree(oldvdpath);
4053         spa_strfree(newvdpath);
4054 
4055         if (spa->spa_bootfs)
4056                 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4057 
4058         return (0);
4059 }
4060 
4061 /*
4062  * Detach a device from a mirror or replacing vdev.
4063  * If 'replace_done' is specified, only detach if the parent
4064  * is a replacing vdev.
4065  */
4066 int
4067 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4068 {
4069         uint64_t txg;
4070         int error;
4071         vdev_t *rvd = spa->spa_root_vdev;
4072         vdev_t *vd, *pvd, *cvd, *tvd;
4073         boolean_t unspare = B_FALSE;
4074         uint64_t unspare_guid;
4075         char *vdpath;
4076 
4077         ASSERT(spa_writeable(spa));
4078 
4079         txg = spa_vdev_enter(spa);
4080 
4081         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4082 
4083         if (vd == NULL)
4084                 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4085 
4086         if (!vd->vdev_ops->vdev_op_leaf)
4087                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4088 
4089         pvd = vd->vdev_parent;
4090 
4091         /*
4092          * If the parent/child relationship is not as expected, don't do it.
4093          * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4094          * vdev that's replacing B with C.  The user's intent in replacing
4095          * is to go from M(A,B) to M(A,C).  If the user decides to cancel
4096          * the replace by detaching C, the expected behavior is to end up
4097          * M(A,B).  But suppose that right after deciding to detach C,
4098          * the replacement of B completes.  We would have M(A,C), and then
4099          * ask to detach C, which would leave us with just A -- not what
4100          * the user wanted.  To prevent this, we make sure that the
4101          * parent/child relationship hasn't changed -- in this example,
4102          * that C's parent is still the replacing vdev R.
4103          */
4104         if (pvd->vdev_guid != pguid && pguid != 0)
4105                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4106 
4107         /*
4108          * Only 'replacing' or 'spare' vdevs can be replaced.
4109          */
4110         if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4111             pvd->vdev_ops != &vdev_spare_ops)
4112                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4113 
4114         ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4115             spa_version(spa) >= SPA_VERSION_SPARES);
4116 
4117         /*
4118          * Only mirror, replacing, and spare vdevs support detach.
4119          */
4120         if (pvd->vdev_ops != &vdev_replacing_ops &&
4121             pvd->vdev_ops != &vdev_mirror_ops &&
4122             pvd->vdev_ops != &vdev_spare_ops)
4123                 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4124 
4125         /*
4126          * If this device has the only valid copy of some data,
4127          * we cannot safely detach it.
4128          */
4129         if (vdev_dtl_required(vd))
4130                 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4131 
4132         ASSERT(pvd->vdev_children >= 2);
4133 
4134         /*
4135          * If we are detaching the second disk from a replacing vdev, then
4136          * check to see if we changed the original vdev's path to have "/old"
4137          * at the end in spa_vdev_attach().  If so, undo that change now.
4138          */
4139         if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4140             vd->vdev_path != NULL) {
4141                 size_t len = strlen(vd->vdev_path);
4142 
4143                 for (int c = 0; c < pvd->vdev_children; c++) {
4144                         cvd = pvd->vdev_child[c];
4145 
4146                         if (cvd == vd || cvd->vdev_path == NULL)
4147                                 continue;
4148 
4149                         if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4150                             strcmp(cvd->vdev_path + len, "/old") == 0) {
4151                                 spa_strfree(cvd->vdev_path);
4152                                 cvd->vdev_path = spa_strdup(vd->vdev_path);
4153                                 break;
4154                         }
4155                 }
4156         }
4157 
4158         /*
4159          * If we are detaching the original disk from a spare, then it implies
4160          * that the spare should become a real disk, and be removed from the
4161          * active spare list for the pool.
4162          */
4163         if (pvd->vdev_ops == &vdev_spare_ops &&
4164             vd->vdev_id == 0 &&
4165             pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4166                 unspare = B_TRUE;
4167 
4168         /*
4169          * Erase the disk labels so the disk can be used for other things.
4170          * This must be done after all other error cases are handled,
4171          * but before we disembowel vd (so we can still do I/O to it).
4172          * But if we can't do it, don't treat the error as fatal --
4173          * it may be that the unwritability of the disk is the reason
4174          * it's being detached!
4175          */
4176         error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4177 
4178         /*
4179          * Remove vd from its parent and compact the parent's children.
4180          */
4181         vdev_remove_child(pvd, vd);
4182         vdev_compact_children(pvd);
4183 
4184         /*
4185          * Remember one of the remaining children so we can get tvd below.
4186          */
4187         cvd = pvd->vdev_child[pvd->vdev_children - 1];
4188 
4189         /*
4190          * If we need to remove the remaining child from the list of hot spares,
4191          * do it now, marking the vdev as no longer a spare in the process.
4192          * We must do this before vdev_remove_parent(), because that can
4193          * change the GUID if it creates a new toplevel GUID.  For a similar
4194          * reason, we must remove the spare now, in the same txg as the detach;
4195          * otherwise someone could attach a new sibling, change the GUID, and
4196          * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4197          */
4198         if (unspare) {
4199                 ASSERT(cvd->vdev_isspare);
4200                 spa_spare_remove(cvd);
4201                 unspare_guid = cvd->vdev_guid;
4202                 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4203                 cvd->vdev_unspare = B_TRUE;
4204         }
4205 
4206         /*
4207          * If the parent mirror/replacing vdev only has one child,
4208          * the parent is no longer needed.  Remove it from the tree.
4209          */
4210         if (pvd->vdev_children == 1) {
4211                 if (pvd->vdev_ops == &vdev_spare_ops)
4212                         cvd->vdev_unspare = B_FALSE;
4213                 vdev_remove_parent(cvd);
4214                 cvd->vdev_resilvering = B_FALSE;
4215         }
4216 
4217 
4218         /*
4219          * We don't set tvd until now because the parent we just removed
4220          * may have been the previous top-level vdev.
4221          */
4222         tvd = cvd->vdev_top;
4223         ASSERT(tvd->vdev_parent == rvd);
4224 
4225         /*
4226          * Reevaluate the parent vdev state.
4227          */
4228         vdev_propagate_state(cvd);
4229 
4230         /*
4231          * If the 'autoexpand' property is set on the pool then automatically
4232          * try to expand the size of the pool. For example if the device we
4233          * just detached was smaller than the others, it may be possible to
4234          * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4235          * first so that we can obtain the updated sizes of the leaf vdevs.
4236          */
4237         if (spa->spa_autoexpand) {
4238                 vdev_reopen(tvd);
4239                 vdev_expand(tvd, txg);
4240         }
4241 
4242         vdev_config_dirty(tvd);
4243 
4244         /*
4245          * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
4246          * vd->vdev_detached is set and free vd's DTL object in syncing context.
4247          * But first make sure we're not on any *other* txg's DTL list, to
4248          * prevent vd from being accessed after it's freed.
4249          */
4250         vdpath = spa_strdup(vd->vdev_path);
4251         for (int t = 0; t < TXG_SIZE; t++)
4252                 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4253         vd->vdev_detached = B_TRUE;
4254         vdev_dirty(tvd, VDD_DTL, vd, txg);
4255 
4256         spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4257 
4258         /* hang on to the spa before we release the lock */
4259         spa_open_ref(spa, FTAG);
4260 
4261         error = spa_vdev_exit(spa, vd, txg, 0);
4262 
4263         spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
4264             "vdev=%s", vdpath);
4265         spa_strfree(vdpath);
4266 
4267         /*
4268          * If this was the removal of the original device in a hot spare vdev,
4269          * then we want to go through and remove the device from the hot spare
4270          * list of every other pool.
4271          */
4272         if (unspare) {
4273                 spa_t *altspa = NULL;
4274 
4275                 mutex_enter(&spa_namespace_lock);
4276                 while ((altspa = spa_next(altspa)) != NULL) {
4277                         if (altspa->spa_state != POOL_STATE_ACTIVE ||
4278                             altspa == spa)
4279                                 continue;
4280 
4281                         spa_open_ref(altspa, FTAG);
4282                         mutex_exit(&spa_namespace_lock);
4283                         (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4284                         mutex_enter(&spa_namespace_lock);
4285                         spa_close(altspa, FTAG);
4286                 }
4287                 mutex_exit(&spa_namespace_lock);
4288 
4289                 /* search the rest of the vdevs for spares to remove */
4290                 spa_vdev_resilver_done(spa);
4291         }
4292 
4293         /* all done with the spa; OK to release */
4294         mutex_enter(&spa_namespace_lock);
4295         spa_close(spa, FTAG);
4296         mutex_exit(&spa_namespace_lock);
4297 
4298         return (error);
4299 }
4300 
4301 /*
4302  * Split a set of devices from their mirrors, and create a new pool from them.
4303  */
4304 int
4305 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4306     nvlist_t *props, boolean_t exp)
4307 {
4308         int error = 0;
4309         uint64_t txg, *glist;
4310         spa_t *newspa;
4311         uint_t c, children, lastlog;
4312         nvlist_t **child, *nvl, *tmp;
4313         dmu_tx_t *tx;
4314         char *altroot = NULL;
4315         vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
4316         boolean_t activate_slog;
4317 
4318         ASSERT(spa_writeable(spa));
4319 
4320         txg = spa_vdev_enter(spa);
4321 
4322         /* clear the log and flush everything up to now */
4323         activate_slog = spa_passivate_log(spa);
4324         (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4325         error = spa_offline_log(spa);
4326         txg = spa_vdev_config_enter(spa);
4327 
4328         if (activate_slog)
4329                 spa_activate_log(spa);
4330 
4331         if (error != 0)
4332                 return (spa_vdev_exit(spa, NULL, txg, error));
4333 
4334         /* check new spa name before going any further */
4335         if (spa_lookup(newname) != NULL)
4336                 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4337 
4338         /*
4339          * scan through all the children to ensure they're all mirrors
4340          */
4341         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4342             nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4343             &children) != 0)
4344                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4345 
4346         /* first, check to ensure we've got the right child count */
4347         rvd = spa->spa_root_vdev;
4348         lastlog = 0;
4349         for (c = 0; c < rvd->vdev_children; c++) {
4350                 vdev_t *vd = rvd->vdev_child[c];
4351 
4352                 /* don't count the holes & logs as children */
4353                 if (vd->vdev_islog || vd->vdev_ishole) {
4354                         if (lastlog == 0)
4355                                 lastlog = c;
4356                         continue;
4357                 }
4358 
4359                 lastlog = 0;
4360         }
4361         if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4362                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4363 
4364         /* next, ensure no spare or cache devices are part of the split */
4365         if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4366             nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4367                 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4368 
4369         vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4370         glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4371 
4372         /* then, loop over each vdev and validate it */
4373         for (c = 0; c < children; c++) {
4374                 uint64_t is_hole = 0;
4375 
4376                 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4377                     &is_hole);
4378 
4379                 if (is_hole != 0) {
4380                         if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4381                             spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4382                                 continue;
4383                         } else {
4384                                 error = EINVAL;
4385                                 break;
4386                         }
4387                 }
4388 
4389                 /* which disk is going to be split? */
4390                 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4391                     &glist[c]) != 0) {
4392                         error = EINVAL;
4393                         break;
4394                 }
4395 
4396                 /* look it up in the spa */
4397                 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4398                 if (vml[c] == NULL) {
4399                         error = ENODEV;
4400                         break;
4401                 }
4402 
4403                 /* make sure there's nothing stopping the split */
4404                 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4405                     vml[c]->vdev_islog ||
4406                     vml[c]->vdev_ishole ||
4407                     vml[c]->vdev_isspare ||
4408                     vml[c]->vdev_isl2cache ||
4409                     !vdev_writeable(vml[c]) ||
4410                     vml[c]->vdev_children != 0 ||
4411                     vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4412                     c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4413                         error = EINVAL;
4414                         break;
4415                 }
4416 
4417                 if (vdev_dtl_required(vml[c])) {
4418                         error = EBUSY;
4419                         break;
4420                 }
4421 
4422                 /* we need certain info from the top level */
4423                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4424                     vml[c]->vdev_top->vdev_ms_array) == 0);
4425                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4426                     vml[c]->vdev_top->vdev_ms_shift) == 0);
4427                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4428                     vml[c]->vdev_top->vdev_asize) == 0);
4429                 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4430                     vml[c]->vdev_top->vdev_ashift) == 0);
4431         }
4432 
4433         if (error != 0) {
4434                 kmem_free(vml, children * sizeof (vdev_t *));
4435                 kmem_free(glist, children * sizeof (uint64_t));
4436                 return (spa_vdev_exit(spa, NULL, txg, error));
4437         }
4438 
4439         /* stop writers from using the disks */
4440         for (c = 0; c < children; c++) {
4441                 if (vml[c] != NULL)
4442                         vml[c]->vdev_offline = B_TRUE;
4443         }
4444         vdev_reopen(spa->spa_root_vdev);
4445 
4446         /*
4447          * Temporarily record the splitting vdevs in the spa config.  This
4448          * will disappear once the config is regenerated.
4449          */
4450         VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4451         VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4452             glist, children) == 0);
4453         kmem_free(glist, children * sizeof (uint64_t));
4454 
4455         mutex_enter(&spa->spa_props_lock);
4456         VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4457             nvl) == 0);
4458         mutex_exit(&spa->spa_props_lock);
4459         spa->spa_config_splitting = nvl;
4460         vdev_config_dirty(spa->spa_root_vdev);
4461 
4462         /* configure and create the new pool */
4463         VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4464         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4465             exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4466         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4467             spa_version(spa)) == 0);
4468         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4469             spa->spa_config_txg) == 0);
4470         VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4471             spa_generate_guid(NULL)) == 0);
4472         (void) nvlist_lookup_string(props,
4473             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4474 
4475         /* add the new pool to the namespace */
4476         newspa = spa_add(newname, config, altroot);
4477         newspa->spa_config_txg = spa->spa_config_txg;
4478         spa_set_log_state(newspa, SPA_LOG_CLEAR);
4479 
4480         /* release the spa config lock, retaining the namespace lock */
4481         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4482 
4483         if (zio_injection_enabled)
4484                 zio_handle_panic_injection(spa, FTAG, 1);
4485 
4486         spa_activate(newspa, spa_mode_global);
4487         spa_async_suspend(newspa);
4488 
4489         /* create the new pool from the disks of the original pool */
4490         error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4491         if (error)
4492                 goto out;
4493 
4494         /* if that worked, generate a real config for the new pool */
4495         if (newspa->spa_root_vdev != NULL) {
4496                 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4497                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
4498                 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4499                     ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4500                 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4501                     B_TRUE));
4502         }
4503 
4504         /* set the props */
4505         if (props != NULL) {
4506                 spa_configfile_set(newspa, props, B_FALSE);
4507                 error = spa_prop_set(newspa, props);
4508                 if (error)
4509                         goto out;
4510         }
4511 
4512         /* flush everything */
4513         txg = spa_vdev_config_enter(newspa);
4514         vdev_config_dirty(newspa->spa_root_vdev);
4515         (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
4516 
4517         if (zio_injection_enabled)
4518                 zio_handle_panic_injection(spa, FTAG, 2);
4519 
4520         spa_async_resume(newspa);
4521 
4522         /* finally, update the original pool's config */
4523         txg = spa_vdev_config_enter(spa);
4524         tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4525         error = dmu_tx_assign(tx, TXG_WAIT);
4526         if (error != 0)
4527                 dmu_tx_abort(tx);
4528         for (c = 0; c < children; c++) {
4529                 if (vml[c] != NULL) {
4530                         vdev_split(vml[c]);
4531                         if (error == 0)
4532                                 spa_history_log_internal(LOG_POOL_VDEV_DETACH,
4533                                     spa, tx, "vdev=%s",
4534                                     vml[c]->vdev_path);
4535                         vdev_free(vml[c]);
4536                 }
4537         }
4538         vdev_config_dirty(spa->spa_root_vdev);
4539         spa->spa_config_splitting = NULL;
4540         nvlist_free(nvl);
4541         if (error == 0)
4542                 dmu_tx_commit(tx);
4543         (void) spa_vdev_exit(spa, NULL, txg, 0);
4544 
4545         if (zio_injection_enabled)
4546                 zio_handle_panic_injection(spa, FTAG, 3);
4547 
4548         /* split is complete; log a history record */
4549         spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
4550             "split new pool %s from pool %s", newname, spa_name(spa));
4551 
4552         kmem_free(vml, children * sizeof (vdev_t *));
4553 
4554         /* if we're not going to mount the filesystems in userland, export */
4555         if (exp)
4556                 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
4557                     B_FALSE, B_FALSE);
4558 
4559         return (error);
4560 
4561 out:
4562         spa_unload(newspa);
4563         spa_deactivate(newspa);
4564         spa_remove(newspa);
4565 
4566         txg = spa_vdev_config_enter(spa);
4567 
4568         /* re-online all offlined disks */
4569         for (c = 0; c < children; c++) {
4570                 if (vml[c] != NULL)
4571                         vml[c]->vdev_offline = B_FALSE;
4572         }
4573         vdev_reopen(spa->spa_root_vdev);
4574 
4575         nvlist_free(spa->spa_config_splitting);
4576         spa->spa_config_splitting = NULL;
4577         (void) spa_vdev_exit(spa, NULL, txg, error);
4578 
4579         kmem_free(vml, children * sizeof (vdev_t *));
4580         return (error);
4581 }
4582 
4583 static nvlist_t *
4584 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
4585 {
4586         for (int i = 0; i < count; i++) {
4587                 uint64_t guid;
4588 
4589                 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
4590                     &guid) == 0);
4591 
4592                 if (guid == target_guid)
4593                         return (nvpp[i]);
4594         }
4595 
4596         return (NULL);
4597 }
4598 
4599 static void
4600 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
4601         nvlist_t *dev_to_remove)
4602 {
4603         nvlist_t **newdev = NULL;
4604 
4605         if (count > 1)
4606                 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
4607 
4608         for (int i = 0, j = 0; i < count; i++) {
4609                 if (dev[i] == dev_to_remove)
4610                         continue;
4611                 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
4612         }
4613 
4614         VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
4615         VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
4616 
4617         for (int i = 0; i < count - 1; i++)
4618                 nvlist_free(newdev[i]);
4619 
4620         if (count > 1)
4621                 kmem_free(newdev, (count - 1) * sizeof (void *));
4622 }
4623 
4624 /*
4625  * Evacuate the device.
4626  */
4627 static int
4628 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
4629 {
4630         uint64_t txg;
4631         int error = 0;
4632 
4633         ASSERT(MUTEX_HELD(&spa_namespace_lock));
4634         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4635         ASSERT(vd == vd->vdev_top);
4636 
4637         /*
4638          * Evacuate the device.  We don't hold the config lock as writer
4639          * since we need to do I/O but we do keep the
4640          * spa_namespace_lock held.  Once this completes the device
4641          * should no longer have any blocks allocated on it.
4642          */
4643         if (vd->vdev_islog) {
4644                 if (vd->vdev_stat.vs_alloc != 0)
4645                         error = spa_offline_log(spa);
4646         } else {
4647                 error = ENOTSUP;
4648         }
4649 
4650         if (error)
4651                 return (error);
4652 
4653         /*
4654          * The evacuation succeeded.  Remove any remaining MOS metadata
4655          * associated with this vdev, and wait for these changes to sync.
4656          */
4657         ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
4658         txg = spa_vdev_config_enter(spa);
4659         vd->vdev_removing = B_TRUE;
4660         vdev_dirty(vd, 0, NULL, txg);
4661         vdev_config_dirty(vd);
4662         spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4663 
4664         return (0);
4665 }
4666 
4667 /*
4668  * Complete the removal by cleaning up the namespace.
4669  */
4670 static void
4671 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
4672 {
4673         vdev_t *rvd = spa->spa_root_vdev;
4674         uint64_t id = vd->vdev_id;
4675         boolean_t last_vdev = (id == (rvd->vdev_children - 1));
4676 
4677         ASSERT(MUTEX_HELD(&spa_namespace_lock));
4678         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4679         ASSERT(vd == vd->vdev_top);
4680 
4681         /*
4682          * Only remove any devices which are empty.
4683          */
4684         if (vd->vdev_stat.vs_alloc != 0)
4685                 return;
4686 
4687         (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4688 
4689         if (list_link_active(&vd->vdev_state_dirty_node))
4690                 vdev_state_clean(vd);
4691         if (list_link_active(&vd->vdev_config_dirty_node))
4692                 vdev_config_clean(vd);
4693 
4694         vdev_free(vd);
4695 
4696         if (last_vdev) {
4697                 vdev_compact_children(rvd);
4698         } else {
4699                 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
4700                 vdev_add_child(rvd, vd);
4701         }
4702         vdev_config_dirty(rvd);
4703 
4704         /*
4705          * Reassess the health of our root vdev.
4706          */
4707         vdev_reopen(rvd);
4708 }
4709 
4710 /*
4711  * Remove a device from the pool -
4712  *
4713  * Removing a device from the vdev namespace requires several steps
4714  * and can take a significant amount of time.  As a result we use
4715  * the spa_vdev_config_[enter/exit] functions which allow us to
4716  * grab and release the spa_config_lock while still holding the namespace
4717  * lock.  During each step the configuration is synced out.
4718  */
4719 
4720 /*
4721  * Remove a device from the pool.  Currently, this supports removing only hot
4722  * spares, slogs, and level 2 ARC devices.
4723  */
4724 int
4725 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
4726 {
4727         vdev_t *vd;
4728         metaslab_group_t *mg;
4729         nvlist_t **spares, **l2cache, *nv;
4730         uint64_t txg = 0;
4731         uint_t nspares, nl2cache;
4732         int error = 0;
4733         boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
4734 
4735         ASSERT(spa_writeable(spa));
4736 
4737         if (!locked)
4738                 txg = spa_vdev_enter(spa);
4739 
4740         vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4741 
4742         if (spa->spa_spares.sav_vdevs != NULL &&
4743             nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4744             ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
4745             (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
4746                 /*
4747                  * Only remove the hot spare if it's not currently in use
4748                  * in this pool.
4749                  */
4750                 if (vd == NULL || unspare) {
4751                         spa_vdev_remove_aux(spa->spa_spares.sav_config,
4752                             ZPOOL_CONFIG_SPARES, spares, nspares, nv);
4753                         spa_load_spares(spa);
4754                         spa->spa_spares.sav_sync = B_TRUE;
4755                 } else {
4756                         error = EBUSY;
4757                 }
4758         } else if (spa->spa_l2cache.sav_vdevs != NULL &&
4759             nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4760             ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
4761             (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
4762                 /*
4763                  * Cache devices can always be removed.
4764                  */
4765                 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
4766                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
4767                 spa_load_l2cache(spa);
4768                 spa->spa_l2cache.sav_sync = B_TRUE;
4769         } else if (vd != NULL && vd->vdev_islog) {
4770                 ASSERT(!locked);
4771                 ASSERT(vd == vd->vdev_top);
4772 
4773                 /*
4774                  * XXX - Once we have bp-rewrite this should
4775                  * become the common case.
4776                  */
4777 
4778                 mg = vd->vdev_mg;
4779 
4780                 /*
4781                  * Stop allocating from this vdev.
4782                  */
4783                 metaslab_group_passivate(mg);
4784 
4785                 /*
4786                  * Wait for the youngest allocations and frees to sync,
4787                  * and then wait for the deferral of those frees to finish.
4788                  */
4789                 spa_vdev_config_exit(spa, NULL,
4790                     txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
4791 
4792                 /*
4793                  * Attempt to evacuate the vdev.
4794                  */
4795                 error = spa_vdev_remove_evacuate(spa, vd);
4796 
4797                 txg = spa_vdev_config_enter(spa);
4798 
4799                 /*
4800                  * If we couldn't evacuate the vdev, unwind.
4801                  */
4802                 if (error) {
4803                         metaslab_group_activate(mg);
4804                         return (spa_vdev_exit(spa, NULL, txg, error));
4805                 }
4806 
4807                 /*
4808                  * Clean up the vdev namespace.
4809                  */
4810                 spa_vdev_remove_from_namespace(spa, vd);
4811 
4812         } else if (vd != NULL) {
4813                 /*
4814                  * Normal vdevs cannot be removed (yet).
4815                  */
4816                 error = ENOTSUP;
4817         } else {
4818                 /*
4819                  * There is no vdev of any kind with the specified guid.
4820                  */
4821                 error = ENOENT;
4822         }
4823 
4824         if (!locked)
4825                 return (spa_vdev_exit(spa, NULL, txg, error));
4826 
4827         return (error);
4828 }
4829 
4830 /*
4831  * Find any device that's done replacing, or a vdev marked 'unspare' that's
4832  * current spared, so we can detach it.
4833  */
4834 static vdev_t *
4835 spa_vdev_resilver_done_hunt(vdev_t *vd)
4836 {
4837         vdev_t *newvd, *oldvd;
4838 
4839         for (int c = 0; c < vd->vdev_children; c++) {
4840                 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
4841                 if (oldvd != NULL)
4842                         return (oldvd);
4843         }
4844 
4845         /*
4846          * Check for a completed replacement.  We always consider the first
4847          * vdev in the list to be the oldest vdev, and the last one to be
4848          * the newest (see spa_vdev_attach() for how that works).  In
4849          * the case where the newest vdev is faulted, we will not automatically
4850          * remove it after a resilver completes.  This is OK as it will require
4851          * user intervention to determine which disk the admin wishes to keep.
4852          */
4853         if (vd->vdev_ops == &vdev_replacing_ops) {
4854                 ASSERT(vd->vdev_children > 1);
4855 
4856                 newvd = vd->vdev_child[vd->vdev_children - 1];
4857                 oldvd = vd->vdev_child[0];
4858 
4859                 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
4860                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4861                     !vdev_dtl_required(oldvd))
4862                         return (oldvd);
4863         }
4864 
4865         /*
4866          * Check for a completed resilver with the 'unspare' flag set.
4867          */
4868         if (vd->vdev_ops == &vdev_spare_ops) {
4869                 vdev_t *first = vd->vdev_child[0];
4870                 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
4871 
4872                 if (last->vdev_unspare) {
4873                         oldvd = first;
4874                         newvd = last;
4875                 } else if (first->vdev_unspare) {
4876                         oldvd = last;
4877                         newvd = first;
4878                 } else {
4879                         oldvd = NULL;
4880                 }
4881 
4882                 if (oldvd != NULL &&
4883                     vdev_dtl_empty(newvd, DTL_MISSING) &&
4884                     vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4885                     !vdev_dtl_required(oldvd))
4886                         return (oldvd);
4887 
4888                 /*
4889                  * If there are more than two spares attached to a disk,
4890                  * and those spares are not required, then we want to
4891                  * attempt to free them up now so that they can be used
4892                  * by other pools.  Once we're back down to a single
4893                  * disk+spare, we stop removing them.
4894                  */
4895                 if (vd->vdev_children > 2) {
4896                         newvd = vd->vdev_child[1];
4897 
4898                         if (newvd->vdev_isspare && last->vdev_isspare &&
4899                             vdev_dtl_empty(last, DTL_MISSING) &&
4900                             vdev_dtl_empty(last, DTL_OUTAGE) &&
4901                             !vdev_dtl_required(newvd))
4902                                 return (newvd);
4903                 }
4904         }
4905 
4906         return (NULL);
4907 }
4908 
4909 static void
4910 spa_vdev_resilver_done(spa_t *spa)
4911 {
4912         vdev_t *vd, *pvd, *ppvd;
4913         uint64_t guid, sguid, pguid, ppguid;
4914 
4915         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4916 
4917         while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
4918                 pvd = vd->vdev_parent;
4919                 ppvd = pvd->vdev_parent;
4920                 guid = vd->vdev_guid;
4921                 pguid = pvd->vdev_guid;
4922                 ppguid = ppvd->vdev_guid;
4923                 sguid = 0;
4924                 /*
4925                  * If we have just finished replacing a hot spared device, then
4926                  * we need to detach the parent's first child (the original hot
4927                  * spare) as well.
4928                  */
4929                 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
4930                     ppvd->vdev_children == 2) {
4931                         ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
4932                         sguid = ppvd->vdev_child[1]->vdev_guid;
4933                 }
4934                 spa_config_exit(spa, SCL_ALL, FTAG);
4935                 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
4936                         return;
4937                 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
4938                         return;
4939                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4940         }
4941 
4942         spa_config_exit(spa, SCL_ALL, FTAG);
4943 }
4944 
4945 /*
4946  * Update the stored path or FRU for this vdev.
4947  */
4948 int
4949 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
4950     boolean_t ispath)
4951 {
4952         vdev_t *vd;
4953         boolean_t sync = B_FALSE;
4954 
4955         ASSERT(spa_writeable(spa));
4956 
4957         spa_vdev_state_enter(spa, SCL_ALL);
4958 
4959         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4960                 return (spa_vdev_state_exit(spa, NULL, ENOENT));
4961 
4962         if (!vd->vdev_ops->vdev_op_leaf)
4963                 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4964 
4965         if (ispath) {
4966                 if (strcmp(value, vd->vdev_path) != 0) {
4967                         spa_strfree(vd->vdev_path);
4968                         vd->vdev_path = spa_strdup(value);
4969                         sync = B_TRUE;
4970                 }
4971         } else {
4972                 if (vd->vdev_fru == NULL) {
4973                         vd->vdev_fru = spa_strdup(value);
4974                         sync = B_TRUE;
4975                 } else if (strcmp(value, vd->vdev_fru) != 0) {
4976                         spa_strfree(vd->vdev_fru);
4977                         vd->vdev_fru = spa_strdup(value);
4978                         sync = B_TRUE;
4979                 }
4980         }
4981 
4982         return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
4983 }
4984 
4985 int
4986 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
4987 {
4988         return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
4989 }
4990 
4991 int
4992 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
4993 {
4994         return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
4995 }
4996 
4997 /*
4998  * ==========================================================================
4999  * SPA Scanning
5000  * ==========================================================================
5001  */
5002 
5003 int
5004 spa_scan_stop(spa_t *spa)
5005 {
5006         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5007         if (dsl_scan_resilvering(spa->spa_dsl_pool))
5008                 return (EBUSY);
5009         return (dsl_scan_cancel(spa->spa_dsl_pool));
5010 }
5011 
5012 int
5013 spa_scan(spa_t *spa, pool_scan_func_t func)
5014 {
5015         ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5016 
5017         if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5018                 return (ENOTSUP);
5019 
5020         /*
5021          * If a resilver was requested, but there is no DTL on a
5022          * writeable leaf device, we have nothing to do.
5023          */
5024         if (func == POOL_SCAN_RESILVER &&
5025             !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5026                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5027                 return (0);
5028         }
5029 
5030         return (dsl_scan(spa->spa_dsl_pool, func));
5031 }
5032 
5033 /*
5034  * ==========================================================================
5035  * SPA async task processing
5036  * ==========================================================================
5037  */
5038 
5039 static void
5040 spa_async_remove(spa_t *spa, vdev_t *vd)
5041 {
5042         if (vd->vdev_remove_wanted) {
5043                 vd->vdev_remove_wanted = B_FALSE;
5044                 vd->vdev_delayed_close = B_FALSE;
5045                 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5046 
5047                 /*
5048                  * We want to clear the stats, but we don't want to do a full
5049                  * vdev_clear() as that will cause us to throw away
5050                  * degraded/faulted state as well as attempt to reopen the
5051                  * device, all of which is a waste.
5052                  */
5053                 vd->vdev_stat.vs_read_errors = 0;
5054                 vd->vdev_stat.vs_write_errors = 0;
5055                 vd->vdev_stat.vs_checksum_errors = 0;
5056 
5057                 vdev_state_dirty(vd->vdev_top);
5058         }
5059 
5060         for (int c = 0; c < vd->vdev_children; c++)
5061                 spa_async_remove(spa, vd->vdev_child[c]);
5062 }
5063 
5064 static void
5065 spa_async_probe(spa_t *spa, vdev_t *vd)
5066 {
5067         if (vd->vdev_probe_wanted) {
5068                 vd->vdev_probe_wanted = B_FALSE;
5069                 vdev_reopen(vd);        /* vdev_open() does the actual probe */
5070         }
5071 
5072         for (int c = 0; c < vd->vdev_children; c++)
5073                 spa_async_probe(spa, vd->vdev_child[c]);
5074 }
5075 
5076 static void
5077 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5078 {
5079         sysevent_id_t eid;
5080         nvlist_t *attr;
5081         char *physpath;
5082 
5083         if (!spa->spa_autoexpand)
5084                 return;
5085 
5086         for (int c = 0; c < vd->vdev_children; c++) {
5087                 vdev_t *cvd = vd->vdev_child[c];
5088                 spa_async_autoexpand(spa, cvd);
5089         }
5090 
5091         if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5092                 return;
5093 
5094         physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5095         (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5096 
5097         VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5098         VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5099 
5100         (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5101             ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5102 
5103         nvlist_free(attr);
5104         kmem_free(physpath, MAXPATHLEN);
5105 }
5106 
5107 static void
5108 spa_async_thread(spa_t *spa)
5109 {
5110         int tasks;
5111 
5112         ASSERT(spa->spa_sync_on);
5113 
5114         mutex_enter(&spa->spa_async_lock);
5115         tasks = spa->spa_async_tasks;
5116         spa->spa_async_tasks = 0;
5117         mutex_exit(&spa->spa_async_lock);
5118 
5119         /*
5120          * See if the config needs to be updated.
5121          */
5122         if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5123                 uint64_t old_space, new_space;
5124 
5125                 mutex_enter(&spa_namespace_lock);
5126                 old_space = metaslab_class_get_space(spa_normal_class(spa));
5127                 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5128                 new_space = metaslab_class_get_space(spa_normal_class(spa));
5129                 mutex_exit(&spa_namespace_lock);
5130 
5131                 /*
5132                  * If the pool grew as a result of the config update,
5133                  * then log an internal history event.
5134                  */
5135                 if (new_space != old_space) {
5136                         spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
5137                             spa, NULL,
5138                             "pool '%s' size: %llu(+%llu)",
5139                             spa_name(spa), new_space, new_space - old_space);
5140                 }
5141         }
5142 
5143         /*
5144          * See if any devices need to be marked REMOVED.
5145          */
5146         if (tasks & SPA_ASYNC_REMOVE) {
5147                 spa_vdev_state_enter(spa, SCL_NONE);
5148                 spa_async_remove(spa, spa->spa_root_vdev);
5149                 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5150                         spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5151                 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5152                         spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5153                 (void) spa_vdev_state_exit(spa, NULL, 0);
5154         }
5155 
5156         if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5157                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5158                 spa_async_autoexpand(spa, spa->spa_root_vdev);
5159                 spa_config_exit(spa, SCL_CONFIG, FTAG);
5160         }
5161 
5162         /*
5163          * See if any devices need to be probed.
5164          */
5165         if (tasks & SPA_ASYNC_PROBE) {
5166                 spa_vdev_state_enter(spa, SCL_NONE);
5167                 spa_async_probe(spa, spa->spa_root_vdev);
5168                 (void) spa_vdev_state_exit(spa, NULL, 0);
5169         }
5170 
5171         /*
5172          * If any devices are done replacing, detach them.
5173          */
5174         if (tasks & SPA_ASYNC_RESILVER_DONE)
5175                 spa_vdev_resilver_done(spa);
5176 
5177         /*
5178          * Kick off a resilver.
5179          */
5180         if (tasks & SPA_ASYNC_RESILVER)
5181                 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5182 
5183         /*
5184          * Let the world know that we're done.
5185          */
5186         mutex_enter(&spa->spa_async_lock);
5187         spa->spa_async_thread = NULL;
5188         cv_broadcast(&spa->spa_async_cv);
5189         mutex_exit(&spa->spa_async_lock);
5190         thread_exit();
5191 }
5192 
5193 void
5194 spa_async_suspend(spa_t *spa)
5195 {
5196         mutex_enter(&spa->spa_async_lock);
5197         spa->spa_async_suspended++;
5198         while (spa->spa_async_thread != NULL)
5199                 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5200         mutex_exit(&spa->spa_async_lock);
5201 }
5202 
5203 void
5204 spa_async_resume(spa_t *spa)
5205 {
5206         mutex_enter(&spa->spa_async_lock);
5207         ASSERT(spa->spa_async_suspended != 0);
5208         spa->spa_async_suspended--;
5209         mutex_exit(&spa->spa_async_lock);
5210 }
5211 
5212 static void
5213 spa_async_dispatch(spa_t *spa)
5214 {
5215         mutex_enter(&spa->spa_async_lock);
5216         if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5217             spa->spa_async_thread == NULL &&
5218             rootdir != NULL && !vn_is_readonly(rootdir))
5219                 spa->spa_async_thread = thread_create(NULL, 0,
5220                     spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5221         mutex_exit(&spa->spa_async_lock);
5222 }
5223 
5224 void
5225 spa_async_request(spa_t *spa, int task)
5226 {
5227         zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5228         mutex_enter(&spa->spa_async_lock);
5229         spa->spa_async_tasks |= task;
5230         mutex_exit(&spa->spa_async_lock);
5231 }
5232 
5233 /*
5234  * ==========================================================================
5235  * SPA syncing routines
5236  * ==========================================================================
5237  */
5238 
5239 static int
5240 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5241 {
5242         bpobj_t *bpo = arg;
5243         bpobj_enqueue(bpo, bp, tx);
5244         return (0);
5245 }
5246 
5247 static int
5248 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5249 {
5250         zio_t *zio = arg;
5251 
5252         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5253             zio->io_flags));
5254         return (0);
5255 }
5256 
5257 static void
5258 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5259 {
5260         char *packed = NULL;
5261         size_t bufsize;
5262         size_t nvsize = 0;
5263         dmu_buf_t *db;
5264 
5265         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5266 
5267         /*
5268          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5269          * information.  This avoids the dbuf_will_dirty() path and
5270          * saves us a pre-read to get data we don't actually care about.
5271          */
5272         bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5273         packed = kmem_alloc(bufsize, KM_SLEEP);
5274 
5275         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5276             KM_SLEEP) == 0);
5277         bzero(packed + nvsize, bufsize - nvsize);
5278 
5279         dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5280 
5281         kmem_free(packed, bufsize);
5282 
5283         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5284         dmu_buf_will_dirty(db, tx);
5285         *(uint64_t *)db->db_data = nvsize;
5286         dmu_buf_rele(db, FTAG);
5287 }
5288 
5289 static void
5290 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5291     const char *config, const char *entry)
5292 {
5293         nvlist_t *nvroot;
5294         nvlist_t **list;
5295         int i;
5296 
5297         if (!sav->sav_sync)
5298                 return;
5299 
5300         /*
5301          * Update the MOS nvlist describing the list of available devices.
5302          * spa_validate_aux() will have already made sure this nvlist is
5303          * valid and the vdevs are labeled appropriately.
5304          */
5305         if (sav->sav_object == 0) {
5306                 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5307                     DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5308                     sizeof (uint64_t), tx);
5309                 VERIFY(zap_update(spa->spa_meta_objset,
5310                     DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5311                     &sav->sav_object, tx) == 0);
5312         }
5313 
5314         VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5315         if (sav->sav_count == 0) {
5316                 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5317         } else {
5318                 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5319                 for (i = 0; i < sav->sav_count; i++)
5320                         list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5321                             B_FALSE, VDEV_CONFIG_L2CACHE);
5322                 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5323                     sav->sav_count) == 0);
5324                 for (i = 0; i < sav->sav_count; i++)
5325                         nvlist_free(list[i]);
5326                 kmem_free(list, sav->sav_count * sizeof (void *));
5327         }
5328 
5329         spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5330         nvlist_free(nvroot);
5331 
5332         sav->sav_sync = B_FALSE;
5333 }
5334 
5335 static void
5336 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5337 {
5338         nvlist_t *config;
5339 
5340         if (list_is_empty(&spa->spa_config_dirty_list))
5341                 return;
5342 
5343         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5344 
5345         config = spa_config_generate(spa, spa->spa_root_vdev,
5346             dmu_tx_get_txg(tx), B_FALSE);
5347 
5348         spa_config_exit(spa, SCL_STATE, FTAG);
5349 
5350         if (spa->spa_config_syncing)
5351                 nvlist_free(spa->spa_config_syncing);
5352         spa->spa_config_syncing = config;
5353 
5354         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5355 }
5356 
5357 /*
5358  * Set zpool properties.
5359  */
5360 static void
5361 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5362 {
5363         spa_t *spa = arg1;
5364         objset_t *mos = spa->spa_meta_objset;
5365         nvlist_t *nvp = arg2;
5366         nvpair_t *elem;
5367         uint64_t intval;
5368         char *strval;
5369         zpool_prop_t prop;
5370         const char *propname;
5371         zprop_type_t proptype;
5372 
5373         mutex_enter(&spa->spa_props_lock);
5374 
5375         elem = NULL;
5376         while ((elem = nvlist_next_nvpair(nvp, elem))) {
5377                 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5378                 case ZPOOL_PROP_VERSION:
5379                         /*
5380                          * Only set version for non-zpool-creation cases
5381                          * (set/import). spa_create() needs special care
5382                          * for version setting.
5383                          */
5384                         if (tx->tx_txg != TXG_INITIAL) {
5385                                 VERIFY(nvpair_value_uint64(elem,
5386                                     &intval) == 0);
5387                                 ASSERT(intval <= SPA_VERSION);
5388                                 ASSERT(intval >= spa_version(spa));
5389                                 spa->spa_uberblock.ub_version = intval;
5390                                 vdev_config_dirty(spa->spa_root_vdev);
5391                         }
5392                         break;
5393 
5394                 case ZPOOL_PROP_ALTROOT:
5395                         /*
5396                          * 'altroot' is a non-persistent property. It should
5397                          * have been set temporarily at creation or import time.
5398                          */
5399                         ASSERT(spa->spa_root != NULL);
5400                         break;
5401 
5402                 case ZPOOL_PROP_READONLY:
5403                 case ZPOOL_PROP_CACHEFILE:
5404                         /*
5405                          * 'readonly' and 'cachefile' are also non-persisitent
5406                          * properties.
5407                          */
5408                         break;
5409                 case ZPOOL_PROP_COMMENT:
5410                         VERIFY(nvpair_value_string(elem, &strval) == 0);
5411                         if (spa->spa_comment != NULL)
5412                                 spa_strfree(spa->spa_comment);
5413                         spa->spa_comment = spa_strdup(strval);
5414                         /*
5415                          * We need to dirty the configuration on all the vdevs
5416                          * so that their labels get updated.  It's unnecessary
5417                          * to do this for pool creation since the vdev's
5418                          * configuratoin has already been dirtied.
5419                          */
5420                         if (tx->tx_txg != TXG_INITIAL)
5421                                 vdev_config_dirty(spa->spa_root_vdev);
5422                         break;
5423                 default:
5424                         /*
5425                          * Set pool property values in the poolprops mos object.
5426                          */
5427                         if (spa->spa_pool_props_object == 0) {
5428                                 VERIFY((spa->spa_pool_props_object =
5429                                     zap_create(mos, DMU_OT_POOL_PROPS,
5430                                     DMU_OT_NONE, 0, tx)) > 0);
5431 
5432                                 VERIFY(zap_update(mos,
5433                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5434                                     8, 1, &spa->spa_pool_props_object, tx)
5435                                     == 0);
5436                         }
5437 
5438                         /* normalize the property name */
5439                         propname = zpool_prop_to_name(prop);
5440                         proptype = zpool_prop_get_type(prop);
5441 
5442                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
5443                                 ASSERT(proptype == PROP_TYPE_STRING);
5444                                 VERIFY(nvpair_value_string(elem, &strval) == 0);
5445                                 VERIFY(zap_update(mos,
5446                                     spa->spa_pool_props_object, propname,
5447                                     1, strlen(strval) + 1, strval, tx) == 0);
5448 
5449                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5450                                 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5451 
5452                                 if (proptype == PROP_TYPE_INDEX) {
5453                                         const char *unused;
5454                                         VERIFY(zpool_prop_index_to_string(
5455                                             prop, intval, &unused) == 0);
5456                                 }
5457                                 VERIFY(zap_update(mos,
5458                                     spa->spa_pool_props_object, propname,
5459                                     8, 1, &intval, tx) == 0);
5460                         } else {
5461                                 ASSERT(0); /* not allowed */
5462                         }
5463 
5464                         switch (prop) {
5465                         case ZPOOL_PROP_DELEGATION:
5466                                 spa->spa_delegation = intval;
5467                                 break;
5468                         case ZPOOL_PROP_BOOTFS:
5469                                 spa->spa_bootfs = intval;
5470                                 break;
5471                         case ZPOOL_PROP_FAILUREMODE:
5472                                 spa->spa_failmode = intval;
5473                                 break;
5474                         case ZPOOL_PROP_AUTOEXPAND:
5475                                 spa->spa_autoexpand = intval;
5476                                 if (tx->tx_txg != TXG_INITIAL)
5477                                         spa_async_request(spa,
5478                                             SPA_ASYNC_AUTOEXPAND);
5479                                 break;
5480                         case ZPOOL_PROP_DEDUPDITTO:
5481                                 spa->spa_dedup_ditto = intval;
5482                                 break;
5483                         default:
5484                                 break;
5485                         }
5486                 }
5487 
5488                 /* log internal history if this is not a zpool create */
5489                 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
5490                     tx->tx_txg != TXG_INITIAL) {
5491                         spa_history_log_internal(LOG_POOL_PROPSET,
5492                             spa, tx, "%s %lld %s",
5493                             nvpair_name(elem), intval, spa_name(spa));
5494                 }
5495         }
5496 
5497         mutex_exit(&spa->spa_props_lock);
5498 }
5499 
5500 /*
5501  * Perform one-time upgrade on-disk changes.  spa_version() does not
5502  * reflect the new version this txg, so there must be no changes this
5503  * txg to anything that the upgrade code depends on after it executes.
5504  * Therefore this must be called after dsl_pool_sync() does the sync
5505  * tasks.
5506  */
5507 static void
5508 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
5509 {
5510         dsl_pool_t *dp = spa->spa_dsl_pool;
5511 
5512         ASSERT(spa->spa_sync_pass == 1);
5513 
5514         if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5515             spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5516                 dsl_pool_create_origin(dp, tx);
5517 
5518                 /* Keeping the origin open increases spa_minref */
5519                 spa->spa_minref += 3;
5520         }
5521 
5522         if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5523             spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5524                 dsl_pool_upgrade_clones(dp, tx);
5525         }
5526 
5527         if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5528             spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5529                 dsl_pool_upgrade_dir_clones(dp, tx);
5530 
5531                 /* Keeping the freedir open increases spa_minref */
5532                 spa->spa_minref += 3;
5533         }
5534 }
5535 
5536 /*
5537  * Sync the specified transaction group.  New blocks may be dirtied as
5538  * part of the process, so we iterate until it converges.
5539  */
5540 void
5541 spa_sync(spa_t *spa, uint64_t txg)
5542 {
5543         dsl_pool_t *dp = spa->spa_dsl_pool;
5544         objset_t *mos = spa->spa_meta_objset;
5545         bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5546         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5547         vdev_t *rvd = spa->spa_root_vdev;
5548         vdev_t *vd;
5549         dmu_tx_t *tx;
5550         int error;
5551 
5552         VERIFY(spa_writeable(spa));
5553 
5554         /*
5555          * Lock out configuration changes.
5556          */
5557         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5558 
5559         spa->spa_syncing_txg = txg;
5560         spa->spa_sync_pass = 0;
5561 
5562         /*
5563          * If there are any pending vdev state changes, convert them
5564          * into config changes that go out with this transaction group.
5565          */
5566         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5567         while (list_head(&spa->spa_state_dirty_list) != NULL) {
5568                 /*
5569                  * We need the write lock here because, for aux vdevs,
5570                  * calling vdev_config_dirty() modifies sav_config.
5571                  * This is ugly and will become unnecessary when we
5572                  * eliminate the aux vdev wart by integrating all vdevs
5573                  * into the root vdev tree.
5574                  */
5575                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5576                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
5577                 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
5578                         vdev_state_clean(vd);
5579                         vdev_config_dirty(vd);
5580                 }
5581                 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5582                 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
5583         }
5584         spa_config_exit(spa, SCL_STATE, FTAG);
5585 
5586         tx = dmu_tx_create_assigned(dp, txg);
5587 
5588         /*
5589          * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
5590          * set spa_deflate if we have no raid-z vdevs.
5591          */
5592         if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
5593             spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
5594                 int i;
5595 
5596                 for (i = 0; i < rvd->vdev_children; i++) {
5597                         vd = rvd->vdev_child[i];
5598                         if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
5599                                 break;
5600                 }
5601                 if (i == rvd->vdev_children) {
5602                         spa->spa_deflate = TRUE;
5603                         VERIFY(0 == zap_add(spa->spa_meta_objset,
5604                             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5605                             sizeof (uint64_t), 1, &spa->spa_deflate, tx));
5606                 }
5607         }
5608 
5609         /*
5610          * If anything has changed in this txg, or if someone is waiting
5611          * for this txg to sync (eg, spa_vdev_remove()), push the
5612          * deferred frees from the previous txg.  If not, leave them
5613          * alone so that we don't generate work on an otherwise idle
5614          * system.
5615          */
5616         if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
5617             !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
5618             !txg_list_empty(&dp->dp_sync_tasks, txg) ||
5619             ((dsl_scan_active(dp->dp_scan) ||
5620             txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
5621                 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5622                 VERIFY3U(bpobj_iterate(defer_bpo,
5623                     spa_free_sync_cb, zio, tx), ==, 0);
5624                 VERIFY3U(zio_wait(zio), ==, 0);
5625         }
5626 
5627         /*
5628          * Iterate to convergence.
5629          */
5630         do {
5631                 int pass = ++spa->spa_sync_pass;
5632 
5633                 spa_sync_config_object(spa, tx);
5634                 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
5635                     ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
5636                 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
5637                     ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
5638                 spa_errlog_sync(spa, txg);
5639                 dsl_pool_sync(dp, txg);
5640 
5641                 if (pass <= SYNC_PASS_DEFERRED_FREE) {
5642                         zio_t *zio = zio_root(spa, NULL, NULL, 0);
5643                         bplist_iterate(free_bpl, spa_free_sync_cb,
5644                             zio, tx);
5645                         VERIFY(zio_wait(zio) == 0);
5646                 } else {
5647                         bplist_iterate(free_bpl, bpobj_enqueue_cb,
5648                             defer_bpo, tx);
5649                 }
5650 
5651                 ddt_sync(spa, txg);
5652                 dsl_scan_sync(dp, tx);
5653 
5654                 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
5655                         vdev_sync(vd, txg);
5656 
5657                 if (pass == 1)
5658                         spa_sync_upgrades(spa, tx);
5659 
5660         } while (dmu_objset_is_dirty(mos, txg));
5661 
5662         /*
5663          * Rewrite the vdev configuration (which includes the uberblock)
5664          * to commit the transaction group.
5665          *
5666          * If there are no dirty vdevs, we sync the uberblock to a few
5667          * random top-level vdevs that are known to be visible in the
5668          * config cache (see spa_vdev_add() for a complete description).
5669          * If there *are* dirty vdevs, sync the uberblock to all vdevs.
5670          */
5671         for (;;) {
5672                 /*
5673                  * We hold SCL_STATE to prevent vdev open/close/etc.
5674                  * while we're attempting to write the vdev labels.
5675                  */
5676                 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5677 
5678                 if (list_is_empty(&spa->spa_config_dirty_list)) {
5679                         vdev_t *svd[SPA_DVAS_PER_BP];
5680                         int svdcount = 0;
5681                         int children = rvd->vdev_children;
5682                         int c0 = spa_get_random(children);
5683 
5684                         for (int c = 0; c < children; c++) {
5685                                 vd = rvd->vdev_child[(c0 + c) % children];
5686                                 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
5687                                         continue;
5688                                 svd[svdcount++] = vd;
5689                                 if (svdcount == SPA_DVAS_PER_BP)
5690                                         break;
5691                         }
5692                         error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
5693                         if (error != 0)
5694                                 error = vdev_config_sync(svd, svdcount, txg,
5695                                     B_TRUE);
5696                 } else {
5697                         error = vdev_config_sync(rvd->vdev_child,
5698                             rvd->vdev_children, txg, B_FALSE);
5699                         if (error != 0)
5700                                 error = vdev_config_sync(rvd->vdev_child,
5701                                     rvd->vdev_children, txg, B_TRUE);
5702                 }
5703 
5704                 spa_config_exit(spa, SCL_STATE, FTAG);
5705 
5706                 if (error == 0)
5707                         break;
5708                 zio_suspend(spa, NULL);
5709                 zio_resume_wait(spa);
5710         }
5711         dmu_tx_commit(tx);
5712 
5713         /*
5714          * Clear the dirty config list.
5715          */
5716         while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
5717                 vdev_config_clean(vd);
5718 
5719         /*
5720          * Now that the new config has synced transactionally,
5721          * let it become visible to the config cache.
5722          */
5723         if (spa->spa_config_syncing != NULL) {
5724                 spa_config_set(spa, spa->spa_config_syncing);
5725                 spa->spa_config_txg = txg;
5726                 spa->spa_config_syncing = NULL;
5727         }
5728 
5729         spa->spa_ubsync = spa->spa_uberblock;
5730 
5731         dsl_pool_sync_done(dp, txg);
5732 
5733         /*
5734          * Update usable space statistics.
5735          */
5736         while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
5737                 vdev_sync_done(vd, txg);
5738 
5739         spa_update_dspace(spa);
5740 
5741         /*
5742          * It had better be the case that we didn't dirty anything
5743          * since vdev_config_sync().
5744          */
5745         ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
5746         ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
5747         ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
5748 
5749         spa->spa_sync_pass = 0;
5750 
5751         spa_config_exit(spa, SCL_CONFIG, FTAG);
5752 
5753         spa_handle_ignored_writes(spa);
5754 
5755         /*
5756          * If any async tasks have been requested, kick them off.
5757          */
5758         spa_async_dispatch(spa);
5759 }
5760 
5761 /*
5762  * Sync all pools.  We don't want to hold the namespace lock across these
5763  * operations, so we take a reference on the spa_t and drop the lock during the
5764  * sync.
5765  */
5766 void
5767 spa_sync_allpools(void)
5768 {
5769         spa_t *spa = NULL;
5770         mutex_enter(&spa_namespace_lock);
5771         while ((spa = spa_next(spa)) != NULL) {
5772                 if (spa_state(spa) != POOL_STATE_ACTIVE ||
5773                     !spa_writeable(spa) || spa_suspended(spa))
5774                         continue;
5775                 spa_open_ref(spa, FTAG);
5776                 mutex_exit(&spa_namespace_lock);
5777                 txg_wait_synced(spa_get_dsl(spa), 0);
5778                 mutex_enter(&spa_namespace_lock);
5779                 spa_close(spa, FTAG);
5780         }
5781         mutex_exit(&spa_namespace_lock);
5782 }
5783 
5784 /*
5785  * ==========================================================================
5786  * Miscellaneous routines
5787  * ==========================================================================
5788  */
5789 
5790 /*
5791  * Remove all pools in the system.
5792  */
5793 void
5794 spa_evict_all(void)
5795 {
5796         spa_t *spa;
5797 
5798         /*
5799          * Remove all cached state.  All pools should be closed now,
5800          * so every spa in the AVL tree should be unreferenced.
5801          */
5802         mutex_enter(&spa_namespace_lock);
5803         while ((spa = spa_next(NULL)) != NULL) {
5804                 /*
5805                  * Stop async tasks.  The async thread may need to detach
5806                  * a device that's been replaced, which requires grabbing
5807                  * spa_namespace_lock, so we must drop it here.
5808                  */
5809                 spa_open_ref(spa, FTAG);
5810                 mutex_exit(&spa_namespace_lock);
5811                 spa_async_suspend(spa);
5812                 mutex_enter(&spa_namespace_lock);
5813                 spa_close(spa, FTAG);
5814 
5815                 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5816                         spa_unload(spa);
5817                         spa_deactivate(spa);
5818                 }
5819                 spa_remove(spa);
5820         }
5821         mutex_exit(&spa_namespace_lock);
5822 }
5823 
5824 vdev_t *
5825 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
5826 {
5827         vdev_t *vd;
5828         int i;
5829 
5830         if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
5831                 return (vd);
5832 
5833         if (aux) {
5834                 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
5835                         vd = spa->spa_l2cache.sav_vdevs[i];
5836                         if (vd->vdev_guid == guid)
5837                                 return (vd);
5838                 }
5839 
5840                 for (i = 0; i < spa->spa_spares.sav_count; i++) {
5841                         vd = spa->spa_spares.sav_vdevs[i];
5842                         if (vd->vdev_guid == guid)
5843                                 return (vd);
5844                 }
5845         }
5846 
5847         return (NULL);
5848 }
5849 
5850 void
5851 spa_upgrade(spa_t *spa, uint64_t version)
5852 {
5853         ASSERT(spa_writeable(spa));
5854 
5855         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5856 
5857         /*
5858          * This should only be called for a non-faulted pool, and since a
5859          * future version would result in an unopenable pool, this shouldn't be
5860          * possible.
5861          */
5862         ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
5863         ASSERT(version >= spa->spa_uberblock.ub_version);
5864 
5865         spa->spa_uberblock.ub_version = version;
5866         vdev_config_dirty(spa->spa_root_vdev);
5867 
5868         spa_config_exit(spa, SCL_ALL, FTAG);
5869 
5870         txg_wait_synced(spa_get_dsl(spa), 0);
5871 }
5872 
5873 boolean_t
5874 spa_has_spare(spa_t *spa, uint64_t guid)
5875 {
5876         int i;
5877         uint64_t spareguid;
5878         spa_aux_vdev_t *sav = &spa->spa_spares;
5879 
5880         for (i = 0; i < sav->sav_count; i++)
5881                 if (sav->sav_vdevs[i]->vdev_guid == guid)
5882                         return (B_TRUE);
5883 
5884         for (i = 0; i < sav->sav_npending; i++) {
5885                 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
5886                     &spareguid) == 0 && spareguid == guid)
5887                         return (B_TRUE);
5888         }
5889 
5890         return (B_FALSE);
5891 }
5892 
5893 /*
5894  * Check if a pool has an active shared spare device.
5895  * Note: reference count of an active spare is 2, as a spare and as a replace
5896  */
5897 static boolean_t
5898 spa_has_active_shared_spare(spa_t *spa)
5899 {
5900         int i, refcnt;
5901         uint64_t pool;
5902         spa_aux_vdev_t *sav = &spa->spa_spares;
5903 
5904         for (i = 0; i < sav->sav_count; i++) {
5905                 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
5906                     &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
5907                     refcnt > 2)
5908                         return (B_TRUE);
5909         }
5910 
5911         return (B_FALSE);
5912 }
5913 
5914 /*
5915  * Post a sysevent corresponding to the given event.  The 'name' must be one of
5916  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
5917  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
5918  * in the userland libzpool, as we don't want consumers to misinterpret ztest
5919  * or zdb as real changes.
5920  */
5921 void
5922 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
5923 {
5924 #ifdef _KERNEL
5925         sysevent_t              *ev;
5926         sysevent_attr_list_t    *attr = NULL;
5927         sysevent_value_t        value;
5928         sysevent_id_t           eid;
5929 
5930         ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
5931             SE_SLEEP);
5932 
5933         value.value_type = SE_DATA_TYPE_STRING;
5934         value.value.sv_string = spa_name(spa);
5935         if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
5936                 goto done;
5937 
5938         value.value_type = SE_DATA_TYPE_UINT64;
5939         value.value.sv_uint64 = spa_guid(spa);
5940         if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
5941                 goto done;
5942 
5943         if (vd) {
5944                 value.value_type = SE_DATA_TYPE_UINT64;
5945                 value.value.sv_uint64 = vd->vdev_guid;
5946                 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
5947                     SE_SLEEP) != 0)
5948                         goto done;
5949 
5950                 if (vd->vdev_path) {
5951                         value.value_type = SE_DATA_TYPE_STRING;
5952                         value.value.sv_string = vd->vdev_path;
5953                         if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
5954                             &value, SE_SLEEP) != 0)
5955                                 goto done;
5956                 }
5957         }
5958 
5959         if (sysevent_attach_attributes(ev, attr) != 0)
5960                 goto done;
5961         attr = NULL;
5962 
5963         (void) log_sysevent(ev, SE_SLEEP, &eid);
5964 
5965 done:
5966         if (attr)
5967                 sysevent_free_attr(attr);
5968         sysevent_free(ev);
5969 #endif
5970 }