Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>


  45 #include <sys/metaslab.h>
  46 #include <sys/metaslab_impl.h>
  47 #include <sys/uberblock_impl.h>
  48 #include <sys/txg.h>
  49 #include <sys/avl.h>
  50 #include <sys/dmu_traverse.h>
  51 #include <sys/dmu_objset.h>
  52 #include <sys/unique.h>
  53 #include <sys/dsl_pool.h>
  54 #include <sys/dsl_dataset.h>
  55 #include <sys/dsl_dir.h>
  56 #include <sys/dsl_prop.h>
  57 #include <sys/dsl_synctask.h>
  58 #include <sys/fs/zfs.h>
  59 #include <sys/arc.h>
  60 #include <sys/callb.h>
  61 #include <sys/systeminfo.h>
  62 #include <sys/spa_boot.h>
  63 #include <sys/zfs_ioctl.h>
  64 #include <sys/dsl_scan.h>

  65 
  66 #ifdef  _KERNEL
  67 #include <sys/bootprops.h>
  68 #include <sys/callb.h>
  69 #include <sys/cpupart.h>
  70 #include <sys/pool.h>
  71 #include <sys/sysdc.h>
  72 #include <sys/zone.h>
  73 #endif  /* _KERNEL */
  74 
  75 #include "zfs_prop.h"
  76 #include "zfs_comutil.h"
  77 
  78 typedef enum zti_modes {
  79         zti_mode_fixed,                 /* value is # of threads (min 1) */
  80         zti_mode_online_percent,        /* value is % of online CPUs */
  81         zti_mode_batch,                 /* cpu-intensive; value is ignored */
  82         zti_mode_null,                  /* don't create a taskq */
  83         zti_nmodes
  84 } zti_modes_t;


  96 } zio_taskq_info_t;
  97 
  98 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  99         "issue", "issue_high", "intr", "intr_high"
 100 };
 101 
 102 /*
 103  * Define the taskq threads for the following I/O types:
 104  *      NULL, READ, WRITE, FREE, CLAIM, and IOCTL
 105  */
 106 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 107         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 108         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 109         { ZTI_FIX(8),   ZTI_NULL,       ZTI_BATCH,      ZTI_NULL },
 110         { ZTI_BATCH,    ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
 111         { ZTI_FIX(100), ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 112         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 113         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 114 };
 115 

 116 static dsl_syncfunc_t spa_sync_props;
 117 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 118 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 119     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 120     char **ereport);
 121 static void spa_vdev_resilver_done(spa_t *spa);
 122 
 123 uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 124 id_t            zio_taskq_psrset_bind = PS_NONE;
 125 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 126 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 127 
 128 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 129 
 130 /*
 131  * This (illegal) pool name is used when temporarily importing a spa_t in order
 132  * to get the vdev stats associated with the imported devices.
 133  */
 134 #define TRYIMPORT_NAME  "$import"
 135 


 151 
 152         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 153         VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 154 
 155         if (strval != NULL)
 156                 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 157         else
 158                 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 159 
 160         VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 161         nvlist_free(propval);
 162 }
 163 
 164 /*
 165  * Get property values from the spa configuration.
 166  */
 167 static void
 168 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 169 {
 170         vdev_t *rvd = spa->spa_root_vdev;

 171         uint64_t size;
 172         uint64_t alloc;
 173         uint64_t space;
 174         uint64_t cap, version;
 175         zprop_source_t src = ZPROP_SRC_NONE;
 176         spa_config_dirent_t *dp;
 177 
 178         ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 179 
 180         if (rvd != NULL) {
 181                 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 182                 size = metaslab_class_get_space(spa_normal_class(spa));
 183                 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 184                 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 185                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 186                 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 187                     size - alloc, src);
 188 
 189                 space = 0;
 190                 for (int c = 0; c < rvd->vdev_children; c++) {


 197                 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 198                     (spa_mode(spa) == FREAD), src);
 199 
 200                 cap = (size == 0) ? 0 : (alloc * 100 / size);
 201                 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 202 
 203                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 204                     ddt_get_pool_dedup_ratio(spa), src);
 205 
 206                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 207                     rvd->vdev_state, src);
 208 
 209                 version = spa_version(spa);
 210                 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 211                         src = ZPROP_SRC_DEFAULT;
 212                 else
 213                         src = ZPROP_SRC_LOCAL;
 214                 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 215         }
 216 
















 217         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 218 
 219         if (spa->spa_comment != NULL) {
 220                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 221                     0, ZPROP_SRC_LOCAL);
 222         }
 223 
 224         if (spa->spa_root != NULL)
 225                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 226                     0, ZPROP_SRC_LOCAL);
 227 
 228         if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 229                 if (dp->scd_path == NULL) {
 230                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 231                             "none", 0, ZPROP_SRC_LOCAL);
 232                 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 233                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 234                             dp->scd_path, 0, ZPROP_SRC_LOCAL);
 235                 }
 236         }


 336 out:
 337         if (err && err != ENOENT) {
 338                 nvlist_free(*nvp);
 339                 *nvp = NULL;
 340                 return (err);
 341         }
 342 
 343         return (0);
 344 }
 345 
 346 /*
 347  * Validate the given pool properties nvlist and modify the list
 348  * for the property values to be set.
 349  */
 350 static int
 351 spa_prop_validate(spa_t *spa, nvlist_t *props)
 352 {
 353         nvpair_t *elem;
 354         int error = 0, reset_bootfs = 0;
 355         uint64_t objnum;

 356 
 357         elem = NULL;
 358         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 359                 zpool_prop_t prop;
 360                 char *propname, *strval;
 361                 uint64_t intval;
 362                 objset_t *os;
 363                 char *slash, *check;

 364 
 365                 propname = nvpair_name(elem);





 366 
 367                 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
 368                         return (EINVAL);





 369 
 370                 switch (prop) {


















 371                 case ZPOOL_PROP_VERSION:
 372                         error = nvpair_value_uint64(elem, &intval);
 373                         if (!error &&
 374                             (intval < spa_version(spa) || intval > SPA_VERSION))


 375                                 error = EINVAL;
 376                         break;
 377 
 378                 case ZPOOL_PROP_DELEGATION:
 379                 case ZPOOL_PROP_AUTOREPLACE:
 380                 case ZPOOL_PROP_LISTSNAPS:
 381                 case ZPOOL_PROP_AUTOEXPAND:
 382                         error = nvpair_value_uint64(elem, &intval);
 383                         if (!error && intval > 1)
 384                                 error = EINVAL;
 385                         break;
 386 
 387                 case ZPOOL_PROP_BOOTFS:
 388                         /*
 389                          * If the pool version is less than SPA_VERSION_BOOTFS,
 390                          * or the pool is still being created (version == 0),
 391                          * the bootfs property cannot be set.
 392                          */
 393                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 394                                 error = ENOTSUP;
 395                                 break;
 396                         }
 397 
 398                         /*
 399                          * Make sure the vdev config is bootable
 400                          */
 401                         if (!vdev_is_bootable(spa->spa_root_vdev)) {
 402                                 error = ENOTSUP;
 403                                 break;
 404                         }
 405 
 406                         reset_bootfs = 1;
 407 
 408                         error = nvpair_value_string(elem, &strval);
 409 
 410                         if (!error) {

 411                                 uint64_t compress;
 412 
 413                                 if (strval == NULL || strval[0] == '\0') {
 414                                         objnum = zpool_prop_default_numeric(
 415                                             ZPOOL_PROP_BOOTFS);
 416                                         break;
 417                                 }
 418 
 419                                 if (error = dmu_objset_hold(strval, FTAG, &os))
 420                                         break;
 421 
 422                                 /* Must be ZPL and not gzip compressed. */
 423 
 424                                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
 425                                         error = ENOTSUP;
 426                                 } else if ((error = dsl_prop_get_integer(strval,
 427                                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 428                                     &compress, NULL)) == 0 &&
 429                                     !BOOTFS_COMPRESS_VALID(compress)) {
 430                                         error = ENOTSUP;


 540 
 541         dp = kmem_alloc(sizeof (spa_config_dirent_t),
 542             KM_SLEEP);
 543 
 544         if (cachefile[0] == '\0')
 545                 dp->scd_path = spa_strdup(spa_config_path);
 546         else if (strcmp(cachefile, "none") == 0)
 547                 dp->scd_path = NULL;
 548         else
 549                 dp->scd_path = spa_strdup(cachefile);
 550 
 551         list_insert_head(&spa->spa_config_list, dp);
 552         if (need_sync)
 553                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 554 }
 555 
 556 int
 557 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 558 {
 559         int error;
 560         nvpair_t *elem;
 561         boolean_t need_sync = B_FALSE;
 562         zpool_prop_t prop;
 563 
 564         if ((error = spa_prop_validate(spa, nvp)) != 0)
 565                 return (error);
 566 
 567         elem = NULL;
 568         while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 569                 if ((prop = zpool_name_to_prop(
 570                     nvpair_name(elem))) == ZPROP_INVAL)
 571                         return (EINVAL);
 572 
 573                 if (prop == ZPOOL_PROP_CACHEFILE ||
 574                     prop == ZPOOL_PROP_ALTROOT ||
 575                     prop == ZPOOL_PROP_READONLY)
 576                         continue;
 577 








 578                 need_sync = B_TRUE;




















 579                 break;
 580         }
 581 
 582         if (need_sync)
 583                 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
 584                     spa, nvp, 3));
 585         else

 586                 return (0);
 587 }
 588 
 589 /*
 590  * If the bootfs property value is dsobj, clear it.
 591  */
 592 void
 593 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 594 {
 595         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 596                 VERIFY(zap_remove(spa->spa_meta_objset,
 597                     spa->spa_pool_props_object,
 598                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 599                 spa->spa_bootfs = 0;
 600         }
 601 }
 602 
 603 /*
 604  * Change the GUID for the pool.  This is done so that we can later
 605  * re-import a pool built from a clone of our own vdevs.  We will modify


1590         mutex_enter(&spa->spa_props_lock);       /* any mutex will do */
1591         if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1592                 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1593         mutex_exit(&spa->spa_props_lock);
1594 }
1595 
1596 typedef struct spa_load_error {
1597         uint64_t        sle_meta_count;
1598         uint64_t        sle_data_count;
1599 } spa_load_error_t;
1600 
1601 static void
1602 spa_load_verify_done(zio_t *zio)
1603 {
1604         blkptr_t *bp = zio->io_bp;
1605         spa_load_error_t *sle = zio->io_private;
1606         dmu_object_type_t type = BP_GET_TYPE(bp);
1607         int error = zio->io_error;
1608 
1609         if (error) {
1610                 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1611                     type != DMU_OT_INTENT_LOG)
1612                         atomic_add_64(&sle->sle_meta_count, 1);
1613                 else
1614                         atomic_add_64(&sle->sle_data_count, 1);
1615         }
1616         zio_data_buf_free(zio->io_data, zio->io_size);
1617 }
1618 
1619 /*ARGSUSED*/
1620 static int
1621 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1622     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1623 {
1624         if (bp != NULL) {
1625                 zio_t *rio = arg;
1626                 size_t size = BP_GET_PSIZE(bp);
1627                 void *data = zio_data_buf_alloc(size);
1628 
1629                 zio_nowait(zio_read(rio, spa, bp, data, size,
1630                     spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,


1820          */
1821         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1822             &spa->spa_ubsync.ub_version) != 0)
1823                 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1824 
1825         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1826             &spa->spa_config_txg);
1827 
1828         if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1829             spa_guid_exists(pool_guid, 0)) {
1830                 error = EEXIST;
1831         } else {
1832                 spa->spa_config_guid = pool_guid;
1833 
1834                 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1835                     &nvl) == 0) {
1836                         VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1837                             KM_SLEEP) == 0);
1838                 }
1839 



1840                 gethrestime(&spa->spa_loaded_ts);
1841                 error = spa_load_impl(spa, pool_guid, config, state, type,
1842                     mosconfig, &ereport);
1843         }
1844 
1845         spa->spa_minref = refcount_count(&spa->spa_refcount);
1846         if (error) {
1847                 if (error != EEXIST) {
1848                         spa->spa_loaded_ts.tv_sec = 0;
1849                         spa->spa_loaded_ts.tv_nsec = 0;
1850                 }
1851                 if (error != EBADF) {
1852                         zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1853                 }
1854         }
1855         spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1856         spa->spa_ena = 0;
1857 
1858         return (error);
1859 }
1860 
1861 /*
1862  * Load an existing storage pool, using the pool's builtin spa_config as a
1863  * source of configuration information.
1864  */
1865 static int
1866 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1867     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1868     char **ereport)
1869 {
1870         int error = 0;
1871         nvlist_t *nvroot = NULL;

1872         vdev_t *rvd;
1873         uberblock_t *ub = &spa->spa_uberblock;
1874         uint64_t children, config_cache_txg = spa->spa_config_txg;
1875         int orig_mode = spa->spa_mode;
1876         int parse;
1877         uint64_t obj;

1878 
1879         /*
1880          * If this is an untrusted config, access the pool in read-only mode.
1881          * This prevents things like resilvering recently removed devices.
1882          */
1883         if (!mosconfig)
1884                 spa->spa_mode = FREAD;
1885 
1886         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1887 
1888         spa->spa_load_state = state;
1889 
1890         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1891                 return (EINVAL);
1892 
1893         parse = (type == SPA_IMPORT_EXISTING ?
1894             VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1895 
1896         /*
1897          * Create "The Godfather" zio to hold all async IOs


1937          *
1938          * If we're assembling a new pool that's been split off from an
1939          * existing pool, the labels haven't yet been updated so we skip
1940          * validation for now.
1941          */
1942         if (type != SPA_IMPORT_ASSEMBLE) {
1943                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1944                 error = vdev_validate(rvd, mosconfig);
1945                 spa_config_exit(spa, SCL_ALL, FTAG);
1946 
1947                 if (error != 0)
1948                         return (error);
1949 
1950                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
1951                         return (ENXIO);
1952         }
1953 
1954         /*
1955          * Find the best uberblock.
1956          */
1957         vdev_uberblock_load(NULL, rvd, ub);
1958 
1959         /*
1960          * If we weren't able to find a single valid uberblock, return failure.
1961          */
1962         if (ub->ub_txg == 0)

1963                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));

1964 
1965         /*
1966          * If the pool is newer than the code, we can't open it.
1967          */
1968         if (ub->ub_version > SPA_VERSION)

1969                 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));

1970 



1971         /*




















































1972          * If the vdev guid sum doesn't match the uberblock, we have an
1973          * incomplete configuration.  We first check to see if the pool
1974          * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
1975          * If it is, defer the vdev_guid_sum check till later so we
1976          * can handle missing vdevs.
1977          */
1978         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
1979             &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
1980             rvd->vdev_guid_sum != ub->ub_guid_sum)
1981                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
1982 
1983         if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
1984                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1985                 spa_try_repair(spa, config);
1986                 spa_config_exit(spa, SCL_ALL, FTAG);
1987                 nvlist_free(spa->spa_config_splitting);
1988                 spa->spa_config_splitting = NULL;
1989         }
1990 
1991         /*
1992          * Initialize internal SPA structures.
1993          */
1994         spa->spa_state = POOL_STATE_ACTIVE;
1995         spa->spa_ubsync = spa->spa_uberblock;
1996         spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1997             TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1998         spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1999             spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2000         spa->spa_claim_max_txg = spa->spa_first_txg;
2001         spa->spa_prev_software_version = ub->ub_software_version;
2002 
2003         error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2004         if (error)
2005                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2006         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2007 
2008         if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2009                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2010 














































































2011         if (!mosconfig) {
2012                 uint64_t hostid;
2013                 nvlist_t *policy = NULL, *nvconfig;
2014 
2015                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2016                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2017 
2018                 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2019                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2020                         char *hostname;
2021                         unsigned long myhostid = 0;
2022 
2023                         VERIFY(nvlist_lookup_string(nvconfig,
2024                             ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2025 
2026 #ifdef  _KERNEL
2027                         myhostid = zone_get_hostid(NULL);
2028 #else   /* _KERNEL */
2029                         /*
2030                          * We're emulating the system's hostid in userland, so


2208          * Validate the config, using the MOS config to fill in any
2209          * information which might be missing.  If we fail to validate
2210          * the config then declare the pool unfit for use. If we're
2211          * assembling a pool from a split, the log is not transferred
2212          * over.
2213          */
2214         if (type != SPA_IMPORT_ASSEMBLE) {
2215                 nvlist_t *nvconfig;
2216 
2217                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2218                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2219 
2220                 if (!spa_config_valid(spa, nvconfig)) {
2221                         nvlist_free(nvconfig);
2222                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2223                             ENXIO));
2224                 }
2225                 nvlist_free(nvconfig);
2226 
2227                 /*
2228                  * Now that we've validate the config, check the state of the
2229                  * root vdev.  If it can't be opened, it indicates one or
2230                  * more toplevel vdevs are faulted.
2231                  */
2232                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2233                         return (ENXIO);
2234 
2235                 if (spa_check_logs(spa)) {
2236                         *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2237                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2238                 }
2239         }
2240 



2241         /*








2242          * We've successfully opened the pool, verify that we're ready
2243          * to start pushing transactions.
2244          */
2245         if (state != SPA_LOAD_TRYIMPORT) {
2246                 if (error = spa_load_verify(spa))
2247                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2248                             error));
2249         }
2250 
2251         if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2252             spa->spa_load_max_txg == UINT64_MAX)) {
2253                 dmu_tx_t *tx;
2254                 int need_update = B_FALSE;
2255 
2256                 ASSERT(state != SPA_LOAD_TRYIMPORT);
2257 
2258                 /*
2259                  * Claim log blocks that haven't been committed yet.
2260                  * This must all happen in a single txg.
2261                  * Note: spa_claim_max_txg is updated by spa_claim_notify(),


2330 
2331         return (0);
2332 }
2333 
2334 static int
2335 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2336 {
2337         int mode = spa->spa_mode;
2338 
2339         spa_unload(spa);
2340         spa_deactivate(spa);
2341 
2342         spa->spa_load_max_txg--;
2343 
2344         spa_activate(spa, mode);
2345         spa_async_suspend(spa);
2346 
2347         return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2348 }
2349 







2350 static int
2351 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2352     uint64_t max_request, int rewind_flags)
2353 {

2354         nvlist_t *config = NULL;
2355         int load_error, rewind_error;
2356         uint64_t safe_rewind_txg;
2357         uint64_t min_txg;
2358 
2359         if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2360                 spa->spa_load_max_txg = spa->spa_load_txg;
2361                 spa_set_log_state(spa, SPA_LOG_CLEAR);
2362         } else {
2363                 spa->spa_load_max_txg = max_request;
2364         }
2365 
2366         load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2367             mosconfig);
2368         if (load_error == 0)
2369                 return (0);
2370 
2371         if (spa->spa_root_vdev != NULL)
2372                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2373 
2374         spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2375         spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2376 
2377         if (rewind_flags & ZPOOL_NEVER_REWIND) {
2378                 nvlist_free(config);
2379                 return (load_error);
2380         }
2381 

2382         /* Price of rolling back is discarding txgs, including log */
2383         if (state == SPA_LOAD_RECOVER)
2384                 spa_set_log_state(spa, SPA_LOG_CLEAR);









2385 
2386         spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2387         safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2388         min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2389             TXG_INITIAL : safe_rewind_txg;
2390 
2391         /*
2392          * Continue as long as we're finding errors, we're still within
2393          * the acceptable rewind range, and we're still finding uberblocks
2394          */
2395         while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2396             spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2397                 if (spa->spa_load_max_txg < safe_rewind_txg)
2398                         spa->spa_extreme_rewind = B_TRUE;
2399                 rewind_error = spa_load_retry(spa, state, mosconfig);
2400         }
2401 
2402         spa->spa_extreme_rewind = B_FALSE;
2403         spa->spa_load_max_txg = UINT64_MAX;
2404 
2405         if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2406                 spa_config_set(spa, config);
2407 
2408         return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);













2409 }
2410 
2411 /*
2412  * Pool Open/Import
2413  *
2414  * The import case is identical to an open except that the configuration is sent
2415  * down from userland, instead of grabbed from the configuration cache.  For the
2416  * case of an open, the pool configuration will exist in the
2417  * POOL_STATE_UNINITIALIZED state.
2418  *
2419  * The stats information (gen/count/ustats) is used to gather vdev statistics at
2420  * the same time open the pool, without having to keep around the spa_t in some
2421  * ambiguous state.
2422  */
2423 static int
2424 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2425     nvlist_t **config)
2426 {
2427         spa_t *spa;
2428         spa_load_state_t state = SPA_LOAD_OPEN;


2658                             ZPOOL_CONFIG_GUID, &guid) == 0);
2659 
2660                         vd = NULL;
2661                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2662                                 if (guid ==
2663                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2664                                         vd = spa->spa_l2cache.sav_vdevs[j];
2665                                         break;
2666                                 }
2667                         }
2668                         ASSERT(vd != NULL);
2669 
2670                         VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2671                             ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2672                             == 0);
2673                         vdev_get_stats(vd, vs);
2674                 }
2675         }
2676 }
2677 









































2678 int
2679 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)

2680 {
2681         int error;
2682         spa_t *spa;
2683 
2684         *config = NULL;
2685         error = spa_open_common(name, &spa, FTAG, NULL, config);
2686 
2687         if (spa != NULL) {
2688                 /*
2689                  * This still leaves a window of inconsistency where the spares
2690                  * or l2cache devices could change and the config would be
2691                  * self-inconsistent.
2692                  */
2693                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2694 
2695                 if (*config != NULL) {
2696                         uint64_t loadtimes[2];
2697 
2698                         loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2699                         loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
2700                         VERIFY(nvlist_add_uint64_array(*config,
2701                             ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
2702 
2703                         VERIFY(nvlist_add_uint64(*config,
2704                             ZPOOL_CONFIG_ERRCOUNT,
2705                             spa_get_errlog_size(spa)) == 0);
2706 
2707                         if (spa_suspended(spa))
2708                                 VERIFY(nvlist_add_uint64(*config,
2709                                     ZPOOL_CONFIG_SUSPENDED,
2710                                     spa->spa_failmode) == 0);
2711 
2712                         spa_add_spares(spa, *config);
2713                         spa_add_l2cache(spa, *config);

2714                 }
2715         }
2716 
2717         /*
2718          * We want to get the alternate root even for faulted pools, so we cheat
2719          * and call spa_lookup() directly.
2720          */
2721         if (altroot) {
2722                 if (spa == NULL) {
2723                         mutex_enter(&spa_namespace_lock);
2724                         spa = spa_lookup(name);
2725                         if (spa)
2726                                 spa_altroot(spa, altroot, buflen);
2727                         else
2728                                 altroot[0] = '\0';
2729                         spa = NULL;
2730                         mutex_exit(&spa_namespace_lock);
2731                 } else {
2732                         spa_altroot(spa, altroot, buflen);
2733                 }


2914         }
2915 }
2916 
2917 /*
2918  * Pool Creation
2919  */
2920 int
2921 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2922     const char *history_str, nvlist_t *zplprops)
2923 {
2924         spa_t *spa;
2925         char *altroot = NULL;
2926         vdev_t *rvd;
2927         dsl_pool_t *dp;
2928         dmu_tx_t *tx;
2929         int error = 0;
2930         uint64_t txg = TXG_INITIAL;
2931         nvlist_t **spares, **l2cache;
2932         uint_t nspares, nl2cache;
2933         uint64_t version, obj;

2934 
2935         /*
2936          * If this pool already exists, return failure.
2937          */
2938         mutex_enter(&spa_namespace_lock);
2939         if (spa_lookup(pool) != NULL) {
2940                 mutex_exit(&spa_namespace_lock);
2941                 return (EEXIST);
2942         }
2943 
2944         /*
2945          * Allocate a new spa_t structure.
2946          */
2947         (void) nvlist_lookup_string(props,
2948             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2949         spa = spa_add(pool, NULL, altroot);
2950         spa_activate(spa, spa_mode_global);
2951 
2952         if (props && (error = spa_prop_validate(spa, props))) {
2953                 spa_deactivate(spa);
2954                 spa_remove(spa);
2955                 mutex_exit(&spa_namespace_lock);
2956                 return (error);
2957         }
2958 
2959         if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2960             &version) != 0)







2961                 version = SPA_VERSION;
2962         ASSERT(version <= SPA_VERSION);

2963 
2964         spa->spa_first_txg = txg;
2965         spa->spa_uberblock.ub_txg = txg - 1;
2966         spa->spa_uberblock.ub_version = version;
2967         spa->spa_ubsync = spa->spa_uberblock;
2968 
2969         /*
2970          * Create "The Godfather" zio to hold all async IOs
2971          */
2972         spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2973             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2974 
2975         /*
2976          * Create the root vdev.
2977          */
2978         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2979 
2980         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2981 
2982         ASSERT(error != 0 || rvd != NULL);


3018                 spa_load_spares(spa);
3019                 spa_config_exit(spa, SCL_ALL, FTAG);
3020                 spa->spa_spares.sav_sync = B_TRUE;
3021         }
3022 
3023         /*
3024          * Get the list of level 2 cache devices, if specified.
3025          */
3026         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3027             &l2cache, &nl2cache) == 0) {
3028                 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3029                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
3030                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3031                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3032                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3033                 spa_load_l2cache(spa);
3034                 spa_config_exit(spa, SCL_ALL, FTAG);
3035                 spa->spa_l2cache.sav_sync = B_TRUE;
3036         }
3037 

3038         spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3039         spa->spa_meta_objset = dp->dp_meta_objset;

3040 
3041         /*
3042          * Create DDTs (dedup tables).
3043          */
3044         ddt_create(spa);
3045 
3046         spa_update_dspace(spa);
3047 
3048         tx = dmu_tx_create_assigned(dp, txg);
3049 
3050         /*
3051          * Create the pool config object.
3052          */
3053         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3054             DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3055             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3056 
3057         if (zap_add(spa->spa_meta_objset,
3058             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3059             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3060                 cmn_err(CE_PANIC, "failed to add pool config");
3061         }
3062 



3063         if (zap_add(spa->spa_meta_objset,
3064             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3065             sizeof (uint64_t), 1, &version, tx) != 0) {
3066                 cmn_err(CE_PANIC, "failed to add pool version");
3067         }
3068 
3069         /* Newly created pools with the right version are always deflated. */
3070         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3071                 spa->spa_deflate = TRUE;
3072                 if (zap_add(spa->spa_meta_objset,
3073                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3074                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3075                         cmn_err(CE_PANIC, "failed to add deflate");
3076                 }
3077         }
3078 
3079         /*
3080          * Create the deferred-free bpobj.  Turn off compression
3081          * because sync-to-convergence takes longer if the blocksize
3082          * keeps changing.


3233         vdev_t *rvd, *bvd, *avd = NULL;
3234         nvlist_t *config, *nvtop;
3235         uint64_t guid, txg;
3236         char *pname;
3237         int error;
3238 
3239         /*
3240          * Read the label from the boot device and generate a configuration.
3241          */
3242         config = spa_generate_rootconf(devpath, devid, &guid);
3243 #if defined(_OBP) && defined(_KERNEL)
3244         if (config == NULL) {
3245                 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3246                         /* iscsi boot */
3247                         get_iscsi_bootpath_phy(devpath);
3248                         config = spa_generate_rootconf(devpath, devid, &guid);
3249                 }
3250         }
3251 #endif
3252         if (config == NULL) {
3253                 cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
3254                     devpath);
3255                 return (EIO);
3256         }
3257 
3258         VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3259             &pname) == 0);
3260         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3261 
3262         mutex_enter(&spa_namespace_lock);
3263         if ((spa = spa_lookup(pname)) != NULL) {
3264                 /*
3265                  * Remove the existing root pool from the namespace so that we
3266                  * can replace it with the correct config we just read in.
3267                  */
3268                 spa_remove(spa);
3269         }
3270 
3271         spa = spa_add(pname, config, NULL);
3272         spa->spa_is_root = B_TRUE;
3273         spa->spa_import_flags = ZFS_IMPORT_VERBATIM;


3547         spa_activate(spa, FREAD);
3548 
3549         /*
3550          * Pass off the heavy lifting to spa_load().
3551          * Pass TRUE for mosconfig because the user-supplied config
3552          * is actually the one to trust when doing an import.
3553          */
3554         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3555 
3556         /*
3557          * If 'tryconfig' was at least parsable, return the current config.
3558          */
3559         if (spa->spa_root_vdev != NULL) {
3560                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3561                 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3562                     poolname) == 0);
3563                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3564                     state) == 0);
3565                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3566                     spa->spa_uberblock.ub_timestamp) == 0);


3567 
3568                 /*
3569                  * If the bootfs property exists on this pool then we
3570                  * copy it out so that external consumers can tell which
3571                  * pools are bootable.
3572                  */
3573                 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3574                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3575 
3576                         /*
3577                          * We have to play games with the name since the
3578                          * pool was opened as TRYIMPORT_NAME.
3579                          */
3580                         if (dsl_dsobj_to_dsname(spa_name(spa),
3581                             spa->spa_bootfs, tmpname) == 0) {
3582                                 char *cp;
3583                                 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3584 
3585                                 cp = strchr(tmpname, '/');
3586                                 if (cp == NULL) {


5264         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5265             zio->io_flags));
5266         return (0);
5267 }
5268 
5269 static void
5270 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5271 {
5272         char *packed = NULL;
5273         size_t bufsize;
5274         size_t nvsize = 0;
5275         dmu_buf_t *db;
5276 
5277         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5278 
5279         /*
5280          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5281          * information.  This avoids the dbuf_will_dirty() path and
5282          * saves us a pre-read to get data we don't actually care about.
5283          */
5284         bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5285         packed = kmem_alloc(bufsize, KM_SLEEP);
5286 
5287         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5288             KM_SLEEP) == 0);
5289         bzero(packed + nvsize, bufsize - nvsize);
5290 
5291         dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5292 
5293         kmem_free(packed, bufsize);
5294 
5295         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5296         dmu_buf_will_dirty(db, tx);
5297         *(uint64_t *)db->db_data = nvsize;
5298         dmu_buf_rele(db, FTAG);
5299 }
5300 
5301 static void
5302 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5303     const char *config, const char *entry)
5304 {


5349 {
5350         nvlist_t *config;
5351 
5352         if (list_is_empty(&spa->spa_config_dirty_list))
5353                 return;
5354 
5355         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5356 
5357         config = spa_config_generate(spa, spa->spa_root_vdev,
5358             dmu_tx_get_txg(tx), B_FALSE);
5359 
5360         spa_config_exit(spa, SCL_STATE, FTAG);
5361 
5362         if (spa->spa_config_syncing)
5363                 nvlist_free(spa->spa_config_syncing);
5364         spa->spa_config_syncing = config;
5365 
5366         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5367 }
5368 


















5369 /*
5370  * Set zpool properties.
5371  */
5372 static void
5373 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5374 {
5375         spa_t *spa = arg1;
5376         objset_t *mos = spa->spa_meta_objset;
5377         nvlist_t *nvp = arg2;
5378         nvpair_t *elem;




5379         uint64_t intval;
5380         char *strval;
5381         zpool_prop_t prop;
5382         const char *propname;
5383         zprop_type_t proptype;

5384 
5385         mutex_enter(&spa->spa_props_lock);
5386 
5387         elem = NULL;
5388         while ((elem = nvlist_next_nvpair(nvp, elem))) {
5389                 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {












5390                 case ZPOOL_PROP_VERSION:

5391                         /*
5392                          * Only set version for non-zpool-creation cases
5393                          * (set/import). spa_create() needs special care
5394                          * for version setting.
5395                          */
5396                         if (tx->tx_txg != TXG_INITIAL) {
5397                                 VERIFY(nvpair_value_uint64(elem,
5398                                     &intval) == 0);
5399                                 ASSERT(intval <= SPA_VERSION);
5400                                 ASSERT(intval >= spa_version(spa));
5401                                 spa->spa_uberblock.ub_version = intval;
5402                                 vdev_config_dirty(spa->spa_root_vdev);
5403                         }
5404                         break;
5405 
5406                 case ZPOOL_PROP_ALTROOT:
5407                         /*
5408                          * 'altroot' is a non-persistent property. It should
5409                          * have been set temporarily at creation or import time.
5410                          */
5411                         ASSERT(spa->spa_root != NULL);
5412                         break;
5413 
5414                 case ZPOOL_PROP_READONLY:
5415                 case ZPOOL_PROP_CACHEFILE:
5416                         /*
5417                          * 'readonly' and 'cachefile' are also non-persisitent
5418                          * properties.
5419                          */
5420                         break;
5421                 case ZPOOL_PROP_COMMENT:
5422                         VERIFY(nvpair_value_string(elem, &strval) == 0);
5423                         if (spa->spa_comment != NULL)
5424                                 spa_strfree(spa->spa_comment);
5425                         spa->spa_comment = spa_strdup(strval);
5426                         /*
5427                          * We need to dirty the configuration on all the vdevs
5428                          * so that their labels get updated.  It's unnecessary
5429                          * to do this for pool creation since the vdev's
5430                          * configuratoin has already been dirtied.
5431                          */
5432                         if (tx->tx_txg != TXG_INITIAL)
5433                                 vdev_config_dirty(spa->spa_root_vdev);
5434                         break;
5435                 default:
5436                         /*
5437                          * Set pool property values in the poolprops mos object.
5438                          */
5439                         if (spa->spa_pool_props_object == 0) {
5440                                 VERIFY((spa->spa_pool_props_object =
5441                                     zap_create(mos, DMU_OT_POOL_PROPS,
5442                                     DMU_OT_NONE, 0, tx)) > 0);
5443 
5444                                 VERIFY(zap_update(mos,
5445                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5446                                     8, 1, &spa->spa_pool_props_object, tx)
5447                                     == 0);
5448                         }
5449 
5450                         /* normalize the property name */
5451                         propname = zpool_prop_to_name(prop);
5452                         proptype = zpool_prop_get_type(prop);
5453 
5454                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
5455                                 ASSERT(proptype == PROP_TYPE_STRING);
5456                                 VERIFY(nvpair_value_string(elem, &strval) == 0);
5457                                 VERIFY(zap_update(mos,
5458                                     spa->spa_pool_props_object, propname,
5459                                     1, strlen(strval) + 1, strval, tx) == 0);
5460 
5461                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5462                                 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5463 
5464                                 if (proptype == PROP_TYPE_INDEX) {
5465                                         const char *unused;
5466                                         VERIFY(zpool_prop_index_to_string(
5467                                             prop, intval, &unused) == 0);


5526         if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5527             spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5528                 dsl_pool_create_origin(dp, tx);
5529 
5530                 /* Keeping the origin open increases spa_minref */
5531                 spa->spa_minref += 3;
5532         }
5533 
5534         if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5535             spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5536                 dsl_pool_upgrade_clones(dp, tx);
5537         }
5538 
5539         if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5540             spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5541                 dsl_pool_upgrade_dir_clones(dp, tx);
5542 
5543                 /* Keeping the freedir open increases spa_minref */
5544                 spa->spa_minref += 3;
5545         }





5546 }
5547 
5548 /*
5549  * Sync the specified transaction group.  New blocks may be dirtied as
5550  * part of the process, so we iterate until it converges.
5551  */
5552 void
5553 spa_sync(spa_t *spa, uint64_t txg)
5554 {
5555         dsl_pool_t *dp = spa->spa_dsl_pool;
5556         objset_t *mos = spa->spa_meta_objset;
5557         bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5558         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5559         vdev_t *rvd = spa->spa_root_vdev;
5560         vdev_t *vd;
5561         dmu_tx_t *tx;
5562         int error;
5563 
5564         VERIFY(spa_writeable(spa));
5565 




  45 #include <sys/metaslab.h>
  46 #include <sys/metaslab_impl.h>
  47 #include <sys/uberblock_impl.h>
  48 #include <sys/txg.h>
  49 #include <sys/avl.h>
  50 #include <sys/dmu_traverse.h>
  51 #include <sys/dmu_objset.h>
  52 #include <sys/unique.h>
  53 #include <sys/dsl_pool.h>
  54 #include <sys/dsl_dataset.h>
  55 #include <sys/dsl_dir.h>
  56 #include <sys/dsl_prop.h>
  57 #include <sys/dsl_synctask.h>
  58 #include <sys/fs/zfs.h>
  59 #include <sys/arc.h>
  60 #include <sys/callb.h>
  61 #include <sys/systeminfo.h>
  62 #include <sys/spa_boot.h>
  63 #include <sys/zfs_ioctl.h>
  64 #include <sys/dsl_scan.h>
  65 #include <sys/zfeature.h>
  66 
  67 #ifdef  _KERNEL
  68 #include <sys/bootprops.h>
  69 #include <sys/callb.h>
  70 #include <sys/cpupart.h>
  71 #include <sys/pool.h>
  72 #include <sys/sysdc.h>
  73 #include <sys/zone.h>
  74 #endif  /* _KERNEL */
  75 
  76 #include "zfs_prop.h"
  77 #include "zfs_comutil.h"
  78 
  79 typedef enum zti_modes {
  80         zti_mode_fixed,                 /* value is # of threads (min 1) */
  81         zti_mode_online_percent,        /* value is % of online CPUs */
  82         zti_mode_batch,                 /* cpu-intensive; value is ignored */
  83         zti_mode_null,                  /* don't create a taskq */
  84         zti_nmodes
  85 } zti_modes_t;


  97 } zio_taskq_info_t;
  98 
  99 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 100         "issue", "issue_high", "intr", "intr_high"
 101 };
 102 
 103 /*
 104  * Define the taskq threads for the following I/O types:
 105  *      NULL, READ, WRITE, FREE, CLAIM, and IOCTL
 106  */
 107 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 108         /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 109         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 110         { ZTI_FIX(8),   ZTI_NULL,       ZTI_BATCH,      ZTI_NULL },
 111         { ZTI_BATCH,    ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
 112         { ZTI_FIX(100), ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 113         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 114         { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 115 };
 116 
 117 static dsl_syncfunc_t spa_sync_version;
 118 static dsl_syncfunc_t spa_sync_props;
 119 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 120 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 121     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 122     char **ereport);
 123 static void spa_vdev_resilver_done(spa_t *spa);
 124 
 125 uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 126 id_t            zio_taskq_psrset_bind = PS_NONE;
 127 boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 128 uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 129 
 130 boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 131 
 132 /*
 133  * This (illegal) pool name is used when temporarily importing a spa_t in order
 134  * to get the vdev stats associated with the imported devices.
 135  */
 136 #define TRYIMPORT_NAME  "$import"
 137 


 153 
 154         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 155         VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 156 
 157         if (strval != NULL)
 158                 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 159         else
 160                 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 161 
 162         VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 163         nvlist_free(propval);
 164 }
 165 
 166 /*
 167  * Get property values from the spa configuration.
 168  */
 169 static void
 170 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 171 {
 172         vdev_t *rvd = spa->spa_root_vdev;
 173         dsl_pool_t *pool = spa->spa_dsl_pool;
 174         uint64_t size;
 175         uint64_t alloc;
 176         uint64_t space;
 177         uint64_t cap, version;
 178         zprop_source_t src = ZPROP_SRC_NONE;
 179         spa_config_dirent_t *dp;
 180 
 181         ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 182 
 183         if (rvd != NULL) {
 184                 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 185                 size = metaslab_class_get_space(spa_normal_class(spa));
 186                 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 187                 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 188                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 189                 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 190                     size - alloc, src);
 191 
 192                 space = 0;
 193                 for (int c = 0; c < rvd->vdev_children; c++) {


 200                 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 201                     (spa_mode(spa) == FREAD), src);
 202 
 203                 cap = (size == 0) ? 0 : (alloc * 100 / size);
 204                 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 205 
 206                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 207                     ddt_get_pool_dedup_ratio(spa), src);
 208 
 209                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 210                     rvd->vdev_state, src);
 211 
 212                 version = spa_version(spa);
 213                 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 214                         src = ZPROP_SRC_DEFAULT;
 215                 else
 216                         src = ZPROP_SRC_LOCAL;
 217                 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 218         }
 219 
 220         if (pool != NULL) {
 221                 dsl_dir_t *freedir = pool->dp_free_dir;
 222 
 223                 /*
 224                  * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 225                  * when opening pools before this version freedir will be NULL.
 226                  */
 227                 if (freedir != NULL) {
 228                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 229                             freedir->dd_phys->dd_used_bytes, src);
 230                 } else {
 231                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 232                             NULL, 0, src);
 233                 }
 234         }
 235 
 236         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 237 
 238         if (spa->spa_comment != NULL) {
 239                 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 240                     0, ZPROP_SRC_LOCAL);
 241         }
 242 
 243         if (spa->spa_root != NULL)
 244                 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 245                     0, ZPROP_SRC_LOCAL);
 246 
 247         if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 248                 if (dp->scd_path == NULL) {
 249                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 250                             "none", 0, ZPROP_SRC_LOCAL);
 251                 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 252                         spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 253                             dp->scd_path, 0, ZPROP_SRC_LOCAL);
 254                 }
 255         }


 355 out:
 356         if (err && err != ENOENT) {
 357                 nvlist_free(*nvp);
 358                 *nvp = NULL;
 359                 return (err);
 360         }
 361 
 362         return (0);
 363 }
 364 
 365 /*
 366  * Validate the given pool properties nvlist and modify the list
 367  * for the property values to be set.
 368  */
 369 static int
 370 spa_prop_validate(spa_t *spa, nvlist_t *props)
 371 {
 372         nvpair_t *elem;
 373         int error = 0, reset_bootfs = 0;
 374         uint64_t objnum;
 375         boolean_t has_feature = B_FALSE;
 376 
 377         elem = NULL;
 378         while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {


 379                 uint64_t intval;
 380                 char *strval, *slash, *check, *fname;
 381                 const char *propname = nvpair_name(elem);
 382                 zpool_prop_t prop = zpool_name_to_prop(propname);
 383 
 384                 switch (prop) {
 385                 case ZPROP_INVAL:
 386                         if (!zpool_prop_feature(propname)) {
 387                                 error = EINVAL;
 388                                 break;
 389                         }
 390 
 391                         /*
 392                          * Sanitize the input.
 393                          */
 394                         if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 395                                 error = EINVAL;
 396                                 break;
 397                         }
 398 
 399                         if (nvpair_value_uint64(elem, &intval) != 0) {
 400                                 error = EINVAL;
 401                                 break;
 402                         }
 403 
 404                         if (intval != 0) {
 405                                 error = EINVAL;
 406                                 break;
 407                         }
 408 
 409                         fname = strchr(propname, '@') + 1;
 410                         if (zfeature_lookup_name(fname, NULL) != 0) {
 411                                 error = EINVAL;
 412                                 break;
 413                         }
 414 
 415                         has_feature = B_TRUE;
 416                         break;
 417 
 418                 case ZPOOL_PROP_VERSION:
 419                         error = nvpair_value_uint64(elem, &intval);
 420                         if (!error &&
 421                             (intval < spa_version(spa) ||
 422                             intval > SPA_VERSION_BEFORE_FEATURES ||
 423                             has_feature))
 424                                 error = EINVAL;
 425                         break;
 426 
 427                 case ZPOOL_PROP_DELEGATION:
 428                 case ZPOOL_PROP_AUTOREPLACE:
 429                 case ZPOOL_PROP_LISTSNAPS:
 430                 case ZPOOL_PROP_AUTOEXPAND:
 431                         error = nvpair_value_uint64(elem, &intval);
 432                         if (!error && intval > 1)
 433                                 error = EINVAL;
 434                         break;
 435 
 436                 case ZPOOL_PROP_BOOTFS:
 437                         /*
 438                          * If the pool version is less than SPA_VERSION_BOOTFS,
 439                          * or the pool is still being created (version == 0),
 440                          * the bootfs property cannot be set.
 441                          */
 442                         if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 443                                 error = ENOTSUP;
 444                                 break;
 445                         }
 446 
 447                         /*
 448                          * Make sure the vdev config is bootable
 449                          */
 450                         if (!vdev_is_bootable(spa->spa_root_vdev)) {
 451                                 error = ENOTSUP;
 452                                 break;
 453                         }
 454 
 455                         reset_bootfs = 1;
 456 
 457                         error = nvpair_value_string(elem, &strval);
 458 
 459                         if (!error) {
 460                                 objset_t *os;
 461                                 uint64_t compress;
 462 
 463                                 if (strval == NULL || strval[0] == '\0') {
 464                                         objnum = zpool_prop_default_numeric(
 465                                             ZPOOL_PROP_BOOTFS);
 466                                         break;
 467                                 }
 468 
 469                                 if (error = dmu_objset_hold(strval, FTAG, &os))
 470                                         break;
 471 
 472                                 /* Must be ZPL and not gzip compressed. */
 473 
 474                                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
 475                                         error = ENOTSUP;
 476                                 } else if ((error = dsl_prop_get_integer(strval,
 477                                     zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 478                                     &compress, NULL)) == 0 &&
 479                                     !BOOTFS_COMPRESS_VALID(compress)) {
 480                                         error = ENOTSUP;


 590 
 591         dp = kmem_alloc(sizeof (spa_config_dirent_t),
 592             KM_SLEEP);
 593 
 594         if (cachefile[0] == '\0')
 595                 dp->scd_path = spa_strdup(spa_config_path);
 596         else if (strcmp(cachefile, "none") == 0)
 597                 dp->scd_path = NULL;
 598         else
 599                 dp->scd_path = spa_strdup(cachefile);
 600 
 601         list_insert_head(&spa->spa_config_list, dp);
 602         if (need_sync)
 603                 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 604 }
 605 
 606 int
 607 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 608 {
 609         int error;
 610         nvpair_t *elem = NULL;
 611         boolean_t need_sync = B_FALSE;

 612 
 613         if ((error = spa_prop_validate(spa, nvp)) != 0)
 614                 return (error);
 615 

 616         while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 617                 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));


 618 
 619                 if (prop == ZPOOL_PROP_CACHEFILE ||
 620                     prop == ZPOOL_PROP_ALTROOT ||
 621                     prop == ZPOOL_PROP_READONLY)
 622                         continue;
 623 
 624                 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 625                         uint64_t ver;
 626 
 627                         if (prop == ZPOOL_PROP_VERSION) {
 628                                 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 629                         } else {
 630                                 ASSERT(zpool_prop_feature(nvpair_name(elem)));
 631                                 ver = SPA_VERSION_FEATURES;
 632                                 need_sync = B_TRUE;
 633                         }
 634 
 635                         /* Save time if the version is already set. */
 636                         if (ver == spa_version(spa))
 637                                 continue;
 638 
 639                         /*
 640                          * In addition to the pool directory object, we might
 641                          * create the pool properties object, the features for
 642                          * read object, the features for write object, or the
 643                          * feature descriptions object.
 644                          */
 645                         error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
 646                             spa_sync_version, spa, &ver, 6);
 647                         if (error)
 648                                 return (error);
 649                         continue;
 650                 }
 651 
 652                 need_sync = B_TRUE;
 653                 break;
 654         }
 655 
 656         if (need_sync) {
 657                 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
 658                     spa, nvp, 6));
 659         }
 660 
 661         return (0);
 662 }
 663 
 664 /*
 665  * If the bootfs property value is dsobj, clear it.
 666  */
 667 void
 668 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 669 {
 670         if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 671                 VERIFY(zap_remove(spa->spa_meta_objset,
 672                     spa->spa_pool_props_object,
 673                     zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 674                 spa->spa_bootfs = 0;
 675         }
 676 }
 677 
 678 /*
 679  * Change the GUID for the pool.  This is done so that we can later
 680  * re-import a pool built from a clone of our own vdevs.  We will modify


1665         mutex_enter(&spa->spa_props_lock);       /* any mutex will do */
1666         if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1667                 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1668         mutex_exit(&spa->spa_props_lock);
1669 }
1670 
1671 typedef struct spa_load_error {
1672         uint64_t        sle_meta_count;
1673         uint64_t        sle_data_count;
1674 } spa_load_error_t;
1675 
1676 static void
1677 spa_load_verify_done(zio_t *zio)
1678 {
1679         blkptr_t *bp = zio->io_bp;
1680         spa_load_error_t *sle = zio->io_private;
1681         dmu_object_type_t type = BP_GET_TYPE(bp);
1682         int error = zio->io_error;
1683 
1684         if (error) {
1685                 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1686                     type != DMU_OT_INTENT_LOG)
1687                         atomic_add_64(&sle->sle_meta_count, 1);
1688                 else
1689                         atomic_add_64(&sle->sle_data_count, 1);
1690         }
1691         zio_data_buf_free(zio->io_data, zio->io_size);
1692 }
1693 
1694 /*ARGSUSED*/
1695 static int
1696 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1697     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1698 {
1699         if (bp != NULL) {
1700                 zio_t *rio = arg;
1701                 size_t size = BP_GET_PSIZE(bp);
1702                 void *data = zio_data_buf_alloc(size);
1703 
1704                 zio_nowait(zio_read(rio, spa, bp, data, size,
1705                     spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,


1895          */
1896         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1897             &spa->spa_ubsync.ub_version) != 0)
1898                 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1899 
1900         (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1901             &spa->spa_config_txg);
1902 
1903         if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1904             spa_guid_exists(pool_guid, 0)) {
1905                 error = EEXIST;
1906         } else {
1907                 spa->spa_config_guid = pool_guid;
1908 
1909                 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1910                     &nvl) == 0) {
1911                         VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1912                             KM_SLEEP) == 0);
1913                 }
1914 
1915                 nvlist_free(spa->spa_load_info);
1916                 spa->spa_load_info = fnvlist_alloc();
1917 
1918                 gethrestime(&spa->spa_loaded_ts);
1919                 error = spa_load_impl(spa, pool_guid, config, state, type,
1920                     mosconfig, &ereport);
1921         }
1922 
1923         spa->spa_minref = refcount_count(&spa->spa_refcount);
1924         if (error) {
1925                 if (error != EEXIST) {
1926                         spa->spa_loaded_ts.tv_sec = 0;
1927                         spa->spa_loaded_ts.tv_nsec = 0;
1928                 }
1929                 if (error != EBADF) {
1930                         zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1931                 }
1932         }
1933         spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1934         spa->spa_ena = 0;
1935 
1936         return (error);
1937 }
1938 
1939 /*
1940  * Load an existing storage pool, using the pool's builtin spa_config as a
1941  * source of configuration information.
1942  */
1943 static int
1944 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1945     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1946     char **ereport)
1947 {
1948         int error = 0;
1949         nvlist_t *nvroot = NULL;
1950         nvlist_t *label;
1951         vdev_t *rvd;
1952         uberblock_t *ub = &spa->spa_uberblock;
1953         uint64_t children, config_cache_txg = spa->spa_config_txg;
1954         int orig_mode = spa->spa_mode;
1955         int parse;
1956         uint64_t obj;
1957         boolean_t missing_feat_write = B_FALSE;
1958 
1959         /*
1960          * If this is an untrusted config, access the pool in read-only mode.
1961          * This prevents things like resilvering recently removed devices.
1962          */
1963         if (!mosconfig)
1964                 spa->spa_mode = FREAD;
1965 
1966         ASSERT(MUTEX_HELD(&spa_namespace_lock));
1967 
1968         spa->spa_load_state = state;
1969 
1970         if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1971                 return (EINVAL);
1972 
1973         parse = (type == SPA_IMPORT_EXISTING ?
1974             VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1975 
1976         /*
1977          * Create "The Godfather" zio to hold all async IOs


2017          *
2018          * If we're assembling a new pool that's been split off from an
2019          * existing pool, the labels haven't yet been updated so we skip
2020          * validation for now.
2021          */
2022         if (type != SPA_IMPORT_ASSEMBLE) {
2023                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2024                 error = vdev_validate(rvd, mosconfig);
2025                 spa_config_exit(spa, SCL_ALL, FTAG);
2026 
2027                 if (error != 0)
2028                         return (error);
2029 
2030                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2031                         return (ENXIO);
2032         }
2033 
2034         /*
2035          * Find the best uberblock.
2036          */
2037         vdev_uberblock_load(rvd, ub, &label);
2038 
2039         /*
2040          * If we weren't able to find a single valid uberblock, return failure.
2041          */
2042         if (ub->ub_txg == 0) {
2043                 nvlist_free(label);
2044                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2045         }
2046 
2047         /*
2048          * If the pool has an unsupported version we can't open it.
2049          */
2050         if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2051                 nvlist_free(label);
2052                 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2053         }
2054 
2055         if (ub->ub_version >= SPA_VERSION_FEATURES) {
2056                 nvlist_t *features;
2057 
2058                 /*
2059                  * If we weren't able to find what's necessary for reading the
2060                  * MOS in the label, return failure.
2061                  */
2062                 if (label == NULL || nvlist_lookup_nvlist(label,
2063                     ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2064                         nvlist_free(label);
2065                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2066                             ENXIO));
2067                 }
2068 
2069                 /*
2070                  * Update our in-core representation with the definitive values
2071                  * from the label.
2072                  */
2073                 nvlist_free(spa->spa_label_features);
2074                 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2075         }
2076 
2077         nvlist_free(label);
2078 
2079         /*
2080          * Look through entries in the label nvlist's features_for_read. If
2081          * there is a feature listed there which we don't understand then we
2082          * cannot open a pool.
2083          */
2084         if (ub->ub_version >= SPA_VERSION_FEATURES) {
2085                 nvlist_t *unsup_feat;
2086 
2087                 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2088                     0);
2089 
2090                 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2091                     NULL); nvp != NULL;
2092                     nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2093                         if (!zfeature_is_supported(nvpair_name(nvp))) {
2094                                 VERIFY(nvlist_add_string(unsup_feat,
2095                                     nvpair_name(nvp), "") == 0);
2096                         }
2097                 }
2098 
2099                 if (!nvlist_empty(unsup_feat)) {
2100                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2101                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2102                         nvlist_free(unsup_feat);
2103                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2104                             ENOTSUP));
2105                 }
2106 
2107                 nvlist_free(unsup_feat);
2108         }
2109 
2110         /*
2111          * If the vdev guid sum doesn't match the uberblock, we have an
2112          * incomplete configuration.  We first check to see if the pool
2113          * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2114          * If it is, defer the vdev_guid_sum check till later so we
2115          * can handle missing vdevs.
2116          */
2117         if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2118             &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2119             rvd->vdev_guid_sum != ub->ub_guid_sum)
2120                 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2121 
2122         if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2123                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2124                 spa_try_repair(spa, config);
2125                 spa_config_exit(spa, SCL_ALL, FTAG);
2126                 nvlist_free(spa->spa_config_splitting);
2127                 spa->spa_config_splitting = NULL;
2128         }
2129 
2130         /*
2131          * Initialize internal SPA structures.
2132          */
2133         spa->spa_state = POOL_STATE_ACTIVE;
2134         spa->spa_ubsync = spa->spa_uberblock;
2135         spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2136             TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2137         spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2138             spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2139         spa->spa_claim_max_txg = spa->spa_first_txg;
2140         spa->spa_prev_software_version = ub->ub_software_version;
2141 
2142         error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2143         if (error)
2144                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2145         spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2146 
2147         if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2148                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2149 
2150         if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2151                 boolean_t missing_feat_read = B_FALSE;
2152                 nvlist_t *unsup_feat;
2153 
2154                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2155                     &spa->spa_feat_for_read_obj) != 0) {
2156                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2157                 }
2158 
2159                 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2160                     &spa->spa_feat_for_write_obj) != 0) {
2161                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2162                 }
2163 
2164                 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2165                     &spa->spa_feat_desc_obj) != 0) {
2166                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2167                 }
2168 
2169                 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2170                     0);
2171 
2172                 if (!feature_is_supported(spa->spa_meta_objset,
2173                     spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2174                     unsup_feat))
2175                         missing_feat_read = B_TRUE;
2176 
2177                 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2178                         if (!feature_is_supported(spa->spa_meta_objset,
2179                             spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2180                             unsup_feat))
2181                                 missing_feat_write = B_TRUE;
2182                 }
2183 
2184                 if (!nvlist_empty(unsup_feat)) {
2185                         VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2186                             ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2187                 }
2188 
2189                 nvlist_free(unsup_feat);
2190 
2191                 if (!missing_feat_read) {
2192                         fnvlist_add_boolean(spa->spa_load_info,
2193                             ZPOOL_CONFIG_CAN_RDONLY);
2194                 }
2195 
2196                 /*
2197                  * If the state is SPA_LOAD_TRYIMPORT, our objective is
2198                  * twofold: to determine whether the pool is available for
2199                  * import in read-write mode and (if it is not) whether the
2200                  * pool is available for import in read-only mode. If the pool
2201                  * is available for import in read-write mode, it is displayed
2202                  * as available in userland; if it is not available for import
2203                  * in read-only mode, it is displayed as unavailable in
2204                  * userland. If the pool is available for import in read-only
2205                  * mode but not read-write mode, it is displayed as unavailable
2206                  * in userland with a special note that the pool is actually
2207                  * available for open in read-only mode.
2208                  *
2209                  * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2210                  * missing a feature for write, we must first determine whether
2211                  * the pool can be opened read-only before returning to
2212                  * userland in order to know whether to display the
2213                  * abovementioned note.
2214                  */
2215                 if (missing_feat_read || (missing_feat_write &&
2216                     spa_writeable(spa))) {
2217                         return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2218                             ENOTSUP));
2219                 }
2220         }
2221 
2222         spa->spa_is_initializing = B_TRUE;
2223         error = dsl_pool_open(spa->spa_dsl_pool);
2224         spa->spa_is_initializing = B_FALSE;
2225         if (error != 0)
2226                 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2227 
2228         if (!mosconfig) {
2229                 uint64_t hostid;
2230                 nvlist_t *policy = NULL, *nvconfig;
2231 
2232                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2233                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2234 
2235                 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2236                     ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2237                         char *hostname;
2238                         unsigned long myhostid = 0;
2239 
2240                         VERIFY(nvlist_lookup_string(nvconfig,
2241                             ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2242 
2243 #ifdef  _KERNEL
2244                         myhostid = zone_get_hostid(NULL);
2245 #else   /* _KERNEL */
2246                         /*
2247                          * We're emulating the system's hostid in userland, so


2425          * Validate the config, using the MOS config to fill in any
2426          * information which might be missing.  If we fail to validate
2427          * the config then declare the pool unfit for use. If we're
2428          * assembling a pool from a split, the log is not transferred
2429          * over.
2430          */
2431         if (type != SPA_IMPORT_ASSEMBLE) {
2432                 nvlist_t *nvconfig;
2433 
2434                 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2435                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2436 
2437                 if (!spa_config_valid(spa, nvconfig)) {
2438                         nvlist_free(nvconfig);
2439                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2440                             ENXIO));
2441                 }
2442                 nvlist_free(nvconfig);
2443 
2444                 /*
2445                  * Now that we've validated the config, check the state of the
2446                  * root vdev.  If it can't be opened, it indicates one or
2447                  * more toplevel vdevs are faulted.
2448                  */
2449                 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2450                         return (ENXIO);
2451 
2452                 if (spa_check_logs(spa)) {
2453                         *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2454                         return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2455                 }
2456         }
2457 
2458         if (missing_feat_write) {
2459                 ASSERT(state == SPA_LOAD_TRYIMPORT);
2460 
2461                 /*
2462                  * At this point, we know that we can open the pool in
2463                  * read-only mode but not read-write mode. We now have enough
2464                  * information and can return to userland.
2465                  */
2466                 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2467         }
2468 
2469         /*
2470          * We've successfully opened the pool, verify that we're ready
2471          * to start pushing transactions.
2472          */
2473         if (state != SPA_LOAD_TRYIMPORT) {
2474                 if (error = spa_load_verify(spa))
2475                         return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2476                             error));
2477         }
2478 
2479         if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2480             spa->spa_load_max_txg == UINT64_MAX)) {
2481                 dmu_tx_t *tx;
2482                 int need_update = B_FALSE;
2483 
2484                 ASSERT(state != SPA_LOAD_TRYIMPORT);
2485 
2486                 /*
2487                  * Claim log blocks that haven't been committed yet.
2488                  * This must all happen in a single txg.
2489                  * Note: spa_claim_max_txg is updated by spa_claim_notify(),


2558 
2559         return (0);
2560 }
2561 
2562 static int
2563 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2564 {
2565         int mode = spa->spa_mode;
2566 
2567         spa_unload(spa);
2568         spa_deactivate(spa);
2569 
2570         spa->spa_load_max_txg--;
2571 
2572         spa_activate(spa, mode);
2573         spa_async_suspend(spa);
2574 
2575         return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2576 }
2577 
2578 /*
2579  * If spa_load() fails this function will try loading prior txg's. If
2580  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2581  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2582  * function will not rewind the pool and will return the same error as
2583  * spa_load().
2584  */
2585 static int
2586 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2587     uint64_t max_request, int rewind_flags)
2588 {
2589         nvlist_t *loadinfo = NULL;
2590         nvlist_t *config = NULL;
2591         int load_error, rewind_error;
2592         uint64_t safe_rewind_txg;
2593         uint64_t min_txg;
2594 
2595         if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2596                 spa->spa_load_max_txg = spa->spa_load_txg;
2597                 spa_set_log_state(spa, SPA_LOG_CLEAR);
2598         } else {
2599                 spa->spa_load_max_txg = max_request;
2600         }
2601 
2602         load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2603             mosconfig);
2604         if (load_error == 0)
2605                 return (0);
2606 
2607         if (spa->spa_root_vdev != NULL)
2608                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2609 
2610         spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2611         spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2612 
2613         if (rewind_flags & ZPOOL_NEVER_REWIND) {
2614                 nvlist_free(config);
2615                 return (load_error);
2616         }
2617 
2618         if (state == SPA_LOAD_RECOVER) {
2619                 /* Price of rolling back is discarding txgs, including log */

2620                 spa_set_log_state(spa, SPA_LOG_CLEAR);
2621         } else {
2622                 /*
2623                  * If we aren't rolling back save the load info from our first
2624                  * import attempt so that we can restore it after attempting
2625                  * to rewind.
2626                  */
2627                 loadinfo = spa->spa_load_info;
2628                 spa->spa_load_info = fnvlist_alloc();
2629         }
2630 
2631         spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2632         safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2633         min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2634             TXG_INITIAL : safe_rewind_txg;
2635 
2636         /*
2637          * Continue as long as we're finding errors, we're still within
2638          * the acceptable rewind range, and we're still finding uberblocks
2639          */
2640         while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2641             spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2642                 if (spa->spa_load_max_txg < safe_rewind_txg)
2643                         spa->spa_extreme_rewind = B_TRUE;
2644                 rewind_error = spa_load_retry(spa, state, mosconfig);
2645         }
2646 
2647         spa->spa_extreme_rewind = B_FALSE;
2648         spa->spa_load_max_txg = UINT64_MAX;
2649 
2650         if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2651                 spa_config_set(spa, config);
2652 
2653         if (state == SPA_LOAD_RECOVER) {
2654                 ASSERT3P(loadinfo, ==, NULL);
2655                 return (rewind_error);
2656         } else {
2657                 /* Store the rewind info as part of the initial load info */
2658                 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2659                     spa->spa_load_info);
2660 
2661                 /* Restore the initial load info */
2662                 fnvlist_free(spa->spa_load_info);
2663                 spa->spa_load_info = loadinfo;
2664 
2665                 return (load_error);
2666         }
2667 }
2668 
2669 /*
2670  * Pool Open/Import
2671  *
2672  * The import case is identical to an open except that the configuration is sent
2673  * down from userland, instead of grabbed from the configuration cache.  For the
2674  * case of an open, the pool configuration will exist in the
2675  * POOL_STATE_UNINITIALIZED state.
2676  *
2677  * The stats information (gen/count/ustats) is used to gather vdev statistics at
2678  * the same time open the pool, without having to keep around the spa_t in some
2679  * ambiguous state.
2680  */
2681 static int
2682 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2683     nvlist_t **config)
2684 {
2685         spa_t *spa;
2686         spa_load_state_t state = SPA_LOAD_OPEN;


2916                             ZPOOL_CONFIG_GUID, &guid) == 0);
2917 
2918                         vd = NULL;
2919                         for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2920                                 if (guid ==
2921                                     spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2922                                         vd = spa->spa_l2cache.sav_vdevs[j];
2923                                         break;
2924                                 }
2925                         }
2926                         ASSERT(vd != NULL);
2927 
2928                         VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2929                             ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2930                             == 0);
2931                         vdev_get_stats(vd, vs);
2932                 }
2933         }
2934 }
2935 
2936 static void
2937 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
2938 {
2939         nvlist_t *features;
2940         zap_cursor_t zc;
2941         zap_attribute_t za;
2942 
2943         ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2944         VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2945 
2946         if (spa->spa_feat_for_read_obj != 0) {
2947                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
2948                     spa->spa_feat_for_read_obj);
2949                     zap_cursor_retrieve(&zc, &za) == 0;
2950                     zap_cursor_advance(&zc)) {
2951                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
2952                             za.za_num_integers == 1);
2953                         VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
2954                             za.za_first_integer));
2955                 }
2956                 zap_cursor_fini(&zc);
2957         }
2958 
2959         if (spa->spa_feat_for_write_obj != 0) {
2960                 for (zap_cursor_init(&zc, spa->spa_meta_objset,
2961                     spa->spa_feat_for_write_obj);
2962                     zap_cursor_retrieve(&zc, &za) == 0;
2963                     zap_cursor_advance(&zc)) {
2964                         ASSERT(za.za_integer_length == sizeof (uint64_t) &&
2965                             za.za_num_integers == 1);
2966                         VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
2967                             za.za_first_integer));
2968                 }
2969                 zap_cursor_fini(&zc);
2970         }
2971 
2972         VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
2973             features) == 0);
2974         nvlist_free(features);
2975 }
2976 
2977 int
2978 spa_get_stats(const char *name, nvlist_t **config,
2979     char *altroot, size_t buflen)
2980 {
2981         int error;
2982         spa_t *spa;
2983 
2984         *config = NULL;
2985         error = spa_open_common(name, &spa, FTAG, NULL, config);
2986 
2987         if (spa != NULL) {
2988                 /*
2989                  * This still leaves a window of inconsistency where the spares
2990                  * or l2cache devices could change and the config would be
2991                  * self-inconsistent.
2992                  */
2993                 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2994 
2995                 if (*config != NULL) {
2996                         uint64_t loadtimes[2];
2997 
2998                         loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2999                         loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3000                         VERIFY(nvlist_add_uint64_array(*config,
3001                             ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3002 
3003                         VERIFY(nvlist_add_uint64(*config,
3004                             ZPOOL_CONFIG_ERRCOUNT,
3005                             spa_get_errlog_size(spa)) == 0);
3006 
3007                         if (spa_suspended(spa))
3008                                 VERIFY(nvlist_add_uint64(*config,
3009                                     ZPOOL_CONFIG_SUSPENDED,
3010                                     spa->spa_failmode) == 0);
3011 
3012                         spa_add_spares(spa, *config);
3013                         spa_add_l2cache(spa, *config);
3014                         spa_add_feature_stats(spa, *config);
3015                 }
3016         }
3017 
3018         /*
3019          * We want to get the alternate root even for faulted pools, so we cheat
3020          * and call spa_lookup() directly.
3021          */
3022         if (altroot) {
3023                 if (spa == NULL) {
3024                         mutex_enter(&spa_namespace_lock);
3025                         spa = spa_lookup(name);
3026                         if (spa)
3027                                 spa_altroot(spa, altroot, buflen);
3028                         else
3029                                 altroot[0] = '\0';
3030                         spa = NULL;
3031                         mutex_exit(&spa_namespace_lock);
3032                 } else {
3033                         spa_altroot(spa, altroot, buflen);
3034                 }


3215         }
3216 }
3217 
3218 /*
3219  * Pool Creation
3220  */
3221 int
3222 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3223     const char *history_str, nvlist_t *zplprops)
3224 {
3225         spa_t *spa;
3226         char *altroot = NULL;
3227         vdev_t *rvd;
3228         dsl_pool_t *dp;
3229         dmu_tx_t *tx;
3230         int error = 0;
3231         uint64_t txg = TXG_INITIAL;
3232         nvlist_t **spares, **l2cache;
3233         uint_t nspares, nl2cache;
3234         uint64_t version, obj;
3235         boolean_t has_features;
3236 
3237         /*
3238          * If this pool already exists, return failure.
3239          */
3240         mutex_enter(&spa_namespace_lock);
3241         if (spa_lookup(pool) != NULL) {
3242                 mutex_exit(&spa_namespace_lock);
3243                 return (EEXIST);
3244         }
3245 
3246         /*
3247          * Allocate a new spa_t structure.
3248          */
3249         (void) nvlist_lookup_string(props,
3250             zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3251         spa = spa_add(pool, NULL, altroot);
3252         spa_activate(spa, spa_mode_global);
3253 
3254         if (props && (error = spa_prop_validate(spa, props))) {
3255                 spa_deactivate(spa);
3256                 spa_remove(spa);
3257                 mutex_exit(&spa_namespace_lock);
3258                 return (error);
3259         }
3260 
3261         has_features = B_FALSE;
3262         for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3263             elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3264                 if (zpool_prop_feature(nvpair_name(elem)))
3265                         has_features = B_TRUE;
3266         }
3267 
3268         if (has_features || nvlist_lookup_uint64(props,
3269             zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3270                 version = SPA_VERSION;
3271         }
3272         ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3273 
3274         spa->spa_first_txg = txg;
3275         spa->spa_uberblock.ub_txg = txg - 1;
3276         spa->spa_uberblock.ub_version = version;
3277         spa->spa_ubsync = spa->spa_uberblock;
3278 
3279         /*
3280          * Create "The Godfather" zio to hold all async IOs
3281          */
3282         spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3283             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3284 
3285         /*
3286          * Create the root vdev.
3287          */
3288         spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3289 
3290         error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3291 
3292         ASSERT(error != 0 || rvd != NULL);


3328                 spa_load_spares(spa);
3329                 spa_config_exit(spa, SCL_ALL, FTAG);
3330                 spa->spa_spares.sav_sync = B_TRUE;
3331         }
3332 
3333         /*
3334          * Get the list of level 2 cache devices, if specified.
3335          */
3336         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3337             &l2cache, &nl2cache) == 0) {
3338                 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3339                     NV_UNIQUE_NAME, KM_SLEEP) == 0);
3340                 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3341                     ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3342                 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3343                 spa_load_l2cache(spa);
3344                 spa_config_exit(spa, SCL_ALL, FTAG);
3345                 spa->spa_l2cache.sav_sync = B_TRUE;
3346         }
3347 
3348         spa->spa_is_initializing = B_TRUE;
3349         spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3350         spa->spa_meta_objset = dp->dp_meta_objset;
3351         spa->spa_is_initializing = B_FALSE;
3352 
3353         /*
3354          * Create DDTs (dedup tables).
3355          */
3356         ddt_create(spa);
3357 
3358         spa_update_dspace(spa);
3359 
3360         tx = dmu_tx_create_assigned(dp, txg);
3361 
3362         /*
3363          * Create the pool config object.
3364          */
3365         spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3366             DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3367             DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3368 
3369         if (zap_add(spa->spa_meta_objset,
3370             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3371             sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3372                 cmn_err(CE_PANIC, "failed to add pool config");
3373         }
3374 
3375         if (spa_version(spa) >= SPA_VERSION_FEATURES)
3376                 spa_feature_create_zap_objects(spa, tx);
3377 
3378         if (zap_add(spa->spa_meta_objset,
3379             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3380             sizeof (uint64_t), 1, &version, tx) != 0) {
3381                 cmn_err(CE_PANIC, "failed to add pool version");
3382         }
3383 
3384         /* Newly created pools with the right version are always deflated. */
3385         if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3386                 spa->spa_deflate = TRUE;
3387                 if (zap_add(spa->spa_meta_objset,
3388                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3389                     sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3390                         cmn_err(CE_PANIC, "failed to add deflate");
3391                 }
3392         }
3393 
3394         /*
3395          * Create the deferred-free bpobj.  Turn off compression
3396          * because sync-to-convergence takes longer if the blocksize
3397          * keeps changing.


3548         vdev_t *rvd, *bvd, *avd = NULL;
3549         nvlist_t *config, *nvtop;
3550         uint64_t guid, txg;
3551         char *pname;
3552         int error;
3553 
3554         /*
3555          * Read the label from the boot device and generate a configuration.
3556          */
3557         config = spa_generate_rootconf(devpath, devid, &guid);
3558 #if defined(_OBP) && defined(_KERNEL)
3559         if (config == NULL) {
3560                 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3561                         /* iscsi boot */
3562                         get_iscsi_bootpath_phy(devpath);
3563                         config = spa_generate_rootconf(devpath, devid, &guid);
3564                 }
3565         }
3566 #endif
3567         if (config == NULL) {
3568                 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3569                     devpath);
3570                 return (EIO);
3571         }
3572 
3573         VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3574             &pname) == 0);
3575         VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3576 
3577         mutex_enter(&spa_namespace_lock);
3578         if ((spa = spa_lookup(pname)) != NULL) {
3579                 /*
3580                  * Remove the existing root pool from the namespace so that we
3581                  * can replace it with the correct config we just read in.
3582                  */
3583                 spa_remove(spa);
3584         }
3585 
3586         spa = spa_add(pname, config, NULL);
3587         spa->spa_is_root = B_TRUE;
3588         spa->spa_import_flags = ZFS_IMPORT_VERBATIM;


3862         spa_activate(spa, FREAD);
3863 
3864         /*
3865          * Pass off the heavy lifting to spa_load().
3866          * Pass TRUE for mosconfig because the user-supplied config
3867          * is actually the one to trust when doing an import.
3868          */
3869         error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3870 
3871         /*
3872          * If 'tryconfig' was at least parsable, return the current config.
3873          */
3874         if (spa->spa_root_vdev != NULL) {
3875                 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3876                 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3877                     poolname) == 0);
3878                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3879                     state) == 0);
3880                 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3881                     spa->spa_uberblock.ub_timestamp) == 0);
3882                 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3883                     spa->spa_load_info) == 0);
3884 
3885                 /*
3886                  * If the bootfs property exists on this pool then we
3887                  * copy it out so that external consumers can tell which
3888                  * pools are bootable.
3889                  */
3890                 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3891                         char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3892 
3893                         /*
3894                          * We have to play games with the name since the
3895                          * pool was opened as TRYIMPORT_NAME.
3896                          */
3897                         if (dsl_dsobj_to_dsname(spa_name(spa),
3898                             spa->spa_bootfs, tmpname) == 0) {
3899                                 char *cp;
3900                                 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3901 
3902                                 cp = strchr(tmpname, '/');
3903                                 if (cp == NULL) {


5581         zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5582             zio->io_flags));
5583         return (0);
5584 }
5585 
5586 static void
5587 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5588 {
5589         char *packed = NULL;
5590         size_t bufsize;
5591         size_t nvsize = 0;
5592         dmu_buf_t *db;
5593 
5594         VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5595 
5596         /*
5597          * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5598          * information.  This avoids the dbuf_will_dirty() path and
5599          * saves us a pre-read to get data we don't actually care about.
5600          */
5601         bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5602         packed = kmem_alloc(bufsize, KM_SLEEP);
5603 
5604         VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5605             KM_SLEEP) == 0);
5606         bzero(packed + nvsize, bufsize - nvsize);
5607 
5608         dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5609 
5610         kmem_free(packed, bufsize);
5611 
5612         VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5613         dmu_buf_will_dirty(db, tx);
5614         *(uint64_t *)db->db_data = nvsize;
5615         dmu_buf_rele(db, FTAG);
5616 }
5617 
5618 static void
5619 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5620     const char *config, const char *entry)
5621 {


5666 {
5667         nvlist_t *config;
5668 
5669         if (list_is_empty(&spa->spa_config_dirty_list))
5670                 return;
5671 
5672         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5673 
5674         config = spa_config_generate(spa, spa->spa_root_vdev,
5675             dmu_tx_get_txg(tx), B_FALSE);
5676 
5677         spa_config_exit(spa, SCL_STATE, FTAG);
5678 
5679         if (spa->spa_config_syncing)
5680                 nvlist_free(spa->spa_config_syncing);
5681         spa->spa_config_syncing = config;
5682 
5683         spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5684 }
5685 
5686 static void
5687 spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
5688 {
5689         spa_t *spa = arg1;
5690         uint64_t version = *(uint64_t *)arg2;
5691 
5692         /*
5693          * Setting the version is special cased when first creating the pool.
5694          */
5695         ASSERT(tx->tx_txg != TXG_INITIAL);
5696 
5697         ASSERT(version <= SPA_VERSION);
5698         ASSERT(version >= spa_version(spa));
5699 
5700         spa->spa_uberblock.ub_version = version;
5701         vdev_config_dirty(spa->spa_root_vdev);
5702 }
5703 
5704 /*
5705  * Set zpool properties.
5706  */
5707 static void
5708 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5709 {
5710         spa_t *spa = arg1;
5711         objset_t *mos = spa->spa_meta_objset;
5712         nvlist_t *nvp = arg2;
5713         nvpair_t *elem = NULL;
5714 
5715         mutex_enter(&spa->spa_props_lock);
5716 
5717         while ((elem = nvlist_next_nvpair(nvp, elem))) {
5718                 uint64_t intval;
5719                 char *strval, *fname;
5720                 zpool_prop_t prop;
5721                 const char *propname;
5722                 zprop_type_t proptype;
5723                 zfeature_info_t *feature;
5724 




5725                 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5726                 case ZPROP_INVAL:
5727                         /*
5728                          * We checked this earlier in spa_prop_validate().
5729                          */
5730                         ASSERT(zpool_prop_feature(nvpair_name(elem)));
5731 
5732                         fname = strchr(nvpair_name(elem), '@') + 1;
5733                         VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
5734 
5735                         spa_feature_enable(spa, feature, tx);
5736                         break;
5737 
5738                 case ZPOOL_PROP_VERSION:
5739                         VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5740                         /*
5741                          * The version is synced seperatly before other
5742                          * properties and should be correct by now.

5743                          */
5744                         ASSERT3U(spa_version(spa), >=, intval);







5745                         break;
5746 
5747                 case ZPOOL_PROP_ALTROOT:
5748                         /*
5749                          * 'altroot' is a non-persistent property. It should
5750                          * have been set temporarily at creation or import time.
5751                          */
5752                         ASSERT(spa->spa_root != NULL);
5753                         break;
5754 
5755                 case ZPOOL_PROP_READONLY:
5756                 case ZPOOL_PROP_CACHEFILE:
5757                         /*
5758                          * 'readonly' and 'cachefile' are also non-persisitent
5759                          * properties.
5760                          */
5761                         break;
5762                 case ZPOOL_PROP_COMMENT:
5763                         VERIFY(nvpair_value_string(elem, &strval) == 0);
5764                         if (spa->spa_comment != NULL)
5765                                 spa_strfree(spa->spa_comment);
5766                         spa->spa_comment = spa_strdup(strval);
5767                         /*
5768                          * We need to dirty the configuration on all the vdevs
5769                          * so that their labels get updated.  It's unnecessary
5770                          * to do this for pool creation since the vdev's
5771                          * configuratoin has already been dirtied.
5772                          */
5773                         if (tx->tx_txg != TXG_INITIAL)
5774                                 vdev_config_dirty(spa->spa_root_vdev);
5775                         break;
5776                 default:
5777                         /*
5778                          * Set pool property values in the poolprops mos object.
5779                          */
5780                         if (spa->spa_pool_props_object == 0) {
5781                                 spa->spa_pool_props_object =
5782                                     zap_create_link(mos, DMU_OT_POOL_PROPS,



5783                                     DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5784                                     tx);

5785                         }
5786 
5787                         /* normalize the property name */
5788                         propname = zpool_prop_to_name(prop);
5789                         proptype = zpool_prop_get_type(prop);
5790 
5791                         if (nvpair_type(elem) == DATA_TYPE_STRING) {
5792                                 ASSERT(proptype == PROP_TYPE_STRING);
5793                                 VERIFY(nvpair_value_string(elem, &strval) == 0);
5794                                 VERIFY(zap_update(mos,
5795                                     spa->spa_pool_props_object, propname,
5796                                     1, strlen(strval) + 1, strval, tx) == 0);
5797 
5798                         } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5799                                 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5800 
5801                                 if (proptype == PROP_TYPE_INDEX) {
5802                                         const char *unused;
5803                                         VERIFY(zpool_prop_index_to_string(
5804                                             prop, intval, &unused) == 0);


5863         if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5864             spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5865                 dsl_pool_create_origin(dp, tx);
5866 
5867                 /* Keeping the origin open increases spa_minref */
5868                 spa->spa_minref += 3;
5869         }
5870 
5871         if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5872             spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5873                 dsl_pool_upgrade_clones(dp, tx);
5874         }
5875 
5876         if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5877             spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5878                 dsl_pool_upgrade_dir_clones(dp, tx);
5879 
5880                 /* Keeping the freedir open increases spa_minref */
5881                 spa->spa_minref += 3;
5882         }
5883 
5884         if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
5885             spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
5886                 spa_feature_create_zap_objects(spa, tx);
5887         }
5888 }
5889 
5890 /*
5891  * Sync the specified transaction group.  New blocks may be dirtied as
5892  * part of the process, so we iterate until it converges.
5893  */
5894 void
5895 spa_sync(spa_t *spa, uint64_t txg)
5896 {
5897         dsl_pool_t *dp = spa->spa_dsl_pool;
5898         objset_t *mos = spa->spa_meta_objset;
5899         bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5900         bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5901         vdev_t *rvd = spa->spa_root_vdev;
5902         vdev_t *vd;
5903         dmu_tx_t *tx;
5904         int error;
5905 
5906         VERIFY(spa_writeable(spa));
5907