45 #include <sys/metaslab.h>
46 #include <sys/metaslab_impl.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/dmu_objset.h>
52 #include <sys/unique.h>
53 #include <sys/dsl_pool.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_dir.h>
56 #include <sys/dsl_prop.h>
57 #include <sys/dsl_synctask.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/arc.h>
60 #include <sys/callb.h>
61 #include <sys/systeminfo.h>
62 #include <sys/spa_boot.h>
63 #include <sys/zfs_ioctl.h>
64 #include <sys/dsl_scan.h>
65
66 #ifdef _KERNEL
67 #include <sys/bootprops.h>
68 #include <sys/callb.h>
69 #include <sys/cpupart.h>
70 #include <sys/pool.h>
71 #include <sys/sysdc.h>
72 #include <sys/zone.h>
73 #endif /* _KERNEL */
74
75 #include "zfs_prop.h"
76 #include "zfs_comutil.h"
77
78 typedef enum zti_modes {
79 zti_mode_fixed, /* value is # of threads (min 1) */
80 zti_mode_online_percent, /* value is % of online CPUs */
81 zti_mode_batch, /* cpu-intensive; value is ignored */
82 zti_mode_null, /* don't create a taskq */
83 zti_nmodes
84 } zti_modes_t;
96 } zio_taskq_info_t;
97
98 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
99 "issue", "issue_high", "intr", "intr_high"
100 };
101
102 /*
103 * Define the taskq threads for the following I/O types:
104 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
105 */
106 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
107 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
108 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
109 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
110 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
111 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
113 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
114 };
115
116 static dsl_syncfunc_t spa_sync_props;
117 static boolean_t spa_has_active_shared_spare(spa_t *spa);
118 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
119 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
120 char **ereport);
121 static void spa_vdev_resilver_done(spa_t *spa);
122
123 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
124 id_t zio_taskq_psrset_bind = PS_NONE;
125 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
126 uint_t zio_taskq_basedc = 80; /* base duty cycle */
127
128 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
129
130 /*
131 * This (illegal) pool name is used when temporarily importing a spa_t in order
132 * to get the vdev stats associated with the imported devices.
133 */
134 #define TRYIMPORT_NAME "$import"
135
151
152 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
153 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
154
155 if (strval != NULL)
156 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
157 else
158 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
159
160 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
161 nvlist_free(propval);
162 }
163
164 /*
165 * Get property values from the spa configuration.
166 */
167 static void
168 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
169 {
170 vdev_t *rvd = spa->spa_root_vdev;
171 uint64_t size;
172 uint64_t alloc;
173 uint64_t space;
174 uint64_t cap, version;
175 zprop_source_t src = ZPROP_SRC_NONE;
176 spa_config_dirent_t *dp;
177
178 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
179
180 if (rvd != NULL) {
181 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
182 size = metaslab_class_get_space(spa_normal_class(spa));
183 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
184 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
185 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
186 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
187 size - alloc, src);
188
189 space = 0;
190 for (int c = 0; c < rvd->vdev_children; c++) {
197 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
198 (spa_mode(spa) == FREAD), src);
199
200 cap = (size == 0) ? 0 : (alloc * 100 / size);
201 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
202
203 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
204 ddt_get_pool_dedup_ratio(spa), src);
205
206 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
207 rvd->vdev_state, src);
208
209 version = spa_version(spa);
210 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
211 src = ZPROP_SRC_DEFAULT;
212 else
213 src = ZPROP_SRC_LOCAL;
214 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
215 }
216
217 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
218
219 if (spa->spa_comment != NULL) {
220 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
221 0, ZPROP_SRC_LOCAL);
222 }
223
224 if (spa->spa_root != NULL)
225 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
226 0, ZPROP_SRC_LOCAL);
227
228 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
229 if (dp->scd_path == NULL) {
230 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
231 "none", 0, ZPROP_SRC_LOCAL);
232 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
233 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
234 dp->scd_path, 0, ZPROP_SRC_LOCAL);
235 }
236 }
336 out:
337 if (err && err != ENOENT) {
338 nvlist_free(*nvp);
339 *nvp = NULL;
340 return (err);
341 }
342
343 return (0);
344 }
345
346 /*
347 * Validate the given pool properties nvlist and modify the list
348 * for the property values to be set.
349 */
350 static int
351 spa_prop_validate(spa_t *spa, nvlist_t *props)
352 {
353 nvpair_t *elem;
354 int error = 0, reset_bootfs = 0;
355 uint64_t objnum;
356
357 elem = NULL;
358 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
359 zpool_prop_t prop;
360 char *propname, *strval;
361 uint64_t intval;
362 objset_t *os;
363 char *slash, *check;
364
365 propname = nvpair_name(elem);
366
367 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
368 return (EINVAL);
369
370 switch (prop) {
371 case ZPOOL_PROP_VERSION:
372 error = nvpair_value_uint64(elem, &intval);
373 if (!error &&
374 (intval < spa_version(spa) || intval > SPA_VERSION))
375 error = EINVAL;
376 break;
377
378 case ZPOOL_PROP_DELEGATION:
379 case ZPOOL_PROP_AUTOREPLACE:
380 case ZPOOL_PROP_LISTSNAPS:
381 case ZPOOL_PROP_AUTOEXPAND:
382 error = nvpair_value_uint64(elem, &intval);
383 if (!error && intval > 1)
384 error = EINVAL;
385 break;
386
387 case ZPOOL_PROP_BOOTFS:
388 /*
389 * If the pool version is less than SPA_VERSION_BOOTFS,
390 * or the pool is still being created (version == 0),
391 * the bootfs property cannot be set.
392 */
393 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
394 error = ENOTSUP;
395 break;
396 }
397
398 /*
399 * Make sure the vdev config is bootable
400 */
401 if (!vdev_is_bootable(spa->spa_root_vdev)) {
402 error = ENOTSUP;
403 break;
404 }
405
406 reset_bootfs = 1;
407
408 error = nvpair_value_string(elem, &strval);
409
410 if (!error) {
411 uint64_t compress;
412
413 if (strval == NULL || strval[0] == '\0') {
414 objnum = zpool_prop_default_numeric(
415 ZPOOL_PROP_BOOTFS);
416 break;
417 }
418
419 if (error = dmu_objset_hold(strval, FTAG, &os))
420 break;
421
422 /* Must be ZPL and not gzip compressed. */
423
424 if (dmu_objset_type(os) != DMU_OST_ZFS) {
425 error = ENOTSUP;
426 } else if ((error = dsl_prop_get_integer(strval,
427 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
428 &compress, NULL)) == 0 &&
429 !BOOTFS_COMPRESS_VALID(compress)) {
430 error = ENOTSUP;
540
541 dp = kmem_alloc(sizeof (spa_config_dirent_t),
542 KM_SLEEP);
543
544 if (cachefile[0] == '\0')
545 dp->scd_path = spa_strdup(spa_config_path);
546 else if (strcmp(cachefile, "none") == 0)
547 dp->scd_path = NULL;
548 else
549 dp->scd_path = spa_strdup(cachefile);
550
551 list_insert_head(&spa->spa_config_list, dp);
552 if (need_sync)
553 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
554 }
555
556 int
557 spa_prop_set(spa_t *spa, nvlist_t *nvp)
558 {
559 int error;
560 nvpair_t *elem;
561 boolean_t need_sync = B_FALSE;
562 zpool_prop_t prop;
563
564 if ((error = spa_prop_validate(spa, nvp)) != 0)
565 return (error);
566
567 elem = NULL;
568 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
569 if ((prop = zpool_name_to_prop(
570 nvpair_name(elem))) == ZPROP_INVAL)
571 return (EINVAL);
572
573 if (prop == ZPOOL_PROP_CACHEFILE ||
574 prop == ZPOOL_PROP_ALTROOT ||
575 prop == ZPOOL_PROP_READONLY)
576 continue;
577
578 need_sync = B_TRUE;
579 break;
580 }
581
582 if (need_sync)
583 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
584 spa, nvp, 3));
585 else
586 return (0);
587 }
588
589 /*
590 * If the bootfs property value is dsobj, clear it.
591 */
592 void
593 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
594 {
595 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
596 VERIFY(zap_remove(spa->spa_meta_objset,
597 spa->spa_pool_props_object,
598 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
599 spa->spa_bootfs = 0;
600 }
601 }
602
603 /*
604 * Change the GUID for the pool. This is done so that we can later
605 * re-import a pool built from a clone of our own vdevs. We will modify
1590 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1591 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1592 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1593 mutex_exit(&spa->spa_props_lock);
1594 }
1595
1596 typedef struct spa_load_error {
1597 uint64_t sle_meta_count;
1598 uint64_t sle_data_count;
1599 } spa_load_error_t;
1600
1601 static void
1602 spa_load_verify_done(zio_t *zio)
1603 {
1604 blkptr_t *bp = zio->io_bp;
1605 spa_load_error_t *sle = zio->io_private;
1606 dmu_object_type_t type = BP_GET_TYPE(bp);
1607 int error = zio->io_error;
1608
1609 if (error) {
1610 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1611 type != DMU_OT_INTENT_LOG)
1612 atomic_add_64(&sle->sle_meta_count, 1);
1613 else
1614 atomic_add_64(&sle->sle_data_count, 1);
1615 }
1616 zio_data_buf_free(zio->io_data, zio->io_size);
1617 }
1618
1619 /*ARGSUSED*/
1620 static int
1621 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1622 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1623 {
1624 if (bp != NULL) {
1625 zio_t *rio = arg;
1626 size_t size = BP_GET_PSIZE(bp);
1627 void *data = zio_data_buf_alloc(size);
1628
1629 zio_nowait(zio_read(rio, spa, bp, data, size,
1630 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1820 */
1821 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1822 &spa->spa_ubsync.ub_version) != 0)
1823 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1824
1825 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1826 &spa->spa_config_txg);
1827
1828 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1829 spa_guid_exists(pool_guid, 0)) {
1830 error = EEXIST;
1831 } else {
1832 spa->spa_config_guid = pool_guid;
1833
1834 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1835 &nvl) == 0) {
1836 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1837 KM_SLEEP) == 0);
1838 }
1839
1840 gethrestime(&spa->spa_loaded_ts);
1841 error = spa_load_impl(spa, pool_guid, config, state, type,
1842 mosconfig, &ereport);
1843 }
1844
1845 spa->spa_minref = refcount_count(&spa->spa_refcount);
1846 if (error) {
1847 if (error != EEXIST) {
1848 spa->spa_loaded_ts.tv_sec = 0;
1849 spa->spa_loaded_ts.tv_nsec = 0;
1850 }
1851 if (error != EBADF) {
1852 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1853 }
1854 }
1855 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1856 spa->spa_ena = 0;
1857
1858 return (error);
1859 }
1860
1861 /*
1862 * Load an existing storage pool, using the pool's builtin spa_config as a
1863 * source of configuration information.
1864 */
1865 static int
1866 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1867 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1868 char **ereport)
1869 {
1870 int error = 0;
1871 nvlist_t *nvroot = NULL;
1872 vdev_t *rvd;
1873 uberblock_t *ub = &spa->spa_uberblock;
1874 uint64_t children, config_cache_txg = spa->spa_config_txg;
1875 int orig_mode = spa->spa_mode;
1876 int parse;
1877 uint64_t obj;
1878
1879 /*
1880 * If this is an untrusted config, access the pool in read-only mode.
1881 * This prevents things like resilvering recently removed devices.
1882 */
1883 if (!mosconfig)
1884 spa->spa_mode = FREAD;
1885
1886 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1887
1888 spa->spa_load_state = state;
1889
1890 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1891 return (EINVAL);
1892
1893 parse = (type == SPA_IMPORT_EXISTING ?
1894 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1895
1896 /*
1897 * Create "The Godfather" zio to hold all async IOs
1937 *
1938 * If we're assembling a new pool that's been split off from an
1939 * existing pool, the labels haven't yet been updated so we skip
1940 * validation for now.
1941 */
1942 if (type != SPA_IMPORT_ASSEMBLE) {
1943 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1944 error = vdev_validate(rvd, mosconfig);
1945 spa_config_exit(spa, SCL_ALL, FTAG);
1946
1947 if (error != 0)
1948 return (error);
1949
1950 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
1951 return (ENXIO);
1952 }
1953
1954 /*
1955 * Find the best uberblock.
1956 */
1957 vdev_uberblock_load(NULL, rvd, ub);
1958
1959 /*
1960 * If we weren't able to find a single valid uberblock, return failure.
1961 */
1962 if (ub->ub_txg == 0)
1963 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
1964
1965 /*
1966 * If the pool is newer than the code, we can't open it.
1967 */
1968 if (ub->ub_version > SPA_VERSION)
1969 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
1970
1971 /*
1972 * If the vdev guid sum doesn't match the uberblock, we have an
1973 * incomplete configuration. We first check to see if the pool
1974 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
1975 * If it is, defer the vdev_guid_sum check till later so we
1976 * can handle missing vdevs.
1977 */
1978 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
1979 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
1980 rvd->vdev_guid_sum != ub->ub_guid_sum)
1981 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
1982
1983 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
1984 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1985 spa_try_repair(spa, config);
1986 spa_config_exit(spa, SCL_ALL, FTAG);
1987 nvlist_free(spa->spa_config_splitting);
1988 spa->spa_config_splitting = NULL;
1989 }
1990
1991 /*
1992 * Initialize internal SPA structures.
1993 */
1994 spa->spa_state = POOL_STATE_ACTIVE;
1995 spa->spa_ubsync = spa->spa_uberblock;
1996 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1997 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1998 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1999 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2000 spa->spa_claim_max_txg = spa->spa_first_txg;
2001 spa->spa_prev_software_version = ub->ub_software_version;
2002
2003 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2004 if (error)
2005 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2006 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2007
2008 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2009 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2010
2011 if (!mosconfig) {
2012 uint64_t hostid;
2013 nvlist_t *policy = NULL, *nvconfig;
2014
2015 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2016 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2017
2018 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2019 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2020 char *hostname;
2021 unsigned long myhostid = 0;
2022
2023 VERIFY(nvlist_lookup_string(nvconfig,
2024 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2025
2026 #ifdef _KERNEL
2027 myhostid = zone_get_hostid(NULL);
2028 #else /* _KERNEL */
2029 /*
2030 * We're emulating the system's hostid in userland, so
2208 * Validate the config, using the MOS config to fill in any
2209 * information which might be missing. If we fail to validate
2210 * the config then declare the pool unfit for use. If we're
2211 * assembling a pool from a split, the log is not transferred
2212 * over.
2213 */
2214 if (type != SPA_IMPORT_ASSEMBLE) {
2215 nvlist_t *nvconfig;
2216
2217 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2218 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2219
2220 if (!spa_config_valid(spa, nvconfig)) {
2221 nvlist_free(nvconfig);
2222 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2223 ENXIO));
2224 }
2225 nvlist_free(nvconfig);
2226
2227 /*
2228 * Now that we've validate the config, check the state of the
2229 * root vdev. If it can't be opened, it indicates one or
2230 * more toplevel vdevs are faulted.
2231 */
2232 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2233 return (ENXIO);
2234
2235 if (spa_check_logs(spa)) {
2236 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2237 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2238 }
2239 }
2240
2241 /*
2242 * We've successfully opened the pool, verify that we're ready
2243 * to start pushing transactions.
2244 */
2245 if (state != SPA_LOAD_TRYIMPORT) {
2246 if (error = spa_load_verify(spa))
2247 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2248 error));
2249 }
2250
2251 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2252 spa->spa_load_max_txg == UINT64_MAX)) {
2253 dmu_tx_t *tx;
2254 int need_update = B_FALSE;
2255
2256 ASSERT(state != SPA_LOAD_TRYIMPORT);
2257
2258 /*
2259 * Claim log blocks that haven't been committed yet.
2260 * This must all happen in a single txg.
2261 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2330
2331 return (0);
2332 }
2333
2334 static int
2335 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2336 {
2337 int mode = spa->spa_mode;
2338
2339 spa_unload(spa);
2340 spa_deactivate(spa);
2341
2342 spa->spa_load_max_txg--;
2343
2344 spa_activate(spa, mode);
2345 spa_async_suspend(spa);
2346
2347 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2348 }
2349
2350 static int
2351 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2352 uint64_t max_request, int rewind_flags)
2353 {
2354 nvlist_t *config = NULL;
2355 int load_error, rewind_error;
2356 uint64_t safe_rewind_txg;
2357 uint64_t min_txg;
2358
2359 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2360 spa->spa_load_max_txg = spa->spa_load_txg;
2361 spa_set_log_state(spa, SPA_LOG_CLEAR);
2362 } else {
2363 spa->spa_load_max_txg = max_request;
2364 }
2365
2366 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2367 mosconfig);
2368 if (load_error == 0)
2369 return (0);
2370
2371 if (spa->spa_root_vdev != NULL)
2372 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2373
2374 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2375 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2376
2377 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2378 nvlist_free(config);
2379 return (load_error);
2380 }
2381
2382 /* Price of rolling back is discarding txgs, including log */
2383 if (state == SPA_LOAD_RECOVER)
2384 spa_set_log_state(spa, SPA_LOG_CLEAR);
2385
2386 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2387 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2388 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2389 TXG_INITIAL : safe_rewind_txg;
2390
2391 /*
2392 * Continue as long as we're finding errors, we're still within
2393 * the acceptable rewind range, and we're still finding uberblocks
2394 */
2395 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2396 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2397 if (spa->spa_load_max_txg < safe_rewind_txg)
2398 spa->spa_extreme_rewind = B_TRUE;
2399 rewind_error = spa_load_retry(spa, state, mosconfig);
2400 }
2401
2402 spa->spa_extreme_rewind = B_FALSE;
2403 spa->spa_load_max_txg = UINT64_MAX;
2404
2405 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2406 spa_config_set(spa, config);
2407
2408 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
2409 }
2410
2411 /*
2412 * Pool Open/Import
2413 *
2414 * The import case is identical to an open except that the configuration is sent
2415 * down from userland, instead of grabbed from the configuration cache. For the
2416 * case of an open, the pool configuration will exist in the
2417 * POOL_STATE_UNINITIALIZED state.
2418 *
2419 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2420 * the same time open the pool, without having to keep around the spa_t in some
2421 * ambiguous state.
2422 */
2423 static int
2424 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2425 nvlist_t **config)
2426 {
2427 spa_t *spa;
2428 spa_load_state_t state = SPA_LOAD_OPEN;
2658 ZPOOL_CONFIG_GUID, &guid) == 0);
2659
2660 vd = NULL;
2661 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2662 if (guid ==
2663 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2664 vd = spa->spa_l2cache.sav_vdevs[j];
2665 break;
2666 }
2667 }
2668 ASSERT(vd != NULL);
2669
2670 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2671 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2672 == 0);
2673 vdev_get_stats(vd, vs);
2674 }
2675 }
2676 }
2677
2678 int
2679 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
2680 {
2681 int error;
2682 spa_t *spa;
2683
2684 *config = NULL;
2685 error = spa_open_common(name, &spa, FTAG, NULL, config);
2686
2687 if (spa != NULL) {
2688 /*
2689 * This still leaves a window of inconsistency where the spares
2690 * or l2cache devices could change and the config would be
2691 * self-inconsistent.
2692 */
2693 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2694
2695 if (*config != NULL) {
2696 uint64_t loadtimes[2];
2697
2698 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2699 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
2700 VERIFY(nvlist_add_uint64_array(*config,
2701 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
2702
2703 VERIFY(nvlist_add_uint64(*config,
2704 ZPOOL_CONFIG_ERRCOUNT,
2705 spa_get_errlog_size(spa)) == 0);
2706
2707 if (spa_suspended(spa))
2708 VERIFY(nvlist_add_uint64(*config,
2709 ZPOOL_CONFIG_SUSPENDED,
2710 spa->spa_failmode) == 0);
2711
2712 spa_add_spares(spa, *config);
2713 spa_add_l2cache(spa, *config);
2714 }
2715 }
2716
2717 /*
2718 * We want to get the alternate root even for faulted pools, so we cheat
2719 * and call spa_lookup() directly.
2720 */
2721 if (altroot) {
2722 if (spa == NULL) {
2723 mutex_enter(&spa_namespace_lock);
2724 spa = spa_lookup(name);
2725 if (spa)
2726 spa_altroot(spa, altroot, buflen);
2727 else
2728 altroot[0] = '\0';
2729 spa = NULL;
2730 mutex_exit(&spa_namespace_lock);
2731 } else {
2732 spa_altroot(spa, altroot, buflen);
2733 }
2914 }
2915 }
2916
2917 /*
2918 * Pool Creation
2919 */
2920 int
2921 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2922 const char *history_str, nvlist_t *zplprops)
2923 {
2924 spa_t *spa;
2925 char *altroot = NULL;
2926 vdev_t *rvd;
2927 dsl_pool_t *dp;
2928 dmu_tx_t *tx;
2929 int error = 0;
2930 uint64_t txg = TXG_INITIAL;
2931 nvlist_t **spares, **l2cache;
2932 uint_t nspares, nl2cache;
2933 uint64_t version, obj;
2934
2935 /*
2936 * If this pool already exists, return failure.
2937 */
2938 mutex_enter(&spa_namespace_lock);
2939 if (spa_lookup(pool) != NULL) {
2940 mutex_exit(&spa_namespace_lock);
2941 return (EEXIST);
2942 }
2943
2944 /*
2945 * Allocate a new spa_t structure.
2946 */
2947 (void) nvlist_lookup_string(props,
2948 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2949 spa = spa_add(pool, NULL, altroot);
2950 spa_activate(spa, spa_mode_global);
2951
2952 if (props && (error = spa_prop_validate(spa, props))) {
2953 spa_deactivate(spa);
2954 spa_remove(spa);
2955 mutex_exit(&spa_namespace_lock);
2956 return (error);
2957 }
2958
2959 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2960 &version) != 0)
2961 version = SPA_VERSION;
2962 ASSERT(version <= SPA_VERSION);
2963
2964 spa->spa_first_txg = txg;
2965 spa->spa_uberblock.ub_txg = txg - 1;
2966 spa->spa_uberblock.ub_version = version;
2967 spa->spa_ubsync = spa->spa_uberblock;
2968
2969 /*
2970 * Create "The Godfather" zio to hold all async IOs
2971 */
2972 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2973 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2974
2975 /*
2976 * Create the root vdev.
2977 */
2978 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2979
2980 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2981
2982 ASSERT(error != 0 || rvd != NULL);
3018 spa_load_spares(spa);
3019 spa_config_exit(spa, SCL_ALL, FTAG);
3020 spa->spa_spares.sav_sync = B_TRUE;
3021 }
3022
3023 /*
3024 * Get the list of level 2 cache devices, if specified.
3025 */
3026 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3027 &l2cache, &nl2cache) == 0) {
3028 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3029 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3030 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3031 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3032 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3033 spa_load_l2cache(spa);
3034 spa_config_exit(spa, SCL_ALL, FTAG);
3035 spa->spa_l2cache.sav_sync = B_TRUE;
3036 }
3037
3038 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3039 spa->spa_meta_objset = dp->dp_meta_objset;
3040
3041 /*
3042 * Create DDTs (dedup tables).
3043 */
3044 ddt_create(spa);
3045
3046 spa_update_dspace(spa);
3047
3048 tx = dmu_tx_create_assigned(dp, txg);
3049
3050 /*
3051 * Create the pool config object.
3052 */
3053 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3054 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3055 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3056
3057 if (zap_add(spa->spa_meta_objset,
3058 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3059 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3060 cmn_err(CE_PANIC, "failed to add pool config");
3061 }
3062
3063 if (zap_add(spa->spa_meta_objset,
3064 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3065 sizeof (uint64_t), 1, &version, tx) != 0) {
3066 cmn_err(CE_PANIC, "failed to add pool version");
3067 }
3068
3069 /* Newly created pools with the right version are always deflated. */
3070 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3071 spa->spa_deflate = TRUE;
3072 if (zap_add(spa->spa_meta_objset,
3073 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3074 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3075 cmn_err(CE_PANIC, "failed to add deflate");
3076 }
3077 }
3078
3079 /*
3080 * Create the deferred-free bpobj. Turn off compression
3081 * because sync-to-convergence takes longer if the blocksize
3082 * keeps changing.
3233 vdev_t *rvd, *bvd, *avd = NULL;
3234 nvlist_t *config, *nvtop;
3235 uint64_t guid, txg;
3236 char *pname;
3237 int error;
3238
3239 /*
3240 * Read the label from the boot device and generate a configuration.
3241 */
3242 config = spa_generate_rootconf(devpath, devid, &guid);
3243 #if defined(_OBP) && defined(_KERNEL)
3244 if (config == NULL) {
3245 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3246 /* iscsi boot */
3247 get_iscsi_bootpath_phy(devpath);
3248 config = spa_generate_rootconf(devpath, devid, &guid);
3249 }
3250 }
3251 #endif
3252 if (config == NULL) {
3253 cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
3254 devpath);
3255 return (EIO);
3256 }
3257
3258 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3259 &pname) == 0);
3260 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3261
3262 mutex_enter(&spa_namespace_lock);
3263 if ((spa = spa_lookup(pname)) != NULL) {
3264 /*
3265 * Remove the existing root pool from the namespace so that we
3266 * can replace it with the correct config we just read in.
3267 */
3268 spa_remove(spa);
3269 }
3270
3271 spa = spa_add(pname, config, NULL);
3272 spa->spa_is_root = B_TRUE;
3273 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3547 spa_activate(spa, FREAD);
3548
3549 /*
3550 * Pass off the heavy lifting to spa_load().
3551 * Pass TRUE for mosconfig because the user-supplied config
3552 * is actually the one to trust when doing an import.
3553 */
3554 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3555
3556 /*
3557 * If 'tryconfig' was at least parsable, return the current config.
3558 */
3559 if (spa->spa_root_vdev != NULL) {
3560 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3561 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3562 poolname) == 0);
3563 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3564 state) == 0);
3565 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3566 spa->spa_uberblock.ub_timestamp) == 0);
3567
3568 /*
3569 * If the bootfs property exists on this pool then we
3570 * copy it out so that external consumers can tell which
3571 * pools are bootable.
3572 */
3573 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3574 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3575
3576 /*
3577 * We have to play games with the name since the
3578 * pool was opened as TRYIMPORT_NAME.
3579 */
3580 if (dsl_dsobj_to_dsname(spa_name(spa),
3581 spa->spa_bootfs, tmpname) == 0) {
3582 char *cp;
3583 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3584
3585 cp = strchr(tmpname, '/');
3586 if (cp == NULL) {
5264 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5265 zio->io_flags));
5266 return (0);
5267 }
5268
5269 static void
5270 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5271 {
5272 char *packed = NULL;
5273 size_t bufsize;
5274 size_t nvsize = 0;
5275 dmu_buf_t *db;
5276
5277 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5278
5279 /*
5280 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5281 * information. This avoids the dbuf_will_dirty() path and
5282 * saves us a pre-read to get data we don't actually care about.
5283 */
5284 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5285 packed = kmem_alloc(bufsize, KM_SLEEP);
5286
5287 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5288 KM_SLEEP) == 0);
5289 bzero(packed + nvsize, bufsize - nvsize);
5290
5291 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5292
5293 kmem_free(packed, bufsize);
5294
5295 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5296 dmu_buf_will_dirty(db, tx);
5297 *(uint64_t *)db->db_data = nvsize;
5298 dmu_buf_rele(db, FTAG);
5299 }
5300
5301 static void
5302 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5303 const char *config, const char *entry)
5304 {
5349 {
5350 nvlist_t *config;
5351
5352 if (list_is_empty(&spa->spa_config_dirty_list))
5353 return;
5354
5355 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5356
5357 config = spa_config_generate(spa, spa->spa_root_vdev,
5358 dmu_tx_get_txg(tx), B_FALSE);
5359
5360 spa_config_exit(spa, SCL_STATE, FTAG);
5361
5362 if (spa->spa_config_syncing)
5363 nvlist_free(spa->spa_config_syncing);
5364 spa->spa_config_syncing = config;
5365
5366 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5367 }
5368
5369 /*
5370 * Set zpool properties.
5371 */
5372 static void
5373 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5374 {
5375 spa_t *spa = arg1;
5376 objset_t *mos = spa->spa_meta_objset;
5377 nvlist_t *nvp = arg2;
5378 nvpair_t *elem;
5379 uint64_t intval;
5380 char *strval;
5381 zpool_prop_t prop;
5382 const char *propname;
5383 zprop_type_t proptype;
5384
5385 mutex_enter(&spa->spa_props_lock);
5386
5387 elem = NULL;
5388 while ((elem = nvlist_next_nvpair(nvp, elem))) {
5389 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5390 case ZPOOL_PROP_VERSION:
5391 /*
5392 * Only set version for non-zpool-creation cases
5393 * (set/import). spa_create() needs special care
5394 * for version setting.
5395 */
5396 if (tx->tx_txg != TXG_INITIAL) {
5397 VERIFY(nvpair_value_uint64(elem,
5398 &intval) == 0);
5399 ASSERT(intval <= SPA_VERSION);
5400 ASSERT(intval >= spa_version(spa));
5401 spa->spa_uberblock.ub_version = intval;
5402 vdev_config_dirty(spa->spa_root_vdev);
5403 }
5404 break;
5405
5406 case ZPOOL_PROP_ALTROOT:
5407 /*
5408 * 'altroot' is a non-persistent property. It should
5409 * have been set temporarily at creation or import time.
5410 */
5411 ASSERT(spa->spa_root != NULL);
5412 break;
5413
5414 case ZPOOL_PROP_READONLY:
5415 case ZPOOL_PROP_CACHEFILE:
5416 /*
5417 * 'readonly' and 'cachefile' are also non-persisitent
5418 * properties.
5419 */
5420 break;
5421 case ZPOOL_PROP_COMMENT:
5422 VERIFY(nvpair_value_string(elem, &strval) == 0);
5423 if (spa->spa_comment != NULL)
5424 spa_strfree(spa->spa_comment);
5425 spa->spa_comment = spa_strdup(strval);
5426 /*
5427 * We need to dirty the configuration on all the vdevs
5428 * so that their labels get updated. It's unnecessary
5429 * to do this for pool creation since the vdev's
5430 * configuratoin has already been dirtied.
5431 */
5432 if (tx->tx_txg != TXG_INITIAL)
5433 vdev_config_dirty(spa->spa_root_vdev);
5434 break;
5435 default:
5436 /*
5437 * Set pool property values in the poolprops mos object.
5438 */
5439 if (spa->spa_pool_props_object == 0) {
5440 VERIFY((spa->spa_pool_props_object =
5441 zap_create(mos, DMU_OT_POOL_PROPS,
5442 DMU_OT_NONE, 0, tx)) > 0);
5443
5444 VERIFY(zap_update(mos,
5445 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5446 8, 1, &spa->spa_pool_props_object, tx)
5447 == 0);
5448 }
5449
5450 /* normalize the property name */
5451 propname = zpool_prop_to_name(prop);
5452 proptype = zpool_prop_get_type(prop);
5453
5454 if (nvpair_type(elem) == DATA_TYPE_STRING) {
5455 ASSERT(proptype == PROP_TYPE_STRING);
5456 VERIFY(nvpair_value_string(elem, &strval) == 0);
5457 VERIFY(zap_update(mos,
5458 spa->spa_pool_props_object, propname,
5459 1, strlen(strval) + 1, strval, tx) == 0);
5460
5461 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5462 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5463
5464 if (proptype == PROP_TYPE_INDEX) {
5465 const char *unused;
5466 VERIFY(zpool_prop_index_to_string(
5467 prop, intval, &unused) == 0);
5526 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5527 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5528 dsl_pool_create_origin(dp, tx);
5529
5530 /* Keeping the origin open increases spa_minref */
5531 spa->spa_minref += 3;
5532 }
5533
5534 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5535 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5536 dsl_pool_upgrade_clones(dp, tx);
5537 }
5538
5539 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5540 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5541 dsl_pool_upgrade_dir_clones(dp, tx);
5542
5543 /* Keeping the freedir open increases spa_minref */
5544 spa->spa_minref += 3;
5545 }
5546 }
5547
5548 /*
5549 * Sync the specified transaction group. New blocks may be dirtied as
5550 * part of the process, so we iterate until it converges.
5551 */
5552 void
5553 spa_sync(spa_t *spa, uint64_t txg)
5554 {
5555 dsl_pool_t *dp = spa->spa_dsl_pool;
5556 objset_t *mos = spa->spa_meta_objset;
5557 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5558 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5559 vdev_t *rvd = spa->spa_root_vdev;
5560 vdev_t *vd;
5561 dmu_tx_t *tx;
5562 int error;
5563
5564 VERIFY(spa_writeable(spa));
5565
|
45 #include <sys/metaslab.h>
46 #include <sys/metaslab_impl.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/dmu_objset.h>
52 #include <sys/unique.h>
53 #include <sys/dsl_pool.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_dir.h>
56 #include <sys/dsl_prop.h>
57 #include <sys/dsl_synctask.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/arc.h>
60 #include <sys/callb.h>
61 #include <sys/systeminfo.h>
62 #include <sys/spa_boot.h>
63 #include <sys/zfs_ioctl.h>
64 #include <sys/dsl_scan.h>
65 #include <sys/zfeature.h>
66
67 #ifdef _KERNEL
68 #include <sys/bootprops.h>
69 #include <sys/callb.h>
70 #include <sys/cpupart.h>
71 #include <sys/pool.h>
72 #include <sys/sysdc.h>
73 #include <sys/zone.h>
74 #endif /* _KERNEL */
75
76 #include "zfs_prop.h"
77 #include "zfs_comutil.h"
78
79 typedef enum zti_modes {
80 zti_mode_fixed, /* value is # of threads (min 1) */
81 zti_mode_online_percent, /* value is % of online CPUs */
82 zti_mode_batch, /* cpu-intensive; value is ignored */
83 zti_mode_null, /* don't create a taskq */
84 zti_nmodes
85 } zti_modes_t;
97 } zio_taskq_info_t;
98
99 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
100 "issue", "issue_high", "intr", "intr_high"
101 };
102
103 /*
104 * Define the taskq threads for the following I/O types:
105 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
106 */
107 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
108 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
109 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
110 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
111 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
112 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
113 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
114 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
115 };
116
117 static dsl_syncfunc_t spa_sync_version;
118 static dsl_syncfunc_t spa_sync_props;
119 static boolean_t spa_has_active_shared_spare(spa_t *spa);
120 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
121 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
122 char **ereport);
123 static void spa_vdev_resilver_done(spa_t *spa);
124
125 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
126 id_t zio_taskq_psrset_bind = PS_NONE;
127 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
128 uint_t zio_taskq_basedc = 80; /* base duty cycle */
129
130 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
131
132 /*
133 * This (illegal) pool name is used when temporarily importing a spa_t in order
134 * to get the vdev stats associated with the imported devices.
135 */
136 #define TRYIMPORT_NAME "$import"
137
153
154 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
155 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
156
157 if (strval != NULL)
158 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
159 else
160 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
161
162 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
163 nvlist_free(propval);
164 }
165
166 /*
167 * Get property values from the spa configuration.
168 */
169 static void
170 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
171 {
172 vdev_t *rvd = spa->spa_root_vdev;
173 dsl_pool_t *pool = spa->spa_dsl_pool;
174 uint64_t size;
175 uint64_t alloc;
176 uint64_t space;
177 uint64_t cap, version;
178 zprop_source_t src = ZPROP_SRC_NONE;
179 spa_config_dirent_t *dp;
180
181 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
182
183 if (rvd != NULL) {
184 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
185 size = metaslab_class_get_space(spa_normal_class(spa));
186 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
187 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
188 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
189 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
190 size - alloc, src);
191
192 space = 0;
193 for (int c = 0; c < rvd->vdev_children; c++) {
200 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
201 (spa_mode(spa) == FREAD), src);
202
203 cap = (size == 0) ? 0 : (alloc * 100 / size);
204 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
205
206 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
207 ddt_get_pool_dedup_ratio(spa), src);
208
209 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
210 rvd->vdev_state, src);
211
212 version = spa_version(spa);
213 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
214 src = ZPROP_SRC_DEFAULT;
215 else
216 src = ZPROP_SRC_LOCAL;
217 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
218 }
219
220 if (pool != NULL) {
221 dsl_dir_t *freedir = pool->dp_free_dir;
222
223 /*
224 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
225 * when opening pools before this version freedir will be NULL.
226 */
227 if (freedir != NULL) {
228 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
229 freedir->dd_phys->dd_used_bytes, src);
230 } else {
231 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
232 NULL, 0, src);
233 }
234 }
235
236 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
237
238 if (spa->spa_comment != NULL) {
239 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
240 0, ZPROP_SRC_LOCAL);
241 }
242
243 if (spa->spa_root != NULL)
244 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
245 0, ZPROP_SRC_LOCAL);
246
247 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
248 if (dp->scd_path == NULL) {
249 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
250 "none", 0, ZPROP_SRC_LOCAL);
251 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
252 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
253 dp->scd_path, 0, ZPROP_SRC_LOCAL);
254 }
255 }
355 out:
356 if (err && err != ENOENT) {
357 nvlist_free(*nvp);
358 *nvp = NULL;
359 return (err);
360 }
361
362 return (0);
363 }
364
365 /*
366 * Validate the given pool properties nvlist and modify the list
367 * for the property values to be set.
368 */
369 static int
370 spa_prop_validate(spa_t *spa, nvlist_t *props)
371 {
372 nvpair_t *elem;
373 int error = 0, reset_bootfs = 0;
374 uint64_t objnum;
375 boolean_t has_feature = B_FALSE;
376
377 elem = NULL;
378 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
379 uint64_t intval;
380 char *strval, *slash, *check, *fname;
381 const char *propname = nvpair_name(elem);
382 zpool_prop_t prop = zpool_name_to_prop(propname);
383
384 switch (prop) {
385 case ZPROP_INVAL:
386 if (!zpool_prop_feature(propname)) {
387 error = EINVAL;
388 break;
389 }
390
391 /*
392 * Sanitize the input.
393 */
394 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
395 error = EINVAL;
396 break;
397 }
398
399 if (nvpair_value_uint64(elem, &intval) != 0) {
400 error = EINVAL;
401 break;
402 }
403
404 if (intval != 0) {
405 error = EINVAL;
406 break;
407 }
408
409 fname = strchr(propname, '@') + 1;
410 if (zfeature_lookup_name(fname, NULL) != 0) {
411 error = EINVAL;
412 break;
413 }
414
415 has_feature = B_TRUE;
416 break;
417
418 case ZPOOL_PROP_VERSION:
419 error = nvpair_value_uint64(elem, &intval);
420 if (!error &&
421 (intval < spa_version(spa) ||
422 intval > SPA_VERSION_BEFORE_FEATURES ||
423 has_feature))
424 error = EINVAL;
425 break;
426
427 case ZPOOL_PROP_DELEGATION:
428 case ZPOOL_PROP_AUTOREPLACE:
429 case ZPOOL_PROP_LISTSNAPS:
430 case ZPOOL_PROP_AUTOEXPAND:
431 error = nvpair_value_uint64(elem, &intval);
432 if (!error && intval > 1)
433 error = EINVAL;
434 break;
435
436 case ZPOOL_PROP_BOOTFS:
437 /*
438 * If the pool version is less than SPA_VERSION_BOOTFS,
439 * or the pool is still being created (version == 0),
440 * the bootfs property cannot be set.
441 */
442 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
443 error = ENOTSUP;
444 break;
445 }
446
447 /*
448 * Make sure the vdev config is bootable
449 */
450 if (!vdev_is_bootable(spa->spa_root_vdev)) {
451 error = ENOTSUP;
452 break;
453 }
454
455 reset_bootfs = 1;
456
457 error = nvpair_value_string(elem, &strval);
458
459 if (!error) {
460 objset_t *os;
461 uint64_t compress;
462
463 if (strval == NULL || strval[0] == '\0') {
464 objnum = zpool_prop_default_numeric(
465 ZPOOL_PROP_BOOTFS);
466 break;
467 }
468
469 if (error = dmu_objset_hold(strval, FTAG, &os))
470 break;
471
472 /* Must be ZPL and not gzip compressed. */
473
474 if (dmu_objset_type(os) != DMU_OST_ZFS) {
475 error = ENOTSUP;
476 } else if ((error = dsl_prop_get_integer(strval,
477 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
478 &compress, NULL)) == 0 &&
479 !BOOTFS_COMPRESS_VALID(compress)) {
480 error = ENOTSUP;
590
591 dp = kmem_alloc(sizeof (spa_config_dirent_t),
592 KM_SLEEP);
593
594 if (cachefile[0] == '\0')
595 dp->scd_path = spa_strdup(spa_config_path);
596 else if (strcmp(cachefile, "none") == 0)
597 dp->scd_path = NULL;
598 else
599 dp->scd_path = spa_strdup(cachefile);
600
601 list_insert_head(&spa->spa_config_list, dp);
602 if (need_sync)
603 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
604 }
605
606 int
607 spa_prop_set(spa_t *spa, nvlist_t *nvp)
608 {
609 int error;
610 nvpair_t *elem = NULL;
611 boolean_t need_sync = B_FALSE;
612
613 if ((error = spa_prop_validate(spa, nvp)) != 0)
614 return (error);
615
616 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
617 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
618
619 if (prop == ZPOOL_PROP_CACHEFILE ||
620 prop == ZPOOL_PROP_ALTROOT ||
621 prop == ZPOOL_PROP_READONLY)
622 continue;
623
624 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
625 uint64_t ver;
626
627 if (prop == ZPOOL_PROP_VERSION) {
628 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
629 } else {
630 ASSERT(zpool_prop_feature(nvpair_name(elem)));
631 ver = SPA_VERSION_FEATURES;
632 need_sync = B_TRUE;
633 }
634
635 /* Save time if the version is already set. */
636 if (ver == spa_version(spa))
637 continue;
638
639 /*
640 * In addition to the pool directory object, we might
641 * create the pool properties object, the features for
642 * read object, the features for write object, or the
643 * feature descriptions object.
644 */
645 error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
646 spa_sync_version, spa, &ver, 6);
647 if (error)
648 return (error);
649 continue;
650 }
651
652 need_sync = B_TRUE;
653 break;
654 }
655
656 if (need_sync) {
657 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
658 spa, nvp, 6));
659 }
660
661 return (0);
662 }
663
664 /*
665 * If the bootfs property value is dsobj, clear it.
666 */
667 void
668 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
669 {
670 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
671 VERIFY(zap_remove(spa->spa_meta_objset,
672 spa->spa_pool_props_object,
673 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
674 spa->spa_bootfs = 0;
675 }
676 }
677
678 /*
679 * Change the GUID for the pool. This is done so that we can later
680 * re-import a pool built from a clone of our own vdevs. We will modify
1665 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1666 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1667 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1668 mutex_exit(&spa->spa_props_lock);
1669 }
1670
1671 typedef struct spa_load_error {
1672 uint64_t sle_meta_count;
1673 uint64_t sle_data_count;
1674 } spa_load_error_t;
1675
1676 static void
1677 spa_load_verify_done(zio_t *zio)
1678 {
1679 blkptr_t *bp = zio->io_bp;
1680 spa_load_error_t *sle = zio->io_private;
1681 dmu_object_type_t type = BP_GET_TYPE(bp);
1682 int error = zio->io_error;
1683
1684 if (error) {
1685 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1686 type != DMU_OT_INTENT_LOG)
1687 atomic_add_64(&sle->sle_meta_count, 1);
1688 else
1689 atomic_add_64(&sle->sle_data_count, 1);
1690 }
1691 zio_data_buf_free(zio->io_data, zio->io_size);
1692 }
1693
1694 /*ARGSUSED*/
1695 static int
1696 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1697 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1698 {
1699 if (bp != NULL) {
1700 zio_t *rio = arg;
1701 size_t size = BP_GET_PSIZE(bp);
1702 void *data = zio_data_buf_alloc(size);
1703
1704 zio_nowait(zio_read(rio, spa, bp, data, size,
1705 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1895 */
1896 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1897 &spa->spa_ubsync.ub_version) != 0)
1898 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1899
1900 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1901 &spa->spa_config_txg);
1902
1903 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1904 spa_guid_exists(pool_guid, 0)) {
1905 error = EEXIST;
1906 } else {
1907 spa->spa_config_guid = pool_guid;
1908
1909 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1910 &nvl) == 0) {
1911 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1912 KM_SLEEP) == 0);
1913 }
1914
1915 nvlist_free(spa->spa_load_info);
1916 spa->spa_load_info = fnvlist_alloc();
1917
1918 gethrestime(&spa->spa_loaded_ts);
1919 error = spa_load_impl(spa, pool_guid, config, state, type,
1920 mosconfig, &ereport);
1921 }
1922
1923 spa->spa_minref = refcount_count(&spa->spa_refcount);
1924 if (error) {
1925 if (error != EEXIST) {
1926 spa->spa_loaded_ts.tv_sec = 0;
1927 spa->spa_loaded_ts.tv_nsec = 0;
1928 }
1929 if (error != EBADF) {
1930 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1931 }
1932 }
1933 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1934 spa->spa_ena = 0;
1935
1936 return (error);
1937 }
1938
1939 /*
1940 * Load an existing storage pool, using the pool's builtin spa_config as a
1941 * source of configuration information.
1942 */
1943 static int
1944 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1945 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1946 char **ereport)
1947 {
1948 int error = 0;
1949 nvlist_t *nvroot = NULL;
1950 nvlist_t *label;
1951 vdev_t *rvd;
1952 uberblock_t *ub = &spa->spa_uberblock;
1953 uint64_t children, config_cache_txg = spa->spa_config_txg;
1954 int orig_mode = spa->spa_mode;
1955 int parse;
1956 uint64_t obj;
1957 boolean_t missing_feat_write = B_FALSE;
1958
1959 /*
1960 * If this is an untrusted config, access the pool in read-only mode.
1961 * This prevents things like resilvering recently removed devices.
1962 */
1963 if (!mosconfig)
1964 spa->spa_mode = FREAD;
1965
1966 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1967
1968 spa->spa_load_state = state;
1969
1970 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1971 return (EINVAL);
1972
1973 parse = (type == SPA_IMPORT_EXISTING ?
1974 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1975
1976 /*
1977 * Create "The Godfather" zio to hold all async IOs
2017 *
2018 * If we're assembling a new pool that's been split off from an
2019 * existing pool, the labels haven't yet been updated so we skip
2020 * validation for now.
2021 */
2022 if (type != SPA_IMPORT_ASSEMBLE) {
2023 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2024 error = vdev_validate(rvd, mosconfig);
2025 spa_config_exit(spa, SCL_ALL, FTAG);
2026
2027 if (error != 0)
2028 return (error);
2029
2030 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2031 return (ENXIO);
2032 }
2033
2034 /*
2035 * Find the best uberblock.
2036 */
2037 vdev_uberblock_load(rvd, ub, &label);
2038
2039 /*
2040 * If we weren't able to find a single valid uberblock, return failure.
2041 */
2042 if (ub->ub_txg == 0) {
2043 nvlist_free(label);
2044 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2045 }
2046
2047 /*
2048 * If the pool has an unsupported version we can't open it.
2049 */
2050 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2051 nvlist_free(label);
2052 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2053 }
2054
2055 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2056 nvlist_t *features;
2057
2058 /*
2059 * If we weren't able to find what's necessary for reading the
2060 * MOS in the label, return failure.
2061 */
2062 if (label == NULL || nvlist_lookup_nvlist(label,
2063 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
2064 nvlist_free(label);
2065 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2066 ENXIO));
2067 }
2068
2069 /*
2070 * Update our in-core representation with the definitive values
2071 * from the label.
2072 */
2073 nvlist_free(spa->spa_label_features);
2074 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2075 }
2076
2077 nvlist_free(label);
2078
2079 /*
2080 * Look through entries in the label nvlist's features_for_read. If
2081 * there is a feature listed there which we don't understand then we
2082 * cannot open a pool.
2083 */
2084 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2085 nvlist_t *unsup_feat;
2086
2087 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2088 0);
2089
2090 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2091 NULL); nvp != NULL;
2092 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2093 if (!zfeature_is_supported(nvpair_name(nvp))) {
2094 VERIFY(nvlist_add_string(unsup_feat,
2095 nvpair_name(nvp), "") == 0);
2096 }
2097 }
2098
2099 if (!nvlist_empty(unsup_feat)) {
2100 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2101 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2102 nvlist_free(unsup_feat);
2103 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2104 ENOTSUP));
2105 }
2106
2107 nvlist_free(unsup_feat);
2108 }
2109
2110 /*
2111 * If the vdev guid sum doesn't match the uberblock, we have an
2112 * incomplete configuration. We first check to see if the pool
2113 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
2114 * If it is, defer the vdev_guid_sum check till later so we
2115 * can handle missing vdevs.
2116 */
2117 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
2118 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
2119 rvd->vdev_guid_sum != ub->ub_guid_sum)
2120 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2121
2122 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2123 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2124 spa_try_repair(spa, config);
2125 spa_config_exit(spa, SCL_ALL, FTAG);
2126 nvlist_free(spa->spa_config_splitting);
2127 spa->spa_config_splitting = NULL;
2128 }
2129
2130 /*
2131 * Initialize internal SPA structures.
2132 */
2133 spa->spa_state = POOL_STATE_ACTIVE;
2134 spa->spa_ubsync = spa->spa_uberblock;
2135 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2136 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2137 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2138 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2139 spa->spa_claim_max_txg = spa->spa_first_txg;
2140 spa->spa_prev_software_version = ub->ub_software_version;
2141
2142 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2143 if (error)
2144 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2145 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2146
2147 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2148 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2149
2150 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2151 boolean_t missing_feat_read = B_FALSE;
2152 nvlist_t *unsup_feat;
2153
2154 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2155 &spa->spa_feat_for_read_obj) != 0) {
2156 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2157 }
2158
2159 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2160 &spa->spa_feat_for_write_obj) != 0) {
2161 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2162 }
2163
2164 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2165 &spa->spa_feat_desc_obj) != 0) {
2166 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2167 }
2168
2169 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2170 0);
2171
2172 if (!feature_is_supported(spa->spa_meta_objset,
2173 spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
2174 unsup_feat))
2175 missing_feat_read = B_TRUE;
2176
2177 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
2178 if (!feature_is_supported(spa->spa_meta_objset,
2179 spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
2180 unsup_feat))
2181 missing_feat_write = B_TRUE;
2182 }
2183
2184 if (!nvlist_empty(unsup_feat)) {
2185 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2186 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2187 }
2188
2189 nvlist_free(unsup_feat);
2190
2191 if (!missing_feat_read) {
2192 fnvlist_add_boolean(spa->spa_load_info,
2193 ZPOOL_CONFIG_CAN_RDONLY);
2194 }
2195
2196 /*
2197 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2198 * twofold: to determine whether the pool is available for
2199 * import in read-write mode and (if it is not) whether the
2200 * pool is available for import in read-only mode. If the pool
2201 * is available for import in read-write mode, it is displayed
2202 * as available in userland; if it is not available for import
2203 * in read-only mode, it is displayed as unavailable in
2204 * userland. If the pool is available for import in read-only
2205 * mode but not read-write mode, it is displayed as unavailable
2206 * in userland with a special note that the pool is actually
2207 * available for open in read-only mode.
2208 *
2209 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2210 * missing a feature for write, we must first determine whether
2211 * the pool can be opened read-only before returning to
2212 * userland in order to know whether to display the
2213 * abovementioned note.
2214 */
2215 if (missing_feat_read || (missing_feat_write &&
2216 spa_writeable(spa))) {
2217 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2218 ENOTSUP));
2219 }
2220 }
2221
2222 spa->spa_is_initializing = B_TRUE;
2223 error = dsl_pool_open(spa->spa_dsl_pool);
2224 spa->spa_is_initializing = B_FALSE;
2225 if (error != 0)
2226 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2227
2228 if (!mosconfig) {
2229 uint64_t hostid;
2230 nvlist_t *policy = NULL, *nvconfig;
2231
2232 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2233 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2234
2235 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2236 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2237 char *hostname;
2238 unsigned long myhostid = 0;
2239
2240 VERIFY(nvlist_lookup_string(nvconfig,
2241 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2242
2243 #ifdef _KERNEL
2244 myhostid = zone_get_hostid(NULL);
2245 #else /* _KERNEL */
2246 /*
2247 * We're emulating the system's hostid in userland, so
2425 * Validate the config, using the MOS config to fill in any
2426 * information which might be missing. If we fail to validate
2427 * the config then declare the pool unfit for use. If we're
2428 * assembling a pool from a split, the log is not transferred
2429 * over.
2430 */
2431 if (type != SPA_IMPORT_ASSEMBLE) {
2432 nvlist_t *nvconfig;
2433
2434 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2435 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2436
2437 if (!spa_config_valid(spa, nvconfig)) {
2438 nvlist_free(nvconfig);
2439 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2440 ENXIO));
2441 }
2442 nvlist_free(nvconfig);
2443
2444 /*
2445 * Now that we've validated the config, check the state of the
2446 * root vdev. If it can't be opened, it indicates one or
2447 * more toplevel vdevs are faulted.
2448 */
2449 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2450 return (ENXIO);
2451
2452 if (spa_check_logs(spa)) {
2453 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2454 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2455 }
2456 }
2457
2458 if (missing_feat_write) {
2459 ASSERT(state == SPA_LOAD_TRYIMPORT);
2460
2461 /*
2462 * At this point, we know that we can open the pool in
2463 * read-only mode but not read-write mode. We now have enough
2464 * information and can return to userland.
2465 */
2466 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
2467 }
2468
2469 /*
2470 * We've successfully opened the pool, verify that we're ready
2471 * to start pushing transactions.
2472 */
2473 if (state != SPA_LOAD_TRYIMPORT) {
2474 if (error = spa_load_verify(spa))
2475 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2476 error));
2477 }
2478
2479 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2480 spa->spa_load_max_txg == UINT64_MAX)) {
2481 dmu_tx_t *tx;
2482 int need_update = B_FALSE;
2483
2484 ASSERT(state != SPA_LOAD_TRYIMPORT);
2485
2486 /*
2487 * Claim log blocks that haven't been committed yet.
2488 * This must all happen in a single txg.
2489 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2558
2559 return (0);
2560 }
2561
2562 static int
2563 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2564 {
2565 int mode = spa->spa_mode;
2566
2567 spa_unload(spa);
2568 spa_deactivate(spa);
2569
2570 spa->spa_load_max_txg--;
2571
2572 spa_activate(spa, mode);
2573 spa_async_suspend(spa);
2574
2575 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2576 }
2577
2578 /*
2579 * If spa_load() fails this function will try loading prior txg's. If
2580 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
2581 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
2582 * function will not rewind the pool and will return the same error as
2583 * spa_load().
2584 */
2585 static int
2586 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2587 uint64_t max_request, int rewind_flags)
2588 {
2589 nvlist_t *loadinfo = NULL;
2590 nvlist_t *config = NULL;
2591 int load_error, rewind_error;
2592 uint64_t safe_rewind_txg;
2593 uint64_t min_txg;
2594
2595 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2596 spa->spa_load_max_txg = spa->spa_load_txg;
2597 spa_set_log_state(spa, SPA_LOG_CLEAR);
2598 } else {
2599 spa->spa_load_max_txg = max_request;
2600 }
2601
2602 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2603 mosconfig);
2604 if (load_error == 0)
2605 return (0);
2606
2607 if (spa->spa_root_vdev != NULL)
2608 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2609
2610 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2611 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2612
2613 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2614 nvlist_free(config);
2615 return (load_error);
2616 }
2617
2618 if (state == SPA_LOAD_RECOVER) {
2619 /* Price of rolling back is discarding txgs, including log */
2620 spa_set_log_state(spa, SPA_LOG_CLEAR);
2621 } else {
2622 /*
2623 * If we aren't rolling back save the load info from our first
2624 * import attempt so that we can restore it after attempting
2625 * to rewind.
2626 */
2627 loadinfo = spa->spa_load_info;
2628 spa->spa_load_info = fnvlist_alloc();
2629 }
2630
2631 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2632 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2633 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2634 TXG_INITIAL : safe_rewind_txg;
2635
2636 /*
2637 * Continue as long as we're finding errors, we're still within
2638 * the acceptable rewind range, and we're still finding uberblocks
2639 */
2640 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2641 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2642 if (spa->spa_load_max_txg < safe_rewind_txg)
2643 spa->spa_extreme_rewind = B_TRUE;
2644 rewind_error = spa_load_retry(spa, state, mosconfig);
2645 }
2646
2647 spa->spa_extreme_rewind = B_FALSE;
2648 spa->spa_load_max_txg = UINT64_MAX;
2649
2650 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2651 spa_config_set(spa, config);
2652
2653 if (state == SPA_LOAD_RECOVER) {
2654 ASSERT3P(loadinfo, ==, NULL);
2655 return (rewind_error);
2656 } else {
2657 /* Store the rewind info as part of the initial load info */
2658 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
2659 spa->spa_load_info);
2660
2661 /* Restore the initial load info */
2662 fnvlist_free(spa->spa_load_info);
2663 spa->spa_load_info = loadinfo;
2664
2665 return (load_error);
2666 }
2667 }
2668
2669 /*
2670 * Pool Open/Import
2671 *
2672 * The import case is identical to an open except that the configuration is sent
2673 * down from userland, instead of grabbed from the configuration cache. For the
2674 * case of an open, the pool configuration will exist in the
2675 * POOL_STATE_UNINITIALIZED state.
2676 *
2677 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2678 * the same time open the pool, without having to keep around the spa_t in some
2679 * ambiguous state.
2680 */
2681 static int
2682 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2683 nvlist_t **config)
2684 {
2685 spa_t *spa;
2686 spa_load_state_t state = SPA_LOAD_OPEN;
2916 ZPOOL_CONFIG_GUID, &guid) == 0);
2917
2918 vd = NULL;
2919 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2920 if (guid ==
2921 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2922 vd = spa->spa_l2cache.sav_vdevs[j];
2923 break;
2924 }
2925 }
2926 ASSERT(vd != NULL);
2927
2928 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2929 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2930 == 0);
2931 vdev_get_stats(vd, vs);
2932 }
2933 }
2934 }
2935
2936 static void
2937 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
2938 {
2939 nvlist_t *features;
2940 zap_cursor_t zc;
2941 zap_attribute_t za;
2942
2943 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2944 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2945
2946 if (spa->spa_feat_for_read_obj != 0) {
2947 for (zap_cursor_init(&zc, spa->spa_meta_objset,
2948 spa->spa_feat_for_read_obj);
2949 zap_cursor_retrieve(&zc, &za) == 0;
2950 zap_cursor_advance(&zc)) {
2951 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
2952 za.za_num_integers == 1);
2953 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
2954 za.za_first_integer));
2955 }
2956 zap_cursor_fini(&zc);
2957 }
2958
2959 if (spa->spa_feat_for_write_obj != 0) {
2960 for (zap_cursor_init(&zc, spa->spa_meta_objset,
2961 spa->spa_feat_for_write_obj);
2962 zap_cursor_retrieve(&zc, &za) == 0;
2963 zap_cursor_advance(&zc)) {
2964 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
2965 za.za_num_integers == 1);
2966 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
2967 za.za_first_integer));
2968 }
2969 zap_cursor_fini(&zc);
2970 }
2971
2972 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
2973 features) == 0);
2974 nvlist_free(features);
2975 }
2976
2977 int
2978 spa_get_stats(const char *name, nvlist_t **config,
2979 char *altroot, size_t buflen)
2980 {
2981 int error;
2982 spa_t *spa;
2983
2984 *config = NULL;
2985 error = spa_open_common(name, &spa, FTAG, NULL, config);
2986
2987 if (spa != NULL) {
2988 /*
2989 * This still leaves a window of inconsistency where the spares
2990 * or l2cache devices could change and the config would be
2991 * self-inconsistent.
2992 */
2993 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2994
2995 if (*config != NULL) {
2996 uint64_t loadtimes[2];
2997
2998 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2999 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
3000 VERIFY(nvlist_add_uint64_array(*config,
3001 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
3002
3003 VERIFY(nvlist_add_uint64(*config,
3004 ZPOOL_CONFIG_ERRCOUNT,
3005 spa_get_errlog_size(spa)) == 0);
3006
3007 if (spa_suspended(spa))
3008 VERIFY(nvlist_add_uint64(*config,
3009 ZPOOL_CONFIG_SUSPENDED,
3010 spa->spa_failmode) == 0);
3011
3012 spa_add_spares(spa, *config);
3013 spa_add_l2cache(spa, *config);
3014 spa_add_feature_stats(spa, *config);
3015 }
3016 }
3017
3018 /*
3019 * We want to get the alternate root even for faulted pools, so we cheat
3020 * and call spa_lookup() directly.
3021 */
3022 if (altroot) {
3023 if (spa == NULL) {
3024 mutex_enter(&spa_namespace_lock);
3025 spa = spa_lookup(name);
3026 if (spa)
3027 spa_altroot(spa, altroot, buflen);
3028 else
3029 altroot[0] = '\0';
3030 spa = NULL;
3031 mutex_exit(&spa_namespace_lock);
3032 } else {
3033 spa_altroot(spa, altroot, buflen);
3034 }
3215 }
3216 }
3217
3218 /*
3219 * Pool Creation
3220 */
3221 int
3222 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
3223 const char *history_str, nvlist_t *zplprops)
3224 {
3225 spa_t *spa;
3226 char *altroot = NULL;
3227 vdev_t *rvd;
3228 dsl_pool_t *dp;
3229 dmu_tx_t *tx;
3230 int error = 0;
3231 uint64_t txg = TXG_INITIAL;
3232 nvlist_t **spares, **l2cache;
3233 uint_t nspares, nl2cache;
3234 uint64_t version, obj;
3235 boolean_t has_features;
3236
3237 /*
3238 * If this pool already exists, return failure.
3239 */
3240 mutex_enter(&spa_namespace_lock);
3241 if (spa_lookup(pool) != NULL) {
3242 mutex_exit(&spa_namespace_lock);
3243 return (EEXIST);
3244 }
3245
3246 /*
3247 * Allocate a new spa_t structure.
3248 */
3249 (void) nvlist_lookup_string(props,
3250 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3251 spa = spa_add(pool, NULL, altroot);
3252 spa_activate(spa, spa_mode_global);
3253
3254 if (props && (error = spa_prop_validate(spa, props))) {
3255 spa_deactivate(spa);
3256 spa_remove(spa);
3257 mutex_exit(&spa_namespace_lock);
3258 return (error);
3259 }
3260
3261 has_features = B_FALSE;
3262 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
3263 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
3264 if (zpool_prop_feature(nvpair_name(elem)))
3265 has_features = B_TRUE;
3266 }
3267
3268 if (has_features || nvlist_lookup_uint64(props,
3269 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
3270 version = SPA_VERSION;
3271 }
3272 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
3273
3274 spa->spa_first_txg = txg;
3275 spa->spa_uberblock.ub_txg = txg - 1;
3276 spa->spa_uberblock.ub_version = version;
3277 spa->spa_ubsync = spa->spa_uberblock;
3278
3279 /*
3280 * Create "The Godfather" zio to hold all async IOs
3281 */
3282 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
3283 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
3284
3285 /*
3286 * Create the root vdev.
3287 */
3288 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3289
3290 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
3291
3292 ASSERT(error != 0 || rvd != NULL);
3328 spa_load_spares(spa);
3329 spa_config_exit(spa, SCL_ALL, FTAG);
3330 spa->spa_spares.sav_sync = B_TRUE;
3331 }
3332
3333 /*
3334 * Get the list of level 2 cache devices, if specified.
3335 */
3336 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3337 &l2cache, &nl2cache) == 0) {
3338 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3339 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3340 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3341 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3342 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3343 spa_load_l2cache(spa);
3344 spa_config_exit(spa, SCL_ALL, FTAG);
3345 spa->spa_l2cache.sav_sync = B_TRUE;
3346 }
3347
3348 spa->spa_is_initializing = B_TRUE;
3349 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3350 spa->spa_meta_objset = dp->dp_meta_objset;
3351 spa->spa_is_initializing = B_FALSE;
3352
3353 /*
3354 * Create DDTs (dedup tables).
3355 */
3356 ddt_create(spa);
3357
3358 spa_update_dspace(spa);
3359
3360 tx = dmu_tx_create_assigned(dp, txg);
3361
3362 /*
3363 * Create the pool config object.
3364 */
3365 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3366 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3367 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3368
3369 if (zap_add(spa->spa_meta_objset,
3370 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3371 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3372 cmn_err(CE_PANIC, "failed to add pool config");
3373 }
3374
3375 if (spa_version(spa) >= SPA_VERSION_FEATURES)
3376 spa_feature_create_zap_objects(spa, tx);
3377
3378 if (zap_add(spa->spa_meta_objset,
3379 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3380 sizeof (uint64_t), 1, &version, tx) != 0) {
3381 cmn_err(CE_PANIC, "failed to add pool version");
3382 }
3383
3384 /* Newly created pools with the right version are always deflated. */
3385 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3386 spa->spa_deflate = TRUE;
3387 if (zap_add(spa->spa_meta_objset,
3388 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3389 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3390 cmn_err(CE_PANIC, "failed to add deflate");
3391 }
3392 }
3393
3394 /*
3395 * Create the deferred-free bpobj. Turn off compression
3396 * because sync-to-convergence takes longer if the blocksize
3397 * keeps changing.
3548 vdev_t *rvd, *bvd, *avd = NULL;
3549 nvlist_t *config, *nvtop;
3550 uint64_t guid, txg;
3551 char *pname;
3552 int error;
3553
3554 /*
3555 * Read the label from the boot device and generate a configuration.
3556 */
3557 config = spa_generate_rootconf(devpath, devid, &guid);
3558 #if defined(_OBP) && defined(_KERNEL)
3559 if (config == NULL) {
3560 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3561 /* iscsi boot */
3562 get_iscsi_bootpath_phy(devpath);
3563 config = spa_generate_rootconf(devpath, devid, &guid);
3564 }
3565 }
3566 #endif
3567 if (config == NULL) {
3568 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3569 devpath);
3570 return (EIO);
3571 }
3572
3573 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3574 &pname) == 0);
3575 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3576
3577 mutex_enter(&spa_namespace_lock);
3578 if ((spa = spa_lookup(pname)) != NULL) {
3579 /*
3580 * Remove the existing root pool from the namespace so that we
3581 * can replace it with the correct config we just read in.
3582 */
3583 spa_remove(spa);
3584 }
3585
3586 spa = spa_add(pname, config, NULL);
3587 spa->spa_is_root = B_TRUE;
3588 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3862 spa_activate(spa, FREAD);
3863
3864 /*
3865 * Pass off the heavy lifting to spa_load().
3866 * Pass TRUE for mosconfig because the user-supplied config
3867 * is actually the one to trust when doing an import.
3868 */
3869 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3870
3871 /*
3872 * If 'tryconfig' was at least parsable, return the current config.
3873 */
3874 if (spa->spa_root_vdev != NULL) {
3875 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3876 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3877 poolname) == 0);
3878 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3879 state) == 0);
3880 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3881 spa->spa_uberblock.ub_timestamp) == 0);
3882 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3883 spa->spa_load_info) == 0);
3884
3885 /*
3886 * If the bootfs property exists on this pool then we
3887 * copy it out so that external consumers can tell which
3888 * pools are bootable.
3889 */
3890 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3891 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3892
3893 /*
3894 * We have to play games with the name since the
3895 * pool was opened as TRYIMPORT_NAME.
3896 */
3897 if (dsl_dsobj_to_dsname(spa_name(spa),
3898 spa->spa_bootfs, tmpname) == 0) {
3899 char *cp;
3900 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3901
3902 cp = strchr(tmpname, '/');
3903 if (cp == NULL) {
5581 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5582 zio->io_flags));
5583 return (0);
5584 }
5585
5586 static void
5587 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5588 {
5589 char *packed = NULL;
5590 size_t bufsize;
5591 size_t nvsize = 0;
5592 dmu_buf_t *db;
5593
5594 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5595
5596 /*
5597 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5598 * information. This avoids the dbuf_will_dirty() path and
5599 * saves us a pre-read to get data we don't actually care about.
5600 */
5601 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5602 packed = kmem_alloc(bufsize, KM_SLEEP);
5603
5604 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5605 KM_SLEEP) == 0);
5606 bzero(packed + nvsize, bufsize - nvsize);
5607
5608 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5609
5610 kmem_free(packed, bufsize);
5611
5612 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5613 dmu_buf_will_dirty(db, tx);
5614 *(uint64_t *)db->db_data = nvsize;
5615 dmu_buf_rele(db, FTAG);
5616 }
5617
5618 static void
5619 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5620 const char *config, const char *entry)
5621 {
5666 {
5667 nvlist_t *config;
5668
5669 if (list_is_empty(&spa->spa_config_dirty_list))
5670 return;
5671
5672 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5673
5674 config = spa_config_generate(spa, spa->spa_root_vdev,
5675 dmu_tx_get_txg(tx), B_FALSE);
5676
5677 spa_config_exit(spa, SCL_STATE, FTAG);
5678
5679 if (spa->spa_config_syncing)
5680 nvlist_free(spa->spa_config_syncing);
5681 spa->spa_config_syncing = config;
5682
5683 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5684 }
5685
5686 static void
5687 spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
5688 {
5689 spa_t *spa = arg1;
5690 uint64_t version = *(uint64_t *)arg2;
5691
5692 /*
5693 * Setting the version is special cased when first creating the pool.
5694 */
5695 ASSERT(tx->tx_txg != TXG_INITIAL);
5696
5697 ASSERT(version <= SPA_VERSION);
5698 ASSERT(version >= spa_version(spa));
5699
5700 spa->spa_uberblock.ub_version = version;
5701 vdev_config_dirty(spa->spa_root_vdev);
5702 }
5703
5704 /*
5705 * Set zpool properties.
5706 */
5707 static void
5708 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5709 {
5710 spa_t *spa = arg1;
5711 objset_t *mos = spa->spa_meta_objset;
5712 nvlist_t *nvp = arg2;
5713 nvpair_t *elem = NULL;
5714
5715 mutex_enter(&spa->spa_props_lock);
5716
5717 while ((elem = nvlist_next_nvpair(nvp, elem))) {
5718 uint64_t intval;
5719 char *strval, *fname;
5720 zpool_prop_t prop;
5721 const char *propname;
5722 zprop_type_t proptype;
5723 zfeature_info_t *feature;
5724
5725 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5726 case ZPROP_INVAL:
5727 /*
5728 * We checked this earlier in spa_prop_validate().
5729 */
5730 ASSERT(zpool_prop_feature(nvpair_name(elem)));
5731
5732 fname = strchr(nvpair_name(elem), '@') + 1;
5733 VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
5734
5735 spa_feature_enable(spa, feature, tx);
5736 break;
5737
5738 case ZPOOL_PROP_VERSION:
5739 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5740 /*
5741 * The version is synced seperatly before other
5742 * properties and should be correct by now.
5743 */
5744 ASSERT3U(spa_version(spa), >=, intval);
5745 break;
5746
5747 case ZPOOL_PROP_ALTROOT:
5748 /*
5749 * 'altroot' is a non-persistent property. It should
5750 * have been set temporarily at creation or import time.
5751 */
5752 ASSERT(spa->spa_root != NULL);
5753 break;
5754
5755 case ZPOOL_PROP_READONLY:
5756 case ZPOOL_PROP_CACHEFILE:
5757 /*
5758 * 'readonly' and 'cachefile' are also non-persisitent
5759 * properties.
5760 */
5761 break;
5762 case ZPOOL_PROP_COMMENT:
5763 VERIFY(nvpair_value_string(elem, &strval) == 0);
5764 if (spa->spa_comment != NULL)
5765 spa_strfree(spa->spa_comment);
5766 spa->spa_comment = spa_strdup(strval);
5767 /*
5768 * We need to dirty the configuration on all the vdevs
5769 * so that their labels get updated. It's unnecessary
5770 * to do this for pool creation since the vdev's
5771 * configuratoin has already been dirtied.
5772 */
5773 if (tx->tx_txg != TXG_INITIAL)
5774 vdev_config_dirty(spa->spa_root_vdev);
5775 break;
5776 default:
5777 /*
5778 * Set pool property values in the poolprops mos object.
5779 */
5780 if (spa->spa_pool_props_object == 0) {
5781 spa->spa_pool_props_object =
5782 zap_create_link(mos, DMU_OT_POOL_PROPS,
5783 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5784 tx);
5785 }
5786
5787 /* normalize the property name */
5788 propname = zpool_prop_to_name(prop);
5789 proptype = zpool_prop_get_type(prop);
5790
5791 if (nvpair_type(elem) == DATA_TYPE_STRING) {
5792 ASSERT(proptype == PROP_TYPE_STRING);
5793 VERIFY(nvpair_value_string(elem, &strval) == 0);
5794 VERIFY(zap_update(mos,
5795 spa->spa_pool_props_object, propname,
5796 1, strlen(strval) + 1, strval, tx) == 0);
5797
5798 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5799 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5800
5801 if (proptype == PROP_TYPE_INDEX) {
5802 const char *unused;
5803 VERIFY(zpool_prop_index_to_string(
5804 prop, intval, &unused) == 0);
5863 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5864 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5865 dsl_pool_create_origin(dp, tx);
5866
5867 /* Keeping the origin open increases spa_minref */
5868 spa->spa_minref += 3;
5869 }
5870
5871 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5872 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5873 dsl_pool_upgrade_clones(dp, tx);
5874 }
5875
5876 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5877 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5878 dsl_pool_upgrade_dir_clones(dp, tx);
5879
5880 /* Keeping the freedir open increases spa_minref */
5881 spa->spa_minref += 3;
5882 }
5883
5884 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
5885 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
5886 spa_feature_create_zap_objects(spa, tx);
5887 }
5888 }
5889
5890 /*
5891 * Sync the specified transaction group. New blocks may be dirtied as
5892 * part of the process, so we iterate until it converges.
5893 */
5894 void
5895 spa_sync(spa_t *spa, uint64_t txg)
5896 {
5897 dsl_pool_t *dp = spa->spa_dsl_pool;
5898 objset_t *mos = spa->spa_meta_objset;
5899 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5900 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5901 vdev_t *rvd = spa->spa_root_vdev;
5902 vdev_t *vd;
5903 dmu_tx_t *tx;
5904 int error;
5905
5906 VERIFY(spa_writeable(spa));
5907
|