Print this page
5882 Temporary pool names
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Approved by: Dan McDonald <danmcd@joyent.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/spa.c
+++ new/usr/src/uts/common/fs/zfs/spa.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
26 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27 27 * Copyright 2013 Saso Kiselkov. All rights reserved.
28 28 * Copyright (c) 2014 Integros [integros.com]
29 29 * Copyright 2016 Toomas Soome <tsoome@me.com>
30 30 * Copyright 2018 Joyent, Inc.
31 31 * Copyright (c) 2017 Datto Inc.
32 32 * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
33 33 */
34 34
35 35 /*
36 36 * SPA: Storage Pool Allocator
37 37 *
38 38 * This file contains all the routines used when modifying on-disk SPA state.
39 39 * This includes opening, importing, destroying, exporting a pool, and syncing a
40 40 * pool.
41 41 */
42 42
43 43 #include <sys/zfs_context.h>
44 44 #include <sys/fm/fs/zfs.h>
45 45 #include <sys/spa_impl.h>
46 46 #include <sys/zio.h>
47 47 #include <sys/zio_checksum.h>
48 48 #include <sys/dmu.h>
49 49 #include <sys/dmu_tx.h>
50 50 #include <sys/zap.h>
51 51 #include <sys/zil.h>
52 52 #include <sys/ddt.h>
53 53 #include <sys/vdev_impl.h>
54 54 #include <sys/vdev_removal.h>
55 55 #include <sys/vdev_indirect_mapping.h>
56 56 #include <sys/vdev_indirect_births.h>
57 57 #include <sys/vdev_initialize.h>
58 58 #include <sys/metaslab.h>
59 59 #include <sys/metaslab_impl.h>
60 60 #include <sys/uberblock_impl.h>
61 61 #include <sys/txg.h>
62 62 #include <sys/avl.h>
63 63 #include <sys/bpobj.h>
64 64 #include <sys/dmu_traverse.h>
65 65 #include <sys/dmu_objset.h>
66 66 #include <sys/unique.h>
67 67 #include <sys/dsl_pool.h>
68 68 #include <sys/dsl_dataset.h>
69 69 #include <sys/dsl_dir.h>
70 70 #include <sys/dsl_prop.h>
71 71 #include <sys/dsl_synctask.h>
72 72 #include <sys/fs/zfs.h>
73 73 #include <sys/arc.h>
74 74 #include <sys/callb.h>
75 75 #include <sys/systeminfo.h>
76 76 #include <sys/spa_boot.h>
77 77 #include <sys/zfs_ioctl.h>
78 78 #include <sys/dsl_scan.h>
79 79 #include <sys/zfeature.h>
80 80 #include <sys/dsl_destroy.h>
81 81 #include <sys/abd.h>
82 82
83 83 #ifdef _KERNEL
84 84 #include <sys/bootprops.h>
85 85 #include <sys/callb.h>
86 86 #include <sys/cpupart.h>
87 87 #include <sys/pool.h>
88 88 #include <sys/sysdc.h>
89 89 #include <sys/zone.h>
90 90 #endif /* _KERNEL */
91 91
92 92 #include "zfs_prop.h"
93 93 #include "zfs_comutil.h"
94 94
95 95 /*
96 96 * The interval, in seconds, at which failed configuration cache file writes
97 97 * should be retried.
98 98 */
99 99 int zfs_ccw_retry_interval = 300;
100 100
101 101 typedef enum zti_modes {
102 102 ZTI_MODE_FIXED, /* value is # of threads (min 1) */
103 103 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
104 104 ZTI_MODE_NULL, /* don't create a taskq */
105 105 ZTI_NMODES
106 106 } zti_modes_t;
107 107
108 108 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
109 109 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
110 110 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
111 111
112 112 #define ZTI_N(n) ZTI_P(n, 1)
113 113 #define ZTI_ONE ZTI_N(1)
114 114
115 115 typedef struct zio_taskq_info {
116 116 zti_modes_t zti_mode;
117 117 uint_t zti_value;
118 118 uint_t zti_count;
119 119 } zio_taskq_info_t;
120 120
121 121 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
122 122 "issue", "issue_high", "intr", "intr_high"
123 123 };
124 124
125 125 /*
126 126 * This table defines the taskq settings for each ZFS I/O type. When
127 127 * initializing a pool, we use this table to create an appropriately sized
128 128 * taskq. Some operations are low volume and therefore have a small, static
129 129 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
130 130 * macros. Other operations process a large amount of data; the ZTI_BATCH
131 131 * macro causes us to create a taskq oriented for throughput. Some operations
132 132 * are so high frequency and short-lived that the taskq itself can become a a
133 133 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
134 134 * additional degree of parallelism specified by the number of threads per-
135 135 * taskq and the number of taskqs; when dispatching an event in this case, the
136 136 * particular taskq is chosen at random.
137 137 *
138 138 * The different taskq priorities are to handle the different contexts (issue
139 139 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
140 140 * need to be handled with minimum delay.
141 141 */
142 142 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
143 143 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
144 144 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
145 145 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
146 146 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
147 147 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
148 148 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
149 149 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
150 150 };
151 151
152 152 static void spa_sync_version(void *arg, dmu_tx_t *tx);
153 153 static void spa_sync_props(void *arg, dmu_tx_t *tx);
154 154 static boolean_t spa_has_active_shared_spare(spa_t *spa);
155 155 static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
156 156 static void spa_vdev_resilver_done(spa_t *spa);
157 157
158 158 uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
159 159 id_t zio_taskq_psrset_bind = PS_NONE;
160 160 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
161 161 uint_t zio_taskq_basedc = 80; /* base duty cycle */
162 162
163 163 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
164 164 extern int zfs_sync_pass_deferred_free;
165 165
166 166 /*
167 167 * Report any spa_load_verify errors found, but do not fail spa_load.
168 168 * This is used by zdb to analyze non-idle pools.
169 169 */
170 170 boolean_t spa_load_verify_dryrun = B_FALSE;
171 171
172 172 /*
173 173 * This (illegal) pool name is used when temporarily importing a spa_t in order
174 174 * to get the vdev stats associated with the imported devices.
175 175 */
176 176 #define TRYIMPORT_NAME "$import"
177 177
178 178 /*
179 179 * For debugging purposes: print out vdev tree during pool import.
180 180 */
181 181 boolean_t spa_load_print_vdev_tree = B_FALSE;
182 182
183 183 /*
184 184 * A non-zero value for zfs_max_missing_tvds means that we allow importing
185 185 * pools with missing top-level vdevs. This is strictly intended for advanced
186 186 * pool recovery cases since missing data is almost inevitable. Pools with
187 187 * missing devices can only be imported read-only for safety reasons, and their
188 188 * fail-mode will be automatically set to "continue".
189 189 *
190 190 * With 1 missing vdev we should be able to import the pool and mount all
191 191 * datasets. User data that was not modified after the missing device has been
192 192 * added should be recoverable. This means that snapshots created prior to the
193 193 * addition of that device should be completely intact.
194 194 *
195 195 * With 2 missing vdevs, some datasets may fail to mount since there are
196 196 * dataset statistics that are stored as regular metadata. Some data might be
197 197 * recoverable if those vdevs were added recently.
198 198 *
199 199 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
200 200 * may be missing entirely. Chances of data recovery are very low. Note that
201 201 * there are also risks of performing an inadvertent rewind as we might be
202 202 * missing all the vdevs with the latest uberblocks.
203 203 */
204 204 uint64_t zfs_max_missing_tvds = 0;
205 205
206 206 /*
207 207 * The parameters below are similar to zfs_max_missing_tvds but are only
208 208 * intended for a preliminary open of the pool with an untrusted config which
209 209 * might be incomplete or out-dated.
210 210 *
211 211 * We are more tolerant for pools opened from a cachefile since we could have
212 212 * an out-dated cachefile where a device removal was not registered.
213 213 * We could have set the limit arbitrarily high but in the case where devices
214 214 * are really missing we would want to return the proper error codes; we chose
215 215 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
216 216 * and we get a chance to retrieve the trusted config.
217 217 */
218 218 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
219 219
220 220 /*
221 221 * In the case where config was assembled by scanning device paths (/dev/dsks
222 222 * by default) we are less tolerant since all the existing devices should have
223 223 * been detected and we want spa_load to return the right error codes.
224 224 */
225 225 uint64_t zfs_max_missing_tvds_scan = 0;
226 226
227 227 /*
228 228 * Debugging aid that pauses spa_sync() towards the end.
229 229 */
230 230 boolean_t zfs_pause_spa_sync = B_FALSE;
231 231
232 232 /*
233 233 * ==========================================================================
234 234 * SPA properties routines
235 235 * ==========================================================================
236 236 */
237 237
238 238 /*
239 239 * Add a (source=src, propname=propval) list to an nvlist.
240 240 */
241 241 static void
242 242 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
243 243 uint64_t intval, zprop_source_t src)
244 244 {
245 245 const char *propname = zpool_prop_to_name(prop);
246 246 nvlist_t *propval;
247 247
248 248 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
249 249 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
250 250
251 251 if (strval != NULL)
252 252 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
253 253 else
254 254 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
255 255
256 256 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
257 257 nvlist_free(propval);
258 258 }
259 259
260 260 /*
261 261 * Get property values from the spa configuration.
262 262 */
263 263 static void
264 264 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
265 265 {
266 266 vdev_t *rvd = spa->spa_root_vdev;
267 267 dsl_pool_t *pool = spa->spa_dsl_pool;
268 268 uint64_t size, alloc, cap, version;
269 269 zprop_source_t src = ZPROP_SRC_NONE;
270 270 spa_config_dirent_t *dp;
271 271 metaslab_class_t *mc = spa_normal_class(spa);
272 272
273 273 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
274 274
275 275 if (rvd != NULL) {
276 276 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
277 277 size = metaslab_class_get_space(spa_normal_class(spa));
278 278 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
279 279 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
280 280 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
281 281 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
282 282 size - alloc, src);
283 283 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
284 284 spa->spa_checkpoint_info.sci_dspace, src);
285 285
286 286 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
287 287 metaslab_class_fragmentation(mc), src);
288 288 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
289 289 metaslab_class_expandable_space(mc), src);
290 290 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
291 291 (spa_mode(spa) == FREAD), src);
292 292
293 293 cap = (size == 0) ? 0 : (alloc * 100 / size);
294 294 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
295 295
296 296 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
297 297 ddt_get_pool_dedup_ratio(spa), src);
298 298
299 299 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
300 300 rvd->vdev_state, src);
301 301
302 302 version = spa_version(spa);
303 303 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
304 304 src = ZPROP_SRC_DEFAULT;
305 305 else
306 306 src = ZPROP_SRC_LOCAL;
307 307 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
308 308 }
309 309
310 310 if (pool != NULL) {
311 311 /*
312 312 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
313 313 * when opening pools before this version freedir will be NULL.
314 314 */
315 315 if (pool->dp_free_dir != NULL) {
316 316 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
317 317 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
318 318 src);
319 319 } else {
320 320 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
321 321 NULL, 0, src);
322 322 }
323 323
324 324 if (pool->dp_leak_dir != NULL) {
325 325 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
326 326 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
327 327 src);
328 328 } else {
329 329 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
330 330 NULL, 0, src);
331 331 }
332 332 }
333 333
334 334 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
335 335
336 336 if (spa->spa_comment != NULL) {
337 337 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
338 338 0, ZPROP_SRC_LOCAL);
339 339 }
340 340
341 341 if (spa->spa_root != NULL)
342 342 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
343 343 0, ZPROP_SRC_LOCAL);
344 344
345 345 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
346 346 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
347 347 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
348 348 } else {
349 349 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
350 350 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
351 351 }
352 352
353 353 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
354 354 if (dp->scd_path == NULL) {
355 355 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
356 356 "none", 0, ZPROP_SRC_LOCAL);
357 357 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
358 358 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
359 359 dp->scd_path, 0, ZPROP_SRC_LOCAL);
360 360 }
361 361 }
362 362 }
363 363
364 364 /*
365 365 * Get zpool property values.
366 366 */
367 367 int
368 368 spa_prop_get(spa_t *spa, nvlist_t **nvp)
369 369 {
370 370 objset_t *mos = spa->spa_meta_objset;
371 371 zap_cursor_t zc;
372 372 zap_attribute_t za;
373 373 int err;
374 374
375 375 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
376 376
377 377 mutex_enter(&spa->spa_props_lock);
378 378
379 379 /*
380 380 * Get properties from the spa config.
381 381 */
382 382 spa_prop_get_config(spa, nvp);
383 383
384 384 /* If no pool property object, no more prop to get. */
385 385 if (mos == NULL || spa->spa_pool_props_object == 0) {
386 386 mutex_exit(&spa->spa_props_lock);
387 387 return (0);
388 388 }
389 389
390 390 /*
391 391 * Get properties from the MOS pool property object.
392 392 */
393 393 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
394 394 (err = zap_cursor_retrieve(&zc, &za)) == 0;
395 395 zap_cursor_advance(&zc)) {
396 396 uint64_t intval = 0;
397 397 char *strval = NULL;
398 398 zprop_source_t src = ZPROP_SRC_DEFAULT;
399 399 zpool_prop_t prop;
400 400
401 401 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
402 402 continue;
403 403
404 404 switch (za.za_integer_length) {
405 405 case 8:
406 406 /* integer property */
407 407 if (za.za_first_integer !=
408 408 zpool_prop_default_numeric(prop))
409 409 src = ZPROP_SRC_LOCAL;
410 410
411 411 if (prop == ZPOOL_PROP_BOOTFS) {
412 412 dsl_pool_t *dp;
413 413 dsl_dataset_t *ds = NULL;
414 414
415 415 dp = spa_get_dsl(spa);
416 416 dsl_pool_config_enter(dp, FTAG);
417 417 err = dsl_dataset_hold_obj(dp,
418 418 za.za_first_integer, FTAG, &ds);
419 419 if (err != 0) {
420 420 dsl_pool_config_exit(dp, FTAG);
421 421 break;
422 422 }
423 423
424 424 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
425 425 KM_SLEEP);
426 426 dsl_dataset_name(ds, strval);
427 427 dsl_dataset_rele(ds, FTAG);
428 428 dsl_pool_config_exit(dp, FTAG);
429 429 } else {
430 430 strval = NULL;
431 431 intval = za.za_first_integer;
432 432 }
433 433
434 434 spa_prop_add_list(*nvp, prop, strval, intval, src);
435 435
436 436 if (strval != NULL)
437 437 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
438 438
439 439 break;
440 440
441 441 case 1:
442 442 /* string property */
443 443 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
444 444 err = zap_lookup(mos, spa->spa_pool_props_object,
445 445 za.za_name, 1, za.za_num_integers, strval);
446 446 if (err) {
447 447 kmem_free(strval, za.za_num_integers);
448 448 break;
449 449 }
450 450 spa_prop_add_list(*nvp, prop, strval, 0, src);
451 451 kmem_free(strval, za.za_num_integers);
452 452 break;
453 453
454 454 default:
455 455 break;
456 456 }
457 457 }
458 458 zap_cursor_fini(&zc);
459 459 mutex_exit(&spa->spa_props_lock);
460 460 out:
461 461 if (err && err != ENOENT) {
462 462 nvlist_free(*nvp);
463 463 *nvp = NULL;
464 464 return (err);
465 465 }
466 466
467 467 return (0);
468 468 }
469 469
470 470 /*
471 471 * Validate the given pool properties nvlist and modify the list
472 472 * for the property values to be set.
473 473 */
474 474 static int
475 475 spa_prop_validate(spa_t *spa, nvlist_t *props)
476 476 {
477 477 nvpair_t *elem;
478 478 int error = 0, reset_bootfs = 0;
479 479 uint64_t objnum = 0;
480 480 boolean_t has_feature = B_FALSE;
481 481
482 482 elem = NULL;
483 483 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
484 484 uint64_t intval;
485 485 char *strval, *slash, *check, *fname;
486 486 const char *propname = nvpair_name(elem);
487 487 zpool_prop_t prop = zpool_name_to_prop(propname);
488 488
489 489 switch (prop) {
490 490 case ZPOOL_PROP_INVAL:
491 491 if (!zpool_prop_feature(propname)) {
492 492 error = SET_ERROR(EINVAL);
493 493 break;
494 494 }
495 495
496 496 /*
497 497 * Sanitize the input.
498 498 */
499 499 if (nvpair_type(elem) != DATA_TYPE_UINT64) {
500 500 error = SET_ERROR(EINVAL);
501 501 break;
502 502 }
503 503
504 504 if (nvpair_value_uint64(elem, &intval) != 0) {
505 505 error = SET_ERROR(EINVAL);
506 506 break;
507 507 }
508 508
509 509 if (intval != 0) {
510 510 error = SET_ERROR(EINVAL);
511 511 break;
512 512 }
513 513
514 514 fname = strchr(propname, '@') + 1;
515 515 if (zfeature_lookup_name(fname, NULL) != 0) {
516 516 error = SET_ERROR(EINVAL);
517 517 break;
518 518 }
519 519
520 520 has_feature = B_TRUE;
521 521 break;
522 522
523 523 case ZPOOL_PROP_VERSION:
524 524 error = nvpair_value_uint64(elem, &intval);
525 525 if (!error &&
526 526 (intval < spa_version(spa) ||
527 527 intval > SPA_VERSION_BEFORE_FEATURES ||
528 528 has_feature))
529 529 error = SET_ERROR(EINVAL);
530 530 break;
531 531
532 532 case ZPOOL_PROP_DELEGATION:
533 533 case ZPOOL_PROP_AUTOREPLACE:
534 534 case ZPOOL_PROP_LISTSNAPS:
535 535 case ZPOOL_PROP_AUTOEXPAND:
536 536 error = nvpair_value_uint64(elem, &intval);
537 537 if (!error && intval > 1)
538 538 error = SET_ERROR(EINVAL);
539 539 break;
540 540
541 541 case ZPOOL_PROP_BOOTFS:
542 542 /*
543 543 * If the pool version is less than SPA_VERSION_BOOTFS,
544 544 * or the pool is still being created (version == 0),
545 545 * the bootfs property cannot be set.
546 546 */
547 547 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
548 548 error = SET_ERROR(ENOTSUP);
549 549 break;
550 550 }
551 551
552 552 /*
553 553 * Make sure the vdev config is bootable
554 554 */
555 555 if (!vdev_is_bootable(spa->spa_root_vdev)) {
556 556 error = SET_ERROR(ENOTSUP);
557 557 break;
558 558 }
559 559
560 560 reset_bootfs = 1;
561 561
562 562 error = nvpair_value_string(elem, &strval);
563 563
564 564 if (!error) {
565 565 objset_t *os;
566 566 uint64_t propval;
567 567
568 568 if (strval == NULL || strval[0] == '\0') {
569 569 objnum = zpool_prop_default_numeric(
570 570 ZPOOL_PROP_BOOTFS);
571 571 break;
572 572 }
573 573
574 574 error = dmu_objset_hold(strval, FTAG, &os);
575 575 if (error != 0)
576 576 break;
577 577
578 578 /*
579 579 * Must be ZPL, and its property settings
580 580 * must be supported by GRUB (compression
581 581 * is not gzip, and large blocks are not used).
582 582 */
583 583
584 584 if (dmu_objset_type(os) != DMU_OST_ZFS) {
585 585 error = SET_ERROR(ENOTSUP);
586 586 } else if ((error =
587 587 dsl_prop_get_int_ds(dmu_objset_ds(os),
588 588 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
589 589 &propval)) == 0 &&
590 590 !BOOTFS_COMPRESS_VALID(propval)) {
591 591 error = SET_ERROR(ENOTSUP);
592 592 } else {
593 593 objnum = dmu_objset_id(os);
594 594 }
595 595 dmu_objset_rele(os, FTAG);
596 596 }
597 597 break;
598 598
599 599 case ZPOOL_PROP_FAILUREMODE:
600 600 error = nvpair_value_uint64(elem, &intval);
601 601 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
602 602 intval > ZIO_FAILURE_MODE_PANIC))
603 603 error = SET_ERROR(EINVAL);
604 604
605 605 /*
606 606 * This is a special case which only occurs when
607 607 * the pool has completely failed. This allows
608 608 * the user to change the in-core failmode property
609 609 * without syncing it out to disk (I/Os might
610 610 * currently be blocked). We do this by returning
611 611 * EIO to the caller (spa_prop_set) to trick it
612 612 * into thinking we encountered a property validation
613 613 * error.
614 614 */
615 615 if (!error && spa_suspended(spa)) {
616 616 spa->spa_failmode = intval;
617 617 error = SET_ERROR(EIO);
618 618 }
619 619 break;
620 620
621 621 case ZPOOL_PROP_CACHEFILE:
622 622 if ((error = nvpair_value_string(elem, &strval)) != 0)
623 623 break;
624 624
625 625 if (strval[0] == '\0')
626 626 break;
627 627
628 628 if (strcmp(strval, "none") == 0)
629 629 break;
630 630
631 631 if (strval[0] != '/') {
632 632 error = SET_ERROR(EINVAL);
633 633 break;
634 634 }
635 635
636 636 slash = strrchr(strval, '/');
637 637 ASSERT(slash != NULL);
638 638
639 639 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
640 640 strcmp(slash, "/..") == 0)
641 641 error = SET_ERROR(EINVAL);
642 642 break;
643 643
644 644 case ZPOOL_PROP_COMMENT:
645 645 if ((error = nvpair_value_string(elem, &strval)) != 0)
646 646 break;
647 647 for (check = strval; *check != '\0'; check++) {
648 648 /*
649 649 * The kernel doesn't have an easy isprint()
650 650 * check. For this kernel check, we merely
651 651 * check ASCII apart from DEL. Fix this if
652 652 * there is an easy-to-use kernel isprint().
653 653 */
654 654 if (*check >= 0x7f) {
655 655 error = SET_ERROR(EINVAL);
656 656 break;
657 657 }
658 658 }
659 659 if (strlen(strval) > ZPROP_MAX_COMMENT)
660 660 error = E2BIG;
661 661 break;
662 662
663 663 case ZPOOL_PROP_DEDUPDITTO:
664 664 if (spa_version(spa) < SPA_VERSION_DEDUP)
665 665 error = SET_ERROR(ENOTSUP);
666 666 else
667 667 error = nvpair_value_uint64(elem, &intval);
668 668 if (error == 0 &&
669 669 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
670 670 error = SET_ERROR(EINVAL);
671 671 break;
672 672 }
673 673
674 674 if (error)
675 675 break;
676 676 }
677 677
678 678 if (!error && reset_bootfs) {
679 679 error = nvlist_remove(props,
680 680 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
681 681
682 682 if (!error) {
683 683 error = nvlist_add_uint64(props,
684 684 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
685 685 }
686 686 }
687 687
688 688 return (error);
689 689 }
690 690
691 691 void
692 692 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
693 693 {
694 694 char *cachefile;
695 695 spa_config_dirent_t *dp;
696 696
697 697 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
698 698 &cachefile) != 0)
699 699 return;
700 700
701 701 dp = kmem_alloc(sizeof (spa_config_dirent_t),
702 702 KM_SLEEP);
703 703
704 704 if (cachefile[0] == '\0')
705 705 dp->scd_path = spa_strdup(spa_config_path);
706 706 else if (strcmp(cachefile, "none") == 0)
707 707 dp->scd_path = NULL;
708 708 else
709 709 dp->scd_path = spa_strdup(cachefile);
710 710
711 711 list_insert_head(&spa->spa_config_list, dp);
712 712 if (need_sync)
713 713 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
714 714 }
715 715
716 716 int
717 717 spa_prop_set(spa_t *spa, nvlist_t *nvp)
718 718 {
719 719 int error;
720 720 nvpair_t *elem = NULL;
721 721 boolean_t need_sync = B_FALSE;
722 722
723 723 if ((error = spa_prop_validate(spa, nvp)) != 0)
724 724 return (error);
725 725
726 726 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
727 727 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
728 728
729 729 if (prop == ZPOOL_PROP_CACHEFILE ||
730 730 prop == ZPOOL_PROP_ALTROOT ||
731 731 prop == ZPOOL_PROP_READONLY)
732 732 continue;
733 733
734 734 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
735 735 uint64_t ver;
736 736
737 737 if (prop == ZPOOL_PROP_VERSION) {
738 738 VERIFY(nvpair_value_uint64(elem, &ver) == 0);
739 739 } else {
740 740 ASSERT(zpool_prop_feature(nvpair_name(elem)));
741 741 ver = SPA_VERSION_FEATURES;
742 742 need_sync = B_TRUE;
743 743 }
744 744
745 745 /* Save time if the version is already set. */
746 746 if (ver == spa_version(spa))
747 747 continue;
748 748
749 749 /*
750 750 * In addition to the pool directory object, we might
751 751 * create the pool properties object, the features for
752 752 * read object, the features for write object, or the
753 753 * feature descriptions object.
754 754 */
755 755 error = dsl_sync_task(spa->spa_name, NULL,
756 756 spa_sync_version, &ver,
757 757 6, ZFS_SPACE_CHECK_RESERVED);
758 758 if (error)
759 759 return (error);
760 760 continue;
761 761 }
762 762
763 763 need_sync = B_TRUE;
764 764 break;
765 765 }
766 766
767 767 if (need_sync) {
768 768 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
769 769 nvp, 6, ZFS_SPACE_CHECK_RESERVED));
770 770 }
771 771
772 772 return (0);
773 773 }
774 774
775 775 /*
776 776 * If the bootfs property value is dsobj, clear it.
777 777 */
778 778 void
779 779 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
780 780 {
781 781 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
782 782 VERIFY(zap_remove(spa->spa_meta_objset,
783 783 spa->spa_pool_props_object,
784 784 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
785 785 spa->spa_bootfs = 0;
786 786 }
787 787 }
788 788
789 789 /*ARGSUSED*/
790 790 static int
791 791 spa_change_guid_check(void *arg, dmu_tx_t *tx)
792 792 {
793 793 uint64_t *newguid = arg;
794 794 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
795 795 vdev_t *rvd = spa->spa_root_vdev;
796 796 uint64_t vdev_state;
797 797
798 798 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
799 799 int error = (spa_has_checkpoint(spa)) ?
800 800 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
801 801 return (SET_ERROR(error));
802 802 }
803 803
804 804 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
805 805 vdev_state = rvd->vdev_state;
806 806 spa_config_exit(spa, SCL_STATE, FTAG);
807 807
808 808 if (vdev_state != VDEV_STATE_HEALTHY)
809 809 return (SET_ERROR(ENXIO));
810 810
811 811 ASSERT3U(spa_guid(spa), !=, *newguid);
812 812
813 813 return (0);
814 814 }
815 815
816 816 static void
817 817 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
818 818 {
819 819 uint64_t *newguid = arg;
820 820 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
821 821 uint64_t oldguid;
822 822 vdev_t *rvd = spa->spa_root_vdev;
823 823
824 824 oldguid = spa_guid(spa);
825 825
826 826 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
827 827 rvd->vdev_guid = *newguid;
828 828 rvd->vdev_guid_sum += (*newguid - oldguid);
829 829 vdev_config_dirty(rvd);
830 830 spa_config_exit(spa, SCL_STATE, FTAG);
831 831
832 832 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
833 833 oldguid, *newguid);
834 834 }
835 835
836 836 /*
837 837 * Change the GUID for the pool. This is done so that we can later
838 838 * re-import a pool built from a clone of our own vdevs. We will modify
839 839 * the root vdev's guid, our own pool guid, and then mark all of our
840 840 * vdevs dirty. Note that we must make sure that all our vdevs are
841 841 * online when we do this, or else any vdevs that weren't present
842 842 * would be orphaned from our pool. We are also going to issue a
843 843 * sysevent to update any watchers.
844 844 */
845 845 int
846 846 spa_change_guid(spa_t *spa)
847 847 {
848 848 int error;
849 849 uint64_t guid;
850 850
851 851 mutex_enter(&spa->spa_vdev_top_lock);
852 852 mutex_enter(&spa_namespace_lock);
853 853 guid = spa_generate_guid(NULL);
854 854
855 855 error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
856 856 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
857 857
858 858 if (error == 0) {
859 859 spa_write_cachefile(spa, B_FALSE, B_TRUE);
860 860 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
861 861 }
862 862
863 863 mutex_exit(&spa_namespace_lock);
864 864 mutex_exit(&spa->spa_vdev_top_lock);
865 865
866 866 return (error);
867 867 }
868 868
869 869 /*
870 870 * ==========================================================================
871 871 * SPA state manipulation (open/create/destroy/import/export)
872 872 * ==========================================================================
873 873 */
874 874
875 875 static int
876 876 spa_error_entry_compare(const void *a, const void *b)
877 877 {
878 878 spa_error_entry_t *sa = (spa_error_entry_t *)a;
879 879 spa_error_entry_t *sb = (spa_error_entry_t *)b;
880 880 int ret;
881 881
882 882 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
883 883 sizeof (zbookmark_phys_t));
884 884
885 885 if (ret < 0)
886 886 return (-1);
887 887 else if (ret > 0)
888 888 return (1);
889 889 else
890 890 return (0);
891 891 }
892 892
893 893 /*
894 894 * Utility function which retrieves copies of the current logs and
895 895 * re-initializes them in the process.
896 896 */
897 897 void
898 898 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
899 899 {
900 900 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
901 901
902 902 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
903 903 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
904 904
905 905 avl_create(&spa->spa_errlist_scrub,
906 906 spa_error_entry_compare, sizeof (spa_error_entry_t),
907 907 offsetof(spa_error_entry_t, se_avl));
908 908 avl_create(&spa->spa_errlist_last,
909 909 spa_error_entry_compare, sizeof (spa_error_entry_t),
910 910 offsetof(spa_error_entry_t, se_avl));
911 911 }
912 912
913 913 static void
914 914 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
915 915 {
916 916 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
917 917 enum zti_modes mode = ztip->zti_mode;
918 918 uint_t value = ztip->zti_value;
919 919 uint_t count = ztip->zti_count;
920 920 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
921 921 char name[32];
922 922 uint_t flags = 0;
923 923 boolean_t batch = B_FALSE;
924 924
925 925 if (mode == ZTI_MODE_NULL) {
926 926 tqs->stqs_count = 0;
927 927 tqs->stqs_taskq = NULL;
928 928 return;
929 929 }
930 930
931 931 ASSERT3U(count, >, 0);
932 932
933 933 tqs->stqs_count = count;
934 934 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
935 935
936 936 switch (mode) {
937 937 case ZTI_MODE_FIXED:
938 938 ASSERT3U(value, >=, 1);
939 939 value = MAX(value, 1);
940 940 break;
941 941
942 942 case ZTI_MODE_BATCH:
943 943 batch = B_TRUE;
944 944 flags |= TASKQ_THREADS_CPU_PCT;
945 945 value = zio_taskq_batch_pct;
946 946 break;
947 947
948 948 default:
949 949 panic("unrecognized mode for %s_%s taskq (%u:%u) in "
950 950 "spa_activate()",
951 951 zio_type_name[t], zio_taskq_types[q], mode, value);
952 952 break;
953 953 }
954 954
955 955 for (uint_t i = 0; i < count; i++) {
956 956 taskq_t *tq;
957 957
958 958 if (count > 1) {
959 959 (void) snprintf(name, sizeof (name), "%s_%s_%u",
960 960 zio_type_name[t], zio_taskq_types[q], i);
961 961 } else {
962 962 (void) snprintf(name, sizeof (name), "%s_%s",
963 963 zio_type_name[t], zio_taskq_types[q]);
964 964 }
965 965
966 966 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
967 967 if (batch)
968 968 flags |= TASKQ_DC_BATCH;
969 969
970 970 tq = taskq_create_sysdc(name, value, 50, INT_MAX,
971 971 spa->spa_proc, zio_taskq_basedc, flags);
972 972 } else {
973 973 pri_t pri = maxclsyspri;
974 974 /*
975 975 * The write issue taskq can be extremely CPU
976 976 * intensive. Run it at slightly lower priority
977 977 * than the other taskqs.
978 978 */
979 979 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
980 980 pri--;
981 981
982 982 tq = taskq_create_proc(name, value, pri, 50,
983 983 INT_MAX, spa->spa_proc, flags);
984 984 }
985 985
986 986 tqs->stqs_taskq[i] = tq;
987 987 }
988 988 }
989 989
990 990 static void
991 991 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
992 992 {
993 993 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
994 994
995 995 if (tqs->stqs_taskq == NULL) {
996 996 ASSERT0(tqs->stqs_count);
997 997 return;
998 998 }
999 999
1000 1000 for (uint_t i = 0; i < tqs->stqs_count; i++) {
1001 1001 ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1002 1002 taskq_destroy(tqs->stqs_taskq[i]);
1003 1003 }
1004 1004
1005 1005 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1006 1006 tqs->stqs_taskq = NULL;
1007 1007 }
1008 1008
1009 1009 /*
1010 1010 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1011 1011 * Note that a type may have multiple discrete taskqs to avoid lock contention
1012 1012 * on the taskq itself. In that case we choose which taskq at random by using
1013 1013 * the low bits of gethrtime().
1014 1014 */
1015 1015 void
1016 1016 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1017 1017 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1018 1018 {
1019 1019 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1020 1020 taskq_t *tq;
1021 1021
1022 1022 ASSERT3P(tqs->stqs_taskq, !=, NULL);
1023 1023 ASSERT3U(tqs->stqs_count, !=, 0);
1024 1024
1025 1025 if (tqs->stqs_count == 1) {
1026 1026 tq = tqs->stqs_taskq[0];
1027 1027 } else {
1028 1028 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
1029 1029 }
1030 1030
1031 1031 taskq_dispatch_ent(tq, func, arg, flags, ent);
1032 1032 }
1033 1033
1034 1034 static void
1035 1035 spa_create_zio_taskqs(spa_t *spa)
1036 1036 {
1037 1037 for (int t = 0; t < ZIO_TYPES; t++) {
1038 1038 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1039 1039 spa_taskqs_init(spa, t, q);
1040 1040 }
1041 1041 }
1042 1042 }
1043 1043
1044 1044 #ifdef _KERNEL
1045 1045 static void
1046 1046 spa_thread(void *arg)
1047 1047 {
1048 1048 callb_cpr_t cprinfo;
1049 1049
1050 1050 spa_t *spa = arg;
1051 1051 user_t *pu = PTOU(curproc);
1052 1052
1053 1053 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1054 1054 spa->spa_name);
1055 1055
1056 1056 ASSERT(curproc != &p0);
1057 1057 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1058 1058 "zpool-%s", spa->spa_name);
1059 1059 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1060 1060
1061 1061 /* bind this thread to the requested psrset */
1062 1062 if (zio_taskq_psrset_bind != PS_NONE) {
1063 1063 pool_lock();
1064 1064 mutex_enter(&cpu_lock);
1065 1065 mutex_enter(&pidlock);
1066 1066 mutex_enter(&curproc->p_lock);
1067 1067
1068 1068 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1069 1069 0, NULL, NULL) == 0) {
1070 1070 curthread->t_bind_pset = zio_taskq_psrset_bind;
1071 1071 } else {
1072 1072 cmn_err(CE_WARN,
1073 1073 "Couldn't bind process for zfs pool \"%s\" to "
1074 1074 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1075 1075 }
1076 1076
1077 1077 mutex_exit(&curproc->p_lock);
1078 1078 mutex_exit(&pidlock);
1079 1079 mutex_exit(&cpu_lock);
1080 1080 pool_unlock();
1081 1081 }
1082 1082
1083 1083 if (zio_taskq_sysdc) {
1084 1084 sysdc_thread_enter(curthread, 100, 0);
1085 1085 }
1086 1086
1087 1087 spa->spa_proc = curproc;
1088 1088 spa->spa_did = curthread->t_did;
1089 1089
1090 1090 spa_create_zio_taskqs(spa);
1091 1091
1092 1092 mutex_enter(&spa->spa_proc_lock);
1093 1093 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1094 1094
1095 1095 spa->spa_proc_state = SPA_PROC_ACTIVE;
1096 1096 cv_broadcast(&spa->spa_proc_cv);
1097 1097
1098 1098 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1099 1099 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1100 1100 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1101 1101 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1102 1102
1103 1103 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1104 1104 spa->spa_proc_state = SPA_PROC_GONE;
1105 1105 spa->spa_proc = &p0;
1106 1106 cv_broadcast(&spa->spa_proc_cv);
1107 1107 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
1108 1108
1109 1109 mutex_enter(&curproc->p_lock);
1110 1110 lwp_exit();
1111 1111 }
1112 1112 #endif
1113 1113
1114 1114 /*
1115 1115 * Activate an uninitialized pool.
1116 1116 */
1117 1117 static void
1118 1118 spa_activate(spa_t *spa, int mode)
1119 1119 {
1120 1120 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1121 1121
1122 1122 spa->spa_state = POOL_STATE_ACTIVE;
1123 1123 spa->spa_mode = mode;
1124 1124
1125 1125 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1126 1126 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1127 1127
1128 1128 /* Try to create a covering process */
1129 1129 mutex_enter(&spa->spa_proc_lock);
1130 1130 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1131 1131 ASSERT(spa->spa_proc == &p0);
1132 1132 spa->spa_did = 0;
1133 1133
1134 1134 /* Only create a process if we're going to be around a while. */
1135 1135 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1136 1136 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1137 1137 NULL, 0) == 0) {
1138 1138 spa->spa_proc_state = SPA_PROC_CREATED;
1139 1139 while (spa->spa_proc_state == SPA_PROC_CREATED) {
1140 1140 cv_wait(&spa->spa_proc_cv,
1141 1141 &spa->spa_proc_lock);
1142 1142 }
1143 1143 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1144 1144 ASSERT(spa->spa_proc != &p0);
1145 1145 ASSERT(spa->spa_did != 0);
1146 1146 } else {
1147 1147 #ifdef _KERNEL
1148 1148 cmn_err(CE_WARN,
1149 1149 "Couldn't create process for zfs pool \"%s\"\n",
1150 1150 spa->spa_name);
1151 1151 #endif
1152 1152 }
1153 1153 }
1154 1154 mutex_exit(&spa->spa_proc_lock);
1155 1155
1156 1156 /* If we didn't create a process, we need to create our taskqs. */
1157 1157 if (spa->spa_proc == &p0) {
1158 1158 spa_create_zio_taskqs(spa);
1159 1159 }
1160 1160
1161 1161 for (size_t i = 0; i < TXG_SIZE; i++) {
1162 1162 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1163 1163 ZIO_FLAG_CANFAIL);
1164 1164 }
1165 1165
1166 1166 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1167 1167 offsetof(vdev_t, vdev_config_dirty_node));
1168 1168 list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1169 1169 offsetof(objset_t, os_evicting_node));
1170 1170 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1171 1171 offsetof(vdev_t, vdev_state_dirty_node));
1172 1172
1173 1173 txg_list_create(&spa->spa_vdev_txg_list, spa,
1174 1174 offsetof(struct vdev, vdev_txg_node));
1175 1175
1176 1176 avl_create(&spa->spa_errlist_scrub,
1177 1177 spa_error_entry_compare, sizeof (spa_error_entry_t),
1178 1178 offsetof(spa_error_entry_t, se_avl));
1179 1179 avl_create(&spa->spa_errlist_last,
1180 1180 spa_error_entry_compare, sizeof (spa_error_entry_t),
1181 1181 offsetof(spa_error_entry_t, se_avl));
1182 1182 }
1183 1183
1184 1184 /*
1185 1185 * Opposite of spa_activate().
1186 1186 */
1187 1187 static void
1188 1188 spa_deactivate(spa_t *spa)
1189 1189 {
1190 1190 ASSERT(spa->spa_sync_on == B_FALSE);
1191 1191 ASSERT(spa->spa_dsl_pool == NULL);
1192 1192 ASSERT(spa->spa_root_vdev == NULL);
1193 1193 ASSERT(spa->spa_async_zio_root == NULL);
1194 1194 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1195 1195
1196 1196 spa_evicting_os_wait(spa);
1197 1197
1198 1198 txg_list_destroy(&spa->spa_vdev_txg_list);
1199 1199
1200 1200 list_destroy(&spa->spa_config_dirty_list);
1201 1201 list_destroy(&spa->spa_evicting_os_list);
1202 1202 list_destroy(&spa->spa_state_dirty_list);
1203 1203
1204 1204 for (int t = 0; t < ZIO_TYPES; t++) {
1205 1205 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1206 1206 spa_taskqs_fini(spa, t, q);
1207 1207 }
1208 1208 }
1209 1209
1210 1210 for (size_t i = 0; i < TXG_SIZE; i++) {
1211 1211 ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1212 1212 VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1213 1213 spa->spa_txg_zio[i] = NULL;
1214 1214 }
1215 1215
1216 1216 metaslab_class_destroy(spa->spa_normal_class);
1217 1217 spa->spa_normal_class = NULL;
1218 1218
1219 1219 metaslab_class_destroy(spa->spa_log_class);
1220 1220 spa->spa_log_class = NULL;
1221 1221
1222 1222 /*
1223 1223 * If this was part of an import or the open otherwise failed, we may
1224 1224 * still have errors left in the queues. Empty them just in case.
1225 1225 */
1226 1226 spa_errlog_drain(spa);
1227 1227
1228 1228 avl_destroy(&spa->spa_errlist_scrub);
1229 1229 avl_destroy(&spa->spa_errlist_last);
1230 1230
1231 1231 spa->spa_state = POOL_STATE_UNINITIALIZED;
1232 1232
1233 1233 mutex_enter(&spa->spa_proc_lock);
1234 1234 if (spa->spa_proc_state != SPA_PROC_NONE) {
1235 1235 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1236 1236 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1237 1237 cv_broadcast(&spa->spa_proc_cv);
1238 1238 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1239 1239 ASSERT(spa->spa_proc != &p0);
1240 1240 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1241 1241 }
1242 1242 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1243 1243 spa->spa_proc_state = SPA_PROC_NONE;
1244 1244 }
1245 1245 ASSERT(spa->spa_proc == &p0);
1246 1246 mutex_exit(&spa->spa_proc_lock);
1247 1247
1248 1248 /*
1249 1249 * We want to make sure spa_thread() has actually exited the ZFS
1250 1250 * module, so that the module can't be unloaded out from underneath
1251 1251 * it.
1252 1252 */
1253 1253 if (spa->spa_did != 0) {
1254 1254 thread_join(spa->spa_did);
1255 1255 spa->spa_did = 0;
1256 1256 }
1257 1257 }
1258 1258
1259 1259 /*
1260 1260 * Verify a pool configuration, and construct the vdev tree appropriately. This
1261 1261 * will create all the necessary vdevs in the appropriate layout, with each vdev
1262 1262 * in the CLOSED state. This will prep the pool before open/creation/import.
1263 1263 * All vdev validation is done by the vdev_alloc() routine.
1264 1264 */
1265 1265 static int
1266 1266 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1267 1267 uint_t id, int atype)
1268 1268 {
1269 1269 nvlist_t **child;
1270 1270 uint_t children;
1271 1271 int error;
1272 1272
1273 1273 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1274 1274 return (error);
1275 1275
1276 1276 if ((*vdp)->vdev_ops->vdev_op_leaf)
1277 1277 return (0);
1278 1278
1279 1279 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1280 1280 &child, &children);
1281 1281
1282 1282 if (error == ENOENT)
1283 1283 return (0);
1284 1284
1285 1285 if (error) {
1286 1286 vdev_free(*vdp);
1287 1287 *vdp = NULL;
1288 1288 return (SET_ERROR(EINVAL));
1289 1289 }
1290 1290
1291 1291 for (int c = 0; c < children; c++) {
1292 1292 vdev_t *vd;
1293 1293 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1294 1294 atype)) != 0) {
1295 1295 vdev_free(*vdp);
1296 1296 *vdp = NULL;
1297 1297 return (error);
1298 1298 }
1299 1299 }
1300 1300
1301 1301 ASSERT(*vdp != NULL);
1302 1302
1303 1303 return (0);
1304 1304 }
1305 1305
1306 1306 /*
1307 1307 * Opposite of spa_load().
1308 1308 */
1309 1309 static void
1310 1310 spa_unload(spa_t *spa)
1311 1311 {
1312 1312 int i;
1313 1313
1314 1314 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1315 1315
1316 1316 spa_load_note(spa, "UNLOADING");
1317 1317
1318 1318 /*
1319 1319 * Stop async tasks.
1320 1320 */
1321 1321 spa_async_suspend(spa);
1322 1322
1323 1323 if (spa->spa_root_vdev) {
1324 1324 vdev_initialize_stop_all(spa->spa_root_vdev,
1325 1325 VDEV_INITIALIZE_ACTIVE);
1326 1326 }
1327 1327
1328 1328 /*
1329 1329 * Stop syncing.
1330 1330 */
1331 1331 if (spa->spa_sync_on) {
1332 1332 txg_sync_stop(spa->spa_dsl_pool);
1333 1333 spa->spa_sync_on = B_FALSE;
1334 1334 }
1335 1335
1336 1336 /*
1337 1337 * Even though vdev_free() also calls vdev_metaslab_fini, we need
1338 1338 * to call it earlier, before we wait for async i/o to complete.
1339 1339 * This ensures that there is no async metaslab prefetching, by
1340 1340 * calling taskq_wait(mg_taskq).
1341 1341 */
1342 1342 if (spa->spa_root_vdev != NULL) {
1343 1343 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1344 1344 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
1345 1345 vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
1346 1346 spa_config_exit(spa, SCL_ALL, spa);
1347 1347 }
1348 1348
1349 1349 /*
1350 1350 * Wait for any outstanding async I/O to complete.
1351 1351 */
1352 1352 if (spa->spa_async_zio_root != NULL) {
1353 1353 for (int i = 0; i < max_ncpus; i++)
1354 1354 (void) zio_wait(spa->spa_async_zio_root[i]);
1355 1355 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1356 1356 spa->spa_async_zio_root = NULL;
1357 1357 }
1358 1358
1359 1359 if (spa->spa_vdev_removal != NULL) {
1360 1360 spa_vdev_removal_destroy(spa->spa_vdev_removal);
1361 1361 spa->spa_vdev_removal = NULL;
1362 1362 }
1363 1363
1364 1364 if (spa->spa_condense_zthr != NULL) {
1365 1365 ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
1366 1366 zthr_destroy(spa->spa_condense_zthr);
1367 1367 spa->spa_condense_zthr = NULL;
1368 1368 }
1369 1369
1370 1370 if (spa->spa_checkpoint_discard_zthr != NULL) {
1371 1371 ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
1372 1372 zthr_destroy(spa->spa_checkpoint_discard_zthr);
1373 1373 spa->spa_checkpoint_discard_zthr = NULL;
1374 1374 }
1375 1375
1376 1376 spa_condense_fini(spa);
1377 1377
1378 1378 bpobj_close(&spa->spa_deferred_bpobj);
1379 1379
1380 1380 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1381 1381
1382 1382 /*
1383 1383 * Close all vdevs.
1384 1384 */
1385 1385 if (spa->spa_root_vdev)
1386 1386 vdev_free(spa->spa_root_vdev);
1387 1387 ASSERT(spa->spa_root_vdev == NULL);
1388 1388
1389 1389 /*
1390 1390 * Close the dsl pool.
1391 1391 */
1392 1392 if (spa->spa_dsl_pool) {
1393 1393 dsl_pool_close(spa->spa_dsl_pool);
1394 1394 spa->spa_dsl_pool = NULL;
1395 1395 spa->spa_meta_objset = NULL;
1396 1396 }
1397 1397
1398 1398 ddt_unload(spa);
1399 1399
1400 1400 /*
1401 1401 * Drop and purge level 2 cache
1402 1402 */
1403 1403 spa_l2cache_drop(spa);
1404 1404
1405 1405 for (i = 0; i < spa->spa_spares.sav_count; i++)
1406 1406 vdev_free(spa->spa_spares.sav_vdevs[i]);
1407 1407 if (spa->spa_spares.sav_vdevs) {
1408 1408 kmem_free(spa->spa_spares.sav_vdevs,
1409 1409 spa->spa_spares.sav_count * sizeof (void *));
1410 1410 spa->spa_spares.sav_vdevs = NULL;
1411 1411 }
1412 1412 if (spa->spa_spares.sav_config) {
1413 1413 nvlist_free(spa->spa_spares.sav_config);
1414 1414 spa->spa_spares.sav_config = NULL;
1415 1415 }
1416 1416 spa->spa_spares.sav_count = 0;
1417 1417
1418 1418 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1419 1419 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1420 1420 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1421 1421 }
1422 1422 if (spa->spa_l2cache.sav_vdevs) {
1423 1423 kmem_free(spa->spa_l2cache.sav_vdevs,
1424 1424 spa->spa_l2cache.sav_count * sizeof (void *));
1425 1425 spa->spa_l2cache.sav_vdevs = NULL;
1426 1426 }
1427 1427 if (spa->spa_l2cache.sav_config) {
1428 1428 nvlist_free(spa->spa_l2cache.sav_config);
1429 1429 spa->spa_l2cache.sav_config = NULL;
1430 1430 }
1431 1431 spa->spa_l2cache.sav_count = 0;
1432 1432
1433 1433 spa->spa_async_suspended = 0;
1434 1434
1435 1435 spa->spa_indirect_vdevs_loaded = B_FALSE;
1436 1436
1437 1437 if (spa->spa_comment != NULL) {
1438 1438 spa_strfree(spa->spa_comment);
1439 1439 spa->spa_comment = NULL;
1440 1440 }
1441 1441
1442 1442 spa_config_exit(spa, SCL_ALL, spa);
1443 1443 }
1444 1444
1445 1445 /*
1446 1446 * Load (or re-load) the current list of vdevs describing the active spares for
1447 1447 * this pool. When this is called, we have some form of basic information in
1448 1448 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1449 1449 * then re-generate a more complete list including status information.
1450 1450 */
1451 1451 void
1452 1452 spa_load_spares(spa_t *spa)
1453 1453 {
1454 1454 nvlist_t **spares;
1455 1455 uint_t nspares;
1456 1456 int i;
1457 1457 vdev_t *vd, *tvd;
1458 1458
1459 1459 #ifndef _KERNEL
1460 1460 /*
1461 1461 * zdb opens both the current state of the pool and the
1462 1462 * checkpointed state (if present), with a different spa_t.
1463 1463 *
1464 1464 * As spare vdevs are shared among open pools, we skip loading
1465 1465 * them when we load the checkpointed state of the pool.
1466 1466 */
1467 1467 if (!spa_writeable(spa))
1468 1468 return;
1469 1469 #endif
1470 1470
1471 1471 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1472 1472
1473 1473 /*
1474 1474 * First, close and free any existing spare vdevs.
1475 1475 */
1476 1476 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1477 1477 vd = spa->spa_spares.sav_vdevs[i];
1478 1478
1479 1479 /* Undo the call to spa_activate() below */
1480 1480 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1481 1481 B_FALSE)) != NULL && tvd->vdev_isspare)
1482 1482 spa_spare_remove(tvd);
1483 1483 vdev_close(vd);
1484 1484 vdev_free(vd);
1485 1485 }
1486 1486
1487 1487 if (spa->spa_spares.sav_vdevs)
1488 1488 kmem_free(spa->spa_spares.sav_vdevs,
1489 1489 spa->spa_spares.sav_count * sizeof (void *));
1490 1490
1491 1491 if (spa->spa_spares.sav_config == NULL)
1492 1492 nspares = 0;
1493 1493 else
1494 1494 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1495 1495 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1496 1496
1497 1497 spa->spa_spares.sav_count = (int)nspares;
1498 1498 spa->spa_spares.sav_vdevs = NULL;
1499 1499
1500 1500 if (nspares == 0)
1501 1501 return;
1502 1502
1503 1503 /*
1504 1504 * Construct the array of vdevs, opening them to get status in the
1505 1505 * process. For each spare, there is potentially two different vdev_t
1506 1506 * structures associated with it: one in the list of spares (used only
1507 1507 * for basic validation purposes) and one in the active vdev
1508 1508 * configuration (if it's spared in). During this phase we open and
1509 1509 * validate each vdev on the spare list. If the vdev also exists in the
1510 1510 * active configuration, then we also mark this vdev as an active spare.
1511 1511 */
1512 1512 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1513 1513 KM_SLEEP);
1514 1514 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1515 1515 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1516 1516 VDEV_ALLOC_SPARE) == 0);
1517 1517 ASSERT(vd != NULL);
1518 1518
1519 1519 spa->spa_spares.sav_vdevs[i] = vd;
1520 1520
1521 1521 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1522 1522 B_FALSE)) != NULL) {
1523 1523 if (!tvd->vdev_isspare)
1524 1524 spa_spare_add(tvd);
1525 1525
1526 1526 /*
1527 1527 * We only mark the spare active if we were successfully
1528 1528 * able to load the vdev. Otherwise, importing a pool
1529 1529 * with a bad active spare would result in strange
1530 1530 * behavior, because multiple pool would think the spare
1531 1531 * is actively in use.
1532 1532 *
1533 1533 * There is a vulnerability here to an equally bizarre
1534 1534 * circumstance, where a dead active spare is later
1535 1535 * brought back to life (onlined or otherwise). Given
1536 1536 * the rarity of this scenario, and the extra complexity
1537 1537 * it adds, we ignore the possibility.
1538 1538 */
1539 1539 if (!vdev_is_dead(tvd))
1540 1540 spa_spare_activate(tvd);
1541 1541 }
1542 1542
1543 1543 vd->vdev_top = vd;
1544 1544 vd->vdev_aux = &spa->spa_spares;
1545 1545
1546 1546 if (vdev_open(vd) != 0)
1547 1547 continue;
1548 1548
1549 1549 if (vdev_validate_aux(vd) == 0)
1550 1550 spa_spare_add(vd);
1551 1551 }
1552 1552
1553 1553 /*
1554 1554 * Recompute the stashed list of spares, with status information
1555 1555 * this time.
1556 1556 */
1557 1557 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1558 1558 DATA_TYPE_NVLIST_ARRAY) == 0);
1559 1559
1560 1560 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1561 1561 KM_SLEEP);
1562 1562 for (i = 0; i < spa->spa_spares.sav_count; i++)
1563 1563 spares[i] = vdev_config_generate(spa,
1564 1564 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1565 1565 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1566 1566 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1567 1567 for (i = 0; i < spa->spa_spares.sav_count; i++)
1568 1568 nvlist_free(spares[i]);
1569 1569 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1570 1570 }
1571 1571
1572 1572 /*
1573 1573 * Load (or re-load) the current list of vdevs describing the active l2cache for
1574 1574 * this pool. When this is called, we have some form of basic information in
1575 1575 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1576 1576 * then re-generate a more complete list including status information.
1577 1577 * Devices which are already active have their details maintained, and are
1578 1578 * not re-opened.
1579 1579 */
1580 1580 void
1581 1581 spa_load_l2cache(spa_t *spa)
1582 1582 {
1583 1583 nvlist_t **l2cache;
1584 1584 uint_t nl2cache;
1585 1585 int i, j, oldnvdevs;
1586 1586 uint64_t guid;
1587 1587 vdev_t *vd, **oldvdevs, **newvdevs;
1588 1588 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1589 1589
1590 1590 #ifndef _KERNEL
1591 1591 /*
1592 1592 * zdb opens both the current state of the pool and the
1593 1593 * checkpointed state (if present), with a different spa_t.
1594 1594 *
1595 1595 * As L2 caches are part of the ARC which is shared among open
1596 1596 * pools, we skip loading them when we load the checkpointed
1597 1597 * state of the pool.
1598 1598 */
1599 1599 if (!spa_writeable(spa))
1600 1600 return;
1601 1601 #endif
1602 1602
1603 1603 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1604 1604
1605 1605 if (sav->sav_config != NULL) {
1606 1606 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1607 1607 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1608 1608 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1609 1609 } else {
1610 1610 nl2cache = 0;
1611 1611 newvdevs = NULL;
1612 1612 }
1613 1613
1614 1614 oldvdevs = sav->sav_vdevs;
1615 1615 oldnvdevs = sav->sav_count;
1616 1616 sav->sav_vdevs = NULL;
1617 1617 sav->sav_count = 0;
1618 1618
1619 1619 /*
1620 1620 * Process new nvlist of vdevs.
1621 1621 */
1622 1622 for (i = 0; i < nl2cache; i++) {
1623 1623 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1624 1624 &guid) == 0);
1625 1625
1626 1626 newvdevs[i] = NULL;
1627 1627 for (j = 0; j < oldnvdevs; j++) {
1628 1628 vd = oldvdevs[j];
1629 1629 if (vd != NULL && guid == vd->vdev_guid) {
1630 1630 /*
1631 1631 * Retain previous vdev for add/remove ops.
1632 1632 */
1633 1633 newvdevs[i] = vd;
1634 1634 oldvdevs[j] = NULL;
1635 1635 break;
1636 1636 }
1637 1637 }
1638 1638
1639 1639 if (newvdevs[i] == NULL) {
1640 1640 /*
1641 1641 * Create new vdev
1642 1642 */
1643 1643 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1644 1644 VDEV_ALLOC_L2CACHE) == 0);
1645 1645 ASSERT(vd != NULL);
1646 1646 newvdevs[i] = vd;
1647 1647
1648 1648 /*
1649 1649 * Commit this vdev as an l2cache device,
1650 1650 * even if it fails to open.
1651 1651 */
1652 1652 spa_l2cache_add(vd);
1653 1653
1654 1654 vd->vdev_top = vd;
1655 1655 vd->vdev_aux = sav;
1656 1656
1657 1657 spa_l2cache_activate(vd);
1658 1658
1659 1659 if (vdev_open(vd) != 0)
1660 1660 continue;
1661 1661
1662 1662 (void) vdev_validate_aux(vd);
1663 1663
1664 1664 if (!vdev_is_dead(vd))
1665 1665 l2arc_add_vdev(spa, vd);
1666 1666 }
1667 1667 }
1668 1668
1669 1669 /*
1670 1670 * Purge vdevs that were dropped
1671 1671 */
1672 1672 for (i = 0; i < oldnvdevs; i++) {
1673 1673 uint64_t pool;
1674 1674
1675 1675 vd = oldvdevs[i];
1676 1676 if (vd != NULL) {
1677 1677 ASSERT(vd->vdev_isl2cache);
1678 1678
1679 1679 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1680 1680 pool != 0ULL && l2arc_vdev_present(vd))
1681 1681 l2arc_remove_vdev(vd);
1682 1682 vdev_clear_stats(vd);
1683 1683 vdev_free(vd);
1684 1684 }
1685 1685 }
1686 1686
1687 1687 if (oldvdevs)
1688 1688 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1689 1689
1690 1690 if (sav->sav_config == NULL)
1691 1691 goto out;
1692 1692
1693 1693 sav->sav_vdevs = newvdevs;
1694 1694 sav->sav_count = (int)nl2cache;
1695 1695
1696 1696 /*
1697 1697 * Recompute the stashed list of l2cache devices, with status
1698 1698 * information this time.
1699 1699 */
1700 1700 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1701 1701 DATA_TYPE_NVLIST_ARRAY) == 0);
1702 1702
1703 1703 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1704 1704 for (i = 0; i < sav->sav_count; i++)
1705 1705 l2cache[i] = vdev_config_generate(spa,
1706 1706 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1707 1707 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1708 1708 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1709 1709 out:
1710 1710 for (i = 0; i < sav->sav_count; i++)
1711 1711 nvlist_free(l2cache[i]);
1712 1712 if (sav->sav_count)
1713 1713 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1714 1714 }
1715 1715
1716 1716 static int
1717 1717 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1718 1718 {
1719 1719 dmu_buf_t *db;
1720 1720 char *packed = NULL;
1721 1721 size_t nvsize = 0;
1722 1722 int error;
1723 1723 *value = NULL;
1724 1724
1725 1725 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1726 1726 if (error != 0)
1727 1727 return (error);
1728 1728
1729 1729 nvsize = *(uint64_t *)db->db_data;
1730 1730 dmu_buf_rele(db, FTAG);
1731 1731
1732 1732 packed = kmem_alloc(nvsize, KM_SLEEP);
1733 1733 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1734 1734 DMU_READ_PREFETCH);
1735 1735 if (error == 0)
1736 1736 error = nvlist_unpack(packed, nvsize, value, 0);
1737 1737 kmem_free(packed, nvsize);
1738 1738
1739 1739 return (error);
1740 1740 }
1741 1741
1742 1742 /*
1743 1743 * Concrete top-level vdevs that are not missing and are not logs. At every
1744 1744 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
1745 1745 */
1746 1746 static uint64_t
1747 1747 spa_healthy_core_tvds(spa_t *spa)
1748 1748 {
1749 1749 vdev_t *rvd = spa->spa_root_vdev;
1750 1750 uint64_t tvds = 0;
1751 1751
1752 1752 for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1753 1753 vdev_t *vd = rvd->vdev_child[i];
1754 1754 if (vd->vdev_islog)
1755 1755 continue;
1756 1756 if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
1757 1757 tvds++;
1758 1758 }
1759 1759
1760 1760 return (tvds);
1761 1761 }
1762 1762
1763 1763 /*
1764 1764 * Checks to see if the given vdev could not be opened, in which case we post a
1765 1765 * sysevent to notify the autoreplace code that the device has been removed.
1766 1766 */
1767 1767 static void
1768 1768 spa_check_removed(vdev_t *vd)
1769 1769 {
1770 1770 for (uint64_t c = 0; c < vd->vdev_children; c++)
1771 1771 spa_check_removed(vd->vdev_child[c]);
1772 1772
1773 1773 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1774 1774 vdev_is_concrete(vd)) {
1775 1775 zfs_post_autoreplace(vd->vdev_spa, vd);
1776 1776 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
1777 1777 }
1778 1778 }
1779 1779
1780 1780 static int
1781 1781 spa_check_for_missing_logs(spa_t *spa)
1782 1782 {
1783 1783 vdev_t *rvd = spa->spa_root_vdev;
1784 1784
1785 1785 /*
1786 1786 * If we're doing a normal import, then build up any additional
1787 1787 * diagnostic information about missing log devices.
1788 1788 * We'll pass this up to the user for further processing.
1789 1789 */
1790 1790 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1791 1791 nvlist_t **child, *nv;
1792 1792 uint64_t idx = 0;
1793 1793
1794 1794 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1795 1795 KM_SLEEP);
1796 1796 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1797 1797
1798 1798 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1799 1799 vdev_t *tvd = rvd->vdev_child[c];
1800 1800
1801 1801 /*
1802 1802 * We consider a device as missing only if it failed
1803 1803 * to open (i.e. offline or faulted is not considered
1804 1804 * as missing).
1805 1805 */
1806 1806 if (tvd->vdev_islog &&
1807 1807 tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1808 1808 child[idx++] = vdev_config_generate(spa, tvd,
1809 1809 B_FALSE, VDEV_CONFIG_MISSING);
1810 1810 }
1811 1811 }
1812 1812
1813 1813 if (idx > 0) {
1814 1814 fnvlist_add_nvlist_array(nv,
1815 1815 ZPOOL_CONFIG_CHILDREN, child, idx);
1816 1816 fnvlist_add_nvlist(spa->spa_load_info,
1817 1817 ZPOOL_CONFIG_MISSING_DEVICES, nv);
1818 1818
1819 1819 for (uint64_t i = 0; i < idx; i++)
1820 1820 nvlist_free(child[i]);
1821 1821 }
1822 1822 nvlist_free(nv);
1823 1823 kmem_free(child, rvd->vdev_children * sizeof (char **));
1824 1824
1825 1825 if (idx > 0) {
1826 1826 spa_load_failed(spa, "some log devices are missing");
1827 1827 vdev_dbgmsg_print_tree(rvd, 2);
1828 1828 return (SET_ERROR(ENXIO));
1829 1829 }
1830 1830 } else {
1831 1831 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1832 1832 vdev_t *tvd = rvd->vdev_child[c];
1833 1833
1834 1834 if (tvd->vdev_islog &&
1835 1835 tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1836 1836 spa_set_log_state(spa, SPA_LOG_CLEAR);
1837 1837 spa_load_note(spa, "some log devices are "
1838 1838 "missing, ZIL is dropped.");
1839 1839 vdev_dbgmsg_print_tree(rvd, 2);
1840 1840 break;
1841 1841 }
1842 1842 }
1843 1843 }
1844 1844
1845 1845 return (0);
1846 1846 }
1847 1847
1848 1848 /*
1849 1849 * Check for missing log devices
1850 1850 */
1851 1851 static boolean_t
1852 1852 spa_check_logs(spa_t *spa)
1853 1853 {
1854 1854 boolean_t rv = B_FALSE;
1855 1855 dsl_pool_t *dp = spa_get_dsl(spa);
1856 1856
1857 1857 switch (spa->spa_log_state) {
1858 1858 case SPA_LOG_MISSING:
1859 1859 /* need to recheck in case slog has been restored */
1860 1860 case SPA_LOG_UNKNOWN:
1861 1861 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1862 1862 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1863 1863 if (rv)
1864 1864 spa_set_log_state(spa, SPA_LOG_MISSING);
1865 1865 break;
1866 1866 }
1867 1867 return (rv);
1868 1868 }
1869 1869
1870 1870 static boolean_t
1871 1871 spa_passivate_log(spa_t *spa)
1872 1872 {
1873 1873 vdev_t *rvd = spa->spa_root_vdev;
1874 1874 boolean_t slog_found = B_FALSE;
1875 1875
1876 1876 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1877 1877
1878 1878 if (!spa_has_slogs(spa))
1879 1879 return (B_FALSE);
1880 1880
1881 1881 for (int c = 0; c < rvd->vdev_children; c++) {
1882 1882 vdev_t *tvd = rvd->vdev_child[c];
1883 1883 metaslab_group_t *mg = tvd->vdev_mg;
1884 1884
1885 1885 if (tvd->vdev_islog) {
1886 1886 metaslab_group_passivate(mg);
1887 1887 slog_found = B_TRUE;
1888 1888 }
1889 1889 }
1890 1890
1891 1891 return (slog_found);
1892 1892 }
1893 1893
1894 1894 static void
1895 1895 spa_activate_log(spa_t *spa)
1896 1896 {
1897 1897 vdev_t *rvd = spa->spa_root_vdev;
1898 1898
1899 1899 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1900 1900
1901 1901 for (int c = 0; c < rvd->vdev_children; c++) {
1902 1902 vdev_t *tvd = rvd->vdev_child[c];
1903 1903 metaslab_group_t *mg = tvd->vdev_mg;
1904 1904
1905 1905 if (tvd->vdev_islog)
1906 1906 metaslab_group_activate(mg);
1907 1907 }
1908 1908 }
1909 1909
1910 1910 int
1911 1911 spa_reset_logs(spa_t *spa)
1912 1912 {
1913 1913 int error;
1914 1914
1915 1915 error = dmu_objset_find(spa_name(spa), zil_reset,
1916 1916 NULL, DS_FIND_CHILDREN);
1917 1917 if (error == 0) {
1918 1918 /*
1919 1919 * We successfully offlined the log device, sync out the
1920 1920 * current txg so that the "stubby" block can be removed
1921 1921 * by zil_sync().
1922 1922 */
1923 1923 txg_wait_synced(spa->spa_dsl_pool, 0);
1924 1924 }
1925 1925 return (error);
1926 1926 }
1927 1927
1928 1928 static void
1929 1929 spa_aux_check_removed(spa_aux_vdev_t *sav)
1930 1930 {
1931 1931 for (int i = 0; i < sav->sav_count; i++)
1932 1932 spa_check_removed(sav->sav_vdevs[i]);
1933 1933 }
1934 1934
1935 1935 void
1936 1936 spa_claim_notify(zio_t *zio)
1937 1937 {
1938 1938 spa_t *spa = zio->io_spa;
1939 1939
1940 1940 if (zio->io_error)
1941 1941 return;
1942 1942
1943 1943 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1944 1944 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1945 1945 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1946 1946 mutex_exit(&spa->spa_props_lock);
1947 1947 }
1948 1948
1949 1949 typedef struct spa_load_error {
1950 1950 uint64_t sle_meta_count;
1951 1951 uint64_t sle_data_count;
1952 1952 } spa_load_error_t;
1953 1953
1954 1954 static void
1955 1955 spa_load_verify_done(zio_t *zio)
1956 1956 {
1957 1957 blkptr_t *bp = zio->io_bp;
1958 1958 spa_load_error_t *sle = zio->io_private;
1959 1959 dmu_object_type_t type = BP_GET_TYPE(bp);
1960 1960 int error = zio->io_error;
1961 1961 spa_t *spa = zio->io_spa;
1962 1962
1963 1963 abd_free(zio->io_abd);
1964 1964 if (error) {
1965 1965 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1966 1966 type != DMU_OT_INTENT_LOG)
1967 1967 atomic_inc_64(&sle->sle_meta_count);
1968 1968 else
1969 1969 atomic_inc_64(&sle->sle_data_count);
1970 1970 }
1971 1971
1972 1972 mutex_enter(&spa->spa_scrub_lock);
1973 1973 spa->spa_scrub_inflight--;
1974 1974 cv_broadcast(&spa->spa_scrub_io_cv);
1975 1975 mutex_exit(&spa->spa_scrub_lock);
1976 1976 }
1977 1977
1978 1978 /*
1979 1979 * Maximum number of concurrent scrub i/os to create while verifying
1980 1980 * a pool while importing it.
1981 1981 */
1982 1982 int spa_load_verify_maxinflight = 10000;
1983 1983 boolean_t spa_load_verify_metadata = B_TRUE;
1984 1984 boolean_t spa_load_verify_data = B_TRUE;
1985 1985
1986 1986 /*ARGSUSED*/
1987 1987 static int
1988 1988 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1989 1989 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1990 1990 {
1991 1991 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
1992 1992 return (0);
1993 1993 /*
1994 1994 * Note: normally this routine will not be called if
1995 1995 * spa_load_verify_metadata is not set. However, it may be useful
1996 1996 * to manually set the flag after the traversal has begun.
1997 1997 */
1998 1998 if (!spa_load_verify_metadata)
1999 1999 return (0);
2000 2000 if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
2001 2001 return (0);
2002 2002
2003 2003 zio_t *rio = arg;
2004 2004 size_t size = BP_GET_PSIZE(bp);
2005 2005
2006 2006 mutex_enter(&spa->spa_scrub_lock);
2007 2007 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
2008 2008 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2009 2009 spa->spa_scrub_inflight++;
2010 2010 mutex_exit(&spa->spa_scrub_lock);
2011 2011
2012 2012 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
2013 2013 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2014 2014 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2015 2015 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2016 2016 return (0);
2017 2017 }
2018 2018
2019 2019 /* ARGSUSED */
2020 2020 int
2021 2021 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2022 2022 {
2023 2023 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2024 2024 return (SET_ERROR(ENAMETOOLONG));
2025 2025
2026 2026 return (0);
2027 2027 }
2028 2028
2029 2029 static int
2030 2030 spa_load_verify(spa_t *spa)
2031 2031 {
2032 2032 zio_t *rio;
2033 2033 spa_load_error_t sle = { 0 };
2034 2034 zpool_load_policy_t policy;
2035 2035 boolean_t verify_ok = B_FALSE;
2036 2036 int error = 0;
2037 2037
2038 2038 zpool_get_load_policy(spa->spa_config, &policy);
2039 2039
2040 2040 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
2041 2041 return (0);
2042 2042
2043 2043 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2044 2044 error = dmu_objset_find_dp(spa->spa_dsl_pool,
2045 2045 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2046 2046 DS_FIND_CHILDREN);
2047 2047 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2048 2048 if (error != 0)
2049 2049 return (error);
2050 2050
2051 2051 rio = zio_root(spa, NULL, &sle,
2052 2052 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2053 2053
2054 2054 if (spa_load_verify_metadata) {
2055 2055 if (spa->spa_extreme_rewind) {
2056 2056 spa_load_note(spa, "performing a complete scan of the "
2057 2057 "pool since extreme rewind is on. This may take "
2058 2058 "a very long time.\n (spa_load_verify_data=%u, "
2059 2059 "spa_load_verify_metadata=%u)",
2060 2060 spa_load_verify_data, spa_load_verify_metadata);
2061 2061 }
2062 2062 error = traverse_pool(spa, spa->spa_verify_min_txg,
2063 2063 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2064 2064 spa_load_verify_cb, rio);
2065 2065 }
2066 2066
2067 2067 (void) zio_wait(rio);
2068 2068
2069 2069 spa->spa_load_meta_errors = sle.sle_meta_count;
2070 2070 spa->spa_load_data_errors = sle.sle_data_count;
2071 2071
2072 2072 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2073 2073 spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2074 2074 "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2075 2075 (u_longlong_t)sle.sle_data_count);
2076 2076 }
2077 2077
2078 2078 if (spa_load_verify_dryrun ||
2079 2079 (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2080 2080 sle.sle_data_count <= policy.zlp_maxdata)) {
2081 2081 int64_t loss = 0;
2082 2082
2083 2083 verify_ok = B_TRUE;
2084 2084 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2085 2085 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2086 2086
2087 2087 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2088 2088 VERIFY(nvlist_add_uint64(spa->spa_load_info,
2089 2089 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2090 2090 VERIFY(nvlist_add_int64(spa->spa_load_info,
2091 2091 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2092 2092 VERIFY(nvlist_add_uint64(spa->spa_load_info,
2093 2093 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2094 2094 } else {
2095 2095 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2096 2096 }
2097 2097
2098 2098 if (spa_load_verify_dryrun)
2099 2099 return (0);
2100 2100
2101 2101 if (error) {
2102 2102 if (error != ENXIO && error != EIO)
2103 2103 error = SET_ERROR(EIO);
2104 2104 return (error);
2105 2105 }
2106 2106
2107 2107 return (verify_ok ? 0 : EIO);
2108 2108 }
2109 2109
2110 2110 /*
2111 2111 * Find a value in the pool props object.
2112 2112 */
2113 2113 static void
2114 2114 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2115 2115 {
2116 2116 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2117 2117 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2118 2118 }
2119 2119
2120 2120 /*
2121 2121 * Find a value in the pool directory object.
2122 2122 */
2123 2123 static int
2124 2124 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2125 2125 {
2126 2126 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2127 2127 name, sizeof (uint64_t), 1, val);
2128 2128
2129 2129 if (error != 0 && (error != ENOENT || log_enoent)) {
2130 2130 spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2131 2131 "[error=%d]", name, error);
2132 2132 }
2133 2133
2134 2134 return (error);
2135 2135 }
2136 2136
2137 2137 static int
2138 2138 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2139 2139 {
2140 2140 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2141 2141 return (SET_ERROR(err));
2142 2142 }
2143 2143
2144 2144 static void
2145 2145 spa_spawn_aux_threads(spa_t *spa)
2146 2146 {
2147 2147 ASSERT(spa_writeable(spa));
2148 2148
2149 2149 ASSERT(MUTEX_HELD(&spa_namespace_lock));
2150 2150
2151 2151 spa_start_indirect_condensing_thread(spa);
2152 2152
2153 2153 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2154 2154 spa->spa_checkpoint_discard_zthr =
2155 2155 zthr_create(spa_checkpoint_discard_thread_check,
2156 2156 spa_checkpoint_discard_thread, spa);
2157 2157 }
2158 2158
2159 2159 /*
2160 2160 * Fix up config after a partly-completed split. This is done with the
2161 2161 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
2162 2162 * pool have that entry in their config, but only the splitting one contains
2163 2163 * a list of all the guids of the vdevs that are being split off.
2164 2164 *
2165 2165 * This function determines what to do with that list: either rejoin
2166 2166 * all the disks to the pool, or complete the splitting process. To attempt
2167 2167 * the rejoin, each disk that is offlined is marked online again, and
2168 2168 * we do a reopen() call. If the vdev label for every disk that was
2169 2169 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2170 2170 * then we call vdev_split() on each disk, and complete the split.
2171 2171 *
2172 2172 * Otherwise we leave the config alone, with all the vdevs in place in
2173 2173 * the original pool.
2174 2174 */
2175 2175 static void
2176 2176 spa_try_repair(spa_t *spa, nvlist_t *config)
2177 2177 {
2178 2178 uint_t extracted;
2179 2179 uint64_t *glist;
2180 2180 uint_t i, gcount;
2181 2181 nvlist_t *nvl;
2182 2182 vdev_t **vd;
2183 2183 boolean_t attempt_reopen;
2184 2184
2185 2185 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2186 2186 return;
2187 2187
2188 2188 /* check that the config is complete */
2189 2189 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2190 2190 &glist, &gcount) != 0)
2191 2191 return;
2192 2192
2193 2193 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2194 2194
2195 2195 /* attempt to online all the vdevs & validate */
2196 2196 attempt_reopen = B_TRUE;
2197 2197 for (i = 0; i < gcount; i++) {
2198 2198 if (glist[i] == 0) /* vdev is hole */
2199 2199 continue;
2200 2200
2201 2201 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2202 2202 if (vd[i] == NULL) {
2203 2203 /*
2204 2204 * Don't bother attempting to reopen the disks;
2205 2205 * just do the split.
2206 2206 */
2207 2207 attempt_reopen = B_FALSE;
2208 2208 } else {
2209 2209 /* attempt to re-online it */
2210 2210 vd[i]->vdev_offline = B_FALSE;
2211 2211 }
2212 2212 }
2213 2213
2214 2214 if (attempt_reopen) {
2215 2215 vdev_reopen(spa->spa_root_vdev);
2216 2216
2217 2217 /* check each device to see what state it's in */
2218 2218 for (extracted = 0, i = 0; i < gcount; i++) {
2219 2219 if (vd[i] != NULL &&
2220 2220 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2221 2221 break;
2222 2222 ++extracted;
2223 2223 }
2224 2224 }
2225 2225
2226 2226 /*
2227 2227 * If every disk has been moved to the new pool, or if we never
2228 2228 * even attempted to look at them, then we split them off for
2229 2229 * good.
2230 2230 */
2231 2231 if (!attempt_reopen || gcount == extracted) {
2232 2232 for (i = 0; i < gcount; i++)
2233 2233 if (vd[i] != NULL)
2234 2234 vdev_split(vd[i]);
2235 2235 vdev_reopen(spa->spa_root_vdev);
2236 2236 }
2237 2237
2238 2238 kmem_free(vd, gcount * sizeof (vdev_t *));
2239 2239 }
2240 2240
2241 2241 static int
2242 2242 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
2243 2243 {
2244 2244 char *ereport = FM_EREPORT_ZFS_POOL;
2245 2245 int error;
2246 2246
2247 2247 spa->spa_load_state = state;
2248 2248
2249 2249 gethrestime(&spa->spa_loaded_ts);
2250 2250 error = spa_load_impl(spa, type, &ereport);
2251 2251
2252 2252 /*
2253 2253 * Don't count references from objsets that are already closed
2254 2254 * and are making their way through the eviction process.
2255 2255 */
2256 2256 spa_evicting_os_wait(spa);
2257 2257 spa->spa_minref = refcount_count(&spa->spa_refcount);
2258 2258 if (error) {
2259 2259 if (error != EEXIST) {
2260 2260 spa->spa_loaded_ts.tv_sec = 0;
2261 2261 spa->spa_loaded_ts.tv_nsec = 0;
2262 2262 }
2263 2263 if (error != EBADF) {
2264 2264 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2265 2265 }
2266 2266 }
2267 2267 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2268 2268 spa->spa_ena = 0;
2269 2269
2270 2270 return (error);
2271 2271 }
2272 2272
2273 2273 /*
2274 2274 * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2275 2275 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2276 2276 * spa's per-vdev ZAP list.
2277 2277 */
2278 2278 static uint64_t
2279 2279 vdev_count_verify_zaps(vdev_t *vd)
2280 2280 {
2281 2281 spa_t *spa = vd->vdev_spa;
2282 2282 uint64_t total = 0;
2283 2283 if (vd->vdev_top_zap != 0) {
2284 2284 total++;
2285 2285 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2286 2286 spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2287 2287 }
2288 2288 if (vd->vdev_leaf_zap != 0) {
2289 2289 total++;
2290 2290 ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2291 2291 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2292 2292 }
2293 2293
2294 2294 for (uint64_t i = 0; i < vd->vdev_children; i++) {
2295 2295 total += vdev_count_verify_zaps(vd->vdev_child[i]);
2296 2296 }
2297 2297
2298 2298 return (total);
2299 2299 }
2300 2300
2301 2301 static int
2302 2302 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
2303 2303 {
2304 2304 uint64_t hostid;
2305 2305 char *hostname;
2306 2306 uint64_t myhostid = 0;
2307 2307
2308 2308 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
2309 2309 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2310 2310 hostname = fnvlist_lookup_string(mos_config,
2311 2311 ZPOOL_CONFIG_HOSTNAME);
2312 2312
2313 2313 myhostid = zone_get_hostid(NULL);
2314 2314
2315 2315 if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
2316 2316 cmn_err(CE_WARN, "pool '%s' could not be "
2317 2317 "loaded as it was last accessed by "
2318 2318 "another system (host: %s hostid: 0x%llx). "
2319 2319 "See: http://illumos.org/msg/ZFS-8000-EY",
2320 2320 spa_name(spa), hostname, (u_longlong_t)hostid);
2321 2321 spa_load_failed(spa, "hostid verification failed: pool "
2322 2322 "last accessed by host: %s (hostid: 0x%llx)",
2323 2323 hostname, (u_longlong_t)hostid);
2324 2324 return (SET_ERROR(EBADF));
2325 2325 }
2326 2326 }
2327 2327
2328 2328 return (0);
2329 2329 }
2330 2330
2331 2331 static int
2332 2332 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
2333 2333 {
2334 2334 int error = 0;
2335 2335 nvlist_t *nvtree, *nvl, *config = spa->spa_config;
2336 2336 int parse;
2337 2337 vdev_t *rvd;
2338 2338 uint64_t pool_guid;
2339 2339 char *comment;
2340 2340
2341 2341 /*
2342 2342 * Versioning wasn't explicitly added to the label until later, so if
2343 2343 * it's not present treat it as the initial version.
2344 2344 */
2345 2345 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2346 2346 &spa->spa_ubsync.ub_version) != 0)
2347 2347 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2348 2348
2349 2349 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
2350 2350 spa_load_failed(spa, "invalid config provided: '%s' missing",
2351 2351 ZPOOL_CONFIG_POOL_GUID);
2352 2352 return (SET_ERROR(EINVAL));
2353 2353 }
2354 2354
2355 2355 /*
2356 2356 * If we are doing an import, ensure that the pool is not already
2357 2357 * imported by checking if its pool guid already exists in the
2358 2358 * spa namespace.
2359 2359 *
2360 2360 * The only case that we allow an already imported pool to be
2361 2361 * imported again, is when the pool is checkpointed and we want to
2362 2362 * look at its checkpointed state from userland tools like zdb.
2363 2363 */
2364 2364 #ifdef _KERNEL
2365 2365 if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2366 2366 spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2367 2367 spa_guid_exists(pool_guid, 0)) {
2368 2368 #else
2369 2369 if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2370 2370 spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2371 2371 spa_guid_exists(pool_guid, 0) &&
2372 2372 !spa_importing_readonly_checkpoint(spa)) {
2373 2373 #endif
2374 2374 spa_load_failed(spa, "a pool with guid %llu is already open",
2375 2375 (u_longlong_t)pool_guid);
2376 2376 return (SET_ERROR(EEXIST));
2377 2377 }
2378 2378
2379 2379 spa->spa_config_guid = pool_guid;
2380 2380
2381 2381 nvlist_free(spa->spa_load_info);
2382 2382 spa->spa_load_info = fnvlist_alloc();
2383 2383
2384 2384 ASSERT(spa->spa_comment == NULL);
2385 2385 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2386 2386 spa->spa_comment = spa_strdup(comment);
2387 2387
2388 2388 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2389 2389 &spa->spa_config_txg);
2390 2390
2391 2391 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
2392 2392 spa->spa_config_splitting = fnvlist_dup(nvl);
2393 2393
2394 2394 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
2395 2395 spa_load_failed(spa, "invalid config provided: '%s' missing",
2396 2396 ZPOOL_CONFIG_VDEV_TREE);
2397 2397 return (SET_ERROR(EINVAL));
2398 2398 }
2399 2399
2400 2400 /*
2401 2401 * Create "The Godfather" zio to hold all async IOs
2402 2402 */
2403 2403 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2404 2404 KM_SLEEP);
2405 2405 for (int i = 0; i < max_ncpus; i++) {
2406 2406 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2407 2407 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2408 2408 ZIO_FLAG_GODFATHER);
2409 2409 }
2410 2410
2411 2411 /*
2412 2412 * Parse the configuration into a vdev tree. We explicitly set the
2413 2413 * value that will be returned by spa_version() since parsing the
2414 2414 * configuration requires knowing the version number.
2415 2415 */
2416 2416 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2417 2417 parse = (type == SPA_IMPORT_EXISTING ?
2418 2418 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2419 2419 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
2420 2420 spa_config_exit(spa, SCL_ALL, FTAG);
2421 2421
2422 2422 if (error != 0) {
2423 2423 spa_load_failed(spa, "unable to parse config [error=%d]",
2424 2424 error);
2425 2425 return (error);
2426 2426 }
2427 2427
2428 2428 ASSERT(spa->spa_root_vdev == rvd);
2429 2429 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2430 2430 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2431 2431
2432 2432 if (type != SPA_IMPORT_ASSEMBLE) {
2433 2433 ASSERT(spa_guid(spa) == pool_guid);
2434 2434 }
2435 2435
2436 2436 return (0);
2437 2437 }
2438 2438
2439 2439 /*
2440 2440 * Recursively open all vdevs in the vdev tree. This function is called twice:
2441 2441 * first with the untrusted config, then with the trusted config.
2442 2442 */
2443 2443 static int
2444 2444 spa_ld_open_vdevs(spa_t *spa)
2445 2445 {
2446 2446 int error = 0;
2447 2447
2448 2448 /*
2449 2449 * spa_missing_tvds_allowed defines how many top-level vdevs can be
2450 2450 * missing/unopenable for the root vdev to be still considered openable.
2451 2451 */
2452 2452 if (spa->spa_trust_config) {
2453 2453 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
2454 2454 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
2455 2455 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
2456 2456 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
2457 2457 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
2458 2458 } else {
2459 2459 spa->spa_missing_tvds_allowed = 0;
2460 2460 }
2461 2461
2462 2462 spa->spa_missing_tvds_allowed =
2463 2463 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
2464 2464
2465 2465 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2466 2466 error = vdev_open(spa->spa_root_vdev);
2467 2467 spa_config_exit(spa, SCL_ALL, FTAG);
2468 2468
2469 2469 if (spa->spa_missing_tvds != 0) {
2470 2470 spa_load_note(spa, "vdev tree has %lld missing top-level "
2471 2471 "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
2472 2472 if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
2473 2473 /*
2474 2474 * Although theoretically we could allow users to open
2475 2475 * incomplete pools in RW mode, we'd need to add a lot
2476 2476 * of extra logic (e.g. adjust pool space to account
2477 2477 * for missing vdevs).
2478 2478 * This limitation also prevents users from accidentally
2479 2479 * opening the pool in RW mode during data recovery and
2480 2480 * damaging it further.
2481 2481 */
2482 2482 spa_load_note(spa, "pools with missing top-level "
2483 2483 "vdevs can only be opened in read-only mode.");
2484 2484 error = SET_ERROR(ENXIO);
2485 2485 } else {
2486 2486 spa_load_note(spa, "current settings allow for maximum "
2487 2487 "%lld missing top-level vdevs at this stage.",
2488 2488 (u_longlong_t)spa->spa_missing_tvds_allowed);
2489 2489 }
2490 2490 }
2491 2491 if (error != 0) {
2492 2492 spa_load_failed(spa, "unable to open vdev tree [error=%d]",
2493 2493 error);
2494 2494 }
2495 2495 if (spa->spa_missing_tvds != 0 || error != 0)
2496 2496 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
2497 2497
2498 2498 return (error);
2499 2499 }
2500 2500
2501 2501 /*
2502 2502 * We need to validate the vdev labels against the configuration that
2503 2503 * we have in hand. This function is called twice: first with an untrusted
2504 2504 * config, then with a trusted config. The validation is more strict when the
2505 2505 * config is trusted.
2506 2506 */
2507 2507 static int
2508 2508 spa_ld_validate_vdevs(spa_t *spa)
2509 2509 {
2510 2510 int error = 0;
2511 2511 vdev_t *rvd = spa->spa_root_vdev;
2512 2512
2513 2513 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2514 2514 error = vdev_validate(rvd);
2515 2515 spa_config_exit(spa, SCL_ALL, FTAG);
2516 2516
2517 2517 if (error != 0) {
2518 2518 spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
2519 2519 return (error);
2520 2520 }
2521 2521
2522 2522 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
2523 2523 spa_load_failed(spa, "cannot open vdev tree after invalidating "
2524 2524 "some vdevs");
2525 2525 vdev_dbgmsg_print_tree(rvd, 2);
2526 2526 return (SET_ERROR(ENXIO));
2527 2527 }
2528 2528
2529 2529 return (0);
2530 2530 }
2531 2531
2532 2532 static void
2533 2533 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
2534 2534 {
2535 2535 spa->spa_state = POOL_STATE_ACTIVE;
2536 2536 spa->spa_ubsync = spa->spa_uberblock;
2537 2537 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2538 2538 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2539 2539 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2540 2540 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2541 2541 spa->spa_claim_max_txg = spa->spa_first_txg;
2542 2542 spa->spa_prev_software_version = ub->ub_software_version;
2543 2543 }
2544 2544
2545 2545 static int
2546 2546 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
2547 2547 {
2548 2548 vdev_t *rvd = spa->spa_root_vdev;
2549 2549 nvlist_t *label;
2550 2550 uberblock_t *ub = &spa->spa_uberblock;
2551 2551
2552 2552 /*
2553 2553 * If we are opening the checkpointed state of the pool by
2554 2554 * rewinding to it, at this point we will have written the
2555 2555 * checkpointed uberblock to the vdev labels, so searching
2556 2556 * the labels will find the right uberblock. However, if
2557 2557 * we are opening the checkpointed state read-only, we have
2558 2558 * not modified the labels. Therefore, we must ignore the
2559 2559 * labels and continue using the spa_uberblock that was set
2560 2560 * by spa_ld_checkpoint_rewind.
2561 2561 *
2562 2562 * Note that it would be fine to ignore the labels when
2563 2563 * rewinding (opening writeable) as well. However, if we
2564 2564 * crash just after writing the labels, we will end up
2565 2565 * searching the labels. Doing so in the common case means
2566 2566 * that this code path gets exercised normally, rather than
2567 2567 * just in the edge case.
2568 2568 */
2569 2569 if (ub->ub_checkpoint_txg != 0 &&
2570 2570 spa_importing_readonly_checkpoint(spa)) {
2571 2571 spa_ld_select_uberblock_done(spa, ub);
2572 2572 return (0);
2573 2573 }
2574 2574
2575 2575 /*
2576 2576 * Find the best uberblock.
2577 2577 */
2578 2578 vdev_uberblock_load(rvd, ub, &label);
2579 2579
2580 2580 /*
2581 2581 * If we weren't able to find a single valid uberblock, return failure.
2582 2582 */
2583 2583 if (ub->ub_txg == 0) {
2584 2584 nvlist_free(label);
2585 2585 spa_load_failed(spa, "no valid uberblock found");
2586 2586 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2587 2587 }
2588 2588
2589 2589 spa_load_note(spa, "using uberblock with txg=%llu",
2590 2590 (u_longlong_t)ub->ub_txg);
2591 2591
2592 2592 /*
2593 2593 * If the pool has an unsupported version we can't open it.
2594 2594 */
2595 2595 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2596 2596 nvlist_free(label);
2597 2597 spa_load_failed(spa, "version %llu is not supported",
2598 2598 (u_longlong_t)ub->ub_version);
2599 2599 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2600 2600 }
2601 2601
2602 2602 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2603 2603 nvlist_t *features;
2604 2604
2605 2605 /*
2606 2606 * If we weren't able to find what's necessary for reading the
2607 2607 * MOS in the label, return failure.
2608 2608 */
2609 2609 if (label == NULL) {
2610 2610 spa_load_failed(spa, "label config unavailable");
2611 2611 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2612 2612 ENXIO));
2613 2613 }
2614 2614
2615 2615 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
2616 2616 &features) != 0) {
2617 2617 nvlist_free(label);
2618 2618 spa_load_failed(spa, "invalid label: '%s' missing",
2619 2619 ZPOOL_CONFIG_FEATURES_FOR_READ);
2620 2620 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2621 2621 ENXIO));
2622 2622 }
2623 2623
2624 2624 /*
2625 2625 * Update our in-core representation with the definitive values
2626 2626 * from the label.
2627 2627 */
2628 2628 nvlist_free(spa->spa_label_features);
2629 2629 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2630 2630 }
2631 2631
2632 2632 nvlist_free(label);
2633 2633
2634 2634 /*
2635 2635 * Look through entries in the label nvlist's features_for_read. If
2636 2636 * there is a feature listed there which we don't understand then we
2637 2637 * cannot open a pool.
2638 2638 */
2639 2639 if (ub->ub_version >= SPA_VERSION_FEATURES) {
2640 2640 nvlist_t *unsup_feat;
2641 2641
2642 2642 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2643 2643 0);
2644 2644
2645 2645 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2646 2646 NULL); nvp != NULL;
2647 2647 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2648 2648 if (!zfeature_is_supported(nvpair_name(nvp))) {
2649 2649 VERIFY(nvlist_add_string(unsup_feat,
2650 2650 nvpair_name(nvp), "") == 0);
2651 2651 }
2652 2652 }
2653 2653
2654 2654 if (!nvlist_empty(unsup_feat)) {
2655 2655 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2656 2656 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2657 2657 nvlist_free(unsup_feat);
2658 2658 spa_load_failed(spa, "some features are unsupported");
2659 2659 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2660 2660 ENOTSUP));
2661 2661 }
2662 2662
2663 2663 nvlist_free(unsup_feat);
2664 2664 }
2665 2665
2666 2666 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2667 2667 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2668 2668 spa_try_repair(spa, spa->spa_config);
2669 2669 spa_config_exit(spa, SCL_ALL, FTAG);
2670 2670 nvlist_free(spa->spa_config_splitting);
2671 2671 spa->spa_config_splitting = NULL;
2672 2672 }
2673 2673
2674 2674 /*
2675 2675 * Initialize internal SPA structures.
2676 2676 */
2677 2677 spa_ld_select_uberblock_done(spa, ub);
2678 2678
2679 2679 return (0);
2680 2680 }
2681 2681
2682 2682 static int
2683 2683 spa_ld_open_rootbp(spa_t *spa)
2684 2684 {
2685 2685 int error = 0;
2686 2686 vdev_t *rvd = spa->spa_root_vdev;
2687 2687
2688 2688 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2689 2689 if (error != 0) {
2690 2690 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
2691 2691 "[error=%d]", error);
2692 2692 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2693 2693 }
2694 2694 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2695 2695
2696 2696 return (0);
2697 2697 }
2698 2698
2699 2699 static int
2700 2700 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
2701 2701 boolean_t reloading)
2702 2702 {
2703 2703 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
2704 2704 nvlist_t *nv, *mos_config, *policy;
2705 2705 int error = 0, copy_error;
2706 2706 uint64_t healthy_tvds, healthy_tvds_mos;
2707 2707 uint64_t mos_config_txg;
2708 2708
2709 2709 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
2710 2710 != 0)
2711 2711 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2712 2712
2713 2713 /*
2714 2714 * If we're assembling a pool from a split, the config provided is
2715 2715 * already trusted so there is nothing to do.
2716 2716 */
2717 2717 if (type == SPA_IMPORT_ASSEMBLE)
2718 2718 return (0);
2719 2719
2720 2720 healthy_tvds = spa_healthy_core_tvds(spa);
2721 2721
2722 2722 if (load_nvlist(spa, spa->spa_config_object, &mos_config)
2723 2723 != 0) {
2724 2724 spa_load_failed(spa, "unable to retrieve MOS config");
2725 2725 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2726 2726 }
2727 2727
2728 2728 /*
2729 2729 * If we are doing an open, pool owner wasn't verified yet, thus do
2730 2730 * the verification here.
2731 2731 */
2732 2732 if (spa->spa_load_state == SPA_LOAD_OPEN) {
2733 2733 error = spa_verify_host(spa, mos_config);
2734 2734 if (error != 0) {
2735 2735 nvlist_free(mos_config);
2736 2736 return (error);
2737 2737 }
2738 2738 }
2739 2739
2740 2740 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
2741 2741
2742 2742 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2743 2743
2744 2744 /*
2745 2745 * Build a new vdev tree from the trusted config
2746 2746 */
2747 2747 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
2748 2748
2749 2749 /*
2750 2750 * Vdev paths in the MOS may be obsolete. If the untrusted config was
2751 2751 * obtained by scanning /dev/dsk, then it will have the right vdev
2752 2752 * paths. We update the trusted MOS config with this information.
2753 2753 * We first try to copy the paths with vdev_copy_path_strict, which
2754 2754 * succeeds only when both configs have exactly the same vdev tree.
2755 2755 * If that fails, we fall back to a more flexible method that has a
2756 2756 * best effort policy.
2757 2757 */
2758 2758 copy_error = vdev_copy_path_strict(rvd, mrvd);
2759 2759 if (copy_error != 0 || spa_load_print_vdev_tree) {
2760 2760 spa_load_note(spa, "provided vdev tree:");
2761 2761 vdev_dbgmsg_print_tree(rvd, 2);
2762 2762 spa_load_note(spa, "MOS vdev tree:");
2763 2763 vdev_dbgmsg_print_tree(mrvd, 2);
2764 2764 }
2765 2765 if (copy_error != 0) {
2766 2766 spa_load_note(spa, "vdev_copy_path_strict failed, falling "
2767 2767 "back to vdev_copy_path_relaxed");
2768 2768 vdev_copy_path_relaxed(rvd, mrvd);
2769 2769 }
2770 2770
2771 2771 vdev_close(rvd);
2772 2772 vdev_free(rvd);
2773 2773 spa->spa_root_vdev = mrvd;
2774 2774 rvd = mrvd;
2775 2775 spa_config_exit(spa, SCL_ALL, FTAG);
2776 2776
2777 2777 /*
2778 2778 * We will use spa_config if we decide to reload the spa or if spa_load
2779 2779 * fails and we rewind. We must thus regenerate the config using the
2780 2780 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
2781 2781 * pass settings on how to load the pool and is not stored in the MOS.
2782 2782 * We copy it over to our new, trusted config.
2783 2783 */
2784 2784 mos_config_txg = fnvlist_lookup_uint64(mos_config,
2785 2785 ZPOOL_CONFIG_POOL_TXG);
2786 2786 nvlist_free(mos_config);
2787 2787 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
2788 2788 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
2789 2789 &policy) == 0)
2790 2790 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
2791 2791 spa_config_set(spa, mos_config);
2792 2792 spa->spa_config_source = SPA_CONFIG_SRC_MOS;
2793 2793
2794 2794 /*
2795 2795 * Now that we got the config from the MOS, we should be more strict
2796 2796 * in checking blkptrs and can make assumptions about the consistency
2797 2797 * of the vdev tree. spa_trust_config must be set to true before opening
2798 2798 * vdevs in order for them to be writeable.
2799 2799 */
2800 2800 spa->spa_trust_config = B_TRUE;
2801 2801
2802 2802 /*
2803 2803 * Open and validate the new vdev tree
2804 2804 */
2805 2805 error = spa_ld_open_vdevs(spa);
2806 2806 if (error != 0)
2807 2807 return (error);
2808 2808
2809 2809 error = spa_ld_validate_vdevs(spa);
2810 2810 if (error != 0)
2811 2811 return (error);
2812 2812
2813 2813 if (copy_error != 0 || spa_load_print_vdev_tree) {
2814 2814 spa_load_note(spa, "final vdev tree:");
2815 2815 vdev_dbgmsg_print_tree(rvd, 2);
2816 2816 }
2817 2817
2818 2818 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
2819 2819 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
2820 2820 /*
2821 2821 * Sanity check to make sure that we are indeed loading the
2822 2822 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
2823 2823 * in the config provided and they happened to be the only ones
2824 2824 * to have the latest uberblock, we could involuntarily perform
2825 2825 * an extreme rewind.
2826 2826 */
2827 2827 healthy_tvds_mos = spa_healthy_core_tvds(spa);
2828 2828 if (healthy_tvds_mos - healthy_tvds >=
2829 2829 SPA_SYNC_MIN_VDEVS) {
2830 2830 spa_load_note(spa, "config provided misses too many "
2831 2831 "top-level vdevs compared to MOS (%lld vs %lld). ",
2832 2832 (u_longlong_t)healthy_tvds,
2833 2833 (u_longlong_t)healthy_tvds_mos);
2834 2834 spa_load_note(spa, "vdev tree:");
2835 2835 vdev_dbgmsg_print_tree(rvd, 2);
2836 2836 if (reloading) {
2837 2837 spa_load_failed(spa, "config was already "
2838 2838 "provided from MOS. Aborting.");
2839 2839 return (spa_vdev_err(rvd,
2840 2840 VDEV_AUX_CORRUPT_DATA, EIO));
2841 2841 }
2842 2842 spa_load_note(spa, "spa must be reloaded using MOS "
2843 2843 "config");
2844 2844 return (SET_ERROR(EAGAIN));
2845 2845 }
2846 2846 }
2847 2847
2848 2848 error = spa_check_for_missing_logs(spa);
2849 2849 if (error != 0)
2850 2850 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2851 2851
2852 2852 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
2853 2853 spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
2854 2854 "guid sum (%llu != %llu)",
2855 2855 (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
2856 2856 (u_longlong_t)rvd->vdev_guid_sum);
2857 2857 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2858 2858 ENXIO));
2859 2859 }
2860 2860
2861 2861 return (0);
2862 2862 }
2863 2863
2864 2864 static int
2865 2865 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
2866 2866 {
2867 2867 int error = 0;
2868 2868 vdev_t *rvd = spa->spa_root_vdev;
2869 2869
2870 2870 /*
2871 2871 * Everything that we read before spa_remove_init() must be stored
2872 2872 * on concreted vdevs. Therefore we do this as early as possible.
2873 2873 */
2874 2874 error = spa_remove_init(spa);
2875 2875 if (error != 0) {
2876 2876 spa_load_failed(spa, "spa_remove_init failed [error=%d]",
2877 2877 error);
2878 2878 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2879 2879 }
2880 2880
2881 2881 /*
2882 2882 * Retrieve information needed to condense indirect vdev mappings.
2883 2883 */
2884 2884 error = spa_condense_init(spa);
2885 2885 if (error != 0) {
2886 2886 spa_load_failed(spa, "spa_condense_init failed [error=%d]",
2887 2887 error);
2888 2888 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
2889 2889 }
2890 2890
2891 2891 return (0);
2892 2892 }
2893 2893
2894 2894 static int
2895 2895 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
2896 2896 {
2897 2897 int error = 0;
2898 2898 vdev_t *rvd = spa->spa_root_vdev;
2899 2899
2900 2900 if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2901 2901 boolean_t missing_feat_read = B_FALSE;
2902 2902 nvlist_t *unsup_feat, *enabled_feat;
2903 2903
2904 2904 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2905 2905 &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
2906 2906 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2907 2907 }
2908 2908
2909 2909 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2910 2910 &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
2911 2911 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2912 2912 }
2913 2913
2914 2914 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2915 2915 &spa->spa_feat_desc_obj, B_TRUE) != 0) {
2916 2916 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2917 2917 }
2918 2918
2919 2919 enabled_feat = fnvlist_alloc();
2920 2920 unsup_feat = fnvlist_alloc();
2921 2921
2922 2922 if (!spa_features_check(spa, B_FALSE,
2923 2923 unsup_feat, enabled_feat))
2924 2924 missing_feat_read = B_TRUE;
2925 2925
2926 2926 if (spa_writeable(spa) ||
2927 2927 spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
2928 2928 if (!spa_features_check(spa, B_TRUE,
2929 2929 unsup_feat, enabled_feat)) {
2930 2930 *missing_feat_writep = B_TRUE;
2931 2931 }
2932 2932 }
2933 2933
2934 2934 fnvlist_add_nvlist(spa->spa_load_info,
2935 2935 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2936 2936
2937 2937 if (!nvlist_empty(unsup_feat)) {
2938 2938 fnvlist_add_nvlist(spa->spa_load_info,
2939 2939 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2940 2940 }
2941 2941
2942 2942 fnvlist_free(enabled_feat);
2943 2943 fnvlist_free(unsup_feat);
2944 2944
2945 2945 if (!missing_feat_read) {
2946 2946 fnvlist_add_boolean(spa->spa_load_info,
2947 2947 ZPOOL_CONFIG_CAN_RDONLY);
2948 2948 }
2949 2949
2950 2950 /*
2951 2951 * If the state is SPA_LOAD_TRYIMPORT, our objective is
2952 2952 * twofold: to determine whether the pool is available for
2953 2953 * import in read-write mode and (if it is not) whether the
2954 2954 * pool is available for import in read-only mode. If the pool
2955 2955 * is available for import in read-write mode, it is displayed
2956 2956 * as available in userland; if it is not available for import
2957 2957 * in read-only mode, it is displayed as unavailable in
2958 2958 * userland. If the pool is available for import in read-only
2959 2959 * mode but not read-write mode, it is displayed as unavailable
2960 2960 * in userland with a special note that the pool is actually
2961 2961 * available for open in read-only mode.
2962 2962 *
2963 2963 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2964 2964 * missing a feature for write, we must first determine whether
2965 2965 * the pool can be opened read-only before returning to
2966 2966 * userland in order to know whether to display the
2967 2967 * abovementioned note.
2968 2968 */
2969 2969 if (missing_feat_read || (*missing_feat_writep &&
2970 2970 spa_writeable(spa))) {
2971 2971 spa_load_failed(spa, "pool uses unsupported features");
2972 2972 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2973 2973 ENOTSUP));
2974 2974 }
2975 2975
2976 2976 /*
2977 2977 * Load refcounts for ZFS features from disk into an in-memory
2978 2978 * cache during SPA initialization.
2979 2979 */
2980 2980 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
2981 2981 uint64_t refcount;
2982 2982
2983 2983 error = feature_get_refcount_from_disk(spa,
2984 2984 &spa_feature_table[i], &refcount);
2985 2985 if (error == 0) {
2986 2986 spa->spa_feat_refcount_cache[i] = refcount;
2987 2987 } else if (error == ENOTSUP) {
2988 2988 spa->spa_feat_refcount_cache[i] =
2989 2989 SPA_FEATURE_DISABLED;
2990 2990 } else {
2991 2991 spa_load_failed(spa, "error getting refcount "
2992 2992 "for feature %s [error=%d]",
2993 2993 spa_feature_table[i].fi_guid, error);
2994 2994 return (spa_vdev_err(rvd,
2995 2995 VDEV_AUX_CORRUPT_DATA, EIO));
2996 2996 }
2997 2997 }
2998 2998 }
2999 2999
3000 3000 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
3001 3001 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
3002 3002 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
3003 3003 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3004 3004 }
3005 3005
3006 3006 return (0);
3007 3007 }
3008 3008
3009 3009 static int
3010 3010 spa_ld_load_special_directories(spa_t *spa)
3011 3011 {
3012 3012 int error = 0;
3013 3013 vdev_t *rvd = spa->spa_root_vdev;
3014 3014
3015 3015 spa->spa_is_initializing = B_TRUE;
3016 3016 error = dsl_pool_open(spa->spa_dsl_pool);
3017 3017 spa->spa_is_initializing = B_FALSE;
3018 3018 if (error != 0) {
3019 3019 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
3020 3020 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3021 3021 }
3022 3022
3023 3023 return (0);
3024 3024 }
3025 3025
3026 3026 static int
3027 3027 spa_ld_get_props(spa_t *spa)
3028 3028 {
3029 3029 int error = 0;
3030 3030 uint64_t obj;
3031 3031 vdev_t *rvd = spa->spa_root_vdev;
3032 3032
3033 3033 /* Grab the secret checksum salt from the MOS. */
3034 3034 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3035 3035 DMU_POOL_CHECKSUM_SALT, 1,
3036 3036 sizeof (spa->spa_cksum_salt.zcs_bytes),
3037 3037 spa->spa_cksum_salt.zcs_bytes);
3038 3038 if (error == ENOENT) {
3039 3039 /* Generate a new salt for subsequent use */
3040 3040 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3041 3041 sizeof (spa->spa_cksum_salt.zcs_bytes));
3042 3042 } else if (error != 0) {
3043 3043 spa_load_failed(spa, "unable to retrieve checksum salt from "
3044 3044 "MOS [error=%d]", error);
3045 3045 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3046 3046 }
3047 3047
3048 3048 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
3049 3049 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3050 3050 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
3051 3051 if (error != 0) {
3052 3052 spa_load_failed(spa, "error opening deferred-frees bpobj "
3053 3053 "[error=%d]", error);
3054 3054 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3055 3055 }
3056 3056
3057 3057 /*
3058 3058 * Load the bit that tells us to use the new accounting function
3059 3059 * (raid-z deflation). If we have an older pool, this will not
3060 3060 * be present.
3061 3061 */
3062 3062 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
3063 3063 if (error != 0 && error != ENOENT)
3064 3064 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3065 3065
3066 3066 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
3067 3067 &spa->spa_creation_version, B_FALSE);
3068 3068 if (error != 0 && error != ENOENT)
3069 3069 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3070 3070
3071 3071 /*
3072 3072 * Load the persistent error log. If we have an older pool, this will
3073 3073 * not be present.
3074 3074 */
3075 3075 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
3076 3076 B_FALSE);
3077 3077 if (error != 0 && error != ENOENT)
3078 3078 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3079 3079
3080 3080 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
3081 3081 &spa->spa_errlog_scrub, B_FALSE);
3082 3082 if (error != 0 && error != ENOENT)
3083 3083 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3084 3084
3085 3085 /*
3086 3086 * Load the history object. If we have an older pool, this
3087 3087 * will not be present.
3088 3088 */
3089 3089 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
3090 3090 if (error != 0 && error != ENOENT)
3091 3091 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3092 3092
3093 3093 /*
3094 3094 * Load the per-vdev ZAP map. If we have an older pool, this will not
3095 3095 * be present; in this case, defer its creation to a later time to
3096 3096 * avoid dirtying the MOS this early / out of sync context. See
3097 3097 * spa_sync_config_object.
3098 3098 */
3099 3099
3100 3100 /* The sentinel is only available in the MOS config. */
3101 3101 nvlist_t *mos_config;
3102 3102 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
3103 3103 spa_load_failed(spa, "unable to retrieve MOS config");
3104 3104 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3105 3105 }
3106 3106
3107 3107 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
3108 3108 &spa->spa_all_vdev_zaps, B_FALSE);
3109 3109
3110 3110 if (error == ENOENT) {
3111 3111 VERIFY(!nvlist_exists(mos_config,
3112 3112 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
3113 3113 spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
3114 3114 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3115 3115 } else if (error != 0) {
3116 3116 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3117 3117 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
3118 3118 /*
3119 3119 * An older version of ZFS overwrote the sentinel value, so
3120 3120 * we have orphaned per-vdev ZAPs in the MOS. Defer their
3121 3121 * destruction to later; see spa_sync_config_object.
3122 3122 */
3123 3123 spa->spa_avz_action = AVZ_ACTION_DESTROY;
3124 3124 /*
3125 3125 * We're assuming that no vdevs have had their ZAPs created
3126 3126 * before this. Better be sure of it.
3127 3127 */
3128 3128 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3129 3129 }
3130 3130 nvlist_free(mos_config);
3131 3131
3132 3132 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3133 3133
3134 3134 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
3135 3135 B_FALSE);
3136 3136 if (error && error != ENOENT)
3137 3137 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3138 3138
3139 3139 if (error == 0) {
3140 3140 uint64_t autoreplace;
3141 3141
3142 3142 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
3143 3143 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
3144 3144 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
3145 3145 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
3146 3146 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
3147 3147 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
3148 3148 &spa->spa_dedup_ditto);
3149 3149
3150 3150 spa->spa_autoreplace = (autoreplace != 0);
3151 3151 }
3152 3152
3153 3153 /*
3154 3154 * If we are importing a pool with missing top-level vdevs,
3155 3155 * we enforce that the pool doesn't panic or get suspended on
3156 3156 * error since the likelihood of missing data is extremely high.
3157 3157 */
3158 3158 if (spa->spa_missing_tvds > 0 &&
3159 3159 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
3160 3160 spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3161 3161 spa_load_note(spa, "forcing failmode to 'continue' "
3162 3162 "as some top level vdevs are missing");
3163 3163 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
3164 3164 }
3165 3165
3166 3166 return (0);
3167 3167 }
3168 3168
3169 3169 static int
3170 3170 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
3171 3171 {
3172 3172 int error = 0;
3173 3173 vdev_t *rvd = spa->spa_root_vdev;
3174 3174
3175 3175 /*
3176 3176 * If we're assembling the pool from the split-off vdevs of
3177 3177 * an existing pool, we don't want to attach the spares & cache
3178 3178 * devices.
3179 3179 */
3180 3180
3181 3181 /*
3182 3182 * Load any hot spares for this pool.
3183 3183 */
3184 3184 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
3185 3185 B_FALSE);
3186 3186 if (error != 0 && error != ENOENT)
3187 3187 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3188 3188 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3189 3189 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
3190 3190 if (load_nvlist(spa, spa->spa_spares.sav_object,
3191 3191 &spa->spa_spares.sav_config) != 0) {
3192 3192 spa_load_failed(spa, "error loading spares nvlist");
3193 3193 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3194 3194 }
3195 3195
3196 3196 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3197 3197 spa_load_spares(spa);
3198 3198 spa_config_exit(spa, SCL_ALL, FTAG);
3199 3199 } else if (error == 0) {
3200 3200 spa->spa_spares.sav_sync = B_TRUE;
3201 3201 }
3202 3202
3203 3203 /*
3204 3204 * Load any level 2 ARC devices for this pool.
3205 3205 */
3206 3206 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
3207 3207 &spa->spa_l2cache.sav_object, B_FALSE);
3208 3208 if (error != 0 && error != ENOENT)
3209 3209 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3210 3210 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3211 3211 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
3212 3212 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
3213 3213 &spa->spa_l2cache.sav_config) != 0) {
3214 3214 spa_load_failed(spa, "error loading l2cache nvlist");
3215 3215 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3216 3216 }
3217 3217
3218 3218 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3219 3219 spa_load_l2cache(spa);
3220 3220 spa_config_exit(spa, SCL_ALL, FTAG);
3221 3221 } else if (error == 0) {
3222 3222 spa->spa_l2cache.sav_sync = B_TRUE;
3223 3223 }
3224 3224
3225 3225 return (0);
3226 3226 }
3227 3227
3228 3228 static int
3229 3229 spa_ld_load_vdev_metadata(spa_t *spa)
3230 3230 {
3231 3231 int error = 0;
3232 3232 vdev_t *rvd = spa->spa_root_vdev;
3233 3233
3234 3234 /*
3235 3235 * If the 'autoreplace' property is set, then post a resource notifying
3236 3236 * the ZFS DE that it should not issue any faults for unopenable
3237 3237 * devices. We also iterate over the vdevs, and post a sysevent for any
3238 3238 * unopenable vdevs so that the normal autoreplace handler can take
3239 3239 * over.
3240 3240 */
3241 3241 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3242 3242 spa_check_removed(spa->spa_root_vdev);
3243 3243 /*
3244 3244 * For the import case, this is done in spa_import(), because
3245 3245 * at this point we're using the spare definitions from
3246 3246 * the MOS config, not necessarily from the userland config.
3247 3247 */
3248 3248 if (spa->spa_load_state != SPA_LOAD_IMPORT) {
3249 3249 spa_aux_check_removed(&spa->spa_spares);
3250 3250 spa_aux_check_removed(&spa->spa_l2cache);
3251 3251 }
3252 3252 }
3253 3253
3254 3254 /*
3255 3255 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
3256 3256 */
3257 3257 error = vdev_load(rvd);
3258 3258 if (error != 0) {
3259 3259 spa_load_failed(spa, "vdev_load failed [error=%d]", error);
3260 3260 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3261 3261 }
3262 3262
3263 3263 /*
3264 3264 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
3265 3265 */
3266 3266 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3267 3267 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
3268 3268 spa_config_exit(spa, SCL_ALL, FTAG);
3269 3269
3270 3270 return (0);
3271 3271 }
3272 3272
3273 3273 static int
3274 3274 spa_ld_load_dedup_tables(spa_t *spa)
3275 3275 {
3276 3276 int error = 0;
3277 3277 vdev_t *rvd = spa->spa_root_vdev;
3278 3278
3279 3279 error = ddt_load(spa);
3280 3280 if (error != 0) {
3281 3281 spa_load_failed(spa, "ddt_load failed [error=%d]", error);
3282 3282 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3283 3283 }
3284 3284
3285 3285 return (0);
3286 3286 }
3287 3287
3288 3288 static int
3289 3289 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
3290 3290 {
3291 3291 vdev_t *rvd = spa->spa_root_vdev;
3292 3292
3293 3293 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
3294 3294 boolean_t missing = spa_check_logs(spa);
3295 3295 if (missing) {
3296 3296 if (spa->spa_missing_tvds != 0) {
3297 3297 spa_load_note(spa, "spa_check_logs failed "
3298 3298 "so dropping the logs");
3299 3299 } else {
3300 3300 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
3301 3301 spa_load_failed(spa, "spa_check_logs failed");
3302 3302 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
3303 3303 ENXIO));
3304 3304 }
3305 3305 }
3306 3306 }
3307 3307
3308 3308 return (0);
3309 3309 }
3310 3310
3311 3311 static int
3312 3312 spa_ld_verify_pool_data(spa_t *spa)
3313 3313 {
3314 3314 int error = 0;
3315 3315 vdev_t *rvd = spa->spa_root_vdev;
3316 3316
3317 3317 /*
3318 3318 * We've successfully opened the pool, verify that we're ready
3319 3319 * to start pushing transactions.
3320 3320 */
3321 3321 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3322 3322 error = spa_load_verify(spa);
3323 3323 if (error != 0) {
3324 3324 spa_load_failed(spa, "spa_load_verify failed "
3325 3325 "[error=%d]", error);
3326 3326 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3327 3327 error));
3328 3328 }
3329 3329 }
3330 3330
3331 3331 return (0);
3332 3332 }
3333 3333
3334 3334 static void
3335 3335 spa_ld_claim_log_blocks(spa_t *spa)
3336 3336 {
3337 3337 dmu_tx_t *tx;
3338 3338 dsl_pool_t *dp = spa_get_dsl(spa);
3339 3339
3340 3340 /*
3341 3341 * Claim log blocks that haven't been committed yet.
3342 3342 * This must all happen in a single txg.
3343 3343 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
3344 3344 * invoked from zil_claim_log_block()'s i/o done callback.
3345 3345 * Price of rollback is that we abandon the log.
3346 3346 */
3347 3347 spa->spa_claiming = B_TRUE;
3348 3348
3349 3349 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
3350 3350 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3351 3351 zil_claim, tx, DS_FIND_CHILDREN);
3352 3352 dmu_tx_commit(tx);
3353 3353
3354 3354 spa->spa_claiming = B_FALSE;
3355 3355
3356 3356 spa_set_log_state(spa, SPA_LOG_GOOD);
3357 3357 }
3358 3358
3359 3359 static void
3360 3360 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
3361 3361 boolean_t update_config_cache)
3362 3362 {
3363 3363 vdev_t *rvd = spa->spa_root_vdev;
3364 3364 int need_update = B_FALSE;
3365 3365
3366 3366 /*
3367 3367 * If the config cache is stale, or we have uninitialized
3368 3368 * metaslabs (see spa_vdev_add()), then update the config.
3369 3369 *
3370 3370 * If this is a verbatim import, trust the current
3371 3371 * in-core spa_config and update the disk labels.
3372 3372 */
3373 3373 if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
3374 3374 spa->spa_load_state == SPA_LOAD_IMPORT ||
3375 3375 spa->spa_load_state == SPA_LOAD_RECOVER ||
3376 3376 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
3377 3377 need_update = B_TRUE;
3378 3378
3379 3379 for (int c = 0; c < rvd->vdev_children; c++)
3380 3380 if (rvd->vdev_child[c]->vdev_ms_array == 0)
3381 3381 need_update = B_TRUE;
3382 3382
3383 3383 /*
3384 3384 * Update the config cache asychronously in case we're the
3385 3385 * root pool, in which case the config cache isn't writable yet.
3386 3386 */
3387 3387 if (need_update)
3388 3388 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3389 3389 }
3390 3390
3391 3391 static void
3392 3392 spa_ld_prepare_for_reload(spa_t *spa)
3393 3393 {
3394 3394 int mode = spa->spa_mode;
3395 3395 int async_suspended = spa->spa_async_suspended;
3396 3396
3397 3397 spa_unload(spa);
3398 3398 spa_deactivate(spa);
3399 3399 spa_activate(spa, mode);
3400 3400
3401 3401 /*
3402 3402 * We save the value of spa_async_suspended as it gets reset to 0 by
3403 3403 * spa_unload(). We want to restore it back to the original value before
3404 3404 * returning as we might be calling spa_async_resume() later.
3405 3405 */
3406 3406 spa->spa_async_suspended = async_suspended;
3407 3407 }
3408 3408
3409 3409 static int
3410 3410 spa_ld_read_checkpoint_txg(spa_t *spa)
3411 3411 {
3412 3412 uberblock_t checkpoint;
3413 3413 int error = 0;
3414 3414
3415 3415 ASSERT0(spa->spa_checkpoint_txg);
3416 3416 ASSERT(MUTEX_HELD(&spa_namespace_lock));
3417 3417
3418 3418 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3419 3419 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3420 3420 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3421 3421
3422 3422 if (error == ENOENT)
3423 3423 return (0);
3424 3424
3425 3425 if (error != 0)
3426 3426 return (error);
3427 3427
3428 3428 ASSERT3U(checkpoint.ub_txg, !=, 0);
3429 3429 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
3430 3430 ASSERT3U(checkpoint.ub_timestamp, !=, 0);
3431 3431 spa->spa_checkpoint_txg = checkpoint.ub_txg;
3432 3432 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
3433 3433
3434 3434 return (0);
3435 3435 }
3436 3436
3437 3437 static int
3438 3438 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
3439 3439 {
3440 3440 int error = 0;
3441 3441
3442 3442 ASSERT(MUTEX_HELD(&spa_namespace_lock));
3443 3443 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3444 3444
3445 3445 /*
3446 3446 * Never trust the config that is provided unless we are assembling
3447 3447 * a pool following a split.
3448 3448 * This means don't trust blkptrs and the vdev tree in general. This
3449 3449 * also effectively puts the spa in read-only mode since
3450 3450 * spa_writeable() checks for spa_trust_config to be true.
3451 3451 * We will later load a trusted config from the MOS.
3452 3452 */
3453 3453 if (type != SPA_IMPORT_ASSEMBLE)
3454 3454 spa->spa_trust_config = B_FALSE;
3455 3455
3456 3456 /*
3457 3457 * Parse the config provided to create a vdev tree.
3458 3458 */
3459 3459 error = spa_ld_parse_config(spa, type);
3460 3460 if (error != 0)
3461 3461 return (error);
3462 3462
3463 3463 /*
3464 3464 * Now that we have the vdev tree, try to open each vdev. This involves
3465 3465 * opening the underlying physical device, retrieving its geometry and
3466 3466 * probing the vdev with a dummy I/O. The state of each vdev will be set
3467 3467 * based on the success of those operations. After this we'll be ready
3468 3468 * to read from the vdevs.
3469 3469 */
3470 3470 error = spa_ld_open_vdevs(spa);
3471 3471 if (error != 0)
3472 3472 return (error);
3473 3473
3474 3474 /*
3475 3475 * Read the label of each vdev and make sure that the GUIDs stored
3476 3476 * there match the GUIDs in the config provided.
3477 3477 * If we're assembling a new pool that's been split off from an
3478 3478 * existing pool, the labels haven't yet been updated so we skip
3479 3479 * validation for now.
3480 3480 */
3481 3481 if (type != SPA_IMPORT_ASSEMBLE) {
3482 3482 error = spa_ld_validate_vdevs(spa);
3483 3483 if (error != 0)
3484 3484 return (error);
3485 3485 }
3486 3486
3487 3487 /*
3488 3488 * Read all vdev labels to find the best uberblock (i.e. latest,
3489 3489 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
3490 3490 * get the list of features required to read blkptrs in the MOS from
3491 3491 * the vdev label with the best uberblock and verify that our version
3492 3492 * of zfs supports them all.
3493 3493 */
3494 3494 error = spa_ld_select_uberblock(spa, type);
3495 3495 if (error != 0)
3496 3496 return (error);
3497 3497
3498 3498 /*
3499 3499 * Pass that uberblock to the dsl_pool layer which will open the root
3500 3500 * blkptr. This blkptr points to the latest version of the MOS and will
3501 3501 * allow us to read its contents.
3502 3502 */
3503 3503 error = spa_ld_open_rootbp(spa);
3504 3504 if (error != 0)
3505 3505 return (error);
3506 3506
3507 3507 return (0);
3508 3508 }
3509 3509
3510 3510 static int
3511 3511 spa_ld_checkpoint_rewind(spa_t *spa)
3512 3512 {
3513 3513 uberblock_t checkpoint;
3514 3514 int error = 0;
3515 3515
3516 3516 ASSERT(MUTEX_HELD(&spa_namespace_lock));
3517 3517 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3518 3518
3519 3519 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3520 3520 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3521 3521 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3522 3522
3523 3523 if (error != 0) {
3524 3524 spa_load_failed(spa, "unable to retrieve checkpointed "
3525 3525 "uberblock from the MOS config [error=%d]", error);
3526 3526
3527 3527 if (error == ENOENT)
3528 3528 error = ZFS_ERR_NO_CHECKPOINT;
3529 3529
3530 3530 return (error);
3531 3531 }
3532 3532
3533 3533 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
3534 3534 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
3535 3535
3536 3536 /*
3537 3537 * We need to update the txg and timestamp of the checkpointed
3538 3538 * uberblock to be higher than the latest one. This ensures that
3539 3539 * the checkpointed uberblock is selected if we were to close and
3540 3540 * reopen the pool right after we've written it in the vdev labels.
3541 3541 * (also see block comment in vdev_uberblock_compare)
3542 3542 */
3543 3543 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
3544 3544 checkpoint.ub_timestamp = gethrestime_sec();
3545 3545
3546 3546 /*
3547 3547 * Set current uberblock to be the checkpointed uberblock.
3548 3548 */
3549 3549 spa->spa_uberblock = checkpoint;
3550 3550
3551 3551 /*
3552 3552 * If we are doing a normal rewind, then the pool is open for
3553 3553 * writing and we sync the "updated" checkpointed uberblock to
3554 3554 * disk. Once this is done, we've basically rewound the whole
3555 3555 * pool and there is no way back.
3556 3556 *
3557 3557 * There are cases when we don't want to attempt and sync the
3558 3558 * checkpointed uberblock to disk because we are opening a
3559 3559 * pool as read-only. Specifically, verifying the checkpointed
3560 3560 * state with zdb, and importing the checkpointed state to get
3561 3561 * a "preview" of its content.
3562 3562 */
3563 3563 if (spa_writeable(spa)) {
3564 3564 vdev_t *rvd = spa->spa_root_vdev;
3565 3565
3566 3566 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3567 3567 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
3568 3568 int svdcount = 0;
3569 3569 int children = rvd->vdev_children;
3570 3570 int c0 = spa_get_random(children);
3571 3571
3572 3572 for (int c = 0; c < children; c++) {
3573 3573 vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
3574 3574
3575 3575 /* Stop when revisiting the first vdev */
3576 3576 if (c > 0 && svd[0] == vd)
3577 3577 break;
3578 3578
3579 3579 if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
3580 3580 !vdev_is_concrete(vd))
3581 3581 continue;
3582 3582
3583 3583 svd[svdcount++] = vd;
3584 3584 if (svdcount == SPA_SYNC_MIN_VDEVS)
3585 3585 break;
3586 3586 }
3587 3587 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
3588 3588 if (error == 0)
3589 3589 spa->spa_last_synced_guid = rvd->vdev_guid;
3590 3590 spa_config_exit(spa, SCL_ALL, FTAG);
3591 3591
3592 3592 if (error != 0) {
3593 3593 spa_load_failed(spa, "failed to write checkpointed "
3594 3594 "uberblock to the vdev labels [error=%d]", error);
3595 3595 return (error);
3596 3596 }
3597 3597 }
3598 3598
3599 3599 return (0);
3600 3600 }
3601 3601
3602 3602 static int
3603 3603 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
3604 3604 boolean_t *update_config_cache)
3605 3605 {
3606 3606 int error;
3607 3607
3608 3608 /*
3609 3609 * Parse the config for pool, open and validate vdevs,
3610 3610 * select an uberblock, and use that uberblock to open
3611 3611 * the MOS.
3612 3612 */
3613 3613 error = spa_ld_mos_init(spa, type);
3614 3614 if (error != 0)
3615 3615 return (error);
3616 3616
3617 3617 /*
3618 3618 * Retrieve the trusted config stored in the MOS and use it to create
3619 3619 * a new, exact version of the vdev tree, then reopen all vdevs.
3620 3620 */
3621 3621 error = spa_ld_trusted_config(spa, type, B_FALSE);
3622 3622 if (error == EAGAIN) {
3623 3623 if (update_config_cache != NULL)
3624 3624 *update_config_cache = B_TRUE;
3625 3625
3626 3626 /*
3627 3627 * Redo the loading process with the trusted config if it is
3628 3628 * too different from the untrusted config.
3629 3629 */
3630 3630 spa_ld_prepare_for_reload(spa);
3631 3631 spa_load_note(spa, "RELOADING");
3632 3632 error = spa_ld_mos_init(spa, type);
3633 3633 if (error != 0)
3634 3634 return (error);
3635 3635
3636 3636 error = spa_ld_trusted_config(spa, type, B_TRUE);
3637 3637 if (error != 0)
3638 3638 return (error);
3639 3639
3640 3640 } else if (error != 0) {
3641 3641 return (error);
3642 3642 }
3643 3643
3644 3644 return (0);
3645 3645 }
3646 3646
3647 3647 /*
3648 3648 * Load an existing storage pool, using the config provided. This config
3649 3649 * describes which vdevs are part of the pool and is later validated against
3650 3650 * partial configs present in each vdev's label and an entire copy of the
3651 3651 * config stored in the MOS.
3652 3652 */
3653 3653 static int
3654 3654 spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
3655 3655 {
3656 3656 int error = 0;
3657 3657 boolean_t missing_feat_write = B_FALSE;
3658 3658 boolean_t checkpoint_rewind =
3659 3659 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3660 3660 boolean_t update_config_cache = B_FALSE;
3661 3661
3662 3662 ASSERT(MUTEX_HELD(&spa_namespace_lock));
3663 3663 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3664 3664
3665 3665 spa_load_note(spa, "LOADING");
3666 3666
3667 3667 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
3668 3668 if (error != 0)
3669 3669 return (error);
3670 3670
3671 3671 /*
3672 3672 * If we are rewinding to the checkpoint then we need to repeat
3673 3673 * everything we've done so far in this function but this time
3674 3674 * selecting the checkpointed uberblock and using that to open
3675 3675 * the MOS.
3676 3676 */
3677 3677 if (checkpoint_rewind) {
3678 3678 /*
3679 3679 * If we are rewinding to the checkpoint update config cache
3680 3680 * anyway.
3681 3681 */
3682 3682 update_config_cache = B_TRUE;
3683 3683
3684 3684 /*
3685 3685 * Extract the checkpointed uberblock from the current MOS
3686 3686 * and use this as the pool's uberblock from now on. If the
3687 3687 * pool is imported as writeable we also write the checkpoint
3688 3688 * uberblock to the labels, making the rewind permanent.
3689 3689 */
3690 3690 error = spa_ld_checkpoint_rewind(spa);
3691 3691 if (error != 0)
3692 3692 return (error);
3693 3693
3694 3694 /*
3695 3695 * Redo the loading process process again with the
3696 3696 * checkpointed uberblock.
3697 3697 */
3698 3698 spa_ld_prepare_for_reload(spa);
3699 3699 spa_load_note(spa, "LOADING checkpointed uberblock");
3700 3700 error = spa_ld_mos_with_trusted_config(spa, type, NULL);
3701 3701 if (error != 0)
3702 3702 return (error);
3703 3703 }
3704 3704
3705 3705 /*
3706 3706 * Retrieve the checkpoint txg if the pool has a checkpoint.
3707 3707 */
3708 3708 error = spa_ld_read_checkpoint_txg(spa);
3709 3709 if (error != 0)
3710 3710 return (error);
3711 3711
3712 3712 /*
3713 3713 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
3714 3714 * from the pool and their contents were re-mapped to other vdevs. Note
3715 3715 * that everything that we read before this step must have been
3716 3716 * rewritten on concrete vdevs after the last device removal was
3717 3717 * initiated. Otherwise we could be reading from indirect vdevs before
3718 3718 * we have loaded their mappings.
3719 3719 */
3720 3720 error = spa_ld_open_indirect_vdev_metadata(spa);
3721 3721 if (error != 0)
3722 3722 return (error);
3723 3723
3724 3724 /*
3725 3725 * Retrieve the full list of active features from the MOS and check if
3726 3726 * they are all supported.
3727 3727 */
3728 3728 error = spa_ld_check_features(spa, &missing_feat_write);
3729 3729 if (error != 0)
3730 3730 return (error);
3731 3731
3732 3732 /*
3733 3733 * Load several special directories from the MOS needed by the dsl_pool
3734 3734 * layer.
3735 3735 */
3736 3736 error = spa_ld_load_special_directories(spa);
3737 3737 if (error != 0)
3738 3738 return (error);
3739 3739
3740 3740 /*
3741 3741 * Retrieve pool properties from the MOS.
3742 3742 */
3743 3743 error = spa_ld_get_props(spa);
3744 3744 if (error != 0)
3745 3745 return (error);
3746 3746
3747 3747 /*
3748 3748 * Retrieve the list of auxiliary devices - cache devices and spares -
3749 3749 * and open them.
3750 3750 */
3751 3751 error = spa_ld_open_aux_vdevs(spa, type);
3752 3752 if (error != 0)
3753 3753 return (error);
3754 3754
3755 3755 /*
3756 3756 * Load the metadata for all vdevs. Also check if unopenable devices
3757 3757 * should be autoreplaced.
3758 3758 */
3759 3759 error = spa_ld_load_vdev_metadata(spa);
3760 3760 if (error != 0)
3761 3761 return (error);
3762 3762
3763 3763 error = spa_ld_load_dedup_tables(spa);
3764 3764 if (error != 0)
3765 3765 return (error);
3766 3766
3767 3767 /*
3768 3768 * Verify the logs now to make sure we don't have any unexpected errors
3769 3769 * when we claim log blocks later.
3770 3770 */
3771 3771 error = spa_ld_verify_logs(spa, type, ereport);
3772 3772 if (error != 0)
3773 3773 return (error);
3774 3774
3775 3775 if (missing_feat_write) {
3776 3776 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
3777 3777
3778 3778 /*
3779 3779 * At this point, we know that we can open the pool in
3780 3780 * read-only mode but not read-write mode. We now have enough
3781 3781 * information and can return to userland.
3782 3782 */
3783 3783 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
3784 3784 ENOTSUP));
3785 3785 }
3786 3786
3787 3787 /*
3788 3788 * Traverse the last txgs to make sure the pool was left off in a safe
3789 3789 * state. When performing an extreme rewind, we verify the whole pool,
3790 3790 * which can take a very long time.
3791 3791 */
3792 3792 error = spa_ld_verify_pool_data(spa);
3793 3793 if (error != 0)
3794 3794 return (error);
3795 3795
3796 3796 /*
3797 3797 * Calculate the deflated space for the pool. This must be done before
3798 3798 * we write anything to the pool because we'd need to update the space
3799 3799 * accounting using the deflated sizes.
3800 3800 */
3801 3801 spa_update_dspace(spa);
3802 3802
3803 3803 /*
3804 3804 * We have now retrieved all the information we needed to open the
3805 3805 * pool. If we are importing the pool in read-write mode, a few
3806 3806 * additional steps must be performed to finish the import.
3807 3807 */
3808 3808 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
3809 3809 spa->spa_load_max_txg == UINT64_MAX)) {
3810 3810 uint64_t config_cache_txg = spa->spa_config_txg;
3811 3811
3812 3812 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
3813 3813
3814 3814 /*
3815 3815 * In case of a checkpoint rewind, log the original txg
3816 3816 * of the checkpointed uberblock.
3817 3817 */
3818 3818 if (checkpoint_rewind) {
3819 3819 spa_history_log_internal(spa, "checkpoint rewind",
3820 3820 NULL, "rewound state to txg=%llu",
3821 3821 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
3822 3822 }
3823 3823
3824 3824 /*
3825 3825 * Traverse the ZIL and claim all blocks.
3826 3826 */
3827 3827 spa_ld_claim_log_blocks(spa);
3828 3828
3829 3829 /*
3830 3830 * Kick-off the syncing thread.
3831 3831 */
3832 3832 spa->spa_sync_on = B_TRUE;
3833 3833 txg_sync_start(spa->spa_dsl_pool);
3834 3834
3835 3835 /*
3836 3836 * Wait for all claims to sync. We sync up to the highest
3837 3837 * claimed log block birth time so that claimed log blocks
3838 3838 * don't appear to be from the future. spa_claim_max_txg
3839 3839 * will have been set for us by ZIL traversal operations
3840 3840 * performed above.
3841 3841 */
3842 3842 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
3843 3843
3844 3844 /*
3845 3845 * Check if we need to request an update of the config. On the
3846 3846 * next sync, we would update the config stored in vdev labels
3847 3847 * and the cachefile (by default /etc/zfs/zpool.cache).
3848 3848 */
3849 3849 spa_ld_check_for_config_update(spa, config_cache_txg,
3850 3850 update_config_cache);
3851 3851
3852 3852 /*
3853 3853 * Check all DTLs to see if anything needs resilvering.
3854 3854 */
3855 3855 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
3856 3856 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
3857 3857 spa_async_request(spa, SPA_ASYNC_RESILVER);
3858 3858
3859 3859 /*
3860 3860 * Log the fact that we booted up (so that we can detect if
3861 3861 * we rebooted in the middle of an operation).
3862 3862 */
3863 3863 spa_history_log_version(spa, "open");
3864 3864
3865 3865 spa_restart_removal(spa);
3866 3866 spa_spawn_aux_threads(spa);
3867 3867
3868 3868 /*
3869 3869 * Delete any inconsistent datasets.
3870 3870 *
3871 3871 * Note:
3872 3872 * Since we may be issuing deletes for clones here,
3873 3873 * we make sure to do so after we've spawned all the
3874 3874 * auxiliary threads above (from which the livelist
3875 3875 * deletion zthr is part of).
3876 3876 */
3877 3877 (void) dmu_objset_find(spa_name(spa),
3878 3878 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
3879 3879
3880 3880 /*
3881 3881 * Clean up any stale temporary dataset userrefs.
3882 3882 */
3883 3883 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
3884 3884
3885 3885 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3886 3886 vdev_initialize_restart(spa->spa_root_vdev);
3887 3887 spa_config_exit(spa, SCL_CONFIG, FTAG);
3888 3888 }
3889 3889
3890 3890 spa_load_note(spa, "LOADED");
3891 3891
3892 3892 return (0);
3893 3893 }
3894 3894
3895 3895 static int
3896 3896 spa_load_retry(spa_t *spa, spa_load_state_t state)
3897 3897 {
3898 3898 int mode = spa->spa_mode;
3899 3899
3900 3900 spa_unload(spa);
3901 3901 spa_deactivate(spa);
3902 3902
3903 3903 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
3904 3904
3905 3905 spa_activate(spa, mode);
3906 3906 spa_async_suspend(spa);
3907 3907
3908 3908 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
3909 3909 (u_longlong_t)spa->spa_load_max_txg);
3910 3910
3911 3911 return (spa_load(spa, state, SPA_IMPORT_EXISTING));
3912 3912 }
3913 3913
3914 3914 /*
3915 3915 * If spa_load() fails this function will try loading prior txg's. If
3916 3916 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
3917 3917 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
3918 3918 * function will not rewind the pool and will return the same error as
3919 3919 * spa_load().
3920 3920 */
3921 3921 static int
3922 3922 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
3923 3923 int rewind_flags)
3924 3924 {
3925 3925 nvlist_t *loadinfo = NULL;
3926 3926 nvlist_t *config = NULL;
3927 3927 int load_error, rewind_error;
3928 3928 uint64_t safe_rewind_txg;
3929 3929 uint64_t min_txg;
3930 3930
3931 3931 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
3932 3932 spa->spa_load_max_txg = spa->spa_load_txg;
3933 3933 spa_set_log_state(spa, SPA_LOG_CLEAR);
3934 3934 } else {
3935 3935 spa->spa_load_max_txg = max_request;
3936 3936 if (max_request != UINT64_MAX)
3937 3937 spa->spa_extreme_rewind = B_TRUE;
3938 3938 }
3939 3939
3940 3940 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
3941 3941 if (load_error == 0)
3942 3942 return (0);
3943 3943 if (load_error == ZFS_ERR_NO_CHECKPOINT) {
3944 3944 /*
3945 3945 * When attempting checkpoint-rewind on a pool with no
3946 3946 * checkpoint, we should not attempt to load uberblocks
3947 3947 * from previous txgs when spa_load fails.
3948 3948 */
3949 3949 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3950 3950 return (load_error);
3951 3951 }
3952 3952
3953 3953 if (spa->spa_root_vdev != NULL)
3954 3954 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3955 3955
3956 3956 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
3957 3957 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
3958 3958
3959 3959 if (rewind_flags & ZPOOL_NEVER_REWIND) {
3960 3960 nvlist_free(config);
3961 3961 return (load_error);
3962 3962 }
3963 3963
3964 3964 if (state == SPA_LOAD_RECOVER) {
3965 3965 /* Price of rolling back is discarding txgs, including log */
3966 3966 spa_set_log_state(spa, SPA_LOG_CLEAR);
3967 3967 } else {
3968 3968 /*
3969 3969 * If we aren't rolling back save the load info from our first
3970 3970 * import attempt so that we can restore it after attempting
3971 3971 * to rewind.
3972 3972 */
3973 3973 loadinfo = spa->spa_load_info;
3974 3974 spa->spa_load_info = fnvlist_alloc();
3975 3975 }
3976 3976
3977 3977 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
3978 3978 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
3979 3979 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
3980 3980 TXG_INITIAL : safe_rewind_txg;
3981 3981
3982 3982 /*
3983 3983 * Continue as long as we're finding errors, we're still within
3984 3984 * the acceptable rewind range, and we're still finding uberblocks
3985 3985 */
3986 3986 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
3987 3987 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
3988 3988 if (spa->spa_load_max_txg < safe_rewind_txg)
3989 3989 spa->spa_extreme_rewind = B_TRUE;
3990 3990 rewind_error = spa_load_retry(spa, state);
3991 3991 }
3992 3992
3993 3993 spa->spa_extreme_rewind = B_FALSE;
3994 3994 spa->spa_load_max_txg = UINT64_MAX;
3995 3995
3996 3996 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
3997 3997 spa_config_set(spa, config);
3998 3998 else
3999 3999 nvlist_free(config);
4000 4000
4001 4001 if (state == SPA_LOAD_RECOVER) {
4002 4002 ASSERT3P(loadinfo, ==, NULL);
4003 4003 return (rewind_error);
4004 4004 } else {
4005 4005 /* Store the rewind info as part of the initial load info */
4006 4006 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
4007 4007 spa->spa_load_info);
4008 4008
4009 4009 /* Restore the initial load info */
4010 4010 fnvlist_free(spa->spa_load_info);
4011 4011 spa->spa_load_info = loadinfo;
4012 4012
4013 4013 return (load_error);
4014 4014 }
4015 4015 }
4016 4016
4017 4017 /*
4018 4018 * Pool Open/Import
4019 4019 *
4020 4020 * The import case is identical to an open except that the configuration is sent
4021 4021 * down from userland, instead of grabbed from the configuration cache. For the
4022 4022 * case of an open, the pool configuration will exist in the
4023 4023 * POOL_STATE_UNINITIALIZED state.
4024 4024 *
4025 4025 * The stats information (gen/count/ustats) is used to gather vdev statistics at
4026 4026 * the same time open the pool, without having to keep around the spa_t in some
4027 4027 * ambiguous state.
4028 4028 */
4029 4029 static int
4030 4030 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
4031 4031 nvlist_t **config)
4032 4032 {
4033 4033 spa_t *spa;
4034 4034 spa_load_state_t state = SPA_LOAD_OPEN;
4035 4035 int error;
4036 4036 int locked = B_FALSE;
4037 4037
4038 4038 *spapp = NULL;
4039 4039
4040 4040 /*
4041 4041 * As disgusting as this is, we need to support recursive calls to this
4042 4042 * function because dsl_dir_open() is called during spa_load(), and ends
4043 4043 * up calling spa_open() again. The real fix is to figure out how to
4044 4044 * avoid dsl_dir_open() calling this in the first place.
4045 4045 */
4046 4046 if (mutex_owner(&spa_namespace_lock) != curthread) {
4047 4047 mutex_enter(&spa_namespace_lock);
4048 4048 locked = B_TRUE;
4049 4049 }
4050 4050
4051 4051 if ((spa = spa_lookup(pool)) == NULL) {
4052 4052 if (locked)
4053 4053 mutex_exit(&spa_namespace_lock);
4054 4054 return (SET_ERROR(ENOENT));
4055 4055 }
4056 4056
4057 4057 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
4058 4058 zpool_load_policy_t policy;
4059 4059
4060 4060 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
4061 4061 &policy);
4062 4062 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
4063 4063 state = SPA_LOAD_RECOVER;
4064 4064
4065 4065 spa_activate(spa, spa_mode_global);
4066 4066
4067 4067 if (state != SPA_LOAD_RECOVER)
4068 4068 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4069 4069 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
4070 4070
4071 4071 zfs_dbgmsg("spa_open_common: opening %s", pool);
4072 4072 error = spa_load_best(spa, state, policy.zlp_txg,
4073 4073 policy.zlp_rewind);
4074 4074
4075 4075 if (error == EBADF) {
4076 4076 /*
4077 4077 * If vdev_validate() returns failure (indicated by
4078 4078 * EBADF), it indicates that one of the vdevs indicates
4079 4079 * that the pool has been exported or destroyed. If
4080 4080 * this is the case, the config cache is out of sync and
4081 4081 * we should remove the pool from the namespace.
4082 4082 */
4083 4083 spa_unload(spa);
4084 4084 spa_deactivate(spa);
4085 4085 spa_write_cachefile(spa, B_TRUE, B_TRUE);
4086 4086 spa_remove(spa);
4087 4087 if (locked)
4088 4088 mutex_exit(&spa_namespace_lock);
4089 4089 return (SET_ERROR(ENOENT));
4090 4090 }
4091 4091
4092 4092 if (error) {
4093 4093 /*
4094 4094 * We can't open the pool, but we still have useful
4095 4095 * information: the state of each vdev after the
4096 4096 * attempted vdev_open(). Return this to the user.
4097 4097 */
4098 4098 if (config != NULL && spa->spa_config) {
4099 4099 VERIFY(nvlist_dup(spa->spa_config, config,
4100 4100 KM_SLEEP) == 0);
4101 4101 VERIFY(nvlist_add_nvlist(*config,
4102 4102 ZPOOL_CONFIG_LOAD_INFO,
4103 4103 spa->spa_load_info) == 0);
4104 4104 }
4105 4105 spa_unload(spa);
4106 4106 spa_deactivate(spa);
4107 4107 spa->spa_last_open_failed = error;
4108 4108 if (locked)
4109 4109 mutex_exit(&spa_namespace_lock);
4110 4110 *spapp = NULL;
4111 4111 return (error);
4112 4112 }
4113 4113 }
4114 4114
4115 4115 spa_open_ref(spa, tag);
4116 4116
4117 4117 if (config != NULL)
4118 4118 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4119 4119
4120 4120 /*
4121 4121 * If we've recovered the pool, pass back any information we
4122 4122 * gathered while doing the load.
4123 4123 */
4124 4124 if (state == SPA_LOAD_RECOVER) {
4125 4125 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
4126 4126 spa->spa_load_info) == 0);
4127 4127 }
4128 4128
4129 4129 if (locked) {
4130 4130 spa->spa_last_open_failed = 0;
4131 4131 spa->spa_last_ubsync_txg = 0;
4132 4132 spa->spa_load_txg = 0;
4133 4133 mutex_exit(&spa_namespace_lock);
4134 4134 }
4135 4135
4136 4136 *spapp = spa;
4137 4137
4138 4138 return (0);
4139 4139 }
4140 4140
4141 4141 int
4142 4142 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
4143 4143 nvlist_t **config)
4144 4144 {
4145 4145 return (spa_open_common(name, spapp, tag, policy, config));
4146 4146 }
4147 4147
4148 4148 int
4149 4149 spa_open(const char *name, spa_t **spapp, void *tag)
4150 4150 {
4151 4151 return (spa_open_common(name, spapp, tag, NULL, NULL));
4152 4152 }
4153 4153
4154 4154 /*
4155 4155 * Lookup the given spa_t, incrementing the inject count in the process,
4156 4156 * preventing it from being exported or destroyed.
4157 4157 */
4158 4158 spa_t *
4159 4159 spa_inject_addref(char *name)
4160 4160 {
4161 4161 spa_t *spa;
4162 4162
4163 4163 mutex_enter(&spa_namespace_lock);
4164 4164 if ((spa = spa_lookup(name)) == NULL) {
4165 4165 mutex_exit(&spa_namespace_lock);
4166 4166 return (NULL);
4167 4167 }
4168 4168 spa->spa_inject_ref++;
4169 4169 mutex_exit(&spa_namespace_lock);
4170 4170
4171 4171 return (spa);
4172 4172 }
4173 4173
4174 4174 void
4175 4175 spa_inject_delref(spa_t *spa)
4176 4176 {
4177 4177 mutex_enter(&spa_namespace_lock);
4178 4178 spa->spa_inject_ref--;
4179 4179 mutex_exit(&spa_namespace_lock);
4180 4180 }
4181 4181
4182 4182 /*
4183 4183 * Add spares device information to the nvlist.
4184 4184 */
4185 4185 static void
4186 4186 spa_add_spares(spa_t *spa, nvlist_t *config)
4187 4187 {
4188 4188 nvlist_t **spares;
4189 4189 uint_t i, nspares;
4190 4190 nvlist_t *nvroot;
4191 4191 uint64_t guid;
4192 4192 vdev_stat_t *vs;
4193 4193 uint_t vsc;
4194 4194 uint64_t pool;
4195 4195
4196 4196 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4197 4197
4198 4198 if (spa->spa_spares.sav_count == 0)
4199 4199 return;
4200 4200
4201 4201 VERIFY(nvlist_lookup_nvlist(config,
4202 4202 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4203 4203 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4204 4204 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4205 4205 if (nspares != 0) {
4206 4206 VERIFY(nvlist_add_nvlist_array(nvroot,
4207 4207 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4208 4208 VERIFY(nvlist_lookup_nvlist_array(nvroot,
4209 4209 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4210 4210
4211 4211 /*
4212 4212 * Go through and find any spares which have since been
4213 4213 * repurposed as an active spare. If this is the case, update
4214 4214 * their status appropriately.
4215 4215 */
4216 4216 for (i = 0; i < nspares; i++) {
4217 4217 VERIFY(nvlist_lookup_uint64(spares[i],
4218 4218 ZPOOL_CONFIG_GUID, &guid) == 0);
4219 4219 if (spa_spare_exists(guid, &pool, NULL) &&
4220 4220 pool != 0ULL) {
4221 4221 VERIFY(nvlist_lookup_uint64_array(
4222 4222 spares[i], ZPOOL_CONFIG_VDEV_STATS,
4223 4223 (uint64_t **)&vs, &vsc) == 0);
4224 4224 vs->vs_state = VDEV_STATE_CANT_OPEN;
4225 4225 vs->vs_aux = VDEV_AUX_SPARED;
4226 4226 }
4227 4227 }
4228 4228 }
4229 4229 }
4230 4230
4231 4231 /*
4232 4232 * Add l2cache device information to the nvlist, including vdev stats.
4233 4233 */
4234 4234 static void
4235 4235 spa_add_l2cache(spa_t *spa, nvlist_t *config)
4236 4236 {
4237 4237 nvlist_t **l2cache;
4238 4238 uint_t i, j, nl2cache;
4239 4239 nvlist_t *nvroot;
4240 4240 uint64_t guid;
4241 4241 vdev_t *vd;
4242 4242 vdev_stat_t *vs;
4243 4243 uint_t vsc;
4244 4244
4245 4245 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4246 4246
4247 4247 if (spa->spa_l2cache.sav_count == 0)
4248 4248 return;
4249 4249
4250 4250 VERIFY(nvlist_lookup_nvlist(config,
4251 4251 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4252 4252 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4253 4253 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4254 4254 if (nl2cache != 0) {
4255 4255 VERIFY(nvlist_add_nvlist_array(nvroot,
4256 4256 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4257 4257 VERIFY(nvlist_lookup_nvlist_array(nvroot,
4258 4258 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4259 4259
4260 4260 /*
4261 4261 * Update level 2 cache device stats.
4262 4262 */
4263 4263
4264 4264 for (i = 0; i < nl2cache; i++) {
4265 4265 VERIFY(nvlist_lookup_uint64(l2cache[i],
4266 4266 ZPOOL_CONFIG_GUID, &guid) == 0);
4267 4267
4268 4268 vd = NULL;
4269 4269 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
4270 4270 if (guid ==
4271 4271 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
4272 4272 vd = spa->spa_l2cache.sav_vdevs[j];
4273 4273 break;
4274 4274 }
4275 4275 }
4276 4276 ASSERT(vd != NULL);
4277 4277
4278 4278 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
4279 4279 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
4280 4280 == 0);
4281 4281 vdev_get_stats(vd, vs);
4282 4282 }
4283 4283 }
4284 4284 }
4285 4285
4286 4286 static void
4287 4287 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
4288 4288 {
4289 4289 nvlist_t *features;
4290 4290 zap_cursor_t zc;
4291 4291 zap_attribute_t za;
4292 4292
4293 4293 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4294 4294 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4295 4295
4296 4296 if (spa->spa_feat_for_read_obj != 0) {
4297 4297 for (zap_cursor_init(&zc, spa->spa_meta_objset,
4298 4298 spa->spa_feat_for_read_obj);
4299 4299 zap_cursor_retrieve(&zc, &za) == 0;
4300 4300 zap_cursor_advance(&zc)) {
4301 4301 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4302 4302 za.za_num_integers == 1);
4303 4303 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4304 4304 za.za_first_integer));
4305 4305 }
4306 4306 zap_cursor_fini(&zc);
4307 4307 }
4308 4308
4309 4309 if (spa->spa_feat_for_write_obj != 0) {
4310 4310 for (zap_cursor_init(&zc, spa->spa_meta_objset,
4311 4311 spa->spa_feat_for_write_obj);
4312 4312 zap_cursor_retrieve(&zc, &za) == 0;
4313 4313 zap_cursor_advance(&zc)) {
4314 4314 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4315 4315 za.za_num_integers == 1);
4316 4316 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4317 4317 za.za_first_integer));
4318 4318 }
4319 4319 zap_cursor_fini(&zc);
4320 4320 }
4321 4321
4322 4322 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
4323 4323 features) == 0);
4324 4324 nvlist_free(features);
4325 4325 }
4326 4326
4327 4327 int
4328 4328 spa_get_stats(const char *name, nvlist_t **config,
4329 4329 char *altroot, size_t buflen)
4330 4330 {
4331 4331 int error;
4332 4332 spa_t *spa;
4333 4333
4334 4334 *config = NULL;
4335 4335 error = spa_open_common(name, &spa, FTAG, NULL, config);
4336 4336
4337 4337 if (spa != NULL) {
4338 4338 /*
4339 4339 * This still leaves a window of inconsistency where the spares
4340 4340 * or l2cache devices could change and the config would be
4341 4341 * self-inconsistent.
4342 4342 */
4343 4343 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4344 4344
4345 4345 if (*config != NULL) {
4346 4346 uint64_t loadtimes[2];
4347 4347
4348 4348 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
4349 4349 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
4350 4350 VERIFY(nvlist_add_uint64_array(*config,
4351 4351 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
4352 4352
4353 4353 VERIFY(nvlist_add_uint64(*config,
4354 4354 ZPOOL_CONFIG_ERRCOUNT,
4355 4355 spa_get_errlog_size(spa)) == 0);
4356 4356
4357 4357 if (spa_suspended(spa))
4358 4358 VERIFY(nvlist_add_uint64(*config,
4359 4359 ZPOOL_CONFIG_SUSPENDED,
4360 4360 spa->spa_failmode) == 0);
4361 4361
4362 4362 spa_add_spares(spa, *config);
4363 4363 spa_add_l2cache(spa, *config);
4364 4364 spa_add_feature_stats(spa, *config);
4365 4365 }
4366 4366 }
4367 4367
4368 4368 /*
4369 4369 * We want to get the alternate root even for faulted pools, so we cheat
4370 4370 * and call spa_lookup() directly.
4371 4371 */
4372 4372 if (altroot) {
4373 4373 if (spa == NULL) {
4374 4374 mutex_enter(&spa_namespace_lock);
4375 4375 spa = spa_lookup(name);
4376 4376 if (spa)
4377 4377 spa_altroot(spa, altroot, buflen);
4378 4378 else
4379 4379 altroot[0] = '\0';
4380 4380 spa = NULL;
4381 4381 mutex_exit(&spa_namespace_lock);
4382 4382 } else {
4383 4383 spa_altroot(spa, altroot, buflen);
4384 4384 }
4385 4385 }
4386 4386
4387 4387 if (spa != NULL) {
4388 4388 spa_config_exit(spa, SCL_CONFIG, FTAG);
4389 4389 spa_close(spa, FTAG);
4390 4390 }
4391 4391
4392 4392 return (error);
4393 4393 }
4394 4394
4395 4395 /*
4396 4396 * Validate that the auxiliary device array is well formed. We must have an
4397 4397 * array of nvlists, each which describes a valid leaf vdev. If this is an
4398 4398 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
4399 4399 * specified, as long as they are well-formed.
4400 4400 */
4401 4401 static int
4402 4402 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
4403 4403 spa_aux_vdev_t *sav, const char *config, uint64_t version,
4404 4404 vdev_labeltype_t label)
4405 4405 {
4406 4406 nvlist_t **dev;
4407 4407 uint_t i, ndev;
4408 4408 vdev_t *vd;
4409 4409 int error;
4410 4410
4411 4411 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4412 4412
4413 4413 /*
4414 4414 * It's acceptable to have no devs specified.
4415 4415 */
4416 4416 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
4417 4417 return (0);
4418 4418
4419 4419 if (ndev == 0)
4420 4420 return (SET_ERROR(EINVAL));
4421 4421
4422 4422 /*
4423 4423 * Make sure the pool is formatted with a version that supports this
4424 4424 * device type.
4425 4425 */
4426 4426 if (spa_version(spa) < version)
4427 4427 return (SET_ERROR(ENOTSUP));
4428 4428
4429 4429 /*
4430 4430 * Set the pending device list so we correctly handle device in-use
4431 4431 * checking.
4432 4432 */
4433 4433 sav->sav_pending = dev;
4434 4434 sav->sav_npending = ndev;
4435 4435
4436 4436 for (i = 0; i < ndev; i++) {
4437 4437 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
4438 4438 mode)) != 0)
4439 4439 goto out;
4440 4440
4441 4441 if (!vd->vdev_ops->vdev_op_leaf) {
4442 4442 vdev_free(vd);
4443 4443 error = SET_ERROR(EINVAL);
4444 4444 goto out;
4445 4445 }
4446 4446
4447 4447 /*
4448 4448 * The L2ARC currently only supports disk devices in
4449 4449 * kernel context. For user-level testing, we allow it.
4450 4450 */
4451 4451 #ifdef _KERNEL
4452 4452 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
4453 4453 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
4454 4454 error = SET_ERROR(ENOTBLK);
4455 4455 vdev_free(vd);
4456 4456 goto out;
4457 4457 }
4458 4458 #endif
4459 4459 vd->vdev_top = vd;
4460 4460
4461 4461 if ((error = vdev_open(vd)) == 0 &&
4462 4462 (error = vdev_label_init(vd, crtxg, label)) == 0) {
4463 4463 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
4464 4464 vd->vdev_guid) == 0);
4465 4465 }
4466 4466
4467 4467 vdev_free(vd);
4468 4468
4469 4469 if (error &&
4470 4470 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
4471 4471 goto out;
4472 4472 else
4473 4473 error = 0;
4474 4474 }
4475 4475
4476 4476 out:
4477 4477 sav->sav_pending = NULL;
4478 4478 sav->sav_npending = 0;
4479 4479 return (error);
4480 4480 }
4481 4481
4482 4482 static int
4483 4483 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
4484 4484 {
4485 4485 int error;
4486 4486
4487 4487 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4488 4488
4489 4489 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4490 4490 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
4491 4491 VDEV_LABEL_SPARE)) != 0) {
4492 4492 return (error);
4493 4493 }
4494 4494
4495 4495 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4496 4496 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
4497 4497 VDEV_LABEL_L2CACHE));
4498 4498 }
4499 4499
4500 4500 static void
4501 4501 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
4502 4502 const char *config)
4503 4503 {
4504 4504 int i;
4505 4505
4506 4506 if (sav->sav_config != NULL) {
4507 4507 nvlist_t **olddevs;
4508 4508 uint_t oldndevs;
4509 4509 nvlist_t **newdevs;
4510 4510
4511 4511 /*
4512 4512 * Generate new dev list by concatentating with the
4513 4513 * current dev list.
4514 4514 */
4515 4515 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
4516 4516 &olddevs, &oldndevs) == 0);
4517 4517
4518 4518 newdevs = kmem_alloc(sizeof (void *) *
4519 4519 (ndevs + oldndevs), KM_SLEEP);
4520 4520 for (i = 0; i < oldndevs; i++)
4521 4521 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
4522 4522 KM_SLEEP) == 0);
4523 4523 for (i = 0; i < ndevs; i++)
4524 4524 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
4525 4525 KM_SLEEP) == 0);
4526 4526
4527 4527 VERIFY(nvlist_remove(sav->sav_config, config,
4528 4528 DATA_TYPE_NVLIST_ARRAY) == 0);
4529 4529
4530 4530 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
4531 4531 config, newdevs, ndevs + oldndevs) == 0);
4532 4532 for (i = 0; i < oldndevs + ndevs; i++)
4533 4533 nvlist_free(newdevs[i]);
4534 4534 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
4535 4535 } else {
4536 4536 /*
4537 4537 * Generate a new dev list.
4538 4538 */
4539 4539 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
4540 4540 KM_SLEEP) == 0);
4541 4541 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
4542 4542 devs, ndevs) == 0);
4543 4543 }
4544 4544 }
4545 4545
4546 4546 /*
4547 4547 * Stop and drop level 2 ARC devices
4548 4548 */
4549 4549 void
4550 4550 spa_l2cache_drop(spa_t *spa)
4551 4551 {
4552 4552 vdev_t *vd;
4553 4553 int i;
4554 4554 spa_aux_vdev_t *sav = &spa->spa_l2cache;
4555 4555
4556 4556 for (i = 0; i < sav->sav_count; i++) {
4557 4557 uint64_t pool;
4558 4558
4559 4559 vd = sav->sav_vdevs[i];
4560 4560 ASSERT(vd != NULL);
4561 4561
4562 4562 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
4563 4563 pool != 0ULL && l2arc_vdev_present(vd))
4564 4564 l2arc_remove_vdev(vd);
4565 4565 }
4566 4566 }
4567 4567
4568 4568 /*
4569 4569 * Pool Creation
4570 4570 */
4571 4571 int
4572 4572 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
4573 4573 nvlist_t *zplprops)
4574 4574 {
4575 4575 spa_t *spa;
|
↓ open down ↓ |
4575 lines elided |
↑ open up ↑ |
4576 4576 char *altroot = NULL;
4577 4577 vdev_t *rvd;
4578 4578 dsl_pool_t *dp;
4579 4579 dmu_tx_t *tx;
4580 4580 int error = 0;
4581 4581 uint64_t txg = TXG_INITIAL;
4582 4582 nvlist_t **spares, **l2cache;
4583 4583 uint_t nspares, nl2cache;
4584 4584 uint64_t version, obj;
4585 4585 boolean_t has_features;
4586 + char *poolname;
4587 + nvlist_t *nvl;
4586 4588
4589 + if (nvlist_lookup_string(props,
4590 + zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
4591 + poolname = (char *)pool;
4592 +
4587 4593 /*
4588 4594 * If this pool already exists, return failure.
4589 4595 */
4590 4596 mutex_enter(&spa_namespace_lock);
4591 - if (spa_lookup(pool) != NULL) {
4597 + if (spa_lookup(poolname) != NULL) {
4592 4598 mutex_exit(&spa_namespace_lock);
4593 4599 return (SET_ERROR(EEXIST));
4594 4600 }
4595 4601
4596 4602 /*
4597 4603 * Allocate a new spa_t structure.
4598 4604 */
4605 + nvl = fnvlist_alloc();
4606 + fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
4599 4607 (void) nvlist_lookup_string(props,
4600 4608 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4601 - spa = spa_add(pool, NULL, altroot);
4609 + spa = spa_add(poolname, nvl, altroot);
4610 + fnvlist_free(nvl);
4602 4611 spa_activate(spa, spa_mode_global);
4603 4612
4604 4613 if (props && (error = spa_prop_validate(spa, props))) {
4605 4614 spa_deactivate(spa);
4606 4615 spa_remove(spa);
4607 4616 mutex_exit(&spa_namespace_lock);
4608 4617 return (error);
4609 4618 }
4610 4619
4620 + /*
4621 + * Temporary pool names should never be written to disk.
4622 + */
4623 + if (poolname != pool)
4624 + spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
4625 +
4611 4626 has_features = B_FALSE;
4612 4627 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
4613 4628 elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
4614 4629 if (zpool_prop_feature(nvpair_name(elem)))
4615 4630 has_features = B_TRUE;
4616 4631 }
4617 4632
4618 4633 if (has_features || nvlist_lookup_uint64(props,
4619 4634 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
4620 4635 version = SPA_VERSION;
4621 4636 }
4622 4637 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
4623 4638
4624 4639 spa->spa_first_txg = txg;
4625 4640 spa->spa_uberblock.ub_txg = txg - 1;
4626 4641 spa->spa_uberblock.ub_version = version;
4627 4642 spa->spa_ubsync = spa->spa_uberblock;
4628 4643 spa->spa_load_state = SPA_LOAD_CREATE;
4629 4644 spa->spa_removing_phys.sr_state = DSS_NONE;
4630 4645 spa->spa_removing_phys.sr_removing_vdev = -1;
4631 4646 spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
4632 4647
4633 4648 /*
4634 4649 * Create "The Godfather" zio to hold all async IOs
4635 4650 */
4636 4651 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
4637 4652 KM_SLEEP);
4638 4653 for (int i = 0; i < max_ncpus; i++) {
4639 4654 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
4640 4655 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4641 4656 ZIO_FLAG_GODFATHER);
4642 4657 }
4643 4658
4644 4659 /*
4645 4660 * Create the root vdev.
4646 4661 */
4647 4662 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4648 4663
4649 4664 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
4650 4665
4651 4666 ASSERT(error != 0 || rvd != NULL);
4652 4667 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
4653 4668
4654 4669 if (error == 0 && !zfs_allocatable_devs(nvroot))
4655 4670 error = SET_ERROR(EINVAL);
4656 4671
4657 4672 if (error == 0 &&
4658 4673 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
4659 4674 (error = spa_validate_aux(spa, nvroot, txg,
4660 4675 VDEV_ALLOC_ADD)) == 0) {
4661 4676 for (int c = 0; c < rvd->vdev_children; c++) {
4662 4677 vdev_metaslab_set_size(rvd->vdev_child[c]);
4663 4678 vdev_expand(rvd->vdev_child[c], txg);
4664 4679 }
4665 4680 }
4666 4681
4667 4682 spa_config_exit(spa, SCL_ALL, FTAG);
4668 4683
4669 4684 if (error != 0) {
4670 4685 spa_unload(spa);
4671 4686 spa_deactivate(spa);
4672 4687 spa_remove(spa);
4673 4688 mutex_exit(&spa_namespace_lock);
4674 4689 return (error);
4675 4690 }
4676 4691
4677 4692 /*
4678 4693 * Get the list of spares, if specified.
4679 4694 */
4680 4695 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4681 4696 &spares, &nspares) == 0) {
4682 4697 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
4683 4698 KM_SLEEP) == 0);
4684 4699 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4685 4700 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4686 4701 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4687 4702 spa_load_spares(spa);
4688 4703 spa_config_exit(spa, SCL_ALL, FTAG);
4689 4704 spa->spa_spares.sav_sync = B_TRUE;
4690 4705 }
4691 4706
4692 4707 /*
4693 4708 * Get the list of level 2 cache devices, if specified.
4694 4709 */
4695 4710 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4696 4711 &l2cache, &nl2cache) == 0) {
4697 4712 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4698 4713 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4699 4714 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4700 4715 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4701 4716 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4702 4717 spa_load_l2cache(spa);
4703 4718 spa_config_exit(spa, SCL_ALL, FTAG);
4704 4719 spa->spa_l2cache.sav_sync = B_TRUE;
4705 4720 }
4706 4721
4707 4722 spa->spa_is_initializing = B_TRUE;
4708 4723 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
4709 4724 spa->spa_meta_objset = dp->dp_meta_objset;
4710 4725 spa->spa_is_initializing = B_FALSE;
4711 4726
4712 4727 /*
4713 4728 * Create DDTs (dedup tables).
4714 4729 */
4715 4730 ddt_create(spa);
4716 4731
4717 4732 spa_update_dspace(spa);
4718 4733
4719 4734 tx = dmu_tx_create_assigned(dp, txg);
4720 4735
4721 4736 /*
4722 4737 * Create the pool config object.
4723 4738 */
4724 4739 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
4725 4740 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
4726 4741 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
4727 4742
4728 4743 if (zap_add(spa->spa_meta_objset,
4729 4744 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
4730 4745 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
4731 4746 cmn_err(CE_PANIC, "failed to add pool config");
4732 4747 }
4733 4748
4734 4749 if (spa_version(spa) >= SPA_VERSION_FEATURES)
4735 4750 spa_feature_create_zap_objects(spa, tx);
4736 4751
4737 4752 if (zap_add(spa->spa_meta_objset,
4738 4753 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
4739 4754 sizeof (uint64_t), 1, &version, tx) != 0) {
4740 4755 cmn_err(CE_PANIC, "failed to add pool version");
4741 4756 }
4742 4757
4743 4758 /* Newly created pools with the right version are always deflated. */
4744 4759 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
4745 4760 spa->spa_deflate = TRUE;
4746 4761 if (zap_add(spa->spa_meta_objset,
4747 4762 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4748 4763 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
4749 4764 cmn_err(CE_PANIC, "failed to add deflate");
4750 4765 }
4751 4766 }
4752 4767
4753 4768 /*
4754 4769 * Create the deferred-free bpobj. Turn off compression
4755 4770 * because sync-to-convergence takes longer if the blocksize
4756 4771 * keeps changing.
4757 4772 */
4758 4773 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
4759 4774 dmu_object_set_compress(spa->spa_meta_objset, obj,
4760 4775 ZIO_COMPRESS_OFF, tx);
4761 4776 if (zap_add(spa->spa_meta_objset,
4762 4777 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
4763 4778 sizeof (uint64_t), 1, &obj, tx) != 0) {
4764 4779 cmn_err(CE_PANIC, "failed to add bpobj");
4765 4780 }
4766 4781 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
4767 4782 spa->spa_meta_objset, obj));
4768 4783
4769 4784 /*
4770 4785 * Create the pool's history object.
4771 4786 */
4772 4787 if (version >= SPA_VERSION_ZPOOL_HISTORY)
4773 4788 spa_history_create_obj(spa, tx);
4774 4789
4775 4790 /*
4776 4791 * Generate some random noise for salted checksums to operate on.
4777 4792 */
4778 4793 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4779 4794 sizeof (spa->spa_cksum_salt.zcs_bytes));
4780 4795
4781 4796 /*
4782 4797 * Set pool properties.
4783 4798 */
4784 4799 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
4785 4800 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4786 4801 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
4787 4802 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
4788 4803
4789 4804 if (props != NULL) {
4790 4805 spa_configfile_set(spa, props, B_FALSE);
4791 4806 spa_sync_props(props, tx);
4792 4807 }
4793 4808
4794 4809 dmu_tx_commit(tx);
4795 4810
4796 4811 spa->spa_sync_on = B_TRUE;
4797 4812 txg_sync_start(spa->spa_dsl_pool);
4798 4813
4799 4814 /*
4800 4815 * We explicitly wait for the first transaction to complete so that our
4801 4816 * bean counters are appropriately updated.
4802 4817 */
4803 4818 txg_wait_synced(spa->spa_dsl_pool, txg);
4804 4819
4805 4820 spa_spawn_aux_threads(spa);
4806 4821
4807 4822 spa_write_cachefile(spa, B_FALSE, B_TRUE);
4808 4823 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
4809 4824
4810 4825 spa_history_log_version(spa, "create");
4811 4826
4812 4827 /*
4813 4828 * Don't count references from objsets that are already closed
4814 4829 * and are making their way through the eviction process.
4815 4830 */
4816 4831 spa_evicting_os_wait(spa);
4817 4832 spa->spa_minref = refcount_count(&spa->spa_refcount);
4818 4833 spa->spa_load_state = SPA_LOAD_NONE;
4819 4834
4820 4835 mutex_exit(&spa_namespace_lock);
4821 4836
4822 4837 return (0);
4823 4838 }
4824 4839
4825 4840 #ifdef _KERNEL
4826 4841 /*
4827 4842 * Get the root pool information from the root disk, then import the root pool
4828 4843 * during the system boot up time.
4829 4844 */
4830 4845 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
4831 4846
4832 4847 static nvlist_t *
4833 4848 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
4834 4849 {
4835 4850 nvlist_t *config;
4836 4851 nvlist_t *nvtop, *nvroot;
4837 4852 uint64_t pgid;
4838 4853
4839 4854 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
4840 4855 return (NULL);
4841 4856
4842 4857 /*
4843 4858 * Add this top-level vdev to the child array.
4844 4859 */
4845 4860 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4846 4861 &nvtop) == 0);
4847 4862 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4848 4863 &pgid) == 0);
4849 4864 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
4850 4865
4851 4866 /*
4852 4867 * Put this pool's top-level vdevs into a root vdev.
4853 4868 */
4854 4869 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4855 4870 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
4856 4871 VDEV_TYPE_ROOT) == 0);
4857 4872 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
4858 4873 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
4859 4874 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
4860 4875 &nvtop, 1) == 0);
4861 4876
4862 4877 /*
4863 4878 * Replace the existing vdev_tree with the new root vdev in
4864 4879 * this pool's configuration (remove the old, add the new).
4865 4880 */
4866 4881 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
4867 4882 nvlist_free(nvroot);
4868 4883 return (config);
4869 4884 }
4870 4885
4871 4886 /*
4872 4887 * Walk the vdev tree and see if we can find a device with "better"
4873 4888 * configuration. A configuration is "better" if the label on that
4874 4889 * device has a more recent txg.
4875 4890 */
4876 4891 static void
4877 4892 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
4878 4893 {
4879 4894 for (int c = 0; c < vd->vdev_children; c++)
4880 4895 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
4881 4896
4882 4897 if (vd->vdev_ops->vdev_op_leaf) {
4883 4898 nvlist_t *label;
4884 4899 uint64_t label_txg;
4885 4900
4886 4901 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
4887 4902 &label) != 0)
4888 4903 return;
4889 4904
4890 4905 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
4891 4906 &label_txg) == 0);
4892 4907
4893 4908 /*
4894 4909 * Do we have a better boot device?
4895 4910 */
4896 4911 if (label_txg > *txg) {
4897 4912 *txg = label_txg;
4898 4913 *avd = vd;
4899 4914 }
4900 4915 nvlist_free(label);
4901 4916 }
4902 4917 }
4903 4918
4904 4919 /*
4905 4920 * Import a root pool.
4906 4921 *
4907 4922 * For x86. devpath_list will consist of devid and/or physpath name of
4908 4923 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
4909 4924 * The GRUB "findroot" command will return the vdev we should boot.
4910 4925 *
4911 4926 * For Sparc, devpath_list consists the physpath name of the booting device
4912 4927 * no matter the rootpool is a single device pool or a mirrored pool.
4913 4928 * e.g.
4914 4929 * "/pci@1f,0/ide@d/disk@0,0:a"
4915 4930 */
4916 4931 int
4917 4932 spa_import_rootpool(char *devpath, char *devid)
4918 4933 {
4919 4934 spa_t *spa;
4920 4935 vdev_t *rvd, *bvd, *avd = NULL;
4921 4936 nvlist_t *config, *nvtop;
4922 4937 uint64_t guid, txg;
4923 4938 char *pname;
4924 4939 int error;
4925 4940
4926 4941 /*
4927 4942 * Read the label from the boot device and generate a configuration.
4928 4943 */
4929 4944 config = spa_generate_rootconf(devpath, devid, &guid);
4930 4945 #if defined(_OBP) && defined(_KERNEL)
4931 4946 if (config == NULL) {
4932 4947 if (strstr(devpath, "/iscsi/ssd") != NULL) {
4933 4948 /* iscsi boot */
4934 4949 get_iscsi_bootpath_phy(devpath);
4935 4950 config = spa_generate_rootconf(devpath, devid, &guid);
4936 4951 }
4937 4952 }
4938 4953 #endif
4939 4954 if (config == NULL) {
4940 4955 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
4941 4956 devpath);
4942 4957 return (SET_ERROR(EIO));
4943 4958 }
4944 4959
4945 4960 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
4946 4961 &pname) == 0);
4947 4962 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
4948 4963
4949 4964 mutex_enter(&spa_namespace_lock);
4950 4965 if ((spa = spa_lookup(pname)) != NULL) {
4951 4966 /*
4952 4967 * Remove the existing root pool from the namespace so that we
4953 4968 * can replace it with the correct config we just read in.
4954 4969 */
4955 4970 spa_remove(spa);
4956 4971 }
4957 4972
4958 4973 spa = spa_add(pname, config, NULL);
4959 4974 spa->spa_is_root = B_TRUE;
4960 4975 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
4961 4976 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
4962 4977 &spa->spa_ubsync.ub_version) != 0)
4963 4978 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
4964 4979
4965 4980 /*
4966 4981 * Build up a vdev tree based on the boot device's label config.
4967 4982 */
4968 4983 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4969 4984 &nvtop) == 0);
4970 4985 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4971 4986 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
4972 4987 VDEV_ALLOC_ROOTPOOL);
4973 4988 spa_config_exit(spa, SCL_ALL, FTAG);
4974 4989 if (error) {
4975 4990 mutex_exit(&spa_namespace_lock);
4976 4991 nvlist_free(config);
4977 4992 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
4978 4993 pname);
4979 4994 return (error);
4980 4995 }
4981 4996
4982 4997 /*
4983 4998 * Get the boot vdev.
4984 4999 */
4985 5000 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
4986 5001 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
4987 5002 (u_longlong_t)guid);
4988 5003 error = SET_ERROR(ENOENT);
4989 5004 goto out;
4990 5005 }
4991 5006
4992 5007 /*
4993 5008 * Determine if there is a better boot device.
4994 5009 */
4995 5010 avd = bvd;
4996 5011 spa_alt_rootvdev(rvd, &avd, &txg);
4997 5012 if (avd != bvd) {
4998 5013 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
4999 5014 "try booting from '%s'", avd->vdev_path);
5000 5015 error = SET_ERROR(EINVAL);
5001 5016 goto out;
5002 5017 }
5003 5018
5004 5019 /*
5005 5020 * If the boot device is part of a spare vdev then ensure that
5006 5021 * we're booting off the active spare.
5007 5022 */
5008 5023 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
5009 5024 !bvd->vdev_isspare) {
5010 5025 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
5011 5026 "try booting from '%s'",
5012 5027 bvd->vdev_parent->
5013 5028 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
5014 5029 error = SET_ERROR(EINVAL);
5015 5030 goto out;
5016 5031 }
5017 5032
5018 5033 error = 0;
5019 5034 out:
5020 5035 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5021 5036 vdev_free(rvd);
5022 5037 spa_config_exit(spa, SCL_ALL, FTAG);
5023 5038 mutex_exit(&spa_namespace_lock);
5024 5039
5025 5040 nvlist_free(config);
5026 5041 return (error);
5027 5042 }
5028 5043
5029 5044 #endif
5030 5045
5031 5046 /*
5032 5047 * Import a non-root pool into the system.
5033 5048 */
5034 5049 int
5035 5050 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
5036 5051 {
5037 5052 spa_t *spa;
5038 5053 char *altroot = NULL;
5039 5054 spa_load_state_t state = SPA_LOAD_IMPORT;
5040 5055 zpool_load_policy_t policy;
5041 5056 uint64_t mode = spa_mode_global;
5042 5057 uint64_t readonly = B_FALSE;
5043 5058 int error;
5044 5059 nvlist_t *nvroot;
5045 5060 nvlist_t **spares, **l2cache;
5046 5061 uint_t nspares, nl2cache;
5047 5062
5048 5063 /*
5049 5064 * If a pool with this name exists, return failure.
5050 5065 */
5051 5066 mutex_enter(&spa_namespace_lock);
5052 5067 if (spa_lookup(pool) != NULL) {
5053 5068 mutex_exit(&spa_namespace_lock);
5054 5069 return (SET_ERROR(EEXIST));
5055 5070 }
5056 5071
5057 5072 /*
5058 5073 * Create and initialize the spa structure.
5059 5074 */
5060 5075 (void) nvlist_lookup_string(props,
5061 5076 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5062 5077 (void) nvlist_lookup_uint64(props,
5063 5078 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
5064 5079 if (readonly)
5065 5080 mode = FREAD;
5066 5081 spa = spa_add(pool, config, altroot);
5067 5082 spa->spa_import_flags = flags;
5068 5083
5069 5084 /*
5070 5085 * Verbatim import - Take a pool and insert it into the namespace
5071 5086 * as if it had been loaded at boot.
5072 5087 */
5073 5088 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
5074 5089 if (props != NULL)
5075 5090 spa_configfile_set(spa, props, B_FALSE);
5076 5091
5077 5092 spa_write_cachefile(spa, B_FALSE, B_TRUE);
5078 5093 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5079 5094 zfs_dbgmsg("spa_import: verbatim import of %s", pool);
5080 5095 mutex_exit(&spa_namespace_lock);
5081 5096 return (0);
5082 5097 }
5083 5098
5084 5099 spa_activate(spa, mode);
5085 5100
5086 5101 /*
5087 5102 * Don't start async tasks until we know everything is healthy.
5088 5103 */
5089 5104 spa_async_suspend(spa);
5090 5105
5091 5106 zpool_get_load_policy(config, &policy);
5092 5107 if (policy.zlp_rewind & ZPOOL_DO_REWIND)
5093 5108 state = SPA_LOAD_RECOVER;
5094 5109
5095 5110 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
5096 5111
5097 5112 if (state != SPA_LOAD_RECOVER) {
5098 5113 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
5099 5114 zfs_dbgmsg("spa_import: importing %s", pool);
5100 5115 } else {
5101 5116 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
5102 5117 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
5103 5118 }
5104 5119 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
5105 5120
5106 5121 /*
5107 5122 * Propagate anything learned while loading the pool and pass it
5108 5123 * back to caller (i.e. rewind info, missing devices, etc).
5109 5124 */
5110 5125 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5111 5126 spa->spa_load_info) == 0);
5112 5127
5113 5128 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5114 5129 /*
5115 5130 * Toss any existing sparelist, as it doesn't have any validity
5116 5131 * anymore, and conflicts with spa_has_spare().
5117 5132 */
5118 5133 if (spa->spa_spares.sav_config) {
5119 5134 nvlist_free(spa->spa_spares.sav_config);
5120 5135 spa->spa_spares.sav_config = NULL;
5121 5136 spa_load_spares(spa);
5122 5137 }
5123 5138 if (spa->spa_l2cache.sav_config) {
5124 5139 nvlist_free(spa->spa_l2cache.sav_config);
5125 5140 spa->spa_l2cache.sav_config = NULL;
5126 5141 spa_load_l2cache(spa);
5127 5142 }
5128 5143
5129 5144 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5130 5145 &nvroot) == 0);
5131 5146 if (error == 0)
5132 5147 error = spa_validate_aux(spa, nvroot, -1ULL,
5133 5148 VDEV_ALLOC_SPARE);
5134 5149 if (error == 0)
5135 5150 error = spa_validate_aux(spa, nvroot, -1ULL,
5136 5151 VDEV_ALLOC_L2CACHE);
5137 5152 spa_config_exit(spa, SCL_ALL, FTAG);
5138 5153
5139 5154 if (props != NULL)
5140 5155 spa_configfile_set(spa, props, B_FALSE);
5141 5156
5142 5157 if (error != 0 || (props && spa_writeable(spa) &&
5143 5158 (error = spa_prop_set(spa, props)))) {
5144 5159 spa_unload(spa);
5145 5160 spa_deactivate(spa);
5146 5161 spa_remove(spa);
5147 5162 mutex_exit(&spa_namespace_lock);
5148 5163 return (error);
5149 5164 }
5150 5165
5151 5166 spa_async_resume(spa);
5152 5167
5153 5168 /*
5154 5169 * Override any spares and level 2 cache devices as specified by
5155 5170 * the user, as these may have correct device names/devids, etc.
5156 5171 */
5157 5172 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5158 5173 &spares, &nspares) == 0) {
5159 5174 if (spa->spa_spares.sav_config)
5160 5175 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
5161 5176 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
5162 5177 else
5163 5178 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
5164 5179 NV_UNIQUE_NAME, KM_SLEEP) == 0);
5165 5180 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5166 5181 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5167 5182 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5168 5183 spa_load_spares(spa);
5169 5184 spa_config_exit(spa, SCL_ALL, FTAG);
5170 5185 spa->spa_spares.sav_sync = B_TRUE;
5171 5186 }
5172 5187 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5173 5188 &l2cache, &nl2cache) == 0) {
5174 5189 if (spa->spa_l2cache.sav_config)
5175 5190 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
5176 5191 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
5177 5192 else
5178 5193 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5179 5194 NV_UNIQUE_NAME, KM_SLEEP) == 0);
5180 5195 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5181 5196 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5182 5197 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5183 5198 spa_load_l2cache(spa);
5184 5199 spa_config_exit(spa, SCL_ALL, FTAG);
5185 5200 spa->spa_l2cache.sav_sync = B_TRUE;
5186 5201 }
5187 5202
5188 5203 /*
5189 5204 * Check for any removed devices.
5190 5205 */
5191 5206 if (spa->spa_autoreplace) {
5192 5207 spa_aux_check_removed(&spa->spa_spares);
5193 5208 spa_aux_check_removed(&spa->spa_l2cache);
5194 5209 }
5195 5210
5196 5211 if (spa_writeable(spa)) {
5197 5212 /*
5198 5213 * Update the config cache to include the newly-imported pool.
5199 5214 */
5200 5215 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5201 5216 }
5202 5217
5203 5218 /*
5204 5219 * It's possible that the pool was expanded while it was exported.
5205 5220 * We kick off an async task to handle this for us.
5206 5221 */
5207 5222 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
5208 5223
5209 5224 spa_history_log_version(spa, "import");
5210 5225
5211 5226 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5212 5227
5213 5228 mutex_exit(&spa_namespace_lock);
5214 5229
5215 5230 return (0);
5216 5231 }
5217 5232
5218 5233 nvlist_t *
5219 5234 spa_tryimport(nvlist_t *tryconfig)
5220 5235 {
5221 5236 nvlist_t *config = NULL;
5222 5237 char *poolname, *cachefile;
5223 5238 spa_t *spa;
5224 5239 uint64_t state;
5225 5240 int error;
5226 5241 zpool_load_policy_t policy;
5227 5242
5228 5243 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
5229 5244 return (NULL);
5230 5245
5231 5246 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
5232 5247 return (NULL);
5233 5248
5234 5249 /*
5235 5250 * Create and initialize the spa structure.
5236 5251 */
5237 5252 mutex_enter(&spa_namespace_lock);
5238 5253 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
5239 5254 spa_activate(spa, FREAD);
5240 5255
5241 5256 /*
5242 5257 * Rewind pool if a max txg was provided.
5243 5258 */
5244 5259 zpool_get_load_policy(spa->spa_config, &policy);
5245 5260 if (policy.zlp_txg != UINT64_MAX) {
5246 5261 spa->spa_load_max_txg = policy.zlp_txg;
5247 5262 spa->spa_extreme_rewind = B_TRUE;
5248 5263 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
5249 5264 poolname, (longlong_t)policy.zlp_txg);
5250 5265 } else {
5251 5266 zfs_dbgmsg("spa_tryimport: importing %s", poolname);
5252 5267 }
5253 5268
5254 5269 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
5255 5270 == 0) {
5256 5271 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
5257 5272 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
5258 5273 } else {
5259 5274 spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
5260 5275 }
5261 5276
5262 5277 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
5263 5278
5264 5279 /*
5265 5280 * If 'tryconfig' was at least parsable, return the current config.
5266 5281 */
5267 5282 if (spa->spa_root_vdev != NULL) {
5268 5283 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5269 5284 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
5270 5285 poolname) == 0);
5271 5286 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5272 5287 state) == 0);
5273 5288 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
5274 5289 spa->spa_uberblock.ub_timestamp) == 0);
5275 5290 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5276 5291 spa->spa_load_info) == 0);
5277 5292
5278 5293 /*
5279 5294 * If the bootfs property exists on this pool then we
5280 5295 * copy it out so that external consumers can tell which
5281 5296 * pools are bootable.
5282 5297 */
5283 5298 if ((!error || error == EEXIST) && spa->spa_bootfs) {
5284 5299 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5285 5300
5286 5301 /*
5287 5302 * We have to play games with the name since the
5288 5303 * pool was opened as TRYIMPORT_NAME.
5289 5304 */
5290 5305 if (dsl_dsobj_to_dsname(spa_name(spa),
5291 5306 spa->spa_bootfs, tmpname) == 0) {
5292 5307 char *cp;
5293 5308 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5294 5309
5295 5310 cp = strchr(tmpname, '/');
5296 5311 if (cp == NULL) {
5297 5312 (void) strlcpy(dsname, tmpname,
5298 5313 MAXPATHLEN);
5299 5314 } else {
5300 5315 (void) snprintf(dsname, MAXPATHLEN,
5301 5316 "%s/%s", poolname, ++cp);
5302 5317 }
5303 5318 VERIFY(nvlist_add_string(config,
5304 5319 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
5305 5320 kmem_free(dsname, MAXPATHLEN);
5306 5321 }
5307 5322 kmem_free(tmpname, MAXPATHLEN);
5308 5323 }
5309 5324
5310 5325 /*
5311 5326 * Add the list of hot spares and level 2 cache devices.
5312 5327 */
5313 5328 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5314 5329 spa_add_spares(spa, config);
5315 5330 spa_add_l2cache(spa, config);
5316 5331 spa_config_exit(spa, SCL_CONFIG, FTAG);
5317 5332 }
5318 5333
5319 5334 spa_unload(spa);
5320 5335 spa_deactivate(spa);
5321 5336 spa_remove(spa);
5322 5337 mutex_exit(&spa_namespace_lock);
5323 5338
5324 5339 return (config);
5325 5340 }
5326 5341
5327 5342 /*
5328 5343 * Pool export/destroy
5329 5344 *
5330 5345 * The act of destroying or exporting a pool is very simple. We make sure there
5331 5346 * is no more pending I/O and any references to the pool are gone. Then, we
5332 5347 * update the pool state and sync all the labels to disk, removing the
5333 5348 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
5334 5349 * we don't sync the labels or remove the configuration cache.
5335 5350 */
5336 5351 static int
5337 5352 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
5338 5353 boolean_t force, boolean_t hardforce)
5339 5354 {
5340 5355 spa_t *spa;
5341 5356
5342 5357 if (oldconfig)
5343 5358 *oldconfig = NULL;
5344 5359
5345 5360 if (!(spa_mode_global & FWRITE))
5346 5361 return (SET_ERROR(EROFS));
5347 5362
5348 5363 mutex_enter(&spa_namespace_lock);
5349 5364 if ((spa = spa_lookup(pool)) == NULL) {
5350 5365 mutex_exit(&spa_namespace_lock);
5351 5366 return (SET_ERROR(ENOENT));
5352 5367 }
5353 5368
5354 5369 /*
5355 5370 * Put a hold on the pool, drop the namespace lock, stop async tasks,
5356 5371 * reacquire the namespace lock, and see if we can export.
5357 5372 */
5358 5373 spa_open_ref(spa, FTAG);
5359 5374 mutex_exit(&spa_namespace_lock);
5360 5375 spa_async_suspend(spa);
5361 5376 mutex_enter(&spa_namespace_lock);
5362 5377 spa_close(spa, FTAG);
5363 5378
5364 5379 /*
5365 5380 * The pool will be in core if it's openable,
5366 5381 * in which case we can modify its state.
5367 5382 */
5368 5383 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
5369 5384
5370 5385 /*
5371 5386 * Objsets may be open only because they're dirty, so we
5372 5387 * have to force it to sync before checking spa_refcnt.
5373 5388 */
5374 5389 txg_wait_synced(spa->spa_dsl_pool, 0);
5375 5390 spa_evicting_os_wait(spa);
5376 5391
5377 5392 /*
5378 5393 * A pool cannot be exported or destroyed if there are active
5379 5394 * references. If we are resetting a pool, allow references by
5380 5395 * fault injection handlers.
5381 5396 */
5382 5397 if (!spa_refcount_zero(spa) ||
5383 5398 (spa->spa_inject_ref != 0 &&
5384 5399 new_state != POOL_STATE_UNINITIALIZED)) {
5385 5400 spa_async_resume(spa);
5386 5401 mutex_exit(&spa_namespace_lock);
5387 5402 return (SET_ERROR(EBUSY));
5388 5403 }
5389 5404
5390 5405 /*
5391 5406 * A pool cannot be exported if it has an active shared spare.
5392 5407 * This is to prevent other pools stealing the active spare
5393 5408 * from an exported pool. At user's own will, such pool can
5394 5409 * be forcedly exported.
5395 5410 */
5396 5411 if (!force && new_state == POOL_STATE_EXPORTED &&
5397 5412 spa_has_active_shared_spare(spa)) {
5398 5413 spa_async_resume(spa);
5399 5414 mutex_exit(&spa_namespace_lock);
5400 5415 return (SET_ERROR(EXDEV));
5401 5416 }
5402 5417
5403 5418 /*
5404 5419 * We're about to export or destroy this pool. Make sure
5405 5420 * we stop all initializtion activity here before we
5406 5421 * set the spa_final_txg. This will ensure that all
5407 5422 * dirty data resulting from the initialization is
5408 5423 * committed to disk before we unload the pool.
5409 5424 */
5410 5425 if (spa->spa_root_vdev != NULL) {
5411 5426 vdev_initialize_stop_all(spa->spa_root_vdev,
5412 5427 VDEV_INITIALIZE_ACTIVE);
5413 5428 }
5414 5429
5415 5430 /*
5416 5431 * We want this to be reflected on every label,
5417 5432 * so mark them all dirty. spa_unload() will do the
5418 5433 * final sync that pushes these changes out.
5419 5434 */
5420 5435 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
5421 5436 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5422 5437 spa->spa_state = new_state;
5423 5438 spa->spa_final_txg = spa_last_synced_txg(spa) +
5424 5439 TXG_DEFER_SIZE + 1;
5425 5440 vdev_config_dirty(spa->spa_root_vdev);
5426 5441 spa_config_exit(spa, SCL_ALL, FTAG);
5427 5442 }
5428 5443 }
5429 5444
5430 5445 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
5431 5446
5432 5447 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5433 5448 spa_unload(spa);
5434 5449 spa_deactivate(spa);
5435 5450 }
5436 5451
5437 5452 if (oldconfig && spa->spa_config)
5438 5453 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
5439 5454
5440 5455 if (new_state != POOL_STATE_UNINITIALIZED) {
5441 5456 if (!hardforce)
5442 5457 spa_write_cachefile(spa, B_TRUE, B_TRUE);
5443 5458 spa_remove(spa);
5444 5459 }
5445 5460 mutex_exit(&spa_namespace_lock);
5446 5461
5447 5462 return (0);
5448 5463 }
5449 5464
5450 5465 /*
5451 5466 * Destroy a storage pool.
5452 5467 */
5453 5468 int
5454 5469 spa_destroy(char *pool)
5455 5470 {
5456 5471 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
5457 5472 B_FALSE, B_FALSE));
5458 5473 }
5459 5474
5460 5475 /*
5461 5476 * Export a storage pool.
5462 5477 */
5463 5478 int
5464 5479 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
5465 5480 boolean_t hardforce)
5466 5481 {
5467 5482 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
5468 5483 force, hardforce));
5469 5484 }
5470 5485
5471 5486 /*
5472 5487 * Similar to spa_export(), this unloads the spa_t without actually removing it
5473 5488 * from the namespace in any way.
5474 5489 */
5475 5490 int
5476 5491 spa_reset(char *pool)
5477 5492 {
5478 5493 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
5479 5494 B_FALSE, B_FALSE));
5480 5495 }
5481 5496
5482 5497 /*
5483 5498 * ==========================================================================
5484 5499 * Device manipulation
5485 5500 * ==========================================================================
5486 5501 */
5487 5502
5488 5503 /*
5489 5504 * Add a device to a storage pool.
5490 5505 */
5491 5506 int
5492 5507 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
5493 5508 {
5494 5509 uint64_t txg, id;
5495 5510 int error;
5496 5511 vdev_t *rvd = spa->spa_root_vdev;
5497 5512 vdev_t *vd, *tvd;
5498 5513 nvlist_t **spares, **l2cache;
5499 5514 uint_t nspares, nl2cache;
5500 5515
5501 5516 ASSERT(spa_writeable(spa));
5502 5517
5503 5518 txg = spa_vdev_enter(spa);
5504 5519
5505 5520 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
5506 5521 VDEV_ALLOC_ADD)) != 0)
5507 5522 return (spa_vdev_exit(spa, NULL, txg, error));
5508 5523
5509 5524 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
5510 5525
5511 5526 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
5512 5527 &nspares) != 0)
5513 5528 nspares = 0;
5514 5529
5515 5530 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
5516 5531 &nl2cache) != 0)
5517 5532 nl2cache = 0;
5518 5533
5519 5534 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
5520 5535 return (spa_vdev_exit(spa, vd, txg, EINVAL));
5521 5536
5522 5537 if (vd->vdev_children != 0 &&
5523 5538 (error = vdev_create(vd, txg, B_FALSE)) != 0)
5524 5539 return (spa_vdev_exit(spa, vd, txg, error));
5525 5540
5526 5541 /*
5527 5542 * We must validate the spares and l2cache devices after checking the
5528 5543 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
5529 5544 */
5530 5545 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
5531 5546 return (spa_vdev_exit(spa, vd, txg, error));
5532 5547
5533 5548 /*
5534 5549 * If we are in the middle of a device removal, we can only add
5535 5550 * devices which match the existing devices in the pool.
5536 5551 * If we are in the middle of a removal, or have some indirect
5537 5552 * vdevs, we can not add raidz toplevels.
5538 5553 */
5539 5554 if (spa->spa_vdev_removal != NULL ||
5540 5555 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
5541 5556 for (int c = 0; c < vd->vdev_children; c++) {
5542 5557 tvd = vd->vdev_child[c];
5543 5558 if (spa->spa_vdev_removal != NULL &&
5544 5559 tvd->vdev_ashift != spa->spa_max_ashift) {
5545 5560 return (spa_vdev_exit(spa, vd, txg, EINVAL));
5546 5561 }
5547 5562 /* Fail if top level vdev is raidz */
5548 5563 if (tvd->vdev_ops == &vdev_raidz_ops) {
5549 5564 return (spa_vdev_exit(spa, vd, txg, EINVAL));
5550 5565 }
5551 5566 /*
5552 5567 * Need the top level mirror to be
5553 5568 * a mirror of leaf vdevs only
5554 5569 */
5555 5570 if (tvd->vdev_ops == &vdev_mirror_ops) {
5556 5571 for (uint64_t cid = 0;
5557 5572 cid < tvd->vdev_children; cid++) {
5558 5573 vdev_t *cvd = tvd->vdev_child[cid];
5559 5574 if (!cvd->vdev_ops->vdev_op_leaf) {
5560 5575 return (spa_vdev_exit(spa, vd,
5561 5576 txg, EINVAL));
5562 5577 }
5563 5578 }
5564 5579 }
5565 5580 }
5566 5581 }
5567 5582
5568 5583 for (int c = 0; c < vd->vdev_children; c++) {
5569 5584
5570 5585 /*
5571 5586 * Set the vdev id to the first hole, if one exists.
5572 5587 */
5573 5588 for (id = 0; id < rvd->vdev_children; id++) {
5574 5589 if (rvd->vdev_child[id]->vdev_ishole) {
5575 5590 vdev_free(rvd->vdev_child[id]);
5576 5591 break;
5577 5592 }
5578 5593 }
5579 5594 tvd = vd->vdev_child[c];
5580 5595 vdev_remove_child(vd, tvd);
5581 5596 tvd->vdev_id = id;
5582 5597 vdev_add_child(rvd, tvd);
5583 5598 vdev_config_dirty(tvd);
5584 5599 }
5585 5600
5586 5601 if (nspares != 0) {
5587 5602 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
5588 5603 ZPOOL_CONFIG_SPARES);
5589 5604 spa_load_spares(spa);
5590 5605 spa->spa_spares.sav_sync = B_TRUE;
5591 5606 }
5592 5607
5593 5608 if (nl2cache != 0) {
5594 5609 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
5595 5610 ZPOOL_CONFIG_L2CACHE);
5596 5611 spa_load_l2cache(spa);
5597 5612 spa->spa_l2cache.sav_sync = B_TRUE;
5598 5613 }
5599 5614
5600 5615 /*
5601 5616 * We have to be careful when adding new vdevs to an existing pool.
5602 5617 * If other threads start allocating from these vdevs before we
5603 5618 * sync the config cache, and we lose power, then upon reboot we may
5604 5619 * fail to open the pool because there are DVAs that the config cache
5605 5620 * can't translate. Therefore, we first add the vdevs without
5606 5621 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
5607 5622 * and then let spa_config_update() initialize the new metaslabs.
5608 5623 *
5609 5624 * spa_load() checks for added-but-not-initialized vdevs, so that
5610 5625 * if we lose power at any point in this sequence, the remaining
5611 5626 * steps will be completed the next time we load the pool.
5612 5627 */
5613 5628 (void) spa_vdev_exit(spa, vd, txg, 0);
5614 5629
5615 5630 mutex_enter(&spa_namespace_lock);
5616 5631 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5617 5632 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
5618 5633 mutex_exit(&spa_namespace_lock);
5619 5634
5620 5635 return (0);
5621 5636 }
5622 5637
5623 5638 /*
5624 5639 * Attach a device to a mirror. The arguments are the path to any device
5625 5640 * in the mirror, and the nvroot for the new device. If the path specifies
5626 5641 * a device that is not mirrored, we automatically insert the mirror vdev.
5627 5642 *
5628 5643 * If 'replacing' is specified, the new device is intended to replace the
5629 5644 * existing device; in this case the two devices are made into their own
5630 5645 * mirror using the 'replacing' vdev, which is functionally identical to
5631 5646 * the mirror vdev (it actually reuses all the same ops) but has a few
5632 5647 * extra rules: you can't attach to it after it's been created, and upon
5633 5648 * completion of resilvering, the first disk (the one being replaced)
5634 5649 * is automatically detached.
5635 5650 */
5636 5651 int
5637 5652 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
5638 5653 {
5639 5654 uint64_t txg, dtl_max_txg;
5640 5655 vdev_t *rvd = spa->spa_root_vdev;
5641 5656 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
5642 5657 vdev_ops_t *pvops;
5643 5658 char *oldvdpath, *newvdpath;
5644 5659 int newvd_isspare;
5645 5660 int error;
5646 5661
5647 5662 ASSERT(spa_writeable(spa));
5648 5663
5649 5664 txg = spa_vdev_enter(spa);
5650 5665
5651 5666 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
5652 5667
5653 5668 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5654 5669 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
5655 5670 error = (spa_has_checkpoint(spa)) ?
5656 5671 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
5657 5672 return (spa_vdev_exit(spa, NULL, txg, error));
5658 5673 }
5659 5674
5660 5675 if (spa->spa_vdev_removal != NULL)
5661 5676 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5662 5677
5663 5678 if (oldvd == NULL)
5664 5679 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
5665 5680
5666 5681 if (!oldvd->vdev_ops->vdev_op_leaf)
5667 5682 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5668 5683
5669 5684 pvd = oldvd->vdev_parent;
5670 5685
5671 5686 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
5672 5687 VDEV_ALLOC_ATTACH)) != 0)
5673 5688 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5674 5689
5675 5690 if (newrootvd->vdev_children != 1)
5676 5691 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
5677 5692
5678 5693 newvd = newrootvd->vdev_child[0];
5679 5694
5680 5695 if (!newvd->vdev_ops->vdev_op_leaf)
5681 5696 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
5682 5697
5683 5698 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
5684 5699 return (spa_vdev_exit(spa, newrootvd, txg, error));
5685 5700
5686 5701 /*
5687 5702 * Spares can't replace logs
5688 5703 */
5689 5704 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
5690 5705 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5691 5706
5692 5707 if (!replacing) {
5693 5708 /*
5694 5709 * For attach, the only allowable parent is a mirror or the root
5695 5710 * vdev.
5696 5711 */
5697 5712 if (pvd->vdev_ops != &vdev_mirror_ops &&
5698 5713 pvd->vdev_ops != &vdev_root_ops)
5699 5714 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5700 5715
5701 5716 pvops = &vdev_mirror_ops;
5702 5717 } else {
5703 5718 /*
5704 5719 * Active hot spares can only be replaced by inactive hot
5705 5720 * spares.
5706 5721 */
5707 5722 if (pvd->vdev_ops == &vdev_spare_ops &&
5708 5723 oldvd->vdev_isspare &&
5709 5724 !spa_has_spare(spa, newvd->vdev_guid))
5710 5725 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5711 5726
5712 5727 /*
5713 5728 * If the source is a hot spare, and the parent isn't already a
5714 5729 * spare, then we want to create a new hot spare. Otherwise, we
5715 5730 * want to create a replacing vdev. The user is not allowed to
5716 5731 * attach to a spared vdev child unless the 'isspare' state is
5717 5732 * the same (spare replaces spare, non-spare replaces
5718 5733 * non-spare).
5719 5734 */
5720 5735 if (pvd->vdev_ops == &vdev_replacing_ops &&
5721 5736 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
5722 5737 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5723 5738 } else if (pvd->vdev_ops == &vdev_spare_ops &&
5724 5739 newvd->vdev_isspare != oldvd->vdev_isspare) {
5725 5740 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5726 5741 }
5727 5742
5728 5743 if (newvd->vdev_isspare)
5729 5744 pvops = &vdev_spare_ops;
5730 5745 else
5731 5746 pvops = &vdev_replacing_ops;
5732 5747 }
5733 5748
5734 5749 /*
5735 5750 * Make sure the new device is big enough.
5736 5751 */
5737 5752 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
5738 5753 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
5739 5754
5740 5755 /*
5741 5756 * The new device cannot have a higher alignment requirement
5742 5757 * than the top-level vdev.
5743 5758 */
5744 5759 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
5745 5760 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
5746 5761
5747 5762 /*
5748 5763 * If this is an in-place replacement, update oldvd's path and devid
5749 5764 * to make it distinguishable from newvd, and unopenable from now on.
5750 5765 */
5751 5766 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
5752 5767 spa_strfree(oldvd->vdev_path);
5753 5768 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
5754 5769 KM_SLEEP);
5755 5770 (void) sprintf(oldvd->vdev_path, "%s/%s",
5756 5771 newvd->vdev_path, "old");
5757 5772 if (oldvd->vdev_devid != NULL) {
5758 5773 spa_strfree(oldvd->vdev_devid);
5759 5774 oldvd->vdev_devid = NULL;
5760 5775 }
5761 5776 }
5762 5777
5763 5778 /* mark the device being resilvered */
5764 5779 newvd->vdev_resilver_txg = txg;
5765 5780
5766 5781 /*
5767 5782 * If the parent is not a mirror, or if we're replacing, insert the new
5768 5783 * mirror/replacing/spare vdev above oldvd.
5769 5784 */
5770 5785 if (pvd->vdev_ops != pvops)
5771 5786 pvd = vdev_add_parent(oldvd, pvops);
5772 5787
5773 5788 ASSERT(pvd->vdev_top->vdev_parent == rvd);
5774 5789 ASSERT(pvd->vdev_ops == pvops);
5775 5790 ASSERT(oldvd->vdev_parent == pvd);
5776 5791
5777 5792 /*
5778 5793 * Extract the new device from its root and add it to pvd.
5779 5794 */
5780 5795 vdev_remove_child(newrootvd, newvd);
5781 5796 newvd->vdev_id = pvd->vdev_children;
5782 5797 newvd->vdev_crtxg = oldvd->vdev_crtxg;
5783 5798 vdev_add_child(pvd, newvd);
5784 5799
5785 5800 tvd = newvd->vdev_top;
5786 5801 ASSERT(pvd->vdev_top == tvd);
5787 5802 ASSERT(tvd->vdev_parent == rvd);
5788 5803
5789 5804 vdev_config_dirty(tvd);
5790 5805
5791 5806 /*
5792 5807 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
5793 5808 * for any dmu_sync-ed blocks. It will propagate upward when
5794 5809 * spa_vdev_exit() calls vdev_dtl_reassess().
5795 5810 */
5796 5811 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
5797 5812
5798 5813 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
5799 5814 dtl_max_txg - TXG_INITIAL);
5800 5815
5801 5816 if (newvd->vdev_isspare) {
5802 5817 spa_spare_activate(newvd);
5803 5818 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
5804 5819 }
5805 5820
5806 5821 oldvdpath = spa_strdup(oldvd->vdev_path);
5807 5822 newvdpath = spa_strdup(newvd->vdev_path);
5808 5823 newvd_isspare = newvd->vdev_isspare;
5809 5824
5810 5825 /*
5811 5826 * Mark newvd's DTL dirty in this txg.
5812 5827 */
5813 5828 vdev_dirty(tvd, VDD_DTL, newvd, txg);
5814 5829
5815 5830 /*
5816 5831 * Schedule the resilver to restart in the future. We do this to
5817 5832 * ensure that dmu_sync-ed blocks have been stitched into the
5818 5833 * respective datasets.
5819 5834 */
5820 5835 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
5821 5836
5822 5837 if (spa->spa_bootfs)
5823 5838 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
5824 5839
5825 5840 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
5826 5841
5827 5842 /*
5828 5843 * Commit the config
5829 5844 */
5830 5845 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
5831 5846
5832 5847 spa_history_log_internal(spa, "vdev attach", NULL,
5833 5848 "%s vdev=%s %s vdev=%s",
5834 5849 replacing && newvd_isspare ? "spare in" :
5835 5850 replacing ? "replace" : "attach", newvdpath,
5836 5851 replacing ? "for" : "to", oldvdpath);
5837 5852
5838 5853 spa_strfree(oldvdpath);
5839 5854 spa_strfree(newvdpath);
5840 5855
5841 5856 return (0);
5842 5857 }
5843 5858
5844 5859 /*
5845 5860 * Detach a device from a mirror or replacing vdev.
5846 5861 *
5847 5862 * If 'replace_done' is specified, only detach if the parent
5848 5863 * is a replacing vdev.
5849 5864 */
5850 5865 int
5851 5866 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
5852 5867 {
5853 5868 uint64_t txg;
5854 5869 int error;
5855 5870 vdev_t *rvd = spa->spa_root_vdev;
5856 5871 vdev_t *vd, *pvd, *cvd, *tvd;
5857 5872 boolean_t unspare = B_FALSE;
5858 5873 uint64_t unspare_guid = 0;
5859 5874 char *vdpath;
5860 5875
5861 5876 ASSERT(spa_writeable(spa));
5862 5877
5863 5878 txg = spa_vdev_enter(spa);
5864 5879
5865 5880 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5866 5881
5867 5882 /*
5868 5883 * Besides being called directly from the userland through the
5869 5884 * ioctl interface, spa_vdev_detach() can be potentially called
5870 5885 * at the end of spa_vdev_resilver_done().
5871 5886 *
5872 5887 * In the regular case, when we have a checkpoint this shouldn't
5873 5888 * happen as we never empty the DTLs of a vdev during the scrub
5874 5889 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
5875 5890 * should never get here when we have a checkpoint.
5876 5891 *
5877 5892 * That said, even in a case when we checkpoint the pool exactly
5878 5893 * as spa_vdev_resilver_done() calls this function everything
5879 5894 * should be fine as the resilver will return right away.
5880 5895 */
5881 5896 ASSERT(MUTEX_HELD(&spa_namespace_lock));
5882 5897 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
5883 5898 error = (spa_has_checkpoint(spa)) ?
5884 5899 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
5885 5900 return (spa_vdev_exit(spa, NULL, txg, error));
5886 5901 }
5887 5902
5888 5903 if (vd == NULL)
5889 5904 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
5890 5905
5891 5906 if (!vd->vdev_ops->vdev_op_leaf)
5892 5907 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5893 5908
5894 5909 pvd = vd->vdev_parent;
5895 5910
5896 5911 /*
5897 5912 * If the parent/child relationship is not as expected, don't do it.
5898 5913 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
5899 5914 * vdev that's replacing B with C. The user's intent in replacing
5900 5915 * is to go from M(A,B) to M(A,C). If the user decides to cancel
5901 5916 * the replace by detaching C, the expected behavior is to end up
5902 5917 * M(A,B). But suppose that right after deciding to detach C,
5903 5918 * the replacement of B completes. We would have M(A,C), and then
5904 5919 * ask to detach C, which would leave us with just A -- not what
5905 5920 * the user wanted. To prevent this, we make sure that the
5906 5921 * parent/child relationship hasn't changed -- in this example,
5907 5922 * that C's parent is still the replacing vdev R.
5908 5923 */
5909 5924 if (pvd->vdev_guid != pguid && pguid != 0)
5910 5925 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5911 5926
5912 5927 /*
5913 5928 * Only 'replacing' or 'spare' vdevs can be replaced.
5914 5929 */
5915 5930 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
5916 5931 pvd->vdev_ops != &vdev_spare_ops)
5917 5932 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5918 5933
5919 5934 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
5920 5935 spa_version(spa) >= SPA_VERSION_SPARES);
5921 5936
5922 5937 /*
5923 5938 * Only mirror, replacing, and spare vdevs support detach.
5924 5939 */
5925 5940 if (pvd->vdev_ops != &vdev_replacing_ops &&
5926 5941 pvd->vdev_ops != &vdev_mirror_ops &&
5927 5942 pvd->vdev_ops != &vdev_spare_ops)
5928 5943 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5929 5944
5930 5945 /*
5931 5946 * If this device has the only valid copy of some data,
5932 5947 * we cannot safely detach it.
5933 5948 */
5934 5949 if (vdev_dtl_required(vd))
5935 5950 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5936 5951
5937 5952 ASSERT(pvd->vdev_children >= 2);
5938 5953
5939 5954 /*
5940 5955 * If we are detaching the second disk from a replacing vdev, then
5941 5956 * check to see if we changed the original vdev's path to have "/old"
5942 5957 * at the end in spa_vdev_attach(). If so, undo that change now.
5943 5958 */
5944 5959 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
5945 5960 vd->vdev_path != NULL) {
5946 5961 size_t len = strlen(vd->vdev_path);
5947 5962
5948 5963 for (int c = 0; c < pvd->vdev_children; c++) {
5949 5964 cvd = pvd->vdev_child[c];
5950 5965
5951 5966 if (cvd == vd || cvd->vdev_path == NULL)
5952 5967 continue;
5953 5968
5954 5969 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
5955 5970 strcmp(cvd->vdev_path + len, "/old") == 0) {
5956 5971 spa_strfree(cvd->vdev_path);
5957 5972 cvd->vdev_path = spa_strdup(vd->vdev_path);
5958 5973 break;
5959 5974 }
5960 5975 }
5961 5976 }
5962 5977
5963 5978 /*
5964 5979 * If we are detaching the original disk from a spare, then it implies
5965 5980 * that the spare should become a real disk, and be removed from the
5966 5981 * active spare list for the pool.
5967 5982 */
5968 5983 if (pvd->vdev_ops == &vdev_spare_ops &&
5969 5984 vd->vdev_id == 0 &&
5970 5985 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
5971 5986 unspare = B_TRUE;
5972 5987
5973 5988 /*
5974 5989 * Erase the disk labels so the disk can be used for other things.
5975 5990 * This must be done after all other error cases are handled,
5976 5991 * but before we disembowel vd (so we can still do I/O to it).
5977 5992 * But if we can't do it, don't treat the error as fatal --
5978 5993 * it may be that the unwritability of the disk is the reason
5979 5994 * it's being detached!
5980 5995 */
5981 5996 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5982 5997
5983 5998 /*
5984 5999 * Remove vd from its parent and compact the parent's children.
5985 6000 */
5986 6001 vdev_remove_child(pvd, vd);
5987 6002 vdev_compact_children(pvd);
5988 6003
5989 6004 /*
5990 6005 * Remember one of the remaining children so we can get tvd below.
5991 6006 */
5992 6007 cvd = pvd->vdev_child[pvd->vdev_children - 1];
5993 6008
5994 6009 /*
5995 6010 * If we need to remove the remaining child from the list of hot spares,
5996 6011 * do it now, marking the vdev as no longer a spare in the process.
5997 6012 * We must do this before vdev_remove_parent(), because that can
5998 6013 * change the GUID if it creates a new toplevel GUID. For a similar
5999 6014 * reason, we must remove the spare now, in the same txg as the detach;
6000 6015 * otherwise someone could attach a new sibling, change the GUID, and
6001 6016 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
6002 6017 */
6003 6018 if (unspare) {
6004 6019 ASSERT(cvd->vdev_isspare);
6005 6020 spa_spare_remove(cvd);
6006 6021 unspare_guid = cvd->vdev_guid;
6007 6022 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
6008 6023 cvd->vdev_unspare = B_TRUE;
6009 6024 }
6010 6025
6011 6026 /*
6012 6027 * If the parent mirror/replacing vdev only has one child,
6013 6028 * the parent is no longer needed. Remove it from the tree.
6014 6029 */
6015 6030 if (pvd->vdev_children == 1) {
6016 6031 if (pvd->vdev_ops == &vdev_spare_ops)
6017 6032 cvd->vdev_unspare = B_FALSE;
6018 6033 vdev_remove_parent(cvd);
6019 6034 }
6020 6035
6021 6036
6022 6037 /*
6023 6038 * We don't set tvd until now because the parent we just removed
6024 6039 * may have been the previous top-level vdev.
6025 6040 */
6026 6041 tvd = cvd->vdev_top;
6027 6042 ASSERT(tvd->vdev_parent == rvd);
6028 6043
6029 6044 /*
6030 6045 * Reevaluate the parent vdev state.
6031 6046 */
6032 6047 vdev_propagate_state(cvd);
6033 6048
6034 6049 /*
6035 6050 * If the 'autoexpand' property is set on the pool then automatically
6036 6051 * try to expand the size of the pool. For example if the device we
6037 6052 * just detached was smaller than the others, it may be possible to
6038 6053 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
6039 6054 * first so that we can obtain the updated sizes of the leaf vdevs.
6040 6055 */
6041 6056 if (spa->spa_autoexpand) {
6042 6057 vdev_reopen(tvd);
6043 6058 vdev_expand(tvd, txg);
6044 6059 }
6045 6060
6046 6061 vdev_config_dirty(tvd);
6047 6062
6048 6063 /*
6049 6064 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
6050 6065 * vd->vdev_detached is set and free vd's DTL object in syncing context.
6051 6066 * But first make sure we're not on any *other* txg's DTL list, to
6052 6067 * prevent vd from being accessed after it's freed.
6053 6068 */
6054 6069 vdpath = spa_strdup(vd->vdev_path);
6055 6070 for (int t = 0; t < TXG_SIZE; t++)
6056 6071 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
6057 6072 vd->vdev_detached = B_TRUE;
6058 6073 vdev_dirty(tvd, VDD_DTL, vd, txg);
6059 6074
6060 6075 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
6061 6076
6062 6077 /* hang on to the spa before we release the lock */
6063 6078 spa_open_ref(spa, FTAG);
6064 6079
6065 6080 error = spa_vdev_exit(spa, vd, txg, 0);
6066 6081
6067 6082 spa_history_log_internal(spa, "detach", NULL,
6068 6083 "vdev=%s", vdpath);
6069 6084 spa_strfree(vdpath);
6070 6085
6071 6086 /*
6072 6087 * If this was the removal of the original device in a hot spare vdev,
6073 6088 * then we want to go through and remove the device from the hot spare
6074 6089 * list of every other pool.
6075 6090 */
6076 6091 if (unspare) {
6077 6092 spa_t *altspa = NULL;
6078 6093
6079 6094 mutex_enter(&spa_namespace_lock);
6080 6095 while ((altspa = spa_next(altspa)) != NULL) {
6081 6096 if (altspa->spa_state != POOL_STATE_ACTIVE ||
6082 6097 altspa == spa)
6083 6098 continue;
6084 6099
6085 6100 spa_open_ref(altspa, FTAG);
6086 6101 mutex_exit(&spa_namespace_lock);
6087 6102 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
6088 6103 mutex_enter(&spa_namespace_lock);
6089 6104 spa_close(altspa, FTAG);
6090 6105 }
6091 6106 mutex_exit(&spa_namespace_lock);
6092 6107
6093 6108 /* search the rest of the vdevs for spares to remove */
6094 6109 spa_vdev_resilver_done(spa);
6095 6110 }
6096 6111
6097 6112 /* all done with the spa; OK to release */
6098 6113 mutex_enter(&spa_namespace_lock);
6099 6114 spa_close(spa, FTAG);
6100 6115 mutex_exit(&spa_namespace_lock);
6101 6116
6102 6117 return (error);
6103 6118 }
6104 6119
6105 6120 int
6106 6121 spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
6107 6122 {
6108 6123 /*
6109 6124 * We hold the namespace lock through the whole function
6110 6125 * to prevent any changes to the pool while we're starting or
6111 6126 * stopping initialization. The config and state locks are held so that
6112 6127 * we can properly assess the vdev state before we commit to
6113 6128 * the initializing operation.
6114 6129 */
6115 6130 mutex_enter(&spa_namespace_lock);
6116 6131 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6117 6132
6118 6133 /* Look up vdev and ensure it's a leaf. */
6119 6134 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
6120 6135 if (vd == NULL || vd->vdev_detached) {
6121 6136 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6122 6137 mutex_exit(&spa_namespace_lock);
6123 6138 return (SET_ERROR(ENODEV));
6124 6139 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
6125 6140 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6126 6141 mutex_exit(&spa_namespace_lock);
6127 6142 return (SET_ERROR(EINVAL));
6128 6143 } else if (!vdev_writeable(vd)) {
6129 6144 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6130 6145 mutex_exit(&spa_namespace_lock);
6131 6146 return (SET_ERROR(EROFS));
6132 6147 }
6133 6148 mutex_enter(&vd->vdev_initialize_lock);
6134 6149 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6135 6150
6136 6151 /*
6137 6152 * When we activate an initialize action we check to see
6138 6153 * if the vdev_initialize_thread is NULL. We do this instead
6139 6154 * of using the vdev_initialize_state since there might be
6140 6155 * a previous initialization process which has completed but
6141 6156 * the thread is not exited.
6142 6157 */
6143 6158 if (cmd_type == POOL_INITIALIZE_DO &&
6144 6159 (vd->vdev_initialize_thread != NULL ||
6145 6160 vd->vdev_top->vdev_removing)) {
6146 6161 mutex_exit(&vd->vdev_initialize_lock);
6147 6162 mutex_exit(&spa_namespace_lock);
6148 6163 return (SET_ERROR(EBUSY));
6149 6164 } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
6150 6165 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
6151 6166 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
6152 6167 mutex_exit(&vd->vdev_initialize_lock);
6153 6168 mutex_exit(&spa_namespace_lock);
6154 6169 return (SET_ERROR(ESRCH));
6155 6170 } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
6156 6171 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
6157 6172 mutex_exit(&vd->vdev_initialize_lock);
6158 6173 mutex_exit(&spa_namespace_lock);
6159 6174 return (SET_ERROR(ESRCH));
6160 6175 }
6161 6176
6162 6177 switch (cmd_type) {
6163 6178 case POOL_INITIALIZE_DO:
6164 6179 vdev_initialize(vd);
6165 6180 break;
6166 6181 case POOL_INITIALIZE_CANCEL:
6167 6182 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
6168 6183 break;
6169 6184 case POOL_INITIALIZE_SUSPEND:
6170 6185 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
6171 6186 break;
6172 6187 default:
6173 6188 panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
6174 6189 }
6175 6190 mutex_exit(&vd->vdev_initialize_lock);
6176 6191
6177 6192 /* Sync out the initializing state */
6178 6193 txg_wait_synced(spa->spa_dsl_pool, 0);
6179 6194 mutex_exit(&spa_namespace_lock);
6180 6195
6181 6196 return (0);
6182 6197 }
6183 6198
6184 6199
6185 6200 /*
6186 6201 * Split a set of devices from their mirrors, and create a new pool from them.
6187 6202 */
6188 6203 int
6189 6204 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
6190 6205 nvlist_t *props, boolean_t exp)
6191 6206 {
6192 6207 int error = 0;
6193 6208 uint64_t txg, *glist;
6194 6209 spa_t *newspa;
6195 6210 uint_t c, children, lastlog;
6196 6211 nvlist_t **child, *nvl, *tmp;
6197 6212 dmu_tx_t *tx;
6198 6213 char *altroot = NULL;
6199 6214 vdev_t *rvd, **vml = NULL; /* vdev modify list */
6200 6215 boolean_t activate_slog;
6201 6216
6202 6217 ASSERT(spa_writeable(spa));
6203 6218
6204 6219 txg = spa_vdev_enter(spa);
6205 6220
6206 6221 ASSERT(MUTEX_HELD(&spa_namespace_lock));
6207 6222 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6208 6223 error = (spa_has_checkpoint(spa)) ?
6209 6224 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6210 6225 return (spa_vdev_exit(spa, NULL, txg, error));
6211 6226 }
6212 6227
6213 6228 /* clear the log and flush everything up to now */
6214 6229 activate_slog = spa_passivate_log(spa);
6215 6230 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
6216 6231 error = spa_reset_logs(spa);
6217 6232 txg = spa_vdev_config_enter(spa);
6218 6233
6219 6234 if (activate_slog)
6220 6235 spa_activate_log(spa);
6221 6236
6222 6237 if (error != 0)
6223 6238 return (spa_vdev_exit(spa, NULL, txg, error));
6224 6239
6225 6240 /* check new spa name before going any further */
6226 6241 if (spa_lookup(newname) != NULL)
6227 6242 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
6228 6243
6229 6244 /*
6230 6245 * scan through all the children to ensure they're all mirrors
6231 6246 */
6232 6247 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
6233 6248 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
6234 6249 &children) != 0)
6235 6250 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6236 6251
6237 6252 /* first, check to ensure we've got the right child count */
6238 6253 rvd = spa->spa_root_vdev;
6239 6254 lastlog = 0;
6240 6255 for (c = 0; c < rvd->vdev_children; c++) {
6241 6256 vdev_t *vd = rvd->vdev_child[c];
6242 6257
6243 6258 /* don't count the holes & logs as children */
6244 6259 if (vd->vdev_islog || !vdev_is_concrete(vd)) {
6245 6260 if (lastlog == 0)
6246 6261 lastlog = c;
6247 6262 continue;
6248 6263 }
6249 6264
6250 6265 lastlog = 0;
6251 6266 }
6252 6267 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
6253 6268 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6254 6269
6255 6270 /* next, ensure no spare or cache devices are part of the split */
6256 6271 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
6257 6272 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
6258 6273 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6259 6274
6260 6275 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
6261 6276 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
6262 6277
6263 6278 /* then, loop over each vdev and validate it */
6264 6279 for (c = 0; c < children; c++) {
6265 6280 uint64_t is_hole = 0;
6266 6281
6267 6282 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
6268 6283 &is_hole);
6269 6284
6270 6285 if (is_hole != 0) {
6271 6286 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
6272 6287 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
6273 6288 continue;
6274 6289 } else {
6275 6290 error = SET_ERROR(EINVAL);
6276 6291 break;
6277 6292 }
6278 6293 }
6279 6294
6280 6295 /* which disk is going to be split? */
6281 6296 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
6282 6297 &glist[c]) != 0) {
6283 6298 error = SET_ERROR(EINVAL);
6284 6299 break;
6285 6300 }
6286 6301
6287 6302 /* look it up in the spa */
6288 6303 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
6289 6304 if (vml[c] == NULL) {
6290 6305 error = SET_ERROR(ENODEV);
6291 6306 break;
6292 6307 }
6293 6308
6294 6309 /* make sure there's nothing stopping the split */
6295 6310 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
6296 6311 vml[c]->vdev_islog ||
6297 6312 !vdev_is_concrete(vml[c]) ||
6298 6313 vml[c]->vdev_isspare ||
6299 6314 vml[c]->vdev_isl2cache ||
6300 6315 !vdev_writeable(vml[c]) ||
6301 6316 vml[c]->vdev_children != 0 ||
6302 6317 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
6303 6318 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
6304 6319 error = SET_ERROR(EINVAL);
6305 6320 break;
6306 6321 }
6307 6322
6308 6323 if (vdev_dtl_required(vml[c])) {
6309 6324 error = SET_ERROR(EBUSY);
6310 6325 break;
6311 6326 }
6312 6327
6313 6328 /* we need certain info from the top level */
6314 6329 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
6315 6330 vml[c]->vdev_top->vdev_ms_array) == 0);
6316 6331 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
6317 6332 vml[c]->vdev_top->vdev_ms_shift) == 0);
6318 6333 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
6319 6334 vml[c]->vdev_top->vdev_asize) == 0);
6320 6335 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
6321 6336 vml[c]->vdev_top->vdev_ashift) == 0);
6322 6337
6323 6338 /* transfer per-vdev ZAPs */
6324 6339 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
6325 6340 VERIFY0(nvlist_add_uint64(child[c],
6326 6341 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
6327 6342
6328 6343 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
6329 6344 VERIFY0(nvlist_add_uint64(child[c],
6330 6345 ZPOOL_CONFIG_VDEV_TOP_ZAP,
6331 6346 vml[c]->vdev_parent->vdev_top_zap));
6332 6347 }
6333 6348
6334 6349 if (error != 0) {
6335 6350 kmem_free(vml, children * sizeof (vdev_t *));
6336 6351 kmem_free(glist, children * sizeof (uint64_t));
6337 6352 return (spa_vdev_exit(spa, NULL, txg, error));
6338 6353 }
6339 6354
6340 6355 /* stop writers from using the disks */
6341 6356 for (c = 0; c < children; c++) {
6342 6357 if (vml[c] != NULL)
6343 6358 vml[c]->vdev_offline = B_TRUE;
6344 6359 }
6345 6360 vdev_reopen(spa->spa_root_vdev);
6346 6361
6347 6362 /*
6348 6363 * Temporarily record the splitting vdevs in the spa config. This
6349 6364 * will disappear once the config is regenerated.
6350 6365 */
6351 6366 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6352 6367 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
6353 6368 glist, children) == 0);
6354 6369 kmem_free(glist, children * sizeof (uint64_t));
6355 6370
6356 6371 mutex_enter(&spa->spa_props_lock);
6357 6372 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
6358 6373 nvl) == 0);
6359 6374 mutex_exit(&spa->spa_props_lock);
6360 6375 spa->spa_config_splitting = nvl;
6361 6376 vdev_config_dirty(spa->spa_root_vdev);
6362 6377
6363 6378 /* configure and create the new pool */
6364 6379 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
6365 6380 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
6366 6381 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
6367 6382 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6368 6383 spa_version(spa)) == 0);
6369 6384 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
6370 6385 spa->spa_config_txg) == 0);
6371 6386 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
6372 6387 spa_generate_guid(NULL)) == 0);
6373 6388 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
6374 6389 (void) nvlist_lookup_string(props,
6375 6390 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
6376 6391
6377 6392 /* add the new pool to the namespace */
6378 6393 newspa = spa_add(newname, config, altroot);
6379 6394 newspa->spa_avz_action = AVZ_ACTION_REBUILD;
6380 6395 newspa->spa_config_txg = spa->spa_config_txg;
6381 6396 spa_set_log_state(newspa, SPA_LOG_CLEAR);
6382 6397
6383 6398 /* release the spa config lock, retaining the namespace lock */
6384 6399 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
6385 6400
6386 6401 if (zio_injection_enabled)
6387 6402 zio_handle_panic_injection(spa, FTAG, 1);
6388 6403
6389 6404 spa_activate(newspa, spa_mode_global);
6390 6405 spa_async_suspend(newspa);
6391 6406
6392 6407 for (c = 0; c < children; c++) {
6393 6408 if (vml[c] != NULL) {
6394 6409 /*
6395 6410 * Temporarily stop the initializing activity. We set
6396 6411 * the state to ACTIVE so that we know to resume
6397 6412 * the initializing once the split has completed.
6398 6413 */
6399 6414 mutex_enter(&vml[c]->vdev_initialize_lock);
6400 6415 vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
6401 6416 mutex_exit(&vml[c]->vdev_initialize_lock);
6402 6417 }
6403 6418 }
6404 6419
6405 6420 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
6406 6421
6407 6422 /* create the new pool from the disks of the original pool */
6408 6423 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
6409 6424 if (error)
6410 6425 goto out;
6411 6426
6412 6427 /* if that worked, generate a real config for the new pool */
6413 6428 if (newspa->spa_root_vdev != NULL) {
6414 6429 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
6415 6430 NV_UNIQUE_NAME, KM_SLEEP) == 0);
6416 6431 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
6417 6432 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
6418 6433 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
6419 6434 B_TRUE));
6420 6435 }
6421 6436
6422 6437 /* set the props */
6423 6438 if (props != NULL) {
6424 6439 spa_configfile_set(newspa, props, B_FALSE);
6425 6440 error = spa_prop_set(newspa, props);
6426 6441 if (error)
6427 6442 goto out;
6428 6443 }
6429 6444
6430 6445 /* flush everything */
6431 6446 txg = spa_vdev_config_enter(newspa);
6432 6447 vdev_config_dirty(newspa->spa_root_vdev);
6433 6448 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
6434 6449
6435 6450 if (zio_injection_enabled)
6436 6451 zio_handle_panic_injection(spa, FTAG, 2);
6437 6452
6438 6453 spa_async_resume(newspa);
6439 6454
6440 6455 /* finally, update the original pool's config */
6441 6456 txg = spa_vdev_config_enter(spa);
6442 6457 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
6443 6458 error = dmu_tx_assign(tx, TXG_WAIT);
6444 6459 if (error != 0)
6445 6460 dmu_tx_abort(tx);
6446 6461 for (c = 0; c < children; c++) {
6447 6462 if (vml[c] != NULL) {
6448 6463 vdev_split(vml[c]);
6449 6464 if (error == 0)
6450 6465 spa_history_log_internal(spa, "detach", tx,
6451 6466 "vdev=%s", vml[c]->vdev_path);
6452 6467
6453 6468 vdev_free(vml[c]);
6454 6469 }
6455 6470 }
6456 6471 spa->spa_avz_action = AVZ_ACTION_REBUILD;
6457 6472 vdev_config_dirty(spa->spa_root_vdev);
6458 6473 spa->spa_config_splitting = NULL;
6459 6474 nvlist_free(nvl);
6460 6475 if (error == 0)
6461 6476 dmu_tx_commit(tx);
6462 6477 (void) spa_vdev_exit(spa, NULL, txg, 0);
6463 6478
6464 6479 if (zio_injection_enabled)
6465 6480 zio_handle_panic_injection(spa, FTAG, 3);
6466 6481
6467 6482 /* split is complete; log a history record */
6468 6483 spa_history_log_internal(newspa, "split", NULL,
6469 6484 "from pool %s", spa_name(spa));
6470 6485
6471 6486 kmem_free(vml, children * sizeof (vdev_t *));
6472 6487
6473 6488 /* if we're not going to mount the filesystems in userland, export */
6474 6489 if (exp)
6475 6490 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
6476 6491 B_FALSE, B_FALSE);
6477 6492
6478 6493 return (error);
6479 6494
6480 6495 out:
6481 6496 spa_unload(newspa);
6482 6497 spa_deactivate(newspa);
6483 6498 spa_remove(newspa);
6484 6499
6485 6500 txg = spa_vdev_config_enter(spa);
6486 6501
6487 6502 /* re-online all offlined disks */
6488 6503 for (c = 0; c < children; c++) {
6489 6504 if (vml[c] != NULL)
6490 6505 vml[c]->vdev_offline = B_FALSE;
6491 6506 }
6492 6507
6493 6508 /* restart initializing disks as necessary */
6494 6509 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
6495 6510
6496 6511 vdev_reopen(spa->spa_root_vdev);
6497 6512
6498 6513 nvlist_free(spa->spa_config_splitting);
6499 6514 spa->spa_config_splitting = NULL;
6500 6515 (void) spa_vdev_exit(spa, NULL, txg, error);
6501 6516
6502 6517 kmem_free(vml, children * sizeof (vdev_t *));
6503 6518 return (error);
6504 6519 }
6505 6520
6506 6521 /*
6507 6522 * Find any device that's done replacing, or a vdev marked 'unspare' that's
6508 6523 * currently spared, so we can detach it.
6509 6524 */
6510 6525 static vdev_t *
6511 6526 spa_vdev_resilver_done_hunt(vdev_t *vd)
6512 6527 {
6513 6528 vdev_t *newvd, *oldvd;
6514 6529
6515 6530 for (int c = 0; c < vd->vdev_children; c++) {
6516 6531 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
6517 6532 if (oldvd != NULL)
6518 6533 return (oldvd);
6519 6534 }
6520 6535
6521 6536 /*
6522 6537 * Check for a completed replacement. We always consider the first
6523 6538 * vdev in the list to be the oldest vdev, and the last one to be
6524 6539 * the newest (see spa_vdev_attach() for how that works). In
6525 6540 * the case where the newest vdev is faulted, we will not automatically
6526 6541 * remove it after a resilver completes. This is OK as it will require
6527 6542 * user intervention to determine which disk the admin wishes to keep.
6528 6543 */
6529 6544 if (vd->vdev_ops == &vdev_replacing_ops) {
6530 6545 ASSERT(vd->vdev_children > 1);
6531 6546
6532 6547 newvd = vd->vdev_child[vd->vdev_children - 1];
6533 6548 oldvd = vd->vdev_child[0];
6534 6549
6535 6550 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
6536 6551 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6537 6552 !vdev_dtl_required(oldvd))
6538 6553 return (oldvd);
6539 6554 }
6540 6555
6541 6556 /*
6542 6557 * Check for a completed resilver with the 'unspare' flag set.
6543 6558 * Also potentially update faulted state.
6544 6559 */
6545 6560 if (vd->vdev_ops == &vdev_spare_ops) {
6546 6561 vdev_t *first = vd->vdev_child[0];
6547 6562 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
6548 6563
6549 6564 if (last->vdev_unspare) {
6550 6565 oldvd = first;
6551 6566 newvd = last;
6552 6567 } else if (first->vdev_unspare) {
6553 6568 oldvd = last;
6554 6569 newvd = first;
6555 6570 } else {
6556 6571 oldvd = NULL;
6557 6572 }
6558 6573
6559 6574 if (oldvd != NULL &&
6560 6575 vdev_dtl_empty(newvd, DTL_MISSING) &&
6561 6576 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6562 6577 !vdev_dtl_required(oldvd))
6563 6578 return (oldvd);
6564 6579
6565 6580 vdev_propagate_state(vd);
6566 6581
6567 6582 /*
6568 6583 * If there are more than two spares attached to a disk,
6569 6584 * and those spares are not required, then we want to
6570 6585 * attempt to free them up now so that they can be used
6571 6586 * by other pools. Once we're back down to a single
6572 6587 * disk+spare, we stop removing them.
6573 6588 */
6574 6589 if (vd->vdev_children > 2) {
6575 6590 newvd = vd->vdev_child[1];
6576 6591
6577 6592 if (newvd->vdev_isspare && last->vdev_isspare &&
6578 6593 vdev_dtl_empty(last, DTL_MISSING) &&
6579 6594 vdev_dtl_empty(last, DTL_OUTAGE) &&
6580 6595 !vdev_dtl_required(newvd))
6581 6596 return (newvd);
6582 6597 }
6583 6598 }
6584 6599
6585 6600 return (NULL);
6586 6601 }
6587 6602
6588 6603 static void
6589 6604 spa_vdev_resilver_done(spa_t *spa)
6590 6605 {
6591 6606 vdev_t *vd, *pvd, *ppvd;
6592 6607 uint64_t guid, sguid, pguid, ppguid;
6593 6608
6594 6609 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6595 6610
6596 6611 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
6597 6612 pvd = vd->vdev_parent;
6598 6613 ppvd = pvd->vdev_parent;
6599 6614 guid = vd->vdev_guid;
6600 6615 pguid = pvd->vdev_guid;
6601 6616 ppguid = ppvd->vdev_guid;
6602 6617 sguid = 0;
6603 6618 /*
6604 6619 * If we have just finished replacing a hot spared device, then
6605 6620 * we need to detach the parent's first child (the original hot
6606 6621 * spare) as well.
6607 6622 */
6608 6623 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
6609 6624 ppvd->vdev_children == 2) {
6610 6625 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
6611 6626 sguid = ppvd->vdev_child[1]->vdev_guid;
6612 6627 }
6613 6628 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
6614 6629
6615 6630 spa_config_exit(spa, SCL_ALL, FTAG);
6616 6631 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
6617 6632 return;
6618 6633 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
6619 6634 return;
6620 6635 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6621 6636 }
6622 6637
6623 6638 spa_config_exit(spa, SCL_ALL, FTAG);
6624 6639 }
6625 6640
6626 6641 /*
6627 6642 * Update the stored path or FRU for this vdev.
6628 6643 */
6629 6644 int
6630 6645 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
6631 6646 boolean_t ispath)
6632 6647 {
6633 6648 vdev_t *vd;
6634 6649 boolean_t sync = B_FALSE;
6635 6650
6636 6651 ASSERT(spa_writeable(spa));
6637 6652
6638 6653 spa_vdev_state_enter(spa, SCL_ALL);
6639 6654
6640 6655 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
6641 6656 return (spa_vdev_state_exit(spa, NULL, ENOENT));
6642 6657
6643 6658 if (!vd->vdev_ops->vdev_op_leaf)
6644 6659 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
6645 6660
6646 6661 if (ispath) {
6647 6662 if (strcmp(value, vd->vdev_path) != 0) {
6648 6663 spa_strfree(vd->vdev_path);
6649 6664 vd->vdev_path = spa_strdup(value);
6650 6665 sync = B_TRUE;
6651 6666 }
6652 6667 } else {
6653 6668 if (vd->vdev_fru == NULL) {
6654 6669 vd->vdev_fru = spa_strdup(value);
6655 6670 sync = B_TRUE;
6656 6671 } else if (strcmp(value, vd->vdev_fru) != 0) {
6657 6672 spa_strfree(vd->vdev_fru);
6658 6673 vd->vdev_fru = spa_strdup(value);
6659 6674 sync = B_TRUE;
6660 6675 }
6661 6676 }
6662 6677
6663 6678 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
6664 6679 }
6665 6680
6666 6681 int
6667 6682 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
6668 6683 {
6669 6684 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
6670 6685 }
6671 6686
6672 6687 int
6673 6688 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
6674 6689 {
6675 6690 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
6676 6691 }
6677 6692
6678 6693 /*
6679 6694 * ==========================================================================
6680 6695 * SPA Scanning
6681 6696 * ==========================================================================
6682 6697 */
6683 6698 int
6684 6699 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
6685 6700 {
6686 6701 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6687 6702
6688 6703 if (dsl_scan_resilvering(spa->spa_dsl_pool))
6689 6704 return (SET_ERROR(EBUSY));
6690 6705
6691 6706 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
6692 6707 }
6693 6708
6694 6709 int
6695 6710 spa_scan_stop(spa_t *spa)
6696 6711 {
6697 6712 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6698 6713 if (dsl_scan_resilvering(spa->spa_dsl_pool))
6699 6714 return (SET_ERROR(EBUSY));
6700 6715 return (dsl_scan_cancel(spa->spa_dsl_pool));
6701 6716 }
6702 6717
6703 6718 int
6704 6719 spa_scan(spa_t *spa, pool_scan_func_t func)
6705 6720 {
6706 6721 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6707 6722
6708 6723 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
6709 6724 return (SET_ERROR(ENOTSUP));
6710 6725
6711 6726 /*
6712 6727 * If a resilver was requested, but there is no DTL on a
6713 6728 * writeable leaf device, we have nothing to do.
6714 6729 */
6715 6730 if (func == POOL_SCAN_RESILVER &&
6716 6731 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
6717 6732 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
6718 6733 return (0);
6719 6734 }
6720 6735
6721 6736 return (dsl_scan(spa->spa_dsl_pool, func));
6722 6737 }
6723 6738
6724 6739 /*
6725 6740 * ==========================================================================
6726 6741 * SPA async task processing
6727 6742 * ==========================================================================
6728 6743 */
6729 6744
6730 6745 static void
6731 6746 spa_async_remove(spa_t *spa, vdev_t *vd)
6732 6747 {
6733 6748 if (vd->vdev_remove_wanted) {
6734 6749 vd->vdev_remove_wanted = B_FALSE;
6735 6750 vd->vdev_delayed_close = B_FALSE;
6736 6751 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
6737 6752
6738 6753 /*
6739 6754 * We want to clear the stats, but we don't want to do a full
6740 6755 * vdev_clear() as that will cause us to throw away
6741 6756 * degraded/faulted state as well as attempt to reopen the
6742 6757 * device, all of which is a waste.
6743 6758 */
6744 6759 vd->vdev_stat.vs_read_errors = 0;
6745 6760 vd->vdev_stat.vs_write_errors = 0;
6746 6761 vd->vdev_stat.vs_checksum_errors = 0;
6747 6762
6748 6763 vdev_state_dirty(vd->vdev_top);
6749 6764 }
6750 6765
6751 6766 for (int c = 0; c < vd->vdev_children; c++)
6752 6767 spa_async_remove(spa, vd->vdev_child[c]);
6753 6768 }
6754 6769
6755 6770 static void
6756 6771 spa_async_probe(spa_t *spa, vdev_t *vd)
6757 6772 {
6758 6773 if (vd->vdev_probe_wanted) {
6759 6774 vd->vdev_probe_wanted = B_FALSE;
6760 6775 vdev_reopen(vd); /* vdev_open() does the actual probe */
6761 6776 }
6762 6777
6763 6778 for (int c = 0; c < vd->vdev_children; c++)
6764 6779 spa_async_probe(spa, vd->vdev_child[c]);
6765 6780 }
6766 6781
6767 6782 static void
6768 6783 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
6769 6784 {
6770 6785 sysevent_id_t eid;
6771 6786 nvlist_t *attr;
6772 6787 char *physpath;
6773 6788
6774 6789 if (!spa->spa_autoexpand)
6775 6790 return;
6776 6791
6777 6792 for (int c = 0; c < vd->vdev_children; c++) {
6778 6793 vdev_t *cvd = vd->vdev_child[c];
6779 6794 spa_async_autoexpand(spa, cvd);
6780 6795 }
6781 6796
6782 6797 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
6783 6798 return;
6784 6799
6785 6800 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
6786 6801 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
6787 6802
6788 6803 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6789 6804 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
6790 6805
6791 6806 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
6792 6807 ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
6793 6808
6794 6809 nvlist_free(attr);
6795 6810 kmem_free(physpath, MAXPATHLEN);
6796 6811 }
6797 6812
6798 6813 static void
6799 6814 spa_async_thread(void *arg)
6800 6815 {
6801 6816 spa_t *spa = (spa_t *)arg;
6802 6817 int tasks;
6803 6818
6804 6819 ASSERT(spa->spa_sync_on);
6805 6820
6806 6821 mutex_enter(&spa->spa_async_lock);
6807 6822 tasks = spa->spa_async_tasks;
6808 6823 spa->spa_async_tasks = 0;
6809 6824 mutex_exit(&spa->spa_async_lock);
6810 6825
6811 6826 /*
6812 6827 * See if the config needs to be updated.
6813 6828 */
6814 6829 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
6815 6830 uint64_t old_space, new_space;
6816 6831
6817 6832 mutex_enter(&spa_namespace_lock);
6818 6833 old_space = metaslab_class_get_space(spa_normal_class(spa));
6819 6834 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
6820 6835 new_space = metaslab_class_get_space(spa_normal_class(spa));
6821 6836 mutex_exit(&spa_namespace_lock);
6822 6837
6823 6838 /*
6824 6839 * If the pool grew as a result of the config update,
6825 6840 * then log an internal history event.
6826 6841 */
6827 6842 if (new_space != old_space) {
6828 6843 spa_history_log_internal(spa, "vdev online", NULL,
6829 6844 "pool '%s' size: %llu(+%llu)",
6830 6845 spa_name(spa), new_space, new_space - old_space);
6831 6846 }
6832 6847 }
6833 6848
6834 6849 /*
6835 6850 * See if any devices need to be marked REMOVED.
6836 6851 */
6837 6852 if (tasks & SPA_ASYNC_REMOVE) {
6838 6853 spa_vdev_state_enter(spa, SCL_NONE);
6839 6854 spa_async_remove(spa, spa->spa_root_vdev);
6840 6855 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
6841 6856 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
6842 6857 for (int i = 0; i < spa->spa_spares.sav_count; i++)
6843 6858 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
6844 6859 (void) spa_vdev_state_exit(spa, NULL, 0);
6845 6860 }
6846 6861
6847 6862 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
6848 6863 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6849 6864 spa_async_autoexpand(spa, spa->spa_root_vdev);
6850 6865 spa_config_exit(spa, SCL_CONFIG, FTAG);
6851 6866 }
6852 6867
6853 6868 /*
6854 6869 * See if any devices need to be probed.
6855 6870 */
6856 6871 if (tasks & SPA_ASYNC_PROBE) {
6857 6872 spa_vdev_state_enter(spa, SCL_NONE);
6858 6873 spa_async_probe(spa, spa->spa_root_vdev);
6859 6874 (void) spa_vdev_state_exit(spa, NULL, 0);
6860 6875 }
6861 6876
6862 6877 /*
6863 6878 * If any devices are done replacing, detach them.
6864 6879 */
6865 6880 if (tasks & SPA_ASYNC_RESILVER_DONE)
6866 6881 spa_vdev_resilver_done(spa);
6867 6882
6868 6883 /*
6869 6884 * Kick off a resilver.
6870 6885 */
6871 6886 if (tasks & SPA_ASYNC_RESILVER)
6872 6887 dsl_resilver_restart(spa->spa_dsl_pool, 0);
6873 6888
6874 6889 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
6875 6890 mutex_enter(&spa_namespace_lock);
6876 6891 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6877 6892 vdev_initialize_restart(spa->spa_root_vdev);
6878 6893 spa_config_exit(spa, SCL_CONFIG, FTAG);
6879 6894 mutex_exit(&spa_namespace_lock);
6880 6895 }
6881 6896
6882 6897 /*
6883 6898 * Let the world know that we're done.
6884 6899 */
6885 6900 mutex_enter(&spa->spa_async_lock);
6886 6901 spa->spa_async_thread = NULL;
6887 6902 cv_broadcast(&spa->spa_async_cv);
6888 6903 mutex_exit(&spa->spa_async_lock);
6889 6904 thread_exit();
6890 6905 }
6891 6906
6892 6907 void
6893 6908 spa_async_suspend(spa_t *spa)
6894 6909 {
6895 6910 mutex_enter(&spa->spa_async_lock);
6896 6911 spa->spa_async_suspended++;
6897 6912 while (spa->spa_async_thread != NULL)
6898 6913 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
6899 6914 mutex_exit(&spa->spa_async_lock);
6900 6915
6901 6916 spa_vdev_remove_suspend(spa);
6902 6917
6903 6918 zthr_t *condense_thread = spa->spa_condense_zthr;
6904 6919 if (condense_thread != NULL && zthr_isrunning(condense_thread))
6905 6920 VERIFY0(zthr_cancel(condense_thread));
6906 6921
6907 6922 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
6908 6923 if (discard_thread != NULL && zthr_isrunning(discard_thread))
6909 6924 VERIFY0(zthr_cancel(discard_thread));
6910 6925 }
6911 6926
6912 6927 void
6913 6928 spa_async_resume(spa_t *spa)
6914 6929 {
6915 6930 mutex_enter(&spa->spa_async_lock);
6916 6931 ASSERT(spa->spa_async_suspended != 0);
6917 6932 spa->spa_async_suspended--;
6918 6933 mutex_exit(&spa->spa_async_lock);
6919 6934 spa_restart_removal(spa);
6920 6935
6921 6936 zthr_t *condense_thread = spa->spa_condense_zthr;
6922 6937 if (condense_thread != NULL && !zthr_isrunning(condense_thread))
6923 6938 zthr_resume(condense_thread);
6924 6939
6925 6940 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
6926 6941 if (discard_thread != NULL && !zthr_isrunning(discard_thread))
6927 6942 zthr_resume(discard_thread);
6928 6943 }
6929 6944
6930 6945 static boolean_t
6931 6946 spa_async_tasks_pending(spa_t *spa)
6932 6947 {
6933 6948 uint_t non_config_tasks;
6934 6949 uint_t config_task;
6935 6950 boolean_t config_task_suspended;
6936 6951
6937 6952 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
6938 6953 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
6939 6954 if (spa->spa_ccw_fail_time == 0) {
6940 6955 config_task_suspended = B_FALSE;
6941 6956 } else {
6942 6957 config_task_suspended =
6943 6958 (gethrtime() - spa->spa_ccw_fail_time) <
6944 6959 (zfs_ccw_retry_interval * NANOSEC);
6945 6960 }
6946 6961
6947 6962 return (non_config_tasks || (config_task && !config_task_suspended));
6948 6963 }
6949 6964
6950 6965 static void
6951 6966 spa_async_dispatch(spa_t *spa)
6952 6967 {
6953 6968 mutex_enter(&spa->spa_async_lock);
6954 6969 if (spa_async_tasks_pending(spa) &&
6955 6970 !spa->spa_async_suspended &&
6956 6971 spa->spa_async_thread == NULL &&
6957 6972 rootdir != NULL)
6958 6973 spa->spa_async_thread = thread_create(NULL, 0,
6959 6974 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
6960 6975 mutex_exit(&spa->spa_async_lock);
6961 6976 }
6962 6977
6963 6978 void
6964 6979 spa_async_request(spa_t *spa, int task)
6965 6980 {
6966 6981 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
6967 6982 mutex_enter(&spa->spa_async_lock);
6968 6983 spa->spa_async_tasks |= task;
6969 6984 mutex_exit(&spa->spa_async_lock);
6970 6985 }
6971 6986
6972 6987 /*
6973 6988 * ==========================================================================
6974 6989 * SPA syncing routines
6975 6990 * ==========================================================================
6976 6991 */
6977 6992
6978 6993 static int
6979 6994 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6980 6995 {
6981 6996 bpobj_t *bpo = arg;
6982 6997 bpobj_enqueue(bpo, bp, tx);
6983 6998 return (0);
6984 6999 }
6985 7000
6986 7001 static int
6987 7002 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6988 7003 {
6989 7004 zio_t *zio = arg;
6990 7005
6991 7006 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
6992 7007 zio->io_flags));
6993 7008 return (0);
6994 7009 }
6995 7010
6996 7011 /*
6997 7012 * Note: this simple function is not inlined to make it easier to dtrace the
6998 7013 * amount of time spent syncing frees.
6999 7014 */
7000 7015 static void
7001 7016 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
7002 7017 {
7003 7018 zio_t *zio = zio_root(spa, NULL, NULL, 0);
7004 7019 bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
7005 7020 VERIFY(zio_wait(zio) == 0);
7006 7021 }
7007 7022
7008 7023 /*
7009 7024 * Note: this simple function is not inlined to make it easier to dtrace the
7010 7025 * amount of time spent syncing deferred frees.
7011 7026 */
7012 7027 static void
7013 7028 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
7014 7029 {
7015 7030 zio_t *zio = zio_root(spa, NULL, NULL, 0);
7016 7031 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
7017 7032 spa_free_sync_cb, zio, tx), ==, 0);
7018 7033 VERIFY0(zio_wait(zio));
7019 7034 }
7020 7035
7021 7036
7022 7037 static void
7023 7038 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
7024 7039 {
7025 7040 char *packed = NULL;
7026 7041 size_t bufsize;
7027 7042 size_t nvsize = 0;
7028 7043 dmu_buf_t *db;
7029 7044
7030 7045 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
7031 7046
7032 7047 /*
7033 7048 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
7034 7049 * information. This avoids the dmu_buf_will_dirty() path and
7035 7050 * saves us a pre-read to get data we don't actually care about.
7036 7051 */
7037 7052 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
7038 7053 packed = kmem_alloc(bufsize, KM_SLEEP);
7039 7054
7040 7055 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
7041 7056 KM_SLEEP) == 0);
7042 7057 bzero(packed + nvsize, bufsize - nvsize);
7043 7058
7044 7059 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
7045 7060
7046 7061 kmem_free(packed, bufsize);
7047 7062
7048 7063 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
7049 7064 dmu_buf_will_dirty(db, tx);
7050 7065 *(uint64_t *)db->db_data = nvsize;
7051 7066 dmu_buf_rele(db, FTAG);
7052 7067 }
7053 7068
7054 7069 static void
7055 7070 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
7056 7071 const char *config, const char *entry)
7057 7072 {
7058 7073 nvlist_t *nvroot;
7059 7074 nvlist_t **list;
7060 7075 int i;
7061 7076
7062 7077 if (!sav->sav_sync)
7063 7078 return;
7064 7079
7065 7080 /*
7066 7081 * Update the MOS nvlist describing the list of available devices.
7067 7082 * spa_validate_aux() will have already made sure this nvlist is
7068 7083 * valid and the vdevs are labeled appropriately.
7069 7084 */
7070 7085 if (sav->sav_object == 0) {
7071 7086 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
7072 7087 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
7073 7088 sizeof (uint64_t), tx);
7074 7089 VERIFY(zap_update(spa->spa_meta_objset,
7075 7090 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
7076 7091 &sav->sav_object, tx) == 0);
7077 7092 }
7078 7093
7079 7094 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
7080 7095 if (sav->sav_count == 0) {
7081 7096 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
7082 7097 } else {
7083 7098 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
7084 7099 for (i = 0; i < sav->sav_count; i++)
7085 7100 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
7086 7101 B_FALSE, VDEV_CONFIG_L2CACHE);
7087 7102 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
7088 7103 sav->sav_count) == 0);
7089 7104 for (i = 0; i < sav->sav_count; i++)
7090 7105 nvlist_free(list[i]);
7091 7106 kmem_free(list, sav->sav_count * sizeof (void *));
7092 7107 }
7093 7108
7094 7109 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
7095 7110 nvlist_free(nvroot);
7096 7111
7097 7112 sav->sav_sync = B_FALSE;
7098 7113 }
7099 7114
7100 7115 /*
7101 7116 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
7102 7117 * The all-vdev ZAP must be empty.
7103 7118 */
7104 7119 static void
7105 7120 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
7106 7121 {
7107 7122 spa_t *spa = vd->vdev_spa;
7108 7123 if (vd->vdev_top_zap != 0) {
7109 7124 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
7110 7125 vd->vdev_top_zap, tx));
7111 7126 }
7112 7127 if (vd->vdev_leaf_zap != 0) {
7113 7128 VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
7114 7129 vd->vdev_leaf_zap, tx));
7115 7130 }
7116 7131 for (uint64_t i = 0; i < vd->vdev_children; i++) {
7117 7132 spa_avz_build(vd->vdev_child[i], avz, tx);
7118 7133 }
7119 7134 }
7120 7135
7121 7136 static void
7122 7137 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
7123 7138 {
7124 7139 nvlist_t *config;
7125 7140
7126 7141 /*
7127 7142 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
7128 7143 * its config may not be dirty but we still need to build per-vdev ZAPs.
7129 7144 * Similarly, if the pool is being assembled (e.g. after a split), we
7130 7145 * need to rebuild the AVZ although the config may not be dirty.
7131 7146 */
7132 7147 if (list_is_empty(&spa->spa_config_dirty_list) &&
7133 7148 spa->spa_avz_action == AVZ_ACTION_NONE)
7134 7149 return;
7135 7150
7136 7151 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7137 7152
7138 7153 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
7139 7154 spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
7140 7155 spa->spa_all_vdev_zaps != 0);
7141 7156
7142 7157 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
7143 7158 /* Make and build the new AVZ */
7144 7159 uint64_t new_avz = zap_create(spa->spa_meta_objset,
7145 7160 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
7146 7161 spa_avz_build(spa->spa_root_vdev, new_avz, tx);
7147 7162
7148 7163 /* Diff old AVZ with new one */
7149 7164 zap_cursor_t zc;
7150 7165 zap_attribute_t za;
7151 7166
7152 7167 for (zap_cursor_init(&zc, spa->spa_meta_objset,
7153 7168 spa->spa_all_vdev_zaps);
7154 7169 zap_cursor_retrieve(&zc, &za) == 0;
7155 7170 zap_cursor_advance(&zc)) {
7156 7171 uint64_t vdzap = za.za_first_integer;
7157 7172 if (zap_lookup_int(spa->spa_meta_objset, new_avz,
7158 7173 vdzap) == ENOENT) {
7159 7174 /*
7160 7175 * ZAP is listed in old AVZ but not in new one;
7161 7176 * destroy it
7162 7177 */
7163 7178 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
7164 7179 tx));
7165 7180 }
7166 7181 }
7167 7182
7168 7183 zap_cursor_fini(&zc);
7169 7184
7170 7185 /* Destroy the old AVZ */
7171 7186 VERIFY0(zap_destroy(spa->spa_meta_objset,
7172 7187 spa->spa_all_vdev_zaps, tx));
7173 7188
7174 7189 /* Replace the old AVZ in the dir obj with the new one */
7175 7190 VERIFY0(zap_update(spa->spa_meta_objset,
7176 7191 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
7177 7192 sizeof (new_avz), 1, &new_avz, tx));
7178 7193
7179 7194 spa->spa_all_vdev_zaps = new_avz;
7180 7195 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
7181 7196 zap_cursor_t zc;
7182 7197 zap_attribute_t za;
7183 7198
7184 7199 /* Walk through the AVZ and destroy all listed ZAPs */
7185 7200 for (zap_cursor_init(&zc, spa->spa_meta_objset,
7186 7201 spa->spa_all_vdev_zaps);
7187 7202 zap_cursor_retrieve(&zc, &za) == 0;
7188 7203 zap_cursor_advance(&zc)) {
7189 7204 uint64_t zap = za.za_first_integer;
7190 7205 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
7191 7206 }
7192 7207
7193 7208 zap_cursor_fini(&zc);
7194 7209
7195 7210 /* Destroy and unlink the AVZ itself */
7196 7211 VERIFY0(zap_destroy(spa->spa_meta_objset,
7197 7212 spa->spa_all_vdev_zaps, tx));
7198 7213 VERIFY0(zap_remove(spa->spa_meta_objset,
7199 7214 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
7200 7215 spa->spa_all_vdev_zaps = 0;
7201 7216 }
7202 7217
7203 7218 if (spa->spa_all_vdev_zaps == 0) {
7204 7219 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
7205 7220 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
7206 7221 DMU_POOL_VDEV_ZAP_MAP, tx);
7207 7222 }
7208 7223 spa->spa_avz_action = AVZ_ACTION_NONE;
7209 7224
7210 7225 /* Create ZAPs for vdevs that don't have them. */
7211 7226 vdev_construct_zaps(spa->spa_root_vdev, tx);
7212 7227
7213 7228 config = spa_config_generate(spa, spa->spa_root_vdev,
7214 7229 dmu_tx_get_txg(tx), B_FALSE);
7215 7230
7216 7231 /*
7217 7232 * If we're upgrading the spa version then make sure that
7218 7233 * the config object gets updated with the correct version.
7219 7234 */
7220 7235 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
7221 7236 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
7222 7237 spa->spa_uberblock.ub_version);
7223 7238
7224 7239 spa_config_exit(spa, SCL_STATE, FTAG);
7225 7240
7226 7241 nvlist_free(spa->spa_config_syncing);
7227 7242 spa->spa_config_syncing = config;
7228 7243
7229 7244 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
7230 7245 }
7231 7246
7232 7247 static void
7233 7248 spa_sync_version(void *arg, dmu_tx_t *tx)
7234 7249 {
7235 7250 uint64_t *versionp = arg;
7236 7251 uint64_t version = *versionp;
7237 7252 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7238 7253
7239 7254 /*
7240 7255 * Setting the version is special cased when first creating the pool.
7241 7256 */
7242 7257 ASSERT(tx->tx_txg != TXG_INITIAL);
7243 7258
7244 7259 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
7245 7260 ASSERT(version >= spa_version(spa));
7246 7261
7247 7262 spa->spa_uberblock.ub_version = version;
7248 7263 vdev_config_dirty(spa->spa_root_vdev);
7249 7264 spa_history_log_internal(spa, "set", tx, "version=%lld", version);
7250 7265 }
7251 7266
7252 7267 /*
7253 7268 * Set zpool properties.
7254 7269 */
7255 7270 static void
7256 7271 spa_sync_props(void *arg, dmu_tx_t *tx)
7257 7272 {
7258 7273 nvlist_t *nvp = arg;
7259 7274 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7260 7275 objset_t *mos = spa->spa_meta_objset;
7261 7276 nvpair_t *elem = NULL;
7262 7277
7263 7278 mutex_enter(&spa->spa_props_lock);
7264 7279
7265 7280 while ((elem = nvlist_next_nvpair(nvp, elem))) {
7266 7281 uint64_t intval;
7267 7282 char *strval, *fname;
7268 7283 zpool_prop_t prop;
7269 7284 const char *propname;
7270 7285 zprop_type_t proptype;
7271 7286 spa_feature_t fid;
7272 7287
7273 7288 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
7274 7289 case ZPOOL_PROP_INVAL:
7275 7290 /*
7276 7291 * We checked this earlier in spa_prop_validate().
7277 7292 */
7278 7293 ASSERT(zpool_prop_feature(nvpair_name(elem)));
7279 7294
7280 7295 fname = strchr(nvpair_name(elem), '@') + 1;
7281 7296 VERIFY0(zfeature_lookup_name(fname, &fid));
7282 7297
7283 7298 spa_feature_enable(spa, fid, tx);
7284 7299 spa_history_log_internal(spa, "set", tx,
7285 7300 "%s=enabled", nvpair_name(elem));
7286 7301 break;
7287 7302
7288 7303 case ZPOOL_PROP_VERSION:
7289 7304 intval = fnvpair_value_uint64(elem);
7290 7305 /*
7291 7306 * The version is synced seperatly before other
7292 7307 * properties and should be correct by now.
7293 7308 */
7294 7309 ASSERT3U(spa_version(spa), >=, intval);
7295 7310 break;
7296 7311
7297 7312 case ZPOOL_PROP_ALTROOT:
7298 7313 /*
7299 7314 * 'altroot' is a non-persistent property. It should
7300 7315 * have been set temporarily at creation or import time.
7301 7316 */
7302 7317 ASSERT(spa->spa_root != NULL);
7303 7318 break;
7304 7319
7305 7320 case ZPOOL_PROP_READONLY:
7306 7321 case ZPOOL_PROP_CACHEFILE:
7307 7322 /*
7308 7323 * 'readonly' and 'cachefile' are also non-persisitent
7309 7324 * properties.
7310 7325 */
7311 7326 break;
7312 7327 case ZPOOL_PROP_COMMENT:
7313 7328 strval = fnvpair_value_string(elem);
7314 7329 if (spa->spa_comment != NULL)
7315 7330 spa_strfree(spa->spa_comment);
7316 7331 spa->spa_comment = spa_strdup(strval);
7317 7332 /*
7318 7333 * We need to dirty the configuration on all the vdevs
7319 7334 * so that their labels get updated. It's unnecessary
7320 7335 * to do this for pool creation since the vdev's
7321 7336 * configuratoin has already been dirtied.
7322 7337 */
7323 7338 if (tx->tx_txg != TXG_INITIAL)
7324 7339 vdev_config_dirty(spa->spa_root_vdev);
7325 7340 spa_history_log_internal(spa, "set", tx,
7326 7341 "%s=%s", nvpair_name(elem), strval);
7327 7342 break;
7328 7343 default:
7329 7344 /*
7330 7345 * Set pool property values in the poolprops mos object.
7331 7346 */
7332 7347 if (spa->spa_pool_props_object == 0) {
7333 7348 spa->spa_pool_props_object =
7334 7349 zap_create_link(mos, DMU_OT_POOL_PROPS,
7335 7350 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
7336 7351 tx);
7337 7352 }
7338 7353
7339 7354 /* normalize the property name */
7340 7355 propname = zpool_prop_to_name(prop);
7341 7356 proptype = zpool_prop_get_type(prop);
7342 7357
7343 7358 if (nvpair_type(elem) == DATA_TYPE_STRING) {
7344 7359 ASSERT(proptype == PROP_TYPE_STRING);
7345 7360 strval = fnvpair_value_string(elem);
7346 7361 VERIFY0(zap_update(mos,
7347 7362 spa->spa_pool_props_object, propname,
7348 7363 1, strlen(strval) + 1, strval, tx));
7349 7364 spa_history_log_internal(spa, "set", tx,
7350 7365 "%s=%s", nvpair_name(elem), strval);
7351 7366 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
7352 7367 intval = fnvpair_value_uint64(elem);
7353 7368
7354 7369 if (proptype == PROP_TYPE_INDEX) {
7355 7370 const char *unused;
7356 7371 VERIFY0(zpool_prop_index_to_string(
7357 7372 prop, intval, &unused));
7358 7373 }
7359 7374 VERIFY0(zap_update(mos,
7360 7375 spa->spa_pool_props_object, propname,
7361 7376 8, 1, &intval, tx));
7362 7377 spa_history_log_internal(spa, "set", tx,
7363 7378 "%s=%lld", nvpair_name(elem), intval);
7364 7379 } else {
7365 7380 ASSERT(0); /* not allowed */
7366 7381 }
7367 7382
7368 7383 switch (prop) {
7369 7384 case ZPOOL_PROP_DELEGATION:
7370 7385 spa->spa_delegation = intval;
7371 7386 break;
7372 7387 case ZPOOL_PROP_BOOTFS:
7373 7388 spa->spa_bootfs = intval;
7374 7389 break;
7375 7390 case ZPOOL_PROP_FAILUREMODE:
7376 7391 spa->spa_failmode = intval;
7377 7392 break;
7378 7393 case ZPOOL_PROP_AUTOEXPAND:
7379 7394 spa->spa_autoexpand = intval;
7380 7395 if (tx->tx_txg != TXG_INITIAL)
7381 7396 spa_async_request(spa,
7382 7397 SPA_ASYNC_AUTOEXPAND);
7383 7398 break;
7384 7399 case ZPOOL_PROP_DEDUPDITTO:
7385 7400 spa->spa_dedup_ditto = intval;
7386 7401 break;
7387 7402 default:
7388 7403 break;
7389 7404 }
7390 7405 }
7391 7406
7392 7407 }
7393 7408
7394 7409 mutex_exit(&spa->spa_props_lock);
7395 7410 }
7396 7411
7397 7412 /*
7398 7413 * Perform one-time upgrade on-disk changes. spa_version() does not
7399 7414 * reflect the new version this txg, so there must be no changes this
7400 7415 * txg to anything that the upgrade code depends on after it executes.
7401 7416 * Therefore this must be called after dsl_pool_sync() does the sync
7402 7417 * tasks.
7403 7418 */
7404 7419 static void
7405 7420 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
7406 7421 {
7407 7422 dsl_pool_t *dp = spa->spa_dsl_pool;
7408 7423
7409 7424 ASSERT(spa->spa_sync_pass == 1);
7410 7425
7411 7426 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
7412 7427
7413 7428 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
7414 7429 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
7415 7430 dsl_pool_create_origin(dp, tx);
7416 7431
7417 7432 /* Keeping the origin open increases spa_minref */
7418 7433 spa->spa_minref += 3;
7419 7434 }
7420 7435
7421 7436 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
7422 7437 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
7423 7438 dsl_pool_upgrade_clones(dp, tx);
7424 7439 }
7425 7440
7426 7441 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
7427 7442 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
7428 7443 dsl_pool_upgrade_dir_clones(dp, tx);
7429 7444
7430 7445 /* Keeping the freedir open increases spa_minref */
7431 7446 spa->spa_minref += 3;
7432 7447 }
7433 7448
7434 7449 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
7435 7450 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
7436 7451 spa_feature_create_zap_objects(spa, tx);
7437 7452 }
7438 7453
7439 7454 /*
7440 7455 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
7441 7456 * when possibility to use lz4 compression for metadata was added
7442 7457 * Old pools that have this feature enabled must be upgraded to have
7443 7458 * this feature active
7444 7459 */
7445 7460 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
7446 7461 boolean_t lz4_en = spa_feature_is_enabled(spa,
7447 7462 SPA_FEATURE_LZ4_COMPRESS);
7448 7463 boolean_t lz4_ac = spa_feature_is_active(spa,
7449 7464 SPA_FEATURE_LZ4_COMPRESS);
7450 7465
7451 7466 if (lz4_en && !lz4_ac)
7452 7467 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
7453 7468 }
7454 7469
7455 7470 /*
7456 7471 * If we haven't written the salt, do so now. Note that the
7457 7472 * feature may not be activated yet, but that's fine since
7458 7473 * the presence of this ZAP entry is backwards compatible.
7459 7474 */
7460 7475 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
7461 7476 DMU_POOL_CHECKSUM_SALT) == ENOENT) {
7462 7477 VERIFY0(zap_add(spa->spa_meta_objset,
7463 7478 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
7464 7479 sizeof (spa->spa_cksum_salt.zcs_bytes),
7465 7480 spa->spa_cksum_salt.zcs_bytes, tx));
7466 7481 }
7467 7482
7468 7483 rrw_exit(&dp->dp_config_rwlock, FTAG);
7469 7484 }
7470 7485
7471 7486 static void
7472 7487 vdev_indirect_state_sync_verify(vdev_t *vd)
7473 7488 {
7474 7489 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
7475 7490 vdev_indirect_births_t *vib = vd->vdev_indirect_births;
7476 7491
7477 7492 if (vd->vdev_ops == &vdev_indirect_ops) {
7478 7493 ASSERT(vim != NULL);
7479 7494 ASSERT(vib != NULL);
7480 7495 }
7481 7496
7482 7497 if (vdev_obsolete_sm_object(vd) != 0) {
7483 7498 ASSERT(vd->vdev_obsolete_sm != NULL);
7484 7499 ASSERT(vd->vdev_removing ||
7485 7500 vd->vdev_ops == &vdev_indirect_ops);
7486 7501 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
7487 7502 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
7488 7503
7489 7504 ASSERT3U(vdev_obsolete_sm_object(vd), ==,
7490 7505 space_map_object(vd->vdev_obsolete_sm));
7491 7506 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
7492 7507 space_map_allocated(vd->vdev_obsolete_sm));
7493 7508 }
7494 7509 ASSERT(vd->vdev_obsolete_segments != NULL);
7495 7510
7496 7511 /*
7497 7512 * Since frees / remaps to an indirect vdev can only
7498 7513 * happen in syncing context, the obsolete segments
7499 7514 * tree must be empty when we start syncing.
7500 7515 */
7501 7516 ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
7502 7517 }
7503 7518
7504 7519 /*
7505 7520 * Sync the specified transaction group. New blocks may be dirtied as
7506 7521 * part of the process, so we iterate until it converges.
7507 7522 */
7508 7523 void
7509 7524 spa_sync(spa_t *spa, uint64_t txg)
7510 7525 {
7511 7526 dsl_pool_t *dp = spa->spa_dsl_pool;
7512 7527 objset_t *mos = spa->spa_meta_objset;
7513 7528 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
7514 7529 vdev_t *rvd = spa->spa_root_vdev;
7515 7530 vdev_t *vd;
7516 7531 dmu_tx_t *tx;
7517 7532 int error;
7518 7533 uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
7519 7534 zfs_vdev_queue_depth_pct / 100;
7520 7535
7521 7536 VERIFY(spa_writeable(spa));
7522 7537
7523 7538 /*
7524 7539 * Wait for i/os issued in open context that need to complete
7525 7540 * before this txg syncs.
7526 7541 */
7527 7542 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
7528 7543 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
7529 7544 ZIO_FLAG_CANFAIL);
7530 7545
7531 7546 /*
7532 7547 * Lock out configuration changes.
7533 7548 */
7534 7549 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7535 7550
7536 7551 spa->spa_syncing_txg = txg;
7537 7552 spa->spa_sync_pass = 0;
7538 7553
7539 7554 for (int i = 0; i < spa->spa_alloc_count; i++) {
7540 7555 mutex_enter(&spa->spa_alloc_locks[i]);
7541 7556 VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
7542 7557 mutex_exit(&spa->spa_alloc_locks[i]);
7543 7558 }
7544 7559
7545 7560 /*
7546 7561 * If there are any pending vdev state changes, convert them
7547 7562 * into config changes that go out with this transaction group.
7548 7563 */
7549 7564 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7550 7565 while (list_head(&spa->spa_state_dirty_list) != NULL) {
7551 7566 /*
7552 7567 * We need the write lock here because, for aux vdevs,
7553 7568 * calling vdev_config_dirty() modifies sav_config.
7554 7569 * This is ugly and will become unnecessary when we
7555 7570 * eliminate the aux vdev wart by integrating all vdevs
7556 7571 * into the root vdev tree.
7557 7572 */
7558 7573 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7559 7574 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
7560 7575 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
7561 7576 vdev_state_clean(vd);
7562 7577 vdev_config_dirty(vd);
7563 7578 }
7564 7579 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7565 7580 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7566 7581 }
7567 7582 spa_config_exit(spa, SCL_STATE, FTAG);
7568 7583
7569 7584 tx = dmu_tx_create_assigned(dp, txg);
7570 7585
7571 7586 spa->spa_sync_starttime = gethrtime();
7572 7587 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
7573 7588 spa->spa_sync_starttime + spa->spa_deadman_synctime));
7574 7589
7575 7590 /*
7576 7591 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
7577 7592 * set spa_deflate if we have no raid-z vdevs.
7578 7593 */
7579 7594 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
7580 7595 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
7581 7596 int i;
7582 7597
7583 7598 for (i = 0; i < rvd->vdev_children; i++) {
7584 7599 vd = rvd->vdev_child[i];
7585 7600 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
7586 7601 break;
7587 7602 }
7588 7603 if (i == rvd->vdev_children) {
7589 7604 spa->spa_deflate = TRUE;
7590 7605 VERIFY(0 == zap_add(spa->spa_meta_objset,
7591 7606 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
7592 7607 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
7593 7608 }
7594 7609 }
7595 7610
7596 7611 /*
7597 7612 * Set the top-level vdev's max queue depth. Evaluate each
7598 7613 * top-level's async write queue depth in case it changed.
7599 7614 * The max queue depth will not change in the middle of syncing
7600 7615 * out this txg.
7601 7616 */
7602 7617 uint64_t slots_per_allocator = 0;
7603 7618 for (int c = 0; c < rvd->vdev_children; c++) {
7604 7619 vdev_t *tvd = rvd->vdev_child[c];
7605 7620 metaslab_group_t *mg = tvd->vdev_mg;
7606 7621
7607 7622 if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
7608 7623 !metaslab_group_initialized(mg))
7609 7624 continue;
7610 7625
7611 7626 /*
7612 7627 * It is safe to do a lock-free check here because only async
7613 7628 * allocations look at mg_max_alloc_queue_depth, and async
7614 7629 * allocations all happen from spa_sync().
7615 7630 */
7616 7631 for (int i = 0; i < spa->spa_alloc_count; i++)
7617 7632 ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
7618 7633 mg->mg_max_alloc_queue_depth = max_queue_depth;
7619 7634
7620 7635 for (int i = 0; i < spa->spa_alloc_count; i++) {
7621 7636 mg->mg_cur_max_alloc_queue_depth[i] =
7622 7637 zfs_vdev_def_queue_depth;
7623 7638 }
7624 7639 slots_per_allocator += zfs_vdev_def_queue_depth;
7625 7640 }
7626 7641 metaslab_class_t *mc = spa_normal_class(spa);
7627 7642 for (int i = 0; i < spa->spa_alloc_count; i++) {
7628 7643 ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
7629 7644 mc->mc_alloc_max_slots[i] = slots_per_allocator;
7630 7645 }
7631 7646 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
7632 7647
7633 7648 for (int c = 0; c < rvd->vdev_children; c++) {
7634 7649 vdev_t *vd = rvd->vdev_child[c];
7635 7650 vdev_indirect_state_sync_verify(vd);
7636 7651
7637 7652 if (vdev_indirect_should_condense(vd)) {
7638 7653 spa_condense_indirect_start_sync(vd, tx);
7639 7654 break;
7640 7655 }
7641 7656 }
7642 7657
7643 7658 /*
7644 7659 * Iterate to convergence.
7645 7660 */
7646 7661 do {
7647 7662 int pass = ++spa->spa_sync_pass;
7648 7663
7649 7664 spa_sync_config_object(spa, tx);
7650 7665 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
7651 7666 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
7652 7667 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
7653 7668 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
7654 7669 spa_errlog_sync(spa, txg);
7655 7670 dsl_pool_sync(dp, txg);
7656 7671
7657 7672 if (pass < zfs_sync_pass_deferred_free) {
7658 7673 spa_sync_frees(spa, free_bpl, tx);
7659 7674 } else {
7660 7675 /*
7661 7676 * We can not defer frees in pass 1, because
7662 7677 * we sync the deferred frees later in pass 1.
7663 7678 */
7664 7679 ASSERT3U(pass, >, 1);
7665 7680 bplist_iterate(free_bpl, bpobj_enqueue_cb,
7666 7681 &spa->spa_deferred_bpobj, tx);
7667 7682 }
7668 7683
7669 7684 ddt_sync(spa, txg);
7670 7685 dsl_scan_sync(dp, tx);
7671 7686
7672 7687 if (spa->spa_vdev_removal != NULL)
7673 7688 svr_sync(spa, tx);
7674 7689
7675 7690 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
7676 7691 != NULL)
7677 7692 vdev_sync(vd, txg);
7678 7693
7679 7694 if (pass == 1) {
7680 7695 spa_sync_upgrades(spa, tx);
7681 7696 ASSERT3U(txg, >=,
7682 7697 spa->spa_uberblock.ub_rootbp.blk_birth);
7683 7698 /*
7684 7699 * Note: We need to check if the MOS is dirty
7685 7700 * because we could have marked the MOS dirty
7686 7701 * without updating the uberblock (e.g. if we
7687 7702 * have sync tasks but no dirty user data). We
7688 7703 * need to check the uberblock's rootbp because
7689 7704 * it is updated if we have synced out dirty
7690 7705 * data (though in this case the MOS will most
7691 7706 * likely also be dirty due to second order
7692 7707 * effects, we don't want to rely on that here).
7693 7708 */
7694 7709 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
7695 7710 !dmu_objset_is_dirty(mos, txg)) {
7696 7711 /*
7697 7712 * Nothing changed on the first pass,
7698 7713 * therefore this TXG is a no-op. Avoid
7699 7714 * syncing deferred frees, so that we
7700 7715 * can keep this TXG as a no-op.
7701 7716 */
7702 7717 ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
7703 7718 txg));
7704 7719 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
7705 7720 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
7706 7721 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
7707 7722 txg));
7708 7723 break;
7709 7724 }
7710 7725 spa_sync_deferred_frees(spa, tx);
7711 7726 }
7712 7727
7713 7728 } while (dmu_objset_is_dirty(mos, txg));
7714 7729
7715 7730 if (!list_is_empty(&spa->spa_config_dirty_list)) {
7716 7731 /*
7717 7732 * Make sure that the number of ZAPs for all the vdevs matches
7718 7733 * the number of ZAPs in the per-vdev ZAP list. This only gets
7719 7734 * called if the config is dirty; otherwise there may be
7720 7735 * outstanding AVZ operations that weren't completed in
7721 7736 * spa_sync_config_object.
7722 7737 */
7723 7738 uint64_t all_vdev_zap_entry_count;
7724 7739 ASSERT0(zap_count(spa->spa_meta_objset,
7725 7740 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
7726 7741 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
7727 7742 all_vdev_zap_entry_count);
7728 7743 }
7729 7744
7730 7745 if (spa->spa_vdev_removal != NULL) {
7731 7746 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
7732 7747 }
7733 7748
7734 7749 /*
7735 7750 * Rewrite the vdev configuration (which includes the uberblock)
7736 7751 * to commit the transaction group.
7737 7752 *
7738 7753 * If there are no dirty vdevs, we sync the uberblock to a few
7739 7754 * random top-level vdevs that are known to be visible in the
7740 7755 * config cache (see spa_vdev_add() for a complete description).
7741 7756 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
7742 7757 */
7743 7758 for (;;) {
7744 7759 /*
7745 7760 * We hold SCL_STATE to prevent vdev open/close/etc.
7746 7761 * while we're attempting to write the vdev labels.
7747 7762 */
7748 7763 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7749 7764
7750 7765 if (list_is_empty(&spa->spa_config_dirty_list)) {
7751 7766 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
7752 7767 int svdcount = 0;
7753 7768 int children = rvd->vdev_children;
7754 7769 int c0 = spa_get_random(children);
7755 7770
7756 7771 for (int c = 0; c < children; c++) {
7757 7772 vd = rvd->vdev_child[(c0 + c) % children];
7758 7773
7759 7774 /* Stop when revisiting the first vdev */
7760 7775 if (c > 0 && svd[0] == vd)
7761 7776 break;
7762 7777
7763 7778 if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
7764 7779 !vdev_is_concrete(vd))
7765 7780 continue;
7766 7781
7767 7782 svd[svdcount++] = vd;
7768 7783 if (svdcount == SPA_SYNC_MIN_VDEVS)
7769 7784 break;
7770 7785 }
7771 7786 error = vdev_config_sync(svd, svdcount, txg);
7772 7787 } else {
7773 7788 error = vdev_config_sync(rvd->vdev_child,
7774 7789 rvd->vdev_children, txg);
7775 7790 }
7776 7791
7777 7792 if (error == 0)
7778 7793 spa->spa_last_synced_guid = rvd->vdev_guid;
7779 7794
7780 7795 spa_config_exit(spa, SCL_STATE, FTAG);
7781 7796
7782 7797 if (error == 0)
7783 7798 break;
7784 7799 zio_suspend(spa, NULL);
7785 7800 zio_resume_wait(spa);
7786 7801 }
7787 7802 dmu_tx_commit(tx);
7788 7803
7789 7804 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
7790 7805
7791 7806 /*
7792 7807 * Clear the dirty config list.
7793 7808 */
7794 7809 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
7795 7810 vdev_config_clean(vd);
7796 7811
7797 7812 /*
7798 7813 * Now that the new config has synced transactionally,
7799 7814 * let it become visible to the config cache.
7800 7815 */
7801 7816 if (spa->spa_config_syncing != NULL) {
7802 7817 spa_config_set(spa, spa->spa_config_syncing);
7803 7818 spa->spa_config_txg = txg;
7804 7819 spa->spa_config_syncing = NULL;
7805 7820 }
7806 7821
7807 7822 dsl_pool_sync_done(dp, txg);
7808 7823
7809 7824 for (int i = 0; i < spa->spa_alloc_count; i++) {
7810 7825 mutex_enter(&spa->spa_alloc_locks[i]);
7811 7826 VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
7812 7827 mutex_exit(&spa->spa_alloc_locks[i]);
7813 7828 }
7814 7829
7815 7830 /*
7816 7831 * Update usable space statistics.
7817 7832 */
7818 7833 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
7819 7834 != NULL)
7820 7835 vdev_sync_done(vd, txg);
7821 7836
7822 7837 spa_update_dspace(spa);
7823 7838
7824 7839 /*
7825 7840 * It had better be the case that we didn't dirty anything
7826 7841 * since vdev_config_sync().
7827 7842 */
7828 7843 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
7829 7844 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
7830 7845 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
7831 7846
7832 7847 while (zfs_pause_spa_sync)
7833 7848 delay(1);
7834 7849
7835 7850 spa->spa_sync_pass = 0;
7836 7851
7837 7852 /*
7838 7853 * Update the last synced uberblock here. We want to do this at
7839 7854 * the end of spa_sync() so that consumers of spa_last_synced_txg()
7840 7855 * will be guaranteed that all the processing associated with
7841 7856 * that txg has been completed.
7842 7857 */
7843 7858 spa->spa_ubsync = spa->spa_uberblock;
7844 7859 spa_config_exit(spa, SCL_CONFIG, FTAG);
7845 7860
7846 7861 spa_handle_ignored_writes(spa);
7847 7862
7848 7863 /*
7849 7864 * If any async tasks have been requested, kick them off.
7850 7865 */
7851 7866 spa_async_dispatch(spa);
7852 7867 }
7853 7868
7854 7869 /*
7855 7870 * Sync all pools. We don't want to hold the namespace lock across these
7856 7871 * operations, so we take a reference on the spa_t and drop the lock during the
7857 7872 * sync.
7858 7873 */
7859 7874 void
7860 7875 spa_sync_allpools(void)
7861 7876 {
7862 7877 spa_t *spa = NULL;
7863 7878 mutex_enter(&spa_namespace_lock);
7864 7879 while ((spa = spa_next(spa)) != NULL) {
7865 7880 if (spa_state(spa) != POOL_STATE_ACTIVE ||
7866 7881 !spa_writeable(spa) || spa_suspended(spa))
7867 7882 continue;
7868 7883 spa_open_ref(spa, FTAG);
7869 7884 mutex_exit(&spa_namespace_lock);
7870 7885 txg_wait_synced(spa_get_dsl(spa), 0);
7871 7886 mutex_enter(&spa_namespace_lock);
7872 7887 spa_close(spa, FTAG);
7873 7888 }
7874 7889 mutex_exit(&spa_namespace_lock);
7875 7890 }
7876 7891
7877 7892 /*
7878 7893 * ==========================================================================
7879 7894 * Miscellaneous routines
7880 7895 * ==========================================================================
7881 7896 */
7882 7897
7883 7898 /*
7884 7899 * Remove all pools in the system.
7885 7900 */
7886 7901 void
7887 7902 spa_evict_all(void)
7888 7903 {
7889 7904 spa_t *spa;
7890 7905
7891 7906 /*
7892 7907 * Remove all cached state. All pools should be closed now,
7893 7908 * so every spa in the AVL tree should be unreferenced.
7894 7909 */
7895 7910 mutex_enter(&spa_namespace_lock);
7896 7911 while ((spa = spa_next(NULL)) != NULL) {
7897 7912 /*
7898 7913 * Stop async tasks. The async thread may need to detach
7899 7914 * a device that's been replaced, which requires grabbing
7900 7915 * spa_namespace_lock, so we must drop it here.
7901 7916 */
7902 7917 spa_open_ref(spa, FTAG);
7903 7918 mutex_exit(&spa_namespace_lock);
7904 7919 spa_async_suspend(spa);
7905 7920 mutex_enter(&spa_namespace_lock);
7906 7921 spa_close(spa, FTAG);
7907 7922
7908 7923 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
7909 7924 spa_unload(spa);
7910 7925 spa_deactivate(spa);
7911 7926 }
7912 7927 spa_remove(spa);
7913 7928 }
7914 7929 mutex_exit(&spa_namespace_lock);
7915 7930 }
7916 7931
7917 7932 vdev_t *
7918 7933 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
7919 7934 {
7920 7935 vdev_t *vd;
7921 7936 int i;
7922 7937
7923 7938 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
7924 7939 return (vd);
7925 7940
7926 7941 if (aux) {
7927 7942 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
7928 7943 vd = spa->spa_l2cache.sav_vdevs[i];
7929 7944 if (vd->vdev_guid == guid)
7930 7945 return (vd);
7931 7946 }
7932 7947
7933 7948 for (i = 0; i < spa->spa_spares.sav_count; i++) {
7934 7949 vd = spa->spa_spares.sav_vdevs[i];
7935 7950 if (vd->vdev_guid == guid)
7936 7951 return (vd);
7937 7952 }
7938 7953 }
7939 7954
7940 7955 return (NULL);
7941 7956 }
7942 7957
7943 7958 void
7944 7959 spa_upgrade(spa_t *spa, uint64_t version)
7945 7960 {
7946 7961 ASSERT(spa_writeable(spa));
7947 7962
7948 7963 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7949 7964
7950 7965 /*
7951 7966 * This should only be called for a non-faulted pool, and since a
7952 7967 * future version would result in an unopenable pool, this shouldn't be
7953 7968 * possible.
7954 7969 */
7955 7970 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
7956 7971 ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
7957 7972
7958 7973 spa->spa_uberblock.ub_version = version;
7959 7974 vdev_config_dirty(spa->spa_root_vdev);
7960 7975
7961 7976 spa_config_exit(spa, SCL_ALL, FTAG);
7962 7977
7963 7978 txg_wait_synced(spa_get_dsl(spa), 0);
7964 7979 }
7965 7980
7966 7981 boolean_t
7967 7982 spa_has_spare(spa_t *spa, uint64_t guid)
7968 7983 {
7969 7984 int i;
7970 7985 uint64_t spareguid;
7971 7986 spa_aux_vdev_t *sav = &spa->spa_spares;
7972 7987
7973 7988 for (i = 0; i < sav->sav_count; i++)
7974 7989 if (sav->sav_vdevs[i]->vdev_guid == guid)
7975 7990 return (B_TRUE);
7976 7991
7977 7992 for (i = 0; i < sav->sav_npending; i++) {
7978 7993 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
7979 7994 &spareguid) == 0 && spareguid == guid)
7980 7995 return (B_TRUE);
7981 7996 }
7982 7997
7983 7998 return (B_FALSE);
7984 7999 }
7985 8000
7986 8001 /*
7987 8002 * Check if a pool has an active shared spare device.
7988 8003 * Note: reference count of an active spare is 2, as a spare and as a replace
7989 8004 */
7990 8005 static boolean_t
7991 8006 spa_has_active_shared_spare(spa_t *spa)
7992 8007 {
7993 8008 int i, refcnt;
7994 8009 uint64_t pool;
7995 8010 spa_aux_vdev_t *sav = &spa->spa_spares;
7996 8011
7997 8012 for (i = 0; i < sav->sav_count; i++) {
7998 8013 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
7999 8014 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
8000 8015 refcnt > 2)
8001 8016 return (B_TRUE);
8002 8017 }
8003 8018
8004 8019 return (B_FALSE);
8005 8020 }
8006 8021
8007 8022 sysevent_t *
8008 8023 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
8009 8024 {
8010 8025 sysevent_t *ev = NULL;
8011 8026 #ifdef _KERNEL
8012 8027 sysevent_attr_list_t *attr = NULL;
8013 8028 sysevent_value_t value;
8014 8029
8015 8030 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
8016 8031 SE_SLEEP);
8017 8032 ASSERT(ev != NULL);
8018 8033
8019 8034 value.value_type = SE_DATA_TYPE_STRING;
8020 8035 value.value.sv_string = spa_name(spa);
8021 8036 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
8022 8037 goto done;
8023 8038
8024 8039 value.value_type = SE_DATA_TYPE_UINT64;
8025 8040 value.value.sv_uint64 = spa_guid(spa);
8026 8041 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
8027 8042 goto done;
8028 8043
8029 8044 if (vd) {
8030 8045 value.value_type = SE_DATA_TYPE_UINT64;
8031 8046 value.value.sv_uint64 = vd->vdev_guid;
8032 8047 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
8033 8048 SE_SLEEP) != 0)
8034 8049 goto done;
8035 8050
8036 8051 if (vd->vdev_path) {
8037 8052 value.value_type = SE_DATA_TYPE_STRING;
8038 8053 value.value.sv_string = vd->vdev_path;
8039 8054 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
8040 8055 &value, SE_SLEEP) != 0)
8041 8056 goto done;
8042 8057 }
8043 8058 }
8044 8059
8045 8060 if (hist_nvl != NULL) {
8046 8061 fnvlist_merge((nvlist_t *)attr, hist_nvl);
8047 8062 }
8048 8063
8049 8064 if (sysevent_attach_attributes(ev, attr) != 0)
8050 8065 goto done;
8051 8066 attr = NULL;
8052 8067
8053 8068 done:
8054 8069 if (attr)
8055 8070 sysevent_free_attr(attr);
8056 8071
8057 8072 #endif
8058 8073 return (ev);
8059 8074 }
8060 8075
8061 8076 void
8062 8077 spa_event_post(sysevent_t *ev)
8063 8078 {
8064 8079 #ifdef _KERNEL
8065 8080 sysevent_id_t eid;
8066 8081
8067 8082 (void) log_sysevent(ev, SE_SLEEP, &eid);
8068 8083 sysevent_free(ev);
8069 8084 #endif
8070 8085 }
8071 8086
8072 8087 void
8073 8088 spa_event_discard(sysevent_t *ev)
8074 8089 {
8075 8090 #ifdef _KERNEL
8076 8091 sysevent_free(ev);
8077 8092 #endif
8078 8093 }
8079 8094
8080 8095 /*
8081 8096 * Post a sysevent corresponding to the given event. The 'name' must be one of
8082 8097 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
8083 8098 * filled in from the spa and (optionally) the vdev and history nvl. This
8084 8099 * doesn't do anything in the userland libzpool, as we don't want consumers to
8085 8100 * misinterpret ztest or zdb as real changes.
8086 8101 */
8087 8102 void
8088 8103 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
8089 8104 {
8090 8105 spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
8091 8106 }
|
↓ open down ↓ |
3471 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX