1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2011 by Delphix. All rights reserved.
26 */
27
28 /*
29 * This file contains all the routines used when modifying on-disk SPA state.
30 * This includes opening, importing, destroying, exporting a pool, and syncing a
31 * pool.
32 */
33
34 #include <sys/zfs_context.h>
35 #include <sys/fm/fs/zfs.h>
36 #include <sys/spa_impl.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/dmu.h>
40 #include <sys/dmu_tx.h>
41 #include <sys/zap.h>
42 #include <sys/zil.h>
43 #include <sys/ddt.h>
44 #include <sys/vdev_impl.h>
45 #include <sys/metaslab.h>
46 #include <sys/metaslab_impl.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/dmu_objset.h>
52 #include <sys/unique.h>
53 #include <sys/dsl_pool.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_dir.h>
56 #include <sys/dsl_prop.h>
57 #include <sys/dsl_synctask.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/arc.h>
60 #include <sys/callb.h>
61 #include <sys/systeminfo.h>
62 #include <sys/spa_boot.h>
63 #include <sys/zfs_ioctl.h>
64 #include <sys/dsl_scan.h>
65
66 #ifdef _KERNEL
67 #include <sys/bootprops.h>
68 #include <sys/callb.h>
69 #include <sys/cpupart.h>
70 #include <sys/pool.h>
71 #include <sys/sysdc.h>
72 #include <sys/zone.h>
73 #endif /* _KERNEL */
74
75 #include "zfs_prop.h"
76 #include "zfs_comutil.h"
77
78 typedef enum zti_modes {
79 zti_mode_fixed, /* value is # of threads (min 1) */
80 zti_mode_online_percent, /* value is % of online CPUs */
81 zti_mode_batch, /* cpu-intensive; value is ignored */
82 zti_mode_null, /* don't create a taskq */
83 zti_nmodes
84 } zti_modes_t;
85
86 #define ZTI_FIX(n) { zti_mode_fixed, (n) }
87 #define ZTI_PCT(n) { zti_mode_online_percent, (n) }
88 #define ZTI_BATCH { zti_mode_batch, 0 }
89 #define ZTI_NULL { zti_mode_null, 0 }
90
91 #define ZTI_ONE ZTI_FIX(1)
92
93 typedef struct zio_taskq_info {
94 enum zti_modes zti_mode;
95 uint_t zti_value;
96 } zio_taskq_info_t;
97
98 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
99 "issue", "issue_high", "intr", "intr_high"
100 };
101
102 /*
103 * Define the taskq threads for the following I/O types:
104 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
105 */
106 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
107 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
108 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
109 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
110 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
111 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
113 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
114 };
115
116 static dsl_syncfunc_t spa_sync_props;
117 static boolean_t spa_has_active_shared_spare(spa_t *spa);
118 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
119 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
120 char **ereport);
121 static void spa_vdev_resilver_done(spa_t *spa);
122
123 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
124 id_t zio_taskq_psrset_bind = PS_NONE;
125 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
126 uint_t zio_taskq_basedc = 80; /* base duty cycle */
127
128 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
129
130 /*
131 * This (illegal) pool name is used when temporarily importing a spa_t in order
132 * to get the vdev stats associated with the imported devices.
133 */
134 #define TRYIMPORT_NAME "$import"
135
136 /*
137 * ==========================================================================
138 * SPA properties routines
139 * ==========================================================================
140 */
141
142 /*
143 * Add a (source=src, propname=propval) list to an nvlist.
144 */
145 static void
146 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
147 uint64_t intval, zprop_source_t src)
148 {
149 const char *propname = zpool_prop_to_name(prop);
150 nvlist_t *propval;
151
152 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
153 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
154
155 if (strval != NULL)
156 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
157 else
158 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
159
160 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
161 nvlist_free(propval);
162 }
163
164 /*
165 * Get property values from the spa configuration.
166 */
167 static void
168 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
169 {
170 uint64_t size;
171 uint64_t alloc;
172 uint64_t cap, version;
173 zprop_source_t src = ZPROP_SRC_NONE;
174 spa_config_dirent_t *dp;
175
176 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
177
178 if (spa->spa_root_vdev != NULL) {
179 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
180 size = metaslab_class_get_space(spa_normal_class(spa));
181 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
182 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
183 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
184 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
185 size - alloc, src);
186 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
187 (spa_mode(spa) == FREAD), src);
188
189 cap = (size == 0) ? 0 : (alloc * 100 / size);
190 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
191
192 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
193 ddt_get_pool_dedup_ratio(spa), src);
194
195 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
196 spa->spa_root_vdev->vdev_state, src);
197
198 version = spa_version(spa);
199 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
200 src = ZPROP_SRC_DEFAULT;
201 else
202 src = ZPROP_SRC_LOCAL;
203 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
204 }
205
206 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
207
208 if (spa->spa_comment != NULL) {
209 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
210 0, ZPROP_SRC_LOCAL);
211 }
212
213 if (spa->spa_root != NULL)
214 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
215 0, ZPROP_SRC_LOCAL);
216
217 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
218 if (dp->scd_path == NULL) {
219 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
220 "none", 0, ZPROP_SRC_LOCAL);
221 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
222 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
223 dp->scd_path, 0, ZPROP_SRC_LOCAL);
224 }
225 }
226 }
227
228 /*
229 * Get zpool property values.
230 */
231 int
232 spa_prop_get(spa_t *spa, nvlist_t **nvp)
233 {
234 objset_t *mos = spa->spa_meta_objset;
235 zap_cursor_t zc;
236 zap_attribute_t za;
237 int err;
238
239 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
240
241 mutex_enter(&spa->spa_props_lock);
242
243 /*
244 * Get properties from the spa config.
245 */
246 spa_prop_get_config(spa, nvp);
247
248 /* If no pool property object, no more prop to get. */
249 if (mos == NULL || spa->spa_pool_props_object == 0) {
250 mutex_exit(&spa->spa_props_lock);
251 return (0);
252 }
253
254 /*
255 * Get properties from the MOS pool property object.
256 */
257 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
258 (err = zap_cursor_retrieve(&zc, &za)) == 0;
259 zap_cursor_advance(&zc)) {
260 uint64_t intval = 0;
261 char *strval = NULL;
262 zprop_source_t src = ZPROP_SRC_DEFAULT;
263 zpool_prop_t prop;
264
265 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
266 continue;
267
268 switch (za.za_integer_length) {
269 case 8:
270 /* integer property */
271 if (za.za_first_integer !=
272 zpool_prop_default_numeric(prop))
273 src = ZPROP_SRC_LOCAL;
274
275 if (prop == ZPOOL_PROP_BOOTFS) {
276 dsl_pool_t *dp;
277 dsl_dataset_t *ds = NULL;
278
279 dp = spa_get_dsl(spa);
280 rw_enter(&dp->dp_config_rwlock, RW_READER);
281 if (err = dsl_dataset_hold_obj(dp,
282 za.za_first_integer, FTAG, &ds)) {
283 rw_exit(&dp->dp_config_rwlock);
284 break;
285 }
286
287 strval = kmem_alloc(
288 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
289 KM_SLEEP);
290 dsl_dataset_name(ds, strval);
291 dsl_dataset_rele(ds, FTAG);
292 rw_exit(&dp->dp_config_rwlock);
293 } else {
294 strval = NULL;
295 intval = za.za_first_integer;
296 }
297
298 spa_prop_add_list(*nvp, prop, strval, intval, src);
299
300 if (strval != NULL)
301 kmem_free(strval,
302 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
303
304 break;
305
306 case 1:
307 /* string property */
308 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
309 err = zap_lookup(mos, spa->spa_pool_props_object,
310 za.za_name, 1, za.za_num_integers, strval);
311 if (err) {
312 kmem_free(strval, za.za_num_integers);
313 break;
314 }
315 spa_prop_add_list(*nvp, prop, strval, 0, src);
316 kmem_free(strval, za.za_num_integers);
317 break;
318
319 default:
320 break;
321 }
322 }
323 zap_cursor_fini(&zc);
324 mutex_exit(&spa->spa_props_lock);
325 out:
326 if (err && err != ENOENT) {
327 nvlist_free(*nvp);
328 *nvp = NULL;
329 return (err);
330 }
331
332 return (0);
333 }
334
335 /*
336 * Validate the given pool properties nvlist and modify the list
337 * for the property values to be set.
338 */
339 static int
340 spa_prop_validate(spa_t *spa, nvlist_t *props)
341 {
342 nvpair_t *elem;
343 int error = 0, reset_bootfs = 0;
344 uint64_t objnum;
345
346 elem = NULL;
347 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
348 zpool_prop_t prop;
349 char *propname, *strval;
350 uint64_t intval;
351 objset_t *os;
352 char *slash, *check;
353
354 propname = nvpair_name(elem);
355
356 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
357 return (EINVAL);
358
359 switch (prop) {
360 case ZPOOL_PROP_VERSION:
361 error = nvpair_value_uint64(elem, &intval);
362 if (!error &&
363 (intval < spa_version(spa) || intval > SPA_VERSION))
364 error = EINVAL;
365 break;
366
367 case ZPOOL_PROP_DELEGATION:
368 case ZPOOL_PROP_AUTOREPLACE:
369 case ZPOOL_PROP_LISTSNAPS:
370 case ZPOOL_PROP_AUTOEXPAND:
371 error = nvpair_value_uint64(elem, &intval);
372 if (!error && intval > 1)
373 error = EINVAL;
374 break;
375
376 case ZPOOL_PROP_BOOTFS:
377 /*
378 * If the pool version is less than SPA_VERSION_BOOTFS,
379 * or the pool is still being created (version == 0),
380 * the bootfs property cannot be set.
381 */
382 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
383 error = ENOTSUP;
384 break;
385 }
386
387 /*
388 * Make sure the vdev config is bootable
389 */
390 if (!vdev_is_bootable(spa->spa_root_vdev)) {
391 error = ENOTSUP;
392 break;
393 }
394
395 reset_bootfs = 1;
396
397 error = nvpair_value_string(elem, &strval);
398
399 if (!error) {
400 uint64_t compress;
401
402 if (strval == NULL || strval[0] == '\0') {
403 objnum = zpool_prop_default_numeric(
404 ZPOOL_PROP_BOOTFS);
405 break;
406 }
407
408 if (error = dmu_objset_hold(strval, FTAG, &os))
409 break;
410
411 /* Must be ZPL and not gzip compressed. */
412
413 if (dmu_objset_type(os) != DMU_OST_ZFS) {
414 error = ENOTSUP;
415 } else if ((error = dsl_prop_get_integer(strval,
416 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
417 &compress, NULL)) == 0 &&
418 !BOOTFS_COMPRESS_VALID(compress)) {
419 error = ENOTSUP;
420 } else {
421 objnum = dmu_objset_id(os);
422 }
423 dmu_objset_rele(os, FTAG);
424 }
425 break;
426
427 case ZPOOL_PROP_FAILUREMODE:
428 error = nvpair_value_uint64(elem, &intval);
429 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
430 intval > ZIO_FAILURE_MODE_PANIC))
431 error = EINVAL;
432
433 /*
434 * This is a special case which only occurs when
435 * the pool has completely failed. This allows
436 * the user to change the in-core failmode property
437 * without syncing it out to disk (I/Os might
438 * currently be blocked). We do this by returning
439 * EIO to the caller (spa_prop_set) to trick it
440 * into thinking we encountered a property validation
441 * error.
442 */
443 if (!error && spa_suspended(spa)) {
444 spa->spa_failmode = intval;
445 error = EIO;
446 }
447 break;
448
449 case ZPOOL_PROP_CACHEFILE:
450 if ((error = nvpair_value_string(elem, &strval)) != 0)
451 break;
452
453 if (strval[0] == '\0')
454 break;
455
456 if (strcmp(strval, "none") == 0)
457 break;
458
459 if (strval[0] != '/') {
460 error = EINVAL;
461 break;
462 }
463
464 slash = strrchr(strval, '/');
465 ASSERT(slash != NULL);
466
467 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
468 strcmp(slash, "/..") == 0)
469 error = EINVAL;
470 break;
471
472 case ZPOOL_PROP_COMMENT:
473 if ((error = nvpair_value_string(elem, &strval)) != 0)
474 break;
475 for (check = strval; *check != '\0'; check++) {
476 /*
477 * The kernel doesn't have an easy isprint()
478 * check. For this kernel check, we merely
479 * check ASCII apart from DEL. Fix this if
480 * there is an easy-to-use kernel isprint().
481 */
482 if (*check >= 0x7f) {
483 error = EINVAL;
484 break;
485 }
486 check++;
487 }
488 if (strlen(strval) > ZPROP_MAX_COMMENT)
489 error = E2BIG;
490 break;
491
492 case ZPOOL_PROP_DEDUPDITTO:
493 if (spa_version(spa) < SPA_VERSION_DEDUP)
494 error = ENOTSUP;
495 else
496 error = nvpair_value_uint64(elem, &intval);
497 if (error == 0 &&
498 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
499 error = EINVAL;
500 break;
501 }
502
503 if (error)
504 break;
505 }
506
507 if (!error && reset_bootfs) {
508 error = nvlist_remove(props,
509 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
510
511 if (!error) {
512 error = nvlist_add_uint64(props,
513 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
514 }
515 }
516
517 return (error);
518 }
519
520 void
521 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
522 {
523 char *cachefile;
524 spa_config_dirent_t *dp;
525
526 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
527 &cachefile) != 0)
528 return;
529
530 dp = kmem_alloc(sizeof (spa_config_dirent_t),
531 KM_SLEEP);
532
533 if (cachefile[0] == '\0')
534 dp->scd_path = spa_strdup(spa_config_path);
535 else if (strcmp(cachefile, "none") == 0)
536 dp->scd_path = NULL;
537 else
538 dp->scd_path = spa_strdup(cachefile);
539
540 list_insert_head(&spa->spa_config_list, dp);
541 if (need_sync)
542 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
543 }
544
545 int
546 spa_prop_set(spa_t *spa, nvlist_t *nvp)
547 {
548 int error;
549 nvpair_t *elem;
550 boolean_t need_sync = B_FALSE;
551 zpool_prop_t prop;
552
553 if ((error = spa_prop_validate(spa, nvp)) != 0)
554 return (error);
555
556 elem = NULL;
557 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
558 if ((prop = zpool_name_to_prop(
559 nvpair_name(elem))) == ZPROP_INVAL)
560 return (EINVAL);
561
562 if (prop == ZPOOL_PROP_CACHEFILE ||
563 prop == ZPOOL_PROP_ALTROOT ||
564 prop == ZPOOL_PROP_READONLY)
565 continue;
566
567 need_sync = B_TRUE;
568 break;
569 }
570
571 if (need_sync)
572 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
573 spa, nvp, 3));
574 else
575 return (0);
576 }
577
578 /*
579 * If the bootfs property value is dsobj, clear it.
580 */
581 void
582 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
583 {
584 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
585 VERIFY(zap_remove(spa->spa_meta_objset,
586 spa->spa_pool_props_object,
587 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
588 spa->spa_bootfs = 0;
589 }
590 }
591
592 /*
593 * Change the GUID for the pool. This is done so that we can later
594 * re-import a pool built from a clone of our own vdevs. We will modify
595 * the root vdev's guid, our own pool guid, and then mark all of our
596 * vdevs dirty. Note that we must make sure that all our vdevs are
597 * online when we do this, or else any vdevs that weren't present
598 * would be orphaned from our pool. We are also going to issue a
599 * sysevent to update any watchers.
600 */
601 int
602 spa_change_guid(spa_t *spa)
603 {
604 uint64_t oldguid, newguid;
605 uint64_t txg;
606
607 if (!(spa_mode_global & FWRITE))
608 return (EROFS);
609
610 txg = spa_vdev_enter(spa);
611
612 if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
613 return (spa_vdev_exit(spa, NULL, txg, ENXIO));
614
615 oldguid = spa_guid(spa);
616 newguid = spa_generate_guid(NULL);
617 ASSERT3U(oldguid, !=, newguid);
618
619 spa->spa_root_vdev->vdev_guid = newguid;
620 spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
621
622 vdev_config_dirty(spa->spa_root_vdev);
623
624 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
625
626 return (spa_vdev_exit(spa, NULL, txg, 0));
627 }
628
629 /*
630 * ==========================================================================
631 * SPA state manipulation (open/create/destroy/import/export)
632 * ==========================================================================
633 */
634
635 static int
636 spa_error_entry_compare(const void *a, const void *b)
637 {
638 spa_error_entry_t *sa = (spa_error_entry_t *)a;
639 spa_error_entry_t *sb = (spa_error_entry_t *)b;
640 int ret;
641
642 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
643 sizeof (zbookmark_t));
644
645 if (ret < 0)
646 return (-1);
647 else if (ret > 0)
648 return (1);
649 else
650 return (0);
651 }
652
653 /*
654 * Utility function which retrieves copies of the current logs and
655 * re-initializes them in the process.
656 */
657 void
658 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
659 {
660 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
661
662 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
663 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
664
665 avl_create(&spa->spa_errlist_scrub,
666 spa_error_entry_compare, sizeof (spa_error_entry_t),
667 offsetof(spa_error_entry_t, se_avl));
668 avl_create(&spa->spa_errlist_last,
669 spa_error_entry_compare, sizeof (spa_error_entry_t),
670 offsetof(spa_error_entry_t, se_avl));
671 }
672
673 static taskq_t *
674 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
675 uint_t value)
676 {
677 uint_t flags = 0;
678 boolean_t batch = B_FALSE;
679
680 switch (mode) {
681 case zti_mode_null:
682 return (NULL); /* no taskq needed */
683
684 case zti_mode_fixed:
685 ASSERT3U(value, >=, 1);
686 value = MAX(value, 1);
687 break;
688
689 case zti_mode_batch:
690 batch = B_TRUE;
691 flags |= TASKQ_THREADS_CPU_PCT;
692 value = zio_taskq_batch_pct;
693 break;
694
695 case zti_mode_online_percent:
696 flags |= TASKQ_THREADS_CPU_PCT;
697 break;
698
699 default:
700 panic("unrecognized mode for %s taskq (%u:%u) in "
701 "spa_activate()",
702 name, mode, value);
703 break;
704 }
705
706 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
707 if (batch)
708 flags |= TASKQ_DC_BATCH;
709
710 return (taskq_create_sysdc(name, value, 50, INT_MAX,
711 spa->spa_proc, zio_taskq_basedc, flags));
712 }
713 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
714 spa->spa_proc, flags));
715 }
716
717 static void
718 spa_create_zio_taskqs(spa_t *spa)
719 {
720 for (int t = 0; t < ZIO_TYPES; t++) {
721 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
722 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
723 enum zti_modes mode = ztip->zti_mode;
724 uint_t value = ztip->zti_value;
725 char name[32];
726
727 (void) snprintf(name, sizeof (name),
728 "%s_%s", zio_type_name[t], zio_taskq_types[q]);
729
730 spa->spa_zio_taskq[t][q] =
731 spa_taskq_create(spa, name, mode, value);
732 }
733 }
734 }
735
736 #ifdef _KERNEL
737 static void
738 spa_thread(void *arg)
739 {
740 callb_cpr_t cprinfo;
741
742 spa_t *spa = arg;
743 user_t *pu = PTOU(curproc);
744
745 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
746 spa->spa_name);
747
748 ASSERT(curproc != &p0);
749 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
750 "zpool-%s", spa->spa_name);
751 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
752
753 /* bind this thread to the requested psrset */
754 if (zio_taskq_psrset_bind != PS_NONE) {
755 pool_lock();
756 mutex_enter(&cpu_lock);
757 mutex_enter(&pidlock);
758 mutex_enter(&curproc->p_lock);
759
760 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
761 0, NULL, NULL) == 0) {
762 curthread->t_bind_pset = zio_taskq_psrset_bind;
763 } else {
764 cmn_err(CE_WARN,
765 "Couldn't bind process for zfs pool \"%s\" to "
766 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
767 }
768
769 mutex_exit(&curproc->p_lock);
770 mutex_exit(&pidlock);
771 mutex_exit(&cpu_lock);
772 pool_unlock();
773 }
774
775 if (zio_taskq_sysdc) {
776 sysdc_thread_enter(curthread, 100, 0);
777 }
778
779 spa->spa_proc = curproc;
780 spa->spa_did = curthread->t_did;
781
782 spa_create_zio_taskqs(spa);
783
784 mutex_enter(&spa->spa_proc_lock);
785 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
786
787 spa->spa_proc_state = SPA_PROC_ACTIVE;
788 cv_broadcast(&spa->spa_proc_cv);
789
790 CALLB_CPR_SAFE_BEGIN(&cprinfo);
791 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
792 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
793 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
794
795 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
796 spa->spa_proc_state = SPA_PROC_GONE;
797 spa->spa_proc = &p0;
798 cv_broadcast(&spa->spa_proc_cv);
799 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
800
801 mutex_enter(&curproc->p_lock);
802 lwp_exit();
803 }
804 #endif
805
806 /*
807 * Activate an uninitialized pool.
808 */
809 static void
810 spa_activate(spa_t *spa, int mode)
811 {
812 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
813
814 spa->spa_state = POOL_STATE_ACTIVE;
815 spa->spa_mode = mode;
816
817 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
818 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
819
820 /* Try to create a covering process */
821 mutex_enter(&spa->spa_proc_lock);
822 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
823 ASSERT(spa->spa_proc == &p0);
824 spa->spa_did = 0;
825
826 /* Only create a process if we're going to be around a while. */
827 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
828 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
829 NULL, 0) == 0) {
830 spa->spa_proc_state = SPA_PROC_CREATED;
831 while (spa->spa_proc_state == SPA_PROC_CREATED) {
832 cv_wait(&spa->spa_proc_cv,
833 &spa->spa_proc_lock);
834 }
835 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
836 ASSERT(spa->spa_proc != &p0);
837 ASSERT(spa->spa_did != 0);
838 } else {
839 #ifdef _KERNEL
840 cmn_err(CE_WARN,
841 "Couldn't create process for zfs pool \"%s\"\n",
842 spa->spa_name);
843 #endif
844 }
845 }
846 mutex_exit(&spa->spa_proc_lock);
847
848 /* If we didn't create a process, we need to create our taskqs. */
849 if (spa->spa_proc == &p0) {
850 spa_create_zio_taskqs(spa);
851 }
852
853 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
854 offsetof(vdev_t, vdev_config_dirty_node));
855 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
856 offsetof(vdev_t, vdev_state_dirty_node));
857
858 txg_list_create(&spa->spa_vdev_txg_list,
859 offsetof(struct vdev, vdev_txg_node));
860
861 avl_create(&spa->spa_errlist_scrub,
862 spa_error_entry_compare, sizeof (spa_error_entry_t),
863 offsetof(spa_error_entry_t, se_avl));
864 avl_create(&spa->spa_errlist_last,
865 spa_error_entry_compare, sizeof (spa_error_entry_t),
866 offsetof(spa_error_entry_t, se_avl));
867 }
868
869 /*
870 * Opposite of spa_activate().
871 */
872 static void
873 spa_deactivate(spa_t *spa)
874 {
875 ASSERT(spa->spa_sync_on == B_FALSE);
876 ASSERT(spa->spa_dsl_pool == NULL);
877 ASSERT(spa->spa_root_vdev == NULL);
878 ASSERT(spa->spa_async_zio_root == NULL);
879 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
880
881 txg_list_destroy(&spa->spa_vdev_txg_list);
882
883 list_destroy(&spa->spa_config_dirty_list);
884 list_destroy(&spa->spa_state_dirty_list);
885
886 for (int t = 0; t < ZIO_TYPES; t++) {
887 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
888 if (spa->spa_zio_taskq[t][q] != NULL)
889 taskq_destroy(spa->spa_zio_taskq[t][q]);
890 spa->spa_zio_taskq[t][q] = NULL;
891 }
892 }
893
894 metaslab_class_destroy(spa->spa_normal_class);
895 spa->spa_normal_class = NULL;
896
897 metaslab_class_destroy(spa->spa_log_class);
898 spa->spa_log_class = NULL;
899
900 /*
901 * If this was part of an import or the open otherwise failed, we may
902 * still have errors left in the queues. Empty them just in case.
903 */
904 spa_errlog_drain(spa);
905
906 avl_destroy(&spa->spa_errlist_scrub);
907 avl_destroy(&spa->spa_errlist_last);
908
909 spa->spa_state = POOL_STATE_UNINITIALIZED;
910
911 mutex_enter(&spa->spa_proc_lock);
912 if (spa->spa_proc_state != SPA_PROC_NONE) {
913 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
914 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
915 cv_broadcast(&spa->spa_proc_cv);
916 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
917 ASSERT(spa->spa_proc != &p0);
918 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
919 }
920 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
921 spa->spa_proc_state = SPA_PROC_NONE;
922 }
923 ASSERT(spa->spa_proc == &p0);
924 mutex_exit(&spa->spa_proc_lock);
925
926 /*
927 * We want to make sure spa_thread() has actually exited the ZFS
928 * module, so that the module can't be unloaded out from underneath
929 * it.
930 */
931 if (spa->spa_did != 0) {
932 thread_join(spa->spa_did);
933 spa->spa_did = 0;
934 }
935 }
936
937 /*
938 * Verify a pool configuration, and construct the vdev tree appropriately. This
939 * will create all the necessary vdevs in the appropriate layout, with each vdev
940 * in the CLOSED state. This will prep the pool before open/creation/import.
941 * All vdev validation is done by the vdev_alloc() routine.
942 */
943 static int
944 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
945 uint_t id, int atype)
946 {
947 nvlist_t **child;
948 uint_t children;
949 int error;
950
951 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
952 return (error);
953
954 if ((*vdp)->vdev_ops->vdev_op_leaf)
955 return (0);
956
957 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
958 &child, &children);
959
960 if (error == ENOENT)
961 return (0);
962
963 if (error) {
964 vdev_free(*vdp);
965 *vdp = NULL;
966 return (EINVAL);
967 }
968
969 for (int c = 0; c < children; c++) {
970 vdev_t *vd;
971 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
972 atype)) != 0) {
973 vdev_free(*vdp);
974 *vdp = NULL;
975 return (error);
976 }
977 }
978
979 ASSERT(*vdp != NULL);
980
981 return (0);
982 }
983
984 /*
985 * Opposite of spa_load().
986 */
987 static void
988 spa_unload(spa_t *spa)
989 {
990 int i;
991
992 ASSERT(MUTEX_HELD(&spa_namespace_lock));
993
994 /*
995 * Stop async tasks.
996 */
997 spa_async_suspend(spa);
998
999 /*
1000 * Stop syncing.
1001 */
1002 if (spa->spa_sync_on) {
1003 txg_sync_stop(spa->spa_dsl_pool);
1004 spa->spa_sync_on = B_FALSE;
1005 }
1006
1007 /*
1008 * Wait for any outstanding async I/O to complete.
1009 */
1010 if (spa->spa_async_zio_root != NULL) {
1011 (void) zio_wait(spa->spa_async_zio_root);
1012 spa->spa_async_zio_root = NULL;
1013 }
1014
1015 bpobj_close(&spa->spa_deferred_bpobj);
1016
1017 /*
1018 * Close the dsl pool.
1019 */
1020 if (spa->spa_dsl_pool) {
1021 dsl_pool_close(spa->spa_dsl_pool);
1022 spa->spa_dsl_pool = NULL;
1023 spa->spa_meta_objset = NULL;
1024 }
1025
1026 ddt_unload(spa);
1027
1028 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1029
1030 /*
1031 * Drop and purge level 2 cache
1032 */
1033 spa_l2cache_drop(spa);
1034
1035 /*
1036 * Close all vdevs.
1037 */
1038 if (spa->spa_root_vdev)
1039 vdev_free(spa->spa_root_vdev);
1040 ASSERT(spa->spa_root_vdev == NULL);
1041
1042 for (i = 0; i < spa->spa_spares.sav_count; i++)
1043 vdev_free(spa->spa_spares.sav_vdevs[i]);
1044 if (spa->spa_spares.sav_vdevs) {
1045 kmem_free(spa->spa_spares.sav_vdevs,
1046 spa->spa_spares.sav_count * sizeof (void *));
1047 spa->spa_spares.sav_vdevs = NULL;
1048 }
1049 if (spa->spa_spares.sav_config) {
1050 nvlist_free(spa->spa_spares.sav_config);
1051 spa->spa_spares.sav_config = NULL;
1052 }
1053 spa->spa_spares.sav_count = 0;
1054
1055 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
1056 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1057 if (spa->spa_l2cache.sav_vdevs) {
1058 kmem_free(spa->spa_l2cache.sav_vdevs,
1059 spa->spa_l2cache.sav_count * sizeof (void *));
1060 spa->spa_l2cache.sav_vdevs = NULL;
1061 }
1062 if (spa->spa_l2cache.sav_config) {
1063 nvlist_free(spa->spa_l2cache.sav_config);
1064 spa->spa_l2cache.sav_config = NULL;
1065 }
1066 spa->spa_l2cache.sav_count = 0;
1067
1068 spa->spa_async_suspended = 0;
1069
1070 if (spa->spa_comment != NULL) {
1071 spa_strfree(spa->spa_comment);
1072 spa->spa_comment = NULL;
1073 }
1074
1075 spa_config_exit(spa, SCL_ALL, FTAG);
1076 }
1077
1078 /*
1079 * Load (or re-load) the current list of vdevs describing the active spares for
1080 * this pool. When this is called, we have some form of basic information in
1081 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1082 * then re-generate a more complete list including status information.
1083 */
1084 static void
1085 spa_load_spares(spa_t *spa)
1086 {
1087 nvlist_t **spares;
1088 uint_t nspares;
1089 int i;
1090 vdev_t *vd, *tvd;
1091
1092 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1093
1094 /*
1095 * First, close and free any existing spare vdevs.
1096 */
1097 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1098 vd = spa->spa_spares.sav_vdevs[i];
1099
1100 /* Undo the call to spa_activate() below */
1101 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1102 B_FALSE)) != NULL && tvd->vdev_isspare)
1103 spa_spare_remove(tvd);
1104 vdev_close(vd);
1105 vdev_free(vd);
1106 }
1107
1108 if (spa->spa_spares.sav_vdevs)
1109 kmem_free(spa->spa_spares.sav_vdevs,
1110 spa->spa_spares.sav_count * sizeof (void *));
1111
1112 if (spa->spa_spares.sav_config == NULL)
1113 nspares = 0;
1114 else
1115 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1116 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1117
1118 spa->spa_spares.sav_count = (int)nspares;
1119 spa->spa_spares.sav_vdevs = NULL;
1120
1121 if (nspares == 0)
1122 return;
1123
1124 /*
1125 * Construct the array of vdevs, opening them to get status in the
1126 * process. For each spare, there is potentially two different vdev_t
1127 * structures associated with it: one in the list of spares (used only
1128 * for basic validation purposes) and one in the active vdev
1129 * configuration (if it's spared in). During this phase we open and
1130 * validate each vdev on the spare list. If the vdev also exists in the
1131 * active configuration, then we also mark this vdev as an active spare.
1132 */
1133 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1134 KM_SLEEP);
1135 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1136 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1137 VDEV_ALLOC_SPARE) == 0);
1138 ASSERT(vd != NULL);
1139
1140 spa->spa_spares.sav_vdevs[i] = vd;
1141
1142 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1143 B_FALSE)) != NULL) {
1144 if (!tvd->vdev_isspare)
1145 spa_spare_add(tvd);
1146
1147 /*
1148 * We only mark the spare active if we were successfully
1149 * able to load the vdev. Otherwise, importing a pool
1150 * with a bad active spare would result in strange
1151 * behavior, because multiple pool would think the spare
1152 * is actively in use.
1153 *
1154 * There is a vulnerability here to an equally bizarre
1155 * circumstance, where a dead active spare is later
1156 * brought back to life (onlined or otherwise). Given
1157 * the rarity of this scenario, and the extra complexity
1158 * it adds, we ignore the possibility.
1159 */
1160 if (!vdev_is_dead(tvd))
1161 spa_spare_activate(tvd);
1162 }
1163
1164 vd->vdev_top = vd;
1165 vd->vdev_aux = &spa->spa_spares;
1166
1167 if (vdev_open(vd) != 0)
1168 continue;
1169
1170 if (vdev_validate_aux(vd) == 0)
1171 spa_spare_add(vd);
1172 }
1173
1174 /*
1175 * Recompute the stashed list of spares, with status information
1176 * this time.
1177 */
1178 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1179 DATA_TYPE_NVLIST_ARRAY) == 0);
1180
1181 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1182 KM_SLEEP);
1183 for (i = 0; i < spa->spa_spares.sav_count; i++)
1184 spares[i] = vdev_config_generate(spa,
1185 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1186 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1187 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1188 for (i = 0; i < spa->spa_spares.sav_count; i++)
1189 nvlist_free(spares[i]);
1190 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1191 }
1192
1193 /*
1194 * Load (or re-load) the current list of vdevs describing the active l2cache for
1195 * this pool. When this is called, we have some form of basic information in
1196 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1197 * then re-generate a more complete list including status information.
1198 * Devices which are already active have their details maintained, and are
1199 * not re-opened.
1200 */
1201 static void
1202 spa_load_l2cache(spa_t *spa)
1203 {
1204 nvlist_t **l2cache;
1205 uint_t nl2cache;
1206 int i, j, oldnvdevs;
1207 uint64_t guid;
1208 vdev_t *vd, **oldvdevs, **newvdevs;
1209 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1210
1211 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1212
1213 if (sav->sav_config != NULL) {
1214 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1215 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1216 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1217 } else {
1218 nl2cache = 0;
1219 }
1220
1221 oldvdevs = sav->sav_vdevs;
1222 oldnvdevs = sav->sav_count;
1223 sav->sav_vdevs = NULL;
1224 sav->sav_count = 0;
1225
1226 /*
1227 * Process new nvlist of vdevs.
1228 */
1229 for (i = 0; i < nl2cache; i++) {
1230 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1231 &guid) == 0);
1232
1233 newvdevs[i] = NULL;
1234 for (j = 0; j < oldnvdevs; j++) {
1235 vd = oldvdevs[j];
1236 if (vd != NULL && guid == vd->vdev_guid) {
1237 /*
1238 * Retain previous vdev for add/remove ops.
1239 */
1240 newvdevs[i] = vd;
1241 oldvdevs[j] = NULL;
1242 break;
1243 }
1244 }
1245
1246 if (newvdevs[i] == NULL) {
1247 /*
1248 * Create new vdev
1249 */
1250 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1251 VDEV_ALLOC_L2CACHE) == 0);
1252 ASSERT(vd != NULL);
1253 newvdevs[i] = vd;
1254
1255 /*
1256 * Commit this vdev as an l2cache device,
1257 * even if it fails to open.
1258 */
1259 spa_l2cache_add(vd);
1260
1261 vd->vdev_top = vd;
1262 vd->vdev_aux = sav;
1263
1264 spa_l2cache_activate(vd);
1265
1266 if (vdev_open(vd) != 0)
1267 continue;
1268
1269 (void) vdev_validate_aux(vd);
1270
1271 if (!vdev_is_dead(vd))
1272 l2arc_add_vdev(spa, vd);
1273 }
1274 }
1275
1276 /*
1277 * Purge vdevs that were dropped
1278 */
1279 for (i = 0; i < oldnvdevs; i++) {
1280 uint64_t pool;
1281
1282 vd = oldvdevs[i];
1283 if (vd != NULL) {
1284 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1285 pool != 0ULL && l2arc_vdev_present(vd))
1286 l2arc_remove_vdev(vd);
1287 (void) vdev_close(vd);
1288 spa_l2cache_remove(vd);
1289 }
1290 }
1291
1292 if (oldvdevs)
1293 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1294
1295 if (sav->sav_config == NULL)
1296 goto out;
1297
1298 sav->sav_vdevs = newvdevs;
1299 sav->sav_count = (int)nl2cache;
1300
1301 /*
1302 * Recompute the stashed list of l2cache devices, with status
1303 * information this time.
1304 */
1305 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1306 DATA_TYPE_NVLIST_ARRAY) == 0);
1307
1308 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1309 for (i = 0; i < sav->sav_count; i++)
1310 l2cache[i] = vdev_config_generate(spa,
1311 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1312 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1313 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1314 out:
1315 for (i = 0; i < sav->sav_count; i++)
1316 nvlist_free(l2cache[i]);
1317 if (sav->sav_count)
1318 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1319 }
1320
1321 static int
1322 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1323 {
1324 dmu_buf_t *db;
1325 char *packed = NULL;
1326 size_t nvsize = 0;
1327 int error;
1328 *value = NULL;
1329
1330 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1331 nvsize = *(uint64_t *)db->db_data;
1332 dmu_buf_rele(db, FTAG);
1333
1334 packed = kmem_alloc(nvsize, KM_SLEEP);
1335 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1336 DMU_READ_PREFETCH);
1337 if (error == 0)
1338 error = nvlist_unpack(packed, nvsize, value, 0);
1339 kmem_free(packed, nvsize);
1340
1341 return (error);
1342 }
1343
1344 /*
1345 * Checks to see if the given vdev could not be opened, in which case we post a
1346 * sysevent to notify the autoreplace code that the device has been removed.
1347 */
1348 static void
1349 spa_check_removed(vdev_t *vd)
1350 {
1351 for (int c = 0; c < vd->vdev_children; c++)
1352 spa_check_removed(vd->vdev_child[c]);
1353
1354 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1355 zfs_post_autoreplace(vd->vdev_spa, vd);
1356 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1357 }
1358 }
1359
1360 /*
1361 * Validate the current config against the MOS config
1362 */
1363 static boolean_t
1364 spa_config_valid(spa_t *spa, nvlist_t *config)
1365 {
1366 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1367 nvlist_t *nv;
1368
1369 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1370
1371 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1372 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1373
1374 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1375
1376 /*
1377 * If we're doing a normal import, then build up any additional
1378 * diagnostic information about missing devices in this config.
1379 * We'll pass this up to the user for further processing.
1380 */
1381 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1382 nvlist_t **child, *nv;
1383 uint64_t idx = 0;
1384
1385 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1386 KM_SLEEP);
1387 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1388
1389 for (int c = 0; c < rvd->vdev_children; c++) {
1390 vdev_t *tvd = rvd->vdev_child[c];
1391 vdev_t *mtvd = mrvd->vdev_child[c];
1392
1393 if (tvd->vdev_ops == &vdev_missing_ops &&
1394 mtvd->vdev_ops != &vdev_missing_ops &&
1395 mtvd->vdev_islog)
1396 child[idx++] = vdev_config_generate(spa, mtvd,
1397 B_FALSE, 0);
1398 }
1399
1400 if (idx) {
1401 VERIFY(nvlist_add_nvlist_array(nv,
1402 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1403 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1404 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1405
1406 for (int i = 0; i < idx; i++)
1407 nvlist_free(child[i]);
1408 }
1409 nvlist_free(nv);
1410 kmem_free(child, rvd->vdev_children * sizeof (char **));
1411 }
1412
1413 /*
1414 * Compare the root vdev tree with the information we have
1415 * from the MOS config (mrvd). Check each top-level vdev
1416 * with the corresponding MOS config top-level (mtvd).
1417 */
1418 for (int c = 0; c < rvd->vdev_children; c++) {
1419 vdev_t *tvd = rvd->vdev_child[c];
1420 vdev_t *mtvd = mrvd->vdev_child[c];
1421
1422 /*
1423 * Resolve any "missing" vdevs in the current configuration.
1424 * If we find that the MOS config has more accurate information
1425 * about the top-level vdev then use that vdev instead.
1426 */
1427 if (tvd->vdev_ops == &vdev_missing_ops &&
1428 mtvd->vdev_ops != &vdev_missing_ops) {
1429
1430 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1431 continue;
1432
1433 /*
1434 * Device specific actions.
1435 */
1436 if (mtvd->vdev_islog) {
1437 spa_set_log_state(spa, SPA_LOG_CLEAR);
1438 } else {
1439 /*
1440 * XXX - once we have 'readonly' pool
1441 * support we should be able to handle
1442 * missing data devices by transitioning
1443 * the pool to readonly.
1444 */
1445 continue;
1446 }
1447
1448 /*
1449 * Swap the missing vdev with the data we were
1450 * able to obtain from the MOS config.
1451 */
1452 vdev_remove_child(rvd, tvd);
1453 vdev_remove_child(mrvd, mtvd);
1454
1455 vdev_add_child(rvd, mtvd);
1456 vdev_add_child(mrvd, tvd);
1457
1458 spa_config_exit(spa, SCL_ALL, FTAG);
1459 vdev_load(mtvd);
1460 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1461
1462 vdev_reopen(rvd);
1463 } else if (mtvd->vdev_islog) {
1464 /*
1465 * Load the slog device's state from the MOS config
1466 * since it's possible that the label does not
1467 * contain the most up-to-date information.
1468 */
1469 vdev_load_log_state(tvd, mtvd);
1470 vdev_reopen(tvd);
1471 }
1472 }
1473 vdev_free(mrvd);
1474 spa_config_exit(spa, SCL_ALL, FTAG);
1475
1476 /*
1477 * Ensure we were able to validate the config.
1478 */
1479 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1480 }
1481
1482 /*
1483 * Check for missing log devices
1484 */
1485 static int
1486 spa_check_logs(spa_t *spa)
1487 {
1488 switch (spa->spa_log_state) {
1489 case SPA_LOG_MISSING:
1490 /* need to recheck in case slog has been restored */
1491 case SPA_LOG_UNKNOWN:
1492 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1493 DS_FIND_CHILDREN)) {
1494 spa_set_log_state(spa, SPA_LOG_MISSING);
1495 return (1);
1496 }
1497 break;
1498 }
1499 return (0);
1500 }
1501
1502 static boolean_t
1503 spa_passivate_log(spa_t *spa)
1504 {
1505 vdev_t *rvd = spa->spa_root_vdev;
1506 boolean_t slog_found = B_FALSE;
1507
1508 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1509
1510 if (!spa_has_slogs(spa))
1511 return (B_FALSE);
1512
1513 for (int c = 0; c < rvd->vdev_children; c++) {
1514 vdev_t *tvd = rvd->vdev_child[c];
1515 metaslab_group_t *mg = tvd->vdev_mg;
1516
1517 if (tvd->vdev_islog) {
1518 metaslab_group_passivate(mg);
1519 slog_found = B_TRUE;
1520 }
1521 }
1522
1523 return (slog_found);
1524 }
1525
1526 static void
1527 spa_activate_log(spa_t *spa)
1528 {
1529 vdev_t *rvd = spa->spa_root_vdev;
1530
1531 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1532
1533 for (int c = 0; c < rvd->vdev_children; c++) {
1534 vdev_t *tvd = rvd->vdev_child[c];
1535 metaslab_group_t *mg = tvd->vdev_mg;
1536
1537 if (tvd->vdev_islog)
1538 metaslab_group_activate(mg);
1539 }
1540 }
1541
1542 int
1543 spa_offline_log(spa_t *spa)
1544 {
1545 int error = 0;
1546
1547 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1548 NULL, DS_FIND_CHILDREN)) == 0) {
1549
1550 /*
1551 * We successfully offlined the log device, sync out the
1552 * current txg so that the "stubby" block can be removed
1553 * by zil_sync().
1554 */
1555 txg_wait_synced(spa->spa_dsl_pool, 0);
1556 }
1557 return (error);
1558 }
1559
1560 static void
1561 spa_aux_check_removed(spa_aux_vdev_t *sav)
1562 {
1563 for (int i = 0; i < sav->sav_count; i++)
1564 spa_check_removed(sav->sav_vdevs[i]);
1565 }
1566
1567 void
1568 spa_claim_notify(zio_t *zio)
1569 {
1570 spa_t *spa = zio->io_spa;
1571
1572 if (zio->io_error)
1573 return;
1574
1575 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1576 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1577 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1578 mutex_exit(&spa->spa_props_lock);
1579 }
1580
1581 typedef struct spa_load_error {
1582 uint64_t sle_meta_count;
1583 uint64_t sle_data_count;
1584 } spa_load_error_t;
1585
1586 static void
1587 spa_load_verify_done(zio_t *zio)
1588 {
1589 blkptr_t *bp = zio->io_bp;
1590 spa_load_error_t *sle = zio->io_private;
1591 dmu_object_type_t type = BP_GET_TYPE(bp);
1592 int error = zio->io_error;
1593
1594 if (error) {
1595 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1596 type != DMU_OT_INTENT_LOG)
1597 atomic_add_64(&sle->sle_meta_count, 1);
1598 else
1599 atomic_add_64(&sle->sle_data_count, 1);
1600 }
1601 zio_data_buf_free(zio->io_data, zio->io_size);
1602 }
1603
1604 /*ARGSUSED*/
1605 static int
1606 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1607 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1608 {
1609 if (bp != NULL) {
1610 zio_t *rio = arg;
1611 size_t size = BP_GET_PSIZE(bp);
1612 void *data = zio_data_buf_alloc(size);
1613
1614 zio_nowait(zio_read(rio, spa, bp, data, size,
1615 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1616 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1617 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1618 }
1619 return (0);
1620 }
1621
1622 static int
1623 spa_load_verify(spa_t *spa)
1624 {
1625 zio_t *rio;
1626 spa_load_error_t sle = { 0 };
1627 zpool_rewind_policy_t policy;
1628 boolean_t verify_ok = B_FALSE;
1629 int error;
1630
1631 zpool_get_rewind_policy(spa->spa_config, &policy);
1632
1633 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1634 return (0);
1635
1636 rio = zio_root(spa, NULL, &sle,
1637 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1638
1639 error = traverse_pool(spa, spa->spa_verify_min_txg,
1640 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1641
1642 (void) zio_wait(rio);
1643
1644 spa->spa_load_meta_errors = sle.sle_meta_count;
1645 spa->spa_load_data_errors = sle.sle_data_count;
1646
1647 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1648 sle.sle_data_count <= policy.zrp_maxdata) {
1649 int64_t loss = 0;
1650
1651 verify_ok = B_TRUE;
1652 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1653 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1654
1655 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1656 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1657 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1658 VERIFY(nvlist_add_int64(spa->spa_load_info,
1659 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1660 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1661 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1662 } else {
1663 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1664 }
1665
1666 if (error) {
1667 if (error != ENXIO && error != EIO)
1668 error = EIO;
1669 return (error);
1670 }
1671
1672 return (verify_ok ? 0 : EIO);
1673 }
1674
1675 /*
1676 * Find a value in the pool props object.
1677 */
1678 static void
1679 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1680 {
1681 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1682 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1683 }
1684
1685 /*
1686 * Find a value in the pool directory object.
1687 */
1688 static int
1689 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1690 {
1691 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1692 name, sizeof (uint64_t), 1, val));
1693 }
1694
1695 static int
1696 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1697 {
1698 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1699 return (err);
1700 }
1701
1702 /*
1703 * Fix up config after a partly-completed split. This is done with the
1704 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1705 * pool have that entry in their config, but only the splitting one contains
1706 * a list of all the guids of the vdevs that are being split off.
1707 *
1708 * This function determines what to do with that list: either rejoin
1709 * all the disks to the pool, or complete the splitting process. To attempt
1710 * the rejoin, each disk that is offlined is marked online again, and
1711 * we do a reopen() call. If the vdev label for every disk that was
1712 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1713 * then we call vdev_split() on each disk, and complete the split.
1714 *
1715 * Otherwise we leave the config alone, with all the vdevs in place in
1716 * the original pool.
1717 */
1718 static void
1719 spa_try_repair(spa_t *spa, nvlist_t *config)
1720 {
1721 uint_t extracted;
1722 uint64_t *glist;
1723 uint_t i, gcount;
1724 nvlist_t *nvl;
1725 vdev_t **vd;
1726 boolean_t attempt_reopen;
1727
1728 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1729 return;
1730
1731 /* check that the config is complete */
1732 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1733 &glist, &gcount) != 0)
1734 return;
1735
1736 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1737
1738 /* attempt to online all the vdevs & validate */
1739 attempt_reopen = B_TRUE;
1740 for (i = 0; i < gcount; i++) {
1741 if (glist[i] == 0) /* vdev is hole */
1742 continue;
1743
1744 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1745 if (vd[i] == NULL) {
1746 /*
1747 * Don't bother attempting to reopen the disks;
1748 * just do the split.
1749 */
1750 attempt_reopen = B_FALSE;
1751 } else {
1752 /* attempt to re-online it */
1753 vd[i]->vdev_offline = B_FALSE;
1754 }
1755 }
1756
1757 if (attempt_reopen) {
1758 vdev_reopen(spa->spa_root_vdev);
1759
1760 /* check each device to see what state it's in */
1761 for (extracted = 0, i = 0; i < gcount; i++) {
1762 if (vd[i] != NULL &&
1763 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1764 break;
1765 ++extracted;
1766 }
1767 }
1768
1769 /*
1770 * If every disk has been moved to the new pool, or if we never
1771 * even attempted to look at them, then we split them off for
1772 * good.
1773 */
1774 if (!attempt_reopen || gcount == extracted) {
1775 for (i = 0; i < gcount; i++)
1776 if (vd[i] != NULL)
1777 vdev_split(vd[i]);
1778 vdev_reopen(spa->spa_root_vdev);
1779 }
1780
1781 kmem_free(vd, gcount * sizeof (vdev_t *));
1782 }
1783
1784 static int
1785 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1786 boolean_t mosconfig)
1787 {
1788 nvlist_t *config = spa->spa_config;
1789 char *ereport = FM_EREPORT_ZFS_POOL;
1790 char *comment;
1791 int error;
1792 uint64_t pool_guid;
1793 nvlist_t *nvl;
1794
1795 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1796 return (EINVAL);
1797
1798 ASSERT(spa->spa_comment == NULL);
1799 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1800 spa->spa_comment = spa_strdup(comment);
1801
1802 /*
1803 * Versioning wasn't explicitly added to the label until later, so if
1804 * it's not present treat it as the initial version.
1805 */
1806 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1807 &spa->spa_ubsync.ub_version) != 0)
1808 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1809
1810 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1811 &spa->spa_config_txg);
1812
1813 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1814 spa_guid_exists(pool_guid, 0)) {
1815 error = EEXIST;
1816 } else {
1817 spa->spa_config_guid = pool_guid;
1818
1819 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1820 &nvl) == 0) {
1821 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1822 KM_SLEEP) == 0);
1823 }
1824
1825 gethrestime(&spa->spa_loaded_ts);
1826 error = spa_load_impl(spa, pool_guid, config, state, type,
1827 mosconfig, &ereport);
1828 }
1829
1830 spa->spa_minref = refcount_count(&spa->spa_refcount);
1831 if (error) {
1832 if (error != EEXIST) {
1833 spa->spa_loaded_ts.tv_sec = 0;
1834 spa->spa_loaded_ts.tv_nsec = 0;
1835 }
1836 if (error != EBADF) {
1837 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1838 }
1839 }
1840 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1841 spa->spa_ena = 0;
1842
1843 return (error);
1844 }
1845
1846 /*
1847 * Load an existing storage pool, using the pool's builtin spa_config as a
1848 * source of configuration information.
1849 */
1850 static int
1851 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1852 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1853 char **ereport)
1854 {
1855 int error = 0;
1856 nvlist_t *nvroot = NULL;
1857 vdev_t *rvd;
1858 uberblock_t *ub = &spa->spa_uberblock;
1859 uint64_t children, config_cache_txg = spa->spa_config_txg;
1860 int orig_mode = spa->spa_mode;
1861 int parse;
1862 uint64_t obj;
1863
1864 /*
1865 * If this is an untrusted config, access the pool in read-only mode.
1866 * This prevents things like resilvering recently removed devices.
1867 */
1868 if (!mosconfig)
1869 spa->spa_mode = FREAD;
1870
1871 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1872
1873 spa->spa_load_state = state;
1874
1875 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1876 return (EINVAL);
1877
1878 parse = (type == SPA_IMPORT_EXISTING ?
1879 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1880
1881 /*
1882 * Create "The Godfather" zio to hold all async IOs
1883 */
1884 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
1885 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
1886
1887 /*
1888 * Parse the configuration into a vdev tree. We explicitly set the
1889 * value that will be returned by spa_version() since parsing the
1890 * configuration requires knowing the version number.
1891 */
1892 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1893 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
1894 spa_config_exit(spa, SCL_ALL, FTAG);
1895
1896 if (error != 0)
1897 return (error);
1898
1899 ASSERT(spa->spa_root_vdev == rvd);
1900
1901 if (type != SPA_IMPORT_ASSEMBLE) {
1902 ASSERT(spa_guid(spa) == pool_guid);
1903 }
1904
1905 /*
1906 * Try to open all vdevs, loading each label in the process.
1907 */
1908 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1909 error = vdev_open(rvd);
1910 spa_config_exit(spa, SCL_ALL, FTAG);
1911 if (error != 0)
1912 return (error);
1913
1914 /*
1915 * We need to validate the vdev labels against the configuration that
1916 * we have in hand, which is dependent on the setting of mosconfig. If
1917 * mosconfig is true then we're validating the vdev labels based on
1918 * that config. Otherwise, we're validating against the cached config
1919 * (zpool.cache) that was read when we loaded the zfs module, and then
1920 * later we will recursively call spa_load() and validate against
1921 * the vdev config.
1922 *
1923 * If we're assembling a new pool that's been split off from an
1924 * existing pool, the labels haven't yet been updated so we skip
1925 * validation for now.
1926 */
1927 if (type != SPA_IMPORT_ASSEMBLE) {
1928 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1929 error = vdev_validate(rvd);
1930 spa_config_exit(spa, SCL_ALL, FTAG);
1931
1932 if (error != 0)
1933 return (error);
1934
1935 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
1936 return (ENXIO);
1937 }
1938
1939 /*
1940 * Find the best uberblock.
1941 */
1942 vdev_uberblock_load(NULL, rvd, ub);
1943
1944 /*
1945 * If we weren't able to find a single valid uberblock, return failure.
1946 */
1947 if (ub->ub_txg == 0)
1948 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
1949
1950 /*
1951 * If the pool is newer than the code, we can't open it.
1952 */
1953 if (ub->ub_version > SPA_VERSION)
1954 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
1955
1956 /*
1957 * If the vdev guid sum doesn't match the uberblock, we have an
1958 * incomplete configuration. We first check to see if the pool
1959 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
1960 * If it is, defer the vdev_guid_sum check till later so we
1961 * can handle missing vdevs.
1962 */
1963 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
1964 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
1965 rvd->vdev_guid_sum != ub->ub_guid_sum)
1966 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
1967
1968 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
1969 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1970 spa_try_repair(spa, config);
1971 spa_config_exit(spa, SCL_ALL, FTAG);
1972 nvlist_free(spa->spa_config_splitting);
1973 spa->spa_config_splitting = NULL;
1974 }
1975
1976 /*
1977 * Initialize internal SPA structures.
1978 */
1979 spa->spa_state = POOL_STATE_ACTIVE;
1980 spa->spa_ubsync = spa->spa_uberblock;
1981 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1982 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1983 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1984 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
1985 spa->spa_claim_max_txg = spa->spa_first_txg;
1986 spa->spa_prev_software_version = ub->ub_software_version;
1987
1988 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1989 if (error)
1990 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1991 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1992
1993 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
1994 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1995
1996 if (!mosconfig) {
1997 uint64_t hostid;
1998 nvlist_t *policy = NULL, *nvconfig;
1999
2000 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2001 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2002
2003 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2004 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2005 char *hostname;
2006 unsigned long myhostid = 0;
2007
2008 VERIFY(nvlist_lookup_string(nvconfig,
2009 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2010
2011 #ifdef _KERNEL
2012 myhostid = zone_get_hostid(NULL);
2013 #else /* _KERNEL */
2014 /*
2015 * We're emulating the system's hostid in userland, so
2016 * we can't use zone_get_hostid().
2017 */
2018 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2019 #endif /* _KERNEL */
2020 if (hostid != 0 && myhostid != 0 &&
2021 hostid != myhostid) {
2022 nvlist_free(nvconfig);
2023 cmn_err(CE_WARN, "pool '%s' could not be "
2024 "loaded as it was last accessed by "
2025 "another system (host: %s hostid: 0x%lx). "
2026 "See: http://www.sun.com/msg/ZFS-8000-EY",
2027 spa_name(spa), hostname,
2028 (unsigned long)hostid);
2029 return (EBADF);
2030 }
2031 }
2032 if (nvlist_lookup_nvlist(spa->spa_config,
2033 ZPOOL_REWIND_POLICY, &policy) == 0)
2034 VERIFY(nvlist_add_nvlist(nvconfig,
2035 ZPOOL_REWIND_POLICY, policy) == 0);
2036
2037 spa_config_set(spa, nvconfig);
2038 spa_unload(spa);
2039 spa_deactivate(spa);
2040 spa_activate(spa, orig_mode);
2041
2042 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2043 }
2044
2045 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2046 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2047 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2048 if (error != 0)
2049 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2050
2051 /*
2052 * Load the bit that tells us to use the new accounting function
2053 * (raid-z deflation). If we have an older pool, this will not
2054 * be present.
2055 */
2056 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2057 if (error != 0 && error != ENOENT)
2058 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2059
2060 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2061 &spa->spa_creation_version);
2062 if (error != 0 && error != ENOENT)
2063 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2064
2065 /*
2066 * Load the persistent error log. If we have an older pool, this will
2067 * not be present.
2068 */
2069 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2070 if (error != 0 && error != ENOENT)
2071 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2072
2073 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2074 &spa->spa_errlog_scrub);
2075 if (error != 0 && error != ENOENT)
2076 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2077
2078 /*
2079 * Load the history object. If we have an older pool, this
2080 * will not be present.
2081 */
2082 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2083 if (error != 0 && error != ENOENT)
2084 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2085
2086 /*
2087 * If we're assembling the pool from the split-off vdevs of
2088 * an existing pool, we don't want to attach the spares & cache
2089 * devices.
2090 */
2091
2092 /*
2093 * Load any hot spares for this pool.
2094 */
2095 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2096 if (error != 0 && error != ENOENT)
2097 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2098 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2099 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2100 if (load_nvlist(spa, spa->spa_spares.sav_object,
2101 &spa->spa_spares.sav_config) != 0)
2102 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2103
2104 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2105 spa_load_spares(spa);
2106 spa_config_exit(spa, SCL_ALL, FTAG);
2107 } else if (error == 0) {
2108 spa->spa_spares.sav_sync = B_TRUE;
2109 }
2110
2111 /*
2112 * Load any level 2 ARC devices for this pool.
2113 */
2114 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2115 &spa->spa_l2cache.sav_object);
2116 if (error != 0 && error != ENOENT)
2117 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2118 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2119 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2120 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2121 &spa->spa_l2cache.sav_config) != 0)
2122 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2123
2124 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2125 spa_load_l2cache(spa);
2126 spa_config_exit(spa, SCL_ALL, FTAG);
2127 } else if (error == 0) {
2128 spa->spa_l2cache.sav_sync = B_TRUE;
2129 }
2130
2131 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2132
2133 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2134 if (error && error != ENOENT)
2135 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2136
2137 if (error == 0) {
2138 uint64_t autoreplace;
2139
2140 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2141 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2142 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2143 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2144 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2145 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2146 &spa->spa_dedup_ditto);
2147
2148 spa->spa_autoreplace = (autoreplace != 0);
2149 }
2150
2151 /*
2152 * If the 'autoreplace' property is set, then post a resource notifying
2153 * the ZFS DE that it should not issue any faults for unopenable
2154 * devices. We also iterate over the vdevs, and post a sysevent for any
2155 * unopenable vdevs so that the normal autoreplace handler can take
2156 * over.
2157 */
2158 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2159 spa_check_removed(spa->spa_root_vdev);
2160 /*
2161 * For the import case, this is done in spa_import(), because
2162 * at this point we're using the spare definitions from
2163 * the MOS config, not necessarily from the userland config.
2164 */
2165 if (state != SPA_LOAD_IMPORT) {
2166 spa_aux_check_removed(&spa->spa_spares);
2167 spa_aux_check_removed(&spa->spa_l2cache);
2168 }
2169 }
2170
2171 /*
2172 * Load the vdev state for all toplevel vdevs.
2173 */
2174 vdev_load(rvd);
2175
2176 /*
2177 * Propagate the leaf DTLs we just loaded all the way up the tree.
2178 */
2179 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2180 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2181 spa_config_exit(spa, SCL_ALL, FTAG);
2182
2183 /*
2184 * Load the DDTs (dedup tables).
2185 */
2186 error = ddt_load(spa);
2187 if (error != 0)
2188 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2189
2190 spa_update_dspace(spa);
2191
2192 /*
2193 * Validate the config, using the MOS config to fill in any
2194 * information which might be missing. If we fail to validate
2195 * the config then declare the pool unfit for use. If we're
2196 * assembling a pool from a split, the log is not transferred
2197 * over.
2198 */
2199 if (type != SPA_IMPORT_ASSEMBLE) {
2200 nvlist_t *nvconfig;
2201
2202 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2203 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2204
2205 if (!spa_config_valid(spa, nvconfig)) {
2206 nvlist_free(nvconfig);
2207 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2208 ENXIO));
2209 }
2210 nvlist_free(nvconfig);
2211
2212 /*
2213 * Now that we've validate the config, check the state of the
2214 * root vdev. If it can't be opened, it indicates one or
2215 * more toplevel vdevs are faulted.
2216 */
2217 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2218 return (ENXIO);
2219
2220 if (spa_check_logs(spa)) {
2221 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2222 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2223 }
2224 }
2225
2226 /*
2227 * We've successfully opened the pool, verify that we're ready
2228 * to start pushing transactions.
2229 */
2230 if (state != SPA_LOAD_TRYIMPORT) {
2231 if (error = spa_load_verify(spa))
2232 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2233 error));
2234 }
2235
2236 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2237 spa->spa_load_max_txg == UINT64_MAX)) {
2238 dmu_tx_t *tx;
2239 int need_update = B_FALSE;
2240
2241 ASSERT(state != SPA_LOAD_TRYIMPORT);
2242
2243 /*
2244 * Claim log blocks that haven't been committed yet.
2245 * This must all happen in a single txg.
2246 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2247 * invoked from zil_claim_log_block()'s i/o done callback.
2248 * Price of rollback is that we abandon the log.
2249 */
2250 spa->spa_claiming = B_TRUE;
2251
2252 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2253 spa_first_txg(spa));
2254 (void) dmu_objset_find(spa_name(spa),
2255 zil_claim, tx, DS_FIND_CHILDREN);
2256 dmu_tx_commit(tx);
2257
2258 spa->spa_claiming = B_FALSE;
2259
2260 spa_set_log_state(spa, SPA_LOG_GOOD);
2261 spa->spa_sync_on = B_TRUE;
2262 txg_sync_start(spa->spa_dsl_pool);
2263
2264 /*
2265 * Wait for all claims to sync. We sync up to the highest
2266 * claimed log block birth time so that claimed log blocks
2267 * don't appear to be from the future. spa_claim_max_txg
2268 * will have been set for us by either zil_check_log_chain()
2269 * (invoked from spa_check_logs()) or zil_claim() above.
2270 */
2271 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2272
2273 /*
2274 * If the config cache is stale, or we have uninitialized
2275 * metaslabs (see spa_vdev_add()), then update the config.
2276 *
2277 * If this is a verbatim import, trust the current
2278 * in-core spa_config and update the disk labels.
2279 */
2280 if (config_cache_txg != spa->spa_config_txg ||
2281 state == SPA_LOAD_IMPORT ||
2282 state == SPA_LOAD_RECOVER ||
2283 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2284 need_update = B_TRUE;
2285
2286 for (int c = 0; c < rvd->vdev_children; c++)
2287 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2288 need_update = B_TRUE;
2289
2290 /*
2291 * Update the config cache asychronously in case we're the
2292 * root pool, in which case the config cache isn't writable yet.
2293 */
2294 if (need_update)
2295 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2296
2297 /*
2298 * Check all DTLs to see if anything needs resilvering.
2299 */
2300 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2301 vdev_resilver_needed(rvd, NULL, NULL))
2302 spa_async_request(spa, SPA_ASYNC_RESILVER);
2303
2304 /*
2305 * Delete any inconsistent datasets.
2306 */
2307 (void) dmu_objset_find(spa_name(spa),
2308 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2309
2310 /*
2311 * Clean up any stale temporary dataset userrefs.
2312 */
2313 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2314 }
2315
2316 return (0);
2317 }
2318
2319 static int
2320 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2321 {
2322 int mode = spa->spa_mode;
2323
2324 spa_unload(spa);
2325 spa_deactivate(spa);
2326
2327 spa->spa_load_max_txg--;
2328
2329 spa_activate(spa, mode);
2330 spa_async_suspend(spa);
2331
2332 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2333 }
2334
2335 static int
2336 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2337 uint64_t max_request, int rewind_flags)
2338 {
2339 nvlist_t *config = NULL;
2340 int load_error, rewind_error;
2341 uint64_t safe_rewind_txg;
2342 uint64_t min_txg;
2343
2344 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2345 spa->spa_load_max_txg = spa->spa_load_txg;
2346 spa_set_log_state(spa, SPA_LOG_CLEAR);
2347 } else {
2348 spa->spa_load_max_txg = max_request;
2349 }
2350
2351 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2352 mosconfig);
2353 if (load_error == 0)
2354 return (0);
2355
2356 if (spa->spa_root_vdev != NULL)
2357 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2358
2359 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2360 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2361
2362 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2363 nvlist_free(config);
2364 return (load_error);
2365 }
2366
2367 /* Price of rolling back is discarding txgs, including log */
2368 if (state == SPA_LOAD_RECOVER)
2369 spa_set_log_state(spa, SPA_LOG_CLEAR);
2370
2371 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2372 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2373 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2374 TXG_INITIAL : safe_rewind_txg;
2375
2376 /*
2377 * Continue as long as we're finding errors, we're still within
2378 * the acceptable rewind range, and we're still finding uberblocks
2379 */
2380 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2381 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2382 if (spa->spa_load_max_txg < safe_rewind_txg)
2383 spa->spa_extreme_rewind = B_TRUE;
2384 rewind_error = spa_load_retry(spa, state, mosconfig);
2385 }
2386
2387 spa->spa_extreme_rewind = B_FALSE;
2388 spa->spa_load_max_txg = UINT64_MAX;
2389
2390 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2391 spa_config_set(spa, config);
2392
2393 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
2394 }
2395
2396 /*
2397 * Pool Open/Import
2398 *
2399 * The import case is identical to an open except that the configuration is sent
2400 * down from userland, instead of grabbed from the configuration cache. For the
2401 * case of an open, the pool configuration will exist in the
2402 * POOL_STATE_UNINITIALIZED state.
2403 *
2404 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2405 * the same time open the pool, without having to keep around the spa_t in some
2406 * ambiguous state.
2407 */
2408 static int
2409 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2410 nvlist_t **config)
2411 {
2412 spa_t *spa;
2413 spa_load_state_t state = SPA_LOAD_OPEN;
2414 int error;
2415 int locked = B_FALSE;
2416
2417 *spapp = NULL;
2418
2419 /*
2420 * As disgusting as this is, we need to support recursive calls to this
2421 * function because dsl_dir_open() is called during spa_load(), and ends
2422 * up calling spa_open() again. The real fix is to figure out how to
2423 * avoid dsl_dir_open() calling this in the first place.
2424 */
2425 if (mutex_owner(&spa_namespace_lock) != curthread) {
2426 mutex_enter(&spa_namespace_lock);
2427 locked = B_TRUE;
2428 }
2429
2430 if ((spa = spa_lookup(pool)) == NULL) {
2431 if (locked)
2432 mutex_exit(&spa_namespace_lock);
2433 return (ENOENT);
2434 }
2435
2436 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2437 zpool_rewind_policy_t policy;
2438
2439 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2440 &policy);
2441 if (policy.zrp_request & ZPOOL_DO_REWIND)
2442 state = SPA_LOAD_RECOVER;
2443
2444 spa_activate(spa, spa_mode_global);
2445
2446 if (state != SPA_LOAD_RECOVER)
2447 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2448
2449 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2450 policy.zrp_request);
2451
2452 if (error == EBADF) {
2453 /*
2454 * If vdev_validate() returns failure (indicated by
2455 * EBADF), it indicates that one of the vdevs indicates
2456 * that the pool has been exported or destroyed. If
2457 * this is the case, the config cache is out of sync and
2458 * we should remove the pool from the namespace.
2459 */
2460 spa_unload(spa);
2461 spa_deactivate(spa);
2462 spa_config_sync(spa, B_TRUE, B_TRUE);
2463 spa_remove(spa);
2464 if (locked)
2465 mutex_exit(&spa_namespace_lock);
2466 return (ENOENT);
2467 }
2468
2469 if (error) {
2470 /*
2471 * We can't open the pool, but we still have useful
2472 * information: the state of each vdev after the
2473 * attempted vdev_open(). Return this to the user.
2474 */
2475 if (config != NULL && spa->spa_config) {
2476 VERIFY(nvlist_dup(spa->spa_config, config,
2477 KM_SLEEP) == 0);
2478 VERIFY(nvlist_add_nvlist(*config,
2479 ZPOOL_CONFIG_LOAD_INFO,
2480 spa->spa_load_info) == 0);
2481 }
2482 spa_unload(spa);
2483 spa_deactivate(spa);
2484 spa->spa_last_open_failed = error;
2485 if (locked)
2486 mutex_exit(&spa_namespace_lock);
2487 *spapp = NULL;
2488 return (error);
2489 }
2490 }
2491
2492 spa_open_ref(spa, tag);
2493
2494 if (config != NULL)
2495 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2496
2497 /*
2498 * If we've recovered the pool, pass back any information we
2499 * gathered while doing the load.
2500 */
2501 if (state == SPA_LOAD_RECOVER) {
2502 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2503 spa->spa_load_info) == 0);
2504 }
2505
2506 if (locked) {
2507 spa->spa_last_open_failed = 0;
2508 spa->spa_last_ubsync_txg = 0;
2509 spa->spa_load_txg = 0;
2510 mutex_exit(&spa_namespace_lock);
2511 }
2512
2513 *spapp = spa;
2514
2515 return (0);
2516 }
2517
2518 int
2519 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2520 nvlist_t **config)
2521 {
2522 return (spa_open_common(name, spapp, tag, policy, config));
2523 }
2524
2525 int
2526 spa_open(const char *name, spa_t **spapp, void *tag)
2527 {
2528 return (spa_open_common(name, spapp, tag, NULL, NULL));
2529 }
2530
2531 /*
2532 * Lookup the given spa_t, incrementing the inject count in the process,
2533 * preventing it from being exported or destroyed.
2534 */
2535 spa_t *
2536 spa_inject_addref(char *name)
2537 {
2538 spa_t *spa;
2539
2540 mutex_enter(&spa_namespace_lock);
2541 if ((spa = spa_lookup(name)) == NULL) {
2542 mutex_exit(&spa_namespace_lock);
2543 return (NULL);
2544 }
2545 spa->spa_inject_ref++;
2546 mutex_exit(&spa_namespace_lock);
2547
2548 return (spa);
2549 }
2550
2551 void
2552 spa_inject_delref(spa_t *spa)
2553 {
2554 mutex_enter(&spa_namespace_lock);
2555 spa->spa_inject_ref--;
2556 mutex_exit(&spa_namespace_lock);
2557 }
2558
2559 /*
2560 * Add spares device information to the nvlist.
2561 */
2562 static void
2563 spa_add_spares(spa_t *spa, nvlist_t *config)
2564 {
2565 nvlist_t **spares;
2566 uint_t i, nspares;
2567 nvlist_t *nvroot;
2568 uint64_t guid;
2569 vdev_stat_t *vs;
2570 uint_t vsc;
2571 uint64_t pool;
2572
2573 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2574
2575 if (spa->spa_spares.sav_count == 0)
2576 return;
2577
2578 VERIFY(nvlist_lookup_nvlist(config,
2579 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2580 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2581 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2582 if (nspares != 0) {
2583 VERIFY(nvlist_add_nvlist_array(nvroot,
2584 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2585 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2586 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2587
2588 /*
2589 * Go through and find any spares which have since been
2590 * repurposed as an active spare. If this is the case, update
2591 * their status appropriately.
2592 */
2593 for (i = 0; i < nspares; i++) {
2594 VERIFY(nvlist_lookup_uint64(spares[i],
2595 ZPOOL_CONFIG_GUID, &guid) == 0);
2596 if (spa_spare_exists(guid, &pool, NULL) &&
2597 pool != 0ULL) {
2598 VERIFY(nvlist_lookup_uint64_array(
2599 spares[i], ZPOOL_CONFIG_VDEV_STATS,
2600 (uint64_t **)&vs, &vsc) == 0);
2601 vs->vs_state = VDEV_STATE_CANT_OPEN;
2602 vs->vs_aux = VDEV_AUX_SPARED;
2603 }
2604 }
2605 }
2606 }
2607
2608 /*
2609 * Add l2cache device information to the nvlist, including vdev stats.
2610 */
2611 static void
2612 spa_add_l2cache(spa_t *spa, nvlist_t *config)
2613 {
2614 nvlist_t **l2cache;
2615 uint_t i, j, nl2cache;
2616 nvlist_t *nvroot;
2617 uint64_t guid;
2618 vdev_t *vd;
2619 vdev_stat_t *vs;
2620 uint_t vsc;
2621
2622 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2623
2624 if (spa->spa_l2cache.sav_count == 0)
2625 return;
2626
2627 VERIFY(nvlist_lookup_nvlist(config,
2628 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2629 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2630 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2631 if (nl2cache != 0) {
2632 VERIFY(nvlist_add_nvlist_array(nvroot,
2633 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2634 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2635 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2636
2637 /*
2638 * Update level 2 cache device stats.
2639 */
2640
2641 for (i = 0; i < nl2cache; i++) {
2642 VERIFY(nvlist_lookup_uint64(l2cache[i],
2643 ZPOOL_CONFIG_GUID, &guid) == 0);
2644
2645 vd = NULL;
2646 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2647 if (guid ==
2648 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2649 vd = spa->spa_l2cache.sav_vdevs[j];
2650 break;
2651 }
2652 }
2653 ASSERT(vd != NULL);
2654
2655 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2656 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2657 == 0);
2658 vdev_get_stats(vd, vs);
2659 }
2660 }
2661 }
2662
2663 int
2664 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
2665 {
2666 int error;
2667 spa_t *spa;
2668
2669 *config = NULL;
2670 error = spa_open_common(name, &spa, FTAG, NULL, config);
2671
2672 if (spa != NULL) {
2673 /*
2674 * This still leaves a window of inconsistency where the spares
2675 * or l2cache devices could change and the config would be
2676 * self-inconsistent.
2677 */
2678 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2679
2680 if (*config != NULL) {
2681 uint64_t loadtimes[2];
2682
2683 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2684 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
2685 VERIFY(nvlist_add_uint64_array(*config,
2686 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
2687
2688 VERIFY(nvlist_add_uint64(*config,
2689 ZPOOL_CONFIG_ERRCOUNT,
2690 spa_get_errlog_size(spa)) == 0);
2691
2692 if (spa_suspended(spa))
2693 VERIFY(nvlist_add_uint64(*config,
2694 ZPOOL_CONFIG_SUSPENDED,
2695 spa->spa_failmode) == 0);
2696
2697 spa_add_spares(spa, *config);
2698 spa_add_l2cache(spa, *config);
2699 }
2700 }
2701
2702 /*
2703 * We want to get the alternate root even for faulted pools, so we cheat
2704 * and call spa_lookup() directly.
2705 */
2706 if (altroot) {
2707 if (spa == NULL) {
2708 mutex_enter(&spa_namespace_lock);
2709 spa = spa_lookup(name);
2710 if (spa)
2711 spa_altroot(spa, altroot, buflen);
2712 else
2713 altroot[0] = '\0';
2714 spa = NULL;
2715 mutex_exit(&spa_namespace_lock);
2716 } else {
2717 spa_altroot(spa, altroot, buflen);
2718 }
2719 }
2720
2721 if (spa != NULL) {
2722 spa_config_exit(spa, SCL_CONFIG, FTAG);
2723 spa_close(spa, FTAG);
2724 }
2725
2726 return (error);
2727 }
2728
2729 /*
2730 * Validate that the auxiliary device array is well formed. We must have an
2731 * array of nvlists, each which describes a valid leaf vdev. If this is an
2732 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
2733 * specified, as long as they are well-formed.
2734 */
2735 static int
2736 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
2737 spa_aux_vdev_t *sav, const char *config, uint64_t version,
2738 vdev_labeltype_t label)
2739 {
2740 nvlist_t **dev;
2741 uint_t i, ndev;
2742 vdev_t *vd;
2743 int error;
2744
2745 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2746
2747 /*
2748 * It's acceptable to have no devs specified.
2749 */
2750 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
2751 return (0);
2752
2753 if (ndev == 0)
2754 return (EINVAL);
2755
2756 /*
2757 * Make sure the pool is formatted with a version that supports this
2758 * device type.
2759 */
2760 if (spa_version(spa) < version)
2761 return (ENOTSUP);
2762
2763 /*
2764 * Set the pending device list so we correctly handle device in-use
2765 * checking.
2766 */
2767 sav->sav_pending = dev;
2768 sav->sav_npending = ndev;
2769
2770 for (i = 0; i < ndev; i++) {
2771 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
2772 mode)) != 0)
2773 goto out;
2774
2775 if (!vd->vdev_ops->vdev_op_leaf) {
2776 vdev_free(vd);
2777 error = EINVAL;
2778 goto out;
2779 }
2780
2781 /*
2782 * The L2ARC currently only supports disk devices in
2783 * kernel context. For user-level testing, we allow it.
2784 */
2785 #ifdef _KERNEL
2786 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
2787 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2788 error = ENOTBLK;
2789 goto out;
2790 }
2791 #endif
2792 vd->vdev_top = vd;
2793
2794 if ((error = vdev_open(vd)) == 0 &&
2795 (error = vdev_label_init(vd, crtxg, label)) == 0) {
2796 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
2797 vd->vdev_guid) == 0);
2798 }
2799
2800 vdev_free(vd);
2801
2802 if (error &&
2803 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
2804 goto out;
2805 else
2806 error = 0;
2807 }
2808
2809 out:
2810 sav->sav_pending = NULL;
2811 sav->sav_npending = 0;
2812 return (error);
2813 }
2814
2815 static int
2816 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2817 {
2818 int error;
2819
2820 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2821
2822 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2823 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2824 VDEV_LABEL_SPARE)) != 0) {
2825 return (error);
2826 }
2827
2828 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2829 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2830 VDEV_LABEL_L2CACHE));
2831 }
2832
2833 static void
2834 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2835 const char *config)
2836 {
2837 int i;
2838
2839 if (sav->sav_config != NULL) {
2840 nvlist_t **olddevs;
2841 uint_t oldndevs;
2842 nvlist_t **newdevs;
2843
2844 /*
2845 * Generate new dev list by concatentating with the
2846 * current dev list.
2847 */
2848 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2849 &olddevs, &oldndevs) == 0);
2850
2851 newdevs = kmem_alloc(sizeof (void *) *
2852 (ndevs + oldndevs), KM_SLEEP);
2853 for (i = 0; i < oldndevs; i++)
2854 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2855 KM_SLEEP) == 0);
2856 for (i = 0; i < ndevs; i++)
2857 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2858 KM_SLEEP) == 0);
2859
2860 VERIFY(nvlist_remove(sav->sav_config, config,
2861 DATA_TYPE_NVLIST_ARRAY) == 0);
2862
2863 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2864 config, newdevs, ndevs + oldndevs) == 0);
2865 for (i = 0; i < oldndevs + ndevs; i++)
2866 nvlist_free(newdevs[i]);
2867 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2868 } else {
2869 /*
2870 * Generate a new dev list.
2871 */
2872 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2873 KM_SLEEP) == 0);
2874 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2875 devs, ndevs) == 0);
2876 }
2877 }
2878
2879 /*
2880 * Stop and drop level 2 ARC devices
2881 */
2882 void
2883 spa_l2cache_drop(spa_t *spa)
2884 {
2885 vdev_t *vd;
2886 int i;
2887 spa_aux_vdev_t *sav = &spa->spa_l2cache;
2888
2889 for (i = 0; i < sav->sav_count; i++) {
2890 uint64_t pool;
2891
2892 vd = sav->sav_vdevs[i];
2893 ASSERT(vd != NULL);
2894
2895 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2896 pool != 0ULL && l2arc_vdev_present(vd))
2897 l2arc_remove_vdev(vd);
2898 if (vd->vdev_isl2cache)
2899 spa_l2cache_remove(vd);
2900 vdev_clear_stats(vd);
2901 (void) vdev_close(vd);
2902 }
2903 }
2904
2905 /*
2906 * Pool Creation
2907 */
2908 int
2909 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2910 const char *history_str, nvlist_t *zplprops)
2911 {
2912 spa_t *spa;
2913 char *altroot = NULL;
2914 vdev_t *rvd;
2915 dsl_pool_t *dp;
2916 dmu_tx_t *tx;
2917 int error = 0;
2918 uint64_t txg = TXG_INITIAL;
2919 nvlist_t **spares, **l2cache;
2920 uint_t nspares, nl2cache;
2921 uint64_t version, obj;
2922
2923 /*
2924 * If this pool already exists, return failure.
2925 */
2926 mutex_enter(&spa_namespace_lock);
2927 if (spa_lookup(pool) != NULL) {
2928 mutex_exit(&spa_namespace_lock);
2929 return (EEXIST);
2930 }
2931
2932 /*
2933 * Allocate a new spa_t structure.
2934 */
2935 (void) nvlist_lookup_string(props,
2936 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2937 spa = spa_add(pool, NULL, altroot);
2938 spa_activate(spa, spa_mode_global);
2939
2940 if (props && (error = spa_prop_validate(spa, props))) {
2941 spa_deactivate(spa);
2942 spa_remove(spa);
2943 mutex_exit(&spa_namespace_lock);
2944 return (error);
2945 }
2946
2947 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2948 &version) != 0)
2949 version = SPA_VERSION;
2950 ASSERT(version <= SPA_VERSION);
2951
2952 spa->spa_first_txg = txg;
2953 spa->spa_uberblock.ub_txg = txg - 1;
2954 spa->spa_uberblock.ub_version = version;
2955 spa->spa_ubsync = spa->spa_uberblock;
2956
2957 /*
2958 * Create "The Godfather" zio to hold all async IOs
2959 */
2960 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2961 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2962
2963 /*
2964 * Create the root vdev.
2965 */
2966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2967
2968 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2969
2970 ASSERT(error != 0 || rvd != NULL);
2971 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
2972
2973 if (error == 0 && !zfs_allocatable_devs(nvroot))
2974 error = EINVAL;
2975
2976 if (error == 0 &&
2977 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2978 (error = spa_validate_aux(spa, nvroot, txg,
2979 VDEV_ALLOC_ADD)) == 0) {
2980 for (int c = 0; c < rvd->vdev_children; c++) {
2981 vdev_metaslab_set_size(rvd->vdev_child[c]);
2982 vdev_expand(rvd->vdev_child[c], txg);
2983 }
2984 }
2985
2986 spa_config_exit(spa, SCL_ALL, FTAG);
2987
2988 if (error != 0) {
2989 spa_unload(spa);
2990 spa_deactivate(spa);
2991 spa_remove(spa);
2992 mutex_exit(&spa_namespace_lock);
2993 return (error);
2994 }
2995
2996 /*
2997 * Get the list of spares, if specified.
2998 */
2999 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3000 &spares, &nspares) == 0) {
3001 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3002 KM_SLEEP) == 0);
3003 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3004 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3005 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3006 spa_load_spares(spa);
3007 spa_config_exit(spa, SCL_ALL, FTAG);
3008 spa->spa_spares.sav_sync = B_TRUE;
3009 }
3010
3011 /*
3012 * Get the list of level 2 cache devices, if specified.
3013 */
3014 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3015 &l2cache, &nl2cache) == 0) {
3016 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3017 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3018 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3019 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3020 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3021 spa_load_l2cache(spa);
3022 spa_config_exit(spa, SCL_ALL, FTAG);
3023 spa->spa_l2cache.sav_sync = B_TRUE;
3024 }
3025
3026 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3027 spa->spa_meta_objset = dp->dp_meta_objset;
3028
3029 /*
3030 * Create DDTs (dedup tables).
3031 */
3032 ddt_create(spa);
3033
3034 spa_update_dspace(spa);
3035
3036 tx = dmu_tx_create_assigned(dp, txg);
3037
3038 /*
3039 * Create the pool config object.
3040 */
3041 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3042 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3043 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3044
3045 if (zap_add(spa->spa_meta_objset,
3046 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3047 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3048 cmn_err(CE_PANIC, "failed to add pool config");
3049 }
3050
3051 if (zap_add(spa->spa_meta_objset,
3052 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3053 sizeof (uint64_t), 1, &version, tx) != 0) {
3054 cmn_err(CE_PANIC, "failed to add pool version");
3055 }
3056
3057 /* Newly created pools with the right version are always deflated. */
3058 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3059 spa->spa_deflate = TRUE;
3060 if (zap_add(spa->spa_meta_objset,
3061 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3062 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3063 cmn_err(CE_PANIC, "failed to add deflate");
3064 }
3065 }
3066
3067 /*
3068 * Create the deferred-free bpobj. Turn off compression
3069 * because sync-to-convergence takes longer if the blocksize
3070 * keeps changing.
3071 */
3072 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3073 dmu_object_set_compress(spa->spa_meta_objset, obj,
3074 ZIO_COMPRESS_OFF, tx);
3075 if (zap_add(spa->spa_meta_objset,
3076 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3077 sizeof (uint64_t), 1, &obj, tx) != 0) {
3078 cmn_err(CE_PANIC, "failed to add bpobj");
3079 }
3080 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3081 spa->spa_meta_objset, obj));
3082
3083 /*
3084 * Create the pool's history object.
3085 */
3086 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3087 spa_history_create_obj(spa, tx);
3088
3089 /*
3090 * Set pool properties.
3091 */
3092 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3093 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3094 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3095 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3096
3097 if (props != NULL) {
3098 spa_configfile_set(spa, props, B_FALSE);
3099 spa_sync_props(spa, props, tx);
3100 }
3101
3102 dmu_tx_commit(tx);
3103
3104 spa->spa_sync_on = B_TRUE;
3105 txg_sync_start(spa->spa_dsl_pool);
3106
3107 /*
3108 * We explicitly wait for the first transaction to complete so that our
3109 * bean counters are appropriately updated.
3110 */
3111 txg_wait_synced(spa->spa_dsl_pool, txg);
3112
3113 spa_config_sync(spa, B_FALSE, B_TRUE);
3114
3115 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
3116 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
3117 spa_history_log_version(spa, LOG_POOL_CREATE);
3118
3119 spa->spa_minref = refcount_count(&spa->spa_refcount);
3120
3121 mutex_exit(&spa_namespace_lock);
3122
3123 return (0);
3124 }
3125
3126 #ifdef _KERNEL
3127 /*
3128 * Get the root pool information from the root disk, then import the root pool
3129 * during the system boot up time.
3130 */
3131 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3132
3133 static nvlist_t *
3134 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3135 {
3136 nvlist_t *config;
3137 nvlist_t *nvtop, *nvroot;
3138 uint64_t pgid;
3139
3140 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3141 return (NULL);
3142
3143 /*
3144 * Add this top-level vdev to the child array.
3145 */
3146 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3147 &nvtop) == 0);
3148 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3149 &pgid) == 0);
3150 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3151
3152 /*
3153 * Put this pool's top-level vdevs into a root vdev.
3154 */
3155 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3156 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3157 VDEV_TYPE_ROOT) == 0);
3158 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3159 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3160 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3161 &nvtop, 1) == 0);
3162
3163 /*
3164 * Replace the existing vdev_tree with the new root vdev in
3165 * this pool's configuration (remove the old, add the new).
3166 */
3167 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3168 nvlist_free(nvroot);
3169 return (config);
3170 }
3171
3172 /*
3173 * Walk the vdev tree and see if we can find a device with "better"
3174 * configuration. A configuration is "better" if the label on that
3175 * device has a more recent txg.
3176 */
3177 static void
3178 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3179 {
3180 for (int c = 0; c < vd->vdev_children; c++)
3181 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3182
3183 if (vd->vdev_ops->vdev_op_leaf) {
3184 nvlist_t *label;
3185 uint64_t label_txg;
3186
3187 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3188 &label) != 0)
3189 return;
3190
3191 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3192 &label_txg) == 0);
3193
3194 /*
3195 * Do we have a better boot device?
3196 */
3197 if (label_txg > *txg) {
3198 *txg = label_txg;
3199 *avd = vd;
3200 }
3201 nvlist_free(label);
3202 }
3203 }
3204
3205 /*
3206 * Import a root pool.
3207 *
3208 * For x86. devpath_list will consist of devid and/or physpath name of
3209 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3210 * The GRUB "findroot" command will return the vdev we should boot.
3211 *
3212 * For Sparc, devpath_list consists the physpath name of the booting device
3213 * no matter the rootpool is a single device pool or a mirrored pool.
3214 * e.g.
3215 * "/pci@1f,0/ide@d/disk@0,0:a"
3216 */
3217 int
3218 spa_import_rootpool(char *devpath, char *devid)
3219 {
3220 spa_t *spa;
3221 vdev_t *rvd, *bvd, *avd = NULL;
3222 nvlist_t *config, *nvtop;
3223 uint64_t guid, txg;
3224 char *pname;
3225 int error;
3226
3227 /*
3228 * Read the label from the boot device and generate a configuration.
3229 */
3230 config = spa_generate_rootconf(devpath, devid, &guid);
3231 #if defined(_OBP) && defined(_KERNEL)
3232 if (config == NULL) {
3233 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3234 /* iscsi boot */
3235 get_iscsi_bootpath_phy(devpath);
3236 config = spa_generate_rootconf(devpath, devid, &guid);
3237 }
3238 }
3239 #endif
3240 if (config == NULL) {
3241 cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
3242 devpath);
3243 return (EIO);
3244 }
3245
3246 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3247 &pname) == 0);
3248 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3249
3250 mutex_enter(&spa_namespace_lock);
3251 if ((spa = spa_lookup(pname)) != NULL) {
3252 /*
3253 * Remove the existing root pool from the namespace so that we
3254 * can replace it with the correct config we just read in.
3255 */
3256 spa_remove(spa);
3257 }
3258
3259 spa = spa_add(pname, config, NULL);
3260 spa->spa_is_root = B_TRUE;
3261 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3262
3263 /*
3264 * Build up a vdev tree based on the boot device's label config.
3265 */
3266 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3267 &nvtop) == 0);
3268 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3269 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3270 VDEV_ALLOC_ROOTPOOL);
3271 spa_config_exit(spa, SCL_ALL, FTAG);
3272 if (error) {
3273 mutex_exit(&spa_namespace_lock);
3274 nvlist_free(config);
3275 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3276 pname);
3277 return (error);
3278 }
3279
3280 /*
3281 * Get the boot vdev.
3282 */
3283 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3284 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3285 (u_longlong_t)guid);
3286 error = ENOENT;
3287 goto out;
3288 }
3289
3290 /*
3291 * Determine if there is a better boot device.
3292 */
3293 avd = bvd;
3294 spa_alt_rootvdev(rvd, &avd, &txg);
3295 if (avd != bvd) {
3296 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3297 "try booting from '%s'", avd->vdev_path);
3298 error = EINVAL;
3299 goto out;
3300 }
3301
3302 /*
3303 * If the boot device is part of a spare vdev then ensure that
3304 * we're booting off the active spare.
3305 */
3306 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3307 !bvd->vdev_isspare) {
3308 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3309 "try booting from '%s'",
3310 bvd->vdev_parent->
3311 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3312 error = EINVAL;
3313 goto out;
3314 }
3315
3316 error = 0;
3317 spa_history_log_version(spa, LOG_POOL_IMPORT);
3318 out:
3319 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3320 vdev_free(rvd);
3321 spa_config_exit(spa, SCL_ALL, FTAG);
3322 mutex_exit(&spa_namespace_lock);
3323
3324 nvlist_free(config);
3325 return (error);
3326 }
3327
3328 #endif
3329
3330 /*
3331 * Import a non-root pool into the system.
3332 */
3333 int
3334 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3335 {
3336 spa_t *spa;
3337 char *altroot = NULL;
3338 spa_load_state_t state = SPA_LOAD_IMPORT;
3339 zpool_rewind_policy_t policy;
3340 uint64_t mode = spa_mode_global;
3341 uint64_t readonly = B_FALSE;
3342 int error;
3343 nvlist_t *nvroot;
3344 nvlist_t **spares, **l2cache;
3345 uint_t nspares, nl2cache;
3346
3347 /*
3348 * If a pool with this name exists, return failure.
3349 */
3350 mutex_enter(&spa_namespace_lock);
3351 if (spa_lookup(pool) != NULL) {
3352 mutex_exit(&spa_namespace_lock);
3353 return (EEXIST);
3354 }
3355
3356 /*
3357 * Create and initialize the spa structure.
3358 */
3359 (void) nvlist_lookup_string(props,
3360 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3361 (void) nvlist_lookup_uint64(props,
3362 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3363 if (readonly)
3364 mode = FREAD;
3365 spa = spa_add(pool, config, altroot);
3366 spa->spa_import_flags = flags;
3367
3368 /*
3369 * Verbatim import - Take a pool and insert it into the namespace
3370 * as if it had been loaded at boot.
3371 */
3372 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3373 if (props != NULL)
3374 spa_configfile_set(spa, props, B_FALSE);
3375
3376 spa_config_sync(spa, B_FALSE, B_TRUE);
3377
3378 mutex_exit(&spa_namespace_lock);
3379 spa_history_log_version(spa, LOG_POOL_IMPORT);
3380
3381 return (0);
3382 }
3383
3384 spa_activate(spa, mode);
3385
3386 /*
3387 * Don't start async tasks until we know everything is healthy.
3388 */
3389 spa_async_suspend(spa);
3390
3391 zpool_get_rewind_policy(config, &policy);
3392 if (policy.zrp_request & ZPOOL_DO_REWIND)
3393 state = SPA_LOAD_RECOVER;
3394
3395 /*
3396 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
3397 * because the user-supplied config is actually the one to trust when
3398 * doing an import.
3399 */
3400 if (state != SPA_LOAD_RECOVER)
3401 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3402
3403 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3404 policy.zrp_request);
3405
3406 /*
3407 * Propagate anything learned while loading the pool and pass it
3408 * back to caller (i.e. rewind info, missing devices, etc).
3409 */
3410 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3411 spa->spa_load_info) == 0);
3412
3413 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3414 /*
3415 * Toss any existing sparelist, as it doesn't have any validity
3416 * anymore, and conflicts with spa_has_spare().
3417 */
3418 if (spa->spa_spares.sav_config) {
3419 nvlist_free(spa->spa_spares.sav_config);
3420 spa->spa_spares.sav_config = NULL;
3421 spa_load_spares(spa);
3422 }
3423 if (spa->spa_l2cache.sav_config) {
3424 nvlist_free(spa->spa_l2cache.sav_config);
3425 spa->spa_l2cache.sav_config = NULL;
3426 spa_load_l2cache(spa);
3427 }
3428
3429 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3430 &nvroot) == 0);
3431 if (error == 0)
3432 error = spa_validate_aux(spa, nvroot, -1ULL,
3433 VDEV_ALLOC_SPARE);
3434 if (error == 0)
3435 error = spa_validate_aux(spa, nvroot, -1ULL,
3436 VDEV_ALLOC_L2CACHE);
3437 spa_config_exit(spa, SCL_ALL, FTAG);
3438
3439 if (props != NULL)
3440 spa_configfile_set(spa, props, B_FALSE);
3441
3442 if (error != 0 || (props && spa_writeable(spa) &&
3443 (error = spa_prop_set(spa, props)))) {
3444 spa_unload(spa);
3445 spa_deactivate(spa);
3446 spa_remove(spa);
3447 mutex_exit(&spa_namespace_lock);
3448 return (error);
3449 }
3450
3451 spa_async_resume(spa);
3452
3453 /*
3454 * Override any spares and level 2 cache devices as specified by
3455 * the user, as these may have correct device names/devids, etc.
3456 */
3457 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3458 &spares, &nspares) == 0) {
3459 if (spa->spa_spares.sav_config)
3460 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3461 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3462 else
3463 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3464 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3465 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3466 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3467 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3468 spa_load_spares(spa);
3469 spa_config_exit(spa, SCL_ALL, FTAG);
3470 spa->spa_spares.sav_sync = B_TRUE;
3471 }
3472 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3473 &l2cache, &nl2cache) == 0) {
3474 if (spa->spa_l2cache.sav_config)
3475 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3476 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3477 else
3478 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3479 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3480 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3481 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3482 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3483 spa_load_l2cache(spa);
3484 spa_config_exit(spa, SCL_ALL, FTAG);
3485 spa->spa_l2cache.sav_sync = B_TRUE;
3486 }
3487
3488 /*
3489 * Check for any removed devices.
3490 */
3491 if (spa->spa_autoreplace) {
3492 spa_aux_check_removed(&spa->spa_spares);
3493 spa_aux_check_removed(&spa->spa_l2cache);
3494 }
3495
3496 if (spa_writeable(spa)) {
3497 /*
3498 * Update the config cache to include the newly-imported pool.
3499 */
3500 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3501 }
3502
3503 /*
3504 * It's possible that the pool was expanded while it was exported.
3505 * We kick off an async task to handle this for us.
3506 */
3507 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3508
3509 mutex_exit(&spa_namespace_lock);
3510 spa_history_log_version(spa, LOG_POOL_IMPORT);
3511
3512 return (0);
3513 }
3514
3515 nvlist_t *
3516 spa_tryimport(nvlist_t *tryconfig)
3517 {
3518 nvlist_t *config = NULL;
3519 char *poolname;
3520 spa_t *spa;
3521 uint64_t state;
3522 int error;
3523
3524 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3525 return (NULL);
3526
3527 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3528 return (NULL);
3529
3530 /*
3531 * Create and initialize the spa structure.
3532 */
3533 mutex_enter(&spa_namespace_lock);
3534 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
3535 spa_activate(spa, FREAD);
3536
3537 /*
3538 * Pass off the heavy lifting to spa_load().
3539 * Pass TRUE for mosconfig because the user-supplied config
3540 * is actually the one to trust when doing an import.
3541 */
3542 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3543
3544 /*
3545 * If 'tryconfig' was at least parsable, return the current config.
3546 */
3547 if (spa->spa_root_vdev != NULL) {
3548 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3549 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3550 poolname) == 0);
3551 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3552 state) == 0);
3553 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3554 spa->spa_uberblock.ub_timestamp) == 0);
3555
3556 /*
3557 * If the bootfs property exists on this pool then we
3558 * copy it out so that external consumers can tell which
3559 * pools are bootable.
3560 */
3561 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3562 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3563
3564 /*
3565 * We have to play games with the name since the
3566 * pool was opened as TRYIMPORT_NAME.
3567 */
3568 if (dsl_dsobj_to_dsname(spa_name(spa),
3569 spa->spa_bootfs, tmpname) == 0) {
3570 char *cp;
3571 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3572
3573 cp = strchr(tmpname, '/');
3574 if (cp == NULL) {
3575 (void) strlcpy(dsname, tmpname,
3576 MAXPATHLEN);
3577 } else {
3578 (void) snprintf(dsname, MAXPATHLEN,
3579 "%s/%s", poolname, ++cp);
3580 }
3581 VERIFY(nvlist_add_string(config,
3582 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
3583 kmem_free(dsname, MAXPATHLEN);
3584 }
3585 kmem_free(tmpname, MAXPATHLEN);
3586 }
3587
3588 /*
3589 * Add the list of hot spares and level 2 cache devices.
3590 */
3591 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3592 spa_add_spares(spa, config);
3593 spa_add_l2cache(spa, config);
3594 spa_config_exit(spa, SCL_CONFIG, FTAG);
3595 }
3596
3597 spa_unload(spa);
3598 spa_deactivate(spa);
3599 spa_remove(spa);
3600 mutex_exit(&spa_namespace_lock);
3601
3602 return (config);
3603 }
3604
3605 /*
3606 * Pool export/destroy
3607 *
3608 * The act of destroying or exporting a pool is very simple. We make sure there
3609 * is no more pending I/O and any references to the pool are gone. Then, we
3610 * update the pool state and sync all the labels to disk, removing the
3611 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
3612 * we don't sync the labels or remove the configuration cache.
3613 */
3614 static int
3615 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
3616 boolean_t force, boolean_t hardforce)
3617 {
3618 spa_t *spa;
3619
3620 if (oldconfig)
3621 *oldconfig = NULL;
3622
3623 if (!(spa_mode_global & FWRITE))
3624 return (EROFS);
3625
3626 mutex_enter(&spa_namespace_lock);
3627 if ((spa = spa_lookup(pool)) == NULL) {
3628 mutex_exit(&spa_namespace_lock);
3629 return (ENOENT);
3630 }
3631
3632 /*
3633 * Put a hold on the pool, drop the namespace lock, stop async tasks,
3634 * reacquire the namespace lock, and see if we can export.
3635 */
3636 spa_open_ref(spa, FTAG);
3637 mutex_exit(&spa_namespace_lock);
3638 spa_async_suspend(spa);
3639 mutex_enter(&spa_namespace_lock);
3640 spa_close(spa, FTAG);
3641
3642 /*
3643 * The pool will be in core if it's openable,
3644 * in which case we can modify its state.
3645 */
3646 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
3647 /*
3648 * Objsets may be open only because they're dirty, so we
3649 * have to force it to sync before checking spa_refcnt.
3650 */
3651 txg_wait_synced(spa->spa_dsl_pool, 0);
3652
3653 /*
3654 * A pool cannot be exported or destroyed if there are active
3655 * references. If we are resetting a pool, allow references by
3656 * fault injection handlers.
3657 */
3658 if (!spa_refcount_zero(spa) ||
3659 (spa->spa_inject_ref != 0 &&
3660 new_state != POOL_STATE_UNINITIALIZED)) {
3661 spa_async_resume(spa);
3662 mutex_exit(&spa_namespace_lock);
3663 return (EBUSY);
3664 }
3665
3666 /*
3667 * A pool cannot be exported if it has an active shared spare.
3668 * This is to prevent other pools stealing the active spare
3669 * from an exported pool. At user's own will, such pool can
3670 * be forcedly exported.
3671 */
3672 if (!force && new_state == POOL_STATE_EXPORTED &&
3673 spa_has_active_shared_spare(spa)) {
3674 spa_async_resume(spa);
3675 mutex_exit(&spa_namespace_lock);
3676 return (EXDEV);
3677 }
3678
3679 /*
3680 * We want this to be reflected on every label,
3681 * so mark them all dirty. spa_unload() will do the
3682 * final sync that pushes these changes out.
3683 */
3684 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
3685 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3686 spa->spa_state = new_state;
3687 spa->spa_final_txg = spa_last_synced_txg(spa) +
3688 TXG_DEFER_SIZE + 1;
3689 vdev_config_dirty(spa->spa_root_vdev);
3690 spa_config_exit(spa, SCL_ALL, FTAG);
3691 }
3692 }
3693
3694 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
3695
3696 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3697 spa_unload(spa);
3698 spa_deactivate(spa);
3699 }
3700
3701 if (oldconfig && spa->spa_config)
3702 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
3703
3704 if (new_state != POOL_STATE_UNINITIALIZED) {
3705 if (!hardforce)
3706 spa_config_sync(spa, B_TRUE, B_TRUE);
3707 spa_remove(spa);
3708 }
3709 mutex_exit(&spa_namespace_lock);
3710
3711 return (0);
3712 }
3713
3714 /*
3715 * Destroy a storage pool.
3716 */
3717 int
3718 spa_destroy(char *pool)
3719 {
3720 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
3721 B_FALSE, B_FALSE));
3722 }
3723
3724 /*
3725 * Export a storage pool.
3726 */
3727 int
3728 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
3729 boolean_t hardforce)
3730 {
3731 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
3732 force, hardforce));
3733 }
3734
3735 /*
3736 * Similar to spa_export(), this unloads the spa_t without actually removing it
3737 * from the namespace in any way.
3738 */
3739 int
3740 spa_reset(char *pool)
3741 {
3742 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
3743 B_FALSE, B_FALSE));
3744 }
3745
3746 /*
3747 * ==========================================================================
3748 * Device manipulation
3749 * ==========================================================================
3750 */
3751
3752 /*
3753 * Add a device to a storage pool.
3754 */
3755 int
3756 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
3757 {
3758 uint64_t txg, id;
3759 int error;
3760 vdev_t *rvd = spa->spa_root_vdev;
3761 vdev_t *vd, *tvd;
3762 nvlist_t **spares, **l2cache;
3763 uint_t nspares, nl2cache;
3764
3765 ASSERT(spa_writeable(spa));
3766
3767 txg = spa_vdev_enter(spa);
3768
3769 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
3770 VDEV_ALLOC_ADD)) != 0)
3771 return (spa_vdev_exit(spa, NULL, txg, error));
3772
3773 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
3774
3775 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
3776 &nspares) != 0)
3777 nspares = 0;
3778
3779 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
3780 &nl2cache) != 0)
3781 nl2cache = 0;
3782
3783 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
3784 return (spa_vdev_exit(spa, vd, txg, EINVAL));
3785
3786 if (vd->vdev_children != 0 &&
3787 (error = vdev_create(vd, txg, B_FALSE)) != 0)
3788 return (spa_vdev_exit(spa, vd, txg, error));
3789
3790 /*
3791 * We must validate the spares and l2cache devices after checking the
3792 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
3793 */
3794 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
3795 return (spa_vdev_exit(spa, vd, txg, error));
3796
3797 /*
3798 * Transfer each new top-level vdev from vd to rvd.
3799 */
3800 for (int c = 0; c < vd->vdev_children; c++) {
3801
3802 /*
3803 * Set the vdev id to the first hole, if one exists.
3804 */
3805 for (id = 0; id < rvd->vdev_children; id++) {
3806 if (rvd->vdev_child[id]->vdev_ishole) {
3807 vdev_free(rvd->vdev_child[id]);
3808 break;
3809 }
3810 }
3811 tvd = vd->vdev_child[c];
3812 vdev_remove_child(vd, tvd);
3813 tvd->vdev_id = id;
3814 vdev_add_child(rvd, tvd);
3815 vdev_config_dirty(tvd);
3816 }
3817
3818 if (nspares != 0) {
3819 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
3820 ZPOOL_CONFIG_SPARES);
3821 spa_load_spares(spa);
3822 spa->spa_spares.sav_sync = B_TRUE;
3823 }
3824
3825 if (nl2cache != 0) {
3826 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
3827 ZPOOL_CONFIG_L2CACHE);
3828 spa_load_l2cache(spa);
3829 spa->spa_l2cache.sav_sync = B_TRUE;
3830 }
3831
3832 /*
3833 * We have to be careful when adding new vdevs to an existing pool.
3834 * If other threads start allocating from these vdevs before we
3835 * sync the config cache, and we lose power, then upon reboot we may
3836 * fail to open the pool because there are DVAs that the config cache
3837 * can't translate. Therefore, we first add the vdevs without
3838 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
3839 * and then let spa_config_update() initialize the new metaslabs.
3840 *
3841 * spa_load() checks for added-but-not-initialized vdevs, so that
3842 * if we lose power at any point in this sequence, the remaining
3843 * steps will be completed the next time we load the pool.
3844 */
3845 (void) spa_vdev_exit(spa, vd, txg, 0);
3846
3847 mutex_enter(&spa_namespace_lock);
3848 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3849 mutex_exit(&spa_namespace_lock);
3850
3851 return (0);
3852 }
3853
3854 /*
3855 * Attach a device to a mirror. The arguments are the path to any device
3856 * in the mirror, and the nvroot for the new device. If the path specifies
3857 * a device that is not mirrored, we automatically insert the mirror vdev.
3858 *
3859 * If 'replacing' is specified, the new device is intended to replace the
3860 * existing device; in this case the two devices are made into their own
3861 * mirror using the 'replacing' vdev, which is functionally identical to
3862 * the mirror vdev (it actually reuses all the same ops) but has a few
3863 * extra rules: you can't attach to it after it's been created, and upon
3864 * completion of resilvering, the first disk (the one being replaced)
3865 * is automatically detached.
3866 */
3867 int
3868 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3869 {
3870 uint64_t txg, dtl_max_txg;
3871 vdev_t *rvd = spa->spa_root_vdev;
3872 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
3873 vdev_ops_t *pvops;
3874 char *oldvdpath, *newvdpath;
3875 int newvd_isspare;
3876 int error;
3877
3878 ASSERT(spa_writeable(spa));
3879
3880 txg = spa_vdev_enter(spa);
3881
3882 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3883
3884 if (oldvd == NULL)
3885 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3886
3887 if (!oldvd->vdev_ops->vdev_op_leaf)
3888 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3889
3890 pvd = oldvd->vdev_parent;
3891
3892 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
3893 VDEV_ALLOC_ADD)) != 0)
3894 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
3895
3896 if (newrootvd->vdev_children != 1)
3897 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3898
3899 newvd = newrootvd->vdev_child[0];
3900
3901 if (!newvd->vdev_ops->vdev_op_leaf)
3902 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3903
3904 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3905 return (spa_vdev_exit(spa, newrootvd, txg, error));
3906
3907 /*
3908 * Spares can't replace logs
3909 */
3910 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
3911 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3912
3913 if (!replacing) {
3914 /*
3915 * For attach, the only allowable parent is a mirror or the root
3916 * vdev.
3917 */
3918 if (pvd->vdev_ops != &vdev_mirror_ops &&
3919 pvd->vdev_ops != &vdev_root_ops)
3920 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3921
3922 pvops = &vdev_mirror_ops;
3923 } else {
3924 /*
3925 * Active hot spares can only be replaced by inactive hot
3926 * spares.
3927 */
3928 if (pvd->vdev_ops == &vdev_spare_ops &&
3929 oldvd->vdev_isspare &&
3930 !spa_has_spare(spa, newvd->vdev_guid))
3931 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3932
3933 /*
3934 * If the source is a hot spare, and the parent isn't already a
3935 * spare, then we want to create a new hot spare. Otherwise, we
3936 * want to create a replacing vdev. The user is not allowed to
3937 * attach to a spared vdev child unless the 'isspare' state is
3938 * the same (spare replaces spare, non-spare replaces
3939 * non-spare).
3940 */
3941 if (pvd->vdev_ops == &vdev_replacing_ops &&
3942 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
3943 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3944 } else if (pvd->vdev_ops == &vdev_spare_ops &&
3945 newvd->vdev_isspare != oldvd->vdev_isspare) {
3946 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3947 }
3948
3949 if (newvd->vdev_isspare)
3950 pvops = &vdev_spare_ops;
3951 else
3952 pvops = &vdev_replacing_ops;
3953 }
3954
3955 /*
3956 * Make sure the new device is big enough.
3957 */
3958 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3959 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3960
3961 /*
3962 * The new device cannot have a higher alignment requirement
3963 * than the top-level vdev.
3964 */
3965 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3966 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3967
3968 /*
3969 * If this is an in-place replacement, update oldvd's path and devid
3970 * to make it distinguishable from newvd, and unopenable from now on.
3971 */
3972 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3973 spa_strfree(oldvd->vdev_path);
3974 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3975 KM_SLEEP);
3976 (void) sprintf(oldvd->vdev_path, "%s/%s",
3977 newvd->vdev_path, "old");
3978 if (oldvd->vdev_devid != NULL) {
3979 spa_strfree(oldvd->vdev_devid);
3980 oldvd->vdev_devid = NULL;
3981 }
3982 }
3983
3984 /* mark the device being resilvered */
3985 newvd->vdev_resilvering = B_TRUE;
3986
3987 /*
3988 * If the parent is not a mirror, or if we're replacing, insert the new
3989 * mirror/replacing/spare vdev above oldvd.
3990 */
3991 if (pvd->vdev_ops != pvops)
3992 pvd = vdev_add_parent(oldvd, pvops);
3993
3994 ASSERT(pvd->vdev_top->vdev_parent == rvd);
3995 ASSERT(pvd->vdev_ops == pvops);
3996 ASSERT(oldvd->vdev_parent == pvd);
3997
3998 /*
3999 * Extract the new device from its root and add it to pvd.
4000 */
4001 vdev_remove_child(newrootvd, newvd);
4002 newvd->vdev_id = pvd->vdev_children;
4003 newvd->vdev_crtxg = oldvd->vdev_crtxg;
4004 vdev_add_child(pvd, newvd);
4005
4006 tvd = newvd->vdev_top;
4007 ASSERT(pvd->vdev_top == tvd);
4008 ASSERT(tvd->vdev_parent == rvd);
4009
4010 vdev_config_dirty(tvd);
4011
4012 /*
4013 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4014 * for any dmu_sync-ed blocks. It will propagate upward when
4015 * spa_vdev_exit() calls vdev_dtl_reassess().
4016 */
4017 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4018
4019 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4020 dtl_max_txg - TXG_INITIAL);
4021
4022 if (newvd->vdev_isspare) {
4023 spa_spare_activate(newvd);
4024 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4025 }
4026
4027 oldvdpath = spa_strdup(oldvd->vdev_path);
4028 newvdpath = spa_strdup(newvd->vdev_path);
4029 newvd_isspare = newvd->vdev_isspare;
4030
4031 /*
4032 * Mark newvd's DTL dirty in this txg.
4033 */
4034 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4035
4036 /*
4037 * Restart the resilver
4038 */
4039 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4040
4041 /*
4042 * Commit the config
4043 */
4044 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4045
4046 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
4047 "%s vdev=%s %s vdev=%s",
4048 replacing && newvd_isspare ? "spare in" :
4049 replacing ? "replace" : "attach", newvdpath,
4050 replacing ? "for" : "to", oldvdpath);
4051
4052 spa_strfree(oldvdpath);
4053 spa_strfree(newvdpath);
4054
4055 if (spa->spa_bootfs)
4056 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4057
4058 return (0);
4059 }
4060
4061 /*
4062 * Detach a device from a mirror or replacing vdev.
4063 * If 'replace_done' is specified, only detach if the parent
4064 * is a replacing vdev.
4065 */
4066 int
4067 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4068 {
4069 uint64_t txg;
4070 int error;
4071 vdev_t *rvd = spa->spa_root_vdev;
4072 vdev_t *vd, *pvd, *cvd, *tvd;
4073 boolean_t unspare = B_FALSE;
4074 uint64_t unspare_guid;
4075 char *vdpath;
4076
4077 ASSERT(spa_writeable(spa));
4078
4079 txg = spa_vdev_enter(spa);
4080
4081 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4082
4083 if (vd == NULL)
4084 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4085
4086 if (!vd->vdev_ops->vdev_op_leaf)
4087 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4088
4089 pvd = vd->vdev_parent;
4090
4091 /*
4092 * If the parent/child relationship is not as expected, don't do it.
4093 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4094 * vdev that's replacing B with C. The user's intent in replacing
4095 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4096 * the replace by detaching C, the expected behavior is to end up
4097 * M(A,B). But suppose that right after deciding to detach C,
4098 * the replacement of B completes. We would have M(A,C), and then
4099 * ask to detach C, which would leave us with just A -- not what
4100 * the user wanted. To prevent this, we make sure that the
4101 * parent/child relationship hasn't changed -- in this example,
4102 * that C's parent is still the replacing vdev R.
4103 */
4104 if (pvd->vdev_guid != pguid && pguid != 0)
4105 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4106
4107 /*
4108 * Only 'replacing' or 'spare' vdevs can be replaced.
4109 */
4110 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4111 pvd->vdev_ops != &vdev_spare_ops)
4112 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4113
4114 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4115 spa_version(spa) >= SPA_VERSION_SPARES);
4116
4117 /*
4118 * Only mirror, replacing, and spare vdevs support detach.
4119 */
4120 if (pvd->vdev_ops != &vdev_replacing_ops &&
4121 pvd->vdev_ops != &vdev_mirror_ops &&
4122 pvd->vdev_ops != &vdev_spare_ops)
4123 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4124
4125 /*
4126 * If this device has the only valid copy of some data,
4127 * we cannot safely detach it.
4128 */
4129 if (vdev_dtl_required(vd))
4130 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4131
4132 ASSERT(pvd->vdev_children >= 2);
4133
4134 /*
4135 * If we are detaching the second disk from a replacing vdev, then
4136 * check to see if we changed the original vdev's path to have "/old"
4137 * at the end in spa_vdev_attach(). If so, undo that change now.
4138 */
4139 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4140 vd->vdev_path != NULL) {
4141 size_t len = strlen(vd->vdev_path);
4142
4143 for (int c = 0; c < pvd->vdev_children; c++) {
4144 cvd = pvd->vdev_child[c];
4145
4146 if (cvd == vd || cvd->vdev_path == NULL)
4147 continue;
4148
4149 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4150 strcmp(cvd->vdev_path + len, "/old") == 0) {
4151 spa_strfree(cvd->vdev_path);
4152 cvd->vdev_path = spa_strdup(vd->vdev_path);
4153 break;
4154 }
4155 }
4156 }
4157
4158 /*
4159 * If we are detaching the original disk from a spare, then it implies
4160 * that the spare should become a real disk, and be removed from the
4161 * active spare list for the pool.
4162 */
4163 if (pvd->vdev_ops == &vdev_spare_ops &&
4164 vd->vdev_id == 0 &&
4165 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4166 unspare = B_TRUE;
4167
4168 /*
4169 * Erase the disk labels so the disk can be used for other things.
4170 * This must be done after all other error cases are handled,
4171 * but before we disembowel vd (so we can still do I/O to it).
4172 * But if we can't do it, don't treat the error as fatal --
4173 * it may be that the unwritability of the disk is the reason
4174 * it's being detached!
4175 */
4176 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4177
4178 /*
4179 * Remove vd from its parent and compact the parent's children.
4180 */
4181 vdev_remove_child(pvd, vd);
4182 vdev_compact_children(pvd);
4183
4184 /*
4185 * Remember one of the remaining children so we can get tvd below.
4186 */
4187 cvd = pvd->vdev_child[pvd->vdev_children - 1];
4188
4189 /*
4190 * If we need to remove the remaining child from the list of hot spares,
4191 * do it now, marking the vdev as no longer a spare in the process.
4192 * We must do this before vdev_remove_parent(), because that can
4193 * change the GUID if it creates a new toplevel GUID. For a similar
4194 * reason, we must remove the spare now, in the same txg as the detach;
4195 * otherwise someone could attach a new sibling, change the GUID, and
4196 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4197 */
4198 if (unspare) {
4199 ASSERT(cvd->vdev_isspare);
4200 spa_spare_remove(cvd);
4201 unspare_guid = cvd->vdev_guid;
4202 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4203 cvd->vdev_unspare = B_TRUE;
4204 }
4205
4206 /*
4207 * If the parent mirror/replacing vdev only has one child,
4208 * the parent is no longer needed. Remove it from the tree.
4209 */
4210 if (pvd->vdev_children == 1) {
4211 if (pvd->vdev_ops == &vdev_spare_ops)
4212 cvd->vdev_unspare = B_FALSE;
4213 vdev_remove_parent(cvd);
4214 cvd->vdev_resilvering = B_FALSE;
4215 }
4216
4217
4218 /*
4219 * We don't set tvd until now because the parent we just removed
4220 * may have been the previous top-level vdev.
4221 */
4222 tvd = cvd->vdev_top;
4223 ASSERT(tvd->vdev_parent == rvd);
4224
4225 /*
4226 * Reevaluate the parent vdev state.
4227 */
4228 vdev_propagate_state(cvd);
4229
4230 /*
4231 * If the 'autoexpand' property is set on the pool then automatically
4232 * try to expand the size of the pool. For example if the device we
4233 * just detached was smaller than the others, it may be possible to
4234 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4235 * first so that we can obtain the updated sizes of the leaf vdevs.
4236 */
4237 if (spa->spa_autoexpand) {
4238 vdev_reopen(tvd);
4239 vdev_expand(tvd, txg);
4240 }
4241
4242 vdev_config_dirty(tvd);
4243
4244 /*
4245 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4246 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4247 * But first make sure we're not on any *other* txg's DTL list, to
4248 * prevent vd from being accessed after it's freed.
4249 */
4250 vdpath = spa_strdup(vd->vdev_path);
4251 for (int t = 0; t < TXG_SIZE; t++)
4252 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4253 vd->vdev_detached = B_TRUE;
4254 vdev_dirty(tvd, VDD_DTL, vd, txg);
4255
4256 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4257
4258 /* hang on to the spa before we release the lock */
4259 spa_open_ref(spa, FTAG);
4260
4261 error = spa_vdev_exit(spa, vd, txg, 0);
4262
4263 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
4264 "vdev=%s", vdpath);
4265 spa_strfree(vdpath);
4266
4267 /*
4268 * If this was the removal of the original device in a hot spare vdev,
4269 * then we want to go through and remove the device from the hot spare
4270 * list of every other pool.
4271 */
4272 if (unspare) {
4273 spa_t *altspa = NULL;
4274
4275 mutex_enter(&spa_namespace_lock);
4276 while ((altspa = spa_next(altspa)) != NULL) {
4277 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4278 altspa == spa)
4279 continue;
4280
4281 spa_open_ref(altspa, FTAG);
4282 mutex_exit(&spa_namespace_lock);
4283 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4284 mutex_enter(&spa_namespace_lock);
4285 spa_close(altspa, FTAG);
4286 }
4287 mutex_exit(&spa_namespace_lock);
4288
4289 /* search the rest of the vdevs for spares to remove */
4290 spa_vdev_resilver_done(spa);
4291 }
4292
4293 /* all done with the spa; OK to release */
4294 mutex_enter(&spa_namespace_lock);
4295 spa_close(spa, FTAG);
4296 mutex_exit(&spa_namespace_lock);
4297
4298 return (error);
4299 }
4300
4301 /*
4302 * Split a set of devices from their mirrors, and create a new pool from them.
4303 */
4304 int
4305 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4306 nvlist_t *props, boolean_t exp)
4307 {
4308 int error = 0;
4309 uint64_t txg, *glist;
4310 spa_t *newspa;
4311 uint_t c, children, lastlog;
4312 nvlist_t **child, *nvl, *tmp;
4313 dmu_tx_t *tx;
4314 char *altroot = NULL;
4315 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4316 boolean_t activate_slog;
4317
4318 ASSERT(spa_writeable(spa));
4319
4320 txg = spa_vdev_enter(spa);
4321
4322 /* clear the log and flush everything up to now */
4323 activate_slog = spa_passivate_log(spa);
4324 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4325 error = spa_offline_log(spa);
4326 txg = spa_vdev_config_enter(spa);
4327
4328 if (activate_slog)
4329 spa_activate_log(spa);
4330
4331 if (error != 0)
4332 return (spa_vdev_exit(spa, NULL, txg, error));
4333
4334 /* check new spa name before going any further */
4335 if (spa_lookup(newname) != NULL)
4336 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4337
4338 /*
4339 * scan through all the children to ensure they're all mirrors
4340 */
4341 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4342 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4343 &children) != 0)
4344 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4345
4346 /* first, check to ensure we've got the right child count */
4347 rvd = spa->spa_root_vdev;
4348 lastlog = 0;
4349 for (c = 0; c < rvd->vdev_children; c++) {
4350 vdev_t *vd = rvd->vdev_child[c];
4351
4352 /* don't count the holes & logs as children */
4353 if (vd->vdev_islog || vd->vdev_ishole) {
4354 if (lastlog == 0)
4355 lastlog = c;
4356 continue;
4357 }
4358
4359 lastlog = 0;
4360 }
4361 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4362 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4363
4364 /* next, ensure no spare or cache devices are part of the split */
4365 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4366 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4367 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4368
4369 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4370 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4371
4372 /* then, loop over each vdev and validate it */
4373 for (c = 0; c < children; c++) {
4374 uint64_t is_hole = 0;
4375
4376 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4377 &is_hole);
4378
4379 if (is_hole != 0) {
4380 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4381 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4382 continue;
4383 } else {
4384 error = EINVAL;
4385 break;
4386 }
4387 }
4388
4389 /* which disk is going to be split? */
4390 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4391 &glist[c]) != 0) {
4392 error = EINVAL;
4393 break;
4394 }
4395
4396 /* look it up in the spa */
4397 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4398 if (vml[c] == NULL) {
4399 error = ENODEV;
4400 break;
4401 }
4402
4403 /* make sure there's nothing stopping the split */
4404 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4405 vml[c]->vdev_islog ||
4406 vml[c]->vdev_ishole ||
4407 vml[c]->vdev_isspare ||
4408 vml[c]->vdev_isl2cache ||
4409 !vdev_writeable(vml[c]) ||
4410 vml[c]->vdev_children != 0 ||
4411 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4412 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4413 error = EINVAL;
4414 break;
4415 }
4416
4417 if (vdev_dtl_required(vml[c])) {
4418 error = EBUSY;
4419 break;
4420 }
4421
4422 /* we need certain info from the top level */
4423 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4424 vml[c]->vdev_top->vdev_ms_array) == 0);
4425 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4426 vml[c]->vdev_top->vdev_ms_shift) == 0);
4427 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4428 vml[c]->vdev_top->vdev_asize) == 0);
4429 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4430 vml[c]->vdev_top->vdev_ashift) == 0);
4431 }
4432
4433 if (error != 0) {
4434 kmem_free(vml, children * sizeof (vdev_t *));
4435 kmem_free(glist, children * sizeof (uint64_t));
4436 return (spa_vdev_exit(spa, NULL, txg, error));
4437 }
4438
4439 /* stop writers from using the disks */
4440 for (c = 0; c < children; c++) {
4441 if (vml[c] != NULL)
4442 vml[c]->vdev_offline = B_TRUE;
4443 }
4444 vdev_reopen(spa->spa_root_vdev);
4445
4446 /*
4447 * Temporarily record the splitting vdevs in the spa config. This
4448 * will disappear once the config is regenerated.
4449 */
4450 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4451 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4452 glist, children) == 0);
4453 kmem_free(glist, children * sizeof (uint64_t));
4454
4455 mutex_enter(&spa->spa_props_lock);
4456 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4457 nvl) == 0);
4458 mutex_exit(&spa->spa_props_lock);
4459 spa->spa_config_splitting = nvl;
4460 vdev_config_dirty(spa->spa_root_vdev);
4461
4462 /* configure and create the new pool */
4463 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4464 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4465 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4466 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4467 spa_version(spa)) == 0);
4468 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4469 spa->spa_config_txg) == 0);
4470 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4471 spa_generate_guid(NULL)) == 0);
4472 (void) nvlist_lookup_string(props,
4473 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4474
4475 /* add the new pool to the namespace */
4476 newspa = spa_add(newname, config, altroot);
4477 newspa->spa_config_txg = spa->spa_config_txg;
4478 spa_set_log_state(newspa, SPA_LOG_CLEAR);
4479
4480 /* release the spa config lock, retaining the namespace lock */
4481 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4482
4483 if (zio_injection_enabled)
4484 zio_handle_panic_injection(spa, FTAG, 1);
4485
4486 spa_activate(newspa, spa_mode_global);
4487 spa_async_suspend(newspa);
4488
4489 /* create the new pool from the disks of the original pool */
4490 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4491 if (error)
4492 goto out;
4493
4494 /* if that worked, generate a real config for the new pool */
4495 if (newspa->spa_root_vdev != NULL) {
4496 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4497 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4498 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4499 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4500 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4501 B_TRUE));
4502 }
4503
4504 /* set the props */
4505 if (props != NULL) {
4506 spa_configfile_set(newspa, props, B_FALSE);
4507 error = spa_prop_set(newspa, props);
4508 if (error)
4509 goto out;
4510 }
4511
4512 /* flush everything */
4513 txg = spa_vdev_config_enter(newspa);
4514 vdev_config_dirty(newspa->spa_root_vdev);
4515 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
4516
4517 if (zio_injection_enabled)
4518 zio_handle_panic_injection(spa, FTAG, 2);
4519
4520 spa_async_resume(newspa);
4521
4522 /* finally, update the original pool's config */
4523 txg = spa_vdev_config_enter(spa);
4524 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4525 error = dmu_tx_assign(tx, TXG_WAIT);
4526 if (error != 0)
4527 dmu_tx_abort(tx);
4528 for (c = 0; c < children; c++) {
4529 if (vml[c] != NULL) {
4530 vdev_split(vml[c]);
4531 if (error == 0)
4532 spa_history_log_internal(LOG_POOL_VDEV_DETACH,
4533 spa, tx, "vdev=%s",
4534 vml[c]->vdev_path);
4535 vdev_free(vml[c]);
4536 }
4537 }
4538 vdev_config_dirty(spa->spa_root_vdev);
4539 spa->spa_config_splitting = NULL;
4540 nvlist_free(nvl);
4541 if (error == 0)
4542 dmu_tx_commit(tx);
4543 (void) spa_vdev_exit(spa, NULL, txg, 0);
4544
4545 if (zio_injection_enabled)
4546 zio_handle_panic_injection(spa, FTAG, 3);
4547
4548 /* split is complete; log a history record */
4549 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
4550 "split new pool %s from pool %s", newname, spa_name(spa));
4551
4552 kmem_free(vml, children * sizeof (vdev_t *));
4553
4554 /* if we're not going to mount the filesystems in userland, export */
4555 if (exp)
4556 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
4557 B_FALSE, B_FALSE);
4558
4559 return (error);
4560
4561 out:
4562 spa_unload(newspa);
4563 spa_deactivate(newspa);
4564 spa_remove(newspa);
4565
4566 txg = spa_vdev_config_enter(spa);
4567
4568 /* re-online all offlined disks */
4569 for (c = 0; c < children; c++) {
4570 if (vml[c] != NULL)
4571 vml[c]->vdev_offline = B_FALSE;
4572 }
4573 vdev_reopen(spa->spa_root_vdev);
4574
4575 nvlist_free(spa->spa_config_splitting);
4576 spa->spa_config_splitting = NULL;
4577 (void) spa_vdev_exit(spa, NULL, txg, error);
4578
4579 kmem_free(vml, children * sizeof (vdev_t *));
4580 return (error);
4581 }
4582
4583 static nvlist_t *
4584 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
4585 {
4586 for (int i = 0; i < count; i++) {
4587 uint64_t guid;
4588
4589 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
4590 &guid) == 0);
4591
4592 if (guid == target_guid)
4593 return (nvpp[i]);
4594 }
4595
4596 return (NULL);
4597 }
4598
4599 static void
4600 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
4601 nvlist_t *dev_to_remove)
4602 {
4603 nvlist_t **newdev = NULL;
4604
4605 if (count > 1)
4606 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
4607
4608 for (int i = 0, j = 0; i < count; i++) {
4609 if (dev[i] == dev_to_remove)
4610 continue;
4611 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
4612 }
4613
4614 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
4615 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
4616
4617 for (int i = 0; i < count - 1; i++)
4618 nvlist_free(newdev[i]);
4619
4620 if (count > 1)
4621 kmem_free(newdev, (count - 1) * sizeof (void *));
4622 }
4623
4624 /*
4625 * Evacuate the device.
4626 */
4627 static int
4628 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
4629 {
4630 uint64_t txg;
4631 int error = 0;
4632
4633 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4634 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4635 ASSERT(vd == vd->vdev_top);
4636
4637 /*
4638 * Evacuate the device. We don't hold the config lock as writer
4639 * since we need to do I/O but we do keep the
4640 * spa_namespace_lock held. Once this completes the device
4641 * should no longer have any blocks allocated on it.
4642 */
4643 if (vd->vdev_islog) {
4644 if (vd->vdev_stat.vs_alloc != 0)
4645 error = spa_offline_log(spa);
4646 } else {
4647 error = ENOTSUP;
4648 }
4649
4650 if (error)
4651 return (error);
4652
4653 /*
4654 * The evacuation succeeded. Remove any remaining MOS metadata
4655 * associated with this vdev, and wait for these changes to sync.
4656 */
4657 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
4658 txg = spa_vdev_config_enter(spa);
4659 vd->vdev_removing = B_TRUE;
4660 vdev_dirty(vd, 0, NULL, txg);
4661 vdev_config_dirty(vd);
4662 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4663
4664 return (0);
4665 }
4666
4667 /*
4668 * Complete the removal by cleaning up the namespace.
4669 */
4670 static void
4671 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
4672 {
4673 vdev_t *rvd = spa->spa_root_vdev;
4674 uint64_t id = vd->vdev_id;
4675 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
4676
4677 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4678 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4679 ASSERT(vd == vd->vdev_top);
4680
4681 /*
4682 * Only remove any devices which are empty.
4683 */
4684 if (vd->vdev_stat.vs_alloc != 0)
4685 return;
4686
4687 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4688
4689 if (list_link_active(&vd->vdev_state_dirty_node))
4690 vdev_state_clean(vd);
4691 if (list_link_active(&vd->vdev_config_dirty_node))
4692 vdev_config_clean(vd);
4693
4694 vdev_free(vd);
4695
4696 if (last_vdev) {
4697 vdev_compact_children(rvd);
4698 } else {
4699 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
4700 vdev_add_child(rvd, vd);
4701 }
4702 vdev_config_dirty(rvd);
4703
4704 /*
4705 * Reassess the health of our root vdev.
4706 */
4707 vdev_reopen(rvd);
4708 }
4709
4710 /*
4711 * Remove a device from the pool -
4712 *
4713 * Removing a device from the vdev namespace requires several steps
4714 * and can take a significant amount of time. As a result we use
4715 * the spa_vdev_config_[enter/exit] functions which allow us to
4716 * grab and release the spa_config_lock while still holding the namespace
4717 * lock. During each step the configuration is synced out.
4718 */
4719
4720 /*
4721 * Remove a device from the pool. Currently, this supports removing only hot
4722 * spares, slogs, and level 2 ARC devices.
4723 */
4724 int
4725 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
4726 {
4727 vdev_t *vd;
4728 metaslab_group_t *mg;
4729 nvlist_t **spares, **l2cache, *nv;
4730 uint64_t txg = 0;
4731 uint_t nspares, nl2cache;
4732 int error = 0;
4733 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
4734
4735 ASSERT(spa_writeable(spa));
4736
4737 if (!locked)
4738 txg = spa_vdev_enter(spa);
4739
4740 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4741
4742 if (spa->spa_spares.sav_vdevs != NULL &&
4743 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4744 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
4745 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
4746 /*
4747 * Only remove the hot spare if it's not currently in use
4748 * in this pool.
4749 */
4750 if (vd == NULL || unspare) {
4751 spa_vdev_remove_aux(spa->spa_spares.sav_config,
4752 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
4753 spa_load_spares(spa);
4754 spa->spa_spares.sav_sync = B_TRUE;
4755 } else {
4756 error = EBUSY;
4757 }
4758 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
4759 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4760 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
4761 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
4762 /*
4763 * Cache devices can always be removed.
4764 */
4765 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
4766 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
4767 spa_load_l2cache(spa);
4768 spa->spa_l2cache.sav_sync = B_TRUE;
4769 } else if (vd != NULL && vd->vdev_islog) {
4770 ASSERT(!locked);
4771 ASSERT(vd == vd->vdev_top);
4772
4773 /*
4774 * XXX - Once we have bp-rewrite this should
4775 * become the common case.
4776 */
4777
4778 mg = vd->vdev_mg;
4779
4780 /*
4781 * Stop allocating from this vdev.
4782 */
4783 metaslab_group_passivate(mg);
4784
4785 /*
4786 * Wait for the youngest allocations and frees to sync,
4787 * and then wait for the deferral of those frees to finish.
4788 */
4789 spa_vdev_config_exit(spa, NULL,
4790 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
4791
4792 /*
4793 * Attempt to evacuate the vdev.
4794 */
4795 error = spa_vdev_remove_evacuate(spa, vd);
4796
4797 txg = spa_vdev_config_enter(spa);
4798
4799 /*
4800 * If we couldn't evacuate the vdev, unwind.
4801 */
4802 if (error) {
4803 metaslab_group_activate(mg);
4804 return (spa_vdev_exit(spa, NULL, txg, error));
4805 }
4806
4807 /*
4808 * Clean up the vdev namespace.
4809 */
4810 spa_vdev_remove_from_namespace(spa, vd);
4811
4812 } else if (vd != NULL) {
4813 /*
4814 * Normal vdevs cannot be removed (yet).
4815 */
4816 error = ENOTSUP;
4817 } else {
4818 /*
4819 * There is no vdev of any kind with the specified guid.
4820 */
4821 error = ENOENT;
4822 }
4823
4824 if (!locked)
4825 return (spa_vdev_exit(spa, NULL, txg, error));
4826
4827 return (error);
4828 }
4829
4830 /*
4831 * Find any device that's done replacing, or a vdev marked 'unspare' that's
4832 * current spared, so we can detach it.
4833 */
4834 static vdev_t *
4835 spa_vdev_resilver_done_hunt(vdev_t *vd)
4836 {
4837 vdev_t *newvd, *oldvd;
4838
4839 for (int c = 0; c < vd->vdev_children; c++) {
4840 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
4841 if (oldvd != NULL)
4842 return (oldvd);
4843 }
4844
4845 /*
4846 * Check for a completed replacement. We always consider the first
4847 * vdev in the list to be the oldest vdev, and the last one to be
4848 * the newest (see spa_vdev_attach() for how that works). In
4849 * the case where the newest vdev is faulted, we will not automatically
4850 * remove it after a resilver completes. This is OK as it will require
4851 * user intervention to determine which disk the admin wishes to keep.
4852 */
4853 if (vd->vdev_ops == &vdev_replacing_ops) {
4854 ASSERT(vd->vdev_children > 1);
4855
4856 newvd = vd->vdev_child[vd->vdev_children - 1];
4857 oldvd = vd->vdev_child[0];
4858
4859 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
4860 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4861 !vdev_dtl_required(oldvd))
4862 return (oldvd);
4863 }
4864
4865 /*
4866 * Check for a completed resilver with the 'unspare' flag set.
4867 */
4868 if (vd->vdev_ops == &vdev_spare_ops) {
4869 vdev_t *first = vd->vdev_child[0];
4870 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
4871
4872 if (last->vdev_unspare) {
4873 oldvd = first;
4874 newvd = last;
4875 } else if (first->vdev_unspare) {
4876 oldvd = last;
4877 newvd = first;
4878 } else {
4879 oldvd = NULL;
4880 }
4881
4882 if (oldvd != NULL &&
4883 vdev_dtl_empty(newvd, DTL_MISSING) &&
4884 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4885 !vdev_dtl_required(oldvd))
4886 return (oldvd);
4887
4888 /*
4889 * If there are more than two spares attached to a disk,
4890 * and those spares are not required, then we want to
4891 * attempt to free them up now so that they can be used
4892 * by other pools. Once we're back down to a single
4893 * disk+spare, we stop removing them.
4894 */
4895 if (vd->vdev_children > 2) {
4896 newvd = vd->vdev_child[1];
4897
4898 if (newvd->vdev_isspare && last->vdev_isspare &&
4899 vdev_dtl_empty(last, DTL_MISSING) &&
4900 vdev_dtl_empty(last, DTL_OUTAGE) &&
4901 !vdev_dtl_required(newvd))
4902 return (newvd);
4903 }
4904 }
4905
4906 return (NULL);
4907 }
4908
4909 static void
4910 spa_vdev_resilver_done(spa_t *spa)
4911 {
4912 vdev_t *vd, *pvd, *ppvd;
4913 uint64_t guid, sguid, pguid, ppguid;
4914
4915 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4916
4917 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
4918 pvd = vd->vdev_parent;
4919 ppvd = pvd->vdev_parent;
4920 guid = vd->vdev_guid;
4921 pguid = pvd->vdev_guid;
4922 ppguid = ppvd->vdev_guid;
4923 sguid = 0;
4924 /*
4925 * If we have just finished replacing a hot spared device, then
4926 * we need to detach the parent's first child (the original hot
4927 * spare) as well.
4928 */
4929 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
4930 ppvd->vdev_children == 2) {
4931 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
4932 sguid = ppvd->vdev_child[1]->vdev_guid;
4933 }
4934 spa_config_exit(spa, SCL_ALL, FTAG);
4935 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
4936 return;
4937 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
4938 return;
4939 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4940 }
4941
4942 spa_config_exit(spa, SCL_ALL, FTAG);
4943 }
4944
4945 /*
4946 * Update the stored path or FRU for this vdev.
4947 */
4948 int
4949 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
4950 boolean_t ispath)
4951 {
4952 vdev_t *vd;
4953 boolean_t sync = B_FALSE;
4954
4955 ASSERT(spa_writeable(spa));
4956
4957 spa_vdev_state_enter(spa, SCL_ALL);
4958
4959 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4960 return (spa_vdev_state_exit(spa, NULL, ENOENT));
4961
4962 if (!vd->vdev_ops->vdev_op_leaf)
4963 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4964
4965 if (ispath) {
4966 if (strcmp(value, vd->vdev_path) != 0) {
4967 spa_strfree(vd->vdev_path);
4968 vd->vdev_path = spa_strdup(value);
4969 sync = B_TRUE;
4970 }
4971 } else {
4972 if (vd->vdev_fru == NULL) {
4973 vd->vdev_fru = spa_strdup(value);
4974 sync = B_TRUE;
4975 } else if (strcmp(value, vd->vdev_fru) != 0) {
4976 spa_strfree(vd->vdev_fru);
4977 vd->vdev_fru = spa_strdup(value);
4978 sync = B_TRUE;
4979 }
4980 }
4981
4982 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
4983 }
4984
4985 int
4986 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
4987 {
4988 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
4989 }
4990
4991 int
4992 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
4993 {
4994 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
4995 }
4996
4997 /*
4998 * ==========================================================================
4999 * SPA Scanning
5000 * ==========================================================================
5001 */
5002
5003 int
5004 spa_scan_stop(spa_t *spa)
5005 {
5006 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5007 if (dsl_scan_resilvering(spa->spa_dsl_pool))
5008 return (EBUSY);
5009 return (dsl_scan_cancel(spa->spa_dsl_pool));
5010 }
5011
5012 int
5013 spa_scan(spa_t *spa, pool_scan_func_t func)
5014 {
5015 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5016
5017 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5018 return (ENOTSUP);
5019
5020 /*
5021 * If a resilver was requested, but there is no DTL on a
5022 * writeable leaf device, we have nothing to do.
5023 */
5024 if (func == POOL_SCAN_RESILVER &&
5025 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5026 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5027 return (0);
5028 }
5029
5030 return (dsl_scan(spa->spa_dsl_pool, func));
5031 }
5032
5033 /*
5034 * ==========================================================================
5035 * SPA async task processing
5036 * ==========================================================================
5037 */
5038
5039 static void
5040 spa_async_remove(spa_t *spa, vdev_t *vd)
5041 {
5042 if (vd->vdev_remove_wanted) {
5043 vd->vdev_remove_wanted = B_FALSE;
5044 vd->vdev_delayed_close = B_FALSE;
5045 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5046
5047 /*
5048 * We want to clear the stats, but we don't want to do a full
5049 * vdev_clear() as that will cause us to throw away
5050 * degraded/faulted state as well as attempt to reopen the
5051 * device, all of which is a waste.
5052 */
5053 vd->vdev_stat.vs_read_errors = 0;
5054 vd->vdev_stat.vs_write_errors = 0;
5055 vd->vdev_stat.vs_checksum_errors = 0;
5056
5057 vdev_state_dirty(vd->vdev_top);
5058 }
5059
5060 for (int c = 0; c < vd->vdev_children; c++)
5061 spa_async_remove(spa, vd->vdev_child[c]);
5062 }
5063
5064 static void
5065 spa_async_probe(spa_t *spa, vdev_t *vd)
5066 {
5067 if (vd->vdev_probe_wanted) {
5068 vd->vdev_probe_wanted = B_FALSE;
5069 vdev_reopen(vd); /* vdev_open() does the actual probe */
5070 }
5071
5072 for (int c = 0; c < vd->vdev_children; c++)
5073 spa_async_probe(spa, vd->vdev_child[c]);
5074 }
5075
5076 static void
5077 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5078 {
5079 sysevent_id_t eid;
5080 nvlist_t *attr;
5081 char *physpath;
5082
5083 if (!spa->spa_autoexpand)
5084 return;
5085
5086 for (int c = 0; c < vd->vdev_children; c++) {
5087 vdev_t *cvd = vd->vdev_child[c];
5088 spa_async_autoexpand(spa, cvd);
5089 }
5090
5091 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5092 return;
5093
5094 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5095 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5096
5097 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5098 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5099
5100 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5101 ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5102
5103 nvlist_free(attr);
5104 kmem_free(physpath, MAXPATHLEN);
5105 }
5106
5107 static void
5108 spa_async_thread(spa_t *spa)
5109 {
5110 int tasks;
5111
5112 ASSERT(spa->spa_sync_on);
5113
5114 mutex_enter(&spa->spa_async_lock);
5115 tasks = spa->spa_async_tasks;
5116 spa->spa_async_tasks = 0;
5117 mutex_exit(&spa->spa_async_lock);
5118
5119 /*
5120 * See if the config needs to be updated.
5121 */
5122 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5123 uint64_t old_space, new_space;
5124
5125 mutex_enter(&spa_namespace_lock);
5126 old_space = metaslab_class_get_space(spa_normal_class(spa));
5127 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5128 new_space = metaslab_class_get_space(spa_normal_class(spa));
5129 mutex_exit(&spa_namespace_lock);
5130
5131 /*
5132 * If the pool grew as a result of the config update,
5133 * then log an internal history event.
5134 */
5135 if (new_space != old_space) {
5136 spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
5137 spa, NULL,
5138 "pool '%s' size: %llu(+%llu)",
5139 spa_name(spa), new_space, new_space - old_space);
5140 }
5141 }
5142
5143 /*
5144 * See if any devices need to be marked REMOVED.
5145 */
5146 if (tasks & SPA_ASYNC_REMOVE) {
5147 spa_vdev_state_enter(spa, SCL_NONE);
5148 spa_async_remove(spa, spa->spa_root_vdev);
5149 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5150 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5151 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5152 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5153 (void) spa_vdev_state_exit(spa, NULL, 0);
5154 }
5155
5156 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5157 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5158 spa_async_autoexpand(spa, spa->spa_root_vdev);
5159 spa_config_exit(spa, SCL_CONFIG, FTAG);
5160 }
5161
5162 /*
5163 * See if any devices need to be probed.
5164 */
5165 if (tasks & SPA_ASYNC_PROBE) {
5166 spa_vdev_state_enter(spa, SCL_NONE);
5167 spa_async_probe(spa, spa->spa_root_vdev);
5168 (void) spa_vdev_state_exit(spa, NULL, 0);
5169 }
5170
5171 /*
5172 * If any devices are done replacing, detach them.
5173 */
5174 if (tasks & SPA_ASYNC_RESILVER_DONE)
5175 spa_vdev_resilver_done(spa);
5176
5177 /*
5178 * Kick off a resilver.
5179 */
5180 if (tasks & SPA_ASYNC_RESILVER)
5181 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5182
5183 /*
5184 * Let the world know that we're done.
5185 */
5186 mutex_enter(&spa->spa_async_lock);
5187 spa->spa_async_thread = NULL;
5188 cv_broadcast(&spa->spa_async_cv);
5189 mutex_exit(&spa->spa_async_lock);
5190 thread_exit();
5191 }
5192
5193 void
5194 spa_async_suspend(spa_t *spa)
5195 {
5196 mutex_enter(&spa->spa_async_lock);
5197 spa->spa_async_suspended++;
5198 while (spa->spa_async_thread != NULL)
5199 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5200 mutex_exit(&spa->spa_async_lock);
5201 }
5202
5203 void
5204 spa_async_resume(spa_t *spa)
5205 {
5206 mutex_enter(&spa->spa_async_lock);
5207 ASSERT(spa->spa_async_suspended != 0);
5208 spa->spa_async_suspended--;
5209 mutex_exit(&spa->spa_async_lock);
5210 }
5211
5212 static void
5213 spa_async_dispatch(spa_t *spa)
5214 {
5215 mutex_enter(&spa->spa_async_lock);
5216 if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5217 spa->spa_async_thread == NULL &&
5218 rootdir != NULL && !vn_is_readonly(rootdir))
5219 spa->spa_async_thread = thread_create(NULL, 0,
5220 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5221 mutex_exit(&spa->spa_async_lock);
5222 }
5223
5224 void
5225 spa_async_request(spa_t *spa, int task)
5226 {
5227 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5228 mutex_enter(&spa->spa_async_lock);
5229 spa->spa_async_tasks |= task;
5230 mutex_exit(&spa->spa_async_lock);
5231 }
5232
5233 /*
5234 * ==========================================================================
5235 * SPA syncing routines
5236 * ==========================================================================
5237 */
5238
5239 static int
5240 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5241 {
5242 bpobj_t *bpo = arg;
5243 bpobj_enqueue(bpo, bp, tx);
5244 return (0);
5245 }
5246
5247 static int
5248 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5249 {
5250 zio_t *zio = arg;
5251
5252 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5253 zio->io_flags));
5254 return (0);
5255 }
5256
5257 static void
5258 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5259 {
5260 char *packed = NULL;
5261 size_t bufsize;
5262 size_t nvsize = 0;
5263 dmu_buf_t *db;
5264
5265 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5266
5267 /*
5268 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5269 * information. This avoids the dbuf_will_dirty() path and
5270 * saves us a pre-read to get data we don't actually care about.
5271 */
5272 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5273 packed = kmem_alloc(bufsize, KM_SLEEP);
5274
5275 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5276 KM_SLEEP) == 0);
5277 bzero(packed + nvsize, bufsize - nvsize);
5278
5279 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5280
5281 kmem_free(packed, bufsize);
5282
5283 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5284 dmu_buf_will_dirty(db, tx);
5285 *(uint64_t *)db->db_data = nvsize;
5286 dmu_buf_rele(db, FTAG);
5287 }
5288
5289 static void
5290 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5291 const char *config, const char *entry)
5292 {
5293 nvlist_t *nvroot;
5294 nvlist_t **list;
5295 int i;
5296
5297 if (!sav->sav_sync)
5298 return;
5299
5300 /*
5301 * Update the MOS nvlist describing the list of available devices.
5302 * spa_validate_aux() will have already made sure this nvlist is
5303 * valid and the vdevs are labeled appropriately.
5304 */
5305 if (sav->sav_object == 0) {
5306 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5307 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5308 sizeof (uint64_t), tx);
5309 VERIFY(zap_update(spa->spa_meta_objset,
5310 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5311 &sav->sav_object, tx) == 0);
5312 }
5313
5314 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5315 if (sav->sav_count == 0) {
5316 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5317 } else {
5318 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5319 for (i = 0; i < sav->sav_count; i++)
5320 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5321 B_FALSE, VDEV_CONFIG_L2CACHE);
5322 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5323 sav->sav_count) == 0);
5324 for (i = 0; i < sav->sav_count; i++)
5325 nvlist_free(list[i]);
5326 kmem_free(list, sav->sav_count * sizeof (void *));
5327 }
5328
5329 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5330 nvlist_free(nvroot);
5331
5332 sav->sav_sync = B_FALSE;
5333 }
5334
5335 static void
5336 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5337 {
5338 nvlist_t *config;
5339
5340 if (list_is_empty(&spa->spa_config_dirty_list))
5341 return;
5342
5343 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5344
5345 config = spa_config_generate(spa, spa->spa_root_vdev,
5346 dmu_tx_get_txg(tx), B_FALSE);
5347
5348 spa_config_exit(spa, SCL_STATE, FTAG);
5349
5350 if (spa->spa_config_syncing)
5351 nvlist_free(spa->spa_config_syncing);
5352 spa->spa_config_syncing = config;
5353
5354 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5355 }
5356
5357 /*
5358 * Set zpool properties.
5359 */
5360 static void
5361 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5362 {
5363 spa_t *spa = arg1;
5364 objset_t *mos = spa->spa_meta_objset;
5365 nvlist_t *nvp = arg2;
5366 nvpair_t *elem;
5367 uint64_t intval;
5368 char *strval;
5369 zpool_prop_t prop;
5370 const char *propname;
5371 zprop_type_t proptype;
5372
5373 mutex_enter(&spa->spa_props_lock);
5374
5375 elem = NULL;
5376 while ((elem = nvlist_next_nvpair(nvp, elem))) {
5377 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5378 case ZPOOL_PROP_VERSION:
5379 /*
5380 * Only set version for non-zpool-creation cases
5381 * (set/import). spa_create() needs special care
5382 * for version setting.
5383 */
5384 if (tx->tx_txg != TXG_INITIAL) {
5385 VERIFY(nvpair_value_uint64(elem,
5386 &intval) == 0);
5387 ASSERT(intval <= SPA_VERSION);
5388 ASSERT(intval >= spa_version(spa));
5389 spa->spa_uberblock.ub_version = intval;
5390 vdev_config_dirty(spa->spa_root_vdev);
5391 }
5392 break;
5393
5394 case ZPOOL_PROP_ALTROOT:
5395 /*
5396 * 'altroot' is a non-persistent property. It should
5397 * have been set temporarily at creation or import time.
5398 */
5399 ASSERT(spa->spa_root != NULL);
5400 break;
5401
5402 case ZPOOL_PROP_READONLY:
5403 case ZPOOL_PROP_CACHEFILE:
5404 /*
5405 * 'readonly' and 'cachefile' are also non-persisitent
5406 * properties.
5407 */
5408 break;
5409 case ZPOOL_PROP_COMMENT:
5410 VERIFY(nvpair_value_string(elem, &strval) == 0);
5411 if (spa->spa_comment != NULL)
5412 spa_strfree(spa->spa_comment);
5413 spa->spa_comment = spa_strdup(strval);
5414 /*
5415 * We need to dirty the configuration on all the vdevs
5416 * so that their labels get updated. It's unnecessary
5417 * to do this for pool creation since the vdev's
5418 * configuratoin has already been dirtied.
5419 */
5420 if (tx->tx_txg != TXG_INITIAL)
5421 vdev_config_dirty(spa->spa_root_vdev);
5422 break;
5423 default:
5424 /*
5425 * Set pool property values in the poolprops mos object.
5426 */
5427 if (spa->spa_pool_props_object == 0) {
5428 VERIFY((spa->spa_pool_props_object =
5429 zap_create(mos, DMU_OT_POOL_PROPS,
5430 DMU_OT_NONE, 0, tx)) > 0);
5431
5432 VERIFY(zap_update(mos,
5433 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5434 8, 1, &spa->spa_pool_props_object, tx)
5435 == 0);
5436 }
5437
5438 /* normalize the property name */
5439 propname = zpool_prop_to_name(prop);
5440 proptype = zpool_prop_get_type(prop);
5441
5442 if (nvpair_type(elem) == DATA_TYPE_STRING) {
5443 ASSERT(proptype == PROP_TYPE_STRING);
5444 VERIFY(nvpair_value_string(elem, &strval) == 0);
5445 VERIFY(zap_update(mos,
5446 spa->spa_pool_props_object, propname,
5447 1, strlen(strval) + 1, strval, tx) == 0);
5448
5449 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5450 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5451
5452 if (proptype == PROP_TYPE_INDEX) {
5453 const char *unused;
5454 VERIFY(zpool_prop_index_to_string(
5455 prop, intval, &unused) == 0);
5456 }
5457 VERIFY(zap_update(mos,
5458 spa->spa_pool_props_object, propname,
5459 8, 1, &intval, tx) == 0);
5460 } else {
5461 ASSERT(0); /* not allowed */
5462 }
5463
5464 switch (prop) {
5465 case ZPOOL_PROP_DELEGATION:
5466 spa->spa_delegation = intval;
5467 break;
5468 case ZPOOL_PROP_BOOTFS:
5469 spa->spa_bootfs = intval;
5470 break;
5471 case ZPOOL_PROP_FAILUREMODE:
5472 spa->spa_failmode = intval;
5473 break;
5474 case ZPOOL_PROP_AUTOEXPAND:
5475 spa->spa_autoexpand = intval;
5476 if (tx->tx_txg != TXG_INITIAL)
5477 spa_async_request(spa,
5478 SPA_ASYNC_AUTOEXPAND);
5479 break;
5480 case ZPOOL_PROP_DEDUPDITTO:
5481 spa->spa_dedup_ditto = intval;
5482 break;
5483 default:
5484 break;
5485 }
5486 }
5487
5488 /* log internal history if this is not a zpool create */
5489 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
5490 tx->tx_txg != TXG_INITIAL) {
5491 spa_history_log_internal(LOG_POOL_PROPSET,
5492 spa, tx, "%s %lld %s",
5493 nvpair_name(elem), intval, spa_name(spa));
5494 }
5495 }
5496
5497 mutex_exit(&spa->spa_props_lock);
5498 }
5499
5500 /*
5501 * Perform one-time upgrade on-disk changes. spa_version() does not
5502 * reflect the new version this txg, so there must be no changes this
5503 * txg to anything that the upgrade code depends on after it executes.
5504 * Therefore this must be called after dsl_pool_sync() does the sync
5505 * tasks.
5506 */
5507 static void
5508 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
5509 {
5510 dsl_pool_t *dp = spa->spa_dsl_pool;
5511
5512 ASSERT(spa->spa_sync_pass == 1);
5513
5514 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5515 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5516 dsl_pool_create_origin(dp, tx);
5517
5518 /* Keeping the origin open increases spa_minref */
5519 spa->spa_minref += 3;
5520 }
5521
5522 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5523 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5524 dsl_pool_upgrade_clones(dp, tx);
5525 }
5526
5527 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5528 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5529 dsl_pool_upgrade_dir_clones(dp, tx);
5530
5531 /* Keeping the freedir open increases spa_minref */
5532 spa->spa_minref += 3;
5533 }
5534 }
5535
5536 /*
5537 * Sync the specified transaction group. New blocks may be dirtied as
5538 * part of the process, so we iterate until it converges.
5539 */
5540 void
5541 spa_sync(spa_t *spa, uint64_t txg)
5542 {
5543 dsl_pool_t *dp = spa->spa_dsl_pool;
5544 objset_t *mos = spa->spa_meta_objset;
5545 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5546 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5547 vdev_t *rvd = spa->spa_root_vdev;
5548 vdev_t *vd;
5549 dmu_tx_t *tx;
5550 int error;
5551
5552 VERIFY(spa_writeable(spa));
5553
5554 /*
5555 * Lock out configuration changes.
5556 */
5557 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5558
5559 spa->spa_syncing_txg = txg;
5560 spa->spa_sync_pass = 0;
5561
5562 /*
5563 * If there are any pending vdev state changes, convert them
5564 * into config changes that go out with this transaction group.
5565 */
5566 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5567 while (list_head(&spa->spa_state_dirty_list) != NULL) {
5568 /*
5569 * We need the write lock here because, for aux vdevs,
5570 * calling vdev_config_dirty() modifies sav_config.
5571 * This is ugly and will become unnecessary when we
5572 * eliminate the aux vdev wart by integrating all vdevs
5573 * into the root vdev tree.
5574 */
5575 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5576 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
5577 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
5578 vdev_state_clean(vd);
5579 vdev_config_dirty(vd);
5580 }
5581 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5582 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
5583 }
5584 spa_config_exit(spa, SCL_STATE, FTAG);
5585
5586 tx = dmu_tx_create_assigned(dp, txg);
5587
5588 /*
5589 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
5590 * set spa_deflate if we have no raid-z vdevs.
5591 */
5592 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
5593 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
5594 int i;
5595
5596 for (i = 0; i < rvd->vdev_children; i++) {
5597 vd = rvd->vdev_child[i];
5598 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
5599 break;
5600 }
5601 if (i == rvd->vdev_children) {
5602 spa->spa_deflate = TRUE;
5603 VERIFY(0 == zap_add(spa->spa_meta_objset,
5604 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5605 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
5606 }
5607 }
5608
5609 /*
5610 * If anything has changed in this txg, or if someone is waiting
5611 * for this txg to sync (eg, spa_vdev_remove()), push the
5612 * deferred frees from the previous txg. If not, leave them
5613 * alone so that we don't generate work on an otherwise idle
5614 * system.
5615 */
5616 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
5617 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
5618 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
5619 ((dsl_scan_active(dp->dp_scan) ||
5620 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
5621 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5622 VERIFY3U(bpobj_iterate(defer_bpo,
5623 spa_free_sync_cb, zio, tx), ==, 0);
5624 VERIFY3U(zio_wait(zio), ==, 0);
5625 }
5626
5627 /*
5628 * Iterate to convergence.
5629 */
5630 do {
5631 int pass = ++spa->spa_sync_pass;
5632
5633 spa_sync_config_object(spa, tx);
5634 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
5635 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
5636 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
5637 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
5638 spa_errlog_sync(spa, txg);
5639 dsl_pool_sync(dp, txg);
5640
5641 if (pass <= SYNC_PASS_DEFERRED_FREE) {
5642 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5643 bplist_iterate(free_bpl, spa_free_sync_cb,
5644 zio, tx);
5645 VERIFY(zio_wait(zio) == 0);
5646 } else {
5647 bplist_iterate(free_bpl, bpobj_enqueue_cb,
5648 defer_bpo, tx);
5649 }
5650
5651 ddt_sync(spa, txg);
5652 dsl_scan_sync(dp, tx);
5653
5654 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
5655 vdev_sync(vd, txg);
5656
5657 if (pass == 1)
5658 spa_sync_upgrades(spa, tx);
5659
5660 } while (dmu_objset_is_dirty(mos, txg));
5661
5662 /*
5663 * Rewrite the vdev configuration (which includes the uberblock)
5664 * to commit the transaction group.
5665 *
5666 * If there are no dirty vdevs, we sync the uberblock to a few
5667 * random top-level vdevs that are known to be visible in the
5668 * config cache (see spa_vdev_add() for a complete description).
5669 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
5670 */
5671 for (;;) {
5672 /*
5673 * We hold SCL_STATE to prevent vdev open/close/etc.
5674 * while we're attempting to write the vdev labels.
5675 */
5676 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5677
5678 if (list_is_empty(&spa->spa_config_dirty_list)) {
5679 vdev_t *svd[SPA_DVAS_PER_BP];
5680 int svdcount = 0;
5681 int children = rvd->vdev_children;
5682 int c0 = spa_get_random(children);
5683
5684 for (int c = 0; c < children; c++) {
5685 vd = rvd->vdev_child[(c0 + c) % children];
5686 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
5687 continue;
5688 svd[svdcount++] = vd;
5689 if (svdcount == SPA_DVAS_PER_BP)
5690 break;
5691 }
5692 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
5693 if (error != 0)
5694 error = vdev_config_sync(svd, svdcount, txg,
5695 B_TRUE);
5696 } else {
5697 error = vdev_config_sync(rvd->vdev_child,
5698 rvd->vdev_children, txg, B_FALSE);
5699 if (error != 0)
5700 error = vdev_config_sync(rvd->vdev_child,
5701 rvd->vdev_children, txg, B_TRUE);
5702 }
5703
5704 spa_config_exit(spa, SCL_STATE, FTAG);
5705
5706 if (error == 0)
5707 break;
5708 zio_suspend(spa, NULL);
5709 zio_resume_wait(spa);
5710 }
5711 dmu_tx_commit(tx);
5712
5713 /*
5714 * Clear the dirty config list.
5715 */
5716 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
5717 vdev_config_clean(vd);
5718
5719 /*
5720 * Now that the new config has synced transactionally,
5721 * let it become visible to the config cache.
5722 */
5723 if (spa->spa_config_syncing != NULL) {
5724 spa_config_set(spa, spa->spa_config_syncing);
5725 spa->spa_config_txg = txg;
5726 spa->spa_config_syncing = NULL;
5727 }
5728
5729 spa->spa_ubsync = spa->spa_uberblock;
5730
5731 dsl_pool_sync_done(dp, txg);
5732
5733 /*
5734 * Update usable space statistics.
5735 */
5736 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
5737 vdev_sync_done(vd, txg);
5738
5739 spa_update_dspace(spa);
5740
5741 /*
5742 * It had better be the case that we didn't dirty anything
5743 * since vdev_config_sync().
5744 */
5745 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
5746 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
5747 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
5748
5749 spa->spa_sync_pass = 0;
5750
5751 spa_config_exit(spa, SCL_CONFIG, FTAG);
5752
5753 spa_handle_ignored_writes(spa);
5754
5755 /*
5756 * If any async tasks have been requested, kick them off.
5757 */
5758 spa_async_dispatch(spa);
5759 }
5760
5761 /*
5762 * Sync all pools. We don't want to hold the namespace lock across these
5763 * operations, so we take a reference on the spa_t and drop the lock during the
5764 * sync.
5765 */
5766 void
5767 spa_sync_allpools(void)
5768 {
5769 spa_t *spa = NULL;
5770 mutex_enter(&spa_namespace_lock);
5771 while ((spa = spa_next(spa)) != NULL) {
5772 if (spa_state(spa) != POOL_STATE_ACTIVE ||
5773 !spa_writeable(spa) || spa_suspended(spa))
5774 continue;
5775 spa_open_ref(spa, FTAG);
5776 mutex_exit(&spa_namespace_lock);
5777 txg_wait_synced(spa_get_dsl(spa), 0);
5778 mutex_enter(&spa_namespace_lock);
5779 spa_close(spa, FTAG);
5780 }
5781 mutex_exit(&spa_namespace_lock);
5782 }
5783
5784 /*
5785 * ==========================================================================
5786 * Miscellaneous routines
5787 * ==========================================================================
5788 */
5789
5790 /*
5791 * Remove all pools in the system.
5792 */
5793 void
5794 spa_evict_all(void)
5795 {
5796 spa_t *spa;
5797
5798 /*
5799 * Remove all cached state. All pools should be closed now,
5800 * so every spa in the AVL tree should be unreferenced.
5801 */
5802 mutex_enter(&spa_namespace_lock);
5803 while ((spa = spa_next(NULL)) != NULL) {
5804 /*
5805 * Stop async tasks. The async thread may need to detach
5806 * a device that's been replaced, which requires grabbing
5807 * spa_namespace_lock, so we must drop it here.
5808 */
5809 spa_open_ref(spa, FTAG);
5810 mutex_exit(&spa_namespace_lock);
5811 spa_async_suspend(spa);
5812 mutex_enter(&spa_namespace_lock);
5813 spa_close(spa, FTAG);
5814
5815 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5816 spa_unload(spa);
5817 spa_deactivate(spa);
5818 }
5819 spa_remove(spa);
5820 }
5821 mutex_exit(&spa_namespace_lock);
5822 }
5823
5824 vdev_t *
5825 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
5826 {
5827 vdev_t *vd;
5828 int i;
5829
5830 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
5831 return (vd);
5832
5833 if (aux) {
5834 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
5835 vd = spa->spa_l2cache.sav_vdevs[i];
5836 if (vd->vdev_guid == guid)
5837 return (vd);
5838 }
5839
5840 for (i = 0; i < spa->spa_spares.sav_count; i++) {
5841 vd = spa->spa_spares.sav_vdevs[i];
5842 if (vd->vdev_guid == guid)
5843 return (vd);
5844 }
5845 }
5846
5847 return (NULL);
5848 }
5849
5850 void
5851 spa_upgrade(spa_t *spa, uint64_t version)
5852 {
5853 ASSERT(spa_writeable(spa));
5854
5855 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5856
5857 /*
5858 * This should only be called for a non-faulted pool, and since a
5859 * future version would result in an unopenable pool, this shouldn't be
5860 * possible.
5861 */
5862 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
5863 ASSERT(version >= spa->spa_uberblock.ub_version);
5864
5865 spa->spa_uberblock.ub_version = version;
5866 vdev_config_dirty(spa->spa_root_vdev);
5867
5868 spa_config_exit(spa, SCL_ALL, FTAG);
5869
5870 txg_wait_synced(spa_get_dsl(spa), 0);
5871 }
5872
5873 boolean_t
5874 spa_has_spare(spa_t *spa, uint64_t guid)
5875 {
5876 int i;
5877 uint64_t spareguid;
5878 spa_aux_vdev_t *sav = &spa->spa_spares;
5879
5880 for (i = 0; i < sav->sav_count; i++)
5881 if (sav->sav_vdevs[i]->vdev_guid == guid)
5882 return (B_TRUE);
5883
5884 for (i = 0; i < sav->sav_npending; i++) {
5885 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
5886 &spareguid) == 0 && spareguid == guid)
5887 return (B_TRUE);
5888 }
5889
5890 return (B_FALSE);
5891 }
5892
5893 /*
5894 * Check if a pool has an active shared spare device.
5895 * Note: reference count of an active spare is 2, as a spare and as a replace
5896 */
5897 static boolean_t
5898 spa_has_active_shared_spare(spa_t *spa)
5899 {
5900 int i, refcnt;
5901 uint64_t pool;
5902 spa_aux_vdev_t *sav = &spa->spa_spares;
5903
5904 for (i = 0; i < sav->sav_count; i++) {
5905 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
5906 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
5907 refcnt > 2)
5908 return (B_TRUE);
5909 }
5910
5911 return (B_FALSE);
5912 }
5913
5914 /*
5915 * Post a sysevent corresponding to the given event. The 'name' must be one of
5916 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
5917 * filled in from the spa and (optionally) the vdev. This doesn't do anything
5918 * in the userland libzpool, as we don't want consumers to misinterpret ztest
5919 * or zdb as real changes.
5920 */
5921 void
5922 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
5923 {
5924 #ifdef _KERNEL
5925 sysevent_t *ev;
5926 sysevent_attr_list_t *attr = NULL;
5927 sysevent_value_t value;
5928 sysevent_id_t eid;
5929
5930 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
5931 SE_SLEEP);
5932
5933 value.value_type = SE_DATA_TYPE_STRING;
5934 value.value.sv_string = spa_name(spa);
5935 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
5936 goto done;
5937
5938 value.value_type = SE_DATA_TYPE_UINT64;
5939 value.value.sv_uint64 = spa_guid(spa);
5940 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
5941 goto done;
5942
5943 if (vd) {
5944 value.value_type = SE_DATA_TYPE_UINT64;
5945 value.value.sv_uint64 = vd->vdev_guid;
5946 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
5947 SE_SLEEP) != 0)
5948 goto done;
5949
5950 if (vd->vdev_path) {
5951 value.value_type = SE_DATA_TYPE_STRING;
5952 value.value.sv_string = vd->vdev_path;
5953 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
5954 &value, SE_SLEEP) != 0)
5955 goto done;
5956 }
5957 }
5958
5959 if (sysevent_attach_attributes(ev, attr) != 0)
5960 goto done;
5961 attr = NULL;
5962
5963 (void) log_sysevent(ev, SE_SLEEP, &eid);
5964
5965 done:
5966 if (attr)
5967 sysevent_free_attr(attr);
5968 sysevent_free(ev);
5969 #endif
5970 }