Print this page
1693 persistent 'comment' field for a zpool
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/spa.c
+++ new/usr/src/uts/common/fs/zfs/spa.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright (c) 2011 by Delphix. All rights reserved.
26 26 */
27 27
28 28 /*
29 29 * This file contains all the routines used when modifying on-disk SPA state.
30 30 * This includes opening, importing, destroying, exporting a pool, and syncing a
31 31 * pool.
32 32 */
33 33
34 34 #include <sys/zfs_context.h>
35 35 #include <sys/fm/fs/zfs.h>
36 36 #include <sys/spa_impl.h>
37 37 #include <sys/zio.h>
38 38 #include <sys/zio_checksum.h>
39 39 #include <sys/dmu.h>
40 40 #include <sys/dmu_tx.h>
41 41 #include <sys/zap.h>
42 42 #include <sys/zil.h>
43 43 #include <sys/ddt.h>
44 44 #include <sys/vdev_impl.h>
45 45 #include <sys/metaslab.h>
46 46 #include <sys/metaslab_impl.h>
47 47 #include <sys/uberblock_impl.h>
48 48 #include <sys/txg.h>
49 49 #include <sys/avl.h>
50 50 #include <sys/dmu_traverse.h>
51 51 #include <sys/dmu_objset.h>
52 52 #include <sys/unique.h>
53 53 #include <sys/dsl_pool.h>
54 54 #include <sys/dsl_dataset.h>
55 55 #include <sys/dsl_dir.h>
56 56 #include <sys/dsl_prop.h>
57 57 #include <sys/dsl_synctask.h>
58 58 #include <sys/fs/zfs.h>
59 59 #include <sys/arc.h>
60 60 #include <sys/callb.h>
61 61 #include <sys/systeminfo.h>
62 62 #include <sys/spa_boot.h>
63 63 #include <sys/zfs_ioctl.h>
64 64 #include <sys/dsl_scan.h>
65 65
66 66 #ifdef _KERNEL
67 67 #include <sys/bootprops.h>
68 68 #include <sys/callb.h>
69 69 #include <sys/cpupart.h>
70 70 #include <sys/pool.h>
71 71 #include <sys/sysdc.h>
72 72 #include <sys/zone.h>
73 73 #endif /* _KERNEL */
74 74
75 75 #include "zfs_prop.h"
76 76 #include "zfs_comutil.h"
77 77
78 78 typedef enum zti_modes {
79 79 zti_mode_fixed, /* value is # of threads (min 1) */
80 80 zti_mode_online_percent, /* value is % of online CPUs */
81 81 zti_mode_batch, /* cpu-intensive; value is ignored */
82 82 zti_mode_null, /* don't create a taskq */
83 83 zti_nmodes
84 84 } zti_modes_t;
85 85
86 86 #define ZTI_FIX(n) { zti_mode_fixed, (n) }
87 87 #define ZTI_PCT(n) { zti_mode_online_percent, (n) }
88 88 #define ZTI_BATCH { zti_mode_batch, 0 }
89 89 #define ZTI_NULL { zti_mode_null, 0 }
90 90
91 91 #define ZTI_ONE ZTI_FIX(1)
92 92
93 93 typedef struct zio_taskq_info {
94 94 enum zti_modes zti_mode;
95 95 uint_t zti_value;
96 96 } zio_taskq_info_t;
97 97
98 98 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
99 99 "issue", "issue_high", "intr", "intr_high"
100 100 };
101 101
102 102 /*
103 103 * Define the taskq threads for the following I/O types:
104 104 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
105 105 */
106 106 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
107 107 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
108 108 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
109 109 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
110 110 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
111 111 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
112 112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
113 113 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
114 114 };
115 115
116 116 static dsl_syncfunc_t spa_sync_props;
117 117 static boolean_t spa_has_active_shared_spare(spa_t *spa);
118 118 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
119 119 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
120 120 char **ereport);
121 121 static void spa_vdev_resilver_done(spa_t *spa);
122 122
123 123 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
124 124 id_t zio_taskq_psrset_bind = PS_NONE;
125 125 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
126 126 uint_t zio_taskq_basedc = 80; /* base duty cycle */
127 127
128 128 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
129 129
130 130 /*
131 131 * This (illegal) pool name is used when temporarily importing a spa_t in order
132 132 * to get the vdev stats associated with the imported devices.
133 133 */
134 134 #define TRYIMPORT_NAME "$import"
135 135
136 136 /*
137 137 * ==========================================================================
138 138 * SPA properties routines
139 139 * ==========================================================================
140 140 */
141 141
142 142 /*
143 143 * Add a (source=src, propname=propval) list to an nvlist.
144 144 */
145 145 static void
146 146 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
147 147 uint64_t intval, zprop_source_t src)
148 148 {
149 149 const char *propname = zpool_prop_to_name(prop);
150 150 nvlist_t *propval;
151 151
152 152 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
153 153 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
154 154
155 155 if (strval != NULL)
156 156 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
157 157 else
158 158 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
159 159
160 160 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
161 161 nvlist_free(propval);
162 162 }
163 163
164 164 /*
165 165 * Get property values from the spa configuration.
166 166 */
167 167 static void
168 168 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
169 169 {
170 170 uint64_t size;
171 171 uint64_t alloc;
172 172 uint64_t cap, version;
173 173 zprop_source_t src = ZPROP_SRC_NONE;
174 174 spa_config_dirent_t *dp;
175 175
176 176 ASSERT(MUTEX_HELD(&spa->spa_props_lock));
177 177
178 178 if (spa->spa_root_vdev != NULL) {
179 179 alloc = metaslab_class_get_alloc(spa_normal_class(spa));
180 180 size = metaslab_class_get_space(spa_normal_class(spa));
181 181 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
182 182 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
183 183 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
184 184 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
185 185 size - alloc, src);
186 186 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
187 187 (spa_mode(spa) == FREAD), src);
188 188
189 189 cap = (size == 0) ? 0 : (alloc * 100 / size);
190 190 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
191 191
192 192 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
193 193 ddt_get_pool_dedup_ratio(spa), src);
194 194
195 195 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
196 196 spa->spa_root_vdev->vdev_state, src);
197 197
|
↓ open down ↓ |
197 lines elided |
↑ open up ↑ |
198 198 version = spa_version(spa);
199 199 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
200 200 src = ZPROP_SRC_DEFAULT;
201 201 else
202 202 src = ZPROP_SRC_LOCAL;
203 203 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
204 204 }
205 205
206 206 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
207 207
208 + if (spa->spa_comment != NULL) {
209 + spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
210 + 0, ZPROP_SRC_LOCAL);
211 + }
212 +
208 213 if (spa->spa_root != NULL)
209 214 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
210 215 0, ZPROP_SRC_LOCAL);
211 216
212 217 if ((dp = list_head(&spa->spa_config_list)) != NULL) {
213 218 if (dp->scd_path == NULL) {
214 219 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
215 220 "none", 0, ZPROP_SRC_LOCAL);
216 221 } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
217 222 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
218 223 dp->scd_path, 0, ZPROP_SRC_LOCAL);
219 224 }
220 225 }
221 226 }
222 227
223 228 /*
224 229 * Get zpool property values.
225 230 */
226 231 int
227 232 spa_prop_get(spa_t *spa, nvlist_t **nvp)
228 233 {
229 234 objset_t *mos = spa->spa_meta_objset;
230 235 zap_cursor_t zc;
231 236 zap_attribute_t za;
232 237 int err;
233 238
234 239 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
235 240
236 241 mutex_enter(&spa->spa_props_lock);
237 242
238 243 /*
239 244 * Get properties from the spa config.
240 245 */
241 246 spa_prop_get_config(spa, nvp);
242 247
243 248 /* If no pool property object, no more prop to get. */
244 249 if (mos == NULL || spa->spa_pool_props_object == 0) {
245 250 mutex_exit(&spa->spa_props_lock);
246 251 return (0);
247 252 }
248 253
249 254 /*
250 255 * Get properties from the MOS pool property object.
251 256 */
252 257 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
253 258 (err = zap_cursor_retrieve(&zc, &za)) == 0;
254 259 zap_cursor_advance(&zc)) {
255 260 uint64_t intval = 0;
256 261 char *strval = NULL;
257 262 zprop_source_t src = ZPROP_SRC_DEFAULT;
258 263 zpool_prop_t prop;
259 264
260 265 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
261 266 continue;
262 267
263 268 switch (za.za_integer_length) {
264 269 case 8:
265 270 /* integer property */
266 271 if (za.za_first_integer !=
267 272 zpool_prop_default_numeric(prop))
268 273 src = ZPROP_SRC_LOCAL;
269 274
270 275 if (prop == ZPOOL_PROP_BOOTFS) {
271 276 dsl_pool_t *dp;
272 277 dsl_dataset_t *ds = NULL;
273 278
274 279 dp = spa_get_dsl(spa);
275 280 rw_enter(&dp->dp_config_rwlock, RW_READER);
276 281 if (err = dsl_dataset_hold_obj(dp,
277 282 za.za_first_integer, FTAG, &ds)) {
278 283 rw_exit(&dp->dp_config_rwlock);
279 284 break;
280 285 }
281 286
282 287 strval = kmem_alloc(
283 288 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
284 289 KM_SLEEP);
285 290 dsl_dataset_name(ds, strval);
286 291 dsl_dataset_rele(ds, FTAG);
287 292 rw_exit(&dp->dp_config_rwlock);
288 293 } else {
289 294 strval = NULL;
290 295 intval = za.za_first_integer;
291 296 }
292 297
293 298 spa_prop_add_list(*nvp, prop, strval, intval, src);
294 299
295 300 if (strval != NULL)
296 301 kmem_free(strval,
297 302 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
298 303
299 304 break;
300 305
301 306 case 1:
302 307 /* string property */
303 308 strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
304 309 err = zap_lookup(mos, spa->spa_pool_props_object,
305 310 za.za_name, 1, za.za_num_integers, strval);
306 311 if (err) {
307 312 kmem_free(strval, za.za_num_integers);
308 313 break;
309 314 }
310 315 spa_prop_add_list(*nvp, prop, strval, 0, src);
311 316 kmem_free(strval, za.za_num_integers);
312 317 break;
313 318
314 319 default:
315 320 break;
316 321 }
317 322 }
318 323 zap_cursor_fini(&zc);
319 324 mutex_exit(&spa->spa_props_lock);
320 325 out:
321 326 if (err && err != ENOENT) {
322 327 nvlist_free(*nvp);
323 328 *nvp = NULL;
324 329 return (err);
325 330 }
326 331
327 332 return (0);
328 333 }
329 334
330 335 /*
331 336 * Validate the given pool properties nvlist and modify the list
332 337 * for the property values to be set.
333 338 */
334 339 static int
335 340 spa_prop_validate(spa_t *spa, nvlist_t *props)
336 341 {
|
↓ open down ↓ |
119 lines elided |
↑ open up ↑ |
337 342 nvpair_t *elem;
338 343 int error = 0, reset_bootfs = 0;
339 344 uint64_t objnum;
340 345
341 346 elem = NULL;
342 347 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
343 348 zpool_prop_t prop;
344 349 char *propname, *strval;
345 350 uint64_t intval;
346 351 objset_t *os;
347 - char *slash;
352 + char *slash, *check;
348 353
349 354 propname = nvpair_name(elem);
350 355
351 356 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
352 357 return (EINVAL);
353 358
354 359 switch (prop) {
355 360 case ZPOOL_PROP_VERSION:
356 361 error = nvpair_value_uint64(elem, &intval);
357 362 if (!error &&
358 363 (intval < spa_version(spa) || intval > SPA_VERSION))
359 364 error = EINVAL;
360 365 break;
361 366
362 367 case ZPOOL_PROP_DELEGATION:
363 368 case ZPOOL_PROP_AUTOREPLACE:
364 369 case ZPOOL_PROP_LISTSNAPS:
365 370 case ZPOOL_PROP_AUTOEXPAND:
366 371 error = nvpair_value_uint64(elem, &intval);
367 372 if (!error && intval > 1)
368 373 error = EINVAL;
369 374 break;
370 375
371 376 case ZPOOL_PROP_BOOTFS:
372 377 /*
373 378 * If the pool version is less than SPA_VERSION_BOOTFS,
374 379 * or the pool is still being created (version == 0),
375 380 * the bootfs property cannot be set.
376 381 */
377 382 if (spa_version(spa) < SPA_VERSION_BOOTFS) {
378 383 error = ENOTSUP;
379 384 break;
380 385 }
381 386
382 387 /*
383 388 * Make sure the vdev config is bootable
384 389 */
385 390 if (!vdev_is_bootable(spa->spa_root_vdev)) {
386 391 error = ENOTSUP;
387 392 break;
388 393 }
389 394
390 395 reset_bootfs = 1;
391 396
392 397 error = nvpair_value_string(elem, &strval);
393 398
394 399 if (!error) {
395 400 uint64_t compress;
396 401
397 402 if (strval == NULL || strval[0] == '\0') {
398 403 objnum = zpool_prop_default_numeric(
399 404 ZPOOL_PROP_BOOTFS);
400 405 break;
401 406 }
402 407
403 408 if (error = dmu_objset_hold(strval, FTAG, &os))
404 409 break;
405 410
406 411 /* Must be ZPL and not gzip compressed. */
407 412
408 413 if (dmu_objset_type(os) != DMU_OST_ZFS) {
409 414 error = ENOTSUP;
410 415 } else if ((error = dsl_prop_get_integer(strval,
411 416 zfs_prop_to_name(ZFS_PROP_COMPRESSION),
412 417 &compress, NULL)) == 0 &&
413 418 !BOOTFS_COMPRESS_VALID(compress)) {
414 419 error = ENOTSUP;
415 420 } else {
416 421 objnum = dmu_objset_id(os);
417 422 }
418 423 dmu_objset_rele(os, FTAG);
419 424 }
420 425 break;
421 426
422 427 case ZPOOL_PROP_FAILUREMODE:
423 428 error = nvpair_value_uint64(elem, &intval);
424 429 if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
425 430 intval > ZIO_FAILURE_MODE_PANIC))
426 431 error = EINVAL;
427 432
428 433 /*
429 434 * This is a special case which only occurs when
430 435 * the pool has completely failed. This allows
431 436 * the user to change the in-core failmode property
432 437 * without syncing it out to disk (I/Os might
433 438 * currently be blocked). We do this by returning
434 439 * EIO to the caller (spa_prop_set) to trick it
435 440 * into thinking we encountered a property validation
436 441 * error.
437 442 */
438 443 if (!error && spa_suspended(spa)) {
439 444 spa->spa_failmode = intval;
440 445 error = EIO;
441 446 }
442 447 break;
443 448
444 449 case ZPOOL_PROP_CACHEFILE:
445 450 if ((error = nvpair_value_string(elem, &strval)) != 0)
446 451 break;
447 452
448 453 if (strval[0] == '\0')
449 454 break;
450 455
451 456 if (strcmp(strval, "none") == 0)
452 457 break;
453 458
454 459 if (strval[0] != '/') {
455 460 error = EINVAL;
456 461 break;
|
↓ open down ↓ |
99 lines elided |
↑ open up ↑ |
457 462 }
458 463
459 464 slash = strrchr(strval, '/');
460 465 ASSERT(slash != NULL);
461 466
462 467 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
463 468 strcmp(slash, "/..") == 0)
464 469 error = EINVAL;
465 470 break;
466 471
472 + case ZPOOL_PROP_COMMENT:
473 + if ((error = nvpair_value_string(elem, &strval)) != 0)
474 + break;
475 + for (check = strval; *check != '\0'; check++) {
476 + /*
477 + * The kernel doesn't have an easy isprint()
478 + * check. For this kernel check, we merely
479 + * check ASCII apart from DEL. Fix this if
480 + * there is an easy-to-use kernel isprint().
481 + */
482 + if (*check >= 0x7f) {
483 + error = EINVAL;
484 + break;
485 + }
486 + check++;
487 + }
488 + if (strlen(strval) > ZPROP_MAX_COMMENT)
489 + error = E2BIG;
490 + break;
491 +
467 492 case ZPOOL_PROP_DEDUPDITTO:
468 493 if (spa_version(spa) < SPA_VERSION_DEDUP)
469 494 error = ENOTSUP;
470 495 else
471 496 error = nvpair_value_uint64(elem, &intval);
472 497 if (error == 0 &&
473 498 intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
474 499 error = EINVAL;
475 500 break;
476 501 }
477 502
478 503 if (error)
479 504 break;
480 505 }
481 506
482 507 if (!error && reset_bootfs) {
483 508 error = nvlist_remove(props,
484 509 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
485 510
486 511 if (!error) {
487 512 error = nvlist_add_uint64(props,
488 513 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
489 514 }
490 515 }
491 516
492 517 return (error);
493 518 }
494 519
495 520 void
496 521 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
497 522 {
498 523 char *cachefile;
499 524 spa_config_dirent_t *dp;
500 525
501 526 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
502 527 &cachefile) != 0)
503 528 return;
504 529
505 530 dp = kmem_alloc(sizeof (spa_config_dirent_t),
506 531 KM_SLEEP);
507 532
508 533 if (cachefile[0] == '\0')
509 534 dp->scd_path = spa_strdup(spa_config_path);
510 535 else if (strcmp(cachefile, "none") == 0)
511 536 dp->scd_path = NULL;
512 537 else
513 538 dp->scd_path = spa_strdup(cachefile);
514 539
515 540 list_insert_head(&spa->spa_config_list, dp);
516 541 if (need_sync)
517 542 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
518 543 }
519 544
520 545 int
521 546 spa_prop_set(spa_t *spa, nvlist_t *nvp)
522 547 {
523 548 int error;
524 549 nvpair_t *elem;
525 550 boolean_t need_sync = B_FALSE;
526 551 zpool_prop_t prop;
527 552
528 553 if ((error = spa_prop_validate(spa, nvp)) != 0)
529 554 return (error);
530 555
531 556 elem = NULL;
532 557 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
533 558 if ((prop = zpool_name_to_prop(
534 559 nvpair_name(elem))) == ZPROP_INVAL)
535 560 return (EINVAL);
536 561
537 562 if (prop == ZPOOL_PROP_CACHEFILE ||
538 563 prop == ZPOOL_PROP_ALTROOT ||
539 564 prop == ZPOOL_PROP_READONLY)
540 565 continue;
541 566
542 567 need_sync = B_TRUE;
543 568 break;
544 569 }
545 570
546 571 if (need_sync)
547 572 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
548 573 spa, nvp, 3));
549 574 else
550 575 return (0);
551 576 }
552 577
553 578 /*
554 579 * If the bootfs property value is dsobj, clear it.
555 580 */
556 581 void
557 582 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
558 583 {
559 584 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
560 585 VERIFY(zap_remove(spa->spa_meta_objset,
561 586 spa->spa_pool_props_object,
562 587 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
563 588 spa->spa_bootfs = 0;
564 589 }
565 590 }
566 591
567 592 /*
568 593 * Change the GUID for the pool. This is done so that we can later
569 594 * re-import a pool built from a clone of our own vdevs. We will modify
570 595 * the root vdev's guid, our own pool guid, and then mark all of our
571 596 * vdevs dirty. Note that we must make sure that all our vdevs are
572 597 * online when we do this, or else any vdevs that weren't present
573 598 * would be orphaned from our pool. We are also going to issue a
574 599 * sysevent to update any watchers.
575 600 */
576 601 int
577 602 spa_change_guid(spa_t *spa)
578 603 {
579 604 uint64_t oldguid, newguid;
580 605 uint64_t txg;
581 606
582 607 if (!(spa_mode_global & FWRITE))
583 608 return (EROFS);
584 609
585 610 txg = spa_vdev_enter(spa);
586 611
587 612 if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
588 613 return (spa_vdev_exit(spa, NULL, txg, ENXIO));
589 614
590 615 oldguid = spa_guid(spa);
591 616 newguid = spa_generate_guid(NULL);
592 617 ASSERT3U(oldguid, !=, newguid);
593 618
594 619 spa->spa_root_vdev->vdev_guid = newguid;
595 620 spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
596 621
597 622 vdev_config_dirty(spa->spa_root_vdev);
598 623
599 624 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
600 625
601 626 return (spa_vdev_exit(spa, NULL, txg, 0));
602 627 }
603 628
604 629 /*
605 630 * ==========================================================================
606 631 * SPA state manipulation (open/create/destroy/import/export)
607 632 * ==========================================================================
608 633 */
609 634
610 635 static int
611 636 spa_error_entry_compare(const void *a, const void *b)
612 637 {
613 638 spa_error_entry_t *sa = (spa_error_entry_t *)a;
614 639 spa_error_entry_t *sb = (spa_error_entry_t *)b;
615 640 int ret;
616 641
617 642 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
618 643 sizeof (zbookmark_t));
619 644
620 645 if (ret < 0)
621 646 return (-1);
622 647 else if (ret > 0)
623 648 return (1);
624 649 else
625 650 return (0);
626 651 }
627 652
628 653 /*
629 654 * Utility function which retrieves copies of the current logs and
630 655 * re-initializes them in the process.
631 656 */
632 657 void
633 658 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
634 659 {
635 660 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
636 661
637 662 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
638 663 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
639 664
640 665 avl_create(&spa->spa_errlist_scrub,
641 666 spa_error_entry_compare, sizeof (spa_error_entry_t),
642 667 offsetof(spa_error_entry_t, se_avl));
643 668 avl_create(&spa->spa_errlist_last,
644 669 spa_error_entry_compare, sizeof (spa_error_entry_t),
645 670 offsetof(spa_error_entry_t, se_avl));
646 671 }
647 672
648 673 static taskq_t *
649 674 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
650 675 uint_t value)
651 676 {
652 677 uint_t flags = 0;
653 678 boolean_t batch = B_FALSE;
654 679
655 680 switch (mode) {
656 681 case zti_mode_null:
657 682 return (NULL); /* no taskq needed */
658 683
659 684 case zti_mode_fixed:
660 685 ASSERT3U(value, >=, 1);
661 686 value = MAX(value, 1);
662 687 break;
663 688
664 689 case zti_mode_batch:
665 690 batch = B_TRUE;
666 691 flags |= TASKQ_THREADS_CPU_PCT;
667 692 value = zio_taskq_batch_pct;
668 693 break;
669 694
670 695 case zti_mode_online_percent:
671 696 flags |= TASKQ_THREADS_CPU_PCT;
672 697 break;
673 698
674 699 default:
675 700 panic("unrecognized mode for %s taskq (%u:%u) in "
676 701 "spa_activate()",
677 702 name, mode, value);
678 703 break;
679 704 }
680 705
681 706 if (zio_taskq_sysdc && spa->spa_proc != &p0) {
682 707 if (batch)
683 708 flags |= TASKQ_DC_BATCH;
684 709
685 710 return (taskq_create_sysdc(name, value, 50, INT_MAX,
686 711 spa->spa_proc, zio_taskq_basedc, flags));
687 712 }
688 713 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
689 714 spa->spa_proc, flags));
690 715 }
691 716
692 717 static void
693 718 spa_create_zio_taskqs(spa_t *spa)
694 719 {
695 720 for (int t = 0; t < ZIO_TYPES; t++) {
696 721 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
697 722 const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
698 723 enum zti_modes mode = ztip->zti_mode;
699 724 uint_t value = ztip->zti_value;
700 725 char name[32];
701 726
702 727 (void) snprintf(name, sizeof (name),
703 728 "%s_%s", zio_type_name[t], zio_taskq_types[q]);
704 729
705 730 spa->spa_zio_taskq[t][q] =
706 731 spa_taskq_create(spa, name, mode, value);
707 732 }
708 733 }
709 734 }
710 735
711 736 #ifdef _KERNEL
712 737 static void
713 738 spa_thread(void *arg)
714 739 {
715 740 callb_cpr_t cprinfo;
716 741
717 742 spa_t *spa = arg;
718 743 user_t *pu = PTOU(curproc);
719 744
720 745 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
721 746 spa->spa_name);
722 747
723 748 ASSERT(curproc != &p0);
724 749 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
725 750 "zpool-%s", spa->spa_name);
726 751 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
727 752
728 753 /* bind this thread to the requested psrset */
729 754 if (zio_taskq_psrset_bind != PS_NONE) {
730 755 pool_lock();
731 756 mutex_enter(&cpu_lock);
732 757 mutex_enter(&pidlock);
733 758 mutex_enter(&curproc->p_lock);
734 759
735 760 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
736 761 0, NULL, NULL) == 0) {
737 762 curthread->t_bind_pset = zio_taskq_psrset_bind;
738 763 } else {
739 764 cmn_err(CE_WARN,
740 765 "Couldn't bind process for zfs pool \"%s\" to "
741 766 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
742 767 }
743 768
744 769 mutex_exit(&curproc->p_lock);
745 770 mutex_exit(&pidlock);
746 771 mutex_exit(&cpu_lock);
747 772 pool_unlock();
748 773 }
749 774
750 775 if (zio_taskq_sysdc) {
751 776 sysdc_thread_enter(curthread, 100, 0);
752 777 }
753 778
754 779 spa->spa_proc = curproc;
755 780 spa->spa_did = curthread->t_did;
756 781
757 782 spa_create_zio_taskqs(spa);
758 783
759 784 mutex_enter(&spa->spa_proc_lock);
760 785 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
761 786
762 787 spa->spa_proc_state = SPA_PROC_ACTIVE;
763 788 cv_broadcast(&spa->spa_proc_cv);
764 789
765 790 CALLB_CPR_SAFE_BEGIN(&cprinfo);
766 791 while (spa->spa_proc_state == SPA_PROC_ACTIVE)
767 792 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
768 793 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
769 794
770 795 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
771 796 spa->spa_proc_state = SPA_PROC_GONE;
772 797 spa->spa_proc = &p0;
773 798 cv_broadcast(&spa->spa_proc_cv);
774 799 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */
775 800
776 801 mutex_enter(&curproc->p_lock);
777 802 lwp_exit();
778 803 }
779 804 #endif
780 805
781 806 /*
782 807 * Activate an uninitialized pool.
783 808 */
784 809 static void
785 810 spa_activate(spa_t *spa, int mode)
786 811 {
787 812 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
788 813
789 814 spa->spa_state = POOL_STATE_ACTIVE;
790 815 spa->spa_mode = mode;
791 816
792 817 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
793 818 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
794 819
795 820 /* Try to create a covering process */
796 821 mutex_enter(&spa->spa_proc_lock);
797 822 ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
798 823 ASSERT(spa->spa_proc == &p0);
799 824 spa->spa_did = 0;
800 825
801 826 /* Only create a process if we're going to be around a while. */
802 827 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
803 828 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
804 829 NULL, 0) == 0) {
805 830 spa->spa_proc_state = SPA_PROC_CREATED;
806 831 while (spa->spa_proc_state == SPA_PROC_CREATED) {
807 832 cv_wait(&spa->spa_proc_cv,
808 833 &spa->spa_proc_lock);
809 834 }
810 835 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
811 836 ASSERT(spa->spa_proc != &p0);
812 837 ASSERT(spa->spa_did != 0);
813 838 } else {
814 839 #ifdef _KERNEL
815 840 cmn_err(CE_WARN,
816 841 "Couldn't create process for zfs pool \"%s\"\n",
817 842 spa->spa_name);
818 843 #endif
819 844 }
820 845 }
821 846 mutex_exit(&spa->spa_proc_lock);
822 847
823 848 /* If we didn't create a process, we need to create our taskqs. */
824 849 if (spa->spa_proc == &p0) {
825 850 spa_create_zio_taskqs(spa);
826 851 }
827 852
828 853 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
829 854 offsetof(vdev_t, vdev_config_dirty_node));
830 855 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
831 856 offsetof(vdev_t, vdev_state_dirty_node));
832 857
833 858 txg_list_create(&spa->spa_vdev_txg_list,
834 859 offsetof(struct vdev, vdev_txg_node));
835 860
836 861 avl_create(&spa->spa_errlist_scrub,
837 862 spa_error_entry_compare, sizeof (spa_error_entry_t),
838 863 offsetof(spa_error_entry_t, se_avl));
839 864 avl_create(&spa->spa_errlist_last,
840 865 spa_error_entry_compare, sizeof (spa_error_entry_t),
841 866 offsetof(spa_error_entry_t, se_avl));
842 867 }
843 868
844 869 /*
845 870 * Opposite of spa_activate().
846 871 */
847 872 static void
848 873 spa_deactivate(spa_t *spa)
849 874 {
850 875 ASSERT(spa->spa_sync_on == B_FALSE);
851 876 ASSERT(spa->spa_dsl_pool == NULL);
852 877 ASSERT(spa->spa_root_vdev == NULL);
853 878 ASSERT(spa->spa_async_zio_root == NULL);
854 879 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
855 880
856 881 txg_list_destroy(&spa->spa_vdev_txg_list);
857 882
858 883 list_destroy(&spa->spa_config_dirty_list);
859 884 list_destroy(&spa->spa_state_dirty_list);
860 885
861 886 for (int t = 0; t < ZIO_TYPES; t++) {
862 887 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
863 888 if (spa->spa_zio_taskq[t][q] != NULL)
864 889 taskq_destroy(spa->spa_zio_taskq[t][q]);
865 890 spa->spa_zio_taskq[t][q] = NULL;
866 891 }
867 892 }
868 893
869 894 metaslab_class_destroy(spa->spa_normal_class);
870 895 spa->spa_normal_class = NULL;
871 896
872 897 metaslab_class_destroy(spa->spa_log_class);
873 898 spa->spa_log_class = NULL;
874 899
875 900 /*
876 901 * If this was part of an import or the open otherwise failed, we may
877 902 * still have errors left in the queues. Empty them just in case.
878 903 */
879 904 spa_errlog_drain(spa);
880 905
881 906 avl_destroy(&spa->spa_errlist_scrub);
882 907 avl_destroy(&spa->spa_errlist_last);
883 908
884 909 spa->spa_state = POOL_STATE_UNINITIALIZED;
885 910
886 911 mutex_enter(&spa->spa_proc_lock);
887 912 if (spa->spa_proc_state != SPA_PROC_NONE) {
888 913 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
889 914 spa->spa_proc_state = SPA_PROC_DEACTIVATE;
890 915 cv_broadcast(&spa->spa_proc_cv);
891 916 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
892 917 ASSERT(spa->spa_proc != &p0);
893 918 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
894 919 }
895 920 ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
896 921 spa->spa_proc_state = SPA_PROC_NONE;
897 922 }
898 923 ASSERT(spa->spa_proc == &p0);
899 924 mutex_exit(&spa->spa_proc_lock);
900 925
901 926 /*
902 927 * We want to make sure spa_thread() has actually exited the ZFS
903 928 * module, so that the module can't be unloaded out from underneath
904 929 * it.
905 930 */
906 931 if (spa->spa_did != 0) {
907 932 thread_join(spa->spa_did);
908 933 spa->spa_did = 0;
909 934 }
910 935 }
911 936
912 937 /*
913 938 * Verify a pool configuration, and construct the vdev tree appropriately. This
914 939 * will create all the necessary vdevs in the appropriate layout, with each vdev
915 940 * in the CLOSED state. This will prep the pool before open/creation/import.
916 941 * All vdev validation is done by the vdev_alloc() routine.
917 942 */
918 943 static int
919 944 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
920 945 uint_t id, int atype)
921 946 {
922 947 nvlist_t **child;
923 948 uint_t children;
924 949 int error;
925 950
926 951 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
927 952 return (error);
928 953
929 954 if ((*vdp)->vdev_ops->vdev_op_leaf)
930 955 return (0);
931 956
932 957 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
933 958 &child, &children);
934 959
935 960 if (error == ENOENT)
936 961 return (0);
937 962
938 963 if (error) {
939 964 vdev_free(*vdp);
940 965 *vdp = NULL;
941 966 return (EINVAL);
942 967 }
943 968
944 969 for (int c = 0; c < children; c++) {
945 970 vdev_t *vd;
946 971 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
947 972 atype)) != 0) {
948 973 vdev_free(*vdp);
949 974 *vdp = NULL;
950 975 return (error);
951 976 }
952 977 }
953 978
954 979 ASSERT(*vdp != NULL);
955 980
956 981 return (0);
957 982 }
958 983
959 984 /*
960 985 * Opposite of spa_load().
961 986 */
962 987 static void
963 988 spa_unload(spa_t *spa)
964 989 {
965 990 int i;
966 991
967 992 ASSERT(MUTEX_HELD(&spa_namespace_lock));
968 993
969 994 /*
970 995 * Stop async tasks.
971 996 */
972 997 spa_async_suspend(spa);
973 998
974 999 /*
975 1000 * Stop syncing.
976 1001 */
977 1002 if (spa->spa_sync_on) {
978 1003 txg_sync_stop(spa->spa_dsl_pool);
979 1004 spa->spa_sync_on = B_FALSE;
980 1005 }
981 1006
982 1007 /*
983 1008 * Wait for any outstanding async I/O to complete.
984 1009 */
985 1010 if (spa->spa_async_zio_root != NULL) {
986 1011 (void) zio_wait(spa->spa_async_zio_root);
987 1012 spa->spa_async_zio_root = NULL;
988 1013 }
989 1014
990 1015 bpobj_close(&spa->spa_deferred_bpobj);
991 1016
992 1017 /*
993 1018 * Close the dsl pool.
994 1019 */
995 1020 if (spa->spa_dsl_pool) {
996 1021 dsl_pool_close(spa->spa_dsl_pool);
997 1022 spa->spa_dsl_pool = NULL;
998 1023 spa->spa_meta_objset = NULL;
999 1024 }
1000 1025
1001 1026 ddt_unload(spa);
1002 1027
1003 1028 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1004 1029
1005 1030 /*
1006 1031 * Drop and purge level 2 cache
1007 1032 */
1008 1033 spa_l2cache_drop(spa);
1009 1034
1010 1035 /*
1011 1036 * Close all vdevs.
1012 1037 */
1013 1038 if (spa->spa_root_vdev)
1014 1039 vdev_free(spa->spa_root_vdev);
1015 1040 ASSERT(spa->spa_root_vdev == NULL);
1016 1041
1017 1042 for (i = 0; i < spa->spa_spares.sav_count; i++)
1018 1043 vdev_free(spa->spa_spares.sav_vdevs[i]);
1019 1044 if (spa->spa_spares.sav_vdevs) {
1020 1045 kmem_free(spa->spa_spares.sav_vdevs,
1021 1046 spa->spa_spares.sav_count * sizeof (void *));
1022 1047 spa->spa_spares.sav_vdevs = NULL;
1023 1048 }
1024 1049 if (spa->spa_spares.sav_config) {
1025 1050 nvlist_free(spa->spa_spares.sav_config);
1026 1051 spa->spa_spares.sav_config = NULL;
1027 1052 }
1028 1053 spa->spa_spares.sav_count = 0;
1029 1054
1030 1055 for (i = 0; i < spa->spa_l2cache.sav_count; i++)
1031 1056 vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1032 1057 if (spa->spa_l2cache.sav_vdevs) {
1033 1058 kmem_free(spa->spa_l2cache.sav_vdevs,
1034 1059 spa->spa_l2cache.sav_count * sizeof (void *));
|
↓ open down ↓ |
558 lines elided |
↑ open up ↑ |
1035 1060 spa->spa_l2cache.sav_vdevs = NULL;
1036 1061 }
1037 1062 if (spa->spa_l2cache.sav_config) {
1038 1063 nvlist_free(spa->spa_l2cache.sav_config);
1039 1064 spa->spa_l2cache.sav_config = NULL;
1040 1065 }
1041 1066 spa->spa_l2cache.sav_count = 0;
1042 1067
1043 1068 spa->spa_async_suspended = 0;
1044 1069
1070 + if (spa->spa_comment != NULL) {
1071 + spa_strfree(spa->spa_comment);
1072 + spa->spa_comment = NULL;
1073 + }
1074 +
1045 1075 spa_config_exit(spa, SCL_ALL, FTAG);
1046 1076 }
1047 1077
1048 1078 /*
1049 1079 * Load (or re-load) the current list of vdevs describing the active spares for
1050 1080 * this pool. When this is called, we have some form of basic information in
1051 1081 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
1052 1082 * then re-generate a more complete list including status information.
1053 1083 */
1054 1084 static void
1055 1085 spa_load_spares(spa_t *spa)
1056 1086 {
1057 1087 nvlist_t **spares;
1058 1088 uint_t nspares;
1059 1089 int i;
1060 1090 vdev_t *vd, *tvd;
1061 1091
1062 1092 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1063 1093
1064 1094 /*
1065 1095 * First, close and free any existing spare vdevs.
1066 1096 */
1067 1097 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1068 1098 vd = spa->spa_spares.sav_vdevs[i];
1069 1099
1070 1100 /* Undo the call to spa_activate() below */
1071 1101 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1072 1102 B_FALSE)) != NULL && tvd->vdev_isspare)
1073 1103 spa_spare_remove(tvd);
1074 1104 vdev_close(vd);
1075 1105 vdev_free(vd);
1076 1106 }
1077 1107
1078 1108 if (spa->spa_spares.sav_vdevs)
1079 1109 kmem_free(spa->spa_spares.sav_vdevs,
1080 1110 spa->spa_spares.sav_count * sizeof (void *));
1081 1111
1082 1112 if (spa->spa_spares.sav_config == NULL)
1083 1113 nspares = 0;
1084 1114 else
1085 1115 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1086 1116 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1087 1117
1088 1118 spa->spa_spares.sav_count = (int)nspares;
1089 1119 spa->spa_spares.sav_vdevs = NULL;
1090 1120
1091 1121 if (nspares == 0)
1092 1122 return;
1093 1123
1094 1124 /*
1095 1125 * Construct the array of vdevs, opening them to get status in the
1096 1126 * process. For each spare, there is potentially two different vdev_t
1097 1127 * structures associated with it: one in the list of spares (used only
1098 1128 * for basic validation purposes) and one in the active vdev
1099 1129 * configuration (if it's spared in). During this phase we open and
1100 1130 * validate each vdev on the spare list. If the vdev also exists in the
1101 1131 * active configuration, then we also mark this vdev as an active spare.
1102 1132 */
1103 1133 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1104 1134 KM_SLEEP);
1105 1135 for (i = 0; i < spa->spa_spares.sav_count; i++) {
1106 1136 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1107 1137 VDEV_ALLOC_SPARE) == 0);
1108 1138 ASSERT(vd != NULL);
1109 1139
1110 1140 spa->spa_spares.sav_vdevs[i] = vd;
1111 1141
1112 1142 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1113 1143 B_FALSE)) != NULL) {
1114 1144 if (!tvd->vdev_isspare)
1115 1145 spa_spare_add(tvd);
1116 1146
1117 1147 /*
1118 1148 * We only mark the spare active if we were successfully
1119 1149 * able to load the vdev. Otherwise, importing a pool
1120 1150 * with a bad active spare would result in strange
1121 1151 * behavior, because multiple pool would think the spare
1122 1152 * is actively in use.
1123 1153 *
1124 1154 * There is a vulnerability here to an equally bizarre
1125 1155 * circumstance, where a dead active spare is later
1126 1156 * brought back to life (onlined or otherwise). Given
1127 1157 * the rarity of this scenario, and the extra complexity
1128 1158 * it adds, we ignore the possibility.
1129 1159 */
1130 1160 if (!vdev_is_dead(tvd))
1131 1161 spa_spare_activate(tvd);
1132 1162 }
1133 1163
1134 1164 vd->vdev_top = vd;
1135 1165 vd->vdev_aux = &spa->spa_spares;
1136 1166
1137 1167 if (vdev_open(vd) != 0)
1138 1168 continue;
1139 1169
1140 1170 if (vdev_validate_aux(vd) == 0)
1141 1171 spa_spare_add(vd);
1142 1172 }
1143 1173
1144 1174 /*
1145 1175 * Recompute the stashed list of spares, with status information
1146 1176 * this time.
1147 1177 */
1148 1178 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1149 1179 DATA_TYPE_NVLIST_ARRAY) == 0);
1150 1180
1151 1181 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1152 1182 KM_SLEEP);
1153 1183 for (i = 0; i < spa->spa_spares.sav_count; i++)
1154 1184 spares[i] = vdev_config_generate(spa,
1155 1185 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1156 1186 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1157 1187 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1158 1188 for (i = 0; i < spa->spa_spares.sav_count; i++)
1159 1189 nvlist_free(spares[i]);
1160 1190 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1161 1191 }
1162 1192
1163 1193 /*
1164 1194 * Load (or re-load) the current list of vdevs describing the active l2cache for
1165 1195 * this pool. When this is called, we have some form of basic information in
1166 1196 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
1167 1197 * then re-generate a more complete list including status information.
1168 1198 * Devices which are already active have their details maintained, and are
1169 1199 * not re-opened.
1170 1200 */
1171 1201 static void
1172 1202 spa_load_l2cache(spa_t *spa)
1173 1203 {
1174 1204 nvlist_t **l2cache;
1175 1205 uint_t nl2cache;
1176 1206 int i, j, oldnvdevs;
1177 1207 uint64_t guid;
1178 1208 vdev_t *vd, **oldvdevs, **newvdevs;
1179 1209 spa_aux_vdev_t *sav = &spa->spa_l2cache;
1180 1210
1181 1211 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1182 1212
1183 1213 if (sav->sav_config != NULL) {
1184 1214 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1185 1215 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1186 1216 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1187 1217 } else {
1188 1218 nl2cache = 0;
1189 1219 }
1190 1220
1191 1221 oldvdevs = sav->sav_vdevs;
1192 1222 oldnvdevs = sav->sav_count;
1193 1223 sav->sav_vdevs = NULL;
1194 1224 sav->sav_count = 0;
1195 1225
1196 1226 /*
1197 1227 * Process new nvlist of vdevs.
1198 1228 */
1199 1229 for (i = 0; i < nl2cache; i++) {
1200 1230 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1201 1231 &guid) == 0);
1202 1232
1203 1233 newvdevs[i] = NULL;
1204 1234 for (j = 0; j < oldnvdevs; j++) {
1205 1235 vd = oldvdevs[j];
1206 1236 if (vd != NULL && guid == vd->vdev_guid) {
1207 1237 /*
1208 1238 * Retain previous vdev for add/remove ops.
1209 1239 */
1210 1240 newvdevs[i] = vd;
1211 1241 oldvdevs[j] = NULL;
1212 1242 break;
1213 1243 }
1214 1244 }
1215 1245
1216 1246 if (newvdevs[i] == NULL) {
1217 1247 /*
1218 1248 * Create new vdev
1219 1249 */
1220 1250 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1221 1251 VDEV_ALLOC_L2CACHE) == 0);
1222 1252 ASSERT(vd != NULL);
1223 1253 newvdevs[i] = vd;
1224 1254
1225 1255 /*
1226 1256 * Commit this vdev as an l2cache device,
1227 1257 * even if it fails to open.
1228 1258 */
1229 1259 spa_l2cache_add(vd);
1230 1260
1231 1261 vd->vdev_top = vd;
1232 1262 vd->vdev_aux = sav;
1233 1263
1234 1264 spa_l2cache_activate(vd);
1235 1265
1236 1266 if (vdev_open(vd) != 0)
1237 1267 continue;
1238 1268
1239 1269 (void) vdev_validate_aux(vd);
1240 1270
1241 1271 if (!vdev_is_dead(vd))
1242 1272 l2arc_add_vdev(spa, vd);
1243 1273 }
1244 1274 }
1245 1275
1246 1276 /*
1247 1277 * Purge vdevs that were dropped
1248 1278 */
1249 1279 for (i = 0; i < oldnvdevs; i++) {
1250 1280 uint64_t pool;
1251 1281
1252 1282 vd = oldvdevs[i];
1253 1283 if (vd != NULL) {
1254 1284 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1255 1285 pool != 0ULL && l2arc_vdev_present(vd))
1256 1286 l2arc_remove_vdev(vd);
1257 1287 (void) vdev_close(vd);
1258 1288 spa_l2cache_remove(vd);
1259 1289 }
1260 1290 }
1261 1291
1262 1292 if (oldvdevs)
1263 1293 kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1264 1294
1265 1295 if (sav->sav_config == NULL)
1266 1296 goto out;
1267 1297
1268 1298 sav->sav_vdevs = newvdevs;
1269 1299 sav->sav_count = (int)nl2cache;
1270 1300
1271 1301 /*
1272 1302 * Recompute the stashed list of l2cache devices, with status
1273 1303 * information this time.
1274 1304 */
1275 1305 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1276 1306 DATA_TYPE_NVLIST_ARRAY) == 0);
1277 1307
1278 1308 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1279 1309 for (i = 0; i < sav->sav_count; i++)
1280 1310 l2cache[i] = vdev_config_generate(spa,
1281 1311 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1282 1312 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1283 1313 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1284 1314 out:
1285 1315 for (i = 0; i < sav->sav_count; i++)
1286 1316 nvlist_free(l2cache[i]);
1287 1317 if (sav->sav_count)
1288 1318 kmem_free(l2cache, sav->sav_count * sizeof (void *));
1289 1319 }
1290 1320
1291 1321 static int
1292 1322 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1293 1323 {
1294 1324 dmu_buf_t *db;
1295 1325 char *packed = NULL;
1296 1326 size_t nvsize = 0;
1297 1327 int error;
1298 1328 *value = NULL;
1299 1329
1300 1330 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1301 1331 nvsize = *(uint64_t *)db->db_data;
1302 1332 dmu_buf_rele(db, FTAG);
1303 1333
1304 1334 packed = kmem_alloc(nvsize, KM_SLEEP);
1305 1335 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1306 1336 DMU_READ_PREFETCH);
1307 1337 if (error == 0)
1308 1338 error = nvlist_unpack(packed, nvsize, value, 0);
1309 1339 kmem_free(packed, nvsize);
1310 1340
1311 1341 return (error);
1312 1342 }
1313 1343
1314 1344 /*
1315 1345 * Checks to see if the given vdev could not be opened, in which case we post a
1316 1346 * sysevent to notify the autoreplace code that the device has been removed.
1317 1347 */
1318 1348 static void
1319 1349 spa_check_removed(vdev_t *vd)
1320 1350 {
1321 1351 for (int c = 0; c < vd->vdev_children; c++)
1322 1352 spa_check_removed(vd->vdev_child[c]);
1323 1353
1324 1354 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1325 1355 zfs_post_autoreplace(vd->vdev_spa, vd);
1326 1356 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1327 1357 }
1328 1358 }
1329 1359
1330 1360 /*
1331 1361 * Validate the current config against the MOS config
1332 1362 */
1333 1363 static boolean_t
1334 1364 spa_config_valid(spa_t *spa, nvlist_t *config)
1335 1365 {
1336 1366 vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1337 1367 nvlist_t *nv;
1338 1368
1339 1369 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1340 1370
1341 1371 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1342 1372 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1343 1373
1344 1374 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1345 1375
1346 1376 /*
1347 1377 * If we're doing a normal import, then build up any additional
1348 1378 * diagnostic information about missing devices in this config.
1349 1379 * We'll pass this up to the user for further processing.
1350 1380 */
1351 1381 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1352 1382 nvlist_t **child, *nv;
1353 1383 uint64_t idx = 0;
1354 1384
1355 1385 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1356 1386 KM_SLEEP);
1357 1387 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1358 1388
1359 1389 for (int c = 0; c < rvd->vdev_children; c++) {
1360 1390 vdev_t *tvd = rvd->vdev_child[c];
1361 1391 vdev_t *mtvd = mrvd->vdev_child[c];
1362 1392
1363 1393 if (tvd->vdev_ops == &vdev_missing_ops &&
1364 1394 mtvd->vdev_ops != &vdev_missing_ops &&
1365 1395 mtvd->vdev_islog)
1366 1396 child[idx++] = vdev_config_generate(spa, mtvd,
1367 1397 B_FALSE, 0);
1368 1398 }
1369 1399
1370 1400 if (idx) {
1371 1401 VERIFY(nvlist_add_nvlist_array(nv,
1372 1402 ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1373 1403 VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1374 1404 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1375 1405
1376 1406 for (int i = 0; i < idx; i++)
1377 1407 nvlist_free(child[i]);
1378 1408 }
1379 1409 nvlist_free(nv);
1380 1410 kmem_free(child, rvd->vdev_children * sizeof (char **));
1381 1411 }
1382 1412
1383 1413 /*
1384 1414 * Compare the root vdev tree with the information we have
1385 1415 * from the MOS config (mrvd). Check each top-level vdev
1386 1416 * with the corresponding MOS config top-level (mtvd).
1387 1417 */
1388 1418 for (int c = 0; c < rvd->vdev_children; c++) {
1389 1419 vdev_t *tvd = rvd->vdev_child[c];
1390 1420 vdev_t *mtvd = mrvd->vdev_child[c];
1391 1421
1392 1422 /*
1393 1423 * Resolve any "missing" vdevs in the current configuration.
1394 1424 * If we find that the MOS config has more accurate information
1395 1425 * about the top-level vdev then use that vdev instead.
1396 1426 */
1397 1427 if (tvd->vdev_ops == &vdev_missing_ops &&
1398 1428 mtvd->vdev_ops != &vdev_missing_ops) {
1399 1429
1400 1430 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1401 1431 continue;
1402 1432
1403 1433 /*
1404 1434 * Device specific actions.
1405 1435 */
1406 1436 if (mtvd->vdev_islog) {
1407 1437 spa_set_log_state(spa, SPA_LOG_CLEAR);
1408 1438 } else {
1409 1439 /*
1410 1440 * XXX - once we have 'readonly' pool
1411 1441 * support we should be able to handle
1412 1442 * missing data devices by transitioning
1413 1443 * the pool to readonly.
1414 1444 */
1415 1445 continue;
1416 1446 }
1417 1447
1418 1448 /*
1419 1449 * Swap the missing vdev with the data we were
1420 1450 * able to obtain from the MOS config.
1421 1451 */
1422 1452 vdev_remove_child(rvd, tvd);
1423 1453 vdev_remove_child(mrvd, mtvd);
1424 1454
1425 1455 vdev_add_child(rvd, mtvd);
1426 1456 vdev_add_child(mrvd, tvd);
1427 1457
1428 1458 spa_config_exit(spa, SCL_ALL, FTAG);
1429 1459 vdev_load(mtvd);
1430 1460 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1431 1461
1432 1462 vdev_reopen(rvd);
1433 1463 } else if (mtvd->vdev_islog) {
1434 1464 /*
1435 1465 * Load the slog device's state from the MOS config
1436 1466 * since it's possible that the label does not
1437 1467 * contain the most up-to-date information.
1438 1468 */
1439 1469 vdev_load_log_state(tvd, mtvd);
1440 1470 vdev_reopen(tvd);
1441 1471 }
1442 1472 }
1443 1473 vdev_free(mrvd);
1444 1474 spa_config_exit(spa, SCL_ALL, FTAG);
1445 1475
1446 1476 /*
1447 1477 * Ensure we were able to validate the config.
1448 1478 */
1449 1479 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1450 1480 }
1451 1481
1452 1482 /*
1453 1483 * Check for missing log devices
1454 1484 */
1455 1485 static int
1456 1486 spa_check_logs(spa_t *spa)
1457 1487 {
1458 1488 switch (spa->spa_log_state) {
1459 1489 case SPA_LOG_MISSING:
1460 1490 /* need to recheck in case slog has been restored */
1461 1491 case SPA_LOG_UNKNOWN:
1462 1492 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1463 1493 DS_FIND_CHILDREN)) {
1464 1494 spa_set_log_state(spa, SPA_LOG_MISSING);
1465 1495 return (1);
1466 1496 }
1467 1497 break;
1468 1498 }
1469 1499 return (0);
1470 1500 }
1471 1501
1472 1502 static boolean_t
1473 1503 spa_passivate_log(spa_t *spa)
1474 1504 {
1475 1505 vdev_t *rvd = spa->spa_root_vdev;
1476 1506 boolean_t slog_found = B_FALSE;
1477 1507
1478 1508 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1479 1509
1480 1510 if (!spa_has_slogs(spa))
1481 1511 return (B_FALSE);
1482 1512
1483 1513 for (int c = 0; c < rvd->vdev_children; c++) {
1484 1514 vdev_t *tvd = rvd->vdev_child[c];
1485 1515 metaslab_group_t *mg = tvd->vdev_mg;
1486 1516
1487 1517 if (tvd->vdev_islog) {
1488 1518 metaslab_group_passivate(mg);
1489 1519 slog_found = B_TRUE;
1490 1520 }
1491 1521 }
1492 1522
1493 1523 return (slog_found);
1494 1524 }
1495 1525
1496 1526 static void
1497 1527 spa_activate_log(spa_t *spa)
1498 1528 {
1499 1529 vdev_t *rvd = spa->spa_root_vdev;
1500 1530
1501 1531 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1502 1532
1503 1533 for (int c = 0; c < rvd->vdev_children; c++) {
1504 1534 vdev_t *tvd = rvd->vdev_child[c];
1505 1535 metaslab_group_t *mg = tvd->vdev_mg;
1506 1536
1507 1537 if (tvd->vdev_islog)
1508 1538 metaslab_group_activate(mg);
1509 1539 }
1510 1540 }
1511 1541
1512 1542 int
1513 1543 spa_offline_log(spa_t *spa)
1514 1544 {
1515 1545 int error = 0;
1516 1546
1517 1547 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1518 1548 NULL, DS_FIND_CHILDREN)) == 0) {
1519 1549
1520 1550 /*
1521 1551 * We successfully offlined the log device, sync out the
1522 1552 * current txg so that the "stubby" block can be removed
1523 1553 * by zil_sync().
1524 1554 */
1525 1555 txg_wait_synced(spa->spa_dsl_pool, 0);
1526 1556 }
1527 1557 return (error);
1528 1558 }
1529 1559
1530 1560 static void
1531 1561 spa_aux_check_removed(spa_aux_vdev_t *sav)
1532 1562 {
1533 1563 for (int i = 0; i < sav->sav_count; i++)
1534 1564 spa_check_removed(sav->sav_vdevs[i]);
1535 1565 }
1536 1566
1537 1567 void
1538 1568 spa_claim_notify(zio_t *zio)
1539 1569 {
1540 1570 spa_t *spa = zio->io_spa;
1541 1571
1542 1572 if (zio->io_error)
1543 1573 return;
1544 1574
1545 1575 mutex_enter(&spa->spa_props_lock); /* any mutex will do */
1546 1576 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1547 1577 spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1548 1578 mutex_exit(&spa->spa_props_lock);
1549 1579 }
1550 1580
1551 1581 typedef struct spa_load_error {
1552 1582 uint64_t sle_meta_count;
1553 1583 uint64_t sle_data_count;
1554 1584 } spa_load_error_t;
1555 1585
1556 1586 static void
1557 1587 spa_load_verify_done(zio_t *zio)
1558 1588 {
1559 1589 blkptr_t *bp = zio->io_bp;
1560 1590 spa_load_error_t *sle = zio->io_private;
1561 1591 dmu_object_type_t type = BP_GET_TYPE(bp);
1562 1592 int error = zio->io_error;
1563 1593
1564 1594 if (error) {
1565 1595 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
1566 1596 type != DMU_OT_INTENT_LOG)
1567 1597 atomic_add_64(&sle->sle_meta_count, 1);
1568 1598 else
1569 1599 atomic_add_64(&sle->sle_data_count, 1);
1570 1600 }
1571 1601 zio_data_buf_free(zio->io_data, zio->io_size);
1572 1602 }
1573 1603
1574 1604 /*ARGSUSED*/
1575 1605 static int
1576 1606 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1577 1607 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1578 1608 {
1579 1609 if (bp != NULL) {
1580 1610 zio_t *rio = arg;
1581 1611 size_t size = BP_GET_PSIZE(bp);
1582 1612 void *data = zio_data_buf_alloc(size);
1583 1613
1584 1614 zio_nowait(zio_read(rio, spa, bp, data, size,
1585 1615 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1586 1616 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1587 1617 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1588 1618 }
1589 1619 return (0);
1590 1620 }
1591 1621
1592 1622 static int
1593 1623 spa_load_verify(spa_t *spa)
1594 1624 {
1595 1625 zio_t *rio;
1596 1626 spa_load_error_t sle = { 0 };
1597 1627 zpool_rewind_policy_t policy;
1598 1628 boolean_t verify_ok = B_FALSE;
1599 1629 int error;
1600 1630
1601 1631 zpool_get_rewind_policy(spa->spa_config, &policy);
1602 1632
1603 1633 if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1604 1634 return (0);
1605 1635
1606 1636 rio = zio_root(spa, NULL, &sle,
1607 1637 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1608 1638
1609 1639 error = traverse_pool(spa, spa->spa_verify_min_txg,
1610 1640 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1611 1641
1612 1642 (void) zio_wait(rio);
1613 1643
1614 1644 spa->spa_load_meta_errors = sle.sle_meta_count;
1615 1645 spa->spa_load_data_errors = sle.sle_data_count;
1616 1646
1617 1647 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1618 1648 sle.sle_data_count <= policy.zrp_maxdata) {
1619 1649 int64_t loss = 0;
1620 1650
1621 1651 verify_ok = B_TRUE;
1622 1652 spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1623 1653 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1624 1654
1625 1655 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1626 1656 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1627 1657 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1628 1658 VERIFY(nvlist_add_int64(spa->spa_load_info,
1629 1659 ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1630 1660 VERIFY(nvlist_add_uint64(spa->spa_load_info,
1631 1661 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1632 1662 } else {
1633 1663 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1634 1664 }
1635 1665
1636 1666 if (error) {
1637 1667 if (error != ENXIO && error != EIO)
1638 1668 error = EIO;
1639 1669 return (error);
1640 1670 }
1641 1671
1642 1672 return (verify_ok ? 0 : EIO);
1643 1673 }
1644 1674
1645 1675 /*
1646 1676 * Find a value in the pool props object.
1647 1677 */
1648 1678 static void
1649 1679 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1650 1680 {
1651 1681 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1652 1682 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1653 1683 }
1654 1684
1655 1685 /*
1656 1686 * Find a value in the pool directory object.
1657 1687 */
1658 1688 static int
1659 1689 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1660 1690 {
1661 1691 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1662 1692 name, sizeof (uint64_t), 1, val));
1663 1693 }
1664 1694
1665 1695 static int
1666 1696 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1667 1697 {
1668 1698 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1669 1699 return (err);
1670 1700 }
1671 1701
1672 1702 /*
1673 1703 * Fix up config after a partly-completed split. This is done with the
1674 1704 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off
1675 1705 * pool have that entry in their config, but only the splitting one contains
1676 1706 * a list of all the guids of the vdevs that are being split off.
1677 1707 *
1678 1708 * This function determines what to do with that list: either rejoin
1679 1709 * all the disks to the pool, or complete the splitting process. To attempt
1680 1710 * the rejoin, each disk that is offlined is marked online again, and
1681 1711 * we do a reopen() call. If the vdev label for every disk that was
1682 1712 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1683 1713 * then we call vdev_split() on each disk, and complete the split.
1684 1714 *
1685 1715 * Otherwise we leave the config alone, with all the vdevs in place in
1686 1716 * the original pool.
1687 1717 */
1688 1718 static void
1689 1719 spa_try_repair(spa_t *spa, nvlist_t *config)
1690 1720 {
1691 1721 uint_t extracted;
1692 1722 uint64_t *glist;
1693 1723 uint_t i, gcount;
1694 1724 nvlist_t *nvl;
1695 1725 vdev_t **vd;
1696 1726 boolean_t attempt_reopen;
1697 1727
1698 1728 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1699 1729 return;
1700 1730
1701 1731 /* check that the config is complete */
1702 1732 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1703 1733 &glist, &gcount) != 0)
1704 1734 return;
1705 1735
1706 1736 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1707 1737
1708 1738 /* attempt to online all the vdevs & validate */
1709 1739 attempt_reopen = B_TRUE;
1710 1740 for (i = 0; i < gcount; i++) {
1711 1741 if (glist[i] == 0) /* vdev is hole */
1712 1742 continue;
1713 1743
1714 1744 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1715 1745 if (vd[i] == NULL) {
1716 1746 /*
1717 1747 * Don't bother attempting to reopen the disks;
1718 1748 * just do the split.
1719 1749 */
1720 1750 attempt_reopen = B_FALSE;
1721 1751 } else {
1722 1752 /* attempt to re-online it */
1723 1753 vd[i]->vdev_offline = B_FALSE;
1724 1754 }
1725 1755 }
1726 1756
1727 1757 if (attempt_reopen) {
1728 1758 vdev_reopen(spa->spa_root_vdev);
1729 1759
1730 1760 /* check each device to see what state it's in */
1731 1761 for (extracted = 0, i = 0; i < gcount; i++) {
1732 1762 if (vd[i] != NULL &&
1733 1763 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1734 1764 break;
1735 1765 ++extracted;
1736 1766 }
1737 1767 }
1738 1768
1739 1769 /*
1740 1770 * If every disk has been moved to the new pool, or if we never
1741 1771 * even attempted to look at them, then we split them off for
1742 1772 * good.
1743 1773 */
1744 1774 if (!attempt_reopen || gcount == extracted) {
1745 1775 for (i = 0; i < gcount; i++)
1746 1776 if (vd[i] != NULL)
1747 1777 vdev_split(vd[i]);
1748 1778 vdev_reopen(spa->spa_root_vdev);
1749 1779 }
|
↓ open down ↓ |
695 lines elided |
↑ open up ↑ |
1750 1780
1751 1781 kmem_free(vd, gcount * sizeof (vdev_t *));
1752 1782 }
1753 1783
1754 1784 static int
1755 1785 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1756 1786 boolean_t mosconfig)
1757 1787 {
1758 1788 nvlist_t *config = spa->spa_config;
1759 1789 char *ereport = FM_EREPORT_ZFS_POOL;
1790 + char *comment;
1760 1791 int error;
1761 1792 uint64_t pool_guid;
1762 1793 nvlist_t *nvl;
1763 1794
1764 1795 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1765 1796 return (EINVAL);
1766 1797
1798 + ASSERT(spa->spa_comment == NULL);
1799 + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1800 + spa->spa_comment = spa_strdup(comment);
1801 +
1767 1802 /*
1768 1803 * Versioning wasn't explicitly added to the label until later, so if
1769 1804 * it's not present treat it as the initial version.
1770 1805 */
1771 1806 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1772 1807 &spa->spa_ubsync.ub_version) != 0)
1773 1808 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1774 1809
1775 1810 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1776 1811 &spa->spa_config_txg);
1777 1812
1778 1813 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1779 1814 spa_guid_exists(pool_guid, 0)) {
1780 1815 error = EEXIST;
1781 1816 } else {
1782 1817 spa->spa_config_guid = pool_guid;
1783 1818
1784 1819 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1785 1820 &nvl) == 0) {
1786 1821 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1787 1822 KM_SLEEP) == 0);
1788 1823 }
1789 1824
1790 1825 gethrestime(&spa->spa_loaded_ts);
1791 1826 error = spa_load_impl(spa, pool_guid, config, state, type,
1792 1827 mosconfig, &ereport);
1793 1828 }
1794 1829
1795 1830 spa->spa_minref = refcount_count(&spa->spa_refcount);
1796 1831 if (error) {
1797 1832 if (error != EEXIST) {
1798 1833 spa->spa_loaded_ts.tv_sec = 0;
1799 1834 spa->spa_loaded_ts.tv_nsec = 0;
1800 1835 }
1801 1836 if (error != EBADF) {
1802 1837 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1803 1838 }
1804 1839 }
1805 1840 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1806 1841 spa->spa_ena = 0;
1807 1842
1808 1843 return (error);
1809 1844 }
1810 1845
1811 1846 /*
1812 1847 * Load an existing storage pool, using the pool's builtin spa_config as a
1813 1848 * source of configuration information.
1814 1849 */
1815 1850 static int
1816 1851 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1817 1852 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1818 1853 char **ereport)
1819 1854 {
1820 1855 int error = 0;
1821 1856 nvlist_t *nvroot = NULL;
1822 1857 vdev_t *rvd;
1823 1858 uberblock_t *ub = &spa->spa_uberblock;
1824 1859 uint64_t children, config_cache_txg = spa->spa_config_txg;
1825 1860 int orig_mode = spa->spa_mode;
1826 1861 int parse;
1827 1862 uint64_t obj;
1828 1863
1829 1864 /*
1830 1865 * If this is an untrusted config, access the pool in read-only mode.
1831 1866 * This prevents things like resilvering recently removed devices.
1832 1867 */
1833 1868 if (!mosconfig)
1834 1869 spa->spa_mode = FREAD;
1835 1870
1836 1871 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1837 1872
1838 1873 spa->spa_load_state = state;
1839 1874
1840 1875 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1841 1876 return (EINVAL);
1842 1877
1843 1878 parse = (type == SPA_IMPORT_EXISTING ?
1844 1879 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1845 1880
1846 1881 /*
1847 1882 * Create "The Godfather" zio to hold all async IOs
1848 1883 */
1849 1884 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
1850 1885 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
1851 1886
1852 1887 /*
1853 1888 * Parse the configuration into a vdev tree. We explicitly set the
1854 1889 * value that will be returned by spa_version() since parsing the
1855 1890 * configuration requires knowing the version number.
1856 1891 */
1857 1892 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1858 1893 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
1859 1894 spa_config_exit(spa, SCL_ALL, FTAG);
1860 1895
1861 1896 if (error != 0)
1862 1897 return (error);
1863 1898
1864 1899 ASSERT(spa->spa_root_vdev == rvd);
1865 1900
1866 1901 if (type != SPA_IMPORT_ASSEMBLE) {
1867 1902 ASSERT(spa_guid(spa) == pool_guid);
1868 1903 }
1869 1904
1870 1905 /*
1871 1906 * Try to open all vdevs, loading each label in the process.
1872 1907 */
1873 1908 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1874 1909 error = vdev_open(rvd);
1875 1910 spa_config_exit(spa, SCL_ALL, FTAG);
1876 1911 if (error != 0)
1877 1912 return (error);
1878 1913
1879 1914 /*
1880 1915 * We need to validate the vdev labels against the configuration that
1881 1916 * we have in hand, which is dependent on the setting of mosconfig. If
1882 1917 * mosconfig is true then we're validating the vdev labels based on
1883 1918 * that config. Otherwise, we're validating against the cached config
1884 1919 * (zpool.cache) that was read when we loaded the zfs module, and then
1885 1920 * later we will recursively call spa_load() and validate against
1886 1921 * the vdev config.
1887 1922 *
1888 1923 * If we're assembling a new pool that's been split off from an
1889 1924 * existing pool, the labels haven't yet been updated so we skip
1890 1925 * validation for now.
1891 1926 */
1892 1927 if (type != SPA_IMPORT_ASSEMBLE) {
1893 1928 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1894 1929 error = vdev_validate(rvd);
1895 1930 spa_config_exit(spa, SCL_ALL, FTAG);
1896 1931
1897 1932 if (error != 0)
1898 1933 return (error);
1899 1934
1900 1935 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
1901 1936 return (ENXIO);
1902 1937 }
1903 1938
1904 1939 /*
1905 1940 * Find the best uberblock.
1906 1941 */
1907 1942 vdev_uberblock_load(NULL, rvd, ub);
1908 1943
1909 1944 /*
1910 1945 * If we weren't able to find a single valid uberblock, return failure.
1911 1946 */
1912 1947 if (ub->ub_txg == 0)
1913 1948 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
1914 1949
1915 1950 /*
1916 1951 * If the pool is newer than the code, we can't open it.
1917 1952 */
1918 1953 if (ub->ub_version > SPA_VERSION)
1919 1954 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
1920 1955
1921 1956 /*
1922 1957 * If the vdev guid sum doesn't match the uberblock, we have an
1923 1958 * incomplete configuration. We first check to see if the pool
1924 1959 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
1925 1960 * If it is, defer the vdev_guid_sum check till later so we
1926 1961 * can handle missing vdevs.
1927 1962 */
1928 1963 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
1929 1964 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
1930 1965 rvd->vdev_guid_sum != ub->ub_guid_sum)
1931 1966 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
1932 1967
1933 1968 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
1934 1969 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1935 1970 spa_try_repair(spa, config);
1936 1971 spa_config_exit(spa, SCL_ALL, FTAG);
1937 1972 nvlist_free(spa->spa_config_splitting);
1938 1973 spa->spa_config_splitting = NULL;
1939 1974 }
1940 1975
1941 1976 /*
1942 1977 * Initialize internal SPA structures.
1943 1978 */
1944 1979 spa->spa_state = POOL_STATE_ACTIVE;
1945 1980 spa->spa_ubsync = spa->spa_uberblock;
1946 1981 spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1947 1982 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1948 1983 spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1949 1984 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
1950 1985 spa->spa_claim_max_txg = spa->spa_first_txg;
1951 1986 spa->spa_prev_software_version = ub->ub_software_version;
1952 1987
1953 1988 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1954 1989 if (error)
1955 1990 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1956 1991 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1957 1992
1958 1993 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
1959 1994 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1960 1995
1961 1996 if (!mosconfig) {
1962 1997 uint64_t hostid;
1963 1998 nvlist_t *policy = NULL, *nvconfig;
1964 1999
1965 2000 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
1966 2001 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
1967 2002
1968 2003 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
1969 2004 ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1970 2005 char *hostname;
1971 2006 unsigned long myhostid = 0;
1972 2007
1973 2008 VERIFY(nvlist_lookup_string(nvconfig,
1974 2009 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1975 2010
1976 2011 #ifdef _KERNEL
1977 2012 myhostid = zone_get_hostid(NULL);
1978 2013 #else /* _KERNEL */
1979 2014 /*
1980 2015 * We're emulating the system's hostid in userland, so
1981 2016 * we can't use zone_get_hostid().
1982 2017 */
1983 2018 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1984 2019 #endif /* _KERNEL */
1985 2020 if (hostid != 0 && myhostid != 0 &&
1986 2021 hostid != myhostid) {
1987 2022 nvlist_free(nvconfig);
1988 2023 cmn_err(CE_WARN, "pool '%s' could not be "
1989 2024 "loaded as it was last accessed by "
1990 2025 "another system (host: %s hostid: 0x%lx). "
1991 2026 "See: http://www.sun.com/msg/ZFS-8000-EY",
1992 2027 spa_name(spa), hostname,
1993 2028 (unsigned long)hostid);
1994 2029 return (EBADF);
1995 2030 }
1996 2031 }
1997 2032 if (nvlist_lookup_nvlist(spa->spa_config,
1998 2033 ZPOOL_REWIND_POLICY, &policy) == 0)
1999 2034 VERIFY(nvlist_add_nvlist(nvconfig,
2000 2035 ZPOOL_REWIND_POLICY, policy) == 0);
2001 2036
2002 2037 spa_config_set(spa, nvconfig);
2003 2038 spa_unload(spa);
2004 2039 spa_deactivate(spa);
2005 2040 spa_activate(spa, orig_mode);
2006 2041
2007 2042 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2008 2043 }
2009 2044
2010 2045 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2011 2046 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2012 2047 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2013 2048 if (error != 0)
2014 2049 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2015 2050
2016 2051 /*
2017 2052 * Load the bit that tells us to use the new accounting function
2018 2053 * (raid-z deflation). If we have an older pool, this will not
2019 2054 * be present.
2020 2055 */
2021 2056 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2022 2057 if (error != 0 && error != ENOENT)
2023 2058 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2024 2059
2025 2060 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2026 2061 &spa->spa_creation_version);
2027 2062 if (error != 0 && error != ENOENT)
2028 2063 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2029 2064
2030 2065 /*
2031 2066 * Load the persistent error log. If we have an older pool, this will
2032 2067 * not be present.
2033 2068 */
2034 2069 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2035 2070 if (error != 0 && error != ENOENT)
2036 2071 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2037 2072
2038 2073 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2039 2074 &spa->spa_errlog_scrub);
2040 2075 if (error != 0 && error != ENOENT)
2041 2076 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2042 2077
2043 2078 /*
2044 2079 * Load the history object. If we have an older pool, this
2045 2080 * will not be present.
2046 2081 */
2047 2082 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2048 2083 if (error != 0 && error != ENOENT)
2049 2084 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2050 2085
2051 2086 /*
2052 2087 * If we're assembling the pool from the split-off vdevs of
2053 2088 * an existing pool, we don't want to attach the spares & cache
2054 2089 * devices.
2055 2090 */
2056 2091
2057 2092 /*
2058 2093 * Load any hot spares for this pool.
2059 2094 */
2060 2095 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2061 2096 if (error != 0 && error != ENOENT)
2062 2097 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2063 2098 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2064 2099 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2065 2100 if (load_nvlist(spa, spa->spa_spares.sav_object,
2066 2101 &spa->spa_spares.sav_config) != 0)
2067 2102 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2068 2103
2069 2104 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2070 2105 spa_load_spares(spa);
2071 2106 spa_config_exit(spa, SCL_ALL, FTAG);
2072 2107 } else if (error == 0) {
2073 2108 spa->spa_spares.sav_sync = B_TRUE;
2074 2109 }
2075 2110
2076 2111 /*
2077 2112 * Load any level 2 ARC devices for this pool.
2078 2113 */
2079 2114 error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2080 2115 &spa->spa_l2cache.sav_object);
2081 2116 if (error != 0 && error != ENOENT)
2082 2117 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2083 2118 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2084 2119 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2085 2120 if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2086 2121 &spa->spa_l2cache.sav_config) != 0)
2087 2122 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2088 2123
2089 2124 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2090 2125 spa_load_l2cache(spa);
2091 2126 spa_config_exit(spa, SCL_ALL, FTAG);
2092 2127 } else if (error == 0) {
2093 2128 spa->spa_l2cache.sav_sync = B_TRUE;
2094 2129 }
2095 2130
2096 2131 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2097 2132
2098 2133 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2099 2134 if (error && error != ENOENT)
2100 2135 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2101 2136
2102 2137 if (error == 0) {
2103 2138 uint64_t autoreplace;
2104 2139
2105 2140 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2106 2141 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2107 2142 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2108 2143 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2109 2144 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2110 2145 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2111 2146 &spa->spa_dedup_ditto);
2112 2147
2113 2148 spa->spa_autoreplace = (autoreplace != 0);
2114 2149 }
2115 2150
2116 2151 /*
2117 2152 * If the 'autoreplace' property is set, then post a resource notifying
2118 2153 * the ZFS DE that it should not issue any faults for unopenable
2119 2154 * devices. We also iterate over the vdevs, and post a sysevent for any
2120 2155 * unopenable vdevs so that the normal autoreplace handler can take
2121 2156 * over.
2122 2157 */
2123 2158 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2124 2159 spa_check_removed(spa->spa_root_vdev);
2125 2160 /*
2126 2161 * For the import case, this is done in spa_import(), because
2127 2162 * at this point we're using the spare definitions from
2128 2163 * the MOS config, not necessarily from the userland config.
2129 2164 */
2130 2165 if (state != SPA_LOAD_IMPORT) {
2131 2166 spa_aux_check_removed(&spa->spa_spares);
2132 2167 spa_aux_check_removed(&spa->spa_l2cache);
2133 2168 }
2134 2169 }
2135 2170
2136 2171 /*
2137 2172 * Load the vdev state for all toplevel vdevs.
2138 2173 */
2139 2174 vdev_load(rvd);
2140 2175
2141 2176 /*
2142 2177 * Propagate the leaf DTLs we just loaded all the way up the tree.
2143 2178 */
2144 2179 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2145 2180 vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2146 2181 spa_config_exit(spa, SCL_ALL, FTAG);
2147 2182
2148 2183 /*
2149 2184 * Load the DDTs (dedup tables).
2150 2185 */
2151 2186 error = ddt_load(spa);
2152 2187 if (error != 0)
2153 2188 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2154 2189
2155 2190 spa_update_dspace(spa);
2156 2191
2157 2192 /*
2158 2193 * Validate the config, using the MOS config to fill in any
2159 2194 * information which might be missing. If we fail to validate
2160 2195 * the config then declare the pool unfit for use. If we're
2161 2196 * assembling a pool from a split, the log is not transferred
2162 2197 * over.
2163 2198 */
2164 2199 if (type != SPA_IMPORT_ASSEMBLE) {
2165 2200 nvlist_t *nvconfig;
2166 2201
2167 2202 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2168 2203 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2169 2204
2170 2205 if (!spa_config_valid(spa, nvconfig)) {
2171 2206 nvlist_free(nvconfig);
2172 2207 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2173 2208 ENXIO));
2174 2209 }
2175 2210 nvlist_free(nvconfig);
2176 2211
2177 2212 /*
2178 2213 * Now that we've validate the config, check the state of the
2179 2214 * root vdev. If it can't be opened, it indicates one or
2180 2215 * more toplevel vdevs are faulted.
2181 2216 */
2182 2217 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2183 2218 return (ENXIO);
2184 2219
2185 2220 if (spa_check_logs(spa)) {
2186 2221 *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2187 2222 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2188 2223 }
2189 2224 }
2190 2225
2191 2226 /*
2192 2227 * We've successfully opened the pool, verify that we're ready
2193 2228 * to start pushing transactions.
2194 2229 */
2195 2230 if (state != SPA_LOAD_TRYIMPORT) {
2196 2231 if (error = spa_load_verify(spa))
2197 2232 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2198 2233 error));
2199 2234 }
2200 2235
2201 2236 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2202 2237 spa->spa_load_max_txg == UINT64_MAX)) {
2203 2238 dmu_tx_t *tx;
2204 2239 int need_update = B_FALSE;
2205 2240
2206 2241 ASSERT(state != SPA_LOAD_TRYIMPORT);
2207 2242
2208 2243 /*
2209 2244 * Claim log blocks that haven't been committed yet.
2210 2245 * This must all happen in a single txg.
2211 2246 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2212 2247 * invoked from zil_claim_log_block()'s i/o done callback.
2213 2248 * Price of rollback is that we abandon the log.
2214 2249 */
2215 2250 spa->spa_claiming = B_TRUE;
2216 2251
2217 2252 tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2218 2253 spa_first_txg(spa));
2219 2254 (void) dmu_objset_find(spa_name(spa),
2220 2255 zil_claim, tx, DS_FIND_CHILDREN);
2221 2256 dmu_tx_commit(tx);
2222 2257
2223 2258 spa->spa_claiming = B_FALSE;
2224 2259
2225 2260 spa_set_log_state(spa, SPA_LOG_GOOD);
2226 2261 spa->spa_sync_on = B_TRUE;
2227 2262 txg_sync_start(spa->spa_dsl_pool);
2228 2263
2229 2264 /*
2230 2265 * Wait for all claims to sync. We sync up to the highest
2231 2266 * claimed log block birth time so that claimed log blocks
2232 2267 * don't appear to be from the future. spa_claim_max_txg
2233 2268 * will have been set for us by either zil_check_log_chain()
2234 2269 * (invoked from spa_check_logs()) or zil_claim() above.
2235 2270 */
2236 2271 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2237 2272
2238 2273 /*
2239 2274 * If the config cache is stale, or we have uninitialized
2240 2275 * metaslabs (see spa_vdev_add()), then update the config.
2241 2276 *
2242 2277 * If this is a verbatim import, trust the current
2243 2278 * in-core spa_config and update the disk labels.
2244 2279 */
2245 2280 if (config_cache_txg != spa->spa_config_txg ||
2246 2281 state == SPA_LOAD_IMPORT ||
2247 2282 state == SPA_LOAD_RECOVER ||
2248 2283 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2249 2284 need_update = B_TRUE;
2250 2285
2251 2286 for (int c = 0; c < rvd->vdev_children; c++)
2252 2287 if (rvd->vdev_child[c]->vdev_ms_array == 0)
2253 2288 need_update = B_TRUE;
2254 2289
2255 2290 /*
2256 2291 * Update the config cache asychronously in case we're the
2257 2292 * root pool, in which case the config cache isn't writable yet.
2258 2293 */
2259 2294 if (need_update)
2260 2295 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2261 2296
2262 2297 /*
2263 2298 * Check all DTLs to see if anything needs resilvering.
2264 2299 */
2265 2300 if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2266 2301 vdev_resilver_needed(rvd, NULL, NULL))
2267 2302 spa_async_request(spa, SPA_ASYNC_RESILVER);
2268 2303
2269 2304 /*
2270 2305 * Delete any inconsistent datasets.
2271 2306 */
2272 2307 (void) dmu_objset_find(spa_name(spa),
2273 2308 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2274 2309
2275 2310 /*
2276 2311 * Clean up any stale temporary dataset userrefs.
2277 2312 */
2278 2313 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2279 2314 }
2280 2315
2281 2316 return (0);
2282 2317 }
2283 2318
2284 2319 static int
2285 2320 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2286 2321 {
2287 2322 int mode = spa->spa_mode;
2288 2323
2289 2324 spa_unload(spa);
2290 2325 spa_deactivate(spa);
2291 2326
2292 2327 spa->spa_load_max_txg--;
2293 2328
2294 2329 spa_activate(spa, mode);
2295 2330 spa_async_suspend(spa);
2296 2331
2297 2332 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2298 2333 }
2299 2334
2300 2335 static int
2301 2336 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2302 2337 uint64_t max_request, int rewind_flags)
2303 2338 {
2304 2339 nvlist_t *config = NULL;
2305 2340 int load_error, rewind_error;
2306 2341 uint64_t safe_rewind_txg;
2307 2342 uint64_t min_txg;
2308 2343
2309 2344 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2310 2345 spa->spa_load_max_txg = spa->spa_load_txg;
2311 2346 spa_set_log_state(spa, SPA_LOG_CLEAR);
2312 2347 } else {
2313 2348 spa->spa_load_max_txg = max_request;
2314 2349 }
2315 2350
2316 2351 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2317 2352 mosconfig);
2318 2353 if (load_error == 0)
2319 2354 return (0);
2320 2355
2321 2356 if (spa->spa_root_vdev != NULL)
2322 2357 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2323 2358
2324 2359 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2325 2360 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2326 2361
2327 2362 if (rewind_flags & ZPOOL_NEVER_REWIND) {
2328 2363 nvlist_free(config);
2329 2364 return (load_error);
2330 2365 }
2331 2366
2332 2367 /* Price of rolling back is discarding txgs, including log */
2333 2368 if (state == SPA_LOAD_RECOVER)
2334 2369 spa_set_log_state(spa, SPA_LOG_CLEAR);
2335 2370
2336 2371 spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2337 2372 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2338 2373 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2339 2374 TXG_INITIAL : safe_rewind_txg;
2340 2375
2341 2376 /*
2342 2377 * Continue as long as we're finding errors, we're still within
2343 2378 * the acceptable rewind range, and we're still finding uberblocks
2344 2379 */
2345 2380 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2346 2381 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2347 2382 if (spa->spa_load_max_txg < safe_rewind_txg)
2348 2383 spa->spa_extreme_rewind = B_TRUE;
2349 2384 rewind_error = spa_load_retry(spa, state, mosconfig);
2350 2385 }
2351 2386
2352 2387 spa->spa_extreme_rewind = B_FALSE;
2353 2388 spa->spa_load_max_txg = UINT64_MAX;
2354 2389
2355 2390 if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2356 2391 spa_config_set(spa, config);
2357 2392
2358 2393 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
2359 2394 }
2360 2395
2361 2396 /*
2362 2397 * Pool Open/Import
2363 2398 *
2364 2399 * The import case is identical to an open except that the configuration is sent
2365 2400 * down from userland, instead of grabbed from the configuration cache. For the
2366 2401 * case of an open, the pool configuration will exist in the
2367 2402 * POOL_STATE_UNINITIALIZED state.
2368 2403 *
2369 2404 * The stats information (gen/count/ustats) is used to gather vdev statistics at
2370 2405 * the same time open the pool, without having to keep around the spa_t in some
2371 2406 * ambiguous state.
2372 2407 */
2373 2408 static int
2374 2409 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2375 2410 nvlist_t **config)
2376 2411 {
2377 2412 spa_t *spa;
2378 2413 spa_load_state_t state = SPA_LOAD_OPEN;
2379 2414 int error;
2380 2415 int locked = B_FALSE;
2381 2416
2382 2417 *spapp = NULL;
2383 2418
2384 2419 /*
2385 2420 * As disgusting as this is, we need to support recursive calls to this
2386 2421 * function because dsl_dir_open() is called during spa_load(), and ends
2387 2422 * up calling spa_open() again. The real fix is to figure out how to
2388 2423 * avoid dsl_dir_open() calling this in the first place.
2389 2424 */
2390 2425 if (mutex_owner(&spa_namespace_lock) != curthread) {
2391 2426 mutex_enter(&spa_namespace_lock);
2392 2427 locked = B_TRUE;
2393 2428 }
2394 2429
2395 2430 if ((spa = spa_lookup(pool)) == NULL) {
2396 2431 if (locked)
2397 2432 mutex_exit(&spa_namespace_lock);
2398 2433 return (ENOENT);
2399 2434 }
2400 2435
2401 2436 if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2402 2437 zpool_rewind_policy_t policy;
2403 2438
2404 2439 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2405 2440 &policy);
2406 2441 if (policy.zrp_request & ZPOOL_DO_REWIND)
2407 2442 state = SPA_LOAD_RECOVER;
2408 2443
2409 2444 spa_activate(spa, spa_mode_global);
2410 2445
2411 2446 if (state != SPA_LOAD_RECOVER)
2412 2447 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2413 2448
2414 2449 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2415 2450 policy.zrp_request);
2416 2451
2417 2452 if (error == EBADF) {
2418 2453 /*
2419 2454 * If vdev_validate() returns failure (indicated by
2420 2455 * EBADF), it indicates that one of the vdevs indicates
2421 2456 * that the pool has been exported or destroyed. If
2422 2457 * this is the case, the config cache is out of sync and
2423 2458 * we should remove the pool from the namespace.
2424 2459 */
2425 2460 spa_unload(spa);
2426 2461 spa_deactivate(spa);
2427 2462 spa_config_sync(spa, B_TRUE, B_TRUE);
2428 2463 spa_remove(spa);
2429 2464 if (locked)
2430 2465 mutex_exit(&spa_namespace_lock);
2431 2466 return (ENOENT);
2432 2467 }
2433 2468
2434 2469 if (error) {
2435 2470 /*
2436 2471 * We can't open the pool, but we still have useful
2437 2472 * information: the state of each vdev after the
2438 2473 * attempted vdev_open(). Return this to the user.
2439 2474 */
2440 2475 if (config != NULL && spa->spa_config) {
2441 2476 VERIFY(nvlist_dup(spa->spa_config, config,
2442 2477 KM_SLEEP) == 0);
2443 2478 VERIFY(nvlist_add_nvlist(*config,
2444 2479 ZPOOL_CONFIG_LOAD_INFO,
2445 2480 spa->spa_load_info) == 0);
2446 2481 }
2447 2482 spa_unload(spa);
2448 2483 spa_deactivate(spa);
2449 2484 spa->spa_last_open_failed = error;
2450 2485 if (locked)
2451 2486 mutex_exit(&spa_namespace_lock);
2452 2487 *spapp = NULL;
2453 2488 return (error);
2454 2489 }
2455 2490 }
2456 2491
2457 2492 spa_open_ref(spa, tag);
2458 2493
2459 2494 if (config != NULL)
2460 2495 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2461 2496
2462 2497 /*
2463 2498 * If we've recovered the pool, pass back any information we
2464 2499 * gathered while doing the load.
2465 2500 */
2466 2501 if (state == SPA_LOAD_RECOVER) {
2467 2502 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2468 2503 spa->spa_load_info) == 0);
2469 2504 }
2470 2505
2471 2506 if (locked) {
2472 2507 spa->spa_last_open_failed = 0;
2473 2508 spa->spa_last_ubsync_txg = 0;
2474 2509 spa->spa_load_txg = 0;
2475 2510 mutex_exit(&spa_namespace_lock);
2476 2511 }
2477 2512
2478 2513 *spapp = spa;
2479 2514
2480 2515 return (0);
2481 2516 }
2482 2517
2483 2518 int
2484 2519 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2485 2520 nvlist_t **config)
2486 2521 {
2487 2522 return (spa_open_common(name, spapp, tag, policy, config));
2488 2523 }
2489 2524
2490 2525 int
2491 2526 spa_open(const char *name, spa_t **spapp, void *tag)
2492 2527 {
2493 2528 return (spa_open_common(name, spapp, tag, NULL, NULL));
2494 2529 }
2495 2530
2496 2531 /*
2497 2532 * Lookup the given spa_t, incrementing the inject count in the process,
2498 2533 * preventing it from being exported or destroyed.
2499 2534 */
2500 2535 spa_t *
2501 2536 spa_inject_addref(char *name)
2502 2537 {
2503 2538 spa_t *spa;
2504 2539
2505 2540 mutex_enter(&spa_namespace_lock);
2506 2541 if ((spa = spa_lookup(name)) == NULL) {
2507 2542 mutex_exit(&spa_namespace_lock);
2508 2543 return (NULL);
2509 2544 }
2510 2545 spa->spa_inject_ref++;
2511 2546 mutex_exit(&spa_namespace_lock);
2512 2547
2513 2548 return (spa);
2514 2549 }
2515 2550
2516 2551 void
2517 2552 spa_inject_delref(spa_t *spa)
2518 2553 {
2519 2554 mutex_enter(&spa_namespace_lock);
2520 2555 spa->spa_inject_ref--;
2521 2556 mutex_exit(&spa_namespace_lock);
2522 2557 }
2523 2558
2524 2559 /*
2525 2560 * Add spares device information to the nvlist.
2526 2561 */
2527 2562 static void
2528 2563 spa_add_spares(spa_t *spa, nvlist_t *config)
2529 2564 {
2530 2565 nvlist_t **spares;
2531 2566 uint_t i, nspares;
2532 2567 nvlist_t *nvroot;
2533 2568 uint64_t guid;
2534 2569 vdev_stat_t *vs;
2535 2570 uint_t vsc;
2536 2571 uint64_t pool;
2537 2572
2538 2573 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2539 2574
2540 2575 if (spa->spa_spares.sav_count == 0)
2541 2576 return;
2542 2577
2543 2578 VERIFY(nvlist_lookup_nvlist(config,
2544 2579 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2545 2580 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2546 2581 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2547 2582 if (nspares != 0) {
2548 2583 VERIFY(nvlist_add_nvlist_array(nvroot,
2549 2584 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2550 2585 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2551 2586 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2552 2587
2553 2588 /*
2554 2589 * Go through and find any spares which have since been
2555 2590 * repurposed as an active spare. If this is the case, update
2556 2591 * their status appropriately.
2557 2592 */
2558 2593 for (i = 0; i < nspares; i++) {
2559 2594 VERIFY(nvlist_lookup_uint64(spares[i],
2560 2595 ZPOOL_CONFIG_GUID, &guid) == 0);
2561 2596 if (spa_spare_exists(guid, &pool, NULL) &&
2562 2597 pool != 0ULL) {
2563 2598 VERIFY(nvlist_lookup_uint64_array(
2564 2599 spares[i], ZPOOL_CONFIG_VDEV_STATS,
2565 2600 (uint64_t **)&vs, &vsc) == 0);
2566 2601 vs->vs_state = VDEV_STATE_CANT_OPEN;
2567 2602 vs->vs_aux = VDEV_AUX_SPARED;
2568 2603 }
2569 2604 }
2570 2605 }
2571 2606 }
2572 2607
2573 2608 /*
2574 2609 * Add l2cache device information to the nvlist, including vdev stats.
2575 2610 */
2576 2611 static void
2577 2612 spa_add_l2cache(spa_t *spa, nvlist_t *config)
2578 2613 {
2579 2614 nvlist_t **l2cache;
2580 2615 uint_t i, j, nl2cache;
2581 2616 nvlist_t *nvroot;
2582 2617 uint64_t guid;
2583 2618 vdev_t *vd;
2584 2619 vdev_stat_t *vs;
2585 2620 uint_t vsc;
2586 2621
2587 2622 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2588 2623
2589 2624 if (spa->spa_l2cache.sav_count == 0)
2590 2625 return;
2591 2626
2592 2627 VERIFY(nvlist_lookup_nvlist(config,
2593 2628 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2594 2629 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2595 2630 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2596 2631 if (nl2cache != 0) {
2597 2632 VERIFY(nvlist_add_nvlist_array(nvroot,
2598 2633 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2599 2634 VERIFY(nvlist_lookup_nvlist_array(nvroot,
2600 2635 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2601 2636
2602 2637 /*
2603 2638 * Update level 2 cache device stats.
2604 2639 */
2605 2640
2606 2641 for (i = 0; i < nl2cache; i++) {
2607 2642 VERIFY(nvlist_lookup_uint64(l2cache[i],
2608 2643 ZPOOL_CONFIG_GUID, &guid) == 0);
2609 2644
2610 2645 vd = NULL;
2611 2646 for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2612 2647 if (guid ==
2613 2648 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2614 2649 vd = spa->spa_l2cache.sav_vdevs[j];
2615 2650 break;
2616 2651 }
2617 2652 }
2618 2653 ASSERT(vd != NULL);
2619 2654
2620 2655 VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2621 2656 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2622 2657 == 0);
2623 2658 vdev_get_stats(vd, vs);
2624 2659 }
2625 2660 }
2626 2661 }
2627 2662
2628 2663 int
2629 2664 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
2630 2665 {
2631 2666 int error;
2632 2667 spa_t *spa;
2633 2668
2634 2669 *config = NULL;
2635 2670 error = spa_open_common(name, &spa, FTAG, NULL, config);
2636 2671
2637 2672 if (spa != NULL) {
2638 2673 /*
2639 2674 * This still leaves a window of inconsistency where the spares
2640 2675 * or l2cache devices could change and the config would be
2641 2676 * self-inconsistent.
2642 2677 */
2643 2678 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2644 2679
2645 2680 if (*config != NULL) {
2646 2681 uint64_t loadtimes[2];
2647 2682
2648 2683 loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2649 2684 loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
2650 2685 VERIFY(nvlist_add_uint64_array(*config,
2651 2686 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
2652 2687
2653 2688 VERIFY(nvlist_add_uint64(*config,
2654 2689 ZPOOL_CONFIG_ERRCOUNT,
2655 2690 spa_get_errlog_size(spa)) == 0);
2656 2691
2657 2692 if (spa_suspended(spa))
2658 2693 VERIFY(nvlist_add_uint64(*config,
2659 2694 ZPOOL_CONFIG_SUSPENDED,
2660 2695 spa->spa_failmode) == 0);
2661 2696
2662 2697 spa_add_spares(spa, *config);
2663 2698 spa_add_l2cache(spa, *config);
2664 2699 }
2665 2700 }
2666 2701
2667 2702 /*
2668 2703 * We want to get the alternate root even for faulted pools, so we cheat
2669 2704 * and call spa_lookup() directly.
2670 2705 */
2671 2706 if (altroot) {
2672 2707 if (spa == NULL) {
2673 2708 mutex_enter(&spa_namespace_lock);
2674 2709 spa = spa_lookup(name);
2675 2710 if (spa)
2676 2711 spa_altroot(spa, altroot, buflen);
2677 2712 else
2678 2713 altroot[0] = '\0';
2679 2714 spa = NULL;
2680 2715 mutex_exit(&spa_namespace_lock);
2681 2716 } else {
2682 2717 spa_altroot(spa, altroot, buflen);
2683 2718 }
2684 2719 }
2685 2720
2686 2721 if (spa != NULL) {
2687 2722 spa_config_exit(spa, SCL_CONFIG, FTAG);
2688 2723 spa_close(spa, FTAG);
2689 2724 }
2690 2725
2691 2726 return (error);
2692 2727 }
2693 2728
2694 2729 /*
2695 2730 * Validate that the auxiliary device array is well formed. We must have an
2696 2731 * array of nvlists, each which describes a valid leaf vdev. If this is an
2697 2732 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
2698 2733 * specified, as long as they are well-formed.
2699 2734 */
2700 2735 static int
2701 2736 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
2702 2737 spa_aux_vdev_t *sav, const char *config, uint64_t version,
2703 2738 vdev_labeltype_t label)
2704 2739 {
2705 2740 nvlist_t **dev;
2706 2741 uint_t i, ndev;
2707 2742 vdev_t *vd;
2708 2743 int error;
2709 2744
2710 2745 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2711 2746
2712 2747 /*
2713 2748 * It's acceptable to have no devs specified.
2714 2749 */
2715 2750 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
2716 2751 return (0);
2717 2752
2718 2753 if (ndev == 0)
2719 2754 return (EINVAL);
2720 2755
2721 2756 /*
2722 2757 * Make sure the pool is formatted with a version that supports this
2723 2758 * device type.
2724 2759 */
2725 2760 if (spa_version(spa) < version)
2726 2761 return (ENOTSUP);
2727 2762
2728 2763 /*
2729 2764 * Set the pending device list so we correctly handle device in-use
2730 2765 * checking.
2731 2766 */
2732 2767 sav->sav_pending = dev;
2733 2768 sav->sav_npending = ndev;
2734 2769
2735 2770 for (i = 0; i < ndev; i++) {
2736 2771 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
2737 2772 mode)) != 0)
2738 2773 goto out;
2739 2774
2740 2775 if (!vd->vdev_ops->vdev_op_leaf) {
2741 2776 vdev_free(vd);
2742 2777 error = EINVAL;
2743 2778 goto out;
2744 2779 }
2745 2780
2746 2781 /*
2747 2782 * The L2ARC currently only supports disk devices in
2748 2783 * kernel context. For user-level testing, we allow it.
2749 2784 */
2750 2785 #ifdef _KERNEL
2751 2786 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
2752 2787 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2753 2788 error = ENOTBLK;
2754 2789 goto out;
2755 2790 }
2756 2791 #endif
2757 2792 vd->vdev_top = vd;
2758 2793
2759 2794 if ((error = vdev_open(vd)) == 0 &&
2760 2795 (error = vdev_label_init(vd, crtxg, label)) == 0) {
2761 2796 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
2762 2797 vd->vdev_guid) == 0);
2763 2798 }
2764 2799
2765 2800 vdev_free(vd);
2766 2801
2767 2802 if (error &&
2768 2803 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
2769 2804 goto out;
2770 2805 else
2771 2806 error = 0;
2772 2807 }
2773 2808
2774 2809 out:
2775 2810 sav->sav_pending = NULL;
2776 2811 sav->sav_npending = 0;
2777 2812 return (error);
2778 2813 }
2779 2814
2780 2815 static int
2781 2816 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2782 2817 {
2783 2818 int error;
2784 2819
2785 2820 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2786 2821
2787 2822 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2788 2823 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2789 2824 VDEV_LABEL_SPARE)) != 0) {
2790 2825 return (error);
2791 2826 }
2792 2827
2793 2828 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2794 2829 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2795 2830 VDEV_LABEL_L2CACHE));
2796 2831 }
2797 2832
2798 2833 static void
2799 2834 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2800 2835 const char *config)
2801 2836 {
2802 2837 int i;
2803 2838
2804 2839 if (sav->sav_config != NULL) {
2805 2840 nvlist_t **olddevs;
2806 2841 uint_t oldndevs;
2807 2842 nvlist_t **newdevs;
2808 2843
2809 2844 /*
2810 2845 * Generate new dev list by concatentating with the
2811 2846 * current dev list.
2812 2847 */
2813 2848 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2814 2849 &olddevs, &oldndevs) == 0);
2815 2850
2816 2851 newdevs = kmem_alloc(sizeof (void *) *
2817 2852 (ndevs + oldndevs), KM_SLEEP);
2818 2853 for (i = 0; i < oldndevs; i++)
2819 2854 VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2820 2855 KM_SLEEP) == 0);
2821 2856 for (i = 0; i < ndevs; i++)
2822 2857 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2823 2858 KM_SLEEP) == 0);
2824 2859
2825 2860 VERIFY(nvlist_remove(sav->sav_config, config,
2826 2861 DATA_TYPE_NVLIST_ARRAY) == 0);
2827 2862
2828 2863 VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2829 2864 config, newdevs, ndevs + oldndevs) == 0);
2830 2865 for (i = 0; i < oldndevs + ndevs; i++)
2831 2866 nvlist_free(newdevs[i]);
2832 2867 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2833 2868 } else {
2834 2869 /*
2835 2870 * Generate a new dev list.
2836 2871 */
2837 2872 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2838 2873 KM_SLEEP) == 0);
2839 2874 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2840 2875 devs, ndevs) == 0);
2841 2876 }
2842 2877 }
2843 2878
2844 2879 /*
2845 2880 * Stop and drop level 2 ARC devices
2846 2881 */
2847 2882 void
2848 2883 spa_l2cache_drop(spa_t *spa)
2849 2884 {
2850 2885 vdev_t *vd;
2851 2886 int i;
2852 2887 spa_aux_vdev_t *sav = &spa->spa_l2cache;
2853 2888
2854 2889 for (i = 0; i < sav->sav_count; i++) {
2855 2890 uint64_t pool;
2856 2891
2857 2892 vd = sav->sav_vdevs[i];
2858 2893 ASSERT(vd != NULL);
2859 2894
2860 2895 if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2861 2896 pool != 0ULL && l2arc_vdev_present(vd))
2862 2897 l2arc_remove_vdev(vd);
2863 2898 if (vd->vdev_isl2cache)
2864 2899 spa_l2cache_remove(vd);
2865 2900 vdev_clear_stats(vd);
2866 2901 (void) vdev_close(vd);
2867 2902 }
2868 2903 }
2869 2904
2870 2905 /*
2871 2906 * Pool Creation
2872 2907 */
2873 2908 int
2874 2909 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2875 2910 const char *history_str, nvlist_t *zplprops)
2876 2911 {
2877 2912 spa_t *spa;
2878 2913 char *altroot = NULL;
2879 2914 vdev_t *rvd;
2880 2915 dsl_pool_t *dp;
2881 2916 dmu_tx_t *tx;
2882 2917 int error = 0;
2883 2918 uint64_t txg = TXG_INITIAL;
2884 2919 nvlist_t **spares, **l2cache;
2885 2920 uint_t nspares, nl2cache;
2886 2921 uint64_t version, obj;
2887 2922
2888 2923 /*
2889 2924 * If this pool already exists, return failure.
2890 2925 */
2891 2926 mutex_enter(&spa_namespace_lock);
2892 2927 if (spa_lookup(pool) != NULL) {
2893 2928 mutex_exit(&spa_namespace_lock);
2894 2929 return (EEXIST);
2895 2930 }
2896 2931
2897 2932 /*
2898 2933 * Allocate a new spa_t structure.
2899 2934 */
2900 2935 (void) nvlist_lookup_string(props,
2901 2936 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2902 2937 spa = spa_add(pool, NULL, altroot);
2903 2938 spa_activate(spa, spa_mode_global);
2904 2939
2905 2940 if (props && (error = spa_prop_validate(spa, props))) {
2906 2941 spa_deactivate(spa);
2907 2942 spa_remove(spa);
2908 2943 mutex_exit(&spa_namespace_lock);
2909 2944 return (error);
2910 2945 }
2911 2946
2912 2947 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2913 2948 &version) != 0)
2914 2949 version = SPA_VERSION;
2915 2950 ASSERT(version <= SPA_VERSION);
2916 2951
2917 2952 spa->spa_first_txg = txg;
2918 2953 spa->spa_uberblock.ub_txg = txg - 1;
2919 2954 spa->spa_uberblock.ub_version = version;
2920 2955 spa->spa_ubsync = spa->spa_uberblock;
2921 2956
2922 2957 /*
2923 2958 * Create "The Godfather" zio to hold all async IOs
2924 2959 */
2925 2960 spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2926 2961 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2927 2962
2928 2963 /*
2929 2964 * Create the root vdev.
2930 2965 */
2931 2966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2932 2967
2933 2968 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2934 2969
2935 2970 ASSERT(error != 0 || rvd != NULL);
2936 2971 ASSERT(error != 0 || spa->spa_root_vdev == rvd);
2937 2972
2938 2973 if (error == 0 && !zfs_allocatable_devs(nvroot))
2939 2974 error = EINVAL;
2940 2975
2941 2976 if (error == 0 &&
2942 2977 (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2943 2978 (error = spa_validate_aux(spa, nvroot, txg,
2944 2979 VDEV_ALLOC_ADD)) == 0) {
2945 2980 for (int c = 0; c < rvd->vdev_children; c++) {
2946 2981 vdev_metaslab_set_size(rvd->vdev_child[c]);
2947 2982 vdev_expand(rvd->vdev_child[c], txg);
2948 2983 }
2949 2984 }
2950 2985
2951 2986 spa_config_exit(spa, SCL_ALL, FTAG);
2952 2987
2953 2988 if (error != 0) {
2954 2989 spa_unload(spa);
2955 2990 spa_deactivate(spa);
2956 2991 spa_remove(spa);
2957 2992 mutex_exit(&spa_namespace_lock);
2958 2993 return (error);
2959 2994 }
2960 2995
2961 2996 /*
2962 2997 * Get the list of spares, if specified.
2963 2998 */
2964 2999 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2965 3000 &spares, &nspares) == 0) {
2966 3001 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
2967 3002 KM_SLEEP) == 0);
2968 3003 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2969 3004 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2970 3005 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2971 3006 spa_load_spares(spa);
2972 3007 spa_config_exit(spa, SCL_ALL, FTAG);
2973 3008 spa->spa_spares.sav_sync = B_TRUE;
2974 3009 }
2975 3010
2976 3011 /*
2977 3012 * Get the list of level 2 cache devices, if specified.
2978 3013 */
2979 3014 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2980 3015 &l2cache, &nl2cache) == 0) {
2981 3016 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2982 3017 NV_UNIQUE_NAME, KM_SLEEP) == 0);
2983 3018 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2984 3019 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2985 3020 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2986 3021 spa_load_l2cache(spa);
2987 3022 spa_config_exit(spa, SCL_ALL, FTAG);
2988 3023 spa->spa_l2cache.sav_sync = B_TRUE;
2989 3024 }
2990 3025
2991 3026 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
2992 3027 spa->spa_meta_objset = dp->dp_meta_objset;
2993 3028
2994 3029 /*
2995 3030 * Create DDTs (dedup tables).
2996 3031 */
2997 3032 ddt_create(spa);
2998 3033
2999 3034 spa_update_dspace(spa);
3000 3035
3001 3036 tx = dmu_tx_create_assigned(dp, txg);
3002 3037
3003 3038 /*
3004 3039 * Create the pool config object.
3005 3040 */
3006 3041 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3007 3042 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3008 3043 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3009 3044
3010 3045 if (zap_add(spa->spa_meta_objset,
3011 3046 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3012 3047 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3013 3048 cmn_err(CE_PANIC, "failed to add pool config");
3014 3049 }
3015 3050
3016 3051 if (zap_add(spa->spa_meta_objset,
3017 3052 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3018 3053 sizeof (uint64_t), 1, &version, tx) != 0) {
3019 3054 cmn_err(CE_PANIC, "failed to add pool version");
3020 3055 }
3021 3056
3022 3057 /* Newly created pools with the right version are always deflated. */
3023 3058 if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3024 3059 spa->spa_deflate = TRUE;
3025 3060 if (zap_add(spa->spa_meta_objset,
3026 3061 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3027 3062 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3028 3063 cmn_err(CE_PANIC, "failed to add deflate");
3029 3064 }
3030 3065 }
3031 3066
3032 3067 /*
3033 3068 * Create the deferred-free bpobj. Turn off compression
3034 3069 * because sync-to-convergence takes longer if the blocksize
3035 3070 * keeps changing.
3036 3071 */
3037 3072 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3038 3073 dmu_object_set_compress(spa->spa_meta_objset, obj,
3039 3074 ZIO_COMPRESS_OFF, tx);
3040 3075 if (zap_add(spa->spa_meta_objset,
3041 3076 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3042 3077 sizeof (uint64_t), 1, &obj, tx) != 0) {
3043 3078 cmn_err(CE_PANIC, "failed to add bpobj");
3044 3079 }
3045 3080 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3046 3081 spa->spa_meta_objset, obj));
3047 3082
3048 3083 /*
3049 3084 * Create the pool's history object.
3050 3085 */
3051 3086 if (version >= SPA_VERSION_ZPOOL_HISTORY)
3052 3087 spa_history_create_obj(spa, tx);
3053 3088
3054 3089 /*
3055 3090 * Set pool properties.
3056 3091 */
3057 3092 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3058 3093 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3059 3094 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3060 3095 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3061 3096
3062 3097 if (props != NULL) {
3063 3098 spa_configfile_set(spa, props, B_FALSE);
3064 3099 spa_sync_props(spa, props, tx);
3065 3100 }
3066 3101
3067 3102 dmu_tx_commit(tx);
3068 3103
3069 3104 spa->spa_sync_on = B_TRUE;
3070 3105 txg_sync_start(spa->spa_dsl_pool);
3071 3106
3072 3107 /*
3073 3108 * We explicitly wait for the first transaction to complete so that our
3074 3109 * bean counters are appropriately updated.
3075 3110 */
3076 3111 txg_wait_synced(spa->spa_dsl_pool, txg);
3077 3112
3078 3113 spa_config_sync(spa, B_FALSE, B_TRUE);
3079 3114
3080 3115 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
3081 3116 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
3082 3117 spa_history_log_version(spa, LOG_POOL_CREATE);
3083 3118
3084 3119 spa->spa_minref = refcount_count(&spa->spa_refcount);
3085 3120
3086 3121 mutex_exit(&spa_namespace_lock);
3087 3122
3088 3123 return (0);
3089 3124 }
3090 3125
3091 3126 #ifdef _KERNEL
3092 3127 /*
3093 3128 * Get the root pool information from the root disk, then import the root pool
3094 3129 * during the system boot up time.
3095 3130 */
3096 3131 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3097 3132
3098 3133 static nvlist_t *
3099 3134 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3100 3135 {
3101 3136 nvlist_t *config;
3102 3137 nvlist_t *nvtop, *nvroot;
3103 3138 uint64_t pgid;
3104 3139
3105 3140 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3106 3141 return (NULL);
3107 3142
3108 3143 /*
3109 3144 * Add this top-level vdev to the child array.
3110 3145 */
3111 3146 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3112 3147 &nvtop) == 0);
3113 3148 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3114 3149 &pgid) == 0);
3115 3150 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3116 3151
3117 3152 /*
3118 3153 * Put this pool's top-level vdevs into a root vdev.
3119 3154 */
3120 3155 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3121 3156 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3122 3157 VDEV_TYPE_ROOT) == 0);
3123 3158 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3124 3159 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3125 3160 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3126 3161 &nvtop, 1) == 0);
3127 3162
3128 3163 /*
3129 3164 * Replace the existing vdev_tree with the new root vdev in
3130 3165 * this pool's configuration (remove the old, add the new).
3131 3166 */
3132 3167 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3133 3168 nvlist_free(nvroot);
3134 3169 return (config);
3135 3170 }
3136 3171
3137 3172 /*
3138 3173 * Walk the vdev tree and see if we can find a device with "better"
3139 3174 * configuration. A configuration is "better" if the label on that
3140 3175 * device has a more recent txg.
3141 3176 */
3142 3177 static void
3143 3178 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3144 3179 {
3145 3180 for (int c = 0; c < vd->vdev_children; c++)
3146 3181 spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3147 3182
3148 3183 if (vd->vdev_ops->vdev_op_leaf) {
3149 3184 nvlist_t *label;
3150 3185 uint64_t label_txg;
3151 3186
3152 3187 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3153 3188 &label) != 0)
3154 3189 return;
3155 3190
3156 3191 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3157 3192 &label_txg) == 0);
3158 3193
3159 3194 /*
3160 3195 * Do we have a better boot device?
3161 3196 */
3162 3197 if (label_txg > *txg) {
3163 3198 *txg = label_txg;
3164 3199 *avd = vd;
3165 3200 }
3166 3201 nvlist_free(label);
3167 3202 }
3168 3203 }
3169 3204
3170 3205 /*
3171 3206 * Import a root pool.
3172 3207 *
3173 3208 * For x86. devpath_list will consist of devid and/or physpath name of
3174 3209 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3175 3210 * The GRUB "findroot" command will return the vdev we should boot.
3176 3211 *
3177 3212 * For Sparc, devpath_list consists the physpath name of the booting device
3178 3213 * no matter the rootpool is a single device pool or a mirrored pool.
3179 3214 * e.g.
3180 3215 * "/pci@1f,0/ide@d/disk@0,0:a"
3181 3216 */
3182 3217 int
3183 3218 spa_import_rootpool(char *devpath, char *devid)
3184 3219 {
3185 3220 spa_t *spa;
3186 3221 vdev_t *rvd, *bvd, *avd = NULL;
3187 3222 nvlist_t *config, *nvtop;
3188 3223 uint64_t guid, txg;
3189 3224 char *pname;
3190 3225 int error;
3191 3226
3192 3227 /*
3193 3228 * Read the label from the boot device and generate a configuration.
3194 3229 */
3195 3230 config = spa_generate_rootconf(devpath, devid, &guid);
3196 3231 #if defined(_OBP) && defined(_KERNEL)
3197 3232 if (config == NULL) {
3198 3233 if (strstr(devpath, "/iscsi/ssd") != NULL) {
3199 3234 /* iscsi boot */
3200 3235 get_iscsi_bootpath_phy(devpath);
3201 3236 config = spa_generate_rootconf(devpath, devid, &guid);
3202 3237 }
3203 3238 }
3204 3239 #endif
3205 3240 if (config == NULL) {
3206 3241 cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
3207 3242 devpath);
3208 3243 return (EIO);
3209 3244 }
3210 3245
3211 3246 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3212 3247 &pname) == 0);
3213 3248 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3214 3249
3215 3250 mutex_enter(&spa_namespace_lock);
3216 3251 if ((spa = spa_lookup(pname)) != NULL) {
3217 3252 /*
3218 3253 * Remove the existing root pool from the namespace so that we
3219 3254 * can replace it with the correct config we just read in.
3220 3255 */
3221 3256 spa_remove(spa);
3222 3257 }
3223 3258
3224 3259 spa = spa_add(pname, config, NULL);
3225 3260 spa->spa_is_root = B_TRUE;
3226 3261 spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3227 3262
3228 3263 /*
3229 3264 * Build up a vdev tree based on the boot device's label config.
3230 3265 */
3231 3266 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3232 3267 &nvtop) == 0);
3233 3268 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3234 3269 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3235 3270 VDEV_ALLOC_ROOTPOOL);
3236 3271 spa_config_exit(spa, SCL_ALL, FTAG);
3237 3272 if (error) {
3238 3273 mutex_exit(&spa_namespace_lock);
3239 3274 nvlist_free(config);
3240 3275 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3241 3276 pname);
3242 3277 return (error);
3243 3278 }
3244 3279
3245 3280 /*
3246 3281 * Get the boot vdev.
3247 3282 */
3248 3283 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3249 3284 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3250 3285 (u_longlong_t)guid);
3251 3286 error = ENOENT;
3252 3287 goto out;
3253 3288 }
3254 3289
3255 3290 /*
3256 3291 * Determine if there is a better boot device.
3257 3292 */
3258 3293 avd = bvd;
3259 3294 spa_alt_rootvdev(rvd, &avd, &txg);
3260 3295 if (avd != bvd) {
3261 3296 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3262 3297 "try booting from '%s'", avd->vdev_path);
3263 3298 error = EINVAL;
3264 3299 goto out;
3265 3300 }
3266 3301
3267 3302 /*
3268 3303 * If the boot device is part of a spare vdev then ensure that
3269 3304 * we're booting off the active spare.
3270 3305 */
3271 3306 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3272 3307 !bvd->vdev_isspare) {
3273 3308 cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3274 3309 "try booting from '%s'",
3275 3310 bvd->vdev_parent->
3276 3311 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3277 3312 error = EINVAL;
3278 3313 goto out;
3279 3314 }
3280 3315
3281 3316 error = 0;
3282 3317 spa_history_log_version(spa, LOG_POOL_IMPORT);
3283 3318 out:
3284 3319 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3285 3320 vdev_free(rvd);
3286 3321 spa_config_exit(spa, SCL_ALL, FTAG);
3287 3322 mutex_exit(&spa_namespace_lock);
3288 3323
3289 3324 nvlist_free(config);
3290 3325 return (error);
3291 3326 }
3292 3327
3293 3328 #endif
3294 3329
3295 3330 /*
3296 3331 * Import a non-root pool into the system.
3297 3332 */
3298 3333 int
3299 3334 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3300 3335 {
3301 3336 spa_t *spa;
3302 3337 char *altroot = NULL;
3303 3338 spa_load_state_t state = SPA_LOAD_IMPORT;
3304 3339 zpool_rewind_policy_t policy;
3305 3340 uint64_t mode = spa_mode_global;
3306 3341 uint64_t readonly = B_FALSE;
3307 3342 int error;
3308 3343 nvlist_t *nvroot;
3309 3344 nvlist_t **spares, **l2cache;
3310 3345 uint_t nspares, nl2cache;
3311 3346
3312 3347 /*
3313 3348 * If a pool with this name exists, return failure.
3314 3349 */
3315 3350 mutex_enter(&spa_namespace_lock);
3316 3351 if (spa_lookup(pool) != NULL) {
3317 3352 mutex_exit(&spa_namespace_lock);
3318 3353 return (EEXIST);
3319 3354 }
3320 3355
3321 3356 /*
3322 3357 * Create and initialize the spa structure.
3323 3358 */
3324 3359 (void) nvlist_lookup_string(props,
3325 3360 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3326 3361 (void) nvlist_lookup_uint64(props,
3327 3362 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3328 3363 if (readonly)
3329 3364 mode = FREAD;
3330 3365 spa = spa_add(pool, config, altroot);
3331 3366 spa->spa_import_flags = flags;
3332 3367
3333 3368 /*
3334 3369 * Verbatim import - Take a pool and insert it into the namespace
3335 3370 * as if it had been loaded at boot.
3336 3371 */
3337 3372 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3338 3373 if (props != NULL)
3339 3374 spa_configfile_set(spa, props, B_FALSE);
3340 3375
3341 3376 spa_config_sync(spa, B_FALSE, B_TRUE);
3342 3377
3343 3378 mutex_exit(&spa_namespace_lock);
3344 3379 spa_history_log_version(spa, LOG_POOL_IMPORT);
3345 3380
3346 3381 return (0);
3347 3382 }
3348 3383
3349 3384 spa_activate(spa, mode);
3350 3385
3351 3386 /*
3352 3387 * Don't start async tasks until we know everything is healthy.
3353 3388 */
3354 3389 spa_async_suspend(spa);
3355 3390
3356 3391 zpool_get_rewind_policy(config, &policy);
3357 3392 if (policy.zrp_request & ZPOOL_DO_REWIND)
3358 3393 state = SPA_LOAD_RECOVER;
3359 3394
3360 3395 /*
3361 3396 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig
3362 3397 * because the user-supplied config is actually the one to trust when
3363 3398 * doing an import.
3364 3399 */
3365 3400 if (state != SPA_LOAD_RECOVER)
3366 3401 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3367 3402
3368 3403 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3369 3404 policy.zrp_request);
3370 3405
3371 3406 /*
3372 3407 * Propagate anything learned while loading the pool and pass it
3373 3408 * back to caller (i.e. rewind info, missing devices, etc).
3374 3409 */
3375 3410 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3376 3411 spa->spa_load_info) == 0);
3377 3412
3378 3413 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3379 3414 /*
3380 3415 * Toss any existing sparelist, as it doesn't have any validity
3381 3416 * anymore, and conflicts with spa_has_spare().
3382 3417 */
3383 3418 if (spa->spa_spares.sav_config) {
3384 3419 nvlist_free(spa->spa_spares.sav_config);
3385 3420 spa->spa_spares.sav_config = NULL;
3386 3421 spa_load_spares(spa);
3387 3422 }
3388 3423 if (spa->spa_l2cache.sav_config) {
3389 3424 nvlist_free(spa->spa_l2cache.sav_config);
3390 3425 spa->spa_l2cache.sav_config = NULL;
3391 3426 spa_load_l2cache(spa);
3392 3427 }
3393 3428
3394 3429 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3395 3430 &nvroot) == 0);
3396 3431 if (error == 0)
3397 3432 error = spa_validate_aux(spa, nvroot, -1ULL,
3398 3433 VDEV_ALLOC_SPARE);
3399 3434 if (error == 0)
3400 3435 error = spa_validate_aux(spa, nvroot, -1ULL,
3401 3436 VDEV_ALLOC_L2CACHE);
3402 3437 spa_config_exit(spa, SCL_ALL, FTAG);
3403 3438
3404 3439 if (props != NULL)
3405 3440 spa_configfile_set(spa, props, B_FALSE);
3406 3441
3407 3442 if (error != 0 || (props && spa_writeable(spa) &&
3408 3443 (error = spa_prop_set(spa, props)))) {
3409 3444 spa_unload(spa);
3410 3445 spa_deactivate(spa);
3411 3446 spa_remove(spa);
3412 3447 mutex_exit(&spa_namespace_lock);
3413 3448 return (error);
3414 3449 }
3415 3450
3416 3451 spa_async_resume(spa);
3417 3452
3418 3453 /*
3419 3454 * Override any spares and level 2 cache devices as specified by
3420 3455 * the user, as these may have correct device names/devids, etc.
3421 3456 */
3422 3457 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3423 3458 &spares, &nspares) == 0) {
3424 3459 if (spa->spa_spares.sav_config)
3425 3460 VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3426 3461 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3427 3462 else
3428 3463 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3429 3464 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3430 3465 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3431 3466 ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3432 3467 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3433 3468 spa_load_spares(spa);
3434 3469 spa_config_exit(spa, SCL_ALL, FTAG);
3435 3470 spa->spa_spares.sav_sync = B_TRUE;
3436 3471 }
3437 3472 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3438 3473 &l2cache, &nl2cache) == 0) {
3439 3474 if (spa->spa_l2cache.sav_config)
3440 3475 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3441 3476 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3442 3477 else
3443 3478 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3444 3479 NV_UNIQUE_NAME, KM_SLEEP) == 0);
3445 3480 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3446 3481 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3447 3482 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3448 3483 spa_load_l2cache(spa);
3449 3484 spa_config_exit(spa, SCL_ALL, FTAG);
3450 3485 spa->spa_l2cache.sav_sync = B_TRUE;
3451 3486 }
3452 3487
3453 3488 /*
3454 3489 * Check for any removed devices.
3455 3490 */
3456 3491 if (spa->spa_autoreplace) {
3457 3492 spa_aux_check_removed(&spa->spa_spares);
3458 3493 spa_aux_check_removed(&spa->spa_l2cache);
3459 3494 }
3460 3495
3461 3496 if (spa_writeable(spa)) {
3462 3497 /*
3463 3498 * Update the config cache to include the newly-imported pool.
3464 3499 */
3465 3500 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3466 3501 }
3467 3502
3468 3503 /*
3469 3504 * It's possible that the pool was expanded while it was exported.
3470 3505 * We kick off an async task to handle this for us.
3471 3506 */
3472 3507 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3473 3508
3474 3509 mutex_exit(&spa_namespace_lock);
3475 3510 spa_history_log_version(spa, LOG_POOL_IMPORT);
3476 3511
3477 3512 return (0);
3478 3513 }
3479 3514
3480 3515 nvlist_t *
3481 3516 spa_tryimport(nvlist_t *tryconfig)
3482 3517 {
3483 3518 nvlist_t *config = NULL;
3484 3519 char *poolname;
3485 3520 spa_t *spa;
3486 3521 uint64_t state;
3487 3522 int error;
3488 3523
3489 3524 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3490 3525 return (NULL);
3491 3526
3492 3527 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3493 3528 return (NULL);
3494 3529
3495 3530 /*
3496 3531 * Create and initialize the spa structure.
3497 3532 */
3498 3533 mutex_enter(&spa_namespace_lock);
3499 3534 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
3500 3535 spa_activate(spa, FREAD);
3501 3536
3502 3537 /*
3503 3538 * Pass off the heavy lifting to spa_load().
3504 3539 * Pass TRUE for mosconfig because the user-supplied config
3505 3540 * is actually the one to trust when doing an import.
3506 3541 */
3507 3542 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3508 3543
3509 3544 /*
3510 3545 * If 'tryconfig' was at least parsable, return the current config.
3511 3546 */
3512 3547 if (spa->spa_root_vdev != NULL) {
3513 3548 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3514 3549 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3515 3550 poolname) == 0);
3516 3551 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3517 3552 state) == 0);
3518 3553 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3519 3554 spa->spa_uberblock.ub_timestamp) == 0);
3520 3555
3521 3556 /*
3522 3557 * If the bootfs property exists on this pool then we
3523 3558 * copy it out so that external consumers can tell which
3524 3559 * pools are bootable.
3525 3560 */
3526 3561 if ((!error || error == EEXIST) && spa->spa_bootfs) {
3527 3562 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3528 3563
3529 3564 /*
3530 3565 * We have to play games with the name since the
3531 3566 * pool was opened as TRYIMPORT_NAME.
3532 3567 */
3533 3568 if (dsl_dsobj_to_dsname(spa_name(spa),
3534 3569 spa->spa_bootfs, tmpname) == 0) {
3535 3570 char *cp;
3536 3571 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3537 3572
3538 3573 cp = strchr(tmpname, '/');
3539 3574 if (cp == NULL) {
3540 3575 (void) strlcpy(dsname, tmpname,
3541 3576 MAXPATHLEN);
3542 3577 } else {
3543 3578 (void) snprintf(dsname, MAXPATHLEN,
3544 3579 "%s/%s", poolname, ++cp);
3545 3580 }
3546 3581 VERIFY(nvlist_add_string(config,
3547 3582 ZPOOL_CONFIG_BOOTFS, dsname) == 0);
3548 3583 kmem_free(dsname, MAXPATHLEN);
3549 3584 }
3550 3585 kmem_free(tmpname, MAXPATHLEN);
3551 3586 }
3552 3587
3553 3588 /*
3554 3589 * Add the list of hot spares and level 2 cache devices.
3555 3590 */
3556 3591 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3557 3592 spa_add_spares(spa, config);
3558 3593 spa_add_l2cache(spa, config);
3559 3594 spa_config_exit(spa, SCL_CONFIG, FTAG);
3560 3595 }
3561 3596
3562 3597 spa_unload(spa);
3563 3598 spa_deactivate(spa);
3564 3599 spa_remove(spa);
3565 3600 mutex_exit(&spa_namespace_lock);
3566 3601
3567 3602 return (config);
3568 3603 }
3569 3604
3570 3605 /*
3571 3606 * Pool export/destroy
3572 3607 *
3573 3608 * The act of destroying or exporting a pool is very simple. We make sure there
3574 3609 * is no more pending I/O and any references to the pool are gone. Then, we
3575 3610 * update the pool state and sync all the labels to disk, removing the
3576 3611 * configuration from the cache afterwards. If the 'hardforce' flag is set, then
3577 3612 * we don't sync the labels or remove the configuration cache.
3578 3613 */
3579 3614 static int
3580 3615 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
3581 3616 boolean_t force, boolean_t hardforce)
3582 3617 {
3583 3618 spa_t *spa;
3584 3619
3585 3620 if (oldconfig)
3586 3621 *oldconfig = NULL;
3587 3622
3588 3623 if (!(spa_mode_global & FWRITE))
3589 3624 return (EROFS);
3590 3625
3591 3626 mutex_enter(&spa_namespace_lock);
3592 3627 if ((spa = spa_lookup(pool)) == NULL) {
3593 3628 mutex_exit(&spa_namespace_lock);
3594 3629 return (ENOENT);
3595 3630 }
3596 3631
3597 3632 /*
3598 3633 * Put a hold on the pool, drop the namespace lock, stop async tasks,
3599 3634 * reacquire the namespace lock, and see if we can export.
3600 3635 */
3601 3636 spa_open_ref(spa, FTAG);
3602 3637 mutex_exit(&spa_namespace_lock);
3603 3638 spa_async_suspend(spa);
3604 3639 mutex_enter(&spa_namespace_lock);
3605 3640 spa_close(spa, FTAG);
3606 3641
3607 3642 /*
3608 3643 * The pool will be in core if it's openable,
3609 3644 * in which case we can modify its state.
3610 3645 */
3611 3646 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
3612 3647 /*
3613 3648 * Objsets may be open only because they're dirty, so we
3614 3649 * have to force it to sync before checking spa_refcnt.
3615 3650 */
3616 3651 txg_wait_synced(spa->spa_dsl_pool, 0);
3617 3652
3618 3653 /*
3619 3654 * A pool cannot be exported or destroyed if there are active
3620 3655 * references. If we are resetting a pool, allow references by
3621 3656 * fault injection handlers.
3622 3657 */
3623 3658 if (!spa_refcount_zero(spa) ||
3624 3659 (spa->spa_inject_ref != 0 &&
3625 3660 new_state != POOL_STATE_UNINITIALIZED)) {
3626 3661 spa_async_resume(spa);
3627 3662 mutex_exit(&spa_namespace_lock);
3628 3663 return (EBUSY);
3629 3664 }
3630 3665
3631 3666 /*
3632 3667 * A pool cannot be exported if it has an active shared spare.
3633 3668 * This is to prevent other pools stealing the active spare
3634 3669 * from an exported pool. At user's own will, such pool can
3635 3670 * be forcedly exported.
3636 3671 */
3637 3672 if (!force && new_state == POOL_STATE_EXPORTED &&
3638 3673 spa_has_active_shared_spare(spa)) {
3639 3674 spa_async_resume(spa);
3640 3675 mutex_exit(&spa_namespace_lock);
3641 3676 return (EXDEV);
3642 3677 }
3643 3678
3644 3679 /*
3645 3680 * We want this to be reflected on every label,
3646 3681 * so mark them all dirty. spa_unload() will do the
3647 3682 * final sync that pushes these changes out.
3648 3683 */
3649 3684 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
3650 3685 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3651 3686 spa->spa_state = new_state;
3652 3687 spa->spa_final_txg = spa_last_synced_txg(spa) +
3653 3688 TXG_DEFER_SIZE + 1;
3654 3689 vdev_config_dirty(spa->spa_root_vdev);
3655 3690 spa_config_exit(spa, SCL_ALL, FTAG);
3656 3691 }
3657 3692 }
3658 3693
3659 3694 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
3660 3695
3661 3696 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3662 3697 spa_unload(spa);
3663 3698 spa_deactivate(spa);
3664 3699 }
3665 3700
3666 3701 if (oldconfig && spa->spa_config)
3667 3702 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
3668 3703
3669 3704 if (new_state != POOL_STATE_UNINITIALIZED) {
3670 3705 if (!hardforce)
3671 3706 spa_config_sync(spa, B_TRUE, B_TRUE);
3672 3707 spa_remove(spa);
3673 3708 }
3674 3709 mutex_exit(&spa_namespace_lock);
3675 3710
3676 3711 return (0);
3677 3712 }
3678 3713
3679 3714 /*
3680 3715 * Destroy a storage pool.
3681 3716 */
3682 3717 int
3683 3718 spa_destroy(char *pool)
3684 3719 {
3685 3720 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
3686 3721 B_FALSE, B_FALSE));
3687 3722 }
3688 3723
3689 3724 /*
3690 3725 * Export a storage pool.
3691 3726 */
3692 3727 int
3693 3728 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
3694 3729 boolean_t hardforce)
3695 3730 {
3696 3731 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
3697 3732 force, hardforce));
3698 3733 }
3699 3734
3700 3735 /*
3701 3736 * Similar to spa_export(), this unloads the spa_t without actually removing it
3702 3737 * from the namespace in any way.
3703 3738 */
3704 3739 int
3705 3740 spa_reset(char *pool)
3706 3741 {
3707 3742 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
3708 3743 B_FALSE, B_FALSE));
3709 3744 }
3710 3745
3711 3746 /*
3712 3747 * ==========================================================================
3713 3748 * Device manipulation
3714 3749 * ==========================================================================
3715 3750 */
3716 3751
3717 3752 /*
3718 3753 * Add a device to a storage pool.
3719 3754 */
3720 3755 int
3721 3756 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
3722 3757 {
3723 3758 uint64_t txg, id;
3724 3759 int error;
3725 3760 vdev_t *rvd = spa->spa_root_vdev;
3726 3761 vdev_t *vd, *tvd;
3727 3762 nvlist_t **spares, **l2cache;
3728 3763 uint_t nspares, nl2cache;
3729 3764
3730 3765 ASSERT(spa_writeable(spa));
3731 3766
3732 3767 txg = spa_vdev_enter(spa);
3733 3768
3734 3769 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
3735 3770 VDEV_ALLOC_ADD)) != 0)
3736 3771 return (spa_vdev_exit(spa, NULL, txg, error));
3737 3772
3738 3773 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */
3739 3774
3740 3775 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
3741 3776 &nspares) != 0)
3742 3777 nspares = 0;
3743 3778
3744 3779 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
3745 3780 &nl2cache) != 0)
3746 3781 nl2cache = 0;
3747 3782
3748 3783 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
3749 3784 return (spa_vdev_exit(spa, vd, txg, EINVAL));
3750 3785
3751 3786 if (vd->vdev_children != 0 &&
3752 3787 (error = vdev_create(vd, txg, B_FALSE)) != 0)
3753 3788 return (spa_vdev_exit(spa, vd, txg, error));
3754 3789
3755 3790 /*
3756 3791 * We must validate the spares and l2cache devices after checking the
3757 3792 * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
3758 3793 */
3759 3794 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
3760 3795 return (spa_vdev_exit(spa, vd, txg, error));
3761 3796
3762 3797 /*
3763 3798 * Transfer each new top-level vdev from vd to rvd.
3764 3799 */
3765 3800 for (int c = 0; c < vd->vdev_children; c++) {
3766 3801
3767 3802 /*
3768 3803 * Set the vdev id to the first hole, if one exists.
3769 3804 */
3770 3805 for (id = 0; id < rvd->vdev_children; id++) {
3771 3806 if (rvd->vdev_child[id]->vdev_ishole) {
3772 3807 vdev_free(rvd->vdev_child[id]);
3773 3808 break;
3774 3809 }
3775 3810 }
3776 3811 tvd = vd->vdev_child[c];
3777 3812 vdev_remove_child(vd, tvd);
3778 3813 tvd->vdev_id = id;
3779 3814 vdev_add_child(rvd, tvd);
3780 3815 vdev_config_dirty(tvd);
3781 3816 }
3782 3817
3783 3818 if (nspares != 0) {
3784 3819 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
3785 3820 ZPOOL_CONFIG_SPARES);
3786 3821 spa_load_spares(spa);
3787 3822 spa->spa_spares.sav_sync = B_TRUE;
3788 3823 }
3789 3824
3790 3825 if (nl2cache != 0) {
3791 3826 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
3792 3827 ZPOOL_CONFIG_L2CACHE);
3793 3828 spa_load_l2cache(spa);
3794 3829 spa->spa_l2cache.sav_sync = B_TRUE;
3795 3830 }
3796 3831
3797 3832 /*
3798 3833 * We have to be careful when adding new vdevs to an existing pool.
3799 3834 * If other threads start allocating from these vdevs before we
3800 3835 * sync the config cache, and we lose power, then upon reboot we may
3801 3836 * fail to open the pool because there are DVAs that the config cache
3802 3837 * can't translate. Therefore, we first add the vdevs without
3803 3838 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
3804 3839 * and then let spa_config_update() initialize the new metaslabs.
3805 3840 *
3806 3841 * spa_load() checks for added-but-not-initialized vdevs, so that
3807 3842 * if we lose power at any point in this sequence, the remaining
3808 3843 * steps will be completed the next time we load the pool.
3809 3844 */
3810 3845 (void) spa_vdev_exit(spa, vd, txg, 0);
3811 3846
3812 3847 mutex_enter(&spa_namespace_lock);
3813 3848 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3814 3849 mutex_exit(&spa_namespace_lock);
3815 3850
3816 3851 return (0);
3817 3852 }
3818 3853
3819 3854 /*
3820 3855 * Attach a device to a mirror. The arguments are the path to any device
3821 3856 * in the mirror, and the nvroot for the new device. If the path specifies
3822 3857 * a device that is not mirrored, we automatically insert the mirror vdev.
3823 3858 *
3824 3859 * If 'replacing' is specified, the new device is intended to replace the
3825 3860 * existing device; in this case the two devices are made into their own
3826 3861 * mirror using the 'replacing' vdev, which is functionally identical to
3827 3862 * the mirror vdev (it actually reuses all the same ops) but has a few
3828 3863 * extra rules: you can't attach to it after it's been created, and upon
3829 3864 * completion of resilvering, the first disk (the one being replaced)
3830 3865 * is automatically detached.
3831 3866 */
3832 3867 int
3833 3868 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3834 3869 {
3835 3870 uint64_t txg, dtl_max_txg;
3836 3871 vdev_t *rvd = spa->spa_root_vdev;
3837 3872 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
3838 3873 vdev_ops_t *pvops;
3839 3874 char *oldvdpath, *newvdpath;
3840 3875 int newvd_isspare;
3841 3876 int error;
3842 3877
3843 3878 ASSERT(spa_writeable(spa));
3844 3879
3845 3880 txg = spa_vdev_enter(spa);
3846 3881
3847 3882 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3848 3883
3849 3884 if (oldvd == NULL)
3850 3885 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3851 3886
3852 3887 if (!oldvd->vdev_ops->vdev_op_leaf)
3853 3888 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3854 3889
3855 3890 pvd = oldvd->vdev_parent;
3856 3891
3857 3892 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
3858 3893 VDEV_ALLOC_ADD)) != 0)
3859 3894 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
3860 3895
3861 3896 if (newrootvd->vdev_children != 1)
3862 3897 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3863 3898
3864 3899 newvd = newrootvd->vdev_child[0];
3865 3900
3866 3901 if (!newvd->vdev_ops->vdev_op_leaf)
3867 3902 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3868 3903
3869 3904 if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3870 3905 return (spa_vdev_exit(spa, newrootvd, txg, error));
3871 3906
3872 3907 /*
3873 3908 * Spares can't replace logs
3874 3909 */
3875 3910 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
3876 3911 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3877 3912
3878 3913 if (!replacing) {
3879 3914 /*
3880 3915 * For attach, the only allowable parent is a mirror or the root
3881 3916 * vdev.
3882 3917 */
3883 3918 if (pvd->vdev_ops != &vdev_mirror_ops &&
3884 3919 pvd->vdev_ops != &vdev_root_ops)
3885 3920 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3886 3921
3887 3922 pvops = &vdev_mirror_ops;
3888 3923 } else {
3889 3924 /*
3890 3925 * Active hot spares can only be replaced by inactive hot
3891 3926 * spares.
3892 3927 */
3893 3928 if (pvd->vdev_ops == &vdev_spare_ops &&
3894 3929 oldvd->vdev_isspare &&
3895 3930 !spa_has_spare(spa, newvd->vdev_guid))
3896 3931 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3897 3932
3898 3933 /*
3899 3934 * If the source is a hot spare, and the parent isn't already a
3900 3935 * spare, then we want to create a new hot spare. Otherwise, we
3901 3936 * want to create a replacing vdev. The user is not allowed to
3902 3937 * attach to a spared vdev child unless the 'isspare' state is
3903 3938 * the same (spare replaces spare, non-spare replaces
3904 3939 * non-spare).
3905 3940 */
3906 3941 if (pvd->vdev_ops == &vdev_replacing_ops &&
3907 3942 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
3908 3943 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3909 3944 } else if (pvd->vdev_ops == &vdev_spare_ops &&
3910 3945 newvd->vdev_isspare != oldvd->vdev_isspare) {
3911 3946 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3912 3947 }
3913 3948
3914 3949 if (newvd->vdev_isspare)
3915 3950 pvops = &vdev_spare_ops;
3916 3951 else
3917 3952 pvops = &vdev_replacing_ops;
3918 3953 }
3919 3954
3920 3955 /*
3921 3956 * Make sure the new device is big enough.
3922 3957 */
3923 3958 if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3924 3959 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3925 3960
3926 3961 /*
3927 3962 * The new device cannot have a higher alignment requirement
3928 3963 * than the top-level vdev.
3929 3964 */
3930 3965 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3931 3966 return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3932 3967
3933 3968 /*
3934 3969 * If this is an in-place replacement, update oldvd's path and devid
3935 3970 * to make it distinguishable from newvd, and unopenable from now on.
3936 3971 */
3937 3972 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3938 3973 spa_strfree(oldvd->vdev_path);
3939 3974 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3940 3975 KM_SLEEP);
3941 3976 (void) sprintf(oldvd->vdev_path, "%s/%s",
3942 3977 newvd->vdev_path, "old");
3943 3978 if (oldvd->vdev_devid != NULL) {
3944 3979 spa_strfree(oldvd->vdev_devid);
3945 3980 oldvd->vdev_devid = NULL;
3946 3981 }
3947 3982 }
3948 3983
3949 3984 /* mark the device being resilvered */
3950 3985 newvd->vdev_resilvering = B_TRUE;
3951 3986
3952 3987 /*
3953 3988 * If the parent is not a mirror, or if we're replacing, insert the new
3954 3989 * mirror/replacing/spare vdev above oldvd.
3955 3990 */
3956 3991 if (pvd->vdev_ops != pvops)
3957 3992 pvd = vdev_add_parent(oldvd, pvops);
3958 3993
3959 3994 ASSERT(pvd->vdev_top->vdev_parent == rvd);
3960 3995 ASSERT(pvd->vdev_ops == pvops);
3961 3996 ASSERT(oldvd->vdev_parent == pvd);
3962 3997
3963 3998 /*
3964 3999 * Extract the new device from its root and add it to pvd.
3965 4000 */
3966 4001 vdev_remove_child(newrootvd, newvd);
3967 4002 newvd->vdev_id = pvd->vdev_children;
3968 4003 newvd->vdev_crtxg = oldvd->vdev_crtxg;
3969 4004 vdev_add_child(pvd, newvd);
3970 4005
3971 4006 tvd = newvd->vdev_top;
3972 4007 ASSERT(pvd->vdev_top == tvd);
3973 4008 ASSERT(tvd->vdev_parent == rvd);
3974 4009
3975 4010 vdev_config_dirty(tvd);
3976 4011
3977 4012 /*
3978 4013 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
3979 4014 * for any dmu_sync-ed blocks. It will propagate upward when
3980 4015 * spa_vdev_exit() calls vdev_dtl_reassess().
3981 4016 */
3982 4017 dtl_max_txg = txg + TXG_CONCURRENT_STATES;
3983 4018
3984 4019 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
3985 4020 dtl_max_txg - TXG_INITIAL);
3986 4021
3987 4022 if (newvd->vdev_isspare) {
3988 4023 spa_spare_activate(newvd);
3989 4024 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
3990 4025 }
3991 4026
3992 4027 oldvdpath = spa_strdup(oldvd->vdev_path);
3993 4028 newvdpath = spa_strdup(newvd->vdev_path);
3994 4029 newvd_isspare = newvd->vdev_isspare;
3995 4030
3996 4031 /*
3997 4032 * Mark newvd's DTL dirty in this txg.
3998 4033 */
3999 4034 vdev_dirty(tvd, VDD_DTL, newvd, txg);
4000 4035
4001 4036 /*
4002 4037 * Restart the resilver
4003 4038 */
4004 4039 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4005 4040
4006 4041 /*
4007 4042 * Commit the config
4008 4043 */
4009 4044 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4010 4045
4011 4046 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
4012 4047 "%s vdev=%s %s vdev=%s",
4013 4048 replacing && newvd_isspare ? "spare in" :
4014 4049 replacing ? "replace" : "attach", newvdpath,
4015 4050 replacing ? "for" : "to", oldvdpath);
4016 4051
4017 4052 spa_strfree(oldvdpath);
4018 4053 spa_strfree(newvdpath);
4019 4054
4020 4055 if (spa->spa_bootfs)
4021 4056 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4022 4057
4023 4058 return (0);
4024 4059 }
4025 4060
4026 4061 /*
4027 4062 * Detach a device from a mirror or replacing vdev.
4028 4063 * If 'replace_done' is specified, only detach if the parent
4029 4064 * is a replacing vdev.
4030 4065 */
4031 4066 int
4032 4067 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4033 4068 {
4034 4069 uint64_t txg;
4035 4070 int error;
4036 4071 vdev_t *rvd = spa->spa_root_vdev;
4037 4072 vdev_t *vd, *pvd, *cvd, *tvd;
4038 4073 boolean_t unspare = B_FALSE;
4039 4074 uint64_t unspare_guid;
4040 4075 char *vdpath;
4041 4076
4042 4077 ASSERT(spa_writeable(spa));
4043 4078
4044 4079 txg = spa_vdev_enter(spa);
4045 4080
4046 4081 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4047 4082
4048 4083 if (vd == NULL)
4049 4084 return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4050 4085
4051 4086 if (!vd->vdev_ops->vdev_op_leaf)
4052 4087 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4053 4088
4054 4089 pvd = vd->vdev_parent;
4055 4090
4056 4091 /*
4057 4092 * If the parent/child relationship is not as expected, don't do it.
4058 4093 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4059 4094 * vdev that's replacing B with C. The user's intent in replacing
4060 4095 * is to go from M(A,B) to M(A,C). If the user decides to cancel
4061 4096 * the replace by detaching C, the expected behavior is to end up
4062 4097 * M(A,B). But suppose that right after deciding to detach C,
4063 4098 * the replacement of B completes. We would have M(A,C), and then
4064 4099 * ask to detach C, which would leave us with just A -- not what
4065 4100 * the user wanted. To prevent this, we make sure that the
4066 4101 * parent/child relationship hasn't changed -- in this example,
4067 4102 * that C's parent is still the replacing vdev R.
4068 4103 */
4069 4104 if (pvd->vdev_guid != pguid && pguid != 0)
4070 4105 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4071 4106
4072 4107 /*
4073 4108 * Only 'replacing' or 'spare' vdevs can be replaced.
4074 4109 */
4075 4110 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4076 4111 pvd->vdev_ops != &vdev_spare_ops)
4077 4112 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4078 4113
4079 4114 ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4080 4115 spa_version(spa) >= SPA_VERSION_SPARES);
4081 4116
4082 4117 /*
4083 4118 * Only mirror, replacing, and spare vdevs support detach.
4084 4119 */
4085 4120 if (pvd->vdev_ops != &vdev_replacing_ops &&
4086 4121 pvd->vdev_ops != &vdev_mirror_ops &&
4087 4122 pvd->vdev_ops != &vdev_spare_ops)
4088 4123 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4089 4124
4090 4125 /*
4091 4126 * If this device has the only valid copy of some data,
4092 4127 * we cannot safely detach it.
4093 4128 */
4094 4129 if (vdev_dtl_required(vd))
4095 4130 return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4096 4131
4097 4132 ASSERT(pvd->vdev_children >= 2);
4098 4133
4099 4134 /*
4100 4135 * If we are detaching the second disk from a replacing vdev, then
4101 4136 * check to see if we changed the original vdev's path to have "/old"
4102 4137 * at the end in spa_vdev_attach(). If so, undo that change now.
4103 4138 */
4104 4139 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4105 4140 vd->vdev_path != NULL) {
4106 4141 size_t len = strlen(vd->vdev_path);
4107 4142
4108 4143 for (int c = 0; c < pvd->vdev_children; c++) {
4109 4144 cvd = pvd->vdev_child[c];
4110 4145
4111 4146 if (cvd == vd || cvd->vdev_path == NULL)
4112 4147 continue;
4113 4148
4114 4149 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4115 4150 strcmp(cvd->vdev_path + len, "/old") == 0) {
4116 4151 spa_strfree(cvd->vdev_path);
4117 4152 cvd->vdev_path = spa_strdup(vd->vdev_path);
4118 4153 break;
4119 4154 }
4120 4155 }
4121 4156 }
4122 4157
4123 4158 /*
4124 4159 * If we are detaching the original disk from a spare, then it implies
4125 4160 * that the spare should become a real disk, and be removed from the
4126 4161 * active spare list for the pool.
4127 4162 */
4128 4163 if (pvd->vdev_ops == &vdev_spare_ops &&
4129 4164 vd->vdev_id == 0 &&
4130 4165 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4131 4166 unspare = B_TRUE;
4132 4167
4133 4168 /*
4134 4169 * Erase the disk labels so the disk can be used for other things.
4135 4170 * This must be done after all other error cases are handled,
4136 4171 * but before we disembowel vd (so we can still do I/O to it).
4137 4172 * But if we can't do it, don't treat the error as fatal --
4138 4173 * it may be that the unwritability of the disk is the reason
4139 4174 * it's being detached!
4140 4175 */
4141 4176 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4142 4177
4143 4178 /*
4144 4179 * Remove vd from its parent and compact the parent's children.
4145 4180 */
4146 4181 vdev_remove_child(pvd, vd);
4147 4182 vdev_compact_children(pvd);
4148 4183
4149 4184 /*
4150 4185 * Remember one of the remaining children so we can get tvd below.
4151 4186 */
4152 4187 cvd = pvd->vdev_child[pvd->vdev_children - 1];
4153 4188
4154 4189 /*
4155 4190 * If we need to remove the remaining child from the list of hot spares,
4156 4191 * do it now, marking the vdev as no longer a spare in the process.
4157 4192 * We must do this before vdev_remove_parent(), because that can
4158 4193 * change the GUID if it creates a new toplevel GUID. For a similar
4159 4194 * reason, we must remove the spare now, in the same txg as the detach;
4160 4195 * otherwise someone could attach a new sibling, change the GUID, and
4161 4196 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4162 4197 */
4163 4198 if (unspare) {
4164 4199 ASSERT(cvd->vdev_isspare);
4165 4200 spa_spare_remove(cvd);
4166 4201 unspare_guid = cvd->vdev_guid;
4167 4202 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4168 4203 cvd->vdev_unspare = B_TRUE;
4169 4204 }
4170 4205
4171 4206 /*
4172 4207 * If the parent mirror/replacing vdev only has one child,
4173 4208 * the parent is no longer needed. Remove it from the tree.
4174 4209 */
4175 4210 if (pvd->vdev_children == 1) {
4176 4211 if (pvd->vdev_ops == &vdev_spare_ops)
4177 4212 cvd->vdev_unspare = B_FALSE;
4178 4213 vdev_remove_parent(cvd);
4179 4214 cvd->vdev_resilvering = B_FALSE;
4180 4215 }
4181 4216
4182 4217
4183 4218 /*
4184 4219 * We don't set tvd until now because the parent we just removed
4185 4220 * may have been the previous top-level vdev.
4186 4221 */
4187 4222 tvd = cvd->vdev_top;
4188 4223 ASSERT(tvd->vdev_parent == rvd);
4189 4224
4190 4225 /*
4191 4226 * Reevaluate the parent vdev state.
4192 4227 */
4193 4228 vdev_propagate_state(cvd);
4194 4229
4195 4230 /*
4196 4231 * If the 'autoexpand' property is set on the pool then automatically
4197 4232 * try to expand the size of the pool. For example if the device we
4198 4233 * just detached was smaller than the others, it may be possible to
4199 4234 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4200 4235 * first so that we can obtain the updated sizes of the leaf vdevs.
4201 4236 */
4202 4237 if (spa->spa_autoexpand) {
4203 4238 vdev_reopen(tvd);
4204 4239 vdev_expand(tvd, txg);
4205 4240 }
4206 4241
4207 4242 vdev_config_dirty(tvd);
4208 4243
4209 4244 /*
4210 4245 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that
4211 4246 * vd->vdev_detached is set and free vd's DTL object in syncing context.
4212 4247 * But first make sure we're not on any *other* txg's DTL list, to
4213 4248 * prevent vd from being accessed after it's freed.
4214 4249 */
4215 4250 vdpath = spa_strdup(vd->vdev_path);
4216 4251 for (int t = 0; t < TXG_SIZE; t++)
4217 4252 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4218 4253 vd->vdev_detached = B_TRUE;
4219 4254 vdev_dirty(tvd, VDD_DTL, vd, txg);
4220 4255
4221 4256 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4222 4257
4223 4258 /* hang on to the spa before we release the lock */
4224 4259 spa_open_ref(spa, FTAG);
4225 4260
4226 4261 error = spa_vdev_exit(spa, vd, txg, 0);
4227 4262
4228 4263 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
4229 4264 "vdev=%s", vdpath);
4230 4265 spa_strfree(vdpath);
4231 4266
4232 4267 /*
4233 4268 * If this was the removal of the original device in a hot spare vdev,
4234 4269 * then we want to go through and remove the device from the hot spare
4235 4270 * list of every other pool.
4236 4271 */
4237 4272 if (unspare) {
4238 4273 spa_t *altspa = NULL;
4239 4274
4240 4275 mutex_enter(&spa_namespace_lock);
4241 4276 while ((altspa = spa_next(altspa)) != NULL) {
4242 4277 if (altspa->spa_state != POOL_STATE_ACTIVE ||
4243 4278 altspa == spa)
4244 4279 continue;
4245 4280
4246 4281 spa_open_ref(altspa, FTAG);
4247 4282 mutex_exit(&spa_namespace_lock);
4248 4283 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4249 4284 mutex_enter(&spa_namespace_lock);
4250 4285 spa_close(altspa, FTAG);
4251 4286 }
4252 4287 mutex_exit(&spa_namespace_lock);
4253 4288
4254 4289 /* search the rest of the vdevs for spares to remove */
4255 4290 spa_vdev_resilver_done(spa);
4256 4291 }
4257 4292
4258 4293 /* all done with the spa; OK to release */
4259 4294 mutex_enter(&spa_namespace_lock);
4260 4295 spa_close(spa, FTAG);
4261 4296 mutex_exit(&spa_namespace_lock);
4262 4297
4263 4298 return (error);
4264 4299 }
4265 4300
4266 4301 /*
4267 4302 * Split a set of devices from their mirrors, and create a new pool from them.
4268 4303 */
4269 4304 int
4270 4305 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4271 4306 nvlist_t *props, boolean_t exp)
4272 4307 {
4273 4308 int error = 0;
4274 4309 uint64_t txg, *glist;
4275 4310 spa_t *newspa;
4276 4311 uint_t c, children, lastlog;
4277 4312 nvlist_t **child, *nvl, *tmp;
4278 4313 dmu_tx_t *tx;
4279 4314 char *altroot = NULL;
4280 4315 vdev_t *rvd, **vml = NULL; /* vdev modify list */
4281 4316 boolean_t activate_slog;
4282 4317
4283 4318 ASSERT(spa_writeable(spa));
4284 4319
4285 4320 txg = spa_vdev_enter(spa);
4286 4321
4287 4322 /* clear the log and flush everything up to now */
4288 4323 activate_slog = spa_passivate_log(spa);
4289 4324 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4290 4325 error = spa_offline_log(spa);
4291 4326 txg = spa_vdev_config_enter(spa);
4292 4327
4293 4328 if (activate_slog)
4294 4329 spa_activate_log(spa);
4295 4330
4296 4331 if (error != 0)
4297 4332 return (spa_vdev_exit(spa, NULL, txg, error));
4298 4333
4299 4334 /* check new spa name before going any further */
4300 4335 if (spa_lookup(newname) != NULL)
4301 4336 return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4302 4337
4303 4338 /*
4304 4339 * scan through all the children to ensure they're all mirrors
4305 4340 */
4306 4341 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4307 4342 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4308 4343 &children) != 0)
4309 4344 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4310 4345
4311 4346 /* first, check to ensure we've got the right child count */
4312 4347 rvd = spa->spa_root_vdev;
4313 4348 lastlog = 0;
4314 4349 for (c = 0; c < rvd->vdev_children; c++) {
4315 4350 vdev_t *vd = rvd->vdev_child[c];
4316 4351
4317 4352 /* don't count the holes & logs as children */
4318 4353 if (vd->vdev_islog || vd->vdev_ishole) {
4319 4354 if (lastlog == 0)
4320 4355 lastlog = c;
4321 4356 continue;
4322 4357 }
4323 4358
4324 4359 lastlog = 0;
4325 4360 }
4326 4361 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4327 4362 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4328 4363
4329 4364 /* next, ensure no spare or cache devices are part of the split */
4330 4365 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4331 4366 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4332 4367 return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4333 4368
4334 4369 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4335 4370 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4336 4371
4337 4372 /* then, loop over each vdev and validate it */
4338 4373 for (c = 0; c < children; c++) {
4339 4374 uint64_t is_hole = 0;
4340 4375
4341 4376 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4342 4377 &is_hole);
4343 4378
4344 4379 if (is_hole != 0) {
4345 4380 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4346 4381 spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4347 4382 continue;
4348 4383 } else {
4349 4384 error = EINVAL;
4350 4385 break;
4351 4386 }
4352 4387 }
4353 4388
4354 4389 /* which disk is going to be split? */
4355 4390 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4356 4391 &glist[c]) != 0) {
4357 4392 error = EINVAL;
4358 4393 break;
4359 4394 }
4360 4395
4361 4396 /* look it up in the spa */
4362 4397 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4363 4398 if (vml[c] == NULL) {
4364 4399 error = ENODEV;
4365 4400 break;
4366 4401 }
4367 4402
4368 4403 /* make sure there's nothing stopping the split */
4369 4404 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4370 4405 vml[c]->vdev_islog ||
4371 4406 vml[c]->vdev_ishole ||
4372 4407 vml[c]->vdev_isspare ||
4373 4408 vml[c]->vdev_isl2cache ||
4374 4409 !vdev_writeable(vml[c]) ||
4375 4410 vml[c]->vdev_children != 0 ||
4376 4411 vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4377 4412 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4378 4413 error = EINVAL;
4379 4414 break;
4380 4415 }
4381 4416
4382 4417 if (vdev_dtl_required(vml[c])) {
4383 4418 error = EBUSY;
4384 4419 break;
4385 4420 }
4386 4421
4387 4422 /* we need certain info from the top level */
4388 4423 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4389 4424 vml[c]->vdev_top->vdev_ms_array) == 0);
4390 4425 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4391 4426 vml[c]->vdev_top->vdev_ms_shift) == 0);
4392 4427 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4393 4428 vml[c]->vdev_top->vdev_asize) == 0);
4394 4429 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4395 4430 vml[c]->vdev_top->vdev_ashift) == 0);
4396 4431 }
4397 4432
4398 4433 if (error != 0) {
4399 4434 kmem_free(vml, children * sizeof (vdev_t *));
4400 4435 kmem_free(glist, children * sizeof (uint64_t));
4401 4436 return (spa_vdev_exit(spa, NULL, txg, error));
4402 4437 }
4403 4438
4404 4439 /* stop writers from using the disks */
4405 4440 for (c = 0; c < children; c++) {
4406 4441 if (vml[c] != NULL)
4407 4442 vml[c]->vdev_offline = B_TRUE;
4408 4443 }
4409 4444 vdev_reopen(spa->spa_root_vdev);
4410 4445
4411 4446 /*
4412 4447 * Temporarily record the splitting vdevs in the spa config. This
4413 4448 * will disappear once the config is regenerated.
4414 4449 */
4415 4450 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4416 4451 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4417 4452 glist, children) == 0);
4418 4453 kmem_free(glist, children * sizeof (uint64_t));
4419 4454
4420 4455 mutex_enter(&spa->spa_props_lock);
4421 4456 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4422 4457 nvl) == 0);
4423 4458 mutex_exit(&spa->spa_props_lock);
4424 4459 spa->spa_config_splitting = nvl;
4425 4460 vdev_config_dirty(spa->spa_root_vdev);
4426 4461
4427 4462 /* configure and create the new pool */
4428 4463 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4429 4464 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4430 4465 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4431 4466 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4432 4467 spa_version(spa)) == 0);
4433 4468 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4434 4469 spa->spa_config_txg) == 0);
4435 4470 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4436 4471 spa_generate_guid(NULL)) == 0);
4437 4472 (void) nvlist_lookup_string(props,
4438 4473 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4439 4474
4440 4475 /* add the new pool to the namespace */
4441 4476 newspa = spa_add(newname, config, altroot);
4442 4477 newspa->spa_config_txg = spa->spa_config_txg;
4443 4478 spa_set_log_state(newspa, SPA_LOG_CLEAR);
4444 4479
4445 4480 /* release the spa config lock, retaining the namespace lock */
4446 4481 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4447 4482
4448 4483 if (zio_injection_enabled)
4449 4484 zio_handle_panic_injection(spa, FTAG, 1);
4450 4485
4451 4486 spa_activate(newspa, spa_mode_global);
4452 4487 spa_async_suspend(newspa);
4453 4488
4454 4489 /* create the new pool from the disks of the original pool */
4455 4490 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4456 4491 if (error)
4457 4492 goto out;
4458 4493
4459 4494 /* if that worked, generate a real config for the new pool */
4460 4495 if (newspa->spa_root_vdev != NULL) {
4461 4496 VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4462 4497 NV_UNIQUE_NAME, KM_SLEEP) == 0);
4463 4498 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4464 4499 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4465 4500 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4466 4501 B_TRUE));
4467 4502 }
4468 4503
4469 4504 /* set the props */
4470 4505 if (props != NULL) {
4471 4506 spa_configfile_set(newspa, props, B_FALSE);
4472 4507 error = spa_prop_set(newspa, props);
4473 4508 if (error)
4474 4509 goto out;
4475 4510 }
4476 4511
4477 4512 /* flush everything */
4478 4513 txg = spa_vdev_config_enter(newspa);
4479 4514 vdev_config_dirty(newspa->spa_root_vdev);
4480 4515 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
4481 4516
4482 4517 if (zio_injection_enabled)
4483 4518 zio_handle_panic_injection(spa, FTAG, 2);
4484 4519
4485 4520 spa_async_resume(newspa);
4486 4521
4487 4522 /* finally, update the original pool's config */
4488 4523 txg = spa_vdev_config_enter(spa);
4489 4524 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4490 4525 error = dmu_tx_assign(tx, TXG_WAIT);
4491 4526 if (error != 0)
4492 4527 dmu_tx_abort(tx);
4493 4528 for (c = 0; c < children; c++) {
4494 4529 if (vml[c] != NULL) {
4495 4530 vdev_split(vml[c]);
4496 4531 if (error == 0)
4497 4532 spa_history_log_internal(LOG_POOL_VDEV_DETACH,
4498 4533 spa, tx, "vdev=%s",
4499 4534 vml[c]->vdev_path);
4500 4535 vdev_free(vml[c]);
4501 4536 }
4502 4537 }
4503 4538 vdev_config_dirty(spa->spa_root_vdev);
4504 4539 spa->spa_config_splitting = NULL;
4505 4540 nvlist_free(nvl);
4506 4541 if (error == 0)
4507 4542 dmu_tx_commit(tx);
4508 4543 (void) spa_vdev_exit(spa, NULL, txg, 0);
4509 4544
4510 4545 if (zio_injection_enabled)
4511 4546 zio_handle_panic_injection(spa, FTAG, 3);
4512 4547
4513 4548 /* split is complete; log a history record */
4514 4549 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
4515 4550 "split new pool %s from pool %s", newname, spa_name(spa));
4516 4551
4517 4552 kmem_free(vml, children * sizeof (vdev_t *));
4518 4553
4519 4554 /* if we're not going to mount the filesystems in userland, export */
4520 4555 if (exp)
4521 4556 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
4522 4557 B_FALSE, B_FALSE);
4523 4558
4524 4559 return (error);
4525 4560
4526 4561 out:
4527 4562 spa_unload(newspa);
4528 4563 spa_deactivate(newspa);
4529 4564 spa_remove(newspa);
4530 4565
4531 4566 txg = spa_vdev_config_enter(spa);
4532 4567
4533 4568 /* re-online all offlined disks */
4534 4569 for (c = 0; c < children; c++) {
4535 4570 if (vml[c] != NULL)
4536 4571 vml[c]->vdev_offline = B_FALSE;
4537 4572 }
4538 4573 vdev_reopen(spa->spa_root_vdev);
4539 4574
4540 4575 nvlist_free(spa->spa_config_splitting);
4541 4576 spa->spa_config_splitting = NULL;
4542 4577 (void) spa_vdev_exit(spa, NULL, txg, error);
4543 4578
4544 4579 kmem_free(vml, children * sizeof (vdev_t *));
4545 4580 return (error);
4546 4581 }
4547 4582
4548 4583 static nvlist_t *
4549 4584 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
4550 4585 {
4551 4586 for (int i = 0; i < count; i++) {
4552 4587 uint64_t guid;
4553 4588
4554 4589 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
4555 4590 &guid) == 0);
4556 4591
4557 4592 if (guid == target_guid)
4558 4593 return (nvpp[i]);
4559 4594 }
4560 4595
4561 4596 return (NULL);
4562 4597 }
4563 4598
4564 4599 static void
4565 4600 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
4566 4601 nvlist_t *dev_to_remove)
4567 4602 {
4568 4603 nvlist_t **newdev = NULL;
4569 4604
4570 4605 if (count > 1)
4571 4606 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
4572 4607
4573 4608 for (int i = 0, j = 0; i < count; i++) {
4574 4609 if (dev[i] == dev_to_remove)
4575 4610 continue;
4576 4611 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
4577 4612 }
4578 4613
4579 4614 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
4580 4615 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
4581 4616
4582 4617 for (int i = 0; i < count - 1; i++)
4583 4618 nvlist_free(newdev[i]);
4584 4619
4585 4620 if (count > 1)
4586 4621 kmem_free(newdev, (count - 1) * sizeof (void *));
4587 4622 }
4588 4623
4589 4624 /*
4590 4625 * Evacuate the device.
4591 4626 */
4592 4627 static int
4593 4628 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
4594 4629 {
4595 4630 uint64_t txg;
4596 4631 int error = 0;
4597 4632
4598 4633 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4599 4634 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4600 4635 ASSERT(vd == vd->vdev_top);
4601 4636
4602 4637 /*
4603 4638 * Evacuate the device. We don't hold the config lock as writer
4604 4639 * since we need to do I/O but we do keep the
4605 4640 * spa_namespace_lock held. Once this completes the device
4606 4641 * should no longer have any blocks allocated on it.
4607 4642 */
4608 4643 if (vd->vdev_islog) {
4609 4644 if (vd->vdev_stat.vs_alloc != 0)
4610 4645 error = spa_offline_log(spa);
4611 4646 } else {
4612 4647 error = ENOTSUP;
4613 4648 }
4614 4649
4615 4650 if (error)
4616 4651 return (error);
4617 4652
4618 4653 /*
4619 4654 * The evacuation succeeded. Remove any remaining MOS metadata
4620 4655 * associated with this vdev, and wait for these changes to sync.
4621 4656 */
4622 4657 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
4623 4658 txg = spa_vdev_config_enter(spa);
4624 4659 vd->vdev_removing = B_TRUE;
4625 4660 vdev_dirty(vd, 0, NULL, txg);
4626 4661 vdev_config_dirty(vd);
4627 4662 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4628 4663
4629 4664 return (0);
4630 4665 }
4631 4666
4632 4667 /*
4633 4668 * Complete the removal by cleaning up the namespace.
4634 4669 */
4635 4670 static void
4636 4671 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
4637 4672 {
4638 4673 vdev_t *rvd = spa->spa_root_vdev;
4639 4674 uint64_t id = vd->vdev_id;
4640 4675 boolean_t last_vdev = (id == (rvd->vdev_children - 1));
4641 4676
4642 4677 ASSERT(MUTEX_HELD(&spa_namespace_lock));
4643 4678 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4644 4679 ASSERT(vd == vd->vdev_top);
4645 4680
4646 4681 /*
4647 4682 * Only remove any devices which are empty.
4648 4683 */
4649 4684 if (vd->vdev_stat.vs_alloc != 0)
4650 4685 return;
4651 4686
4652 4687 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4653 4688
4654 4689 if (list_link_active(&vd->vdev_state_dirty_node))
4655 4690 vdev_state_clean(vd);
4656 4691 if (list_link_active(&vd->vdev_config_dirty_node))
4657 4692 vdev_config_clean(vd);
4658 4693
4659 4694 vdev_free(vd);
4660 4695
4661 4696 if (last_vdev) {
4662 4697 vdev_compact_children(rvd);
4663 4698 } else {
4664 4699 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
4665 4700 vdev_add_child(rvd, vd);
4666 4701 }
4667 4702 vdev_config_dirty(rvd);
4668 4703
4669 4704 /*
4670 4705 * Reassess the health of our root vdev.
4671 4706 */
4672 4707 vdev_reopen(rvd);
4673 4708 }
4674 4709
4675 4710 /*
4676 4711 * Remove a device from the pool -
4677 4712 *
4678 4713 * Removing a device from the vdev namespace requires several steps
4679 4714 * and can take a significant amount of time. As a result we use
4680 4715 * the spa_vdev_config_[enter/exit] functions which allow us to
4681 4716 * grab and release the spa_config_lock while still holding the namespace
4682 4717 * lock. During each step the configuration is synced out.
4683 4718 */
4684 4719
4685 4720 /*
4686 4721 * Remove a device from the pool. Currently, this supports removing only hot
4687 4722 * spares, slogs, and level 2 ARC devices.
4688 4723 */
4689 4724 int
4690 4725 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
4691 4726 {
4692 4727 vdev_t *vd;
4693 4728 metaslab_group_t *mg;
4694 4729 nvlist_t **spares, **l2cache, *nv;
4695 4730 uint64_t txg = 0;
4696 4731 uint_t nspares, nl2cache;
4697 4732 int error = 0;
4698 4733 boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
4699 4734
4700 4735 ASSERT(spa_writeable(spa));
4701 4736
4702 4737 if (!locked)
4703 4738 txg = spa_vdev_enter(spa);
4704 4739
4705 4740 vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4706 4741
4707 4742 if (spa->spa_spares.sav_vdevs != NULL &&
4708 4743 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4709 4744 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
4710 4745 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
4711 4746 /*
4712 4747 * Only remove the hot spare if it's not currently in use
4713 4748 * in this pool.
4714 4749 */
4715 4750 if (vd == NULL || unspare) {
4716 4751 spa_vdev_remove_aux(spa->spa_spares.sav_config,
4717 4752 ZPOOL_CONFIG_SPARES, spares, nspares, nv);
4718 4753 spa_load_spares(spa);
4719 4754 spa->spa_spares.sav_sync = B_TRUE;
4720 4755 } else {
4721 4756 error = EBUSY;
4722 4757 }
4723 4758 } else if (spa->spa_l2cache.sav_vdevs != NULL &&
4724 4759 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4725 4760 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
4726 4761 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
4727 4762 /*
4728 4763 * Cache devices can always be removed.
4729 4764 */
4730 4765 spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
4731 4766 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
4732 4767 spa_load_l2cache(spa);
4733 4768 spa->spa_l2cache.sav_sync = B_TRUE;
4734 4769 } else if (vd != NULL && vd->vdev_islog) {
4735 4770 ASSERT(!locked);
4736 4771 ASSERT(vd == vd->vdev_top);
4737 4772
4738 4773 /*
4739 4774 * XXX - Once we have bp-rewrite this should
4740 4775 * become the common case.
4741 4776 */
4742 4777
4743 4778 mg = vd->vdev_mg;
4744 4779
4745 4780 /*
4746 4781 * Stop allocating from this vdev.
4747 4782 */
4748 4783 metaslab_group_passivate(mg);
4749 4784
4750 4785 /*
4751 4786 * Wait for the youngest allocations and frees to sync,
4752 4787 * and then wait for the deferral of those frees to finish.
4753 4788 */
4754 4789 spa_vdev_config_exit(spa, NULL,
4755 4790 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
4756 4791
4757 4792 /*
4758 4793 * Attempt to evacuate the vdev.
4759 4794 */
4760 4795 error = spa_vdev_remove_evacuate(spa, vd);
4761 4796
4762 4797 txg = spa_vdev_config_enter(spa);
4763 4798
4764 4799 /*
4765 4800 * If we couldn't evacuate the vdev, unwind.
4766 4801 */
4767 4802 if (error) {
4768 4803 metaslab_group_activate(mg);
4769 4804 return (spa_vdev_exit(spa, NULL, txg, error));
4770 4805 }
4771 4806
4772 4807 /*
4773 4808 * Clean up the vdev namespace.
4774 4809 */
4775 4810 spa_vdev_remove_from_namespace(spa, vd);
4776 4811
4777 4812 } else if (vd != NULL) {
4778 4813 /*
4779 4814 * Normal vdevs cannot be removed (yet).
4780 4815 */
4781 4816 error = ENOTSUP;
4782 4817 } else {
4783 4818 /*
4784 4819 * There is no vdev of any kind with the specified guid.
4785 4820 */
4786 4821 error = ENOENT;
4787 4822 }
4788 4823
4789 4824 if (!locked)
4790 4825 return (spa_vdev_exit(spa, NULL, txg, error));
4791 4826
4792 4827 return (error);
4793 4828 }
4794 4829
4795 4830 /*
4796 4831 * Find any device that's done replacing, or a vdev marked 'unspare' that's
4797 4832 * current spared, so we can detach it.
4798 4833 */
4799 4834 static vdev_t *
4800 4835 spa_vdev_resilver_done_hunt(vdev_t *vd)
4801 4836 {
4802 4837 vdev_t *newvd, *oldvd;
4803 4838
4804 4839 for (int c = 0; c < vd->vdev_children; c++) {
4805 4840 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
4806 4841 if (oldvd != NULL)
4807 4842 return (oldvd);
4808 4843 }
4809 4844
4810 4845 /*
4811 4846 * Check for a completed replacement. We always consider the first
4812 4847 * vdev in the list to be the oldest vdev, and the last one to be
4813 4848 * the newest (see spa_vdev_attach() for how that works). In
4814 4849 * the case where the newest vdev is faulted, we will not automatically
4815 4850 * remove it after a resilver completes. This is OK as it will require
4816 4851 * user intervention to determine which disk the admin wishes to keep.
4817 4852 */
4818 4853 if (vd->vdev_ops == &vdev_replacing_ops) {
4819 4854 ASSERT(vd->vdev_children > 1);
4820 4855
4821 4856 newvd = vd->vdev_child[vd->vdev_children - 1];
4822 4857 oldvd = vd->vdev_child[0];
4823 4858
4824 4859 if (vdev_dtl_empty(newvd, DTL_MISSING) &&
4825 4860 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4826 4861 !vdev_dtl_required(oldvd))
4827 4862 return (oldvd);
4828 4863 }
4829 4864
4830 4865 /*
4831 4866 * Check for a completed resilver with the 'unspare' flag set.
4832 4867 */
4833 4868 if (vd->vdev_ops == &vdev_spare_ops) {
4834 4869 vdev_t *first = vd->vdev_child[0];
4835 4870 vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
4836 4871
4837 4872 if (last->vdev_unspare) {
4838 4873 oldvd = first;
4839 4874 newvd = last;
4840 4875 } else if (first->vdev_unspare) {
4841 4876 oldvd = last;
4842 4877 newvd = first;
4843 4878 } else {
4844 4879 oldvd = NULL;
4845 4880 }
4846 4881
4847 4882 if (oldvd != NULL &&
4848 4883 vdev_dtl_empty(newvd, DTL_MISSING) &&
4849 4884 vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4850 4885 !vdev_dtl_required(oldvd))
4851 4886 return (oldvd);
4852 4887
4853 4888 /*
4854 4889 * If there are more than two spares attached to a disk,
4855 4890 * and those spares are not required, then we want to
4856 4891 * attempt to free them up now so that they can be used
4857 4892 * by other pools. Once we're back down to a single
4858 4893 * disk+spare, we stop removing them.
4859 4894 */
4860 4895 if (vd->vdev_children > 2) {
4861 4896 newvd = vd->vdev_child[1];
4862 4897
4863 4898 if (newvd->vdev_isspare && last->vdev_isspare &&
4864 4899 vdev_dtl_empty(last, DTL_MISSING) &&
4865 4900 vdev_dtl_empty(last, DTL_OUTAGE) &&
4866 4901 !vdev_dtl_required(newvd))
4867 4902 return (newvd);
4868 4903 }
4869 4904 }
4870 4905
4871 4906 return (NULL);
4872 4907 }
4873 4908
4874 4909 static void
4875 4910 spa_vdev_resilver_done(spa_t *spa)
4876 4911 {
4877 4912 vdev_t *vd, *pvd, *ppvd;
4878 4913 uint64_t guid, sguid, pguid, ppguid;
4879 4914
4880 4915 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4881 4916
4882 4917 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
4883 4918 pvd = vd->vdev_parent;
4884 4919 ppvd = pvd->vdev_parent;
4885 4920 guid = vd->vdev_guid;
4886 4921 pguid = pvd->vdev_guid;
4887 4922 ppguid = ppvd->vdev_guid;
4888 4923 sguid = 0;
4889 4924 /*
4890 4925 * If we have just finished replacing a hot spared device, then
4891 4926 * we need to detach the parent's first child (the original hot
4892 4927 * spare) as well.
4893 4928 */
4894 4929 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
4895 4930 ppvd->vdev_children == 2) {
4896 4931 ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
4897 4932 sguid = ppvd->vdev_child[1]->vdev_guid;
4898 4933 }
4899 4934 spa_config_exit(spa, SCL_ALL, FTAG);
4900 4935 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
4901 4936 return;
4902 4937 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
4903 4938 return;
4904 4939 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4905 4940 }
4906 4941
4907 4942 spa_config_exit(spa, SCL_ALL, FTAG);
4908 4943 }
4909 4944
4910 4945 /*
4911 4946 * Update the stored path or FRU for this vdev.
4912 4947 */
4913 4948 int
4914 4949 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
4915 4950 boolean_t ispath)
4916 4951 {
4917 4952 vdev_t *vd;
4918 4953 boolean_t sync = B_FALSE;
4919 4954
4920 4955 ASSERT(spa_writeable(spa));
4921 4956
4922 4957 spa_vdev_state_enter(spa, SCL_ALL);
4923 4958
4924 4959 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4925 4960 return (spa_vdev_state_exit(spa, NULL, ENOENT));
4926 4961
4927 4962 if (!vd->vdev_ops->vdev_op_leaf)
4928 4963 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4929 4964
4930 4965 if (ispath) {
4931 4966 if (strcmp(value, vd->vdev_path) != 0) {
4932 4967 spa_strfree(vd->vdev_path);
4933 4968 vd->vdev_path = spa_strdup(value);
4934 4969 sync = B_TRUE;
4935 4970 }
4936 4971 } else {
4937 4972 if (vd->vdev_fru == NULL) {
4938 4973 vd->vdev_fru = spa_strdup(value);
4939 4974 sync = B_TRUE;
4940 4975 } else if (strcmp(value, vd->vdev_fru) != 0) {
4941 4976 spa_strfree(vd->vdev_fru);
4942 4977 vd->vdev_fru = spa_strdup(value);
4943 4978 sync = B_TRUE;
4944 4979 }
4945 4980 }
4946 4981
4947 4982 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
4948 4983 }
4949 4984
4950 4985 int
4951 4986 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
4952 4987 {
4953 4988 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
4954 4989 }
4955 4990
4956 4991 int
4957 4992 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
4958 4993 {
4959 4994 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
4960 4995 }
4961 4996
4962 4997 /*
4963 4998 * ==========================================================================
4964 4999 * SPA Scanning
4965 5000 * ==========================================================================
4966 5001 */
4967 5002
4968 5003 int
4969 5004 spa_scan_stop(spa_t *spa)
4970 5005 {
4971 5006 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4972 5007 if (dsl_scan_resilvering(spa->spa_dsl_pool))
4973 5008 return (EBUSY);
4974 5009 return (dsl_scan_cancel(spa->spa_dsl_pool));
4975 5010 }
4976 5011
4977 5012 int
4978 5013 spa_scan(spa_t *spa, pool_scan_func_t func)
4979 5014 {
4980 5015 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4981 5016
4982 5017 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
4983 5018 return (ENOTSUP);
4984 5019
4985 5020 /*
4986 5021 * If a resilver was requested, but there is no DTL on a
4987 5022 * writeable leaf device, we have nothing to do.
4988 5023 */
4989 5024 if (func == POOL_SCAN_RESILVER &&
4990 5025 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
4991 5026 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
4992 5027 return (0);
4993 5028 }
4994 5029
4995 5030 return (dsl_scan(spa->spa_dsl_pool, func));
4996 5031 }
4997 5032
4998 5033 /*
4999 5034 * ==========================================================================
5000 5035 * SPA async task processing
5001 5036 * ==========================================================================
5002 5037 */
5003 5038
5004 5039 static void
5005 5040 spa_async_remove(spa_t *spa, vdev_t *vd)
5006 5041 {
5007 5042 if (vd->vdev_remove_wanted) {
5008 5043 vd->vdev_remove_wanted = B_FALSE;
5009 5044 vd->vdev_delayed_close = B_FALSE;
5010 5045 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5011 5046
5012 5047 /*
5013 5048 * We want to clear the stats, but we don't want to do a full
5014 5049 * vdev_clear() as that will cause us to throw away
5015 5050 * degraded/faulted state as well as attempt to reopen the
5016 5051 * device, all of which is a waste.
5017 5052 */
5018 5053 vd->vdev_stat.vs_read_errors = 0;
5019 5054 vd->vdev_stat.vs_write_errors = 0;
5020 5055 vd->vdev_stat.vs_checksum_errors = 0;
5021 5056
5022 5057 vdev_state_dirty(vd->vdev_top);
5023 5058 }
5024 5059
5025 5060 for (int c = 0; c < vd->vdev_children; c++)
5026 5061 spa_async_remove(spa, vd->vdev_child[c]);
5027 5062 }
5028 5063
5029 5064 static void
5030 5065 spa_async_probe(spa_t *spa, vdev_t *vd)
5031 5066 {
5032 5067 if (vd->vdev_probe_wanted) {
5033 5068 vd->vdev_probe_wanted = B_FALSE;
5034 5069 vdev_reopen(vd); /* vdev_open() does the actual probe */
5035 5070 }
5036 5071
5037 5072 for (int c = 0; c < vd->vdev_children; c++)
5038 5073 spa_async_probe(spa, vd->vdev_child[c]);
5039 5074 }
5040 5075
5041 5076 static void
5042 5077 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5043 5078 {
5044 5079 sysevent_id_t eid;
5045 5080 nvlist_t *attr;
5046 5081 char *physpath;
5047 5082
5048 5083 if (!spa->spa_autoexpand)
5049 5084 return;
5050 5085
5051 5086 for (int c = 0; c < vd->vdev_children; c++) {
5052 5087 vdev_t *cvd = vd->vdev_child[c];
5053 5088 spa_async_autoexpand(spa, cvd);
5054 5089 }
5055 5090
5056 5091 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5057 5092 return;
5058 5093
5059 5094 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5060 5095 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5061 5096
5062 5097 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5063 5098 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5064 5099
5065 5100 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5066 5101 ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5067 5102
5068 5103 nvlist_free(attr);
5069 5104 kmem_free(physpath, MAXPATHLEN);
5070 5105 }
5071 5106
5072 5107 static void
5073 5108 spa_async_thread(spa_t *spa)
5074 5109 {
5075 5110 int tasks;
5076 5111
5077 5112 ASSERT(spa->spa_sync_on);
5078 5113
5079 5114 mutex_enter(&spa->spa_async_lock);
5080 5115 tasks = spa->spa_async_tasks;
5081 5116 spa->spa_async_tasks = 0;
5082 5117 mutex_exit(&spa->spa_async_lock);
5083 5118
5084 5119 /*
5085 5120 * See if the config needs to be updated.
5086 5121 */
5087 5122 if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5088 5123 uint64_t old_space, new_space;
5089 5124
5090 5125 mutex_enter(&spa_namespace_lock);
5091 5126 old_space = metaslab_class_get_space(spa_normal_class(spa));
5092 5127 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5093 5128 new_space = metaslab_class_get_space(spa_normal_class(spa));
5094 5129 mutex_exit(&spa_namespace_lock);
5095 5130
5096 5131 /*
5097 5132 * If the pool grew as a result of the config update,
5098 5133 * then log an internal history event.
5099 5134 */
5100 5135 if (new_space != old_space) {
5101 5136 spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
5102 5137 spa, NULL,
5103 5138 "pool '%s' size: %llu(+%llu)",
5104 5139 spa_name(spa), new_space, new_space - old_space);
5105 5140 }
5106 5141 }
5107 5142
5108 5143 /*
5109 5144 * See if any devices need to be marked REMOVED.
5110 5145 */
5111 5146 if (tasks & SPA_ASYNC_REMOVE) {
5112 5147 spa_vdev_state_enter(spa, SCL_NONE);
5113 5148 spa_async_remove(spa, spa->spa_root_vdev);
5114 5149 for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5115 5150 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5116 5151 for (int i = 0; i < spa->spa_spares.sav_count; i++)
5117 5152 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5118 5153 (void) spa_vdev_state_exit(spa, NULL, 0);
5119 5154 }
5120 5155
5121 5156 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5122 5157 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5123 5158 spa_async_autoexpand(spa, spa->spa_root_vdev);
5124 5159 spa_config_exit(spa, SCL_CONFIG, FTAG);
5125 5160 }
5126 5161
5127 5162 /*
5128 5163 * See if any devices need to be probed.
5129 5164 */
5130 5165 if (tasks & SPA_ASYNC_PROBE) {
5131 5166 spa_vdev_state_enter(spa, SCL_NONE);
5132 5167 spa_async_probe(spa, spa->spa_root_vdev);
5133 5168 (void) spa_vdev_state_exit(spa, NULL, 0);
5134 5169 }
5135 5170
5136 5171 /*
5137 5172 * If any devices are done replacing, detach them.
5138 5173 */
5139 5174 if (tasks & SPA_ASYNC_RESILVER_DONE)
5140 5175 spa_vdev_resilver_done(spa);
5141 5176
5142 5177 /*
5143 5178 * Kick off a resilver.
5144 5179 */
5145 5180 if (tasks & SPA_ASYNC_RESILVER)
5146 5181 dsl_resilver_restart(spa->spa_dsl_pool, 0);
5147 5182
5148 5183 /*
5149 5184 * Let the world know that we're done.
5150 5185 */
5151 5186 mutex_enter(&spa->spa_async_lock);
5152 5187 spa->spa_async_thread = NULL;
5153 5188 cv_broadcast(&spa->spa_async_cv);
5154 5189 mutex_exit(&spa->spa_async_lock);
5155 5190 thread_exit();
5156 5191 }
5157 5192
5158 5193 void
5159 5194 spa_async_suspend(spa_t *spa)
5160 5195 {
5161 5196 mutex_enter(&spa->spa_async_lock);
5162 5197 spa->spa_async_suspended++;
5163 5198 while (spa->spa_async_thread != NULL)
5164 5199 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5165 5200 mutex_exit(&spa->spa_async_lock);
5166 5201 }
5167 5202
5168 5203 void
5169 5204 spa_async_resume(spa_t *spa)
5170 5205 {
5171 5206 mutex_enter(&spa->spa_async_lock);
5172 5207 ASSERT(spa->spa_async_suspended != 0);
5173 5208 spa->spa_async_suspended--;
5174 5209 mutex_exit(&spa->spa_async_lock);
5175 5210 }
5176 5211
5177 5212 static void
5178 5213 spa_async_dispatch(spa_t *spa)
5179 5214 {
5180 5215 mutex_enter(&spa->spa_async_lock);
5181 5216 if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5182 5217 spa->spa_async_thread == NULL &&
5183 5218 rootdir != NULL && !vn_is_readonly(rootdir))
5184 5219 spa->spa_async_thread = thread_create(NULL, 0,
5185 5220 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5186 5221 mutex_exit(&spa->spa_async_lock);
5187 5222 }
5188 5223
5189 5224 void
5190 5225 spa_async_request(spa_t *spa, int task)
5191 5226 {
5192 5227 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5193 5228 mutex_enter(&spa->spa_async_lock);
5194 5229 spa->spa_async_tasks |= task;
5195 5230 mutex_exit(&spa->spa_async_lock);
5196 5231 }
5197 5232
5198 5233 /*
5199 5234 * ==========================================================================
5200 5235 * SPA syncing routines
5201 5236 * ==========================================================================
5202 5237 */
5203 5238
5204 5239 static int
5205 5240 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5206 5241 {
5207 5242 bpobj_t *bpo = arg;
5208 5243 bpobj_enqueue(bpo, bp, tx);
5209 5244 return (0);
5210 5245 }
5211 5246
5212 5247 static int
5213 5248 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5214 5249 {
5215 5250 zio_t *zio = arg;
5216 5251
5217 5252 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5218 5253 zio->io_flags));
5219 5254 return (0);
5220 5255 }
5221 5256
5222 5257 static void
5223 5258 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5224 5259 {
5225 5260 char *packed = NULL;
5226 5261 size_t bufsize;
5227 5262 size_t nvsize = 0;
5228 5263 dmu_buf_t *db;
5229 5264
5230 5265 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5231 5266
5232 5267 /*
5233 5268 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5234 5269 * information. This avoids the dbuf_will_dirty() path and
5235 5270 * saves us a pre-read to get data we don't actually care about.
5236 5271 */
5237 5272 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
5238 5273 packed = kmem_alloc(bufsize, KM_SLEEP);
5239 5274
5240 5275 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5241 5276 KM_SLEEP) == 0);
5242 5277 bzero(packed + nvsize, bufsize - nvsize);
5243 5278
5244 5279 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5245 5280
5246 5281 kmem_free(packed, bufsize);
5247 5282
5248 5283 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5249 5284 dmu_buf_will_dirty(db, tx);
5250 5285 *(uint64_t *)db->db_data = nvsize;
5251 5286 dmu_buf_rele(db, FTAG);
5252 5287 }
5253 5288
5254 5289 static void
5255 5290 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5256 5291 const char *config, const char *entry)
5257 5292 {
5258 5293 nvlist_t *nvroot;
5259 5294 nvlist_t **list;
5260 5295 int i;
5261 5296
5262 5297 if (!sav->sav_sync)
5263 5298 return;
5264 5299
5265 5300 /*
5266 5301 * Update the MOS nvlist describing the list of available devices.
5267 5302 * spa_validate_aux() will have already made sure this nvlist is
5268 5303 * valid and the vdevs are labeled appropriately.
5269 5304 */
5270 5305 if (sav->sav_object == 0) {
5271 5306 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5272 5307 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5273 5308 sizeof (uint64_t), tx);
5274 5309 VERIFY(zap_update(spa->spa_meta_objset,
5275 5310 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5276 5311 &sav->sav_object, tx) == 0);
5277 5312 }
5278 5313
5279 5314 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5280 5315 if (sav->sav_count == 0) {
5281 5316 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5282 5317 } else {
5283 5318 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5284 5319 for (i = 0; i < sav->sav_count; i++)
5285 5320 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5286 5321 B_FALSE, VDEV_CONFIG_L2CACHE);
5287 5322 VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5288 5323 sav->sav_count) == 0);
5289 5324 for (i = 0; i < sav->sav_count; i++)
5290 5325 nvlist_free(list[i]);
5291 5326 kmem_free(list, sav->sav_count * sizeof (void *));
5292 5327 }
5293 5328
5294 5329 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5295 5330 nvlist_free(nvroot);
5296 5331
5297 5332 sav->sav_sync = B_FALSE;
5298 5333 }
5299 5334
5300 5335 static void
5301 5336 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5302 5337 {
5303 5338 nvlist_t *config;
5304 5339
5305 5340 if (list_is_empty(&spa->spa_config_dirty_list))
5306 5341 return;
5307 5342
5308 5343 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5309 5344
5310 5345 config = spa_config_generate(spa, spa->spa_root_vdev,
5311 5346 dmu_tx_get_txg(tx), B_FALSE);
5312 5347
5313 5348 spa_config_exit(spa, SCL_STATE, FTAG);
5314 5349
5315 5350 if (spa->spa_config_syncing)
5316 5351 nvlist_free(spa->spa_config_syncing);
5317 5352 spa->spa_config_syncing = config;
5318 5353
5319 5354 spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5320 5355 }
5321 5356
5322 5357 /*
5323 5358 * Set zpool properties.
5324 5359 */
5325 5360 static void
5326 5361 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5327 5362 {
5328 5363 spa_t *spa = arg1;
5329 5364 objset_t *mos = spa->spa_meta_objset;
5330 5365 nvlist_t *nvp = arg2;
5331 5366 nvpair_t *elem;
5332 5367 uint64_t intval;
5333 5368 char *strval;
5334 5369 zpool_prop_t prop;
5335 5370 const char *propname;
5336 5371 zprop_type_t proptype;
5337 5372
5338 5373 mutex_enter(&spa->spa_props_lock);
5339 5374
5340 5375 elem = NULL;
5341 5376 while ((elem = nvlist_next_nvpair(nvp, elem))) {
5342 5377 switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
5343 5378 case ZPOOL_PROP_VERSION:
5344 5379 /*
5345 5380 * Only set version for non-zpool-creation cases
5346 5381 * (set/import). spa_create() needs special care
5347 5382 * for version setting.
5348 5383 */
5349 5384 if (tx->tx_txg != TXG_INITIAL) {
5350 5385 VERIFY(nvpair_value_uint64(elem,
5351 5386 &intval) == 0);
5352 5387 ASSERT(intval <= SPA_VERSION);
5353 5388 ASSERT(intval >= spa_version(spa));
5354 5389 spa->spa_uberblock.ub_version = intval;
5355 5390 vdev_config_dirty(spa->spa_root_vdev);
5356 5391 }
5357 5392 break;
5358 5393
5359 5394 case ZPOOL_PROP_ALTROOT:
5360 5395 /*
5361 5396 * 'altroot' is a non-persistent property. It should
5362 5397 * have been set temporarily at creation or import time.
5363 5398 */
|
↓ open down ↓ |
3587 lines elided |
↑ open up ↑ |
5364 5399 ASSERT(spa->spa_root != NULL);
5365 5400 break;
5366 5401
5367 5402 case ZPOOL_PROP_READONLY:
5368 5403 case ZPOOL_PROP_CACHEFILE:
5369 5404 /*
5370 5405 * 'readonly' and 'cachefile' are also non-persisitent
5371 5406 * properties.
5372 5407 */
5373 5408 break;
5409 + case ZPOOL_PROP_COMMENT:
5410 + VERIFY(nvpair_value_string(elem, &strval) == 0);
5411 + if (spa->spa_comment != NULL)
5412 + spa_strfree(spa->spa_comment);
5413 + spa->spa_comment = spa_strdup(strval);
5414 + /*
5415 + * We need to dirty the configuration on all the vdevs
5416 + * so that their labels get updated. It's unnecessary
5417 + * to do this for pool creation since the vdev's
5418 + * configuratoin has already been dirtied.
5419 + */
5420 + if (tx->tx_txg != TXG_INITIAL)
5421 + vdev_config_dirty(spa->spa_root_vdev);
5422 + break;
5374 5423 default:
5375 5424 /*
5376 5425 * Set pool property values in the poolprops mos object.
5377 5426 */
5378 5427 if (spa->spa_pool_props_object == 0) {
5379 5428 VERIFY((spa->spa_pool_props_object =
5380 5429 zap_create(mos, DMU_OT_POOL_PROPS,
5381 5430 DMU_OT_NONE, 0, tx)) > 0);
5382 5431
5383 5432 VERIFY(zap_update(mos,
5384 5433 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5385 5434 8, 1, &spa->spa_pool_props_object, tx)
5386 5435 == 0);
5387 5436 }
5388 5437
5389 5438 /* normalize the property name */
5390 5439 propname = zpool_prop_to_name(prop);
5391 5440 proptype = zpool_prop_get_type(prop);
5392 5441
5393 5442 if (nvpair_type(elem) == DATA_TYPE_STRING) {
5394 5443 ASSERT(proptype == PROP_TYPE_STRING);
5395 5444 VERIFY(nvpair_value_string(elem, &strval) == 0);
5396 5445 VERIFY(zap_update(mos,
5397 5446 spa->spa_pool_props_object, propname,
5398 5447 1, strlen(strval) + 1, strval, tx) == 0);
5399 5448
5400 5449 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5401 5450 VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5402 5451
5403 5452 if (proptype == PROP_TYPE_INDEX) {
5404 5453 const char *unused;
5405 5454 VERIFY(zpool_prop_index_to_string(
5406 5455 prop, intval, &unused) == 0);
5407 5456 }
5408 5457 VERIFY(zap_update(mos,
5409 5458 spa->spa_pool_props_object, propname,
5410 5459 8, 1, &intval, tx) == 0);
5411 5460 } else {
5412 5461 ASSERT(0); /* not allowed */
5413 5462 }
5414 5463
5415 5464 switch (prop) {
5416 5465 case ZPOOL_PROP_DELEGATION:
5417 5466 spa->spa_delegation = intval;
5418 5467 break;
5419 5468 case ZPOOL_PROP_BOOTFS:
5420 5469 spa->spa_bootfs = intval;
5421 5470 break;
5422 5471 case ZPOOL_PROP_FAILUREMODE:
5423 5472 spa->spa_failmode = intval;
5424 5473 break;
5425 5474 case ZPOOL_PROP_AUTOEXPAND:
5426 5475 spa->spa_autoexpand = intval;
5427 5476 if (tx->tx_txg != TXG_INITIAL)
5428 5477 spa_async_request(spa,
5429 5478 SPA_ASYNC_AUTOEXPAND);
5430 5479 break;
5431 5480 case ZPOOL_PROP_DEDUPDITTO:
5432 5481 spa->spa_dedup_ditto = intval;
5433 5482 break;
5434 5483 default:
5435 5484 break;
5436 5485 }
5437 5486 }
5438 5487
5439 5488 /* log internal history if this is not a zpool create */
5440 5489 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
5441 5490 tx->tx_txg != TXG_INITIAL) {
5442 5491 spa_history_log_internal(LOG_POOL_PROPSET,
5443 5492 spa, tx, "%s %lld %s",
5444 5493 nvpair_name(elem), intval, spa_name(spa));
5445 5494 }
5446 5495 }
5447 5496
5448 5497 mutex_exit(&spa->spa_props_lock);
5449 5498 }
5450 5499
5451 5500 /*
5452 5501 * Perform one-time upgrade on-disk changes. spa_version() does not
5453 5502 * reflect the new version this txg, so there must be no changes this
5454 5503 * txg to anything that the upgrade code depends on after it executes.
5455 5504 * Therefore this must be called after dsl_pool_sync() does the sync
5456 5505 * tasks.
5457 5506 */
5458 5507 static void
5459 5508 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
5460 5509 {
5461 5510 dsl_pool_t *dp = spa->spa_dsl_pool;
5462 5511
5463 5512 ASSERT(spa->spa_sync_pass == 1);
5464 5513
5465 5514 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5466 5515 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5467 5516 dsl_pool_create_origin(dp, tx);
5468 5517
5469 5518 /* Keeping the origin open increases spa_minref */
5470 5519 spa->spa_minref += 3;
5471 5520 }
5472 5521
5473 5522 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5474 5523 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
5475 5524 dsl_pool_upgrade_clones(dp, tx);
5476 5525 }
5477 5526
5478 5527 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5479 5528 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5480 5529 dsl_pool_upgrade_dir_clones(dp, tx);
5481 5530
5482 5531 /* Keeping the freedir open increases spa_minref */
5483 5532 spa->spa_minref += 3;
5484 5533 }
5485 5534 }
5486 5535
5487 5536 /*
5488 5537 * Sync the specified transaction group. New blocks may be dirtied as
5489 5538 * part of the process, so we iterate until it converges.
5490 5539 */
5491 5540 void
5492 5541 spa_sync(spa_t *spa, uint64_t txg)
5493 5542 {
5494 5543 dsl_pool_t *dp = spa->spa_dsl_pool;
5495 5544 objset_t *mos = spa->spa_meta_objset;
5496 5545 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5497 5546 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5498 5547 vdev_t *rvd = spa->spa_root_vdev;
5499 5548 vdev_t *vd;
5500 5549 dmu_tx_t *tx;
5501 5550 int error;
5502 5551
5503 5552 VERIFY(spa_writeable(spa));
5504 5553
5505 5554 /*
5506 5555 * Lock out configuration changes.
5507 5556 */
5508 5557 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5509 5558
5510 5559 spa->spa_syncing_txg = txg;
5511 5560 spa->spa_sync_pass = 0;
5512 5561
5513 5562 /*
5514 5563 * If there are any pending vdev state changes, convert them
5515 5564 * into config changes that go out with this transaction group.
5516 5565 */
5517 5566 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5518 5567 while (list_head(&spa->spa_state_dirty_list) != NULL) {
5519 5568 /*
5520 5569 * We need the write lock here because, for aux vdevs,
5521 5570 * calling vdev_config_dirty() modifies sav_config.
5522 5571 * This is ugly and will become unnecessary when we
5523 5572 * eliminate the aux vdev wart by integrating all vdevs
5524 5573 * into the root vdev tree.
5525 5574 */
5526 5575 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5527 5576 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
5528 5577 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
5529 5578 vdev_state_clean(vd);
5530 5579 vdev_config_dirty(vd);
5531 5580 }
5532 5581 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5533 5582 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
5534 5583 }
5535 5584 spa_config_exit(spa, SCL_STATE, FTAG);
5536 5585
5537 5586 tx = dmu_tx_create_assigned(dp, txg);
5538 5587
5539 5588 /*
5540 5589 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
5541 5590 * set spa_deflate if we have no raid-z vdevs.
5542 5591 */
5543 5592 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
5544 5593 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
5545 5594 int i;
5546 5595
5547 5596 for (i = 0; i < rvd->vdev_children; i++) {
5548 5597 vd = rvd->vdev_child[i];
5549 5598 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
5550 5599 break;
5551 5600 }
5552 5601 if (i == rvd->vdev_children) {
5553 5602 spa->spa_deflate = TRUE;
5554 5603 VERIFY(0 == zap_add(spa->spa_meta_objset,
5555 5604 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5556 5605 sizeof (uint64_t), 1, &spa->spa_deflate, tx));
5557 5606 }
5558 5607 }
5559 5608
5560 5609 /*
5561 5610 * If anything has changed in this txg, or if someone is waiting
5562 5611 * for this txg to sync (eg, spa_vdev_remove()), push the
5563 5612 * deferred frees from the previous txg. If not, leave them
5564 5613 * alone so that we don't generate work on an otherwise idle
5565 5614 * system.
5566 5615 */
5567 5616 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
5568 5617 !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
5569 5618 !txg_list_empty(&dp->dp_sync_tasks, txg) ||
5570 5619 ((dsl_scan_active(dp->dp_scan) ||
5571 5620 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
5572 5621 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5573 5622 VERIFY3U(bpobj_iterate(defer_bpo,
5574 5623 spa_free_sync_cb, zio, tx), ==, 0);
5575 5624 VERIFY3U(zio_wait(zio), ==, 0);
5576 5625 }
5577 5626
5578 5627 /*
5579 5628 * Iterate to convergence.
5580 5629 */
5581 5630 do {
5582 5631 int pass = ++spa->spa_sync_pass;
5583 5632
5584 5633 spa_sync_config_object(spa, tx);
5585 5634 spa_sync_aux_dev(spa, &spa->spa_spares, tx,
5586 5635 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
5587 5636 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
5588 5637 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
5589 5638 spa_errlog_sync(spa, txg);
5590 5639 dsl_pool_sync(dp, txg);
5591 5640
5592 5641 if (pass <= SYNC_PASS_DEFERRED_FREE) {
5593 5642 zio_t *zio = zio_root(spa, NULL, NULL, 0);
5594 5643 bplist_iterate(free_bpl, spa_free_sync_cb,
5595 5644 zio, tx);
5596 5645 VERIFY(zio_wait(zio) == 0);
5597 5646 } else {
5598 5647 bplist_iterate(free_bpl, bpobj_enqueue_cb,
5599 5648 defer_bpo, tx);
5600 5649 }
5601 5650
5602 5651 ddt_sync(spa, txg);
5603 5652 dsl_scan_sync(dp, tx);
5604 5653
5605 5654 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
5606 5655 vdev_sync(vd, txg);
5607 5656
5608 5657 if (pass == 1)
5609 5658 spa_sync_upgrades(spa, tx);
5610 5659
5611 5660 } while (dmu_objset_is_dirty(mos, txg));
5612 5661
5613 5662 /*
5614 5663 * Rewrite the vdev configuration (which includes the uberblock)
5615 5664 * to commit the transaction group.
5616 5665 *
5617 5666 * If there are no dirty vdevs, we sync the uberblock to a few
5618 5667 * random top-level vdevs that are known to be visible in the
5619 5668 * config cache (see spa_vdev_add() for a complete description).
5620 5669 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
5621 5670 */
5622 5671 for (;;) {
5623 5672 /*
5624 5673 * We hold SCL_STATE to prevent vdev open/close/etc.
5625 5674 * while we're attempting to write the vdev labels.
5626 5675 */
5627 5676 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5628 5677
5629 5678 if (list_is_empty(&spa->spa_config_dirty_list)) {
5630 5679 vdev_t *svd[SPA_DVAS_PER_BP];
5631 5680 int svdcount = 0;
5632 5681 int children = rvd->vdev_children;
5633 5682 int c0 = spa_get_random(children);
5634 5683
5635 5684 for (int c = 0; c < children; c++) {
5636 5685 vd = rvd->vdev_child[(c0 + c) % children];
5637 5686 if (vd->vdev_ms_array == 0 || vd->vdev_islog)
5638 5687 continue;
5639 5688 svd[svdcount++] = vd;
5640 5689 if (svdcount == SPA_DVAS_PER_BP)
5641 5690 break;
5642 5691 }
5643 5692 error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
5644 5693 if (error != 0)
5645 5694 error = vdev_config_sync(svd, svdcount, txg,
5646 5695 B_TRUE);
5647 5696 } else {
5648 5697 error = vdev_config_sync(rvd->vdev_child,
5649 5698 rvd->vdev_children, txg, B_FALSE);
5650 5699 if (error != 0)
5651 5700 error = vdev_config_sync(rvd->vdev_child,
5652 5701 rvd->vdev_children, txg, B_TRUE);
5653 5702 }
5654 5703
5655 5704 spa_config_exit(spa, SCL_STATE, FTAG);
5656 5705
5657 5706 if (error == 0)
5658 5707 break;
5659 5708 zio_suspend(spa, NULL);
5660 5709 zio_resume_wait(spa);
5661 5710 }
5662 5711 dmu_tx_commit(tx);
5663 5712
5664 5713 /*
5665 5714 * Clear the dirty config list.
5666 5715 */
5667 5716 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
5668 5717 vdev_config_clean(vd);
5669 5718
5670 5719 /*
5671 5720 * Now that the new config has synced transactionally,
5672 5721 * let it become visible to the config cache.
5673 5722 */
5674 5723 if (spa->spa_config_syncing != NULL) {
5675 5724 spa_config_set(spa, spa->spa_config_syncing);
5676 5725 spa->spa_config_txg = txg;
5677 5726 spa->spa_config_syncing = NULL;
5678 5727 }
5679 5728
5680 5729 spa->spa_ubsync = spa->spa_uberblock;
5681 5730
5682 5731 dsl_pool_sync_done(dp, txg);
5683 5732
5684 5733 /*
5685 5734 * Update usable space statistics.
5686 5735 */
5687 5736 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
5688 5737 vdev_sync_done(vd, txg);
5689 5738
5690 5739 spa_update_dspace(spa);
5691 5740
5692 5741 /*
5693 5742 * It had better be the case that we didn't dirty anything
5694 5743 * since vdev_config_sync().
5695 5744 */
5696 5745 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
5697 5746 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
5698 5747 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
5699 5748
5700 5749 spa->spa_sync_pass = 0;
5701 5750
5702 5751 spa_config_exit(spa, SCL_CONFIG, FTAG);
5703 5752
5704 5753 spa_handle_ignored_writes(spa);
5705 5754
5706 5755 /*
5707 5756 * If any async tasks have been requested, kick them off.
5708 5757 */
5709 5758 spa_async_dispatch(spa);
5710 5759 }
5711 5760
5712 5761 /*
5713 5762 * Sync all pools. We don't want to hold the namespace lock across these
5714 5763 * operations, so we take a reference on the spa_t and drop the lock during the
5715 5764 * sync.
5716 5765 */
5717 5766 void
5718 5767 spa_sync_allpools(void)
5719 5768 {
5720 5769 spa_t *spa = NULL;
5721 5770 mutex_enter(&spa_namespace_lock);
5722 5771 while ((spa = spa_next(spa)) != NULL) {
5723 5772 if (spa_state(spa) != POOL_STATE_ACTIVE ||
5724 5773 !spa_writeable(spa) || spa_suspended(spa))
5725 5774 continue;
5726 5775 spa_open_ref(spa, FTAG);
5727 5776 mutex_exit(&spa_namespace_lock);
5728 5777 txg_wait_synced(spa_get_dsl(spa), 0);
5729 5778 mutex_enter(&spa_namespace_lock);
5730 5779 spa_close(spa, FTAG);
5731 5780 }
5732 5781 mutex_exit(&spa_namespace_lock);
5733 5782 }
5734 5783
5735 5784 /*
5736 5785 * ==========================================================================
5737 5786 * Miscellaneous routines
5738 5787 * ==========================================================================
5739 5788 */
5740 5789
5741 5790 /*
5742 5791 * Remove all pools in the system.
5743 5792 */
5744 5793 void
5745 5794 spa_evict_all(void)
5746 5795 {
5747 5796 spa_t *spa;
5748 5797
5749 5798 /*
5750 5799 * Remove all cached state. All pools should be closed now,
5751 5800 * so every spa in the AVL tree should be unreferenced.
5752 5801 */
5753 5802 mutex_enter(&spa_namespace_lock);
5754 5803 while ((spa = spa_next(NULL)) != NULL) {
5755 5804 /*
5756 5805 * Stop async tasks. The async thread may need to detach
5757 5806 * a device that's been replaced, which requires grabbing
5758 5807 * spa_namespace_lock, so we must drop it here.
5759 5808 */
5760 5809 spa_open_ref(spa, FTAG);
5761 5810 mutex_exit(&spa_namespace_lock);
5762 5811 spa_async_suspend(spa);
5763 5812 mutex_enter(&spa_namespace_lock);
5764 5813 spa_close(spa, FTAG);
5765 5814
5766 5815 if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5767 5816 spa_unload(spa);
5768 5817 spa_deactivate(spa);
5769 5818 }
5770 5819 spa_remove(spa);
5771 5820 }
5772 5821 mutex_exit(&spa_namespace_lock);
5773 5822 }
5774 5823
5775 5824 vdev_t *
5776 5825 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
5777 5826 {
5778 5827 vdev_t *vd;
5779 5828 int i;
5780 5829
5781 5830 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
5782 5831 return (vd);
5783 5832
5784 5833 if (aux) {
5785 5834 for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
5786 5835 vd = spa->spa_l2cache.sav_vdevs[i];
5787 5836 if (vd->vdev_guid == guid)
5788 5837 return (vd);
5789 5838 }
5790 5839
5791 5840 for (i = 0; i < spa->spa_spares.sav_count; i++) {
5792 5841 vd = spa->spa_spares.sav_vdevs[i];
5793 5842 if (vd->vdev_guid == guid)
5794 5843 return (vd);
5795 5844 }
5796 5845 }
5797 5846
5798 5847 return (NULL);
5799 5848 }
5800 5849
5801 5850 void
5802 5851 spa_upgrade(spa_t *spa, uint64_t version)
5803 5852 {
5804 5853 ASSERT(spa_writeable(spa));
5805 5854
5806 5855 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5807 5856
5808 5857 /*
5809 5858 * This should only be called for a non-faulted pool, and since a
5810 5859 * future version would result in an unopenable pool, this shouldn't be
5811 5860 * possible.
5812 5861 */
5813 5862 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
5814 5863 ASSERT(version >= spa->spa_uberblock.ub_version);
5815 5864
5816 5865 spa->spa_uberblock.ub_version = version;
5817 5866 vdev_config_dirty(spa->spa_root_vdev);
5818 5867
5819 5868 spa_config_exit(spa, SCL_ALL, FTAG);
5820 5869
5821 5870 txg_wait_synced(spa_get_dsl(spa), 0);
5822 5871 }
5823 5872
5824 5873 boolean_t
5825 5874 spa_has_spare(spa_t *spa, uint64_t guid)
5826 5875 {
5827 5876 int i;
5828 5877 uint64_t spareguid;
5829 5878 spa_aux_vdev_t *sav = &spa->spa_spares;
5830 5879
5831 5880 for (i = 0; i < sav->sav_count; i++)
5832 5881 if (sav->sav_vdevs[i]->vdev_guid == guid)
5833 5882 return (B_TRUE);
5834 5883
5835 5884 for (i = 0; i < sav->sav_npending; i++) {
5836 5885 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
5837 5886 &spareguid) == 0 && spareguid == guid)
5838 5887 return (B_TRUE);
5839 5888 }
5840 5889
5841 5890 return (B_FALSE);
5842 5891 }
5843 5892
5844 5893 /*
5845 5894 * Check if a pool has an active shared spare device.
5846 5895 * Note: reference count of an active spare is 2, as a spare and as a replace
5847 5896 */
5848 5897 static boolean_t
5849 5898 spa_has_active_shared_spare(spa_t *spa)
5850 5899 {
5851 5900 int i, refcnt;
5852 5901 uint64_t pool;
5853 5902 spa_aux_vdev_t *sav = &spa->spa_spares;
5854 5903
5855 5904 for (i = 0; i < sav->sav_count; i++) {
5856 5905 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
5857 5906 &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
5858 5907 refcnt > 2)
5859 5908 return (B_TRUE);
5860 5909 }
5861 5910
5862 5911 return (B_FALSE);
5863 5912 }
5864 5913
5865 5914 /*
5866 5915 * Post a sysevent corresponding to the given event. The 'name' must be one of
5867 5916 * the event definitions in sys/sysevent/eventdefs.h. The payload will be
5868 5917 * filled in from the spa and (optionally) the vdev. This doesn't do anything
5869 5918 * in the userland libzpool, as we don't want consumers to misinterpret ztest
5870 5919 * or zdb as real changes.
5871 5920 */
5872 5921 void
5873 5922 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
5874 5923 {
5875 5924 #ifdef _KERNEL
5876 5925 sysevent_t *ev;
5877 5926 sysevent_attr_list_t *attr = NULL;
5878 5927 sysevent_value_t value;
5879 5928 sysevent_id_t eid;
5880 5929
5881 5930 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
5882 5931 SE_SLEEP);
5883 5932
5884 5933 value.value_type = SE_DATA_TYPE_STRING;
5885 5934 value.value.sv_string = spa_name(spa);
5886 5935 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
5887 5936 goto done;
5888 5937
5889 5938 value.value_type = SE_DATA_TYPE_UINT64;
5890 5939 value.value.sv_uint64 = spa_guid(spa);
5891 5940 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
5892 5941 goto done;
5893 5942
5894 5943 if (vd) {
5895 5944 value.value_type = SE_DATA_TYPE_UINT64;
5896 5945 value.value.sv_uint64 = vd->vdev_guid;
5897 5946 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
5898 5947 SE_SLEEP) != 0)
5899 5948 goto done;
5900 5949
5901 5950 if (vd->vdev_path) {
5902 5951 value.value_type = SE_DATA_TYPE_STRING;
5903 5952 value.value.sv_string = vd->vdev_path;
5904 5953 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
5905 5954 &value, SE_SLEEP) != 0)
5906 5955 goto done;
5907 5956 }
5908 5957 }
5909 5958
5910 5959 if (sysevent_attach_attributes(ev, attr) != 0)
5911 5960 goto done;
5912 5961 attr = NULL;
5913 5962
5914 5963 (void) log_sysevent(ev, SE_SLEEP, &eid);
5915 5964
5916 5965 done:
5917 5966 if (attr)
5918 5967 sysevent_free_attr(attr);
5919 5968 sysevent_free(ev);
5920 5969 #endif
5921 5970 }
|
↓ open down ↓ |
538 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX