4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright 2017 Nexenta Systems, Inc.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Toomas Soome <tsoome@me.com>
28 * Copyright 2017 Joyent, Inc.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/fm/fs/zfs.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/bpobj.h>
36 #include <sys/dmu.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/vdev_impl.h>
40 #include <sys/uberblock_impl.h>
41 #include <sys/metaslab.h>
42 #include <sys/metaslab_impl.h>
43 #include <sys/space_map.h>
44 #include <sys/space_reftree.h>
45 #include <sys/zio.h>
46 #include <sys/zap.h>
47 #include <sys/fs/zfs.h>
48 #include <sys/arc.h>
49 #include <sys/zil.h>
50 #include <sys/dsl_scan.h>
51 #include <sys/abd.h>
52
53 /*
54 * Virtual device management.
55 */
56
57 static vdev_ops_t *vdev_ops_table[] = {
58 &vdev_root_ops,
59 &vdev_raidz_ops,
60 &vdev_mirror_ops,
61 &vdev_replacing_ops,
62 &vdev_spare_ops,
63 &vdev_disk_ops,
64 &vdev_file_ops,
65 &vdev_missing_ops,
66 &vdev_hole_ops,
67 &vdev_indirect_ops,
68 NULL
69 };
70
71 /* maximum scrub/resilver I/O queue per leaf vdev */
72 int zfs_scrub_limit = 10;
73
74 /*
75 * When a vdev is added, it will be divided into approximately (but no
76 * more than) this number of metaslabs.
77 */
78 int metaslabs_per_vdev = 200;
79
80 boolean_t vdev_validate_skip = B_FALSE;
81
82 /*PRINTFLIKE2*/
83 void
84 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
85 {
86 va_list adx;
87 char buf[256];
88
89 va_start(adx, fmt);
90 (void) vsnprintf(buf, sizeof (buf), fmt, adx);
91 va_end(adx);
92
93 if (vd->vdev_path != NULL) {
94 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
95 vd->vdev_path, buf);
96 } else {
97 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
98 vd->vdev_ops->vdev_op_type,
99 (u_longlong_t)vd->vdev_id,
100 (u_longlong_t)vd->vdev_guid, buf);
101 }
102 }
103
104 void
105 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
106 {
107 char state[20];
108
109 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
110 zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
111 vd->vdev_ops->vdev_op_type);
112 return;
113 }
114
115 switch (vd->vdev_state) {
116 case VDEV_STATE_UNKNOWN:
117 (void) snprintf(state, sizeof (state), "unknown");
118 break;
119 case VDEV_STATE_CLOSED:
120 (void) snprintf(state, sizeof (state), "closed");
121 break;
122 case VDEV_STATE_OFFLINE:
123 (void) snprintf(state, sizeof (state), "offline");
124 break;
125 case VDEV_STATE_REMOVED:
126 (void) snprintf(state, sizeof (state), "removed");
127 break;
128 case VDEV_STATE_CANT_OPEN:
129 (void) snprintf(state, sizeof (state), "can't open");
130 break;
131 case VDEV_STATE_FAULTED:
132 (void) snprintf(state, sizeof (state), "faulted");
133 break;
134 case VDEV_STATE_DEGRADED:
135 (void) snprintf(state, sizeof (state), "degraded");
136 break;
137 case VDEV_STATE_HEALTHY:
138 (void) snprintf(state, sizeof (state), "healthy");
139 break;
140 default:
141 (void) snprintf(state, sizeof (state), "<state %u>",
142 (uint_t)vd->vdev_state);
143 }
144
145 zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
146 "", vd->vdev_id, vd->vdev_ops->vdev_op_type,
147 vd->vdev_islog ? " (log)" : "",
148 (u_longlong_t)vd->vdev_guid,
149 vd->vdev_path ? vd->vdev_path : "N/A", state);
150
151 for (uint64_t i = 0; i < vd->vdev_children; i++)
152 vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
153 }
154
155 /*
156 * Given a vdev type, return the appropriate ops vector.
157 */
158 static vdev_ops_t *
159 vdev_getops(const char *type)
160 {
161 vdev_ops_t *ops, **opspp;
162
163 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
164 if (strcmp(ops->vdev_op_type, type) == 0)
165 break;
166
167 return (ops);
168 }
169
170 /*
171 * Default asize function: return the MAX of psize with the asize of
172 * all children. This is what's used by anything other than RAID-Z.
173 */
174 uint64_t
175 vdev_default_asize(vdev_t *vd, uint64_t psize)
176 {
177 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
178 uint64_t csize;
179
180 for (int c = 0; c < vd->vdev_children; c++) {
181 csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
182 asize = MAX(asize, csize);
183 }
184
185 return (asize);
186 }
187
188 /*
189 * Get the minimum allocatable size. We define the allocatable size as
295 cvd->vdev_parent = pvd;
296
297 if (pvd == NULL)
298 return;
299
300 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
301
302 oldsize = pvd->vdev_children * sizeof (vdev_t *);
303 pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
304 newsize = pvd->vdev_children * sizeof (vdev_t *);
305
306 newchild = kmem_zalloc(newsize, KM_SLEEP);
307 if (pvd->vdev_child != NULL) {
308 bcopy(pvd->vdev_child, newchild, oldsize);
309 kmem_free(pvd->vdev_child, oldsize);
310 }
311
312 pvd->vdev_child = newchild;
313 pvd->vdev_child[id] = cvd;
314
315 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
316 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
317
318 /*
319 * Walk up all ancestors to update guid sum.
320 */
321 for (; pvd != NULL; pvd = pvd->vdev_parent)
322 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
323 }
324
325 void
326 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
327 {
328 int c;
329 uint_t id = cvd->vdev_id;
330
331 ASSERT(cvd->vdev_parent == pvd);
332
333 if (pvd == NULL)
334 return;
376
377 for (int c = newc = 0; c < oldc; c++) {
378 if ((cvd = pvd->vdev_child[c]) != NULL) {
379 newchild[newc] = cvd;
380 cvd->vdev_id = newc++;
381 }
382 }
383
384 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
385 pvd->vdev_child = newchild;
386 pvd->vdev_children = newc;
387 }
388
389 /*
390 * Allocate and minimally initialize a vdev_t.
391 */
392 vdev_t *
393 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
394 {
395 vdev_t *vd;
396 vdev_indirect_config_t *vic;
397
398 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
399 vic = &vd->vdev_indirect_config;
400
401 if (spa->spa_root_vdev == NULL) {
402 ASSERT(ops == &vdev_root_ops);
403 spa->spa_root_vdev = vd;
404 spa->spa_load_guid = spa_generate_guid(NULL);
405 }
406
407 if (guid == 0 && ops != &vdev_hole_ops) {
408 if (spa->spa_root_vdev == vd) {
409 /*
410 * The root vdev's guid will also be the pool guid,
411 * which must be unique among all pools.
412 */
413 guid = spa_generate_guid(NULL);
414 } else {
415 /*
416 * Any other vdev's guid must be unique within the pool.
417 */
418 guid = spa_generate_guid(spa);
419 }
420 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
421 }
422
423 vd->vdev_spa = spa;
424 vd->vdev_id = id;
425 vd->vdev_guid = guid;
426 vd->vdev_guid_sum = guid;
427 vd->vdev_ops = ops;
428 vd->vdev_state = VDEV_STATE_CLOSED;
429 vd->vdev_ishole = (ops == &vdev_hole_ops);
430 vic->vic_prev_indirect_vdev = UINT64_MAX;
431
432 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
433 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
434 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
435
436 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
437 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
438 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
439 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
440 for (int t = 0; t < DTL_TYPES; t++) {
441 vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
442 }
443 txg_list_create(&vd->vdev_ms_list, spa,
444 offsetof(struct metaslab, ms_txg_node));
445 txg_list_create(&vd->vdev_dtl_list, spa,
446 offsetof(struct vdev, vdev_dtl_node));
447 vd->vdev_stat.vs_timestamp = gethrtime();
448 vdev_queue_init(vd);
449 vdev_cache_init(vd);
450
451 return (vd);
452 }
453
454 /*
455 * Allocate a new vdev. The 'alloctype' is used to control whether we are
456 * creating a new vdev or loading an existing one - the behavior is slightly
457 * different for each case.
458 */
459 int
460 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
461 int alloctype)
462 {
463 vdev_ops_t *ops;
464 char *type;
465 uint64_t guid = 0, islog, nparity;
466 vdev_t *vd;
467 vdev_indirect_config_t *vic;
468
469 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
470
471 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
472 return (SET_ERROR(EINVAL));
473
474 if ((ops = vdev_getops(type)) == NULL)
475 return (SET_ERROR(EINVAL));
476
477 /*
478 * If this is a load, get the vdev guid from the nvlist.
479 * Otherwise, vdev_alloc_common() will generate one for us.
480 */
481 if (alloctype == VDEV_ALLOC_LOAD) {
482 uint64_t label_id;
483
484 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
485 label_id != id)
486 return (SET_ERROR(EINVAL));
487
490 } else if (alloctype == VDEV_ALLOC_SPARE) {
491 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
492 return (SET_ERROR(EINVAL));
493 } else if (alloctype == VDEV_ALLOC_L2CACHE) {
494 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
495 return (SET_ERROR(EINVAL));
496 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
497 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
498 return (SET_ERROR(EINVAL));
499 }
500
501 /*
502 * The first allocated vdev must be of type 'root'.
503 */
504 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
505 return (SET_ERROR(EINVAL));
506
507 /*
508 * Determine whether we're a log vdev.
509 */
510 islog = 0;
511 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
512 if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
513 return (SET_ERROR(ENOTSUP));
514
515 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
516 return (SET_ERROR(ENOTSUP));
517
518 /*
519 * Set the nparity property for RAID-Z vdevs.
520 */
521 nparity = -1ULL;
522 if (ops == &vdev_raidz_ops) {
523 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
524 &nparity) == 0) {
525 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
526 return (SET_ERROR(EINVAL));
527 /*
528 * Previous versions could only support 1 or 2 parity
529 * device.
530 */
531 if (nparity > 1 &&
532 spa_version(spa) < SPA_VERSION_RAIDZ2)
533 return (SET_ERROR(ENOTSUP));
534 if (nparity > 2 &&
535 spa_version(spa) < SPA_VERSION_RAIDZ3)
536 return (SET_ERROR(ENOTSUP));
537 } else {
538 /*
539 * We require the parity to be specified for SPAs that
540 * support multiple parity levels.
541 */
542 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
543 return (SET_ERROR(EINVAL));
544 /*
545 * Otherwise, we default to 1 parity device for RAID-Z.
546 */
547 nparity = 1;
548 }
549 } else {
550 nparity = 0;
551 }
552 ASSERT(nparity != -1ULL);
553
554 vd = vdev_alloc_common(spa, id, guid, ops);
555 vic = &vd->vdev_indirect_config;
556
557 vd->vdev_islog = islog;
558 vd->vdev_nparity = nparity;
559
560 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
561 vd->vdev_path = spa_strdup(vd->vdev_path);
562 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
563 vd->vdev_devid = spa_strdup(vd->vdev_devid);
564 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
565 &vd->vdev_physpath) == 0)
566 vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
567 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
568 vd->vdev_fru = spa_strdup(vd->vdev_fru);
569
570 /*
571 * Set the whole_disk property. If it's not specified, leave the value
572 * as -1.
573 */
574 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
575 &vd->vdev_wholedisk) != 0)
576 vd->vdev_wholedisk = -1ULL;
577
578 ASSERT0(vic->vic_mapping_object);
579 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
580 &vic->vic_mapping_object);
581 ASSERT0(vic->vic_births_object);
582 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
583 &vic->vic_births_object);
584 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
585 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
586 &vic->vic_prev_indirect_vdev);
587
588 /*
589 * Look for the 'not present' flag. This will only be set if the device
590 * was not present at the time of import.
591 */
592 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
593 &vd->vdev_not_present);
594
595 /*
596 * Get the alignment requirement.
597 */
598 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
599
600 /*
601 * Retrieve the vdev creation time.
602 */
603 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
604 &vd->vdev_crtxg);
605
606 /*
607 * If we're a top-level vdev, try to load the allocation parameters.
608 */
609 if (parent && !parent->vdev_parent &&
610 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
611 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
612 &vd->vdev_ms_array);
613 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
614 &vd->vdev_ms_shift);
615 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
616 &vd->vdev_asize);
617 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
618 &vd->vdev_removing);
619 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
620 &vd->vdev_top_zap);
621 } else {
622 ASSERT0(vd->vdev_top_zap);
623 }
624
625 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
626 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
627 alloctype == VDEV_ALLOC_ADD ||
628 alloctype == VDEV_ALLOC_SPLIT ||
629 alloctype == VDEV_ALLOC_ROOTPOOL);
630 vd->vdev_mg = metaslab_group_create(islog ?
631 spa_log_class(spa) : spa_normal_class(spa), vd);
632 }
633
634 if (vd->vdev_ops->vdev_op_leaf &&
635 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
636 (void) nvlist_lookup_uint64(nv,
637 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
638 } else {
639 ASSERT0(vd->vdev_leaf_zap);
640 }
641
642 /*
643 * If we're a leaf vdev, try to load the DTL object and other state.
644 */
645
646 if (vd->vdev_ops->vdev_op_leaf &&
647 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
648 alloctype == VDEV_ALLOC_ROOTPOOL)) {
649 if (alloctype == VDEV_ALLOC_LOAD) {
650 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
651 &vd->vdev_dtl_object);
693 }
694 }
695 }
696
697 /*
698 * Add ourselves to the parent's list of children.
699 */
700 vdev_add_child(parent, vd);
701
702 *vdp = vd;
703
704 return (0);
705 }
706
707 void
708 vdev_free(vdev_t *vd)
709 {
710 spa_t *spa = vd->vdev_spa;
711
712 /*
713 * vdev_free() implies closing the vdev first. This is simpler than
714 * trying to ensure complicated semantics for all callers.
715 */
716 vdev_close(vd);
717
718 ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
719 ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
720
721 /*
722 * Free all children.
723 */
724 for (int c = 0; c < vd->vdev_children; c++)
725 vdev_free(vd->vdev_child[c]);
726
727 ASSERT(vd->vdev_child == NULL);
728 ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
729
730 /*
731 * Discard allocation state.
732 */
760 spa_strfree(vd->vdev_physpath);
761 if (vd->vdev_fru)
762 spa_strfree(vd->vdev_fru);
763
764 if (vd->vdev_isspare)
765 spa_spare_remove(vd);
766 if (vd->vdev_isl2cache)
767 spa_l2cache_remove(vd);
768
769 txg_list_destroy(&vd->vdev_ms_list);
770 txg_list_destroy(&vd->vdev_dtl_list);
771
772 mutex_enter(&vd->vdev_dtl_lock);
773 space_map_close(vd->vdev_dtl_sm);
774 for (int t = 0; t < DTL_TYPES; t++) {
775 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
776 range_tree_destroy(vd->vdev_dtl[t]);
777 }
778 mutex_exit(&vd->vdev_dtl_lock);
779
780 EQUIV(vd->vdev_indirect_births != NULL,
781 vd->vdev_indirect_mapping != NULL);
782 if (vd->vdev_indirect_births != NULL) {
783 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
784 vdev_indirect_births_close(vd->vdev_indirect_births);
785 }
786
787 if (vd->vdev_obsolete_sm != NULL) {
788 ASSERT(vd->vdev_removing ||
789 vd->vdev_ops == &vdev_indirect_ops);
790 space_map_close(vd->vdev_obsolete_sm);
791 vd->vdev_obsolete_sm = NULL;
792 }
793 range_tree_destroy(vd->vdev_obsolete_segments);
794 rw_destroy(&vd->vdev_indirect_rwlock);
795 mutex_destroy(&vd->vdev_obsolete_lock);
796
797 mutex_destroy(&vd->vdev_queue_lock);
798 mutex_destroy(&vd->vdev_dtl_lock);
799 mutex_destroy(&vd->vdev_stat_lock);
800 mutex_destroy(&vd->vdev_probe_lock);
801
802 if (vd == spa->spa_root_vdev)
803 spa->spa_root_vdev = NULL;
804
805 kmem_free(vd, sizeof (vdev_t));
806 }
807
808 /*
809 * Transfer top-level vdev state from svd to tvd.
810 */
811 static void
812 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
813 {
814 spa_t *spa = svd->vdev_spa;
815 metaslab_t *msp;
816 vdev_t *vd;
817 int t;
818
819 ASSERT(tvd == tvd->vdev_top);
820
821 tvd->vdev_ms_array = svd->vdev_ms_array;
822 tvd->vdev_ms_shift = svd->vdev_ms_shift;
823 tvd->vdev_ms_count = svd->vdev_ms_count;
824 tvd->vdev_top_zap = svd->vdev_top_zap;
854 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
855 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
856 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
857 }
858
859 if (list_link_active(&svd->vdev_config_dirty_node)) {
860 vdev_config_clean(svd);
861 vdev_config_dirty(tvd);
862 }
863
864 if (list_link_active(&svd->vdev_state_dirty_node)) {
865 vdev_state_clean(svd);
866 vdev_state_dirty(tvd);
867 }
868
869 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
870 svd->vdev_deflate_ratio = 0;
871
872 tvd->vdev_islog = svd->vdev_islog;
873 svd->vdev_islog = 0;
874 }
875
876 static void
877 vdev_top_update(vdev_t *tvd, vdev_t *vd)
878 {
879 if (vd == NULL)
880 return;
881
882 vd->vdev_top = tvd;
883
884 for (int c = 0; c < vd->vdev_children; c++)
885 vdev_top_update(tvd, vd->vdev_child[c]);
886 }
887
888 /*
889 * Add a mirror/replacing vdev above an existing vdev.
890 */
891 vdev_t *
892 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
893 {
894 spa_t *spa = cvd->vdev_spa;
895 vdev_t *pvd = cvd->vdev_parent;
896 vdev_t *mvd;
897
898 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
899
900 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
901
902 mvd->vdev_asize = cvd->vdev_asize;
903 mvd->vdev_min_asize = cvd->vdev_min_asize;
904 mvd->vdev_max_asize = cvd->vdev_max_asize;
905 mvd->vdev_psize = cvd->vdev_psize;
906 mvd->vdev_ashift = cvd->vdev_ashift;
907 mvd->vdev_state = cvd->vdev_state;
908 mvd->vdev_crtxg = cvd->vdev_crtxg;
909
910 vdev_remove_child(pvd, cvd);
911 vdev_add_child(pvd, mvd);
912 cvd->vdev_id = mvd->vdev_children;
913 vdev_add_child(mvd, cvd);
914 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
915
916 if (mvd == mvd->vdev_top)
917 vdev_top_transfer(cvd, mvd);
918
919 return (mvd);
920 }
921
922 /*
923 * Remove a 1-way mirror/replacing vdev from the tree.
924 */
925 void
966 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
967 {
968 spa_t *spa = vd->vdev_spa;
969 objset_t *mos = spa->spa_meta_objset;
970 uint64_t m;
971 uint64_t oldc = vd->vdev_ms_count;
972 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
973 metaslab_t **mspp;
974 int error;
975
976 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
977
978 /*
979 * This vdev is not being allocated from yet or is a hole.
980 */
981 if (vd->vdev_ms_shift == 0)
982 return (0);
983
984 ASSERT(!vd->vdev_ishole);
985
986 ASSERT(oldc <= newc);
987
988 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
989
990 if (oldc != 0) {
991 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
992 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
993 }
994
995 vd->vdev_ms = mspp;
996 vd->vdev_ms_count = newc;
997
998 for (m = oldc; m < newc; m++) {
999 uint64_t object = 0;
1000
1001 /*
1002 * vdev_ms_array may be 0 if we are creating the "fake"
1003 * metaslabs for an indirect vdev for zdb's leak detection.
1004 * See zdb_leak_init().
1005 */
1006 if (txg == 0 && vd->vdev_ms_array != 0) {
1007 error = dmu_read(mos, vd->vdev_ms_array,
1008 m * sizeof (uint64_t), sizeof (uint64_t), &object,
1009 DMU_READ_PREFETCH);
1010 if (error != 0) {
1011 vdev_dbgmsg(vd, "unable to read the metaslab "
1012 "array [error=%d]", error);
1013 return (error);
1014 }
1015 }
1016
1017 error = metaslab_init(vd->vdev_mg, m, object, txg,
1018 &(vd->vdev_ms[m]));
1019 if (error != 0) {
1020 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
1021 error);
1022 return (error);
1023 }
1024 }
1025
1026 if (txg == 0)
1027 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
1028
1029 /*
1030 * If the vdev is being removed we don't activate
1031 * the metaslabs since we want to ensure that no new
1032 * allocations are performed on this device.
1033 */
1034 if (oldc == 0 && !vd->vdev_removing)
1035 metaslab_group_activate(vd->vdev_mg);
1036
1037 if (txg == 0)
1038 spa_config_exit(spa, SCL_ALLOC, FTAG);
1039
1040 return (0);
1041 }
1042
1043 void
1044 vdev_metaslab_fini(vdev_t *vd)
1045 {
1046 if (vd->vdev_ms != NULL) {
1047 uint64_t count = vd->vdev_ms_count;
1048
1049 metaslab_group_passivate(vd->vdev_mg);
1050 for (uint64_t m = 0; m < count; m++) {
1051 metaslab_t *msp = vd->vdev_ms[m];
1052
1053 if (msp != NULL)
1054 metaslab_fini(msp);
1055 }
1056 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1057 vd->vdev_ms = NULL;
1058
1059 vd->vdev_ms_count = 0;
1060 }
1061 ASSERT0(vd->vdev_ms_count);
1062 }
1063
1064 typedef struct vdev_probe_stats {
1065 boolean_t vps_readable;
1066 boolean_t vps_writeable;
1067 int vps_flags;
1068 } vdev_probe_stats_t;
1069
1070 static void
1071 vdev_probe_done(zio_t *zio)
1072 {
1073 spa_t *spa = zio->io_spa;
1074 vdev_t *vd = zio->io_vd;
1075 vdev_probe_stats_t *vps = zio->io_private;
1076
1077 ASSERT(vd->vdev_probe_zio != NULL);
1078
1079 if (zio->io_type == ZIO_TYPE_READ) {
1080 if (zio->io_error == 0)
1081 vps->vps_readable = 1;
1085 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1086 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
1087 } else {
1088 abd_free(zio->io_abd);
1089 }
1090 } else if (zio->io_type == ZIO_TYPE_WRITE) {
1091 if (zio->io_error == 0)
1092 vps->vps_writeable = 1;
1093 abd_free(zio->io_abd);
1094 } else if (zio->io_type == ZIO_TYPE_NULL) {
1095 zio_t *pio;
1096
1097 vd->vdev_cant_read |= !vps->vps_readable;
1098 vd->vdev_cant_write |= !vps->vps_writeable;
1099
1100 if (vdev_readable(vd) &&
1101 (vdev_writeable(vd) || !spa_writeable(spa))) {
1102 zio->io_error = 0;
1103 } else {
1104 ASSERT(zio->io_error != 0);
1105 vdev_dbgmsg(vd, "failed probe");
1106 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
1107 spa, vd, NULL, 0, 0);
1108 zio->io_error = SET_ERROR(ENXIO);
1109 }
1110
1111 mutex_enter(&vd->vdev_probe_lock);
1112 ASSERT(vd->vdev_probe_zio == zio);
1113 vd->vdev_probe_zio = NULL;
1114 mutex_exit(&vd->vdev_probe_lock);
1115
1116 zio_link_t *zl = NULL;
1117 while ((pio = zio_walk_parents(zio, &zl)) != NULL)
1118 if (!vdev_accessible(vd, pio))
1119 pio->io_error = SET_ERROR(ENXIO);
1120
1121 kmem_free(vps, sizeof (*vps));
1122 }
1123 }
1124
1125 /*
1253 * in a single thread so that the same thread holds the
1254 * spa_namespace_lock
1255 */
1256 if (vdev_uses_zvols(vd)) {
1257 for (int c = 0; c < children; c++)
1258 vd->vdev_child[c]->vdev_open_error =
1259 vdev_open(vd->vdev_child[c]);
1260 return;
1261 }
1262 tq = taskq_create("vdev_open", children, minclsyspri,
1263 children, children, TASKQ_PREPOPULATE);
1264
1265 for (int c = 0; c < children; c++)
1266 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
1267 TQ_SLEEP) != NULL);
1268
1269 taskq_destroy(tq);
1270 }
1271
1272 /*
1273 * Compute the raidz-deflation ratio. Note, we hard-code
1274 * in 128k (1 << 17) because it is the "typical" blocksize.
1275 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
1276 * otherwise it would inconsistently account for existing bp's.
1277 */
1278 static void
1279 vdev_set_deflate_ratio(vdev_t *vd)
1280 {
1281 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
1282 vd->vdev_deflate_ratio = (1 << 17) /
1283 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
1284 }
1285 }
1286
1287 /*
1288 * Prepare a virtual device for access.
1289 */
1290 int
1291 vdev_open(vdev_t *vd)
1292 {
1293 spa_t *spa = vd->vdev_spa;
1294 int error;
1295 uint64_t osize = 0;
1296 uint64_t max_osize = 0;
1297 uint64_t asize, max_asize, psize;
1298 uint64_t ashift = 0;
1299
1300 ASSERT(vd->vdev_open_thread == curthread ||
1301 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1302 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1303 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1304 vd->vdev_state == VDEV_STATE_OFFLINE);
1305
1306 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1307 vd->vdev_cant_read = B_FALSE;
1308 vd->vdev_cant_write = B_FALSE;
1309 vd->vdev_min_asize = vdev_get_min_asize(vd);
1310
1311 /*
1312 * If this vdev is not removed, check its fault status. If it's
1313 * faulted, bail out of the open.
1314 */
1315 if (!vd->vdev_removed && vd->vdev_faulted) {
1316 ASSERT(vd->vdev_children == 0);
1317 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1318 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1319 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1320 vd->vdev_label_aux);
1321 return (SET_ERROR(ENXIO));
1322 } else if (vd->vdev_offline) {
1323 ASSERT(vd->vdev_children == 0);
1324 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1325 return (SET_ERROR(ENXIO));
1326 }
1327
1328 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
1329
1330 /*
1331 * Reset the vdev_reopening flag so that we actually close
1332 * the vdev on error.
1333 */
1334 vd->vdev_reopening = B_FALSE;
1335 if (zio_injection_enabled && error == 0)
1336 error = zio_handle_device_injection(vd, NULL, ENXIO);
1337
1338 if (error) {
1339 if (vd->vdev_removed &&
1340 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1341 vd->vdev_removed = B_FALSE;
1342
1343 if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
1344 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
1345 vd->vdev_stat.vs_aux);
1346 } else {
1347 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1348 vd->vdev_stat.vs_aux);
1349 }
1350 return (error);
1351 }
1352
1353 vd->vdev_removed = B_FALSE;
1354
1355 /*
1356 * Recheck the faulted flag now that we have confirmed that
1357 * the vdev is accessible. If we're faulted, bail.
1358 */
1359 if (vd->vdev_faulted) {
1360 ASSERT(vd->vdev_children == 0);
1361 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1362 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1363 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1364 vd->vdev_label_aux);
1365 return (SET_ERROR(ENXIO));
1366 }
1367
1368 if (vd->vdev_degraded) {
1369 ASSERT(vd->vdev_children == 0);
1489 spa->spa_min_ashift = vd->vdev_ashift;
1490 }
1491
1492 /*
1493 * If a leaf vdev has a DTL, and seems healthy, then kick off a
1494 * resilver. But don't do this if we are doing a reopen for a scrub,
1495 * since this would just restart the scrub we are already doing.
1496 */
1497 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1498 vdev_resilver_needed(vd, NULL, NULL))
1499 spa_async_request(spa, SPA_ASYNC_RESILVER);
1500
1501 return (0);
1502 }
1503
1504 /*
1505 * Called once the vdevs are all opened, this routine validates the label
1506 * contents. This needs to be done before vdev_load() so that we don't
1507 * inadvertently do repair I/Os to the wrong device.
1508 *
1509 * This function will only return failure if one of the vdevs indicates that it
1510 * has since been destroyed or exported. This is only possible if
1511 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
1512 * will be updated but the function will return 0.
1513 */
1514 int
1515 vdev_validate(vdev_t *vd)
1516 {
1517 spa_t *spa = vd->vdev_spa;
1518 nvlist_t *label;
1519 uint64_t guid = 0, aux_guid = 0, top_guid;
1520 uint64_t state;
1521 nvlist_t *nvl;
1522 uint64_t txg;
1523
1524 if (vdev_validate_skip)
1525 return (0);
1526
1527 for (uint64_t c = 0; c < vd->vdev_children; c++)
1528 if (vdev_validate(vd->vdev_child[c]) != 0)
1529 return (SET_ERROR(EBADF));
1530
1531 /*
1532 * If the device has already failed, or was marked offline, don't do
1533 * any further validation. Otherwise, label I/O will fail and we will
1534 * overwrite the previous state.
1535 */
1536 if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
1537 return (0);
1538
1539 /*
1540 * If we are performing an extreme rewind, we allow for a label that
1541 * was modified at a point after the current txg.
1542 */
1543 if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0)
1544 txg = UINT64_MAX;
1545 else
1546 txg = spa_last_synced_txg(spa);
1547
1548 if ((label = vdev_label_read_config(vd, txg)) == NULL) {
1549 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1550 VDEV_AUX_BAD_LABEL);
1551 vdev_dbgmsg(vd, "vdev_validate: failed reading config");
1552 return (0);
1553 }
1554
1555 /*
1556 * Determine if this vdev has been split off into another
1557 * pool. If so, then refuse to open it.
1558 */
1559 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1560 &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1561 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1562 VDEV_AUX_SPLIT_POOL);
1563 nvlist_free(label);
1564 vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
1565 return (0);
1566 }
1567
1568 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
1569 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1570 VDEV_AUX_CORRUPT_DATA);
1571 nvlist_free(label);
1572 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
1573 ZPOOL_CONFIG_POOL_GUID);
1574 return (0);
1575 }
1576
1577 /*
1578 * If config is not trusted then ignore the spa guid check. This is
1579 * necessary because if the machine crashed during a re-guid the new
1580 * guid might have been written to all of the vdev labels, but not the
1581 * cached config. The check will be performed again once we have the
1582 * trusted config from the MOS.
1583 */
1584 if (spa->spa_trust_config && guid != spa_guid(spa)) {
1585 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1586 VDEV_AUX_CORRUPT_DATA);
1587 nvlist_free(label);
1588 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
1589 "match config (%llu != %llu)", (u_longlong_t)guid,
1590 (u_longlong_t)spa_guid(spa));
1591 return (0);
1592 }
1593
1594 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
1595 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
1596 &aux_guid) != 0)
1597 aux_guid = 0;
1598
1599 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
1600 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1601 VDEV_AUX_CORRUPT_DATA);
1602 nvlist_free(label);
1603 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
1604 ZPOOL_CONFIG_GUID);
1605 return (0);
1606 }
1607
1608 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
1609 != 0) {
1610 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1611 VDEV_AUX_CORRUPT_DATA);
1612 nvlist_free(label);
1613 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
1614 ZPOOL_CONFIG_TOP_GUID);
1615 return (0);
1616 }
1617
1618 /*
1619 * If this vdev just became a top-level vdev because its sibling was
1620 * detached, it will have adopted the parent's vdev guid -- but the
1621 * label may or may not be on disk yet. Fortunately, either version
1622 * of the label will have the same top guid, so if we're a top-level
1623 * vdev, we can safely compare to that instead.
1624 * However, if the config comes from a cachefile that failed to update
1625 * after the detach, a top-level vdev will appear as a non top-level
1626 * vdev in the config. Also relax the constraints if we perform an
1627 * extreme rewind.
1628 *
1629 * If we split this vdev off instead, then we also check the
1630 * original pool's guid. We don't want to consider the vdev
1631 * corrupt if it is partway through a split operation.
1632 */
1633 if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
1634 boolean_t mismatch = B_FALSE;
1635 if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
1636 if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
1637 mismatch = B_TRUE;
1638 } else {
1639 if (vd->vdev_guid != top_guid &&
1640 vd->vdev_top->vdev_guid != guid)
1641 mismatch = B_TRUE;
1642 }
1643
1644 if (mismatch) {
1645 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1646 VDEV_AUX_CORRUPT_DATA);
1647 nvlist_free(label);
1648 vdev_dbgmsg(vd, "vdev_validate: config guid "
1649 "doesn't match label guid");
1650 vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
1651 (u_longlong_t)vd->vdev_guid,
1652 (u_longlong_t)vd->vdev_top->vdev_guid);
1653 vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
1654 "aux_guid %llu", (u_longlong_t)guid,
1655 (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
1656 return (0);
1657 }
1658 }
1659
1660 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1661 &state) != 0) {
1662 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1663 VDEV_AUX_CORRUPT_DATA);
1664 nvlist_free(label);
1665 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
1666 ZPOOL_CONFIG_POOL_STATE);
1667 return (0);
1668 }
1669
1670 nvlist_free(label);
1671
1672 /*
1673 * If this is a verbatim import, no need to check the
1674 * state of the pool.
1675 */
1676 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
1677 spa_load_state(spa) == SPA_LOAD_OPEN &&
1678 state != POOL_STATE_ACTIVE) {
1679 vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
1680 "for spa %s", (u_longlong_t)state, spa->spa_name);
1681 return (SET_ERROR(EBADF));
1682 }
1683
1684 /*
1685 * If we were able to open and validate a vdev that was
1686 * previously marked permanently unavailable, clear that state
1687 * now.
1688 */
1689 if (vd->vdev_not_present)
1690 vd->vdev_not_present = 0;
1691
1692 return (0);
1693 }
1694
1695 static void
1696 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
1697 {
1698 if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
1699 if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
1700 zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
1701 "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
1702 dvd->vdev_path, svd->vdev_path);
1703 spa_strfree(dvd->vdev_path);
1704 dvd->vdev_path = spa_strdup(svd->vdev_path);
1705 }
1706 } else if (svd->vdev_path != NULL) {
1707 dvd->vdev_path = spa_strdup(svd->vdev_path);
1708 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
1709 (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
1710 }
1711 }
1712
1713 /*
1714 * Recursively copy vdev paths from one vdev to another. Source and destination
1715 * vdev trees must have same geometry otherwise return error. Intended to copy
1716 * paths from userland config into MOS config.
1717 */
1718 int
1719 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
1720 {
1721 if ((svd->vdev_ops == &vdev_missing_ops) ||
1722 (svd->vdev_ishole && dvd->vdev_ishole) ||
1723 (dvd->vdev_ops == &vdev_indirect_ops))
1724 return (0);
1725
1726 if (svd->vdev_ops != dvd->vdev_ops) {
1727 vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
1728 svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
1729 return (SET_ERROR(EINVAL));
1730 }
1731
1732 if (svd->vdev_guid != dvd->vdev_guid) {
1733 vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
1734 "%llu)", (u_longlong_t)svd->vdev_guid,
1735 (u_longlong_t)dvd->vdev_guid);
1736 return (SET_ERROR(EINVAL));
1737 }
1738
1739 if (svd->vdev_children != dvd->vdev_children) {
1740 vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
1741 "%llu != %llu", (u_longlong_t)svd->vdev_children,
1742 (u_longlong_t)dvd->vdev_children);
1743 return (SET_ERROR(EINVAL));
1744 }
1745
1746 for (uint64_t i = 0; i < svd->vdev_children; i++) {
1747 int error = vdev_copy_path_strict(svd->vdev_child[i],
1748 dvd->vdev_child[i]);
1749 if (error != 0)
1750 return (error);
1751 }
1752
1753 if (svd->vdev_ops->vdev_op_leaf)
1754 vdev_copy_path_impl(svd, dvd);
1755
1756 return (0);
1757 }
1758
1759 static void
1760 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
1761 {
1762 ASSERT(stvd->vdev_top == stvd);
1763 ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
1764
1765 for (uint64_t i = 0; i < dvd->vdev_children; i++) {
1766 vdev_copy_path_search(stvd, dvd->vdev_child[i]);
1767 }
1768
1769 if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
1770 return;
1771
1772 /*
1773 * The idea here is that while a vdev can shift positions within
1774 * a top vdev (when replacing, attaching mirror, etc.) it cannot
1775 * step outside of it.
1776 */
1777 vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
1778
1779 if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
1780 return;
1781
1782 ASSERT(vd->vdev_ops->vdev_op_leaf);
1783
1784 vdev_copy_path_impl(vd, dvd);
1785 }
1786
1787 /*
1788 * Recursively copy vdev paths from one root vdev to another. Source and
1789 * destination vdev trees may differ in geometry. For each destination leaf
1790 * vdev, search a vdev with the same guid and top vdev id in the source.
1791 * Intended to copy paths from userland config into MOS config.
1792 */
1793 void
1794 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
1795 {
1796 uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
1797 ASSERT(srvd->vdev_ops == &vdev_root_ops);
1798 ASSERT(drvd->vdev_ops == &vdev_root_ops);
1799
1800 for (uint64_t i = 0; i < children; i++) {
1801 vdev_copy_path_search(srvd->vdev_child[i],
1802 drvd->vdev_child[i]);
1803 }
1804 }
1805
1806 /*
1807 * Close a virtual device.
1808 */
1809 void
1810 vdev_close(vdev_t *vd)
1811 {
1812 spa_t *spa = vd->vdev_spa;
1813 vdev_t *pvd = vd->vdev_parent;
1814
1815 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1816
1817 /*
1818 * If our parent is reopening, then we are as well, unless we are
1819 * going offline.
1820 */
1821 if (pvd != NULL && pvd->vdev_reopening)
1822 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1823
1824 vd->vdev_ops->vdev_op_close(vd);
1825
1826 vdev_cache_purge(vd);
1878 vdev_reopen(vdev_t *vd)
1879 {
1880 spa_t *spa = vd->vdev_spa;
1881
1882 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1883
1884 /* set the reopening flag unless we're taking the vdev offline */
1885 vd->vdev_reopening = !vd->vdev_offline;
1886 vdev_close(vd);
1887 (void) vdev_open(vd);
1888
1889 /*
1890 * Call vdev_validate() here to make sure we have the same device.
1891 * Otherwise, a device with an invalid label could be successfully
1892 * opened in response to vdev_reopen().
1893 */
1894 if (vd->vdev_aux) {
1895 (void) vdev_validate_aux(vd);
1896 if (vdev_readable(vd) && vdev_writeable(vd) &&
1897 vd->vdev_aux == &spa->spa_l2cache &&
1898 !l2arc_vdev_present(vd))
1899 l2arc_add_vdev(spa, vd);
1900 } else {
1901 (void) vdev_validate(vd);
1902 }
1903
1904 /*
1905 * Reassess parent vdev's health.
1906 */
1907 vdev_propagate_state(vd);
1908 }
1909
1910 int
1911 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1912 {
1913 int error;
1914
1915 /*
1916 * Normally, partial opens (e.g. of a mirror) are allowed.
1917 * For a create, however, we want to fail the request if
1918 * there are any components we can't open.
1919 */
1920 error = vdev_open(vd);
1921
1934 return (error);
1935 }
1936
1937 return (0);
1938 }
1939
1940 void
1941 vdev_metaslab_set_size(vdev_t *vd)
1942 {
1943 /*
1944 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
1945 */
1946 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
1947 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1948 }
1949
1950 void
1951 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1952 {
1953 ASSERT(vd == vd->vdev_top);
1954 /* indirect vdevs don't have metaslabs or dtls */
1955 ASSERT(vdev_is_concrete(vd) || flags == 0);
1956 ASSERT(ISP2(flags));
1957 ASSERT(spa_writeable(vd->vdev_spa));
1958
1959 if (flags & VDD_METASLAB)
1960 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1961
1962 if (flags & VDD_DTL)
1963 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1964
1965 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1966 }
1967
1968 void
1969 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1970 {
1971 for (int c = 0; c < vd->vdev_children; c++)
1972 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1973
1974 if (vd->vdev_ops->vdev_op_leaf)
1975 vdev_dirty(vd->vdev_top, flags, vd, txg);
2005 * comprising only those txgs which appear in 'maxfaults' or more children;
2006 * those are the txgs we don't have enough replication to read. For example,
2007 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
2008 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
2009 * two child DTL_MISSING maps.
2010 *
2011 * It should be clear from the above that to compute the DTLs and outage maps
2012 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
2013 * Therefore, that is all we keep on disk. When loading the pool, or after
2014 * a configuration change, we generate all other DTLs from first principles.
2015 */
2016 void
2017 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2018 {
2019 range_tree_t *rt = vd->vdev_dtl[t];
2020
2021 ASSERT(t < DTL_TYPES);
2022 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2023 ASSERT(spa_writeable(vd->vdev_spa));
2024
2025 mutex_enter(&vd->vdev_dtl_lock);
2026 if (!range_tree_contains(rt, txg, size))
2027 range_tree_add(rt, txg, size);
2028 mutex_exit(&vd->vdev_dtl_lock);
2029 }
2030
2031 boolean_t
2032 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
2033 {
2034 range_tree_t *rt = vd->vdev_dtl[t];
2035 boolean_t dirty = B_FALSE;
2036
2037 ASSERT(t < DTL_TYPES);
2038 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
2039
2040 /*
2041 * While we are loading the pool, the DTLs have not been loaded yet.
2042 * Ignore the DTLs and try all devices. This avoids a recursive
2043 * mutex enter on the vdev_dtl_lock, and also makes us try hard
2044 * when loading the pool (relying on the checksum to ensure that
2045 * we get the right data -- note that we while loading, we are
2046 * only reading the MOS, which is always checksummed).
2047 */
2048 if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
2049 return (B_FALSE);
2050
2051 mutex_enter(&vd->vdev_dtl_lock);
2052 if (range_tree_space(rt) != 0)
2053 dirty = range_tree_contains(rt, txg, size);
2054 mutex_exit(&vd->vdev_dtl_lock);
2055
2056 return (dirty);
2057 }
2058
2059 boolean_t
2060 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
2061 {
2062 range_tree_t *rt = vd->vdev_dtl[t];
2063 boolean_t empty;
2064
2065 mutex_enter(&vd->vdev_dtl_lock);
2066 empty = (range_tree_space(rt) == 0);
2067 mutex_exit(&vd->vdev_dtl_lock);
2068
2069 return (empty);
2070 }
2071
2072 /*
2073 * Returns the lowest txg in the DTL range.
2074 */
2075 static uint64_t
2076 vdev_dtl_min(vdev_t *vd)
2077 {
2078 range_seg_t *rs;
2079
2080 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
2081 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
2082 ASSERT0(vd->vdev_children);
2083
2084 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
2085 return (rs->rs_start - 1);
2086 }
2087
2140 }
2141 return (B_FALSE);
2142 }
2143
2144 /*
2145 * Reassess DTLs after a config change or scrub completion.
2146 */
2147 void
2148 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
2149 {
2150 spa_t *spa = vd->vdev_spa;
2151 avl_tree_t reftree;
2152 int minref;
2153
2154 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2155
2156 for (int c = 0; c < vd->vdev_children; c++)
2157 vdev_dtl_reassess(vd->vdev_child[c], txg,
2158 scrub_txg, scrub_done);
2159
2160 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
2161 return;
2162
2163 if (vd->vdev_ops->vdev_op_leaf) {
2164 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
2165
2166 mutex_enter(&vd->vdev_dtl_lock);
2167
2168 /*
2169 * If we've completed a scan cleanly then determine
2170 * if this vdev should remove any DTLs. We only want to
2171 * excise regions on vdevs that were available during
2172 * the entire duration of this scan.
2173 */
2174 if (scrub_txg != 0 &&
2175 (spa->spa_scrub_started ||
2176 (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
2177 vdev_dtl_should_excise(vd)) {
2178 /*
2179 * We completed a scrub up to scrub_txg. If we
2180 * did it without rebooting, then the scrub dtl
2246 for (int c = 0; c < vd->vdev_children; c++) {
2247 vdev_t *cvd = vd->vdev_child[c];
2248 mutex_enter(&cvd->vdev_dtl_lock);
2249 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
2250 mutex_exit(&cvd->vdev_dtl_lock);
2251 }
2252 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
2253 space_reftree_destroy(&reftree);
2254 }
2255 mutex_exit(&vd->vdev_dtl_lock);
2256 }
2257
2258 int
2259 vdev_dtl_load(vdev_t *vd)
2260 {
2261 spa_t *spa = vd->vdev_spa;
2262 objset_t *mos = spa->spa_meta_objset;
2263 int error = 0;
2264
2265 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2266 ASSERT(vdev_is_concrete(vd));
2267
2268 error = space_map_open(&vd->vdev_dtl_sm, mos,
2269 vd->vdev_dtl_object, 0, -1ULL, 0);
2270 if (error)
2271 return (error);
2272 ASSERT(vd->vdev_dtl_sm != NULL);
2273
2274 mutex_enter(&vd->vdev_dtl_lock);
2275
2276 /*
2277 * Now that we've opened the space_map we need to update
2278 * the in-core DTL.
2279 */
2280 space_map_update(vd->vdev_dtl_sm);
2281
2282 error = space_map_load(vd->vdev_dtl_sm,
2283 vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2284 mutex_exit(&vd->vdev_dtl_lock);
2285
2286 return (error);
2287 }
2288
2289 for (int c = 0; c < vd->vdev_children; c++) {
2328 !vd->vdev_top->vdev_removing) {
2329 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
2330 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
2331 }
2332 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
2333 vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
2334 }
2335 }
2336 for (uint64_t i = 0; i < vd->vdev_children; i++) {
2337 vdev_construct_zaps(vd->vdev_child[i], tx);
2338 }
2339 }
2340
2341 void
2342 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
2343 {
2344 spa_t *spa = vd->vdev_spa;
2345 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
2346 objset_t *mos = spa->spa_meta_objset;
2347 range_tree_t *rtsync;
2348 dmu_tx_t *tx;
2349 uint64_t object = space_map_object(vd->vdev_dtl_sm);
2350
2351 ASSERT(vdev_is_concrete(vd));
2352 ASSERT(vd->vdev_ops->vdev_op_leaf);
2353
2354 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2355
2356 if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
2357 mutex_enter(&vd->vdev_dtl_lock);
2358 space_map_free(vd->vdev_dtl_sm, tx);
2359 space_map_close(vd->vdev_dtl_sm);
2360 vd->vdev_dtl_sm = NULL;
2361 mutex_exit(&vd->vdev_dtl_lock);
2362
2363 /*
2364 * We only destroy the leaf ZAP for detached leaves or for
2365 * removed log devices. Removed data devices handle leaf ZAP
2366 * cleanup later, once cancellation is no longer possible.
2367 */
2368 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
2369 vd->vdev_top->vdev_islog)) {
2370 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
2371 vd->vdev_leaf_zap = 0;
2372 }
2373
2374 dmu_tx_commit(tx);
2375 return;
2376 }
2377
2378 if (vd->vdev_dtl_sm == NULL) {
2379 uint64_t new_object;
2380
2381 new_object = space_map_alloc(mos, tx);
2382 VERIFY3U(new_object, !=, 0);
2383
2384 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
2385 0, -1ULL, 0));
2386 ASSERT(vd->vdev_dtl_sm != NULL);
2387 }
2388
2389 rtsync = range_tree_create(NULL, NULL);
2390
2391 mutex_enter(&vd->vdev_dtl_lock);
2392 range_tree_walk(rt, range_tree_add, rtsync);
2393 mutex_exit(&vd->vdev_dtl_lock);
2394
2395 space_map_truncate(vd->vdev_dtl_sm, tx);
2396 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
2397 range_tree_vacate(rtsync, NULL, NULL);
2398
2399 range_tree_destroy(rtsync);
2400
2401 /*
2402 * If the object for the space map has changed then dirty
2403 * the top level so that we update the config.
2404 */
2405 if (object != space_map_object(vd->vdev_dtl_sm)) {
2406 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
2407 "new object %llu", (u_longlong_t)txg, spa_name(spa),
2408 (u_longlong_t)object,
2409 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2410 vdev_config_dirty(vd->vdev_top);
2411 }
2412
2413 dmu_tx_commit(tx);
2414
2415 mutex_enter(&vd->vdev_dtl_lock);
2416 space_map_update(vd->vdev_dtl_sm);
2417 mutex_exit(&vd->vdev_dtl_lock);
2418 }
2419
2420 /*
2421 * Determine whether the specified vdev can be offlined/detached/removed
2422 * without losing data.
2423 */
2424 boolean_t
2425 vdev_dtl_required(vdev_t *vd)
2426 {
2427 spa_t *spa = vd->vdev_spa;
2428 vdev_t *tvd = vd->vdev_top;
2429 uint8_t cant_read = vd->vdev_cant_read;
2474 } else {
2475 for (int c = 0; c < vd->vdev_children; c++) {
2476 vdev_t *cvd = vd->vdev_child[c];
2477 uint64_t cmin, cmax;
2478
2479 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2480 thismin = MIN(thismin, cmin);
2481 thismax = MAX(thismax, cmax);
2482 needed = B_TRUE;
2483 }
2484 }
2485 }
2486
2487 if (needed && minp) {
2488 *minp = thismin;
2489 *maxp = thismax;
2490 }
2491 return (needed);
2492 }
2493
2494 int
2495 vdev_load(vdev_t *vd)
2496 {
2497 int error = 0;
2498 /*
2499 * Recursively load all children.
2500 */
2501 for (int c = 0; c < vd->vdev_children; c++) {
2502 error = vdev_load(vd->vdev_child[c]);
2503 if (error != 0) {
2504 return (error);
2505 }
2506 }
2507
2508 vdev_set_deflate_ratio(vd);
2509
2510 /*
2511 * If this is a top-level vdev, initialize its metaslabs.
2512 */
2513 if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
2514 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
2515 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2516 VDEV_AUX_CORRUPT_DATA);
2517 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
2518 "asize=%llu", (u_longlong_t)vd->vdev_ashift,
2519 (u_longlong_t)vd->vdev_asize);
2520 return (SET_ERROR(ENXIO));
2521 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
2522 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
2523 "[error=%d]", error);
2524 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2525 VDEV_AUX_CORRUPT_DATA);
2526 return (error);
2527 }
2528 }
2529
2530 /*
2531 * If this is a leaf vdev, load its DTL.
2532 */
2533 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
2534 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2535 VDEV_AUX_CORRUPT_DATA);
2536 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
2537 "[error=%d]", error);
2538 return (error);
2539 }
2540
2541 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
2542 if (obsolete_sm_object != 0) {
2543 objset_t *mos = vd->vdev_spa->spa_meta_objset;
2544 ASSERT(vd->vdev_asize != 0);
2545 ASSERT(vd->vdev_obsolete_sm == NULL);
2546
2547 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
2548 obsolete_sm_object, 0, vd->vdev_asize, 0))) {
2549 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2550 VDEV_AUX_CORRUPT_DATA);
2551 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
2552 "obsolete spacemap (obj %llu) [error=%d]",
2553 (u_longlong_t)obsolete_sm_object, error);
2554 return (error);
2555 }
2556 space_map_update(vd->vdev_obsolete_sm);
2557 }
2558
2559 return (0);
2560 }
2561
2562 /*
2563 * The special vdev case is used for hot spares and l2cache devices. Its
2564 * sole purpose it to set the vdev state for the associated vdev. To do this,
2565 * we make sure that we can open the underlying device, then try to read the
2566 * label, and make sure that the label is sane and that it hasn't been
2567 * repurposed to another pool.
2568 */
2569 int
2570 vdev_validate_aux(vdev_t *vd)
2571 {
2572 nvlist_t *label;
2573 uint64_t guid, version;
2574 uint64_t state;
2575
2576 if (!vdev_readable(vd))
2577 return (0);
2578
2579 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
2584
2585 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
2586 !SPA_VERSION_IS_SUPPORTED(version) ||
2587 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
2588 guid != vd->vdev_guid ||
2589 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
2590 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2591 VDEV_AUX_CORRUPT_DATA);
2592 nvlist_free(label);
2593 return (-1);
2594 }
2595
2596 /*
2597 * We don't actually check the pool state here. If it's in fact in
2598 * use by another pool, we update this fact on the fly when requested.
2599 */
2600 nvlist_free(label);
2601 return (0);
2602 }
2603
2604 /*
2605 * Free the objects used to store this vdev's spacemaps, and the array
2606 * that points to them.
2607 */
2608 void
2609 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
2610 {
2611 if (vd->vdev_ms_array == 0)
2612 return;
2613
2614 objset_t *mos = vd->vdev_spa->spa_meta_objset;
2615 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
2616 size_t array_bytes = array_count * sizeof (uint64_t);
2617 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
2618 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
2619 array_bytes, smobj_array, 0));
2620
2621 for (uint64_t i = 0; i < array_count; i++) {
2622 uint64_t smobj = smobj_array[i];
2623 if (smobj == 0)
2624 continue;
2625
2626 space_map_free_obj(mos, smobj, tx);
2627 }
2628
2629 kmem_free(smobj_array, array_bytes);
2630 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
2631 vd->vdev_ms_array = 0;
2632 }
2633
2634 static void
2635 vdev_remove_empty(vdev_t *vd, uint64_t txg)
2636 {
2637 spa_t *spa = vd->vdev_spa;
2638 dmu_tx_t *tx;
2639
2640 ASSERT(vd == vd->vdev_top);
2641 ASSERT3U(txg, ==, spa_syncing_txg(spa));
2642
2643 if (vd->vdev_ms != NULL) {
2644 metaslab_group_t *mg = vd->vdev_mg;
2645
2646 metaslab_group_histogram_verify(mg);
2647 metaslab_class_histogram_verify(mg->mg_class);
2648
2649 for (int m = 0; m < vd->vdev_ms_count; m++) {
2650 metaslab_t *msp = vd->vdev_ms[m];
2651
2652 if (msp == NULL || msp->ms_sm == NULL)
2653 continue;
2654
2655 mutex_enter(&msp->ms_lock);
2656 /*
2657 * If the metaslab was not loaded when the vdev
2658 * was removed then the histogram accounting may
2659 * not be accurate. Update the histogram information
2660 * here so that we ensure that the metaslab group
2661 * and metaslab class are up-to-date.
2662 */
2663 metaslab_group_histogram_remove(mg, msp);
2664
2665 VERIFY0(space_map_allocated(msp->ms_sm));
2666 space_map_close(msp->ms_sm);
2667 msp->ms_sm = NULL;
2668 mutex_exit(&msp->ms_lock);
2669 }
2670
2671 metaslab_group_histogram_verify(mg);
2672 metaslab_class_histogram_verify(mg->mg_class);
2673 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
2674 ASSERT0(mg->mg_histogram[i]);
2675 }
2676
2677 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2678 vdev_destroy_spacemaps(vd, tx);
2679
2680 if (vd->vdev_islog && vd->vdev_top_zap != 0) {
2681 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
2682 vd->vdev_top_zap = 0;
2683 }
2684 dmu_tx_commit(tx);
2685 }
2686
2687 void
2688 vdev_sync_done(vdev_t *vd, uint64_t txg)
2689 {
2690 metaslab_t *msp;
2691 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2692
2693 ASSERT(vdev_is_concrete(vd));
2694
2695 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2696 metaslab_sync_done(msp, txg);
2697
2698 if (reassess)
2699 metaslab_sync_reassess(vd->vdev_mg);
2700 }
2701
2702 void
2703 vdev_sync(vdev_t *vd, uint64_t txg)
2704 {
2705 spa_t *spa = vd->vdev_spa;
2706 vdev_t *lvd;
2707 metaslab_t *msp;
2708 dmu_tx_t *tx;
2709
2710 if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
2711 dmu_tx_t *tx;
2712
2713 ASSERT(vd->vdev_removing ||
2714 vd->vdev_ops == &vdev_indirect_ops);
2715
2716 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2717 vdev_indirect_sync_obsolete(vd, tx);
2718 dmu_tx_commit(tx);
2719
2720 /*
2721 * If the vdev is indirect, it can't have dirty
2722 * metaslabs or DTLs.
2723 */
2724 if (vd->vdev_ops == &vdev_indirect_ops) {
2725 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
2726 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
2727 return;
2728 }
2729 }
2730
2731 ASSERT(vdev_is_concrete(vd));
2732
2733 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
2734 !vd->vdev_removing) {
2735 ASSERT(vd == vd->vdev_top);
2736 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
2737 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2738 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
2739 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
2740 ASSERT(vd->vdev_ms_array != 0);
2741 vdev_config_dirty(vd);
2742 dmu_tx_commit(tx);
2743 }
2744
2745 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
2746 metaslab_sync(msp, txg);
2747 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
2748 }
2749
2750 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
2751 vdev_dtl_sync(lvd, txg);
2752
2753 /*
2754 * Remove the metadata associated with this vdev once it's empty.
2755 * Note that this is typically used for log/cache device removal;
2756 * we don't empty toplevel vdevs when removing them. But if
2757 * a toplevel happens to be emptied, this is not harmful.
2758 */
2759 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) {
2760 vdev_remove_empty(vd, txg);
2761 }
2762
2763 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
2764 }
2765
2766 uint64_t
2767 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
2768 {
2769 return (vd->vdev_ops->vdev_op_asize(vd, psize));
2770 }
2771
2772 /*
2773 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
2774 * not be opened, and no I/O is attempted.
2775 */
2776 int
2777 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2778 {
2779 vdev_t *vd, *tvd;
2780
2781 spa_vdev_state_enter(spa, SCL_NONE);
2782
2866 */
2867 int
2868 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2869 {
2870 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2871 boolean_t wasoffline;
2872 vdev_state_t oldstate;
2873
2874 spa_vdev_state_enter(spa, SCL_NONE);
2875
2876 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2877 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2878
2879 if (!vd->vdev_ops->vdev_op_leaf)
2880 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2881
2882 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
2883 oldstate = vd->vdev_state;
2884
2885 tvd = vd->vdev_top;
2886 vd->vdev_offline = B_FALSE;
2887 vd->vdev_tmpoffline = B_FALSE;
2888 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
2889 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2890
2891 /* XXX - L2ARC 1.0 does not support expansion */
2892 if (!vd->vdev_aux) {
2893 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2894 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
2895 }
2896
2897 vdev_reopen(tvd);
2898 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2899
2900 if (!vd->vdev_aux) {
2901 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2902 pvd->vdev_expanding = B_FALSE;
2903 }
2904
2905 if (newstate)
2906 *newstate = vd->vdev_state;
2907 if ((flags & ZFS_ONLINE_UNSPARE) &&
2956 * don't allow it to be offlined. Log devices are always
2957 * expendable.
2958 */
2959 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2960 vdev_dtl_required(vd))
2961 return (spa_vdev_state_exit(spa, NULL, EBUSY));
2962
2963 /*
2964 * If the top-level is a slog and it has had allocations
2965 * then proceed. We check that the vdev's metaslab group
2966 * is not NULL since it's possible that we may have just
2967 * added this vdev but not yet initialized its metaslabs.
2968 */
2969 if (tvd->vdev_islog && mg != NULL) {
2970 /*
2971 * Prevent any future allocations.
2972 */
2973 metaslab_group_passivate(mg);
2974 (void) spa_vdev_state_exit(spa, vd, 0);
2975
2976 error = spa_reset_logs(spa);
2977
2978 spa_vdev_state_enter(spa, SCL_ALLOC);
2979
2980 /*
2981 * Check to see if the config has changed.
2982 */
2983 if (error || generation != spa->spa_config_generation) {
2984 metaslab_group_activate(mg);
2985 if (error)
2986 return (spa_vdev_state_exit(spa,
2987 vd, error));
2988 (void) spa_vdev_state_exit(spa, vd, 0);
2989 goto top;
2990 }
2991 ASSERT0(tvd->vdev_stat.vs_alloc);
2992 }
2993
2994 /*
2995 * Offline this device and reopen its top-level vdev.
2996 * If the top-level vdev is a log device then just offline
3023 int
3024 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
3025 {
3026 int error;
3027
3028 mutex_enter(&spa->spa_vdev_top_lock);
3029 error = vdev_offline_locked(spa, guid, flags);
3030 mutex_exit(&spa->spa_vdev_top_lock);
3031
3032 return (error);
3033 }
3034
3035 /*
3036 * Clear the error counts associated with this vdev. Unlike vdev_online() and
3037 * vdev_offline(), we assume the spa config is locked. We also clear all
3038 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
3039 */
3040 void
3041 vdev_clear(spa_t *spa, vdev_t *vd)
3042 {
3043 vdev_t *rvd = spa->spa_root_vdev;
3044
3045 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3046
3047 if (vd == NULL)
3048 vd = rvd;
3049
3050 vd->vdev_stat.vs_read_errors = 0;
3051 vd->vdev_stat.vs_write_errors = 0;
3052 vd->vdev_stat.vs_checksum_errors = 0;
3053
3054 for (int c = 0; c < vd->vdev_children; c++)
3055 vdev_clear(spa, vd->vdev_child[c]);
3056
3057 /*
3058 * It makes no sense to "clear" an indirect vdev.
3059 */
3060 if (!vdev_is_concrete(vd))
3061 return;
3062
3063 /*
3064 * If we're in the FAULTED state or have experienced failed I/O, then
3065 * clear the persistent state and attempt to reopen the device. We
3066 * also mark the vdev config dirty, so that the new faulted state is
3067 * written out to disk.
3068 */
3069 if (vd->vdev_faulted || vd->vdev_degraded ||
3070 !vdev_readable(vd) || !vdev_writeable(vd)) {
3071
3072 /*
3073 * When reopening in reponse to a clear event, it may be due to
3074 * a fmadm repair request. In this case, if the device is
3075 * still broken, we want to still post the ereport again.
3076 */
3077 vd->vdev_forcefault = B_TRUE;
3078
3079 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
3080 vd->vdev_cant_read = B_FALSE;
3081 vd->vdev_cant_write = B_FALSE;
3082
3097 * When clearing a FMA-diagnosed fault, we always want to
3098 * unspare the device, as we assume that the original spare was
3099 * done in response to the FMA fault.
3100 */
3101 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
3102 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3103 vd->vdev_parent->vdev_child[0] == vd)
3104 vd->vdev_unspare = B_TRUE;
3105 }
3106
3107 boolean_t
3108 vdev_is_dead(vdev_t *vd)
3109 {
3110 /*
3111 * Holes and missing devices are always considered "dead".
3112 * This simplifies the code since we don't have to check for
3113 * these types of devices in the various code paths.
3114 * Instead we rely on the fact that we skip over dead devices
3115 * before issuing I/O to them.
3116 */
3117 return (vd->vdev_state < VDEV_STATE_DEGRADED ||
3118 vd->vdev_ops == &vdev_hole_ops ||
3119 vd->vdev_ops == &vdev_missing_ops);
3120 }
3121
3122 boolean_t
3123 vdev_readable(vdev_t *vd)
3124 {
3125 return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
3126 }
3127
3128 boolean_t
3129 vdev_writeable(vdev_t *vd)
3130 {
3131 return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
3132 vdev_is_concrete(vd));
3133 }
3134
3135 boolean_t
3136 vdev_allocatable(vdev_t *vd)
3137 {
3138 uint64_t state = vd->vdev_state;
3139
3140 /*
3141 * We currently allow allocations from vdevs which may be in the
3142 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
3143 * fails to reopen then we'll catch it later when we're holding
3144 * the proper locks. Note that we have to get the vdev state
3145 * in a local variable because although it changes atomically,
3146 * we're asking two separate questions about it.
3147 */
3148 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
3149 !vd->vdev_cant_write && vdev_is_concrete(vd) &&
3150 vd->vdev_mg->mg_initialized);
3151 }
3152
3153 boolean_t
3154 vdev_accessible(vdev_t *vd, zio_t *zio)
3155 {
3156 ASSERT(zio->io_vd == vd);
3157
3158 if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
3159 return (B_FALSE);
3160
3161 if (zio->io_type == ZIO_TYPE_READ)
3162 return (!vd->vdev_cant_read);
3163
3164 if (zio->io_type == ZIO_TYPE_WRITE)
3165 return (!vd->vdev_cant_write);
3166
3167 return (B_TRUE);
3168 }
3169
3178 vdev_t *tvd = vd->vdev_top;
3179
3180 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
3181
3182 mutex_enter(&vd->vdev_stat_lock);
3183 bcopy(&vd->vdev_stat, vs, sizeof (*vs));
3184 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
3185 vs->vs_state = vd->vdev_state;
3186 vs->vs_rsize = vdev_get_min_asize(vd);
3187 if (vd->vdev_ops->vdev_op_leaf)
3188 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
3189 /*
3190 * Report expandable space on top-level, non-auxillary devices only.
3191 * The expandable space is reported in terms of metaslab sized units
3192 * since that determines how much space the pool can expand.
3193 */
3194 if (vd->vdev_aux == NULL && tvd != NULL) {
3195 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
3196 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
3197 }
3198 if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
3199 vdev_is_concrete(vd)) {
3200 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
3201 }
3202
3203 /*
3204 * If we're getting stats on the root vdev, aggregate the I/O counts
3205 * over all top-level vdevs (i.e. the direct children of the root).
3206 */
3207 if (vd == rvd) {
3208 for (int c = 0; c < rvd->vdev_children; c++) {
3209 vdev_t *cvd = rvd->vdev_child[c];
3210 vdev_stat_t *cvs = &cvd->vdev_stat;
3211
3212 for (int t = 0; t < ZIO_TYPES; t++) {
3213 vs->vs_ops[t] += cvs->vs_ops[t];
3214 vs->vs_bytes[t] += cvs->vs_bytes[t];
3215 }
3216 cvs->vs_scan_removing = cvd->vdev_removing;
3217 }
3218 }
3219 mutex_exit(&vd->vdev_stat_lock);
3220 }
3221
3222 void
3223 vdev_clear_stats(vdev_t *vd)
3224 {
3225 mutex_enter(&vd->vdev_stat_lock);
3226 vd->vdev_stat.vs_space = 0;
3227 vd->vdev_stat.vs_dspace = 0;
3228 vd->vdev_stat.vs_alloc = 0;
3229 mutex_exit(&vd->vdev_stat_lock);
3230 }
3231
3232 void
3233 vdev_scan_stat_init(vdev_t *vd)
3234 {
3287
3288 if (flags & ZIO_FLAG_IO_REPAIR) {
3289 if (flags & ZIO_FLAG_SCAN_THREAD) {
3290 dsl_scan_phys_t *scn_phys =
3291 &spa->spa_dsl_pool->dp_scan->scn_phys;
3292 uint64_t *processed = &scn_phys->scn_processed;
3293
3294 /* XXX cleanup? */
3295 if (vd->vdev_ops->vdev_op_leaf)
3296 atomic_add_64(processed, psize);
3297 vs->vs_scan_processed += psize;
3298 }
3299
3300 if (flags & ZIO_FLAG_SELF_HEAL)
3301 vs->vs_self_healed += psize;
3302 }
3303
3304 vs->vs_ops[type]++;
3305 vs->vs_bytes[type] += psize;
3306
3307 mutex_exit(&vd->vdev_stat_lock);
3308 return;
3309 }
3310
3311 if (flags & ZIO_FLAG_SPECULATIVE)
3312 return;
3313
3314 /*
3315 * If this is an I/O error that is going to be retried, then ignore the
3316 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
3317 * hard errors, when in reality they can happen for any number of
3318 * innocuous reasons (bus resets, MPxIO link failure, etc).
3319 */
3320 if (zio->io_error == EIO &&
3321 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
3322 return;
3323
3324 /*
3325 * Intent logs writes won't propagate their error to the root
3326 * I/O so don't mark these types of failures as pool-level
3327 * errors.
3328 */
3329 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
3330 return;
3331
3332 mutex_enter(&vd->vdev_stat_lock);
3333 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
3334 if (zio->io_error == ECKSUM)
3335 vs->vs_checksum_errors++;
3336 else
3337 vs->vs_read_errors++;
3338 }
3339 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
3340 vs->vs_write_errors++;
3341 mutex_exit(&vd->vdev_stat_lock);
3342
3343 if (spa->spa_load_state == SPA_LOAD_NONE &&
3344 type == ZIO_TYPE_WRITE && txg != 0 &&
3345 (!(flags & ZIO_FLAG_IO_REPAIR) ||
3346 (flags & ZIO_FLAG_SCAN_THREAD) ||
3347 spa->spa_claiming)) {
3348 /*
3349 * This is either a normal write (not a repair), or it's
3350 * a repair induced by the scrub thread, or it's a repair
3351 * made by zil_claim() during spa_load() in the first txg.
3352 * In the normal case, we commit the DTL change in the same
3353 * txg as the block was born. In the scrub-induced repair
3354 * case, we know that scrubs run in first-pass syncing context,
3355 * so we commit the DTL change in spa_syncing_txg(spa).
3356 * In the zil_claim() case, we commit in spa_first_txg(spa).
3357 *
3358 * We currently do not make DTL entries for failed spontaneous
3359 * self-healing writes triggered by normal (non-scrubbing)
3360 * reads, because we have no transactional context in which to
3361 * do so -- and it's not clear that it'd be desirable anyway.
3362 */
3363 if (vd->vdev_ops->vdev_op_leaf) {
3364 uint64_t commit_txg = txg;
3399
3400 ASSERT(vd == vd->vdev_top);
3401
3402 /*
3403 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
3404 * factor. We must calculate this here and not at the root vdev
3405 * because the root vdev's psize-to-asize is simply the max of its
3406 * childrens', thus not accurate enough for us.
3407 */
3408 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
3409 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
3410 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
3411 vd->vdev_deflate_ratio;
3412
3413 mutex_enter(&vd->vdev_stat_lock);
3414 vd->vdev_stat.vs_alloc += alloc_delta;
3415 vd->vdev_stat.vs_space += space_delta;
3416 vd->vdev_stat.vs_dspace += dspace_delta;
3417 mutex_exit(&vd->vdev_stat_lock);
3418
3419 if (mc == spa_normal_class(spa)) {
3420 mutex_enter(&rvd->vdev_stat_lock);
3421 rvd->vdev_stat.vs_alloc += alloc_delta;
3422 rvd->vdev_stat.vs_space += space_delta;
3423 rvd->vdev_stat.vs_dspace += dspace_delta;
3424 mutex_exit(&rvd->vdev_stat_lock);
3425 }
3426
3427 if (mc != NULL) {
3428 ASSERT(rvd == vd->vdev_parent);
3429 ASSERT(vd->vdev_ms_count != 0);
3430
3431 metaslab_class_space_update(mc,
3432 alloc_delta, defer_delta, space_delta, dspace_delta);
3433 }
3434 }
3435
3436 /*
3437 * Mark a top-level vdev's config as dirty, placing it on the dirty list
3438 * so that it will be written out next time the vdev configuration is synced.
3439 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
3489 return;
3490 }
3491
3492 /*
3493 * The dirty list is protected by the SCL_CONFIG lock. The caller
3494 * must either hold SCL_CONFIG as writer, or must be the sync thread
3495 * (which holds SCL_CONFIG as reader). There's only one sync thread,
3496 * so this is sufficient to ensure mutual exclusion.
3497 */
3498 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3499 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3500 spa_config_held(spa, SCL_CONFIG, RW_READER)));
3501
3502 if (vd == rvd) {
3503 for (c = 0; c < rvd->vdev_children; c++)
3504 vdev_config_dirty(rvd->vdev_child[c]);
3505 } else {
3506 ASSERT(vd == vd->vdev_top);
3507
3508 if (!list_link_active(&vd->vdev_config_dirty_node) &&
3509 vdev_is_concrete(vd)) {
3510 list_insert_head(&spa->spa_config_dirty_list, vd);
3511 }
3512 }
3513 }
3514
3515 void
3516 vdev_config_clean(vdev_t *vd)
3517 {
3518 spa_t *spa = vd->vdev_spa;
3519
3520 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3521 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3522 spa_config_held(spa, SCL_CONFIG, RW_READER)));
3523
3524 ASSERT(list_link_active(&vd->vdev_config_dirty_node));
3525 list_remove(&spa->spa_config_dirty_list, vd);
3526 }
3527
3528 /*
3529 * Mark a top-level vdev's state as dirty, so that the next pass of
3530 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
3531 * the state changes from larger config changes because they require
3532 * much less locking, and are often needed for administrative actions.
3533 */
3534 void
3535 vdev_state_dirty(vdev_t *vd)
3536 {
3537 spa_t *spa = vd->vdev_spa;
3538
3539 ASSERT(spa_writeable(spa));
3540 ASSERT(vd == vd->vdev_top);
3541
3542 /*
3543 * The state list is protected by the SCL_STATE lock. The caller
3544 * must either hold SCL_STATE as writer, or must be the sync thread
3545 * (which holds SCL_STATE as reader). There's only one sync thread,
3546 * so this is sufficient to ensure mutual exclusion.
3547 */
3548 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3549 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3550 spa_config_held(spa, SCL_STATE, RW_READER)));
3551
3552 if (!list_link_active(&vd->vdev_state_dirty_node) &&
3553 vdev_is_concrete(vd))
3554 list_insert_head(&spa->spa_state_dirty_list, vd);
3555 }
3556
3557 void
3558 vdev_state_clean(vdev_t *vd)
3559 {
3560 spa_t *spa = vd->vdev_spa;
3561
3562 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3563 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3564 spa_config_held(spa, SCL_STATE, RW_READER)));
3565
3566 ASSERT(list_link_active(&vd->vdev_state_dirty_node));
3567 list_remove(&spa->spa_state_dirty_list, vd);
3568 }
3569
3570 /*
3571 * Propagate vdev state up from children to parent.
3572 */
3573 void
3574 vdev_propagate_state(vdev_t *vd)
3575 {
3576 spa_t *spa = vd->vdev_spa;
3577 vdev_t *rvd = spa->spa_root_vdev;
3578 int degraded = 0, faulted = 0;
3579 int corrupted = 0;
3580 vdev_t *child;
3581
3582 if (vd->vdev_children > 0) {
3583 for (int c = 0; c < vd->vdev_children; c++) {
3584 child = vd->vdev_child[c];
3585
3586 /*
3587 * Don't factor holes or indirect vdevs into the
3588 * decision.
3589 */
3590 if (!vdev_is_concrete(child))
3591 continue;
3592
3593 if (!vdev_readable(child) ||
3594 (!vdev_writeable(child) && spa_writeable(spa))) {
3595 /*
3596 * Root special: if there is a top-level log
3597 * device, treat the root vdev as if it were
3598 * degraded.
3599 */
3600 if (child->vdev_islog && vd == rvd)
3601 degraded++;
3602 else
3603 faulted++;
3604 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
3605 degraded++;
3606 }
3607
3608 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
3609 corrupted++;
3610 }
3745 case VDEV_AUX_BAD_LABEL:
3746 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3747 break;
3748 default:
3749 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3750 }
3751
3752 zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3753 }
3754
3755 /* Erase any notion of persistent removed state */
3756 vd->vdev_removed = B_FALSE;
3757 } else {
3758 vd->vdev_removed = B_FALSE;
3759 }
3760
3761 if (!isopen && vd->vdev_parent)
3762 vdev_propagate_state(vd->vdev_parent);
3763 }
3764
3765 boolean_t
3766 vdev_children_are_offline(vdev_t *vd)
3767 {
3768 ASSERT(!vd->vdev_ops->vdev_op_leaf);
3769
3770 for (uint64_t i = 0; i < vd->vdev_children; i++) {
3771 if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
3772 return (B_FALSE);
3773 }
3774
3775 return (B_TRUE);
3776 }
3777
3778 /*
3779 * Check the vdev configuration to ensure that it's capable of supporting
3780 * a root pool. We do not support partial configuration.
3781 * In addition, only a single top-level vdev is allowed.
3782 */
3783 boolean_t
3784 vdev_is_bootable(vdev_t *vd)
3785 {
3786 if (!vd->vdev_ops->vdev_op_leaf) {
3787 char *vdev_type = vd->vdev_ops->vdev_op_type;
3788
3789 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3790 vd->vdev_children > 1) {
3791 return (B_FALSE);
3792 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
3793 strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
3794 return (B_FALSE);
3795 }
3796 }
3797
3798 for (int c = 0; c < vd->vdev_children; c++) {
3799 if (!vdev_is_bootable(vd->vdev_child[c]))
3800 return (B_FALSE);
3801 }
3802 return (B_TRUE);
3803 }
3804
3805 boolean_t
3806 vdev_is_concrete(vdev_t *vd)
3807 {
3808 vdev_ops_t *ops = vd->vdev_ops;
3809 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
3810 ops == &vdev_missing_ops || ops == &vdev_root_ops) {
3811 return (B_FALSE);
3812 } else {
3813 return (B_TRUE);
3814 }
3815 }
3816
3817 /*
3818 * Determine if a log device has valid content. If the vdev was
3819 * removed or faulted in the MOS config then we know that
3820 * the content on the log device has already been written to the pool.
3821 */
3822 boolean_t
3823 vdev_log_state_valid(vdev_t *vd)
3824 {
3825 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3826 !vd->vdev_removed)
3827 return (B_TRUE);
3828
3829 for (int c = 0; c < vd->vdev_children; c++)
3830 if (vdev_log_state_valid(vd->vdev_child[c]))
3831 return (B_TRUE);
3832
3833 return (B_FALSE);
3834 }
3835
3836 /*
3837 * Expand a vdev if possible.
3838 */
3839 void
3840 vdev_expand(vdev_t *vd, uint64_t txg)
3841 {
3842 ASSERT(vd->vdev_top == vd);
3843 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3844
3845 vdev_set_deflate_ratio(vd);
3846
3847 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
3848 vdev_is_concrete(vd)) {
3849 VERIFY(vdev_metaslab_init(vd, txg) == 0);
3850 vdev_config_dirty(vd);
3851 }
3852 }
3853
3854 /*
3855 * Split a vdev.
3856 */
3857 void
3858 vdev_split(vdev_t *vd)
3859 {
3860 vdev_t *cvd, *pvd = vd->vdev_parent;
3861
3862 vdev_remove_child(pvd, vd);
3863 vdev_compact_children(pvd);
3864
3865 cvd = pvd->vdev_child[0];
3866 if (pvd->vdev_children == 1) {
3867 vdev_remove_parent(cvd);
3868 cvd->vdev_splitting = B_TRUE;
3879 vdev_deadman(cvd);
3880 }
3881
3882 if (vd->vdev_ops->vdev_op_leaf) {
3883 vdev_queue_t *vq = &vd->vdev_queue;
3884
3885 mutex_enter(&vq->vq_lock);
3886 if (avl_numnodes(&vq->vq_active_tree) > 0) {
3887 spa_t *spa = vd->vdev_spa;
3888 zio_t *fio;
3889 uint64_t delta;
3890
3891 /*
3892 * Look at the head of all the pending queues,
3893 * if any I/O has been outstanding for longer than
3894 * the spa_deadman_synctime we panic the system.
3895 */
3896 fio = avl_first(&vq->vq_active_tree);
3897 delta = gethrtime() - fio->io_timestamp;
3898 if (delta > spa_deadman_synctime(spa)) {
3899 vdev_dbgmsg(vd, "SLOW IO: zio timestamp "
3900 "%lluns, delta %lluns, last io %lluns",
3901 fio->io_timestamp, (u_longlong_t)delta,
3902 vq->vq_io_complete_ts);
3903 fm_panic("I/O to pool '%s' appears to be "
3904 "hung.", spa_name(spa));
3905 }
3906 }
3907 mutex_exit(&vq->vq_lock);
3908 }
3909 }
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright 2018 Nexenta Systems, Inc.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Toomas Soome <tsoome@me.com>
28 * Copyright 2017 Joyent, Inc.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/fm/fs/zfs.h>
33 #include <sys/spa.h>
34 #include <sys/spa_impl.h>
35 #include <sys/dmu.h>
36 #include <sys/dmu_tx.h>
37 #include <sys/vdev_impl.h>
38 #include <sys/uberblock_impl.h>
39 #include <sys/metaslab.h>
40 #include <sys/metaslab_impl.h>
41 #include <sys/space_map.h>
42 #include <sys/space_reftree.h>
43 #include <sys/zio.h>
44 #include <sys/zap.h>
45 #include <sys/fs/zfs.h>
46 #include <sys/arc.h>
47 #include <sys/zil.h>
48 #include <sys/dsl_scan.h>
49 #include <sys/abd.h>
50
51 /*
52 * Virtual device management.
53 */
54
55 static vdev_ops_t *vdev_ops_table[] = {
56 &vdev_root_ops,
57 &vdev_raidz_ops,
58 &vdev_mirror_ops,
59 &vdev_replacing_ops,
60 &vdev_spare_ops,
61 &vdev_disk_ops,
62 &vdev_file_ops,
63 &vdev_missing_ops,
64 &vdev_hole_ops,
65 NULL
66 };
67
68 /* maximum scrub/resilver I/O queue per leaf vdev */
69 int zfs_scrub_limit = 10;
70
71 /*
72 * alpha for exponential moving average of I/O latency (in 1/10th of a percent)
73 */
74 int zfs_vs_latency_alpha = 100;
75
76 /*
77 * When a vdev is added, it will be divided into approximately (but no
78 * more than) this number of metaslabs.
79 */
80 int metaslabs_per_vdev = 200;
81
82 /*
83 * Given a vdev type, return the appropriate ops vector.
84 */
85 static vdev_ops_t *
86 vdev_getops(const char *type)
87 {
88 vdev_ops_t *ops, **opspp;
89
90 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
91 if (strcmp(ops->vdev_op_type, type) == 0)
92 break;
93
94 return (ops);
95 }
96
97 boolean_t
98 vdev_is_special(vdev_t *vd)
99 {
100 return (vd ? vd->vdev_isspecial : B_FALSE);
101 }
102
103 /*
104 * Default asize function: return the MAX of psize with the asize of
105 * all children. This is what's used by anything other than RAID-Z.
106 */
107 uint64_t
108 vdev_default_asize(vdev_t *vd, uint64_t psize)
109 {
110 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
111 uint64_t csize;
112
113 for (int c = 0; c < vd->vdev_children; c++) {
114 csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
115 asize = MAX(asize, csize);
116 }
117
118 return (asize);
119 }
120
121 /*
122 * Get the minimum allocatable size. We define the allocatable size as
228 cvd->vdev_parent = pvd;
229
230 if (pvd == NULL)
231 return;
232
233 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
234
235 oldsize = pvd->vdev_children * sizeof (vdev_t *);
236 pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
237 newsize = pvd->vdev_children * sizeof (vdev_t *);
238
239 newchild = kmem_zalloc(newsize, KM_SLEEP);
240 if (pvd->vdev_child != NULL) {
241 bcopy(pvd->vdev_child, newchild, oldsize);
242 kmem_free(pvd->vdev_child, oldsize);
243 }
244
245 pvd->vdev_child = newchild;
246 pvd->vdev_child[id] = cvd;
247
248 cvd->vdev_isspecial_child =
249 (pvd->vdev_isspecial || pvd->vdev_isspecial_child);
250
251 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
252 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
253
254 /*
255 * Walk up all ancestors to update guid sum.
256 */
257 for (; pvd != NULL; pvd = pvd->vdev_parent)
258 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
259 }
260
261 void
262 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
263 {
264 int c;
265 uint_t id = cvd->vdev_id;
266
267 ASSERT(cvd->vdev_parent == pvd);
268
269 if (pvd == NULL)
270 return;
312
313 for (int c = newc = 0; c < oldc; c++) {
314 if ((cvd = pvd->vdev_child[c]) != NULL) {
315 newchild[newc] = cvd;
316 cvd->vdev_id = newc++;
317 }
318 }
319
320 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
321 pvd->vdev_child = newchild;
322 pvd->vdev_children = newc;
323 }
324
325 /*
326 * Allocate and minimally initialize a vdev_t.
327 */
328 vdev_t *
329 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
330 {
331 vdev_t *vd;
332
333 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
334
335 if (spa->spa_root_vdev == NULL) {
336 ASSERT(ops == &vdev_root_ops);
337 spa->spa_root_vdev = vd;
338 spa->spa_load_guid = spa_generate_guid(NULL);
339 }
340
341 if (guid == 0 && ops != &vdev_hole_ops) {
342 if (spa->spa_root_vdev == vd) {
343 /*
344 * The root vdev's guid will also be the pool guid,
345 * which must be unique among all pools.
346 */
347 guid = spa_generate_guid(NULL);
348 } else {
349 /*
350 * Any other vdev's guid must be unique within the pool.
351 */
352 guid = spa_generate_guid(spa);
353 }
354 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
355 }
356
357 vd->vdev_spa = spa;
358 vd->vdev_id = id;
359 vd->vdev_guid = guid;
360 vd->vdev_guid_sum = guid;
361 vd->vdev_ops = ops;
362 vd->vdev_state = VDEV_STATE_CLOSED;
363 vd->vdev_ishole = (ops == &vdev_hole_ops);
364
365 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
366 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
367 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
368 mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
369 rw_init(&vd->vdev_tsd_lock, NULL, RW_DEFAULT, NULL);
370 for (int t = 0; t < DTL_TYPES; t++) {
371 vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
372 &vd->vdev_dtl_lock);
373 }
374 txg_list_create(&vd->vdev_ms_list, spa,
375 offsetof(struct metaslab, ms_txg_node));
376 txg_list_create(&vd->vdev_dtl_list, spa,
377 offsetof(struct vdev, vdev_dtl_node));
378 vd->vdev_stat.vs_timestamp = gethrtime();
379 vdev_queue_init(vd);
380 vdev_cache_init(vd);
381
382 return (vd);
383 }
384
385 /*
386 * Allocate a new vdev. The 'alloctype' is used to control whether we are
387 * creating a new vdev or loading an existing one - the behavior is slightly
388 * different for each case.
389 */
390 int
391 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
392 int alloctype)
393 {
394 vdev_ops_t *ops;
395 char *type;
396 uint64_t guid = 0, nparity;
397 uint64_t isspecial = 0, islog = 0;
398 vdev_t *vd;
399
400 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
401
402 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
403 return (SET_ERROR(EINVAL));
404
405 if ((ops = vdev_getops(type)) == NULL)
406 return (SET_ERROR(EINVAL));
407
408 /*
409 * If this is a load, get the vdev guid from the nvlist.
410 * Otherwise, vdev_alloc_common() will generate one for us.
411 */
412 if (alloctype == VDEV_ALLOC_LOAD) {
413 uint64_t label_id;
414
415 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
416 label_id != id)
417 return (SET_ERROR(EINVAL));
418
421 } else if (alloctype == VDEV_ALLOC_SPARE) {
422 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
423 return (SET_ERROR(EINVAL));
424 } else if (alloctype == VDEV_ALLOC_L2CACHE) {
425 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
426 return (SET_ERROR(EINVAL));
427 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
428 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
429 return (SET_ERROR(EINVAL));
430 }
431
432 /*
433 * The first allocated vdev must be of type 'root'.
434 */
435 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
436 return (SET_ERROR(EINVAL));
437
438 /*
439 * Determine whether we're a log vdev.
440 */
441 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
442 if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
443 return (SET_ERROR(ENOTSUP));
444
445 /*
446 * Determine whether we're a special vdev.
447 */
448 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPECIAL, &isspecial);
449 if (isspecial && spa_version(spa) < SPA_VERSION_FEATURES)
450 return (SET_ERROR(ENOTSUP));
451
452 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
453 return (SET_ERROR(ENOTSUP));
454
455 /*
456 * Set the nparity property for RAID-Z vdevs.
457 */
458 nparity = -1ULL;
459 if (ops == &vdev_raidz_ops) {
460 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
461 &nparity) == 0) {
462 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
463 return (SET_ERROR(EINVAL));
464 /*
465 * Previous versions could only support 1 or 2 parity
466 * device.
467 */
468 if (nparity > 1 &&
469 spa_version(spa) < SPA_VERSION_RAIDZ2)
470 return (SET_ERROR(ENOTSUP));
471 if (nparity > 2 &&
472 spa_version(spa) < SPA_VERSION_RAIDZ3)
473 return (SET_ERROR(ENOTSUP));
474 } else {
475 /*
476 * We require the parity to be specified for SPAs that
477 * support multiple parity levels.
478 */
479 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
480 return (SET_ERROR(EINVAL));
481 /*
482 * Otherwise, we default to 1 parity device for RAID-Z.
483 */
484 nparity = 1;
485 }
486 } else {
487 nparity = 0;
488 }
489 ASSERT(nparity != -1ULL);
490
491 vd = vdev_alloc_common(spa, id, guid, ops);
492
493 vd->vdev_islog = islog;
494 vd->vdev_isspecial = isspecial;
495 vd->vdev_nparity = nparity;
496 vd->vdev_isspecial_child = (parent != NULL &&
497 (parent->vdev_isspecial || parent->vdev_isspecial_child));
498
499 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
500 vd->vdev_path = spa_strdup(vd->vdev_path);
501 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
502 vd->vdev_devid = spa_strdup(vd->vdev_devid);
503 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
504 &vd->vdev_physpath) == 0)
505 vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
506 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
507 vd->vdev_fru = spa_strdup(vd->vdev_fru);
508
509 #ifdef _KERNEL
510 if (vd->vdev_path) {
511 char dev_path[MAXPATHLEN];
512 char *last_slash = NULL;
513 kstat_t *exist = NULL;
514
515 if (strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) == 0)
516 last_slash = strrchr(vd->vdev_path, '/');
517
518 (void) sprintf(dev_path, "%s:%s", spa->spa_name,
519 last_slash != NULL ? last_slash + 1 : vd->vdev_path);
520
521 exist = kstat_hold_byname("zfs", 0, dev_path, ALL_ZONES);
522
523 if (!exist) {
524 vd->vdev_iokstat = kstat_create("zfs", 0, dev_path,
525 "zfs", KSTAT_TYPE_IO, 1, 0);
526
527 if (vd->vdev_iokstat) {
528 vd->vdev_iokstat->ks_lock =
529 &spa->spa_iokstat_lock;
530 kstat_install(vd->vdev_iokstat);
531 }
532 } else {
533 kstat_rele(exist);
534 }
535 }
536 #endif
537
538 /*
539 * Set the whole_disk property. If it's not specified, leave the value
540 * as -1.
541 */
542 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
543 &vd->vdev_wholedisk) != 0)
544 vd->vdev_wholedisk = -1ULL;
545
546 /*
547 * Set the is_ssd property. If it's not specified it means the media
548 * is not SSD or the request failed and we assume it's not.
549 */
550 if (nvlist_lookup_boolean(nv, ZPOOL_CONFIG_IS_SSD) == 0)
551 vd->vdev_is_ssd = B_TRUE;
552 else
553 vd->vdev_is_ssd = B_FALSE;
554
555 /*
556 * Look for the 'not present' flag. This will only be set if the device
557 * was not present at the time of import.
558 */
559 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
560 &vd->vdev_not_present);
561
562 /*
563 * Get the alignment requirement.
564 */
565 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
566
567 /*
568 * Retrieve the vdev creation time.
569 */
570 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
571 &vd->vdev_crtxg);
572
573 /*
574 * If we're a top-level vdev, try to load the allocation parameters.
575 */
576 if (parent && !parent->vdev_parent &&
577 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
578 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
579 &vd->vdev_ms_array);
580 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
581 &vd->vdev_ms_shift);
582 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
583 &vd->vdev_asize);
584 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
585 &vd->vdev_removing);
586 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
587 &vd->vdev_top_zap);
588 } else {
589 ASSERT0(vd->vdev_top_zap);
590 }
591
592 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
593 metaslab_class_t *mc = isspecial ? spa_special_class(spa) :
594 (islog ? spa_log_class(spa) : spa_normal_class(spa));
595
596 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
597 alloctype == VDEV_ALLOC_ADD ||
598 alloctype == VDEV_ALLOC_SPLIT ||
599 alloctype == VDEV_ALLOC_ROOTPOOL);
600
601 vd->vdev_mg = metaslab_group_create(mc, vd);
602 }
603
604 if (vd->vdev_ops->vdev_op_leaf &&
605 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
606 (void) nvlist_lookup_uint64(nv,
607 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
608 } else {
609 ASSERT0(vd->vdev_leaf_zap);
610 }
611
612 /*
613 * If we're a leaf vdev, try to load the DTL object and other state.
614 */
615
616 if (vd->vdev_ops->vdev_op_leaf &&
617 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
618 alloctype == VDEV_ALLOC_ROOTPOOL)) {
619 if (alloctype == VDEV_ALLOC_LOAD) {
620 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
621 &vd->vdev_dtl_object);
663 }
664 }
665 }
666
667 /*
668 * Add ourselves to the parent's list of children.
669 */
670 vdev_add_child(parent, vd);
671
672 *vdp = vd;
673
674 return (0);
675 }
676
677 void
678 vdev_free(vdev_t *vd)
679 {
680 spa_t *spa = vd->vdev_spa;
681
682 /*
683 * Scan queues are normally destroyed at the end of a scan. If the
684 * queue exists here, that implies the vdev is being removed while
685 * the scan is still running.
686 */
687 if (vd->vdev_scan_io_queue != NULL) {
688 dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
689 vd->vdev_scan_io_queue = NULL;
690 }
691
692 /*
693 * vdev_free() implies closing the vdev first. This is simpler than
694 * trying to ensure complicated semantics for all callers.
695 */
696 vdev_close(vd);
697
698 ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
699 ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
700
701 /*
702 * Free all children.
703 */
704 for (int c = 0; c < vd->vdev_children; c++)
705 vdev_free(vd->vdev_child[c]);
706
707 ASSERT(vd->vdev_child == NULL);
708 ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
709
710 /*
711 * Discard allocation state.
712 */
740 spa_strfree(vd->vdev_physpath);
741 if (vd->vdev_fru)
742 spa_strfree(vd->vdev_fru);
743
744 if (vd->vdev_isspare)
745 spa_spare_remove(vd);
746 if (vd->vdev_isl2cache)
747 spa_l2cache_remove(vd);
748
749 txg_list_destroy(&vd->vdev_ms_list);
750 txg_list_destroy(&vd->vdev_dtl_list);
751
752 mutex_enter(&vd->vdev_dtl_lock);
753 space_map_close(vd->vdev_dtl_sm);
754 for (int t = 0; t < DTL_TYPES; t++) {
755 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
756 range_tree_destroy(vd->vdev_dtl[t]);
757 }
758 mutex_exit(&vd->vdev_dtl_lock);
759
760 if (vd->vdev_iokstat) {
761 kstat_delete(vd->vdev_iokstat);
762 vd->vdev_iokstat = NULL;
763 }
764 mutex_destroy(&vd->vdev_dtl_lock);
765 mutex_destroy(&vd->vdev_stat_lock);
766 mutex_destroy(&vd->vdev_probe_lock);
767 mutex_destroy(&vd->vdev_scan_io_queue_lock);
768 rw_destroy(&vd->vdev_tsd_lock);
769
770 if (vd == spa->spa_root_vdev)
771 spa->spa_root_vdev = NULL;
772
773 ASSERT3P(vd->vdev_scan_io_queue, ==, NULL);
774
775 kmem_free(vd, sizeof (vdev_t));
776 }
777
778 /*
779 * Transfer top-level vdev state from svd to tvd.
780 */
781 static void
782 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
783 {
784 spa_t *spa = svd->vdev_spa;
785 metaslab_t *msp;
786 vdev_t *vd;
787 int t;
788
789 ASSERT(tvd == tvd->vdev_top);
790
791 tvd->vdev_ms_array = svd->vdev_ms_array;
792 tvd->vdev_ms_shift = svd->vdev_ms_shift;
793 tvd->vdev_ms_count = svd->vdev_ms_count;
794 tvd->vdev_top_zap = svd->vdev_top_zap;
824 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
825 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
826 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
827 }
828
829 if (list_link_active(&svd->vdev_config_dirty_node)) {
830 vdev_config_clean(svd);
831 vdev_config_dirty(tvd);
832 }
833
834 if (list_link_active(&svd->vdev_state_dirty_node)) {
835 vdev_state_clean(svd);
836 vdev_state_dirty(tvd);
837 }
838
839 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
840 svd->vdev_deflate_ratio = 0;
841
842 tvd->vdev_islog = svd->vdev_islog;
843 svd->vdev_islog = 0;
844
845 tvd->vdev_isspecial = svd->vdev_isspecial;
846 svd->vdev_isspecial = 0;
847 svd->vdev_isspecial_child = tvd->vdev_isspecial;
848
849 dsl_scan_io_queue_vdev_xfer(svd, tvd);
850 }
851
852 static void
853 vdev_top_update(vdev_t *tvd, vdev_t *vd)
854 {
855 if (vd == NULL)
856 return;
857
858 vd->vdev_top = tvd;
859
860 for (int c = 0; c < vd->vdev_children; c++)
861 vdev_top_update(tvd, vd->vdev_child[c]);
862 }
863
864 /*
865 * Add a mirror/replacing vdev above an existing vdev.
866 */
867 vdev_t *
868 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
869 {
870 spa_t *spa = cvd->vdev_spa;
871 vdev_t *pvd = cvd->vdev_parent;
872 vdev_t *mvd;
873
874 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
875
876 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
877
878 mvd->vdev_asize = cvd->vdev_asize;
879 mvd->vdev_min_asize = cvd->vdev_min_asize;
880 mvd->vdev_max_asize = cvd->vdev_max_asize;
881 mvd->vdev_ashift = cvd->vdev_ashift;
882 mvd->vdev_state = cvd->vdev_state;
883 mvd->vdev_crtxg = cvd->vdev_crtxg;
884
885 vdev_remove_child(pvd, cvd);
886 vdev_add_child(pvd, mvd);
887 cvd->vdev_id = mvd->vdev_children;
888 vdev_add_child(mvd, cvd);
889 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
890
891 if (mvd == mvd->vdev_top)
892 vdev_top_transfer(cvd, mvd);
893
894 return (mvd);
895 }
896
897 /*
898 * Remove a 1-way mirror/replacing vdev from the tree.
899 */
900 void
941 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
942 {
943 spa_t *spa = vd->vdev_spa;
944 objset_t *mos = spa->spa_meta_objset;
945 uint64_t m;
946 uint64_t oldc = vd->vdev_ms_count;
947 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
948 metaslab_t **mspp;
949 int error;
950
951 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
952
953 /*
954 * This vdev is not being allocated from yet or is a hole.
955 */
956 if (vd->vdev_ms_shift == 0)
957 return (0);
958
959 ASSERT(!vd->vdev_ishole);
960
961 /*
962 * Compute the raidz-deflation ratio. Note, we hard-code
963 * in 128k (1 << 17) because it is the "typical" blocksize.
964 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
965 * otherwise it would inconsistently account for existing bp's.
966 */
967 vd->vdev_deflate_ratio = (1 << 17) /
968 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
969
970 ASSERT(oldc <= newc);
971
972 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
973
974 if (oldc != 0) {
975 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
976 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
977 }
978
979 vd->vdev_ms = mspp;
980 vd->vdev_ms_count = newc;
981
982 for (m = oldc; m < newc; m++) {
983 uint64_t object = 0;
984
985 if (txg == 0) {
986 error = dmu_read(mos, vd->vdev_ms_array,
987 m * sizeof (uint64_t), sizeof (uint64_t), &object,
988 DMU_READ_PREFETCH);
989 if (error)
990 return (error);
991 }
992
993 error = metaslab_init(vd->vdev_mg, m, object, txg,
994 &(vd->vdev_ms[m]));
995 if (error)
996 return (error);
997 }
998
999 if (txg == 0)
1000 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
1001
1002 /*
1003 * If the vdev is being removed we don't activate
1004 * the metaslabs since we want to ensure that no new
1005 * allocations are performed on this device.
1006 */
1007 if (oldc == 0 && !vd->vdev_removing)
1008 metaslab_group_activate(vd->vdev_mg);
1009
1010 if (txg == 0)
1011 spa_config_exit(spa, SCL_ALLOC, FTAG);
1012
1013 return (0);
1014 }
1015
1016 void
1017 vdev_metaslab_fini(vdev_t *vd)
1018 {
1019 uint64_t m;
1020 uint64_t count = vd->vdev_ms_count;
1021
1022 if (vd->vdev_ms != NULL) {
1023 metaslab_group_passivate(vd->vdev_mg);
1024 for (m = 0; m < count; m++) {
1025 metaslab_t *msp = vd->vdev_ms[m];
1026
1027 if (msp != NULL)
1028 metaslab_fini(msp);
1029 }
1030 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1031 vd->vdev_ms = NULL;
1032 }
1033 }
1034
1035 typedef struct vdev_probe_stats {
1036 boolean_t vps_readable;
1037 boolean_t vps_writeable;
1038 int vps_flags;
1039 } vdev_probe_stats_t;
1040
1041 static void
1042 vdev_probe_done(zio_t *zio)
1043 {
1044 spa_t *spa = zio->io_spa;
1045 vdev_t *vd = zio->io_vd;
1046 vdev_probe_stats_t *vps = zio->io_private;
1047
1048 ASSERT(vd->vdev_probe_zio != NULL);
1049
1050 if (zio->io_type == ZIO_TYPE_READ) {
1051 if (zio->io_error == 0)
1052 vps->vps_readable = 1;
1056 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1057 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
1058 } else {
1059 abd_free(zio->io_abd);
1060 }
1061 } else if (zio->io_type == ZIO_TYPE_WRITE) {
1062 if (zio->io_error == 0)
1063 vps->vps_writeable = 1;
1064 abd_free(zio->io_abd);
1065 } else if (zio->io_type == ZIO_TYPE_NULL) {
1066 zio_t *pio;
1067
1068 vd->vdev_cant_read |= !vps->vps_readable;
1069 vd->vdev_cant_write |= !vps->vps_writeable;
1070
1071 if (vdev_readable(vd) &&
1072 (vdev_writeable(vd) || !spa_writeable(spa))) {
1073 zio->io_error = 0;
1074 } else {
1075 ASSERT(zio->io_error != 0);
1076 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
1077 spa, vd, NULL, 0, 0);
1078 zio->io_error = SET_ERROR(ENXIO);
1079 }
1080
1081 mutex_enter(&vd->vdev_probe_lock);
1082 ASSERT(vd->vdev_probe_zio == zio);
1083 vd->vdev_probe_zio = NULL;
1084 mutex_exit(&vd->vdev_probe_lock);
1085
1086 zio_link_t *zl = NULL;
1087 while ((pio = zio_walk_parents(zio, &zl)) != NULL)
1088 if (!vdev_accessible(vd, pio))
1089 pio->io_error = SET_ERROR(ENXIO);
1090
1091 kmem_free(vps, sizeof (*vps));
1092 }
1093 }
1094
1095 /*
1223 * in a single thread so that the same thread holds the
1224 * spa_namespace_lock
1225 */
1226 if (vdev_uses_zvols(vd)) {
1227 for (int c = 0; c < children; c++)
1228 vd->vdev_child[c]->vdev_open_error =
1229 vdev_open(vd->vdev_child[c]);
1230 return;
1231 }
1232 tq = taskq_create("vdev_open", children, minclsyspri,
1233 children, children, TASKQ_PREPOPULATE);
1234
1235 for (int c = 0; c < children; c++)
1236 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
1237 TQ_SLEEP) != NULL);
1238
1239 taskq_destroy(tq);
1240 }
1241
1242 /*
1243 * Prepare a virtual device for access.
1244 */
1245 int
1246 vdev_open(vdev_t *vd)
1247 {
1248 spa_t *spa = vd->vdev_spa;
1249 int error;
1250 uint64_t osize = 0;
1251 uint64_t max_osize = 0;
1252 uint64_t asize, max_asize, psize;
1253 uint64_t ashift = 0;
1254
1255 ASSERT(vd->vdev_open_thread == curthread ||
1256 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1257 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1258 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1259 vd->vdev_state == VDEV_STATE_OFFLINE);
1260
1261 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1262 vd->vdev_cant_read = B_FALSE;
1263 vd->vdev_cant_write = B_FALSE;
1264 vd->vdev_min_asize = vdev_get_min_asize(vd);
1265
1266 /*
1267 * If vdev isn't removed and is faulted for reasons other than failed
1268 * open, or if it's offline - bail out.
1269 */
1270 if (!vd->vdev_removed && vd->vdev_faulted &&
1271 vd->vdev_label_aux != VDEV_AUX_OPEN_FAILED) {
1272 ASSERT(vd->vdev_children == 0);
1273 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1274 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1275 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1276 vd->vdev_label_aux);
1277 return (SET_ERROR(ENXIO));
1278 } else if (vd->vdev_offline) {
1279 ASSERT(vd->vdev_children == 0);
1280 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1281 return (SET_ERROR(ENXIO));
1282 }
1283
1284 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
1285
1286 /*
1287 * Reset the vdev_reopening flag so that we actually close
1288 * the vdev on error.
1289 */
1290 vd->vdev_reopening = B_FALSE;
1291 if (zio_injection_enabled && error == 0)
1292 error = zio_handle_device_injection(vd, NULL, ENXIO);
1293
1294 if (error) {
1295 if (vd->vdev_removed &&
1296 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1297 vd->vdev_removed = B_FALSE;
1298
1299 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1300 vd->vdev_stat.vs_aux);
1301 return (error);
1302 }
1303
1304 vd->vdev_removed = B_FALSE;
1305
1306 /*
1307 * Recheck the faulted flag now that we have confirmed that
1308 * the vdev is accessible. If we're faulted, bail.
1309 */
1310 if (vd->vdev_faulted) {
1311 ASSERT(vd->vdev_children == 0);
1312 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
1313 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
1314 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1315 vd->vdev_label_aux);
1316 return (SET_ERROR(ENXIO));
1317 }
1318
1319 if (vd->vdev_degraded) {
1320 ASSERT(vd->vdev_children == 0);
1440 spa->spa_min_ashift = vd->vdev_ashift;
1441 }
1442
1443 /*
1444 * If a leaf vdev has a DTL, and seems healthy, then kick off a
1445 * resilver. But don't do this if we are doing a reopen for a scrub,
1446 * since this would just restart the scrub we are already doing.
1447 */
1448 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1449 vdev_resilver_needed(vd, NULL, NULL))
1450 spa_async_request(spa, SPA_ASYNC_RESILVER);
1451
1452 return (0);
1453 }
1454
1455 /*
1456 * Called once the vdevs are all opened, this routine validates the label
1457 * contents. This needs to be done before vdev_load() so that we don't
1458 * inadvertently do repair I/Os to the wrong device.
1459 *
1460 * If 'strict' is false ignore the spa guid check. This is necessary because
1461 * if the machine crashed during a re-guid the new guid might have been written
1462 * to all of the vdev labels, but not the cached config. The strict check
1463 * will be performed when the pool is opened again using the mos config.
1464 *
1465 * This function will only return failure if one of the vdevs indicates that it
1466 * has since been destroyed or exported. This is only possible if
1467 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
1468 * will be updated but the function will return 0.
1469 */
1470 int
1471 vdev_validate(vdev_t *vd, boolean_t strict)
1472 {
1473 spa_t *spa = vd->vdev_spa;
1474 nvlist_t *label;
1475 uint64_t guid = 0, top_guid;
1476 uint64_t state;
1477
1478 for (int c = 0; c < vd->vdev_children; c++)
1479 if (vdev_validate(vd->vdev_child[c], strict) != 0)
1480 return (SET_ERROR(EBADF));
1481
1482 /*
1483 * If the device has already failed, or was marked offline, don't do
1484 * any further validation. Otherwise, label I/O will fail and we will
1485 * overwrite the previous state.
1486 */
1487 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
1488 uint64_t aux_guid = 0;
1489 nvlist_t *nvl;
1490 uint64_t txg = spa_last_synced_txg(spa) != 0 ?
1491 spa_last_synced_txg(spa) : -1ULL;
1492
1493 if ((label = vdev_label_read_config(vd, txg)) == NULL) {
1494 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1495 VDEV_AUX_BAD_LABEL);
1496 return (0);
1497 }
1498
1499 /*
1500 * Determine if this vdev has been split off into another
1501 * pool. If so, then refuse to open it.
1502 */
1503 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
1504 &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
1505 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1506 VDEV_AUX_SPLIT_POOL);
1507 nvlist_free(label);
1508 return (0);
1509 }
1510
1511 if (strict && (nvlist_lookup_uint64(label,
1512 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
1513 guid != spa_guid(spa))) {
1514 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1515 VDEV_AUX_CORRUPT_DATA);
1516 nvlist_free(label);
1517 return (0);
1518 }
1519
1520 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
1521 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
1522 &aux_guid) != 0)
1523 aux_guid = 0;
1524
1525 /*
1526 * If this vdev just became a top-level vdev because its
1527 * sibling was detached, it will have adopted the parent's
1528 * vdev guid -- but the label may or may not be on disk yet.
1529 * Fortunately, either version of the label will have the
1530 * same top guid, so if we're a top-level vdev, we can
1531 * safely compare to that instead.
1532 *
1533 * If we split this vdev off instead, then we also check the
1534 * original pool's guid. We don't want to consider the vdev
1535 * corrupt if it is partway through a split operation.
1536 */
1537 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1538 &guid) != 0 ||
1539 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1540 &top_guid) != 0 ||
1541 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
1542 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1543 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1544 VDEV_AUX_CORRUPT_DATA);
1545 nvlist_free(label);
1546 return (0);
1547 }
1548
1549 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1550 &state) != 0) {
1551 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1552 VDEV_AUX_CORRUPT_DATA);
1553 nvlist_free(label);
1554 return (0);
1555 }
1556
1557 nvlist_free(label);
1558
1559 /*
1560 * If this is a verbatim import, no need to check the
1561 * state of the pool.
1562 */
1563 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
1564 spa_load_state(spa) == SPA_LOAD_OPEN &&
1565 state != POOL_STATE_ACTIVE)
1566 return (SET_ERROR(EBADF));
1567
1568 /*
1569 * If we were able to open and validate a vdev that was
1570 * previously marked permanently unavailable, clear that state
1571 * now.
1572 */
1573 if (vd->vdev_not_present)
1574 vd->vdev_not_present = 0;
1575 }
1576
1577 return (0);
1578 }
1579
1580 /*
1581 * Close a virtual device.
1582 */
1583 void
1584 vdev_close(vdev_t *vd)
1585 {
1586 spa_t *spa = vd->vdev_spa;
1587 vdev_t *pvd = vd->vdev_parent;
1588
1589 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1590
1591 /*
1592 * If our parent is reopening, then we are as well, unless we are
1593 * going offline.
1594 */
1595 if (pvd != NULL && pvd->vdev_reopening)
1596 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
1597
1598 vd->vdev_ops->vdev_op_close(vd);
1599
1600 vdev_cache_purge(vd);
1652 vdev_reopen(vdev_t *vd)
1653 {
1654 spa_t *spa = vd->vdev_spa;
1655
1656 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1657
1658 /* set the reopening flag unless we're taking the vdev offline */
1659 vd->vdev_reopening = !vd->vdev_offline;
1660 vdev_close(vd);
1661 (void) vdev_open(vd);
1662
1663 /*
1664 * Call vdev_validate() here to make sure we have the same device.
1665 * Otherwise, a device with an invalid label could be successfully
1666 * opened in response to vdev_reopen().
1667 */
1668 if (vd->vdev_aux) {
1669 (void) vdev_validate_aux(vd);
1670 if (vdev_readable(vd) && vdev_writeable(vd) &&
1671 vd->vdev_aux == &spa->spa_l2cache &&
1672 !l2arc_vdev_present(vd)) {
1673 /*
1674 * When reopening we can assume persistent L2ARC is
1675 * supported, since we've already opened the device
1676 * in the past and prepended an L2ARC uberblock.
1677 */
1678 l2arc_add_vdev(spa, vd, B_TRUE);
1679 }
1680 } else {
1681 (void) vdev_validate(vd, B_TRUE);
1682 }
1683
1684 /*
1685 * Reassess parent vdev's health.
1686 */
1687 vdev_propagate_state(vd);
1688 }
1689
1690 int
1691 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1692 {
1693 int error;
1694
1695 /*
1696 * Normally, partial opens (e.g. of a mirror) are allowed.
1697 * For a create, however, we want to fail the request if
1698 * there are any components we can't open.
1699 */
1700 error = vdev_open(vd);
1701
1714 return (error);
1715 }
1716
1717 return (0);
1718 }
1719
1720 void
1721 vdev_metaslab_set_size(vdev_t *vd)
1722 {
1723 /*
1724 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
1725 */
1726 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
1727 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1728 }
1729
1730 void
1731 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1732 {
1733 ASSERT(vd == vd->vdev_top);
1734 ASSERT(!vd->vdev_ishole);
1735 ASSERT(ISP2(flags));
1736 ASSERT(spa_writeable(vd->vdev_spa));
1737
1738 if (flags & VDD_METASLAB)
1739 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1740
1741 if (flags & VDD_DTL)
1742 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1743
1744 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1745 }
1746
1747 void
1748 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
1749 {
1750 for (int c = 0; c < vd->vdev_children; c++)
1751 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
1752
1753 if (vd->vdev_ops->vdev_op_leaf)
1754 vdev_dirty(vd->vdev_top, flags, vd, txg);
1784 * comprising only those txgs which appear in 'maxfaults' or more children;
1785 * those are the txgs we don't have enough replication to read. For example,
1786 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1787 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1788 * two child DTL_MISSING maps.
1789 *
1790 * It should be clear from the above that to compute the DTLs and outage maps
1791 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1792 * Therefore, that is all we keep on disk. When loading the pool, or after
1793 * a configuration change, we generate all other DTLs from first principles.
1794 */
1795 void
1796 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1797 {
1798 range_tree_t *rt = vd->vdev_dtl[t];
1799
1800 ASSERT(t < DTL_TYPES);
1801 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1802 ASSERT(spa_writeable(vd->vdev_spa));
1803
1804 mutex_enter(rt->rt_lock);
1805 if (!range_tree_contains(rt, txg, size))
1806 range_tree_add(rt, txg, size);
1807 mutex_exit(rt->rt_lock);
1808 }
1809
1810 boolean_t
1811 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1812 {
1813 range_tree_t *rt = vd->vdev_dtl[t];
1814 boolean_t dirty = B_FALSE;
1815
1816 ASSERT(t < DTL_TYPES);
1817 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1818
1819 mutex_enter(rt->rt_lock);
1820 if (range_tree_space(rt) != 0)
1821 dirty = range_tree_contains(rt, txg, size);
1822 mutex_exit(rt->rt_lock);
1823
1824 return (dirty);
1825 }
1826
1827 boolean_t
1828 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1829 {
1830 range_tree_t *rt = vd->vdev_dtl[t];
1831 boolean_t empty;
1832
1833 mutex_enter(rt->rt_lock);
1834 empty = (range_tree_space(rt) == 0);
1835 mutex_exit(rt->rt_lock);
1836
1837 return (empty);
1838 }
1839
1840 /*
1841 * Returns the lowest txg in the DTL range.
1842 */
1843 static uint64_t
1844 vdev_dtl_min(vdev_t *vd)
1845 {
1846 range_seg_t *rs;
1847
1848 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
1849 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
1850 ASSERT0(vd->vdev_children);
1851
1852 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
1853 return (rs->rs_start - 1);
1854 }
1855
1908 }
1909 return (B_FALSE);
1910 }
1911
1912 /*
1913 * Reassess DTLs after a config change or scrub completion.
1914 */
1915 void
1916 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1917 {
1918 spa_t *spa = vd->vdev_spa;
1919 avl_tree_t reftree;
1920 int minref;
1921
1922 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1923
1924 for (int c = 0; c < vd->vdev_children; c++)
1925 vdev_dtl_reassess(vd->vdev_child[c], txg,
1926 scrub_txg, scrub_done);
1927
1928 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
1929 return;
1930
1931 if (vd->vdev_ops->vdev_op_leaf) {
1932 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
1933
1934 mutex_enter(&vd->vdev_dtl_lock);
1935
1936 /*
1937 * If we've completed a scan cleanly then determine
1938 * if this vdev should remove any DTLs. We only want to
1939 * excise regions on vdevs that were available during
1940 * the entire duration of this scan.
1941 */
1942 if (scrub_txg != 0 &&
1943 (spa->spa_scrub_started ||
1944 (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
1945 vdev_dtl_should_excise(vd)) {
1946 /*
1947 * We completed a scrub up to scrub_txg. If we
1948 * did it without rebooting, then the scrub dtl
2014 for (int c = 0; c < vd->vdev_children; c++) {
2015 vdev_t *cvd = vd->vdev_child[c];
2016 mutex_enter(&cvd->vdev_dtl_lock);
2017 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
2018 mutex_exit(&cvd->vdev_dtl_lock);
2019 }
2020 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
2021 space_reftree_destroy(&reftree);
2022 }
2023 mutex_exit(&vd->vdev_dtl_lock);
2024 }
2025
2026 int
2027 vdev_dtl_load(vdev_t *vd)
2028 {
2029 spa_t *spa = vd->vdev_spa;
2030 objset_t *mos = spa->spa_meta_objset;
2031 int error = 0;
2032
2033 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2034 ASSERT(!vd->vdev_ishole);
2035
2036 error = space_map_open(&vd->vdev_dtl_sm, mos,
2037 vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
2038 if (error)
2039 return (error);
2040 ASSERT(vd->vdev_dtl_sm != NULL);
2041
2042 mutex_enter(&vd->vdev_dtl_lock);
2043
2044 /*
2045 * Now that we've opened the space_map we need to update
2046 * the in-core DTL.
2047 */
2048 space_map_update(vd->vdev_dtl_sm);
2049
2050 error = space_map_load(vd->vdev_dtl_sm,
2051 vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2052 mutex_exit(&vd->vdev_dtl_lock);
2053
2054 return (error);
2055 }
2056
2057 for (int c = 0; c < vd->vdev_children; c++) {
2096 !vd->vdev_top->vdev_removing) {
2097 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
2098 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
2099 }
2100 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
2101 vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
2102 }
2103 }
2104 for (uint64_t i = 0; i < vd->vdev_children; i++) {
2105 vdev_construct_zaps(vd->vdev_child[i], tx);
2106 }
2107 }
2108
2109 void
2110 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
2111 {
2112 spa_t *spa = vd->vdev_spa;
2113 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
2114 objset_t *mos = spa->spa_meta_objset;
2115 range_tree_t *rtsync;
2116 kmutex_t rtlock;
2117 dmu_tx_t *tx;
2118 uint64_t object = space_map_object(vd->vdev_dtl_sm);
2119
2120 ASSERT(!vd->vdev_ishole);
2121 ASSERT(vd->vdev_ops->vdev_op_leaf);
2122
2123 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2124
2125 if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
2126 mutex_enter(&vd->vdev_dtl_lock);
2127 space_map_free(vd->vdev_dtl_sm, tx);
2128 space_map_close(vd->vdev_dtl_sm);
2129 vd->vdev_dtl_sm = NULL;
2130 mutex_exit(&vd->vdev_dtl_lock);
2131
2132 /*
2133 * We only destroy the leaf ZAP for detached leaves or for
2134 * removed log devices. Removed data devices handle leaf ZAP
2135 * cleanup later, once cancellation is no longer possible.
2136 */
2137 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
2138 vd->vdev_top->vdev_islog || vd->vdev_top->vdev_isspecial)) {
2139 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
2140 vd->vdev_leaf_zap = 0;
2141 }
2142
2143 dmu_tx_commit(tx);
2144 return;
2145 }
2146
2147 if (vd->vdev_dtl_sm == NULL) {
2148 uint64_t new_object;
2149
2150 new_object = space_map_alloc(mos, tx);
2151 VERIFY3U(new_object, !=, 0);
2152
2153 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
2154 0, -1ULL, 0, &vd->vdev_dtl_lock));
2155 ASSERT(vd->vdev_dtl_sm != NULL);
2156 }
2157
2158 mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
2159
2160 rtsync = range_tree_create(NULL, NULL, &rtlock);
2161
2162 mutex_enter(&rtlock);
2163
2164 mutex_enter(&vd->vdev_dtl_lock);
2165 range_tree_walk(rt, range_tree_add, rtsync);
2166 mutex_exit(&vd->vdev_dtl_lock);
2167
2168 space_map_truncate(vd->vdev_dtl_sm, tx);
2169 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
2170 range_tree_vacate(rtsync, NULL, NULL);
2171
2172 range_tree_destroy(rtsync);
2173
2174 mutex_exit(&rtlock);
2175 mutex_destroy(&rtlock);
2176
2177 /*
2178 * If the object for the space map has changed then dirty
2179 * the top level so that we update the config.
2180 */
2181 if (object != space_map_object(vd->vdev_dtl_sm)) {
2182 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
2183 "new object %llu", txg, spa_name(spa), object,
2184 space_map_object(vd->vdev_dtl_sm));
2185 vdev_config_dirty(vd->vdev_top);
2186 }
2187
2188 dmu_tx_commit(tx);
2189
2190 mutex_enter(&vd->vdev_dtl_lock);
2191 space_map_update(vd->vdev_dtl_sm);
2192 mutex_exit(&vd->vdev_dtl_lock);
2193 }
2194
2195 /*
2196 * Determine whether the specified vdev can be offlined/detached/removed
2197 * without losing data.
2198 */
2199 boolean_t
2200 vdev_dtl_required(vdev_t *vd)
2201 {
2202 spa_t *spa = vd->vdev_spa;
2203 vdev_t *tvd = vd->vdev_top;
2204 uint8_t cant_read = vd->vdev_cant_read;
2249 } else {
2250 for (int c = 0; c < vd->vdev_children; c++) {
2251 vdev_t *cvd = vd->vdev_child[c];
2252 uint64_t cmin, cmax;
2253
2254 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
2255 thismin = MIN(thismin, cmin);
2256 thismax = MAX(thismax, cmax);
2257 needed = B_TRUE;
2258 }
2259 }
2260 }
2261
2262 if (needed && minp) {
2263 *minp = thismin;
2264 *maxp = thismax;
2265 }
2266 return (needed);
2267 }
2268
2269 void
2270 vdev_load(vdev_t *vd)
2271 {
2272 /*
2273 * Recursively load all children.
2274 */
2275 for (int c = 0; c < vd->vdev_children; c++)
2276 vdev_load(vd->vdev_child[c]);
2277
2278 /*
2279 * If this is a top-level vdev, initialize its metaslabs.
2280 */
2281 if (vd == vd->vdev_top && !vd->vdev_ishole &&
2282 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
2283 vdev_metaslab_init(vd, 0) != 0))
2284 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2285 VDEV_AUX_CORRUPT_DATA);
2286
2287 /*
2288 * If this is a leaf vdev, load its DTL.
2289 */
2290 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
2291 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2292 VDEV_AUX_CORRUPT_DATA);
2293 }
2294
2295 /*
2296 * The special vdev case is used for hot spares and l2cache devices. Its
2297 * sole purpose it to set the vdev state for the associated vdev. To do this,
2298 * we make sure that we can open the underlying device, then try to read the
2299 * label, and make sure that the label is sane and that it hasn't been
2300 * repurposed to another pool.
2301 */
2302 int
2303 vdev_validate_aux(vdev_t *vd)
2304 {
2305 nvlist_t *label;
2306 uint64_t guid, version;
2307 uint64_t state;
2308
2309 if (!vdev_readable(vd))
2310 return (0);
2311
2312 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
2317
2318 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
2319 !SPA_VERSION_IS_SUPPORTED(version) ||
2320 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
2321 guid != vd->vdev_guid ||
2322 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
2323 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2324 VDEV_AUX_CORRUPT_DATA);
2325 nvlist_free(label);
2326 return (-1);
2327 }
2328
2329 /*
2330 * We don't actually check the pool state here. If it's in fact in
2331 * use by another pool, we update this fact on the fly when requested.
2332 */
2333 nvlist_free(label);
2334 return (0);
2335 }
2336
2337 void
2338 vdev_remove(vdev_t *vd, uint64_t txg)
2339 {
2340 spa_t *spa = vd->vdev_spa;
2341 objset_t *mos = spa->spa_meta_objset;
2342 dmu_tx_t *tx;
2343
2344 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2345 ASSERT(vd == vd->vdev_top);
2346 ASSERT3U(txg, ==, spa_syncing_txg(spa));
2347
2348 if (vd->vdev_ms != NULL) {
2349 metaslab_group_t *mg = vd->vdev_mg;
2350
2351 metaslab_group_histogram_verify(mg);
2352 metaslab_class_histogram_verify(mg->mg_class);
2353
2354 for (int m = 0; m < vd->vdev_ms_count; m++) {
2355 metaslab_t *msp = vd->vdev_ms[m];
2356
2357 if (msp == NULL || msp->ms_sm == NULL)
2358 continue;
2359
2360 mutex_enter(&msp->ms_lock);
2361 /*
2362 * If the metaslab was not loaded when the vdev
2363 * was removed then the histogram accounting may
2364 * not be accurate. Update the histogram information
2365 * here so that we ensure that the metaslab group
2366 * and metaslab class are up-to-date.
2367 */
2368 metaslab_group_histogram_remove(mg, msp);
2369
2370 VERIFY0(space_map_allocated(msp->ms_sm));
2371 space_map_free(msp->ms_sm, tx);
2372 space_map_close(msp->ms_sm);
2373 msp->ms_sm = NULL;
2374 mutex_exit(&msp->ms_lock);
2375 }
2376
2377 metaslab_group_histogram_verify(mg);
2378 metaslab_class_histogram_verify(mg->mg_class);
2379 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
2380 ASSERT0(mg->mg_histogram[i]);
2381
2382 }
2383
2384 if (vd->vdev_ms_array) {
2385 (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
2386 vd->vdev_ms_array = 0;
2387 }
2388
2389 if ((vd->vdev_islog || vd->vdev_isspecial) &&
2390 vd->vdev_top_zap != 0) {
2391 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
2392 vd->vdev_top_zap = 0;
2393 }
2394 dmu_tx_commit(tx);
2395 }
2396
2397 void
2398 vdev_sync_done(vdev_t *vd, uint64_t txg)
2399 {
2400 metaslab_t *msp;
2401 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
2402
2403 ASSERT(!vd->vdev_ishole);
2404
2405 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
2406 metaslab_sync_done(msp, txg);
2407
2408 if (reassess)
2409 metaslab_sync_reassess(vd->vdev_mg);
2410 }
2411
2412 void
2413 vdev_sync(vdev_t *vd, uint64_t txg)
2414 {
2415 spa_t *spa = vd->vdev_spa;
2416 vdev_t *lvd;
2417 metaslab_t *msp;
2418 dmu_tx_t *tx;
2419
2420 ASSERT(!vd->vdev_ishole);
2421
2422 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
2423 ASSERT(vd == vd->vdev_top);
2424 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2425 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
2426 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
2427 ASSERT(vd->vdev_ms_array != 0);
2428 vdev_config_dirty(vd);
2429 dmu_tx_commit(tx);
2430 }
2431
2432 /*
2433 * Remove the metadata associated with this vdev once it's empty.
2434 */
2435 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
2436 vdev_remove(vd, txg);
2437
2438 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
2439 metaslab_sync(msp, txg);
2440 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
2441 }
2442
2443 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
2444 vdev_dtl_sync(lvd, txg);
2445
2446 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
2447 }
2448
2449 uint64_t
2450 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
2451 {
2452 return (vd->vdev_ops->vdev_op_asize(vd, psize));
2453 }
2454
2455 /*
2456 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
2457 * not be opened, and no I/O is attempted.
2458 */
2459 int
2460 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
2461 {
2462 vdev_t *vd, *tvd;
2463
2464 spa_vdev_state_enter(spa, SCL_NONE);
2465
2549 */
2550 int
2551 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
2552 {
2553 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
2554 boolean_t wasoffline;
2555 vdev_state_t oldstate;
2556
2557 spa_vdev_state_enter(spa, SCL_NONE);
2558
2559 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
2560 return (spa_vdev_state_exit(spa, NULL, ENODEV));
2561
2562 if (!vd->vdev_ops->vdev_op_leaf)
2563 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
2564
2565 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
2566 oldstate = vd->vdev_state;
2567
2568 tvd = vd->vdev_top;
2569 vd->vdev_offline = 0ULL;
2570 vd->vdev_tmpoffline = 0ULL;
2571 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
2572 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
2573
2574 /* XXX - L2ARC 1.0 does not support expansion */
2575 if (!vd->vdev_aux) {
2576 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2577 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
2578 }
2579
2580 vdev_reopen(tvd);
2581 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
2582
2583 if (!vd->vdev_aux) {
2584 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2585 pvd->vdev_expanding = B_FALSE;
2586 }
2587
2588 if (newstate)
2589 *newstate = vd->vdev_state;
2590 if ((flags & ZFS_ONLINE_UNSPARE) &&
2639 * don't allow it to be offlined. Log devices are always
2640 * expendable.
2641 */
2642 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
2643 vdev_dtl_required(vd))
2644 return (spa_vdev_state_exit(spa, NULL, EBUSY));
2645
2646 /*
2647 * If the top-level is a slog and it has had allocations
2648 * then proceed. We check that the vdev's metaslab group
2649 * is not NULL since it's possible that we may have just
2650 * added this vdev but not yet initialized its metaslabs.
2651 */
2652 if (tvd->vdev_islog && mg != NULL) {
2653 /*
2654 * Prevent any future allocations.
2655 */
2656 metaslab_group_passivate(mg);
2657 (void) spa_vdev_state_exit(spa, vd, 0);
2658
2659 error = spa_offline_log(spa);
2660
2661 spa_vdev_state_enter(spa, SCL_ALLOC);
2662
2663 /*
2664 * Check to see if the config has changed.
2665 */
2666 if (error || generation != spa->spa_config_generation) {
2667 metaslab_group_activate(mg);
2668 if (error)
2669 return (spa_vdev_state_exit(spa,
2670 vd, error));
2671 (void) spa_vdev_state_exit(spa, vd, 0);
2672 goto top;
2673 }
2674 ASSERT0(tvd->vdev_stat.vs_alloc);
2675 }
2676
2677 /*
2678 * Offline this device and reopen its top-level vdev.
2679 * If the top-level vdev is a log device then just offline
2706 int
2707 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
2708 {
2709 int error;
2710
2711 mutex_enter(&spa->spa_vdev_top_lock);
2712 error = vdev_offline_locked(spa, guid, flags);
2713 mutex_exit(&spa->spa_vdev_top_lock);
2714
2715 return (error);
2716 }
2717
2718 /*
2719 * Clear the error counts associated with this vdev. Unlike vdev_online() and
2720 * vdev_offline(), we assume the spa config is locked. We also clear all
2721 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
2722 */
2723 void
2724 vdev_clear(spa_t *spa, vdev_t *vd)
2725 {
2726 int c;
2727 vdev_t *rvd = spa->spa_root_vdev;
2728
2729 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2730
2731 if (vd == NULL) {
2732 vd = rvd;
2733
2734 /* Go through spare and l2cache vdevs */
2735 for (c = 0; c < spa->spa_spares.sav_count; c++)
2736 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
2737 for (c = 0; c < spa->spa_l2cache.sav_count; c++)
2738 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
2739 }
2740
2741 vd->vdev_stat.vs_read_errors = 0;
2742 vd->vdev_stat.vs_write_errors = 0;
2743 vd->vdev_stat.vs_checksum_errors = 0;
2744
2745 /*
2746 * If all disk vdevs failed at the same time (e.g. due to a
2747 * disconnected cable), that suspends I/O activity to the pool,
2748 * which stalls spa_sync if there happened to be any dirty data.
2749 * As a consequence, this flag might not be cleared, because it
2750 * is only lowered by spa_async_remove (which cannot run). This
2751 * then prevents zio_resume from succeeding even if vdev reopen
2752 * succeeds, leading to an indefinitely suspended pool. So we
2753 * lower the flag here to allow zio_resume to succeed, provided
2754 * reopening of the vdevs succeeds.
2755 */
2756 vd->vdev_remove_wanted = B_FALSE;
2757
2758 for (c = 0; c < vd->vdev_children; c++)
2759 vdev_clear(spa, vd->vdev_child[c]);
2760
2761 /*
2762 * If we're in the FAULTED state or have experienced failed I/O, then
2763 * clear the persistent state and attempt to reopen the device. We
2764 * also mark the vdev config dirty, so that the new faulted state is
2765 * written out to disk.
2766 */
2767 if (vd->vdev_faulted || vd->vdev_degraded ||
2768 !vdev_readable(vd) || !vdev_writeable(vd)) {
2769
2770 /*
2771 * When reopening in reponse to a clear event, it may be due to
2772 * a fmadm repair request. In this case, if the device is
2773 * still broken, we want to still post the ereport again.
2774 */
2775 vd->vdev_forcefault = B_TRUE;
2776
2777 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
2778 vd->vdev_cant_read = B_FALSE;
2779 vd->vdev_cant_write = B_FALSE;
2780
2795 * When clearing a FMA-diagnosed fault, we always want to
2796 * unspare the device, as we assume that the original spare was
2797 * done in response to the FMA fault.
2798 */
2799 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
2800 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2801 vd->vdev_parent->vdev_child[0] == vd)
2802 vd->vdev_unspare = B_TRUE;
2803 }
2804
2805 boolean_t
2806 vdev_is_dead(vdev_t *vd)
2807 {
2808 /*
2809 * Holes and missing devices are always considered "dead".
2810 * This simplifies the code since we don't have to check for
2811 * these types of devices in the various code paths.
2812 * Instead we rely on the fact that we skip over dead devices
2813 * before issuing I/O to them.
2814 */
2815 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
2816 vd->vdev_ops == &vdev_missing_ops);
2817 }
2818
2819 boolean_t
2820 vdev_readable(vdev_t *vd)
2821 {
2822 return (vd != NULL && !vdev_is_dead(vd) && !vd->vdev_cant_read);
2823 }
2824
2825 boolean_t
2826 vdev_writeable(vdev_t *vd)
2827 {
2828 return (vd != NULL && !vdev_is_dead(vd) && !vd->vdev_cant_write);
2829 }
2830
2831 boolean_t
2832 vdev_allocatable(vdev_t *vd)
2833 {
2834 uint64_t state = vd->vdev_state;
2835
2836 /*
2837 * We currently allow allocations from vdevs which may be in the
2838 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2839 * fails to reopen then we'll catch it later when we're holding
2840 * the proper locks. Note that we have to get the vdev state
2841 * in a local variable because although it changes atomically,
2842 * we're asking two separate questions about it.
2843 */
2844 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2845 !vd->vdev_cant_write && !vd->vdev_ishole &&
2846 vd->vdev_mg->mg_initialized);
2847 }
2848
2849 boolean_t
2850 vdev_accessible(vdev_t *vd, zio_t *zio)
2851 {
2852 ASSERT(zio->io_vd == vd);
2853
2854 if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2855 return (B_FALSE);
2856
2857 if (zio->io_type == ZIO_TYPE_READ)
2858 return (!vd->vdev_cant_read);
2859
2860 if (zio->io_type == ZIO_TYPE_WRITE)
2861 return (!vd->vdev_cant_write);
2862
2863 return (B_TRUE);
2864 }
2865
2874 vdev_t *tvd = vd->vdev_top;
2875
2876 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2877
2878 mutex_enter(&vd->vdev_stat_lock);
2879 bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2880 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2881 vs->vs_state = vd->vdev_state;
2882 vs->vs_rsize = vdev_get_min_asize(vd);
2883 if (vd->vdev_ops->vdev_op_leaf)
2884 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
2885 /*
2886 * Report expandable space on top-level, non-auxillary devices only.
2887 * The expandable space is reported in terms of metaslab sized units
2888 * since that determines how much space the pool can expand.
2889 */
2890 if (vd->vdev_aux == NULL && tvd != NULL) {
2891 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
2892 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
2893 }
2894 if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
2895 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
2896 }
2897
2898 /*
2899 * If we're getting stats on the root vdev, aggregate the I/O counts
2900 * over all top-level vdevs (i.e. the direct children of the root).
2901 */
2902 if (vd == rvd) {
2903 for (int c = 0; c < rvd->vdev_children; c++) {
2904 vdev_t *cvd = rvd->vdev_child[c];
2905 vdev_stat_t *cvs = &cvd->vdev_stat;
2906
2907 for (int t = 0; t < ZIO_TYPES; t++) {
2908 vs->vs_ops[t] += cvs->vs_ops[t];
2909 vs->vs_bytes[t] += cvs->vs_bytes[t];
2910 vs->vs_iotime[t] += cvs->vs_iotime[t];
2911 vs->vs_latency[t] += cvs->vs_latency[t];
2912 }
2913 cvs->vs_scan_removing = cvd->vdev_removing;
2914 }
2915 }
2916 mutex_exit(&vd->vdev_stat_lock);
2917 }
2918
2919 void
2920 vdev_clear_stats(vdev_t *vd)
2921 {
2922 mutex_enter(&vd->vdev_stat_lock);
2923 vd->vdev_stat.vs_space = 0;
2924 vd->vdev_stat.vs_dspace = 0;
2925 vd->vdev_stat.vs_alloc = 0;
2926 mutex_exit(&vd->vdev_stat_lock);
2927 }
2928
2929 void
2930 vdev_scan_stat_init(vdev_t *vd)
2931 {
2984
2985 if (flags & ZIO_FLAG_IO_REPAIR) {
2986 if (flags & ZIO_FLAG_SCAN_THREAD) {
2987 dsl_scan_phys_t *scn_phys =
2988 &spa->spa_dsl_pool->dp_scan->scn_phys;
2989 uint64_t *processed = &scn_phys->scn_processed;
2990
2991 /* XXX cleanup? */
2992 if (vd->vdev_ops->vdev_op_leaf)
2993 atomic_add_64(processed, psize);
2994 vs->vs_scan_processed += psize;
2995 }
2996
2997 if (flags & ZIO_FLAG_SELF_HEAL)
2998 vs->vs_self_healed += psize;
2999 }
3000
3001 vs->vs_ops[type]++;
3002 vs->vs_bytes[type] += psize;
3003
3004 /*
3005 * While measuring each delta in nanoseconds, we should keep
3006 * cumulative iotime in microseconds so it doesn't overflow on
3007 * a busy system.
3008 */
3009 vs->vs_iotime[type] += (zio->io_vd_timestamp) / 1000;
3010
3011 /*
3012 * Latency is an exponential moving average of iotime deltas
3013 * with tuneable alpha measured in 1/10th of percent.
3014 */
3015 vs->vs_latency[type] += ((int64_t)zio->io_vd_timestamp -
3016 vs->vs_latency[type]) * zfs_vs_latency_alpha / 1000;
3017
3018 mutex_exit(&vd->vdev_stat_lock);
3019 return;
3020 }
3021
3022 if (flags & ZIO_FLAG_SPECULATIVE)
3023 return;
3024
3025 /*
3026 * If this is an I/O error that is going to be retried, then ignore the
3027 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
3028 * hard errors, when in reality they can happen for any number of
3029 * innocuous reasons (bus resets, MPxIO link failure, etc).
3030 */
3031 if (zio->io_error == EIO &&
3032 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
3033 return;
3034
3035 /*
3036 * Intent logs writes won't propagate their error to the root
3037 * I/O so don't mark these types of failures as pool-level
3038 * errors.
3039 */
3040 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
3041 return;
3042
3043 mutex_enter(&vd->vdev_stat_lock);
3044 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
3045 if (zio->io_error == ECKSUM)
3046 vs->vs_checksum_errors++;
3047 else
3048 vs->vs_read_errors++;
3049 }
3050 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
3051 vs->vs_write_errors++;
3052 mutex_exit(&vd->vdev_stat_lock);
3053
3054 if ((vd->vdev_isspecial || vd->vdev_isspecial_child) &&
3055 (vs->vs_checksum_errors != 0 || vs->vs_read_errors != 0 ||
3056 vs->vs_write_errors != 0 || !vdev_readable(vd) ||
3057 !vdev_writeable(vd)) && !spa->spa_special_has_errors) {
3058 /* all new writes will be placed on normal */
3059 cmn_err(CE_WARN, "New writes to special vdev [%s] "
3060 "will be stopped", (vd->vdev_path != NULL) ?
3061 vd->vdev_path : "undefined");
3062 spa->spa_special_has_errors = B_TRUE;
3063 }
3064
3065 if (type == ZIO_TYPE_WRITE && txg != 0 &&
3066 (!(flags & ZIO_FLAG_IO_REPAIR) ||
3067 (flags & ZIO_FLAG_SCAN_THREAD) ||
3068 spa->spa_claiming)) {
3069 /*
3070 * This is either a normal write (not a repair), or it's
3071 * a repair induced by the scrub thread, or it's a repair
3072 * made by zil_claim() during spa_load() in the first txg.
3073 * In the normal case, we commit the DTL change in the same
3074 * txg as the block was born. In the scrub-induced repair
3075 * case, we know that scrubs run in first-pass syncing context,
3076 * so we commit the DTL change in spa_syncing_txg(spa).
3077 * In the zil_claim() case, we commit in spa_first_txg(spa).
3078 *
3079 * We currently do not make DTL entries for failed spontaneous
3080 * self-healing writes triggered by normal (non-scrubbing)
3081 * reads, because we have no transactional context in which to
3082 * do so -- and it's not clear that it'd be desirable anyway.
3083 */
3084 if (vd->vdev_ops->vdev_op_leaf) {
3085 uint64_t commit_txg = txg;
3120
3121 ASSERT(vd == vd->vdev_top);
3122
3123 /*
3124 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
3125 * factor. We must calculate this here and not at the root vdev
3126 * because the root vdev's psize-to-asize is simply the max of its
3127 * childrens', thus not accurate enough for us.
3128 */
3129 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
3130 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
3131 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
3132 vd->vdev_deflate_ratio;
3133
3134 mutex_enter(&vd->vdev_stat_lock);
3135 vd->vdev_stat.vs_alloc += alloc_delta;
3136 vd->vdev_stat.vs_space += space_delta;
3137 vd->vdev_stat.vs_dspace += dspace_delta;
3138 mutex_exit(&vd->vdev_stat_lock);
3139
3140 if (mc == spa_normal_class(spa) || mc == spa_special_class(spa)) {
3141 mutex_enter(&rvd->vdev_stat_lock);
3142 rvd->vdev_stat.vs_alloc += alloc_delta;
3143 rvd->vdev_stat.vs_space += space_delta;
3144 rvd->vdev_stat.vs_dspace += dspace_delta;
3145 mutex_exit(&rvd->vdev_stat_lock);
3146 }
3147
3148 if (mc != NULL) {
3149 ASSERT(rvd == vd->vdev_parent);
3150 ASSERT(vd->vdev_ms_count != 0);
3151
3152 metaslab_class_space_update(mc,
3153 alloc_delta, defer_delta, space_delta, dspace_delta);
3154 }
3155 }
3156
3157 /*
3158 * Mark a top-level vdev's config as dirty, placing it on the dirty list
3159 * so that it will be written out next time the vdev configuration is synced.
3160 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
3210 return;
3211 }
3212
3213 /*
3214 * The dirty list is protected by the SCL_CONFIG lock. The caller
3215 * must either hold SCL_CONFIG as writer, or must be the sync thread
3216 * (which holds SCL_CONFIG as reader). There's only one sync thread,
3217 * so this is sufficient to ensure mutual exclusion.
3218 */
3219 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3220 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3221 spa_config_held(spa, SCL_CONFIG, RW_READER)));
3222
3223 if (vd == rvd) {
3224 for (c = 0; c < rvd->vdev_children; c++)
3225 vdev_config_dirty(rvd->vdev_child[c]);
3226 } else {
3227 ASSERT(vd == vd->vdev_top);
3228
3229 if (!list_link_active(&vd->vdev_config_dirty_node) &&
3230 !vd->vdev_ishole)
3231 list_insert_head(&spa->spa_config_dirty_list, vd);
3232 }
3233 }
3234
3235 void
3236 vdev_config_clean(vdev_t *vd)
3237 {
3238 spa_t *spa = vd->vdev_spa;
3239
3240 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
3241 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3242 spa_config_held(spa, SCL_CONFIG, RW_READER)));
3243
3244 ASSERT(list_link_active(&vd->vdev_config_dirty_node));
3245 list_remove(&spa->spa_config_dirty_list, vd);
3246 }
3247
3248 /*
3249 * Mark a top-level vdev's state as dirty, so that the next pass of
3250 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
3251 * the state changes from larger config changes because they require
3252 * much less locking, and are often needed for administrative actions.
3253 */
3254 void
3255 vdev_state_dirty(vdev_t *vd)
3256 {
3257 spa_t *spa = vd->vdev_spa;
3258
3259 ASSERT(spa_writeable(spa));
3260 ASSERT(vd == vd->vdev_top);
3261
3262 /*
3263 * The state list is protected by the SCL_STATE lock. The caller
3264 * must either hold SCL_STATE as writer, or must be the sync thread
3265 * (which holds SCL_STATE as reader). There's only one sync thread,
3266 * so this is sufficient to ensure mutual exclusion.
3267 */
3268 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3269 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3270 spa_config_held(spa, SCL_STATE, RW_READER)));
3271
3272 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
3273 list_insert_head(&spa->spa_state_dirty_list, vd);
3274 }
3275
3276 void
3277 vdev_state_clean(vdev_t *vd)
3278 {
3279 spa_t *spa = vd->vdev_spa;
3280
3281 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
3282 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
3283 spa_config_held(spa, SCL_STATE, RW_READER)));
3284
3285 ASSERT(list_link_active(&vd->vdev_state_dirty_node));
3286 list_remove(&spa->spa_state_dirty_list, vd);
3287 }
3288
3289 /*
3290 * Propagate vdev state up from children to parent.
3291 */
3292 void
3293 vdev_propagate_state(vdev_t *vd)
3294 {
3295 spa_t *spa = vd->vdev_spa;
3296 vdev_t *rvd = spa->spa_root_vdev;
3297 int degraded = 0, faulted = 0;
3298 int corrupted = 0;
3299 vdev_t *child;
3300
3301 if (vd->vdev_children > 0) {
3302 for (int c = 0; c < vd->vdev_children; c++) {
3303 child = vd->vdev_child[c];
3304
3305 /*
3306 * Don't factor holes into the decision.
3307 */
3308 if (child->vdev_ishole)
3309 continue;
3310
3311 if (!vdev_readable(child) ||
3312 (!vdev_writeable(child) && spa_writeable(spa))) {
3313 /*
3314 * Root special: if there is a top-level log
3315 * device, treat the root vdev as if it were
3316 * degraded.
3317 */
3318 if (child->vdev_islog && vd == rvd)
3319 degraded++;
3320 else
3321 faulted++;
3322 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
3323 degraded++;
3324 }
3325
3326 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
3327 corrupted++;
3328 }
3463 case VDEV_AUX_BAD_LABEL:
3464 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
3465 break;
3466 default:
3467 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
3468 }
3469
3470 zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
3471 }
3472
3473 /* Erase any notion of persistent removed state */
3474 vd->vdev_removed = B_FALSE;
3475 } else {
3476 vd->vdev_removed = B_FALSE;
3477 }
3478
3479 if (!isopen && vd->vdev_parent)
3480 vdev_propagate_state(vd->vdev_parent);
3481 }
3482
3483 /*
3484 * Check the vdev configuration to ensure that it's capable of supporting
3485 * a root pool. We do not support partial configuration.
3486 * In addition, only a single top-level vdev is allowed.
3487 */
3488 boolean_t
3489 vdev_is_bootable(vdev_t *vd)
3490 {
3491 if (!vd->vdev_ops->vdev_op_leaf) {
3492 char *vdev_type = vd->vdev_ops->vdev_op_type;
3493
3494 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
3495 vd->vdev_children > 1) {
3496 return (B_FALSE);
3497 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
3498 return (B_FALSE);
3499 }
3500 }
3501
3502 for (int c = 0; c < vd->vdev_children; c++) {
3503 if (!vdev_is_bootable(vd->vdev_child[c]))
3504 return (B_FALSE);
3505 }
3506 return (B_TRUE);
3507 }
3508
3509 /*
3510 * Load the state from the original vdev tree (ovd) which
3511 * we've retrieved from the MOS config object. If the original
3512 * vdev was offline or faulted then we transfer that state to the
3513 * device in the current vdev tree (nvd).
3514 */
3515 void
3516 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
3517 {
3518 spa_t *spa = nvd->vdev_spa;
3519
3520 ASSERT(nvd->vdev_top->vdev_islog);
3521 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3522 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
3523
3524 for (int c = 0; c < nvd->vdev_children; c++)
3525 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
3526
3527 if (nvd->vdev_ops->vdev_op_leaf) {
3528 /*
3529 * Restore the persistent vdev state
3530 */
3531 nvd->vdev_offline = ovd->vdev_offline;
3532 nvd->vdev_faulted = ovd->vdev_faulted;
3533 nvd->vdev_degraded = ovd->vdev_degraded;
3534 nvd->vdev_removed = ovd->vdev_removed;
3535 }
3536 }
3537
3538 /*
3539 * Determine if a log device has valid content. If the vdev was
3540 * removed or faulted in the MOS config then we know that
3541 * the content on the log device has already been written to the pool.
3542 */
3543 boolean_t
3544 vdev_log_state_valid(vdev_t *vd)
3545 {
3546 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
3547 !vd->vdev_removed)
3548 return (B_TRUE);
3549
3550 for (int c = 0; c < vd->vdev_children; c++)
3551 if (vdev_log_state_valid(vd->vdev_child[c]))
3552 return (B_TRUE);
3553
3554 return (B_FALSE);
3555 }
3556
3557 /*
3558 * Expand a vdev if possible.
3559 */
3560 void
3561 vdev_expand(vdev_t *vd, uint64_t txg)
3562 {
3563 ASSERT(vd->vdev_top == vd);
3564 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
3565
3566 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
3567 VERIFY(vdev_metaslab_init(vd, txg) == 0);
3568 vdev_config_dirty(vd);
3569 }
3570 }
3571
3572 /*
3573 * Split a vdev.
3574 */
3575 void
3576 vdev_split(vdev_t *vd)
3577 {
3578 vdev_t *cvd, *pvd = vd->vdev_parent;
3579
3580 vdev_remove_child(pvd, vd);
3581 vdev_compact_children(pvd);
3582
3583 cvd = pvd->vdev_child[0];
3584 if (pvd->vdev_children == 1) {
3585 vdev_remove_parent(cvd);
3586 cvd->vdev_splitting = B_TRUE;
3597 vdev_deadman(cvd);
3598 }
3599
3600 if (vd->vdev_ops->vdev_op_leaf) {
3601 vdev_queue_t *vq = &vd->vdev_queue;
3602
3603 mutex_enter(&vq->vq_lock);
3604 if (avl_numnodes(&vq->vq_active_tree) > 0) {
3605 spa_t *spa = vd->vdev_spa;
3606 zio_t *fio;
3607 uint64_t delta;
3608
3609 /*
3610 * Look at the head of all the pending queues,
3611 * if any I/O has been outstanding for longer than
3612 * the spa_deadman_synctime we panic the system.
3613 */
3614 fio = avl_first(&vq->vq_active_tree);
3615 delta = gethrtime() - fio->io_timestamp;
3616 if (delta > spa_deadman_synctime(spa)) {
3617 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
3618 "delta %lluns, last io %lluns",
3619 fio->io_timestamp, delta,
3620 vq->vq_io_complete_ts);
3621 fm_panic("I/O to pool '%s' appears to be "
3622 "hung.", spa_name(spa));
3623 }
3624 }
3625 mutex_exit(&vq->vq_lock);
3626 }
3627 }
3628
3629 boolean_t
3630 vdev_type_is_ddt(vdev_t *vd)
3631 {
3632 uint64_t pool;
3633
3634 if (vd->vdev_l2ad_ddt == 1 &&
3635 zfs_ddt_limit_type == DDT_LIMIT_TO_L2ARC) {
3636 ASSERT(spa_l2cache_exists(vd->vdev_guid, &pool));
3637 ASSERT(vd->vdev_isl2cache);
3638 return (B_TRUE);
3639 }
3640 return (B_FALSE);
3641 }
3642
3643 /* count leaf vdev(s) under the given vdev */
3644 uint_t
3645 vdev_count_leaf_vdevs(vdev_t *vd)
3646 {
3647 uint_t cnt = 0;
3648
3649 if (vd->vdev_ops->vdev_op_leaf)
3650 return (1);
3651
3652 /* if this is not a leaf vdev - visit children */
3653 for (int c = 0; c < vd->vdev_children; c++)
3654 cnt += vdev_count_leaf_vdevs(vd->vdev_child[c]);
3655
3656 return (cnt);
3657 }
3658
3659 /*
3660 * Implements the per-vdev portion of manual TRIM. The function passes over
3661 * all metaslabs on this vdev and performs a metaslab_trim_all on them. It's
3662 * also responsible for rate-control if spa_man_trim_rate is non-zero.
3663 */
3664 void
3665 vdev_man_trim(vdev_trim_info_t *vti)
3666 {
3667 clock_t t = ddi_get_lbolt();
3668 spa_t *spa = vti->vti_vdev->vdev_spa;
3669 vdev_t *vd = vti->vti_vdev;
3670
3671 vd->vdev_man_trimming = B_TRUE;
3672 vd->vdev_trim_prog = 0;
3673
3674 spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER);
3675 for (uint64_t i = 0; i < vti->vti_vdev->vdev_ms_count &&
3676 !spa->spa_man_trim_stop; i++) {
3677 uint64_t delta;
3678 metaslab_t *msp = vd->vdev_ms[i];
3679 zio_t *trim_io = metaslab_trim_all(msp, &delta);
3680
3681 atomic_add_64(&vd->vdev_trim_prog, msp->ms_size);
3682 spa_config_exit(spa, SCL_STATE_ALL, FTAG);
3683
3684 (void) zio_wait(trim_io);
3685
3686 /* delay loop to handle fixed-rate trimming */
3687 for (;;) {
3688 uint64_t rate = spa->spa_man_trim_rate;
3689 uint64_t sleep_delay;
3690
3691 if (rate == 0) {
3692 /* No delay, just update 't' and move on. */
3693 t = ddi_get_lbolt();
3694 break;
3695 }
3696
3697 sleep_delay = (delta * hz) / rate;
3698 mutex_enter(&spa->spa_man_trim_lock);
3699 (void) cv_timedwait(&spa->spa_man_trim_update_cv,
3700 &spa->spa_man_trim_lock, t);
3701 mutex_exit(&spa->spa_man_trim_lock);
3702
3703 /* If interrupted, don't try to relock, get out */
3704 if (spa->spa_man_trim_stop)
3705 goto out;
3706
3707 /* Timeout passed, move on to the next metaslab. */
3708 if (ddi_get_lbolt() >= t + sleep_delay) {
3709 t += sleep_delay;
3710 break;
3711 }
3712 }
3713 spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER);
3714 }
3715 spa_config_exit(spa, SCL_STATE_ALL, FTAG);
3716 out:
3717 vd->vdev_man_trimming = B_FALSE;
3718 /*
3719 * Ensure we're marked as "completed" even if we've had to stop
3720 * before processing all metaslabs.
3721 */
3722 vd->vdev_trim_prog = vd->vdev_asize;
3723
3724 ASSERT(vti->vti_done_cb != NULL);
3725 vti->vti_done_cb(vti->vti_done_arg);
3726
3727 kmem_free(vti, sizeof (*vti));
3728 }
3729
3730 /*
3731 * Runs through all metaslabs on the vdev and does their autotrim processing.
3732 */
3733 void
3734 vdev_auto_trim(vdev_trim_info_t *vti)
3735 {
3736 vdev_t *vd = vti->vti_vdev;
3737 spa_t *spa = vd->vdev_spa;
3738 uint64_t txg = vti->vti_txg;
3739
3740 if (vd->vdev_man_trimming)
3741 goto out;
3742
3743 spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_READER);
3744 for (uint64_t i = 0; i < vd->vdev_ms_count; i++)
3745 metaslab_auto_trim(vd->vdev_ms[i], txg);
3746 spa_config_exit(spa, SCL_STATE_ALL, FTAG);
3747 out:
3748 ASSERT(vti->vti_done_cb != NULL);
3749 vti->vti_done_cb(vti->vti_done_arg);
3750
3751 kmem_free(vti, sizeof (*vti));
3752 }
|