484 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
485 }
486
487 vd->vdev_spa = spa;
488 vd->vdev_id = id;
489 vd->vdev_guid = guid;
490 vd->vdev_guid_sum = guid;
491 vd->vdev_ops = ops;
492 vd->vdev_state = VDEV_STATE_CLOSED;
493 vd->vdev_ishole = (ops == &vdev_hole_ops);
494 vic->vic_prev_indirect_vdev = UINT64_MAX;
495
496 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
497 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
498 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
499
500 list_link_init(&vd->vdev_leaf_node);
501 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
502 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
503 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
504 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
505 mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
506 mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
507 cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
508 cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
509
510 for (int t = 0; t < DTL_TYPES; t++) {
511 vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
512 }
513 txg_list_create(&vd->vdev_ms_list, spa,
514 offsetof(struct metaslab, ms_txg_node));
515 txg_list_create(&vd->vdev_dtl_list, spa,
516 offsetof(struct vdev, vdev_dtl_node));
517 vd->vdev_stat.vs_timestamp = gethrtime();
518 vdev_queue_init(vd);
519 vdev_cache_init(vd);
520
521 return (vd);
522 }
523
524 /*
872 }
873 mutex_exit(&vd->vdev_dtl_lock);
874
875 EQUIV(vd->vdev_indirect_births != NULL,
876 vd->vdev_indirect_mapping != NULL);
877 if (vd->vdev_indirect_births != NULL) {
878 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
879 vdev_indirect_births_close(vd->vdev_indirect_births);
880 }
881
882 if (vd->vdev_obsolete_sm != NULL) {
883 ASSERT(vd->vdev_removing ||
884 vd->vdev_ops == &vdev_indirect_ops);
885 space_map_close(vd->vdev_obsolete_sm);
886 vd->vdev_obsolete_sm = NULL;
887 }
888 range_tree_destroy(vd->vdev_obsolete_segments);
889 rw_destroy(&vd->vdev_indirect_rwlock);
890 mutex_destroy(&vd->vdev_obsolete_lock);
891
892 mutex_destroy(&vd->vdev_queue_lock);
893 mutex_destroy(&vd->vdev_dtl_lock);
894 mutex_destroy(&vd->vdev_stat_lock);
895 mutex_destroy(&vd->vdev_probe_lock);
896 mutex_destroy(&vd->vdev_initialize_lock);
897 mutex_destroy(&vd->vdev_initialize_io_lock);
898 cv_destroy(&vd->vdev_initialize_io_cv);
899 cv_destroy(&vd->vdev_initialize_cv);
900
901 if (vd == spa->spa_root_vdev)
902 spa->spa_root_vdev = NULL;
903
904 kmem_free(vd, sizeof (vdev_t));
905 }
906
907 /*
908 * Transfer top-level vdev state from svd to tvd.
909 */
910 static void
911 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
912 {
1234
1235 void
1236 vdev_metaslab_fini(vdev_t *vd)
1237 {
1238 if (vd->vdev_checkpoint_sm != NULL) {
1239 ASSERT(spa_feature_is_active(vd->vdev_spa,
1240 SPA_FEATURE_POOL_CHECKPOINT));
1241 space_map_close(vd->vdev_checkpoint_sm);
1242 /*
1243 * Even though we close the space map, we need to set its
1244 * pointer to NULL. The reason is that vdev_metaslab_fini()
1245 * may be called multiple times for certain operations
1246 * (i.e. when destroying a pool) so we need to ensure that
1247 * this clause never executes twice. This logic is similar
1248 * to the one used for the vdev_ms clause below.
1249 */
1250 vd->vdev_checkpoint_sm = NULL;
1251 }
1252
1253 if (vd->vdev_ms != NULL) {
1254 uint64_t count = vd->vdev_ms_count;
1255
1256 metaslab_group_passivate(vd->vdev_mg);
1257 for (uint64_t m = 0; m < count; m++) {
1258 metaslab_t *msp = vd->vdev_ms[m];
1259
1260 if (msp != NULL)
1261 metaslab_fini(msp);
1262 }
1263 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1264 vd->vdev_ms = NULL;
1265
1266 vd->vdev_ms_count = 0;
1267 }
1268 ASSERT0(vd->vdev_ms_count);
1269 }
1270
1271 typedef struct vdev_probe_stats {
1272 boolean_t vps_readable;
1273 boolean_t vps_writeable;
1274 int vps_flags;
1275 } vdev_probe_stats_t;
1276
1277 static void
1278 vdev_probe_done(zio_t *zio)
1279 {
1280 spa_t *spa = zio->io_spa;
1281 vdev_t *vd = zio->io_vd;
1282 vdev_probe_stats_t *vps = zio->io_private;
1283
1284 ASSERT(vd->vdev_probe_zio != NULL);
1285
1286 if (zio->io_type == ZIO_TYPE_READ) {
2532 mutex_exit(&vd->vdev_dtl_lock);
2533 }
2534
2535 int
2536 vdev_dtl_load(vdev_t *vd)
2537 {
2538 spa_t *spa = vd->vdev_spa;
2539 objset_t *mos = spa->spa_meta_objset;
2540 int error = 0;
2541
2542 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2543 ASSERT(vdev_is_concrete(vd));
2544
2545 error = space_map_open(&vd->vdev_dtl_sm, mos,
2546 vd->vdev_dtl_object, 0, -1ULL, 0);
2547 if (error)
2548 return (error);
2549 ASSERT(vd->vdev_dtl_sm != NULL);
2550
2551 mutex_enter(&vd->vdev_dtl_lock);
2552
2553 /*
2554 * Now that we've opened the space_map we need to update
2555 * the in-core DTL.
2556 */
2557 space_map_update(vd->vdev_dtl_sm);
2558
2559 error = space_map_load(vd->vdev_dtl_sm,
2560 vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2561 mutex_exit(&vd->vdev_dtl_lock);
2562
2563 return (error);
2564 }
2565
2566 for (int c = 0; c < vd->vdev_children; c++) {
2567 error = vdev_dtl_load(vd->vdev_child[c]);
2568 if (error != 0)
2569 break;
2570 }
2571
2572 return (error);
2573 }
2574
2575 static void
2576 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
2577 {
2578 spa_t *spa = vd->vdev_spa;
2698
2699 space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
2700 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
2701 range_tree_vacate(rtsync, NULL, NULL);
2702
2703 range_tree_destroy(rtsync);
2704
2705 /*
2706 * If the object for the space map has changed then dirty
2707 * the top level so that we update the config.
2708 */
2709 if (object != space_map_object(vd->vdev_dtl_sm)) {
2710 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
2711 "new object %llu", (u_longlong_t)txg, spa_name(spa),
2712 (u_longlong_t)object,
2713 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2714 vdev_config_dirty(vd->vdev_top);
2715 }
2716
2717 dmu_tx_commit(tx);
2718
2719 mutex_enter(&vd->vdev_dtl_lock);
2720 space_map_update(vd->vdev_dtl_sm);
2721 mutex_exit(&vd->vdev_dtl_lock);
2722 }
2723
2724 /*
2725 * Determine whether the specified vdev can be offlined/detached/removed
2726 * without losing data.
2727 */
2728 boolean_t
2729 vdev_dtl_required(vdev_t *vd)
2730 {
2731 spa_t *spa = vd->vdev_spa;
2732 vdev_t *tvd = vd->vdev_top;
2733 uint8_t cant_read = vd->vdev_cant_read;
2734 boolean_t required;
2735
2736 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2737
2738 if (vd == spa->spa_root_vdev || vd == tvd)
2739 return (B_TRUE);
2740
2741 /*
2844 VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
2845 bias_str) == 0) {
2846 ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
2847 vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
2848 }
2849 }
2850
2851 /*
2852 * If this is a top-level vdev, initialize its metaslabs.
2853 */
2854 if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
2855 vdev_metaslab_group_create(vd);
2856
2857 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
2858 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2859 VDEV_AUX_CORRUPT_DATA);
2860 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
2861 "asize=%llu", (u_longlong_t)vd->vdev_ashift,
2862 (u_longlong_t)vd->vdev_asize);
2863 return (SET_ERROR(ENXIO));
2864 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
2865 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
2866 "[error=%d]", error);
2867 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2868 VDEV_AUX_CORRUPT_DATA);
2869 return (error);
2870 }
2871
2872 uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
2873 if (checkpoint_sm_obj != 0) {
2874 objset_t *mos = spa_meta_objset(vd->vdev_spa);
2875 ASSERT(vd->vdev_asize != 0);
2876 ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
2877
2878 if ((error = space_map_open(&vd->vdev_checkpoint_sm,
2879 mos, checkpoint_sm_obj, 0, vd->vdev_asize,
2880 vd->vdev_ashift))) {
2881 vdev_dbgmsg(vd, "vdev_load: space_map_open "
2882 "failed for checkpoint spacemap (obj %llu) "
2883 "[error=%d]",
2884 (u_longlong_t)checkpoint_sm_obj, error);
2885 return (error);
2886 }
2887 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2888 space_map_update(vd->vdev_checkpoint_sm);
2889
2890 /*
2891 * Since the checkpoint_sm contains free entries
2892 * exclusively we can use sm_alloc to indicate the
2893 * culmulative checkpointed space that has been freed.
2894 */
2895 vd->vdev_stat.vs_checkpoint_space =
2896 -vd->vdev_checkpoint_sm->sm_alloc;
2897 vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
2898 vd->vdev_stat.vs_checkpoint_space;
2899 }
2900 }
2901
2902 /*
2903 * If this is a leaf vdev, load its DTL.
2904 */
2905 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
2906 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2907 VDEV_AUX_CORRUPT_DATA);
2908 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
2909 "[error=%d]", error);
2910 return (error);
2911 }
2912
2913 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
2914 if (obsolete_sm_object != 0) {
2915 objset_t *mos = vd->vdev_spa->spa_meta_objset;
2916 ASSERT(vd->vdev_asize != 0);
2917 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
2918
2919 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
2920 obsolete_sm_object, 0, vd->vdev_asize, 0))) {
2921 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2922 VDEV_AUX_CORRUPT_DATA);
2923 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
2924 "obsolete spacemap (obj %llu) [error=%d]",
2925 (u_longlong_t)obsolete_sm_object, error);
2926 return (error);
2927 }
2928 space_map_update(vd->vdev_obsolete_sm);
2929 }
2930
2931 return (0);
2932 }
2933
2934 /*
2935 * The special vdev case is used for hot spares and l2cache devices. Its
2936 * sole purpose it to set the vdev state for the associated vdev. To do this,
2937 * we make sure that we can open the underlying device, then try to read the
2938 * label, and make sure that the label is sane and that it hasn't been
2939 * repurposed to another pool.
2940 */
2941 int
2942 vdev_validate_aux(vdev_t *vd)
2943 {
2944 nvlist_t *label;
2945 uint64_t guid, version;
2946 uint64_t state;
2947
2948 if (!vdev_readable(vd))
2995 if (smobj == 0)
2996 continue;
2997
2998 space_map_free_obj(mos, smobj, tx);
2999 }
3000
3001 kmem_free(smobj_array, array_bytes);
3002 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
3003 vd->vdev_ms_array = 0;
3004 }
3005
3006 static void
3007 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3008 {
3009 spa_t *spa = vd->vdev_spa;
3010
3011 ASSERT(vd->vdev_islog);
3012 ASSERT(vd == vd->vdev_top);
3013 ASSERT3U(txg, ==, spa_syncing_txg(spa));
3014
3015 if (vd->vdev_ms != NULL) {
3016 metaslab_group_t *mg = vd->vdev_mg;
3017
3018 metaslab_group_histogram_verify(mg);
3019 metaslab_class_histogram_verify(mg->mg_class);
3020
3021 for (int m = 0; m < vd->vdev_ms_count; m++) {
3022 metaslab_t *msp = vd->vdev_ms[m];
3023
3024 if (msp == NULL || msp->ms_sm == NULL)
3025 continue;
3026
3027 mutex_enter(&msp->ms_lock);
3028 /*
3029 * If the metaslab was not loaded when the vdev
3030 * was removed then the histogram accounting may
3031 * not be accurate. Update the histogram information
3032 * here so that we ensure that the metaslab group
3033 * and metaslab class are up-to-date.
3034 */
3035 metaslab_group_histogram_remove(mg, msp);
3036
3037 VERIFY0(space_map_allocated(msp->ms_sm));
3038 space_map_close(msp->ms_sm);
3039 msp->ms_sm = NULL;
3040 mutex_exit(&msp->ms_lock);
3041 }
3042
3043 if (vd->vdev_checkpoint_sm != NULL) {
3044 ASSERT(spa_has_checkpoint(spa));
3045 space_map_close(vd->vdev_checkpoint_sm);
3046 vd->vdev_checkpoint_sm = NULL;
3047 }
3048
3049 metaslab_group_histogram_verify(mg);
3050 metaslab_class_histogram_verify(mg->mg_class);
3051
3052 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
3053 ASSERT0(mg->mg_histogram[i]);
3054 }
3055
3056 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3057
3058 vdev_destroy_spacemaps(vd, tx);
3059 if (vd->vdev_top_zap != 0) {
3060 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3061 vd->vdev_top_zap = 0;
3062 }
3063
3064 dmu_tx_commit(tx);
3065 }
3066
3067 void
3068 vdev_sync_done(vdev_t *vd, uint64_t txg)
3069 {
3070 metaslab_t *msp;
3071 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
3072
3073 ASSERT(vdev_is_concrete(vd));
3074
3075 while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
3076 != NULL)
3077 metaslab_sync_done(msp, txg);
3078
3079 if (reassess)
3080 metaslab_sync_reassess(vd->vdev_mg);
3081 }
3082
3083 void
3084 vdev_sync(vdev_t *vd, uint64_t txg)
3085 {
3086 spa_t *spa = vd->vdev_spa;
3087 vdev_t *lvd;
3088 metaslab_t *msp;
3089 dmu_tx_t *tx;
3090
3091 if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
3092 dmu_tx_t *tx;
3093
3094 ASSERT(vd->vdev_removing ||
3095 vd->vdev_ops == &vdev_indirect_ops);
3096
3097 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3098 vdev_indirect_sync_obsolete(vd, tx);
3099 dmu_tx_commit(tx);
3100
3101 /*
3102 * If the vdev is indirect, it can't have dirty
3103 * metaslabs or DTLs.
3104 */
3105 if (vd->vdev_ops == &vdev_indirect_ops) {
3106 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
3107 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
3108 return;
3109 }
3110 }
3111
3112 ASSERT(vdev_is_concrete(vd));
3113
3114 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
3115 !vd->vdev_removing) {
3116 ASSERT(vd == vd->vdev_top);
3117 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3118 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3119 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
3120 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
3121 ASSERT(vd->vdev_ms_array != 0);
3122 vdev_config_dirty(vd);
3123 dmu_tx_commit(tx);
3124 }
3125
3126 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
3127 metaslab_sync(msp, txg);
3128 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
3129 }
3130
3131 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
3132 vdev_dtl_sync(lvd, txg);
3133
3134 /*
3135 * If this is an empty log device being removed, destroy the
3136 * metadata associated with it.
3137 */
3138 if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
3139 vdev_remove_empty_log(vd, txg);
3140
3141 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
3142 }
3143
3144 uint64_t
3145 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
3146 {
3147 return (vd->vdev_ops->vdev_op_asize(vd, psize));
3148 }
3149
3150 /*
3151 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
3152 * not be opened, and no I/O is attempted.
3153 */
3154 int
3155 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
3156 {
3157 vdev_t *vd, *tvd;
3158
3159 spa_vdev_state_enter(spa, SCL_NONE);
3160
3161 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3351 * If the top-level is a slog and it has had allocations
3352 * then proceed. We check that the vdev's metaslab group
3353 * is not NULL since it's possible that we may have just
3354 * added this vdev but not yet initialized its metaslabs.
3355 */
3356 if (tvd->vdev_islog && mg != NULL) {
3357 /*
3358 * Prevent any future allocations.
3359 */
3360 metaslab_group_passivate(mg);
3361 (void) spa_vdev_state_exit(spa, vd, 0);
3362
3363 error = spa_reset_logs(spa);
3364
3365 /*
3366 * If the log device was successfully reset but has
3367 * checkpointed data, do not offline it.
3368 */
3369 if (error == 0 &&
3370 tvd->vdev_checkpoint_sm != NULL) {
3371 ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
3372 !=, 0);
3373 error = ZFS_ERR_CHECKPOINT_EXISTS;
3374 }
3375
3376 spa_vdev_state_enter(spa, SCL_ALLOC);
3377
3378 /*
3379 * Check to see if the config has changed.
3380 */
3381 if (error || generation != spa->spa_config_generation) {
3382 metaslab_group_activate(mg);
3383 if (error)
3384 return (spa_vdev_state_exit(spa,
3385 vd, error));
3386 (void) spa_vdev_state_exit(spa, vd, 0);
3387 goto top;
3388 }
3389 ASSERT0(tvd->vdev_stat.vs_alloc);
3390 }
3391
3392 /*
|
484 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
485 }
486
487 vd->vdev_spa = spa;
488 vd->vdev_id = id;
489 vd->vdev_guid = guid;
490 vd->vdev_guid_sum = guid;
491 vd->vdev_ops = ops;
492 vd->vdev_state = VDEV_STATE_CLOSED;
493 vd->vdev_ishole = (ops == &vdev_hole_ops);
494 vic->vic_prev_indirect_vdev = UINT64_MAX;
495
496 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
497 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
498 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
499
500 list_link_init(&vd->vdev_leaf_node);
501 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
502 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
503 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
504 mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
505 mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
506 cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
507 cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
508
509 for (int t = 0; t < DTL_TYPES; t++) {
510 vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
511 }
512 txg_list_create(&vd->vdev_ms_list, spa,
513 offsetof(struct metaslab, ms_txg_node));
514 txg_list_create(&vd->vdev_dtl_list, spa,
515 offsetof(struct vdev, vdev_dtl_node));
516 vd->vdev_stat.vs_timestamp = gethrtime();
517 vdev_queue_init(vd);
518 vdev_cache_init(vd);
519
520 return (vd);
521 }
522
523 /*
871 }
872 mutex_exit(&vd->vdev_dtl_lock);
873
874 EQUIV(vd->vdev_indirect_births != NULL,
875 vd->vdev_indirect_mapping != NULL);
876 if (vd->vdev_indirect_births != NULL) {
877 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
878 vdev_indirect_births_close(vd->vdev_indirect_births);
879 }
880
881 if (vd->vdev_obsolete_sm != NULL) {
882 ASSERT(vd->vdev_removing ||
883 vd->vdev_ops == &vdev_indirect_ops);
884 space_map_close(vd->vdev_obsolete_sm);
885 vd->vdev_obsolete_sm = NULL;
886 }
887 range_tree_destroy(vd->vdev_obsolete_segments);
888 rw_destroy(&vd->vdev_indirect_rwlock);
889 mutex_destroy(&vd->vdev_obsolete_lock);
890
891 mutex_destroy(&vd->vdev_dtl_lock);
892 mutex_destroy(&vd->vdev_stat_lock);
893 mutex_destroy(&vd->vdev_probe_lock);
894 mutex_destroy(&vd->vdev_initialize_lock);
895 mutex_destroy(&vd->vdev_initialize_io_lock);
896 cv_destroy(&vd->vdev_initialize_io_cv);
897 cv_destroy(&vd->vdev_initialize_cv);
898
899 if (vd == spa->spa_root_vdev)
900 spa->spa_root_vdev = NULL;
901
902 kmem_free(vd, sizeof (vdev_t));
903 }
904
905 /*
906 * Transfer top-level vdev state from svd to tvd.
907 */
908 static void
909 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
910 {
1232
1233 void
1234 vdev_metaslab_fini(vdev_t *vd)
1235 {
1236 if (vd->vdev_checkpoint_sm != NULL) {
1237 ASSERT(spa_feature_is_active(vd->vdev_spa,
1238 SPA_FEATURE_POOL_CHECKPOINT));
1239 space_map_close(vd->vdev_checkpoint_sm);
1240 /*
1241 * Even though we close the space map, we need to set its
1242 * pointer to NULL. The reason is that vdev_metaslab_fini()
1243 * may be called multiple times for certain operations
1244 * (i.e. when destroying a pool) so we need to ensure that
1245 * this clause never executes twice. This logic is similar
1246 * to the one used for the vdev_ms clause below.
1247 */
1248 vd->vdev_checkpoint_sm = NULL;
1249 }
1250
1251 if (vd->vdev_ms != NULL) {
1252 metaslab_group_t *mg = vd->vdev_mg;
1253 metaslab_group_passivate(mg);
1254
1255 uint64_t count = vd->vdev_ms_count;
1256 for (uint64_t m = 0; m < count; m++) {
1257 metaslab_t *msp = vd->vdev_ms[m];
1258 if (msp != NULL)
1259 metaslab_fini(msp);
1260 }
1261 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1262 vd->vdev_ms = NULL;
1263
1264 vd->vdev_ms_count = 0;
1265
1266 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
1267 ASSERT0(mg->mg_histogram[i]);
1268 }
1269 ASSERT0(vd->vdev_ms_count);
1270 }
1271
1272 typedef struct vdev_probe_stats {
1273 boolean_t vps_readable;
1274 boolean_t vps_writeable;
1275 int vps_flags;
1276 } vdev_probe_stats_t;
1277
1278 static void
1279 vdev_probe_done(zio_t *zio)
1280 {
1281 spa_t *spa = zio->io_spa;
1282 vdev_t *vd = zio->io_vd;
1283 vdev_probe_stats_t *vps = zio->io_private;
1284
1285 ASSERT(vd->vdev_probe_zio != NULL);
1286
1287 if (zio->io_type == ZIO_TYPE_READ) {
2533 mutex_exit(&vd->vdev_dtl_lock);
2534 }
2535
2536 int
2537 vdev_dtl_load(vdev_t *vd)
2538 {
2539 spa_t *spa = vd->vdev_spa;
2540 objset_t *mos = spa->spa_meta_objset;
2541 int error = 0;
2542
2543 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2544 ASSERT(vdev_is_concrete(vd));
2545
2546 error = space_map_open(&vd->vdev_dtl_sm, mos,
2547 vd->vdev_dtl_object, 0, -1ULL, 0);
2548 if (error)
2549 return (error);
2550 ASSERT(vd->vdev_dtl_sm != NULL);
2551
2552 mutex_enter(&vd->vdev_dtl_lock);
2553 error = space_map_load(vd->vdev_dtl_sm,
2554 vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2555 mutex_exit(&vd->vdev_dtl_lock);
2556
2557 return (error);
2558 }
2559
2560 for (int c = 0; c < vd->vdev_children; c++) {
2561 error = vdev_dtl_load(vd->vdev_child[c]);
2562 if (error != 0)
2563 break;
2564 }
2565
2566 return (error);
2567 }
2568
2569 static void
2570 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
2571 {
2572 spa_t *spa = vd->vdev_spa;
2692
2693 space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
2694 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
2695 range_tree_vacate(rtsync, NULL, NULL);
2696
2697 range_tree_destroy(rtsync);
2698
2699 /*
2700 * If the object for the space map has changed then dirty
2701 * the top level so that we update the config.
2702 */
2703 if (object != space_map_object(vd->vdev_dtl_sm)) {
2704 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
2705 "new object %llu", (u_longlong_t)txg, spa_name(spa),
2706 (u_longlong_t)object,
2707 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2708 vdev_config_dirty(vd->vdev_top);
2709 }
2710
2711 dmu_tx_commit(tx);
2712 }
2713
2714 /*
2715 * Determine whether the specified vdev can be offlined/detached/removed
2716 * without losing data.
2717 */
2718 boolean_t
2719 vdev_dtl_required(vdev_t *vd)
2720 {
2721 spa_t *spa = vd->vdev_spa;
2722 vdev_t *tvd = vd->vdev_top;
2723 uint8_t cant_read = vd->vdev_cant_read;
2724 boolean_t required;
2725
2726 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2727
2728 if (vd == spa->spa_root_vdev || vd == tvd)
2729 return (B_TRUE);
2730
2731 /*
2834 VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
2835 bias_str) == 0) {
2836 ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
2837 vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
2838 }
2839 }
2840
2841 /*
2842 * If this is a top-level vdev, initialize its metaslabs.
2843 */
2844 if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
2845 vdev_metaslab_group_create(vd);
2846
2847 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
2848 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2849 VDEV_AUX_CORRUPT_DATA);
2850 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
2851 "asize=%llu", (u_longlong_t)vd->vdev_ashift,
2852 (u_longlong_t)vd->vdev_asize);
2853 return (SET_ERROR(ENXIO));
2854 }
2855
2856 error = vdev_metaslab_init(vd, 0);
2857 if (error != 0) {
2858 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
2859 "[error=%d]", error);
2860 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2861 VDEV_AUX_CORRUPT_DATA);
2862 return (error);
2863 }
2864
2865 uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
2866 if (checkpoint_sm_obj != 0) {
2867 objset_t *mos = spa_meta_objset(vd->vdev_spa);
2868 ASSERT(vd->vdev_asize != 0);
2869 ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
2870
2871 error = space_map_open(&vd->vdev_checkpoint_sm,
2872 mos, checkpoint_sm_obj, 0, vd->vdev_asize,
2873 vd->vdev_ashift);
2874 if (error != 0) {
2875 vdev_dbgmsg(vd, "vdev_load: space_map_open "
2876 "failed for checkpoint spacemap (obj %llu) "
2877 "[error=%d]",
2878 (u_longlong_t)checkpoint_sm_obj, error);
2879 return (error);
2880 }
2881 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2882
2883 /*
2884 * Since the checkpoint_sm contains free entries
2885 * exclusively we can use space_map_allocated() to
2886 * indicate the cumulative checkpointed space that
2887 * has been freed.
2888 */
2889 vd->vdev_stat.vs_checkpoint_space =
2890 -space_map_allocated(vd->vdev_checkpoint_sm);
2891 vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
2892 vd->vdev_stat.vs_checkpoint_space;
2893 }
2894 }
2895
2896 /*
2897 * If this is a leaf vdev, load its DTL.
2898 */
2899 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
2900 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2901 VDEV_AUX_CORRUPT_DATA);
2902 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
2903 "[error=%d]", error);
2904 return (error);
2905 }
2906
2907 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
2908 if (obsolete_sm_object != 0) {
2909 objset_t *mos = vd->vdev_spa->spa_meta_objset;
2910 ASSERT(vd->vdev_asize != 0);
2911 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
2912
2913 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
2914 obsolete_sm_object, 0, vd->vdev_asize, 0))) {
2915 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2916 VDEV_AUX_CORRUPT_DATA);
2917 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
2918 "obsolete spacemap (obj %llu) [error=%d]",
2919 (u_longlong_t)obsolete_sm_object, error);
2920 return (error);
2921 }
2922 }
2923
2924 return (0);
2925 }
2926
2927 /*
2928 * The special vdev case is used for hot spares and l2cache devices. Its
2929 * sole purpose it to set the vdev state for the associated vdev. To do this,
2930 * we make sure that we can open the underlying device, then try to read the
2931 * label, and make sure that the label is sane and that it hasn't been
2932 * repurposed to another pool.
2933 */
2934 int
2935 vdev_validate_aux(vdev_t *vd)
2936 {
2937 nvlist_t *label;
2938 uint64_t guid, version;
2939 uint64_t state;
2940
2941 if (!vdev_readable(vd))
2988 if (smobj == 0)
2989 continue;
2990
2991 space_map_free_obj(mos, smobj, tx);
2992 }
2993
2994 kmem_free(smobj_array, array_bytes);
2995 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
2996 vd->vdev_ms_array = 0;
2997 }
2998
2999 static void
3000 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3001 {
3002 spa_t *spa = vd->vdev_spa;
3003
3004 ASSERT(vd->vdev_islog);
3005 ASSERT(vd == vd->vdev_top);
3006 ASSERT3U(txg, ==, spa_syncing_txg(spa));
3007
3008 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3009
3010 vdev_destroy_spacemaps(vd, tx);
3011 if (vd->vdev_top_zap != 0) {
3012 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3013 vd->vdev_top_zap = 0;
3014 }
3015
3016 dmu_tx_commit(tx);
3017 }
3018
3019 void
3020 vdev_sync_done(vdev_t *vd, uint64_t txg)
3021 {
3022 metaslab_t *msp;
3023 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
3024
3025 ASSERT(vdev_is_concrete(vd));
3026
3027 while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
3028 != NULL)
3029 metaslab_sync_done(msp, txg);
3030
3031 if (reassess)
3032 metaslab_sync_reassess(vd->vdev_mg);
3033 }
3034
3035 void
3036 vdev_sync(vdev_t *vd, uint64_t txg)
3037 {
3038 spa_t *spa = vd->vdev_spa;
3039 vdev_t *lvd;
3040 metaslab_t *msp;
3041
3042 ASSERT3U(txg, ==, spa->spa_syncing_txg);
3043 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3044 if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
3045 ASSERT(vd->vdev_removing ||
3046 vd->vdev_ops == &vdev_indirect_ops);
3047
3048 vdev_indirect_sync_obsolete(vd, tx);
3049
3050 /*
3051 * If the vdev is indirect, it can't have dirty
3052 * metaslabs or DTLs.
3053 */
3054 if (vd->vdev_ops == &vdev_indirect_ops) {
3055 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
3056 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
3057 dmu_tx_commit(tx);
3058 return;
3059 }
3060 }
3061
3062 ASSERT(vdev_is_concrete(vd));
3063
3064 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
3065 !vd->vdev_removing) {
3066 ASSERT(vd == vd->vdev_top);
3067 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3068 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
3069 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
3070 ASSERT(vd->vdev_ms_array != 0);
3071 vdev_config_dirty(vd);
3072 }
3073
3074 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
3075 metaslab_sync(msp, txg);
3076 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
3077 }
3078
3079 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
3080 vdev_dtl_sync(lvd, txg);
3081
3082 /*
3083 * If this is an empty log device being removed, destroy the
3084 * metadata associated with it.
3085 */
3086 if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
3087 vdev_remove_empty_log(vd, txg);
3088
3089 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
3090 dmu_tx_commit(tx);
3091 }
3092
3093 uint64_t
3094 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
3095 {
3096 return (vd->vdev_ops->vdev_op_asize(vd, psize));
3097 }
3098
3099 /*
3100 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
3101 * not be opened, and no I/O is attempted.
3102 */
3103 int
3104 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
3105 {
3106 vdev_t *vd, *tvd;
3107
3108 spa_vdev_state_enter(spa, SCL_NONE);
3109
3110 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3300 * If the top-level is a slog and it has had allocations
3301 * then proceed. We check that the vdev's metaslab group
3302 * is not NULL since it's possible that we may have just
3303 * added this vdev but not yet initialized its metaslabs.
3304 */
3305 if (tvd->vdev_islog && mg != NULL) {
3306 /*
3307 * Prevent any future allocations.
3308 */
3309 metaslab_group_passivate(mg);
3310 (void) spa_vdev_state_exit(spa, vd, 0);
3311
3312 error = spa_reset_logs(spa);
3313
3314 /*
3315 * If the log device was successfully reset but has
3316 * checkpointed data, do not offline it.
3317 */
3318 if (error == 0 &&
3319 tvd->vdev_checkpoint_sm != NULL) {
3320 error = ZFS_ERR_CHECKPOINT_EXISTS;
3321 }
3322
3323 spa_vdev_state_enter(spa, SCL_ALLOC);
3324
3325 /*
3326 * Check to see if the config has changed.
3327 */
3328 if (error || generation != spa->spa_config_generation) {
3329 metaslab_group_activate(mg);
3330 if (error)
3331 return (spa_vdev_state_exit(spa,
3332 vd, error));
3333 (void) spa_vdev_state_exit(spa, vd, 0);
3334 goto top;
3335 }
3336 ASSERT0(tvd->vdev_stat.vs_alloc);
3337 }
3338
3339 /*
|