484                 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 485         }
 486 
 487         vd->vdev_spa = spa;
 488         vd->vdev_id = id;
 489         vd->vdev_guid = guid;
 490         vd->vdev_guid_sum = guid;
 491         vd->vdev_ops = ops;
 492         vd->vdev_state = VDEV_STATE_CLOSED;
 493         vd->vdev_ishole = (ops == &vdev_hole_ops);
 494         vic->vic_prev_indirect_vdev = UINT64_MAX;
 495 
 496         rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 497         mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 498         vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
 499 
 500         list_link_init(&vd->vdev_leaf_node);
 501         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 502         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 503         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 504         mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 505         mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 506         mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 507         cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 508         cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 509 
 510         for (int t = 0; t < DTL_TYPES; t++) {
 511                 vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
 512         }
 513         txg_list_create(&vd->vdev_ms_list, spa,
 514             offsetof(struct metaslab, ms_txg_node));
 515         txg_list_create(&vd->vdev_dtl_list, spa,
 516             offsetof(struct vdev, vdev_dtl_node));
 517         vd->vdev_stat.vs_timestamp = gethrtime();
 518         vdev_queue_init(vd);
 519         vdev_cache_init(vd);
 520 
 521         return (vd);
 522 }
 523 
 524 /*
 
 872         }
 873         mutex_exit(&vd->vdev_dtl_lock);
 874 
 875         EQUIV(vd->vdev_indirect_births != NULL,
 876             vd->vdev_indirect_mapping != NULL);
 877         if (vd->vdev_indirect_births != NULL) {
 878                 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 879                 vdev_indirect_births_close(vd->vdev_indirect_births);
 880         }
 881 
 882         if (vd->vdev_obsolete_sm != NULL) {
 883                 ASSERT(vd->vdev_removing ||
 884                     vd->vdev_ops == &vdev_indirect_ops);
 885                 space_map_close(vd->vdev_obsolete_sm);
 886                 vd->vdev_obsolete_sm = NULL;
 887         }
 888         range_tree_destroy(vd->vdev_obsolete_segments);
 889         rw_destroy(&vd->vdev_indirect_rwlock);
 890         mutex_destroy(&vd->vdev_obsolete_lock);
 891 
 892         mutex_destroy(&vd->vdev_queue_lock);
 893         mutex_destroy(&vd->vdev_dtl_lock);
 894         mutex_destroy(&vd->vdev_stat_lock);
 895         mutex_destroy(&vd->vdev_probe_lock);
 896         mutex_destroy(&vd->vdev_initialize_lock);
 897         mutex_destroy(&vd->vdev_initialize_io_lock);
 898         cv_destroy(&vd->vdev_initialize_io_cv);
 899         cv_destroy(&vd->vdev_initialize_cv);
 900 
 901         if (vd == spa->spa_root_vdev)
 902                 spa->spa_root_vdev = NULL;
 903 
 904         kmem_free(vd, sizeof (vdev_t));
 905 }
 906 
 907 /*
 908  * Transfer top-level vdev state from svd to tvd.
 909  */
 910 static void
 911 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 912 {
 
1234 
1235 void
1236 vdev_metaslab_fini(vdev_t *vd)
1237 {
1238         if (vd->vdev_checkpoint_sm != NULL) {
1239                 ASSERT(spa_feature_is_active(vd->vdev_spa,
1240                     SPA_FEATURE_POOL_CHECKPOINT));
1241                 space_map_close(vd->vdev_checkpoint_sm);
1242                 /*
1243                  * Even though we close the space map, we need to set its
1244                  * pointer to NULL. The reason is that vdev_metaslab_fini()
1245                  * may be called multiple times for certain operations
1246                  * (i.e. when destroying a pool) so we need to ensure that
1247                  * this clause never executes twice. This logic is similar
1248                  * to the one used for the vdev_ms clause below.
1249                  */
1250                 vd->vdev_checkpoint_sm = NULL;
1251         }
1252 
1253         if (vd->vdev_ms != NULL) {
1254                 uint64_t count = vd->vdev_ms_count;
1255 
1256                 metaslab_group_passivate(vd->vdev_mg);
1257                 for (uint64_t m = 0; m < count; m++) {
1258                         metaslab_t *msp = vd->vdev_ms[m];
1259 
1260                         if (msp != NULL)
1261                                 metaslab_fini(msp);
1262                 }
1263                 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1264                 vd->vdev_ms = NULL;
1265 
1266                 vd->vdev_ms_count = 0;
1267         }
1268         ASSERT0(vd->vdev_ms_count);
1269 }
1270 
1271 typedef struct vdev_probe_stats {
1272         boolean_t       vps_readable;
1273         boolean_t       vps_writeable;
1274         int             vps_flags;
1275 } vdev_probe_stats_t;
1276 
1277 static void
1278 vdev_probe_done(zio_t *zio)
1279 {
1280         spa_t *spa = zio->io_spa;
1281         vdev_t *vd = zio->io_vd;
1282         vdev_probe_stats_t *vps = zio->io_private;
1283 
1284         ASSERT(vd->vdev_probe_zio != NULL);
1285 
1286         if (zio->io_type == ZIO_TYPE_READ) {
 
2532         mutex_exit(&vd->vdev_dtl_lock);
2533 }
2534 
2535 int
2536 vdev_dtl_load(vdev_t *vd)
2537 {
2538         spa_t *spa = vd->vdev_spa;
2539         objset_t *mos = spa->spa_meta_objset;
2540         int error = 0;
2541 
2542         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2543                 ASSERT(vdev_is_concrete(vd));
2544 
2545                 error = space_map_open(&vd->vdev_dtl_sm, mos,
2546                     vd->vdev_dtl_object, 0, -1ULL, 0);
2547                 if (error)
2548                         return (error);
2549                 ASSERT(vd->vdev_dtl_sm != NULL);
2550 
2551                 mutex_enter(&vd->vdev_dtl_lock);
2552 
2553                 /*
2554                  * Now that we've opened the space_map we need to update
2555                  * the in-core DTL.
2556                  */
2557                 space_map_update(vd->vdev_dtl_sm);
2558 
2559                 error = space_map_load(vd->vdev_dtl_sm,
2560                     vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2561                 mutex_exit(&vd->vdev_dtl_lock);
2562 
2563                 return (error);
2564         }
2565 
2566         for (int c = 0; c < vd->vdev_children; c++) {
2567                 error = vdev_dtl_load(vd->vdev_child[c]);
2568                 if (error != 0)
2569                         break;
2570         }
2571 
2572         return (error);
2573 }
2574 
2575 static void
2576 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
2577 {
2578         spa_t *spa = vd->vdev_spa;
 
2698 
2699         space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
2700         space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
2701         range_tree_vacate(rtsync, NULL, NULL);
2702 
2703         range_tree_destroy(rtsync);
2704 
2705         /*
2706          * If the object for the space map has changed then dirty
2707          * the top level so that we update the config.
2708          */
2709         if (object != space_map_object(vd->vdev_dtl_sm)) {
2710                 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
2711                     "new object %llu", (u_longlong_t)txg, spa_name(spa),
2712                     (u_longlong_t)object,
2713                     (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2714                 vdev_config_dirty(vd->vdev_top);
2715         }
2716 
2717         dmu_tx_commit(tx);
2718 
2719         mutex_enter(&vd->vdev_dtl_lock);
2720         space_map_update(vd->vdev_dtl_sm);
2721         mutex_exit(&vd->vdev_dtl_lock);
2722 }
2723 
2724 /*
2725  * Determine whether the specified vdev can be offlined/detached/removed
2726  * without losing data.
2727  */
2728 boolean_t
2729 vdev_dtl_required(vdev_t *vd)
2730 {
2731         spa_t *spa = vd->vdev_spa;
2732         vdev_t *tvd = vd->vdev_top;
2733         uint8_t cant_read = vd->vdev_cant_read;
2734         boolean_t required;
2735 
2736         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2737 
2738         if (vd == spa->spa_root_vdev || vd == tvd)
2739                 return (B_TRUE);
2740 
2741         /*
 
2844                     VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
2845                     bias_str) == 0) {
2846                         ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
2847                         vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
2848                 }
2849         }
2850 
2851         /*
2852          * If this is a top-level vdev, initialize its metaslabs.
2853          */
2854         if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
2855                 vdev_metaslab_group_create(vd);
2856 
2857                 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
2858                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2859                             VDEV_AUX_CORRUPT_DATA);
2860                         vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
2861                             "asize=%llu", (u_longlong_t)vd->vdev_ashift,
2862                             (u_longlong_t)vd->vdev_asize);
2863                         return (SET_ERROR(ENXIO));
2864                 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
2865                         vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
2866                             "[error=%d]", error);
2867                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2868                             VDEV_AUX_CORRUPT_DATA);
2869                         return (error);
2870                 }
2871 
2872                 uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
2873                 if (checkpoint_sm_obj != 0) {
2874                         objset_t *mos = spa_meta_objset(vd->vdev_spa);
2875                         ASSERT(vd->vdev_asize != 0);
2876                         ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
2877 
2878                         if ((error = space_map_open(&vd->vdev_checkpoint_sm,
2879                             mos, checkpoint_sm_obj, 0, vd->vdev_asize,
2880                             vd->vdev_ashift))) {
2881                                 vdev_dbgmsg(vd, "vdev_load: space_map_open "
2882                                     "failed for checkpoint spacemap (obj %llu) "
2883                                     "[error=%d]",
2884                                     (u_longlong_t)checkpoint_sm_obj, error);
2885                                 return (error);
2886                         }
2887                         ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2888                         space_map_update(vd->vdev_checkpoint_sm);
2889 
2890                         /*
2891                          * Since the checkpoint_sm contains free entries
2892                          * exclusively we can use sm_alloc to indicate the
2893                          * culmulative checkpointed space that has been freed.
2894                          */
2895                         vd->vdev_stat.vs_checkpoint_space =
2896                             -vd->vdev_checkpoint_sm->sm_alloc;
2897                         vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
2898                             vd->vdev_stat.vs_checkpoint_space;
2899                 }
2900         }
2901 
2902         /*
2903          * If this is a leaf vdev, load its DTL.
2904          */
2905         if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
2906                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2907                     VDEV_AUX_CORRUPT_DATA);
2908                 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
2909                     "[error=%d]", error);
2910                 return (error);
2911         }
2912 
2913         uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
2914         if (obsolete_sm_object != 0) {
2915                 objset_t *mos = vd->vdev_spa->spa_meta_objset;
2916                 ASSERT(vd->vdev_asize != 0);
2917                 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
2918 
2919                 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
2920                     obsolete_sm_object, 0, vd->vdev_asize, 0))) {
2921                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2922                             VDEV_AUX_CORRUPT_DATA);
2923                         vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
2924                             "obsolete spacemap (obj %llu) [error=%d]",
2925                             (u_longlong_t)obsolete_sm_object, error);
2926                         return (error);
2927                 }
2928                 space_map_update(vd->vdev_obsolete_sm);
2929         }
2930 
2931         return (0);
2932 }
2933 
2934 /*
2935  * The special vdev case is used for hot spares and l2cache devices.  Its
2936  * sole purpose it to set the vdev state for the associated vdev.  To do this,
2937  * we make sure that we can open the underlying device, then try to read the
2938  * label, and make sure that the label is sane and that it hasn't been
2939  * repurposed to another pool.
2940  */
2941 int
2942 vdev_validate_aux(vdev_t *vd)
2943 {
2944         nvlist_t *label;
2945         uint64_t guid, version;
2946         uint64_t state;
2947 
2948         if (!vdev_readable(vd))
 
2995                 if (smobj == 0)
2996                         continue;
2997 
2998                 space_map_free_obj(mos, smobj, tx);
2999         }
3000 
3001         kmem_free(smobj_array, array_bytes);
3002         VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
3003         vd->vdev_ms_array = 0;
3004 }
3005 
3006 static void
3007 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3008 {
3009         spa_t *spa = vd->vdev_spa;
3010 
3011         ASSERT(vd->vdev_islog);
3012         ASSERT(vd == vd->vdev_top);
3013         ASSERT3U(txg, ==, spa_syncing_txg(spa));
3014 
3015         if (vd->vdev_ms != NULL) {
3016                 metaslab_group_t *mg = vd->vdev_mg;
3017 
3018                 metaslab_group_histogram_verify(mg);
3019                 metaslab_class_histogram_verify(mg->mg_class);
3020 
3021                 for (int m = 0; m < vd->vdev_ms_count; m++) {
3022                         metaslab_t *msp = vd->vdev_ms[m];
3023 
3024                         if (msp == NULL || msp->ms_sm == NULL)
3025                                 continue;
3026 
3027                         mutex_enter(&msp->ms_lock);
3028                         /*
3029                          * If the metaslab was not loaded when the vdev
3030                          * was removed then the histogram accounting may
3031                          * not be accurate. Update the histogram information
3032                          * here so that we ensure that the metaslab group
3033                          * and metaslab class are up-to-date.
3034                          */
3035                         metaslab_group_histogram_remove(mg, msp);
3036 
3037                         VERIFY0(space_map_allocated(msp->ms_sm));
3038                         space_map_close(msp->ms_sm);
3039                         msp->ms_sm = NULL;
3040                         mutex_exit(&msp->ms_lock);
3041                 }
3042 
3043                 if (vd->vdev_checkpoint_sm != NULL) {
3044                         ASSERT(spa_has_checkpoint(spa));
3045                         space_map_close(vd->vdev_checkpoint_sm);
3046                         vd->vdev_checkpoint_sm = NULL;
3047                 }
3048 
3049                 metaslab_group_histogram_verify(mg);
3050                 metaslab_class_histogram_verify(mg->mg_class);
3051 
3052                 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
3053                         ASSERT0(mg->mg_histogram[i]);
3054         }
3055 
3056         dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3057 
3058         vdev_destroy_spacemaps(vd, tx);
3059         if (vd->vdev_top_zap != 0) {
3060                 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3061                 vd->vdev_top_zap = 0;
3062         }
3063 
3064         dmu_tx_commit(tx);
3065 }
3066 
3067 void
3068 vdev_sync_done(vdev_t *vd, uint64_t txg)
3069 {
3070         metaslab_t *msp;
3071         boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
3072 
3073         ASSERT(vdev_is_concrete(vd));
3074 
3075         while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
3076             != NULL)
3077                 metaslab_sync_done(msp, txg);
3078 
3079         if (reassess)
3080                 metaslab_sync_reassess(vd->vdev_mg);
3081 }
3082 
3083 void
3084 vdev_sync(vdev_t *vd, uint64_t txg)
3085 {
3086         spa_t *spa = vd->vdev_spa;
3087         vdev_t *lvd;
3088         metaslab_t *msp;
3089         dmu_tx_t *tx;
3090 
3091         if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
3092                 dmu_tx_t *tx;
3093 
3094                 ASSERT(vd->vdev_removing ||
3095                     vd->vdev_ops == &vdev_indirect_ops);
3096 
3097                 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3098                 vdev_indirect_sync_obsolete(vd, tx);
3099                 dmu_tx_commit(tx);
3100 
3101                 /*
3102                  * If the vdev is indirect, it can't have dirty
3103                  * metaslabs or DTLs.
3104                  */
3105                 if (vd->vdev_ops == &vdev_indirect_ops) {
3106                         ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
3107                         ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
3108                         return;
3109                 }
3110         }
3111 
3112         ASSERT(vdev_is_concrete(vd));
3113 
3114         if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
3115             !vd->vdev_removing) {
3116                 ASSERT(vd == vd->vdev_top);
3117                 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3118                 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3119                 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
3120                     DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
3121                 ASSERT(vd->vdev_ms_array != 0);
3122                 vdev_config_dirty(vd);
3123                 dmu_tx_commit(tx);
3124         }
3125 
3126         while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
3127                 metaslab_sync(msp, txg);
3128                 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
3129         }
3130 
3131         while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
3132                 vdev_dtl_sync(lvd, txg);
3133 
3134         /*
3135          * If this is an empty log device being removed, destroy the
3136          * metadata associated with it.
3137          */
3138         if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
3139                 vdev_remove_empty_log(vd, txg);
3140 
3141         (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
3142 }
3143 
3144 uint64_t
3145 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
3146 {
3147         return (vd->vdev_ops->vdev_op_asize(vd, psize));
3148 }
3149 
3150 /*
3151  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
3152  * not be opened, and no I/O is attempted.
3153  */
3154 int
3155 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
3156 {
3157         vdev_t *vd, *tvd;
3158 
3159         spa_vdev_state_enter(spa, SCL_NONE);
3160 
3161         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 
3351                  * If the top-level is a slog and it has had allocations
3352                  * then proceed.  We check that the vdev's metaslab group
3353                  * is not NULL since it's possible that we may have just
3354                  * added this vdev but not yet initialized its metaslabs.
3355                  */
3356                 if (tvd->vdev_islog && mg != NULL) {
3357                         /*
3358                          * Prevent any future allocations.
3359                          */
3360                         metaslab_group_passivate(mg);
3361                         (void) spa_vdev_state_exit(spa, vd, 0);
3362 
3363                         error = spa_reset_logs(spa);
3364 
3365                         /*
3366                          * If the log device was successfully reset but has
3367                          * checkpointed data, do not offline it.
3368                          */
3369                         if (error == 0 &&
3370                             tvd->vdev_checkpoint_sm != NULL) {
3371                                 ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
3372                                     !=, 0);
3373                                 error = ZFS_ERR_CHECKPOINT_EXISTS;
3374                         }
3375 
3376                         spa_vdev_state_enter(spa, SCL_ALLOC);
3377 
3378                         /*
3379                          * Check to see if the config has changed.
3380                          */
3381                         if (error || generation != spa->spa_config_generation) {
3382                                 metaslab_group_activate(mg);
3383                                 if (error)
3384                                         return (spa_vdev_state_exit(spa,
3385                                             vd, error));
3386                                 (void) spa_vdev_state_exit(spa, vd, 0);
3387                                 goto top;
3388                         }
3389                         ASSERT0(tvd->vdev_stat.vs_alloc);
3390                 }
3391 
3392                 /*
 
 | 
 
 
 484                 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 485         }
 486 
 487         vd->vdev_spa = spa;
 488         vd->vdev_id = id;
 489         vd->vdev_guid = guid;
 490         vd->vdev_guid_sum = guid;
 491         vd->vdev_ops = ops;
 492         vd->vdev_state = VDEV_STATE_CLOSED;
 493         vd->vdev_ishole = (ops == &vdev_hole_ops);
 494         vic->vic_prev_indirect_vdev = UINT64_MAX;
 495 
 496         rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 497         mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 498         vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
 499 
 500         list_link_init(&vd->vdev_leaf_node);
 501         mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 502         mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 503         mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 504         mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 505         mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 506         cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 507         cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 508 
 509         for (int t = 0; t < DTL_TYPES; t++) {
 510                 vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
 511         }
 512         txg_list_create(&vd->vdev_ms_list, spa,
 513             offsetof(struct metaslab, ms_txg_node));
 514         txg_list_create(&vd->vdev_dtl_list, spa,
 515             offsetof(struct vdev, vdev_dtl_node));
 516         vd->vdev_stat.vs_timestamp = gethrtime();
 517         vdev_queue_init(vd);
 518         vdev_cache_init(vd);
 519 
 520         return (vd);
 521 }
 522 
 523 /*
 
 871         }
 872         mutex_exit(&vd->vdev_dtl_lock);
 873 
 874         EQUIV(vd->vdev_indirect_births != NULL,
 875             vd->vdev_indirect_mapping != NULL);
 876         if (vd->vdev_indirect_births != NULL) {
 877                 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 878                 vdev_indirect_births_close(vd->vdev_indirect_births);
 879         }
 880 
 881         if (vd->vdev_obsolete_sm != NULL) {
 882                 ASSERT(vd->vdev_removing ||
 883                     vd->vdev_ops == &vdev_indirect_ops);
 884                 space_map_close(vd->vdev_obsolete_sm);
 885                 vd->vdev_obsolete_sm = NULL;
 886         }
 887         range_tree_destroy(vd->vdev_obsolete_segments);
 888         rw_destroy(&vd->vdev_indirect_rwlock);
 889         mutex_destroy(&vd->vdev_obsolete_lock);
 890 
 891         mutex_destroy(&vd->vdev_dtl_lock);
 892         mutex_destroy(&vd->vdev_stat_lock);
 893         mutex_destroy(&vd->vdev_probe_lock);
 894         mutex_destroy(&vd->vdev_initialize_lock);
 895         mutex_destroy(&vd->vdev_initialize_io_lock);
 896         cv_destroy(&vd->vdev_initialize_io_cv);
 897         cv_destroy(&vd->vdev_initialize_cv);
 898 
 899         if (vd == spa->spa_root_vdev)
 900                 spa->spa_root_vdev = NULL;
 901 
 902         kmem_free(vd, sizeof (vdev_t));
 903 }
 904 
 905 /*
 906  * Transfer top-level vdev state from svd to tvd.
 907  */
 908 static void
 909 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 910 {
 
1232 
1233 void
1234 vdev_metaslab_fini(vdev_t *vd)
1235 {
1236         if (vd->vdev_checkpoint_sm != NULL) {
1237                 ASSERT(spa_feature_is_active(vd->vdev_spa,
1238                     SPA_FEATURE_POOL_CHECKPOINT));
1239                 space_map_close(vd->vdev_checkpoint_sm);
1240                 /*
1241                  * Even though we close the space map, we need to set its
1242                  * pointer to NULL. The reason is that vdev_metaslab_fini()
1243                  * may be called multiple times for certain operations
1244                  * (i.e. when destroying a pool) so we need to ensure that
1245                  * this clause never executes twice. This logic is similar
1246                  * to the one used for the vdev_ms clause below.
1247                  */
1248                 vd->vdev_checkpoint_sm = NULL;
1249         }
1250 
1251         if (vd->vdev_ms != NULL) {
1252                 metaslab_group_t *mg = vd->vdev_mg;
1253                 metaslab_group_passivate(mg);
1254 
1255                 uint64_t count = vd->vdev_ms_count;
1256                 for (uint64_t m = 0; m < count; m++) {
1257                         metaslab_t *msp = vd->vdev_ms[m];
1258                         if (msp != NULL)
1259                                 metaslab_fini(msp);
1260                 }
1261                 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1262                 vd->vdev_ms = NULL;
1263 
1264                 vd->vdev_ms_count = 0;
1265 
1266                 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
1267                         ASSERT0(mg->mg_histogram[i]);
1268         }
1269         ASSERT0(vd->vdev_ms_count);
1270 }
1271 
1272 typedef struct vdev_probe_stats {
1273         boolean_t       vps_readable;
1274         boolean_t       vps_writeable;
1275         int             vps_flags;
1276 } vdev_probe_stats_t;
1277 
1278 static void
1279 vdev_probe_done(zio_t *zio)
1280 {
1281         spa_t *spa = zio->io_spa;
1282         vdev_t *vd = zio->io_vd;
1283         vdev_probe_stats_t *vps = zio->io_private;
1284 
1285         ASSERT(vd->vdev_probe_zio != NULL);
1286 
1287         if (zio->io_type == ZIO_TYPE_READ) {
 
2533         mutex_exit(&vd->vdev_dtl_lock);
2534 }
2535 
2536 int
2537 vdev_dtl_load(vdev_t *vd)
2538 {
2539         spa_t *spa = vd->vdev_spa;
2540         objset_t *mos = spa->spa_meta_objset;
2541         int error = 0;
2542 
2543         if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
2544                 ASSERT(vdev_is_concrete(vd));
2545 
2546                 error = space_map_open(&vd->vdev_dtl_sm, mos,
2547                     vd->vdev_dtl_object, 0, -1ULL, 0);
2548                 if (error)
2549                         return (error);
2550                 ASSERT(vd->vdev_dtl_sm != NULL);
2551 
2552                 mutex_enter(&vd->vdev_dtl_lock);
2553                 error = space_map_load(vd->vdev_dtl_sm,
2554                     vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
2555                 mutex_exit(&vd->vdev_dtl_lock);
2556 
2557                 return (error);
2558         }
2559 
2560         for (int c = 0; c < vd->vdev_children; c++) {
2561                 error = vdev_dtl_load(vd->vdev_child[c]);
2562                 if (error != 0)
2563                         break;
2564         }
2565 
2566         return (error);
2567 }
2568 
2569 static void
2570 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
2571 {
2572         spa_t *spa = vd->vdev_spa;
 
2692 
2693         space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
2694         space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
2695         range_tree_vacate(rtsync, NULL, NULL);
2696 
2697         range_tree_destroy(rtsync);
2698 
2699         /*
2700          * If the object for the space map has changed then dirty
2701          * the top level so that we update the config.
2702          */
2703         if (object != space_map_object(vd->vdev_dtl_sm)) {
2704                 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
2705                     "new object %llu", (u_longlong_t)txg, spa_name(spa),
2706                     (u_longlong_t)object,
2707                     (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
2708                 vdev_config_dirty(vd->vdev_top);
2709         }
2710 
2711         dmu_tx_commit(tx);
2712 }
2713 
2714 /*
2715  * Determine whether the specified vdev can be offlined/detached/removed
2716  * without losing data.
2717  */
2718 boolean_t
2719 vdev_dtl_required(vdev_t *vd)
2720 {
2721         spa_t *spa = vd->vdev_spa;
2722         vdev_t *tvd = vd->vdev_top;
2723         uint8_t cant_read = vd->vdev_cant_read;
2724         boolean_t required;
2725 
2726         ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2727 
2728         if (vd == spa->spa_root_vdev || vd == tvd)
2729                 return (B_TRUE);
2730 
2731         /*
 
2834                     VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
2835                     bias_str) == 0) {
2836                         ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
2837                         vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
2838                 }
2839         }
2840 
2841         /*
2842          * If this is a top-level vdev, initialize its metaslabs.
2843          */
2844         if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
2845                 vdev_metaslab_group_create(vd);
2846 
2847                 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
2848                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2849                             VDEV_AUX_CORRUPT_DATA);
2850                         vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
2851                             "asize=%llu", (u_longlong_t)vd->vdev_ashift,
2852                             (u_longlong_t)vd->vdev_asize);
2853                         return (SET_ERROR(ENXIO));
2854                 }
2855 
2856                 error = vdev_metaslab_init(vd, 0);
2857                 if (error != 0) {
2858                         vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
2859                             "[error=%d]", error);
2860                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2861                             VDEV_AUX_CORRUPT_DATA);
2862                         return (error);
2863                 }
2864 
2865                 uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
2866                 if (checkpoint_sm_obj != 0) {
2867                         objset_t *mos = spa_meta_objset(vd->vdev_spa);
2868                         ASSERT(vd->vdev_asize != 0);
2869                         ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
2870 
2871                         error = space_map_open(&vd->vdev_checkpoint_sm,
2872                             mos, checkpoint_sm_obj, 0, vd->vdev_asize,
2873                             vd->vdev_ashift);
2874                         if (error != 0) {
2875                                 vdev_dbgmsg(vd, "vdev_load: space_map_open "
2876                                     "failed for checkpoint spacemap (obj %llu) "
2877                                     "[error=%d]",
2878                                     (u_longlong_t)checkpoint_sm_obj, error);
2879                                 return (error);
2880                         }
2881                         ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2882 
2883                         /*
2884                          * Since the checkpoint_sm contains free entries
2885                          * exclusively we can use space_map_allocated() to
2886                          * indicate the cumulative checkpointed space that
2887                          * has been freed.
2888                          */
2889                         vd->vdev_stat.vs_checkpoint_space =
2890                             -space_map_allocated(vd->vdev_checkpoint_sm);
2891                         vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
2892                             vd->vdev_stat.vs_checkpoint_space;
2893                 }
2894         }
2895 
2896         /*
2897          * If this is a leaf vdev, load its DTL.
2898          */
2899         if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
2900                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2901                     VDEV_AUX_CORRUPT_DATA);
2902                 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
2903                     "[error=%d]", error);
2904                 return (error);
2905         }
2906 
2907         uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
2908         if (obsolete_sm_object != 0) {
2909                 objset_t *mos = vd->vdev_spa->spa_meta_objset;
2910                 ASSERT(vd->vdev_asize != 0);
2911                 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
2912 
2913                 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
2914                     obsolete_sm_object, 0, vd->vdev_asize, 0))) {
2915                         vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2916                             VDEV_AUX_CORRUPT_DATA);
2917                         vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
2918                             "obsolete spacemap (obj %llu) [error=%d]",
2919                             (u_longlong_t)obsolete_sm_object, error);
2920                         return (error);
2921                 }
2922         }
2923 
2924         return (0);
2925 }
2926 
2927 /*
2928  * The special vdev case is used for hot spares and l2cache devices.  Its
2929  * sole purpose it to set the vdev state for the associated vdev.  To do this,
2930  * we make sure that we can open the underlying device, then try to read the
2931  * label, and make sure that the label is sane and that it hasn't been
2932  * repurposed to another pool.
2933  */
2934 int
2935 vdev_validate_aux(vdev_t *vd)
2936 {
2937         nvlist_t *label;
2938         uint64_t guid, version;
2939         uint64_t state;
2940 
2941         if (!vdev_readable(vd))
 
2988                 if (smobj == 0)
2989                         continue;
2990 
2991                 space_map_free_obj(mos, smobj, tx);
2992         }
2993 
2994         kmem_free(smobj_array, array_bytes);
2995         VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
2996         vd->vdev_ms_array = 0;
2997 }
2998 
2999 static void
3000 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
3001 {
3002         spa_t *spa = vd->vdev_spa;
3003 
3004         ASSERT(vd->vdev_islog);
3005         ASSERT(vd == vd->vdev_top);
3006         ASSERT3U(txg, ==, spa_syncing_txg(spa));
3007 
3008         dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3009 
3010         vdev_destroy_spacemaps(vd, tx);
3011         if (vd->vdev_top_zap != 0) {
3012                 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
3013                 vd->vdev_top_zap = 0;
3014         }
3015 
3016         dmu_tx_commit(tx);
3017 }
3018 
3019 void
3020 vdev_sync_done(vdev_t *vd, uint64_t txg)
3021 {
3022         metaslab_t *msp;
3023         boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
3024 
3025         ASSERT(vdev_is_concrete(vd));
3026 
3027         while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
3028             != NULL)
3029                 metaslab_sync_done(msp, txg);
3030 
3031         if (reassess)
3032                 metaslab_sync_reassess(vd->vdev_mg);
3033 }
3034 
3035 void
3036 vdev_sync(vdev_t *vd, uint64_t txg)
3037 {
3038         spa_t *spa = vd->vdev_spa;
3039         vdev_t *lvd;
3040         metaslab_t *msp;
3041 
3042         ASSERT3U(txg, ==, spa->spa_syncing_txg);
3043         dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3044         if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
3045                 ASSERT(vd->vdev_removing ||
3046                     vd->vdev_ops == &vdev_indirect_ops);
3047 
3048                 vdev_indirect_sync_obsolete(vd, tx);
3049 
3050                 /*
3051                  * If the vdev is indirect, it can't have dirty
3052                  * metaslabs or DTLs.
3053                  */
3054                 if (vd->vdev_ops == &vdev_indirect_ops) {
3055                         ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
3056                         ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
3057                         dmu_tx_commit(tx);
3058                         return;
3059                 }
3060         }
3061 
3062         ASSERT(vdev_is_concrete(vd));
3063 
3064         if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
3065             !vd->vdev_removing) {
3066                 ASSERT(vd == vd->vdev_top);
3067                 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
3068                 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
3069                     DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
3070                 ASSERT(vd->vdev_ms_array != 0);
3071                 vdev_config_dirty(vd);
3072         }
3073 
3074         while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
3075                 metaslab_sync(msp, txg);
3076                 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
3077         }
3078 
3079         while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
3080                 vdev_dtl_sync(lvd, txg);
3081 
3082         /*
3083          * If this is an empty log device being removed, destroy the
3084          * metadata associated with it.
3085          */
3086         if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
3087                 vdev_remove_empty_log(vd, txg);
3088 
3089         (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
3090         dmu_tx_commit(tx);
3091 }
3092 
3093 uint64_t
3094 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
3095 {
3096         return (vd->vdev_ops->vdev_op_asize(vd, psize));
3097 }
3098 
3099 /*
3100  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
3101  * not be opened, and no I/O is attempted.
3102  */
3103 int
3104 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
3105 {
3106         vdev_t *vd, *tvd;
3107 
3108         spa_vdev_state_enter(spa, SCL_NONE);
3109 
3110         if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 
3300                  * If the top-level is a slog and it has had allocations
3301                  * then proceed.  We check that the vdev's metaslab group
3302                  * is not NULL since it's possible that we may have just
3303                  * added this vdev but not yet initialized its metaslabs.
3304                  */
3305                 if (tvd->vdev_islog && mg != NULL) {
3306                         /*
3307                          * Prevent any future allocations.
3308                          */
3309                         metaslab_group_passivate(mg);
3310                         (void) spa_vdev_state_exit(spa, vd, 0);
3311 
3312                         error = spa_reset_logs(spa);
3313 
3314                         /*
3315                          * If the log device was successfully reset but has
3316                          * checkpointed data, do not offline it.
3317                          */
3318                         if (error == 0 &&
3319                             tvd->vdev_checkpoint_sm != NULL) {
3320                                 error = ZFS_ERR_CHECKPOINT_EXISTS;
3321                         }
3322 
3323                         spa_vdev_state_enter(spa, SCL_ALLOC);
3324 
3325                         /*
3326                          * Check to see if the config has changed.
3327                          */
3328                         if (error || generation != spa->spa_config_generation) {
3329                                 metaslab_group_activate(mg);
3330                                 if (error)
3331                                         return (spa_vdev_state_exit(spa,
3332                                             vd, error));
3333                                 (void) spa_vdev_state_exit(spa, vd, 0);
3334                                 goto top;
3335                         }
3336                         ASSERT0(tvd->vdev_stat.vs_alloc);
3337                 }
3338 
3339                 /*
 
 |