768         if (expected_refcount != actual_refcount) {
 769                 (void) printf("space map refcount mismatch: expected %lld != "
 770                     "actual %lld\n",
 771                     (longlong_t)expected_refcount,
 772                     (longlong_t)actual_refcount);
 773                 return (2);
 774         }
 775         return (0);
 776 }
 777 
 778 static void
 779 dump_spacemap(objset_t *os, space_map_t *sm)
 780 {
 781         char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 782             "INVALID", "INVALID", "INVALID", "INVALID" };
 783 
 784         if (sm == NULL)
 785                 return;
 786 
 787         (void) printf("space map object %llu:\n",
 788             (longlong_t)sm->sm_phys->smp_object);
 789         (void) printf("  smp_objsize = 0x%llx\n",
 790             (longlong_t)sm->sm_phys->smp_objsize);
 791         (void) printf("  smp_alloc = 0x%llx\n",
 792             (longlong_t)sm->sm_phys->smp_alloc);
 793 
 794         /*
 795          * Print out the freelist entries in both encoded and decoded form.
 796          */
 797         uint8_t mapshift = sm->sm_shift;
 798         int64_t alloc = 0;
 799         uint64_t word;
 800         for (uint64_t offset = 0; offset < space_map_length(sm);
 801             offset += sizeof (word)) {
 802 
 803                 VERIFY0(dmu_read(os, space_map_object(sm), offset,
 804                     sizeof (word), &word, DMU_READ_PREFETCH));
 805 
 806                 if (sm_entry_is_debug(word)) {
 807                         (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 808                             (u_longlong_t)(offset / sizeof (word)),
 809                             ddata[SM_DEBUG_ACTION_DECODE(word)],
 810                             (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
 811                             (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
 812                         continue;
 813                 }
 814 
 815                 uint8_t words;
 816                 char entry_type;
 817                 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
 818 
 819                 if (sm_entry_is_single_word(word)) {
 820                         entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 821                             'A' : 'F';
 822                         entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
 823                             sm->sm_start;
 824                         entry_run = SM_RUN_DECODE(word) << mapshift;
 825                         words = 1;
 826                 } else {
 827                         /* it is a two-word entry so we read another word */
 828                         ASSERT(sm_entry_is_double_word(word));
 829 
 830                         uint64_t extra_word;
 831                         offset += sizeof (extra_word);
 832                         VERIFY0(dmu_read(os, space_map_object(sm), offset,
 833                             sizeof (extra_word), &extra_word,
 834                             DMU_READ_PREFETCH));
 835 
 836                         ASSERT3U(offset, <=, space_map_length(sm));
 837 
 838                         entry_run = SM2_RUN_DECODE(word) << mapshift;
 839                         entry_vdev = SM2_VDEV_DECODE(word);
 840                         entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 841                             'A' : 'F';
 842                         entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 843                             mapshift) + sm->sm_start;
 844                         words = 2;
 845                 }
 846 
 847                 (void) printf("\t    [%6llu]    %c  range:"
 848                     " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
 849                     (u_longlong_t)(offset / sizeof (word)),
 850                     entry_type, (u_longlong_t)entry_off,
 851                     (u_longlong_t)(entry_off + entry_run),
 852                     (u_longlong_t)entry_run,
 853                     (u_longlong_t)entry_vdev, words);
 854 
 855                 if (entry_type == 'A')
 856                         alloc += entry_run;
 857                 else
 858                         alloc -= entry_run;
 859         }
 860         if ((uint64_t)alloc != space_map_allocated(sm)) {
 861                 (void) printf("space_map_object alloc (%lld) INCONSISTENT "
 862                     "with space map summary (%lld)\n",
 863                     (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 864         }
 865 }
 866 
 867 static void
 868 dump_metaslab_stats(metaslab_t *msp)
 869 {
 870         char maxbuf[32];
 871         range_tree_t *rt = msp->ms_allocatable;
 872         avl_tree_t *t = &msp->ms_allocatable_by_size;
 873         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 874 
 875         /* max sure nicenum has enough space */
 876         CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
 877 
 878         zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
 879 
 880         (void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 
 904                 mutex_enter(&msp->ms_lock);
 905                 VERIFY0(metaslab_load(msp));
 906                 range_tree_stat_verify(msp->ms_allocatable);
 907                 dump_metaslab_stats(msp);
 908                 metaslab_unload(msp);
 909                 mutex_exit(&msp->ms_lock);
 910         }
 911 
 912         if (dump_opt['m'] > 1 && sm != NULL &&
 913             spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 914                 /*
 915                  * The space map histogram represents free space in chunks
 916                  * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 917                  */
 918                 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 919                     (u_longlong_t)msp->ms_fragmentation);
 920                 dump_histogram(sm->sm_phys->smp_histogram,
 921                     SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 922         }
 923 
 924         if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
 925                 ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 926 
 927                 dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 928         }
 929 }
 930 
 931 static void
 932 print_vdev_metaslab_header(vdev_t *vd)
 933 {
 934         vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 935         const char *bias_str;
 936 
 937         bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ?
 938             VDEV_ALLOC_BIAS_LOG :
 939             (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 940             (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP :
 941             vd->vdev_islog ? "log" : "";
 942 
 943         (void) printf("\tvdev %10llu   %s\n"
 944             "\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 945             (u_longlong_t)vd->vdev_id, bias_str,
 946             "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 947             "offset", "spacemap", "free");
 948         (void) printf("\t%15s   %19s   %15s   %12s\n",
 
3081 static void
3082 zdb_leak(void *arg, uint64_t start, uint64_t size)
3083 {
3084         vdev_t *vd = arg;
3085 
3086         (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
3087             (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
3088 }
3089 
3090 static metaslab_ops_t zdb_metaslab_ops = {
3091         NULL    /* alloc */
3092 };
3093 
3094 static void
3095 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
3096 {
3097         ddt_bookmark_t ddb;
3098         ddt_entry_t dde;
3099         int error;
3100 
3101         bzero(&ddb, sizeof (ddb));
3102         while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
3103                 blkptr_t blk;
3104                 ddt_phys_t *ddp = dde.dde_phys;
3105 
3106                 if (ddb.ddb_class == DDT_CLASS_UNIQUE)
3107                         return;
3108 
3109                 ASSERT(ddt_phys_total_refcnt(&dde) > 1);
3110 
3111                 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
3112                         if (ddp->ddp_phys_birth == 0)
3113                                 continue;
3114                         ddt_bp_create(ddb.ddb_checksum,
3115                             &dde.dde_key, ddp, &blk);
3116                         if (p == DDT_PHYS_DITTO) {
3117                                 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
3118                         } else {
3119                                 zcb->zcb_dedup_asize +=
3120                                     BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
3121                                 zcb->zcb_dedup_blocks++;
3122                         }
3123                 }
3124                 if (!dump_opt['L']) {
3125                         ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
3126                         ddt_enter(ddt);
3127                         VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
3128                         ddt_exit(ddt);
3129                 }
3130         }
3131 
3132         ASSERT(error == ENOENT);
3133 }
3134 
3135 /* ARGSUSED */
3136 static void
3137 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3138     uint64_t size, void *arg)
3139 {
3140         /*
3141          * This callback was called through a remap from
3142          * a device being removed. Therefore, the vdev that
3143          * this callback is applied to is a concrete
3144          * vdev.
3145          */
3146         ASSERT(vdev_is_concrete(vd));
3147 
3148         VERIFY0(metaslab_claim_impl(vd, offset, size,
3149             spa_min_claim_txg(vd->vdev_spa)));
3150 }
3151 
3152 static void
3153 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
3154 {
3155         vdev_t *vd = arg;
3156 
3157         vdev_indirect_ops.vdev_op_remap(vd, offset, size,
3158             claim_segment_impl_cb, NULL);
3159 }
3160 
3161 /*
3162  * After accounting for all allocated blocks that are directly referenced,
3163  * we might have missed a reference to a block from a partially complete
3164  * (and thus unused) indirect mapping object. We perform a secondary pass
3165  * through the metaslabs we have already mapped and claim the destination
3166  * blocks.
3167  */
3168 static void
3169 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
3170 {
3171         if (spa->spa_vdev_removal == NULL)
3172                 return;
3173 
3174         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3175 
3176         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
3177         vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
3178         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3179 
3180         for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
3181                 metaslab_t *msp = vd->vdev_ms[msi];
3182 
3183                 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
3184                         break;
3185 
3186                 ASSERT0(range_tree_space(svr->svr_allocd_segs));
3187 
3188                 if (msp->ms_sm != NULL) {
3189                         VERIFY0(space_map_load(msp->ms_sm,
3190                             svr->svr_allocd_segs, SM_ALLOC));
 
3242 static uint32_t *
3243 zdb_load_obsolete_counts(vdev_t *vd)
3244 {
3245         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3246         spa_t *spa = vd->vdev_spa;
3247         spa_condensing_indirect_phys_t *scip =
3248             &spa->spa_condensing_indirect_phys;
3249         uint32_t *counts;
3250 
3251         EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
3252         counts = vdev_indirect_mapping_load_obsolete_counts(vim);
3253         if (vd->vdev_obsolete_sm != NULL) {
3254                 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3255                     vd->vdev_obsolete_sm);
3256         }
3257         if (scip->scip_vdev == vd->vdev_id &&
3258             scip->scip_prev_obsolete_sm_object != 0) {
3259                 space_map_t *prev_obsolete_sm = NULL;
3260                 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
3261                     scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
3262                 space_map_update(prev_obsolete_sm);
3263                 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3264                     prev_obsolete_sm);
3265                 space_map_close(prev_obsolete_sm);
3266         }
3267         return (counts);
3268 }
3269 
3270 typedef struct checkpoint_sm_exclude_entry_arg {
3271         vdev_t *cseea_vd;
3272         uint64_t cseea_checkpoint_size;
3273 } checkpoint_sm_exclude_entry_arg_t;
3274 
3275 static int
3276 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
3277 {
3278         checkpoint_sm_exclude_entry_arg_t *cseea = arg;
3279         vdev_t *vd = cseea->cseea_vd;
3280         metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
3281         uint64_t end = sme->sme_offset + sme->sme_run;
3282 
 
3336          * 2] There is a checkpoint, but no checkpointed blocks
3337          *    have been freed yet
3338          * 3] The current vdev is indirect
3339          *
3340          * In these cases we return immediately.
3341          */
3342         if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
3343             VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
3344                 return;
3345 
3346         VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
3347             VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
3348             &checkpoint_sm_obj));
3349 
3350         checkpoint_sm_exclude_entry_arg_t cseea;
3351         cseea.cseea_vd = vd;
3352         cseea.cseea_checkpoint_size = 0;
3353 
3354         VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
3355             checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
3356         space_map_update(checkpoint_sm);
3357 
3358         VERIFY0(space_map_iterate(checkpoint_sm,
3359             checkpoint_sm_exclude_entry_cb, &cseea));
3360         space_map_close(checkpoint_sm);
3361 
3362         zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
3363 }
3364 
3365 static void
3366 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
3367 {
3368         vdev_t *rvd = spa->spa_root_vdev;
3369         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3370                 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
3371                 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
3372         }
3373 }
3374 
3375 static void
3376 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
3377 {
3378         vdev_t *rvd = spa->spa_root_vdev;
3379         for (uint64_t i = 0; i < rvd->vdev_children; i++) {
3380                 vdev_t *vd = rvd->vdev_child[i];
3381 
3382                 ASSERT3U(i, ==, vd->vdev_id);
3383 
3384                 if (vd->vdev_ops == &vdev_indirect_ops)
3385                         continue;
3386 
3387                 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 
3444                 if (ent_offset >= msp->ms_start + msp->ms_size)
3445                         break;
3446 
3447                 /*
3448                  * Mappings do not cross metaslab boundaries,
3449                  * because we create them by walking the metaslabs.
3450                  */
3451                 ASSERT3U(ent_offset + ent_len, <=,
3452                     msp->ms_start + msp->ms_size);
3453                 range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
3454         }
3455 
3456         if (!msp->ms_loaded)
3457                 msp->ms_loaded = B_TRUE;
3458         mutex_exit(&msp->ms_lock);
3459 }
3460 
3461 static void
3462 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
3463 {
3464         vdev_t *rvd = spa->spa_root_vdev;
3465         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3466                 vdev_t *vd = rvd->vdev_child[c];
3467 
3468                 ASSERT3U(c, ==, vd->vdev_id);
3469 
3470                 if (vd->vdev_ops != &vdev_indirect_ops)
3471                         continue;
3472 
3473                 /*
3474                  * Note: we don't check for mapping leaks on
3475                  * removing vdevs because their ms_allocatable's
3476                  * are used to look for leaks in allocated space.
3477                  */
3478                 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
3479 
3480                 /*
3481                  * Normally, indirect vdevs don't have any
3482                  * metaslabs.  We want to set them up for
3483                  * zio_claim().
 
3490 
3491                         (void) fprintf(stderr,
3492                             "\rloading indirect vdev %llu, "
3493                             "metaslab %llu of %llu ...",
3494                             (longlong_t)vd->vdev_id,
3495                             (longlong_t)vd->vdev_ms[m]->ms_id,
3496                             (longlong_t)vd->vdev_ms_count);
3497 
3498                         load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
3499                             &vim_idx);
3500                 }
3501                 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
3502         }
3503 }
3504 
3505 static void
3506 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
3507 {
3508         zcb->zcb_spa = spa;
3509 
3510         if (!dump_opt['L']) {
3511                 dsl_pool_t *dp = spa->spa_dsl_pool;
3512                 vdev_t *rvd = spa->spa_root_vdev;
3513 
3514                 /*
3515                  * We are going to be changing the meaning of the metaslab's
3516                  * ms_allocatable.  Ensure that the allocator doesn't try to
3517                  * use the tree.
3518                  */
3519                 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
3520                 spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
3521 
3522                 zcb->zcb_vd_obsolete_counts =
3523                     umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
3524                     UMEM_NOFAIL);
3525 
3526                 /*
3527                  * For leak detection, we overload the ms_allocatable trees
3528                  * to contain allocated segments instead of free segments.
3529                  * As a result, we can't use the normal metaslab_load/unload
3530                  * interfaces.
 
3535                 /*
3536                  * On load_concrete_ms_allocatable_trees() we loaded all the
3537                  * allocated entries from the ms_sm to the ms_allocatable for
3538                  * each metaslab. If the pool has a checkpoint or is in the
3539                  * middle of discarding a checkpoint, some of these blocks
3540                  * may have been freed but their ms_sm may not have been
3541                  * updated because they are referenced by the checkpoint. In
3542                  * order to avoid false-positives during leak-detection, we
3543                  * go through the vdev's checkpoint space map and exclude all
3544                  * its entries from their relevant ms_allocatable.
3545                  *
3546                  * We also aggregate the space held by the checkpoint and add
3547                  * it to zcb_checkpoint_size.
3548                  *
3549                  * Note that at this point we are also verifying that all the
3550                  * entries on the checkpoint_sm are marked as allocated in
3551                  * the ms_sm of their relevant metaslab.
3552                  * [see comment in checkpoint_sm_exclude_entry_cb()]
3553                  */
3554                 zdb_leak_init_exclude_checkpoint(spa, zcb);
3555 
3556                 /* for cleaner progress output */
3557                 (void) fprintf(stderr, "\n");
3558 
3559                 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
3560                         ASSERT(spa_feature_is_enabled(spa,
3561                             SPA_FEATURE_DEVICE_REMOVAL));
3562                         (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
3563                             increment_indirect_mapping_cb, zcb, NULL);
3564                 }
3565         } else {
3566                 /*
3567                  * If leak tracing is disabled, we still need to consider
3568                  * any checkpointed space in our space verification.
3569                  */
3570                 zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
3571         }
3572 
3573         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3574         zdb_ddt_leak_init(spa, zcb);
3575         spa_config_exit(spa, SCL_CONFIG, FTAG);
3576 }
3577 
3578 static boolean_t
3579 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
3580 {
3581         boolean_t leaks = B_FALSE;
3582         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3583         uint64_t total_leaked = 0;
3584 
3585         ASSERT(vim != NULL);
3586 
3587         for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
3588                 vdev_indirect_mapping_entry_phys_t *vimep =
3589                     &vim->vim_entries[i];
3590                 uint64_t obsolete_bytes = 0;
3591                 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 
3631                     (u_longlong_t)vd->vdev_id, pct_leaked,
3632                     (u_longlong_t)total_leaked);
3633         } else if (total_leaked > 0) {
3634                 (void) printf("obsolete indirect mapping count mismatch "
3635                     "for vdev %llu -- %llx total bytes mismatched\n",
3636                     (u_longlong_t)vd->vdev_id,
3637                     (u_longlong_t)total_leaked);
3638                 leaks |= B_TRUE;
3639         }
3640 
3641         vdev_indirect_mapping_free_obsolete_counts(vim,
3642             zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3643         zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
3644 
3645         return (leaks);
3646 }
3647 
3648 static boolean_t
3649 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
3650 {
3651         boolean_t leaks = B_FALSE;
3652         if (!dump_opt['L']) {
3653                 vdev_t *rvd = spa->spa_root_vdev;
3654                 for (unsigned c = 0; c < rvd->vdev_children; c++) {
3655                         vdev_t *vd = rvd->vdev_child[c];
3656                         metaslab_group_t *mg = vd->vdev_mg;
3657 
3658                         if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
3659                                 leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
3660                         }
3661 
3662                         for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3663                                 metaslab_t *msp = vd->vdev_ms[m];
3664                                 ASSERT3P(mg, ==, msp->ms_group);
3665 
3666                                 /*
3667                                  * ms_allocatable has been overloaded
3668                                  * to contain allocated segments. Now that
3669                                  * we finished traversing all blocks, any
3670                                  * block that remains in the ms_allocatable
3671                                  * represents an allocated block that we
3672                                  * did not claim during the traversal.
3673                                  * Claimed blocks would have been removed
3674                                  * from the ms_allocatable.  For indirect
3675                                  * vdevs, space remaining in the tree
3676                                  * represents parts of the mapping that are
3677                                  * not referenced, which is not a bug.
3678                                  */
3679                                 if (vd->vdev_ops == &vdev_indirect_ops) {
3680                                         range_tree_vacate(msp->ms_allocatable,
3681                                             NULL, NULL);
3682                                 } else {
3683                                         range_tree_vacate(msp->ms_allocatable,
3684                                             zdb_leak, vd);
3685                                 }
3686 
3687                                 if (msp->ms_loaded) {
3688                                         msp->ms_loaded = B_FALSE;
3689                                 }
3690                         }
3691                 }
3692 
3693                 umem_free(zcb->zcb_vd_obsolete_counts,
3694                     rvd->vdev_children * sizeof (uint32_t *));
3695                 zcb->zcb_vd_obsolete_counts = NULL;
3696         }
3697         return (leaks);
3698 }
3699 
3700 /* ARGSUSED */
3701 static int
3702 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3703 {
3704         zdb_cb_t *zcb = arg;
3705 
3706         if (dump_opt['b'] >= 5) {
3707                 char blkbuf[BP_SPRINTF_LEN];
3708                 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3709                 (void) printf("[%s] %s\n",
3710                     "deferred free", blkbuf);
3711         }
3712         zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
3713         return (0);
3714 }
3715 
3716 static int
3717 dump_block_stats(spa_t *spa)
3718 {
3719         zdb_cb_t zcb;
3720         zdb_blkstats_t *zb, *tzb;
3721         uint64_t norm_alloc, norm_space, total_alloc, total_found;
3722         int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
3723         boolean_t leaks = B_FALSE;
3724         int err;
3725 
3726         bzero(&zcb, sizeof (zcb));
3727         (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
3728             (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
3729             (dump_opt['c'] == 1) ? "metadata " : "",
3730             dump_opt['c'] ? "checksums " : "",
3731             (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
3732             !dump_opt['L'] ? "nothing leaked " : "");
3733 
3734         /*
3735          * Load all space maps as SM_ALLOC maps, then traverse the pool
3736          * claiming each block we discover.  If the pool is perfectly
3737          * consistent, the space maps will be empty when we're done.
3738          * Anything left over is a leak; any block we can't claim (because
3739          * it's not part of any space map) is a double allocation,
3740          * reference to a freed block, or an unclaimed log block.
3741          */
3742         zdb_leak_init(spa, &zcb);
3743 
3744         /*
3745          * If there's a deferred-free bplist, process that first.
3746          */
3747         (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
3748             count_block_cb, &zcb, NULL);
3749 
3750         if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
3751                 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
3752                     count_block_cb, &zcb, NULL);
3753         }
3754 
3755         zdb_claim_removing(spa, &zcb);
3756 
3757         if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
3758                 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
3759                     spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
3760                     &zcb, NULL));
3761         }
 
3800                 }
3801         }
3802 
3803         /*
3804          * Report any leaked segments.
3805          */
3806         leaks |= zdb_leak_fini(spa, &zcb);
3807 
3808         tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
3809 
3810         norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
3811         norm_space = metaslab_class_get_space(spa_normal_class(spa));
3812 
3813         total_alloc = norm_alloc +
3814             metaslab_class_get_alloc(spa_log_class(spa)) +
3815             metaslab_class_get_alloc(spa_special_class(spa)) +
3816             metaslab_class_get_alloc(spa_dedup_class(spa));
3817         total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
3818             zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
3819 
3820         if (total_found == total_alloc) {
3821                 if (!dump_opt['L'])
3822                         (void) printf("\n\tNo leaks (block sum matches space"
3823                             " maps exactly)\n");
3824         } else {
3825                 (void) printf("block traversal size %llu != alloc %llu "
3826                     "(%s %lld)\n",
3827                     (u_longlong_t)total_found,
3828                     (u_longlong_t)total_alloc,
3829                     (dump_opt['L']) ? "unreachable" : "leaked",
3830                     (longlong_t)(total_alloc - total_found));
3831                 leaks = B_TRUE;
3832         }
3833 
3834         if (tzb->zb_count == 0)
3835                 return (2);
3836 
3837         (void) printf("\n");
3838         (void) printf("\t%-16s %14llu\n", "bp count:",
3839             (u_longlong_t)tzb->zb_count);
3840         (void) printf("\t%-16s %14llu\n", "ganged count:",
3841             (longlong_t)tzb->zb_gangs);
3842         (void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
3843             (u_longlong_t)tzb->zb_lsize,
3844             (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 
4144         int ret = 0;
4145 
4146         spa_condensing_indirect_phys_t *scip =
4147             &spa->spa_condensing_indirect_phys;
4148         if (scip->scip_next_mapping_object != 0) {
4149                 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
4150                 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
4151                 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4152 
4153                 (void) printf("Condensing indirect vdev %llu: new mapping "
4154                     "object %llu, prev obsolete sm %llu\n",
4155                     (u_longlong_t)scip->scip_vdev,
4156                     (u_longlong_t)scip->scip_next_mapping_object,
4157                     (u_longlong_t)scip->scip_prev_obsolete_sm_object);
4158                 if (scip->scip_prev_obsolete_sm_object != 0) {
4159                         space_map_t *prev_obsolete_sm = NULL;
4160                         VERIFY0(space_map_open(&prev_obsolete_sm,
4161                             spa->spa_meta_objset,
4162                             scip->scip_prev_obsolete_sm_object,
4163                             0, vd->vdev_asize, 0));
4164                         space_map_update(prev_obsolete_sm);
4165                         dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
4166                         (void) printf("\n");
4167                         space_map_close(prev_obsolete_sm);
4168                 }
4169 
4170                 scip_count += 2;
4171         }
4172 
4173         for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
4174                 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
4175                 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4176 
4177                 if (vic->vic_mapping_object != 0) {
4178                         ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
4179                             vd->vdev_removing);
4180                         indirect_vdev_count++;
4181 
4182                         if (vd->vdev_indirect_mapping->vim_havecounts) {
4183                                 obsolete_counts_count++;
4184                         }
 
4350                 (void) fprintf(stderr,
4351                     "\rverifying vdev %llu, space map entry %llu of %llu ...",
4352                     (longlong_t)vd->vdev_id,
4353                     (longlong_t)vcsec->vcsec_entryid,
4354                     (longlong_t)vcsec->vcsec_num_entries);
4355         }
4356         vcsec->vcsec_entryid++;
4357 
4358         /*
4359          * See comment in checkpoint_sm_exclude_entry_cb()
4360          */
4361         VERIFY3U(sme->sme_offset, >=, ms->ms_start);
4362         VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
4363 
4364         /*
4365          * The entries in the vdev_checkpoint_sm should be marked as
4366          * allocated in the checkpointed state of the pool, therefore
4367          * their respective ms_allocateable trees should not contain them.
4368          */
4369         mutex_enter(&ms->ms_lock);
4370         range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
4371         mutex_exit(&ms->ms_lock);
4372 
4373         return (0);
4374 }
4375 
4376 /*
4377  * Verify that all segments in the vdev_checkpoint_sm are allocated
4378  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
4379  * ms_allocatable).
4380  *
4381  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
4382  * each vdev in the current state of the pool to the metaslab space maps
4383  * (ms_sm) of the checkpointed state of the pool.
4384  *
4385  * Note that the function changes the state of the ms_allocatable
4386  * trees of the current spa_t. The entries of these ms_allocatable
4387  * trees are cleared out and then repopulated from with the free
4388  * entries of their respective ms_sm space maps.
4389  */
4390 static void
 
 
4413                         continue;
4414                 }
4415 
4416                 /*
4417                  * If the checkpoint space map doesn't exist, then nothing
4418                  * here is checkpointed so there's nothing to verify.
4419                  */
4420                 if (current_vd->vdev_top_zap == 0 ||
4421                     zap_contains(spa_meta_objset(current),
4422                     current_vd->vdev_top_zap,
4423                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4424                         continue;
4425 
4426                 VERIFY0(zap_lookup(spa_meta_objset(current),
4427                     current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4428                     sizeof (uint64_t), 1, &checkpoint_sm_obj));
4429 
4430                 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
4431                     checkpoint_sm_obj, 0, current_vd->vdev_asize,
4432                     current_vd->vdev_ashift));
4433                 space_map_update(checkpoint_sm);
4434 
4435                 verify_checkpoint_sm_entry_cb_arg_t vcsec;
4436                 vcsec.vcsec_vd = ckpoint_vd;
4437                 vcsec.vcsec_entryid = 0;
4438                 vcsec.vcsec_num_entries =
4439                     space_map_length(checkpoint_sm) / sizeof (uint64_t);
4440                 VERIFY0(space_map_iterate(checkpoint_sm,
4441                     verify_checkpoint_sm_entry_cb, &vcsec));
4442                 dump_spacemap(current->spa_meta_objset, checkpoint_sm);
4443                 space_map_close(checkpoint_sm);
4444         }
4445 
4446         /*
4447          * If we've added vdevs since we took the checkpoint, ensure
4448          * that their checkpoint space maps are empty.
4449          */
4450         if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
4451                 for (uint64_t c = ckpoint_rvd->vdev_children;
4452                     c < current_rvd->vdev_children; c++) {
4453                         vdev_t *current_vd = current_rvd->vdev_child[c];
4454                         ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
4455                 }
4456         }
4457 
4458         /* for cleaner progress output */
4459         (void) fprintf(stderr, "\n");
4460 }
 
4500                             "\rverifying vdev %llu of %llu, "
4501                             "metaslab %llu of %llu ...",
4502                             (longlong_t)current_vd->vdev_id,
4503                             (longlong_t)current_rvd->vdev_children,
4504                             (longlong_t)current_vd->vdev_ms[m]->ms_id,
4505                             (longlong_t)current_vd->vdev_ms_count);
4506 
4507                         /*
4508                          * We walk through the ms_allocatable trees that
4509                          * are loaded with the allocated blocks from the
4510                          * ms_sm spacemaps of the checkpoint. For each
4511                          * one of these ranges we ensure that none of them
4512                          * exists in the ms_allocatable trees of the
4513                          * current state which are loaded with the ranges
4514                          * that are currently free.
4515                          *
4516                          * This way we ensure that none of the blocks that
4517                          * are part of the checkpoint were freed by mistake.
4518                          */
4519                         range_tree_walk(ckpoint_msp->ms_allocatable,
4520                             (range_tree_func_t *)range_tree_verify,
4521                             current_msp->ms_allocatable);
4522                 }
4523         }
4524 
4525         /* for cleaner progress output */
4526         (void) fprintf(stderr, "\n");
4527 }
4528 
4529 static void
4530 verify_checkpoint_blocks(spa_t *spa)
4531 {
4532         spa_t *checkpoint_spa;
4533         char *checkpoint_pool;
4534         nvlist_t *config = NULL;
4535         int error = 0;
4536 
4537         /*
4538          * We import the checkpointed state of the pool (under a different
4539          * name) so we can do verification on it against the current state
4540          * of the pool.
4541          */
4542         checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
4543             NULL);
4544         ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
4545 
4546         error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
4547         if (error != 0) {
4548                 fatal("Tried to open pool \"%s\" but spa_open() failed with "
4549                     "error %d\n", checkpoint_pool, error);
4550         }
4551 
 
4577 
4578         for (uint64_t i = 0; i < rvd->vdev_children; i++) {
4579                 vdev_t *vd = rvd->vdev_child[i];
4580 
4581                 space_map_t *checkpoint_sm = NULL;
4582                 uint64_t checkpoint_sm_obj;
4583 
4584                 if (vd->vdev_top_zap == 0)
4585                         continue;
4586 
4587                 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
4588                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4589                         continue;
4590 
4591                 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
4592                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4593                     sizeof (uint64_t), 1, &checkpoint_sm_obj));
4594 
4595                 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
4596                     checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
4597                 space_map_update(checkpoint_sm);
4598                 dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
4599                 space_map_close(checkpoint_sm);
4600         }
4601 }
4602 
4603 static int
4604 verify_checkpoint(spa_t *spa)
4605 {
4606         uberblock_t checkpoint;
4607         int error;
4608 
4609         if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
4610                 return (0);
4611 
4612         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4613             DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4614             sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4615 
4616         if (error == ENOENT && !dump_opt['L']) {
4617                 /*
 
 | 
 
 
 768         if (expected_refcount != actual_refcount) {
 769                 (void) printf("space map refcount mismatch: expected %lld != "
 770                     "actual %lld\n",
 771                     (longlong_t)expected_refcount,
 772                     (longlong_t)actual_refcount);
 773                 return (2);
 774         }
 775         return (0);
 776 }
 777 
 778 static void
 779 dump_spacemap(objset_t *os, space_map_t *sm)
 780 {
 781         char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 782             "INVALID", "INVALID", "INVALID", "INVALID" };
 783 
 784         if (sm == NULL)
 785                 return;
 786 
 787         (void) printf("space map object %llu:\n",
 788             (longlong_t)sm->sm_object);
 789         (void) printf("  smp_length = 0x%llx\n",
 790             (longlong_t)sm->sm_phys->smp_length);
 791         (void) printf("  smp_alloc = 0x%llx\n",
 792             (longlong_t)sm->sm_phys->smp_alloc);
 793 
 794         if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
 795                 return;
 796 
 797         /*
 798          * Print out the freelist entries in both encoded and decoded form.
 799          */
 800         uint8_t mapshift = sm->sm_shift;
 801         int64_t alloc = 0;
 802         uint64_t word, entry_id = 0;
 803         for (uint64_t offset = 0; offset < space_map_length(sm);
 804             offset += sizeof (word)) {
 805 
 806                 VERIFY0(dmu_read(os, space_map_object(sm), offset,
 807                     sizeof (word), &word, DMU_READ_PREFETCH));
 808 
 809                 if (sm_entry_is_debug(word)) {
 810                         (void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
 811                             (u_longlong_t)entry_id,
 812                             ddata[SM_DEBUG_ACTION_DECODE(word)],
 813                             (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
 814                             (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
 815                         entry_id++;
 816                         continue;
 817                 }
 818 
 819                 uint8_t words;
 820                 char entry_type;
 821                 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
 822 
 823                 if (sm_entry_is_single_word(word)) {
 824                         entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 825                             'A' : 'F';
 826                         entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
 827                             sm->sm_start;
 828                         entry_run = SM_RUN_DECODE(word) << mapshift;
 829                         words = 1;
 830                 } else {
 831                         /* it is a two-word entry so we read another word */
 832                         ASSERT(sm_entry_is_double_word(word));
 833 
 834                         uint64_t extra_word;
 835                         offset += sizeof (extra_word);
 836                         VERIFY0(dmu_read(os, space_map_object(sm), offset,
 837                             sizeof (extra_word), &extra_word,
 838                             DMU_READ_PREFETCH));
 839 
 840                         ASSERT3U(offset, <=, space_map_length(sm));
 841 
 842                         entry_run = SM2_RUN_DECODE(word) << mapshift;
 843                         entry_vdev = SM2_VDEV_DECODE(word);
 844                         entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 845                             'A' : 'F';
 846                         entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 847                             mapshift) + sm->sm_start;
 848                         words = 2;
 849                 }
 850 
 851                 (void) printf("\t    [%6llu]    %c  range:"
 852                     " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
 853                     (u_longlong_t)entry_id,
 854                     entry_type, (u_longlong_t)entry_off,
 855                     (u_longlong_t)(entry_off + entry_run),
 856                     (u_longlong_t)entry_run,
 857                     (u_longlong_t)entry_vdev, words);
 858 
 859                 if (entry_type == 'A')
 860                         alloc += entry_run;
 861                 else
 862                         alloc -= entry_run;
 863                 entry_id++;
 864         }
 865         if (alloc != space_map_allocated(sm)) {
 866                 (void) printf("space_map_object alloc (%lld) INCONSISTENT "
 867                     "with space map summary (%lld)\n",
 868                     (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 869         }
 870 }
 871 
 872 static void
 873 dump_metaslab_stats(metaslab_t *msp)
 874 {
 875         char maxbuf[32];
 876         range_tree_t *rt = msp->ms_allocatable;
 877         avl_tree_t *t = &msp->ms_allocatable_by_size;
 878         int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 879 
 880         /* max sure nicenum has enough space */
 881         CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
 882 
 883         zdb_nicenum(metaslab_block_maxsize(msp), maxbuf, sizeof (maxbuf));
 884 
 885         (void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 
 909                 mutex_enter(&msp->ms_lock);
 910                 VERIFY0(metaslab_load(msp));
 911                 range_tree_stat_verify(msp->ms_allocatable);
 912                 dump_metaslab_stats(msp);
 913                 metaslab_unload(msp);
 914                 mutex_exit(&msp->ms_lock);
 915         }
 916 
 917         if (dump_opt['m'] > 1 && sm != NULL &&
 918             spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 919                 /*
 920                  * The space map histogram represents free space in chunks
 921                  * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 922                  */
 923                 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 924                     (u_longlong_t)msp->ms_fragmentation);
 925                 dump_histogram(sm->sm_phys->smp_histogram,
 926                     SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 927         }
 928 
 929         ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 930         dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 931 }
 932 
 933 static void
 934 print_vdev_metaslab_header(vdev_t *vd)
 935 {
 936         vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 937         const char *bias_str;
 938 
 939         bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ?
 940             VDEV_ALLOC_BIAS_LOG :
 941             (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 942             (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP :
 943             vd->vdev_islog ? "log" : "";
 944 
 945         (void) printf("\tvdev %10llu   %s\n"
 946             "\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 947             (u_longlong_t)vd->vdev_id, bias_str,
 948             "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 949             "offset", "spacemap", "free");
 950         (void) printf("\t%15s   %19s   %15s   %12s\n",
 
3083 static void
3084 zdb_leak(void *arg, uint64_t start, uint64_t size)
3085 {
3086         vdev_t *vd = arg;
3087 
3088         (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
3089             (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
3090 }
3091 
3092 static metaslab_ops_t zdb_metaslab_ops = {
3093         NULL    /* alloc */
3094 };
3095 
3096 static void
3097 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
3098 {
3099         ddt_bookmark_t ddb;
3100         ddt_entry_t dde;
3101         int error;
3102 
3103         ASSERT(!dump_opt['L']);
3104 
3105         bzero(&ddb, sizeof (ddb));
3106         while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
3107                 blkptr_t blk;
3108                 ddt_phys_t *ddp = dde.dde_phys;
3109 
3110                 if (ddb.ddb_class == DDT_CLASS_UNIQUE)
3111                         return;
3112 
3113                 ASSERT(ddt_phys_total_refcnt(&dde) > 1);
3114 
3115                 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
3116                         if (ddp->ddp_phys_birth == 0)
3117                                 continue;
3118                         ddt_bp_create(ddb.ddb_checksum,
3119                             &dde.dde_key, ddp, &blk);
3120                         if (p == DDT_PHYS_DITTO) {
3121                                 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
3122                         } else {
3123                                 zcb->zcb_dedup_asize +=
3124                                     BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
3125                                 zcb->zcb_dedup_blocks++;
3126                         }
3127                 }
3128                 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
3129                 ddt_enter(ddt);
3130                 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
3131                 ddt_exit(ddt);
3132         }
3133 
3134         ASSERT(error == ENOENT);
3135 }
3136 
3137 /* ARGSUSED */
3138 static void
3139 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3140     uint64_t size, void *arg)
3141 {
3142         /*
3143          * This callback was called through a remap from
3144          * a device being removed. Therefore, the vdev that
3145          * this callback is applied to is a concrete
3146          * vdev.
3147          */
3148         ASSERT(vdev_is_concrete(vd));
3149 
3150         VERIFY0(metaslab_claim_impl(vd, offset, size,
3151             spa_min_claim_txg(vd->vdev_spa)));
3152 }
3153 
3154 static void
3155 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
3156 {
3157         vdev_t *vd = arg;
3158 
3159         vdev_indirect_ops.vdev_op_remap(vd, offset, size,
3160             claim_segment_impl_cb, NULL);
3161 }
3162 
3163 /*
3164  * After accounting for all allocated blocks that are directly referenced,
3165  * we might have missed a reference to a block from a partially complete
3166  * (and thus unused) indirect mapping object. We perform a secondary pass
3167  * through the metaslabs we have already mapped and claim the destination
3168  * blocks.
3169  */
3170 static void
3171 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
3172 {
3173         if (dump_opt['L'])
3174                 return;
3175 
3176         if (spa->spa_vdev_removal == NULL)
3177                 return;
3178 
3179         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3180 
3181         spa_vdev_removal_t *svr = spa->spa_vdev_removal;
3182         vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
3183         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3184 
3185         for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
3186                 metaslab_t *msp = vd->vdev_ms[msi];
3187 
3188                 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
3189                         break;
3190 
3191                 ASSERT0(range_tree_space(svr->svr_allocd_segs));
3192 
3193                 if (msp->ms_sm != NULL) {
3194                         VERIFY0(space_map_load(msp->ms_sm,
3195                             svr->svr_allocd_segs, SM_ALLOC));
 
3247 static uint32_t *
3248 zdb_load_obsolete_counts(vdev_t *vd)
3249 {
3250         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3251         spa_t *spa = vd->vdev_spa;
3252         spa_condensing_indirect_phys_t *scip =
3253             &spa->spa_condensing_indirect_phys;
3254         uint32_t *counts;
3255 
3256         EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL);
3257         counts = vdev_indirect_mapping_load_obsolete_counts(vim);
3258         if (vd->vdev_obsolete_sm != NULL) {
3259                 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3260                     vd->vdev_obsolete_sm);
3261         }
3262         if (scip->scip_vdev == vd->vdev_id &&
3263             scip->scip_prev_obsolete_sm_object != 0) {
3264                 space_map_t *prev_obsolete_sm = NULL;
3265                 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
3266                     scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
3267                 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3268                     prev_obsolete_sm);
3269                 space_map_close(prev_obsolete_sm);
3270         }
3271         return (counts);
3272 }
3273 
3274 typedef struct checkpoint_sm_exclude_entry_arg {
3275         vdev_t *cseea_vd;
3276         uint64_t cseea_checkpoint_size;
3277 } checkpoint_sm_exclude_entry_arg_t;
3278 
3279 static int
3280 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
3281 {
3282         checkpoint_sm_exclude_entry_arg_t *cseea = arg;
3283         vdev_t *vd = cseea->cseea_vd;
3284         metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
3285         uint64_t end = sme->sme_offset + sme->sme_run;
3286 
 
3340          * 2] There is a checkpoint, but no checkpointed blocks
3341          *    have been freed yet
3342          * 3] The current vdev is indirect
3343          *
3344          * In these cases we return immediately.
3345          */
3346         if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
3347             VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
3348                 return;
3349 
3350         VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
3351             VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
3352             &checkpoint_sm_obj));
3353 
3354         checkpoint_sm_exclude_entry_arg_t cseea;
3355         cseea.cseea_vd = vd;
3356         cseea.cseea_checkpoint_size = 0;
3357 
3358         VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
3359             checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
3360 
3361         VERIFY0(space_map_iterate(checkpoint_sm,
3362             space_map_length(checkpoint_sm),
3363             checkpoint_sm_exclude_entry_cb, &cseea));
3364         space_map_close(checkpoint_sm);
3365 
3366         zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
3367 }
3368 
3369 static void
3370 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
3371 {
3372         ASSERT(!dump_opt['L']);
3373 
3374         vdev_t *rvd = spa->spa_root_vdev;
3375         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3376                 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
3377                 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
3378         }
3379 }
3380 
3381 static void
3382 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
3383 {
3384         vdev_t *rvd = spa->spa_root_vdev;
3385         for (uint64_t i = 0; i < rvd->vdev_children; i++) {
3386                 vdev_t *vd = rvd->vdev_child[i];
3387 
3388                 ASSERT3U(i, ==, vd->vdev_id);
3389 
3390                 if (vd->vdev_ops == &vdev_indirect_ops)
3391                         continue;
3392 
3393                 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 
3450                 if (ent_offset >= msp->ms_start + msp->ms_size)
3451                         break;
3452 
3453                 /*
3454                  * Mappings do not cross metaslab boundaries,
3455                  * because we create them by walking the metaslabs.
3456                  */
3457                 ASSERT3U(ent_offset + ent_len, <=,
3458                     msp->ms_start + msp->ms_size);
3459                 range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
3460         }
3461 
3462         if (!msp->ms_loaded)
3463                 msp->ms_loaded = B_TRUE;
3464         mutex_exit(&msp->ms_lock);
3465 }
3466 
3467 static void
3468 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
3469 {
3470         ASSERT(!dump_opt['L']);
3471 
3472         vdev_t *rvd = spa->spa_root_vdev;
3473         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3474                 vdev_t *vd = rvd->vdev_child[c];
3475 
3476                 ASSERT3U(c, ==, vd->vdev_id);
3477 
3478                 if (vd->vdev_ops != &vdev_indirect_ops)
3479                         continue;
3480 
3481                 /*
3482                  * Note: we don't check for mapping leaks on
3483                  * removing vdevs because their ms_allocatable's
3484                  * are used to look for leaks in allocated space.
3485                  */
3486                 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
3487 
3488                 /*
3489                  * Normally, indirect vdevs don't have any
3490                  * metaslabs.  We want to set them up for
3491                  * zio_claim().
 
3498 
3499                         (void) fprintf(stderr,
3500                             "\rloading indirect vdev %llu, "
3501                             "metaslab %llu of %llu ...",
3502                             (longlong_t)vd->vdev_id,
3503                             (longlong_t)vd->vdev_ms[m]->ms_id,
3504                             (longlong_t)vd->vdev_ms_count);
3505 
3506                         load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
3507                             &vim_idx);
3508                 }
3509                 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
3510         }
3511 }
3512 
3513 static void
3514 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
3515 {
3516         zcb->zcb_spa = spa;
3517 
3518         if (dump_opt['L'])
3519                 return;
3520 
3521         dsl_pool_t *dp = spa->spa_dsl_pool;
3522         vdev_t *rvd = spa->spa_root_vdev;
3523 
3524         /*
3525          * We are going to be changing the meaning of the metaslab's
3526          * ms_allocatable.  Ensure that the allocator doesn't try to
3527          * use the tree.
3528          */
3529         spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
3530         spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
3531 
3532         zcb->zcb_vd_obsolete_counts =
3533             umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
3534             UMEM_NOFAIL);
3535 
3536         /*
3537          * For leak detection, we overload the ms_allocatable trees
3538          * to contain allocated segments instead of free segments.
3539          * As a result, we can't use the normal metaslab_load/unload
3540          * interfaces.
 
3545         /*
3546          * On load_concrete_ms_allocatable_trees() we loaded all the
3547          * allocated entries from the ms_sm to the ms_allocatable for
3548          * each metaslab. If the pool has a checkpoint or is in the
3549          * middle of discarding a checkpoint, some of these blocks
3550          * may have been freed but their ms_sm may not have been
3551          * updated because they are referenced by the checkpoint. In
3552          * order to avoid false-positives during leak-detection, we
3553          * go through the vdev's checkpoint space map and exclude all
3554          * its entries from their relevant ms_allocatable.
3555          *
3556          * We also aggregate the space held by the checkpoint and add
3557          * it to zcb_checkpoint_size.
3558          *
3559          * Note that at this point we are also verifying that all the
3560          * entries on the checkpoint_sm are marked as allocated in
3561          * the ms_sm of their relevant metaslab.
3562          * [see comment in checkpoint_sm_exclude_entry_cb()]
3563          */
3564         zdb_leak_init_exclude_checkpoint(spa, zcb);
3565         ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
3566 
3567         /* for cleaner progress output */
3568         (void) fprintf(stderr, "\n");
3569 
3570         if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
3571                 ASSERT(spa_feature_is_enabled(spa,
3572                     SPA_FEATURE_DEVICE_REMOVAL));
3573                 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
3574                     increment_indirect_mapping_cb, zcb, NULL);
3575         }
3576 
3577         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3578         zdb_ddt_leak_init(spa, zcb);
3579         spa_config_exit(spa, SCL_CONFIG, FTAG);
3580 }
3581 
3582 static boolean_t
3583 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
3584 {
3585         boolean_t leaks = B_FALSE;
3586         vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3587         uint64_t total_leaked = 0;
3588 
3589         ASSERT(vim != NULL);
3590 
3591         for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
3592                 vdev_indirect_mapping_entry_phys_t *vimep =
3593                     &vim->vim_entries[i];
3594                 uint64_t obsolete_bytes = 0;
3595                 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 
3635                     (u_longlong_t)vd->vdev_id, pct_leaked,
3636                     (u_longlong_t)total_leaked);
3637         } else if (total_leaked > 0) {
3638                 (void) printf("obsolete indirect mapping count mismatch "
3639                     "for vdev %llu -- %llx total bytes mismatched\n",
3640                     (u_longlong_t)vd->vdev_id,
3641                     (u_longlong_t)total_leaked);
3642                 leaks |= B_TRUE;
3643         }
3644 
3645         vdev_indirect_mapping_free_obsolete_counts(vim,
3646             zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3647         zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
3648 
3649         return (leaks);
3650 }
3651 
3652 static boolean_t
3653 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
3654 {
3655         if (dump_opt['L'])
3656                 return (B_FALSE);
3657 
3658         boolean_t leaks = B_FALSE;
3659 
3660         vdev_t *rvd = spa->spa_root_vdev;
3661         for (unsigned c = 0; c < rvd->vdev_children; c++) {
3662                 vdev_t *vd = rvd->vdev_child[c];
3663 #if DEBUG
3664                 metaslab_group_t *mg = vd->vdev_mg;
3665 #endif
3666 
3667                 if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
3668                         leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
3669                 }
3670 
3671                 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3672                         metaslab_t *msp = vd->vdev_ms[m];
3673                         ASSERT3P(mg, ==, msp->ms_group);
3674 
3675                         /*
3676                          * ms_allocatable has been overloaded
3677                          * to contain allocated segments. Now that
3678                          * we finished traversing all blocks, any
3679                          * block that remains in the ms_allocatable
3680                          * represents an allocated block that we
3681                          * did not claim during the traversal.
3682                          * Claimed blocks would have been removed
3683                          * from the ms_allocatable.  For indirect
3684                          * vdevs, space remaining in the tree
3685                          * represents parts of the mapping that are
3686                          * not referenced, which is not a bug.
3687                          */
3688                         if (vd->vdev_ops == &vdev_indirect_ops) {
3689                                 range_tree_vacate(msp->ms_allocatable,
3690                                     NULL, NULL);
3691                         } else {
3692                                 range_tree_vacate(msp->ms_allocatable,
3693                                     zdb_leak, vd);
3694                         }
3695 
3696                         if (msp->ms_loaded) {
3697                                 msp->ms_loaded = B_FALSE;
3698                         }
3699                 }
3700 
3701         }
3702 
3703         umem_free(zcb->zcb_vd_obsolete_counts,
3704             rvd->vdev_children * sizeof (uint32_t *));
3705         zcb->zcb_vd_obsolete_counts = NULL;
3706 
3707         return (leaks);
3708 }
3709 
3710 /* ARGSUSED */
3711 static int
3712 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3713 {
3714         zdb_cb_t *zcb = arg;
3715 
3716         if (dump_opt['b'] >= 5) {
3717                 char blkbuf[BP_SPRINTF_LEN];
3718                 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
3719                 (void) printf("[%s] %s\n",
3720                     "deferred free", blkbuf);
3721         }
3722         zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
3723         return (0);
3724 }
3725 
3726 static int
3727 dump_block_stats(spa_t *spa)
3728 {
3729         zdb_cb_t zcb;
3730         zdb_blkstats_t *zb, *tzb;
3731         uint64_t norm_alloc, norm_space, total_alloc, total_found;
3732         int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
3733         boolean_t leaks = B_FALSE;
3734         int err;
3735 
3736         bzero(&zcb, sizeof (zcb));
3737         (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
3738             (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
3739             (dump_opt['c'] == 1) ? "metadata " : "",
3740             dump_opt['c'] ? "checksums " : "",
3741             (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
3742             !dump_opt['L'] ? "nothing leaked " : "");
3743 
3744         /*
3745          * When leak detection is enabled we load all space maps as SM_ALLOC
3746          * maps, then traverse the pool claiming each block we discover. If
3747          * the pool is perfectly consistent, the segment trees will be empty
3748          * when we're done. Anything left over is a leak; any block we can't
3749          * claim (because it's not part of any space map) is a double
3750          * allocation, reference to a freed block, or an unclaimed log block.
3751          *
3752          * When leak detection is disabled (-L option) we still traverse the
3753          * pool claiming each block we discover, but we skip opening any space
3754          * maps.
3755          */
3756         bzero(&zcb, sizeof (zdb_cb_t));
3757         zdb_leak_init(spa, &zcb);
3758 
3759         /*
3760          * If there's a deferred-free bplist, process that first.
3761          */
3762         (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
3763             count_block_cb, &zcb, NULL);
3764 
3765         if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
3766                 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
3767                     count_block_cb, &zcb, NULL);
3768         }
3769 
3770         zdb_claim_removing(spa, &zcb);
3771 
3772         if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
3773                 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
3774                     spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
3775                     &zcb, NULL));
3776         }
 
3815                 }
3816         }
3817 
3818         /*
3819          * Report any leaked segments.
3820          */
3821         leaks |= zdb_leak_fini(spa, &zcb);
3822 
3823         tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
3824 
3825         norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
3826         norm_space = metaslab_class_get_space(spa_normal_class(spa));
3827 
3828         total_alloc = norm_alloc +
3829             metaslab_class_get_alloc(spa_log_class(spa)) +
3830             metaslab_class_get_alloc(spa_special_class(spa)) +
3831             metaslab_class_get_alloc(spa_dedup_class(spa));
3832         total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
3833             zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
3834 
3835         if (total_found == total_alloc && !dump_opt['L']) {
3836                 (void) printf("\n\tNo leaks (block sum matches space"
3837                     " maps exactly)\n");
3838         } else if (!dump_opt['L']) {
3839                 (void) printf("block traversal size %llu != alloc %llu "
3840                     "(%s %lld)\n",
3841                     (u_longlong_t)total_found,
3842                     (u_longlong_t)total_alloc,
3843                     (dump_opt['L']) ? "unreachable" : "leaked",
3844                     (longlong_t)(total_alloc - total_found));
3845                 leaks = B_TRUE;
3846         }
3847 
3848         if (tzb->zb_count == 0)
3849                 return (2);
3850 
3851         (void) printf("\n");
3852         (void) printf("\t%-16s %14llu\n", "bp count:",
3853             (u_longlong_t)tzb->zb_count);
3854         (void) printf("\t%-16s %14llu\n", "ganged count:",
3855             (longlong_t)tzb->zb_gangs);
3856         (void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
3857             (u_longlong_t)tzb->zb_lsize,
3858             (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 
4158         int ret = 0;
4159 
4160         spa_condensing_indirect_phys_t *scip =
4161             &spa->spa_condensing_indirect_phys;
4162         if (scip->scip_next_mapping_object != 0) {
4163                 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
4164                 ASSERT(scip->scip_prev_obsolete_sm_object != 0);
4165                 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
4166 
4167                 (void) printf("Condensing indirect vdev %llu: new mapping "
4168                     "object %llu, prev obsolete sm %llu\n",
4169                     (u_longlong_t)scip->scip_vdev,
4170                     (u_longlong_t)scip->scip_next_mapping_object,
4171                     (u_longlong_t)scip->scip_prev_obsolete_sm_object);
4172                 if (scip->scip_prev_obsolete_sm_object != 0) {
4173                         space_map_t *prev_obsolete_sm = NULL;
4174                         VERIFY0(space_map_open(&prev_obsolete_sm,
4175                             spa->spa_meta_objset,
4176                             scip->scip_prev_obsolete_sm_object,
4177                             0, vd->vdev_asize, 0));
4178                         dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
4179                         (void) printf("\n");
4180                         space_map_close(prev_obsolete_sm);
4181                 }
4182 
4183                 scip_count += 2;
4184         }
4185 
4186         for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
4187                 vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
4188                 vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
4189 
4190                 if (vic->vic_mapping_object != 0) {
4191                         ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
4192                             vd->vdev_removing);
4193                         indirect_vdev_count++;
4194 
4195                         if (vd->vdev_indirect_mapping->vim_havecounts) {
4196                                 obsolete_counts_count++;
4197                         }
 
4363                 (void) fprintf(stderr,
4364                     "\rverifying vdev %llu, space map entry %llu of %llu ...",
4365                     (longlong_t)vd->vdev_id,
4366                     (longlong_t)vcsec->vcsec_entryid,
4367                     (longlong_t)vcsec->vcsec_num_entries);
4368         }
4369         vcsec->vcsec_entryid++;
4370 
4371         /*
4372          * See comment in checkpoint_sm_exclude_entry_cb()
4373          */
4374         VERIFY3U(sme->sme_offset, >=, ms->ms_start);
4375         VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
4376 
4377         /*
4378          * The entries in the vdev_checkpoint_sm should be marked as
4379          * allocated in the checkpointed state of the pool, therefore
4380          * their respective ms_allocateable trees should not contain them.
4381          */
4382         mutex_enter(&ms->ms_lock);
4383         range_tree_verify_not_present(ms->ms_allocatable,
4384             sme->sme_offset, sme->sme_run);
4385         mutex_exit(&ms->ms_lock);
4386 
4387         return (0);
4388 }
4389 
4390 /*
4391  * Verify that all segments in the vdev_checkpoint_sm are allocated
4392  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
4393  * ms_allocatable).
4394  *
4395  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
4396  * each vdev in the current state of the pool to the metaslab space maps
4397  * (ms_sm) of the checkpointed state of the pool.
4398  *
4399  * Note that the function changes the state of the ms_allocatable
4400  * trees of the current spa_t. The entries of these ms_allocatable
4401  * trees are cleared out and then repopulated from with the free
4402  * entries of their respective ms_sm space maps.
4403  */
4404 static void
 
 
4427                         continue;
4428                 }
4429 
4430                 /*
4431                  * If the checkpoint space map doesn't exist, then nothing
4432                  * here is checkpointed so there's nothing to verify.
4433                  */
4434                 if (current_vd->vdev_top_zap == 0 ||
4435                     zap_contains(spa_meta_objset(current),
4436                     current_vd->vdev_top_zap,
4437                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4438                         continue;
4439 
4440                 VERIFY0(zap_lookup(spa_meta_objset(current),
4441                     current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4442                     sizeof (uint64_t), 1, &checkpoint_sm_obj));
4443 
4444                 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
4445                     checkpoint_sm_obj, 0, current_vd->vdev_asize,
4446                     current_vd->vdev_ashift));
4447 
4448                 verify_checkpoint_sm_entry_cb_arg_t vcsec;
4449                 vcsec.vcsec_vd = ckpoint_vd;
4450                 vcsec.vcsec_entryid = 0;
4451                 vcsec.vcsec_num_entries =
4452                     space_map_length(checkpoint_sm) / sizeof (uint64_t);
4453                 VERIFY0(space_map_iterate(checkpoint_sm,
4454                     space_map_length(checkpoint_sm),
4455                     verify_checkpoint_sm_entry_cb, &vcsec));
4456                 dump_spacemap(current->spa_meta_objset, checkpoint_sm);
4457                 space_map_close(checkpoint_sm);
4458         }
4459 
4460         /*
4461          * If we've added vdevs since we took the checkpoint, ensure
4462          * that their checkpoint space maps are empty.
4463          */
4464         if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
4465                 for (uint64_t c = ckpoint_rvd->vdev_children;
4466                     c < current_rvd->vdev_children; c++) {
4467                         vdev_t *current_vd = current_rvd->vdev_child[c];
4468                         ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
4469                 }
4470         }
4471 
4472         /* for cleaner progress output */
4473         (void) fprintf(stderr, "\n");
4474 }
 
4514                             "\rverifying vdev %llu of %llu, "
4515                             "metaslab %llu of %llu ...",
4516                             (longlong_t)current_vd->vdev_id,
4517                             (longlong_t)current_rvd->vdev_children,
4518                             (longlong_t)current_vd->vdev_ms[m]->ms_id,
4519                             (longlong_t)current_vd->vdev_ms_count);
4520 
4521                         /*
4522                          * We walk through the ms_allocatable trees that
4523                          * are loaded with the allocated blocks from the
4524                          * ms_sm spacemaps of the checkpoint. For each
4525                          * one of these ranges we ensure that none of them
4526                          * exists in the ms_allocatable trees of the
4527                          * current state which are loaded with the ranges
4528                          * that are currently free.
4529                          *
4530                          * This way we ensure that none of the blocks that
4531                          * are part of the checkpoint were freed by mistake.
4532                          */
4533                         range_tree_walk(ckpoint_msp->ms_allocatable,
4534                             (range_tree_func_t *)range_tree_verify_not_present,
4535                             current_msp->ms_allocatable);
4536                 }
4537         }
4538 
4539         /* for cleaner progress output */
4540         (void) fprintf(stderr, "\n");
4541 }
4542 
4543 static void
4544 verify_checkpoint_blocks(spa_t *spa)
4545 {
4546         ASSERT(!dump_opt['L']);
4547 
4548         spa_t *checkpoint_spa;
4549         char *checkpoint_pool;
4550         nvlist_t *config = NULL;
4551         int error = 0;
4552 
4553         /*
4554          * We import the checkpointed state of the pool (under a different
4555          * name) so we can do verification on it against the current state
4556          * of the pool.
4557          */
4558         checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
4559             NULL);
4560         ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
4561 
4562         error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
4563         if (error != 0) {
4564                 fatal("Tried to open pool \"%s\" but spa_open() failed with "
4565                     "error %d\n", checkpoint_pool, error);
4566         }
4567 
 
4593 
4594         for (uint64_t i = 0; i < rvd->vdev_children; i++) {
4595                 vdev_t *vd = rvd->vdev_child[i];
4596 
4597                 space_map_t *checkpoint_sm = NULL;
4598                 uint64_t checkpoint_sm_obj;
4599 
4600                 if (vd->vdev_top_zap == 0)
4601                         continue;
4602 
4603                 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
4604                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4605                         continue;
4606 
4607                 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
4608                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4609                     sizeof (uint64_t), 1, &checkpoint_sm_obj));
4610 
4611                 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
4612                     checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
4613                 dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
4614                 space_map_close(checkpoint_sm);
4615         }
4616 }
4617 
4618 static int
4619 verify_checkpoint(spa_t *spa)
4620 {
4621         uberblock_t checkpoint;
4622         int error;
4623 
4624         if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
4625                 return (0);
4626 
4627         error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4628             DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
4629             sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
4630 
4631         if (error == ENOENT && !dump_opt['L']) {
4632                 /*
 
 |