Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/cmd/zdb/zdb.c
          +++ new/usr/src/cmd/zdb/zdb.c
↓ open down ↓ 777 lines elided ↑ open up ↑
 778  778  static void
 779  779  dump_spacemap(objset_t *os, space_map_t *sm)
 780  780  {
 781  781          char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 782  782              "INVALID", "INVALID", "INVALID", "INVALID" };
 783  783  
 784  784          if (sm == NULL)
 785  785                  return;
 786  786  
 787  787          (void) printf("space map object %llu:\n",
 788      -            (longlong_t)sm->sm_phys->smp_object);
 789      -        (void) printf("  smp_objsize = 0x%llx\n",
 790      -            (longlong_t)sm->sm_phys->smp_objsize);
      788 +            (longlong_t)sm->sm_object);
      789 +        (void) printf("  smp_length = 0x%llx\n",
      790 +            (longlong_t)sm->sm_phys->smp_length);
 791  791          (void) printf("  smp_alloc = 0x%llx\n",
 792  792              (longlong_t)sm->sm_phys->smp_alloc);
 793  793  
      794 +        if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
      795 +                return;
      796 +
 794  797          /*
 795  798           * Print out the freelist entries in both encoded and decoded form.
 796  799           */
 797  800          uint8_t mapshift = sm->sm_shift;
 798  801          int64_t alloc = 0;
 799      -        uint64_t word;
      802 +        uint64_t word, entry_id = 0;
 800  803          for (uint64_t offset = 0; offset < space_map_length(sm);
 801  804              offset += sizeof (word)) {
 802  805  
 803  806                  VERIFY0(dmu_read(os, space_map_object(sm), offset,
 804  807                      sizeof (word), &word, DMU_READ_PREFETCH));
 805  808  
 806  809                  if (sm_entry_is_debug(word)) {
 807      -                        (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 808      -                            (u_longlong_t)(offset / sizeof (word)),
      810 +                        (void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
      811 +                            (u_longlong_t)entry_id,
 809  812                              ddata[SM_DEBUG_ACTION_DECODE(word)],
 810  813                              (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
 811  814                              (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
      815 +                        entry_id++;
 812  816                          continue;
 813  817                  }
 814  818  
 815  819                  uint8_t words;
 816  820                  char entry_type;
 817  821                  uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
 818  822  
 819  823                  if (sm_entry_is_single_word(word)) {
 820  824                          entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 821  825                              'A' : 'F';
↓ open down ↓ 17 lines elided ↑ open up ↑
 839  843                          entry_vdev = SM2_VDEV_DECODE(word);
 840  844                          entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 841  845                              'A' : 'F';
 842  846                          entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 843  847                              mapshift) + sm->sm_start;
 844  848                          words = 2;
 845  849                  }
 846  850  
 847  851                  (void) printf("\t    [%6llu]    %c  range:"
 848  852                      " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
 849      -                    (u_longlong_t)(offset / sizeof (word)),
      853 +                    (u_longlong_t)entry_id,
 850  854                      entry_type, (u_longlong_t)entry_off,
 851  855                      (u_longlong_t)(entry_off + entry_run),
 852  856                      (u_longlong_t)entry_run,
 853  857                      (u_longlong_t)entry_vdev, words);
 854  858  
 855  859                  if (entry_type == 'A')
 856  860                          alloc += entry_run;
 857  861                  else
 858  862                          alloc -= entry_run;
      863 +                entry_id++;
 859  864          }
 860      -        if ((uint64_t)alloc != space_map_allocated(sm)) {
      865 +        if (alloc != space_map_allocated(sm)) {
 861  866                  (void) printf("space_map_object alloc (%lld) INCONSISTENT "
 862  867                      "with space map summary (%lld)\n",
 863  868                      (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 864  869          }
 865  870  }
 866  871  
 867  872  static void
 868  873  dump_metaslab_stats(metaslab_t *msp)
 869  874  {
 870  875          char maxbuf[32];
↓ open down ↓ 43 lines elided ↑ open up ↑
 914  919                  /*
 915  920                   * The space map histogram represents free space in chunks
 916  921                   * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 917  922                   */
 918  923                  (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 919  924                      (u_longlong_t)msp->ms_fragmentation);
 920  925                  dump_histogram(sm->sm_phys->smp_histogram,
 921  926                      SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 922  927          }
 923  928  
 924      -        if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
 925      -                ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 926      -
 927      -                dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 928      -        }
      929 +        ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
      930 +        dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 929  931  }
 930  932  
 931  933  static void
 932  934  print_vdev_metaslab_header(vdev_t *vd)
 933  935  {
 934  936          vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 935  937          const char *bias_str;
 936  938  
 937  939          bias_str = (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) ?
 938  940              VDEV_ALLOC_BIAS_LOG :
↓ open down ↓ 2152 lines elided ↑ open up ↑
3091 3093          NULL    /* alloc */
3092 3094  };
3093 3095  
3094 3096  static void
3095 3097  zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
3096 3098  {
3097 3099          ddt_bookmark_t ddb;
3098 3100          ddt_entry_t dde;
3099 3101          int error;
3100 3102  
     3103 +        ASSERT(!dump_opt['L']);
     3104 +
3101 3105          bzero(&ddb, sizeof (ddb));
3102 3106          while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
3103 3107                  blkptr_t blk;
3104 3108                  ddt_phys_t *ddp = dde.dde_phys;
3105 3109  
3106 3110                  if (ddb.ddb_class == DDT_CLASS_UNIQUE)
3107 3111                          return;
3108 3112  
3109 3113                  ASSERT(ddt_phys_total_refcnt(&dde) > 1);
3110 3114  
↓ open down ↓ 3 lines elided ↑ open up ↑
3114 3118                          ddt_bp_create(ddb.ddb_checksum,
3115 3119                              &dde.dde_key, ddp, &blk);
3116 3120                          if (p == DDT_PHYS_DITTO) {
3117 3121                                  zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
3118 3122                          } else {
3119 3123                                  zcb->zcb_dedup_asize +=
3120 3124                                      BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
3121 3125                                  zcb->zcb_dedup_blocks++;
3122 3126                          }
3123 3127                  }
3124      -                if (!dump_opt['L']) {
3125      -                        ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
3126      -                        ddt_enter(ddt);
3127      -                        VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
3128      -                        ddt_exit(ddt);
3129      -                }
     3128 +                ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
     3129 +                ddt_enter(ddt);
     3130 +                VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
     3131 +                ddt_exit(ddt);
3130 3132          }
3131 3133  
3132 3134          ASSERT(error == ENOENT);
3133 3135  }
3134 3136  
3135 3137  /* ARGSUSED */
3136 3138  static void
3137 3139  claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
3138 3140      uint64_t size, void *arg)
3139 3141  {
↓ open down ↓ 21 lines elided ↑ open up ↑
3161 3163  /*
3162 3164   * After accounting for all allocated blocks that are directly referenced,
3163 3165   * we might have missed a reference to a block from a partially complete
3164 3166   * (and thus unused) indirect mapping object. We perform a secondary pass
3165 3167   * through the metaslabs we have already mapped and claim the destination
3166 3168   * blocks.
3167 3169   */
3168 3170  static void
3169 3171  zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
3170 3172  {
     3173 +        if (dump_opt['L'])
     3174 +                return;
     3175 +
3171 3176          if (spa->spa_vdev_removal == NULL)
3172 3177                  return;
3173 3178  
3174 3179          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3175 3180  
3176 3181          spa_vdev_removal_t *svr = spa->spa_vdev_removal;
3177 3182          vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
3178 3183          vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
3179 3184  
3180 3185          for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
↓ open down ↓ 71 lines elided ↑ open up ↑
3252 3257          counts = vdev_indirect_mapping_load_obsolete_counts(vim);
3253 3258          if (vd->vdev_obsolete_sm != NULL) {
3254 3259                  vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3255 3260                      vd->vdev_obsolete_sm);
3256 3261          }
3257 3262          if (scip->scip_vdev == vd->vdev_id &&
3258 3263              scip->scip_prev_obsolete_sm_object != 0) {
3259 3264                  space_map_t *prev_obsolete_sm = NULL;
3260 3265                  VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
3261 3266                      scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
3262      -                space_map_update(prev_obsolete_sm);
3263 3267                  vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
3264 3268                      prev_obsolete_sm);
3265 3269                  space_map_close(prev_obsolete_sm);
3266 3270          }
3267 3271          return (counts);
3268 3272  }
3269 3273  
3270 3274  typedef struct checkpoint_sm_exclude_entry_arg {
3271 3275          vdev_t *cseea_vd;
3272 3276          uint64_t cseea_checkpoint_size;
↓ open down ↓ 73 lines elided ↑ open up ↑
3346 3350          VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
3347 3351              VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
3348 3352              &checkpoint_sm_obj));
3349 3353  
3350 3354          checkpoint_sm_exclude_entry_arg_t cseea;
3351 3355          cseea.cseea_vd = vd;
3352 3356          cseea.cseea_checkpoint_size = 0;
3353 3357  
3354 3358          VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
3355 3359              checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
3356      -        space_map_update(checkpoint_sm);
3357 3360  
3358 3361          VERIFY0(space_map_iterate(checkpoint_sm,
     3362 +            space_map_length(checkpoint_sm),
3359 3363              checkpoint_sm_exclude_entry_cb, &cseea));
3360 3364          space_map_close(checkpoint_sm);
3361 3365  
3362 3366          zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
3363 3367  }
3364 3368  
3365 3369  static void
3366 3370  zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
3367 3371  {
     3372 +        ASSERT(!dump_opt['L']);
     3373 +
3368 3374          vdev_t *rvd = spa->spa_root_vdev;
3369 3375          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3370 3376                  ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
3371 3377                  zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
3372 3378          }
3373 3379  }
3374 3380  
3375 3381  static void
3376 3382  load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
3377 3383  {
↓ open down ↓ 76 lines elided ↑ open up ↑
3454 3460          }
3455 3461  
3456 3462          if (!msp->ms_loaded)
3457 3463                  msp->ms_loaded = B_TRUE;
3458 3464          mutex_exit(&msp->ms_lock);
3459 3465  }
3460 3466  
3461 3467  static void
3462 3468  zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
3463 3469  {
     3470 +        ASSERT(!dump_opt['L']);
     3471 +
3464 3472          vdev_t *rvd = spa->spa_root_vdev;
3465 3473          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
3466 3474                  vdev_t *vd = rvd->vdev_child[c];
3467 3475  
3468 3476                  ASSERT3U(c, ==, vd->vdev_id);
3469 3477  
3470 3478                  if (vd->vdev_ops != &vdev_indirect_ops)
3471 3479                          continue;
3472 3480  
3473 3481                  /*
↓ open down ↓ 26 lines elided ↑ open up ↑
3500 3508                  }
3501 3509                  ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
3502 3510          }
3503 3511  }
3504 3512  
3505 3513  static void
3506 3514  zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
3507 3515  {
3508 3516          zcb->zcb_spa = spa;
3509 3517  
3510      -        if (!dump_opt['L']) {
3511      -                dsl_pool_t *dp = spa->spa_dsl_pool;
3512      -                vdev_t *rvd = spa->spa_root_vdev;
     3518 +        if (dump_opt['L'])
     3519 +                return;
3513 3520  
3514      -                /*
3515      -                 * We are going to be changing the meaning of the metaslab's
3516      -                 * ms_allocatable.  Ensure that the allocator doesn't try to
3517      -                 * use the tree.
3518      -                 */
3519      -                spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
3520      -                spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
     3521 +        dsl_pool_t *dp = spa->spa_dsl_pool;
     3522 +        vdev_t *rvd = spa->spa_root_vdev;
3521 3523  
3522      -                zcb->zcb_vd_obsolete_counts =
3523      -                    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
3524      -                    UMEM_NOFAIL);
     3524 +        /*
     3525 +         * We are going to be changing the meaning of the metaslab's
     3526 +         * ms_allocatable.  Ensure that the allocator doesn't try to
     3527 +         * use the tree.
     3528 +         */
     3529 +        spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
     3530 +        spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
3525 3531  
3526      -                /*
3527      -                 * For leak detection, we overload the ms_allocatable trees
3528      -                 * to contain allocated segments instead of free segments.
3529      -                 * As a result, we can't use the normal metaslab_load/unload
3530      -                 * interfaces.
3531      -                 */
3532      -                zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
3533      -                load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
     3532 +        zcb->zcb_vd_obsolete_counts =
     3533 +            umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
     3534 +            UMEM_NOFAIL);
3534 3535  
3535      -                /*
3536      -                 * On load_concrete_ms_allocatable_trees() we loaded all the
3537      -                 * allocated entries from the ms_sm to the ms_allocatable for
3538      -                 * each metaslab. If the pool has a checkpoint or is in the
3539      -                 * middle of discarding a checkpoint, some of these blocks
3540      -                 * may have been freed but their ms_sm may not have been
3541      -                 * updated because they are referenced by the checkpoint. In
3542      -                 * order to avoid false-positives during leak-detection, we
3543      -                 * go through the vdev's checkpoint space map and exclude all
3544      -                 * its entries from their relevant ms_allocatable.
3545      -                 *
3546      -                 * We also aggregate the space held by the checkpoint and add
3547      -                 * it to zcb_checkpoint_size.
3548      -                 *
3549      -                 * Note that at this point we are also verifying that all the
3550      -                 * entries on the checkpoint_sm are marked as allocated in
3551      -                 * the ms_sm of their relevant metaslab.
3552      -                 * [see comment in checkpoint_sm_exclude_entry_cb()]
3553      -                 */
3554      -                zdb_leak_init_exclude_checkpoint(spa, zcb);
     3536 +        /*
     3537 +         * For leak detection, we overload the ms_allocatable trees
     3538 +         * to contain allocated segments instead of free segments.
     3539 +         * As a result, we can't use the normal metaslab_load/unload
     3540 +         * interfaces.
     3541 +         */
     3542 +        zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
     3543 +        load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
3555 3544  
3556      -                /* for cleaner progress output */
3557      -                (void) fprintf(stderr, "\n");
     3545 +        /*
     3546 +         * On load_concrete_ms_allocatable_trees() we loaded all the
     3547 +         * allocated entries from the ms_sm to the ms_allocatable for
     3548 +         * each metaslab. If the pool has a checkpoint or is in the
     3549 +         * middle of discarding a checkpoint, some of these blocks
     3550 +         * may have been freed but their ms_sm may not have been
     3551 +         * updated because they are referenced by the checkpoint. In
     3552 +         * order to avoid false-positives during leak-detection, we
     3553 +         * go through the vdev's checkpoint space map and exclude all
     3554 +         * its entries from their relevant ms_allocatable.
     3555 +         *
     3556 +         * We also aggregate the space held by the checkpoint and add
     3557 +         * it to zcb_checkpoint_size.
     3558 +         *
     3559 +         * Note that at this point we are also verifying that all the
     3560 +         * entries on the checkpoint_sm are marked as allocated in
     3561 +         * the ms_sm of their relevant metaslab.
     3562 +         * [see comment in checkpoint_sm_exclude_entry_cb()]
     3563 +         */
     3564 +        zdb_leak_init_exclude_checkpoint(spa, zcb);
     3565 +        ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
3558 3566  
3559      -                if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
3560      -                        ASSERT(spa_feature_is_enabled(spa,
3561      -                            SPA_FEATURE_DEVICE_REMOVAL));
3562      -                        (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
3563      -                            increment_indirect_mapping_cb, zcb, NULL);
3564      -                }
3565      -        } else {
3566      -                /*
3567      -                 * If leak tracing is disabled, we still need to consider
3568      -                 * any checkpointed space in our space verification.
3569      -                 */
3570      -                zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
     3567 +        /* for cleaner progress output */
     3568 +        (void) fprintf(stderr, "\n");
     3569 +
     3570 +        if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
     3571 +                ASSERT(spa_feature_is_enabled(spa,
     3572 +                    SPA_FEATURE_DEVICE_REMOVAL));
     3573 +                (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
     3574 +                    increment_indirect_mapping_cb, zcb, NULL);
3571 3575          }
3572 3576  
3573 3577          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3574 3578          zdb_ddt_leak_init(spa, zcb);
3575 3579          spa_config_exit(spa, SCL_CONFIG, FTAG);
3576 3580  }
3577 3581  
3578 3582  static boolean_t
3579 3583  zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
3580 3584  {
↓ open down ↓ 60 lines elided ↑ open up ↑
3641 3645          vdev_indirect_mapping_free_obsolete_counts(vim,
3642 3646              zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
3643 3647          zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
3644 3648  
3645 3649          return (leaks);
3646 3650  }
3647 3651  
3648 3652  static boolean_t
3649 3653  zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
3650 3654  {
     3655 +        if (dump_opt['L'])
     3656 +                return (B_FALSE);
     3657 +
3651 3658          boolean_t leaks = B_FALSE;
3652      -        if (!dump_opt['L']) {
3653      -                vdev_t *rvd = spa->spa_root_vdev;
3654      -                for (unsigned c = 0; c < rvd->vdev_children; c++) {
3655      -                        vdev_t *vd = rvd->vdev_child[c];
3656      -                        metaslab_group_t *mg = vd->vdev_mg;
3657 3659  
3658      -                        if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
3659      -                                leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
3660      -                        }
     3660 +        vdev_t *rvd = spa->spa_root_vdev;
     3661 +        for (unsigned c = 0; c < rvd->vdev_children; c++) {
     3662 +                vdev_t *vd = rvd->vdev_child[c];
     3663 +#if DEBUG
     3664 +                metaslab_group_t *mg = vd->vdev_mg;
     3665 +#endif
3661 3666  
3662      -                        for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
3663      -                                metaslab_t *msp = vd->vdev_ms[m];
3664      -                                ASSERT3P(mg, ==, msp->ms_group);
     3667 +                if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
     3668 +                        leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
     3669 +                }
3665 3670  
3666      -                                /*
3667      -                                 * ms_allocatable has been overloaded
3668      -                                 * to contain allocated segments. Now that
3669      -                                 * we finished traversing all blocks, any
3670      -                                 * block that remains in the ms_allocatable
3671      -                                 * represents an allocated block that we
3672      -                                 * did not claim during the traversal.
3673      -                                 * Claimed blocks would have been removed
3674      -                                 * from the ms_allocatable.  For indirect
3675      -                                 * vdevs, space remaining in the tree
3676      -                                 * represents parts of the mapping that are
3677      -                                 * not referenced, which is not a bug.
3678      -                                 */
3679      -                                if (vd->vdev_ops == &vdev_indirect_ops) {
3680      -                                        range_tree_vacate(msp->ms_allocatable,
3681      -                                            NULL, NULL);
3682      -                                } else {
3683      -                                        range_tree_vacate(msp->ms_allocatable,
3684      -                                            zdb_leak, vd);
3685      -                                }
     3671 +                for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
     3672 +                        metaslab_t *msp = vd->vdev_ms[m];
     3673 +                        ASSERT3P(mg, ==, msp->ms_group);
3686 3674  
3687      -                                if (msp->ms_loaded) {
3688      -                                        msp->ms_loaded = B_FALSE;
3689      -                                }
     3675 +                        /*
     3676 +                         * ms_allocatable has been overloaded
     3677 +                         * to contain allocated segments. Now that
     3678 +                         * we finished traversing all blocks, any
     3679 +                         * block that remains in the ms_allocatable
     3680 +                         * represents an allocated block that we
     3681 +                         * did not claim during the traversal.
     3682 +                         * Claimed blocks would have been removed
     3683 +                         * from the ms_allocatable.  For indirect
     3684 +                         * vdevs, space remaining in the tree
     3685 +                         * represents parts of the mapping that are
     3686 +                         * not referenced, which is not a bug.
     3687 +                         */
     3688 +                        if (vd->vdev_ops == &vdev_indirect_ops) {
     3689 +                                range_tree_vacate(msp->ms_allocatable,
     3690 +                                    NULL, NULL);
     3691 +                        } else {
     3692 +                                range_tree_vacate(msp->ms_allocatable,
     3693 +                                    zdb_leak, vd);
3690 3694                          }
     3695 +
     3696 +                        if (msp->ms_loaded) {
     3697 +                                msp->ms_loaded = B_FALSE;
     3698 +                        }
3691 3699                  }
3692 3700  
3693      -                umem_free(zcb->zcb_vd_obsolete_counts,
3694      -                    rvd->vdev_children * sizeof (uint32_t *));
3695      -                zcb->zcb_vd_obsolete_counts = NULL;
3696 3701          }
     3702 +
     3703 +        umem_free(zcb->zcb_vd_obsolete_counts,
     3704 +            rvd->vdev_children * sizeof (uint32_t *));
     3705 +        zcb->zcb_vd_obsolete_counts = NULL;
     3706 +
3697 3707          return (leaks);
3698 3708  }
3699 3709  
3700 3710  /* ARGSUSED */
3701 3711  static int
3702 3712  count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
3703 3713  {
3704 3714          zdb_cb_t *zcb = arg;
3705 3715  
3706 3716          if (dump_opt['b'] >= 5) {
↓ open down ↓ 18 lines elided ↑ open up ↑
3725 3735  
3726 3736          bzero(&zcb, sizeof (zcb));
3727 3737          (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
3728 3738              (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
3729 3739              (dump_opt['c'] == 1) ? "metadata " : "",
3730 3740              dump_opt['c'] ? "checksums " : "",
3731 3741              (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
3732 3742              !dump_opt['L'] ? "nothing leaked " : "");
3733 3743  
3734 3744          /*
3735      -         * Load all space maps as SM_ALLOC maps, then traverse the pool
3736      -         * claiming each block we discover.  If the pool is perfectly
3737      -         * consistent, the space maps will be empty when we're done.
3738      -         * Anything left over is a leak; any block we can't claim (because
3739      -         * it's not part of any space map) is a double allocation,
3740      -         * reference to a freed block, or an unclaimed log block.
     3745 +         * When leak detection is enabled we load all space maps as SM_ALLOC
     3746 +         * maps, then traverse the pool claiming each block we discover. If
     3747 +         * the pool is perfectly consistent, the segment trees will be empty
     3748 +         * when we're done. Anything left over is a leak; any block we can't
     3749 +         * claim (because it's not part of any space map) is a double
     3750 +         * allocation, reference to a freed block, or an unclaimed log block.
     3751 +         *
     3752 +         * When leak detection is disabled (-L option) we still traverse the
     3753 +         * pool claiming each block we discover, but we skip opening any space
     3754 +         * maps.
3741 3755           */
     3756 +        bzero(&zcb, sizeof (zdb_cb_t));
3742 3757          zdb_leak_init(spa, &zcb);
3743 3758  
3744 3759          /*
3745 3760           * If there's a deferred-free bplist, process that first.
3746 3761           */
3747 3762          (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
3748 3763              count_block_cb, &zcb, NULL);
3749 3764  
3750 3765          if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
3751 3766                  (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
↓ open down ↓ 58 lines elided ↑ open up ↑
3810 3825          norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
3811 3826          norm_space = metaslab_class_get_space(spa_normal_class(spa));
3812 3827  
3813 3828          total_alloc = norm_alloc +
3814 3829              metaslab_class_get_alloc(spa_log_class(spa)) +
3815 3830              metaslab_class_get_alloc(spa_special_class(spa)) +
3816 3831              metaslab_class_get_alloc(spa_dedup_class(spa));
3817 3832          total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
3818 3833              zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
3819 3834  
3820      -        if (total_found == total_alloc) {
3821      -                if (!dump_opt['L'])
3822      -                        (void) printf("\n\tNo leaks (block sum matches space"
3823      -                            " maps exactly)\n");
3824      -        } else {
     3835 +        if (total_found == total_alloc && !dump_opt['L']) {
     3836 +                (void) printf("\n\tNo leaks (block sum matches space"
     3837 +                    " maps exactly)\n");
     3838 +        } else if (!dump_opt['L']) {
3825 3839                  (void) printf("block traversal size %llu != alloc %llu "
3826 3840                      "(%s %lld)\n",
3827 3841                      (u_longlong_t)total_found,
3828 3842                      (u_longlong_t)total_alloc,
3829 3843                      (dump_opt['L']) ? "unreachable" : "leaked",
3830 3844                      (longlong_t)(total_alloc - total_found));
3831 3845                  leaks = B_TRUE;
3832 3846          }
3833 3847  
3834 3848          if (tzb->zb_count == 0)
↓ open down ↓ 319 lines elided ↑ open up ↑
4154 4168                      "object %llu, prev obsolete sm %llu\n",
4155 4169                      (u_longlong_t)scip->scip_vdev,
4156 4170                      (u_longlong_t)scip->scip_next_mapping_object,
4157 4171                      (u_longlong_t)scip->scip_prev_obsolete_sm_object);
4158 4172                  if (scip->scip_prev_obsolete_sm_object != 0) {
4159 4173                          space_map_t *prev_obsolete_sm = NULL;
4160 4174                          VERIFY0(space_map_open(&prev_obsolete_sm,
4161 4175                              spa->spa_meta_objset,
4162 4176                              scip->scip_prev_obsolete_sm_object,
4163 4177                              0, vd->vdev_asize, 0));
4164      -                        space_map_update(prev_obsolete_sm);
4165 4178                          dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
4166 4179                          (void) printf("\n");
4167 4180                          space_map_close(prev_obsolete_sm);
4168 4181                  }
4169 4182  
4170 4183                  scip_count += 2;
4171 4184          }
4172 4185  
4173 4186          for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
4174 4187                  vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
↓ open down ↓ 185 lines elided ↑ open up ↑
4360 4373           */
4361 4374          VERIFY3U(sme->sme_offset, >=, ms->ms_start);
4362 4375          VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
4363 4376  
4364 4377          /*
4365 4378           * The entries in the vdev_checkpoint_sm should be marked as
4366 4379           * allocated in the checkpointed state of the pool, therefore
4367 4380           * their respective ms_allocateable trees should not contain them.
4368 4381           */
4369 4382          mutex_enter(&ms->ms_lock);
4370      -        range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
     4383 +        range_tree_verify_not_present(ms->ms_allocatable,
     4384 +            sme->sme_offset, sme->sme_run);
4371 4385          mutex_exit(&ms->ms_lock);
4372 4386  
4373 4387          return (0);
4374 4388  }
4375 4389  
4376 4390  /*
4377 4391   * Verify that all segments in the vdev_checkpoint_sm are allocated
4378 4392   * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
4379 4393   * ms_allocatable).
4380 4394   *
↓ open down ↓ 42 lines elided ↑ open up ↑
4423 4437                      VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4424 4438                          continue;
4425 4439  
4426 4440                  VERIFY0(zap_lookup(spa_meta_objset(current),
4427 4441                      current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4428 4442                      sizeof (uint64_t), 1, &checkpoint_sm_obj));
4429 4443  
4430 4444                  VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
4431 4445                      checkpoint_sm_obj, 0, current_vd->vdev_asize,
4432 4446                      current_vd->vdev_ashift));
4433      -                space_map_update(checkpoint_sm);
4434 4447  
4435 4448                  verify_checkpoint_sm_entry_cb_arg_t vcsec;
4436 4449                  vcsec.vcsec_vd = ckpoint_vd;
4437 4450                  vcsec.vcsec_entryid = 0;
4438 4451                  vcsec.vcsec_num_entries =
4439 4452                      space_map_length(checkpoint_sm) / sizeof (uint64_t);
4440 4453                  VERIFY0(space_map_iterate(checkpoint_sm,
     4454 +                    space_map_length(checkpoint_sm),
4441 4455                      verify_checkpoint_sm_entry_cb, &vcsec));
4442 4456                  dump_spacemap(current->spa_meta_objset, checkpoint_sm);
4443 4457                  space_map_close(checkpoint_sm);
4444 4458          }
4445 4459  
4446 4460          /*
4447 4461           * If we've added vdevs since we took the checkpoint, ensure
4448 4462           * that their checkpoint space maps are empty.
4449 4463           */
4450 4464          if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
↓ open down ↓ 59 lines elided ↑ open up ↑
4510 4524                           * ms_sm spacemaps of the checkpoint. For each
4511 4525                           * one of these ranges we ensure that none of them
4512 4526                           * exists in the ms_allocatable trees of the
4513 4527                           * current state which are loaded with the ranges
4514 4528                           * that are currently free.
4515 4529                           *
4516 4530                           * This way we ensure that none of the blocks that
4517 4531                           * are part of the checkpoint were freed by mistake.
4518 4532                           */
4519 4533                          range_tree_walk(ckpoint_msp->ms_allocatable,
4520      -                            (range_tree_func_t *)range_tree_verify,
     4534 +                            (range_tree_func_t *)range_tree_verify_not_present,
4521 4535                              current_msp->ms_allocatable);
4522 4536                  }
4523 4537          }
4524 4538  
4525 4539          /* for cleaner progress output */
4526 4540          (void) fprintf(stderr, "\n");
4527 4541  }
4528 4542  
4529 4543  static void
4530 4544  verify_checkpoint_blocks(spa_t *spa)
4531 4545  {
     4546 +        ASSERT(!dump_opt['L']);
     4547 +
4532 4548          spa_t *checkpoint_spa;
4533 4549          char *checkpoint_pool;
4534 4550          nvlist_t *config = NULL;
4535 4551          int error = 0;
4536 4552  
4537 4553          /*
4538 4554           * We import the checkpointed state of the pool (under a different
4539 4555           * name) so we can do verification on it against the current state
4540 4556           * of the pool.
4541 4557           */
↓ open down ↓ 45 lines elided ↑ open up ↑
4587 4603                  if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
4588 4604                      VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
4589 4605                          continue;
4590 4606  
4591 4607                  VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
4592 4608                      VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
4593 4609                      sizeof (uint64_t), 1, &checkpoint_sm_obj));
4594 4610  
4595 4611                  VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
4596 4612                      checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
4597      -                space_map_update(checkpoint_sm);
4598 4613                  dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
4599 4614                  space_map_close(checkpoint_sm);
4600 4615          }
4601 4616  }
4602 4617  
4603 4618  static int
4604 4619  verify_checkpoint(spa_t *spa)
4605 4620  {
4606 4621          uberblock_t checkpoint;
4607 4622          int error;
↓ open down ↓ 1081 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX