Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

@@ -783,34 +783,38 @@
 
         if (sm == NULL)
                 return;
 
         (void) printf("space map object %llu:\n",
-            (longlong_t)sm->sm_phys->smp_object);
-        (void) printf("  smp_objsize = 0x%llx\n",
-            (longlong_t)sm->sm_phys->smp_objsize);
+            (longlong_t)sm->sm_object);
+        (void) printf("  smp_length = 0x%llx\n",
+            (longlong_t)sm->sm_phys->smp_length);
         (void) printf("  smp_alloc = 0x%llx\n",
             (longlong_t)sm->sm_phys->smp_alloc);
 
+        if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+                return;
+
         /*
          * Print out the freelist entries in both encoded and decoded form.
          */
         uint8_t mapshift = sm->sm_shift;
         int64_t alloc = 0;
-        uint64_t word;
+        uint64_t word, entry_id = 0;
         for (uint64_t offset = 0; offset < space_map_length(sm);
             offset += sizeof (word)) {
 
                 VERIFY0(dmu_read(os, space_map_object(sm), offset,
                     sizeof (word), &word, DMU_READ_PREFETCH));
 
                 if (sm_entry_is_debug(word)) {
-                        (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
-                            (u_longlong_t)(offset / sizeof (word)),
+                        (void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
+                            (u_longlong_t)entry_id,
                             ddata[SM_DEBUG_ACTION_DECODE(word)],
                             (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
                             (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+                        entry_id++;
                         continue;
                 }
 
                 uint8_t words;
                 char entry_type;

@@ -844,22 +848,23 @@
                         words = 2;
                 }
 
                 (void) printf("\t    [%6llu]    %c  range:"
                     " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
-                    (u_longlong_t)(offset / sizeof (word)),
+                    (u_longlong_t)entry_id,
                     entry_type, (u_longlong_t)entry_off,
                     (u_longlong_t)(entry_off + entry_run),
                     (u_longlong_t)entry_run,
                     (u_longlong_t)entry_vdev, words);
 
                 if (entry_type == 'A')
                         alloc += entry_run;
                 else
                         alloc -= entry_run;
+                entry_id++;
         }
-        if ((uint64_t)alloc != space_map_allocated(sm)) {
+        if (alloc != space_map_allocated(sm)) {
                 (void) printf("space_map_object alloc (%lld) INCONSISTENT "
                     "with space map summary (%lld)\n",
                     (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
         }
 }

@@ -919,15 +924,12 @@
                     (u_longlong_t)msp->ms_fragmentation);
                 dump_histogram(sm->sm_phys->smp_histogram,
                     SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
         }
 
-        if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
                 ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
-
                 dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
-        }
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {

@@ -3096,10 +3098,12 @@
 {
         ddt_bookmark_t ddb;
         ddt_entry_t dde;
         int error;
 
+        ASSERT(!dump_opt['L']);
+
         bzero(&ddb, sizeof (ddb));
         while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
                 blkptr_t blk;
                 ddt_phys_t *ddp = dde.dde_phys;
 

@@ -3119,17 +3123,15 @@
                                 zcb->zcb_dedup_asize +=
                                     BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
                                 zcb->zcb_dedup_blocks++;
                         }
                 }
-                if (!dump_opt['L']) {
                         ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
                         ddt_enter(ddt);
                         VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
                         ddt_exit(ddt);
                 }
-        }
 
         ASSERT(error == ENOENT);
 }
 
 /* ARGSUSED */

@@ -3166,10 +3168,13 @@
  * blocks.
  */
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
+        if (dump_opt['L'])
+                return;
+
         if (spa->spa_vdev_removal == NULL)
                 return;
 
         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 

@@ -3257,11 +3262,10 @@
         if (scip->scip_vdev == vd->vdev_id &&
             scip->scip_prev_obsolete_sm_object != 0) {
                 space_map_t *prev_obsolete_sm = NULL;
                 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
                     scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
-                space_map_update(prev_obsolete_sm);
                 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
                     prev_obsolete_sm);
                 space_map_close(prev_obsolete_sm);
         }
         return (counts);

@@ -3351,22 +3355,24 @@
         cseea.cseea_vd = vd;
         cseea.cseea_checkpoint_size = 0;
 
         VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
             checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-        space_map_update(checkpoint_sm);
 
         VERIFY0(space_map_iterate(checkpoint_sm,
+            space_map_length(checkpoint_sm),
             checkpoint_sm_exclude_entry_cb, &cseea));
         space_map_close(checkpoint_sm);
 
         zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
 }
 
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
+        ASSERT(!dump_opt['L']);
+
         vdev_t *rvd = spa->spa_root_vdev;
         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
                 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
         }

@@ -3459,10 +3465,12 @@
 }
 
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
+        ASSERT(!dump_opt['L']);
+
         vdev_t *rvd = spa->spa_root_vdev;
         for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                 vdev_t *vd = rvd->vdev_child[c];
 
                 ASSERT3U(c, ==, vd->vdev_id);

@@ -3505,11 +3513,13 @@
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
         zcb->zcb_spa = spa;
 
-        if (!dump_opt['L']) {
+        if (dump_opt['L'])
+                return;
+
                 dsl_pool_t *dp = spa->spa_dsl_pool;
                 vdev_t *rvd = spa->spa_root_vdev;
 
                 /*
                  * We are going to be changing the meaning of the metaslab's

@@ -3550,10 +3560,11 @@
                  * entries on the checkpoint_sm are marked as allocated in
                  * the ms_sm of their relevant metaslab.
                  * [see comment in checkpoint_sm_exclude_entry_cb()]
                  */
                 zdb_leak_init_exclude_checkpoint(spa, zcb);
+        ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
 
                 /* for cleaner progress output */
                 (void) fprintf(stderr, "\n");
 
                 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {

@@ -3560,17 +3571,10 @@
                         ASSERT(spa_feature_is_enabled(spa,
                             SPA_FEATURE_DEVICE_REMOVAL));
                         (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
                             increment_indirect_mapping_cb, zcb, NULL);
                 }
-        } else {
-                /*
-                 * If leak tracing is disabled, we still need to consider
-                 * any checkpointed space in our space verification.
-                 */
-                zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
-        }
 
         spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
         zdb_ddt_leak_init(spa, zcb);
         spa_config_exit(spa, SCL_CONFIG, FTAG);
 }

@@ -3646,16 +3650,21 @@
 }
 
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
+        if (dump_opt['L'])
+                return (B_FALSE);
+
         boolean_t leaks = B_FALSE;
-        if (!dump_opt['L']) {
+
                 vdev_t *rvd = spa->spa_root_vdev;
                 for (unsigned c = 0; c < rvd->vdev_children; c++) {
                         vdev_t *vd = rvd->vdev_child[c];
+#if DEBUG
                         metaslab_group_t *mg = vd->vdev_mg;
+#endif
 
                         if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
                                 leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
                         }
 

@@ -3686,16 +3695,17 @@
 
                                 if (msp->ms_loaded) {
                                         msp->ms_loaded = B_FALSE;
                                 }
                         }
+
                 }
 
                 umem_free(zcb->zcb_vd_obsolete_counts,
                     rvd->vdev_children * sizeof (uint32_t *));
                 zcb->zcb_vd_obsolete_counts = NULL;
-        }
+
         return (leaks);
 }
 
 /* ARGSUSED */
 static int

@@ -3730,17 +3740,22 @@
             dump_opt['c'] ? "checksums " : "",
             (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
             !dump_opt['L'] ? "nothing leaked " : "");
 
         /*
-         * Load all space maps as SM_ALLOC maps, then traverse the pool
-         * claiming each block we discover.  If the pool is perfectly
-         * consistent, the space maps will be empty when we're done.
-         * Anything left over is a leak; any block we can't claim (because
-         * it's not part of any space map) is a double allocation,
-         * reference to a freed block, or an unclaimed log block.
+         * When leak detection is enabled we load all space maps as SM_ALLOC
+         * maps, then traverse the pool claiming each block we discover. If
+         * the pool is perfectly consistent, the segment trees will be empty
+         * when we're done. Anything left over is a leak; any block we can't
+         * claim (because it's not part of any space map) is a double
+         * allocation, reference to a freed block, or an unclaimed log block.
+         *
+         * When leak detection is disabled (-L option) we still traverse the
+         * pool claiming each block we discover, but we skip opening any space
+         * maps.
          */
+        bzero(&zcb, sizeof (zdb_cb_t));
         zdb_leak_init(spa, &zcb);
 
         /*
          * If there's a deferred-free bplist, process that first.
          */

@@ -3815,15 +3830,14 @@
             metaslab_class_get_alloc(spa_special_class(spa)) +
             metaslab_class_get_alloc(spa_dedup_class(spa));
         total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
             zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
 
-        if (total_found == total_alloc) {
-                if (!dump_opt['L'])
+        if (total_found == total_alloc && !dump_opt['L']) {
                         (void) printf("\n\tNo leaks (block sum matches space"
                             " maps exactly)\n");
-        } else {
+        } else if (!dump_opt['L']) {
                 (void) printf("block traversal size %llu != alloc %llu "
                     "(%s %lld)\n",
                     (u_longlong_t)total_found,
                     (u_longlong_t)total_alloc,
                     (dump_opt['L']) ? "unreachable" : "leaked",

@@ -4159,11 +4173,10 @@
                         space_map_t *prev_obsolete_sm = NULL;
                         VERIFY0(space_map_open(&prev_obsolete_sm,
                             spa->spa_meta_objset,
                             scip->scip_prev_obsolete_sm_object,
                             0, vd->vdev_asize, 0));
-                        space_map_update(prev_obsolete_sm);
                         dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
                         (void) printf("\n");
                         space_map_close(prev_obsolete_sm);
                 }
 

@@ -4365,11 +4378,12 @@
          * The entries in the vdev_checkpoint_sm should be marked as
          * allocated in the checkpointed state of the pool, therefore
          * their respective ms_allocateable trees should not contain them.
          */
         mutex_enter(&ms->ms_lock);
-        range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+        range_tree_verify_not_present(ms->ms_allocatable,
+            sme->sme_offset, sme->sme_run);
         mutex_exit(&ms->ms_lock);
 
         return (0);
 }
 

@@ -4428,18 +4442,18 @@
                     sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
                 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
                     checkpoint_sm_obj, 0, current_vd->vdev_asize,
                     current_vd->vdev_ashift));
-                space_map_update(checkpoint_sm);
 
                 verify_checkpoint_sm_entry_cb_arg_t vcsec;
                 vcsec.vcsec_vd = ckpoint_vd;
                 vcsec.vcsec_entryid = 0;
                 vcsec.vcsec_num_entries =
                     space_map_length(checkpoint_sm) / sizeof (uint64_t);
                 VERIFY0(space_map_iterate(checkpoint_sm,
+                    space_map_length(checkpoint_sm),
                     verify_checkpoint_sm_entry_cb, &vcsec));
                 dump_spacemap(current->spa_meta_objset, checkpoint_sm);
                 space_map_close(checkpoint_sm);
         }
 

@@ -4515,11 +4529,11 @@
                          *
                          * This way we ensure that none of the blocks that
                          * are part of the checkpoint were freed by mistake.
                          */
                         range_tree_walk(ckpoint_msp->ms_allocatable,
-                            (range_tree_func_t *)range_tree_verify,
+                            (range_tree_func_t *)range_tree_verify_not_present,
                             current_msp->ms_allocatable);
                 }
         }
 
         /* for cleaner progress output */

@@ -4527,10 +4541,12 @@
 }
 
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
+        ASSERT(!dump_opt['L']);
+
         spa_t *checkpoint_spa;
         char *checkpoint_pool;
         nvlist_t *config = NULL;
         int error = 0;
 

@@ -4592,11 +4608,10 @@
                     VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
                     sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
                 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
                     checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-                space_map_update(checkpoint_sm);
                 dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
                 space_map_close(checkpoint_sm);
         }
 }