Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
        
*** 783,816 ****
  
          if (sm == NULL)
                  return;
  
          (void) printf("space map object %llu:\n",
!             (longlong_t)sm->sm_phys->smp_object);
!         (void) printf("  smp_objsize = 0x%llx\n",
!             (longlong_t)sm->sm_phys->smp_objsize);
          (void) printf("  smp_alloc = 0x%llx\n",
              (longlong_t)sm->sm_phys->smp_alloc);
  
          /*
           * Print out the freelist entries in both encoded and decoded form.
           */
          uint8_t mapshift = sm->sm_shift;
          int64_t alloc = 0;
!         uint64_t word;
          for (uint64_t offset = 0; offset < space_map_length(sm);
              offset += sizeof (word)) {
  
                  VERIFY0(dmu_read(os, space_map_object(sm), offset,
                      sizeof (word), &word, DMU_READ_PREFETCH));
  
                  if (sm_entry_is_debug(word)) {
!                         (void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
!                             (u_longlong_t)(offset / sizeof (word)),
                              ddata[SM_DEBUG_ACTION_DECODE(word)],
                              (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
                              (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
                          continue;
                  }
  
                  uint8_t words;
                  char entry_type;
--- 783,820 ----
  
          if (sm == NULL)
                  return;
  
          (void) printf("space map object %llu:\n",
!             (longlong_t)sm->sm_object);
!         (void) printf("  smp_length = 0x%llx\n",
!             (longlong_t)sm->sm_phys->smp_length);
          (void) printf("  smp_alloc = 0x%llx\n",
              (longlong_t)sm->sm_phys->smp_alloc);
  
+         if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+                 return;
+ 
          /*
           * Print out the freelist entries in both encoded and decoded form.
           */
          uint8_t mapshift = sm->sm_shift;
          int64_t alloc = 0;
!         uint64_t word, entry_id = 0;
          for (uint64_t offset = 0; offset < space_map_length(sm);
              offset += sizeof (word)) {
  
                  VERIFY0(dmu_read(os, space_map_object(sm), offset,
                      sizeof (word), &word, DMU_READ_PREFETCH));
  
                  if (sm_entry_is_debug(word)) {
!                         (void) printf("\t    [%6llu] %s: txg %llu pass %llu\n",
!                             (u_longlong_t)entry_id,
                              ddata[SM_DEBUG_ACTION_DECODE(word)],
                              (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
                              (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+                         entry_id++;
                          continue;
                  }
  
                  uint8_t words;
                  char entry_type;
*** 844,865 ****
                          words = 2;
                  }
  
                  (void) printf("\t    [%6llu]    %c  range:"
                      " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
!                     (u_longlong_t)(offset / sizeof (word)),
                      entry_type, (u_longlong_t)entry_off,
                      (u_longlong_t)(entry_off + entry_run),
                      (u_longlong_t)entry_run,
                      (u_longlong_t)entry_vdev, words);
  
                  if (entry_type == 'A')
                          alloc += entry_run;
                  else
                          alloc -= entry_run;
          }
!         if ((uint64_t)alloc != space_map_allocated(sm)) {
                  (void) printf("space_map_object alloc (%lld) INCONSISTENT "
                      "with space map summary (%lld)\n",
                      (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
          }
  }
--- 848,870 ----
                          words = 2;
                  }
  
                  (void) printf("\t    [%6llu]    %c  range:"
                      " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
!                     (u_longlong_t)entry_id,
                      entry_type, (u_longlong_t)entry_off,
                      (u_longlong_t)(entry_off + entry_run),
                      (u_longlong_t)entry_run,
                      (u_longlong_t)entry_vdev, words);
  
                  if (entry_type == 'A')
                          alloc += entry_run;
                  else
                          alloc -= entry_run;
+                 entry_id++;
          }
!         if (alloc != space_map_allocated(sm)) {
                  (void) printf("space_map_object alloc (%lld) INCONSISTENT "
                      "with space map summary (%lld)\n",
                      (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
          }
  }
*** 919,933 ****
                      (u_longlong_t)msp->ms_fragmentation);
                  dump_histogram(sm->sm_phys->smp_histogram,
                      SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
          }
  
-         if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
                  ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
- 
                  dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
-         }
  }
  
  static void
  print_vdev_metaslab_header(vdev_t *vd)
  {
--- 924,935 ----
*** 3096,3105 ****
--- 3098,3109 ----
  {
          ddt_bookmark_t ddb;
          ddt_entry_t dde;
          int error;
  
+         ASSERT(!dump_opt['L']);
+ 
          bzero(&ddb, sizeof (ddb));
          while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
                  blkptr_t blk;
                  ddt_phys_t *ddp = dde.dde_phys;
  
*** 3119,3135 ****
                                  zcb->zcb_dedup_asize +=
                                      BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
                                  zcb->zcb_dedup_blocks++;
                          }
                  }
-                 if (!dump_opt['L']) {
                          ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
                          ddt_enter(ddt);
                          VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
                          ddt_exit(ddt);
                  }
-         }
  
          ASSERT(error == ENOENT);
  }
  
  /* ARGSUSED */
--- 3123,3137 ----
*** 3166,3175 ****
--- 3168,3180 ----
   * blocks.
   */
  static void
  zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
  {
+         if (dump_opt['L'])
+                 return;
+ 
          if (spa->spa_vdev_removal == NULL)
                  return;
  
          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
  
*** 3257,3267 ****
          if (scip->scip_vdev == vd->vdev_id &&
              scip->scip_prev_obsolete_sm_object != 0) {
                  space_map_t *prev_obsolete_sm = NULL;
                  VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
                      scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
-                 space_map_update(prev_obsolete_sm);
                  vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
                      prev_obsolete_sm);
                  space_map_close(prev_obsolete_sm);
          }
          return (counts);
--- 3262,3271 ----
*** 3351,3372 ****
          cseea.cseea_vd = vd;
          cseea.cseea_checkpoint_size = 0;
  
          VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
              checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-         space_map_update(checkpoint_sm);
  
          VERIFY0(space_map_iterate(checkpoint_sm,
              checkpoint_sm_exclude_entry_cb, &cseea));
          space_map_close(checkpoint_sm);
  
          zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
  }
  
  static void
  zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
  {
          vdev_t *rvd = spa->spa_root_vdev;
          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                  ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
                  zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
          }
--- 3355,3378 ----
          cseea.cseea_vd = vd;
          cseea.cseea_checkpoint_size = 0;
  
          VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
              checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
  
          VERIFY0(space_map_iterate(checkpoint_sm,
+             space_map_length(checkpoint_sm),
              checkpoint_sm_exclude_entry_cb, &cseea));
          space_map_close(checkpoint_sm);
  
          zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
  }
  
  static void
  zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
  {
+         ASSERT(!dump_opt['L']);
+ 
          vdev_t *rvd = spa->spa_root_vdev;
          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                  ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
                  zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
          }
*** 3459,3468 ****
--- 3465,3476 ----
  }
  
  static void
  zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
  {
+         ASSERT(!dump_opt['L']);
+ 
          vdev_t *rvd = spa->spa_root_vdev;
          for (uint64_t c = 0; c < rvd->vdev_children; c++) {
                  vdev_t *vd = rvd->vdev_child[c];
  
                  ASSERT3U(c, ==, vd->vdev_id);
*** 3505,3515 ****
  static void
  zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
  {
          zcb->zcb_spa = spa;
  
!         if (!dump_opt['L']) {
                  dsl_pool_t *dp = spa->spa_dsl_pool;
                  vdev_t *rvd = spa->spa_root_vdev;
  
                  /*
                   * We are going to be changing the meaning of the metaslab's
--- 3513,3525 ----
  static void
  zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
  {
          zcb->zcb_spa = spa;
  
!         if (dump_opt['L'])
!                 return;
! 
          dsl_pool_t *dp = spa->spa_dsl_pool;
          vdev_t *rvd = spa->spa_root_vdev;
  
          /*
           * We are going to be changing the meaning of the metaslab's
*** 3550,3559 ****
--- 3560,3570 ----
           * entries on the checkpoint_sm are marked as allocated in
           * the ms_sm of their relevant metaslab.
           * [see comment in checkpoint_sm_exclude_entry_cb()]
           */
          zdb_leak_init_exclude_checkpoint(spa, zcb);
+         ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
  
          /* for cleaner progress output */
          (void) fprintf(stderr, "\n");
  
          if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
*** 3560,3576 ****
                          ASSERT(spa_feature_is_enabled(spa,
                              SPA_FEATURE_DEVICE_REMOVAL));
                          (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
                              increment_indirect_mapping_cb, zcb, NULL);
                  }
-         } else {
-                 /*
-                  * If leak tracing is disabled, we still need to consider
-                  * any checkpointed space in our space verification.
-                  */
-                 zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
-         }
  
          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
          zdb_ddt_leak_init(spa, zcb);
          spa_config_exit(spa, SCL_CONFIG, FTAG);
  }
--- 3571,3580 ----
*** 3646,3661 ****
  }
  
  static boolean_t
  zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
  {
          boolean_t leaks = B_FALSE;
!         if (!dump_opt['L']) {
                  vdev_t *rvd = spa->spa_root_vdev;
                  for (unsigned c = 0; c < rvd->vdev_children; c++) {
                          vdev_t *vd = rvd->vdev_child[c];
                          metaslab_group_t *mg = vd->vdev_mg;
  
                          if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
                                  leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
                          }
  
--- 3650,3670 ----
  }
  
  static boolean_t
  zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
  {
+         if (dump_opt['L'])
+                 return (B_FALSE);
+ 
          boolean_t leaks = B_FALSE;
! 
          vdev_t *rvd = spa->spa_root_vdev;
          for (unsigned c = 0; c < rvd->vdev_children; c++) {
                  vdev_t *vd = rvd->vdev_child[c];
+ #if DEBUG
                  metaslab_group_t *mg = vd->vdev_mg;
+ #endif
  
                  if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
                          leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
                  }
  
*** 3686,3701 ****
  
                                  if (msp->ms_loaded) {
                                          msp->ms_loaded = B_FALSE;
                                  }
                          }
                  }
  
                  umem_free(zcb->zcb_vd_obsolete_counts,
                      rvd->vdev_children * sizeof (uint32_t *));
                  zcb->zcb_vd_obsolete_counts = NULL;
!         }
          return (leaks);
  }
  
  /* ARGSUSED */
  static int
--- 3695,3711 ----
  
                          if (msp->ms_loaded) {
                                  msp->ms_loaded = B_FALSE;
                          }
                  }
+ 
          }
  
          umem_free(zcb->zcb_vd_obsolete_counts,
              rvd->vdev_children * sizeof (uint32_t *));
          zcb->zcb_vd_obsolete_counts = NULL;
! 
          return (leaks);
  }
  
  /* ARGSUSED */
  static int
*** 3730,3746 ****
              dump_opt['c'] ? "checksums " : "",
              (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
              !dump_opt['L'] ? "nothing leaked " : "");
  
          /*
!          * Load all space maps as SM_ALLOC maps, then traverse the pool
!          * claiming each block we discover.  If the pool is perfectly
!          * consistent, the space maps will be empty when we're done.
!          * Anything left over is a leak; any block we can't claim (because
!          * it's not part of any space map) is a double allocation,
!          * reference to a freed block, or an unclaimed log block.
           */
          zdb_leak_init(spa, &zcb);
  
          /*
           * If there's a deferred-free bplist, process that first.
           */
--- 3740,3761 ----
              dump_opt['c'] ? "checksums " : "",
              (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
              !dump_opt['L'] ? "nothing leaked " : "");
  
          /*
!          * When leak detection is enabled we load all space maps as SM_ALLOC
!          * maps, then traverse the pool claiming each block we discover. If
!          * the pool is perfectly consistent, the segment trees will be empty
!          * when we're done. Anything left over is a leak; any block we can't
!          * claim (because it's not part of any space map) is a double
!          * allocation, reference to a freed block, or an unclaimed log block.
!          *
!          * When leak detection is disabled (-L option) we still traverse the
!          * pool claiming each block we discover, but we skip opening any space
!          * maps.
           */
+         bzero(&zcb, sizeof (zdb_cb_t));
          zdb_leak_init(spa, &zcb);
  
          /*
           * If there's a deferred-free bplist, process that first.
           */
*** 3815,3829 ****
              metaslab_class_get_alloc(spa_special_class(spa)) +
              metaslab_class_get_alloc(spa_dedup_class(spa));
          total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
              zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
  
!         if (total_found == total_alloc) {
!                 if (!dump_opt['L'])
                          (void) printf("\n\tNo leaks (block sum matches space"
                              " maps exactly)\n");
!         } else {
                  (void) printf("block traversal size %llu != alloc %llu "
                      "(%s %lld)\n",
                      (u_longlong_t)total_found,
                      (u_longlong_t)total_alloc,
                      (dump_opt['L']) ? "unreachable" : "leaked",
--- 3830,3843 ----
              metaslab_class_get_alloc(spa_special_class(spa)) +
              metaslab_class_get_alloc(spa_dedup_class(spa));
          total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
              zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
  
!         if (total_found == total_alloc && !dump_opt['L']) {
                  (void) printf("\n\tNo leaks (block sum matches space"
                      " maps exactly)\n");
!         } else if (!dump_opt['L']) {
                  (void) printf("block traversal size %llu != alloc %llu "
                      "(%s %lld)\n",
                      (u_longlong_t)total_found,
                      (u_longlong_t)total_alloc,
                      (dump_opt['L']) ? "unreachable" : "leaked",
*** 4159,4169 ****
                          space_map_t *prev_obsolete_sm = NULL;
                          VERIFY0(space_map_open(&prev_obsolete_sm,
                              spa->spa_meta_objset,
                              scip->scip_prev_obsolete_sm_object,
                              0, vd->vdev_asize, 0));
-                         space_map_update(prev_obsolete_sm);
                          dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
                          (void) printf("\n");
                          space_map_close(prev_obsolete_sm);
                  }
  
--- 4173,4182 ----
*** 4365,4375 ****
           * The entries in the vdev_checkpoint_sm should be marked as
           * allocated in the checkpointed state of the pool, therefore
           * their respective ms_allocateable trees should not contain them.
           */
          mutex_enter(&ms->ms_lock);
!         range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
          mutex_exit(&ms->ms_lock);
  
          return (0);
  }
  
--- 4378,4389 ----
           * The entries in the vdev_checkpoint_sm should be marked as
           * allocated in the checkpointed state of the pool, therefore
           * their respective ms_allocateable trees should not contain them.
           */
          mutex_enter(&ms->ms_lock);
!         range_tree_verify_not_present(ms->ms_allocatable,
!             sme->sme_offset, sme->sme_run);
          mutex_exit(&ms->ms_lock);
  
          return (0);
  }
  
*** 4428,4445 ****
                      sizeof (uint64_t), 1, &checkpoint_sm_obj));
  
                  VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
                      checkpoint_sm_obj, 0, current_vd->vdev_asize,
                      current_vd->vdev_ashift));
-                 space_map_update(checkpoint_sm);
  
                  verify_checkpoint_sm_entry_cb_arg_t vcsec;
                  vcsec.vcsec_vd = ckpoint_vd;
                  vcsec.vcsec_entryid = 0;
                  vcsec.vcsec_num_entries =
                      space_map_length(checkpoint_sm) / sizeof (uint64_t);
                  VERIFY0(space_map_iterate(checkpoint_sm,
                      verify_checkpoint_sm_entry_cb, &vcsec));
                  dump_spacemap(current->spa_meta_objset, checkpoint_sm);
                  space_map_close(checkpoint_sm);
          }
  
--- 4442,4459 ----
                      sizeof (uint64_t), 1, &checkpoint_sm_obj));
  
                  VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
                      checkpoint_sm_obj, 0, current_vd->vdev_asize,
                      current_vd->vdev_ashift));
  
                  verify_checkpoint_sm_entry_cb_arg_t vcsec;
                  vcsec.vcsec_vd = ckpoint_vd;
                  vcsec.vcsec_entryid = 0;
                  vcsec.vcsec_num_entries =
                      space_map_length(checkpoint_sm) / sizeof (uint64_t);
                  VERIFY0(space_map_iterate(checkpoint_sm,
+                     space_map_length(checkpoint_sm),
                      verify_checkpoint_sm_entry_cb, &vcsec));
                  dump_spacemap(current->spa_meta_objset, checkpoint_sm);
                  space_map_close(checkpoint_sm);
          }
  
*** 4515,4525 ****
                           *
                           * This way we ensure that none of the blocks that
                           * are part of the checkpoint were freed by mistake.
                           */
                          range_tree_walk(ckpoint_msp->ms_allocatable,
!                             (range_tree_func_t *)range_tree_verify,
                              current_msp->ms_allocatable);
                  }
          }
  
          /* for cleaner progress output */
--- 4529,4539 ----
                           *
                           * This way we ensure that none of the blocks that
                           * are part of the checkpoint were freed by mistake.
                           */
                          range_tree_walk(ckpoint_msp->ms_allocatable,
!                             (range_tree_func_t *)range_tree_verify_not_present,
                              current_msp->ms_allocatable);
                  }
          }
  
          /* for cleaner progress output */
*** 4527,4536 ****
--- 4541,4552 ----
  }
  
  static void
  verify_checkpoint_blocks(spa_t *spa)
  {
+         ASSERT(!dump_opt['L']);
+ 
          spa_t *checkpoint_spa;
          char *checkpoint_pool;
          nvlist_t *config = NULL;
          int error = 0;
  
*** 4592,4602 ****
                      VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
                      sizeof (uint64_t), 1, &checkpoint_sm_obj));
  
                  VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
                      checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
-                 space_map_update(checkpoint_sm);
                  dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
                  space_map_close(checkpoint_sm);
          }
  }
  
--- 4608,4617 ----