Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
        
*** 21,31 ****
  /*
   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  /*
!  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
   */
  
  #include <sys/zfs_context.h>
  #include <sys/spa.h>
  #include <sys/dmu.h>
--- 21,31 ----
  /*
   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  /*
!  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   */
  
  #include <sys/zfs_context.h>
  #include <sys/spa.h>
  #include <sys/dmu.h>
*** 79,111 ****
          return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
  }
  
  /*
   * Iterate through the space map, invoking the callback on each (non-debug)
!  * space map entry.
   */
  int
! space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
  {
!         uint64_t sm_len = space_map_length(sm);
!         ASSERT3U(sm->sm_blksz, !=, 0);
  
!         dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
              ZIO_PRIORITY_SYNC_READ);
  
-         uint64_t blksz = sm->sm_blksz;
          int error = 0;
!         for (uint64_t block_base = 0; block_base < sm_len && error == 0;
              block_base += blksz) {
                  dmu_buf_t *db;
                  error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
                      block_base, FTAG, &db, DMU_READ_PREFETCH);
                  if (error != 0)
                          return (error);
  
                  uint64_t *block_start = db->db_data;
!                 uint64_t block_length = MIN(sm_len - block_base, blksz);
                  uint64_t *block_end = block_start +
                      (block_length / sizeof (uint64_t));
  
                  VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
                  VERIFY3U(block_length, !=, 0);
--- 79,113 ----
          return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
  }
  
  /*
   * Iterate through the space map, invoking the callback on each (non-debug)
!  * space map entry. Stop after reading 'end' bytes of the space map.
   */
  int
! space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
  {
!         uint64_t blksz = sm->sm_blksz;
  
!         ASSERT3U(blksz, !=, 0);
!         ASSERT3U(end, <=, space_map_length(sm));
!         ASSERT0(P2PHASE(end, sizeof (uint64_t)));
! 
!         dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
              ZIO_PRIORITY_SYNC_READ);
  
          int error = 0;
!         for (uint64_t block_base = 0; block_base < end && error == 0;
              block_base += blksz) {
                  dmu_buf_t *db;
                  error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
                      block_base, FTAG, &db, DMU_READ_PREFETCH);
                  if (error != 0)
                          return (error);
  
                  uint64_t *block_start = db->db_data;
!                 uint64_t block_length = MIN(end - block_base, blksz);
                  uint64_t *block_end = block_start +
                      (block_length / sizeof (uint64_t));
  
                  VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
                  VERIFY3U(block_length, !=, 0);
*** 184,194 ****
           * Find the offset of the last word in the space map and use
           * that to read the last block of the space map with
           * dmu_buf_hold().
           */
          uint64_t last_word_offset =
!             sm->sm_phys->smp_objsize - sizeof (uint64_t);
          error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
              FTAG, &db, DMU_READ_NO_PREFETCH);
          if (error != 0)
                  return (error);
  
--- 186,196 ----
           * Find the offset of the last word in the space map and use
           * that to read the last block of the space map with
           * dmu_buf_hold().
           */
          uint64_t last_word_offset =
!             sm->sm_phys->smp_length - sizeof (uint64_t);
          error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
              FTAG, &db, DMU_READ_NO_PREFETCH);
          if (error != 0)
                  return (error);
  
*** 197,207 ****
          ASSERT3U(bufsz, >=, db->db_size);
          ASSERT(nwords != NULL);
  
          uint64_t *words = db->db_data;
          *nwords =
!             (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
  
          ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
  
          uint64_t n = *nwords;
          uint64_t j = n - 1;
--- 199,209 ----
          ASSERT3U(bufsz, >=, db->db_size);
          ASSERT(nwords != NULL);
  
          uint64_t *words = db->db_data;
          *nwords =
!             (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
  
          ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
  
          uint64_t n = *nwords;
          uint64_t j = n - 1;
*** 296,307 ****
  
                  for (uint64_t i = 0; i < nwords; i++) {
                          uint64_t e = buf[i];
  
                          if (sm_entry_is_debug(e)) {
!                                 sm->sm_phys->smp_objsize -= sizeof (uint64_t);
!                                 space_map_update(sm);
                                  continue;
                          }
  
                          int words = 1;
                          uint64_t raw_offset, raw_run, vdev_id;
--- 298,308 ----
  
                  for (uint64_t i = 0; i < nwords; i++) {
                          uint64_t e = buf[i];
  
                          if (sm_entry_is_debug(e)) {
!                                 sm->sm_phys->smp_length -= sizeof (uint64_t);
                                  continue;
                          }
  
                          int words = 1;
                          uint64_t raw_offset, raw_run, vdev_id;
*** 352,370 ****
  
                          if (type == SM_ALLOC)
                                  sm->sm_phys->smp_alloc -= entry_run;
                          else
                                  sm->sm_phys->smp_alloc += entry_run;
!                         sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
!                         space_map_update(sm);
                  }
          }
  
          if (space_map_length(sm) == 0) {
                  ASSERT0(error);
!                 ASSERT0(sm->sm_phys->smp_objsize);
!                 ASSERT0(sm->sm_alloc);
          }
  
          zio_buf_free(buf, bufsz);
          return (error);
  }
--- 353,369 ----
  
                          if (type == SM_ALLOC)
                                  sm->sm_phys->smp_alloc -= entry_run;
                          else
                                  sm->sm_phys->smp_alloc += entry_run;
!                         sm->sm_phys->smp_length -= words * sizeof (uint64_t);
                  }
          }
  
          if (space_map_length(sm) == 0) {
                  ASSERT0(error);
!                 ASSERT0(space_map_allocated(sm));
          }
  
          zio_buf_free(buf, bufsz);
          return (error);
  }
*** 389,430 ****
  
          return (0);
  }
  
  /*
!  * Load the space map disk into the specified range tree. Segments of maptype
!  * are added to the range tree, other segment types are removed.
   */
  int
! space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
  {
-         uint64_t space;
-         int err;
          space_map_load_arg_t smla;
  
          VERIFY0(range_tree_space(rt));
-         space = space_map_allocated(sm);
  
!         if (maptype == SM_FREE) {
                  range_tree_add(rt, sm->sm_start, sm->sm_size);
-                 space = sm->sm_size - space;
-         }
  
          smla.smla_rt = rt;
          smla.smla_sm = sm;
          smla.smla_type = maptype;
!         err = space_map_iterate(sm, space_map_load_callback, &smla);
  
!         if (err == 0) {
!                 VERIFY3U(range_tree_space(rt), ==, space);
!         } else {
                  range_tree_vacate(rt, NULL, NULL);
-         }
  
          return (err);
  }
  
  void
  space_map_histogram_clear(space_map_t *sm)
  {
          if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
                  return;
--- 388,433 ----
  
          return (0);
  }
  
  /*
!  * Load the spacemap into the rangetree, like space_map_load. But only
!  * read the first 'length' bytes of the spacemap.
   */
  int
! space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
!     uint64_t length)
  {
          space_map_load_arg_t smla;
  
          VERIFY0(range_tree_space(rt));
  
!         if (maptype == SM_FREE)
                  range_tree_add(rt, sm->sm_start, sm->sm_size);
  
          smla.smla_rt = rt;
          smla.smla_sm = sm;
          smla.smla_type = maptype;
!         int err = space_map_iterate(sm, length,
!             space_map_load_callback, &smla);
  
!         if (err != 0)
                  range_tree_vacate(rt, NULL, NULL);
  
          return (err);
  }
  
+ /*
+  * Load the space map disk into the specified range tree. Segments of maptype
+  * are added to the range tree, other segment types are removed.
+  */
+ int
+ space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+ {
+         return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+ }
+ 
  void
  space_map_histogram_clear(space_map_t *sm)
  {
          if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
                  return;
*** 504,517 ****
          uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
              SM_DEBUG_ACTION_ENCODE(maptype) |
              SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
              SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
  
!         dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
              sizeof (dentry), &dentry, tx);
  
!         sm->sm_phys->smp_objsize += sizeof (dentry);
  }
  
  /*
   * Writes one or more entries given a segment.
   *
--- 507,520 ----
          uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
              SM_DEBUG_ACTION_ENCODE(maptype) |
              SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
              SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
  
!         dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
              sizeof (dentry), &dentry, tx);
  
!         sm->sm_phys->smp_length += sizeof (dentry);
  }
  
  /*
   * Writes one or more entries given a segment.
   *
*** 539,549 ****
          ASSERT3U(db->db_size, ==, sm->sm_blksz);
  
          uint64_t *block_base = db->db_data;
          uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
          uint64_t *block_cursor = block_base +
!             (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
  
          ASSERT3P(block_cursor, <=, block_end);
  
          uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
          uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
--- 542,552 ----
          ASSERT3U(db->db_size, ==, sm->sm_blksz);
  
          uint64_t *block_base = db->db_data;
          uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
          uint64_t *block_cursor = block_base +
!             (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
  
          ASSERT3P(block_cursor, <=, block_end);
  
          uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
          uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
*** 562,572 ****
                   * writing again from the beginning.
                   */
                  if (block_cursor == block_end) {
                          dmu_buf_rele(db, tag);
  
!                         uint64_t next_word_offset = sm->sm_phys->smp_objsize;
                          VERIFY0(dmu_buf_hold(sm->sm_os,
                              space_map_object(sm), next_word_offset,
                              tag, &db, DMU_READ_PREFETCH));
                          dmu_buf_will_dirty(db, tx);
  
--- 565,575 ----
                   * writing again from the beginning.
                   */
                  if (block_cursor == block_end) {
                          dmu_buf_rele(db, tag);
  
!                         uint64_t next_word_offset = sm->sm_phys->smp_length;
                          VERIFY0(dmu_buf_hold(sm->sm_os,
                              space_map_object(sm), next_word_offset,
                              tag, &db, DMU_READ_PREFETCH));
                          dmu_buf_will_dirty(db, tx);
  
*** 592,602 ****
                          *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
                              SM_DEBUG_ACTION_ENCODE(0) |
                              SM_DEBUG_SYNCPASS_ENCODE(0) |
                              SM_DEBUG_TXG_ENCODE(0);
                          block_cursor++;
!                         sm->sm_phys->smp_objsize += sizeof (uint64_t);
                          ASSERT3P(block_cursor, ==, block_end);
                          continue;
                  }
  
                  uint64_t run_len = MIN(size, run_max);
--- 595,605 ----
                          *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
                              SM_DEBUG_ACTION_ENCODE(0) |
                              SM_DEBUG_SYNCPASS_ENCODE(0) |
                              SM_DEBUG_TXG_ENCODE(0);
                          block_cursor++;
!                         sm->sm_phys->smp_length += sizeof (uint64_t);
                          ASSERT3P(block_cursor, ==, block_end);
                          continue;
                  }
  
                  uint64_t run_len = MIN(size, run_max);
*** 623,633 ****
                  default:
                          panic("%d-word space map entries are not supported",
                              words);
                          break;
                  }
!                 sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
  
                  start += run_len;
                  size -= run_len;
          }
          ASSERT0(size);
--- 626,636 ----
                  default:
                          panic("%d-word space map entries are not supported",
                              words);
                          break;
                  }
!                 sm->sm_phys->smp_length += words * sizeof (uint64_t);
  
                  start += run_len;
                  size -= run_len;
          }
          ASSERT0(size);
*** 650,660 ****
  #ifdef DEBUG
          /*
           * We do this right after we write the intro debug entry
           * because the estimate does not take it into account.
           */
!         uint64_t initial_objsize = sm->sm_phys->smp_objsize;
          uint64_t estimated_growth =
              space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
          uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
  #endif
  
--- 653,663 ----
  #ifdef DEBUG
          /*
           * We do this right after we write the intro debug entry
           * because the estimate does not take it into account.
           */
!         uint64_t initial_objsize = sm->sm_phys->smp_length;
          uint64_t estimated_growth =
              space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
          uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
  #endif
  
*** 661,671 ****
          /*
           * Find the offset right after the last word in the space map
           * and use that to get a hold of the last block, so we can
           * start appending to it.
           */
!         uint64_t next_word_offset = sm->sm_phys->smp_objsize;
          VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
              next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
          ASSERT3U(db->db_size, ==, sm->sm_blksz);
  
          dmu_buf_will_dirty(db, tx);
--- 664,674 ----
          /*
           * Find the offset right after the last word in the space map
           * and use that to get a hold of the last block, so we can
           * start appending to it.
           */
!         uint64_t next_word_offset = sm->sm_phys->smp_length;
          VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
              next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
          ASSERT3U(db->db_size, ==, sm->sm_blksz);
  
          dmu_buf_will_dirty(db, tx);
*** 709,719 ****
           * We expect our estimation to be based on the worst case
           * scenario [see comment in space_map_estimate_optimal_size()].
           * Therefore we expect the actual objsize to be equal or less
           * than whatever we estimated it to be.
           */
!         ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
  #endif
  }
  
  /*
   * Note: This function manipulates the state of the given space map but
--- 712,722 ----
           * We expect our estimation to be based on the worst case
           * scenario [see comment in space_map_estimate_optimal_size()].
           * Therefore we expect the actual objsize to be equal or less
           * than whatever we estimated it to be.
           */
!         ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
  #endif
  }
  
  /*
   * Note: This function manipulates the state of the given space map but
*** 865,891 ****
                  bzero(sm->sm_phys->smp_histogram,
                      sizeof (sm->sm_phys->smp_histogram));
          }
  
          dmu_buf_will_dirty(sm->sm_dbuf, tx);
!         sm->sm_phys->smp_objsize = 0;
          sm->sm_phys->smp_alloc = 0;
  }
  
- /*
-  * Update the in-core space_map allocation and length values.
-  */
- void
- space_map_update(space_map_t *sm)
- {
-         if (sm == NULL)
-                 return;
- 
-         sm->sm_alloc = sm->sm_phys->smp_alloc;
-         sm->sm_length = sm->sm_phys->smp_objsize;
- }
- 
  uint64_t
  space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
  {
          spa_t *spa = dmu_objset_spa(os);
          uint64_t object;
--- 868,881 ----
                  bzero(sm->sm_phys->smp_histogram,
                      sizeof (sm->sm_phys->smp_histogram));
          }
  
          dmu_buf_will_dirty(sm->sm_dbuf, tx);
!         sm->sm_phys->smp_length = 0;
          sm->sm_phys->smp_alloc = 0;
  }
  
  uint64_t
  space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
  {
          spa_t *spa = dmu_objset_spa(os);
          uint64_t object;
*** 1063,1096 ****
  space_map_object(space_map_t *sm)
  {
          return (sm != NULL ? sm->sm_object : 0);
  }
  
! /*
!  * Returns the already synced, on-disk allocated space.
!  */
! uint64_t
  space_map_allocated(space_map_t *sm)
  {
!         return (sm != NULL ? sm->sm_alloc : 0);
  }
  
- /*
-  * Returns the already synced, on-disk length;
-  */
  uint64_t
  space_map_length(space_map_t *sm)
  {
!         return (sm != NULL ? sm->sm_length : 0);
! }
! 
! /*
!  * Returns the allocated space that is currently syncing.
!  */
! int64_t
! space_map_alloc_delta(space_map_t *sm)
! {
!         if (sm == NULL)
!                 return (0);
!         ASSERT(sm->sm_dbuf != NULL);
!         return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
  }
--- 1053,1068 ----
  space_map_object(space_map_t *sm)
  {
          return (sm != NULL ? sm->sm_object : 0);
  }
  
! int64_t
  space_map_allocated(space_map_t *sm)
  {
!         return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
  }
  
  uint64_t
  space_map_length(space_map_t *sm)
  {
!         return (sm != NULL ? sm->sm_phys->smp_length : 0);
  }