Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

@@ -21,11 +21,11 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>

@@ -79,33 +79,35 @@
         return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
 }
 
 /*
  * Iterate through the space map, invoking the callback on each (non-debug)
- * space map entry.
+ * space map entry. Stop after reading 'end' bytes of the space map.
  */
 int
-space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
 {
-        uint64_t sm_len = space_map_length(sm);
-        ASSERT3U(sm->sm_blksz, !=, 0);
+        uint64_t blksz = sm->sm_blksz;
 
-        dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+        ASSERT3U(blksz, !=, 0);
+        ASSERT3U(end, <=, space_map_length(sm));
+        ASSERT0(P2PHASE(end, sizeof (uint64_t)));
+
+        dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
             ZIO_PRIORITY_SYNC_READ);
 
-        uint64_t blksz = sm->sm_blksz;
         int error = 0;
-        for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+        for (uint64_t block_base = 0; block_base < end && error == 0;
             block_base += blksz) {
                 dmu_buf_t *db;
                 error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
                     block_base, FTAG, &db, DMU_READ_PREFETCH);
                 if (error != 0)
                         return (error);
 
                 uint64_t *block_start = db->db_data;
-                uint64_t block_length = MIN(sm_len - block_base, blksz);
+                uint64_t block_length = MIN(end - block_base, blksz);
                 uint64_t *block_end = block_start +
                     (block_length / sizeof (uint64_t));
 
                 VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
                 VERIFY3U(block_length, !=, 0);

@@ -184,11 +186,11 @@
          * Find the offset of the last word in the space map and use
          * that to read the last block of the space map with
          * dmu_buf_hold().
          */
         uint64_t last_word_offset =
-            sm->sm_phys->smp_objsize - sizeof (uint64_t);
+            sm->sm_phys->smp_length - sizeof (uint64_t);
         error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
             FTAG, &db, DMU_READ_NO_PREFETCH);
         if (error != 0)
                 return (error);
 

@@ -197,11 +199,11 @@
         ASSERT3U(bufsz, >=, db->db_size);
         ASSERT(nwords != NULL);
 
         uint64_t *words = db->db_data;
         *nwords =
-            (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+            (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 
         ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
 
         uint64_t n = *nwords;
         uint64_t j = n - 1;

@@ -296,12 +298,11 @@
 
                 for (uint64_t i = 0; i < nwords; i++) {
                         uint64_t e = buf[i];
 
                         if (sm_entry_is_debug(e)) {
-                                sm->sm_phys->smp_objsize -= sizeof (uint64_t);
-                                space_map_update(sm);
+                                sm->sm_phys->smp_length -= sizeof (uint64_t);
                                 continue;
                         }
 
                         int words = 1;
                         uint64_t raw_offset, raw_run, vdev_id;

@@ -352,19 +353,17 @@
 
                         if (type == SM_ALLOC)
                                 sm->sm_phys->smp_alloc -= entry_run;
                         else
                                 sm->sm_phys->smp_alloc += entry_run;
-                        sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
-                        space_map_update(sm);
+                        sm->sm_phys->smp_length -= words * sizeof (uint64_t);
                 }
         }
 
         if (space_map_length(sm) == 0) {
                 ASSERT0(error);
-                ASSERT0(sm->sm_phys->smp_objsize);
-                ASSERT0(sm->sm_alloc);
+                ASSERT0(space_map_allocated(sm));
         }
 
         zio_buf_free(buf, bufsz);
         return (error);
 }

@@ -389,42 +388,46 @@
 
         return (0);
 }
 
 /*
- * Load the space map disk into the specified range tree. Segments of maptype
- * are added to the range tree, other segment types are removed.
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
  */
 int
-space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t length)
 {
-        uint64_t space;
-        int err;
         space_map_load_arg_t smla;
 
         VERIFY0(range_tree_space(rt));
-        space = space_map_allocated(sm);
 
-        if (maptype == SM_FREE) {
+        if (maptype == SM_FREE)
                 range_tree_add(rt, sm->sm_start, sm->sm_size);
-                space = sm->sm_size - space;
-        }
 
         smla.smla_rt = rt;
         smla.smla_sm = sm;
         smla.smla_type = maptype;
-        err = space_map_iterate(sm, space_map_load_callback, &smla);
+        int err = space_map_iterate(sm, length,
+            space_map_load_callback, &smla);
 
-        if (err == 0) {
-                VERIFY3U(range_tree_space(rt), ==, space);
-        } else {
+        if (err != 0)
                 range_tree_vacate(rt, NULL, NULL);
-        }
 
         return (err);
 }
 
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+        return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+}
+
 void
 space_map_histogram_clear(space_map_t *sm)
 {
         if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
                 return;

@@ -504,14 +507,14 @@
         uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
             SM_DEBUG_ACTION_ENCODE(maptype) |
             SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
             SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
-        dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+        dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
             sizeof (dentry), &dentry, tx);
 
-        sm->sm_phys->smp_objsize += sizeof (dentry);
+        sm->sm_phys->smp_length += sizeof (dentry);
 }
 
 /*
  * Writes one or more entries given a segment.
  *

@@ -539,11 +542,11 @@
         ASSERT3U(db->db_size, ==, sm->sm_blksz);
 
         uint64_t *block_base = db->db_data;
         uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
         uint64_t *block_cursor = block_base +
-            (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+            (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
 
         ASSERT3P(block_cursor, <=, block_end);
 
         uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
         uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;

@@ -562,11 +565,11 @@
                  * writing again from the beginning.
                  */
                 if (block_cursor == block_end) {
                         dmu_buf_rele(db, tag);
 
-                        uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+                        uint64_t next_word_offset = sm->sm_phys->smp_length;
                         VERIFY0(dmu_buf_hold(sm->sm_os,
                             space_map_object(sm), next_word_offset,
                             tag, &db, DMU_READ_PREFETCH));
                         dmu_buf_will_dirty(db, tx);
 

@@ -592,11 +595,11 @@
                         *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
                             SM_DEBUG_ACTION_ENCODE(0) |
                             SM_DEBUG_SYNCPASS_ENCODE(0) |
                             SM_DEBUG_TXG_ENCODE(0);
                         block_cursor++;
-                        sm->sm_phys->smp_objsize += sizeof (uint64_t);
+                        sm->sm_phys->smp_length += sizeof (uint64_t);
                         ASSERT3P(block_cursor, ==, block_end);
                         continue;
                 }
 
                 uint64_t run_len = MIN(size, run_max);

@@ -623,11 +626,11 @@
                 default:
                         panic("%d-word space map entries are not supported",
                             words);
                         break;
                 }
-                sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+                sm->sm_phys->smp_length += words * sizeof (uint64_t);
 
                 start += run_len;
                 size -= run_len;
         }
         ASSERT0(size);

@@ -650,11 +653,11 @@
 #ifdef DEBUG
         /*
          * We do this right after we write the intro debug entry
          * because the estimate does not take it into account.
          */
-        uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+        uint64_t initial_objsize = sm->sm_phys->smp_length;
         uint64_t estimated_growth =
             space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
         uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
 #endif
 

@@ -661,11 +664,11 @@
         /*
          * Find the offset right after the last word in the space map
          * and use that to get a hold of the last block, so we can
          * start appending to it.
          */
-        uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+        uint64_t next_word_offset = sm->sm_phys->smp_length;
         VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
             next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
         ASSERT3U(db->db_size, ==, sm->sm_blksz);
 
         dmu_buf_will_dirty(db, tx);

@@ -709,11 +712,11 @@
          * We expect our estimation to be based on the worst case
          * scenario [see comment in space_map_estimate_optimal_size()].
          * Therefore we expect the actual objsize to be equal or less
          * than whatever we estimated it to be.
          */
-        ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+        ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
 #endif
 }
 
 /*
  * Note: This function manipulates the state of the given space map but

@@ -865,27 +868,14 @@
                 bzero(sm->sm_phys->smp_histogram,
                     sizeof (sm->sm_phys->smp_histogram));
         }
 
         dmu_buf_will_dirty(sm->sm_dbuf, tx);
-        sm->sm_phys->smp_objsize = 0;
+        sm->sm_phys->smp_length = 0;
         sm->sm_phys->smp_alloc = 0;
 }
 
-/*
- * Update the in-core space_map allocation and length values.
- */
-void
-space_map_update(space_map_t *sm)
-{
-        if (sm == NULL)
-                return;
-
-        sm->sm_alloc = sm->sm_phys->smp_alloc;
-        sm->sm_length = sm->sm_phys->smp_objsize;
-}
-
 uint64_t
 space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 {
         spa_t *spa = dmu_objset_spa(os);
         uint64_t object;

@@ -1063,34 +1053,16 @@
 space_map_object(space_map_t *sm)
 {
         return (sm != NULL ? sm->sm_object : 0);
 }
 
-/*
- * Returns the already synced, on-disk allocated space.
- */
-uint64_t
+int64_t
 space_map_allocated(space_map_t *sm)
 {
-        return (sm != NULL ? sm->sm_alloc : 0);
+        return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
 }
 
-/*
- * Returns the already synced, on-disk length;
- */
 uint64_t
 space_map_length(space_map_t *sm)
 {
-        return (sm != NULL ? sm->sm_length : 0);
-}
-
-/*
- * Returns the allocated space that is currently syncing.
- */
-int64_t
-space_map_alloc_delta(space_map_t *sm)
-{
-        if (sm == NULL)
-                return (0);
-        ASSERT(sm->sm_dbuf != NULL);
-        return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
+        return (sm != NULL ? sm->sm_phys->smp_length : 0);
 }