Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

@@ -338,21 +338,48 @@
  * metaslab needs to condense then we must set the ms_condensing flag to
  * ensure that allocations are not performed on the metaslab that is
  * being written.
  */
 struct metaslab {
+        /*
+         * This is the main lock of the metaslab and its purpose is to
+         * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+         * metaslab_free_concrete(), ..etc] with our various syncing
+         * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+         *
+         * The lock is also used during some miscellaneous operations like
+         * using the metaslab's histogram for the metaslab group's histogram
+         * aggregation, or marking the metaslab for initialization.
+         */
         kmutex_t        ms_lock;
+
+        /*
+         * Acquired together with the ms_lock whenever we expect to
+         * write to metaslab data on-disk (i.e flushing entries to
+         * the metaslab's space map). It helps coordinate readers of
+         * the metaslab's space map [see spa_vdev_remove_thread()]
+         * with writers [see metaslab_sync()].
+         *
+         * Note that metaslab_load(), even though a reader, uses
+         * a completely different mechanism to deal with the reading
+         * of the metaslab's space map based on ms_synced_length. That
+         * said, the function still uses the ms_sync_lock after it
+         * has read the ms_sm [see relevant comment in metaslab_load()
+         * as to why].
+         */
         kmutex_t        ms_sync_lock;
+
         kcondvar_t      ms_load_cv;
         space_map_t     *ms_sm;
         uint64_t        ms_id;
         uint64_t        ms_start;
         uint64_t        ms_size;
         uint64_t        ms_fragmentation;
 
         range_tree_t    *ms_allocating[TXG_SIZE];
         range_tree_t    *ms_allocatable;
+        uint64_t        ms_allocated_this_txg;
 
         /*
          * The following range trees are accessed only from syncing context.
          * ms_free*tree only have entries while syncing, and are empty
          * between syncs.

@@ -373,10 +400,59 @@
          * and ms_loading.
          */
         boolean_t       ms_loaded;
         boolean_t       ms_loading;
 
+        /*
+         * The following histograms count entries that are in the
+         * metaslab's space map (and its histogram) but are not in
+         * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+         * or ms_defer[].
+         *
+         * When the metaslab is not loaded, its ms_weight needs to
+         * reflect what is allocatable (i.e. what will be part of
+         * ms_allocatable if it is loaded).  The weight is computed from
+         * the spacemap histogram, but that includes ranges that are
+         * not yet allocatable (because they are in ms_freed,
+         * ms_freeing, or ms_defer[]).  Therefore, when calculating the
+         * weight, we need to remove those ranges.
+         *
+         * The ranges in the ms_freed and ms_defer[] range trees are all
+         * present in the spacemap.  However, the spacemap may have
+         * multiple entries to represent a contiguous range, because it
+         * is written across multiple sync passes, but the changes of
+         * all sync passes are consolidated into the range trees.
+         * Adjacent ranges that are freed in different sync passes of
+         * one txg will be represented separately (as 2 or more entries)
+         * in the space map (and its histogram), but these adjacent
+         * ranges will be consolidated (represented as one entry) in the
+         * ms_freed/ms_defer[] range trees (and their histograms).
+         *
+         * When calculating the weight, we can not simply subtract the
+         * range trees' histograms from the spacemap's histogram,
+         * because the range trees' histograms may have entries in
+         * higher buckets than the spacemap, due to consolidation.
+         * Instead we must subtract the exact entries that were added to
+         * the spacemap's histogram.  ms_synchist and ms_deferhist[]
+         * represent these exact entries, so we can subtract them from
+         * the spacemap's histogram when calculating ms_weight.
+         *
+         * ms_synchist represents the same ranges as ms_freeing +
+         * ms_freed, but without consolidation across sync passes.
+         *
+         * ms_deferhist[i] represents the same ranges as ms_defer[i],
+         * but without consolidation across sync passes.
+         */
+        uint64_t        ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+        uint64_t        ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+        /*
+         * Tracks the exact amount of allocated space of this metaslab
+         * (and specifically the metaslab's space map) up to the most
+         * recently completed sync pass [see usage in metaslab_sync()].
+         */
+        uint64_t        ms_allocated_space;
         int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
         uint64_t        ms_weight;      /* weight vs. others in group   */
         uint64_t        ms_activation_weight;   /* activation weight    */
 
         /*

@@ -409,10 +485,13 @@
 
         metaslab_group_t *ms_group;     /* metaslab group               */
         avl_node_t      ms_group_node;  /* node in metaslab group tree  */
         txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
 
+        /* updated every time we are done syncing the metaslab's space map */
+        uint64_t        ms_synced_length;
+
         boolean_t       ms_new;
 };
 
 #ifdef  __cplusplus
 }