10592 Wdiff usr/src/uts/common/fs/zfs/sys/metaslab_impl.h

Print this page

10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
          +++ new/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  28   28   */
  29   29  
  30   30  #ifndef _SYS_METASLAB_IMPL_H
  31   31  #define _SYS_METASLAB_IMPL_H
  32   32  
  33   33  #include <sys/metaslab.h>
  34   34  #include <sys/space_map.h>
  35   35  #include <sys/range_tree.h>
  36   36  #include <sys/vdev.h>
  37   37  #include <sys/txg.h>
  38   38  #include <sys/avl.h>
  39   39  
  40   40  #ifdef  __cplusplus
  41   41  extern "C" {
  42   42  #endif
  43   43  
  44   44  /*
  45   45   * Metaslab allocation tracing record.
  46   46   */
  47   47  typedef struct metaslab_alloc_trace {
  48   48          list_node_t                     mat_list_node;
  49   49          metaslab_group_t                *mat_mg;
  50   50          metaslab_t                      *mat_msp;
  51   51          uint64_t                        mat_size;
  52   52          uint64_t                        mat_weight;
  53   53          uint32_t                        mat_dva_id;
  54   54          uint64_t                        mat_offset;
  55   55          int                                     mat_allocator;
  56   56  } metaslab_alloc_trace_t;
  57   57  
  58   58  /*
  59   59   * Used by the metaslab allocation tracing facility to indicate
  60   60   * error conditions. These errors are stored to the offset member
  61   61   * of the metaslab_alloc_trace_t record and displayed by mdb.
  62   62   */
  63   63  typedef enum trace_alloc_type {
  64   64          TRACE_ALLOC_FAILURE     = -1ULL,
  65   65          TRACE_TOO_SMALL         = -2ULL,
  66   66          TRACE_FORCE_GANG        = -3ULL,
  67   67          TRACE_NOT_ALLOCATABLE   = -4ULL,
  68   68          TRACE_GROUP_FAILURE     = -5ULL,
  69   69          TRACE_ENOSPC            = -6ULL,
  70   70          TRACE_CONDENSING        = -7ULL,
  71   71          TRACE_VDEV_ERROR        = -8ULL,
  72   72          TRACE_INITIALIZING      = -9ULL
  73   73  } trace_alloc_type_t;
  74   74  
  75   75  #define METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
  76   76  #define METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
  77   77  #define METASLAB_WEIGHT_CLAIM           (1ULL << 61)
  78   78  #define METASLAB_WEIGHT_TYPE            (1ULL << 60)
  79   79  #define METASLAB_ACTIVE_MASK            \
  80   80          (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
  81   81          METASLAB_WEIGHT_CLAIM)
  82   82  
  83   83  /*
  84   84   * The metaslab weight is used to encode the amount of free space in a
  85   85   * metaslab, such that the "best" metaslab appears first when sorting the
  86   86   * metaslabs by weight. The weight (and therefore the "best" metaslab) can
  87   87   * be determined in two different ways: by computing a weighted sum of all
  88   88   * the free space in the metaslab (a space based weight) or by counting only
  89   89   * the free segments of the largest size (a segment based weight). We prefer
  90   90   * the segment based weight because it reflects how the free space is
  91   91   * comprised, but we cannot always use it -- legacy pools do not have the
  92   92   * space map histogram information necessary to determine the largest
  93   93   * contiguous regions. Pools that have the space map histogram determine
  94   94   * the segment weight by looking at each bucket in the histogram and
  95   95   * determining the free space whose size in bytes is in the range:
  96   96   *      [2^i, 2^(i+1))
  97   97   * We then encode the largest index, i, that contains regions into the
  98   98   * segment-weighted value.
  99   99   *
 100  100   * Space-based weight:
 101  101   *
 102  102   *      64      56      48      40      32      24      16      8       0
 103  103   *      +-------+-------+-------+-------+-------+-------+-------+-------+
 104  104   *      |PSC1|                  weighted-free space                     |
 105  105   *      +-------+-------+-------+-------+-------+-------+-------+-------+
 106  106   *
 107  107   *      PS - indicates primary and secondary activation
 108  108   *      C - indicates activation for claimed block zio
 109  109   *      space - the fragmentation-weighted space
 110  110   *
 111  111   * Segment-based weight:
 112  112   *
 113  113   *      64      56      48      40      32      24      16      8       0
 114  114   *      +-------+-------+-------+-------+-------+-------+-------+-------+
 115  115   *      |PSC0| idx|            count of segments in region              |
 116  116   *      +-------+-------+-------+-------+-------+-------+-------+-------+
 117  117   *
 118  118   *      PS - indicates primary and secondary activation
 119  119   *      C - indicates activation for claimed block zio
 120  120   *      idx - index for the highest bucket in the histogram
 121  121   *      count - number of segments in the specified bucket
 122  122   */
 123  123  #define WEIGHT_GET_ACTIVE(weight)               BF64_GET((weight), 61, 3)
 124  124  #define WEIGHT_SET_ACTIVE(weight, x)            BF64_SET((weight), 61, 3, x)
 125  125  
 126  126  #define WEIGHT_IS_SPACEBASED(weight)            \
 127  127          ((weight) == 0 || BF64_GET((weight), 60, 1))
 128  128  #define WEIGHT_SET_SPACEBASED(weight)           BF64_SET((weight), 60, 1, 1)
 129  129  
 130  130  /*
 131  131   * These macros are only applicable to segment-based weighting.
 132  132   */
 133  133  #define WEIGHT_GET_INDEX(weight)                BF64_GET((weight), 54, 6)
 134  134  #define WEIGHT_SET_INDEX(weight, x)             BF64_SET((weight), 54, 6, x)
 135  135  #define WEIGHT_GET_COUNT(weight)                BF64_GET((weight), 0, 54)
 136  136  #define WEIGHT_SET_COUNT(weight, x)             BF64_SET((weight), 0, 54, x)
 137  137  
 138  138  /*
 139  139   * A metaslab class encompasses a category of allocatable top-level vdevs.
 140  140   * Each top-level vdev is associated with a metaslab group which defines
 141  141   * the allocatable region for that vdev. Examples of these categories include
 142  142   * "normal" for data block allocations (i.e. main pool allocations) or "log"
 143  143   * for allocations designated for intent log devices (i.e. slog devices).
 144  144   * When a block allocation is requested from the SPA it is associated with a
 145  145   * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
 146  146   * to the class can be used to satisfy that request. Allocations are done
 147  147   * by traversing the metaslab groups that are linked off of the mc_rotor field.
 148  148   * This rotor points to the next metaslab group where allocations will be
 149  149   * attempted. Allocating a block is a 3 step process -- select the metaslab
 150  150   * group, select the metaslab, and then allocate the block. The metaslab
 151  151   * class defines the low-level block allocator that will be used as the
 152  152   * final step in allocation. These allocators are pluggable allowing each class
 153  153   * to use a block allocator that best suits that class.
 154  154   */
 155  155  struct metaslab_class {
 156  156          kmutex_t                mc_lock;
 157  157          spa_t                   *mc_spa;
 158  158          metaslab_group_t        *mc_rotor;
 159  159          metaslab_ops_t          *mc_ops;
 160  160          uint64_t                mc_aliquot;
 161  161  
 162  162          /*
 163  163           * Track the number of metaslab groups that have been initialized
 164  164           * and can accept allocations. An initialized metaslab group is
 165  165           * one has been completely added to the config (i.e. we have
 166  166           * updated the MOS config and the space has been added to the pool).
 167  167           */
 168  168          uint64_t                mc_groups;
 169  169  
 170  170          /*
 171  171           * Toggle to enable/disable the allocation throttle.
 172  172           */
 173  173          boolean_t               mc_alloc_throttle_enabled;
 174  174  
 175  175          /*
 176  176           * The allocation throttle works on a reservation system. Whenever
 177  177           * an asynchronous zio wants to perform an allocation it must
 178  178           * first reserve the number of blocks that it wants to allocate.
 179  179           * If there aren't sufficient slots available for the pending zio
 180  180           * then that I/O is throttled until more slots free up. The current
 181  181           * number of reserved allocations is maintained by the mc_alloc_slots
 182  182           * refcount. The mc_alloc_max_slots value determines the maximum
 183  183           * number of allocations that the system allows. Gang blocks are
 184  184           * allowed to reserve slots even if we've reached the maximum
 185  185           * number of allocations allowed.
 186  186           */
 187  187          uint64_t                *mc_alloc_max_slots;
 188  188          zfs_refcount_t          *mc_alloc_slots;
 189  189  
 190  190          uint64_t                mc_alloc_groups; /* # of allocatable groups */
 191  191  
 192  192          uint64_t                mc_alloc;       /* total allocated space */
 193  193          uint64_t                mc_deferred;    /* total deferred frees */
 194  194          uint64_t                mc_space;       /* total space (alloc + free) */
 195  195          uint64_t                mc_dspace;      /* total deflated space */
 196  196          uint64_t                mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 197  197  };
 198  198  
 199  199  /*
 200  200   * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
 201  201   * of a top-level vdev. They are linked togther to form a circular linked
 202  202   * list and can belong to only one metaslab class. Metaslab groups may become
 203  203   * ineligible for allocations for a number of reasons such as limited free
 204  204   * space, fragmentation, or going offline. When this happens the allocator will
 205  205   * simply find the next metaslab group in the linked list and attempt
 206  206   * to allocate from that group instead.
 207  207   */
 208  208  struct metaslab_group {
 209  209          kmutex_t                mg_lock;
 210  210          metaslab_t              **mg_primaries;
 211  211          metaslab_t              **mg_secondaries;
 212  212          avl_tree_t              mg_metaslab_tree;
 213  213          uint64_t                mg_aliquot;
 214  214          boolean_t               mg_allocatable;         /* can we allocate? */
 215  215          uint64_t                mg_ms_ready;
 216  216  
 217  217          /*
 218  218           * A metaslab group is considered to be initialized only after
 219  219           * we have updated the MOS config and added the space to the pool.
 220  220           * We only allow allocation attempts to a metaslab group if it
 221  221           * has been initialized.
 222  222           */
 223  223          boolean_t               mg_initialized;
 224  224  
 225  225          uint64_t                mg_free_capacity;       /* percentage free */
 226  226          int64_t                 mg_bias;
 227  227          int64_t                 mg_activation_count;
 228  228          metaslab_class_t        *mg_class;
 229  229          vdev_t                  *mg_vd;
 230  230          taskq_t                 *mg_taskq;
 231  231          metaslab_group_t        *mg_prev;
 232  232          metaslab_group_t        *mg_next;
 233  233  
 234  234          /*
 235  235           * In order for the allocation throttle to function properly, we cannot
 236  236           * have too many IOs going to each disk by default; the throttle
 237  237           * operates by allocating more work to disks that finish quickly, so
 238  238           * allocating larger chunks to each disk reduces its effectiveness.
 239  239           * However, if the number of IOs going to each allocator is too small,
 240  240           * we will not perform proper aggregation at the vdev_queue layer,
 241  241           * also resulting in decreased performance. Therefore, we will use a
 242  242           * ramp-up strategy.
 243  243           *
 244  244           * Each allocator in each metaslab group has a current queue depth
 245  245           * (mg_alloc_queue_depth[allocator]) and a current max queue depth
 246  246           * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
 247  247           * has an absolute max queue depth (mg_max_alloc_queue_depth).  We
 248  248           * add IOs to an allocator until the mg_alloc_queue_depth for that
 249  249           * allocator hits the cur_max. Every time an IO completes for a given
 250  250           * allocator on a given metaslab group, we increment its cur_max until
 251  251           * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
 252  252           * help protect against disks that decrease in performance over time.
 253  253           *
 254  254           * It's possible for an allocator to handle more allocations than
 255  255           * its max. This can occur when gang blocks are required or when other
 256  256           * groups are unable to handle their share of allocations.
 257  257           */
 258  258          uint64_t                mg_max_alloc_queue_depth;
 259  259          uint64_t                *mg_cur_max_alloc_queue_depth;
 260  260          zfs_refcount_t          *mg_alloc_queue_depth;
 261  261          int                     mg_allocators;
 262  262          /*
 263  263           * A metalab group that can no longer allocate the minimum block
 264  264           * size will set mg_no_free_space. Once a metaslab group is out
 265  265           * of space then its share of work must be distributed to other
 266  266           * groups.
 267  267           */
 268  268          boolean_t               mg_no_free_space;
 269  269  
 270  270          uint64_t                mg_allocations;
 271  271          uint64_t                mg_failed_allocations;
 272  272          uint64_t                mg_fragmentation;
 273  273          uint64_t                mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 274  274  
 275  275          int                     mg_ms_initializing;
 276  276          boolean_t               mg_initialize_updating;
 277  277          kmutex_t                mg_ms_initialize_lock;
 278  278          kcondvar_t              mg_ms_initialize_cv;
 279  279  };
 280  280  
 281  281  /*
 282  282   * This value defines the number of elements in the ms_lbas array. The value
 283  283   * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
 284  284   * This is the equivalent of highbit(UINT64_MAX).
 285  285   */
 286  286  #define MAX_LBAS        64
 287  287  
 288  288  /*
 289  289   * Each metaslab maintains a set of in-core trees to track metaslab
 290  290   * operations.  The in-core free tree (ms_allocatable) contains the list of
 291  291   * free segments which are eligible for allocation.  As blocks are
 292  292   * allocated, the allocated segment are removed from the ms_allocatable and
 293  293   * added to a per txg allocation tree (ms_allocating).  As blocks are
 294  294   * freed, they are added to the free tree (ms_freeing).  These trees
 295  295   * allow us to process all allocations and frees in syncing context
 296  296   * where it is safe to update the on-disk space maps.  An additional set
 297  297   * of in-core trees is maintained to track deferred frees
 298  298   * (ms_defer).  Once a block is freed it will move from the
 299  299   * ms_freed to the ms_defer tree.  A deferred free means that a block
 300  300   * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
 301  301   * transactions groups later.  For example, a block that is freed in txg
 302  302   * 50 will not be available for reallocation until txg 52 (50 +
 303  303   * TXG_DEFER_SIZE).  This provides a safety net for uberblock rollback.
 304  304   * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
 305  305   * groups and ensure that no block has been reallocated.
 306  306   *
 307  307   * The simplified transition diagram looks like this:
 308  308   *
 309  309   *
 310  310   *      ALLOCATE
 311  311   *         |
 312  312   *         V
 313  313   *    free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map)
 314  314   *         ^
 315  315   *         |                        ms_freeing <--- FREE
 316  316   *         |                             |
 317  317   *         |                             v
 318  318   *         |                         ms_freed
 319  319   *         |                             |
 320  320   *         +-------- ms_defer[2] <-------+-------> (write to space map)
 321  321   *
 322  322   *
 323  323   * Each metaslab's space is tracked in a single space map in the MOS,
 324  324   * which is only updated in syncing context.  Each time we sync a txg,
 325  325   * we append the allocs and frees from that txg to the space map.  The
 326  326   * pool space is only updated once all metaslabs have finished syncing.
 327  327   *
 328  328   * To load the in-core free tree we read the space map from disk.  This
 329  329   * object contains a series of alloc and free records that are combined
 330  330   * to make up the list of all free segments in this metaslab.  These
 331  331   * segments are represented in-core by the ms_allocatable and are stored
 332  332   * in an AVL tree.

↓ open down ↓

332 lines elided

↑ open up ↑

 333  333   *
 334  334   * As the space map grows (as a result of the appends) it will
 335  335   * eventually become space-inefficient.  When the metaslab's in-core
 336  336   * free tree is zfs_condense_pct/100 times the size of the minimal
 337  337   * on-disk representation, we rewrite it in its minimized form.  If a
 338  338   * metaslab needs to condense then we must set the ms_condensing flag to
 339  339   * ensure that allocations are not performed on the metaslab that is
 340  340   * being written.
 341  341   */
 342  342  struct metaslab {
      343 +        /*
      344 +         * This is the main lock of the metaslab and its purpose is to
      345 +         * coordinate our allocations and frees [e.g metaslab_block_alloc(),
      346 +         * metaslab_free_concrete(), ..etc] with our various syncing
      347 +         * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
      348 +         *
      349 +         * The lock is also used during some miscellaneous operations like
      350 +         * using the metaslab's histogram for the metaslab group's histogram
      351 +         * aggregation, or marking the metaslab for initialization.
      352 +         */
 343  353          kmutex_t        ms_lock;
      354 +
      355 +        /*
      356 +         * Acquired together with the ms_lock whenever we expect to
      357 +         * write to metaslab data on-disk (i.e flushing entries to
      358 +         * the metaslab's space map). It helps coordinate readers of
      359 +         * the metaslab's space map [see spa_vdev_remove_thread()]
      360 +         * with writers [see metaslab_sync()].
      361 +         *
      362 +         * Note that metaslab_load(), even though a reader, uses
      363 +         * a completely different mechanism to deal with the reading
      364 +         * of the metaslab's space map based on ms_synced_length. That
      365 +         * said, the function still uses the ms_sync_lock after it
      366 +         * has read the ms_sm [see relevant comment in metaslab_load()
      367 +         * as to why].
      368 +         */
 344  369          kmutex_t        ms_sync_lock;
      370 +
 345  371          kcondvar_t      ms_load_cv;
 346  372          space_map_t     *ms_sm;
 347  373          uint64_t        ms_id;
 348  374          uint64_t        ms_start;
 349  375          uint64_t        ms_size;
 350  376          uint64_t        ms_fragmentation;
 351  377  
 352  378          range_tree_t    *ms_allocating[TXG_SIZE];
 353  379          range_tree_t    *ms_allocatable;
      380 +        uint64_t        ms_allocated_this_txg;
 354  381  
 355  382          /*
 356  383           * The following range trees are accessed only from syncing context.
 357  384           * ms_free*tree only have entries while syncing, and are empty
 358  385           * between syncs.
 359  386           */
 360  387          range_tree_t    *ms_freeing;    /* to free this syncing txg */
 361  388          range_tree_t    *ms_freed;      /* already freed this syncing txg */
 362  389          range_tree_t    *ms_defer[TXG_DEFER_SIZE];
 363  390          range_tree_t    *ms_checkpointing; /* to add to the checkpoint */

 364  391  
 365  392          boolean_t       ms_condensing;  /* condensing? */
 366  393          boolean_t       ms_condense_wanted;
 367  394          uint64_t        ms_condense_checked_txg;

↓ open down ↓

4 lines elided

↑ open up ↑

 368  395  
 369  396          uint64_t        ms_initializing; /* leaves initializing this ms */
 370  397  
 371  398          /*
 372  399           * We must always hold the ms_lock when modifying ms_loaded
 373  400           * and ms_loading.
 374  401           */
 375  402          boolean_t       ms_loaded;
 376  403          boolean_t       ms_loading;
 377  404  
      405 +        /*
      406 +         * The following histograms count entries that are in the
      407 +         * metaslab's space map (and its histogram) but are not in
      408 +         * ms_allocatable yet, because they are in ms_freed, ms_freeing,
      409 +         * or ms_defer[].
      410 +         *
      411 +         * When the metaslab is not loaded, its ms_weight needs to
      412 +         * reflect what is allocatable (i.e. what will be part of
      413 +         * ms_allocatable if it is loaded).  The weight is computed from
      414 +         * the spacemap histogram, but that includes ranges that are
      415 +         * not yet allocatable (because they are in ms_freed,
      416 +         * ms_freeing, or ms_defer[]).  Therefore, when calculating the
      417 +         * weight, we need to remove those ranges.
      418 +         *
      419 +         * The ranges in the ms_freed and ms_defer[] range trees are all
      420 +         * present in the spacemap.  However, the spacemap may have
      421 +         * multiple entries to represent a contiguous range, because it
      422 +         * is written across multiple sync passes, but the changes of
      423 +         * all sync passes are consolidated into the range trees.
      424 +         * Adjacent ranges that are freed in different sync passes of
      425 +         * one txg will be represented separately (as 2 or more entries)
      426 +         * in the space map (and its histogram), but these adjacent
      427 +         * ranges will be consolidated (represented as one entry) in the
      428 +         * ms_freed/ms_defer[] range trees (and their histograms).
      429 +         *
      430 +         * When calculating the weight, we can not simply subtract the
      431 +         * range trees' histograms from the spacemap's histogram,
      432 +         * because the range trees' histograms may have entries in
      433 +         * higher buckets than the spacemap, due to consolidation.
      434 +         * Instead we must subtract the exact entries that were added to
      435 +         * the spacemap's histogram.  ms_synchist and ms_deferhist[]
      436 +         * represent these exact entries, so we can subtract them from
      437 +         * the spacemap's histogram when calculating ms_weight.
      438 +         *
      439 +         * ms_synchist represents the same ranges as ms_freeing +
      440 +         * ms_freed, but without consolidation across sync passes.
      441 +         *
      442 +         * ms_deferhist[i] represents the same ranges as ms_defer[i],
      443 +         * but without consolidation across sync passes.
      444 +         */
      445 +        uint64_t        ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
      446 +        uint64_t        ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
      447 +
      448 +        /*
      449 +         * Tracks the exact amount of allocated space of this metaslab
      450 +         * (and specifically the metaslab's space map) up to the most
      451 +         * recently completed sync pass [see usage in metaslab_sync()].
      452 +         */
      453 +        uint64_t        ms_allocated_space;
 378  454          int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
 379  455          uint64_t        ms_weight;      /* weight vs. others in group   */
 380  456          uint64_t        ms_activation_weight;   /* activation weight    */
 381  457  
 382  458          /*
 383  459           * Track of whenever a metaslab is selected for loading or allocation.
 384  460           * We use this value to determine how long the metaslab should
 385  461           * stay cached.
 386  462           */
 387  463          uint64_t        ms_selected_txg;

 388  464  
 389  465          uint64_t        ms_alloc_txg;   /* last successful alloc (debug only) */
 390  466          uint64_t        ms_max_size;    /* maximum allocatable size     */
 391  467  
 392  468          /*
 393  469           * -1 if it's not active in an allocator, otherwise set to the allocator
 394  470           * this metaslab is active for.
 395  471           */
 396  472          int             ms_allocator;
 397  473          boolean_t       ms_primary; /* Only valid if ms_allocator is not -1 */
 398  474  
 399  475          /*
 400  476           * The metaslab block allocators can optionally use a size-ordered
 401  477           * range tree and/or an array of LBAs. Not all allocators use
 402  478           * this functionality. The ms_allocatable_by_size should always
 403  479           * contain the same number of segments as the ms_allocatable. The

↓ open down ↓

16 lines elided

↑ open up ↑

 404  480           * only difference is that the ms_allocatable_by_size is ordered by
 405  481           * segment sizes.
 406  482           */
 407  483          avl_tree_t      ms_allocatable_by_size;
 408  484          uint64_t        ms_lbas[MAX_LBAS];
 409  485  
 410  486          metaslab_group_t *ms_group;     /* metaslab group               */
 411  487          avl_node_t      ms_group_node;  /* node in metaslab group tree  */
 412  488          txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
 413  489  
      490 +        /* updated every time we are done syncing the metaslab's space map */
      491 +        uint64_t        ms_synced_length;
      492 +
 414  493          boolean_t       ms_new;
 415  494  };
 416  495  
 417  496  #ifdef  __cplusplus
 418  497  }
 419  498  #endif
 420  499  
 421  500  #endif  /* _SYS_METASLAB_IMPL_H */

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX