Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
          +++ new/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
↓ open down ↓ 332 lines elided ↑ open up ↑
 333  333   *
 334  334   * As the space map grows (as a result of the appends) it will
 335  335   * eventually become space-inefficient.  When the metaslab's in-core
 336  336   * free tree is zfs_condense_pct/100 times the size of the minimal
 337  337   * on-disk representation, we rewrite it in its minimized form.  If a
 338  338   * metaslab needs to condense then we must set the ms_condensing flag to
 339  339   * ensure that allocations are not performed on the metaslab that is
 340  340   * being written.
 341  341   */
 342  342  struct metaslab {
      343 +        /*
      344 +         * This is the main lock of the metaslab and its purpose is to
      345 +         * coordinate our allocations and frees [e.g metaslab_block_alloc(),
      346 +         * metaslab_free_concrete(), ..etc] with our various syncing
      347 +         * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
      348 +         *
      349 +         * The lock is also used during some miscellaneous operations like
      350 +         * using the metaslab's histogram for the metaslab group's histogram
      351 +         * aggregation, or marking the metaslab for initialization.
      352 +         */
 343  353          kmutex_t        ms_lock;
      354 +
      355 +        /*
      356 +         * Acquired together with the ms_lock whenever we expect to
      357 +         * write to metaslab data on-disk (i.e flushing entries to
      358 +         * the metaslab's space map). It helps coordinate readers of
      359 +         * the metaslab's space map [see spa_vdev_remove_thread()]
      360 +         * with writers [see metaslab_sync()].
      361 +         *
      362 +         * Note that metaslab_load(), even though a reader, uses
      363 +         * a completely different mechanism to deal with the reading
      364 +         * of the metaslab's space map based on ms_synced_length. That
      365 +         * said, the function still uses the ms_sync_lock after it
      366 +         * has read the ms_sm [see relevant comment in metaslab_load()
      367 +         * as to why].
      368 +         */
 344  369          kmutex_t        ms_sync_lock;
      370 +
 345  371          kcondvar_t      ms_load_cv;
 346  372          space_map_t     *ms_sm;
 347  373          uint64_t        ms_id;
 348  374          uint64_t        ms_start;
 349  375          uint64_t        ms_size;
 350  376          uint64_t        ms_fragmentation;
 351  377  
 352  378          range_tree_t    *ms_allocating[TXG_SIZE];
 353  379          range_tree_t    *ms_allocatable;
      380 +        uint64_t        ms_allocated_this_txg;
 354  381  
 355  382          /*
 356  383           * The following range trees are accessed only from syncing context.
 357  384           * ms_free*tree only have entries while syncing, and are empty
 358  385           * between syncs.
 359  386           */
 360  387          range_tree_t    *ms_freeing;    /* to free this syncing txg */
 361  388          range_tree_t    *ms_freed;      /* already freed this syncing txg */
 362  389          range_tree_t    *ms_defer[TXG_DEFER_SIZE];
 363  390          range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
↓ open down ↓ 4 lines elided ↑ open up ↑
 368  395  
 369  396          uint64_t        ms_initializing; /* leaves initializing this ms */
 370  397  
 371  398          /*
 372  399           * We must always hold the ms_lock when modifying ms_loaded
 373  400           * and ms_loading.
 374  401           */
 375  402          boolean_t       ms_loaded;
 376  403          boolean_t       ms_loading;
 377  404  
      405 +        /*
      406 +         * The following histograms count entries that are in the
      407 +         * metaslab's space map (and its histogram) but are not in
      408 +         * ms_allocatable yet, because they are in ms_freed, ms_freeing,
      409 +         * or ms_defer[].
      410 +         *
      411 +         * When the metaslab is not loaded, its ms_weight needs to
      412 +         * reflect what is allocatable (i.e. what will be part of
      413 +         * ms_allocatable if it is loaded).  The weight is computed from
      414 +         * the spacemap histogram, but that includes ranges that are
      415 +         * not yet allocatable (because they are in ms_freed,
      416 +         * ms_freeing, or ms_defer[]).  Therefore, when calculating the
      417 +         * weight, we need to remove those ranges.
      418 +         *
      419 +         * The ranges in the ms_freed and ms_defer[] range trees are all
      420 +         * present in the spacemap.  However, the spacemap may have
      421 +         * multiple entries to represent a contiguous range, because it
      422 +         * is written across multiple sync passes, but the changes of
      423 +         * all sync passes are consolidated into the range trees.
      424 +         * Adjacent ranges that are freed in different sync passes of
      425 +         * one txg will be represented separately (as 2 or more entries)
      426 +         * in the space map (and its histogram), but these adjacent
      427 +         * ranges will be consolidated (represented as one entry) in the
      428 +         * ms_freed/ms_defer[] range trees (and their histograms).
      429 +         *
      430 +         * When calculating the weight, we can not simply subtract the
      431 +         * range trees' histograms from the spacemap's histogram,
      432 +         * because the range trees' histograms may have entries in
      433 +         * higher buckets than the spacemap, due to consolidation.
      434 +         * Instead we must subtract the exact entries that were added to
      435 +         * the spacemap's histogram.  ms_synchist and ms_deferhist[]
      436 +         * represent these exact entries, so we can subtract them from
      437 +         * the spacemap's histogram when calculating ms_weight.
      438 +         *
      439 +         * ms_synchist represents the same ranges as ms_freeing +
      440 +         * ms_freed, but without consolidation across sync passes.
      441 +         *
      442 +         * ms_deferhist[i] represents the same ranges as ms_defer[i],
      443 +         * but without consolidation across sync passes.
      444 +         */
      445 +        uint64_t        ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
      446 +        uint64_t        ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
      447 +
      448 +        /*
      449 +         * Tracks the exact amount of allocated space of this metaslab
      450 +         * (and specifically the metaslab's space map) up to the most
      451 +         * recently completed sync pass [see usage in metaslab_sync()].
      452 +         */
      453 +        uint64_t        ms_allocated_space;
 378  454          int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
 379  455          uint64_t        ms_weight;      /* weight vs. others in group   */
 380  456          uint64_t        ms_activation_weight;   /* activation weight    */
 381  457  
 382  458          /*
 383  459           * Track of whenever a metaslab is selected for loading or allocation.
 384  460           * We use this value to determine how long the metaslab should
 385  461           * stay cached.
 386  462           */
 387  463          uint64_t        ms_selected_txg;
↓ open down ↓ 16 lines elided ↑ open up ↑
 404  480           * only difference is that the ms_allocatable_by_size is ordered by
 405  481           * segment sizes.
 406  482           */
 407  483          avl_tree_t      ms_allocatable_by_size;
 408  484          uint64_t        ms_lbas[MAX_LBAS];
 409  485  
 410  486          metaslab_group_t *ms_group;     /* metaslab group               */
 411  487          avl_node_t      ms_group_node;  /* node in metaslab group tree  */
 412  488          txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
 413  489  
      490 +        /* updated every time we are done syncing the metaslab's space map */
      491 +        uint64_t        ms_synced_length;
      492 +
 414  493          boolean_t       ms_new;
 415  494  };
 416  495  
 417  496  #ifdef  __cplusplus
 418  497  }
 419  498  #endif
 420  499  
 421  500  #endif  /* _SYS_METASLAB_IMPL_H */
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX