Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>


 323  * Each metaslab's space is tracked in a single space map in the MOS,
 324  * which is only updated in syncing context.  Each time we sync a txg,
 325  * we append the allocs and frees from that txg to the space map.  The
 326  * pool space is only updated once all metaslabs have finished syncing.
 327  *
 328  * To load the in-core free tree we read the space map from disk.  This
 329  * object contains a series of alloc and free records that are combined
 330  * to make up the list of all free segments in this metaslab.  These
 331  * segments are represented in-core by the ms_allocatable and are stored
 332  * in an AVL tree.
 333  *
 334  * As the space map grows (as a result of the appends) it will
 335  * eventually become space-inefficient.  When the metaslab's in-core
 336  * free tree is zfs_condense_pct/100 times the size of the minimal
 337  * on-disk representation, we rewrite it in its minimized form.  If a
 338  * metaslab needs to condense then we must set the ms_condensing flag to
 339  * ensure that allocations are not performed on the metaslab that is
 340  * being written.
 341  */
 342 struct metaslab {










 343         kmutex_t        ms_lock;















 344         kmutex_t        ms_sync_lock;

 345         kcondvar_t      ms_load_cv;
 346         space_map_t     *ms_sm;
 347         uint64_t        ms_id;
 348         uint64_t        ms_start;
 349         uint64_t        ms_size;
 350         uint64_t        ms_fragmentation;
 351 
 352         range_tree_t    *ms_allocating[TXG_SIZE];
 353         range_tree_t    *ms_allocatable;

 354 
 355         /*
 356          * The following range trees are accessed only from syncing context.
 357          * ms_free*tree only have entries while syncing, and are empty
 358          * between syncs.
 359          */
 360         range_tree_t    *ms_freeing;    /* to free this syncing txg */
 361         range_tree_t    *ms_freed;      /* already freed this syncing txg */
 362         range_tree_t    *ms_defer[TXG_DEFER_SIZE];
 363         range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
 364 
 365         boolean_t       ms_condensing;  /* condensing? */
 366         boolean_t       ms_condense_wanted;
 367         uint64_t        ms_condense_checked_txg;
 368 
 369         uint64_t        ms_initializing; /* leaves initializing this ms */
 370 
 371         /*
 372          * We must always hold the ms_lock when modifying ms_loaded
 373          * and ms_loading.
 374          */
 375         boolean_t       ms_loaded;
 376         boolean_t       ms_loading;
 377 

















































 378         int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
 379         uint64_t        ms_weight;      /* weight vs. others in group   */
 380         uint64_t        ms_activation_weight;   /* activation weight    */
 381 
 382         /*
 383          * Track of whenever a metaslab is selected for loading or allocation.
 384          * We use this value to determine how long the metaslab should
 385          * stay cached.
 386          */
 387         uint64_t        ms_selected_txg;
 388 
 389         uint64_t        ms_alloc_txg;   /* last successful alloc (debug only) */
 390         uint64_t        ms_max_size;    /* maximum allocatable size     */
 391 
 392         /*
 393          * -1 if it's not active in an allocator, otherwise set to the allocator
 394          * this metaslab is active for.
 395          */
 396         int             ms_allocator;
 397         boolean_t       ms_primary; /* Only valid if ms_allocator is not -1 */
 398 
 399         /*
 400          * The metaslab block allocators can optionally use a size-ordered
 401          * range tree and/or an array of LBAs. Not all allocators use
 402          * this functionality. The ms_allocatable_by_size should always
 403          * contain the same number of segments as the ms_allocatable. The
 404          * only difference is that the ms_allocatable_by_size is ordered by
 405          * segment sizes.
 406          */
 407         avl_tree_t      ms_allocatable_by_size;
 408         uint64_t        ms_lbas[MAX_LBAS];
 409 
 410         metaslab_group_t *ms_group;     /* metaslab group               */
 411         avl_node_t      ms_group_node;  /* node in metaslab group tree  */
 412         txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
 413 



 414         boolean_t       ms_new;
 415 };
 416 
 417 #ifdef  __cplusplus
 418 }
 419 #endif
 420 
 421 #endif  /* _SYS_METASLAB_IMPL_H */


 323  * Each metaslab's space is tracked in a single space map in the MOS,
 324  * which is only updated in syncing context.  Each time we sync a txg,
 325  * we append the allocs and frees from that txg to the space map.  The
 326  * pool space is only updated once all metaslabs have finished syncing.
 327  *
 328  * To load the in-core free tree we read the space map from disk.  This
 329  * object contains a series of alloc and free records that are combined
 330  * to make up the list of all free segments in this metaslab.  These
 331  * segments are represented in-core by the ms_allocatable and are stored
 332  * in an AVL tree.
 333  *
 334  * As the space map grows (as a result of the appends) it will
 335  * eventually become space-inefficient.  When the metaslab's in-core
 336  * free tree is zfs_condense_pct/100 times the size of the minimal
 337  * on-disk representation, we rewrite it in its minimized form.  If a
 338  * metaslab needs to condense then we must set the ms_condensing flag to
 339  * ensure that allocations are not performed on the metaslab that is
 340  * being written.
 341  */
 342 struct metaslab {
 343         /*
 344          * This is the main lock of the metaslab and its purpose is to
 345          * coordinate our allocations and frees [e.g metaslab_block_alloc(),
 346          * metaslab_free_concrete(), ..etc] with our various syncing
 347          * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
 348          *
 349          * The lock is also used during some miscellaneous operations like
 350          * using the metaslab's histogram for the metaslab group's histogram
 351          * aggregation, or marking the metaslab for initialization.
 352          */
 353         kmutex_t        ms_lock;
 354 
 355         /*
 356          * Acquired together with the ms_lock whenever we expect to
 357          * write to metaslab data on-disk (i.e flushing entries to
 358          * the metaslab's space map). It helps coordinate readers of
 359          * the metaslab's space map [see spa_vdev_remove_thread()]
 360          * with writers [see metaslab_sync()].
 361          *
 362          * Note that metaslab_load(), even though a reader, uses
 363          * a completely different mechanism to deal with the reading
 364          * of the metaslab's space map based on ms_synced_length. That
 365          * said, the function still uses the ms_sync_lock after it
 366          * has read the ms_sm [see relevant comment in metaslab_load()
 367          * as to why].
 368          */
 369         kmutex_t        ms_sync_lock;
 370 
 371         kcondvar_t      ms_load_cv;
 372         space_map_t     *ms_sm;
 373         uint64_t        ms_id;
 374         uint64_t        ms_start;
 375         uint64_t        ms_size;
 376         uint64_t        ms_fragmentation;
 377 
 378         range_tree_t    *ms_allocating[TXG_SIZE];
 379         range_tree_t    *ms_allocatable;
 380         uint64_t        ms_allocated_this_txg;
 381 
 382         /*
 383          * The following range trees are accessed only from syncing context.
 384          * ms_free*tree only have entries while syncing, and are empty
 385          * between syncs.
 386          */
 387         range_tree_t    *ms_freeing;    /* to free this syncing txg */
 388         range_tree_t    *ms_freed;      /* already freed this syncing txg */
 389         range_tree_t    *ms_defer[TXG_DEFER_SIZE];
 390         range_tree_t    *ms_checkpointing; /* to add to the checkpoint */
 391 
 392         boolean_t       ms_condensing;  /* condensing? */
 393         boolean_t       ms_condense_wanted;
 394         uint64_t        ms_condense_checked_txg;
 395 
 396         uint64_t        ms_initializing; /* leaves initializing this ms */
 397 
 398         /*
 399          * We must always hold the ms_lock when modifying ms_loaded
 400          * and ms_loading.
 401          */
 402         boolean_t       ms_loaded;
 403         boolean_t       ms_loading;
 404 
 405         /*
 406          * The following histograms count entries that are in the
 407          * metaslab's space map (and its histogram) but are not in
 408          * ms_allocatable yet, because they are in ms_freed, ms_freeing,
 409          * or ms_defer[].
 410          *
 411          * When the metaslab is not loaded, its ms_weight needs to
 412          * reflect what is allocatable (i.e. what will be part of
 413          * ms_allocatable if it is loaded).  The weight is computed from
 414          * the spacemap histogram, but that includes ranges that are
 415          * not yet allocatable (because they are in ms_freed,
 416          * ms_freeing, or ms_defer[]).  Therefore, when calculating the
 417          * weight, we need to remove those ranges.
 418          *
 419          * The ranges in the ms_freed and ms_defer[] range trees are all
 420          * present in the spacemap.  However, the spacemap may have
 421          * multiple entries to represent a contiguous range, because it
 422          * is written across multiple sync passes, but the changes of
 423          * all sync passes are consolidated into the range trees.
 424          * Adjacent ranges that are freed in different sync passes of
 425          * one txg will be represented separately (as 2 or more entries)
 426          * in the space map (and its histogram), but these adjacent
 427          * ranges will be consolidated (represented as one entry) in the
 428          * ms_freed/ms_defer[] range trees (and their histograms).
 429          *
 430          * When calculating the weight, we can not simply subtract the
 431          * range trees' histograms from the spacemap's histogram,
 432          * because the range trees' histograms may have entries in
 433          * higher buckets than the spacemap, due to consolidation.
 434          * Instead we must subtract the exact entries that were added to
 435          * the spacemap's histogram.  ms_synchist and ms_deferhist[]
 436          * represent these exact entries, so we can subtract them from
 437          * the spacemap's histogram when calculating ms_weight.
 438          *
 439          * ms_synchist represents the same ranges as ms_freeing +
 440          * ms_freed, but without consolidation across sync passes.
 441          *
 442          * ms_deferhist[i] represents the same ranges as ms_defer[i],
 443          * but without consolidation across sync passes.
 444          */
 445         uint64_t        ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
 446         uint64_t        ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
 447 
 448         /*
 449          * Tracks the exact amount of allocated space of this metaslab
 450          * (and specifically the metaslab's space map) up to the most
 451          * recently completed sync pass [see usage in metaslab_sync()].
 452          */
 453         uint64_t        ms_allocated_space;
 454         int64_t         ms_deferspace;  /* sum of ms_defermap[] space   */
 455         uint64_t        ms_weight;      /* weight vs. others in group   */
 456         uint64_t        ms_activation_weight;   /* activation weight    */
 457 
 458         /*
 459          * Track of whenever a metaslab is selected for loading or allocation.
 460          * We use this value to determine how long the metaslab should
 461          * stay cached.
 462          */
 463         uint64_t        ms_selected_txg;
 464 
 465         uint64_t        ms_alloc_txg;   /* last successful alloc (debug only) */
 466         uint64_t        ms_max_size;    /* maximum allocatable size     */
 467 
 468         /*
 469          * -1 if it's not active in an allocator, otherwise set to the allocator
 470          * this metaslab is active for.
 471          */
 472         int             ms_allocator;
 473         boolean_t       ms_primary; /* Only valid if ms_allocator is not -1 */
 474 
 475         /*
 476          * The metaslab block allocators can optionally use a size-ordered
 477          * range tree and/or an array of LBAs. Not all allocators use
 478          * this functionality. The ms_allocatable_by_size should always
 479          * contain the same number of segments as the ms_allocatable. The
 480          * only difference is that the ms_allocatable_by_size is ordered by
 481          * segment sizes.
 482          */
 483         avl_tree_t      ms_allocatable_by_size;
 484         uint64_t        ms_lbas[MAX_LBAS];
 485 
 486         metaslab_group_t *ms_group;     /* metaslab group               */
 487         avl_node_t      ms_group_node;  /* node in metaslab group tree  */
 488         txg_node_t      ms_txg_node;    /* per-txg dirty metaslab links */
 489 
 490         /* updated every time we are done syncing the metaslab's space map */
 491         uint64_t        ms_synced_length;
 492 
 493         boolean_t       ms_new;
 494 };
 495 
 496 #ifdef  __cplusplus
 497 }
 498 #endif
 499 
 500 #endif  /* _SYS_METASLAB_IMPL_H */