323 * Each metaslab's space is tracked in a single space map in the MOS,
324 * which is only updated in syncing context. Each time we sync a txg,
325 * we append the allocs and frees from that txg to the space map. The
326 * pool space is only updated once all metaslabs have finished syncing.
327 *
328 * To load the in-core free tree we read the space map from disk. This
329 * object contains a series of alloc and free records that are combined
330 * to make up the list of all free segments in this metaslab. These
331 * segments are represented in-core by the ms_allocatable and are stored
332 * in an AVL tree.
333 *
334 * As the space map grows (as a result of the appends) it will
335 * eventually become space-inefficient. When the metaslab's in-core
336 * free tree is zfs_condense_pct/100 times the size of the minimal
337 * on-disk representation, we rewrite it in its minimized form. If a
338 * metaslab needs to condense then we must set the ms_condensing flag to
339 * ensure that allocations are not performed on the metaslab that is
340 * being written.
341 */
342 struct metaslab {
343 kmutex_t ms_lock;
344 kmutex_t ms_sync_lock;
345 kcondvar_t ms_load_cv;
346 space_map_t *ms_sm;
347 uint64_t ms_id;
348 uint64_t ms_start;
349 uint64_t ms_size;
350 uint64_t ms_fragmentation;
351
352 range_tree_t *ms_allocating[TXG_SIZE];
353 range_tree_t *ms_allocatable;
354
355 /*
356 * The following range trees are accessed only from syncing context.
357 * ms_free*tree only have entries while syncing, and are empty
358 * between syncs.
359 */
360 range_tree_t *ms_freeing; /* to free this syncing txg */
361 range_tree_t *ms_freed; /* already freed this syncing txg */
362 range_tree_t *ms_defer[TXG_DEFER_SIZE];
363 range_tree_t *ms_checkpointing; /* to add to the checkpoint */
364
365 boolean_t ms_condensing; /* condensing? */
366 boolean_t ms_condense_wanted;
367 uint64_t ms_condense_checked_txg;
368
369 uint64_t ms_initializing; /* leaves initializing this ms */
370
371 /*
372 * We must always hold the ms_lock when modifying ms_loaded
373 * and ms_loading.
374 */
375 boolean_t ms_loaded;
376 boolean_t ms_loading;
377
378 int64_t ms_deferspace; /* sum of ms_defermap[] space */
379 uint64_t ms_weight; /* weight vs. others in group */
380 uint64_t ms_activation_weight; /* activation weight */
381
382 /*
383 * Track of whenever a metaslab is selected for loading or allocation.
384 * We use this value to determine how long the metaslab should
385 * stay cached.
386 */
387 uint64_t ms_selected_txg;
388
389 uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
390 uint64_t ms_max_size; /* maximum allocatable size */
391
392 /*
393 * -1 if it's not active in an allocator, otherwise set to the allocator
394 * this metaslab is active for.
395 */
396 int ms_allocator;
397 boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */
398
399 /*
400 * The metaslab block allocators can optionally use a size-ordered
401 * range tree and/or an array of LBAs. Not all allocators use
402 * this functionality. The ms_allocatable_by_size should always
403 * contain the same number of segments as the ms_allocatable. The
404 * only difference is that the ms_allocatable_by_size is ordered by
405 * segment sizes.
406 */
407 avl_tree_t ms_allocatable_by_size;
408 uint64_t ms_lbas[MAX_LBAS];
409
410 metaslab_group_t *ms_group; /* metaslab group */
411 avl_node_t ms_group_node; /* node in metaslab group tree */
412 txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
413
414 boolean_t ms_new;
415 };
416
417 #ifdef __cplusplus
418 }
419 #endif
420
421 #endif /* _SYS_METASLAB_IMPL_H */
|
323 * Each metaslab's space is tracked in a single space map in the MOS,
324 * which is only updated in syncing context. Each time we sync a txg,
325 * we append the allocs and frees from that txg to the space map. The
326 * pool space is only updated once all metaslabs have finished syncing.
327 *
328 * To load the in-core free tree we read the space map from disk. This
329 * object contains a series of alloc and free records that are combined
330 * to make up the list of all free segments in this metaslab. These
331 * segments are represented in-core by the ms_allocatable and are stored
332 * in an AVL tree.
333 *
334 * As the space map grows (as a result of the appends) it will
335 * eventually become space-inefficient. When the metaslab's in-core
336 * free tree is zfs_condense_pct/100 times the size of the minimal
337 * on-disk representation, we rewrite it in its minimized form. If a
338 * metaslab needs to condense then we must set the ms_condensing flag to
339 * ensure that allocations are not performed on the metaslab that is
340 * being written.
341 */
342 struct metaslab {
343 /*
344 * This is the main lock of the metaslab and its purpose is to
345 * coordinate our allocations and frees [e.g metaslab_block_alloc(),
346 * metaslab_free_concrete(), ..etc] with our various syncing
347 * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
348 *
349 * The lock is also used during some miscellaneous operations like
350 * using the metaslab's histogram for the metaslab group's histogram
351 * aggregation, or marking the metaslab for initialization.
352 */
353 kmutex_t ms_lock;
354
355 /*
356 * Acquired together with the ms_lock whenever we expect to
357 * write to metaslab data on-disk (i.e flushing entries to
358 * the metaslab's space map). It helps coordinate readers of
359 * the metaslab's space map [see spa_vdev_remove_thread()]
360 * with writers [see metaslab_sync()].
361 *
362 * Note that metaslab_load(), even though a reader, uses
363 * a completely different mechanism to deal with the reading
364 * of the metaslab's space map based on ms_synced_length. That
365 * said, the function still uses the ms_sync_lock after it
366 * has read the ms_sm [see relevant comment in metaslab_load()
367 * as to why].
368 */
369 kmutex_t ms_sync_lock;
370
371 kcondvar_t ms_load_cv;
372 space_map_t *ms_sm;
373 uint64_t ms_id;
374 uint64_t ms_start;
375 uint64_t ms_size;
376 uint64_t ms_fragmentation;
377
378 range_tree_t *ms_allocating[TXG_SIZE];
379 range_tree_t *ms_allocatable;
380 uint64_t ms_allocated_this_txg;
381
382 /*
383 * The following range trees are accessed only from syncing context.
384 * ms_free*tree only have entries while syncing, and are empty
385 * between syncs.
386 */
387 range_tree_t *ms_freeing; /* to free this syncing txg */
388 range_tree_t *ms_freed; /* already freed this syncing txg */
389 range_tree_t *ms_defer[TXG_DEFER_SIZE];
390 range_tree_t *ms_checkpointing; /* to add to the checkpoint */
391
392 boolean_t ms_condensing; /* condensing? */
393 boolean_t ms_condense_wanted;
394 uint64_t ms_condense_checked_txg;
395
396 uint64_t ms_initializing; /* leaves initializing this ms */
397
398 /*
399 * We must always hold the ms_lock when modifying ms_loaded
400 * and ms_loading.
401 */
402 boolean_t ms_loaded;
403 boolean_t ms_loading;
404
405 /*
406 * The following histograms count entries that are in the
407 * metaslab's space map (and its histogram) but are not in
408 * ms_allocatable yet, because they are in ms_freed, ms_freeing,
409 * or ms_defer[].
410 *
411 * When the metaslab is not loaded, its ms_weight needs to
412 * reflect what is allocatable (i.e. what will be part of
413 * ms_allocatable if it is loaded). The weight is computed from
414 * the spacemap histogram, but that includes ranges that are
415 * not yet allocatable (because they are in ms_freed,
416 * ms_freeing, or ms_defer[]). Therefore, when calculating the
417 * weight, we need to remove those ranges.
418 *
419 * The ranges in the ms_freed and ms_defer[] range trees are all
420 * present in the spacemap. However, the spacemap may have
421 * multiple entries to represent a contiguous range, because it
422 * is written across multiple sync passes, but the changes of
423 * all sync passes are consolidated into the range trees.
424 * Adjacent ranges that are freed in different sync passes of
425 * one txg will be represented separately (as 2 or more entries)
426 * in the space map (and its histogram), but these adjacent
427 * ranges will be consolidated (represented as one entry) in the
428 * ms_freed/ms_defer[] range trees (and their histograms).
429 *
430 * When calculating the weight, we can not simply subtract the
431 * range trees' histograms from the spacemap's histogram,
432 * because the range trees' histograms may have entries in
433 * higher buckets than the spacemap, due to consolidation.
434 * Instead we must subtract the exact entries that were added to
435 * the spacemap's histogram. ms_synchist and ms_deferhist[]
436 * represent these exact entries, so we can subtract them from
437 * the spacemap's histogram when calculating ms_weight.
438 *
439 * ms_synchist represents the same ranges as ms_freeing +
440 * ms_freed, but without consolidation across sync passes.
441 *
442 * ms_deferhist[i] represents the same ranges as ms_defer[i],
443 * but without consolidation across sync passes.
444 */
445 uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
446 uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
447
448 /*
449 * Tracks the exact amount of allocated space of this metaslab
450 * (and specifically the metaslab's space map) up to the most
451 * recently completed sync pass [see usage in metaslab_sync()].
452 */
453 uint64_t ms_allocated_space;
454 int64_t ms_deferspace; /* sum of ms_defermap[] space */
455 uint64_t ms_weight; /* weight vs. others in group */
456 uint64_t ms_activation_weight; /* activation weight */
457
458 /*
459 * Track of whenever a metaslab is selected for loading or allocation.
460 * We use this value to determine how long the metaslab should
461 * stay cached.
462 */
463 uint64_t ms_selected_txg;
464
465 uint64_t ms_alloc_txg; /* last successful alloc (debug only) */
466 uint64_t ms_max_size; /* maximum allocatable size */
467
468 /*
469 * -1 if it's not active in an allocator, otherwise set to the allocator
470 * this metaslab is active for.
471 */
472 int ms_allocator;
473 boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */
474
475 /*
476 * The metaslab block allocators can optionally use a size-ordered
477 * range tree and/or an array of LBAs. Not all allocators use
478 * this functionality. The ms_allocatable_by_size should always
479 * contain the same number of segments as the ms_allocatable. The
480 * only difference is that the ms_allocatable_by_size is ordered by
481 * segment sizes.
482 */
483 avl_tree_t ms_allocatable_by_size;
484 uint64_t ms_lbas[MAX_LBAS];
485
486 metaslab_group_t *ms_group; /* metaslab group */
487 avl_node_t ms_group_node; /* node in metaslab group tree */
488 txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
489
490 /* updated every time we are done syncing the metaslab's space map */
491 uint64_t ms_synced_length;
492
493 boolean_t ms_new;
494 };
495
496 #ifdef __cplusplus
497 }
498 #endif
499
500 #endif /* _SYS_METASLAB_IMPL_H */
|