481 return (1);
482
483 if (m1->ms_weight < m2->ms_weight)
484 return (1);
485 if (m1->ms_weight > m2->ms_weight)
486 return (-1);
487
488 /*
489 * If the weights are identical, use the offset to force uniqueness.
490 */
491 if (m1->ms_start < m2->ms_start)
492 return (-1);
493 if (m1->ms_start > m2->ms_start)
494 return (1);
495
496 ASSERT3P(m1, ==, m2);
497
498 return (0);
499 }
500
501 /*
502 * Verify that the space accounting on disk matches the in-core range_trees.
503 */
504 void
505 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
506 {
507 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
508 uint64_t allocated = 0;
509 uint64_t sm_free_space, msp_free_space;
510
511 ASSERT(MUTEX_HELD(&msp->ms_lock));
512
513 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
514 return;
515
516 /*
517 * We can only verify the metaslab space when we're called
518 * from syncing context with a loaded metaslab that has an allocated
519 * space map. Calling this in non-syncing context does not
520 * provide a consistent view of the metaslab since we're performing
521 * allocations in the future.
522 */
523 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
524 !msp->ms_loaded)
525 return;
526
527 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
528 space_map_alloc_delta(msp->ms_sm);
529
530 /*
531 * Account for future allocations since we would have already
532 * deducted that space from the ms_freetree.
533 */
534 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
535 allocated +=
536 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
537 }
538
539 msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
540 msp->ms_deferspace + range_tree_space(msp->ms_freed);
541
542 VERIFY3U(sm_free_space, ==, msp_free_space);
543 }
544
545 /*
546 * ==========================================================================
547 * Metaslab groups
548 * ==========================================================================
549 */
550 /*
551 * Update the allocatable flag and the metaslab group's capacity.
552 * The allocatable flag is set to true if the capacity is below
553 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
554 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
555 * transitions from allocatable to non-allocatable or vice versa then the
556 * metaslab group's class is updated to reflect the transition.
557 */
558 static void
559 metaslab_group_alloc_update(metaslab_group_t *mg)
824
825 void
826 metaslab_group_histogram_verify(metaslab_group_t *mg)
827 {
828 uint64_t *mg_hist;
829 vdev_t *vd = mg->mg_vd;
830 uint64_t ashift = vd->vdev_ashift;
831 int i;
832
833 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
834 return;
835
836 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
837 KM_SLEEP);
838
839 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
840 SPACE_MAP_HISTOGRAM_SIZE + ashift);
841
842 for (int m = 0; m < vd->vdev_ms_count; m++) {
843 metaslab_t *msp = vd->vdev_ms[m];
844
845 /* skip if not active or not a member */
846 if (msp->ms_sm == NULL || msp->ms_group != mg)
847 continue;
848
849 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
850 mg_hist[i + ashift] +=
851 msp->ms_sm->sm_phys->smp_histogram[i];
852 }
853
854 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
855 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
856
857 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
858 }
859
860 static void
861 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
862 {
863 metaslab_class_t *mc = mg->mg_class;
1444
1445 if ((rs->rs_end - rs->rs_start) >= size) {
1446 *cursor = rs->rs_start + size;
1447 return (rs->rs_start);
1448 }
1449 return (-1ULL);
1450 }
1451
1452 static metaslab_ops_t metaslab_ndf_ops = {
1453 metaslab_ndf_alloc
1454 };
1455
1456 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1457
1458 /*
1459 * ==========================================================================
1460 * Metaslabs
1461 * ==========================================================================
1462 */
1463
1464 /*
1465 * Wait for any in-progress metaslab loads to complete.
1466 */
1467 static void
1468 metaslab_load_wait(metaslab_t *msp)
1469 {
1470 ASSERT(MUTEX_HELD(&msp->ms_lock));
1471
1472 while (msp->ms_loading) {
1473 ASSERT(!msp->ms_loaded);
1474 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1475 }
1476 }
1477
1478 static int
1479 metaslab_load_impl(metaslab_t *msp)
1480 {
1481 int error = 0;
1482
1483 ASSERT(MUTEX_HELD(&msp->ms_lock));
1484 ASSERT(msp->ms_loading);
1485
1486 /*
1487 * Nobody else can manipulate a loading metaslab, so it's now safe
1488 * to drop the lock. This way we don't have to hold the lock while
1489 * reading the spacemap from disk.
1490 */
1491 mutex_exit(&msp->ms_lock);
1492
1493 /*
1494 * If the space map has not been allocated yet, then treat
1495 * all the space in the metaslab as free and add it to ms_allocatable.
1496 */
1497 if (msp->ms_sm != NULL) {
1498 error = space_map_load(msp->ms_sm, msp->ms_allocatable,
1499 SM_FREE);
1500 } else {
1501 range_tree_add(msp->ms_allocatable,
1502 msp->ms_start, msp->ms_size);
1503 }
1504
1505 mutex_enter(&msp->ms_lock);
1506
1507 if (error != 0)
1508 return (error);
1509
1510 ASSERT3P(msp->ms_group, !=, NULL);
1511 msp->ms_loaded = B_TRUE;
1512
1513 /*
1514 * If the metaslab already has a spacemap, then we need to
1515 * remove all segments from the defer tree; otherwise, the
1516 * metaslab is completely empty and we can skip this.
1517 */
1518 if (msp->ms_sm != NULL) {
1519 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1520 range_tree_walk(msp->ms_defer[t],
1521 range_tree_remove, msp->ms_allocatable);
1522 }
1523 }
1524 msp->ms_max_size = metaslab_block_maxsize(msp);
1525
1526 return (0);
1527 }
1528
1529 int
1530 metaslab_load(metaslab_t *msp)
1531 {
1532 ASSERT(MUTEX_HELD(&msp->ms_lock));
1533
1534 /*
1535 * There may be another thread loading the same metaslab, if that's
1536 * the case just wait until the other thread is done and return.
1537 */
1538 metaslab_load_wait(msp);
1539 if (msp->ms_loaded)
1540 return (0);
1541 VERIFY(!msp->ms_loading);
1542
1543 msp->ms_loading = B_TRUE;
1544 int error = metaslab_load_impl(msp);
1545 msp->ms_loading = B_FALSE;
1546 cv_broadcast(&msp->ms_load_cv);
1547
1548 return (error);
1549 }
1550
1551 void
1552 metaslab_unload(metaslab_t *msp)
1553 {
1554 ASSERT(MUTEX_HELD(&msp->ms_lock));
1555 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1556 msp->ms_loaded = B_FALSE;
1557 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1558 msp->ms_max_size = 0;
1559 }
1560
1561 static void
1562 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1563 int64_t defer_delta, int64_t space_delta)
1564 {
1565 vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1566
1567 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1568 ASSERT(vd->vdev_ms_count != 0);
1569
1570 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
1571 vdev_deflated_space(vd, space_delta));
1572 }
1573
1574 int
1575 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1576 metaslab_t **msp)
1577 {
1578 vdev_t *vd = mg->mg_vd;
1579 spa_t *spa = vd->vdev_spa;
1580 objset_t *mos = spa->spa_meta_objset;
1581 metaslab_t *ms;
1582 int error;
1583
1584 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1585 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1586 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1587 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1588
1589 ms->ms_id = id;
1590 ms->ms_start = id << vd->vdev_ms_shift;
1591 ms->ms_size = 1ULL << vd->vdev_ms_shift;
1592 ms->ms_allocator = -1;
1593 ms->ms_new = B_TRUE;
1594
1595 /*
1596 * We only open space map objects that already exist. All others
1597 * will be opened when we finally allocate an object for it.
1598 */
1599 if (object != 0) {
1600 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1601 ms->ms_size, vd->vdev_ashift);
1602
1603 if (error != 0) {
1604 kmem_free(ms, sizeof (metaslab_t));
1605 return (error);
1606 }
1607
1608 ASSERT(ms->ms_sm != NULL);
1609 }
1610
1611 /*
1612 * We create the main range tree here, but we don't create the
1613 * other range trees until metaslab_sync_done(). This serves
1614 * two purposes: it allows metaslab_sync_done() to detect the
1615 * addition of new space; and for debugging, it ensures that we'd
1616 * data fault on any attempt to use this metaslab before it's ready.
1617 */
1618 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1619 metaslab_group_add(mg, ms);
1620
1621 metaslab_set_fragmentation(ms);
1622
1623 /*
1624 * If we're opening an existing pool (txg == 0) or creating
1625 * a new one (txg == TXG_INITIAL), all space is available now.
1626 * If we're adding space to an existing pool, the new space
1627 * does not become available until after this txg has synced.
1628 * The metaslab's weight will also be initialized when we sync
1629 * out this txg. This ensures that we don't attempt to allocate
1630 * from it before we have initialized it completely.
1631 */
1632 if (txg <= TXG_INITIAL)
1633 metaslab_sync_done(ms, 0);
1634
1635 /*
1636 * If metaslab_debug_load is set and we're initializing a metaslab
1637 * that has an allocated space map object then load the space map
1638 * so that we can verify frees.
1639 */
1640 if (metaslab_debug_load && ms->ms_sm != NULL) {
1641 mutex_enter(&ms->ms_lock);
1642 VERIFY0(metaslab_load(ms));
1643 mutex_exit(&ms->ms_lock);
1644 }
1645
1646 if (txg != 0) {
1647 vdev_dirty(vd, 0, NULL, txg);
1648 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1649 }
1650
1651 *msp = ms;
1652
1653 return (0);
1654 }
1655
1656 void
1657 metaslab_fini(metaslab_t *msp)
1658 {
1659 metaslab_group_t *mg = msp->ms_group;
1660 vdev_t *vd = mg->mg_vd;
1661
1662 metaslab_group_remove(mg, msp);
1663
1664 mutex_enter(&msp->ms_lock);
1665 VERIFY(msp->ms_group == NULL);
1666 metaslab_space_update(vd, mg->mg_class,
1667 -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
1668
1669 space_map_close(msp->ms_sm);
1670
1671 metaslab_unload(msp);
1672
1673 range_tree_destroy(msp->ms_allocatable);
1674 range_tree_destroy(msp->ms_freeing);
1675 range_tree_destroy(msp->ms_freed);
1676
1677 for (int t = 0; t < TXG_SIZE; t++) {
1678 range_tree_destroy(msp->ms_allocating[t]);
1679 }
1680
1681 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1682 range_tree_destroy(msp->ms_defer[t]);
1683 }
1684 ASSERT0(msp->ms_deferspace);
1685
1686 range_tree_destroy(msp->ms_checkpointing);
1687
1688 mutex_exit(&msp->ms_lock);
1689 cv_destroy(&msp->ms_load_cv);
1690 mutex_destroy(&msp->ms_lock);
1691 mutex_destroy(&msp->ms_sync_lock);
1692 ASSERT3U(msp->ms_allocator, ==, -1);
1693
1694 kmem_free(msp, sizeof (metaslab_t));
1695 }
1696
1697 #define FRAGMENTATION_TABLE_SIZE 17
1698
1699 /*
1700 * This table defines a segment size based fragmentation metric that will
1701 * allow each metaslab to derive its own fragmentation value. This is done
1702 * by calculating the space in each bucket of the spacemap histogram and
1703 * multiplying that by the fragmetation metric in this table. Doing
1704 * this for all buckets and dividing it by the total amount of free
1705 * space in this metaslab (i.e. the total free space in all buckets) gives
1706 * us the fragmentation metric. This means that a high fragmentation metric
1707 * equates to most of the free space being comprised of small segments.
1708 * Conversely, if the metric is low, then most of the free space is in
1709 * large segments. A 10% change in fragmentation equates to approximately
1710 * double the number of segments.
1711 *
1712 * This table defines 0% fragmented space using 16MB segments. Testing has
1713 * shown that segments that are greater than or equal to 16MB do not suffer
1714 * from drastic performance problems. Using this value, we derive the rest
1715 * of the table. Since the fragmentation value is never stored on disk, it
1716 * is possible to change these calculations in the future.
1717 */
1718 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1719 100, /* 512B */
1720 100, /* 1K */
1721 98, /* 2K */
1722 95, /* 4K */
1723 90, /* 8K */
1724 80, /* 16K */
1725 70, /* 32K */
1726 60, /* 64K */
1727 50, /* 128K */
1728 40, /* 256K */
1729 30, /* 512K */
1730 20, /* 1M */
1731 15, /* 2M */
1732 10, /* 4M */
1733 5, /* 8M */
1734 0 /* 16M */
1735 };
1736
1737 /*
1738 * Calclate the metaslab's fragmentation metric. A return value
1739 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1740 * not support this metric. Otherwise, the return value should be in the
1741 * range [0, 100].
1742 */
1743 static void
1744 metaslab_set_fragmentation(metaslab_t *msp)
1745 {
1746 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1747 uint64_t fragmentation = 0;
1748 uint64_t total = 0;
1749 boolean_t feature_enabled = spa_feature_is_enabled(spa,
1750 SPA_FEATURE_SPACEMAP_HISTOGRAM);
1751
1752 if (!feature_enabled) {
1753 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1754 return;
1755 }
1756
1757 /*
1758 * A null space map means that the entire metaslab is free
1759 * and thus is not fragmented.
1760 */
1761 if (msp->ms_sm == NULL) {
1814 }
1815
1816 /*
1817 * Compute a weight -- a selection preference value -- for the given metaslab.
1818 * This is based on the amount of free space, the level of fragmentation,
1819 * the LBA range, and whether the metaslab is loaded.
1820 */
1821 static uint64_t
1822 metaslab_space_weight(metaslab_t *msp)
1823 {
1824 metaslab_group_t *mg = msp->ms_group;
1825 vdev_t *vd = mg->mg_vd;
1826 uint64_t weight, space;
1827
1828 ASSERT(MUTEX_HELD(&msp->ms_lock));
1829 ASSERT(!vd->vdev_removing);
1830
1831 /*
1832 * The baseline weight is the metaslab's free space.
1833 */
1834 space = msp->ms_size - space_map_allocated(msp->ms_sm);
1835
1836 if (metaslab_fragmentation_factor_enabled &&
1837 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1838 /*
1839 * Use the fragmentation information to inversely scale
1840 * down the baseline weight. We need to ensure that we
1841 * don't exclude this metaslab completely when it's 100%
1842 * fragmented. To avoid this we reduce the fragmented value
1843 * by 1.
1844 */
1845 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1846
1847 /*
1848 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1849 * this metaslab again. The fragmentation metric may have
1850 * decreased the space to something smaller than
1851 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1852 * so that we can consume any remaining space.
1853 */
1854 if (space > 0 && space < SPA_MINBLOCKSIZE)
1918 continue;
1919
1920 if (segments != 0) {
1921 WEIGHT_SET_COUNT(weight, segments);
1922 WEIGHT_SET_INDEX(weight, i);
1923 WEIGHT_SET_ACTIVE(weight, 0);
1924 break;
1925 }
1926 }
1927 return (weight);
1928 }
1929
1930 /*
1931 * Calculate the weight based on the on-disk histogram. This should only
1932 * be called after a sync pass has completely finished since the on-disk
1933 * information is updated in metaslab_sync().
1934 */
1935 static uint64_t
1936 metaslab_weight_from_spacemap(metaslab_t *msp)
1937 {
1938 uint64_t weight = 0;
1939
1940 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1941 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1942 WEIGHT_SET_COUNT(weight,
1943 msp->ms_sm->sm_phys->smp_histogram[i]);
1944 WEIGHT_SET_INDEX(weight, i +
1945 msp->ms_sm->sm_shift);
1946 WEIGHT_SET_ACTIVE(weight, 0);
1947 break;
1948 }
1949 }
1950 return (weight);
1951 }
1952
1953 /*
1954 * Compute a segment-based weight for the specified metaslab. The weight
1955 * is determined by highest bucket in the histogram. The information
1956 * for the highest bucket is encoded into the weight value.
1957 */
1958 static uint64_t
1959 metaslab_segment_weight(metaslab_t *msp)
1960 {
1961 metaslab_group_t *mg = msp->ms_group;
1962 uint64_t weight = 0;
1963 uint8_t shift = mg->mg_vd->vdev_ashift;
1964
1965 ASSERT(MUTEX_HELD(&msp->ms_lock));
1966
1967 /*
1968 * The metaslab is completely free.
1969 */
1970 if (space_map_allocated(msp->ms_sm) == 0) {
1971 int idx = highbit64(msp->ms_size) - 1;
1972 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1973
1974 if (idx < max_idx) {
1975 WEIGHT_SET_COUNT(weight, 1ULL);
1976 WEIGHT_SET_INDEX(weight, idx);
1977 } else {
1978 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1979 WEIGHT_SET_INDEX(weight, max_idx);
1980 }
1981 WEIGHT_SET_ACTIVE(weight, 0);
1982 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1983
1984 return (weight);
1985 }
1986
1987 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1988
1989 /*
1990 * If the metaslab is fully allocated then just make the weight 0.
1991 */
1992 if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1993 return (0);
1994 /*
1995 * If the metaslab is already loaded, then use the range tree to
1996 * determine the weight. Otherwise, we rely on the space map information
1997 * to generate the weight.
1998 */
1999 if (msp->ms_loaded) {
2000 weight = metaslab_weight_from_range_tree(msp);
2001 } else {
2002 weight = metaslab_weight_from_spacemap(msp);
2003 }
2004
2005 /*
2006 * If the metaslab was active the last time we calculated its weight
2007 * then keep it active. We want to consume the entire region that
2008 * is associated with this weight.
2009 */
2010 if (msp->ms_activation_weight != 0 && weight != 0)
2011 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2012 return (weight);
2053 uint64_t weight;
2054
2055 ASSERT(MUTEX_HELD(&msp->ms_lock));
2056
2057 /*
2058 * If this vdev is in the process of being removed, there is nothing
2059 * for us to do here.
2060 */
2061 if (vd->vdev_removing)
2062 return (0);
2063
2064 metaslab_set_fragmentation(msp);
2065
2066 /*
2067 * Update the maximum size if the metaslab is loaded. This will
2068 * ensure that we get an accurate maximum size if newly freed space
2069 * has been added back into the free tree.
2070 */
2071 if (msp->ms_loaded)
2072 msp->ms_max_size = metaslab_block_maxsize(msp);
2073
2074 /*
2075 * Segment-based weighting requires space map histogram support.
2076 */
2077 if (zfs_metaslab_segment_weight_enabled &&
2078 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2079 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2080 sizeof (space_map_phys_t))) {
2081 weight = metaslab_segment_weight(msp);
2082 } else {
2083 weight = metaslab_space_weight(msp);
2084 }
2085 return (weight);
2086 }
2087
2088 static int
2089 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2090 int allocator, uint64_t activation_weight)
2091 {
2092 /*
2093 * If we're activating for the claim code, we don't want to actually
2094 * set the metaslab up for a specific allocator.
2095 */
2096 if (activation_weight == METASLAB_WEIGHT_CLAIM)
2097 return (0);
2098 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2099 mg->mg_primaries : mg->mg_secondaries);
2100
2101 ASSERT(MUTEX_HELD(&msp->ms_lock));
2102 mutex_enter(&mg->mg_lock);
2103 if (arr[allocator] != NULL) {
2104 mutex_exit(&mg->mg_lock);
2105 return (EEXIST);
2106 }
2107
2452 ASSERT3P(alloctree, !=, NULL);
2453 ASSERT3P(msp->ms_freeing, !=, NULL);
2454 ASSERT3P(msp->ms_freed, !=, NULL);
2455 ASSERT3P(msp->ms_checkpointing, !=, NULL);
2456
2457 /*
2458 * Normally, we don't want to process a metaslab if there are no
2459 * allocations or frees to perform. However, if the metaslab is being
2460 * forced to condense and it's loaded, we need to let it through.
2461 */
2462 if (range_tree_is_empty(alloctree) &&
2463 range_tree_is_empty(msp->ms_freeing) &&
2464 range_tree_is_empty(msp->ms_checkpointing) &&
2465 !(msp->ms_loaded && msp->ms_condense_wanted))
2466 return;
2467
2468
2469 VERIFY(txg <= spa_final_dirty_txg(spa));
2470
2471 /*
2472 * The only state that can actually be changing concurrently with
2473 * metaslab_sync() is the metaslab's ms_allocatable. No other
2474 * thread can be modifying this txg's alloc, freeing,
2475 * freed, or space_map_phys_t. We drop ms_lock whenever we
2476 * could call into the DMU, because the DMU can call down to us
2477 * (e.g. via zio_free()) at any time.
2478 *
2479 * The spa_vdev_remove_thread() can be reading metaslab state
2480 * concurrently, and it is locked out by the ms_sync_lock. Note
2481 * that the ms_lock is insufficient for this, because it is dropped
2482 * by space_map_write().
2483 */
2484 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2485
2486 if (msp->ms_sm == NULL) {
2487 uint64_t new_object;
2488
2489 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2490 VERIFY3U(new_object, !=, 0);
2491
2492 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2493 msp->ms_start, msp->ms_size, vd->vdev_ashift));
2494 ASSERT(msp->ms_sm != NULL);
2495 }
2496
2497 if (!range_tree_is_empty(msp->ms_checkpointing) &&
2498 vd->vdev_checkpoint_sm == NULL) {
2499 ASSERT(spa_has_checkpoint(spa));
2500
2501 uint64_t new_object = space_map_alloc(mos,
2502 vdev_standard_sm_blksz, tx);
2503 VERIFY3U(new_object, !=, 0);
2504
2505 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2506 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2507 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2508
2509 /*
2510 * We save the space map object as an entry in vdev_top_zap
2511 * so it can be retrieved when the pool is reopened after an
2512 * export or through zdb.
2513 */
2514 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
2522 /*
2523 * Note: metaslab_condense() clears the space map's histogram.
2524 * Therefore we must verify and remove this histogram before
2525 * condensing.
2526 */
2527 metaslab_group_histogram_verify(mg);
2528 metaslab_class_histogram_verify(mg->mg_class);
2529 metaslab_group_histogram_remove(mg, msp);
2530
2531 if (msp->ms_loaded && metaslab_should_condense(msp)) {
2532 metaslab_condense(msp, txg, tx);
2533 } else {
2534 mutex_exit(&msp->ms_lock);
2535 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2536 SM_NO_VDEVID, tx);
2537 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2538 SM_NO_VDEVID, tx);
2539 mutex_enter(&msp->ms_lock);
2540 }
2541
2542 if (!range_tree_is_empty(msp->ms_checkpointing)) {
2543 ASSERT(spa_has_checkpoint(spa));
2544 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2545
2546 /*
2547 * Since we are doing writes to disk and the ms_checkpointing
2548 * tree won't be changing during that time, we drop the
2549 * ms_lock while writing to the checkpoint space map.
2550 */
2551 mutex_exit(&msp->ms_lock);
2552 space_map_write(vd->vdev_checkpoint_sm,
2553 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2554 mutex_enter(&msp->ms_lock);
2555 space_map_update(vd->vdev_checkpoint_sm);
2556
2557 spa->spa_checkpoint_info.sci_dspace +=
2558 range_tree_space(msp->ms_checkpointing);
2559 vd->vdev_stat.vs_checkpoint_space +=
2560 range_tree_space(msp->ms_checkpointing);
2561 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2562 -vd->vdev_checkpoint_sm->sm_alloc);
2563
2564 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2565 }
2566
2567 if (msp->ms_loaded) {
2568 /*
2569 * When the space map is loaded, we have an accurate
2570 * histogram in the range tree. This gives us an opportunity
2571 * to bring the space map's histogram up-to-date so we clear
2572 * it first before updating it.
2573 */
2574 space_map_histogram_clear(msp->ms_sm);
2575 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2576
2577 /*
2578 * Since we've cleared the histogram we need to add back
2579 * any free space that has already been processed, plus
2580 * any deferred space. This allows the on-disk histogram
2581 * to accurately reflect all free space even if some space
2582 * is not yet available for allocation (i.e. deferred).
2587 * Add back any deferred free space that has not been
2588 * added back into the in-core free tree yet. This will
2589 * ensure that we don't end up with a space map histogram
2590 * that is completely empty unless the metaslab is fully
2591 * allocated.
2592 */
2593 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2594 space_map_histogram_add(msp->ms_sm,
2595 msp->ms_defer[t], tx);
2596 }
2597 }
2598
2599 /*
2600 * Always add the free space from this sync pass to the space
2601 * map histogram. We want to make sure that the on-disk histogram
2602 * accounts for all free space. If the space map is not loaded,
2603 * then we will lose some accuracy but will correct it the next
2604 * time we load the space map.
2605 */
2606 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
2607
2608 metaslab_group_histogram_add(mg, msp);
2609 metaslab_group_histogram_verify(mg);
2610 metaslab_class_histogram_verify(mg->mg_class);
2611
2612 /*
2613 * For sync pass 1, we avoid traversing this txg's free range tree
2614 * and instead will just swap the pointers for freeing and
2615 * freed. We can safely do this since the freed_tree is
2616 * guaranteed to be empty on the initial pass.
2617 */
2618 if (spa_sync_pass(spa) == 1) {
2619 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
2620 } else {
2621 range_tree_vacate(msp->ms_freeing,
2622 range_tree_add, msp->ms_freed);
2623 }
2624 range_tree_vacate(alloctree, NULL, NULL);
2625
2626 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2627 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2628 & TXG_MASK]));
2629 ASSERT0(range_tree_space(msp->ms_freeing));
2630 ASSERT0(range_tree_space(msp->ms_checkpointing));
2631
2632 mutex_exit(&msp->ms_lock);
2633
2634 if (object != space_map_object(msp->ms_sm)) {
2635 object = space_map_object(msp->ms_sm);
2636 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2637 msp->ms_id, sizeof (uint64_t), &object, tx);
2638 }
2639 mutex_exit(&msp->ms_sync_lock);
2640 dmu_tx_commit(tx);
2641 }
2642
2643 /*
2681 msp->ms_defer[t] = range_tree_create(NULL, NULL);
2682 }
2683
2684 ASSERT3P(msp->ms_checkpointing, ==, NULL);
2685 msp->ms_checkpointing = range_tree_create(NULL, NULL);
2686
2687 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
2688 }
2689 ASSERT0(range_tree_space(msp->ms_freeing));
2690 ASSERT0(range_tree_space(msp->ms_checkpointing));
2691
2692 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
2693
2694 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2695 metaslab_class_get_alloc(spa_normal_class(spa));
2696 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2697 defer_allowed = B_FALSE;
2698 }
2699
2700 defer_delta = 0;
2701 alloc_delta = space_map_alloc_delta(msp->ms_sm);
2702 if (defer_allowed) {
2703 defer_delta = range_tree_space(msp->ms_freed) -
2704 range_tree_space(*defer_tree);
2705 } else {
2706 defer_delta -= range_tree_space(*defer_tree);
2707 }
2708
2709 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
2710 defer_delta, 0);
2711
2712 /*
2713 * If there's a metaslab_load() in progress, wait for it to complete
2714 * so that we have a consistent view of the in-core space map.
2715 */
2716 metaslab_load_wait(msp);
2717
2718 /*
2719 * Move the frees from the defer_tree back to the free
2720 * range tree (if it's loaded). Swap the freed_tree and
2721 * the defer_tree -- this is safe to do because we've
2722 * just emptied out the defer_tree.
2723 */
2724 range_tree_vacate(*defer_tree,
2725 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
2726 if (defer_allowed) {
2727 range_tree_swap(&msp->ms_freed, defer_tree);
2728 } else {
2729 range_tree_vacate(msp->ms_freed,
2730 msp->ms_loaded ? range_tree_add : NULL,
2731 msp->ms_allocatable);
2732 }
2733 space_map_update(msp->ms_sm);
2734
2735 msp->ms_deferspace += defer_delta;
2736 ASSERT3S(msp->ms_deferspace, >=, 0);
2737 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2738 if (msp->ms_deferspace != 0) {
2739 /*
2740 * Keep syncing this metaslab until all deferred frees
2741 * are back in circulation.
2742 */
2743 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2744 }
2745
2746 if (msp->ms_new) {
2747 msp->ms_new = B_FALSE;
2748 mutex_enter(&mg->mg_lock);
2749 mg->mg_ms_ready++;
2750 mutex_exit(&mg->mg_lock);
2751 }
2752 /*
2753 * Calculate the new weights before unloading any metaslabs.
2754 * This will give us the most accurate weighting.
2755 */
2756 metaslab_group_sort(mg, msp, metaslab_weight(msp) |
2757 (msp->ms_weight & METASLAB_ACTIVE_MASK));
2758
2759 /*
2760 * If the metaslab is loaded and we've not tried to load or allocate
2761 * from it in 'metaslab_unload_delay' txgs, then unload it.
2762 */
2763 if (msp->ms_loaded &&
2764 msp->ms_initializing == 0 &&
2765 msp->ms_selected_txg + metaslab_unload_delay < txg) {
2766 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2767 VERIFY0(range_tree_space(
2768 msp->ms_allocating[(txg + t) & TXG_MASK]));
2769 }
2770 if (msp->ms_allocator != -1) {
2771 metaslab_passivate(msp, msp->ms_weight &
2772 ~METASLAB_ACTIVE_MASK);
2773 }
2774
2775 if (!metaslab_debug_unload)
2776 metaslab_unload(msp);
2777 }
2778
2779 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2780 ASSERT0(range_tree_space(msp->ms_freeing));
2781 ASSERT0(range_tree_space(msp->ms_freed));
2782 ASSERT0(range_tree_space(msp->ms_checkpointing));
2783
2784 mutex_exit(&msp->ms_lock);
2785 }
2786
2787 void
2788 metaslab_sync_reassess(metaslab_group_t *mg)
2789 {
2790 spa_t *spa = mg->mg_class->mc_spa;
2791
2792 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2793 metaslab_group_alloc_update(mg);
2794 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2795
2796 /*
2797 * Preload the next potential metaslabs but only on active
2798 * metaslab groups. We can get into a state where the metaslab
2799 * is no longer active since we dirty metaslabs as we remove a
2800 * a device, thus potentially making the metaslab group eligible
2801 * for preloading.
2802 */
2803 if (mg->mg_activation_count > 0) {
4019 vdev_t *vd;
4020
4021 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
4022 return (SET_ERROR(ENXIO));
4023 }
4024
4025 ASSERT(DVA_IS_VALID(dva));
4026
4027 if (DVA_GET_GANG(dva))
4028 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4029
4030 return (metaslab_claim_impl(vd, offset, size, txg));
4031 }
4032
4033 int
4034 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4035 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4036 zio_alloc_list_t *zal, zio_t *zio, int allocator)
4037 {
4038 dva_t *dva = bp->blk_dva;
4039 dva_t *hintdva = hintbp->blk_dva;
4040 int error = 0;
4041
4042 ASSERT(bp->blk_birth == 0);
4043 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4044
4045 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4046
4047 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
4048 spa_config_exit(spa, SCL_ALLOC, FTAG);
4049 return (SET_ERROR(ENOSPC));
4050 }
4051
4052 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4053 ASSERT(BP_GET_NDVAS(bp) == 0);
4054 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4055 ASSERT3P(zal, !=, NULL);
4056
4057 for (int d = 0; d < ndvas; d++) {
4058 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4059 txg, flags, zal, allocator);
4186 {
4187 metaslab_t *msp;
4188 spa_t *spa = vd->vdev_spa;
4189
4190 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4191 return;
4192
4193 if (vd->vdev_ops->vdev_op_remap != NULL) {
4194 vd->vdev_ops->vdev_op_remap(vd, offset, size,
4195 metaslab_check_free_impl_cb, NULL);
4196 return;
4197 }
4198
4199 ASSERT(vdev_is_concrete(vd));
4200 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4201 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4202
4203 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4204
4205 mutex_enter(&msp->ms_lock);
4206 if (msp->ms_loaded)
4207 range_tree_verify(msp->ms_allocatable, offset, size);
4208
4209 range_tree_verify(msp->ms_freeing, offset, size);
4210 range_tree_verify(msp->ms_checkpointing, offset, size);
4211 range_tree_verify(msp->ms_freed, offset, size);
4212 for (int j = 0; j < TXG_DEFER_SIZE; j++)
4213 range_tree_verify(msp->ms_defer[j], offset, size);
4214 mutex_exit(&msp->ms_lock);
4215 }
4216
4217 void
4218 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4219 {
4220 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4221 return;
4222
4223 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4224 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4225 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4226 vdev_t *vd = vdev_lookup_top(spa, vdev);
4227 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4228 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4229
4230 if (DVA_GET_GANG(&bp->blk_dva[i]))
4231 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4232
4233 ASSERT3P(vd, !=, NULL);
|
481 return (1);
482
483 if (m1->ms_weight < m2->ms_weight)
484 return (1);
485 if (m1->ms_weight > m2->ms_weight)
486 return (-1);
487
488 /*
489 * If the weights are identical, use the offset to force uniqueness.
490 */
491 if (m1->ms_start < m2->ms_start)
492 return (-1);
493 if (m1->ms_start > m2->ms_start)
494 return (1);
495
496 ASSERT3P(m1, ==, m2);
497
498 return (0);
499 }
500
501 uint64_t
502 metaslab_allocated_space(metaslab_t *msp)
503 {
504 return (msp->ms_allocated_space);
505 }
506
507 /*
508 * Verify that the space accounting on disk matches the in-core range_trees.
509 */
510 static void
511 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
512 {
513 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
514 uint64_t allocating = 0;
515 uint64_t sm_free_space, msp_free_space;
516
517 ASSERT(MUTEX_HELD(&msp->ms_lock));
518 ASSERT(!msp->ms_condensing);
519
520 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
521 return;
522
523 /*
524 * We can only verify the metaslab space when we're called
525 * from syncing context with a loaded metaslab that has an
526 * allocated space map. Calling this in non-syncing context
527 * does not provide a consistent view of the metaslab since
528 * we're performing allocations in the future.
529 */
530 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
531 !msp->ms_loaded)
532 return;
533
534 /*
535 * Even though the smp_alloc field can get negative (e.g.
536 * see vdev_checkpoint_sm), that should never be the case
537 * when it come's to a metaslab's space map.
538 */
539 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
540
541 sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
542
543 /*
544 * Account for future allocations since we would have
545 * already deducted that space from the ms_allocatable.
546 */
547 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
548 allocating +=
549 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
550 }
551
552 ASSERT3U(msp->ms_deferspace, ==,
553 range_tree_space(msp->ms_defer[0]) +
554 range_tree_space(msp->ms_defer[1]));
555
556 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
557 msp->ms_deferspace + range_tree_space(msp->ms_freed);
558
559 VERIFY3U(sm_free_space, ==, msp_free_space);
560 }
561
562 /*
563 * ==========================================================================
564 * Metaslab groups
565 * ==========================================================================
566 */
567 /*
568 * Update the allocatable flag and the metaslab group's capacity.
569 * The allocatable flag is set to true if the capacity is below
570 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
571 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
572 * transitions from allocatable to non-allocatable or vice versa then the
573 * metaslab group's class is updated to reflect the transition.
574 */
575 static void
576 metaslab_group_alloc_update(metaslab_group_t *mg)
841
842 void
843 metaslab_group_histogram_verify(metaslab_group_t *mg)
844 {
845 uint64_t *mg_hist;
846 vdev_t *vd = mg->mg_vd;
847 uint64_t ashift = vd->vdev_ashift;
848 int i;
849
850 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
851 return;
852
853 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
854 KM_SLEEP);
855
856 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
857 SPACE_MAP_HISTOGRAM_SIZE + ashift);
858
859 for (int m = 0; m < vd->vdev_ms_count; m++) {
860 metaslab_t *msp = vd->vdev_ms[m];
861 ASSERT(msp != NULL);
862
863 /* skip if not active or not a member */
864 if (msp->ms_sm == NULL || msp->ms_group != mg)
865 continue;
866
867 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
868 mg_hist[i + ashift] +=
869 msp->ms_sm->sm_phys->smp_histogram[i];
870 }
871
872 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
873 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
874
875 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
876 }
877
878 static void
879 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
880 {
881 metaslab_class_t *mc = mg->mg_class;
1462
1463 if ((rs->rs_end - rs->rs_start) >= size) {
1464 *cursor = rs->rs_start + size;
1465 return (rs->rs_start);
1466 }
1467 return (-1ULL);
1468 }
1469
1470 static metaslab_ops_t metaslab_ndf_ops = {
1471 metaslab_ndf_alloc
1472 };
1473
1474 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1475
1476 /*
1477 * ==========================================================================
1478 * Metaslabs
1479 * ==========================================================================
1480 */
1481
1482 static void
1483 metaslab_aux_histograms_clear(metaslab_t *msp)
1484 {
1485 /*
1486 * Auxiliary histograms are only cleared when resetting them,
1487 * which can only happen while the metaslab is loaded.
1488 */
1489 ASSERT(msp->ms_loaded);
1490
1491 bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1492 for (int t = 0; t < TXG_DEFER_SIZE; t++)
1493 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1494 }
1495
1496 static void
1497 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1498 range_tree_t *rt)
1499 {
1500 /*
1501 * This is modeled after space_map_histogram_add(), so refer to that
1502 * function for implementation details. We want this to work like
1503 * the space map histogram, and not the range tree histogram, as we
1504 * are essentially constructing a delta that will be later subtracted
1505 * from the space map histogram.
1506 */
1507 int idx = 0;
1508 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1509 ASSERT3U(i, >=, idx + shift);
1510 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1511
1512 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1513 ASSERT3U(idx + shift, ==, i);
1514 idx++;
1515 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1516 }
1517 }
1518 }
1519
1520 /*
1521 * Called at every sync pass that the metaslab gets synced.
1522 *
1523 * The reason is that we want our auxiliary histograms to be updated
1524 * wherever the metaslab's space map histogram is updated. This way
1525 * we stay consistent on which parts of the metaslab space map's
1526 * histogram are currently not available for allocations (e.g because
1527 * they are in the defer, freed, and freeing trees).
1528 */
1529 static void
1530 metaslab_aux_histograms_update(metaslab_t *msp)
1531 {
1532 space_map_t *sm = msp->ms_sm;
1533 ASSERT(sm != NULL);
1534
1535 /*
1536 * This is similar to the metaslab's space map histogram updates
1537 * that take place in metaslab_sync(). The only difference is that
1538 * we only care about segments that haven't made it into the
1539 * ms_allocatable tree yet.
1540 */
1541 if (msp->ms_loaded) {
1542 metaslab_aux_histograms_clear(msp);
1543
1544 metaslab_aux_histogram_add(msp->ms_synchist,
1545 sm->sm_shift, msp->ms_freed);
1546
1547 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1548 metaslab_aux_histogram_add(msp->ms_deferhist[t],
1549 sm->sm_shift, msp->ms_defer[t]);
1550 }
1551 }
1552
1553 metaslab_aux_histogram_add(msp->ms_synchist,
1554 sm->sm_shift, msp->ms_freeing);
1555 }
1556
1557 /*
1558 * Called every time we are done syncing (writing to) the metaslab,
1559 * i.e. at the end of each sync pass.
1560 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1561 */
1562 static void
1563 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1564 {
1565 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1566 space_map_t *sm = msp->ms_sm;
1567
1568 if (sm == NULL) {
1569 /*
1570 * We came here from metaslab_init() when creating/opening a
1571 * pool, looking at a metaslab that hasn't had any allocations
1572 * yet.
1573 */
1574 return;
1575 }
1576
1577 /*
1578 * This is similar to the actions that we take for the ms_freed
1579 * and ms_defer trees in metaslab_sync_done().
1580 */
1581 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1582 if (defer_allowed) {
1583 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1584 sizeof (msp->ms_synchist));
1585 } else {
1586 bzero(msp->ms_deferhist[hist_index],
1587 sizeof (msp->ms_deferhist[hist_index]));
1588 }
1589 bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1590 }
1591
1592 /*
1593 * Ensure that the metaslab's weight and fragmentation are consistent
1594 * with the contents of the histogram (either the range tree's histogram
1595 * or the space map's depending whether the metaslab is loaded).
1596 */
1597 static void
1598 metaslab_verify_weight_and_frag(metaslab_t *msp)
1599 {
1600 ASSERT(MUTEX_HELD(&msp->ms_lock));
1601
1602 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1603 return;
1604
1605 /* see comment in metaslab_verify_unflushed_changes() */
1606 if (msp->ms_group == NULL)
1607 return;
1608
1609 /*
1610 * Devices being removed always return a weight of 0 and leave
1611 * fragmentation and ms_max_size as is - there is nothing for
1612 * us to verify here.
1613 */
1614 vdev_t *vd = msp->ms_group->mg_vd;
1615 if (vd->vdev_removing)
1616 return;
1617
1618 /*
1619 * If the metaslab is dirty it probably means that we've done
1620 * some allocations or frees that have changed our histograms
1621 * and thus the weight.
1622 */
1623 for (int t = 0; t < TXG_SIZE; t++) {
1624 if (txg_list_member(&vd->vdev_ms_list, msp, t))
1625 return;
1626 }
1627
1628 /*
1629 * This verification checks that our in-memory state is consistent
1630 * with what's on disk. If the pool is read-only then there aren't
1631 * any changes and we just have the initially-loaded state.
1632 */
1633 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
1634 return;
1635
1636 /* some extra verification for in-core tree if you can */
1637 if (msp->ms_loaded) {
1638 range_tree_stat_verify(msp->ms_allocatable);
1639 VERIFY(space_map_histogram_verify(msp->ms_sm,
1640 msp->ms_allocatable));
1641 }
1642
1643 uint64_t weight = msp->ms_weight;
1644 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1645 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
1646 uint64_t frag = msp->ms_fragmentation;
1647 uint64_t max_segsize = msp->ms_max_size;
1648
1649 msp->ms_weight = 0;
1650 msp->ms_fragmentation = 0;
1651 msp->ms_max_size = 0;
1652
1653 /*
1654 * This function is used for verification purposes. Regardless of
1655 * whether metaslab_weight() thinks this metaslab should be active or
1656 * not, we want to ensure that the actual weight (and therefore the
1657 * value of ms_weight) would be the same if it was to be recalculated
1658 * at this point.
1659 */
1660 msp->ms_weight = metaslab_weight(msp) | was_active;
1661
1662 VERIFY3U(max_segsize, ==, msp->ms_max_size);
1663
1664 /*
1665 * If the weight type changed then there is no point in doing
1666 * verification. Revert fields to their original values.
1667 */
1668 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
1669 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
1670 msp->ms_fragmentation = frag;
1671 msp->ms_weight = weight;
1672 return;
1673 }
1674
1675 VERIFY3U(msp->ms_fragmentation, ==, frag);
1676 VERIFY3U(msp->ms_weight, ==, weight);
1677 }
1678
1679 /*
1680 * Wait for any in-progress metaslab loads to complete.
1681 */
1682 static void
1683 metaslab_load_wait(metaslab_t *msp)
1684 {
1685 ASSERT(MUTEX_HELD(&msp->ms_lock));
1686
1687 while (msp->ms_loading) {
1688 ASSERT(!msp->ms_loaded);
1689 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1690 }
1691 }
1692
1693 static int
1694 metaslab_load_impl(metaslab_t *msp)
1695 {
1696 int error = 0;
1697
1698 ASSERT(MUTEX_HELD(&msp->ms_lock));
1699 ASSERT(msp->ms_loading);
1700 ASSERT(!msp->ms_condensing);
1701
1702 /*
1703 * We temporarily drop the lock to unblock other operations while we
1704 * are reading the space map. Therefore, metaslab_sync() and
1705 * metaslab_sync_done() can run at the same time as we do.
1706 *
1707 * metaslab_sync() can append to the space map while we are loading.
1708 * Therefore we load only entries that existed when we started the
1709 * load. Additionally, metaslab_sync_done() has to wait for the load
1710 * to complete because there are potential races like metaslab_load()
1711 * loading parts of the space map that are currently being appended
1712 * by metaslab_sync(). If we didn't, the ms_allocatable would have
1713 * entries that metaslab_sync_done() would try to re-add later.
1714 *
1715 * That's why before dropping the lock we remember the synced length
1716 * of the metaslab and read up to that point of the space map,
1717 * ignoring entries appended by metaslab_sync() that happen after we
1718 * drop the lock.
1719 */
1720 uint64_t length = msp->ms_synced_length;
1721 mutex_exit(&msp->ms_lock);
1722
1723 if (msp->ms_sm != NULL) {
1724 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
1725 SM_FREE, length);
1726 } else {
1727 /*
1728 * The space map has not been allocated yet, so treat
1729 * all the space in the metaslab as free and add it to the
1730 * ms_allocatable tree.
1731 */
1732 range_tree_add(msp->ms_allocatable,
1733 msp->ms_start, msp->ms_size);
1734 }
1735
1736 /*
1737 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
1738 * changing the ms_sm and the metaslab's range trees while we are
1739 * about to use them and populate the ms_allocatable. The ms_lock
1740 * is insufficient for this because metaslab_sync() doesn't hold
1741 * the ms_lock while writing the ms_checkpointing tree to disk.
1742 */
1743 mutex_enter(&msp->ms_sync_lock);
1744 mutex_enter(&msp->ms_lock);
1745 ASSERT(!msp->ms_condensing);
1746
1747 if (error != 0) {
1748 mutex_exit(&msp->ms_sync_lock);
1749 return (error);
1750 }
1751
1752 ASSERT3P(msp->ms_group, !=, NULL);
1753 msp->ms_loaded = B_TRUE;
1754
1755 /*
1756 * The ms_allocatable contains the segments that exist in the
1757 * ms_defer trees [see ms_synced_length]. Thus we need to remove
1758 * them from ms_allocatable as they will be added again in
1759 * metaslab_sync_done().
1760 */
1761 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1762 range_tree_walk(msp->ms_defer[t],
1763 range_tree_remove, msp->ms_allocatable);
1764 }
1765
1766 /*
1767 * Call metaslab_recalculate_weight_and_sort() now that the
1768 * metaslab is loaded so we get the metaslab's real weight.
1769 *
1770 * Unless this metaslab was created with older software and
1771 * has not yet been converted to use segment-based weight, we
1772 * expect the new weight to be better or equal to the weight
1773 * that the metaslab had while it was not loaded. This is
1774 * because the old weight does not take into account the
1775 * consolidation of adjacent segments between TXGs. [see
1776 * comment for ms_synchist and ms_deferhist[] for more info]
1777 */
1778 uint64_t weight = msp->ms_weight;
1779 metaslab_recalculate_weight_and_sort(msp);
1780 if (!WEIGHT_IS_SPACEBASED(weight))
1781 ASSERT3U(weight, <=, msp->ms_weight);
1782 msp->ms_max_size = metaslab_block_maxsize(msp);
1783
1784 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1785 metaslab_verify_space(msp, spa_syncing_txg(spa));
1786 mutex_exit(&msp->ms_sync_lock);
1787
1788 return (0);
1789 }
1790
1791 int
1792 metaslab_load(metaslab_t *msp)
1793 {
1794 ASSERT(MUTEX_HELD(&msp->ms_lock));
1795
1796 /*
1797 * There may be another thread loading the same metaslab, if that's
1798 * the case just wait until the other thread is done and return.
1799 */
1800 metaslab_load_wait(msp);
1801 if (msp->ms_loaded)
1802 return (0);
1803 VERIFY(!msp->ms_loading);
1804 ASSERT(!msp->ms_condensing);
1805
1806 msp->ms_loading = B_TRUE;
1807 int error = metaslab_load_impl(msp);
1808 msp->ms_loading = B_FALSE;
1809 cv_broadcast(&msp->ms_load_cv);
1810
1811 return (error);
1812 }
1813
1814 void
1815 metaslab_unload(metaslab_t *msp)
1816 {
1817 ASSERT(MUTEX_HELD(&msp->ms_lock));
1818
1819 metaslab_verify_weight_and_frag(msp);
1820
1821 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1822 msp->ms_loaded = B_FALSE;
1823
1824 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1825 msp->ms_max_size = 0;
1826
1827 /*
1828 * We explicitly recalculate the metaslab's weight based on its space
1829 * map (as it is now not loaded). We want unload metaslabs to always
1830 * have their weights calculated from the space map histograms, while
1831 * loaded ones have it calculated from their in-core range tree
1832 * [see metaslab_load()]. This way, the weight reflects the information
1833 * available in-core, whether it is loaded or not
1834 *
1835 * If ms_group == NULL means that we came here from metaslab_fini(),
1836 * at which point it doesn't make sense for us to do the recalculation
1837 * and the sorting.
1838 */
1839 if (msp->ms_group != NULL)
1840 metaslab_recalculate_weight_and_sort(msp);
1841 }
1842
1843 static void
1844 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1845 int64_t defer_delta, int64_t space_delta)
1846 {
1847 vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1848
1849 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1850 ASSERT(vd->vdev_ms_count != 0);
1851
1852 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
1853 vdev_deflated_space(vd, space_delta));
1854 }
1855
1856 int
1857 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1858 metaslab_t **msp)
1859 {
1860 vdev_t *vd = mg->mg_vd;
1861 spa_t *spa = vd->vdev_spa;
1862 objset_t *mos = spa->spa_meta_objset;
1863 metaslab_t *ms;
1864 int error;
1865
1866 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1867 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1868 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1869 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1870
1871 ms->ms_id = id;
1872 ms->ms_start = id << vd->vdev_ms_shift;
1873 ms->ms_size = 1ULL << vd->vdev_ms_shift;
1874 ms->ms_allocator = -1;
1875 ms->ms_new = B_TRUE;
1876
1877 /*
1878 * We only open space map objects that already exist. All others
1879 * will be opened when we finally allocate an object for it.
1880 *
1881 * Note:
1882 * When called from vdev_expand(), we can't call into the DMU as
1883 * we are holding the spa_config_lock as a writer and we would
1884 * deadlock [see relevant comment in vdev_metaslab_init()]. in
1885 * that case, the object parameter is zero though, so we won't
1886 * call into the DMU.
1887 */
1888 if (object != 0) {
1889 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1890 ms->ms_size, vd->vdev_ashift);
1891
1892 if (error != 0) {
1893 kmem_free(ms, sizeof (metaslab_t));
1894 return (error);
1895 }
1896
1897 ASSERT(ms->ms_sm != NULL);
1898 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
1899 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
1900 }
1901
1902 /*
1903 * We create the ms_allocatable here, but we don't create the
1904 * other range trees until metaslab_sync_done(). This serves
1905 * two purposes: it allows metaslab_sync_done() to detect the
1906 * addition of new space; and for debugging, it ensures that
1907 * we'd data fault on any attempt to use this metaslab before
1908 * it's ready.
1909 */
1910 ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1911 metaslab_group_add(mg, ms);
1912
1913 metaslab_set_fragmentation(ms);
1914
1915 /*
1916 * If we're opening an existing pool (txg == 0) or creating
1917 * a new one (txg == TXG_INITIAL), all space is available now.
1918 * If we're adding space to an existing pool, the new space
1919 * does not become available until after this txg has synced.
1920 * The metaslab's weight will also be initialized when we sync
1921 * out this txg. This ensures that we don't attempt to allocate
1922 * from it before we have initialized it completely.
1923 */
1924 if (txg <= TXG_INITIAL) {
1925 metaslab_sync_done(ms, 0);
1926 metaslab_space_update(vd, mg->mg_class,
1927 metaslab_allocated_space(ms), 0, 0);
1928 }
1929
1930 /*
1931 * If metaslab_debug_load is set and we're initializing a metaslab
1932 * that has an allocated space map object then load the space map
1933 * so that we can verify frees.
1934 */
1935 if (metaslab_debug_load && ms->ms_sm != NULL) {
1936 mutex_enter(&ms->ms_lock);
1937 VERIFY0(metaslab_load(ms));
1938 mutex_exit(&ms->ms_lock);
1939 }
1940
1941 if (txg != 0) {
1942 vdev_dirty(vd, 0, NULL, txg);
1943 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1944 }
1945
1946 *msp = ms;
1947
1948 return (0);
1949 }
1950
1951 void
1952 metaslab_fini(metaslab_t *msp)
1953 {
1954 metaslab_group_t *mg = msp->ms_group;
1955 vdev_t *vd = mg->mg_vd;
1956
1957 metaslab_group_remove(mg, msp);
1958
1959 mutex_enter(&msp->ms_lock);
1960 VERIFY(msp->ms_group == NULL);
1961 metaslab_space_update(vd, mg->mg_class,
1962 -metaslab_allocated_space(msp), 0, -msp->ms_size);
1963
1964 space_map_close(msp->ms_sm);
1965
1966 metaslab_unload(msp);
1967
1968 range_tree_destroy(msp->ms_allocatable);
1969 range_tree_destroy(msp->ms_freeing);
1970 range_tree_destroy(msp->ms_freed);
1971
1972 for (int t = 0; t < TXG_SIZE; t++) {
1973 range_tree_destroy(msp->ms_allocating[t]);
1974 }
1975
1976 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1977 range_tree_destroy(msp->ms_defer[t]);
1978 }
1979 ASSERT0(msp->ms_deferspace);
1980
1981 range_tree_destroy(msp->ms_checkpointing);
1982
1983 for (int t = 0; t < TXG_SIZE; t++)
1984 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
1985
1986 mutex_exit(&msp->ms_lock);
1987 cv_destroy(&msp->ms_load_cv);
1988 mutex_destroy(&msp->ms_lock);
1989 mutex_destroy(&msp->ms_sync_lock);
1990 ASSERT3U(msp->ms_allocator, ==, -1);
1991
1992 kmem_free(msp, sizeof (metaslab_t));
1993 }
1994
1995 #define FRAGMENTATION_TABLE_SIZE 17
1996
1997 /*
1998 * This table defines a segment size based fragmentation metric that will
1999 * allow each metaslab to derive its own fragmentation value. This is done
2000 * by calculating the space in each bucket of the spacemap histogram and
2001 * multiplying that by the fragmentation metric in this table. Doing
2002 * this for all buckets and dividing it by the total amount of free
2003 * space in this metaslab (i.e. the total free space in all buckets) gives
2004 * us the fragmentation metric. This means that a high fragmentation metric
2005 * equates to most of the free space being comprised of small segments.
2006 * Conversely, if the metric is low, then most of the free space is in
2007 * large segments. A 10% change in fragmentation equates to approximately
2008 * double the number of segments.
2009 *
2010 * This table defines 0% fragmented space using 16MB segments. Testing has
2011 * shown that segments that are greater than or equal to 16MB do not suffer
2012 * from drastic performance problems. Using this value, we derive the rest
2013 * of the table. Since the fragmentation value is never stored on disk, it
2014 * is possible to change these calculations in the future.
2015 */
2016 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2017 100, /* 512B */
2018 100, /* 1K */
2019 98, /* 2K */
2020 95, /* 4K */
2021 90, /* 8K */
2022 80, /* 16K */
2023 70, /* 32K */
2024 60, /* 64K */
2025 50, /* 128K */
2026 40, /* 256K */
2027 30, /* 512K */
2028 20, /* 1M */
2029 15, /* 2M */
2030 10, /* 4M */
2031 5, /* 8M */
2032 0 /* 16M */
2033 };
2034
2035 /*
2036 * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2037 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2038 * been upgraded and does not support this metric. Otherwise, the return
2039 * value should be in the range [0, 100].
2040 */
2041 static void
2042 metaslab_set_fragmentation(metaslab_t *msp)
2043 {
2044 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2045 uint64_t fragmentation = 0;
2046 uint64_t total = 0;
2047 boolean_t feature_enabled = spa_feature_is_enabled(spa,
2048 SPA_FEATURE_SPACEMAP_HISTOGRAM);
2049
2050 if (!feature_enabled) {
2051 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2052 return;
2053 }
2054
2055 /*
2056 * A null space map means that the entire metaslab is free
2057 * and thus is not fragmented.
2058 */
2059 if (msp->ms_sm == NULL) {
2112 }
2113
2114 /*
2115 * Compute a weight -- a selection preference value -- for the given metaslab.
2116 * This is based on the amount of free space, the level of fragmentation,
2117 * the LBA range, and whether the metaslab is loaded.
2118 */
2119 static uint64_t
2120 metaslab_space_weight(metaslab_t *msp)
2121 {
2122 metaslab_group_t *mg = msp->ms_group;
2123 vdev_t *vd = mg->mg_vd;
2124 uint64_t weight, space;
2125
2126 ASSERT(MUTEX_HELD(&msp->ms_lock));
2127 ASSERT(!vd->vdev_removing);
2128
2129 /*
2130 * The baseline weight is the metaslab's free space.
2131 */
2132 space = msp->ms_size - metaslab_allocated_space(msp);
2133
2134 if (metaslab_fragmentation_factor_enabled &&
2135 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2136 /*
2137 * Use the fragmentation information to inversely scale
2138 * down the baseline weight. We need to ensure that we
2139 * don't exclude this metaslab completely when it's 100%
2140 * fragmented. To avoid this we reduce the fragmented value
2141 * by 1.
2142 */
2143 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2144
2145 /*
2146 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2147 * this metaslab again. The fragmentation metric may have
2148 * decreased the space to something smaller than
2149 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2150 * so that we can consume any remaining space.
2151 */
2152 if (space > 0 && space < SPA_MINBLOCKSIZE)
2216 continue;
2217
2218 if (segments != 0) {
2219 WEIGHT_SET_COUNT(weight, segments);
2220 WEIGHT_SET_INDEX(weight, i);
2221 WEIGHT_SET_ACTIVE(weight, 0);
2222 break;
2223 }
2224 }
2225 return (weight);
2226 }
2227
2228 /*
2229 * Calculate the weight based on the on-disk histogram. This should only
2230 * be called after a sync pass has completely finished since the on-disk
2231 * information is updated in metaslab_sync().
2232 */
2233 static uint64_t
2234 metaslab_weight_from_spacemap(metaslab_t *msp)
2235 {
2236 space_map_t *sm = msp->ms_sm;
2237 ASSERT(!msp->ms_loaded);
2238 ASSERT(sm != NULL);
2239 ASSERT3U(space_map_object(sm), !=, 0);
2240 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2241
2242 /*
2243 * Create a joint histogram from all the segments that have made
2244 * it to the metaslab's space map histogram, that are not yet
2245 * available for allocation because they are still in the freeing
2246 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2247 * these segments from the space map's histogram to get a more
2248 * accurate weight.
2249 */
2250 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2251 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2252 deferspace_histogram[i] += msp->ms_synchist[i];
2253 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2254 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2255 deferspace_histogram[i] += msp->ms_deferhist[t][i];
2256 }
2257 }
2258
2259 uint64_t weight = 0;
2260 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2261 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2262 deferspace_histogram[i]);
2263 uint64_t count =
2264 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
2265 if (count != 0) {
2266 WEIGHT_SET_COUNT(weight, count);
2267 WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
2268 WEIGHT_SET_ACTIVE(weight, 0);
2269 break;
2270 }
2271 }
2272 return (weight);
2273 }
2274
2275 /*
2276 * Compute a segment-based weight for the specified metaslab. The weight
2277 * is determined by highest bucket in the histogram. The information
2278 * for the highest bucket is encoded into the weight value.
2279 */
2280 static uint64_t
2281 metaslab_segment_weight(metaslab_t *msp)
2282 {
2283 metaslab_group_t *mg = msp->ms_group;
2284 uint64_t weight = 0;
2285 uint8_t shift = mg->mg_vd->vdev_ashift;
2286
2287 ASSERT(MUTEX_HELD(&msp->ms_lock));
2288
2289 /*
2290 * The metaslab is completely free.
2291 */
2292 if (metaslab_allocated_space(msp) == 0) {
2293 int idx = highbit64(msp->ms_size) - 1;
2294 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2295
2296 if (idx < max_idx) {
2297 WEIGHT_SET_COUNT(weight, 1ULL);
2298 WEIGHT_SET_INDEX(weight, idx);
2299 } else {
2300 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
2301 WEIGHT_SET_INDEX(weight, max_idx);
2302 }
2303 WEIGHT_SET_ACTIVE(weight, 0);
2304 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
2305
2306 return (weight);
2307 }
2308
2309 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2310
2311 /*
2312 * If the metaslab is fully allocated then just make the weight 0.
2313 */
2314 if (metaslab_allocated_space(msp) == msp->ms_size)
2315 return (0);
2316 /*
2317 * If the metaslab is already loaded, then use the range tree to
2318 * determine the weight. Otherwise, we rely on the space map information
2319 * to generate the weight.
2320 */
2321 if (msp->ms_loaded) {
2322 weight = metaslab_weight_from_range_tree(msp);
2323 } else {
2324 weight = metaslab_weight_from_spacemap(msp);
2325 }
2326
2327 /*
2328 * If the metaslab was active the last time we calculated its weight
2329 * then keep it active. We want to consume the entire region that
2330 * is associated with this weight.
2331 */
2332 if (msp->ms_activation_weight != 0 && weight != 0)
2333 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2334 return (weight);
2375 uint64_t weight;
2376
2377 ASSERT(MUTEX_HELD(&msp->ms_lock));
2378
2379 /*
2380 * If this vdev is in the process of being removed, there is nothing
2381 * for us to do here.
2382 */
2383 if (vd->vdev_removing)
2384 return (0);
2385
2386 metaslab_set_fragmentation(msp);
2387
2388 /*
2389 * Update the maximum size if the metaslab is loaded. This will
2390 * ensure that we get an accurate maximum size if newly freed space
2391 * has been added back into the free tree.
2392 */
2393 if (msp->ms_loaded)
2394 msp->ms_max_size = metaslab_block_maxsize(msp);
2395 else
2396 ASSERT0(msp->ms_max_size);
2397
2398 /*
2399 * Segment-based weighting requires space map histogram support.
2400 */
2401 if (zfs_metaslab_segment_weight_enabled &&
2402 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2403 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2404 sizeof (space_map_phys_t))) {
2405 weight = metaslab_segment_weight(msp);
2406 } else {
2407 weight = metaslab_space_weight(msp);
2408 }
2409 return (weight);
2410 }
2411
2412 void
2413 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
2414 {
2415 /* note: we preserve the mask (e.g. indication of primary, etc..) */
2416 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2417 metaslab_group_sort(msp->ms_group, msp,
2418 metaslab_weight(msp) | was_active);
2419 }
2420
2421 static int
2422 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2423 int allocator, uint64_t activation_weight)
2424 {
2425 /*
2426 * If we're activating for the claim code, we don't want to actually
2427 * set the metaslab up for a specific allocator.
2428 */
2429 if (activation_weight == METASLAB_WEIGHT_CLAIM)
2430 return (0);
2431 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2432 mg->mg_primaries : mg->mg_secondaries);
2433
2434 ASSERT(MUTEX_HELD(&msp->ms_lock));
2435 mutex_enter(&mg->mg_lock);
2436 if (arr[allocator] != NULL) {
2437 mutex_exit(&mg->mg_lock);
2438 return (EEXIST);
2439 }
2440
2785 ASSERT3P(alloctree, !=, NULL);
2786 ASSERT3P(msp->ms_freeing, !=, NULL);
2787 ASSERT3P(msp->ms_freed, !=, NULL);
2788 ASSERT3P(msp->ms_checkpointing, !=, NULL);
2789
2790 /*
2791 * Normally, we don't want to process a metaslab if there are no
2792 * allocations or frees to perform. However, if the metaslab is being
2793 * forced to condense and it's loaded, we need to let it through.
2794 */
2795 if (range_tree_is_empty(alloctree) &&
2796 range_tree_is_empty(msp->ms_freeing) &&
2797 range_tree_is_empty(msp->ms_checkpointing) &&
2798 !(msp->ms_loaded && msp->ms_condense_wanted))
2799 return;
2800
2801
2802 VERIFY(txg <= spa_final_dirty_txg(spa));
2803
2804 /*
2805 * The only state that can actually be changing concurrently
2806 * with metaslab_sync() is the metaslab's ms_allocatable. No
2807 * other thread can be modifying this txg's alloc, freeing,
2808 * freed, or space_map_phys_t. We drop ms_lock whenever we
2809 * could call into the DMU, because the DMU can call down to
2810 * us (e.g. via zio_free()) at any time.
2811 *
2812 * The spa_vdev_remove_thread() can be reading metaslab state
2813 * concurrently, and it is locked out by the ms_sync_lock.
2814 * Note that the ms_lock is insufficient for this, because it
2815 * is dropped by space_map_write().
2816 */
2817 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2818
2819 if (msp->ms_sm == NULL) {
2820 uint64_t new_object;
2821
2822 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2823 VERIFY3U(new_object, !=, 0);
2824
2825 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2826 msp->ms_start, msp->ms_size, vd->vdev_ashift));
2827
2828 ASSERT(msp->ms_sm != NULL);
2829 ASSERT0(metaslab_allocated_space(msp));
2830 }
2831
2832 if (!range_tree_is_empty(msp->ms_checkpointing) &&
2833 vd->vdev_checkpoint_sm == NULL) {
2834 ASSERT(spa_has_checkpoint(spa));
2835
2836 uint64_t new_object = space_map_alloc(mos,
2837 vdev_standard_sm_blksz, tx);
2838 VERIFY3U(new_object, !=, 0);
2839
2840 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2841 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2842 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2843
2844 /*
2845 * We save the space map object as an entry in vdev_top_zap
2846 * so it can be retrieved when the pool is reopened after an
2847 * export or through zdb.
2848 */
2849 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
2857 /*
2858 * Note: metaslab_condense() clears the space map's histogram.
2859 * Therefore we must verify and remove this histogram before
2860 * condensing.
2861 */
2862 metaslab_group_histogram_verify(mg);
2863 metaslab_class_histogram_verify(mg->mg_class);
2864 metaslab_group_histogram_remove(mg, msp);
2865
2866 if (msp->ms_loaded && metaslab_should_condense(msp)) {
2867 metaslab_condense(msp, txg, tx);
2868 } else {
2869 mutex_exit(&msp->ms_lock);
2870 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2871 SM_NO_VDEVID, tx);
2872 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2873 SM_NO_VDEVID, tx);
2874 mutex_enter(&msp->ms_lock);
2875 }
2876
2877 msp->ms_allocated_space += range_tree_space(alloctree);
2878 ASSERT3U(msp->ms_allocated_space, >=,
2879 range_tree_space(msp->ms_freeing));
2880 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
2881
2882 if (!range_tree_is_empty(msp->ms_checkpointing)) {
2883 ASSERT(spa_has_checkpoint(spa));
2884 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2885
2886 /*
2887 * Since we are doing writes to disk and the ms_checkpointing
2888 * tree won't be changing during that time, we drop the
2889 * ms_lock while writing to the checkpoint space map.
2890 */
2891 mutex_exit(&msp->ms_lock);
2892 space_map_write(vd->vdev_checkpoint_sm,
2893 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2894 mutex_enter(&msp->ms_lock);
2895
2896 spa->spa_checkpoint_info.sci_dspace +=
2897 range_tree_space(msp->ms_checkpointing);
2898 vd->vdev_stat.vs_checkpoint_space +=
2899 range_tree_space(msp->ms_checkpointing);
2900 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2901 -space_map_allocated(vd->vdev_checkpoint_sm));
2902
2903 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2904 }
2905
2906 if (msp->ms_loaded) {
2907 /*
2908 * When the space map is loaded, we have an accurate
2909 * histogram in the range tree. This gives us an opportunity
2910 * to bring the space map's histogram up-to-date so we clear
2911 * it first before updating it.
2912 */
2913 space_map_histogram_clear(msp->ms_sm);
2914 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2915
2916 /*
2917 * Since we've cleared the histogram we need to add back
2918 * any free space that has already been processed, plus
2919 * any deferred space. This allows the on-disk histogram
2920 * to accurately reflect all free space even if some space
2921 * is not yet available for allocation (i.e. deferred).
2926 * Add back any deferred free space that has not been
2927 * added back into the in-core free tree yet. This will
2928 * ensure that we don't end up with a space map histogram
2929 * that is completely empty unless the metaslab is fully
2930 * allocated.
2931 */
2932 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2933 space_map_histogram_add(msp->ms_sm,
2934 msp->ms_defer[t], tx);
2935 }
2936 }
2937
2938 /*
2939 * Always add the free space from this sync pass to the space
2940 * map histogram. We want to make sure that the on-disk histogram
2941 * accounts for all free space. If the space map is not loaded,
2942 * then we will lose some accuracy but will correct it the next
2943 * time we load the space map.
2944 */
2945 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
2946 metaslab_aux_histograms_update(msp);
2947
2948 metaslab_group_histogram_add(mg, msp);
2949 metaslab_group_histogram_verify(mg);
2950 metaslab_class_histogram_verify(mg->mg_class);
2951
2952 /*
2953 * For sync pass 1, we avoid traversing this txg's free range tree
2954 * and instead will just swap the pointers for freeing and freed.
2955 * We can safely do this since the freed_tree is guaranteed to be
2956 * empty on the initial pass.
2957 */
2958 if (spa_sync_pass(spa) == 1) {
2959 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
2960 ASSERT0(msp->ms_allocated_this_txg);
2961 } else {
2962 range_tree_vacate(msp->ms_freeing,
2963 range_tree_add, msp->ms_freed);
2964 }
2965 msp->ms_allocated_this_txg += range_tree_space(alloctree);
2966 range_tree_vacate(alloctree, NULL, NULL);
2967
2968 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2969 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2970 & TXG_MASK]));
2971 ASSERT0(range_tree_space(msp->ms_freeing));
2972 ASSERT0(range_tree_space(msp->ms_checkpointing));
2973
2974 mutex_exit(&msp->ms_lock);
2975
2976 if (object != space_map_object(msp->ms_sm)) {
2977 object = space_map_object(msp->ms_sm);
2978 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2979 msp->ms_id, sizeof (uint64_t), &object, tx);
2980 }
2981 mutex_exit(&msp->ms_sync_lock);
2982 dmu_tx_commit(tx);
2983 }
2984
2985 /*
3023 msp->ms_defer[t] = range_tree_create(NULL, NULL);
3024 }
3025
3026 ASSERT3P(msp->ms_checkpointing, ==, NULL);
3027 msp->ms_checkpointing = range_tree_create(NULL, NULL);
3028
3029 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
3030 }
3031 ASSERT0(range_tree_space(msp->ms_freeing));
3032 ASSERT0(range_tree_space(msp->ms_checkpointing));
3033
3034 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
3035
3036 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
3037 metaslab_class_get_alloc(spa_normal_class(spa));
3038 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
3039 defer_allowed = B_FALSE;
3040 }
3041
3042 defer_delta = 0;
3043 alloc_delta = msp->ms_allocated_this_txg -
3044 range_tree_space(msp->ms_freed);
3045 if (defer_allowed) {
3046 defer_delta = range_tree_space(msp->ms_freed) -
3047 range_tree_space(*defer_tree);
3048 } else {
3049 defer_delta -= range_tree_space(*defer_tree);
3050 }
3051
3052 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
3053 defer_delta, 0);
3054
3055 /*
3056 * If there's a metaslab_load() in progress, wait for it to complete
3057 * so that we have a consistent view of the in-core space map.
3058 */
3059 metaslab_load_wait(msp);
3060
3061 /*
3062 * Move the frees from the defer_tree back to the free
3063 * range tree (if it's loaded). Swap the freed_tree and
3064 * the defer_tree -- this is safe to do because we've
3065 * just emptied out the defer_tree.
3066 */
3067 range_tree_vacate(*defer_tree,
3068 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
3069 if (defer_allowed) {
3070 range_tree_swap(&msp->ms_freed, defer_tree);
3071 } else {
3072 range_tree_vacate(msp->ms_freed,
3073 msp->ms_loaded ? range_tree_add : NULL,
3074 msp->ms_allocatable);
3075 }
3076
3077 msp->ms_synced_length = space_map_length(msp->ms_sm);
3078
3079 msp->ms_deferspace += defer_delta;
3080 ASSERT3S(msp->ms_deferspace, >=, 0);
3081 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
3082 if (msp->ms_deferspace != 0) {
3083 /*
3084 * Keep syncing this metaslab until all deferred frees
3085 * are back in circulation.
3086 */
3087 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
3088 }
3089 metaslab_aux_histograms_update_done(msp, defer_allowed);
3090
3091 if (msp->ms_new) {
3092 msp->ms_new = B_FALSE;
3093 mutex_enter(&mg->mg_lock);
3094 mg->mg_ms_ready++;
3095 mutex_exit(&mg->mg_lock);
3096 }
3097
3098 /*
3099 * Re-sort metaslab within its group now that we've adjusted
3100 * its allocatable space.
3101 */
3102 metaslab_recalculate_weight_and_sort(msp);
3103
3104 /*
3105 * If the metaslab is loaded and we've not tried to load or allocate
3106 * from it in 'metaslab_unload_delay' txgs, then unload it.
3107 */
3108 if (msp->ms_loaded &&
3109 msp->ms_initializing == 0 &&
3110 msp->ms_selected_txg + metaslab_unload_delay < txg) {
3111 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3112 VERIFY0(range_tree_space(
3113 msp->ms_allocating[(txg + t) & TXG_MASK]));
3114 }
3115 if (msp->ms_allocator != -1) {
3116 metaslab_passivate(msp, msp->ms_weight &
3117 ~METASLAB_ACTIVE_MASK);
3118 }
3119
3120 if (!metaslab_debug_unload)
3121 metaslab_unload(msp);
3122 }
3123
3124 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3125 ASSERT0(range_tree_space(msp->ms_freeing));
3126 ASSERT0(range_tree_space(msp->ms_freed));
3127 ASSERT0(range_tree_space(msp->ms_checkpointing));
3128
3129 msp->ms_allocated_this_txg = 0;
3130 mutex_exit(&msp->ms_lock);
3131 }
3132
3133 void
3134 metaslab_sync_reassess(metaslab_group_t *mg)
3135 {
3136 spa_t *spa = mg->mg_class->mc_spa;
3137
3138 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3139 metaslab_group_alloc_update(mg);
3140 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
3141
3142 /*
3143 * Preload the next potential metaslabs but only on active
3144 * metaslab groups. We can get into a state where the metaslab
3145 * is no longer active since we dirty metaslabs as we remove a
3146 * a device, thus potentially making the metaslab group eligible
3147 * for preloading.
3148 */
3149 if (mg->mg_activation_count > 0) {
4365 vdev_t *vd;
4366
4367 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
4368 return (SET_ERROR(ENXIO));
4369 }
4370
4371 ASSERT(DVA_IS_VALID(dva));
4372
4373 if (DVA_GET_GANG(dva))
4374 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4375
4376 return (metaslab_claim_impl(vd, offset, size, txg));
4377 }
4378
4379 int
4380 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4381 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4382 zio_alloc_list_t *zal, zio_t *zio, int allocator)
4383 {
4384 dva_t *dva = bp->blk_dva;
4385 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
4386 int error = 0;
4387
4388 ASSERT(bp->blk_birth == 0);
4389 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4390
4391 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4392
4393 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
4394 spa_config_exit(spa, SCL_ALLOC, FTAG);
4395 return (SET_ERROR(ENOSPC));
4396 }
4397
4398 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4399 ASSERT(BP_GET_NDVAS(bp) == 0);
4400 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4401 ASSERT3P(zal, !=, NULL);
4402
4403 for (int d = 0; d < ndvas; d++) {
4404 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4405 txg, flags, zal, allocator);
4532 {
4533 metaslab_t *msp;
4534 spa_t *spa = vd->vdev_spa;
4535
4536 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4537 return;
4538
4539 if (vd->vdev_ops->vdev_op_remap != NULL) {
4540 vd->vdev_ops->vdev_op_remap(vd, offset, size,
4541 metaslab_check_free_impl_cb, NULL);
4542 return;
4543 }
4544
4545 ASSERT(vdev_is_concrete(vd));
4546 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4547 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4548
4549 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4550
4551 mutex_enter(&msp->ms_lock);
4552 if (msp->ms_loaded) {
4553 range_tree_verify_not_present(msp->ms_allocatable,
4554 offset, size);
4555 }
4556
4557 range_tree_verify_not_present(msp->ms_freeing, offset, size);
4558 range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
4559 range_tree_verify_not_present(msp->ms_freed, offset, size);
4560 for (int j = 0; j < TXG_DEFER_SIZE; j++)
4561 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
4562 mutex_exit(&msp->ms_lock);
4563 }
4564
4565 void
4566 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4567 {
4568 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4569 return;
4570
4571 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4572 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4573 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4574 vdev_t *vd = vdev_lookup_top(spa, vdev);
4575 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4576 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4577
4578 if (DVA_GET_GANG(&bp->blk_dva[i]))
4579 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4580
4581 ASSERT3P(vd, !=, NULL);
|