10592 Sdiff usr/src/uts/common/fs/zfs/metaslab.c

Print this page

10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

 481                 return (1);
 482 
 483         if (m1->ms_weight < m2->ms_weight)
 484                 return (1);
 485         if (m1->ms_weight > m2->ms_weight)
 486                 return (-1);
 487 
 488         /*
 489          * If the weights are identical, use the offset to force uniqueness.
 490          */
 491         if (m1->ms_start < m2->ms_start)
 492                 return (-1);
 493         if (m1->ms_start > m2->ms_start)
 494                 return (1);
 495 
 496         ASSERT3P(m1, ==, m2);
 497 
 498         return (0);
 499 }
 500 






 501 /*
 502  * Verify that the space accounting on disk matches the in-core range_trees.
 503  */
 504 void
 505 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 506 {
 507         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 508         uint64_t allocated = 0;
 509         uint64_t sm_free_space, msp_free_space;
 510 
 511         ASSERT(MUTEX_HELD(&msp->ms_lock));

 512 
 513         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 514                 return;
 515 
 516         /*
 517          * We can only verify the metaslab space when we're called
 518          * from syncing context with a loaded metaslab that has an allocated
 519          * space map. Calling this in non-syncing context does not
 520          * provide a consistent view of the metaslab since we're performing
 521          * allocations in the future.
 522          */
 523         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 524             !msp->ms_loaded)
 525                 return;
 526 
 527         sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 528             space_map_alloc_delta(msp->ms_sm);




 529 


 530         /*
 531          * Account for future allocations since we would have already
 532          * deducted that space from the ms_freetree.
 533          */
 534         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 535                 allocated +=
 536                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 537         }
 538 
 539         msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +




 540             msp->ms_deferspace + range_tree_space(msp->ms_freed);
 541 
 542         VERIFY3U(sm_free_space, ==, msp_free_space);
 543 }
 544 
 545 /*
 546  * ==========================================================================
 547  * Metaslab groups
 548  * ==========================================================================
 549  */
 550 /*
 551  * Update the allocatable flag and the metaslab group's capacity.
 552  * The allocatable flag is set to true if the capacity is below
 553  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 554  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 555  * transitions from allocatable to non-allocatable or vice versa then the
 556  * metaslab group's class is updated to reflect the transition.
 557  */
 558 static void
 559 metaslab_group_alloc_update(metaslab_group_t *mg)

 824 
 825 void
 826 metaslab_group_histogram_verify(metaslab_group_t *mg)
 827 {
 828         uint64_t *mg_hist;
 829         vdev_t *vd = mg->mg_vd;
 830         uint64_t ashift = vd->vdev_ashift;
 831         int i;
 832 
 833         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 834                 return;
 835 
 836         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 837             KM_SLEEP);
 838 
 839         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 840             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 841 
 842         for (int m = 0; m < vd->vdev_ms_count; m++) {
 843                 metaslab_t *msp = vd->vdev_ms[m];

 844 
 845                 /* skip if not active or not a member */
 846                 if (msp->ms_sm == NULL || msp->ms_group != mg)
 847                         continue;
 848 
 849                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 850                         mg_hist[i + ashift] +=
 851                             msp->ms_sm->sm_phys->smp_histogram[i];
 852         }
 853 
 854         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 855                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 856 
 857         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 858 }
 859 
 860 static void
 861 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 862 {
 863         metaslab_class_t *mc = mg->mg_class;

1444 
1445         if ((rs->rs_end - rs->rs_start) >= size) {
1446                 *cursor = rs->rs_start + size;
1447                 return (rs->rs_start);
1448         }
1449         return (-1ULL);
1450 }
1451 
1452 static metaslab_ops_t metaslab_ndf_ops = {
1453         metaslab_ndf_alloc
1454 };
1455 
1456 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1457 
1458 /*
1459  * ==========================================================================
1460  * Metaslabs
1461  * ==========================================================================
1462  */
1463 






































1464 /*































































































































































1465  * Wait for any in-progress metaslab loads to complete.
1466  */
1467 static void
1468 metaslab_load_wait(metaslab_t *msp)
1469 {
1470         ASSERT(MUTEX_HELD(&msp->ms_lock));
1471 
1472         while (msp->ms_loading) {
1473                 ASSERT(!msp->ms_loaded);
1474                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1475         }
1476 }
1477 
1478 static int
1479 metaslab_load_impl(metaslab_t *msp)
1480 {
1481         int error = 0;
1482 
1483         ASSERT(MUTEX_HELD(&msp->ms_lock));
1484         ASSERT(msp->ms_loading);

1485 
1486         /*
1487          * Nobody else can manipulate a loading metaslab, so it's now safe
1488          * to drop the lock. This way we don't have to hold the lock while
1489          * reading the spacemap from disk.













1490          */

1491         mutex_exit(&msp->ms_lock);
1492 
1493         /*
1494          * If the space map has not been allocated yet, then treat
1495          * all the space in the metaslab as free and add it to ms_allocatable.
1496          */
1497         if (msp->ms_sm != NULL) {
1498                 error = space_map_load(msp->ms_sm, msp->ms_allocatable,
1499                     SM_FREE);
1500         } else {





1501                 range_tree_add(msp->ms_allocatable,
1502                     msp->ms_start, msp->ms_size);
1503         }
1504 








1505         mutex_enter(&msp->ms_lock);

1506 
1507         if (error != 0)

1508                 return (error);

1509 
1510         ASSERT3P(msp->ms_group, !=, NULL);
1511         msp->ms_loaded = B_TRUE;
1512 
1513         /*
1514          * If the metaslab already has a spacemap, then we need to
1515          * remove all segments from the defer tree; otherwise, the
1516          * metaslab is completely empty and we can skip this.

1517          */
1518         if (msp->ms_sm != NULL) {
1519                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1520                         range_tree_walk(msp->ms_defer[t],
1521                             range_tree_remove, msp->ms_allocatable);
1522                 }
1523         }
















1524         msp->ms_max_size = metaslab_block_maxsize(msp);
1525 




1526         return (0);
1527 }
1528 
1529 int
1530 metaslab_load(metaslab_t *msp)
1531 {
1532         ASSERT(MUTEX_HELD(&msp->ms_lock));
1533 
1534         /*
1535          * There may be another thread loading the same metaslab, if that's
1536          * the case just wait until the other thread is done and return.
1537          */
1538         metaslab_load_wait(msp);
1539         if (msp->ms_loaded)
1540                 return (0);
1541         VERIFY(!msp->ms_loading);

1542 
1543         msp->ms_loading = B_TRUE;
1544         int error = metaslab_load_impl(msp);
1545         msp->ms_loading = B_FALSE;
1546         cv_broadcast(&msp->ms_load_cv);
1547 
1548         return (error);
1549 }
1550 
1551 void
1552 metaslab_unload(metaslab_t *msp)
1553 {
1554         ASSERT(MUTEX_HELD(&msp->ms_lock));



1555         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1556         msp->ms_loaded = B_FALSE;

1557         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1558         msp->ms_max_size = 0;















1559 }
1560 
1561 static void
1562 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1563     int64_t defer_delta, int64_t space_delta)
1564 {
1565         vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1566 
1567         ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1568         ASSERT(vd->vdev_ms_count != 0);
1569 
1570         metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
1571             vdev_deflated_space(vd, space_delta));
1572 }
1573 
1574 int
1575 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1576     metaslab_t **msp)
1577 {
1578         vdev_t *vd = mg->mg_vd;
1579         spa_t *spa = vd->vdev_spa;
1580         objset_t *mos = spa->spa_meta_objset;
1581         metaslab_t *ms;
1582         int error;
1583 
1584         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1585         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1586         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1587         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1588 
1589         ms->ms_id = id;
1590         ms->ms_start = id << vd->vdev_ms_shift;
1591         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1592         ms->ms_allocator = -1;
1593         ms->ms_new = B_TRUE;
1594 
1595         /*
1596          * We only open space map objects that already exist. All others
1597          * will be opened when we finally allocate an object for it.







1598          */
1599         if (object != 0) {
1600                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1601                     ms->ms_size, vd->vdev_ashift);
1602 
1603                 if (error != 0) {
1604                         kmem_free(ms, sizeof (metaslab_t));
1605                         return (error);
1606                 }
1607 
1608                 ASSERT(ms->ms_sm != NULL);


1609         }
1610 
1611         /*
1612          * We create the main range tree here, but we don't create the
1613          * other range trees until metaslab_sync_done().  This serves
1614          * two purposes: it allows metaslab_sync_done() to detect the
1615          * addition of new space; and for debugging, it ensures that we'd
1616          * data fault on any attempt to use this metaslab before it's ready.

1617          */
1618         ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1619         metaslab_group_add(mg, ms);
1620 
1621         metaslab_set_fragmentation(ms);
1622 
1623         /*
1624          * If we're opening an existing pool (txg == 0) or creating
1625          * a new one (txg == TXG_INITIAL), all space is available now.
1626          * If we're adding space to an existing pool, the new space
1627          * does not become available until after this txg has synced.
1628          * The metaslab's weight will also be initialized when we sync
1629          * out this txg. This ensures that we don't attempt to allocate
1630          * from it before we have initialized it completely.
1631          */
1632         if (txg <= TXG_INITIAL)
1633                 metaslab_sync_done(ms, 0);



1634 
1635         /*
1636          * If metaslab_debug_load is set and we're initializing a metaslab
1637          * that has an allocated space map object then load the space map
1638          * so that we can verify frees.
1639          */
1640         if (metaslab_debug_load && ms->ms_sm != NULL) {
1641                 mutex_enter(&ms->ms_lock);
1642                 VERIFY0(metaslab_load(ms));
1643                 mutex_exit(&ms->ms_lock);
1644         }
1645 
1646         if (txg != 0) {
1647                 vdev_dirty(vd, 0, NULL, txg);
1648                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1649         }
1650 
1651         *msp = ms;
1652 
1653         return (0);
1654 }
1655 
1656 void
1657 metaslab_fini(metaslab_t *msp)
1658 {
1659         metaslab_group_t *mg = msp->ms_group;
1660         vdev_t *vd = mg->mg_vd;
1661 
1662         metaslab_group_remove(mg, msp);
1663 
1664         mutex_enter(&msp->ms_lock);
1665         VERIFY(msp->ms_group == NULL);
1666         metaslab_space_update(vd, mg->mg_class,
1667             -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
1668 
1669         space_map_close(msp->ms_sm);
1670 
1671         metaslab_unload(msp);
1672 
1673         range_tree_destroy(msp->ms_allocatable);
1674         range_tree_destroy(msp->ms_freeing);
1675         range_tree_destroy(msp->ms_freed);
1676 
1677         for (int t = 0; t < TXG_SIZE; t++) {
1678                 range_tree_destroy(msp->ms_allocating[t]);
1679         }
1680 
1681         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1682                 range_tree_destroy(msp->ms_defer[t]);
1683         }
1684         ASSERT0(msp->ms_deferspace);
1685 
1686         range_tree_destroy(msp->ms_checkpointing);
1687 



1688         mutex_exit(&msp->ms_lock);
1689         cv_destroy(&msp->ms_load_cv);
1690         mutex_destroy(&msp->ms_lock);
1691         mutex_destroy(&msp->ms_sync_lock);
1692         ASSERT3U(msp->ms_allocator, ==, -1);
1693 
1694         kmem_free(msp, sizeof (metaslab_t));
1695 }
1696 
1697 #define FRAGMENTATION_TABLE_SIZE        17
1698 
1699 /*
1700  * This table defines a segment size based fragmentation metric that will
1701  * allow each metaslab to derive its own fragmentation value. This is done
1702  * by calculating the space in each bucket of the spacemap histogram and
1703  * multiplying that by the fragmetation metric in this table. Doing
1704  * this for all buckets and dividing it by the total amount of free
1705  * space in this metaslab (i.e. the total free space in all buckets) gives
1706  * us the fragmentation metric. This means that a high fragmentation metric
1707  * equates to most of the free space being comprised of small segments.
1708  * Conversely, if the metric is low, then most of the free space is in
1709  * large segments. A 10% change in fragmentation equates to approximately
1710  * double the number of segments.
1711  *
1712  * This table defines 0% fragmented space using 16MB segments. Testing has
1713  * shown that segments that are greater than or equal to 16MB do not suffer
1714  * from drastic performance problems. Using this value, we derive the rest
1715  * of the table. Since the fragmentation value is never stored on disk, it
1716  * is possible to change these calculations in the future.
1717  */
1718 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1719         100,    /* 512B */
1720         100,    /* 1K   */
1721         98,     /* 2K   */
1722         95,     /* 4K   */
1723         90,     /* 8K   */
1724         80,     /* 16K  */
1725         70,     /* 32K  */
1726         60,     /* 64K  */
1727         50,     /* 128K */
1728         40,     /* 256K */
1729         30,     /* 512K */
1730         20,     /* 1M   */
1731         15,     /* 2M   */
1732         10,     /* 4M   */
1733         5,      /* 8M   */
1734         0       /* 16M  */
1735 };
1736 
1737 /*
1738  * Calclate the metaslab's fragmentation metric. A return value
1739  * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1740  * not support this metric. Otherwise, the return value should be in the
1741  * range [0, 100].
1742  */
1743 static void
1744 metaslab_set_fragmentation(metaslab_t *msp)
1745 {
1746         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1747         uint64_t fragmentation = 0;
1748         uint64_t total = 0;
1749         boolean_t feature_enabled = spa_feature_is_enabled(spa,
1750             SPA_FEATURE_SPACEMAP_HISTOGRAM);
1751 
1752         if (!feature_enabled) {
1753                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1754                 return;
1755         }
1756 
1757         /*
1758          * A null space map means that the entire metaslab is free
1759          * and thus is not fragmented.
1760          */
1761         if (msp->ms_sm == NULL) {

1814 }
1815 
1816 /*
1817  * Compute a weight -- a selection preference value -- for the given metaslab.
1818  * This is based on the amount of free space, the level of fragmentation,
1819  * the LBA range, and whether the metaslab is loaded.
1820  */
1821 static uint64_t
1822 metaslab_space_weight(metaslab_t *msp)
1823 {
1824         metaslab_group_t *mg = msp->ms_group;
1825         vdev_t *vd = mg->mg_vd;
1826         uint64_t weight, space;
1827 
1828         ASSERT(MUTEX_HELD(&msp->ms_lock));
1829         ASSERT(!vd->vdev_removing);
1830 
1831         /*
1832          * The baseline weight is the metaslab's free space.
1833          */
1834         space = msp->ms_size - space_map_allocated(msp->ms_sm);
1835 
1836         if (metaslab_fragmentation_factor_enabled &&
1837             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1838                 /*
1839                  * Use the fragmentation information to inversely scale
1840                  * down the baseline weight. We need to ensure that we
1841                  * don't exclude this metaslab completely when it's 100%
1842                  * fragmented. To avoid this we reduce the fragmented value
1843                  * by 1.
1844                  */
1845                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1846 
1847                 /*
1848                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1849                  * this metaslab again. The fragmentation metric may have
1850                  * decreased the space to something smaller than
1851                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1852                  * so that we can consume any remaining space.
1853                  */
1854                 if (space > 0 && space < SPA_MINBLOCKSIZE)

1918                         continue;
1919 
1920                 if (segments != 0) {
1921                         WEIGHT_SET_COUNT(weight, segments);
1922                         WEIGHT_SET_INDEX(weight, i);
1923                         WEIGHT_SET_ACTIVE(weight, 0);
1924                         break;
1925                 }
1926         }
1927         return (weight);
1928 }
1929 
1930 /*
1931  * Calculate the weight based on the on-disk histogram. This should only
1932  * be called after a sync pass has completely finished since the on-disk
1933  * information is updated in metaslab_sync().
1934  */
1935 static uint64_t
1936 metaslab_weight_from_spacemap(metaslab_t *msp)
1937 {
1938         uint64_t weight = 0;




1939 


















1940         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1941                 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1942                         WEIGHT_SET_COUNT(weight,
1943                             msp->ms_sm->sm_phys->smp_histogram[i]);
1944                         WEIGHT_SET_INDEX(weight, i +
1945                             msp->ms_sm->sm_shift);


1946                         WEIGHT_SET_ACTIVE(weight, 0);
1947                         break;
1948                 }
1949         }
1950         return (weight);
1951 }
1952 
1953 /*
1954  * Compute a segment-based weight for the specified metaslab. The weight
1955  * is determined by highest bucket in the histogram. The information
1956  * for the highest bucket is encoded into the weight value.
1957  */
1958 static uint64_t
1959 metaslab_segment_weight(metaslab_t *msp)
1960 {
1961         metaslab_group_t *mg = msp->ms_group;
1962         uint64_t weight = 0;
1963         uint8_t shift = mg->mg_vd->vdev_ashift;
1964 
1965         ASSERT(MUTEX_HELD(&msp->ms_lock));
1966 
1967         /*
1968          * The metaslab is completely free.
1969          */
1970         if (space_map_allocated(msp->ms_sm) == 0) {
1971                 int idx = highbit64(msp->ms_size) - 1;
1972                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1973 
1974                 if (idx < max_idx) {
1975                         WEIGHT_SET_COUNT(weight, 1ULL);
1976                         WEIGHT_SET_INDEX(weight, idx);
1977                 } else {
1978                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1979                         WEIGHT_SET_INDEX(weight, max_idx);
1980                 }
1981                 WEIGHT_SET_ACTIVE(weight, 0);
1982                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1983 
1984                 return (weight);
1985         }
1986 
1987         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1988 
1989         /*
1990          * If the metaslab is fully allocated then just make the weight 0.
1991          */
1992         if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1993                 return (0);
1994         /*
1995          * If the metaslab is already loaded, then use the range tree to
1996          * determine the weight. Otherwise, we rely on the space map information
1997          * to generate the weight.
1998          */
1999         if (msp->ms_loaded) {
2000                 weight = metaslab_weight_from_range_tree(msp);
2001         } else {
2002                 weight = metaslab_weight_from_spacemap(msp);
2003         }
2004 
2005         /*
2006          * If the metaslab was active the last time we calculated its weight
2007          * then keep it active. We want to consume the entire region that
2008          * is associated with this weight.
2009          */
2010         if (msp->ms_activation_weight != 0 && weight != 0)
2011                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2012         return (weight);

2053         uint64_t weight;
2054 
2055         ASSERT(MUTEX_HELD(&msp->ms_lock));
2056 
2057         /*
2058          * If this vdev is in the process of being removed, there is nothing
2059          * for us to do here.
2060          */
2061         if (vd->vdev_removing)
2062                 return (0);
2063 
2064         metaslab_set_fragmentation(msp);
2065 
2066         /*
2067          * Update the maximum size if the metaslab is loaded. This will
2068          * ensure that we get an accurate maximum size if newly freed space
2069          * has been added back into the free tree.
2070          */
2071         if (msp->ms_loaded)
2072                 msp->ms_max_size = metaslab_block_maxsize(msp);


2073 
2074         /*
2075          * Segment-based weighting requires space map histogram support.
2076          */
2077         if (zfs_metaslab_segment_weight_enabled &&
2078             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2079             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2080             sizeof (space_map_phys_t))) {
2081                 weight = metaslab_segment_weight(msp);
2082         } else {
2083                 weight = metaslab_space_weight(msp);
2084         }
2085         return (weight);
2086 }
2087 









2088 static int
2089 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2090     int allocator, uint64_t activation_weight)
2091 {
2092         /*
2093          * If we're activating for the claim code, we don't want to actually
2094          * set the metaslab up for a specific allocator.
2095          */
2096         if (activation_weight == METASLAB_WEIGHT_CLAIM)
2097                 return (0);
2098         metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2099             mg->mg_primaries : mg->mg_secondaries);
2100 
2101         ASSERT(MUTEX_HELD(&msp->ms_lock));
2102         mutex_enter(&mg->mg_lock);
2103         if (arr[allocator] != NULL) {
2104                 mutex_exit(&mg->mg_lock);
2105                 return (EEXIST);
2106         }
2107

2452         ASSERT3P(alloctree, !=, NULL);
2453         ASSERT3P(msp->ms_freeing, !=, NULL);
2454         ASSERT3P(msp->ms_freed, !=, NULL);
2455         ASSERT3P(msp->ms_checkpointing, !=, NULL);
2456 
2457         /*
2458          * Normally, we don't want to process a metaslab if there are no
2459          * allocations or frees to perform. However, if the metaslab is being
2460          * forced to condense and it's loaded, we need to let it through.
2461          */
2462         if (range_tree_is_empty(alloctree) &&
2463             range_tree_is_empty(msp->ms_freeing) &&
2464             range_tree_is_empty(msp->ms_checkpointing) &&
2465             !(msp->ms_loaded && msp->ms_condense_wanted))
2466                 return;
2467 
2468 
2469         VERIFY(txg <= spa_final_dirty_txg(spa));
2470 
2471         /*
2472          * The only state that can actually be changing concurrently with
2473          * metaslab_sync() is the metaslab's ms_allocatable.  No other
2474          * thread can be modifying this txg's alloc, freeing,
2475          * freed, or space_map_phys_t.  We drop ms_lock whenever we
2476          * could call into the DMU, because the DMU can call down to us
2477          * (e.g. via zio_free()) at any time.
2478          *
2479          * The spa_vdev_remove_thread() can be reading metaslab state
2480          * concurrently, and it is locked out by the ms_sync_lock.  Note
2481          * that the ms_lock is insufficient for this, because it is dropped
2482          * by space_map_write().
2483          */
2484         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2485 
2486         if (msp->ms_sm == NULL) {
2487                 uint64_t new_object;
2488 
2489                 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2490                 VERIFY3U(new_object, !=, 0);
2491 
2492                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2493                     msp->ms_start, msp->ms_size, vd->vdev_ashift));

2494                 ASSERT(msp->ms_sm != NULL);

2495         }
2496 
2497         if (!range_tree_is_empty(msp->ms_checkpointing) &&
2498             vd->vdev_checkpoint_sm == NULL) {
2499                 ASSERT(spa_has_checkpoint(spa));
2500 
2501                 uint64_t new_object = space_map_alloc(mos,
2502                     vdev_standard_sm_blksz, tx);
2503                 VERIFY3U(new_object, !=, 0);
2504 
2505                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2506                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2507                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2508 
2509                 /*
2510                  * We save the space map object as an entry in vdev_top_zap
2511                  * so it can be retrieved when the pool is reopened after an
2512                  * export or through zdb.
2513                  */
2514                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,

2522         /*
2523          * Note: metaslab_condense() clears the space map's histogram.
2524          * Therefore we must verify and remove this histogram before
2525          * condensing.
2526          */
2527         metaslab_group_histogram_verify(mg);
2528         metaslab_class_histogram_verify(mg->mg_class);
2529         metaslab_group_histogram_remove(mg, msp);
2530 
2531         if (msp->ms_loaded && metaslab_should_condense(msp)) {
2532                 metaslab_condense(msp, txg, tx);
2533         } else {
2534                 mutex_exit(&msp->ms_lock);
2535                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2536                     SM_NO_VDEVID, tx);
2537                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2538                     SM_NO_VDEVID, tx);
2539                 mutex_enter(&msp->ms_lock);
2540         }
2541 





2542         if (!range_tree_is_empty(msp->ms_checkpointing)) {
2543                 ASSERT(spa_has_checkpoint(spa));
2544                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2545 
2546                 /*
2547                  * Since we are doing writes to disk and the ms_checkpointing
2548                  * tree won't be changing during that time, we drop the
2549                  * ms_lock while writing to the checkpoint space map.
2550                  */
2551                 mutex_exit(&msp->ms_lock);
2552                 space_map_write(vd->vdev_checkpoint_sm,
2553                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2554                 mutex_enter(&msp->ms_lock);
2555                 space_map_update(vd->vdev_checkpoint_sm);
2556 
2557                 spa->spa_checkpoint_info.sci_dspace +=
2558                     range_tree_space(msp->ms_checkpointing);
2559                 vd->vdev_stat.vs_checkpoint_space +=
2560                     range_tree_space(msp->ms_checkpointing);
2561                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2562                     -vd->vdev_checkpoint_sm->sm_alloc);
2563 
2564                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2565         }
2566 
2567         if (msp->ms_loaded) {
2568                 /*
2569                  * When the space map is loaded, we have an accurate
2570                  * histogram in the range tree. This gives us an opportunity
2571                  * to bring the space map's histogram up-to-date so we clear
2572                  * it first before updating it.
2573                  */
2574                 space_map_histogram_clear(msp->ms_sm);
2575                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2576 
2577                 /*
2578                  * Since we've cleared the histogram we need to add back
2579                  * any free space that has already been processed, plus
2580                  * any deferred space. This allows the on-disk histogram
2581                  * to accurately reflect all free space even if some space
2582                  * is not yet available for allocation (i.e. deferred).

2587                  * Add back any deferred free space that has not been
2588                  * added back into the in-core free tree yet. This will
2589                  * ensure that we don't end up with a space map histogram
2590                  * that is completely empty unless the metaslab is fully
2591                  * allocated.
2592                  */
2593                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2594                         space_map_histogram_add(msp->ms_sm,
2595                             msp->ms_defer[t], tx);
2596                 }
2597         }
2598 
2599         /*
2600          * Always add the free space from this sync pass to the space
2601          * map histogram. We want to make sure that the on-disk histogram
2602          * accounts for all free space. If the space map is not loaded,
2603          * then we will lose some accuracy but will correct it the next
2604          * time we load the space map.
2605          */
2606         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);

2607 
2608         metaslab_group_histogram_add(mg, msp);
2609         metaslab_group_histogram_verify(mg);
2610         metaslab_class_histogram_verify(mg->mg_class);
2611 
2612         /*
2613          * For sync pass 1, we avoid traversing this txg's free range tree
2614          * and instead will just swap the pointers for freeing and
2615          * freed. We can safely do this since the freed_tree is
2616          * guaranteed to be empty on the initial pass.
2617          */
2618         if (spa_sync_pass(spa) == 1) {
2619                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);

2620         } else {
2621                 range_tree_vacate(msp->ms_freeing,
2622                     range_tree_add, msp->ms_freed);
2623         }

2624         range_tree_vacate(alloctree, NULL, NULL);
2625 
2626         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2627         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2628             & TXG_MASK]));
2629         ASSERT0(range_tree_space(msp->ms_freeing));
2630         ASSERT0(range_tree_space(msp->ms_checkpointing));
2631 
2632         mutex_exit(&msp->ms_lock);
2633 
2634         if (object != space_map_object(msp->ms_sm)) {
2635                 object = space_map_object(msp->ms_sm);
2636                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2637                     msp->ms_id, sizeof (uint64_t), &object, tx);
2638         }
2639         mutex_exit(&msp->ms_sync_lock);
2640         dmu_tx_commit(tx);
2641 }
2642 
2643 /*

2681                         msp->ms_defer[t] = range_tree_create(NULL, NULL);
2682                 }
2683 
2684                 ASSERT3P(msp->ms_checkpointing, ==, NULL);
2685                 msp->ms_checkpointing = range_tree_create(NULL, NULL);
2686 
2687                 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
2688         }
2689         ASSERT0(range_tree_space(msp->ms_freeing));
2690         ASSERT0(range_tree_space(msp->ms_checkpointing));
2691 
2692         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
2693 
2694         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2695             metaslab_class_get_alloc(spa_normal_class(spa));
2696         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2697                 defer_allowed = B_FALSE;
2698         }
2699 
2700         defer_delta = 0;
2701         alloc_delta = space_map_alloc_delta(msp->ms_sm);

2702         if (defer_allowed) {
2703                 defer_delta = range_tree_space(msp->ms_freed) -
2704                     range_tree_space(*defer_tree);
2705         } else {
2706                 defer_delta -= range_tree_space(*defer_tree);
2707         }
2708 
2709         metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
2710             defer_delta, 0);
2711 
2712         /*
2713          * If there's a metaslab_load() in progress, wait for it to complete
2714          * so that we have a consistent view of the in-core space map.
2715          */
2716         metaslab_load_wait(msp);
2717 
2718         /*
2719          * Move the frees from the defer_tree back to the free
2720          * range tree (if it's loaded). Swap the freed_tree and
2721          * the defer_tree -- this is safe to do because we've
2722          * just emptied out the defer_tree.
2723          */
2724         range_tree_vacate(*defer_tree,
2725             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
2726         if (defer_allowed) {
2727                 range_tree_swap(&msp->ms_freed, defer_tree);
2728         } else {
2729                 range_tree_vacate(msp->ms_freed,
2730                     msp->ms_loaded ? range_tree_add : NULL,
2731                     msp->ms_allocatable);
2732         }
2733         space_map_update(msp->ms_sm);
2734 


2735         msp->ms_deferspace += defer_delta;
2736         ASSERT3S(msp->ms_deferspace, >=, 0);
2737         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2738         if (msp->ms_deferspace != 0) {
2739                 /*
2740                  * Keep syncing this metaslab until all deferred frees
2741                  * are back in circulation.
2742                  */
2743                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2744         }

2745 
2746         if (msp->ms_new) {
2747                 msp->ms_new = B_FALSE;
2748                 mutex_enter(&mg->mg_lock);
2749                 mg->mg_ms_ready++;
2750                 mutex_exit(&mg->mg_lock);
2751         }

2752         /*
2753          * Calculate the new weights before unloading any metaslabs.
2754          * This will give us the most accurate weighting.
2755          */
2756         metaslab_group_sort(mg, msp, metaslab_weight(msp) |
2757             (msp->ms_weight & METASLAB_ACTIVE_MASK));
2758 
2759         /*
2760          * If the metaslab is loaded and we've not tried to load or allocate
2761          * from it in 'metaslab_unload_delay' txgs, then unload it.
2762          */
2763         if (msp->ms_loaded &&
2764             msp->ms_initializing == 0 &&
2765             msp->ms_selected_txg + metaslab_unload_delay < txg) {
2766                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2767                         VERIFY0(range_tree_space(
2768                             msp->ms_allocating[(txg + t) & TXG_MASK]));
2769                 }
2770                 if (msp->ms_allocator != -1) {
2771                         metaslab_passivate(msp, msp->ms_weight &
2772                             ~METASLAB_ACTIVE_MASK);
2773                 }
2774 
2775                 if (!metaslab_debug_unload)
2776                         metaslab_unload(msp);
2777         }
2778 
2779         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2780         ASSERT0(range_tree_space(msp->ms_freeing));
2781         ASSERT0(range_tree_space(msp->ms_freed));
2782         ASSERT0(range_tree_space(msp->ms_checkpointing));
2783 

2784         mutex_exit(&msp->ms_lock);
2785 }
2786 
2787 void
2788 metaslab_sync_reassess(metaslab_group_t *mg)
2789 {
2790         spa_t *spa = mg->mg_class->mc_spa;
2791 
2792         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2793         metaslab_group_alloc_update(mg);
2794         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2795 
2796         /*
2797          * Preload the next potential metaslabs but only on active
2798          * metaslab groups. We can get into a state where the metaslab
2799          * is no longer active since we dirty metaslabs as we remove a
2800          * a device, thus potentially making the metaslab group eligible
2801          * for preloading.
2802          */
2803         if (mg->mg_activation_count > 0) {

4019         vdev_t *vd;
4020 
4021         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
4022                 return (SET_ERROR(ENXIO));
4023         }
4024 
4025         ASSERT(DVA_IS_VALID(dva));
4026 
4027         if (DVA_GET_GANG(dva))
4028                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4029 
4030         return (metaslab_claim_impl(vd, offset, size, txg));
4031 }
4032 
4033 int
4034 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4035     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4036     zio_alloc_list_t *zal, zio_t *zio, int allocator)
4037 {
4038         dva_t *dva = bp->blk_dva;
4039         dva_t *hintdva = hintbp->blk_dva;
4040         int error = 0;
4041 
4042         ASSERT(bp->blk_birth == 0);
4043         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4044 
4045         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4046 
4047         if (mc->mc_rotor == NULL) {  /* no vdevs in this class */
4048                 spa_config_exit(spa, SCL_ALLOC, FTAG);
4049                 return (SET_ERROR(ENOSPC));
4050         }
4051 
4052         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4053         ASSERT(BP_GET_NDVAS(bp) == 0);
4054         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4055         ASSERT3P(zal, !=, NULL);
4056 
4057         for (int d = 0; d < ndvas; d++) {
4058                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4059                     txg, flags, zal, allocator);

4186 {
4187         metaslab_t *msp;
4188         spa_t *spa = vd->vdev_spa;
4189 
4190         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4191                 return;
4192 
4193         if (vd->vdev_ops->vdev_op_remap != NULL) {
4194                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
4195                     metaslab_check_free_impl_cb, NULL);
4196                 return;
4197         }
4198 
4199         ASSERT(vdev_is_concrete(vd));
4200         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4201         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4202 
4203         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4204 
4205         mutex_enter(&msp->ms_lock);
4206         if (msp->ms_loaded)
4207                 range_tree_verify(msp->ms_allocatable, offset, size);


4208 
4209         range_tree_verify(msp->ms_freeing, offset, size);
4210         range_tree_verify(msp->ms_checkpointing, offset, size);
4211         range_tree_verify(msp->ms_freed, offset, size);
4212         for (int j = 0; j < TXG_DEFER_SIZE; j++)
4213                 range_tree_verify(msp->ms_defer[j], offset, size);
4214         mutex_exit(&msp->ms_lock);
4215 }
4216 
4217 void
4218 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4219 {
4220         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4221                 return;
4222 
4223         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4224         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4225                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4226                 vdev_t *vd = vdev_lookup_top(spa, vdev);
4227                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4228                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4229 
4230                 if (DVA_GET_GANG(&bp->blk_dva[i]))
4231                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4232 
4233                 ASSERT3P(vd, !=, NULL);

 481                 return (1);
 482 
 483         if (m1->ms_weight < m2->ms_weight)
 484                 return (1);
 485         if (m1->ms_weight > m2->ms_weight)
 486                 return (-1);
 487 
 488         /*
 489          * If the weights are identical, use the offset to force uniqueness.
 490          */
 491         if (m1->ms_start < m2->ms_start)
 492                 return (-1);
 493         if (m1->ms_start > m2->ms_start)
 494                 return (1);
 495 
 496         ASSERT3P(m1, ==, m2);
 497 
 498         return (0);
 499 }
 500 
 501 uint64_t
 502 metaslab_allocated_space(metaslab_t *msp)
 503 {
 504         return (msp->ms_allocated_space);
 505 }
 506 
 507 /*
 508  * Verify that the space accounting on disk matches the in-core range_trees.
 509  */
 510 static void
 511 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 512 {
 513         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 514         uint64_t allocating = 0;
 515         uint64_t sm_free_space, msp_free_space;
 516 
 517         ASSERT(MUTEX_HELD(&msp->ms_lock));
 518         ASSERT(!msp->ms_condensing);
 519 
 520         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 521                 return;
 522 
 523         /*
 524          * We can only verify the metaslab space when we're called
 525          * from syncing context with a loaded metaslab that has an
 526          * allocated space map. Calling this in non-syncing context
 527          * does not provide a consistent view of the metaslab since
 528          * we're performing allocations in the future.
 529          */
 530         if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 531             !msp->ms_loaded)
 532                 return;
 533 
 534         /*
 535          * Even though the smp_alloc field can get negative (e.g.
 536          * see vdev_checkpoint_sm), that should never be the case
 537          * when it come's to a metaslab's space map.
 538          */
 539         ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 540 
 541         sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 542 
 543         /*
 544          * Account for future allocations since we would have
 545          * already deducted that space from the ms_allocatable.
 546          */
 547         for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 548                 allocating +=
 549                     range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 550         }
 551 
 552         ASSERT3U(msp->ms_deferspace, ==,
 553             range_tree_space(msp->ms_defer[0]) +
 554             range_tree_space(msp->ms_defer[1]));
 555 
 556         msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 557             msp->ms_deferspace + range_tree_space(msp->ms_freed);
 558 
 559         VERIFY3U(sm_free_space, ==, msp_free_space);
 560 }
 561 
 562 /*
 563  * ==========================================================================
 564  * Metaslab groups
 565  * ==========================================================================
 566  */
 567 /*
 568  * Update the allocatable flag and the metaslab group's capacity.
 569  * The allocatable flag is set to true if the capacity is below
 570  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
 571  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
 572  * transitions from allocatable to non-allocatable or vice versa then the
 573  * metaslab group's class is updated to reflect the transition.
 574  */
 575 static void
 576 metaslab_group_alloc_update(metaslab_group_t *mg)

 841 
 842 void
 843 metaslab_group_histogram_verify(metaslab_group_t *mg)
 844 {
 845         uint64_t *mg_hist;
 846         vdev_t *vd = mg->mg_vd;
 847         uint64_t ashift = vd->vdev_ashift;
 848         int i;
 849 
 850         if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 851                 return;
 852 
 853         mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 854             KM_SLEEP);
 855 
 856         ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 857             SPACE_MAP_HISTOGRAM_SIZE + ashift);
 858 
 859         for (int m = 0; m < vd->vdev_ms_count; m++) {
 860                 metaslab_t *msp = vd->vdev_ms[m];
 861                 ASSERT(msp != NULL);
 862 
 863                 /* skip if not active or not a member */
 864                 if (msp->ms_sm == NULL || msp->ms_group != mg)
 865                         continue;
 866 
 867                 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 868                         mg_hist[i + ashift] +=
 869                             msp->ms_sm->sm_phys->smp_histogram[i];
 870         }
 871 
 872         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 873                 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 874 
 875         kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 876 }
 877 
 878 static void
 879 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 880 {
 881         metaslab_class_t *mc = mg->mg_class;

1462 
1463         if ((rs->rs_end - rs->rs_start) >= size) {
1464                 *cursor = rs->rs_start + size;
1465                 return (rs->rs_start);
1466         }
1467         return (-1ULL);
1468 }
1469 
1470 static metaslab_ops_t metaslab_ndf_ops = {
1471         metaslab_ndf_alloc
1472 };
1473 
1474 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1475 
1476 /*
1477  * ==========================================================================
1478  * Metaslabs
1479  * ==========================================================================
1480  */
1481 
1482 static void
1483 metaslab_aux_histograms_clear(metaslab_t *msp)
1484 {
1485         /*
1486          * Auxiliary histograms are only cleared when resetting them,
1487          * which can only happen while the metaslab is loaded.
1488          */
1489         ASSERT(msp->ms_loaded);
1490 
1491         bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1492         for (int t = 0; t < TXG_DEFER_SIZE; t++)
1493                 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1494 }
1495 
1496 static void
1497 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1498     range_tree_t *rt)
1499 {
1500         /*
1501          * This is modeled after space_map_histogram_add(), so refer to that
1502          * function for implementation details. We want this to work like
1503          * the space map histogram, and not the range tree histogram, as we
1504          * are essentially constructing a delta that will be later subtracted
1505          * from the space map histogram.
1506          */
1507         int idx = 0;
1508         for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1509                 ASSERT3U(i, >=, idx + shift);
1510                 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1511 
1512                 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1513                         ASSERT3U(idx + shift, ==, i);
1514                         idx++;
1515                         ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1516                 }
1517         }
1518 }
1519 
1520 /*
1521  * Called at every sync pass that the metaslab gets synced.
1522  *
1523  * The reason is that we want our auxiliary histograms to be updated
1524  * wherever the metaslab's space map histogram is updated. This way
1525  * we stay consistent on which parts of the metaslab space map's
1526  * histogram are currently not available for allocations (e.g because
1527  * they are in the defer, freed, and freeing trees).
1528  */
1529 static void
1530 metaslab_aux_histograms_update(metaslab_t *msp)
1531 {
1532         space_map_t *sm = msp->ms_sm;
1533         ASSERT(sm != NULL);
1534 
1535         /*
1536          * This is similar to the metaslab's space map histogram updates
1537          * that take place in metaslab_sync(). The only difference is that
1538          * we only care about segments that haven't made it into the
1539          * ms_allocatable tree yet.
1540          */
1541         if (msp->ms_loaded) {
1542                 metaslab_aux_histograms_clear(msp);
1543 
1544                 metaslab_aux_histogram_add(msp->ms_synchist,
1545                     sm->sm_shift, msp->ms_freed);
1546 
1547                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1548                         metaslab_aux_histogram_add(msp->ms_deferhist[t],
1549                             sm->sm_shift, msp->ms_defer[t]);
1550                 }
1551         }
1552 
1553         metaslab_aux_histogram_add(msp->ms_synchist,
1554             sm->sm_shift, msp->ms_freeing);
1555 }
1556 
1557 /*
1558  * Called every time we are done syncing (writing to) the metaslab,
1559  * i.e. at the end of each sync pass.
1560  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1561  */
1562 static void
1563 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1564 {
1565         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1566         space_map_t *sm = msp->ms_sm;
1567 
1568         if (sm == NULL) {
1569                 /*
1570                  * We came here from metaslab_init() when creating/opening a
1571                  * pool, looking at a metaslab that hasn't had any allocations
1572                  * yet.
1573                  */
1574                 return;
1575         }
1576 
1577         /*
1578          * This is similar to the actions that we take for the ms_freed
1579          * and ms_defer trees in metaslab_sync_done().
1580          */
1581         uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1582         if (defer_allowed) {
1583                 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1584                     sizeof (msp->ms_synchist));
1585         } else {
1586                 bzero(msp->ms_deferhist[hist_index],
1587                     sizeof (msp->ms_deferhist[hist_index]));
1588         }
1589         bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1590 }
1591 
1592 /*
1593  * Ensure that the metaslab's weight and fragmentation are consistent
1594  * with the contents of the histogram (either the range tree's histogram
1595  * or the space map's depending whether the metaslab is loaded).
1596  */
1597 static void
1598 metaslab_verify_weight_and_frag(metaslab_t *msp)
1599 {
1600         ASSERT(MUTEX_HELD(&msp->ms_lock));
1601 
1602         if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1603                 return;
1604 
1605         /* see comment in metaslab_verify_unflushed_changes() */
1606         if (msp->ms_group == NULL)
1607                 return;
1608 
1609         /*
1610          * Devices being removed always return a weight of 0 and leave
1611          * fragmentation and ms_max_size as is - there is nothing for
1612          * us to verify here.
1613          */
1614         vdev_t *vd = msp->ms_group->mg_vd;
1615         if (vd->vdev_removing)
1616                 return;
1617 
1618         /*
1619          * If the metaslab is dirty it probably means that we've done
1620          * some allocations or frees that have changed our histograms
1621          * and thus the weight.
1622          */
1623         for (int t = 0; t < TXG_SIZE; t++) {
1624                 if (txg_list_member(&vd->vdev_ms_list, msp, t))
1625                         return;
1626         }
1627 
1628         /*
1629          * This verification checks that our in-memory state is consistent
1630          * with what's on disk. If the pool is read-only then there aren't
1631          * any changes and we just have the initially-loaded state.
1632          */
1633         if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
1634                 return;
1635 
1636         /* some extra verification for in-core tree if you can */
1637         if (msp->ms_loaded) {
1638                 range_tree_stat_verify(msp->ms_allocatable);
1639                 VERIFY(space_map_histogram_verify(msp->ms_sm,
1640                     msp->ms_allocatable));
1641         }
1642 
1643         uint64_t weight = msp->ms_weight;
1644         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1645         boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
1646         uint64_t frag = msp->ms_fragmentation;
1647         uint64_t max_segsize = msp->ms_max_size;
1648 
1649         msp->ms_weight = 0;
1650         msp->ms_fragmentation = 0;
1651         msp->ms_max_size = 0;
1652 
1653         /*
1654          * This function is used for verification purposes. Regardless of
1655          * whether metaslab_weight() thinks this metaslab should be active or
1656          * not, we want to ensure that the actual weight (and therefore the
1657          * value of ms_weight) would be the same if it was to be recalculated
1658          * at this point.
1659          */
1660         msp->ms_weight = metaslab_weight(msp) | was_active;
1661 
1662         VERIFY3U(max_segsize, ==, msp->ms_max_size);
1663 
1664         /*
1665          * If the weight type changed then there is no point in doing
1666          * verification. Revert fields to their original values.
1667          */
1668         if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
1669             (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
1670                 msp->ms_fragmentation = frag;
1671                 msp->ms_weight = weight;
1672                 return;
1673         }
1674 
1675         VERIFY3U(msp->ms_fragmentation, ==, frag);
1676         VERIFY3U(msp->ms_weight, ==, weight);
1677 }
1678 
1679 /*
1680  * Wait for any in-progress metaslab loads to complete.
1681  */
1682 static void
1683 metaslab_load_wait(metaslab_t *msp)
1684 {
1685         ASSERT(MUTEX_HELD(&msp->ms_lock));
1686 
1687         while (msp->ms_loading) {
1688                 ASSERT(!msp->ms_loaded);
1689                 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1690         }
1691 }
1692 
1693 static int
1694 metaslab_load_impl(metaslab_t *msp)
1695 {
1696         int error = 0;
1697 
1698         ASSERT(MUTEX_HELD(&msp->ms_lock));
1699         ASSERT(msp->ms_loading);
1700         ASSERT(!msp->ms_condensing);
1701 
1702         /*
1703          * We temporarily drop the lock to unblock other operations while we
1704          * are reading the space map. Therefore, metaslab_sync() and
1705          * metaslab_sync_done() can run at the same time as we do.
1706          *
1707          * metaslab_sync() can append to the space map while we are loading.
1708          * Therefore we load only entries that existed when we started the
1709          * load. Additionally, metaslab_sync_done() has to wait for the load
1710          * to complete because there are potential races like metaslab_load()
1711          * loading parts of the space map that are currently being appended
1712          * by metaslab_sync(). If we didn't, the ms_allocatable would have
1713          * entries that metaslab_sync_done() would try to re-add later.
1714          *
1715          * That's why before dropping the lock we remember the synced length
1716          * of the metaslab and read up to that point of the space map,
1717          * ignoring entries appended by metaslab_sync() that happen after we
1718          * drop the lock.
1719          */
1720         uint64_t length = msp->ms_synced_length;
1721         mutex_exit(&msp->ms_lock);
1722 




1723         if (msp->ms_sm != NULL) {
1724                 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
1725                     SM_FREE, length);
1726         } else {
1727                 /*
1728                  * The space map has not been allocated yet, so treat
1729                  * all the space in the metaslab as free and add it to the
1730                  * ms_allocatable tree.
1731                  */
1732                 range_tree_add(msp->ms_allocatable,
1733                     msp->ms_start, msp->ms_size);
1734         }
1735 
1736         /*
1737          * We need to grab the ms_sync_lock to prevent metaslab_sync() from
1738          * changing the ms_sm and the metaslab's range trees while we are
1739          * about to use them and populate the ms_allocatable. The ms_lock
1740          * is insufficient for this because metaslab_sync() doesn't hold
1741          * the ms_lock while writing the ms_checkpointing tree to disk.
1742          */
1743         mutex_enter(&msp->ms_sync_lock);
1744         mutex_enter(&msp->ms_lock);
1745         ASSERT(!msp->ms_condensing);
1746 
1747         if (error != 0) {
1748                 mutex_exit(&msp->ms_sync_lock);
1749                 return (error);
1750         }
1751 
1752         ASSERT3P(msp->ms_group, !=, NULL);
1753         msp->ms_loaded = B_TRUE;
1754 
1755         /*
1756          * The ms_allocatable contains the segments that exist in the
1757          * ms_defer trees [see ms_synced_length]. Thus we need to remove
1758          * them from ms_allocatable as they will be added again in
1759          * metaslab_sync_done().
1760          */

1761         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1762                 range_tree_walk(msp->ms_defer[t],
1763                     range_tree_remove, msp->ms_allocatable);
1764         }
1765 
1766         /*
1767          * Call metaslab_recalculate_weight_and_sort() now that the
1768          * metaslab is loaded so we get the metaslab's real weight.
1769          *
1770          * Unless this metaslab was created with older software and
1771          * has not yet been converted to use segment-based weight, we
1772          * expect the new weight to be better or equal to the weight
1773          * that the metaslab had while it was not loaded. This is
1774          * because the old weight does not take into account the
1775          * consolidation of adjacent segments between TXGs. [see
1776          * comment for ms_synchist and ms_deferhist[] for more info]
1777          */
1778         uint64_t weight = msp->ms_weight;
1779         metaslab_recalculate_weight_and_sort(msp);
1780         if (!WEIGHT_IS_SPACEBASED(weight))
1781                 ASSERT3U(weight, <=, msp->ms_weight);
1782         msp->ms_max_size = metaslab_block_maxsize(msp);
1783 
1784         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1785         metaslab_verify_space(msp, spa_syncing_txg(spa));
1786         mutex_exit(&msp->ms_sync_lock);
1787 
1788         return (0);
1789 }
1790 
1791 int
1792 metaslab_load(metaslab_t *msp)
1793 {
1794         ASSERT(MUTEX_HELD(&msp->ms_lock));
1795 
1796         /*
1797          * There may be another thread loading the same metaslab, if that's
1798          * the case just wait until the other thread is done and return.
1799          */
1800         metaslab_load_wait(msp);
1801         if (msp->ms_loaded)
1802                 return (0);
1803         VERIFY(!msp->ms_loading);
1804         ASSERT(!msp->ms_condensing);
1805 
1806         msp->ms_loading = B_TRUE;
1807         int error = metaslab_load_impl(msp);
1808         msp->ms_loading = B_FALSE;
1809         cv_broadcast(&msp->ms_load_cv);
1810 
1811         return (error);
1812 }
1813 
1814 void
1815 metaslab_unload(metaslab_t *msp)
1816 {
1817         ASSERT(MUTEX_HELD(&msp->ms_lock));
1818 
1819         metaslab_verify_weight_and_frag(msp);
1820 
1821         range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1822         msp->ms_loaded = B_FALSE;
1823 
1824         msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1825         msp->ms_max_size = 0;
1826 
1827         /*
1828          * We explicitly recalculate the metaslab's weight based on its space
1829          * map (as it is now not loaded). We want unload metaslabs to always
1830          * have their weights calculated from the space map histograms, while
1831          * loaded ones have it calculated from their in-core range tree
1832          * [see metaslab_load()]. This way, the weight reflects the information
1833          * available in-core, whether it is loaded or not
1834          *
1835          * If ms_group == NULL means that we came here from metaslab_fini(),
1836          * at which point it doesn't make sense for us to do the recalculation
1837          * and the sorting.
1838          */
1839         if (msp->ms_group != NULL)
1840                 metaslab_recalculate_weight_and_sort(msp);
1841 }
1842 
1843 static void
1844 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1845     int64_t defer_delta, int64_t space_delta)
1846 {
1847         vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1848 
1849         ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1850         ASSERT(vd->vdev_ms_count != 0);
1851 
1852         metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
1853             vdev_deflated_space(vd, space_delta));
1854 }
1855 
1856 int
1857 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1858     metaslab_t **msp)
1859 {
1860         vdev_t *vd = mg->mg_vd;
1861         spa_t *spa = vd->vdev_spa;
1862         objset_t *mos = spa->spa_meta_objset;
1863         metaslab_t *ms;
1864         int error;
1865 
1866         ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1867         mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1868         mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
1869         cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1870 
1871         ms->ms_id = id;
1872         ms->ms_start = id << vd->vdev_ms_shift;
1873         ms->ms_size = 1ULL << vd->vdev_ms_shift;
1874         ms->ms_allocator = -1;
1875         ms->ms_new = B_TRUE;
1876 
1877         /*
1878          * We only open space map objects that already exist. All others
1879          * will be opened when we finally allocate an object for it.
1880          *
1881          * Note:
1882          * When called from vdev_expand(), we can't call into the DMU as
1883          * we are holding the spa_config_lock as a writer and we would
1884          * deadlock [see relevant comment in vdev_metaslab_init()]. in
1885          * that case, the object parameter is zero though, so we won't
1886          * call into the DMU.
1887          */
1888         if (object != 0) {
1889                 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1890                     ms->ms_size, vd->vdev_ashift);
1891 
1892                 if (error != 0) {
1893                         kmem_free(ms, sizeof (metaslab_t));
1894                         return (error);
1895                 }
1896 
1897                 ASSERT(ms->ms_sm != NULL);
1898                 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
1899                 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
1900         }
1901 
1902         /*
1903          * We create the ms_allocatable here, but we don't create the
1904          * other range trees until metaslab_sync_done().  This serves
1905          * two purposes: it allows metaslab_sync_done() to detect the
1906          * addition of new space; and for debugging, it ensures that
1907          * we'd data fault on any attempt to use this metaslab before
1908          * it's ready.
1909          */
1910         ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1911         metaslab_group_add(mg, ms);
1912 
1913         metaslab_set_fragmentation(ms);
1914 
1915         /*
1916          * If we're opening an existing pool (txg == 0) or creating
1917          * a new one (txg == TXG_INITIAL), all space is available now.
1918          * If we're adding space to an existing pool, the new space
1919          * does not become available until after this txg has synced.
1920          * The metaslab's weight will also be initialized when we sync
1921          * out this txg. This ensures that we don't attempt to allocate
1922          * from it before we have initialized it completely.
1923          */
1924         if (txg <= TXG_INITIAL) {
1925                 metaslab_sync_done(ms, 0);
1926                 metaslab_space_update(vd, mg->mg_class,
1927                     metaslab_allocated_space(ms), 0, 0);
1928         }
1929 
1930         /*
1931          * If metaslab_debug_load is set and we're initializing a metaslab
1932          * that has an allocated space map object then load the space map
1933          * so that we can verify frees.
1934          */
1935         if (metaslab_debug_load && ms->ms_sm != NULL) {
1936                 mutex_enter(&ms->ms_lock);
1937                 VERIFY0(metaslab_load(ms));
1938                 mutex_exit(&ms->ms_lock);
1939         }
1940 
1941         if (txg != 0) {
1942                 vdev_dirty(vd, 0, NULL, txg);
1943                 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1944         }
1945 
1946         *msp = ms;
1947 
1948         return (0);
1949 }
1950 
1951 void
1952 metaslab_fini(metaslab_t *msp)
1953 {
1954         metaslab_group_t *mg = msp->ms_group;
1955         vdev_t *vd = mg->mg_vd;
1956 
1957         metaslab_group_remove(mg, msp);
1958 
1959         mutex_enter(&msp->ms_lock);
1960         VERIFY(msp->ms_group == NULL);
1961         metaslab_space_update(vd, mg->mg_class,
1962             -metaslab_allocated_space(msp), 0, -msp->ms_size);
1963 
1964         space_map_close(msp->ms_sm);
1965 
1966         metaslab_unload(msp);
1967 
1968         range_tree_destroy(msp->ms_allocatable);
1969         range_tree_destroy(msp->ms_freeing);
1970         range_tree_destroy(msp->ms_freed);
1971 
1972         for (int t = 0; t < TXG_SIZE; t++) {
1973                 range_tree_destroy(msp->ms_allocating[t]);
1974         }
1975 
1976         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1977                 range_tree_destroy(msp->ms_defer[t]);
1978         }
1979         ASSERT0(msp->ms_deferspace);
1980 
1981         range_tree_destroy(msp->ms_checkpointing);
1982 
1983         for (int t = 0; t < TXG_SIZE; t++)
1984                 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
1985 
1986         mutex_exit(&msp->ms_lock);
1987         cv_destroy(&msp->ms_load_cv);
1988         mutex_destroy(&msp->ms_lock);
1989         mutex_destroy(&msp->ms_sync_lock);
1990         ASSERT3U(msp->ms_allocator, ==, -1);
1991 
1992         kmem_free(msp, sizeof (metaslab_t));
1993 }
1994 
1995 #define FRAGMENTATION_TABLE_SIZE        17
1996 
1997 /*
1998  * This table defines a segment size based fragmentation metric that will
1999  * allow each metaslab to derive its own fragmentation value. This is done
2000  * by calculating the space in each bucket of the spacemap histogram and
2001  * multiplying that by the fragmentation metric in this table. Doing
2002  * this for all buckets and dividing it by the total amount of free
2003  * space in this metaslab (i.e. the total free space in all buckets) gives
2004  * us the fragmentation metric. This means that a high fragmentation metric
2005  * equates to most of the free space being comprised of small segments.
2006  * Conversely, if the metric is low, then most of the free space is in
2007  * large segments. A 10% change in fragmentation equates to approximately
2008  * double the number of segments.
2009  *
2010  * This table defines 0% fragmented space using 16MB segments. Testing has
2011  * shown that segments that are greater than or equal to 16MB do not suffer
2012  * from drastic performance problems. Using this value, we derive the rest
2013  * of the table. Since the fragmentation value is never stored on disk, it
2014  * is possible to change these calculations in the future.
2015  */
2016 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2017         100,    /* 512B */
2018         100,    /* 1K   */
2019         98,     /* 2K   */
2020         95,     /* 4K   */
2021         90,     /* 8K   */
2022         80,     /* 16K  */
2023         70,     /* 32K  */
2024         60,     /* 64K  */
2025         50,     /* 128K */
2026         40,     /* 256K */
2027         30,     /* 512K */
2028         20,     /* 1M   */
2029         15,     /* 2M   */
2030         10,     /* 4M   */
2031         5,      /* 8M   */
2032         0       /* 16M  */
2033 };
2034 
2035 /*
2036  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2037  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2038  * been upgraded and does not support this metric. Otherwise, the return
2039  * value should be in the range [0, 100].
2040  */
2041 static void
2042 metaslab_set_fragmentation(metaslab_t *msp)
2043 {
2044         spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2045         uint64_t fragmentation = 0;
2046         uint64_t total = 0;
2047         boolean_t feature_enabled = spa_feature_is_enabled(spa,
2048             SPA_FEATURE_SPACEMAP_HISTOGRAM);
2049 
2050         if (!feature_enabled) {
2051                 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2052                 return;
2053         }
2054 
2055         /*
2056          * A null space map means that the entire metaslab is free
2057          * and thus is not fragmented.
2058          */
2059         if (msp->ms_sm == NULL) {

2112 }
2113 
2114 /*
2115  * Compute a weight -- a selection preference value -- for the given metaslab.
2116  * This is based on the amount of free space, the level of fragmentation,
2117  * the LBA range, and whether the metaslab is loaded.
2118  */
2119 static uint64_t
2120 metaslab_space_weight(metaslab_t *msp)
2121 {
2122         metaslab_group_t *mg = msp->ms_group;
2123         vdev_t *vd = mg->mg_vd;
2124         uint64_t weight, space;
2125 
2126         ASSERT(MUTEX_HELD(&msp->ms_lock));
2127         ASSERT(!vd->vdev_removing);
2128 
2129         /*
2130          * The baseline weight is the metaslab's free space.
2131          */
2132         space = msp->ms_size - metaslab_allocated_space(msp);
2133 
2134         if (metaslab_fragmentation_factor_enabled &&
2135             msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2136                 /*
2137                  * Use the fragmentation information to inversely scale
2138                  * down the baseline weight. We need to ensure that we
2139                  * don't exclude this metaslab completely when it's 100%
2140                  * fragmented. To avoid this we reduce the fragmented value
2141                  * by 1.
2142                  */
2143                 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2144 
2145                 /*
2146                  * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2147                  * this metaslab again. The fragmentation metric may have
2148                  * decreased the space to something smaller than
2149                  * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2150                  * so that we can consume any remaining space.
2151                  */
2152                 if (space > 0 && space < SPA_MINBLOCKSIZE)

2216                         continue;
2217 
2218                 if (segments != 0) {
2219                         WEIGHT_SET_COUNT(weight, segments);
2220                         WEIGHT_SET_INDEX(weight, i);
2221                         WEIGHT_SET_ACTIVE(weight, 0);
2222                         break;
2223                 }
2224         }
2225         return (weight);
2226 }
2227 
2228 /*
2229  * Calculate the weight based on the on-disk histogram. This should only
2230  * be called after a sync pass has completely finished since the on-disk
2231  * information is updated in metaslab_sync().
2232  */
2233 static uint64_t
2234 metaslab_weight_from_spacemap(metaslab_t *msp)
2235 {
2236         space_map_t *sm = msp->ms_sm;
2237         ASSERT(!msp->ms_loaded);
2238         ASSERT(sm != NULL);
2239         ASSERT3U(space_map_object(sm), !=, 0);
2240         ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2241 
2242         /*
2243          * Create a joint histogram from all the segments that have made
2244          * it to the metaslab's space map histogram, that are not yet
2245          * available for allocation because they are still in the freeing
2246          * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2247          * these segments from the space map's histogram to get a more
2248          * accurate weight.
2249          */
2250         uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2251         for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2252                 deferspace_histogram[i] += msp->ms_synchist[i];
2253         for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2254                 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2255                         deferspace_histogram[i] += msp->ms_deferhist[t][i];
2256                 }
2257         }
2258 
2259         uint64_t weight = 0;
2260         for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2261                 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2262                     deferspace_histogram[i]);
2263                 uint64_t count =
2264                     sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
2265                 if (count != 0) {
2266                         WEIGHT_SET_COUNT(weight, count);
2267                         WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
2268                         WEIGHT_SET_ACTIVE(weight, 0);
2269                         break;
2270                 }
2271         }
2272         return (weight);
2273 }
2274 
2275 /*
2276  * Compute a segment-based weight for the specified metaslab. The weight
2277  * is determined by highest bucket in the histogram. The information
2278  * for the highest bucket is encoded into the weight value.
2279  */
2280 static uint64_t
2281 metaslab_segment_weight(metaslab_t *msp)
2282 {
2283         metaslab_group_t *mg = msp->ms_group;
2284         uint64_t weight = 0;
2285         uint8_t shift = mg->mg_vd->vdev_ashift;
2286 
2287         ASSERT(MUTEX_HELD(&msp->ms_lock));
2288 
2289         /*
2290          * The metaslab is completely free.
2291          */
2292         if (metaslab_allocated_space(msp) == 0) {
2293                 int idx = highbit64(msp->ms_size) - 1;
2294                 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2295 
2296                 if (idx < max_idx) {
2297                         WEIGHT_SET_COUNT(weight, 1ULL);
2298                         WEIGHT_SET_INDEX(weight, idx);
2299                 } else {
2300                         WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
2301                         WEIGHT_SET_INDEX(weight, max_idx);
2302                 }
2303                 WEIGHT_SET_ACTIVE(weight, 0);
2304                 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
2305 
2306                 return (weight);
2307         }
2308 
2309         ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2310 
2311         /*
2312          * If the metaslab is fully allocated then just make the weight 0.
2313          */
2314         if (metaslab_allocated_space(msp) == msp->ms_size)
2315                 return (0);
2316         /*
2317          * If the metaslab is already loaded, then use the range tree to
2318          * determine the weight. Otherwise, we rely on the space map information
2319          * to generate the weight.
2320          */
2321         if (msp->ms_loaded) {
2322                 weight = metaslab_weight_from_range_tree(msp);
2323         } else {
2324                 weight = metaslab_weight_from_spacemap(msp);
2325         }
2326 
2327         /*
2328          * If the metaslab was active the last time we calculated its weight
2329          * then keep it active. We want to consume the entire region that
2330          * is associated with this weight.
2331          */
2332         if (msp->ms_activation_weight != 0 && weight != 0)
2333                 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
2334         return (weight);

2375         uint64_t weight;
2376 
2377         ASSERT(MUTEX_HELD(&msp->ms_lock));
2378 
2379         /*
2380          * If this vdev is in the process of being removed, there is nothing
2381          * for us to do here.
2382          */
2383         if (vd->vdev_removing)
2384                 return (0);
2385 
2386         metaslab_set_fragmentation(msp);
2387 
2388         /*
2389          * Update the maximum size if the metaslab is loaded. This will
2390          * ensure that we get an accurate maximum size if newly freed space
2391          * has been added back into the free tree.
2392          */
2393         if (msp->ms_loaded)
2394                 msp->ms_max_size = metaslab_block_maxsize(msp);
2395         else
2396                 ASSERT0(msp->ms_max_size);
2397 
2398         /*
2399          * Segment-based weighting requires space map histogram support.
2400          */
2401         if (zfs_metaslab_segment_weight_enabled &&
2402             spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2403             (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2404             sizeof (space_map_phys_t))) {
2405                 weight = metaslab_segment_weight(msp);
2406         } else {
2407                 weight = metaslab_space_weight(msp);
2408         }
2409         return (weight);
2410 }
2411 
2412 void
2413 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
2414 {
2415         /* note: we preserve the mask (e.g. indication of primary, etc..) */
2416         uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2417         metaslab_group_sort(msp->ms_group, msp,
2418             metaslab_weight(msp) | was_active);
2419 }
2420 
2421 static int
2422 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2423     int allocator, uint64_t activation_weight)
2424 {
2425         /*
2426          * If we're activating for the claim code, we don't want to actually
2427          * set the metaslab up for a specific allocator.
2428          */
2429         if (activation_weight == METASLAB_WEIGHT_CLAIM)
2430                 return (0);
2431         metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
2432             mg->mg_primaries : mg->mg_secondaries);
2433 
2434         ASSERT(MUTEX_HELD(&msp->ms_lock));
2435         mutex_enter(&mg->mg_lock);
2436         if (arr[allocator] != NULL) {
2437                 mutex_exit(&mg->mg_lock);
2438                 return (EEXIST);
2439         }
2440

2785         ASSERT3P(alloctree, !=, NULL);
2786         ASSERT3P(msp->ms_freeing, !=, NULL);
2787         ASSERT3P(msp->ms_freed, !=, NULL);
2788         ASSERT3P(msp->ms_checkpointing, !=, NULL);
2789 
2790         /*
2791          * Normally, we don't want to process a metaslab if there are no
2792          * allocations or frees to perform. However, if the metaslab is being
2793          * forced to condense and it's loaded, we need to let it through.
2794          */
2795         if (range_tree_is_empty(alloctree) &&
2796             range_tree_is_empty(msp->ms_freeing) &&
2797             range_tree_is_empty(msp->ms_checkpointing) &&
2798             !(msp->ms_loaded && msp->ms_condense_wanted))
2799                 return;
2800 
2801 
2802         VERIFY(txg <= spa_final_dirty_txg(spa));
2803 
2804         /*
2805          * The only state that can actually be changing concurrently
2806          * with metaslab_sync() is the metaslab's ms_allocatable. No
2807          * other thread can be modifying this txg's alloc, freeing,
2808          * freed, or space_map_phys_t.  We drop ms_lock whenever we
2809          * could call into the DMU, because the DMU can call down to
2810          * us (e.g. via zio_free()) at any time.
2811          *
2812          * The spa_vdev_remove_thread() can be reading metaslab state
2813          * concurrently, and it is locked out by the ms_sync_lock.
2814          * Note that the ms_lock is insufficient for this, because it
2815          * is dropped by space_map_write().
2816          */
2817         tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2818 
2819         if (msp->ms_sm == NULL) {
2820                 uint64_t new_object;
2821 
2822                 new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2823                 VERIFY3U(new_object, !=, 0);
2824 
2825                 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2826                     msp->ms_start, msp->ms_size, vd->vdev_ashift));
2827 
2828                 ASSERT(msp->ms_sm != NULL);
2829                 ASSERT0(metaslab_allocated_space(msp));
2830         }
2831 
2832         if (!range_tree_is_empty(msp->ms_checkpointing) &&
2833             vd->vdev_checkpoint_sm == NULL) {
2834                 ASSERT(spa_has_checkpoint(spa));
2835 
2836                 uint64_t new_object = space_map_alloc(mos,
2837                     vdev_standard_sm_blksz, tx);
2838                 VERIFY3U(new_object, !=, 0);
2839 
2840                 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
2841                     mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
2842                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2843 
2844                 /*
2845                  * We save the space map object as an entry in vdev_top_zap
2846                  * so it can be retrieved when the pool is reopened after an
2847                  * export or through zdb.
2848                  */
2849                 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,

2857         /*
2858          * Note: metaslab_condense() clears the space map's histogram.
2859          * Therefore we must verify and remove this histogram before
2860          * condensing.
2861          */
2862         metaslab_group_histogram_verify(mg);
2863         metaslab_class_histogram_verify(mg->mg_class);
2864         metaslab_group_histogram_remove(mg, msp);
2865 
2866         if (msp->ms_loaded && metaslab_should_condense(msp)) {
2867                 metaslab_condense(msp, txg, tx);
2868         } else {
2869                 mutex_exit(&msp->ms_lock);
2870                 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2871                     SM_NO_VDEVID, tx);
2872                 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2873                     SM_NO_VDEVID, tx);
2874                 mutex_enter(&msp->ms_lock);
2875         }
2876 
2877         msp->ms_allocated_space += range_tree_space(alloctree);
2878         ASSERT3U(msp->ms_allocated_space, >=,
2879             range_tree_space(msp->ms_freeing));
2880         msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
2881 
2882         if (!range_tree_is_empty(msp->ms_checkpointing)) {
2883                 ASSERT(spa_has_checkpoint(spa));
2884                 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2885 
2886                 /*
2887                  * Since we are doing writes to disk and the ms_checkpointing
2888                  * tree won't be changing during that time, we drop the
2889                  * ms_lock while writing to the checkpoint space map.
2890                  */
2891                 mutex_exit(&msp->ms_lock);
2892                 space_map_write(vd->vdev_checkpoint_sm,
2893                     msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2894                 mutex_enter(&msp->ms_lock);

2895 
2896                 spa->spa_checkpoint_info.sci_dspace +=
2897                     range_tree_space(msp->ms_checkpointing);
2898                 vd->vdev_stat.vs_checkpoint_space +=
2899                     range_tree_space(msp->ms_checkpointing);
2900                 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2901                     -space_map_allocated(vd->vdev_checkpoint_sm));
2902 
2903                 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2904         }
2905 
2906         if (msp->ms_loaded) {
2907                 /*
2908                  * When the space map is loaded, we have an accurate
2909                  * histogram in the range tree. This gives us an opportunity
2910                  * to bring the space map's histogram up-to-date so we clear
2911                  * it first before updating it.
2912                  */
2913                 space_map_histogram_clear(msp->ms_sm);
2914                 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
2915 
2916                 /*
2917                  * Since we've cleared the histogram we need to add back
2918                  * any free space that has already been processed, plus
2919                  * any deferred space. This allows the on-disk histogram
2920                  * to accurately reflect all free space even if some space
2921                  * is not yet available for allocation (i.e. deferred).

2926                  * Add back any deferred free space that has not been
2927                  * added back into the in-core free tree yet. This will
2928                  * ensure that we don't end up with a space map histogram
2929                  * that is completely empty unless the metaslab is fully
2930                  * allocated.
2931                  */
2932                 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2933                         space_map_histogram_add(msp->ms_sm,
2934                             msp->ms_defer[t], tx);
2935                 }
2936         }
2937 
2938         /*
2939          * Always add the free space from this sync pass to the space
2940          * map histogram. We want to make sure that the on-disk histogram
2941          * accounts for all free space. If the space map is not loaded,
2942          * then we will lose some accuracy but will correct it the next
2943          * time we load the space map.
2944          */
2945         space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
2946         metaslab_aux_histograms_update(msp);
2947 
2948         metaslab_group_histogram_add(mg, msp);
2949         metaslab_group_histogram_verify(mg);
2950         metaslab_class_histogram_verify(mg->mg_class);
2951 
2952         /*
2953          * For sync pass 1, we avoid traversing this txg's free range tree
2954          * and instead will just swap the pointers for freeing and freed.
2955          * We can safely do this since the freed_tree is guaranteed to be
2956          * empty on the initial pass.
2957          */
2958         if (spa_sync_pass(spa) == 1) {
2959                 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
2960                 ASSERT0(msp->ms_allocated_this_txg);
2961         } else {
2962                 range_tree_vacate(msp->ms_freeing,
2963                     range_tree_add, msp->ms_freed);
2964         }
2965         msp->ms_allocated_this_txg += range_tree_space(alloctree);
2966         range_tree_vacate(alloctree, NULL, NULL);
2967 
2968         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2969         ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2970             & TXG_MASK]));
2971         ASSERT0(range_tree_space(msp->ms_freeing));
2972         ASSERT0(range_tree_space(msp->ms_checkpointing));
2973 
2974         mutex_exit(&msp->ms_lock);
2975 
2976         if (object != space_map_object(msp->ms_sm)) {
2977                 object = space_map_object(msp->ms_sm);
2978                 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2979                     msp->ms_id, sizeof (uint64_t), &object, tx);
2980         }
2981         mutex_exit(&msp->ms_sync_lock);
2982         dmu_tx_commit(tx);
2983 }
2984 
2985 /*

3023                         msp->ms_defer[t] = range_tree_create(NULL, NULL);
3024                 }
3025 
3026                 ASSERT3P(msp->ms_checkpointing, ==, NULL);
3027                 msp->ms_checkpointing = range_tree_create(NULL, NULL);
3028 
3029                 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
3030         }
3031         ASSERT0(range_tree_space(msp->ms_freeing));
3032         ASSERT0(range_tree_space(msp->ms_checkpointing));
3033 
3034         defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
3035 
3036         uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
3037             metaslab_class_get_alloc(spa_normal_class(spa));
3038         if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
3039                 defer_allowed = B_FALSE;
3040         }
3041 
3042         defer_delta = 0;
3043         alloc_delta = msp->ms_allocated_this_txg -
3044             range_tree_space(msp->ms_freed);
3045         if (defer_allowed) {
3046                 defer_delta = range_tree_space(msp->ms_freed) -
3047                     range_tree_space(*defer_tree);
3048         } else {
3049                 defer_delta -= range_tree_space(*defer_tree);
3050         }
3051 
3052         metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
3053             defer_delta, 0);
3054 
3055         /*
3056          * If there's a metaslab_load() in progress, wait for it to complete
3057          * so that we have a consistent view of the in-core space map.
3058          */
3059         metaslab_load_wait(msp);
3060 
3061         /*
3062          * Move the frees from the defer_tree back to the free
3063          * range tree (if it's loaded). Swap the freed_tree and
3064          * the defer_tree -- this is safe to do because we've
3065          * just emptied out the defer_tree.
3066          */
3067         range_tree_vacate(*defer_tree,
3068             msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
3069         if (defer_allowed) {
3070                 range_tree_swap(&msp->ms_freed, defer_tree);
3071         } else {
3072                 range_tree_vacate(msp->ms_freed,
3073                     msp->ms_loaded ? range_tree_add : NULL,
3074                     msp->ms_allocatable);
3075         }

3076 
3077         msp->ms_synced_length = space_map_length(msp->ms_sm);
3078 
3079         msp->ms_deferspace += defer_delta;
3080         ASSERT3S(msp->ms_deferspace, >=, 0);
3081         ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
3082         if (msp->ms_deferspace != 0) {
3083                 /*
3084                  * Keep syncing this metaslab until all deferred frees
3085                  * are back in circulation.
3086                  */
3087                 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
3088         }
3089         metaslab_aux_histograms_update_done(msp, defer_allowed);
3090 
3091         if (msp->ms_new) {
3092                 msp->ms_new = B_FALSE;
3093                 mutex_enter(&mg->mg_lock);
3094                 mg->mg_ms_ready++;
3095                 mutex_exit(&mg->mg_lock);
3096         }
3097 
3098         /*
3099          * Re-sort metaslab within its group now that we've adjusted
3100          * its allocatable space.
3101          */
3102         metaslab_recalculate_weight_and_sort(msp);

3103 
3104         /*
3105          * If the metaslab is loaded and we've not tried to load or allocate
3106          * from it in 'metaslab_unload_delay' txgs, then unload it.
3107          */
3108         if (msp->ms_loaded &&
3109             msp->ms_initializing == 0 &&
3110             msp->ms_selected_txg + metaslab_unload_delay < txg) {
3111                 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
3112                         VERIFY0(range_tree_space(
3113                             msp->ms_allocating[(txg + t) & TXG_MASK]));
3114                 }
3115                 if (msp->ms_allocator != -1) {
3116                         metaslab_passivate(msp, msp->ms_weight &
3117                             ~METASLAB_ACTIVE_MASK);
3118                 }
3119 
3120                 if (!metaslab_debug_unload)
3121                         metaslab_unload(msp);
3122         }
3123 
3124         ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
3125         ASSERT0(range_tree_space(msp->ms_freeing));
3126         ASSERT0(range_tree_space(msp->ms_freed));
3127         ASSERT0(range_tree_space(msp->ms_checkpointing));
3128 
3129         msp->ms_allocated_this_txg = 0;
3130         mutex_exit(&msp->ms_lock);
3131 }
3132 
3133 void
3134 metaslab_sync_reassess(metaslab_group_t *mg)
3135 {
3136         spa_t *spa = mg->mg_class->mc_spa;
3137 
3138         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3139         metaslab_group_alloc_update(mg);
3140         mg->mg_fragmentation = metaslab_group_fragmentation(mg);
3141 
3142         /*
3143          * Preload the next potential metaslabs but only on active
3144          * metaslab groups. We can get into a state where the metaslab
3145          * is no longer active since we dirty metaslabs as we remove a
3146          * a device, thus potentially making the metaslab group eligible
3147          * for preloading.
3148          */
3149         if (mg->mg_activation_count > 0) {

4365         vdev_t *vd;
4366 
4367         if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
4368                 return (SET_ERROR(ENXIO));
4369         }
4370 
4371         ASSERT(DVA_IS_VALID(dva));
4372 
4373         if (DVA_GET_GANG(dva))
4374                 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4375 
4376         return (metaslab_claim_impl(vd, offset, size, txg));
4377 }
4378 
4379 int
4380 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4381     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4382     zio_alloc_list_t *zal, zio_t *zio, int allocator)
4383 {
4384         dva_t *dva = bp->blk_dva;
4385         dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
4386         int error = 0;
4387 
4388         ASSERT(bp->blk_birth == 0);
4389         ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4390 
4391         spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4392 
4393         if (mc->mc_rotor == NULL) {  /* no vdevs in this class */
4394                 spa_config_exit(spa, SCL_ALLOC, FTAG);
4395                 return (SET_ERROR(ENOSPC));
4396         }
4397 
4398         ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
4399         ASSERT(BP_GET_NDVAS(bp) == 0);
4400         ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
4401         ASSERT3P(zal, !=, NULL);
4402 
4403         for (int d = 0; d < ndvas; d++) {
4404                 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
4405                     txg, flags, zal, allocator);

4532 {
4533         metaslab_t *msp;
4534         spa_t *spa = vd->vdev_spa;
4535 
4536         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4537                 return;
4538 
4539         if (vd->vdev_ops->vdev_op_remap != NULL) {
4540                 vd->vdev_ops->vdev_op_remap(vd, offset, size,
4541                     metaslab_check_free_impl_cb, NULL);
4542                 return;
4543         }
4544 
4545         ASSERT(vdev_is_concrete(vd));
4546         ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4547         ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4548 
4549         msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4550 
4551         mutex_enter(&msp->ms_lock);
4552         if (msp->ms_loaded) {
4553                 range_tree_verify_not_present(msp->ms_allocatable,
4554                     offset, size);
4555         }
4556 
4557         range_tree_verify_not_present(msp->ms_freeing, offset, size);
4558         range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
4559         range_tree_verify_not_present(msp->ms_freed, offset, size);
4560         for (int j = 0; j < TXG_DEFER_SIZE; j++)
4561                 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
4562         mutex_exit(&msp->ms_lock);
4563 }
4564 
4565 void
4566 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4567 {
4568         if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4569                 return;
4570 
4571         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
4572         for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
4573                 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
4574                 vdev_t *vd = vdev_lookup_top(spa, vdev);
4575                 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
4576                 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
4577 
4578                 if (DVA_GET_GANG(&bp->blk_dva[i]))
4579                         size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
4580 
4581                 ASSERT3P(vd, !=, NULL);