Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/metaslab.c
          +++ new/usr/src/uts/common/fs/zfs/metaslab.c
↓ open down ↓ 490 lines elided ↑ open up ↑
 491  491          if (m1->ms_start < m2->ms_start)
 492  492                  return (-1);
 493  493          if (m1->ms_start > m2->ms_start)
 494  494                  return (1);
 495  495  
 496  496          ASSERT3P(m1, ==, m2);
 497  497  
 498  498          return (0);
 499  499  }
 500  500  
      501 +uint64_t
      502 +metaslab_allocated_space(metaslab_t *msp)
      503 +{
      504 +        return (msp->ms_allocated_space);
      505 +}
      506 +
 501  507  /*
 502  508   * Verify that the space accounting on disk matches the in-core range_trees.
 503  509   */
 504      -void
      510 +static void
 505  511  metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 506  512  {
 507  513          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 508      -        uint64_t allocated = 0;
      514 +        uint64_t allocating = 0;
 509  515          uint64_t sm_free_space, msp_free_space;
 510  516  
 511  517          ASSERT(MUTEX_HELD(&msp->ms_lock));
      518 +        ASSERT(!msp->ms_condensing);
 512  519  
 513  520          if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 514  521                  return;
 515  522  
 516  523          /*
 517  524           * We can only verify the metaslab space when we're called
 518      -         * from syncing context with a loaded metaslab that has an allocated
 519      -         * space map. Calling this in non-syncing context does not
 520      -         * provide a consistent view of the metaslab since we're performing
 521      -         * allocations in the future.
      525 +         * from syncing context with a loaded metaslab that has an
      526 +         * allocated space map. Calling this in non-syncing context
      527 +         * does not provide a consistent view of the metaslab since
      528 +         * we're performing allocations in the future.
 522  529           */
 523  530          if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 524  531              !msp->ms_loaded)
 525  532                  return;
 526  533  
 527      -        sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
 528      -            space_map_alloc_delta(msp->ms_sm);
      534 +        /*
      535 +         * Even though the smp_alloc field can get negative (e.g.
      536 +         * see vdev_checkpoint_sm), that should never be the case
      537 +         * when it come's to a metaslab's space map.
      538 +         */
      539 +        ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 529  540  
      541 +        sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
      542 +
 530  543          /*
 531      -         * Account for future allocations since we would have already
 532      -         * deducted that space from the ms_freetree.
      544 +         * Account for future allocations since we would have
      545 +         * already deducted that space from the ms_allocatable.
 533  546           */
 534  547          for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 535      -                allocated +=
      548 +                allocating +=
 536  549                      range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 537  550          }
 538  551  
 539      -        msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
      552 +        ASSERT3U(msp->ms_deferspace, ==,
      553 +            range_tree_space(msp->ms_defer[0]) +
      554 +            range_tree_space(msp->ms_defer[1]));
      555 +
      556 +        msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 540  557              msp->ms_deferspace + range_tree_space(msp->ms_freed);
 541  558  
 542  559          VERIFY3U(sm_free_space, ==, msp_free_space);
 543  560  }
 544  561  
 545  562  /*
 546  563   * ==========================================================================
 547  564   * Metaslab groups
 548  565   * ==========================================================================
 549  566   */
↓ open down ↓ 284 lines elided ↑ open up ↑
 834  851                  return;
 835  852  
 836  853          mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 837  854              KM_SLEEP);
 838  855  
 839  856          ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 840  857              SPACE_MAP_HISTOGRAM_SIZE + ashift);
 841  858  
 842  859          for (int m = 0; m < vd->vdev_ms_count; m++) {
 843  860                  metaslab_t *msp = vd->vdev_ms[m];
      861 +                ASSERT(msp != NULL);
 844  862  
 845  863                  /* skip if not active or not a member */
 846  864                  if (msp->ms_sm == NULL || msp->ms_group != mg)
 847  865                          continue;
 848  866  
 849  867                  for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 850  868                          mg_hist[i + ashift] +=
 851  869                              msp->ms_sm->sm_phys->smp_histogram[i];
 852  870          }
 853  871  
↓ open down ↓ 600 lines elided ↑ open up ↑
1454 1472  };
1455 1473  
1456 1474  metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1457 1475  
1458 1476  /*
1459 1477   * ==========================================================================
1460 1478   * Metaslabs
1461 1479   * ==========================================================================
1462 1480   */
1463 1481  
     1482 +static void
     1483 +metaslab_aux_histograms_clear(metaslab_t *msp)
     1484 +{
     1485 +        /*
     1486 +         * Auxiliary histograms are only cleared when resetting them,
     1487 +         * which can only happen while the metaslab is loaded.
     1488 +         */
     1489 +        ASSERT(msp->ms_loaded);
     1490 +
     1491 +        bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
     1492 +        for (int t = 0; t < TXG_DEFER_SIZE; t++)
     1493 +                bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
     1494 +}
     1495 +
     1496 +static void
     1497 +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     1498 +    range_tree_t *rt)
     1499 +{
     1500 +        /*
     1501 +         * This is modeled after space_map_histogram_add(), so refer to that
     1502 +         * function for implementation details. We want this to work like
     1503 +         * the space map histogram, and not the range tree histogram, as we
     1504 +         * are essentially constructing a delta that will be later subtracted
     1505 +         * from the space map histogram.
     1506 +         */
     1507 +        int idx = 0;
     1508 +        for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
     1509 +                ASSERT3U(i, >=, idx + shift);
     1510 +                histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
     1511 +
     1512 +                if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
     1513 +                        ASSERT3U(idx + shift, ==, i);
     1514 +                        idx++;
     1515 +                        ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
     1516 +                }
     1517 +        }
     1518 +}
     1519 +
1464 1520  /*
     1521 + * Called at every sync pass that the metaslab gets synced.
     1522 + *
     1523 + * The reason is that we want our auxiliary histograms to be updated
     1524 + * wherever the metaslab's space map histogram is updated. This way
     1525 + * we stay consistent on which parts of the metaslab space map's
     1526 + * histogram are currently not available for allocations (e.g because
     1527 + * they are in the defer, freed, and freeing trees).
     1528 + */
     1529 +static void
     1530 +metaslab_aux_histograms_update(metaslab_t *msp)
     1531 +{
     1532 +        space_map_t *sm = msp->ms_sm;
     1533 +        ASSERT(sm != NULL);
     1534 +
     1535 +        /*
     1536 +         * This is similar to the metaslab's space map histogram updates
     1537 +         * that take place in metaslab_sync(). The only difference is that
     1538 +         * we only care about segments that haven't made it into the
     1539 +         * ms_allocatable tree yet.
     1540 +         */
     1541 +        if (msp->ms_loaded) {
     1542 +                metaslab_aux_histograms_clear(msp);
     1543 +
     1544 +                metaslab_aux_histogram_add(msp->ms_synchist,
     1545 +                    sm->sm_shift, msp->ms_freed);
     1546 +
     1547 +                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     1548 +                        metaslab_aux_histogram_add(msp->ms_deferhist[t],
     1549 +                            sm->sm_shift, msp->ms_defer[t]);
     1550 +                }
     1551 +        }
     1552 +
     1553 +        metaslab_aux_histogram_add(msp->ms_synchist,
     1554 +            sm->sm_shift, msp->ms_freeing);
     1555 +}
     1556 +
     1557 +/*
     1558 + * Called every time we are done syncing (writing to) the metaslab,
     1559 + * i.e. at the end of each sync pass.
     1560 + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
     1561 + */
     1562 +static void
     1563 +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
     1564 +{
     1565 +        spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
     1566 +        space_map_t *sm = msp->ms_sm;
     1567 +
     1568 +        if (sm == NULL) {
     1569 +                /*
     1570 +                 * We came here from metaslab_init() when creating/opening a
     1571 +                 * pool, looking at a metaslab that hasn't had any allocations
     1572 +                 * yet.
     1573 +                 */
     1574 +                return;
     1575 +        }
     1576 +
     1577 +        /*
     1578 +         * This is similar to the actions that we take for the ms_freed
     1579 +         * and ms_defer trees in metaslab_sync_done().
     1580 +         */
     1581 +        uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
     1582 +        if (defer_allowed) {
     1583 +                bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
     1584 +                    sizeof (msp->ms_synchist));
     1585 +        } else {
     1586 +                bzero(msp->ms_deferhist[hist_index],
     1587 +                    sizeof (msp->ms_deferhist[hist_index]));
     1588 +        }
     1589 +        bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
     1590 +}
     1591 +
     1592 +/*
     1593 + * Ensure that the metaslab's weight and fragmentation are consistent
     1594 + * with the contents of the histogram (either the range tree's histogram
     1595 + * or the space map's depending whether the metaslab is loaded).
     1596 + */
     1597 +static void
     1598 +metaslab_verify_weight_and_frag(metaslab_t *msp)
     1599 +{
     1600 +        ASSERT(MUTEX_HELD(&msp->ms_lock));
     1601 +
     1602 +        if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
     1603 +                return;
     1604 +
     1605 +        /* see comment in metaslab_verify_unflushed_changes() */
     1606 +        if (msp->ms_group == NULL)
     1607 +                return;
     1608 +
     1609 +        /*
     1610 +         * Devices being removed always return a weight of 0 and leave
     1611 +         * fragmentation and ms_max_size as is - there is nothing for
     1612 +         * us to verify here.
     1613 +         */
     1614 +        vdev_t *vd = msp->ms_group->mg_vd;
     1615 +        if (vd->vdev_removing)
     1616 +                return;
     1617 +
     1618 +        /*
     1619 +         * If the metaslab is dirty it probably means that we've done
     1620 +         * some allocations or frees that have changed our histograms
     1621 +         * and thus the weight.
     1622 +         */
     1623 +        for (int t = 0; t < TXG_SIZE; t++) {
     1624 +                if (txg_list_member(&vd->vdev_ms_list, msp, t))
     1625 +                        return;
     1626 +        }
     1627 +
     1628 +        /*
     1629 +         * This verification checks that our in-memory state is consistent
     1630 +         * with what's on disk. If the pool is read-only then there aren't
     1631 +         * any changes and we just have the initially-loaded state.
     1632 +         */
     1633 +        if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
     1634 +                return;
     1635 +
     1636 +        /* some extra verification for in-core tree if you can */
     1637 +        if (msp->ms_loaded) {
     1638 +                range_tree_stat_verify(msp->ms_allocatable);
     1639 +                VERIFY(space_map_histogram_verify(msp->ms_sm,
     1640 +                    msp->ms_allocatable));
     1641 +        }
     1642 +
     1643 +        uint64_t weight = msp->ms_weight;
     1644 +        uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
     1645 +        boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
     1646 +        uint64_t frag = msp->ms_fragmentation;
     1647 +        uint64_t max_segsize = msp->ms_max_size;
     1648 +
     1649 +        msp->ms_weight = 0;
     1650 +        msp->ms_fragmentation = 0;
     1651 +        msp->ms_max_size = 0;
     1652 +
     1653 +        /*
     1654 +         * This function is used for verification purposes. Regardless of
     1655 +         * whether metaslab_weight() thinks this metaslab should be active or
     1656 +         * not, we want to ensure that the actual weight (and therefore the
     1657 +         * value of ms_weight) would be the same if it was to be recalculated
     1658 +         * at this point.
     1659 +         */
     1660 +        msp->ms_weight = metaslab_weight(msp) | was_active;
     1661 +
     1662 +        VERIFY3U(max_segsize, ==, msp->ms_max_size);
     1663 +
     1664 +        /*
     1665 +         * If the weight type changed then there is no point in doing
     1666 +         * verification. Revert fields to their original values.
     1667 +         */
     1668 +        if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
     1669 +            (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
     1670 +                msp->ms_fragmentation = frag;
     1671 +                msp->ms_weight = weight;
     1672 +                return;
     1673 +        }
     1674 +
     1675 +        VERIFY3U(msp->ms_fragmentation, ==, frag);
     1676 +        VERIFY3U(msp->ms_weight, ==, weight);
     1677 +}
     1678 +
     1679 +/*
1465 1680   * Wait for any in-progress metaslab loads to complete.
1466 1681   */
1467 1682  static void
1468 1683  metaslab_load_wait(metaslab_t *msp)
1469 1684  {
1470 1685          ASSERT(MUTEX_HELD(&msp->ms_lock));
1471 1686  
1472 1687          while (msp->ms_loading) {
1473 1688                  ASSERT(!msp->ms_loaded);
1474 1689                  cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1475 1690          }
1476 1691  }
1477 1692  
1478 1693  static int
1479 1694  metaslab_load_impl(metaslab_t *msp)
1480 1695  {
1481 1696          int error = 0;
1482 1697  
1483 1698          ASSERT(MUTEX_HELD(&msp->ms_lock));
1484 1699          ASSERT(msp->ms_loading);
     1700 +        ASSERT(!msp->ms_condensing);
1485 1701  
1486 1702          /*
1487      -         * Nobody else can manipulate a loading metaslab, so it's now safe
1488      -         * to drop the lock. This way we don't have to hold the lock while
1489      -         * reading the spacemap from disk.
     1703 +         * We temporarily drop the lock to unblock other operations while we
     1704 +         * are reading the space map. Therefore, metaslab_sync() and
     1705 +         * metaslab_sync_done() can run at the same time as we do.
     1706 +         *
     1707 +         * metaslab_sync() can append to the space map while we are loading.
     1708 +         * Therefore we load only entries that existed when we started the
     1709 +         * load. Additionally, metaslab_sync_done() has to wait for the load
     1710 +         * to complete because there are potential races like metaslab_load()
     1711 +         * loading parts of the space map that are currently being appended
     1712 +         * by metaslab_sync(). If we didn't, the ms_allocatable would have
     1713 +         * entries that metaslab_sync_done() would try to re-add later.
     1714 +         *
     1715 +         * That's why before dropping the lock we remember the synced length
     1716 +         * of the metaslab and read up to that point of the space map,
     1717 +         * ignoring entries appended by metaslab_sync() that happen after we
     1718 +         * drop the lock.
1490 1719           */
     1720 +        uint64_t length = msp->ms_synced_length;
1491 1721          mutex_exit(&msp->ms_lock);
1492 1722  
1493      -        /*
1494      -         * If the space map has not been allocated yet, then treat
1495      -         * all the space in the metaslab as free and add it to ms_allocatable.
1496      -         */
1497 1723          if (msp->ms_sm != NULL) {
1498      -                error = space_map_load(msp->ms_sm, msp->ms_allocatable,
1499      -                    SM_FREE);
     1724 +                error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
     1725 +                    SM_FREE, length);
1500 1726          } else {
     1727 +                /*
     1728 +                 * The space map has not been allocated yet, so treat
     1729 +                 * all the space in the metaslab as free and add it to the
     1730 +                 * ms_allocatable tree.
     1731 +                 */
1501 1732                  range_tree_add(msp->ms_allocatable,
1502 1733                      msp->ms_start, msp->ms_size);
1503 1734          }
1504 1735  
     1736 +        /*
     1737 +         * We need to grab the ms_sync_lock to prevent metaslab_sync() from
     1738 +         * changing the ms_sm and the metaslab's range trees while we are
     1739 +         * about to use them and populate the ms_allocatable. The ms_lock
     1740 +         * is insufficient for this because metaslab_sync() doesn't hold
     1741 +         * the ms_lock while writing the ms_checkpointing tree to disk.
     1742 +         */
     1743 +        mutex_enter(&msp->ms_sync_lock);
1505 1744          mutex_enter(&msp->ms_lock);
     1745 +        ASSERT(!msp->ms_condensing);
1506 1746  
1507      -        if (error != 0)
     1747 +        if (error != 0) {
     1748 +                mutex_exit(&msp->ms_sync_lock);
1508 1749                  return (error);
     1750 +        }
1509 1751  
1510 1752          ASSERT3P(msp->ms_group, !=, NULL);
1511 1753          msp->ms_loaded = B_TRUE;
1512 1754  
1513 1755          /*
1514      -         * If the metaslab already has a spacemap, then we need to
1515      -         * remove all segments from the defer tree; otherwise, the
1516      -         * metaslab is completely empty and we can skip this.
     1756 +         * The ms_allocatable contains the segments that exist in the
     1757 +         * ms_defer trees [see ms_synced_length]. Thus we need to remove
     1758 +         * them from ms_allocatable as they will be added again in
     1759 +         * metaslab_sync_done().
1517 1760           */
1518      -        if (msp->ms_sm != NULL) {
1519      -                for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1520      -                        range_tree_walk(msp->ms_defer[t],
1521      -                            range_tree_remove, msp->ms_allocatable);
1522      -                }
     1761 +        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     1762 +                range_tree_walk(msp->ms_defer[t],
     1763 +                    range_tree_remove, msp->ms_allocatable);
1523 1764          }
     1765 +
     1766 +        /*
     1767 +         * Call metaslab_recalculate_weight_and_sort() now that the
     1768 +         * metaslab is loaded so we get the metaslab's real weight.
     1769 +         *
     1770 +         * Unless this metaslab was created with older software and
     1771 +         * has not yet been converted to use segment-based weight, we
     1772 +         * expect the new weight to be better or equal to the weight
     1773 +         * that the metaslab had while it was not loaded. This is
     1774 +         * because the old weight does not take into account the
     1775 +         * consolidation of adjacent segments between TXGs. [see
     1776 +         * comment for ms_synchist and ms_deferhist[] for more info]
     1777 +         */
     1778 +        uint64_t weight = msp->ms_weight;
     1779 +        metaslab_recalculate_weight_and_sort(msp);
     1780 +        if (!WEIGHT_IS_SPACEBASED(weight))
     1781 +                ASSERT3U(weight, <=, msp->ms_weight);
1524 1782          msp->ms_max_size = metaslab_block_maxsize(msp);
1525 1783  
     1784 +        spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
     1785 +        metaslab_verify_space(msp, spa_syncing_txg(spa));
     1786 +        mutex_exit(&msp->ms_sync_lock);
     1787 +
1526 1788          return (0);
1527 1789  }
1528 1790  
1529 1791  int
1530 1792  metaslab_load(metaslab_t *msp)
1531 1793  {
1532 1794          ASSERT(MUTEX_HELD(&msp->ms_lock));
1533 1795  
1534 1796          /*
1535 1797           * There may be another thread loading the same metaslab, if that's
1536 1798           * the case just wait until the other thread is done and return.
1537 1799           */
1538 1800          metaslab_load_wait(msp);
1539 1801          if (msp->ms_loaded)
1540 1802                  return (0);
1541 1803          VERIFY(!msp->ms_loading);
     1804 +        ASSERT(!msp->ms_condensing);
1542 1805  
1543 1806          msp->ms_loading = B_TRUE;
1544 1807          int error = metaslab_load_impl(msp);
1545 1808          msp->ms_loading = B_FALSE;
1546 1809          cv_broadcast(&msp->ms_load_cv);
1547 1810  
1548 1811          return (error);
1549 1812  }
1550 1813  
1551 1814  void
1552 1815  metaslab_unload(metaslab_t *msp)
1553 1816  {
1554 1817          ASSERT(MUTEX_HELD(&msp->ms_lock));
     1818 +
     1819 +        metaslab_verify_weight_and_frag(msp);
     1820 +
1555 1821          range_tree_vacate(msp->ms_allocatable, NULL, NULL);
1556 1822          msp->ms_loaded = B_FALSE;
     1823 +
1557 1824          msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1558 1825          msp->ms_max_size = 0;
     1826 +
     1827 +        /*
     1828 +         * We explicitly recalculate the metaslab's weight based on its space
     1829 +         * map (as it is now not loaded). We want unload metaslabs to always
     1830 +         * have their weights calculated from the space map histograms, while
     1831 +         * loaded ones have it calculated from their in-core range tree
     1832 +         * [see metaslab_load()]. This way, the weight reflects the information
     1833 +         * available in-core, whether it is loaded or not
     1834 +         *
     1835 +         * If ms_group == NULL means that we came here from metaslab_fini(),
     1836 +         * at which point it doesn't make sense for us to do the recalculation
     1837 +         * and the sorting.
     1838 +         */
     1839 +        if (msp->ms_group != NULL)
     1840 +                metaslab_recalculate_weight_and_sort(msp);
1559 1841  }
1560 1842  
1561 1843  static void
1562 1844  metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
1563 1845      int64_t defer_delta, int64_t space_delta)
1564 1846  {
1565 1847          vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
1566 1848  
1567 1849          ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
1568 1850          ASSERT(vd->vdev_ms_count != 0);
↓ open down ↓ 19 lines elided ↑ open up ↑
1588 1870  
1589 1871          ms->ms_id = id;
1590 1872          ms->ms_start = id << vd->vdev_ms_shift;
1591 1873          ms->ms_size = 1ULL << vd->vdev_ms_shift;
1592 1874          ms->ms_allocator = -1;
1593 1875          ms->ms_new = B_TRUE;
1594 1876  
1595 1877          /*
1596 1878           * We only open space map objects that already exist. All others
1597 1879           * will be opened when we finally allocate an object for it.
     1880 +         *
     1881 +         * Note:
     1882 +         * When called from vdev_expand(), we can't call into the DMU as
     1883 +         * we are holding the spa_config_lock as a writer and we would
     1884 +         * deadlock [see relevant comment in vdev_metaslab_init()]. in
     1885 +         * that case, the object parameter is zero though, so we won't
     1886 +         * call into the DMU.
1598 1887           */
1599 1888          if (object != 0) {
1600 1889                  error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1601 1890                      ms->ms_size, vd->vdev_ashift);
1602 1891  
1603 1892                  if (error != 0) {
1604 1893                          kmem_free(ms, sizeof (metaslab_t));
1605 1894                          return (error);
1606 1895                  }
1607 1896  
1608 1897                  ASSERT(ms->ms_sm != NULL);
     1898 +                ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
     1899 +                ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
1609 1900          }
1610 1901  
1611 1902          /*
1612      -         * We create the main range tree here, but we don't create the
     1903 +         * We create the ms_allocatable here, but we don't create the
1613 1904           * other range trees until metaslab_sync_done().  This serves
1614 1905           * two purposes: it allows metaslab_sync_done() to detect the
1615      -         * addition of new space; and for debugging, it ensures that we'd
1616      -         * data fault on any attempt to use this metaslab before it's ready.
     1906 +         * addition of new space; and for debugging, it ensures that
     1907 +         * we'd data fault on any attempt to use this metaslab before
     1908 +         * it's ready.
1617 1909           */
1618 1910          ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
1619 1911          metaslab_group_add(mg, ms);
1620 1912  
1621 1913          metaslab_set_fragmentation(ms);
1622 1914  
1623 1915          /*
1624 1916           * If we're opening an existing pool (txg == 0) or creating
1625 1917           * a new one (txg == TXG_INITIAL), all space is available now.
1626 1918           * If we're adding space to an existing pool, the new space
1627 1919           * does not become available until after this txg has synced.
1628 1920           * The metaslab's weight will also be initialized when we sync
1629 1921           * out this txg. This ensures that we don't attempt to allocate
1630 1922           * from it before we have initialized it completely.
1631 1923           */
1632      -        if (txg <= TXG_INITIAL)
     1924 +        if (txg <= TXG_INITIAL) {
1633 1925                  metaslab_sync_done(ms, 0);
     1926 +                metaslab_space_update(vd, mg->mg_class,
     1927 +                    metaslab_allocated_space(ms), 0, 0);
     1928 +        }
1634 1929  
1635 1930          /*
1636 1931           * If metaslab_debug_load is set and we're initializing a metaslab
1637 1932           * that has an allocated space map object then load the space map
1638 1933           * so that we can verify frees.
1639 1934           */
1640 1935          if (metaslab_debug_load && ms->ms_sm != NULL) {
1641 1936                  mutex_enter(&ms->ms_lock);
1642 1937                  VERIFY0(metaslab_load(ms));
1643 1938                  mutex_exit(&ms->ms_lock);
↓ open down ↓ 13 lines elided ↑ open up ↑
1657 1952  metaslab_fini(metaslab_t *msp)
1658 1953  {
1659 1954          metaslab_group_t *mg = msp->ms_group;
1660 1955          vdev_t *vd = mg->mg_vd;
1661 1956  
1662 1957          metaslab_group_remove(mg, msp);
1663 1958  
1664 1959          mutex_enter(&msp->ms_lock);
1665 1960          VERIFY(msp->ms_group == NULL);
1666 1961          metaslab_space_update(vd, mg->mg_class,
1667      -            -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
     1962 +            -metaslab_allocated_space(msp), 0, -msp->ms_size);
1668 1963  
1669 1964          space_map_close(msp->ms_sm);
1670 1965  
1671 1966          metaslab_unload(msp);
1672 1967  
1673 1968          range_tree_destroy(msp->ms_allocatable);
1674 1969          range_tree_destroy(msp->ms_freeing);
1675 1970          range_tree_destroy(msp->ms_freed);
1676 1971  
1677 1972          for (int t = 0; t < TXG_SIZE; t++) {
1678 1973                  range_tree_destroy(msp->ms_allocating[t]);
1679 1974          }
1680 1975  
1681 1976          for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1682 1977                  range_tree_destroy(msp->ms_defer[t]);
1683 1978          }
1684 1979          ASSERT0(msp->ms_deferspace);
1685 1980  
1686 1981          range_tree_destroy(msp->ms_checkpointing);
1687 1982  
     1983 +        for (int t = 0; t < TXG_SIZE; t++)
     1984 +                ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
     1985 +
1688 1986          mutex_exit(&msp->ms_lock);
1689 1987          cv_destroy(&msp->ms_load_cv);
1690 1988          mutex_destroy(&msp->ms_lock);
1691 1989          mutex_destroy(&msp->ms_sync_lock);
1692 1990          ASSERT3U(msp->ms_allocator, ==, -1);
1693 1991  
1694 1992          kmem_free(msp, sizeof (metaslab_t));
1695 1993  }
1696 1994  
1697 1995  #define FRAGMENTATION_TABLE_SIZE        17
1698 1996  
1699 1997  /*
1700 1998   * This table defines a segment size based fragmentation metric that will
1701 1999   * allow each metaslab to derive its own fragmentation value. This is done
1702 2000   * by calculating the space in each bucket of the spacemap histogram and
1703      - * multiplying that by the fragmetation metric in this table. Doing
     2001 + * multiplying that by the fragmentation metric in this table. Doing
1704 2002   * this for all buckets and dividing it by the total amount of free
1705 2003   * space in this metaslab (i.e. the total free space in all buckets) gives
1706 2004   * us the fragmentation metric. This means that a high fragmentation metric
1707 2005   * equates to most of the free space being comprised of small segments.
1708 2006   * Conversely, if the metric is low, then most of the free space is in
1709 2007   * large segments. A 10% change in fragmentation equates to approximately
1710 2008   * double the number of segments.
1711 2009   *
1712 2010   * This table defines 0% fragmented space using 16MB segments. Testing has
1713 2011   * shown that segments that are greater than or equal to 16MB do not suffer
↓ open down ↓ 14 lines elided ↑ open up ↑
1728 2026          40,     /* 256K */
1729 2027          30,     /* 512K */
1730 2028          20,     /* 1M   */
1731 2029          15,     /* 2M   */
1732 2030          10,     /* 4M   */
1733 2031          5,      /* 8M   */
1734 2032          0       /* 16M  */
1735 2033  };
1736 2034  
1737 2035  /*
1738      - * Calclate the metaslab's fragmentation metric. A return value
1739      - * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1740      - * not support this metric. Otherwise, the return value should be in the
1741      - * range [0, 100].
     2036 + * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
     2037 + * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
     2038 + * been upgraded and does not support this metric. Otherwise, the return
     2039 + * value should be in the range [0, 100].
1742 2040   */
1743 2041  static void
1744 2042  metaslab_set_fragmentation(metaslab_t *msp)
1745 2043  {
1746 2044          spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1747 2045          uint64_t fragmentation = 0;
1748 2046          uint64_t total = 0;
1749 2047          boolean_t feature_enabled = spa_feature_is_enabled(spa,
1750 2048              SPA_FEATURE_SPACEMAP_HISTOGRAM);
1751 2049  
↓ open down ↓ 72 lines elided ↑ open up ↑
1824 2122          metaslab_group_t *mg = msp->ms_group;
1825 2123          vdev_t *vd = mg->mg_vd;
1826 2124          uint64_t weight, space;
1827 2125  
1828 2126          ASSERT(MUTEX_HELD(&msp->ms_lock));
1829 2127          ASSERT(!vd->vdev_removing);
1830 2128  
1831 2129          /*
1832 2130           * The baseline weight is the metaslab's free space.
1833 2131           */
1834      -        space = msp->ms_size - space_map_allocated(msp->ms_sm);
     2132 +        space = msp->ms_size - metaslab_allocated_space(msp);
1835 2133  
1836 2134          if (metaslab_fragmentation_factor_enabled &&
1837 2135              msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1838 2136                  /*
1839 2137                   * Use the fragmentation information to inversely scale
1840 2138                   * down the baseline weight. We need to ensure that we
1841 2139                   * don't exclude this metaslab completely when it's 100%
1842 2140                   * fragmented. To avoid this we reduce the fragmented value
1843 2141                   * by 1.
1844 2142                   */
↓ open down ↓ 83 lines elided ↑ open up ↑
1928 2226  }
1929 2227  
1930 2228  /*
1931 2229   * Calculate the weight based on the on-disk histogram. This should only
1932 2230   * be called after a sync pass has completely finished since the on-disk
1933 2231   * information is updated in metaslab_sync().
1934 2232   */
1935 2233  static uint64_t
1936 2234  metaslab_weight_from_spacemap(metaslab_t *msp)
1937 2235  {
1938      -        uint64_t weight = 0;
     2236 +        space_map_t *sm = msp->ms_sm;
     2237 +        ASSERT(!msp->ms_loaded);
     2238 +        ASSERT(sm != NULL);
     2239 +        ASSERT3U(space_map_object(sm), !=, 0);
     2240 +        ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1939 2241  
     2242 +        /*
     2243 +         * Create a joint histogram from all the segments that have made
     2244 +         * it to the metaslab's space map histogram, that are not yet
     2245 +         * available for allocation because they are still in the freeing
     2246 +         * pipeline (e.g. freeing, freed, and defer trees). Then subtract
     2247 +         * these segments from the space map's histogram to get a more
     2248 +         * accurate weight.
     2249 +         */
     2250 +        uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
     2251 +        for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
     2252 +                deferspace_histogram[i] += msp->ms_synchist[i];
     2253 +        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
     2254 +                for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
     2255 +                        deferspace_histogram[i] += msp->ms_deferhist[t][i];
     2256 +                }
     2257 +        }
     2258 +
     2259 +        uint64_t weight = 0;
1940 2260          for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1941      -                if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1942      -                        WEIGHT_SET_COUNT(weight,
1943      -                            msp->ms_sm->sm_phys->smp_histogram[i]);
1944      -                        WEIGHT_SET_INDEX(weight, i +
1945      -                            msp->ms_sm->sm_shift);
     2261 +                ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
     2262 +                    deferspace_histogram[i]);
     2263 +                uint64_t count =
     2264 +                    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
     2265 +                if (count != 0) {
     2266 +                        WEIGHT_SET_COUNT(weight, count);
     2267 +                        WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
1946 2268                          WEIGHT_SET_ACTIVE(weight, 0);
1947 2269                          break;
1948 2270                  }
1949 2271          }
1950 2272          return (weight);
1951 2273  }
1952 2274  
1953 2275  /*
1954 2276   * Compute a segment-based weight for the specified metaslab. The weight
1955 2277   * is determined by highest bucket in the histogram. The information
↓ open down ↓ 4 lines elided ↑ open up ↑
1960 2282  {
1961 2283          metaslab_group_t *mg = msp->ms_group;
1962 2284          uint64_t weight = 0;
1963 2285          uint8_t shift = mg->mg_vd->vdev_ashift;
1964 2286  
1965 2287          ASSERT(MUTEX_HELD(&msp->ms_lock));
1966 2288  
1967 2289          /*
1968 2290           * The metaslab is completely free.
1969 2291           */
1970      -        if (space_map_allocated(msp->ms_sm) == 0) {
     2292 +        if (metaslab_allocated_space(msp) == 0) {
1971 2293                  int idx = highbit64(msp->ms_size) - 1;
1972 2294                  int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1973 2295  
1974 2296                  if (idx < max_idx) {
1975 2297                          WEIGHT_SET_COUNT(weight, 1ULL);
1976 2298                          WEIGHT_SET_INDEX(weight, idx);
1977 2299                  } else {
1978 2300                          WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1979 2301                          WEIGHT_SET_INDEX(weight, max_idx);
1980 2302                  }
↓ open down ↓ 1 lines elided ↑ open up ↑
1982 2304                  ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1983 2305  
1984 2306                  return (weight);
1985 2307          }
1986 2308  
1987 2309          ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1988 2310  
1989 2311          /*
1990 2312           * If the metaslab is fully allocated then just make the weight 0.
1991 2313           */
1992      -        if (space_map_allocated(msp->ms_sm) == msp->ms_size)
     2314 +        if (metaslab_allocated_space(msp) == msp->ms_size)
1993 2315                  return (0);
1994 2316          /*
1995 2317           * If the metaslab is already loaded, then use the range tree to
1996 2318           * determine the weight. Otherwise, we rely on the space map information
1997 2319           * to generate the weight.
1998 2320           */
1999 2321          if (msp->ms_loaded) {
2000 2322                  weight = metaslab_weight_from_range_tree(msp);
2001 2323          } else {
2002 2324                  weight = metaslab_weight_from_spacemap(msp);
↓ open down ↓ 60 lines elided ↑ open up ↑
2063 2385  
2064 2386          metaslab_set_fragmentation(msp);
2065 2387  
2066 2388          /*
2067 2389           * Update the maximum size if the metaslab is loaded. This will
2068 2390           * ensure that we get an accurate maximum size if newly freed space
2069 2391           * has been added back into the free tree.
2070 2392           */
2071 2393          if (msp->ms_loaded)
2072 2394                  msp->ms_max_size = metaslab_block_maxsize(msp);
     2395 +        else
     2396 +                ASSERT0(msp->ms_max_size);
2073 2397  
2074 2398          /*
2075 2399           * Segment-based weighting requires space map histogram support.
2076 2400           */
2077 2401          if (zfs_metaslab_segment_weight_enabled &&
2078 2402              spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
2079 2403              (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
2080 2404              sizeof (space_map_phys_t))) {
2081 2405                  weight = metaslab_segment_weight(msp);
2082 2406          } else {
2083 2407                  weight = metaslab_space_weight(msp);
2084 2408          }
2085 2409          return (weight);
2086 2410  }
2087 2411  
     2412 +void
     2413 +metaslab_recalculate_weight_and_sort(metaslab_t *msp)
     2414 +{
     2415 +        /* note: we preserve the mask (e.g. indication of primary, etc..) */
     2416 +        uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
     2417 +        metaslab_group_sort(msp->ms_group, msp,
     2418 +            metaslab_weight(msp) | was_active);
     2419 +}
     2420 +
2088 2421  static int
2089 2422  metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
2090 2423      int allocator, uint64_t activation_weight)
2091 2424  {
2092 2425          /*
2093 2426           * If we're activating for the claim code, we don't want to actually
2094 2427           * set the metaslab up for a specific allocator.
2095 2428           */
2096 2429          if (activation_weight == METASLAB_WEIGHT_CLAIM)
2097 2430                  return (0);
↓ open down ↓ 364 lines elided ↑ open up ↑
2462 2795          if (range_tree_is_empty(alloctree) &&
2463 2796              range_tree_is_empty(msp->ms_freeing) &&
2464 2797              range_tree_is_empty(msp->ms_checkpointing) &&
2465 2798              !(msp->ms_loaded && msp->ms_condense_wanted))
2466 2799                  return;
2467 2800  
2468 2801  
2469 2802          VERIFY(txg <= spa_final_dirty_txg(spa));
2470 2803  
2471 2804          /*
2472      -         * The only state that can actually be changing concurrently with
2473      -         * metaslab_sync() is the metaslab's ms_allocatable.  No other
2474      -         * thread can be modifying this txg's alloc, freeing,
     2805 +         * The only state that can actually be changing concurrently
     2806 +         * with metaslab_sync() is the metaslab's ms_allocatable. No
     2807 +         * other thread can be modifying this txg's alloc, freeing,
2475 2808           * freed, or space_map_phys_t.  We drop ms_lock whenever we
2476      -         * could call into the DMU, because the DMU can call down to us
2477      -         * (e.g. via zio_free()) at any time.
     2809 +         * could call into the DMU, because the DMU can call down to
     2810 +         * us (e.g. via zio_free()) at any time.
2478 2811           *
2479 2812           * The spa_vdev_remove_thread() can be reading metaslab state
2480      -         * concurrently, and it is locked out by the ms_sync_lock.  Note
2481      -         * that the ms_lock is insufficient for this, because it is dropped
2482      -         * by space_map_write().
     2813 +         * concurrently, and it is locked out by the ms_sync_lock.
     2814 +         * Note that the ms_lock is insufficient for this, because it
     2815 +         * is dropped by space_map_write().
2483 2816           */
2484 2817          tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2485 2818  
2486 2819          if (msp->ms_sm == NULL) {
2487 2820                  uint64_t new_object;
2488 2821  
2489 2822                  new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
2490 2823                  VERIFY3U(new_object, !=, 0);
2491 2824  
2492 2825                  VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2493 2826                      msp->ms_start, msp->ms_size, vd->vdev_ashift));
     2827 +
2494 2828                  ASSERT(msp->ms_sm != NULL);
     2829 +                ASSERT0(metaslab_allocated_space(msp));
2495 2830          }
2496 2831  
2497 2832          if (!range_tree_is_empty(msp->ms_checkpointing) &&
2498 2833              vd->vdev_checkpoint_sm == NULL) {
2499 2834                  ASSERT(spa_has_checkpoint(spa));
2500 2835  
2501 2836                  uint64_t new_object = space_map_alloc(mos,
2502 2837                      vdev_standard_sm_blksz, tx);
2503 2838                  VERIFY3U(new_object, !=, 0);
2504 2839  
↓ open down ↓ 27 lines elided ↑ open up ↑
2532 2867                  metaslab_condense(msp, txg, tx);
2533 2868          } else {
2534 2869                  mutex_exit(&msp->ms_lock);
2535 2870                  space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
2536 2871                      SM_NO_VDEVID, tx);
2537 2872                  space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
2538 2873                      SM_NO_VDEVID, tx);
2539 2874                  mutex_enter(&msp->ms_lock);
2540 2875          }
2541 2876  
     2877 +        msp->ms_allocated_space += range_tree_space(alloctree);
     2878 +        ASSERT3U(msp->ms_allocated_space, >=,
     2879 +            range_tree_space(msp->ms_freeing));
     2880 +        msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
     2881 +
2542 2882          if (!range_tree_is_empty(msp->ms_checkpointing)) {
2543 2883                  ASSERT(spa_has_checkpoint(spa));
2544 2884                  ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
2545 2885  
2546 2886                  /*
2547 2887                   * Since we are doing writes to disk and the ms_checkpointing
2548 2888                   * tree won't be changing during that time, we drop the
2549 2889                   * ms_lock while writing to the checkpoint space map.
2550 2890                   */
2551 2891                  mutex_exit(&msp->ms_lock);
2552 2892                  space_map_write(vd->vdev_checkpoint_sm,
2553 2893                      msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
2554 2894                  mutex_enter(&msp->ms_lock);
2555      -                space_map_update(vd->vdev_checkpoint_sm);
2556 2895  
2557 2896                  spa->spa_checkpoint_info.sci_dspace +=
2558 2897                      range_tree_space(msp->ms_checkpointing);
2559 2898                  vd->vdev_stat.vs_checkpoint_space +=
2560 2899                      range_tree_space(msp->ms_checkpointing);
2561 2900                  ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
2562      -                    -vd->vdev_checkpoint_sm->sm_alloc);
     2901 +                    -space_map_allocated(vd->vdev_checkpoint_sm));
2563 2902  
2564 2903                  range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
2565 2904          }
2566 2905  
2567 2906          if (msp->ms_loaded) {
2568 2907                  /*
2569 2908                   * When the space map is loaded, we have an accurate
2570 2909                   * histogram in the range tree. This gives us an opportunity
2571 2910                   * to bring the space map's histogram up-to-date so we clear
2572 2911                   * it first before updating it.
↓ open down ↓ 24 lines elided ↑ open up ↑
2597 2936          }
2598 2937  
2599 2938          /*
2600 2939           * Always add the free space from this sync pass to the space
2601 2940           * map histogram. We want to make sure that the on-disk histogram
2602 2941           * accounts for all free space. If the space map is not loaded,
2603 2942           * then we will lose some accuracy but will correct it the next
2604 2943           * time we load the space map.
2605 2944           */
2606 2945          space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
     2946 +        metaslab_aux_histograms_update(msp);
2607 2947  
2608 2948          metaslab_group_histogram_add(mg, msp);
2609 2949          metaslab_group_histogram_verify(mg);
2610 2950          metaslab_class_histogram_verify(mg->mg_class);
2611 2951  
2612 2952          /*
2613 2953           * For sync pass 1, we avoid traversing this txg's free range tree
2614      -         * and instead will just swap the pointers for freeing and
2615      -         * freed. We can safely do this since the freed_tree is
2616      -         * guaranteed to be empty on the initial pass.
     2954 +         * and instead will just swap the pointers for freeing and freed.
     2955 +         * We can safely do this since the freed_tree is guaranteed to be
     2956 +         * empty on the initial pass.
2617 2957           */
2618 2958          if (spa_sync_pass(spa) == 1) {
2619 2959                  range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
     2960 +                ASSERT0(msp->ms_allocated_this_txg);
2620 2961          } else {
2621 2962                  range_tree_vacate(msp->ms_freeing,
2622 2963                      range_tree_add, msp->ms_freed);
2623 2964          }
     2965 +        msp->ms_allocated_this_txg += range_tree_space(alloctree);
2624 2966          range_tree_vacate(alloctree, NULL, NULL);
2625 2967  
2626 2968          ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2627 2969          ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
2628 2970              & TXG_MASK]));
2629 2971          ASSERT0(range_tree_space(msp->ms_freeing));
2630 2972          ASSERT0(range_tree_space(msp->ms_checkpointing));
2631 2973  
2632 2974          mutex_exit(&msp->ms_lock);
2633 2975  
↓ open down ↓ 57 lines elided ↑ open up ↑
2691 3033  
2692 3034          defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
2693 3035  
2694 3036          uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2695 3037              metaslab_class_get_alloc(spa_normal_class(spa));
2696 3038          if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
2697 3039                  defer_allowed = B_FALSE;
2698 3040          }
2699 3041  
2700 3042          defer_delta = 0;
2701      -        alloc_delta = space_map_alloc_delta(msp->ms_sm);
     3043 +        alloc_delta = msp->ms_allocated_this_txg -
     3044 +            range_tree_space(msp->ms_freed);
2702 3045          if (defer_allowed) {
2703 3046                  defer_delta = range_tree_space(msp->ms_freed) -
2704 3047                      range_tree_space(*defer_tree);
2705 3048          } else {
2706 3049                  defer_delta -= range_tree_space(*defer_tree);
2707 3050          }
2708 3051  
2709 3052          metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
2710 3053              defer_delta, 0);
2711 3054  
↓ open down ↓ 11 lines elided ↑ open up ↑
2723 3066           */
2724 3067          range_tree_vacate(*defer_tree,
2725 3068              msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
2726 3069          if (defer_allowed) {
2727 3070                  range_tree_swap(&msp->ms_freed, defer_tree);
2728 3071          } else {
2729 3072                  range_tree_vacate(msp->ms_freed,
2730 3073                      msp->ms_loaded ? range_tree_add : NULL,
2731 3074                      msp->ms_allocatable);
2732 3075          }
2733      -        space_map_update(msp->ms_sm);
2734 3076  
     3077 +        msp->ms_synced_length = space_map_length(msp->ms_sm);
     3078 +
2735 3079          msp->ms_deferspace += defer_delta;
2736 3080          ASSERT3S(msp->ms_deferspace, >=, 0);
2737 3081          ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2738 3082          if (msp->ms_deferspace != 0) {
2739 3083                  /*
2740 3084                   * Keep syncing this metaslab until all deferred frees
2741 3085                   * are back in circulation.
2742 3086                   */
2743 3087                  vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2744 3088          }
     3089 +        metaslab_aux_histograms_update_done(msp, defer_allowed);
2745 3090  
2746 3091          if (msp->ms_new) {
2747 3092                  msp->ms_new = B_FALSE;
2748 3093                  mutex_enter(&mg->mg_lock);
2749 3094                  mg->mg_ms_ready++;
2750 3095                  mutex_exit(&mg->mg_lock);
2751 3096          }
     3097 +
2752 3098          /*
2753      -         * Calculate the new weights before unloading any metaslabs.
2754      -         * This will give us the most accurate weighting.
     3099 +         * Re-sort metaslab within its group now that we've adjusted
     3100 +         * its allocatable space.
2755 3101           */
2756      -        metaslab_group_sort(mg, msp, metaslab_weight(msp) |
2757      -            (msp->ms_weight & METASLAB_ACTIVE_MASK));
     3102 +        metaslab_recalculate_weight_and_sort(msp);
2758 3103  
2759 3104          /*
2760 3105           * If the metaslab is loaded and we've not tried to load or allocate
2761 3106           * from it in 'metaslab_unload_delay' txgs, then unload it.
2762 3107           */
2763 3108          if (msp->ms_loaded &&
2764 3109              msp->ms_initializing == 0 &&
2765 3110              msp->ms_selected_txg + metaslab_unload_delay < txg) {
2766 3111                  for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2767 3112                          VERIFY0(range_tree_space(
↓ open down ↓ 6 lines elided ↑ open up ↑
2774 3119  
2775 3120                  if (!metaslab_debug_unload)
2776 3121                          metaslab_unload(msp);
2777 3122          }
2778 3123  
2779 3124          ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
2780 3125          ASSERT0(range_tree_space(msp->ms_freeing));
2781 3126          ASSERT0(range_tree_space(msp->ms_freed));
2782 3127          ASSERT0(range_tree_space(msp->ms_checkpointing));
2783 3128  
     3129 +        msp->ms_allocated_this_txg = 0;
2784 3130          mutex_exit(&msp->ms_lock);
2785 3131  }
2786 3132  
2787 3133  void
2788 3134  metaslab_sync_reassess(metaslab_group_t *mg)
2789 3135  {
2790 3136          spa_t *spa = mg->mg_class->mc_spa;
2791 3137  
2792 3138          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2793 3139          metaslab_group_alloc_update(mg);
↓ open down ↓ 1235 lines elided ↑ open up ↑
4029 4375  
4030 4376          return (metaslab_claim_impl(vd, offset, size, txg));
4031 4377  }
4032 4378  
4033 4379  int
4034 4380  metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
4035 4381      int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
4036 4382      zio_alloc_list_t *zal, zio_t *zio, int allocator)
4037 4383  {
4038 4384          dva_t *dva = bp->blk_dva;
4039      -        dva_t *hintdva = hintbp->blk_dva;
     4385 +        dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
4040 4386          int error = 0;
4041 4387  
4042 4388          ASSERT(bp->blk_birth == 0);
4043 4389          ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
4044 4390  
4045 4391          spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4046 4392  
4047 4393          if (mc->mc_rotor == NULL) {     /* no vdevs in this class */
4048 4394                  spa_config_exit(spa, SCL_ALLOC, FTAG);
4049 4395                  return (SET_ERROR(ENOSPC));
↓ open down ↓ 146 lines elided ↑ open up ↑
4196 4542                  return;
4197 4543          }
4198 4544  
4199 4545          ASSERT(vdev_is_concrete(vd));
4200 4546          ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
4201 4547          ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
4202 4548  
4203 4549          msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
4204 4550  
4205 4551          mutex_enter(&msp->ms_lock);
4206      -        if (msp->ms_loaded)
4207      -                range_tree_verify(msp->ms_allocatable, offset, size);
     4552 +        if (msp->ms_loaded) {
     4553 +                range_tree_verify_not_present(msp->ms_allocatable,
     4554 +                    offset, size);
     4555 +        }
4208 4556  
4209      -        range_tree_verify(msp->ms_freeing, offset, size);
4210      -        range_tree_verify(msp->ms_checkpointing, offset, size);
4211      -        range_tree_verify(msp->ms_freed, offset, size);
     4557 +        range_tree_verify_not_present(msp->ms_freeing, offset, size);
     4558 +        range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
     4559 +        range_tree_verify_not_present(msp->ms_freed, offset, size);
4212 4560          for (int j = 0; j < TXG_DEFER_SIZE; j++)
4213      -                range_tree_verify(msp->ms_defer[j], offset, size);
     4561 +                range_tree_verify_not_present(msp->ms_defer[j], offset, size);
4214 4562          mutex_exit(&msp->ms_lock);
4215 4563  }
4216 4564  
4217 4565  void
4218 4566  metaslab_check_free(spa_t *spa, const blkptr_t *bp)
4219 4567  {
4220 4568          if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
4221 4569                  return;
4222 4570  
4223 4571          spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
↓ open down ↓ 15 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX