Print this page
10592 misc. metaslab and vdev related ZoL bug fixes
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Giuseppe Di Natale <guss80@gmail.com>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Reviewed by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Kody Kantor <kody.kantor@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/vdev_removal.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_removal.c
↓ open down ↓ 275 lines elided ↑ open up ↑
 276  276          /*
 277  277           * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
 278  278           * there may be space in the defer tree, which is free, but still
 279  279           * counted in vs_alloc.
 280  280           */
 281  281          for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
 282  282                  metaslab_t *ms = vd->vdev_ms[i];
 283  283                  if (ms->ms_sm == NULL)
 284  284                          continue;
 285  285  
 286      -                /*
 287      -                 * Sync tasks happen before metaslab_sync(), therefore
 288      -                 * smp_alloc and sm_alloc must be the same.
 289      -                 */
 290      -                ASSERT3U(space_map_allocated(ms->ms_sm), ==,
 291      -                    ms->ms_sm->sm_phys->smp_alloc);
 292      -
 293  286                  spa->spa_removing_phys.sr_to_copy +=
 294      -                    space_map_allocated(ms->ms_sm);
      287 +                    metaslab_allocated_space(ms);
 295  288  
 296  289                  /*
 297  290                   * Space which we are freeing this txg does not need to
 298  291                   * be copied.
 299  292                   */
 300  293                  spa->spa_removing_phys.sr_to_copy -=
 301  294                      range_tree_space(ms->ms_freeing);
 302  295  
 303  296                  ASSERT0(range_tree_space(ms->ms_freed));
 304  297                  for (int t = 0; t < TXG_SIZE; t++)
↓ open down ↓ 1089 lines elided ↑ open up ↑
1394 1387                  /*
1395 1388                   * If the metaslab has ever been allocated from (ms_sm!=NULL),
1396 1389                   * read the allocated segments from the space map object
1397 1390                   * into svr_allocd_segs. Since we do this while holding
1398 1391                   * svr_lock and ms_sync_lock, concurrent frees (which
1399 1392                   * would have modified the space map) will wait for us
1400 1393                   * to finish loading the spacemap, and then take the
1401 1394                   * appropriate action (see free_from_removing_vdev()).
1402 1395                   */
1403 1396                  if (msp->ms_sm != NULL) {
1404      -                        space_map_t *sm = NULL;
     1397 +                        VERIFY0(space_map_load(msp->ms_sm,
     1398 +                            svr->svr_allocd_segs, SM_ALLOC));
1405 1399  
1406      -                        /*
1407      -                         * We have to open a new space map here, because
1408      -                         * ms_sm's sm_length and sm_alloc may not reflect
1409      -                         * what's in the object contents, if we are in between
1410      -                         * metaslab_sync() and metaslab_sync_done().
1411      -                         */
1412      -                        VERIFY0(space_map_open(&sm,
1413      -                            spa->spa_dsl_pool->dp_meta_objset,
1414      -                            msp->ms_sm->sm_object, msp->ms_sm->sm_start,
1415      -                            msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
1416      -                        space_map_update(sm);
1417      -                        VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
1418      -                            SM_ALLOC));
1419      -                        space_map_close(sm);
1420      -
1421 1400                          range_tree_walk(msp->ms_freeing,
1422 1401                              range_tree_remove, svr->svr_allocd_segs);
1423 1402  
1424 1403                          /*
1425 1404                           * When we are resuming from a paused removal (i.e.
1426 1405                           * when importing a pool with a removal in progress),
1427 1406                           * discard any state that we have already processed.
1428 1407                           */
1429 1408                          range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
1430 1409                  }
↓ open down ↓ 173 lines elided ↑ open up ↑
1604 1583                  /*
1605 1584                   * Assert nothing in flight -- ms_*tree is empty.
1606 1585                   */
1607 1586                  for (int i = 0; i < TXG_SIZE; i++)
1608 1587                          ASSERT0(range_tree_space(msp->ms_allocating[i]));
1609 1588                  for (int i = 0; i < TXG_DEFER_SIZE; i++)
1610 1589                          ASSERT0(range_tree_space(msp->ms_defer[i]));
1611 1590                  ASSERT0(range_tree_space(msp->ms_freed));
1612 1591  
1613 1592                  if (msp->ms_sm != NULL) {
1614      -                        /*
1615      -                         * Assert that the in-core spacemap has the same
1616      -                         * length as the on-disk one, so we can use the
1617      -                         * existing in-core spacemap to load it from disk.
1618      -                         */
1619      -                        ASSERT3U(msp->ms_sm->sm_alloc, ==,
1620      -                            msp->ms_sm->sm_phys->smp_alloc);
1621      -                        ASSERT3U(msp->ms_sm->sm_length, ==,
1622      -                            msp->ms_sm->sm_phys->smp_objsize);
1623      -
1624 1593                          mutex_enter(&svr->svr_lock);
1625 1594                          VERIFY0(space_map_load(msp->ms_sm,
1626 1595                              svr->svr_allocd_segs, SM_ALLOC));
1627 1596                          range_tree_walk(msp->ms_freeing,
1628 1597                              range_tree_remove, svr->svr_allocd_segs);
1629 1598  
1630 1599                          /*
1631 1600                           * Clear everything past what has been synced,
1632 1601                           * because we have not allocated mappings for it yet.
1633 1602                           */
↓ open down ↓ 72 lines elided ↑ open up ↑
1706 1675          if (error == 0) {
1707 1676                  spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
1708 1677                  vdev_t *vd = vdev_lookup_top(spa, vdid);
1709 1678                  metaslab_group_activate(vd->vdev_mg);
1710 1679                  spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
1711 1680          }
1712 1681  
1713 1682          return (error);
1714 1683  }
1715 1684  
1716      -/*
1717      - * Called every sync pass of every txg if there's a svr.
1718      - */
1719 1685  void
1720 1686  svr_sync(spa_t *spa, dmu_tx_t *tx)
1721 1687  {
1722 1688          spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1723 1689          int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
1724 1690  
1725 1691          /*
1726 1692           * This check is necessary so that we do not dirty the
1727 1693           * DIRECTORY_OBJECT via spa_sync_removing_state() when there
1728 1694           * is nothing to do.  Dirtying it every time would prevent us
↓ open down ↓ 43 lines elided ↑ open up ↑
1772 1738   */
1773 1739  static int
1774 1740  spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
1775 1741  {
1776 1742          metaslab_group_t *mg = vd->vdev_mg;
1777 1743          spa_t *spa = vd->vdev_spa;
1778 1744          int error = 0;
1779 1745  
1780 1746          ASSERT(vd->vdev_islog);
1781 1747          ASSERT(vd == vd->vdev_top);
     1748 +        ASSERT(MUTEX_HELD(&spa_namespace_lock));
1782 1749  
1783 1750          /*
1784 1751           * Stop allocating from this vdev.
1785 1752           */
1786 1753          metaslab_group_passivate(mg);
1787 1754  
1788 1755          /*
1789 1756           * Wait for the youngest allocations and frees to sync,
1790 1757           * and then wait for the deferral of those frees to finish.
1791 1758           */
1792 1759          spa_vdev_config_exit(spa, NULL,
1793 1760              *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
1794 1761  
1795 1762          /*
1796      -         * Evacuate the device.  We don't hold the config lock as writer
1797      -         * since we need to do I/O but we do keep the
     1763 +         * Evacuate the device.  We don't hold the config lock as
     1764 +         * writer since we need to do I/O but we do keep the
1798 1765           * spa_namespace_lock held.  Once this completes the device
1799 1766           * should no longer have any blocks allocated on it.
1800 1767           */
1801      -        if (vd->vdev_islog) {
1802      -                if (vd->vdev_stat.vs_alloc != 0)
1803      -                        error = spa_reset_logs(spa);
1804      -        }
     1768 +        ASSERT(MUTEX_HELD(&spa_namespace_lock));
     1769 +        if (vd->vdev_stat.vs_alloc != 0)
     1770 +                error = spa_reset_logs(spa);
1805 1771  
1806 1772          *txg = spa_vdev_config_enter(spa);
1807 1773  
1808 1774          if (error != 0) {
1809 1775                  metaslab_group_activate(mg);
1810 1776                  return (error);
1811 1777          }
1812 1778          ASSERT0(vd->vdev_stat.vs_alloc);
1813 1779  
1814 1780          /*
1815 1781           * The evacuation succeeded.  Remove any remaining MOS metadata
1816 1782           * associated with this vdev, and wait for these changes to sync.
1817 1783           */
1818 1784          vd->vdev_removing = B_TRUE;
1819 1785  
1820 1786          vdev_dirty_leaves(vd, VDD_DTL, *txg);
1821 1787          vdev_config_dirty(vd);
1822 1788  
     1789 +        vdev_metaslab_fini(vd);
     1790 +
1823 1791          spa_history_log_internal(spa, "vdev remove", NULL,
1824 1792              "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
1825 1793              (vd->vdev_path != NULL) ? vd->vdev_path : "-");
1826 1794  
1827 1795          /* Make sure these changes are sync'ed */
1828 1796          spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
1829 1797  
1830 1798          /* Stop initializing */
1831 1799          (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
1832 1800  
↓ open down ↓ 9 lines elided ↑ open up ↑
1842 1810          /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
1843 1811          ASSERT0(vd->vdev_leaf_zap);
1844 1812  
1845 1813          (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1846 1814  
1847 1815          if (list_link_active(&vd->vdev_state_dirty_node))
1848 1816                  vdev_state_clean(vd);
1849 1817          if (list_link_active(&vd->vdev_config_dirty_node))
1850 1818                  vdev_config_clean(vd);
1851 1819  
     1820 +        ASSERT0(vd->vdev_stat.vs_alloc);
     1821 +
1852 1822          /*
1853 1823           * Clean up the vdev namespace.
1854 1824           */
1855 1825          vdev_remove_make_hole_and_free(vd);
1856 1826  
1857 1827          if (ev != NULL)
1858 1828                  spa_event_post(ev);
1859 1829  
1860 1830          return (0);
1861 1831  }
↓ open down ↓ 324 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX