266 vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
267 vd->vdev_indirect_births =
268 vdev_indirect_births_open(mos, vic->vic_births_object);
269 spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
270 spa->spa_removing_phys.sr_start_time = gethrestime_sec();
271 spa->spa_removing_phys.sr_end_time = 0;
272 spa->spa_removing_phys.sr_state = DSS_SCANNING;
273 spa->spa_removing_phys.sr_to_copy = 0;
274 spa->spa_removing_phys.sr_copied = 0;
275
276 /*
277 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
278 * there may be space in the defer tree, which is free, but still
279 * counted in vs_alloc.
280 */
281 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
282 metaslab_t *ms = vd->vdev_ms[i];
283 if (ms->ms_sm == NULL)
284 continue;
285
286 /*
287 * Sync tasks happen before metaslab_sync(), therefore
288 * smp_alloc and sm_alloc must be the same.
289 */
290 ASSERT3U(space_map_allocated(ms->ms_sm), ==,
291 ms->ms_sm->sm_phys->smp_alloc);
292
293 spa->spa_removing_phys.sr_to_copy +=
294 space_map_allocated(ms->ms_sm);
295
296 /*
297 * Space which we are freeing this txg does not need to
298 * be copied.
299 */
300 spa->spa_removing_phys.sr_to_copy -=
301 range_tree_space(ms->ms_freeing);
302
303 ASSERT0(range_tree_space(ms->ms_freed));
304 for (int t = 0; t < TXG_SIZE; t++)
305 ASSERT0(range_tree_space(ms->ms_allocating[t]));
306 }
307
308 /*
309 * Sync tasks are called before metaslab_sync(), so there should
310 * be no already-synced metaslabs in the TXG_CLEAN list.
311 */
312 ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
313
314 spa_sync_removing_state(spa, tx);
1384 mutex_enter(&msp->ms_sync_lock);
1385 mutex_enter(&msp->ms_lock);
1386
1387 /*
1388 * Assert nothing in flight -- ms_*tree is empty.
1389 */
1390 for (int i = 0; i < TXG_SIZE; i++) {
1391 ASSERT0(range_tree_space(msp->ms_allocating[i]));
1392 }
1393
1394 /*
1395 * If the metaslab has ever been allocated from (ms_sm!=NULL),
1396 * read the allocated segments from the space map object
1397 * into svr_allocd_segs. Since we do this while holding
1398 * svr_lock and ms_sync_lock, concurrent frees (which
1399 * would have modified the space map) will wait for us
1400 * to finish loading the spacemap, and then take the
1401 * appropriate action (see free_from_removing_vdev()).
1402 */
1403 if (msp->ms_sm != NULL) {
1404 space_map_t *sm = NULL;
1405
1406 /*
1407 * We have to open a new space map here, because
1408 * ms_sm's sm_length and sm_alloc may not reflect
1409 * what's in the object contents, if we are in between
1410 * metaslab_sync() and metaslab_sync_done().
1411 */
1412 VERIFY0(space_map_open(&sm,
1413 spa->spa_dsl_pool->dp_meta_objset,
1414 msp->ms_sm->sm_object, msp->ms_sm->sm_start,
1415 msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
1416 space_map_update(sm);
1417 VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
1418 SM_ALLOC));
1419 space_map_close(sm);
1420
1421 range_tree_walk(msp->ms_freeing,
1422 range_tree_remove, svr->svr_allocd_segs);
1423
1424 /*
1425 * When we are resuming from a paused removal (i.e.
1426 * when importing a pool with a removal in progress),
1427 * discard any state that we have already processed.
1428 */
1429 range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
1430 }
1431 mutex_exit(&msp->ms_lock);
1432 mutex_exit(&msp->ms_sync_lock);
1433
1434 vca.vca_msp = msp;
1435 zfs_dbgmsg("copying %llu segments for metaslab %llu",
1436 avl_numnodes(&svr->svr_allocd_segs->rt_root),
1437 msp->ms_id);
1438
1439 while (!svr->svr_thread_exit &&
1440 !range_tree_is_empty(svr->svr_allocd_segs)) {
1594 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
1595 metaslab_t *msp = vd->vdev_ms[msi];
1596
1597 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
1598 break;
1599
1600 ASSERT0(range_tree_space(svr->svr_allocd_segs));
1601
1602 mutex_enter(&msp->ms_lock);
1603
1604 /*
1605 * Assert nothing in flight -- ms_*tree is empty.
1606 */
1607 for (int i = 0; i < TXG_SIZE; i++)
1608 ASSERT0(range_tree_space(msp->ms_allocating[i]));
1609 for (int i = 0; i < TXG_DEFER_SIZE; i++)
1610 ASSERT0(range_tree_space(msp->ms_defer[i]));
1611 ASSERT0(range_tree_space(msp->ms_freed));
1612
1613 if (msp->ms_sm != NULL) {
1614 /*
1615 * Assert that the in-core spacemap has the same
1616 * length as the on-disk one, so we can use the
1617 * existing in-core spacemap to load it from disk.
1618 */
1619 ASSERT3U(msp->ms_sm->sm_alloc, ==,
1620 msp->ms_sm->sm_phys->smp_alloc);
1621 ASSERT3U(msp->ms_sm->sm_length, ==,
1622 msp->ms_sm->sm_phys->smp_objsize);
1623
1624 mutex_enter(&svr->svr_lock);
1625 VERIFY0(space_map_load(msp->ms_sm,
1626 svr->svr_allocd_segs, SM_ALLOC));
1627 range_tree_walk(msp->ms_freeing,
1628 range_tree_remove, svr->svr_allocd_segs);
1629
1630 /*
1631 * Clear everything past what has been synced,
1632 * because we have not allocated mappings for it yet.
1633 */
1634 uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
1635 uint64_t sm_end = msp->ms_sm->sm_start +
1636 msp->ms_sm->sm_size;
1637 if (sm_end > syncd)
1638 range_tree_clear(svr->svr_allocd_segs,
1639 syncd, sm_end - syncd);
1640
1641 mutex_exit(&svr->svr_lock);
1642 }
1643 mutex_exit(&msp->ms_lock);
1696
1697 if (spa->spa_vdev_removal == NULL)
1698 return (ENOTACTIVE);
1699
1700 uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
1701
1702 int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
1703 spa_vdev_remove_cancel_sync, NULL, 0,
1704 ZFS_SPACE_CHECK_EXTRA_RESERVED);
1705
1706 if (error == 0) {
1707 spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
1708 vdev_t *vd = vdev_lookup_top(spa, vdid);
1709 metaslab_group_activate(vd->vdev_mg);
1710 spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
1711 }
1712
1713 return (error);
1714 }
1715
1716 /*
1717 * Called every sync pass of every txg if there's a svr.
1718 */
1719 void
1720 svr_sync(spa_t *spa, dmu_tx_t *tx)
1721 {
1722 spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1723 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
1724
1725 /*
1726 * This check is necessary so that we do not dirty the
1727 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
1728 * is nothing to do. Dirtying it every time would prevent us
1729 * from syncing-to-convergence.
1730 */
1731 if (svr->svr_bytes_done[txgoff] == 0)
1732 return;
1733
1734 /*
1735 * Update progress accounting.
1736 */
1737 spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
1738 svr->svr_bytes_done[txgoff] = 0;
1762 vdev_config_dirty(rvd);
1763
1764 /*
1765 * Reassess the health of our root vdev.
1766 */
1767 vdev_reopen(rvd);
1768 }
1769
1770 /*
1771 * Remove a log device. The config lock is held for the specified TXG.
1772 */
1773 static int
1774 spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
1775 {
1776 metaslab_group_t *mg = vd->vdev_mg;
1777 spa_t *spa = vd->vdev_spa;
1778 int error = 0;
1779
1780 ASSERT(vd->vdev_islog);
1781 ASSERT(vd == vd->vdev_top);
1782
1783 /*
1784 * Stop allocating from this vdev.
1785 */
1786 metaslab_group_passivate(mg);
1787
1788 /*
1789 * Wait for the youngest allocations and frees to sync,
1790 * and then wait for the deferral of those frees to finish.
1791 */
1792 spa_vdev_config_exit(spa, NULL,
1793 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
1794
1795 /*
1796 * Evacuate the device. We don't hold the config lock as writer
1797 * since we need to do I/O but we do keep the
1798 * spa_namespace_lock held. Once this completes the device
1799 * should no longer have any blocks allocated on it.
1800 */
1801 if (vd->vdev_islog) {
1802 if (vd->vdev_stat.vs_alloc != 0)
1803 error = spa_reset_logs(spa);
1804 }
1805
1806 *txg = spa_vdev_config_enter(spa);
1807
1808 if (error != 0) {
1809 metaslab_group_activate(mg);
1810 return (error);
1811 }
1812 ASSERT0(vd->vdev_stat.vs_alloc);
1813
1814 /*
1815 * The evacuation succeeded. Remove any remaining MOS metadata
1816 * associated with this vdev, and wait for these changes to sync.
1817 */
1818 vd->vdev_removing = B_TRUE;
1819
1820 vdev_dirty_leaves(vd, VDD_DTL, *txg);
1821 vdev_config_dirty(vd);
1822
1823 spa_history_log_internal(spa, "vdev remove", NULL,
1824 "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
1825 (vd->vdev_path != NULL) ? vd->vdev_path : "-");
1826
1827 /* Make sure these changes are sync'ed */
1828 spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
1829
1830 /* Stop initializing */
1831 (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
1832
1833 *txg = spa_vdev_config_enter(spa);
1834
1835 sysevent_t *ev = spa_event_create(spa, vd, NULL,
1836 ESC_ZFS_VDEV_REMOVE_DEV);
1837 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1838 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1839
1840 /* The top ZAP should have been destroyed by vdev_remove_empty. */
1841 ASSERT0(vd->vdev_top_zap);
1842 /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
1843 ASSERT0(vd->vdev_leaf_zap);
1844
1845 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1846
1847 if (list_link_active(&vd->vdev_state_dirty_node))
1848 vdev_state_clean(vd);
1849 if (list_link_active(&vd->vdev_config_dirty_node))
1850 vdev_config_clean(vd);
1851
1852 /*
1853 * Clean up the vdev namespace.
1854 */
1855 vdev_remove_make_hole_and_free(vd);
1856
1857 if (ev != NULL)
1858 spa_event_post(ev);
1859
1860 return (0);
1861 }
1862
1863 static int
1864 spa_vdev_remove_top_check(vdev_t *vd)
1865 {
1866 spa_t *spa = vd->vdev_spa;
1867
1868 if (vd != vd->vdev_top)
1869 return (SET_ERROR(ENOTSUP));
1870
1871 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
|
266 vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
267 vd->vdev_indirect_births =
268 vdev_indirect_births_open(mos, vic->vic_births_object);
269 spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
270 spa->spa_removing_phys.sr_start_time = gethrestime_sec();
271 spa->spa_removing_phys.sr_end_time = 0;
272 spa->spa_removing_phys.sr_state = DSS_SCANNING;
273 spa->spa_removing_phys.sr_to_copy = 0;
274 spa->spa_removing_phys.sr_copied = 0;
275
276 /*
277 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
278 * there may be space in the defer tree, which is free, but still
279 * counted in vs_alloc.
280 */
281 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
282 metaslab_t *ms = vd->vdev_ms[i];
283 if (ms->ms_sm == NULL)
284 continue;
285
286 spa->spa_removing_phys.sr_to_copy +=
287 metaslab_allocated_space(ms);
288
289 /*
290 * Space which we are freeing this txg does not need to
291 * be copied.
292 */
293 spa->spa_removing_phys.sr_to_copy -=
294 range_tree_space(ms->ms_freeing);
295
296 ASSERT0(range_tree_space(ms->ms_freed));
297 for (int t = 0; t < TXG_SIZE; t++)
298 ASSERT0(range_tree_space(ms->ms_allocating[t]));
299 }
300
301 /*
302 * Sync tasks are called before metaslab_sync(), so there should
303 * be no already-synced metaslabs in the TXG_CLEAN list.
304 */
305 ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
306
307 spa_sync_removing_state(spa, tx);
1377 mutex_enter(&msp->ms_sync_lock);
1378 mutex_enter(&msp->ms_lock);
1379
1380 /*
1381 * Assert nothing in flight -- ms_*tree is empty.
1382 */
1383 for (int i = 0; i < TXG_SIZE; i++) {
1384 ASSERT0(range_tree_space(msp->ms_allocating[i]));
1385 }
1386
1387 /*
1388 * If the metaslab has ever been allocated from (ms_sm!=NULL),
1389 * read the allocated segments from the space map object
1390 * into svr_allocd_segs. Since we do this while holding
1391 * svr_lock and ms_sync_lock, concurrent frees (which
1392 * would have modified the space map) will wait for us
1393 * to finish loading the spacemap, and then take the
1394 * appropriate action (see free_from_removing_vdev()).
1395 */
1396 if (msp->ms_sm != NULL) {
1397 VERIFY0(space_map_load(msp->ms_sm,
1398 svr->svr_allocd_segs, SM_ALLOC));
1399
1400 range_tree_walk(msp->ms_freeing,
1401 range_tree_remove, svr->svr_allocd_segs);
1402
1403 /*
1404 * When we are resuming from a paused removal (i.e.
1405 * when importing a pool with a removal in progress),
1406 * discard any state that we have already processed.
1407 */
1408 range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
1409 }
1410 mutex_exit(&msp->ms_lock);
1411 mutex_exit(&msp->ms_sync_lock);
1412
1413 vca.vca_msp = msp;
1414 zfs_dbgmsg("copying %llu segments for metaslab %llu",
1415 avl_numnodes(&svr->svr_allocd_segs->rt_root),
1416 msp->ms_id);
1417
1418 while (!svr->svr_thread_exit &&
1419 !range_tree_is_empty(svr->svr_allocd_segs)) {
1573 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
1574 metaslab_t *msp = vd->vdev_ms[msi];
1575
1576 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
1577 break;
1578
1579 ASSERT0(range_tree_space(svr->svr_allocd_segs));
1580
1581 mutex_enter(&msp->ms_lock);
1582
1583 /*
1584 * Assert nothing in flight -- ms_*tree is empty.
1585 */
1586 for (int i = 0; i < TXG_SIZE; i++)
1587 ASSERT0(range_tree_space(msp->ms_allocating[i]));
1588 for (int i = 0; i < TXG_DEFER_SIZE; i++)
1589 ASSERT0(range_tree_space(msp->ms_defer[i]));
1590 ASSERT0(range_tree_space(msp->ms_freed));
1591
1592 if (msp->ms_sm != NULL) {
1593 mutex_enter(&svr->svr_lock);
1594 VERIFY0(space_map_load(msp->ms_sm,
1595 svr->svr_allocd_segs, SM_ALLOC));
1596 range_tree_walk(msp->ms_freeing,
1597 range_tree_remove, svr->svr_allocd_segs);
1598
1599 /*
1600 * Clear everything past what has been synced,
1601 * because we have not allocated mappings for it yet.
1602 */
1603 uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
1604 uint64_t sm_end = msp->ms_sm->sm_start +
1605 msp->ms_sm->sm_size;
1606 if (sm_end > syncd)
1607 range_tree_clear(svr->svr_allocd_segs,
1608 syncd, sm_end - syncd);
1609
1610 mutex_exit(&svr->svr_lock);
1611 }
1612 mutex_exit(&msp->ms_lock);
1665
1666 if (spa->spa_vdev_removal == NULL)
1667 return (ENOTACTIVE);
1668
1669 uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
1670
1671 int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
1672 spa_vdev_remove_cancel_sync, NULL, 0,
1673 ZFS_SPACE_CHECK_EXTRA_RESERVED);
1674
1675 if (error == 0) {
1676 spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
1677 vdev_t *vd = vdev_lookup_top(spa, vdid);
1678 metaslab_group_activate(vd->vdev_mg);
1679 spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
1680 }
1681
1682 return (error);
1683 }
1684
1685 void
1686 svr_sync(spa_t *spa, dmu_tx_t *tx)
1687 {
1688 spa_vdev_removal_t *svr = spa->spa_vdev_removal;
1689 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
1690
1691 /*
1692 * This check is necessary so that we do not dirty the
1693 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
1694 * is nothing to do. Dirtying it every time would prevent us
1695 * from syncing-to-convergence.
1696 */
1697 if (svr->svr_bytes_done[txgoff] == 0)
1698 return;
1699
1700 /*
1701 * Update progress accounting.
1702 */
1703 spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
1704 svr->svr_bytes_done[txgoff] = 0;
1728 vdev_config_dirty(rvd);
1729
1730 /*
1731 * Reassess the health of our root vdev.
1732 */
1733 vdev_reopen(rvd);
1734 }
1735
1736 /*
1737 * Remove a log device. The config lock is held for the specified TXG.
1738 */
1739 static int
1740 spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
1741 {
1742 metaslab_group_t *mg = vd->vdev_mg;
1743 spa_t *spa = vd->vdev_spa;
1744 int error = 0;
1745
1746 ASSERT(vd->vdev_islog);
1747 ASSERT(vd == vd->vdev_top);
1748 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1749
1750 /*
1751 * Stop allocating from this vdev.
1752 */
1753 metaslab_group_passivate(mg);
1754
1755 /*
1756 * Wait for the youngest allocations and frees to sync,
1757 * and then wait for the deferral of those frees to finish.
1758 */
1759 spa_vdev_config_exit(spa, NULL,
1760 *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
1761
1762 /*
1763 * Evacuate the device. We don't hold the config lock as
1764 * writer since we need to do I/O but we do keep the
1765 * spa_namespace_lock held. Once this completes the device
1766 * should no longer have any blocks allocated on it.
1767 */
1768 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1769 if (vd->vdev_stat.vs_alloc != 0)
1770 error = spa_reset_logs(spa);
1771
1772 *txg = spa_vdev_config_enter(spa);
1773
1774 if (error != 0) {
1775 metaslab_group_activate(mg);
1776 return (error);
1777 }
1778 ASSERT0(vd->vdev_stat.vs_alloc);
1779
1780 /*
1781 * The evacuation succeeded. Remove any remaining MOS metadata
1782 * associated with this vdev, and wait for these changes to sync.
1783 */
1784 vd->vdev_removing = B_TRUE;
1785
1786 vdev_dirty_leaves(vd, VDD_DTL, *txg);
1787 vdev_config_dirty(vd);
1788
1789 vdev_metaslab_fini(vd);
1790
1791 spa_history_log_internal(spa, "vdev remove", NULL,
1792 "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
1793 (vd->vdev_path != NULL) ? vd->vdev_path : "-");
1794
1795 /* Make sure these changes are sync'ed */
1796 spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
1797
1798 /* Stop initializing */
1799 (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
1800
1801 *txg = spa_vdev_config_enter(spa);
1802
1803 sysevent_t *ev = spa_event_create(spa, vd, NULL,
1804 ESC_ZFS_VDEV_REMOVE_DEV);
1805 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1806 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1807
1808 /* The top ZAP should have been destroyed by vdev_remove_empty. */
1809 ASSERT0(vd->vdev_top_zap);
1810 /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
1811 ASSERT0(vd->vdev_leaf_zap);
1812
1813 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1814
1815 if (list_link_active(&vd->vdev_state_dirty_node))
1816 vdev_state_clean(vd);
1817 if (list_link_active(&vd->vdev_config_dirty_node))
1818 vdev_config_clean(vd);
1819
1820 ASSERT0(vd->vdev_stat.vs_alloc);
1821
1822 /*
1823 * Clean up the vdev namespace.
1824 */
1825 vdev_remove_make_hole_and_free(vd);
1826
1827 if (ev != NULL)
1828 spa_event_post(ev);
1829
1830 return (0);
1831 }
1832
1833 static int
1834 spa_vdev_remove_top_check(vdev_t *vd)
1835 {
1836 spa_t *spa = vd->vdev_spa;
1837
1838 if (vd != vd->vdev_top)
1839 return (SET_ERROR(ENOTSUP));
1840
1841 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
|