Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
*** 18,27 ****
--- 18,28 ----
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
*/
#include <sys/dsl_scan.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_dataset.h>
*** 42,51 ****
--- 43,53 ----
#include <sys/zil_impl.h>
#include <sys/zio_checksum.h>
#include <sys/ddt.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
+ #include <sys/zfeature.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
*** 380,438 ****
{
return (arc_read_nolock(pio, spa, bpp, done, private,
priority, zio_flags, arc_flags, zb));
}
- static boolean_t
- bookmark_is_zero(const zbookmark_t *zb)
- {
- return (zb->zb_objset == 0 && zb->zb_object == 0 &&
- zb->zb_level == 0 && zb->zb_blkid == 0);
- }
-
- /* dnp is the dnode for zb1->zb_object */
- static boolean_t
- bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
- const zbookmark_t *zb2)
- {
- uint64_t zb1nextL0, zb2thisobj;
-
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb2->zb_level == 0);
-
- /*
- * A bookmark in the deadlist is considered to be after
- * everything else.
- */
- if (zb2->zb_object == DMU_DEADLIST_OBJECT)
- return (B_TRUE);
-
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
-
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
- if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
- }
-
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
- return (B_FALSE);
- if (zb2->zb_object == DMU_META_DNODE_OBJECT)
- return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
- }
-
static uint64_t
dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
{
uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
if (dsl_dataset_is_snapshot(ds))
--- 382,391 ----
*** 460,470 ****
return (B_FALSE);
if (scn->scn_pausing)
return (B_TRUE); /* we're already pausing */
! if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
if (zb && zb->zb_level != 0)
return (B_FALSE);
--- 413,423 ----
return (B_FALSE);
if (scn->scn_pausing)
return (B_TRUE); /* we're already pausing */
! if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
return (B_FALSE); /* we're resuming */
/* We only know how to resume from level-0 blocks. */
if (zb && zb->zb_level != 0)
return (B_FALSE);
*** 615,631 ****
const zbookmark_t *zb)
{
/*
* We never skip over user/group accounting objects (obj<0)
*/
! if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
(int64_t)zb->zb_object >= 0) {
/*
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
! if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
return (B_TRUE);
/*
* If we found the block we're trying to resume from, or
* we went past it to a different object, zero it out to
--- 568,584 ----
const zbookmark_t *zb)
{
/*
* We never skip over user/group accounting objects (obj<0)
*/
! if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
(int64_t)zb->zb_object >= 0) {
/*
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
! if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
return (B_TRUE);
/*
* If we found the block we're trying to resume from, or
* we went past it to a different object, zero it out to
*** 814,839 ****
pbuf, bp);
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return;
- if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
- /*
- * For non-user-accounting blocks, we need to read the
- * new bp (from a deleted snapshot, found in
- * check_existing_xlation). If we used the old bp,
- * pointers inside this block from before we resumed
- * would be untranslated.
- *
- * For user-accounting blocks, we need to read the old
- * bp, because we will apply the entire space delta to
- * it (original untranslated -> translations from
- * deleted snap -> now).
- */
- bp_toread = *bp;
- }
-
if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
&buf) != 0)
return;
/*
--- 767,776 ----
*** 1394,1416 ****
return;
}
zap_cursor_fini(&zc);
}
! static int
! dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
{
- dsl_scan_t *scn = arg;
uint64_t elapsed_nanosecs;
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
!
! if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
(elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
! spa_shutting_down(scn->scn_dp->dp_spa))
return (ERESTART);
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0));
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
--- 1331,1362 ----
return;
}
zap_cursor_fini(&zc);
}
! static boolean_t
! dsl_scan_free_should_pause(dsl_scan_t *scn)
{
uint64_t elapsed_nanosecs;
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
! return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
(elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
! spa_shutting_down(scn->scn_dp->dp_spa));
! }
!
! static int
! dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
! {
! dsl_scan_t *scn = arg;
!
! if (!scn->scn_is_bptree ||
! (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
! if (dsl_scan_free_should_pause(scn))
return (ERESTART);
+ }
zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
dmu_tx_get_txg(tx), bp, 0));
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
-bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
*** 1431,1440 ****
--- 1377,1390 ----
return (B_FALSE);
if (scn->scn_phys.scn_state == DSS_SCANNING)
return (B_TRUE);
+ if (spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+ return (B_TRUE);
+ }
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
&used, &comp, &uncomp);
}
return (used != 0);
*** 1477,1494 ****
* any scanning. This ensures that there is no free list when
* we are scanning, so the scan code doesn't have to worry about
* traversing it.
*/
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bpobj_iterate(&dp->dp_free_bpobj,
! dsl_scan_free_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
if (scn->scn_visited_this_txg) {
zfs_dbgmsg("freed %llu blocks in %llums from "
! "free_bpobj txg %llu",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
(gethrtime() - scn->scn_sync_start_time) / MICROSEC,
(longlong_t)tx->tx_txg);
scn->scn_visited_this_txg = 0;
--- 1427,1470 ----
* any scanning. This ensures that there is no free list when
* we are scanning, so the scan code doesn't have to worry about
* traversing it.
*/
if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ scn->scn_is_bptree = B_FALSE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
err = bpobj_iterate(&dp->dp_free_bpobj,
! dsl_scan_free_block_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+ if (err == 0 && spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+ scn->scn_is_bptree = B_TRUE;
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bptree_iterate(dp->dp_meta_objset,
+ dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
+ scn, tx);
+ VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ if (err != 0)
+ return;
+
+ /* disable async destroy feature */
+ spa_feature_decr(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
+ ASSERT(!spa_feature_is_active(spa,
+ &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
+ VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, tx));
+ VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
+ dp->dp_bptree_obj, tx));
+ dp->dp_bptree_obj = 0;
+ }
if (scn->scn_visited_this_txg) {
zfs_dbgmsg("freed %llu blocks in %llums from "
! "free_bpobj/bptree txg %llu",
(longlong_t)scn->scn_visited_this_txg,
(longlong_t)
(gethrtime() - scn->scn_sync_start_time) / MICROSEC,
(longlong_t)tx->tx_txg);
scn->scn_visited_this_txg = 0;
*** 1599,1608 ****
--- 1575,1586 ----
return;
for (i = 0; i < 4; i++) {
int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+ if (t & DMU_OT_NEWTYPE)
+ t = DMU_OT_OTHER;
zfs_blkstat_t *zb = &zab->zab_type[l][t];
int equal;
zb->zb_count++;
zb->zb_asize += BP_GET_ASIZE(bp);