Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>

*** 18,27 **** --- 18,28 ---- * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include <sys/dsl_scan.h> #include <sys/dsl_pool.h> #include <sys/dsl_dataset.h>
*** 42,51 **** --- 43,53 ---- #include <sys/zil_impl.h> #include <sys/zio_checksum.h> #include <sys/ddt.h> #include <sys/sa.h> #include <sys/sa_impl.h> + #include <sys/zfeature.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
*** 380,438 **** { return (arc_read_nolock(pio, spa, bpp, done, private, priority, zio_flags, arc_flags, zb)); } - static boolean_t - bookmark_is_zero(const zbookmark_t *zb) - { - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); - } - - /* dnp is the dnode for zb1->zb_object */ - static boolean_t - bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) - { - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); - } - static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (dsl_dataset_is_snapshot(ds)) --- 382,391 ----
*** 460,470 **** return (B_FALSE); if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ ! if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE); --- 413,423 ---- return (B_FALSE); if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ ! if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE);
*** 615,631 **** const zbookmark_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ ! if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ ! if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to --- 568,584 ---- const zbookmark_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ ! if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ ! if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to
*** 814,839 **** pbuf, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; - if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) { - /* - * For non-user-accounting blocks, we need to read the - * new bp (from a deleted snapshot, found in - * check_existing_xlation). If we used the old bp, - * pointers inside this block from before we resumed - * would be untranslated. - * - * For user-accounting blocks, we need to read the old - * bp, because we will apply the entire space delta to - * it (original untranslated -> translations from - * deleted snap -> now). - */ - bp_toread = *bp; - } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx, &buf) != 0) return; /* --- 767,776 ----
*** 1394,1416 **** return; } zap_cursor_fini(&zc); } ! static int ! dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { - dsl_scan_t *scn = arg; uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; ! ! if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || ! spa_shutting_down(scn->scn_dp->dp_spa)) return (ERESTART); zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), --- 1331,1362 ---- return; } zap_cursor_fini(&zc); } ! static boolean_t ! dsl_scan_free_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; ! return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || ! spa_shutting_down(scn->scn_dp->dp_spa)); ! } ! ! static int ! dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) ! { ! dsl_scan_t *scn = arg; ! ! if (!scn->scn_is_bptree || ! (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { ! if (dsl_scan_free_should_pause(scn)) return (ERESTART); + } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
*** 1431,1440 **** --- 1377,1390 ---- return (B_FALSE); if (scn->scn_phys.scn_state == DSS_SCANNING) return (B_TRUE); + if (spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + return (B_TRUE); + } if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } return (used != 0);
*** 1477,1494 **** * any scanning. This ensures that there is no free list when * we are scanning, so the scan code doesn't have to worry about * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, ! dsl_scan_free_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " ! "free_bpobj txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, (longlong_t)tx->tx_txg); scn->scn_visited_this_txg = 0; --- 1427,1470 ---- * any scanning. This ensures that there is no free list when * we are scanning, so the scan code doesn't have to worry about * traversing it. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, ! dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err == 0 && spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, + scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + if (err != 0) + return; + + /* disable async destroy feature */ + spa_feature_decr(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx); + ASSERT(!spa_feature_is_active(spa, + &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])); + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " ! "free_bpobj/bptree txg %llu", (longlong_t)scn->scn_visited_this_txg, (longlong_t) (gethrtime() - scn->scn_sync_start_time) / MICROSEC, (longlong_t)tx->tx_txg); scn->scn_visited_this_txg = 0;
*** 1599,1608 **** --- 1575,1586 ---- return; for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp);