Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>

@@ -18,10 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>

@@ -42,10 +43,11 @@
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);

@@ -380,59 +382,10 @@
 {
         return (arc_read_nolock(pio, spa, bpp, done, private,
             priority, zio_flags, arc_flags, zb));
 }
 
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
-        return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-            zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-        uint64_t zb1nextL0, zb2thisobj;
-
-        ASSERT(zb1->zb_objset == zb2->zb_objset);
-        ASSERT(zb2->zb_level == 0);
-
-        /*
-         * A bookmark in the deadlist is considered to be after
-         * everything else.
-         */
-        if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-                return (B_TRUE);
-
-        /* The objset_phys_t isn't before anything. */
-        if (dnp == NULL)
-                return (B_FALSE);
-
-        zb1nextL0 = (zb1->zb_blkid + 1) <<
-            ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
-        zb2thisobj = zb2->zb_object ? zb2->zb_object :
-            zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
-        if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-                uint64_t nextobj = zb1nextL0 *
-                    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-                return (nextobj <= zb2thisobj);
-        }
-
-        if (zb1->zb_object < zb2thisobj)
-                return (B_TRUE);
-        if (zb1->zb_object > zb2thisobj)
-                return (B_FALSE);
-        if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-                return (B_FALSE);
-        return (zb1nextL0 <= zb2->zb_blkid);
-}
-
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
         uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
         if (dsl_dataset_is_snapshot(ds))

@@ -460,11 +413,11 @@
                 return (B_FALSE);
 
         if (scn->scn_pausing)
                 return (B_TRUE); /* we're already pausing */
 
-        if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
+        if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
                 return (B_FALSE); /* we're resuming */
 
         /* We only know how to resume from level-0 blocks. */
         if (zb && zb->zb_level != 0)
                 return (B_FALSE);

@@ -615,17 +568,17 @@
     const zbookmark_t *zb)
 {
         /*
          * We never skip over user/group accounting objects (obj<0)
          */
-        if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
+        if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
             (int64_t)zb->zb_object >= 0) {
                 /*
                  * If we already visited this bp & everything below (in
                  * a prior txg sync), don't bother doing it again.
                  */
-                if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+                if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
                         return (B_TRUE);
 
                 /*
                  * If we found the block we're trying to resume from, or
                  * we went past it to a different object, zero it out to

@@ -814,26 +767,10 @@
             pbuf, bp);
 
         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
                 return;
 
-        if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
-                /*
-                 * For non-user-accounting blocks, we need to read the
-                 * new bp (from a deleted snapshot, found in
-                 * check_existing_xlation).  If we used the old bp,
-                 * pointers inside this block from before we resumed
-                 * would be untranslated.
-                 *
-                 * For user-accounting blocks, we need to read the old
-                 * bp, because we will apply the entire space delta to
-                 * it (original untranslated -> translations from
-                 * deleted snap -> now).
-                 */
-                bp_toread = *bp;
-        }
-
         if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
             &buf) != 0)
                 return;
 
         /*

@@ -1394,23 +1331,32 @@
                         return;
         }
         zap_cursor_fini(&zc);
 }
 
-static int
-dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static boolean_t
+dsl_scan_free_should_pause(dsl_scan_t *scn)
 {
-        dsl_scan_t *scn = arg;
         uint64_t elapsed_nanosecs;
 
         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
-
-        if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+        return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
             (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
             txg_sync_waiting(scn->scn_dp)) ||
-            spa_shutting_down(scn->scn_dp->dp_spa))
+            spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+        dsl_scan_t *scn = arg;
+
+        if (!scn->scn_is_bptree ||
+            (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+                if (dsl_scan_free_should_pause(scn))
                 return (ERESTART);
+        }
 
         zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
             dmu_tx_get_txg(tx), bp, 0));
         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
             -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),

@@ -1431,10 +1377,14 @@
                 return (B_FALSE);
 
         if (scn->scn_phys.scn_state == DSS_SCANNING)
                 return (B_TRUE);
 
+        if (spa_feature_is_active(spa,
+            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                return (B_TRUE);
+        }
         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
                     &used, &comp, &uncomp);
         }
         return (used != 0);

@@ -1477,18 +1427,44 @@
          * any scanning.  This ensures that there is no free list when
          * we are scanning, so the scan code doesn't have to worry about
          * traversing it.
          */
         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+                scn->scn_is_bptree = B_FALSE;
                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
                     NULL, ZIO_FLAG_MUSTSUCCEED);
                 err = bpobj_iterate(&dp->dp_free_bpobj,
-                    dsl_scan_free_cb, scn, tx);
+                    dsl_scan_free_block_cb, scn, tx);
                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+                if (err == 0 && spa_feature_is_active(spa,
+                    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                        scn->scn_is_bptree = B_TRUE;
+                        scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                            NULL, ZIO_FLAG_MUSTSUCCEED);
+                        err = bptree_iterate(dp->dp_meta_objset,
+                            dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
+                            scn, tx);
+                        VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+                        if (err != 0)
+                                return;
+
+                        /* disable async destroy feature */
+                        spa_feature_decr(spa,
+                            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
+                        ASSERT(!spa_feature_is_active(spa,
+                            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
+                        VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+                            DMU_POOL_DIRECTORY_OBJECT,
+                            DMU_POOL_BPTREE_OBJ, tx));
+                        VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
+                            dp->dp_bptree_obj, tx));
+                        dp->dp_bptree_obj = 0;
+                }
                 if (scn->scn_visited_this_txg) {
                         zfs_dbgmsg("freed %llu blocks in %llums from "
-                            "free_bpobj txg %llu",
+                            "free_bpobj/bptree txg %llu",
                             (longlong_t)scn->scn_visited_this_txg,
                             (longlong_t)
                             (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
                             (longlong_t)tx->tx_txg);
                         scn->scn_visited_this_txg = 0;

@@ -1599,10 +1575,12 @@
                 return;
 
         for (i = 0; i < 4; i++) {
                 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
                 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+                if (t & DMU_OT_NEWTYPE)
+                        t = DMU_OT_OTHER;
                 zfs_blkstat_t *zb = &zab->zab_type[l][t];
                 int equal;
 
                 zb->zb_count++;
                 zb->zb_asize += BP_GET_ASIZE(bp);