Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>

@@ -18,11 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>

@@ -33,10 +33,11 @@
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
+#include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>

@@ -98,11 +99,11 @@
         ASSERT(dmu_tx_is_syncing(tx));
         /* It could have been compressed away to nothing */
         if (BP_IS_HOLE(bp))
                 return;
         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
-        ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+        ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
         if (ds == NULL) {
                 /*
                  * Account for the meta-objset space in its placeholder
                  * dsl_dir.
                  */

@@ -115,11 +116,11 @@
         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
         mutex_enter(&ds->ds_dir->dd_lock);
         mutex_enter(&ds->ds_lock);
         delta = parent_delta(ds, used);
-        ds->ds_phys->ds_used_bytes += used;
+        ds->ds_phys->ds_referenced_bytes += used;
         ds->ds_phys->ds_compressed_bytes += compressed;
         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
         ds->ds_phys->ds_unique_bytes += used;
         mutex_exit(&ds->ds_lock);
         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,

@@ -209,12 +210,12 @@
                         dsl_dir_transfer_space(ds->ds_dir, used,
                             DD_USED_HEAD, DD_USED_SNAP, tx);
                 }
         }
         mutex_enter(&ds->ds_lock);
-        ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
-        ds->ds_phys->ds_used_bytes -= used;
+        ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
+        ds->ds_phys->ds_referenced_bytes -= used;
         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
         ds->ds_phys->ds_compressed_bytes -= compressed;
         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
         mutex_exit(&ds->ds_lock);

@@ -816,12 +817,12 @@
                 dsl_dataset_t *ohds;
 
                 dsphys->ds_prev_snap_obj = origin->ds_object;
                 dsphys->ds_prev_snap_txg =
                     origin->ds_phys->ds_creation_txg;
-                dsphys->ds_used_bytes =
-                    origin->ds_phys->ds_used_bytes;
+                dsphys->ds_referenced_bytes =
+                    origin->ds_phys->ds_referenced_bytes;
                 dsphys->ds_compressed_bytes =
                     origin->ds_phys->ds_compressed_bytes;
                 dsphys->ds_uncompressed_bytes =
                     origin->ds_phys->ds_uncompressed_bytes;
                 dsphys->ds_bp = origin->ds_phys->ds_bp;

@@ -931,11 +932,10 @@
         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 
         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
             pair = nvlist_next_nvpair(snaps, pair)) {
                 dsl_dataset_t *ds;
-                int err;
 
                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
                 if (err == 0) {
                         struct dsl_ds_destroyarg *dsda;
 

@@ -1080,13 +1080,16 @@
         err = dmu_objset_from_ds(ds, &os);
         if (err)
                 goto out;
 
         /*
-         * remove the objects in open context, so that we won't
-         * have too much to do in syncing context.
+         * If async destruction is not enabled try to remove all objects
+         * while in the open context so that there is less work to do in
+         * the syncing context.
          */
+        if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
+            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
             ds->ds_phys->ds_prev_snap_txg)) {
                 /*
                  * Ignore errors, if there is not enough disk space
                  * we will deal with it in dsl_dataset_destroy_sync().

@@ -1093,10 +1096,11 @@
                  */
                 (void) dmu_free_object(os, obj);
         }
         if (err != ESRCH)
                 goto out;
+        }
 
         /*
          * Only the ZIL knows how to free log blocks.
          */
         zil_destroy(dmu_objset_zil(os), B_FALSE);

@@ -1238,19 +1242,19 @@
         uint64_t dlused, dlcomp, dluncomp;
 
         ASSERT(!dsl_dataset_is_snapshot(ds));
 
         if (ds->ds_phys->ds_prev_snap_obj != 0)
-                mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
+                mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
         else
                 mrs_used = 0;
 
         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
         ASSERT3U(dlused, <=, mrs_used);
         ds->ds_phys->ds_unique_bytes =
-            ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
+            ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
 
         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
             SPA_VERSION_UNIQUE_ACCURATE)
                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }

@@ -1604,10 +1608,34 @@
         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
             ds_next->ds_phys->ds_deadlist_obj);
 }
 
+static int
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+        int err;
+        struct killarg ka;
+
+        /*
+         * Free everything that we point to (that's born after
+         * the previous snapshot, if we are a clone)
+         *
+         * NB: this should be very quick, because we already
+         * freed all the objects in open context.
+         */
+        ka.ds = ds;
+        ka.tx = tx;
+        err = traverse_dataset(ds,
+            ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+            kill_blkptr, &ka);
+        ASSERT3U(err, ==, 0);
+        ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+
+        return (err);
+}
+
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
         struct dsl_ds_destroyarg *dsda = arg1;
         dsl_dataset_t *ds = dsda->ds;

@@ -1750,11 +1778,10 @@
                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
                             tx);
                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
                             DD_USED_HEAD, used, comp, uncomp, tx);
-                        dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
 
                         /* Merge our deadlist into next's and free it. */
                         dsl_deadlist_merge(&ds_next->ds_deadlist,
                             ds->ds_phys->ds_deadlist_obj, tx);
                 }

@@ -1826,37 +1853,59 @@
                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
                         }
                 }
                 dsl_dataset_rele(ds_next, FTAG);
         } else {
+                zfeature_info_t *async_destroy =
+                    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
+
                 /*
                  * There's no next snapshot, so this is a head dataset.
                  * Destroy the deadlist.  Unless it's a clone, the
                  * deadlist should be empty.  (If it's a clone, it's
                  * safe to ignore the deadlist contents.)
                  */
-                struct killarg ka;
-
                 dsl_deadlist_close(&ds->ds_deadlist);
                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
                 ds->ds_phys->ds_deadlist_obj = 0;
 
+                if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+                        err = old_synchronous_dataset_destroy(ds, tx);
+                } else {
                 /*
-                 * Free everything that we point to (that's born after
-                 * the previous snapshot, if we are a clone)
-                 *
-                 * NB: this should be very quick, because we already
-                 * freed all the objects in open context.
+                         * Move the bptree into the pool's list of trees to
+                         * clean up and update space accounting information.
                  */
-                ka.ds = ds;
-                ka.tx = tx;
-                err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-                    TRAVERSE_POST, kill_blkptr, &ka);
-                ASSERT3U(err, ==, 0);
+                        uint64_t used, comp, uncomp;
+
+                        ASSERT(err == 0 || err == EBUSY);
+                        if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+                                spa_feature_incr(dp->dp_spa, async_destroy, tx);
+                                dp->dp_bptree_obj = bptree_alloc(
+                                    dp->dp_meta_objset, tx);
+                                VERIFY(zap_add(dp->dp_meta_objset,
+                                    DMU_POOL_DIRECTORY_OBJECT,
+                                    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+                                    &dp->dp_bptree_obj, tx) == 0);
+                        }
+
+                        used = ds->ds_dir->dd_phys->dd_used_bytes;
+                        comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
+                        uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+
                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-                    ds->ds_phys->ds_unique_bytes == 0);
+                            ds->ds_phys->ds_unique_bytes == used);
 
+                        bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj,
+                            &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+                            used, comp, uncomp, tx);
+                        dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+                            -used, -comp, -uncomp, tx);
+                        dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+                            used, comp, uncomp, tx);
+                }
+
                 if (ds->ds_prev != NULL) {
                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
                                 VERIFY3U(0, ==, zap_remove_int(mos,
                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
                                     ds->ds_object, tx));

@@ -2042,11 +2091,11 @@
         dsphys->ds_next_snap_obj = ds->ds_object;
         dsphys->ds_num_children = 1;
         dsphys->ds_creation_time = gethrestime_sec();
         dsphys->ds_creation_txg = crtxg;
         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-        dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
+        dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
         dsphys->ds_flags = ds->ds_phys->ds_flags;
         dsphys->ds_bp = ds->ds_phys->ds_bp;
         dmu_buf_rele(dbuf, FTAG);

@@ -2166,14 +2215,26 @@
         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
             zap_cursor_retrieve(&zc, &za) == 0;
             zap_cursor_advance(&zc)) {
                 dsl_dataset_t *clone;
                 char buf[ZFS_MAXNAMELEN];
+                /*
+                 * Even though we hold the dp_config_rwlock, the dataset
+                 * may fail to open, returning ENOENT.  If there is a
+                 * thread concurrently attempting to destroy this
+                 * dataset, it will have the ds_rwlock held for
+                 * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
+                 * dsl_dataset_hold_ref() will fail its
+                 * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
+                 * dp_config_rwlock, and wait for the destroy progress
+                 * and signal ds_exclusive_cv.  If the destroy was
+                 * successful, we will see that
+                 * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
+                 */
                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-                    za.za_first_integer, FTAG, &clone) != 0) {
-                        goto fail;
-                }
+                    za.za_first_integer, FTAG, &clone) != 0)
+                        continue;
                 dsl_dir_name(clone->ds_dir, buf);
                 VERIFY(nvlist_add_boolean(val, buf) == 0);
                 dsl_dataset_rele(clone, FTAG);
         }
         zap_cursor_fini(&zc);

@@ -2292,11 +2353,11 @@
 void
 dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-        *refdbytesp = ds->ds_phys->ds_used_bytes;
+        *refdbytesp = ds->ds_phys->ds_referenced_bytes;
         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
         if (ds->ds_quota != 0) {
                 /*

@@ -2629,11 +2690,11 @@
          * Which simplifies to:
          * uN + kN + kN-1 + ... + k1 + k0
          * Note however, if we stop before we reach the ORIGIN we get:
          * uN + kN + kN-1 + ... + kM - uM-1
          */
-        pa->used = origin_ds->ds_phys->ds_used_bytes;
+        pa->used = origin_ds->ds_phys->ds_referenced_bytes;
         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
         for (snap = list_head(&pa->shared_snaps); snap;
             snap = list_next(&pa->shared_snaps, snap)) {
                 uint64_t val, dlused, dlcomp, dluncomp;

@@ -2663,11 +2724,11 @@
         /*
          * If we are a clone of a clone then we never reached ORIGIN,
          * so we need to subtract out the clone origin's used space.
          */
         if (pa->origin_origin) {
-                pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
+                pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
         }
 
         /* Check that there is enough space here */

@@ -3179,12 +3240,12 @@
                 dsl_deadlist_space(&csa->cds->ds_deadlist,
                     &cdl_used, &cdl_comp, &cdl_uncomp);
                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
                     &odl_used, &odl_comp, &odl_uncomp);
 
-                dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
-                    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
+                dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
+                    (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
                     cdl_uncomp -
                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);

@@ -3209,12 +3270,12 @@
                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
                     DD_USED_HEAD, DD_USED_SNAP, tx);
         }
 
         /* swap ds_*_bytes */
-        SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
-            csa->cds->ds_phys->ds_used_bytes);
+        SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
+            csa->cds->ds_phys->ds_referenced_bytes);
         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
             csa->cds->ds_phys->ds_compressed_bytes);
         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
             csa->cds->ds_phys->ds_uncompressed_bytes);
         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,

@@ -3339,12 +3400,13 @@
          * If they are requesting more space, and our current estimate
          * is over quota, they get to try again unless the actual
          * on-disk is over quota and there are no pending changes (which
          * may free up space for us).
          */
-        if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
-                if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
+        if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+                if (inflight > 0 ||
+                    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
                         error = ERESTART;
                 else
                         error = EDQUOT;
         }
         mutex_exit(&ds->ds_lock);

@@ -3367,11 +3429,11 @@
                 return (err);
 
         if (psa->psa_effective_value == 0)
                 return (0);
 
-        if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+        if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
             psa->psa_effective_value < ds->ds_reserved)
                 return (ENOSPC);
 
         return (0);
 }

@@ -4121,12 +4183,12 @@
         int err = 0;
         uint64_t snapobj;
         dsl_pool_t *dp = new->ds_dir->dd_pool;
 
         *usedp = 0;
-        *usedp += new->ds_phys->ds_used_bytes;
-        *usedp -= oldsnap->ds_phys->ds_used_bytes;
+        *usedp += new->ds_phys->ds_referenced_bytes;
+        *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
 
         *compp = 0;
         *compp += new->ds_phys->ds_compressed_bytes;
         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
 

@@ -4138,13 +4200,17 @@
         snapobj = new->ds_object;
         while (snapobj != oldsnap->ds_object) {
                 dsl_dataset_t *snap;
                 uint64_t used, comp, uncomp;
 
+                if (snapobj == new->ds_object) {
+                        snap = new;
+                } else {
                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
                 if (err != 0)
                         break;
+                }
 
                 if (snap->ds_phys->ds_prev_snap_txg ==
                     oldsnap->ds_phys->ds_creation_txg) {
                         /*
                          * The blocks in the deadlist can not be born after

@@ -4169,10 +4235,11 @@
                  * If we get to the beginning of the chain of snapshots
                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
                  * was not a snapshot of/before new.
                  */
                 snapobj = snap->ds_phys->ds_prev_snap_obj;
+                if (snap != new)
                 dsl_dataset_rele(snap, FTAG);
                 if (snapobj == 0) {
                         err = EINVAL;
                         break;
                 }