Print this page
NEX-6855 System fails to boot up after a large number of datasets created
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-9301 BAD Trap: Double Fault panic on zfs destroy snapshot
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-7641 Impossible to remove special vdev from pool if WBC-ed dataset was removed before disabling WBC
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
2605 want to resume interrupted zfs send
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Xin Li <delphij@freebsd.org>
Reviewed by: Arne Jansen <sensille@gmx.net>
Approved by: Dan McDonald <danmcd@omniti.com>
6047 SPARC boot should support feature@embedded_data
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Approved by: Dan McDonald <danmcd@omniti.com>
5959 clean up per-dataset feature count code
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-4476 WRC: Allow to use write back cache per tree of datasets
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Revert "NEX-4476 WRC: Allow to use write back cache per tree of datasets"
This reverts commit fe97b74444278a6f36fec93179133641296312da.
NEX-4476 WRC: Allow to use write back cache per tree of datasets
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
NEX-3964 It should not be allowed to rename a snapshot that its new name is matched to the prefix of in-kernel autosnapshots (lint)
NEX-3964 It should not be allowed to rename a snapshot that its new name is matched to the prefix of in-kernel autosnapshots
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>

@@ -18,16 +18,18 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  */
 
+#include <sys/autosnap.h>
 #include <sys/zfs_context.h>
 #include <sys/dsl_userhold.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_destroy.h>

@@ -40,10 +42,11 @@
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
+#include <sys/wbc.h>
 #include <sys/zcp.h>
 
 int
 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
 {

@@ -182,81 +185,50 @@
 
 static void
 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
 {
         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-        zap_cursor_t zc;
-        zap_attribute_t za;
+        zap_cursor_t *zc;
+        zap_attribute_t *za;
 
         /*
          * If it is the old version, dd_clones doesn't exist so we can't
          * find the clones, but dsl_deadlist_remove_key() is a no-op so it
          * doesn't matter.
          */
         if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
                 return;
+        zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+        za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
-        for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
-            zap_cursor_retrieve(&zc, &za) == 0;
-            zap_cursor_advance(&zc)) {
+        for (zap_cursor_init(zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
+            zap_cursor_retrieve(zc, za) == 0;
+            zap_cursor_advance(zc)) {
                 dsl_dataset_t *clone;
 
                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-                    za.za_first_integer, FTAG, &clone));
+                    za->za_first_integer, FTAG, &clone));
                 if (clone->ds_dir->dd_origin_txg > mintxg) {
                         dsl_deadlist_remove_key(&clone->ds_deadlist,
                             mintxg, tx);
-                        if (dsl_dataset_remap_deadlist_exists(clone)) {
-                                dsl_deadlist_remove_key(
-                                    &clone->ds_remap_deadlist, mintxg, tx);
-                        }
                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
                 }
                 dsl_dataset_rele(clone, FTAG);
         }
-        zap_cursor_fini(&zc);
+        zap_cursor_fini(zc);
+        kmem_free(zc, sizeof (zap_cursor_t));
+        kmem_free(za, sizeof (zap_attribute_t));
 }
 
-static void
-dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
-    dmu_tx_t *tx)
-{
-        dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-        /* Move blocks to be obsoleted to pool's obsolete list. */
-        if (dsl_dataset_remap_deadlist_exists(ds_next)) {
-                if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
-                        dsl_pool_create_obsolete_bpobj(dp, tx);
-
-                dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
-                    &dp->dp_obsolete_bpobj,
-                    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
-        }
-
-        /* Merge our deadlist into next's and free it. */
-        if (dsl_dataset_remap_deadlist_exists(ds)) {
-                uint64_t remap_deadlist_object =
-                    dsl_dataset_get_remap_deadlist_object(ds);
-                ASSERT(remap_deadlist_object != 0);
-
-                mutex_enter(&ds_next->ds_remap_deadlist_lock);
-                if (!dsl_dataset_remap_deadlist_exists(ds_next))
-                        dsl_dataset_create_remap_deadlist(ds_next, tx);
-                mutex_exit(&ds_next->ds_remap_deadlist_lock);
-
-                dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
-                    remap_deadlist_object, tx);
-                dsl_dataset_destroy_remap_deadlist(ds, tx);
-        }
-}
-
 void
 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 {
         int err;
         int after_branch_point = FALSE;
         dsl_pool_t *dp = ds->ds_dir->dd_pool;
+        spa_t *spa = dp->dp_spa;
+        wbc_data_t *wbc_data = spa_get_wbc_data(spa);
         objset_t *mos = dp->dp_meta_objset;
         dsl_dataset_t *ds_prev = NULL;
         uint64_t obj;
 
         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));

@@ -263,10 +235,19 @@
         rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
         ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
         rrw_exit(&ds->ds_bp_rwlock, FTAG);
         ASSERT(refcount_is_zero(&ds->ds_longholds));
 
+        /*
+         * if an edge snapshot of WBC window is destroyed, the window must be
+         * aborted
+         */
+        mutex_enter(&wbc_data->wbc_lock);
+        if (dsl_dataset_phys(ds)->ds_creation_txg == wbc_data->wbc_finish_txg)
+                wbc_purge_window(spa, tx);
+        mutex_exit(&wbc_data->wbc_lock);
+
         if (defer &&
             (ds->ds_userrefs > 0 ||
             dsl_dataset_phys(ds)->ds_num_children > 1)) {
                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
                 dmu_buf_will_dirty(ds->ds_dbuf, tx);

@@ -362,18 +343,15 @@
 
                 /* Merge our deadlist into next's and free it. */
                 dsl_deadlist_merge(&ds_next->ds_deadlist,
                     dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
         }
-
         dsl_deadlist_close(&ds->ds_deadlist);
         dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
         dmu_buf_will_dirty(ds->ds_dbuf, tx);
         dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
-        dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
-
         /* Collapse range in clone heads */
         dsl_dataset_remove_clones_key(ds,
             dsl_dataset_phys(ds)->ds_creation_txg, tx);
 
         if (ds_next->ds_is_snapshot) {

@@ -403,14 +381,10 @@
                 dsl_dataset_t *hds;
                 VERIFY0(dsl_dataset_hold_obj(dp,
                     dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
                 dsl_deadlist_remove_key(&hds->ds_deadlist,
                     dsl_dataset_phys(ds)->ds_creation_txg, tx);
-                if (dsl_dataset_remap_deadlist_exists(hds)) {
-                        dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
-                            dsl_dataset_phys(ds)->ds_creation_txg, tx);
-                }
                 dsl_dataset_rele(hds, FTAG);
 
         } else {
                 ASSERT3P(ds_next->ds_prev, ==, ds);
                 dsl_dataset_rele(ds_next->ds_prev, ds_next);

@@ -506,10 +480,14 @@
 
         int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
         if (error == ENOENT)
                 return;
         ASSERT0(error);
+
+        if (autosnap_check_name(strchr(dsname, '@')))
+                autosnap_exempt_snapshot(dp->dp_spa, dsname);
+
         dsl_destroy_snapshot_sync_impl(ds, defer, tx);
         dsl_dataset_rele(ds, FTAG);
 }
 
 /*

@@ -855,25 +833,35 @@
                 dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
         }
 
         /*
          * Destroy the deadlist.  Unless it's a clone, the
-         * deadlist should be empty since the dataset has no snapshots.
-         * (If it's a clone, it's safe to ignore the deadlist contents
-         * since they are still referenced by the origin snapshot.)
+         * deadlist should be empty.  (If it's a clone, it's
+         * safe to ignore the deadlist contents.)
          */
         dsl_deadlist_close(&ds->ds_deadlist);
         dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
         dmu_buf_will_dirty(ds->ds_dbuf, tx);
         dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
 
-        if (dsl_dataset_remap_deadlist_exists(ds))
-                dsl_dataset_destroy_remap_deadlist(ds, tx);
-
         objset_t *os;
         VERIFY0(dmu_objset_from_ds(ds, &os));
 
+        if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_WBC)) {
+                wbc_process_objset(spa_get_wbc_data(dp->dp_spa), os, B_TRUE);
+
+                /*
+                 * If WBC was activated for this dataset and it is a root
+                 * of WBC-ed tree of datasets then need to decrement WBC
+                 * feature flag refcounter, to be sure that 'feature@wbc'
+                 * shows correct information about the status of WBC
+                 */
+                if (os->os_wbc_root_ds_obj != 0 &&
+                    ds->ds_object == os->os_wbc_root_ds_obj)
+                        spa_feature_decr(os->os_spa, SPA_FEATURE_WBC, tx);
+        }
+
         if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
                 old_synchronous_dataset_destroy(ds, tx);
         } else {
                 /*
                  * Move the bptree into the pool's list of trees to

@@ -1011,10 +999,11 @@
 
         error = spa_open(name, &spa, FTAG);
         if (error != 0)
                 return (error);
         isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
+
         spa_close(spa, FTAG);
 
         ddha.ddha_name = name;
 
         if (!isenabled) {

@@ -1048,33 +1037,257 @@
 
         return (dsl_sync_task(name, dsl_destroy_head_check,
             dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
 }
 
-/*
- * Note, this function is used as the callback for dmu_objset_find().  We
- * always return 0 so that we will continue to find and process
- * inconsistent datasets, even if we encounter an error trying to
- * process one of them.
- */
+typedef struct {
+        kmutex_t        lock;
+        list_t list;
+} dsl_inconsistent_walker_cb_t;
+
+typedef struct {
+        char name[ZFS_MAX_DATASET_NAME_LEN];
+        list_node_t node;
+} dsl_inconsistent_node_t;
+
 /* ARGSUSED */
-int
-dsl_destroy_inconsistent(const char *dsname, void *arg)
+static int
+dsl_collect_inconsistent_datasets_cb(dsl_pool_t *dp,
+    dsl_dataset_t *ds, void *arg)
 {
-        objset_t *os;
+        dsl_inconsistent_node_t *ds_node;
+        dsl_inconsistent_walker_cb_t *walker =
+            (dsl_inconsistent_walker_cb_t *)arg;
 
-        if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
-                boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+        if (!DS_IS_INCONSISTENT(ds))
+                return (0);
 
                 /*
                  * If the dataset is inconsistent because a resumable receive
                  * has failed, then do not destroy it.
                  */
-                if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
-                        need_destroy = B_FALSE;
+        if (dsl_dataset_has_resume_receive_state(ds))
+                return (0);
 
-                dmu_objset_rele(os, FTAG);
-                if (need_destroy)
-                        (void) dsl_destroy_head(dsname);
+        ds_node = kmem_alloc(sizeof (dsl_inconsistent_node_t), KM_SLEEP);
+        dsl_dataset_name(ds, ds_node->name);
+
+        mutex_enter(&walker->lock);
+        list_insert_tail(&walker->list, ds_node);
+        mutex_exit(&walker->lock);
+
+        return (0);
+}
+
+/*
+ * Walk in parallel over the entire pool and gather inconsistent
+ * datasets namely, those that don't have resume token and destroy them.
+ */
+void
+dsl_destroy_inconsistent(dsl_pool_t *dp)
+{
+        dsl_inconsistent_walker_cb_t walker;
+        dsl_inconsistent_node_t *ds_node;
+
+        mutex_init(&walker.lock, NULL, MUTEX_DEFAULT, NULL);
+        list_create(&walker.list, sizeof (dsl_inconsistent_node_t),
+            offsetof(dsl_inconsistent_node_t, node));
+
+        VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+                dsl_collect_inconsistent_datasets_cb,
+            &walker, DS_FIND_CHILDREN));
+
+        while ((ds_node = list_remove_head(&walker.list)) != NULL) {
+                (void) dsl_destroy_head(ds_node->name);
+                kmem_free(ds_node, sizeof (dsl_inconsistent_node_t));
         }
+
+        list_destroy(&walker.list);
+        mutex_destroy(&walker.lock);
+}
+
+typedef struct {
+        const char *from_ds;
+        boolean_t defer;
+} dmu_destroy_atomically_arg_t;
+
+static int
+dsl_destroy_atomically_sync(void *arg, dmu_tx_t *tx)
+{
+        dmu_destroy_atomically_arg_t *ddaa = arg;
+        boolean_t defer = ddaa->defer;
+        dsl_pool_t *dp = dmu_tx_pool(tx);
+        zfs_ds_collector_entry_t *tail;
+        list_t namestack;
+        int err = 0;
+
+        /* do not perfrom checks in ioctl */
+        if (!dmu_tx_is_syncing(tx))
         return (0);
+
+        ASSERT(dsl_pool_config_held(dp));
+
+        if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY))
+                return (SET_ERROR(ENOTSUP));
+
+        /* It is possible than autosnap watches the DS */
+        if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_WBC)) {
+                objset_t *os = NULL;
+                dsl_dataset_t *ds = NULL;
+
+                err = dsl_dataset_hold(dp, ddaa->from_ds, FTAG, &ds);
+                if (err != 0)
+                        return (err);
+
+                err = dmu_objset_from_ds(ds, &os);
+                if (err != 0) {
+                        dsl_dataset_rele(ds, FTAG);
+                        return (err);
+                }
+
+                if (!dmu_objset_is_snapshot(os)) {
+                        wbc_process_objset(spa_get_wbc_data(dp->dp_spa),
+                            os, B_TRUE);
+                }
+
+                dsl_dataset_rele(ds, FTAG);
+        }
+
+        /* initialize the stack of datasets */
+        list_create(&namestack, sizeof (zfs_ds_collector_entry_t),
+            offsetof(zfs_ds_collector_entry_t, node));
+        tail = dsl_dataset_collector_cache_alloc();
+
+        /* push the head */
+        tail->cookie = 0;
+        tail->cookie_is_snap = B_FALSE;
+        (void) strcpy(tail->name, ddaa->from_ds);
+        list_insert_tail(&namestack, tail);
+
+        /* the head is processed at the very end and after all is done */
+        while (err == 0 && ((tail = list_tail(&namestack)) != NULL)) {
+                zfs_ds_collector_entry_t *el;
+                objset_t *os;
+                dsl_dataset_t *ds;
+                char *p;
+
+                /* init new entry */
+                el = dsl_dataset_collector_cache_alloc();
+                el->cookie = 0;
+                el->cookie_is_snap = B_FALSE;
+                (void) strcpy(el->name, tail->name);
+                p = el->name + strlen(el->name);
+
+                /* hold the current dataset to traverse its children */
+                err = dsl_dataset_hold(dp, tail->name, FTAG, &ds);
+                if (err != 0) {
+                        dsl_dataset_collector_cache_free(el);
+                        break;
+                }
+
+                err  = dmu_objset_from_ds(ds, &os);
+                if (err != 0) {
+                        dsl_dataset_rele(ds, FTAG);
+                        dsl_dataset_collector_cache_free(el);
+                        break;
+                }
+
+                if (dmu_objset_is_snapshot(os)) {
+                        /* traverse clones for snapshots */
+                        err = dmu_clone_list_next(os, MAXNAMELEN,
+                            el->name, NULL, &tail->cookie);
+                } else {
+                        /* for filesystems traverse fs first, then snaps */
+                        if (!tail->cookie_is_snap) {
+                                *p++ = '/';
+                                do {
+                                        *p = '\0';
+                                        err = dmu_dir_list_next(os,
+                                            MAXNAMELEN - (p - el->name),
+                                            p, NULL, &tail->cookie);
+                                } while (err == 0 &&
+                                    dataset_name_hidden(el->name));
+
+                                /* no more fs, move to snapshots */
+                                if (err == ENOENT) {
+                                        *(--p) = '\0';
+                                        tail->cookie_is_snap = 1;
+                                        tail->cookie = 0;
+                                        err = 0;
+                                }
+                        }
+
+                        if (err == 0 && tail->cookie_is_snap) {
+                                *p++ = '@';
+                                *p = '\0';
+                                err = dmu_snapshot_list_next(os,
+                                    MAXNAMELEN - (p - el->name),
+                                    p, NULL, &tail->cookie, NULL);
+                        }
+                }
+
+                if (err == 0) {
+                        /* a children found, add it and continue */
+                        list_insert_tail(&namestack, el);
+                        dsl_dataset_rele(ds, FTAG);
+                        continue;
+                }
+
+                dsl_dataset_collector_cache_free(el);
+
+                if (err != ENOENT) {
+                        dsl_dataset_rele(ds, FTAG);
+                        break;
+                }
+
+                /*
+                 * There are no more children of the dataset, pop it from stack
+                 * and destroy it
+                 */
+
+                err = 0;
+
+                list_remove(&namestack, tail);
+
+                if (dmu_objset_is_snapshot(os)) {
+                        err = dsl_destroy_snapshot_check_impl(ds, defer);
+                        if (err == 0)
+                                dsl_destroy_snapshot_sync_impl(ds, defer, tx);
+                } else if (strchr(tail->name, '/') != NULL) {
+                        err = dsl_destroy_head_check_impl(ds, 0);
+                        if (err == 0)
+                                dsl_destroy_head_sync_impl(ds, tx);
+                }
+
+                dsl_dataset_rele(ds, FTAG);
+                dsl_dataset_collector_cache_free(tail);
+        }
+
+        if (err != 0) {
+                while ((tail = list_remove_tail(&namestack)) != NULL)
+                        dsl_dataset_collector_cache_free(tail);
+        }
+
+        ASSERT(list_head(&namestack) == NULL);
+
+        list_destroy(&namestack);
+
+        return (err);
+}
+
+/*ARGSUSED*/
+void
+dsl_destroy_atomically_sync_dummy(void *arg, dmu_tx_t *tx)
+{
+}
+
+int
+dsl_destroy_atomically(const char *name, boolean_t defer)
+{
+        dmu_destroy_atomically_arg_t ddaa;
+
+        ddaa.from_ds = name;
+        ddaa.defer = defer;
+
+        return (dsl_sync_task(name, dsl_destroy_atomically_sync,
+            dsl_destroy_atomically_sync_dummy, &ddaa, 0, ZFS_SPACE_CHECK_NONE));
 }