Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/dsl_pool.h>
  27 #include <sys/dsl_dataset.h>
  28 #include <sys/dsl_prop.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_synctask.h>
  31 #include <sys/dsl_scan.h>
  32 #include <sys/dnode.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/arc.h>
  36 #include <sys/zap.h>
  37 #include <sys/zio.h>
  38 #include <sys/zfs_context.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/spa_impl.h>
  42 #include <sys/dsl_deadlist.h>


  43 
  44 int zfs_no_write_throttle = 0;
  45 int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  46 int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  47 
  48 uint64_t zfs_write_limit_min = 32 << 20;  /* min write limit is 32MB */
  49 uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  50 uint64_t zfs_write_limit_inflated = 0;
  51 uint64_t zfs_write_limit_override = 0;
  52 
  53 kmutex_t zfs_write_limit_lock;
  54 
  55 static pgcnt_t old_physmem = 0;
  56 
  57 int
  58 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  59 {
  60         uint64_t obj;
  61         int err;
  62 


  83         txg_init(dp, txg);
  84 
  85         txg_list_create(&dp->dp_dirty_datasets,
  86             offsetof(dsl_dataset_t, ds_dirty_link));
  87         txg_list_create(&dp->dp_dirty_dirs,
  88             offsetof(dsl_dir_t, dd_dirty_link));
  89         txg_list_create(&dp->dp_sync_tasks,
  90             offsetof(dsl_sync_task_group_t, dstg_node));
  91         list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
  92             offsetof(dsl_dataset_t, ds_synced_link));
  93 
  94         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
  95 
  96         dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
  97             1, 4, 0);
  98 
  99         return (dp);
 100 }
 101 
 102 int
 103 dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 104 {
 105         int err;
 106         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);















 107         dsl_dir_t *dd;
 108         dsl_dataset_t *ds;
 109         uint64_t obj;
 110 
 111         rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 112         err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 113             &dp->dp_meta_objset);
 114         if (err)
 115                 goto out;
 116 

 117         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 118             DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 119             &dp->dp_root_dir_obj);
 120         if (err)
 121                 goto out;
 122 
 123         err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 124             NULL, dp, &dp->dp_root_dir);
 125         if (err)
 126                 goto out;
 127 
 128         err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 129         if (err)
 130                 goto out;
 131 
 132         if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
 133                 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 134                 if (err)
 135                         goto out;
 136                 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 137                     FTAG, &ds);
 138                 if (err == 0) {
 139                         err = dsl_dataset_hold_obj(dp,
 140                             ds->ds_phys->ds_prev_snap_obj, dp,
 141                             &dp->dp_origin_snap);
 142                         dsl_dataset_rele(ds, FTAG);
 143                 }
 144                 dsl_dir_close(dd, dp);
 145                 if (err)
 146                         goto out;
 147         }
 148 
 149         if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 150                 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 151                     &dp->dp_free_dir);
 152                 if (err)
 153                         goto out;
 154 
 155                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 156                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 157                 if (err)
 158                         goto out;
 159                 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
 160                     dp->dp_meta_objset, obj));
 161         }
 162 


 163         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,







 164             DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 165             &dp->dp_tmp_userrefs_obj);
 166         if (err == ENOENT)
 167                 err = 0;
 168         if (err)
 169                 goto out;
 170 
 171         err = dsl_scan_init(dp, txg);
 172 
 173 out:
 174         rw_exit(&dp->dp_config_rwlock);
 175         if (err)
 176                 dsl_pool_close(dp);
 177         else
 178                 *dpp = dp;
 179 
 180         return (err);
 181 }
 182 
 183 void
 184 dsl_pool_close(dsl_pool_t *dp)
 185 {
 186         /* drop our references from dsl_pool_open() */
 187 
 188         /*
 189          * Since we held the origin_snap from "syncing" context (which
 190          * includes pool-opening context), it actually only got a "ref"
 191          * and not a hold, so just drop that here.
 192          */
 193         if (dp->dp_origin_snap)
 194                 dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 195         if (dp->dp_mos_dir)
 196                 dsl_dir_close(dp->dp_mos_dir, dp);
 197         if (dp->dp_free_dir)
 198                 dsl_dir_close(dp->dp_free_dir, dp);
 199         if (dp->dp_root_dir)


 453         objset_t *os;
 454 
 455         while (ds = list_head(&dp->dp_synced_datasets)) {
 456                 list_remove(&dp->dp_synced_datasets, ds);
 457                 os = ds->ds_objset;
 458                 zil_clean(os->os_zil, txg);
 459                 ASSERT(!dmu_objset_is_dirty(os, txg));
 460                 dmu_buf_rele(ds->ds_dbuf, ds);
 461         }
 462         ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 463 }
 464 
 465 /*
 466  * TRUE if the current thread is the tx_sync_thread or if we
 467  * are being called from SPA context during pool initialization.
 468  */
 469 int
 470 dsl_pool_sync_context(dsl_pool_t *dp)
 471 {
 472         return (curthread == dp->dp_tx.tx_sync_thread ||
 473             spa_get_dsl(dp->dp_spa) == NULL);
 474 }
 475 
 476 uint64_t
 477 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 478 {
 479         uint64_t space, resv;
 480 
 481         /*
 482          * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 483          * efficiency.
 484          * XXX The intent log is not accounted for, so it must fit
 485          * within this slop.
 486          *
 487          * If we're trying to assess whether it's OK to do a free,
 488          * cut the reservation in half to allow forward progress
 489          * (e.g. make it possible to rm(1) files from a full pool).
 490          */
 491         space = spa_get_dspace(dp->dp_spa);
 492         resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 493         if (netfree)


 771                 htag = strchr(za.za_name, '-');
 772                 *htag = '\0';
 773                 ++htag;
 774                 dsobj = strtonum(za.za_name, NULL);
 775                 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
 776         }
 777         zap_cursor_fini(&zc);
 778 }
 779 
 780 /*
 781  * Create the pool-wide zap object for storing temporary snapshot holds.
 782  */
 783 void
 784 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 785 {
 786         objset_t *mos = dp->dp_meta_objset;
 787 
 788         ASSERT(dp->dp_tmp_userrefs_obj == 0);
 789         ASSERT(dmu_tx_is_syncing(tx));
 790 
 791         dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
 792             DMU_OT_NONE, 0, tx);
 793 
 794         VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
 795             sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
 796 }
 797 
 798 static int
 799 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 800     const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
 801 {
 802         objset_t *mos = dp->dp_meta_objset;
 803         uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 804         char *name;
 805         int error;
 806 
 807         ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 808         ASSERT(dmu_tx_is_syncing(tx));
 809 
 810         /*
 811          * If the pool was created prior to SPA_VERSION_USERREFS, the
 812          * zap object for temporary holds might not exist yet.
 813          */
 814         if (zapobj == 0) {
 815                 if (holding) {




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/dsl_pool.h>
  27 #include <sys/dsl_dataset.h>
  28 #include <sys/dsl_prop.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_synctask.h>
  31 #include <sys/dsl_scan.h>
  32 #include <sys/dnode.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/arc.h>
  36 #include <sys/zap.h>
  37 #include <sys/zio.h>
  38 #include <sys/zfs_context.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/spa_impl.h>
  42 #include <sys/dsl_deadlist.h>
  43 #include <sys/bptree.h>
  44 #include <sys/zfeature.h>
  45 
  46 int zfs_no_write_throttle = 0;
  47 int zfs_write_limit_shift = 3;                  /* 1/8th of physical memory */
  48 int zfs_txg_synctime_ms = 1000;         /* target millisecs to sync a txg */
  49 
  50 uint64_t zfs_write_limit_min = 32 << 20;  /* min write limit is 32MB */
  51 uint64_t zfs_write_limit_max = 0;               /* max data payload per txg */
  52 uint64_t zfs_write_limit_inflated = 0;
  53 uint64_t zfs_write_limit_override = 0;
  54 
  55 kmutex_t zfs_write_limit_lock;
  56 
  57 static pgcnt_t old_physmem = 0;
  58 
  59 int
  60 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
  61 {
  62         uint64_t obj;
  63         int err;
  64 


  85         txg_init(dp, txg);
  86 
  87         txg_list_create(&dp->dp_dirty_datasets,
  88             offsetof(dsl_dataset_t, ds_dirty_link));
  89         txg_list_create(&dp->dp_dirty_dirs,
  90             offsetof(dsl_dir_t, dd_dirty_link));
  91         txg_list_create(&dp->dp_sync_tasks,
  92             offsetof(dsl_sync_task_group_t, dstg_node));
  93         list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
  94             offsetof(dsl_dataset_t, ds_synced_link));
  95 
  96         mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
  97 
  98         dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
  99             1, 4, 0);
 100 
 101         return (dp);
 102 }
 103 
 104 int
 105 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 106 {
 107         int err;
 108         dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 109 
 110         err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 111             &dp->dp_meta_objset);
 112         if (err != 0)
 113                 dsl_pool_close(dp);
 114         else
 115                 *dpp = dp;
 116 
 117         return (err);
 118 }
 119 
 120 int
 121 dsl_pool_open(dsl_pool_t *dp)
 122 {
 123         int err;
 124         dsl_dir_t *dd;
 125         dsl_dataset_t *ds;
 126         uint64_t obj;
 127 
 128         ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset));




 129 
 130         rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 131         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 132             DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 133             &dp->dp_root_dir_obj);
 134         if (err)
 135                 goto out;
 136 
 137         err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 138             NULL, dp, &dp->dp_root_dir);
 139         if (err)
 140                 goto out;
 141 
 142         err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 143         if (err)
 144                 goto out;
 145 
 146         if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 147                 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 148                 if (err)
 149                         goto out;
 150                 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 151                     FTAG, &ds);
 152                 if (err == 0) {
 153                         err = dsl_dataset_hold_obj(dp,
 154                             ds->ds_phys->ds_prev_snap_obj, dp,
 155                             &dp->dp_origin_snap);
 156                         dsl_dataset_rele(ds, FTAG);
 157                 }
 158                 dsl_dir_close(dd, dp);
 159                 if (err)
 160                         goto out;
 161         }
 162 
 163         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 164                 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
 165                     &dp->dp_free_dir);
 166                 if (err)
 167                         goto out;
 168 
 169                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 170                     DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 171                 if (err)
 172                         goto out;
 173                 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
 174                     dp->dp_meta_objset, obj));
 175         }
 176 
 177         if (spa_feature_is_active(dp->dp_spa,
 178             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
 179                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 180                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
 181                     &dp->dp_bptree_obj);
 182                 if (err != 0)
 183                         goto out;
 184         }
 185 
 186         err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 187             DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 188             &dp->dp_tmp_userrefs_obj);
 189         if (err == ENOENT)
 190                 err = 0;
 191         if (err)
 192                 goto out;
 193 
 194         err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 195 
 196 out:
 197         rw_exit(&dp->dp_config_rwlock);





 198         return (err);
 199 }
 200 
 201 void
 202 dsl_pool_close(dsl_pool_t *dp)
 203 {
 204         /* drop our references from dsl_pool_open() */
 205 
 206         /*
 207          * Since we held the origin_snap from "syncing" context (which
 208          * includes pool-opening context), it actually only got a "ref"
 209          * and not a hold, so just drop that here.
 210          */
 211         if (dp->dp_origin_snap)
 212                 dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 213         if (dp->dp_mos_dir)
 214                 dsl_dir_close(dp->dp_mos_dir, dp);
 215         if (dp->dp_free_dir)
 216                 dsl_dir_close(dp->dp_free_dir, dp);
 217         if (dp->dp_root_dir)


 471         objset_t *os;
 472 
 473         while (ds = list_head(&dp->dp_synced_datasets)) {
 474                 list_remove(&dp->dp_synced_datasets, ds);
 475                 os = ds->ds_objset;
 476                 zil_clean(os->os_zil, txg);
 477                 ASSERT(!dmu_objset_is_dirty(os, txg));
 478                 dmu_buf_rele(ds->ds_dbuf, ds);
 479         }
 480         ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 481 }
 482 
 483 /*
 484  * TRUE if the current thread is the tx_sync_thread or if we
 485  * are being called from SPA context during pool initialization.
 486  */
 487 int
 488 dsl_pool_sync_context(dsl_pool_t *dp)
 489 {
 490         return (curthread == dp->dp_tx.tx_sync_thread ||
 491             spa_is_initializing(dp->dp_spa));
 492 }
 493 
 494 uint64_t
 495 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
 496 {
 497         uint64_t space, resv;
 498 
 499         /*
 500          * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 501          * efficiency.
 502          * XXX The intent log is not accounted for, so it must fit
 503          * within this slop.
 504          *
 505          * If we're trying to assess whether it's OK to do a free,
 506          * cut the reservation in half to allow forward progress
 507          * (e.g. make it possible to rm(1) files from a full pool).
 508          */
 509         space = spa_get_dspace(dp->dp_spa);
 510         resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 511         if (netfree)


 789                 htag = strchr(za.za_name, '-');
 790                 *htag = '\0';
 791                 ++htag;
 792                 dsobj = strtonum(za.za_name, NULL);
 793                 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
 794         }
 795         zap_cursor_fini(&zc);
 796 }
 797 
 798 /*
 799  * Create the pool-wide zap object for storing temporary snapshot holds.
 800  */
 801 void
 802 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
 803 {
 804         objset_t *mos = dp->dp_meta_objset;
 805 
 806         ASSERT(dp->dp_tmp_userrefs_obj == 0);
 807         ASSERT(dmu_tx_is_syncing(tx));
 808 
 809         dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
 810             DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);



 811 }
 812 
 813 static int
 814 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
 815     const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
 816 {
 817         objset_t *mos = dp->dp_meta_objset;
 818         uint64_t zapobj = dp->dp_tmp_userrefs_obj;
 819         char *name;
 820         int error;
 821 
 822         ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 823         ASSERT(dmu_tx_is_syncing(tx));
 824 
 825         /*
 826          * If the pool was created prior to SPA_VERSION_USERREFS, the
 827          * zap object for temporary holds might not exist yet.
 828          */
 829         if (zapobj == 0) {
 830                 if (holding) {