3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 by Delphix. All rights reserved.
24 */
25
26 #include <sys/dsl_pool.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_prop.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dsl_scan.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/arc.h>
36 #include <sys/zap.h>
37 #include <sys/zio.h>
38 #include <sys/zfs_context.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/zfs_znode.h>
41 #include <sys/spa_impl.h>
42 #include <sys/dsl_deadlist.h>
43
44 int zfs_no_write_throttle = 0;
45 int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
46 int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
47
48 uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
49 uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
50 uint64_t zfs_write_limit_inflated = 0;
51 uint64_t zfs_write_limit_override = 0;
52
53 kmutex_t zfs_write_limit_lock;
54
55 static pgcnt_t old_physmem = 0;
56
57 int
58 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
59 {
60 uint64_t obj;
61 int err;
62
83 txg_init(dp, txg);
84
85 txg_list_create(&dp->dp_dirty_datasets,
86 offsetof(dsl_dataset_t, ds_dirty_link));
87 txg_list_create(&dp->dp_dirty_dirs,
88 offsetof(dsl_dir_t, dd_dirty_link));
89 txg_list_create(&dp->dp_sync_tasks,
90 offsetof(dsl_sync_task_group_t, dstg_node));
91 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
92 offsetof(dsl_dataset_t, ds_synced_link));
93
94 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
95
96 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
97 1, 4, 0);
98
99 return (dp);
100 }
101
102 int
103 dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
104 {
105 int err;
106 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
107 dsl_dir_t *dd;
108 dsl_dataset_t *ds;
109 uint64_t obj;
110
111 rw_enter(&dp->dp_config_rwlock, RW_WRITER);
112 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
113 &dp->dp_meta_objset);
114 if (err)
115 goto out;
116
117 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
118 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
119 &dp->dp_root_dir_obj);
120 if (err)
121 goto out;
122
123 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
124 NULL, dp, &dp->dp_root_dir);
125 if (err)
126 goto out;
127
128 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
129 if (err)
130 goto out;
131
132 if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
133 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
134 if (err)
135 goto out;
136 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
137 FTAG, &ds);
138 if (err == 0) {
139 err = dsl_dataset_hold_obj(dp,
140 ds->ds_phys->ds_prev_snap_obj, dp,
141 &dp->dp_origin_snap);
142 dsl_dataset_rele(ds, FTAG);
143 }
144 dsl_dir_close(dd, dp);
145 if (err)
146 goto out;
147 }
148
149 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
150 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
151 &dp->dp_free_dir);
152 if (err)
153 goto out;
154
155 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
156 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
157 if (err)
158 goto out;
159 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
160 dp->dp_meta_objset, obj));
161 }
162
163 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
164 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
165 &dp->dp_tmp_userrefs_obj);
166 if (err == ENOENT)
167 err = 0;
168 if (err)
169 goto out;
170
171 err = dsl_scan_init(dp, txg);
172
173 out:
174 rw_exit(&dp->dp_config_rwlock);
175 if (err)
176 dsl_pool_close(dp);
177 else
178 *dpp = dp;
179
180 return (err);
181 }
182
183 void
184 dsl_pool_close(dsl_pool_t *dp)
185 {
186 /* drop our references from dsl_pool_open() */
187
188 /*
189 * Since we held the origin_snap from "syncing" context (which
190 * includes pool-opening context), it actually only got a "ref"
191 * and not a hold, so just drop that here.
192 */
193 if (dp->dp_origin_snap)
194 dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
195 if (dp->dp_mos_dir)
196 dsl_dir_close(dp->dp_mos_dir, dp);
197 if (dp->dp_free_dir)
198 dsl_dir_close(dp->dp_free_dir, dp);
199 if (dp->dp_root_dir)
453 objset_t *os;
454
455 while (ds = list_head(&dp->dp_synced_datasets)) {
456 list_remove(&dp->dp_synced_datasets, ds);
457 os = ds->ds_objset;
458 zil_clean(os->os_zil, txg);
459 ASSERT(!dmu_objset_is_dirty(os, txg));
460 dmu_buf_rele(ds->ds_dbuf, ds);
461 }
462 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
463 }
464
465 /*
466 * TRUE if the current thread is the tx_sync_thread or if we
467 * are being called from SPA context during pool initialization.
468 */
469 int
470 dsl_pool_sync_context(dsl_pool_t *dp)
471 {
472 return (curthread == dp->dp_tx.tx_sync_thread ||
473 spa_get_dsl(dp->dp_spa) == NULL);
474 }
475
476 uint64_t
477 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
478 {
479 uint64_t space, resv;
480
481 /*
482 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
483 * efficiency.
484 * XXX The intent log is not accounted for, so it must fit
485 * within this slop.
486 *
487 * If we're trying to assess whether it's OK to do a free,
488 * cut the reservation in half to allow forward progress
489 * (e.g. make it possible to rm(1) files from a full pool).
490 */
491 space = spa_get_dspace(dp->dp_spa);
492 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
493 if (netfree)
771 htag = strchr(za.za_name, '-');
772 *htag = '\0';
773 ++htag;
774 dsobj = strtonum(za.za_name, NULL);
775 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
776 }
777 zap_cursor_fini(&zc);
778 }
779
780 /*
781 * Create the pool-wide zap object for storing temporary snapshot holds.
782 */
783 void
784 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
785 {
786 objset_t *mos = dp->dp_meta_objset;
787
788 ASSERT(dp->dp_tmp_userrefs_obj == 0);
789 ASSERT(dmu_tx_is_syncing(tx));
790
791 dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
792 DMU_OT_NONE, 0, tx);
793
794 VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
795 sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
796 }
797
798 static int
799 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
800 const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
801 {
802 objset_t *mos = dp->dp_meta_objset;
803 uint64_t zapobj = dp->dp_tmp_userrefs_obj;
804 char *name;
805 int error;
806
807 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
808 ASSERT(dmu_tx_is_syncing(tx));
809
810 /*
811 * If the pool was created prior to SPA_VERSION_USERREFS, the
812 * zap object for temporary holds might not exist yet.
813 */
814 if (zapobj == 0) {
815 if (holding) {
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 */
25
26 #include <sys/dsl_pool.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_prop.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dsl_scan.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/arc.h>
36 #include <sys/zap.h>
37 #include <sys/zio.h>
38 #include <sys/zfs_context.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/zfs_znode.h>
41 #include <sys/spa_impl.h>
42 #include <sys/dsl_deadlist.h>
43 #include <sys/bptree.h>
44 #include <sys/zfeature.h>
45
46 int zfs_no_write_throttle = 0;
47 int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
48 int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
49
50 uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
51 uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
52 uint64_t zfs_write_limit_inflated = 0;
53 uint64_t zfs_write_limit_override = 0;
54
55 kmutex_t zfs_write_limit_lock;
56
57 static pgcnt_t old_physmem = 0;
58
59 int
60 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
61 {
62 uint64_t obj;
63 int err;
64
85 txg_init(dp, txg);
86
87 txg_list_create(&dp->dp_dirty_datasets,
88 offsetof(dsl_dataset_t, ds_dirty_link));
89 txg_list_create(&dp->dp_dirty_dirs,
90 offsetof(dsl_dir_t, dd_dirty_link));
91 txg_list_create(&dp->dp_sync_tasks,
92 offsetof(dsl_sync_task_group_t, dstg_node));
93 list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
94 offsetof(dsl_dataset_t, ds_synced_link));
95
96 mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
97
98 dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
99 1, 4, 0);
100
101 return (dp);
102 }
103
104 int
105 dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
106 {
107 int err;
108 dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
109
110 err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
111 &dp->dp_meta_objset);
112 if (err != 0)
113 dsl_pool_close(dp);
114 else
115 *dpp = dp;
116
117 return (err);
118 }
119
120 int
121 dsl_pool_open(dsl_pool_t *dp)
122 {
123 int err;
124 dsl_dir_t *dd;
125 dsl_dataset_t *ds;
126 uint64_t obj;
127
128 ASSERT(!dmu_objset_is_dirty_anywhere(dp->dp_meta_objset));
129
130 rw_enter(&dp->dp_config_rwlock, RW_WRITER);
131 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
132 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
133 &dp->dp_root_dir_obj);
134 if (err)
135 goto out;
136
137 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
138 NULL, dp, &dp->dp_root_dir);
139 if (err)
140 goto out;
141
142 err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
143 if (err)
144 goto out;
145
146 if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
147 err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
148 if (err)
149 goto out;
150 err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
151 FTAG, &ds);
152 if (err == 0) {
153 err = dsl_dataset_hold_obj(dp,
154 ds->ds_phys->ds_prev_snap_obj, dp,
155 &dp->dp_origin_snap);
156 dsl_dataset_rele(ds, FTAG);
157 }
158 dsl_dir_close(dd, dp);
159 if (err)
160 goto out;
161 }
162
163 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
164 err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
165 &dp->dp_free_dir);
166 if (err)
167 goto out;
168
169 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
170 DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
171 if (err)
172 goto out;
173 VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
174 dp->dp_meta_objset, obj));
175 }
176
177 if (spa_feature_is_active(dp->dp_spa,
178 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
179 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
180 DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
181 &dp->dp_bptree_obj);
182 if (err != 0)
183 goto out;
184 }
185
186 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
187 DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
188 &dp->dp_tmp_userrefs_obj);
189 if (err == ENOENT)
190 err = 0;
191 if (err)
192 goto out;
193
194 err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
195
196 out:
197 rw_exit(&dp->dp_config_rwlock);
198 return (err);
199 }
200
201 void
202 dsl_pool_close(dsl_pool_t *dp)
203 {
204 /* drop our references from dsl_pool_open() */
205
206 /*
207 * Since we held the origin_snap from "syncing" context (which
208 * includes pool-opening context), it actually only got a "ref"
209 * and not a hold, so just drop that here.
210 */
211 if (dp->dp_origin_snap)
212 dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
213 if (dp->dp_mos_dir)
214 dsl_dir_close(dp->dp_mos_dir, dp);
215 if (dp->dp_free_dir)
216 dsl_dir_close(dp->dp_free_dir, dp);
217 if (dp->dp_root_dir)
471 objset_t *os;
472
473 while (ds = list_head(&dp->dp_synced_datasets)) {
474 list_remove(&dp->dp_synced_datasets, ds);
475 os = ds->ds_objset;
476 zil_clean(os->os_zil, txg);
477 ASSERT(!dmu_objset_is_dirty(os, txg));
478 dmu_buf_rele(ds->ds_dbuf, ds);
479 }
480 ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
481 }
482
483 /*
484 * TRUE if the current thread is the tx_sync_thread or if we
485 * are being called from SPA context during pool initialization.
486 */
487 int
488 dsl_pool_sync_context(dsl_pool_t *dp)
489 {
490 return (curthread == dp->dp_tx.tx_sync_thread ||
491 spa_is_initializing(dp->dp_spa));
492 }
493
494 uint64_t
495 dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree)
496 {
497 uint64_t space, resv;
498
499 /*
500 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
501 * efficiency.
502 * XXX The intent log is not accounted for, so it must fit
503 * within this slop.
504 *
505 * If we're trying to assess whether it's OK to do a free,
506 * cut the reservation in half to allow forward progress
507 * (e.g. make it possible to rm(1) files from a full pool).
508 */
509 space = spa_get_dspace(dp->dp_spa);
510 resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
511 if (netfree)
789 htag = strchr(za.za_name, '-');
790 *htag = '\0';
791 ++htag;
792 dsobj = strtonum(za.za_name, NULL);
793 (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE);
794 }
795 zap_cursor_fini(&zc);
796 }
797
798 /*
799 * Create the pool-wide zap object for storing temporary snapshot holds.
800 */
801 void
802 dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
803 {
804 objset_t *mos = dp->dp_meta_objset;
805
806 ASSERT(dp->dp_tmp_userrefs_obj == 0);
807 ASSERT(dmu_tx_is_syncing(tx));
808
809 dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
810 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
811 }
812
813 static int
814 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
815 const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
816 {
817 objset_t *mos = dp->dp_meta_objset;
818 uint64_t zapobj = dp->dp_tmp_userrefs_obj;
819 char *name;
820 int error;
821
822 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
823 ASSERT(dmu_tx_is_syncing(tx));
824
825 /*
826 * If the pool was created prior to SPA_VERSION_USERREFS, the
827 * zap object for temporary holds might not exist yet.
828 */
829 if (zapobj == 0) {
830 if (holding) {
|