Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 24 * Copyright (c) 2013 Martin Matuska. All rights reserved.
25 25 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
26 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 27 */
27 28
28 29 #include <sys/dmu.h>
29 30 #include <sys/dmu_objset.h>
30 31 #include <sys/dmu_tx.h>
31 32 #include <sys/dsl_dataset.h>
32 33 #include <sys/dsl_dir.h>
33 34 #include <sys/dsl_prop.h>
34 35 #include <sys/dsl_synctask.h>
35 36 #include <sys/dsl_deleg.h>
36 37 #include <sys/dmu_impl.h>
37 38 #include <sys/spa.h>
38 39 #include <sys/metaslab.h>
39 40 #include <sys/zap.h>
40 41 #include <sys/zio.h>
41 42 #include <sys/arc.h>
42 43 #include <sys/sunddi.h>
43 44 #include <sys/zfeature.h>
44 45 #include <sys/policy.h>
45 46 #include <sys/zfs_znode.h>
46 47 #include "zfs_namecheck.h"
47 48 #include "zfs_prop.h"
48 49
49 50 /*
50 51 * Filesystem and Snapshot Limits
51 52 * ------------------------------
52 53 *
53 54 * These limits are used to restrict the number of filesystems and/or snapshots
54 55 * that can be created at a given level in the tree or below. A typical
55 56 * use-case is with a delegated dataset where the administrator wants to ensure
56 57 * that a user within the zone is not creating too many additional filesystems
57 58 * or snapshots, even though they're not exceeding their space quota.
58 59 *
59 60 * The filesystem and snapshot counts are stored as extensible properties. This
60 61 * capability is controlled by a feature flag and must be enabled to be used.
61 62 * Once enabled, the feature is not active until the first limit is set. At
62 63 * that point, future operations to create/destroy filesystems or snapshots
63 64 * will validate and update the counts.
64 65 *
65 66 * Because the count properties will not exist before the feature is active,
66 67 * the counts are updated when a limit is first set on an uninitialized
67 68 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
68 69 * all of the nested filesystems/snapshots. Thus, a new leaf node has a
69 70 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
70 71 * snapshot count properties on a node indicate uninitialized counts on that
71 72 * node.) When first setting a limit on an uninitialized node, the code starts
72 73 * at the filesystem with the new limit and descends into all sub-filesystems
73 74 * to add the count properties.
74 75 *
75 76 * In practice this is lightweight since a limit is typically set when the
76 77 * filesystem is created and thus has no children. Once valid, changing the
77 78 * limit value won't require a re-traversal since the counts are already valid.
78 79 * When recursively fixing the counts, if a node with a limit is encountered
79 80 * during the descent, the counts are known to be valid and there is no need to
80 81 * descend into that filesystem's children. The counts on filesystems above the
81 82 * one with the new limit will still be uninitialized, unless a limit is
82 83 * eventually set on one of those filesystems. The counts are always recursively
83 84 * updated when a limit is set on a dataset, unless there is already a limit.
84 85 * When a new limit value is set on a filesystem with an existing limit, it is
85 86 * possible for the new limit to be less than the current count at that level
86 87 * since a user who can change the limit is also allowed to exceed the limit.
87 88 *
88 89 * Once the feature is active, then whenever a filesystem or snapshot is
89 90 * created, the code recurses up the tree, validating the new count against the
90 91 * limit at each initialized level. In practice, most levels will not have a
91 92 * limit set. If there is a limit at any initialized level up the tree, the
92 93 * check must pass or the creation will fail. Likewise, when a filesystem or
93 94 * snapshot is destroyed, the counts are recursively adjusted all the way up
94 95 * the initizized nodes in the tree. Renaming a filesystem into different point
95 96 * in the tree will first validate, then update the counts on each branch up to
96 97 * the common ancestor. A receive will also validate the counts and then update
97 98 * them.
98 99 *
99 100 * An exception to the above behavior is that the limit is not enforced if the
100 101 * user has permission to modify the limit. This is primarily so that
101 102 * recursive snapshots in the global zone always work. We want to prevent a
102 103 * denial-of-service in which a lower level delegated dataset could max out its
103 104 * limit and thus block recursive snapshots from being taken in the global zone.
104 105 * Because of this, it is possible for the snapshot count to be over the limit
105 106 * and snapshots taken in the global zone could cause a lower level dataset to
106 107 * hit or exceed its limit. The administrator taking the global zone recursive
107 108 * snapshot should be aware of this side-effect and behave accordingly.
108 109 * For consistency, the filesystem limit is also not enforced if the user can
109 110 * modify the limit.
110 111 *
111 112 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
112 113 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
113 114 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
114 115 * dsl_dir_init_fs_ss_count().
115 116 *
116 117 * There is a special case when we receive a filesystem that already exists. In
117 118 * this case a temporary clone name of %X is created (see dmu_recv_begin). We
|
↓ open down ↓ |
82 lines elided |
↑ open up ↑ |
118 119 * never update the filesystem counts for temporary clones.
119 120 *
120 121 * Likewise, we do not update the snapshot counts for temporary snapshots,
121 122 * such as those created by zfs diff.
122 123 */
123 124
124 125 extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
125 126
126 127 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
127 128
128 -/* ARGSUSED */
129 129 static void
130 -dsl_dir_evict(dmu_buf_t *db, void *arg)
130 +dsl_dir_evict(void *dbu)
131 131 {
132 - dsl_dir_t *dd = arg;
132 + dsl_dir_t *dd = dbu;
133 133 dsl_pool_t *dp = dd->dd_pool;
134 134 int t;
135 135
136 + dd->dd_dbuf = NULL;
137 +
136 138 for (t = 0; t < TXG_SIZE; t++) {
137 139 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
138 140 ASSERT(dd->dd_tempreserved[t] == 0);
139 141 ASSERT(dd->dd_space_towrite[t] == 0);
140 142 }
141 143
142 144 if (dd->dd_parent)
143 - dsl_dir_rele(dd->dd_parent, dd);
145 + dsl_dir_async_rele(dd->dd_parent, dd);
144 146
145 - spa_close(dd->dd_pool->dp_spa, dd);
147 + spa_async_close(dd->dd_pool->dp_spa, dd);
146 148
147 149 /*
148 150 * The props callback list should have been cleaned up by
149 151 * objset_evict().
150 152 */
151 153 list_destroy(&dd->dd_prop_cbs);
152 154 mutex_destroy(&dd->dd_lock);
153 155 kmem_free(dd, sizeof (dsl_dir_t));
154 156 }
155 157
156 158 int
157 159 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
158 160 const char *tail, void *tag, dsl_dir_t **ddp)
159 161 {
160 162 dmu_buf_t *dbuf;
161 163 dsl_dir_t *dd;
162 164 int err;
163 165
164 166 ASSERT(dsl_pool_config_held(dp));
165 167
166 168 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
167 169 if (err != 0)
168 170 return (err);
169 171 dd = dmu_buf_get_user(dbuf);
170 172 #ifdef ZFS_DEBUG
171 173 {
172 174 dmu_object_info_t doi;
173 175 dmu_object_info_from_db(dbuf, &doi);
174 176 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
175 177 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
176 178 }
177 179 #endif
178 180 if (dd == NULL) {
179 181 dsl_dir_t *winner;
180 182
181 183 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
182 184 dd->dd_object = ddobj;
183 185 dd->dd_dbuf = dbuf;
184 186 dd->dd_pool = dp;
185 187 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
186 188
187 189 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
188 190 offsetof(dsl_prop_cb_record_t, cbr_node));
189 191
190 192 dsl_dir_snap_cmtime_update(dd);
191 193
192 194 if (dsl_dir_phys(dd)->dd_parent_obj) {
193 195 err = dsl_dir_hold_obj(dp,
194 196 dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
195 197 &dd->dd_parent);
196 198 if (err != 0)
197 199 goto errout;
198 200 if (tail) {
199 201 #ifdef ZFS_DEBUG
200 202 uint64_t foundobj;
201 203
202 204 err = zap_lookup(dp->dp_meta_objset,
203 205 dsl_dir_phys(dd->dd_parent)->
204 206 dd_child_dir_zapobj, tail,
205 207 sizeof (foundobj), 1, &foundobj);
206 208 ASSERT(err || foundobj == ddobj);
207 209 #endif
208 210 (void) strcpy(dd->dd_myname, tail);
209 211 } else {
210 212 err = zap_value_search(dp->dp_meta_objset,
211 213 dsl_dir_phys(dd->dd_parent)->
212 214 dd_child_dir_zapobj,
213 215 ddobj, 0, dd->dd_myname);
214 216 }
215 217 if (err != 0)
216 218 goto errout;
217 219 } else {
218 220 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
219 221 }
220 222
221 223 if (dsl_dir_is_clone(dd)) {
222 224 dmu_buf_t *origin_bonus;
223 225 dsl_dataset_phys_t *origin_phys;
224 226
225 227 /*
226 228 * We can't open the origin dataset, because
227 229 * that would require opening this dsl_dir.
228 230 * Just look at its phys directly instead.
229 231 */
230 232 err = dmu_bonus_hold(dp->dp_meta_objset,
|
↓ open down ↓ |
75 lines elided |
↑ open up ↑ |
231 233 dsl_dir_phys(dd)->dd_origin_obj, FTAG,
232 234 &origin_bonus);
233 235 if (err != 0)
234 236 goto errout;
235 237 origin_phys = origin_bonus->db_data;
236 238 dd->dd_origin_txg =
237 239 origin_phys->ds_creation_txg;
238 240 dmu_buf_rele(origin_bonus, FTAG);
239 241 }
240 242
241 - winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict);
242 - if (winner) {
243 + dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf);
244 + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
245 + if (winner != NULL) {
243 246 if (dd->dd_parent)
244 247 dsl_dir_rele(dd->dd_parent, dd);
245 248 mutex_destroy(&dd->dd_lock);
246 249 kmem_free(dd, sizeof (dsl_dir_t));
247 250 dd = winner;
248 251 } else {
249 252 spa_open_ref(dp->dp_spa, dd);
250 253 }
251 254 }
252 255
253 256 /*
254 257 * The dsl_dir_t has both open-to-close and instantiate-to-evict
255 258 * holds on the spa. We need the open-to-close holds because
256 259 * otherwise the spa_refcnt wouldn't change when we open a
257 260 * dir which the spa also has open, so we could incorrectly
258 261 * think it was OK to unload/export/destroy the pool. We need
259 262 * the instantiate-to-evict hold because the dsl_dir_t has a
260 263 * pointer to the dd_pool, which has a pointer to the spa_t.
261 264 */
262 265 spa_open_ref(dp->dp_spa, tag);
263 266 ASSERT3P(dd->dd_pool, ==, dp);
264 267 ASSERT3U(dd->dd_object, ==, ddobj);
265 268 ASSERT3P(dd->dd_dbuf, ==, dbuf);
266 269 *ddp = dd;
267 270 return (0);
268 271
269 272 errout:
270 273 if (dd->dd_parent)
271 274 dsl_dir_rele(dd->dd_parent, dd);
272 275 mutex_destroy(&dd->dd_lock);
273 276 kmem_free(dd, sizeof (dsl_dir_t));
274 277 dmu_buf_rele(dbuf, tag);
275 278 return (err);
|
↓ open down ↓ |
23 lines elided |
↑ open up ↑ |
276 279 }
277 280
278 281 void
279 282 dsl_dir_rele(dsl_dir_t *dd, void *tag)
280 283 {
281 284 dprintf_dd(dd, "%s\n", "");
282 285 spa_close(dd->dd_pool->dp_spa, tag);
283 286 dmu_buf_rele(dd->dd_dbuf, tag);
284 287 }
285 288
289 +/*
290 + * Remove a reference to the given dsl dir that is being asynchronously
291 + * released. Async releases occur from a taskq performing eviction of
292 + * dsl datasets and dirs. This process is identical to a normal release
293 + * with the exception of using the async API for releasing the reference on
294 + * the spa.
295 + */
296 +void
297 +dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
298 +{
299 + dprintf_dd(dd, "%s\n", "");
300 + spa_async_close(dd->dd_pool->dp_spa, tag);
301 + dmu_buf_rele(dd->dd_dbuf, tag);
302 +}
303 +
286 304 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
287 305 void
288 306 dsl_dir_name(dsl_dir_t *dd, char *buf)
289 307 {
290 308 if (dd->dd_parent) {
291 309 dsl_dir_name(dd->dd_parent, buf);
292 310 (void) strcat(buf, "/");
293 311 } else {
294 312 buf[0] = '\0';
295 313 }
296 314 if (!MUTEX_HELD(&dd->dd_lock)) {
297 315 /*
298 316 * recursive mutex so that we can use
299 317 * dprintf_dd() with dd_lock held
300 318 */
301 319 mutex_enter(&dd->dd_lock);
302 320 (void) strcat(buf, dd->dd_myname);
303 321 mutex_exit(&dd->dd_lock);
304 322 } else {
305 323 (void) strcat(buf, dd->dd_myname);
306 324 }
307 325 }
308 326
309 327 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
310 328 int
311 329 dsl_dir_namelen(dsl_dir_t *dd)
312 330 {
313 331 int result = 0;
314 332
315 333 if (dd->dd_parent) {
316 334 /* parent's name + 1 for the "/" */
317 335 result = dsl_dir_namelen(dd->dd_parent) + 1;
318 336 }
319 337
320 338 if (!MUTEX_HELD(&dd->dd_lock)) {
321 339 /* see dsl_dir_name */
322 340 mutex_enter(&dd->dd_lock);
323 341 result += strlen(dd->dd_myname);
324 342 mutex_exit(&dd->dd_lock);
325 343 } else {
326 344 result += strlen(dd->dd_myname);
327 345 }
328 346
329 347 return (result);
330 348 }
331 349
332 350 static int
333 351 getcomponent(const char *path, char *component, const char **nextp)
334 352 {
335 353 char *p;
336 354
337 355 if ((path == NULL) || (path[0] == '\0'))
338 356 return (SET_ERROR(ENOENT));
339 357 /* This would be a good place to reserve some namespace... */
340 358 p = strpbrk(path, "/@");
341 359 if (p && (p[1] == '/' || p[1] == '@')) {
342 360 /* two separators in a row */
343 361 return (SET_ERROR(EINVAL));
344 362 }
345 363 if (p == NULL || p == path) {
346 364 /*
347 365 * if the first thing is an @ or /, it had better be an
348 366 * @ and it had better not have any more ats or slashes,
349 367 * and it had better have something after the @.
350 368 */
351 369 if (p != NULL &&
352 370 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
353 371 return (SET_ERROR(EINVAL));
354 372 if (strlen(path) >= MAXNAMELEN)
355 373 return (SET_ERROR(ENAMETOOLONG));
356 374 (void) strcpy(component, path);
357 375 p = NULL;
358 376 } else if (p[0] == '/') {
359 377 if (p - path >= MAXNAMELEN)
360 378 return (SET_ERROR(ENAMETOOLONG));
361 379 (void) strncpy(component, path, p - path);
362 380 component[p - path] = '\0';
363 381 p++;
364 382 } else if (p[0] == '@') {
365 383 /*
366 384 * if the next separator is an @, there better not be
367 385 * any more slashes.
368 386 */
369 387 if (strchr(path, '/'))
370 388 return (SET_ERROR(EINVAL));
371 389 if (p - path >= MAXNAMELEN)
372 390 return (SET_ERROR(ENAMETOOLONG));
373 391 (void) strncpy(component, path, p - path);
374 392 component[p - path] = '\0';
375 393 } else {
376 394 panic("invalid p=%p", (void *)p);
377 395 }
378 396 *nextp = p;
379 397 return (0);
380 398 }
381 399
382 400 /*
383 401 * Return the dsl_dir_t, and possibly the last component which couldn't
384 402 * be found in *tail. The name must be in the specified dsl_pool_t. This
385 403 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
386 404 * path is bogus, or if tail==NULL and we couldn't parse the whole name.
387 405 * (*tail)[0] == '@' means that the last component is a snapshot.
388 406 */
389 407 int
390 408 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
391 409 dsl_dir_t **ddp, const char **tailp)
392 410 {
393 411 char buf[MAXNAMELEN];
394 412 const char *spaname, *next, *nextnext = NULL;
395 413 int err;
396 414 dsl_dir_t *dd;
397 415 uint64_t ddobj;
398 416
399 417 err = getcomponent(name, buf, &next);
400 418 if (err != 0)
401 419 return (err);
402 420
403 421 /* Make sure the name is in the specified pool. */
404 422 spaname = spa_name(dp->dp_spa);
405 423 if (strcmp(buf, spaname) != 0)
|
↓ open down ↓ |
110 lines elided |
↑ open up ↑ |
406 424 return (SET_ERROR(EINVAL));
407 425
408 426 ASSERT(dsl_pool_config_held(dp));
409 427
410 428 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
411 429 if (err != 0) {
412 430 return (err);
413 431 }
414 432
415 433 while (next != NULL) {
416 - dsl_dir_t *child_ds;
434 + dsl_dir_t *child_dd;
417 435 err = getcomponent(next, buf, &nextnext);
418 436 if (err != 0)
419 437 break;
420 438 ASSERT(next[0] != '\0');
421 439 if (next[0] == '@')
422 440 break;
423 441 dprintf("looking up %s in obj%lld\n",
424 442 buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
425 443
426 444 err = zap_lookup(dp->dp_meta_objset,
427 445 dsl_dir_phys(dd)->dd_child_dir_zapobj,
428 446 buf, sizeof (ddobj), 1, &ddobj);
429 447 if (err != 0) {
430 448 if (err == ENOENT)
431 449 err = 0;
432 450 break;
433 451 }
434 452
435 - err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
453 + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
436 454 if (err != 0)
437 455 break;
438 456 dsl_dir_rele(dd, tag);
439 - dd = child_ds;
457 + dd = child_dd;
440 458 next = nextnext;
441 459 }
442 460
443 461 if (err != 0) {
444 462 dsl_dir_rele(dd, tag);
445 463 return (err);
446 464 }
447 465
448 466 /*
449 467 * It's an error if there's more than one component left, or
450 468 * tailp==NULL and there's any component left.
451 469 */
452 470 if (next != NULL &&
453 471 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
454 472 /* bad path name */
455 473 dsl_dir_rele(dd, tag);
456 474 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
457 475 err = SET_ERROR(ENOENT);
458 476 }
459 477 if (tailp != NULL)
460 478 *tailp = next;
461 479 *ddp = dd;
462 480 return (err);
463 481 }
464 482
465 483 /*
466 484 * If the counts are already initialized for this filesystem and its
467 485 * descendants then do nothing, otherwise initialize the counts.
468 486 *
469 487 * The counts on this filesystem, and those below, may be uninitialized due to
470 488 * either the use of a pre-existing pool which did not support the
471 489 * filesystem/snapshot limit feature, or one in which the feature had not yet
472 490 * been enabled.
473 491 *
474 492 * Recursively descend the filesystem tree and update the filesystem/snapshot
475 493 * counts on each filesystem below, then update the cumulative count on the
476 494 * current filesystem. If the filesystem already has a count set on it,
477 495 * then we know that its counts, and the counts on the filesystems below it,
478 496 * are already correct, so we don't have to update this filesystem.
479 497 */
480 498 static void
481 499 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
482 500 {
483 501 uint64_t my_fs_cnt = 0;
484 502 uint64_t my_ss_cnt = 0;
485 503 dsl_pool_t *dp = dd->dd_pool;
486 504 objset_t *os = dp->dp_meta_objset;
487 505 zap_cursor_t *zc;
488 506 zap_attribute_t *za;
489 507 dsl_dataset_t *ds;
490 508
491 509 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
492 510 ASSERT(dsl_pool_config_held(dp));
493 511 ASSERT(dmu_tx_is_syncing(tx));
494 512
495 513 dsl_dir_zapify(dd, tx);
496 514
497 515 /*
498 516 * If the filesystem count has already been initialized then we
499 517 * don't need to recurse down any further.
500 518 */
501 519 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
502 520 return;
503 521
504 522 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
505 523 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
506 524
507 525 /* Iterate my child dirs */
508 526 for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
509 527 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
510 528 dsl_dir_t *chld_dd;
511 529 uint64_t count;
512 530
513 531 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
514 532 &chld_dd));
515 533
516 534 /*
517 535 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
518 536 * temporary datasets.
519 537 */
520 538 if (chld_dd->dd_myname[0] == '$' ||
521 539 chld_dd->dd_myname[0] == '%') {
522 540 dsl_dir_rele(chld_dd, FTAG);
523 541 continue;
524 542 }
525 543
526 544 my_fs_cnt++; /* count this child */
527 545
528 546 dsl_dir_init_fs_ss_count(chld_dd, tx);
529 547
530 548 VERIFY0(zap_lookup(os, chld_dd->dd_object,
531 549 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
532 550 my_fs_cnt += count;
533 551 VERIFY0(zap_lookup(os, chld_dd->dd_object,
534 552 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
535 553 my_ss_cnt += count;
536 554
537 555 dsl_dir_rele(chld_dd, FTAG);
538 556 }
539 557 zap_cursor_fini(zc);
540 558 /* Count my snapshots (we counted children's snapshots above) */
541 559 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
542 560 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
543 561
544 562 for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
545 563 zap_cursor_retrieve(zc, za) == 0;
546 564 zap_cursor_advance(zc)) {
547 565 /* Don't count temporary snapshots */
548 566 if (za->za_name[0] != '%')
549 567 my_ss_cnt++;
550 568 }
551 569 zap_cursor_fini(zc);
552 570
553 571 dsl_dataset_rele(ds, FTAG);
554 572
555 573 kmem_free(zc, sizeof (zap_cursor_t));
556 574 kmem_free(za, sizeof (zap_attribute_t));
557 575
558 576 /* we're in a sync task, update counts */
559 577 dmu_buf_will_dirty(dd->dd_dbuf, tx);
560 578 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
561 579 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
562 580 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
563 581 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
564 582 }
565 583
566 584 static int
567 585 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
568 586 {
569 587 char *ddname = (char *)arg;
570 588 dsl_pool_t *dp = dmu_tx_pool(tx);
571 589 dsl_dataset_t *ds;
572 590 dsl_dir_t *dd;
573 591 int error;
574 592
575 593 error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
576 594 if (error != 0)
577 595 return (error);
578 596
579 597 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
580 598 dsl_dataset_rele(ds, FTAG);
581 599 return (SET_ERROR(ENOTSUP));
582 600 }
583 601
584 602 dd = ds->ds_dir;
585 603 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
586 604 dsl_dir_is_zapified(dd) &&
587 605 zap_contains(dp->dp_meta_objset, dd->dd_object,
588 606 DD_FIELD_FILESYSTEM_COUNT) == 0) {
589 607 dsl_dataset_rele(ds, FTAG);
590 608 return (SET_ERROR(EALREADY));
591 609 }
592 610
593 611 dsl_dataset_rele(ds, FTAG);
594 612 return (0);
595 613 }
596 614
597 615 static void
598 616 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
599 617 {
600 618 char *ddname = (char *)arg;
601 619 dsl_pool_t *dp = dmu_tx_pool(tx);
602 620 dsl_dataset_t *ds;
603 621 spa_t *spa;
604 622
605 623 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
606 624
607 625 spa = dsl_dataset_get_spa(ds);
608 626
609 627 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
610 628 /*
611 629 * Since the feature was not active and we're now setting a
612 630 * limit, increment the feature-active counter so that the
613 631 * feature becomes active for the first time.
614 632 *
615 633 * We are already in a sync task so we can update the MOS.
616 634 */
617 635 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
618 636 }
619 637
620 638 /*
621 639 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
622 640 * we need to ensure the counts are correct. Descend down the tree from
623 641 * this point and update all of the counts to be accurate.
624 642 */
625 643 dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
626 644
627 645 dsl_dataset_rele(ds, FTAG);
628 646 }
629 647
630 648 /*
631 649 * Make sure the feature is enabled and activate it if necessary.
632 650 * Since we're setting a limit, ensure the on-disk counts are valid.
633 651 * This is only called by the ioctl path when setting a limit value.
634 652 *
635 653 * We do not need to validate the new limit, since users who can change the
636 654 * limit are also allowed to exceed the limit.
637 655 */
638 656 int
639 657 dsl_dir_activate_fs_ss_limit(const char *ddname)
640 658 {
641 659 int error;
642 660
643 661 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
644 662 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
645 663 ZFS_SPACE_CHECK_RESERVED);
646 664
647 665 if (error == EALREADY)
648 666 error = 0;
649 667
650 668 return (error);
651 669 }
652 670
653 671 /*
654 672 * Used to determine if the filesystem_limit or snapshot_limit should be
655 673 * enforced. We allow the limit to be exceeded if the user has permission to
656 674 * write the property value. We pass in the creds that we got in the open
657 675 * context since we will always be the GZ root in syncing context. We also have
658 676 * to handle the case where we are allowed to change the limit on the current
659 677 * dataset, but there may be another limit in the tree above.
660 678 *
661 679 * We can never modify these two properties within a non-global zone. In
662 680 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
663 681 * can't use that function since we are already holding the dp_config_rwlock.
664 682 * In addition, we already have the dd and dealing with snapshots is simplified
665 683 * in this code.
666 684 */
667 685
668 686 typedef enum {
669 687 ENFORCE_ALWAYS,
670 688 ENFORCE_NEVER,
671 689 ENFORCE_ABOVE
672 690 } enforce_res_t;
673 691
674 692 static enforce_res_t
675 693 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
676 694 {
677 695 enforce_res_t enforce = ENFORCE_ALWAYS;
678 696 uint64_t obj;
679 697 dsl_dataset_t *ds;
680 698 uint64_t zoned;
681 699
682 700 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
683 701 prop == ZFS_PROP_SNAPSHOT_LIMIT);
684 702
685 703 #ifdef _KERNEL
686 704 if (crgetzoneid(cr) != GLOBAL_ZONEID)
687 705 return (ENFORCE_ALWAYS);
688 706
689 707 if (secpolicy_zfs(cr) == 0)
690 708 return (ENFORCE_NEVER);
691 709 #endif
692 710
693 711 if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
694 712 return (ENFORCE_ALWAYS);
695 713
696 714 ASSERT(dsl_pool_config_held(dd->dd_pool));
697 715
698 716 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
699 717 return (ENFORCE_ALWAYS);
700 718
701 719 if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
702 720 /* Only root can access zoned fs's from the GZ */
703 721 enforce = ENFORCE_ALWAYS;
704 722 } else {
705 723 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
706 724 enforce = ENFORCE_ABOVE;
707 725 }
708 726
709 727 dsl_dataset_rele(ds, FTAG);
710 728 return (enforce);
711 729 }
712 730
713 731 /*
714 732 * Check if adding additional child filesystem(s) would exceed any filesystem
715 733 * limits or adding additional snapshot(s) would exceed any snapshot limits.
716 734 * The prop argument indicates which limit to check.
717 735 *
718 736 * Note that all filesystem limits up to the root (or the highest
719 737 * initialized) filesystem or the given ancestor must be satisfied.
720 738 */
721 739 int
722 740 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
723 741 dsl_dir_t *ancestor, cred_t *cr)
724 742 {
725 743 objset_t *os = dd->dd_pool->dp_meta_objset;
726 744 uint64_t limit, count;
727 745 char *count_prop;
728 746 enforce_res_t enforce;
729 747 int err = 0;
730 748
731 749 ASSERT(dsl_pool_config_held(dd->dd_pool));
732 750 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
733 751 prop == ZFS_PROP_SNAPSHOT_LIMIT);
734 752
735 753 /*
736 754 * If we're allowed to change the limit, don't enforce the limit
737 755 * e.g. this can happen if a snapshot is taken by an administrative
738 756 * user in the global zone (i.e. a recursive snapshot by root).
739 757 * However, we must handle the case of delegated permissions where we
740 758 * are allowed to change the limit on the current dataset, but there
741 759 * is another limit in the tree above.
742 760 */
743 761 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
744 762 if (enforce == ENFORCE_NEVER)
745 763 return (0);
746 764
747 765 /*
748 766 * e.g. if renaming a dataset with no snapshots, count adjustment
749 767 * is 0.
750 768 */
751 769 if (delta == 0)
752 770 return (0);
753 771
754 772 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
755 773 /*
756 774 * We don't enforce the limit for temporary snapshots. This is
757 775 * indicated by a NULL cred_t argument.
758 776 */
759 777 if (cr == NULL)
760 778 return (0);
761 779
762 780 count_prop = DD_FIELD_SNAPSHOT_COUNT;
763 781 } else {
764 782 count_prop = DD_FIELD_FILESYSTEM_COUNT;
765 783 }
766 784
767 785 /*
768 786 * If an ancestor has been provided, stop checking the limit once we
769 787 * hit that dir. We need this during rename so that we don't overcount
770 788 * the check once we recurse up to the common ancestor.
771 789 */
772 790 if (ancestor == dd)
773 791 return (0);
774 792
775 793 /*
776 794 * If we hit an uninitialized node while recursing up the tree, we can
777 795 * stop since we know there is no limit here (or above). The counts are
778 796 * not valid on this node and we know we won't touch this node's counts.
779 797 */
780 798 if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
781 799 count_prop, sizeof (count), 1, &count) == ENOENT)
782 800 return (0);
783 801
784 802 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
785 803 B_FALSE);
786 804 if (err != 0)
787 805 return (err);
788 806
789 807 /* Is there a limit which we've hit? */
790 808 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
791 809 return (SET_ERROR(EDQUOT));
792 810
793 811 if (dd->dd_parent != NULL)
794 812 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
795 813 ancestor, cr);
796 814
797 815 return (err);
798 816 }
799 817
800 818 /*
801 819 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
802 820 * parents. When a new filesystem/snapshot is created, increment the count on
803 821 * all parents, and when a filesystem/snapshot is destroyed, decrement the
804 822 * count.
805 823 */
806 824 void
807 825 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
808 826 dmu_tx_t *tx)
809 827 {
810 828 int err;
811 829 objset_t *os = dd->dd_pool->dp_meta_objset;
812 830 uint64_t count;
813 831
814 832 ASSERT(dsl_pool_config_held(dd->dd_pool));
815 833 ASSERT(dmu_tx_is_syncing(tx));
816 834 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
817 835 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
818 836
819 837 /*
820 838 * When we receive an incremental stream into a filesystem that already
821 839 * exists, a temporary clone is created. We don't count this temporary
822 840 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
823 841 * $MOS & $ORIGIN) objsets.
824 842 */
825 843 if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
826 844 strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
827 845 return;
828 846
829 847 /*
830 848 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
831 849 */
832 850 if (delta == 0)
833 851 return;
834 852
835 853 /*
836 854 * If we hit an uninitialized node while recursing up the tree, we can
837 855 * stop since we know the counts are not valid on this node and we
838 856 * know we shouldn't touch this node's counts. An uninitialized count
839 857 * on the node indicates that either the feature has not yet been
840 858 * activated or there are no limits on this part of the tree.
841 859 */
842 860 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
843 861 prop, sizeof (count), 1, &count)) == ENOENT)
844 862 return;
845 863 VERIFY0(err);
846 864
847 865 count += delta;
848 866 /* Use a signed verify to make sure we're not neg. */
849 867 VERIFY3S(count, >=, 0);
850 868
851 869 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
852 870 tx));
853 871
854 872 /* Roll up this additional count into our ancestors */
855 873 if (dd->dd_parent != NULL)
856 874 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
857 875 }
858 876
859 877 uint64_t
860 878 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
861 879 dmu_tx_t *tx)
862 880 {
863 881 objset_t *mos = dp->dp_meta_objset;
864 882 uint64_t ddobj;
865 883 dsl_dir_phys_t *ddphys;
866 884 dmu_buf_t *dbuf;
867 885
868 886 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
869 887 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
870 888 if (pds) {
871 889 VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
872 890 name, sizeof (uint64_t), 1, &ddobj, tx));
873 891 } else {
874 892 /* it's the root dir */
875 893 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
876 894 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
877 895 }
878 896 VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
879 897 dmu_buf_will_dirty(dbuf, tx);
880 898 ddphys = dbuf->db_data;
881 899
882 900 ddphys->dd_creation_time = gethrestime_sec();
883 901 if (pds) {
884 902 ddphys->dd_parent_obj = pds->dd_object;
885 903
886 904 /* update the filesystem counts */
887 905 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
888 906 }
889 907 ddphys->dd_props_zapobj = zap_create(mos,
890 908 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
891 909 ddphys->dd_child_dir_zapobj = zap_create(mos,
892 910 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
893 911 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
894 912 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
895 913 dmu_buf_rele(dbuf, FTAG);
896 914
897 915 return (ddobj);
898 916 }
899 917
900 918 boolean_t
901 919 dsl_dir_is_clone(dsl_dir_t *dd)
902 920 {
903 921 return (dsl_dir_phys(dd)->dd_origin_obj &&
904 922 (dd->dd_pool->dp_origin_snap == NULL ||
905 923 dsl_dir_phys(dd)->dd_origin_obj !=
906 924 dd->dd_pool->dp_origin_snap->ds_object));
907 925 }
908 926
909 927 void
910 928 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
911 929 {
912 930 mutex_enter(&dd->dd_lock);
913 931 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
914 932 dsl_dir_phys(dd)->dd_used_bytes);
915 933 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
916 934 dsl_dir_phys(dd)->dd_quota);
917 935 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
918 936 dsl_dir_phys(dd)->dd_reserved);
919 937 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
920 938 dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
921 939 (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
922 940 dsl_dir_phys(dd)->dd_compressed_bytes));
923 941 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
924 942 dsl_dir_phys(dd)->dd_uncompressed_bytes);
925 943 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
926 944 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
927 945 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
928 946 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
929 947 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
930 948 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
931 949 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
932 950 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
933 951 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
934 952 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
935 953 }
936 954 mutex_exit(&dd->dd_lock);
937 955
938 956 if (dsl_dir_is_zapified(dd)) {
939 957 uint64_t count;
940 958 objset_t *os = dd->dd_pool->dp_meta_objset;
941 959
942 960 if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
943 961 sizeof (count), 1, &count) == 0) {
944 962 dsl_prop_nvlist_add_uint64(nv,
945 963 ZFS_PROP_FILESYSTEM_COUNT, count);
946 964 }
947 965 if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
948 966 sizeof (count), 1, &count) == 0) {
949 967 dsl_prop_nvlist_add_uint64(nv,
950 968 ZFS_PROP_SNAPSHOT_COUNT, count);
951 969 }
952 970 }
953 971
954 972 if (dsl_dir_is_clone(dd)) {
955 973 dsl_dataset_t *ds;
956 974 char buf[MAXNAMELEN];
957 975
958 976 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
959 977 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
960 978 dsl_dataset_name(ds, buf);
961 979 dsl_dataset_rele(ds, FTAG);
962 980 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
963 981 }
964 982 }
965 983
966 984 void
967 985 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
968 986 {
969 987 dsl_pool_t *dp = dd->dd_pool;
970 988
971 989 ASSERT(dsl_dir_phys(dd));
972 990
973 991 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
974 992 /* up the hold count until we can be written out */
975 993 dmu_buf_add_ref(dd->dd_dbuf, dd);
976 994 }
977 995 }
978 996
979 997 static int64_t
980 998 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
981 999 {
982 1000 uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
983 1001 uint64_t new_accounted =
984 1002 MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
985 1003 return (new_accounted - old_accounted);
986 1004 }
987 1005
988 1006 void
989 1007 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
990 1008 {
991 1009 ASSERT(dmu_tx_is_syncing(tx));
992 1010
993 1011 mutex_enter(&dd->dd_lock);
994 1012 ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
995 1013 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
996 1014 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
997 1015 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
998 1016 mutex_exit(&dd->dd_lock);
999 1017
1000 1018 /* release the hold from dsl_dir_dirty */
1001 1019 dmu_buf_rele(dd->dd_dbuf, dd);
1002 1020 }
1003 1021
1004 1022 static uint64_t
1005 1023 dsl_dir_space_towrite(dsl_dir_t *dd)
1006 1024 {
1007 1025 uint64_t space = 0;
1008 1026 int i;
1009 1027
1010 1028 ASSERT(MUTEX_HELD(&dd->dd_lock));
1011 1029
1012 1030 for (i = 0; i < TXG_SIZE; i++) {
1013 1031 space += dd->dd_space_towrite[i&TXG_MASK];
1014 1032 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
1015 1033 }
1016 1034 return (space);
1017 1035 }
1018 1036
1019 1037 /*
1020 1038 * How much space would dd have available if ancestor had delta applied
1021 1039 * to it? If ondiskonly is set, we're only interested in what's
1022 1040 * on-disk, not estimated pending changes.
1023 1041 */
1024 1042 uint64_t
1025 1043 dsl_dir_space_available(dsl_dir_t *dd,
1026 1044 dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1027 1045 {
1028 1046 uint64_t parentspace, myspace, quota, used;
1029 1047
1030 1048 /*
1031 1049 * If there are no restrictions otherwise, assume we have
1032 1050 * unlimited space available.
1033 1051 */
1034 1052 quota = UINT64_MAX;
1035 1053 parentspace = UINT64_MAX;
1036 1054
1037 1055 if (dd->dd_parent != NULL) {
1038 1056 parentspace = dsl_dir_space_available(dd->dd_parent,
1039 1057 ancestor, delta, ondiskonly);
1040 1058 }
1041 1059
1042 1060 mutex_enter(&dd->dd_lock);
1043 1061 if (dsl_dir_phys(dd)->dd_quota != 0)
1044 1062 quota = dsl_dir_phys(dd)->dd_quota;
1045 1063 used = dsl_dir_phys(dd)->dd_used_bytes;
1046 1064 if (!ondiskonly)
1047 1065 used += dsl_dir_space_towrite(dd);
1048 1066
1049 1067 if (dd->dd_parent == NULL) {
1050 1068 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
1051 1069 quota = MIN(quota, poolsize);
1052 1070 }
1053 1071
1054 1072 if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
1055 1073 /*
1056 1074 * We have some space reserved, in addition to what our
1057 1075 * parent gave us.
1058 1076 */
1059 1077 parentspace += dsl_dir_phys(dd)->dd_reserved - used;
1060 1078 }
1061 1079
1062 1080 if (dd == ancestor) {
1063 1081 ASSERT(delta <= 0);
1064 1082 ASSERT(used >= -delta);
1065 1083 used += delta;
1066 1084 if (parentspace != UINT64_MAX)
1067 1085 parentspace -= delta;
1068 1086 }
1069 1087
1070 1088 if (used > quota) {
1071 1089 /* over quota */
1072 1090 myspace = 0;
1073 1091 } else {
1074 1092 /*
1075 1093 * the lesser of the space provided by our parent and
1076 1094 * the space left in our quota
1077 1095 */
1078 1096 myspace = MIN(parentspace, quota - used);
1079 1097 }
1080 1098
1081 1099 mutex_exit(&dd->dd_lock);
1082 1100
1083 1101 return (myspace);
1084 1102 }
1085 1103
1086 1104 struct tempreserve {
1087 1105 list_node_t tr_node;
1088 1106 dsl_dir_t *tr_ds;
1089 1107 uint64_t tr_size;
1090 1108 };
1091 1109
1092 1110 static int
1093 1111 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1094 1112 boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
1095 1113 dmu_tx_t *tx, boolean_t first)
1096 1114 {
1097 1115 uint64_t txg = tx->tx_txg;
1098 1116 uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
1099 1117 uint64_t deferred = 0;
1100 1118 struct tempreserve *tr;
1101 1119 int retval = EDQUOT;
1102 1120 int txgidx = txg & TXG_MASK;
1103 1121 int i;
1104 1122 uint64_t ref_rsrv = 0;
1105 1123
1106 1124 ASSERT3U(txg, !=, 0);
1107 1125 ASSERT3S(asize, >, 0);
1108 1126
1109 1127 mutex_enter(&dd->dd_lock);
1110 1128
1111 1129 /*
1112 1130 * Check against the dsl_dir's quota. We don't add in the delta
1113 1131 * when checking for over-quota because they get one free hit.
1114 1132 */
1115 1133 est_inflight = dsl_dir_space_towrite(dd);
1116 1134 for (i = 0; i < TXG_SIZE; i++)
1117 1135 est_inflight += dd->dd_tempreserved[i];
1118 1136 used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
1119 1137
1120 1138 /*
1121 1139 * On the first iteration, fetch the dataset's used-on-disk and
1122 1140 * refreservation values. Also, if checkrefquota is set, test if
1123 1141 * allocating this space would exceed the dataset's refquota.
1124 1142 */
1125 1143 if (first && tx->tx_objset) {
1126 1144 int error;
1127 1145 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1128 1146
1129 1147 error = dsl_dataset_check_quota(ds, checkrefquota,
1130 1148 asize, est_inflight, &used_on_disk, &ref_rsrv);
1131 1149 if (error) {
1132 1150 mutex_exit(&dd->dd_lock);
1133 1151 return (error);
1134 1152 }
1135 1153 }
1136 1154
1137 1155 /*
1138 1156 * If this transaction will result in a net free of space,
1139 1157 * we want to let it through.
1140 1158 */
1141 1159 if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
1142 1160 quota = UINT64_MAX;
1143 1161 else
1144 1162 quota = dsl_dir_phys(dd)->dd_quota;
1145 1163
1146 1164 /*
1147 1165 * Adjust the quota against the actual pool size at the root
1148 1166 * minus any outstanding deferred frees.
1149 1167 * To ensure that it's possible to remove files from a full
1150 1168 * pool without inducing transient overcommits, we throttle
1151 1169 * netfree transactions against a quota that is slightly larger,
1152 1170 * but still within the pool's allocation slop. In cases where
1153 1171 * we're very close to full, this will allow a steady trickle of
1154 1172 * removes to get through.
1155 1173 */
1156 1174 if (dd->dd_parent == NULL) {
1157 1175 spa_t *spa = dd->dd_pool->dp_spa;
1158 1176 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
1159 1177 deferred = metaslab_class_get_deferred(spa_normal_class(spa));
1160 1178 if (poolsize - deferred < quota) {
1161 1179 quota = poolsize - deferred;
1162 1180 retval = ENOSPC;
1163 1181 }
1164 1182 }
1165 1183
1166 1184 /*
1167 1185 * If they are requesting more space, and our current estimate
1168 1186 * is over quota, they get to try again unless the actual
1169 1187 * on-disk is over quota and there are no pending changes (which
1170 1188 * may free up space for us).
1171 1189 */
1172 1190 if (used_on_disk + est_inflight >= quota) {
1173 1191 if (est_inflight > 0 || used_on_disk < quota ||
1174 1192 (retval == ENOSPC && used_on_disk < quota + deferred))
1175 1193 retval = ERESTART;
1176 1194 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1177 1195 "quota=%lluK tr=%lluK err=%d\n",
1178 1196 used_on_disk>>10, est_inflight>>10,
1179 1197 quota>>10, asize>>10, retval);
1180 1198 mutex_exit(&dd->dd_lock);
1181 1199 return (SET_ERROR(retval));
1182 1200 }
1183 1201
1184 1202 /* We need to up our estimated delta before dropping dd_lock */
1185 1203 dd->dd_tempreserved[txgidx] += asize;
1186 1204
1187 1205 parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1188 1206 asize - ref_rsrv);
1189 1207 mutex_exit(&dd->dd_lock);
1190 1208
1191 1209 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1192 1210 tr->tr_ds = dd;
1193 1211 tr->tr_size = asize;
1194 1212 list_insert_tail(tr_list, tr);
1195 1213
1196 1214 /* see if it's OK with our parent */
1197 1215 if (dd->dd_parent && parent_rsrv) {
1198 1216 boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
1199 1217
1200 1218 return (dsl_dir_tempreserve_impl(dd->dd_parent,
1201 1219 parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
1202 1220 } else {
1203 1221 return (0);
1204 1222 }
1205 1223 }
1206 1224
1207 1225 /*
1208 1226 * Reserve space in this dsl_dir, to be used in this tx's txg.
1209 1227 * After the space has been dirtied (and dsl_dir_willuse_space()
1210 1228 * has been called), the reservation should be canceled, using
1211 1229 * dsl_dir_tempreserve_clear().
1212 1230 */
1213 1231 int
1214 1232 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1215 1233 uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
1216 1234 {
1217 1235 int err;
1218 1236 list_t *tr_list;
1219 1237
1220 1238 if (asize == 0) {
1221 1239 *tr_cookiep = NULL;
1222 1240 return (0);
1223 1241 }
1224 1242
1225 1243 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1226 1244 list_create(tr_list, sizeof (struct tempreserve),
1227 1245 offsetof(struct tempreserve, tr_node));
1228 1246 ASSERT3S(asize, >, 0);
1229 1247 ASSERT3S(fsize, >=, 0);
1230 1248
1231 1249 err = arc_tempreserve_space(lsize, tx->tx_txg);
1232 1250 if (err == 0) {
1233 1251 struct tempreserve *tr;
1234 1252
1235 1253 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1236 1254 tr->tr_size = lsize;
1237 1255 list_insert_tail(tr_list, tr);
1238 1256 } else {
1239 1257 if (err == EAGAIN) {
1240 1258 /*
1241 1259 * If arc_memory_throttle() detected that pageout
1242 1260 * is running and we are low on memory, we delay new
1243 1261 * non-pageout transactions to give pageout an
1244 1262 * advantage.
1245 1263 *
1246 1264 * It is unfortunate to be delaying while the caller's
1247 1265 * locks are held.
1248 1266 */
1249 1267 txg_delay(dd->dd_pool, tx->tx_txg,
1250 1268 MSEC2NSEC(10), MSEC2NSEC(10));
1251 1269 err = SET_ERROR(ERESTART);
1252 1270 }
1253 1271 }
1254 1272
1255 1273 if (err == 0) {
1256 1274 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
1257 1275 FALSE, asize > usize, tr_list, tx, TRUE);
1258 1276 }
1259 1277
1260 1278 if (err != 0)
1261 1279 dsl_dir_tempreserve_clear(tr_list, tx);
1262 1280 else
1263 1281 *tr_cookiep = tr_list;
1264 1282
1265 1283 return (err);
1266 1284 }
1267 1285
1268 1286 /*
1269 1287 * Clear a temporary reservation that we previously made with
1270 1288 * dsl_dir_tempreserve_space().
1271 1289 */
1272 1290 void
1273 1291 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1274 1292 {
1275 1293 int txgidx = tx->tx_txg & TXG_MASK;
1276 1294 list_t *tr_list = tr_cookie;
1277 1295 struct tempreserve *tr;
1278 1296
1279 1297 ASSERT3U(tx->tx_txg, !=, 0);
1280 1298
1281 1299 if (tr_cookie == NULL)
1282 1300 return;
1283 1301
1284 1302 while ((tr = list_head(tr_list)) != NULL) {
1285 1303 if (tr->tr_ds) {
1286 1304 mutex_enter(&tr->tr_ds->dd_lock);
1287 1305 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1288 1306 tr->tr_size);
1289 1307 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1290 1308 mutex_exit(&tr->tr_ds->dd_lock);
1291 1309 } else {
1292 1310 arc_tempreserve_clear(tr->tr_size);
1293 1311 }
1294 1312 list_remove(tr_list, tr);
1295 1313 kmem_free(tr, sizeof (struct tempreserve));
1296 1314 }
1297 1315
1298 1316 kmem_free(tr_list, sizeof (list_t));
1299 1317 }
1300 1318
1301 1319 /*
1302 1320 * This should be called from open context when we think we're going to write
1303 1321 * or free space, for example when dirtying data. Be conservative; it's okay
1304 1322 * to write less space or free more, but we don't want to write more or free
1305 1323 * less than the amount specified.
1306 1324 */
1307 1325 void
1308 1326 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1309 1327 {
1310 1328 int64_t parent_space;
1311 1329 uint64_t est_used;
1312 1330
1313 1331 mutex_enter(&dd->dd_lock);
1314 1332 if (space > 0)
1315 1333 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1316 1334
1317 1335 est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
1318 1336 parent_space = parent_delta(dd, est_used, space);
1319 1337 mutex_exit(&dd->dd_lock);
1320 1338
1321 1339 /* Make sure that we clean up dd_space_to* */
1322 1340 dsl_dir_dirty(dd, tx);
1323 1341
1324 1342 /* XXX this is potentially expensive and unnecessary... */
1325 1343 if (parent_space && dd->dd_parent)
1326 1344 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
1327 1345 }
1328 1346
1329 1347 /* call from syncing context when we actually write/free space for this dd */
1330 1348 void
1331 1349 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1332 1350 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1333 1351 {
1334 1352 int64_t accounted_delta;
1335 1353
1336 1354 /*
1337 1355 * dsl_dataset_set_refreservation_sync_impl() calls this with
1338 1356 * dd_lock held, so that it can atomically update
1339 1357 * ds->ds_reserved and the dsl_dir accounting, so that
1340 1358 * dsl_dataset_check_quota() can see dataset and dir accounting
1341 1359 * consistently.
1342 1360 */
1343 1361 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1344 1362
1345 1363 ASSERT(dmu_tx_is_syncing(tx));
1346 1364 ASSERT(type < DD_USED_NUM);
1347 1365
1348 1366 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1349 1367
1350 1368 if (needlock)
1351 1369 mutex_enter(&dd->dd_lock);
1352 1370 accounted_delta =
1353 1371 parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
1354 1372 ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
1355 1373 ASSERT(compressed >= 0 ||
1356 1374 dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
1357 1375 ASSERT(uncompressed >= 0 ||
1358 1376 dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
1359 1377 dsl_dir_phys(dd)->dd_used_bytes += used;
1360 1378 dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
1361 1379 dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
1362 1380
1363 1381 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1364 1382 ASSERT(used > 0 ||
1365 1383 dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
1366 1384 dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
1367 1385 #ifdef DEBUG
1368 1386 dd_used_t t;
1369 1387 uint64_t u = 0;
1370 1388 for (t = 0; t < DD_USED_NUM; t++)
1371 1389 u += dsl_dir_phys(dd)->dd_used_breakdown[t];
1372 1390 ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
1373 1391 #endif
1374 1392 }
1375 1393 if (needlock)
1376 1394 mutex_exit(&dd->dd_lock);
1377 1395
1378 1396 if (dd->dd_parent != NULL) {
1379 1397 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1380 1398 accounted_delta, compressed, uncompressed, tx);
1381 1399 dsl_dir_transfer_space(dd->dd_parent,
1382 1400 used - accounted_delta,
1383 1401 DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1384 1402 }
1385 1403 }
1386 1404
1387 1405 void
1388 1406 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1389 1407 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1390 1408 {
1391 1409 ASSERT(dmu_tx_is_syncing(tx));
1392 1410 ASSERT(oldtype < DD_USED_NUM);
1393 1411 ASSERT(newtype < DD_USED_NUM);
1394 1412
1395 1413 if (delta == 0 ||
1396 1414 !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
1397 1415 return;
1398 1416
1399 1417 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1400 1418 mutex_enter(&dd->dd_lock);
1401 1419 ASSERT(delta > 0 ?
1402 1420 dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
1403 1421 dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
1404 1422 ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
1405 1423 dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
1406 1424 dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
1407 1425 mutex_exit(&dd->dd_lock);
1408 1426 }
1409 1427
1410 1428 typedef struct dsl_dir_set_qr_arg {
1411 1429 const char *ddsqra_name;
1412 1430 zprop_source_t ddsqra_source;
1413 1431 uint64_t ddsqra_value;
1414 1432 } dsl_dir_set_qr_arg_t;
1415 1433
1416 1434 static int
1417 1435 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
1418 1436 {
1419 1437 dsl_dir_set_qr_arg_t *ddsqra = arg;
1420 1438 dsl_pool_t *dp = dmu_tx_pool(tx);
1421 1439 dsl_dataset_t *ds;
1422 1440 int error;
1423 1441 uint64_t towrite, newval;
1424 1442
1425 1443 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1426 1444 if (error != 0)
1427 1445 return (error);
1428 1446
1429 1447 error = dsl_prop_predict(ds->ds_dir, "quota",
1430 1448 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1431 1449 if (error != 0) {
1432 1450 dsl_dataset_rele(ds, FTAG);
1433 1451 return (error);
1434 1452 }
1435 1453
1436 1454 if (newval == 0) {
1437 1455 dsl_dataset_rele(ds, FTAG);
1438 1456 return (0);
1439 1457 }
1440 1458
1441 1459 mutex_enter(&ds->ds_dir->dd_lock);
1442 1460 /*
1443 1461 * If we are doing the preliminary check in open context, and
1444 1462 * there are pending changes, then don't fail it, since the
1445 1463 * pending changes could under-estimate the amount of space to be
1446 1464 * freed up.
1447 1465 */
1448 1466 towrite = dsl_dir_space_towrite(ds->ds_dir);
1449 1467 if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1450 1468 (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
1451 1469 newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
1452 1470 error = SET_ERROR(ENOSPC);
1453 1471 }
1454 1472 mutex_exit(&ds->ds_dir->dd_lock);
1455 1473 dsl_dataset_rele(ds, FTAG);
1456 1474 return (error);
1457 1475 }
1458 1476
1459 1477 static void
1460 1478 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
1461 1479 {
1462 1480 dsl_dir_set_qr_arg_t *ddsqra = arg;
1463 1481 dsl_pool_t *dp = dmu_tx_pool(tx);
1464 1482 dsl_dataset_t *ds;
1465 1483 uint64_t newval;
1466 1484
1467 1485 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1468 1486
1469 1487 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1470 1488 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1471 1489 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1472 1490 &ddsqra->ddsqra_value, tx);
1473 1491
1474 1492 VERIFY0(dsl_prop_get_int_ds(ds,
1475 1493 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1476 1494 } else {
1477 1495 newval = ddsqra->ddsqra_value;
1478 1496 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1479 1497 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1480 1498 }
1481 1499
1482 1500 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1483 1501 mutex_enter(&ds->ds_dir->dd_lock);
1484 1502 dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
1485 1503 mutex_exit(&ds->ds_dir->dd_lock);
1486 1504 dsl_dataset_rele(ds, FTAG);
1487 1505 }
1488 1506
1489 1507 int
1490 1508 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1491 1509 {
1492 1510 dsl_dir_set_qr_arg_t ddsqra;
1493 1511
1494 1512 ddsqra.ddsqra_name = ddname;
1495 1513 ddsqra.ddsqra_source = source;
1496 1514 ddsqra.ddsqra_value = quota;
1497 1515
1498 1516 return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1499 1517 dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
1500 1518 }
1501 1519
1502 1520 int
1503 1521 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1504 1522 {
1505 1523 dsl_dir_set_qr_arg_t *ddsqra = arg;
1506 1524 dsl_pool_t *dp = dmu_tx_pool(tx);
1507 1525 dsl_dataset_t *ds;
1508 1526 dsl_dir_t *dd;
1509 1527 uint64_t newval, used, avail;
1510 1528 int error;
1511 1529
1512 1530 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1513 1531 if (error != 0)
1514 1532 return (error);
1515 1533 dd = ds->ds_dir;
1516 1534
1517 1535 /*
1518 1536 * If we are doing the preliminary check in open context, the
1519 1537 * space estimates may be inaccurate.
1520 1538 */
1521 1539 if (!dmu_tx_is_syncing(tx)) {
1522 1540 dsl_dataset_rele(ds, FTAG);
1523 1541 return (0);
1524 1542 }
1525 1543
1526 1544 error = dsl_prop_predict(ds->ds_dir,
1527 1545 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1528 1546 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1529 1547 if (error != 0) {
1530 1548 dsl_dataset_rele(ds, FTAG);
1531 1549 return (error);
1532 1550 }
1533 1551
1534 1552 mutex_enter(&dd->dd_lock);
1535 1553 used = dsl_dir_phys(dd)->dd_used_bytes;
1536 1554 mutex_exit(&dd->dd_lock);
1537 1555
1538 1556 if (dd->dd_parent) {
1539 1557 avail = dsl_dir_space_available(dd->dd_parent,
1540 1558 NULL, 0, FALSE);
1541 1559 } else {
1542 1560 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1543 1561 }
1544 1562
1545 1563 if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
1546 1564 uint64_t delta = MAX(used, newval) -
1547 1565 MAX(used, dsl_dir_phys(dd)->dd_reserved);
1548 1566
1549 1567 if (delta > avail ||
1550 1568 (dsl_dir_phys(dd)->dd_quota > 0 &&
1551 1569 newval > dsl_dir_phys(dd)->dd_quota))
1552 1570 error = SET_ERROR(ENOSPC);
1553 1571 }
1554 1572
1555 1573 dsl_dataset_rele(ds, FTAG);
1556 1574 return (error);
1557 1575 }
1558 1576
1559 1577 void
1560 1578 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1561 1579 {
1562 1580 uint64_t used;
1563 1581 int64_t delta;
1564 1582
1565 1583 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1566 1584
1567 1585 mutex_enter(&dd->dd_lock);
1568 1586 used = dsl_dir_phys(dd)->dd_used_bytes;
1569 1587 delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
1570 1588 dsl_dir_phys(dd)->dd_reserved = value;
1571 1589
1572 1590 if (dd->dd_parent != NULL) {
1573 1591 /* Roll up this additional usage into our ancestors */
1574 1592 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1575 1593 delta, 0, 0, tx);
1576 1594 }
1577 1595 mutex_exit(&dd->dd_lock);
1578 1596 }
1579 1597
1580 1598
1581 1599 static void
1582 1600 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1583 1601 {
1584 1602 dsl_dir_set_qr_arg_t *ddsqra = arg;
1585 1603 dsl_pool_t *dp = dmu_tx_pool(tx);
1586 1604 dsl_dataset_t *ds;
1587 1605 uint64_t newval;
1588 1606
1589 1607 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1590 1608
1591 1609 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1592 1610 dsl_prop_set_sync_impl(ds,
1593 1611 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1594 1612 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1595 1613 &ddsqra->ddsqra_value, tx);
1596 1614
1597 1615 VERIFY0(dsl_prop_get_int_ds(ds,
1598 1616 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1599 1617 } else {
1600 1618 newval = ddsqra->ddsqra_value;
1601 1619 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1602 1620 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1603 1621 (longlong_t)newval);
1604 1622 }
1605 1623
1606 1624 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1607 1625 dsl_dataset_rele(ds, FTAG);
1608 1626 }
1609 1627
1610 1628 int
1611 1629 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1612 1630 uint64_t reservation)
1613 1631 {
1614 1632 dsl_dir_set_qr_arg_t ddsqra;
1615 1633
1616 1634 ddsqra.ddsqra_name = ddname;
1617 1635 ddsqra.ddsqra_source = source;
1618 1636 ddsqra.ddsqra_value = reservation;
1619 1637
1620 1638 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1621 1639 dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
1622 1640 }
1623 1641
1624 1642 static dsl_dir_t *
1625 1643 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1626 1644 {
1627 1645 for (; ds1; ds1 = ds1->dd_parent) {
1628 1646 dsl_dir_t *dd;
1629 1647 for (dd = ds2; dd; dd = dd->dd_parent) {
1630 1648 if (ds1 == dd)
1631 1649 return (dd);
1632 1650 }
1633 1651 }
1634 1652 return (NULL);
1635 1653 }
1636 1654
1637 1655 /*
1638 1656 * If delta is applied to dd, how much of that delta would be applied to
1639 1657 * ancestor? Syncing context only.
1640 1658 */
1641 1659 static int64_t
1642 1660 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1643 1661 {
1644 1662 if (dd == ancestor)
1645 1663 return (delta);
1646 1664
1647 1665 mutex_enter(&dd->dd_lock);
1648 1666 delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
1649 1667 mutex_exit(&dd->dd_lock);
1650 1668 return (would_change(dd->dd_parent, delta, ancestor));
1651 1669 }
1652 1670
1653 1671 typedef struct dsl_dir_rename_arg {
1654 1672 const char *ddra_oldname;
1655 1673 const char *ddra_newname;
1656 1674 cred_t *ddra_cred;
1657 1675 } dsl_dir_rename_arg_t;
1658 1676
1659 1677 /* ARGSUSED */
1660 1678 static int
1661 1679 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1662 1680 {
1663 1681 int *deltap = arg;
1664 1682 char namebuf[MAXNAMELEN];
1665 1683
1666 1684 dsl_dataset_name(ds, namebuf);
1667 1685
1668 1686 if (strlen(namebuf) + *deltap >= MAXNAMELEN)
1669 1687 return (SET_ERROR(ENAMETOOLONG));
1670 1688 return (0);
1671 1689 }
1672 1690
1673 1691 static int
1674 1692 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1675 1693 {
1676 1694 dsl_dir_rename_arg_t *ddra = arg;
1677 1695 dsl_pool_t *dp = dmu_tx_pool(tx);
1678 1696 dsl_dir_t *dd, *newparent;
1679 1697 const char *mynewname;
1680 1698 int error;
1681 1699 int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
1682 1700
1683 1701 /* target dir should exist */
1684 1702 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1685 1703 if (error != 0)
1686 1704 return (error);
1687 1705
1688 1706 /* new parent should exist */
1689 1707 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1690 1708 &newparent, &mynewname);
1691 1709 if (error != 0) {
1692 1710 dsl_dir_rele(dd, FTAG);
1693 1711 return (error);
1694 1712 }
1695 1713
1696 1714 /* can't rename to different pool */
1697 1715 if (dd->dd_pool != newparent->dd_pool) {
1698 1716 dsl_dir_rele(newparent, FTAG);
1699 1717 dsl_dir_rele(dd, FTAG);
1700 1718 return (SET_ERROR(ENXIO));
1701 1719 }
1702 1720
1703 1721 /* new name should not already exist */
1704 1722 if (mynewname == NULL) {
1705 1723 dsl_dir_rele(newparent, FTAG);
1706 1724 dsl_dir_rele(dd, FTAG);
1707 1725 return (SET_ERROR(EEXIST));
1708 1726 }
1709 1727
1710 1728 /* if the name length is growing, validate child name lengths */
1711 1729 if (delta > 0) {
1712 1730 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1713 1731 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1714 1732 if (error != 0) {
1715 1733 dsl_dir_rele(newparent, FTAG);
1716 1734 dsl_dir_rele(dd, FTAG);
1717 1735 return (error);
1718 1736 }
1719 1737 }
1720 1738
1721 1739 if (dmu_tx_is_syncing(tx)) {
1722 1740 if (spa_feature_is_active(dp->dp_spa,
1723 1741 SPA_FEATURE_FS_SS_LIMIT)) {
1724 1742 /*
1725 1743 * Although this is the check function and we don't
1726 1744 * normally make on-disk changes in check functions,
1727 1745 * we need to do that here.
1728 1746 *
1729 1747 * Ensure this portion of the tree's counts have been
1730 1748 * initialized in case the new parent has limits set.
1731 1749 */
1732 1750 dsl_dir_init_fs_ss_count(dd, tx);
1733 1751 }
1734 1752 }
1735 1753
1736 1754 if (newparent != dd->dd_parent) {
1737 1755 /* is there enough space? */
1738 1756 uint64_t myspace =
1739 1757 MAX(dsl_dir_phys(dd)->dd_used_bytes,
1740 1758 dsl_dir_phys(dd)->dd_reserved);
1741 1759 objset_t *os = dd->dd_pool->dp_meta_objset;
1742 1760 uint64_t fs_cnt = 0;
1743 1761 uint64_t ss_cnt = 0;
1744 1762
1745 1763 if (dsl_dir_is_zapified(dd)) {
1746 1764 int err;
1747 1765
1748 1766 err = zap_lookup(os, dd->dd_object,
1749 1767 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1750 1768 &fs_cnt);
1751 1769 if (err != ENOENT && err != 0) {
1752 1770 dsl_dir_rele(newparent, FTAG);
1753 1771 dsl_dir_rele(dd, FTAG);
1754 1772 return (err);
1755 1773 }
1756 1774
1757 1775 /*
1758 1776 * have to add 1 for the filesystem itself that we're
1759 1777 * moving
1760 1778 */
1761 1779 fs_cnt++;
1762 1780
1763 1781 err = zap_lookup(os, dd->dd_object,
1764 1782 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1765 1783 &ss_cnt);
1766 1784 if (err != ENOENT && err != 0) {
1767 1785 dsl_dir_rele(newparent, FTAG);
1768 1786 dsl_dir_rele(dd, FTAG);
1769 1787 return (err);
1770 1788 }
1771 1789 }
1772 1790
1773 1791 /* no rename into our descendant */
1774 1792 if (closest_common_ancestor(dd, newparent) == dd) {
1775 1793 dsl_dir_rele(newparent, FTAG);
1776 1794 dsl_dir_rele(dd, FTAG);
1777 1795 return (SET_ERROR(EINVAL));
1778 1796 }
1779 1797
1780 1798 error = dsl_dir_transfer_possible(dd->dd_parent,
1781 1799 newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
1782 1800 if (error != 0) {
1783 1801 dsl_dir_rele(newparent, FTAG);
1784 1802 dsl_dir_rele(dd, FTAG);
1785 1803 return (error);
1786 1804 }
1787 1805 }
1788 1806
1789 1807 dsl_dir_rele(newparent, FTAG);
1790 1808 dsl_dir_rele(dd, FTAG);
1791 1809 return (0);
1792 1810 }
1793 1811
1794 1812 static void
1795 1813 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
1796 1814 {
1797 1815 dsl_dir_rename_arg_t *ddra = arg;
1798 1816 dsl_pool_t *dp = dmu_tx_pool(tx);
1799 1817 dsl_dir_t *dd, *newparent;
1800 1818 const char *mynewname;
1801 1819 int error;
1802 1820 objset_t *mos = dp->dp_meta_objset;
1803 1821
1804 1822 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
1805 1823 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
1806 1824 &mynewname));
1807 1825
1808 1826 /* Log this before we change the name. */
1809 1827 spa_history_log_internal_dd(dd, "rename", tx,
1810 1828 "-> %s", ddra->ddra_newname);
1811 1829
1812 1830 if (newparent != dd->dd_parent) {
1813 1831 objset_t *os = dd->dd_pool->dp_meta_objset;
1814 1832 uint64_t fs_cnt = 0;
1815 1833 uint64_t ss_cnt = 0;
1816 1834
1817 1835 /*
1818 1836 * We already made sure the dd counts were initialized in the
1819 1837 * check function.
1820 1838 */
1821 1839 if (spa_feature_is_active(dp->dp_spa,
1822 1840 SPA_FEATURE_FS_SS_LIMIT)) {
1823 1841 VERIFY0(zap_lookup(os, dd->dd_object,
1824 1842 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1825 1843 &fs_cnt));
1826 1844 /* add 1 for the filesystem itself that we're moving */
1827 1845 fs_cnt++;
1828 1846
1829 1847 VERIFY0(zap_lookup(os, dd->dd_object,
1830 1848 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1831 1849 &ss_cnt));
1832 1850 }
1833 1851
1834 1852 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
1835 1853 DD_FIELD_FILESYSTEM_COUNT, tx);
1836 1854 dsl_fs_ss_count_adjust(newparent, fs_cnt,
1837 1855 DD_FIELD_FILESYSTEM_COUNT, tx);
1838 1856
1839 1857 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
1840 1858 DD_FIELD_SNAPSHOT_COUNT, tx);
1841 1859 dsl_fs_ss_count_adjust(newparent, ss_cnt,
1842 1860 DD_FIELD_SNAPSHOT_COUNT, tx);
1843 1861
1844 1862 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1845 1863 -dsl_dir_phys(dd)->dd_used_bytes,
1846 1864 -dsl_dir_phys(dd)->dd_compressed_bytes,
1847 1865 -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
1848 1866 dsl_dir_diduse_space(newparent, DD_USED_CHILD,
1849 1867 dsl_dir_phys(dd)->dd_used_bytes,
1850 1868 dsl_dir_phys(dd)->dd_compressed_bytes,
1851 1869 dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
1852 1870
1853 1871 if (dsl_dir_phys(dd)->dd_reserved >
1854 1872 dsl_dir_phys(dd)->dd_used_bytes) {
1855 1873 uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
1856 1874 dsl_dir_phys(dd)->dd_used_bytes;
1857 1875
1858 1876 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1859 1877 -unused_rsrv, 0, 0, tx);
1860 1878 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
1861 1879 unused_rsrv, 0, 0, tx);
1862 1880 }
1863 1881 }
1864 1882
1865 1883 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1866 1884
1867 1885 /* remove from old parent zapobj */
1868 1886 error = zap_remove(mos,
1869 1887 dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
1870 1888 dd->dd_myname, tx);
1871 1889 ASSERT0(error);
1872 1890
1873 1891 (void) strcpy(dd->dd_myname, mynewname);
1874 1892 dsl_dir_rele(dd->dd_parent, dd);
1875 1893 dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
1876 1894 VERIFY0(dsl_dir_hold_obj(dp,
1877 1895 newparent->dd_object, NULL, dd, &dd->dd_parent));
1878 1896
1879 1897 /* add to new parent zapobj */
1880 1898 VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
1881 1899 dd->dd_myname, 8, 1, &dd->dd_object, tx));
1882 1900
1883 1901 dsl_prop_notify_all(dd);
1884 1902
1885 1903 dsl_dir_rele(newparent, FTAG);
1886 1904 dsl_dir_rele(dd, FTAG);
1887 1905 }
1888 1906
1889 1907 int
1890 1908 dsl_dir_rename(const char *oldname, const char *newname)
1891 1909 {
1892 1910 dsl_dir_rename_arg_t ddra;
1893 1911
1894 1912 ddra.ddra_oldname = oldname;
1895 1913 ddra.ddra_newname = newname;
1896 1914 ddra.ddra_cred = CRED();
1897 1915
1898 1916 return (dsl_sync_task(oldname,
1899 1917 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
1900 1918 3, ZFS_SPACE_CHECK_RESERVED));
1901 1919 }
1902 1920
1903 1921 int
1904 1922 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
1905 1923 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
1906 1924 {
1907 1925 dsl_dir_t *ancestor;
1908 1926 int64_t adelta;
1909 1927 uint64_t avail;
1910 1928 int err;
1911 1929
1912 1930 ancestor = closest_common_ancestor(sdd, tdd);
1913 1931 adelta = would_change(sdd, -space, ancestor);
1914 1932 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1915 1933 if (avail < space)
1916 1934 return (SET_ERROR(ENOSPC));
1917 1935
1918 1936 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
1919 1937 ancestor, cr);
1920 1938 if (err != 0)
1921 1939 return (err);
1922 1940 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
1923 1941 ancestor, cr);
1924 1942 if (err != 0)
1925 1943 return (err);
1926 1944
1927 1945 return (0);
1928 1946 }
1929 1947
1930 1948 timestruc_t
1931 1949 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1932 1950 {
1933 1951 timestruc_t t;
1934 1952
1935 1953 mutex_enter(&dd->dd_lock);
1936 1954 t = dd->dd_snap_cmtime;
1937 1955 mutex_exit(&dd->dd_lock);
1938 1956
1939 1957 return (t);
1940 1958 }
1941 1959
1942 1960 void
1943 1961 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1944 1962 {
1945 1963 timestruc_t t;
1946 1964
1947 1965 gethrestime(&t);
1948 1966 mutex_enter(&dd->dd_lock);
1949 1967 dd->dd_snap_cmtime = t;
1950 1968 mutex_exit(&dd->dd_lock);
1951 1969 }
1952 1970
1953 1971 void
1954 1972 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
1955 1973 {
1956 1974 objset_t *mos = dd->dd_pool->dp_meta_objset;
1957 1975 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
1958 1976 }
1959 1977
1960 1978 boolean_t
1961 1979 dsl_dir_is_zapified(dsl_dir_t *dd)
1962 1980 {
1963 1981 dmu_object_info_t doi;
1964 1982
1965 1983 dmu_object_info_from_db(dd->dd_dbuf, &doi);
1966 1984 return (doi.doi_type == DMU_OTN_ZAP_METADATA);
1967 1985 }
|
↓ open down ↓ |
1518 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX