Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/spa_misc.c
+++ new/usr/src/uts/common/fs/zfs/spa_misc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 - * Copyright (c) 2011 by Delphix. All rights reserved.
23 + * Copyright (c) 2012 by Delphix. All rights reserved.
24 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 25 */
26 26
27 27 #include <sys/zfs_context.h>
28 28 #include <sys/spa_impl.h>
29 29 #include <sys/zio.h>
30 30 #include <sys/zio_checksum.h>
31 31 #include <sys/zio_compress.h>
32 32 #include <sys/dmu.h>
33 33 #include <sys/dmu_tx.h>
34 34 #include <sys/zap.h>
35 35 #include <sys/zil.h>
36 36 #include <sys/vdev_impl.h>
37 37 #include <sys/metaslab.h>
38 38 #include <sys/uberblock_impl.h>
39 39 #include <sys/txg.h>
40 40 #include <sys/avl.h>
|
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
41 41 #include <sys/unique.h>
42 42 #include <sys/dsl_pool.h>
43 43 #include <sys/dsl_dir.h>
44 44 #include <sys/dsl_prop.h>
45 45 #include <sys/dsl_scan.h>
46 46 #include <sys/fs/zfs.h>
47 47 #include <sys/metaslab_impl.h>
48 48 #include <sys/arc.h>
49 49 #include <sys/ddt.h>
50 50 #include "zfs_prop.h"
51 +#include "zfeature_common.h"
51 52
52 53 /*
53 54 * SPA locking
54 55 *
55 56 * There are four basic locks for managing spa_t structures:
56 57 *
57 58 * spa_namespace_lock (global mutex)
58 59 *
59 60 * This lock must be acquired to do any of the following:
60 61 *
61 62 * - Lookup a spa_t by name
62 63 * - Add or remove a spa_t from the namespace
63 64 * - Increase spa_refcount from non-zero
64 65 * - Check if spa_refcount is zero
65 66 * - Rename a spa_t
66 67 * - add/remove/attach/detach devices
67 68 * - Held for the duration of create/destroy/import/export
68 69 *
69 70 * It does not need to handle recursion. A create or destroy may
70 71 * reference objects (files or zvols) in other pools, but by
71 72 * definition they must have an existing reference, and will never need
72 73 * to lookup a spa_t by name.
73 74 *
74 75 * spa_refcount (per-spa refcount_t protected by mutex)
75 76 *
76 77 * This reference count keep track of any active users of the spa_t. The
77 78 * spa_t cannot be destroyed or freed while this is non-zero. Internally,
78 79 * the refcount is never really 'zero' - opening a pool implicitly keeps
79 80 * some references in the DMU. Internally we check against spa_minref, but
80 81 * present the image of a zero/non-zero value to consumers.
81 82 *
82 83 * spa_config_lock[] (per-spa array of rwlocks)
83 84 *
84 85 * This protects the spa_t from config changes, and must be held in
85 86 * the following circumstances:
86 87 *
87 88 * - RW_READER to perform I/O to the spa
88 89 * - RW_WRITER to change the vdev config
89 90 *
90 91 * The locking order is fairly straightforward:
91 92 *
92 93 * spa_namespace_lock -> spa_refcount
93 94 *
94 95 * The namespace lock must be acquired to increase the refcount from 0
95 96 * or to check if it is zero.
96 97 *
97 98 * spa_refcount -> spa_config_lock[]
98 99 *
99 100 * There must be at least one valid reference on the spa_t to acquire
100 101 * the config lock.
101 102 *
102 103 * spa_namespace_lock -> spa_config_lock[]
103 104 *
104 105 * The namespace lock must always be taken before the config lock.
105 106 *
106 107 *
107 108 * The spa_namespace_lock can be acquired directly and is globally visible.
108 109 *
109 110 * The namespace is manipulated using the following functions, all of which
110 111 * require the spa_namespace_lock to be held.
111 112 *
112 113 * spa_lookup() Lookup a spa_t by name.
113 114 *
114 115 * spa_add() Create a new spa_t in the namespace.
115 116 *
116 117 * spa_remove() Remove a spa_t from the namespace. This also
117 118 * frees up any memory associated with the spa_t.
118 119 *
119 120 * spa_next() Returns the next spa_t in the system, or the
120 121 * first if NULL is passed.
121 122 *
122 123 * spa_evict_all() Shutdown and remove all spa_t structures in
123 124 * the system.
124 125 *
125 126 * spa_guid_exists() Determine whether a pool/device guid exists.
126 127 *
127 128 * The spa_refcount is manipulated using the following functions:
128 129 *
129 130 * spa_open_ref() Adds a reference to the given spa_t. Must be
130 131 * called with spa_namespace_lock held if the
131 132 * refcount is currently zero.
132 133 *
133 134 * spa_close() Remove a reference from the spa_t. This will
134 135 * not free the spa_t or remove it from the
135 136 * namespace. No locking is required.
136 137 *
137 138 * spa_refcount_zero() Returns true if the refcount is currently
138 139 * zero. Must be called with spa_namespace_lock
139 140 * held.
140 141 *
141 142 * The spa_config_lock[] is an array of rwlocks, ordered as follows:
142 143 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
143 144 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
144 145 *
145 146 * To read the configuration, it suffices to hold one of these locks as reader.
146 147 * To modify the configuration, you must hold all locks as writer. To modify
147 148 * vdev state without altering the vdev tree's topology (e.g. online/offline),
148 149 * you must hold SCL_STATE and SCL_ZIO as writer.
149 150 *
150 151 * We use these distinct config locks to avoid recursive lock entry.
151 152 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
152 153 * block allocations (SCL_ALLOC), which may require reading space maps
153 154 * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
154 155 *
155 156 * The spa config locks cannot be normal rwlocks because we need the
156 157 * ability to hand off ownership. For example, SCL_ZIO is acquired
157 158 * by the issuing thread and later released by an interrupt thread.
158 159 * They do, however, obey the usual write-wanted semantics to prevent
159 160 * writer (i.e. system administrator) starvation.
160 161 *
161 162 * The lock acquisition rules are as follows:
162 163 *
163 164 * SCL_CONFIG
164 165 * Protects changes to the vdev tree topology, such as vdev
165 166 * add/remove/attach/detach. Protects the dirty config list
166 167 * (spa_config_dirty_list) and the set of spares and l2arc devices.
167 168 *
168 169 * SCL_STATE
169 170 * Protects changes to pool state and vdev state, such as vdev
170 171 * online/offline/fault/degrade/clear. Protects the dirty state list
171 172 * (spa_state_dirty_list) and global pool state (spa_state).
172 173 *
173 174 * SCL_ALLOC
174 175 * Protects changes to metaslab groups and classes.
175 176 * Held as reader by metaslab_alloc() and metaslab_claim().
176 177 *
177 178 * SCL_ZIO
178 179 * Held by bp-level zios (those which have no io_vd upon entry)
179 180 * to prevent changes to the vdev tree. The bp-level zio implicitly
180 181 * protects all of its vdev child zios, which do not hold SCL_ZIO.
181 182 *
182 183 * SCL_FREE
183 184 * Protects changes to metaslab groups and classes.
184 185 * Held as reader by metaslab_free(). SCL_FREE is distinct from
185 186 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
186 187 * blocks in zio_done() while another i/o that holds either
187 188 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
188 189 *
189 190 * SCL_VDEV
190 191 * Held as reader to prevent changes to the vdev tree during trivial
191 192 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
192 193 * other locks, and lower than all of them, to ensure that it's safe
193 194 * to acquire regardless of caller context.
194 195 *
195 196 * In addition, the following rules apply:
196 197 *
197 198 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
198 199 * The lock ordering is SCL_CONFIG > spa_props_lock.
199 200 *
200 201 * (b) I/O operations on leaf vdevs. For any zio operation that takes
201 202 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
202 203 * or zio_write_phys() -- the caller must ensure that the config cannot
203 204 * cannot change in the interim, and that the vdev cannot be reopened.
204 205 * SCL_STATE as reader suffices for both.
205 206 *
206 207 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
207 208 *
208 209 * spa_vdev_enter() Acquire the namespace lock and the config lock
|
↓ open down ↓ |
148 lines elided |
↑ open up ↑ |
209 210 * for writing.
210 211 *
211 212 * spa_vdev_exit() Release the config lock, wait for all I/O
212 213 * to complete, sync the updated configs to the
213 214 * cache, and release the namespace lock.
214 215 *
215 216 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
216 217 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
217 218 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
218 219 *
219 - * spa_rename() is also implemented within this file since is requires
220 + * spa_rename() is also implemented within this file since it requires
220 221 * manipulation of the namespace.
221 222 */
222 223
223 224 static avl_tree_t spa_namespace_avl;
224 225 kmutex_t spa_namespace_lock;
225 226 static kcondvar_t spa_namespace_cv;
226 227 static int spa_active_count;
227 228 int spa_max_replication_override = SPA_DVAS_PER_BP;
228 229
229 230 static kmutex_t spa_spare_lock;
230 231 static avl_tree_t spa_spare_avl;
231 232 static kmutex_t spa_l2cache_lock;
232 233 static avl_tree_t spa_l2cache_avl;
233 234
234 235 kmem_cache_t *spa_buffer_pool;
235 236 int spa_mode_global;
236 237
237 238 #ifdef ZFS_DEBUG
238 239 /* Everything except dprintf is on by default in debug builds */
239 240 int zfs_flags = ~ZFS_DEBUG_DPRINTF;
240 241 #else
241 242 int zfs_flags = 0;
242 243 #endif
243 244
244 245 /*
245 246 * zfs_recover can be set to nonzero to attempt to recover from
246 247 * otherwise-fatal errors, typically caused by on-disk corruption. When
247 248 * set, calls to zfs_panic_recover() will turn into warning messages.
248 249 */
249 250 int zfs_recover = 0;
250 251
251 252
252 253 /*
253 254 * ==========================================================================
254 255 * SPA config locking
255 256 * ==========================================================================
256 257 */
257 258 static void
258 259 spa_config_lock_init(spa_t *spa)
259 260 {
260 261 for (int i = 0; i < SCL_LOCKS; i++) {
261 262 spa_config_lock_t *scl = &spa->spa_config_lock[i];
262 263 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
263 264 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
264 265 refcount_create(&scl->scl_count);
265 266 scl->scl_writer = NULL;
266 267 scl->scl_write_wanted = 0;
267 268 }
268 269 }
269 270
270 271 static void
271 272 spa_config_lock_destroy(spa_t *spa)
272 273 {
273 274 for (int i = 0; i < SCL_LOCKS; i++) {
274 275 spa_config_lock_t *scl = &spa->spa_config_lock[i];
275 276 mutex_destroy(&scl->scl_lock);
276 277 cv_destroy(&scl->scl_cv);
277 278 refcount_destroy(&scl->scl_count);
278 279 ASSERT(scl->scl_writer == NULL);
279 280 ASSERT(scl->scl_write_wanted == 0);
280 281 }
281 282 }
282 283
283 284 int
284 285 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
285 286 {
286 287 for (int i = 0; i < SCL_LOCKS; i++) {
287 288 spa_config_lock_t *scl = &spa->spa_config_lock[i];
288 289 if (!(locks & (1 << i)))
289 290 continue;
290 291 mutex_enter(&scl->scl_lock);
291 292 if (rw == RW_READER) {
292 293 if (scl->scl_writer || scl->scl_write_wanted) {
293 294 mutex_exit(&scl->scl_lock);
294 295 spa_config_exit(spa, locks ^ (1 << i), tag);
295 296 return (0);
296 297 }
297 298 } else {
298 299 ASSERT(scl->scl_writer != curthread);
299 300 if (!refcount_is_zero(&scl->scl_count)) {
300 301 mutex_exit(&scl->scl_lock);
301 302 spa_config_exit(spa, locks ^ (1 << i), tag);
302 303 return (0);
303 304 }
304 305 scl->scl_writer = curthread;
305 306 }
306 307 (void) refcount_add(&scl->scl_count, tag);
307 308 mutex_exit(&scl->scl_lock);
308 309 }
309 310 return (1);
310 311 }
311 312
312 313 void
313 314 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
314 315 {
315 316 int wlocks_held = 0;
316 317
317 318 for (int i = 0; i < SCL_LOCKS; i++) {
318 319 spa_config_lock_t *scl = &spa->spa_config_lock[i];
319 320 if (scl->scl_writer == curthread)
320 321 wlocks_held |= (1 << i);
321 322 if (!(locks & (1 << i)))
322 323 continue;
323 324 mutex_enter(&scl->scl_lock);
324 325 if (rw == RW_READER) {
325 326 while (scl->scl_writer || scl->scl_write_wanted) {
326 327 cv_wait(&scl->scl_cv, &scl->scl_lock);
327 328 }
328 329 } else {
329 330 ASSERT(scl->scl_writer != curthread);
330 331 while (!refcount_is_zero(&scl->scl_count)) {
331 332 scl->scl_write_wanted++;
332 333 cv_wait(&scl->scl_cv, &scl->scl_lock);
333 334 scl->scl_write_wanted--;
334 335 }
335 336 scl->scl_writer = curthread;
336 337 }
337 338 (void) refcount_add(&scl->scl_count, tag);
338 339 mutex_exit(&scl->scl_lock);
339 340 }
340 341 ASSERT(wlocks_held <= locks);
341 342 }
342 343
343 344 void
344 345 spa_config_exit(spa_t *spa, int locks, void *tag)
345 346 {
346 347 for (int i = SCL_LOCKS - 1; i >= 0; i--) {
347 348 spa_config_lock_t *scl = &spa->spa_config_lock[i];
348 349 if (!(locks & (1 << i)))
349 350 continue;
350 351 mutex_enter(&scl->scl_lock);
351 352 ASSERT(!refcount_is_zero(&scl->scl_count));
352 353 if (refcount_remove(&scl->scl_count, tag) == 0) {
353 354 ASSERT(scl->scl_writer == NULL ||
354 355 scl->scl_writer == curthread);
355 356 scl->scl_writer = NULL; /* OK in either case */
356 357 cv_broadcast(&scl->scl_cv);
357 358 }
358 359 mutex_exit(&scl->scl_lock);
359 360 }
360 361 }
361 362
362 363 int
363 364 spa_config_held(spa_t *spa, int locks, krw_t rw)
364 365 {
365 366 int locks_held = 0;
366 367
367 368 for (int i = 0; i < SCL_LOCKS; i++) {
368 369 spa_config_lock_t *scl = &spa->spa_config_lock[i];
369 370 if (!(locks & (1 << i)))
370 371 continue;
371 372 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
372 373 (rw == RW_WRITER && scl->scl_writer == curthread))
373 374 locks_held |= 1 << i;
374 375 }
375 376
376 377 return (locks_held);
377 378 }
378 379
379 380 /*
380 381 * ==========================================================================
381 382 * SPA namespace functions
382 383 * ==========================================================================
383 384 */
384 385
385 386 /*
386 387 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
387 388 * Returns NULL if no matching spa_t is found.
388 389 */
389 390 spa_t *
390 391 spa_lookup(const char *name)
391 392 {
392 393 static spa_t search; /* spa_t is large; don't allocate on stack */
393 394 spa_t *spa;
394 395 avl_index_t where;
395 396 char c;
396 397 char *cp;
397 398
398 399 ASSERT(MUTEX_HELD(&spa_namespace_lock));
399 400
400 401 /*
401 402 * If it's a full dataset name, figure out the pool name and
402 403 * just use that.
403 404 */
404 405 cp = strpbrk(name, "/@");
405 406 if (cp) {
406 407 c = *cp;
407 408 *cp = '\0';
408 409 }
409 410
410 411 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
411 412 spa = avl_find(&spa_namespace_avl, &search, &where);
412 413
413 414 if (cp)
414 415 *cp = c;
415 416
416 417 return (spa);
417 418 }
418 419
419 420 /*
420 421 * Create an uninitialized spa_t with the given name. Requires
421 422 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
422 423 * exist by calling spa_lookup() first.
423 424 */
424 425 spa_t *
425 426 spa_add(const char *name, nvlist_t *config, const char *altroot)
426 427 {
427 428 spa_t *spa;
428 429 spa_config_dirent_t *dp;
429 430
430 431 ASSERT(MUTEX_HELD(&spa_namespace_lock));
431 432
432 433 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
433 434
434 435 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
435 436 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
436 437 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
437 438 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
438 439 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
439 440 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
440 441 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
441 442 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
442 443 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
443 444
444 445 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
445 446 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
446 447 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
447 448 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
448 449
449 450 for (int t = 0; t < TXG_SIZE; t++)
450 451 bplist_create(&spa->spa_free_bplist[t]);
451 452
452 453 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
453 454 spa->spa_state = POOL_STATE_UNINITIALIZED;
454 455 spa->spa_freeze_txg = UINT64_MAX;
455 456 spa->spa_final_txg = UINT64_MAX;
456 457 spa->spa_load_max_txg = UINT64_MAX;
457 458 spa->spa_proc = &p0;
458 459 spa->spa_proc_state = SPA_PROC_NONE;
459 460
460 461 refcount_create(&spa->spa_refcount);
461 462 spa_config_lock_init(spa);
462 463
463 464 avl_add(&spa_namespace_avl, spa);
464 465
465 466 /*
466 467 * Set the alternate root, if there is one.
467 468 */
468 469 if (altroot) {
469 470 spa->spa_root = spa_strdup(altroot);
470 471 spa_active_count++;
471 472 }
472 473
473 474 /*
474 475 * Every pool starts with the default cachefile
475 476 */
|
↓ open down ↓ |
246 lines elided |
↑ open up ↑ |
476 477 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
477 478 offsetof(spa_config_dirent_t, scd_link));
478 479
479 480 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
480 481 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
481 482 list_insert_head(&spa->spa_config_list, dp);
482 483
483 484 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
484 485 KM_SLEEP) == 0);
485 486
486 - if (config != NULL)
487 + if (config != NULL) {
488 + nvlist_t *features;
489 +
490 + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
491 + &features) == 0) {
492 + VERIFY(nvlist_dup(features, &spa->spa_label_features,
493 + 0) == 0);
494 + }
495 +
487 496 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
497 + }
488 498
499 + if (spa->spa_label_features == NULL) {
500 + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
501 + KM_SLEEP) == 0);
502 + }
503 +
489 504 return (spa);
490 505 }
491 506
492 507 /*
493 508 * Removes a spa_t from the namespace, freeing up any memory used. Requires
494 509 * spa_namespace_lock. This is called only after the spa_t has been closed and
495 510 * deactivated.
496 511 */
497 512 void
498 513 spa_remove(spa_t *spa)
499 514 {
500 515 spa_config_dirent_t *dp;
501 516
502 517 ASSERT(MUTEX_HELD(&spa_namespace_lock));
503 518 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
504 519
505 520 nvlist_free(spa->spa_config_splitting);
506 521
507 522 avl_remove(&spa_namespace_avl, spa);
508 523 cv_broadcast(&spa_namespace_cv);
509 524
510 525 if (spa->spa_root) {
511 526 spa_strfree(spa->spa_root);
512 527 spa_active_count--;
513 528 }
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
514 529
515 530 while ((dp = list_head(&spa->spa_config_list)) != NULL) {
516 531 list_remove(&spa->spa_config_list, dp);
517 532 if (dp->scd_path != NULL)
518 533 spa_strfree(dp->scd_path);
519 534 kmem_free(dp, sizeof (spa_config_dirent_t));
520 535 }
521 536
522 537 list_destroy(&spa->spa_config_list);
523 538
539 + nvlist_free(spa->spa_label_features);
524 540 nvlist_free(spa->spa_load_info);
525 541 spa_config_set(spa, NULL);
526 542
527 543 refcount_destroy(&spa->spa_refcount);
528 544
529 545 spa_config_lock_destroy(spa);
530 546
531 547 for (int t = 0; t < TXG_SIZE; t++)
532 548 bplist_destroy(&spa->spa_free_bplist[t]);
533 549
534 550 cv_destroy(&spa->spa_async_cv);
535 551 cv_destroy(&spa->spa_proc_cv);
536 552 cv_destroy(&spa->spa_scrub_io_cv);
537 553 cv_destroy(&spa->spa_suspend_cv);
538 554
539 555 mutex_destroy(&spa->spa_async_lock);
540 556 mutex_destroy(&spa->spa_errlist_lock);
541 557 mutex_destroy(&spa->spa_errlog_lock);
542 558 mutex_destroy(&spa->spa_history_lock);
543 559 mutex_destroy(&spa->spa_proc_lock);
544 560 mutex_destroy(&spa->spa_props_lock);
545 561 mutex_destroy(&spa->spa_scrub_lock);
546 562 mutex_destroy(&spa->spa_suspend_lock);
547 563 mutex_destroy(&spa->spa_vdev_top_lock);
548 564
549 565 kmem_free(spa, sizeof (spa_t));
550 566 }
551 567
552 568 /*
553 569 * Given a pool, return the next pool in the namespace, or NULL if there is
554 570 * none. If 'prev' is NULL, return the first pool.
555 571 */
556 572 spa_t *
557 573 spa_next(spa_t *prev)
558 574 {
559 575 ASSERT(MUTEX_HELD(&spa_namespace_lock));
560 576
561 577 if (prev)
562 578 return (AVL_NEXT(&spa_namespace_avl, prev));
563 579 else
564 580 return (avl_first(&spa_namespace_avl));
565 581 }
566 582
567 583 /*
568 584 * ==========================================================================
569 585 * SPA refcount functions
570 586 * ==========================================================================
571 587 */
572 588
573 589 /*
574 590 * Add a reference to the given spa_t. Must have at least one reference, or
575 591 * have the namespace lock held.
576 592 */
577 593 void
578 594 spa_open_ref(spa_t *spa, void *tag)
579 595 {
580 596 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
581 597 MUTEX_HELD(&spa_namespace_lock));
582 598 (void) refcount_add(&spa->spa_refcount, tag);
583 599 }
584 600
585 601 /*
586 602 * Remove a reference to the given spa_t. Must have at least one reference, or
587 603 * have the namespace lock held.
588 604 */
589 605 void
590 606 spa_close(spa_t *spa, void *tag)
591 607 {
592 608 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
593 609 MUTEX_HELD(&spa_namespace_lock));
594 610 (void) refcount_remove(&spa->spa_refcount, tag);
595 611 }
596 612
597 613 /*
598 614 * Check to see if the spa refcount is zero. Must be called with
599 615 * spa_namespace_lock held. We really compare against spa_minref, which is the
600 616 * number of references acquired when opening a pool
601 617 */
602 618 boolean_t
603 619 spa_refcount_zero(spa_t *spa)
604 620 {
605 621 ASSERT(MUTEX_HELD(&spa_namespace_lock));
606 622
607 623 return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
608 624 }
609 625
610 626 /*
611 627 * ==========================================================================
612 628 * SPA spare and l2cache tracking
613 629 * ==========================================================================
614 630 */
615 631
616 632 /*
617 633 * Hot spares and cache devices are tracked using the same code below,
618 634 * for 'auxiliary' devices.
619 635 */
620 636
621 637 typedef struct spa_aux {
622 638 uint64_t aux_guid;
623 639 uint64_t aux_pool;
624 640 avl_node_t aux_avl;
625 641 int aux_count;
626 642 } spa_aux_t;
627 643
628 644 static int
629 645 spa_aux_compare(const void *a, const void *b)
630 646 {
631 647 const spa_aux_t *sa = a;
632 648 const spa_aux_t *sb = b;
633 649
634 650 if (sa->aux_guid < sb->aux_guid)
635 651 return (-1);
636 652 else if (sa->aux_guid > sb->aux_guid)
637 653 return (1);
638 654 else
639 655 return (0);
640 656 }
641 657
642 658 void
643 659 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
644 660 {
645 661 avl_index_t where;
646 662 spa_aux_t search;
647 663 spa_aux_t *aux;
648 664
649 665 search.aux_guid = vd->vdev_guid;
650 666 if ((aux = avl_find(avl, &search, &where)) != NULL) {
651 667 aux->aux_count++;
652 668 } else {
653 669 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
654 670 aux->aux_guid = vd->vdev_guid;
655 671 aux->aux_count = 1;
656 672 avl_insert(avl, aux, where);
657 673 }
658 674 }
659 675
660 676 void
661 677 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
662 678 {
663 679 spa_aux_t search;
664 680 spa_aux_t *aux;
665 681 avl_index_t where;
666 682
667 683 search.aux_guid = vd->vdev_guid;
668 684 aux = avl_find(avl, &search, &where);
669 685
670 686 ASSERT(aux != NULL);
671 687
672 688 if (--aux->aux_count == 0) {
673 689 avl_remove(avl, aux);
674 690 kmem_free(aux, sizeof (spa_aux_t));
675 691 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
676 692 aux->aux_pool = 0ULL;
677 693 }
678 694 }
679 695
680 696 boolean_t
681 697 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
682 698 {
683 699 spa_aux_t search, *found;
684 700
685 701 search.aux_guid = guid;
686 702 found = avl_find(avl, &search, NULL);
687 703
688 704 if (pool) {
689 705 if (found)
690 706 *pool = found->aux_pool;
691 707 else
692 708 *pool = 0ULL;
693 709 }
694 710
695 711 if (refcnt) {
696 712 if (found)
697 713 *refcnt = found->aux_count;
698 714 else
699 715 *refcnt = 0;
700 716 }
701 717
702 718 return (found != NULL);
703 719 }
704 720
705 721 void
706 722 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
707 723 {
708 724 spa_aux_t search, *found;
709 725 avl_index_t where;
710 726
711 727 search.aux_guid = vd->vdev_guid;
712 728 found = avl_find(avl, &search, &where);
713 729 ASSERT(found != NULL);
714 730 ASSERT(found->aux_pool == 0ULL);
715 731
716 732 found->aux_pool = spa_guid(vd->vdev_spa);
717 733 }
718 734
719 735 /*
720 736 * Spares are tracked globally due to the following constraints:
721 737 *
722 738 * - A spare may be part of multiple pools.
723 739 * - A spare may be added to a pool even if it's actively in use within
724 740 * another pool.
725 741 * - A spare in use in any pool can only be the source of a replacement if
726 742 * the target is a spare in the same pool.
727 743 *
728 744 * We keep track of all spares on the system through the use of a reference
729 745 * counted AVL tree. When a vdev is added as a spare, or used as a replacement
730 746 * spare, then we bump the reference count in the AVL tree. In addition, we set
731 747 * the 'vdev_isspare' member to indicate that the device is a spare (active or
732 748 * inactive). When a spare is made active (used to replace a device in the
733 749 * pool), we also keep track of which pool its been made a part of.
734 750 *
735 751 * The 'spa_spare_lock' protects the AVL tree. These functions are normally
736 752 * called under the spa_namespace lock as part of vdev reconfiguration. The
737 753 * separate spare lock exists for the status query path, which does not need to
738 754 * be completely consistent with respect to other vdev configuration changes.
739 755 */
740 756
741 757 static int
742 758 spa_spare_compare(const void *a, const void *b)
743 759 {
744 760 return (spa_aux_compare(a, b));
745 761 }
746 762
747 763 void
748 764 spa_spare_add(vdev_t *vd)
749 765 {
750 766 mutex_enter(&spa_spare_lock);
751 767 ASSERT(!vd->vdev_isspare);
752 768 spa_aux_add(vd, &spa_spare_avl);
753 769 vd->vdev_isspare = B_TRUE;
754 770 mutex_exit(&spa_spare_lock);
755 771 }
756 772
757 773 void
758 774 spa_spare_remove(vdev_t *vd)
759 775 {
760 776 mutex_enter(&spa_spare_lock);
761 777 ASSERT(vd->vdev_isspare);
762 778 spa_aux_remove(vd, &spa_spare_avl);
763 779 vd->vdev_isspare = B_FALSE;
764 780 mutex_exit(&spa_spare_lock);
765 781 }
766 782
767 783 boolean_t
768 784 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
769 785 {
770 786 boolean_t found;
771 787
772 788 mutex_enter(&spa_spare_lock);
773 789 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
774 790 mutex_exit(&spa_spare_lock);
775 791
776 792 return (found);
777 793 }
778 794
779 795 void
780 796 spa_spare_activate(vdev_t *vd)
781 797 {
782 798 mutex_enter(&spa_spare_lock);
783 799 ASSERT(vd->vdev_isspare);
784 800 spa_aux_activate(vd, &spa_spare_avl);
785 801 mutex_exit(&spa_spare_lock);
786 802 }
787 803
788 804 /*
789 805 * Level 2 ARC devices are tracked globally for the same reasons as spares.
790 806 * Cache devices currently only support one pool per cache device, and so
791 807 * for these devices the aux reference count is currently unused beyond 1.
792 808 */
793 809
794 810 static int
795 811 spa_l2cache_compare(const void *a, const void *b)
796 812 {
797 813 return (spa_aux_compare(a, b));
798 814 }
799 815
800 816 void
801 817 spa_l2cache_add(vdev_t *vd)
802 818 {
803 819 mutex_enter(&spa_l2cache_lock);
804 820 ASSERT(!vd->vdev_isl2cache);
805 821 spa_aux_add(vd, &spa_l2cache_avl);
806 822 vd->vdev_isl2cache = B_TRUE;
807 823 mutex_exit(&spa_l2cache_lock);
808 824 }
809 825
810 826 void
811 827 spa_l2cache_remove(vdev_t *vd)
812 828 {
813 829 mutex_enter(&spa_l2cache_lock);
814 830 ASSERT(vd->vdev_isl2cache);
815 831 spa_aux_remove(vd, &spa_l2cache_avl);
816 832 vd->vdev_isl2cache = B_FALSE;
817 833 mutex_exit(&spa_l2cache_lock);
818 834 }
819 835
820 836 boolean_t
821 837 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
822 838 {
823 839 boolean_t found;
824 840
825 841 mutex_enter(&spa_l2cache_lock);
826 842 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
827 843 mutex_exit(&spa_l2cache_lock);
828 844
829 845 return (found);
830 846 }
831 847
832 848 void
833 849 spa_l2cache_activate(vdev_t *vd)
834 850 {
835 851 mutex_enter(&spa_l2cache_lock);
836 852 ASSERT(vd->vdev_isl2cache);
837 853 spa_aux_activate(vd, &spa_l2cache_avl);
838 854 mutex_exit(&spa_l2cache_lock);
839 855 }
840 856
841 857 /*
842 858 * ==========================================================================
843 859 * SPA vdev locking
844 860 * ==========================================================================
845 861 */
846 862
847 863 /*
848 864 * Lock the given spa_t for the purpose of adding or removing a vdev.
849 865 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
850 866 * It returns the next transaction group for the spa_t.
851 867 */
852 868 uint64_t
853 869 spa_vdev_enter(spa_t *spa)
854 870 {
855 871 mutex_enter(&spa->spa_vdev_top_lock);
856 872 mutex_enter(&spa_namespace_lock);
857 873 return (spa_vdev_config_enter(spa));
858 874 }
859 875
860 876 /*
861 877 * Internal implementation for spa_vdev_enter(). Used when a vdev
862 878 * operation requires multiple syncs (i.e. removing a device) while
863 879 * keeping the spa_namespace_lock held.
864 880 */
865 881 uint64_t
866 882 spa_vdev_config_enter(spa_t *spa)
867 883 {
868 884 ASSERT(MUTEX_HELD(&spa_namespace_lock));
869 885
870 886 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
871 887
872 888 return (spa_last_synced_txg(spa) + 1);
873 889 }
874 890
875 891 /*
876 892 * Used in combination with spa_vdev_config_enter() to allow the syncing
877 893 * of multiple transactions without releasing the spa_namespace_lock.
878 894 */
879 895 void
880 896 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
881 897 {
882 898 ASSERT(MUTEX_HELD(&spa_namespace_lock));
883 899
884 900 int config_changed = B_FALSE;
885 901
886 902 ASSERT(txg > spa_last_synced_txg(spa));
887 903
888 904 spa->spa_pending_vdev = NULL;
889 905
890 906 /*
891 907 * Reassess the DTLs.
892 908 */
893 909 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
894 910
895 911 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
896 912 config_changed = B_TRUE;
897 913 spa->spa_config_generation++;
898 914 }
899 915
900 916 /*
901 917 * Verify the metaslab classes.
902 918 */
903 919 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
904 920 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
905 921
906 922 spa_config_exit(spa, SCL_ALL, spa);
907 923
908 924 /*
909 925 * Panic the system if the specified tag requires it. This
910 926 * is useful for ensuring that configurations are updated
911 927 * transactionally.
912 928 */
913 929 if (zio_injection_enabled)
914 930 zio_handle_panic_injection(spa, tag, 0);
915 931
916 932 /*
917 933 * Note: this txg_wait_synced() is important because it ensures
918 934 * that there won't be more than one config change per txg.
919 935 * This allows us to use the txg as the generation number.
920 936 */
921 937 if (error == 0)
922 938 txg_wait_synced(spa->spa_dsl_pool, txg);
923 939
924 940 if (vd != NULL) {
925 941 ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
926 942 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
927 943 vdev_free(vd);
928 944 spa_config_exit(spa, SCL_ALL, spa);
929 945 }
930 946
931 947 /*
932 948 * If the config changed, update the config cache.
933 949 */
934 950 if (config_changed)
935 951 spa_config_sync(spa, B_FALSE, B_TRUE);
936 952 }
937 953
938 954 /*
939 955 * Unlock the spa_t after adding or removing a vdev. Besides undoing the
940 956 * locking of spa_vdev_enter(), we also want make sure the transactions have
941 957 * synced to disk, and then update the global configuration cache with the new
942 958 * information.
943 959 */
944 960 int
945 961 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
946 962 {
947 963 spa_vdev_config_exit(spa, vd, txg, error, FTAG);
948 964 mutex_exit(&spa_namespace_lock);
949 965 mutex_exit(&spa->spa_vdev_top_lock);
950 966
951 967 return (error);
952 968 }
953 969
954 970 /*
955 971 * Lock the given spa_t for the purpose of changing vdev state.
956 972 */
957 973 void
958 974 spa_vdev_state_enter(spa_t *spa, int oplocks)
959 975 {
960 976 int locks = SCL_STATE_ALL | oplocks;
961 977
962 978 /*
963 979 * Root pools may need to read of the underlying devfs filesystem
964 980 * when opening up a vdev. Unfortunately if we're holding the
965 981 * SCL_ZIO lock it will result in a deadlock when we try to issue
966 982 * the read from the root filesystem. Instead we "prefetch"
967 983 * the associated vnodes that we need prior to opening the
968 984 * underlying devices and cache them so that we can prevent
969 985 * any I/O when we are doing the actual open.
970 986 */
971 987 if (spa_is_root(spa)) {
972 988 int low = locks & ~(SCL_ZIO - 1);
973 989 int high = locks & ~low;
974 990
975 991 spa_config_enter(spa, high, spa, RW_WRITER);
976 992 vdev_hold(spa->spa_root_vdev);
977 993 spa_config_enter(spa, low, spa, RW_WRITER);
978 994 } else {
979 995 spa_config_enter(spa, locks, spa, RW_WRITER);
980 996 }
981 997 spa->spa_vdev_locks = locks;
982 998 }
983 999
984 1000 int
985 1001 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
986 1002 {
987 1003 boolean_t config_changed = B_FALSE;
988 1004
989 1005 if (vd != NULL || error == 0)
990 1006 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
991 1007 0, 0, B_FALSE);
992 1008
993 1009 if (vd != NULL) {
994 1010 vdev_state_dirty(vd->vdev_top);
995 1011 config_changed = B_TRUE;
996 1012 spa->spa_config_generation++;
997 1013 }
998 1014
999 1015 if (spa_is_root(spa))
1000 1016 vdev_rele(spa->spa_root_vdev);
1001 1017
1002 1018 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1003 1019 spa_config_exit(spa, spa->spa_vdev_locks, spa);
1004 1020
1005 1021 /*
1006 1022 * If anything changed, wait for it to sync. This ensures that,
1007 1023 * from the system administrator's perspective, zpool(1M) commands
1008 1024 * are synchronous. This is important for things like zpool offline:
1009 1025 * when the command completes, you expect no further I/O from ZFS.
1010 1026 */
1011 1027 if (vd != NULL)
1012 1028 txg_wait_synced(spa->spa_dsl_pool, 0);
1013 1029
1014 1030 /*
1015 1031 * If the config changed, update the config cache.
1016 1032 */
1017 1033 if (config_changed) {
1018 1034 mutex_enter(&spa_namespace_lock);
1019 1035 spa_config_sync(spa, B_FALSE, B_TRUE);
1020 1036 mutex_exit(&spa_namespace_lock);
1021 1037 }
|
↓ open down ↓ |
488 lines elided |
↑ open up ↑ |
1022 1038
1023 1039 return (error);
1024 1040 }
1025 1041
1026 1042 /*
1027 1043 * ==========================================================================
1028 1044 * Miscellaneous functions
1029 1045 * ==========================================================================
1030 1046 */
1031 1047
1048 +void
1049 +spa_activate_mos_feature(spa_t *spa, const char *feature)
1050 +{
1051 + (void) nvlist_add_boolean(spa->spa_label_features, feature);
1052 + vdev_config_dirty(spa->spa_root_vdev);
1053 +}
1054 +
1055 +void
1056 +spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1057 +{
1058 + (void) nvlist_remove_all(spa->spa_label_features, feature);
1059 + vdev_config_dirty(spa->spa_root_vdev);
1060 +}
1061 +
1032 1062 /*
1033 1063 * Rename a spa_t.
1034 1064 */
1035 1065 int
1036 1066 spa_rename(const char *name, const char *newname)
1037 1067 {
1038 1068 spa_t *spa;
1039 1069 int err;
1040 1070
1041 1071 /*
1042 1072 * Lookup the spa_t and grab the config lock for writing. We need to
1043 1073 * actually open the pool so that we can sync out the necessary labels.
1044 1074 * It's OK to call spa_open() with the namespace lock held because we
1045 1075 * allow recursive calls for other reasons.
1046 1076 */
1047 1077 mutex_enter(&spa_namespace_lock);
1048 1078 if ((err = spa_open(name, &spa, FTAG)) != 0) {
1049 1079 mutex_exit(&spa_namespace_lock);
1050 1080 return (err);
1051 1081 }
1052 1082
1053 1083 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1054 1084
1055 1085 avl_remove(&spa_namespace_avl, spa);
1056 1086 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1057 1087 avl_add(&spa_namespace_avl, spa);
1058 1088
1059 1089 /*
1060 1090 * Sync all labels to disk with the new names by marking the root vdev
1061 1091 * dirty and waiting for it to sync. It will pick up the new pool name
1062 1092 * during the sync.
1063 1093 */
1064 1094 vdev_config_dirty(spa->spa_root_vdev);
1065 1095
1066 1096 spa_config_exit(spa, SCL_ALL, FTAG);
1067 1097
1068 1098 txg_wait_synced(spa->spa_dsl_pool, 0);
1069 1099
1070 1100 /*
1071 1101 * Sync the updated config cache.
1072 1102 */
1073 1103 spa_config_sync(spa, B_FALSE, B_TRUE);
1074 1104
1075 1105 spa_close(spa, FTAG);
1076 1106
1077 1107 mutex_exit(&spa_namespace_lock);
1078 1108
1079 1109 return (0);
1080 1110 }
1081 1111
1082 1112 /*
1083 1113 * Return the spa_t associated with given pool_guid, if it exists. If
1084 1114 * device_guid is non-zero, determine whether the pool exists *and* contains
1085 1115 * a device with the specified device_guid.
1086 1116 */
1087 1117 spa_t *
1088 1118 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1089 1119 {
1090 1120 spa_t *spa;
1091 1121 avl_tree_t *t = &spa_namespace_avl;
1092 1122
1093 1123 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1094 1124
1095 1125 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1096 1126 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1097 1127 continue;
1098 1128 if (spa->spa_root_vdev == NULL)
1099 1129 continue;
1100 1130 if (spa_guid(spa) == pool_guid) {
1101 1131 if (device_guid == 0)
1102 1132 break;
1103 1133
1104 1134 if (vdev_lookup_by_guid(spa->spa_root_vdev,
1105 1135 device_guid) != NULL)
1106 1136 break;
1107 1137
1108 1138 /*
1109 1139 * Check any devices we may be in the process of adding.
1110 1140 */
1111 1141 if (spa->spa_pending_vdev) {
1112 1142 if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1113 1143 device_guid) != NULL)
1114 1144 break;
1115 1145 }
1116 1146 }
1117 1147 }
1118 1148
1119 1149 return (spa);
1120 1150 }
1121 1151
1122 1152 /*
1123 1153 * Determine whether a pool with the given pool_guid exists.
1124 1154 */
1125 1155 boolean_t
1126 1156 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1127 1157 {
1128 1158 return (spa_by_guid(pool_guid, device_guid) != NULL);
1129 1159 }
1130 1160
1131 1161 char *
1132 1162 spa_strdup(const char *s)
1133 1163 {
1134 1164 size_t len;
1135 1165 char *new;
1136 1166
1137 1167 len = strlen(s);
1138 1168 new = kmem_alloc(len + 1, KM_SLEEP);
1139 1169 bcopy(s, new, len);
1140 1170 new[len] = '\0';
1141 1171
1142 1172 return (new);
1143 1173 }
1144 1174
1145 1175 void
1146 1176 spa_strfree(char *s)
1147 1177 {
1148 1178 kmem_free(s, strlen(s) + 1);
1149 1179 }
1150 1180
1151 1181 uint64_t
1152 1182 spa_get_random(uint64_t range)
1153 1183 {
1154 1184 uint64_t r;
1155 1185
1156 1186 ASSERT(range != 0);
1157 1187
1158 1188 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1159 1189
1160 1190 return (r % range);
1161 1191 }
1162 1192
1163 1193 uint64_t
1164 1194 spa_generate_guid(spa_t *spa)
1165 1195 {
1166 1196 uint64_t guid = spa_get_random(-1ULL);
1167 1197
1168 1198 if (spa != NULL) {
1169 1199 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1170 1200 guid = spa_get_random(-1ULL);
1171 1201 } else {
|
↓ open down ↓ |
130 lines elided |
↑ open up ↑ |
1172 1202 while (guid == 0 || spa_guid_exists(guid, 0))
1173 1203 guid = spa_get_random(-1ULL);
1174 1204 }
1175 1205
1176 1206 return (guid);
1177 1207 }
1178 1208
1179 1209 void
1180 1210 sprintf_blkptr(char *buf, const blkptr_t *bp)
1181 1211 {
1182 - char *type = NULL;
1212 + char type[256];
1183 1213 char *checksum = NULL;
1184 1214 char *compress = NULL;
1185 1215
1186 1216 if (bp != NULL) {
1187 - type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
1217 + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1218 + dmu_object_byteswap_t bswap =
1219 + DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1220 + (void) snprintf(type, sizeof (type), "bswap %s %s",
1221 + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1222 + "metadata" : "data",
1223 + dmu_ot_byteswap[bswap].ob_name);
1224 + } else {
1225 + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1226 + sizeof (type));
1227 + }
1188 1228 checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1189 1229 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1190 1230 }
1191 1231
1192 1232 SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
1193 1233 }
1194 1234
1195 1235 void
1196 1236 spa_freeze(spa_t *spa)
1197 1237 {
1198 1238 uint64_t freeze_txg = 0;
1199 1239
1200 1240 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1201 1241 if (spa->spa_freeze_txg == UINT64_MAX) {
1202 1242 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1203 1243 spa->spa_freeze_txg = freeze_txg;
1204 1244 }
1205 1245 spa_config_exit(spa, SCL_ALL, FTAG);
1206 1246 if (freeze_txg != 0)
1207 1247 txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1208 1248 }
1209 1249
1210 1250 void
1211 1251 zfs_panic_recover(const char *fmt, ...)
1212 1252 {
1213 1253 va_list adx;
1214 1254
1215 1255 va_start(adx, fmt);
1216 1256 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1217 1257 va_end(adx);
1218 1258 }
1219 1259
1220 1260 /*
1221 1261 * This is a stripped-down version of strtoull, suitable only for converting
1222 1262 * lowercase hexidecimal numbers that don't overflow.
1223 1263 */
1224 1264 uint64_t
1225 1265 strtonum(const char *str, char **nptr)
1226 1266 {
1227 1267 uint64_t val = 0;
1228 1268 char c;
1229 1269 int digit;
1230 1270
1231 1271 while ((c = *str) != '\0') {
1232 1272 if (c >= '0' && c <= '9')
1233 1273 digit = c - '0';
1234 1274 else if (c >= 'a' && c <= 'f')
1235 1275 digit = 10 + c - 'a';
1236 1276 else
1237 1277 break;
1238 1278
1239 1279 val *= 16;
1240 1280 val += digit;
1241 1281
1242 1282 str++;
1243 1283 }
1244 1284
1245 1285 if (nptr)
1246 1286 *nptr = (char *)str;
1247 1287
1248 1288 return (val);
1249 1289 }
1250 1290
1251 1291 /*
1252 1292 * ==========================================================================
1253 1293 * Accessor functions
1254 1294 * ==========================================================================
1255 1295 */
1256 1296
1257 1297 boolean_t
1258 1298 spa_shutting_down(spa_t *spa)
|
↓ open down ↓ |
61 lines elided |
↑ open up ↑ |
1259 1299 {
1260 1300 return (spa->spa_async_suspended);
1261 1301 }
1262 1302
1263 1303 dsl_pool_t *
1264 1304 spa_get_dsl(spa_t *spa)
1265 1305 {
1266 1306 return (spa->spa_dsl_pool);
1267 1307 }
1268 1308
1309 +boolean_t
1310 +spa_is_initializing(spa_t *spa)
1311 +{
1312 + return (spa->spa_is_initializing);
1313 +}
1314 +
1269 1315 blkptr_t *
1270 1316 spa_get_rootblkptr(spa_t *spa)
1271 1317 {
1272 1318 return (&spa->spa_ubsync.ub_rootbp);
1273 1319 }
1274 1320
1275 1321 void
1276 1322 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1277 1323 {
1278 1324 spa->spa_uberblock.ub_rootbp = *bp;
1279 1325 }
1280 1326
1281 1327 void
1282 1328 spa_altroot(spa_t *spa, char *buf, size_t buflen)
1283 1329 {
1284 1330 if (spa->spa_root == NULL)
1285 1331 buf[0] = '\0';
1286 1332 else
1287 1333 (void) strncpy(buf, spa->spa_root, buflen);
1288 1334 }
1289 1335
1290 1336 int
1291 1337 spa_sync_pass(spa_t *spa)
1292 1338 {
1293 1339 return (spa->spa_sync_pass);
1294 1340 }
1295 1341
1296 1342 char *
1297 1343 spa_name(spa_t *spa)
1298 1344 {
1299 1345 return (spa->spa_name);
1300 1346 }
1301 1347
1302 1348 uint64_t
1303 1349 spa_guid(spa_t *spa)
1304 1350 {
1305 1351 /*
1306 1352 * If we fail to parse the config during spa_load(), we can go through
1307 1353 * the error path (which posts an ereport) and end up here with no root
1308 1354 * vdev. We stash the original pool guid in 'spa_config_guid' to handle
1309 1355 * this case.
1310 1356 */
1311 1357 if (spa->spa_root_vdev != NULL)
1312 1358 return (spa->spa_root_vdev->vdev_guid);
1313 1359 else
1314 1360 return (spa->spa_config_guid);
1315 1361 }
1316 1362
1317 1363 uint64_t
1318 1364 spa_load_guid(spa_t *spa)
1319 1365 {
1320 1366 /*
1321 1367 * This is a GUID that exists solely as a reference for the
1322 1368 * purposes of the arc. It is generated at load time, and
1323 1369 * is never written to persistent storage.
1324 1370 */
1325 1371 return (spa->spa_load_guid);
1326 1372 }
1327 1373
1328 1374 uint64_t
1329 1375 spa_last_synced_txg(spa_t *spa)
1330 1376 {
1331 1377 return (spa->spa_ubsync.ub_txg);
1332 1378 }
1333 1379
1334 1380 uint64_t
1335 1381 spa_first_txg(spa_t *spa)
1336 1382 {
1337 1383 return (spa->spa_first_txg);
1338 1384 }
1339 1385
1340 1386 uint64_t
1341 1387 spa_syncing_txg(spa_t *spa)
1342 1388 {
1343 1389 return (spa->spa_syncing_txg);
1344 1390 }
1345 1391
1346 1392 pool_state_t
1347 1393 spa_state(spa_t *spa)
1348 1394 {
1349 1395 return (spa->spa_state);
1350 1396 }
1351 1397
1352 1398 spa_load_state_t
1353 1399 spa_load_state(spa_t *spa)
1354 1400 {
1355 1401 return (spa->spa_load_state);
1356 1402 }
1357 1403
1358 1404 uint64_t
1359 1405 spa_freeze_txg(spa_t *spa)
1360 1406 {
1361 1407 return (spa->spa_freeze_txg);
1362 1408 }
1363 1409
1364 1410 /* ARGSUSED */
1365 1411 uint64_t
1366 1412 spa_get_asize(spa_t *spa, uint64_t lsize)
1367 1413 {
1368 1414 /*
1369 1415 * The worst case is single-sector max-parity RAID-Z blocks, in which
1370 1416 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
1371 1417 * times the size; so just assume that. Add to this the fact that
1372 1418 * we can have up to 3 DVAs per bp, and one more factor of 2 because
1373 1419 * the block may be dittoed with up to 3 DVAs by ddt_sync().
1374 1420 */
1375 1421 return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
1376 1422 }
1377 1423
1378 1424 uint64_t
1379 1425 spa_get_dspace(spa_t *spa)
1380 1426 {
1381 1427 return (spa->spa_dspace);
1382 1428 }
1383 1429
1384 1430 void
1385 1431 spa_update_dspace(spa_t *spa)
1386 1432 {
1387 1433 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1388 1434 ddt_get_dedup_dspace(spa);
1389 1435 }
1390 1436
1391 1437 /*
1392 1438 * Return the failure mode that has been set to this pool. The default
1393 1439 * behavior will be to block all I/Os when a complete failure occurs.
1394 1440 */
1395 1441 uint8_t
1396 1442 spa_get_failmode(spa_t *spa)
1397 1443 {
1398 1444 return (spa->spa_failmode);
1399 1445 }
1400 1446
1401 1447 boolean_t
1402 1448 spa_suspended(spa_t *spa)
1403 1449 {
1404 1450 return (spa->spa_suspended);
1405 1451 }
1406 1452
1407 1453 uint64_t
1408 1454 spa_version(spa_t *spa)
1409 1455 {
1410 1456 return (spa->spa_ubsync.ub_version);
1411 1457 }
1412 1458
1413 1459 boolean_t
1414 1460 spa_deflate(spa_t *spa)
1415 1461 {
1416 1462 return (spa->spa_deflate);
1417 1463 }
1418 1464
1419 1465 metaslab_class_t *
1420 1466 spa_normal_class(spa_t *spa)
1421 1467 {
1422 1468 return (spa->spa_normal_class);
1423 1469 }
1424 1470
1425 1471 metaslab_class_t *
1426 1472 spa_log_class(spa_t *spa)
1427 1473 {
1428 1474 return (spa->spa_log_class);
1429 1475 }
1430 1476
1431 1477 int
1432 1478 spa_max_replication(spa_t *spa)
1433 1479 {
1434 1480 /*
1435 1481 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1436 1482 * handle BPs with more than one DVA allocated. Set our max
1437 1483 * replication level accordingly.
1438 1484 */
1439 1485 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1440 1486 return (1);
1441 1487 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1442 1488 }
1443 1489
1444 1490 int
1445 1491 spa_prev_software_version(spa_t *spa)
1446 1492 {
1447 1493 return (spa->spa_prev_software_version);
1448 1494 }
1449 1495
1450 1496 uint64_t
1451 1497 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1452 1498 {
1453 1499 uint64_t asize = DVA_GET_ASIZE(dva);
1454 1500 uint64_t dsize = asize;
1455 1501
1456 1502 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1457 1503
1458 1504 if (asize != 0 && spa->spa_deflate) {
1459 1505 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1460 1506 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1461 1507 }
1462 1508
1463 1509 return (dsize);
1464 1510 }
1465 1511
1466 1512 uint64_t
1467 1513 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1468 1514 {
1469 1515 uint64_t dsize = 0;
1470 1516
1471 1517 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
1472 1518 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1473 1519
1474 1520 return (dsize);
1475 1521 }
1476 1522
1477 1523 uint64_t
1478 1524 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1479 1525 {
1480 1526 uint64_t dsize = 0;
1481 1527
1482 1528 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1483 1529
1484 1530 for (int d = 0; d < SPA_DVAS_PER_BP; d++)
1485 1531 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1486 1532
1487 1533 spa_config_exit(spa, SCL_VDEV, FTAG);
1488 1534
1489 1535 return (dsize);
1490 1536 }
1491 1537
1492 1538 /*
1493 1539 * ==========================================================================
1494 1540 * Initialization and Termination
1495 1541 * ==========================================================================
1496 1542 */
1497 1543
1498 1544 static int
1499 1545 spa_name_compare(const void *a1, const void *a2)
1500 1546 {
1501 1547 const spa_t *s1 = a1;
1502 1548 const spa_t *s2 = a2;
1503 1549 int s;
1504 1550
1505 1551 s = strcmp(s1->spa_name, s2->spa_name);
1506 1552 if (s > 0)
1507 1553 return (1);
1508 1554 if (s < 0)
1509 1555 return (-1);
1510 1556 return (0);
1511 1557 }
1512 1558
1513 1559 int
1514 1560 spa_busy(void)
1515 1561 {
1516 1562 return (spa_active_count);
1517 1563 }
1518 1564
1519 1565 void
1520 1566 spa_boot_init()
1521 1567 {
1522 1568 spa_config_load();
1523 1569 }
1524 1570
1525 1571 void
1526 1572 spa_init(int mode)
1527 1573 {
1528 1574 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1529 1575 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1530 1576 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1531 1577 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1532 1578
1533 1579 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1534 1580 offsetof(spa_t, spa_avl));
1535 1581
1536 1582 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1537 1583 offsetof(spa_aux_t, aux_avl));
1538 1584
1539 1585 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1540 1586 offsetof(spa_aux_t, aux_avl));
1541 1587
|
↓ open down ↓ |
263 lines elided |
↑ open up ↑ |
1542 1588 spa_mode_global = mode;
1543 1589
1544 1590 refcount_init();
1545 1591 unique_init();
1546 1592 zio_init();
1547 1593 dmu_init();
1548 1594 zil_init();
1549 1595 vdev_cache_stat_init();
1550 1596 zfs_prop_init();
1551 1597 zpool_prop_init();
1598 + zpool_feature_init();
1552 1599 spa_config_load();
1553 1600 l2arc_start();
1554 1601 }
1555 1602
1556 1603 void
1557 1604 spa_fini(void)
1558 1605 {
1559 1606 l2arc_stop();
1560 1607
1561 1608 spa_evict_all();
1562 1609
1563 1610 vdev_cache_stat_fini();
1564 1611 zil_fini();
1565 1612 dmu_fini();
1566 1613 zio_fini();
1567 1614 unique_fini();
1568 1615 refcount_fini();
1569 1616
1570 1617 avl_destroy(&spa_namespace_avl);
1571 1618 avl_destroy(&spa_spare_avl);
1572 1619 avl_destroy(&spa_l2cache_avl);
1573 1620
1574 1621 cv_destroy(&spa_namespace_cv);
1575 1622 mutex_destroy(&spa_namespace_lock);
1576 1623 mutex_destroy(&spa_spare_lock);
1577 1624 mutex_destroy(&spa_l2cache_lock);
1578 1625 }
1579 1626
1580 1627 /*
1581 1628 * Return whether this pool has slogs. No locking needed.
1582 1629 * It's not a problem if the wrong answer is returned as it's only for
1583 1630 * performance and not correctness
1584 1631 */
1585 1632 boolean_t
1586 1633 spa_has_slogs(spa_t *spa)
1587 1634 {
1588 1635 return (spa->spa_log_class->mc_rotor != NULL);
1589 1636 }
1590 1637
1591 1638 spa_log_state_t
1592 1639 spa_get_log_state(spa_t *spa)
1593 1640 {
1594 1641 return (spa->spa_log_state);
1595 1642 }
1596 1643
1597 1644 void
1598 1645 spa_set_log_state(spa_t *spa, spa_log_state_t state)
1599 1646 {
1600 1647 spa->spa_log_state = state;
1601 1648 }
1602 1649
1603 1650 boolean_t
1604 1651 spa_is_root(spa_t *spa)
1605 1652 {
1606 1653 return (spa->spa_is_root);
1607 1654 }
1608 1655
1609 1656 boolean_t
1610 1657 spa_writeable(spa_t *spa)
1611 1658 {
1612 1659 return (!!(spa->spa_mode & FWRITE));
1613 1660 }
1614 1661
1615 1662 int
1616 1663 spa_mode(spa_t *spa)
1617 1664 {
1618 1665 return (spa->spa_mode);
1619 1666 }
1620 1667
1621 1668 uint64_t
1622 1669 spa_bootfs(spa_t *spa)
1623 1670 {
1624 1671 return (spa->spa_bootfs);
1625 1672 }
1626 1673
1627 1674 uint64_t
1628 1675 spa_delegation(spa_t *spa)
1629 1676 {
1630 1677 return (spa->spa_delegation);
1631 1678 }
1632 1679
1633 1680 objset_t *
1634 1681 spa_meta_objset(spa_t *spa)
1635 1682 {
1636 1683 return (spa->spa_meta_objset);
1637 1684 }
1638 1685
1639 1686 enum zio_checksum
1640 1687 spa_dedup_checksum(spa_t *spa)
1641 1688 {
1642 1689 return (spa->spa_dedup_checksum);
1643 1690 }
1644 1691
1645 1692 /*
1646 1693 * Reset pool scan stat per scan pass (or reboot).
1647 1694 */
1648 1695 void
1649 1696 spa_scan_stat_init(spa_t *spa)
1650 1697 {
1651 1698 /* data not stored on disk */
1652 1699 spa->spa_scan_pass_start = gethrestime_sec();
1653 1700 spa->spa_scan_pass_exam = 0;
1654 1701 vdev_scan_stat_init(spa->spa_root_vdev);
1655 1702 }
1656 1703
1657 1704 /*
1658 1705 * Get scan stats for zpool status reports
1659 1706 */
1660 1707 int
1661 1708 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
1662 1709 {
1663 1710 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
1664 1711
1665 1712 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
1666 1713 return (ENOENT);
1667 1714 bzero(ps, sizeof (pool_scan_stat_t));
1668 1715
1669 1716 /* data stored on disk */
1670 1717 ps->pss_func = scn->scn_phys.scn_func;
1671 1718 ps->pss_start_time = scn->scn_phys.scn_start_time;
1672 1719 ps->pss_end_time = scn->scn_phys.scn_end_time;
1673 1720 ps->pss_to_examine = scn->scn_phys.scn_to_examine;
1674 1721 ps->pss_examined = scn->scn_phys.scn_examined;
1675 1722 ps->pss_to_process = scn->scn_phys.scn_to_process;
1676 1723 ps->pss_processed = scn->scn_phys.scn_processed;
1677 1724 ps->pss_errors = scn->scn_phys.scn_errors;
1678 1725 ps->pss_state = scn->scn_phys.scn_state;
1679 1726
1680 1727 /* data not stored on disk */
1681 1728 ps->pss_pass_start = spa->spa_scan_pass_start;
1682 1729 ps->pss_pass_exam = spa->spa_scan_pass_exam;
1683 1730
1684 1731 return (0);
1685 1732 }
1686 1733
1687 1734 boolean_t
1688 1735 spa_debug_enabled(spa_t *spa)
1689 1736 {
1690 1737 return (spa->spa_debug);
1691 1738 }
|
↓ open down ↓ |
130 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX