Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/spa_misc.c
+++ new/usr/src/uts/common/fs/zfs/spa_misc.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25 26 */
26 27
27 28 #include <sys/zfs_context.h>
28 29 #include <sys/spa_impl.h>
29 30 #include <sys/spa_boot.h>
30 31 #include <sys/zio.h>
31 32 #include <sys/zio_checksum.h>
32 33 #include <sys/zio_compress.h>
33 34 #include <sys/dmu.h>
34 35 #include <sys/dmu_tx.h>
35 36 #include <sys/zap.h>
36 37 #include <sys/zil.h>
37 38 #include <sys/vdev_impl.h>
38 39 #include <sys/metaslab.h>
39 40 #include <sys/uberblock_impl.h>
40 41 #include <sys/txg.h>
41 42 #include <sys/avl.h>
42 43 #include <sys/unique.h>
43 44 #include <sys/dsl_pool.h>
44 45 #include <sys/dsl_dir.h>
45 46 #include <sys/dsl_prop.h>
46 47 #include <sys/dsl_scan.h>
47 48 #include <sys/fs/zfs.h>
48 49 #include <sys/metaslab_impl.h>
49 50 #include <sys/arc.h>
50 51 #include <sys/ddt.h>
51 52 #include "zfs_prop.h"
52 53 #include "zfeature_common.h"
53 54
54 55 /*
55 56 * SPA locking
56 57 *
57 58 * There are four basic locks for managing spa_t structures:
58 59 *
59 60 * spa_namespace_lock (global mutex)
60 61 *
61 62 * This lock must be acquired to do any of the following:
62 63 *
63 64 * - Lookup a spa_t by name
64 65 * - Add or remove a spa_t from the namespace
65 66 * - Increase spa_refcount from non-zero
66 67 * - Check if spa_refcount is zero
67 68 * - Rename a spa_t
68 69 * - add/remove/attach/detach devices
69 70 * - Held for the duration of create/destroy/import/export
70 71 *
71 72 * It does not need to handle recursion. A create or destroy may
72 73 * reference objects (files or zvols) in other pools, but by
73 74 * definition they must have an existing reference, and will never need
74 75 * to lookup a spa_t by name.
75 76 *
76 77 * spa_refcount (per-spa refcount_t protected by mutex)
77 78 *
78 79 * This reference count keep track of any active users of the spa_t. The
79 80 * spa_t cannot be destroyed or freed while this is non-zero. Internally,
80 81 * the refcount is never really 'zero' - opening a pool implicitly keeps
81 82 * some references in the DMU. Internally we check against spa_minref, but
82 83 * present the image of a zero/non-zero value to consumers.
83 84 *
84 85 * spa_config_lock[] (per-spa array of rwlocks)
85 86 *
86 87 * This protects the spa_t from config changes, and must be held in
87 88 * the following circumstances:
88 89 *
89 90 * - RW_READER to perform I/O to the spa
90 91 * - RW_WRITER to change the vdev config
91 92 *
92 93 * The locking order is fairly straightforward:
93 94 *
94 95 * spa_namespace_lock -> spa_refcount
95 96 *
96 97 * The namespace lock must be acquired to increase the refcount from 0
97 98 * or to check if it is zero.
98 99 *
99 100 * spa_refcount -> spa_config_lock[]
100 101 *
101 102 * There must be at least one valid reference on the spa_t to acquire
102 103 * the config lock.
103 104 *
104 105 * spa_namespace_lock -> spa_config_lock[]
105 106 *
106 107 * The namespace lock must always be taken before the config lock.
107 108 *
108 109 *
109 110 * The spa_namespace_lock can be acquired directly and is globally visible.
110 111 *
111 112 * The namespace is manipulated using the following functions, all of which
112 113 * require the spa_namespace_lock to be held.
113 114 *
114 115 * spa_lookup() Lookup a spa_t by name.
115 116 *
116 117 * spa_add() Create a new spa_t in the namespace.
117 118 *
118 119 * spa_remove() Remove a spa_t from the namespace. This also
119 120 * frees up any memory associated with the spa_t.
120 121 *
121 122 * spa_next() Returns the next spa_t in the system, or the
122 123 * first if NULL is passed.
123 124 *
124 125 * spa_evict_all() Shutdown and remove all spa_t structures in
125 126 * the system.
126 127 *
127 128 * spa_guid_exists() Determine whether a pool/device guid exists.
128 129 *
129 130 * The spa_refcount is manipulated using the following functions:
130 131 *
131 132 * spa_open_ref() Adds a reference to the given spa_t. Must be
132 133 * called with spa_namespace_lock held if the
133 134 * refcount is currently zero.
134 135 *
135 136 * spa_close() Remove a reference from the spa_t. This will
136 137 * not free the spa_t or remove it from the
137 138 * namespace. No locking is required.
138 139 *
139 140 * spa_refcount_zero() Returns true if the refcount is currently
140 141 * zero. Must be called with spa_namespace_lock
141 142 * held.
142 143 *
143 144 * The spa_config_lock[] is an array of rwlocks, ordered as follows:
144 145 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
145 146 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
146 147 *
147 148 * To read the configuration, it suffices to hold one of these locks as reader.
148 149 * To modify the configuration, you must hold all locks as writer. To modify
149 150 * vdev state without altering the vdev tree's topology (e.g. online/offline),
150 151 * you must hold SCL_STATE and SCL_ZIO as writer.
151 152 *
152 153 * We use these distinct config locks to avoid recursive lock entry.
153 154 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
154 155 * block allocations (SCL_ALLOC), which may require reading space maps
155 156 * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
156 157 *
157 158 * The spa config locks cannot be normal rwlocks because we need the
158 159 * ability to hand off ownership. For example, SCL_ZIO is acquired
159 160 * by the issuing thread and later released by an interrupt thread.
160 161 * They do, however, obey the usual write-wanted semantics to prevent
161 162 * writer (i.e. system administrator) starvation.
162 163 *
163 164 * The lock acquisition rules are as follows:
164 165 *
165 166 * SCL_CONFIG
166 167 * Protects changes to the vdev tree topology, such as vdev
167 168 * add/remove/attach/detach. Protects the dirty config list
168 169 * (spa_config_dirty_list) and the set of spares and l2arc devices.
169 170 *
170 171 * SCL_STATE
171 172 * Protects changes to pool state and vdev state, such as vdev
172 173 * online/offline/fault/degrade/clear. Protects the dirty state list
173 174 * (spa_state_dirty_list) and global pool state (spa_state).
174 175 *
175 176 * SCL_ALLOC
176 177 * Protects changes to metaslab groups and classes.
177 178 * Held as reader by metaslab_alloc() and metaslab_claim().
178 179 *
179 180 * SCL_ZIO
180 181 * Held by bp-level zios (those which have no io_vd upon entry)
181 182 * to prevent changes to the vdev tree. The bp-level zio implicitly
182 183 * protects all of its vdev child zios, which do not hold SCL_ZIO.
183 184 *
184 185 * SCL_FREE
185 186 * Protects changes to metaslab groups and classes.
186 187 * Held as reader by metaslab_free(). SCL_FREE is distinct from
187 188 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
188 189 * blocks in zio_done() while another i/o that holds either
189 190 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
190 191 *
191 192 * SCL_VDEV
192 193 * Held as reader to prevent changes to the vdev tree during trivial
193 194 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the
194 195 * other locks, and lower than all of them, to ensure that it's safe
195 196 * to acquire regardless of caller context.
196 197 *
197 198 * In addition, the following rules apply:
198 199 *
199 200 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list.
200 201 * The lock ordering is SCL_CONFIG > spa_props_lock.
201 202 *
202 203 * (b) I/O operations on leaf vdevs. For any zio operation that takes
203 204 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
204 205 * or zio_write_phys() -- the caller must ensure that the config cannot
205 206 * cannot change in the interim, and that the vdev cannot be reopened.
206 207 * SCL_STATE as reader suffices for both.
207 208 *
208 209 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
209 210 *
210 211 * spa_vdev_enter() Acquire the namespace lock and the config lock
211 212 * for writing.
212 213 *
213 214 * spa_vdev_exit() Release the config lock, wait for all I/O
214 215 * to complete, sync the updated configs to the
215 216 * cache, and release the namespace lock.
216 217 *
217 218 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
218 219 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
219 220 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
220 221 *
221 222 * spa_rename() is also implemented within this file since it requires
222 223 * manipulation of the namespace.
223 224 */
224 225
225 226 static avl_tree_t spa_namespace_avl;
226 227 kmutex_t spa_namespace_lock;
227 228 static kcondvar_t spa_namespace_cv;
228 229 static int spa_active_count;
229 230 int spa_max_replication_override = SPA_DVAS_PER_BP;
230 231
231 232 static kmutex_t spa_spare_lock;
232 233 static avl_tree_t spa_spare_avl;
233 234 static kmutex_t spa_l2cache_lock;
234 235 static avl_tree_t spa_l2cache_avl;
235 236
236 237 kmem_cache_t *spa_buffer_pool;
237 238 int spa_mode_global;
238 239
239 240 #ifdef ZFS_DEBUG
240 241 /* Everything except dprintf and spa is on by default in debug builds */
241 242 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
242 243 #else
243 244 int zfs_flags = 0;
244 245 #endif
245 246
246 247 /*
247 248 * zfs_recover can be set to nonzero to attempt to recover from
248 249 * otherwise-fatal errors, typically caused by on-disk corruption. When
249 250 * set, calls to zfs_panic_recover() will turn into warning messages.
250 251 * This should only be used as a last resort, as it typically results
251 252 * in leaked space, or worse.
252 253 */
253 254 boolean_t zfs_recover = B_FALSE;
254 255
255 256 /*
256 257 * If destroy encounters an EIO while reading metadata (e.g. indirect
257 258 * blocks), space referenced by the missing metadata can not be freed.
258 259 * Normally this causes the background destroy to become "stalled", as
259 260 * it is unable to make forward progress. While in this stalled state,
260 261 * all remaining space to free from the error-encountering filesystem is
261 262 * "temporarily leaked". Set this flag to cause it to ignore the EIO,
262 263 * permanently leak the space from indirect blocks that can not be read,
263 264 * and continue to free everything else that it can.
264 265 *
265 266 * The default, "stalling" behavior is useful if the storage partially
266 267 * fails (i.e. some but not all i/os fail), and then later recovers. In
267 268 * this case, we will be able to continue pool operations while it is
268 269 * partially failed, and when it recovers, we can continue to free the
269 270 * space, with no leaks. However, note that this case is actually
270 271 * fairly rare.
271 272 *
272 273 * Typically pools either (a) fail completely (but perhaps temporarily,
273 274 * e.g. a top-level vdev going offline), or (b) have localized,
274 275 * permanent errors (e.g. disk returns the wrong data due to bit flip or
275 276 * firmware bug). In case (a), this setting does not matter because the
276 277 * pool will be suspended and the sync thread will not be able to make
277 278 * forward progress regardless. In case (b), because the error is
278 279 * permanent, the best we can do is leak the minimum amount of space,
279 280 * which is what setting this flag will do. Therefore, it is reasonable
280 281 * for this flag to normally be set, but we chose the more conservative
281 282 * approach of not setting it, so that there is no possibility of
282 283 * leaking space in the "partial temporary" failure case.
283 284 */
284 285 boolean_t zfs_free_leak_on_eio = B_FALSE;
285 286
286 287 /*
287 288 * Expiration time in milliseconds. This value has two meanings. First it is
288 289 * used to determine when the spa_deadman() logic should fire. By default the
289 290 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
290 291 * Secondly, the value determines if an I/O is considered "hung". Any I/O that
291 292 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
292 293 * in a system panic.
293 294 */
294 295 uint64_t zfs_deadman_synctime_ms = 1000000ULL;
295 296
296 297 /*
297 298 * Check time in milliseconds. This defines the frequency at which we check
298 299 * for hung I/O.
299 300 */
300 301 uint64_t zfs_deadman_checktime_ms = 5000ULL;
301 302
302 303 /*
303 304 * Override the zfs deadman behavior via /etc/system. By default the
304 305 * deadman is enabled except on VMware and sparc deployments.
305 306 */
306 307 int zfs_deadman_enabled = -1;
307 308
308 309 /*
309 310 * The worst case is single-sector max-parity RAID-Z blocks, in which
310 311 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
311 312 * times the size; so just assume that. Add to this the fact that
312 313 * we can have up to 3 DVAs per bp, and one more factor of 2 because
313 314 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
314 315 * the worst case is:
315 316 * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
316 317 */
317 318 int spa_asize_inflation = 24;
318 319
319 320 /*
320 321 * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
321 322 * the pool to be consumed. This ensures that we don't run the pool
322 323 * completely out of space, due to unaccounted changes (e.g. to the MOS).
323 324 * It also limits the worst-case time to allocate space. If we have
324 325 * less than this amount of free space, most ZPL operations (e.g. write,
325 326 * create) will return ENOSPC.
326 327 *
327 328 * Certain operations (e.g. file removal, most administrative actions) can
328 329 * use half the slop space. They will only return ENOSPC if less than half
329 330 * the slop space is free. Typically, once the pool has less than the slop
330 331 * space free, the user will use these operations to free up space in the pool.
331 332 * These are the operations that call dsl_pool_adjustedsize() with the netfree
332 333 * argument set to TRUE.
333 334 *
334 335 * A very restricted set of operations are always permitted, regardless of
335 336 * the amount of free space. These are the operations that call
336 337 * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
337 338 * operations result in a net increase in the amount of space used,
338 339 * it is possible to run the pool completely out of space, causing it to
339 340 * be permanently read-only.
340 341 *
341 342 * See also the comments in zfs_space_check_t.
342 343 */
343 344 int spa_slop_shift = 5;
344 345
345 346 /*
346 347 * ==========================================================================
347 348 * SPA config locking
348 349 * ==========================================================================
349 350 */
350 351 static void
351 352 spa_config_lock_init(spa_t *spa)
352 353 {
353 354 for (int i = 0; i < SCL_LOCKS; i++) {
354 355 spa_config_lock_t *scl = &spa->spa_config_lock[i];
355 356 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
356 357 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
357 358 refcount_create_untracked(&scl->scl_count);
358 359 scl->scl_writer = NULL;
359 360 scl->scl_write_wanted = 0;
360 361 }
361 362 }
362 363
363 364 static void
364 365 spa_config_lock_destroy(spa_t *spa)
365 366 {
366 367 for (int i = 0; i < SCL_LOCKS; i++) {
367 368 spa_config_lock_t *scl = &spa->spa_config_lock[i];
368 369 mutex_destroy(&scl->scl_lock);
369 370 cv_destroy(&scl->scl_cv);
370 371 refcount_destroy(&scl->scl_count);
371 372 ASSERT(scl->scl_writer == NULL);
372 373 ASSERT(scl->scl_write_wanted == 0);
373 374 }
374 375 }
375 376
376 377 int
377 378 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
378 379 {
379 380 for (int i = 0; i < SCL_LOCKS; i++) {
380 381 spa_config_lock_t *scl = &spa->spa_config_lock[i];
381 382 if (!(locks & (1 << i)))
382 383 continue;
383 384 mutex_enter(&scl->scl_lock);
384 385 if (rw == RW_READER) {
385 386 if (scl->scl_writer || scl->scl_write_wanted) {
386 387 mutex_exit(&scl->scl_lock);
387 388 spa_config_exit(spa, locks ^ (1 << i), tag);
388 389 return (0);
389 390 }
390 391 } else {
391 392 ASSERT(scl->scl_writer != curthread);
392 393 if (!refcount_is_zero(&scl->scl_count)) {
393 394 mutex_exit(&scl->scl_lock);
394 395 spa_config_exit(spa, locks ^ (1 << i), tag);
395 396 return (0);
396 397 }
397 398 scl->scl_writer = curthread;
398 399 }
399 400 (void) refcount_add(&scl->scl_count, tag);
400 401 mutex_exit(&scl->scl_lock);
401 402 }
402 403 return (1);
403 404 }
404 405
405 406 void
406 407 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
407 408 {
408 409 int wlocks_held = 0;
409 410
410 411 ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
411 412
412 413 for (int i = 0; i < SCL_LOCKS; i++) {
413 414 spa_config_lock_t *scl = &spa->spa_config_lock[i];
414 415 if (scl->scl_writer == curthread)
415 416 wlocks_held |= (1 << i);
416 417 if (!(locks & (1 << i)))
417 418 continue;
418 419 mutex_enter(&scl->scl_lock);
419 420 if (rw == RW_READER) {
420 421 while (scl->scl_writer || scl->scl_write_wanted) {
421 422 cv_wait(&scl->scl_cv, &scl->scl_lock);
422 423 }
423 424 } else {
424 425 ASSERT(scl->scl_writer != curthread);
425 426 while (!refcount_is_zero(&scl->scl_count)) {
426 427 scl->scl_write_wanted++;
427 428 cv_wait(&scl->scl_cv, &scl->scl_lock);
428 429 scl->scl_write_wanted--;
429 430 }
430 431 scl->scl_writer = curthread;
431 432 }
432 433 (void) refcount_add(&scl->scl_count, tag);
433 434 mutex_exit(&scl->scl_lock);
434 435 }
435 436 ASSERT(wlocks_held <= locks);
436 437 }
437 438
438 439 void
439 440 spa_config_exit(spa_t *spa, int locks, void *tag)
440 441 {
441 442 for (int i = SCL_LOCKS - 1; i >= 0; i--) {
442 443 spa_config_lock_t *scl = &spa->spa_config_lock[i];
443 444 if (!(locks & (1 << i)))
444 445 continue;
445 446 mutex_enter(&scl->scl_lock);
446 447 ASSERT(!refcount_is_zero(&scl->scl_count));
447 448 if (refcount_remove(&scl->scl_count, tag) == 0) {
448 449 ASSERT(scl->scl_writer == NULL ||
449 450 scl->scl_writer == curthread);
450 451 scl->scl_writer = NULL; /* OK in either case */
451 452 cv_broadcast(&scl->scl_cv);
452 453 }
453 454 mutex_exit(&scl->scl_lock);
454 455 }
455 456 }
456 457
457 458 int
458 459 spa_config_held(spa_t *spa, int locks, krw_t rw)
459 460 {
460 461 int locks_held = 0;
461 462
462 463 for (int i = 0; i < SCL_LOCKS; i++) {
463 464 spa_config_lock_t *scl = &spa->spa_config_lock[i];
464 465 if (!(locks & (1 << i)))
465 466 continue;
466 467 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) ||
467 468 (rw == RW_WRITER && scl->scl_writer == curthread))
468 469 locks_held |= 1 << i;
469 470 }
470 471
471 472 return (locks_held);
472 473 }
473 474
474 475 /*
475 476 * ==========================================================================
476 477 * SPA namespace functions
477 478 * ==========================================================================
478 479 */
479 480
480 481 /*
481 482 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held.
482 483 * Returns NULL if no matching spa_t is found.
483 484 */
484 485 spa_t *
485 486 spa_lookup(const char *name)
486 487 {
487 488 static spa_t search; /* spa_t is large; don't allocate on stack */
488 489 spa_t *spa;
489 490 avl_index_t where;
490 491 char *cp;
491 492
492 493 ASSERT(MUTEX_HELD(&spa_namespace_lock));
493 494
494 495 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
495 496
496 497 /*
497 498 * If it's a full dataset name, figure out the pool name and
498 499 * just use that.
499 500 */
500 501 cp = strpbrk(search.spa_name, "/@#");
501 502 if (cp != NULL)
502 503 *cp = '\0';
503 504
504 505 spa = avl_find(&spa_namespace_avl, &search, &where);
505 506
506 507 return (spa);
507 508 }
508 509
509 510 /*
510 511 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
511 512 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
512 513 * looking for potentially hung I/Os.
513 514 */
514 515 void
515 516 spa_deadman(void *arg)
516 517 {
517 518 spa_t *spa = arg;
518 519
519 520 /*
520 521 * Disable the deadman timer if the pool is suspended.
521 522 */
522 523 if (spa_suspended(spa)) {
523 524 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
524 525 return;
525 526 }
526 527
527 528 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
528 529 (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
529 530 ++spa->spa_deadman_calls);
530 531 if (zfs_deadman_enabled)
531 532 vdev_deadman(spa->spa_root_vdev);
532 533 }
533 534
534 535 /*
535 536 * Create an uninitialized spa_t with the given name. Requires
536 537 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
537 538 * exist by calling spa_lookup() first.
538 539 */
539 540 spa_t *
540 541 spa_add(const char *name, nvlist_t *config, const char *altroot)
541 542 {
542 543 spa_t *spa;
543 544 spa_config_dirent_t *dp;
|
↓ open down ↓ |
509 lines elided |
↑ open up ↑ |
544 545 cyc_handler_t hdlr;
545 546 cyc_time_t when;
546 547
547 548 ASSERT(MUTEX_HELD(&spa_namespace_lock));
548 549
549 550 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
550 551
551 552 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
552 553 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
553 554 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
555 + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
554 556 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
555 557 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
556 558 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
557 559 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
558 560 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
559 561 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
560 562 mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
561 563
562 564 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
565 + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
563 566 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
564 567 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
565 568 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
566 569
567 570 for (int t = 0; t < TXG_SIZE; t++)
568 571 bplist_create(&spa->spa_free_bplist[t]);
569 572
570 573 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
571 574 spa->spa_state = POOL_STATE_UNINITIALIZED;
572 575 spa->spa_freeze_txg = UINT64_MAX;
573 576 spa->spa_final_txg = UINT64_MAX;
574 577 spa->spa_load_max_txg = UINT64_MAX;
575 578 spa->spa_proc = &p0;
576 579 spa->spa_proc_state = SPA_PROC_NONE;
577 580
578 581 hdlr.cyh_func = spa_deadman;
579 582 hdlr.cyh_arg = spa;
580 583 hdlr.cyh_level = CY_LOW_LEVEL;
581 584
582 585 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
583 586
584 587 /*
585 588 * This determines how often we need to check for hung I/Os after
586 589 * the cyclic has already fired. Since checking for hung I/Os is
587 590 * an expensive operation we don't want to check too frequently.
588 591 * Instead wait for 5 seconds before checking again.
589 592 */
590 593 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
591 594 when.cyt_when = CY_INFINITY;
592 595 mutex_enter(&cpu_lock);
593 596 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
594 597 mutex_exit(&cpu_lock);
595 598
596 599 refcount_create(&spa->spa_refcount);
597 600 spa_config_lock_init(spa);
598 601
599 602 avl_add(&spa_namespace_avl, spa);
600 603
601 604 /*
602 605 * Set the alternate root, if there is one.
603 606 */
604 607 if (altroot) {
605 608 spa->spa_root = spa_strdup(altroot);
606 609 spa_active_count++;
607 610 }
608 611
609 612 /*
610 613 * Every pool starts with the default cachefile
611 614 */
612 615 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
613 616 offsetof(spa_config_dirent_t, scd_link));
614 617
615 618 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
616 619 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
617 620 list_insert_head(&spa->spa_config_list, dp);
618 621
619 622 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
620 623 KM_SLEEP) == 0);
621 624
622 625 if (config != NULL) {
623 626 nvlist_t *features;
624 627
625 628 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
626 629 &features) == 0) {
627 630 VERIFY(nvlist_dup(features, &spa->spa_label_features,
628 631 0) == 0);
629 632 }
630 633
631 634 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
632 635 }
633 636
634 637 if (spa->spa_label_features == NULL) {
635 638 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
636 639 KM_SLEEP) == 0);
637 640 }
638 641
639 642 spa->spa_iokstat = kstat_create("zfs", 0, name,
640 643 "disk", KSTAT_TYPE_IO, 1, 0);
641 644 if (spa->spa_iokstat) {
642 645 spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock;
643 646 kstat_install(spa->spa_iokstat);
644 647 }
645 648
646 649 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
647 650
648 651 /*
649 652 * As a pool is being created, treat all features as disabled by
650 653 * setting SPA_FEATURE_DISABLED for all entries in the feature
651 654 * refcount cache.
652 655 */
653 656 for (int i = 0; i < SPA_FEATURES; i++) {
654 657 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
655 658 }
656 659
657 660 return (spa);
658 661 }
659 662
660 663 /*
661 664 * Removes a spa_t from the namespace, freeing up any memory used. Requires
|
↓ open down ↓ |
89 lines elided |
↑ open up ↑ |
662 665 * spa_namespace_lock. This is called only after the spa_t has been closed and
663 666 * deactivated.
664 667 */
665 668 void
666 669 spa_remove(spa_t *spa)
667 670 {
668 671 spa_config_dirent_t *dp;
669 672
670 673 ASSERT(MUTEX_HELD(&spa_namespace_lock));
671 674 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
675 + ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
672 676
673 677 nvlist_free(spa->spa_config_splitting);
674 678
675 679 avl_remove(&spa_namespace_avl, spa);
676 680 cv_broadcast(&spa_namespace_cv);
677 681
678 682 if (spa->spa_root) {
679 683 spa_strfree(spa->spa_root);
680 684 spa_active_count--;
681 685 }
682 686
683 687 while ((dp = list_head(&spa->spa_config_list)) != NULL) {
684 688 list_remove(&spa->spa_config_list, dp);
685 689 if (dp->scd_path != NULL)
686 690 spa_strfree(dp->scd_path);
687 691 kmem_free(dp, sizeof (spa_config_dirent_t));
688 692 }
689 693
690 694 list_destroy(&spa->spa_config_list);
691 695
692 696 nvlist_free(spa->spa_label_features);
693 697 nvlist_free(spa->spa_load_info);
694 698 spa_config_set(spa, NULL);
695 699
696 700 mutex_enter(&cpu_lock);
697 701 if (spa->spa_deadman_cycid != CYCLIC_NONE)
698 702 cyclic_remove(spa->spa_deadman_cycid);
699 703 mutex_exit(&cpu_lock);
700 704 spa->spa_deadman_cycid = CYCLIC_NONE;
701 705
702 706 refcount_destroy(&spa->spa_refcount);
|
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
703 707
704 708 spa_config_lock_destroy(spa);
705 709
706 710 kstat_delete(spa->spa_iokstat);
707 711 spa->spa_iokstat = NULL;
708 712
709 713 for (int t = 0; t < TXG_SIZE; t++)
710 714 bplist_destroy(&spa->spa_free_bplist[t]);
711 715
712 716 cv_destroy(&spa->spa_async_cv);
717 + cv_destroy(&spa->spa_evicting_os_cv);
713 718 cv_destroy(&spa->spa_proc_cv);
714 719 cv_destroy(&spa->spa_scrub_io_cv);
715 720 cv_destroy(&spa->spa_suspend_cv);
716 721
717 722 mutex_destroy(&spa->spa_async_lock);
718 723 mutex_destroy(&spa->spa_errlist_lock);
719 724 mutex_destroy(&spa->spa_errlog_lock);
725 + mutex_destroy(&spa->spa_evicting_os_lock);
720 726 mutex_destroy(&spa->spa_history_lock);
721 727 mutex_destroy(&spa->spa_proc_lock);
722 728 mutex_destroy(&spa->spa_props_lock);
723 729 mutex_destroy(&spa->spa_scrub_lock);
724 730 mutex_destroy(&spa->spa_suspend_lock);
725 731 mutex_destroy(&spa->spa_vdev_top_lock);
726 732 mutex_destroy(&spa->spa_iokstat_lock);
727 733
728 734 kmem_free(spa, sizeof (spa_t));
729 735 }
730 736
731 737 /*
732 738 * Given a pool, return the next pool in the namespace, or NULL if there is
733 739 * none. If 'prev' is NULL, return the first pool.
734 740 */
735 741 spa_t *
736 742 spa_next(spa_t *prev)
737 743 {
738 744 ASSERT(MUTEX_HELD(&spa_namespace_lock));
739 745
740 746 if (prev)
741 747 return (AVL_NEXT(&spa_namespace_avl, prev));
742 748 else
743 749 return (avl_first(&spa_namespace_avl));
744 750 }
745 751
746 752 /*
747 753 * ==========================================================================
748 754 * SPA refcount functions
749 755 * ==========================================================================
750 756 */
751 757
752 758 /*
753 759 * Add a reference to the given spa_t. Must have at least one reference, or
754 760 * have the namespace lock held.
755 761 */
756 762 void
757 763 spa_open_ref(spa_t *spa, void *tag)
758 764 {
759 765 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
760 766 MUTEX_HELD(&spa_namespace_lock));
761 767 (void) refcount_add(&spa->spa_refcount, tag);
762 768 }
763 769
764 770 /*
765 771 * Remove a reference to the given spa_t. Must have at least one reference, or
766 772 * have the namespace lock held.
|
↓ open down ↓ |
37 lines elided |
↑ open up ↑ |
767 773 */
768 774 void
769 775 spa_close(spa_t *spa, void *tag)
770 776 {
771 777 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
772 778 MUTEX_HELD(&spa_namespace_lock));
773 779 (void) refcount_remove(&spa->spa_refcount, tag);
774 780 }
775 781
776 782 /*
783 + * Remove a reference to the given spa_t held by a dsl dir that is
784 + * being asynchronously released. Async releases occur from a taskq
785 + * performing eviction of dsl datasets and dirs. The namespace lock
786 + * isn't held and the hold by the object being evicted may contribute to
787 + * spa_minref (e.g. dataset or directory released during pool export),
788 + * so the asserts in spa_close() do not apply.
789 + */
790 +void
791 +spa_async_close(spa_t *spa, void *tag)
792 +{
793 + (void) refcount_remove(&spa->spa_refcount, tag);
794 +}
795 +
796 +/*
777 797 * Check to see if the spa refcount is zero. Must be called with
778 798 * spa_namespace_lock held. We really compare against spa_minref, which is the
779 799 * number of references acquired when opening a pool
780 800 */
781 801 boolean_t
782 802 spa_refcount_zero(spa_t *spa)
783 803 {
784 804 ASSERT(MUTEX_HELD(&spa_namespace_lock));
785 805
786 806 return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
787 807 }
788 808
789 809 /*
790 810 * ==========================================================================
791 811 * SPA spare and l2cache tracking
792 812 * ==========================================================================
793 813 */
794 814
795 815 /*
796 816 * Hot spares and cache devices are tracked using the same code below,
797 817 * for 'auxiliary' devices.
798 818 */
799 819
800 820 typedef struct spa_aux {
801 821 uint64_t aux_guid;
802 822 uint64_t aux_pool;
803 823 avl_node_t aux_avl;
804 824 int aux_count;
805 825 } spa_aux_t;
806 826
807 827 static int
808 828 spa_aux_compare(const void *a, const void *b)
809 829 {
810 830 const spa_aux_t *sa = a;
811 831 const spa_aux_t *sb = b;
812 832
813 833 if (sa->aux_guid < sb->aux_guid)
814 834 return (-1);
815 835 else if (sa->aux_guid > sb->aux_guid)
816 836 return (1);
817 837 else
818 838 return (0);
819 839 }
820 840
821 841 void
822 842 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
823 843 {
824 844 avl_index_t where;
825 845 spa_aux_t search;
826 846 spa_aux_t *aux;
827 847
828 848 search.aux_guid = vd->vdev_guid;
829 849 if ((aux = avl_find(avl, &search, &where)) != NULL) {
830 850 aux->aux_count++;
831 851 } else {
832 852 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
833 853 aux->aux_guid = vd->vdev_guid;
834 854 aux->aux_count = 1;
835 855 avl_insert(avl, aux, where);
836 856 }
837 857 }
838 858
839 859 void
840 860 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
841 861 {
842 862 spa_aux_t search;
843 863 spa_aux_t *aux;
844 864 avl_index_t where;
845 865
846 866 search.aux_guid = vd->vdev_guid;
847 867 aux = avl_find(avl, &search, &where);
848 868
849 869 ASSERT(aux != NULL);
850 870
851 871 if (--aux->aux_count == 0) {
852 872 avl_remove(avl, aux);
853 873 kmem_free(aux, sizeof (spa_aux_t));
854 874 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
855 875 aux->aux_pool = 0ULL;
856 876 }
857 877 }
858 878
859 879 boolean_t
860 880 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
861 881 {
862 882 spa_aux_t search, *found;
863 883
864 884 search.aux_guid = guid;
865 885 found = avl_find(avl, &search, NULL);
866 886
867 887 if (pool) {
868 888 if (found)
869 889 *pool = found->aux_pool;
870 890 else
871 891 *pool = 0ULL;
872 892 }
873 893
874 894 if (refcnt) {
875 895 if (found)
876 896 *refcnt = found->aux_count;
877 897 else
878 898 *refcnt = 0;
879 899 }
880 900
881 901 return (found != NULL);
882 902 }
883 903
884 904 void
885 905 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
886 906 {
887 907 spa_aux_t search, *found;
888 908 avl_index_t where;
889 909
890 910 search.aux_guid = vd->vdev_guid;
891 911 found = avl_find(avl, &search, &where);
892 912 ASSERT(found != NULL);
893 913 ASSERT(found->aux_pool == 0ULL);
894 914
895 915 found->aux_pool = spa_guid(vd->vdev_spa);
896 916 }
897 917
898 918 /*
899 919 * Spares are tracked globally due to the following constraints:
900 920 *
901 921 * - A spare may be part of multiple pools.
902 922 * - A spare may be added to a pool even if it's actively in use within
903 923 * another pool.
904 924 * - A spare in use in any pool can only be the source of a replacement if
905 925 * the target is a spare in the same pool.
906 926 *
907 927 * We keep track of all spares on the system through the use of a reference
908 928 * counted AVL tree. When a vdev is added as a spare, or used as a replacement
909 929 * spare, then we bump the reference count in the AVL tree. In addition, we set
910 930 * the 'vdev_isspare' member to indicate that the device is a spare (active or
911 931 * inactive). When a spare is made active (used to replace a device in the
912 932 * pool), we also keep track of which pool its been made a part of.
913 933 *
914 934 * The 'spa_spare_lock' protects the AVL tree. These functions are normally
915 935 * called under the spa_namespace lock as part of vdev reconfiguration. The
916 936 * separate spare lock exists for the status query path, which does not need to
917 937 * be completely consistent with respect to other vdev configuration changes.
918 938 */
919 939
920 940 static int
921 941 spa_spare_compare(const void *a, const void *b)
922 942 {
923 943 return (spa_aux_compare(a, b));
924 944 }
925 945
926 946 void
927 947 spa_spare_add(vdev_t *vd)
928 948 {
929 949 mutex_enter(&spa_spare_lock);
930 950 ASSERT(!vd->vdev_isspare);
931 951 spa_aux_add(vd, &spa_spare_avl);
932 952 vd->vdev_isspare = B_TRUE;
933 953 mutex_exit(&spa_spare_lock);
934 954 }
935 955
936 956 void
937 957 spa_spare_remove(vdev_t *vd)
938 958 {
939 959 mutex_enter(&spa_spare_lock);
940 960 ASSERT(vd->vdev_isspare);
941 961 spa_aux_remove(vd, &spa_spare_avl);
942 962 vd->vdev_isspare = B_FALSE;
943 963 mutex_exit(&spa_spare_lock);
944 964 }
945 965
946 966 boolean_t
947 967 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
948 968 {
949 969 boolean_t found;
950 970
951 971 mutex_enter(&spa_spare_lock);
952 972 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
953 973 mutex_exit(&spa_spare_lock);
954 974
955 975 return (found);
956 976 }
957 977
958 978 void
959 979 spa_spare_activate(vdev_t *vd)
960 980 {
961 981 mutex_enter(&spa_spare_lock);
962 982 ASSERT(vd->vdev_isspare);
963 983 spa_aux_activate(vd, &spa_spare_avl);
964 984 mutex_exit(&spa_spare_lock);
965 985 }
966 986
967 987 /*
968 988 * Level 2 ARC devices are tracked globally for the same reasons as spares.
969 989 * Cache devices currently only support one pool per cache device, and so
970 990 * for these devices the aux reference count is currently unused beyond 1.
971 991 */
972 992
973 993 static int
974 994 spa_l2cache_compare(const void *a, const void *b)
975 995 {
976 996 return (spa_aux_compare(a, b));
977 997 }
978 998
979 999 void
980 1000 spa_l2cache_add(vdev_t *vd)
981 1001 {
982 1002 mutex_enter(&spa_l2cache_lock);
983 1003 ASSERT(!vd->vdev_isl2cache);
984 1004 spa_aux_add(vd, &spa_l2cache_avl);
985 1005 vd->vdev_isl2cache = B_TRUE;
986 1006 mutex_exit(&spa_l2cache_lock);
987 1007 }
988 1008
989 1009 void
990 1010 spa_l2cache_remove(vdev_t *vd)
991 1011 {
992 1012 mutex_enter(&spa_l2cache_lock);
993 1013 ASSERT(vd->vdev_isl2cache);
994 1014 spa_aux_remove(vd, &spa_l2cache_avl);
995 1015 vd->vdev_isl2cache = B_FALSE;
996 1016 mutex_exit(&spa_l2cache_lock);
997 1017 }
998 1018
999 1019 boolean_t
1000 1020 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
1001 1021 {
1002 1022 boolean_t found;
1003 1023
1004 1024 mutex_enter(&spa_l2cache_lock);
1005 1025 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
1006 1026 mutex_exit(&spa_l2cache_lock);
1007 1027
1008 1028 return (found);
1009 1029 }
1010 1030
1011 1031 void
1012 1032 spa_l2cache_activate(vdev_t *vd)
1013 1033 {
1014 1034 mutex_enter(&spa_l2cache_lock);
1015 1035 ASSERT(vd->vdev_isl2cache);
1016 1036 spa_aux_activate(vd, &spa_l2cache_avl);
1017 1037 mutex_exit(&spa_l2cache_lock);
1018 1038 }
1019 1039
1020 1040 /*
1021 1041 * ==========================================================================
1022 1042 * SPA vdev locking
1023 1043 * ==========================================================================
1024 1044 */
1025 1045
1026 1046 /*
1027 1047 * Lock the given spa_t for the purpose of adding or removing a vdev.
1028 1048 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1029 1049 * It returns the next transaction group for the spa_t.
1030 1050 */
1031 1051 uint64_t
1032 1052 spa_vdev_enter(spa_t *spa)
1033 1053 {
1034 1054 mutex_enter(&spa->spa_vdev_top_lock);
1035 1055 mutex_enter(&spa_namespace_lock);
1036 1056 return (spa_vdev_config_enter(spa));
1037 1057 }
1038 1058
1039 1059 /*
1040 1060 * Internal implementation for spa_vdev_enter(). Used when a vdev
1041 1061 * operation requires multiple syncs (i.e. removing a device) while
1042 1062 * keeping the spa_namespace_lock held.
1043 1063 */
1044 1064 uint64_t
1045 1065 spa_vdev_config_enter(spa_t *spa)
1046 1066 {
1047 1067 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1048 1068
1049 1069 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1050 1070
1051 1071 return (spa_last_synced_txg(spa) + 1);
1052 1072 }
1053 1073
1054 1074 /*
1055 1075 * Used in combination with spa_vdev_config_enter() to allow the syncing
1056 1076 * of multiple transactions without releasing the spa_namespace_lock.
1057 1077 */
1058 1078 void
1059 1079 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
1060 1080 {
1061 1081 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1062 1082
1063 1083 int config_changed = B_FALSE;
1064 1084
1065 1085 ASSERT(txg > spa_last_synced_txg(spa));
1066 1086
1067 1087 spa->spa_pending_vdev = NULL;
1068 1088
1069 1089 /*
1070 1090 * Reassess the DTLs.
1071 1091 */
1072 1092 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1073 1093
1074 1094 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1075 1095 config_changed = B_TRUE;
1076 1096 spa->spa_config_generation++;
1077 1097 }
1078 1098
1079 1099 /*
1080 1100 * Verify the metaslab classes.
1081 1101 */
1082 1102 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1083 1103 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1084 1104
1085 1105 spa_config_exit(spa, SCL_ALL, spa);
1086 1106
1087 1107 /*
1088 1108 * Panic the system if the specified tag requires it. This
1089 1109 * is useful for ensuring that configurations are updated
1090 1110 * transactionally.
1091 1111 */
1092 1112 if (zio_injection_enabled)
1093 1113 zio_handle_panic_injection(spa, tag, 0);
1094 1114
1095 1115 /*
1096 1116 * Note: this txg_wait_synced() is important because it ensures
1097 1117 * that there won't be more than one config change per txg.
1098 1118 * This allows us to use the txg as the generation number.
1099 1119 */
1100 1120 if (error == 0)
1101 1121 txg_wait_synced(spa->spa_dsl_pool, txg);
1102 1122
1103 1123 if (vd != NULL) {
1104 1124 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1105 1125 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1106 1126 vdev_free(vd);
1107 1127 spa_config_exit(spa, SCL_ALL, spa);
1108 1128 }
1109 1129
1110 1130 /*
1111 1131 * If the config changed, update the config cache.
1112 1132 */
1113 1133 if (config_changed)
1114 1134 spa_config_sync(spa, B_FALSE, B_TRUE);
1115 1135 }
1116 1136
1117 1137 /*
1118 1138 * Unlock the spa_t after adding or removing a vdev. Besides undoing the
1119 1139 * locking of spa_vdev_enter(), we also want make sure the transactions have
1120 1140 * synced to disk, and then update the global configuration cache with the new
1121 1141 * information.
1122 1142 */
1123 1143 int
1124 1144 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1125 1145 {
1126 1146 spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1127 1147 mutex_exit(&spa_namespace_lock);
1128 1148 mutex_exit(&spa->spa_vdev_top_lock);
1129 1149
1130 1150 return (error);
1131 1151 }
1132 1152
1133 1153 /*
1134 1154 * Lock the given spa_t for the purpose of changing vdev state.
1135 1155 */
1136 1156 void
1137 1157 spa_vdev_state_enter(spa_t *spa, int oplocks)
1138 1158 {
1139 1159 int locks = SCL_STATE_ALL | oplocks;
1140 1160
1141 1161 /*
1142 1162 * Root pools may need to read of the underlying devfs filesystem
1143 1163 * when opening up a vdev. Unfortunately if we're holding the
1144 1164 * SCL_ZIO lock it will result in a deadlock when we try to issue
1145 1165 * the read from the root filesystem. Instead we "prefetch"
1146 1166 * the associated vnodes that we need prior to opening the
1147 1167 * underlying devices and cache them so that we can prevent
1148 1168 * any I/O when we are doing the actual open.
1149 1169 */
1150 1170 if (spa_is_root(spa)) {
1151 1171 int low = locks & ~(SCL_ZIO - 1);
1152 1172 int high = locks & ~low;
1153 1173
1154 1174 spa_config_enter(spa, high, spa, RW_WRITER);
1155 1175 vdev_hold(spa->spa_root_vdev);
1156 1176 spa_config_enter(spa, low, spa, RW_WRITER);
1157 1177 } else {
1158 1178 spa_config_enter(spa, locks, spa, RW_WRITER);
1159 1179 }
1160 1180 spa->spa_vdev_locks = locks;
1161 1181 }
1162 1182
1163 1183 int
1164 1184 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
1165 1185 {
1166 1186 boolean_t config_changed = B_FALSE;
1167 1187
1168 1188 if (vd != NULL || error == 0)
1169 1189 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
1170 1190 0, 0, B_FALSE);
1171 1191
1172 1192 if (vd != NULL) {
1173 1193 vdev_state_dirty(vd->vdev_top);
1174 1194 config_changed = B_TRUE;
1175 1195 spa->spa_config_generation++;
1176 1196 }
1177 1197
1178 1198 if (spa_is_root(spa))
1179 1199 vdev_rele(spa->spa_root_vdev);
1180 1200
1181 1201 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1182 1202 spa_config_exit(spa, spa->spa_vdev_locks, spa);
1183 1203
1184 1204 /*
1185 1205 * If anything changed, wait for it to sync. This ensures that,
1186 1206 * from the system administrator's perspective, zpool(1M) commands
1187 1207 * are synchronous. This is important for things like zpool offline:
1188 1208 * when the command completes, you expect no further I/O from ZFS.
1189 1209 */
1190 1210 if (vd != NULL)
1191 1211 txg_wait_synced(spa->spa_dsl_pool, 0);
1192 1212
1193 1213 /*
1194 1214 * If the config changed, update the config cache.
1195 1215 */
1196 1216 if (config_changed) {
1197 1217 mutex_enter(&spa_namespace_lock);
1198 1218 spa_config_sync(spa, B_FALSE, B_TRUE);
1199 1219 mutex_exit(&spa_namespace_lock);
1200 1220 }
1201 1221
1202 1222 return (error);
1203 1223 }
1204 1224
1205 1225 /*
1206 1226 * ==========================================================================
1207 1227 * Miscellaneous functions
1208 1228 * ==========================================================================
1209 1229 */
1210 1230
1211 1231 void
1212 1232 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1213 1233 {
1214 1234 if (!nvlist_exists(spa->spa_label_features, feature)) {
1215 1235 fnvlist_add_boolean(spa->spa_label_features, feature);
1216 1236 /*
1217 1237 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1218 1238 * dirty the vdev config because lock SCL_CONFIG is not held.
1219 1239 * Thankfully, in this case we don't need to dirty the config
1220 1240 * because it will be written out anyway when we finish
1221 1241 * creating the pool.
1222 1242 */
1223 1243 if (tx->tx_txg != TXG_INITIAL)
1224 1244 vdev_config_dirty(spa->spa_root_vdev);
1225 1245 }
1226 1246 }
1227 1247
1228 1248 void
1229 1249 spa_deactivate_mos_feature(spa_t *spa, const char *feature)
1230 1250 {
1231 1251 if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
1232 1252 vdev_config_dirty(spa->spa_root_vdev);
1233 1253 }
1234 1254
1235 1255 /*
1236 1256 * Rename a spa_t.
1237 1257 */
1238 1258 int
1239 1259 spa_rename(const char *name, const char *newname)
1240 1260 {
1241 1261 spa_t *spa;
1242 1262 int err;
1243 1263
1244 1264 /*
1245 1265 * Lookup the spa_t and grab the config lock for writing. We need to
1246 1266 * actually open the pool so that we can sync out the necessary labels.
1247 1267 * It's OK to call spa_open() with the namespace lock held because we
1248 1268 * allow recursive calls for other reasons.
1249 1269 */
1250 1270 mutex_enter(&spa_namespace_lock);
1251 1271 if ((err = spa_open(name, &spa, FTAG)) != 0) {
1252 1272 mutex_exit(&spa_namespace_lock);
1253 1273 return (err);
1254 1274 }
1255 1275
1256 1276 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1257 1277
1258 1278 avl_remove(&spa_namespace_avl, spa);
1259 1279 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1260 1280 avl_add(&spa_namespace_avl, spa);
1261 1281
1262 1282 /*
1263 1283 * Sync all labels to disk with the new names by marking the root vdev
1264 1284 * dirty and waiting for it to sync. It will pick up the new pool name
1265 1285 * during the sync.
1266 1286 */
1267 1287 vdev_config_dirty(spa->spa_root_vdev);
1268 1288
1269 1289 spa_config_exit(spa, SCL_ALL, FTAG);
1270 1290
1271 1291 txg_wait_synced(spa->spa_dsl_pool, 0);
1272 1292
1273 1293 /*
1274 1294 * Sync the updated config cache.
1275 1295 */
1276 1296 spa_config_sync(spa, B_FALSE, B_TRUE);
1277 1297
1278 1298 spa_close(spa, FTAG);
1279 1299
1280 1300 mutex_exit(&spa_namespace_lock);
1281 1301
1282 1302 return (0);
1283 1303 }
1284 1304
1285 1305 /*
1286 1306 * Return the spa_t associated with given pool_guid, if it exists. If
1287 1307 * device_guid is non-zero, determine whether the pool exists *and* contains
1288 1308 * a device with the specified device_guid.
1289 1309 */
1290 1310 spa_t *
1291 1311 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1292 1312 {
1293 1313 spa_t *spa;
1294 1314 avl_tree_t *t = &spa_namespace_avl;
1295 1315
1296 1316 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1297 1317
1298 1318 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
1299 1319 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
1300 1320 continue;
1301 1321 if (spa->spa_root_vdev == NULL)
1302 1322 continue;
1303 1323 if (spa_guid(spa) == pool_guid) {
1304 1324 if (device_guid == 0)
1305 1325 break;
1306 1326
1307 1327 if (vdev_lookup_by_guid(spa->spa_root_vdev,
1308 1328 device_guid) != NULL)
1309 1329 break;
1310 1330
1311 1331 /*
1312 1332 * Check any devices we may be in the process of adding.
1313 1333 */
1314 1334 if (spa->spa_pending_vdev) {
1315 1335 if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1316 1336 device_guid) != NULL)
1317 1337 break;
1318 1338 }
1319 1339 }
1320 1340 }
1321 1341
1322 1342 return (spa);
1323 1343 }
1324 1344
1325 1345 /*
1326 1346 * Determine whether a pool with the given pool_guid exists.
1327 1347 */
1328 1348 boolean_t
1329 1349 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1330 1350 {
1331 1351 return (spa_by_guid(pool_guid, device_guid) != NULL);
1332 1352 }
1333 1353
1334 1354 char *
1335 1355 spa_strdup(const char *s)
1336 1356 {
1337 1357 size_t len;
1338 1358 char *new;
1339 1359
1340 1360 len = strlen(s);
1341 1361 new = kmem_alloc(len + 1, KM_SLEEP);
1342 1362 bcopy(s, new, len);
1343 1363 new[len] = '\0';
1344 1364
1345 1365 return (new);
1346 1366 }
1347 1367
1348 1368 void
1349 1369 spa_strfree(char *s)
1350 1370 {
1351 1371 kmem_free(s, strlen(s) + 1);
1352 1372 }
1353 1373
1354 1374 uint64_t
1355 1375 spa_get_random(uint64_t range)
1356 1376 {
1357 1377 uint64_t r;
1358 1378
1359 1379 ASSERT(range != 0);
1360 1380
1361 1381 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
1362 1382
1363 1383 return (r % range);
1364 1384 }
1365 1385
1366 1386 uint64_t
1367 1387 spa_generate_guid(spa_t *spa)
1368 1388 {
1369 1389 uint64_t guid = spa_get_random(-1ULL);
1370 1390
1371 1391 if (spa != NULL) {
1372 1392 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
1373 1393 guid = spa_get_random(-1ULL);
1374 1394 } else {
1375 1395 while (guid == 0 || spa_guid_exists(guid, 0))
1376 1396 guid = spa_get_random(-1ULL);
1377 1397 }
1378 1398
1379 1399 return (guid);
1380 1400 }
1381 1401
1382 1402 void
1383 1403 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
1384 1404 {
1385 1405 char type[256];
1386 1406 char *checksum = NULL;
1387 1407 char *compress = NULL;
1388 1408
1389 1409 if (bp != NULL) {
1390 1410 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
1391 1411 dmu_object_byteswap_t bswap =
1392 1412 DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
1393 1413 (void) snprintf(type, sizeof (type), "bswap %s %s",
1394 1414 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
1395 1415 "metadata" : "data",
1396 1416 dmu_ot_byteswap[bswap].ob_name);
1397 1417 } else {
1398 1418 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
1399 1419 sizeof (type));
1400 1420 }
1401 1421 if (!BP_IS_EMBEDDED(bp)) {
1402 1422 checksum =
1403 1423 zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
1404 1424 }
1405 1425 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
1406 1426 }
1407 1427
1408 1428 SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
1409 1429 compress);
1410 1430 }
1411 1431
1412 1432 void
1413 1433 spa_freeze(spa_t *spa)
1414 1434 {
1415 1435 uint64_t freeze_txg = 0;
1416 1436
1417 1437 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1418 1438 if (spa->spa_freeze_txg == UINT64_MAX) {
1419 1439 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
1420 1440 spa->spa_freeze_txg = freeze_txg;
1421 1441 }
1422 1442 spa_config_exit(spa, SCL_ALL, FTAG);
1423 1443 if (freeze_txg != 0)
1424 1444 txg_wait_synced(spa_get_dsl(spa), freeze_txg);
1425 1445 }
1426 1446
1427 1447 void
1428 1448 zfs_panic_recover(const char *fmt, ...)
1429 1449 {
1430 1450 va_list adx;
1431 1451
1432 1452 va_start(adx, fmt);
1433 1453 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
1434 1454 va_end(adx);
1435 1455 }
1436 1456
1437 1457 /*
1438 1458 * This is a stripped-down version of strtoull, suitable only for converting
1439 1459 * lowercase hexadecimal numbers that don't overflow.
1440 1460 */
1441 1461 uint64_t
1442 1462 strtonum(const char *str, char **nptr)
1443 1463 {
1444 1464 uint64_t val = 0;
1445 1465 char c;
1446 1466 int digit;
1447 1467
1448 1468 while ((c = *str) != '\0') {
1449 1469 if (c >= '0' && c <= '9')
1450 1470 digit = c - '0';
1451 1471 else if (c >= 'a' && c <= 'f')
1452 1472 digit = 10 + c - 'a';
1453 1473 else
1454 1474 break;
1455 1475
1456 1476 val *= 16;
1457 1477 val += digit;
1458 1478
1459 1479 str++;
1460 1480 }
1461 1481
1462 1482 if (nptr)
1463 1483 *nptr = (char *)str;
1464 1484
1465 1485 return (val);
1466 1486 }
1467 1487
1468 1488 /*
1469 1489 * ==========================================================================
1470 1490 * Accessor functions
1471 1491 * ==========================================================================
1472 1492 */
1473 1493
1474 1494 boolean_t
1475 1495 spa_shutting_down(spa_t *spa)
1476 1496 {
1477 1497 return (spa->spa_async_suspended);
1478 1498 }
1479 1499
1480 1500 dsl_pool_t *
1481 1501 spa_get_dsl(spa_t *spa)
1482 1502 {
1483 1503 return (spa->spa_dsl_pool);
1484 1504 }
1485 1505
1486 1506 boolean_t
1487 1507 spa_is_initializing(spa_t *spa)
1488 1508 {
1489 1509 return (spa->spa_is_initializing);
1490 1510 }
1491 1511
1492 1512 blkptr_t *
1493 1513 spa_get_rootblkptr(spa_t *spa)
1494 1514 {
1495 1515 return (&spa->spa_ubsync.ub_rootbp);
1496 1516 }
1497 1517
1498 1518 void
1499 1519 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1500 1520 {
1501 1521 spa->spa_uberblock.ub_rootbp = *bp;
1502 1522 }
1503 1523
1504 1524 void
1505 1525 spa_altroot(spa_t *spa, char *buf, size_t buflen)
1506 1526 {
1507 1527 if (spa->spa_root == NULL)
1508 1528 buf[0] = '\0';
1509 1529 else
1510 1530 (void) strncpy(buf, spa->spa_root, buflen);
1511 1531 }
1512 1532
1513 1533 int
1514 1534 spa_sync_pass(spa_t *spa)
1515 1535 {
1516 1536 return (spa->spa_sync_pass);
1517 1537 }
1518 1538
1519 1539 char *
1520 1540 spa_name(spa_t *spa)
1521 1541 {
1522 1542 return (spa->spa_name);
1523 1543 }
1524 1544
1525 1545 uint64_t
1526 1546 spa_guid(spa_t *spa)
1527 1547 {
1528 1548 dsl_pool_t *dp = spa_get_dsl(spa);
1529 1549 uint64_t guid;
1530 1550
1531 1551 /*
1532 1552 * If we fail to parse the config during spa_load(), we can go through
1533 1553 * the error path (which posts an ereport) and end up here with no root
1534 1554 * vdev. We stash the original pool guid in 'spa_config_guid' to handle
1535 1555 * this case.
1536 1556 */
1537 1557 if (spa->spa_root_vdev == NULL)
1538 1558 return (spa->spa_config_guid);
1539 1559
1540 1560 guid = spa->spa_last_synced_guid != 0 ?
1541 1561 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
1542 1562
1543 1563 /*
1544 1564 * Return the most recently synced out guid unless we're
1545 1565 * in syncing context.
1546 1566 */
1547 1567 if (dp && dsl_pool_sync_context(dp))
1548 1568 return (spa->spa_root_vdev->vdev_guid);
1549 1569 else
1550 1570 return (guid);
1551 1571 }
1552 1572
1553 1573 uint64_t
1554 1574 spa_load_guid(spa_t *spa)
1555 1575 {
1556 1576 /*
1557 1577 * This is a GUID that exists solely as a reference for the
1558 1578 * purposes of the arc. It is generated at load time, and
1559 1579 * is never written to persistent storage.
1560 1580 */
1561 1581 return (spa->spa_load_guid);
1562 1582 }
1563 1583
1564 1584 uint64_t
1565 1585 spa_last_synced_txg(spa_t *spa)
1566 1586 {
1567 1587 return (spa->spa_ubsync.ub_txg);
1568 1588 }
1569 1589
1570 1590 uint64_t
1571 1591 spa_first_txg(spa_t *spa)
1572 1592 {
1573 1593 return (spa->spa_first_txg);
1574 1594 }
1575 1595
1576 1596 uint64_t
1577 1597 spa_syncing_txg(spa_t *spa)
1578 1598 {
1579 1599 return (spa->spa_syncing_txg);
1580 1600 }
1581 1601
1582 1602 pool_state_t
1583 1603 spa_state(spa_t *spa)
1584 1604 {
1585 1605 return (spa->spa_state);
1586 1606 }
1587 1607
1588 1608 spa_load_state_t
1589 1609 spa_load_state(spa_t *spa)
1590 1610 {
1591 1611 return (spa->spa_load_state);
1592 1612 }
1593 1613
1594 1614 uint64_t
1595 1615 spa_freeze_txg(spa_t *spa)
1596 1616 {
1597 1617 return (spa->spa_freeze_txg);
1598 1618 }
1599 1619
1600 1620 /* ARGSUSED */
1601 1621 uint64_t
1602 1622 spa_get_asize(spa_t *spa, uint64_t lsize)
1603 1623 {
1604 1624 return (lsize * spa_asize_inflation);
1605 1625 }
1606 1626
1607 1627 /*
1608 1628 * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
1609 1629 * or at least 32MB.
1610 1630 *
1611 1631 * See the comment above spa_slop_shift for details.
1612 1632 */
1613 1633 uint64_t
1614 1634 spa_get_slop_space(spa_t *spa) {
1615 1635 uint64_t space = spa_get_dspace(spa);
1616 1636 return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1));
1617 1637 }
1618 1638
1619 1639 uint64_t
1620 1640 spa_get_dspace(spa_t *spa)
1621 1641 {
1622 1642 return (spa->spa_dspace);
1623 1643 }
1624 1644
1625 1645 void
1626 1646 spa_update_dspace(spa_t *spa)
1627 1647 {
1628 1648 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1629 1649 ddt_get_dedup_dspace(spa);
1630 1650 }
1631 1651
1632 1652 /*
1633 1653 * Return the failure mode that has been set to this pool. The default
1634 1654 * behavior will be to block all I/Os when a complete failure occurs.
1635 1655 */
1636 1656 uint8_t
1637 1657 spa_get_failmode(spa_t *spa)
1638 1658 {
1639 1659 return (spa->spa_failmode);
1640 1660 }
1641 1661
1642 1662 boolean_t
1643 1663 spa_suspended(spa_t *spa)
1644 1664 {
1645 1665 return (spa->spa_suspended);
1646 1666 }
1647 1667
1648 1668 uint64_t
1649 1669 spa_version(spa_t *spa)
1650 1670 {
1651 1671 return (spa->spa_ubsync.ub_version);
1652 1672 }
1653 1673
1654 1674 boolean_t
1655 1675 spa_deflate(spa_t *spa)
1656 1676 {
1657 1677 return (spa->spa_deflate);
1658 1678 }
1659 1679
1660 1680 metaslab_class_t *
1661 1681 spa_normal_class(spa_t *spa)
|
↓ open down ↓ |
875 lines elided |
↑ open up ↑ |
1662 1682 {
1663 1683 return (spa->spa_normal_class);
1664 1684 }
1665 1685
1666 1686 metaslab_class_t *
1667 1687 spa_log_class(spa_t *spa)
1668 1688 {
1669 1689 return (spa->spa_log_class);
1670 1690 }
1671 1691
1692 +void
1693 +spa_evicting_os_register(spa_t *spa, objset_t *os)
1694 +{
1695 + mutex_enter(&spa->spa_evicting_os_lock);
1696 + list_insert_head(&spa->spa_evicting_os_list, os);
1697 + mutex_exit(&spa->spa_evicting_os_lock);
1698 +}
1699 +
1700 +void
1701 +spa_evicting_os_deregister(spa_t *spa, objset_t *os)
1702 +{
1703 + mutex_enter(&spa->spa_evicting_os_lock);
1704 + list_remove(&spa->spa_evicting_os_list, os);
1705 + cv_broadcast(&spa->spa_evicting_os_cv);
1706 + mutex_exit(&spa->spa_evicting_os_lock);
1707 +}
1708 +
1709 +void
1710 +spa_evicting_os_wait(spa_t *spa)
1711 +{
1712 + mutex_enter(&spa->spa_evicting_os_lock);
1713 + while (!list_is_empty(&spa->spa_evicting_os_list))
1714 + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1715 + mutex_exit(&spa->spa_evicting_os_lock);
1716 +
1717 + dmu_buf_user_evict_wait();
1718 +}
1719 +
1672 1720 int
1673 1721 spa_max_replication(spa_t *spa)
1674 1722 {
1675 1723 /*
1676 1724 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1677 1725 * handle BPs with more than one DVA allocated. Set our max
1678 1726 * replication level accordingly.
1679 1727 */
1680 1728 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1681 1729 return (1);
1682 1730 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1683 1731 }
1684 1732
1685 1733 int
1686 1734 spa_prev_software_version(spa_t *spa)
1687 1735 {
1688 1736 return (spa->spa_prev_software_version);
1689 1737 }
1690 1738
1691 1739 uint64_t
1692 1740 spa_deadman_synctime(spa_t *spa)
1693 1741 {
1694 1742 return (spa->spa_deadman_synctime);
1695 1743 }
1696 1744
1697 1745 uint64_t
1698 1746 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1699 1747 {
1700 1748 uint64_t asize = DVA_GET_ASIZE(dva);
1701 1749 uint64_t dsize = asize;
1702 1750
1703 1751 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1704 1752
1705 1753 if (asize != 0 && spa->spa_deflate) {
1706 1754 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1707 1755 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1708 1756 }
1709 1757
1710 1758 return (dsize);
1711 1759 }
1712 1760
1713 1761 uint64_t
1714 1762 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1715 1763 {
1716 1764 uint64_t dsize = 0;
1717 1765
1718 1766 for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1719 1767 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1720 1768
1721 1769 return (dsize);
1722 1770 }
1723 1771
1724 1772 uint64_t
1725 1773 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1726 1774 {
1727 1775 uint64_t dsize = 0;
1728 1776
1729 1777 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1730 1778
1731 1779 for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1732 1780 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1733 1781
1734 1782 spa_config_exit(spa, SCL_VDEV, FTAG);
1735 1783
1736 1784 return (dsize);
1737 1785 }
1738 1786
1739 1787 /*
1740 1788 * ==========================================================================
1741 1789 * Initialization and Termination
1742 1790 * ==========================================================================
1743 1791 */
1744 1792
1745 1793 static int
1746 1794 spa_name_compare(const void *a1, const void *a2)
1747 1795 {
1748 1796 const spa_t *s1 = a1;
1749 1797 const spa_t *s2 = a2;
1750 1798 int s;
1751 1799
1752 1800 s = strcmp(s1->spa_name, s2->spa_name);
1753 1801 if (s > 0)
1754 1802 return (1);
1755 1803 if (s < 0)
1756 1804 return (-1);
1757 1805 return (0);
1758 1806 }
1759 1807
1760 1808 int
1761 1809 spa_busy(void)
1762 1810 {
1763 1811 return (spa_active_count);
1764 1812 }
1765 1813
1766 1814 void
1767 1815 spa_boot_init()
1768 1816 {
1769 1817 spa_config_load();
1770 1818 }
1771 1819
1772 1820 void
1773 1821 spa_init(int mode)
1774 1822 {
1775 1823 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1776 1824 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1777 1825 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1778 1826 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1779 1827
1780 1828 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1781 1829 offsetof(spa_t, spa_avl));
1782 1830
1783 1831 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1784 1832 offsetof(spa_aux_t, aux_avl));
1785 1833
1786 1834 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1787 1835 offsetof(spa_aux_t, aux_avl));
1788 1836
1789 1837 spa_mode_global = mode;
1790 1838
1791 1839 #ifdef _KERNEL
1792 1840 spa_arch_init();
1793 1841 #else
1794 1842 if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
1795 1843 arc_procfd = open("/proc/self/ctl", O_WRONLY);
1796 1844 if (arc_procfd == -1) {
1797 1845 perror("could not enable watchpoints: "
1798 1846 "opening /proc/self/ctl failed: ");
1799 1847 } else {
1800 1848 arc_watch = B_TRUE;
1801 1849 }
1802 1850 }
1803 1851 #endif
1804 1852
1805 1853 refcount_init();
1806 1854 unique_init();
1807 1855 range_tree_init();
1808 1856 zio_init();
1809 1857 dmu_init();
1810 1858 zil_init();
1811 1859 vdev_cache_stat_init();
1812 1860 zfs_prop_init();
1813 1861 zpool_prop_init();
1814 1862 zpool_feature_init();
1815 1863 spa_config_load();
1816 1864 l2arc_start();
1817 1865 }
1818 1866
1819 1867 void
1820 1868 spa_fini(void)
1821 1869 {
1822 1870 l2arc_stop();
1823 1871
1824 1872 spa_evict_all();
1825 1873
1826 1874 vdev_cache_stat_fini();
1827 1875 zil_fini();
1828 1876 dmu_fini();
1829 1877 zio_fini();
1830 1878 range_tree_fini();
1831 1879 unique_fini();
1832 1880 refcount_fini();
1833 1881
1834 1882 avl_destroy(&spa_namespace_avl);
1835 1883 avl_destroy(&spa_spare_avl);
1836 1884 avl_destroy(&spa_l2cache_avl);
1837 1885
1838 1886 cv_destroy(&spa_namespace_cv);
1839 1887 mutex_destroy(&spa_namespace_lock);
1840 1888 mutex_destroy(&spa_spare_lock);
1841 1889 mutex_destroy(&spa_l2cache_lock);
1842 1890 }
1843 1891
1844 1892 /*
1845 1893 * Return whether this pool has slogs. No locking needed.
1846 1894 * It's not a problem if the wrong answer is returned as it's only for
1847 1895 * performance and not correctness
1848 1896 */
1849 1897 boolean_t
1850 1898 spa_has_slogs(spa_t *spa)
1851 1899 {
1852 1900 return (spa->spa_log_class->mc_rotor != NULL);
1853 1901 }
1854 1902
1855 1903 spa_log_state_t
1856 1904 spa_get_log_state(spa_t *spa)
1857 1905 {
1858 1906 return (spa->spa_log_state);
1859 1907 }
1860 1908
1861 1909 void
1862 1910 spa_set_log_state(spa_t *spa, spa_log_state_t state)
1863 1911 {
1864 1912 spa->spa_log_state = state;
1865 1913 }
1866 1914
1867 1915 boolean_t
1868 1916 spa_is_root(spa_t *spa)
1869 1917 {
1870 1918 return (spa->spa_is_root);
1871 1919 }
1872 1920
1873 1921 boolean_t
1874 1922 spa_writeable(spa_t *spa)
1875 1923 {
1876 1924 return (!!(spa->spa_mode & FWRITE));
1877 1925 }
1878 1926
1879 1927 /*
1880 1928 * Returns true if there is a pending sync task in any of the current
1881 1929 * syncing txg, the current quiescing txg, or the current open txg.
1882 1930 */
1883 1931 boolean_t
1884 1932 spa_has_pending_synctask(spa_t *spa)
1885 1933 {
1886 1934 return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
1887 1935 }
1888 1936
1889 1937 int
1890 1938 spa_mode(spa_t *spa)
1891 1939 {
1892 1940 return (spa->spa_mode);
1893 1941 }
1894 1942
1895 1943 uint64_t
1896 1944 spa_bootfs(spa_t *spa)
1897 1945 {
1898 1946 return (spa->spa_bootfs);
1899 1947 }
1900 1948
1901 1949 uint64_t
1902 1950 spa_delegation(spa_t *spa)
1903 1951 {
1904 1952 return (spa->spa_delegation);
1905 1953 }
1906 1954
1907 1955 objset_t *
1908 1956 spa_meta_objset(spa_t *spa)
1909 1957 {
1910 1958 return (spa->spa_meta_objset);
1911 1959 }
1912 1960
1913 1961 enum zio_checksum
1914 1962 spa_dedup_checksum(spa_t *spa)
1915 1963 {
1916 1964 return (spa->spa_dedup_checksum);
1917 1965 }
1918 1966
1919 1967 /*
1920 1968 * Reset pool scan stat per scan pass (or reboot).
1921 1969 */
1922 1970 void
1923 1971 spa_scan_stat_init(spa_t *spa)
1924 1972 {
1925 1973 /* data not stored on disk */
1926 1974 spa->spa_scan_pass_start = gethrestime_sec();
1927 1975 spa->spa_scan_pass_exam = 0;
1928 1976 vdev_scan_stat_init(spa->spa_root_vdev);
1929 1977 }
1930 1978
1931 1979 /*
1932 1980 * Get scan stats for zpool status reports
1933 1981 */
1934 1982 int
1935 1983 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
1936 1984 {
1937 1985 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
1938 1986
1939 1987 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
1940 1988 return (SET_ERROR(ENOENT));
1941 1989 bzero(ps, sizeof (pool_scan_stat_t));
1942 1990
1943 1991 /* data stored on disk */
1944 1992 ps->pss_func = scn->scn_phys.scn_func;
1945 1993 ps->pss_start_time = scn->scn_phys.scn_start_time;
1946 1994 ps->pss_end_time = scn->scn_phys.scn_end_time;
1947 1995 ps->pss_to_examine = scn->scn_phys.scn_to_examine;
1948 1996 ps->pss_examined = scn->scn_phys.scn_examined;
1949 1997 ps->pss_to_process = scn->scn_phys.scn_to_process;
1950 1998 ps->pss_processed = scn->scn_phys.scn_processed;
1951 1999 ps->pss_errors = scn->scn_phys.scn_errors;
1952 2000 ps->pss_state = scn->scn_phys.scn_state;
1953 2001
1954 2002 /* data not stored on disk */
1955 2003 ps->pss_pass_start = spa->spa_scan_pass_start;
1956 2004 ps->pss_pass_exam = spa->spa_scan_pass_exam;
1957 2005
1958 2006 return (0);
1959 2007 }
1960 2008
1961 2009 boolean_t
1962 2010 spa_debug_enabled(spa_t *spa)
1963 2011 {
1964 2012 return (spa->spa_debug);
1965 2013 }
1966 2014
1967 2015 int
1968 2016 spa_maxblocksize(spa_t *spa)
1969 2017 {
1970 2018 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
1971 2019 return (SPA_MAXBLOCKSIZE);
1972 2020 else
1973 2021 return (SPA_OLD_MAXBLOCKSIZE);
1974 2022 }
|
↓ open down ↓ |
293 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX