4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2013 Saso Kiselkov. All rights reserved.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright (c) 2017 Datto Inc.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/spa_impl.h>
33 #include <sys/spa_boot.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/zio_compress.h>
37 #include <sys/dmu.h>
38 #include <sys/dmu_tx.h>
39 #include <sys/zap.h>
40 #include <sys/zil.h>
41 #include <sys/vdev_impl.h>
42 #include <sys/metaslab.h>
43 #include <sys/uberblock_impl.h>
44 #include <sys/txg.h>
45 #include <sys/avl.h>
46 #include <sys/unique.h>
47 #include <sys/dsl_pool.h>
48 #include <sys/dsl_dir.h>
49 #include <sys/dsl_prop.h>
50 #include <sys/dsl_scan.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/metaslab_impl.h>
53 #include <sys/arc.h>
54 #include <sys/ddt.h>
55 #include "zfs_prop.h"
56 #include <sys/zfeature.h>
57
58 /*
59 * SPA locking
60 *
61 * There are four basic locks for managing spa_t structures:
62 *
63 * spa_namespace_lock (global mutex)
64 *
65 * This lock must be acquired to do any of the following:
66 *
67 * - Lookup a spa_t by name
68 * - Add or remove a spa_t from the namespace
69 * - Increase spa_refcount from non-zero
70 * - Check if spa_refcount is zero
71 * - Rename a spa_t
72 * - add/remove/attach/detach devices
73 * - Held for the duration of create/destroy/import/export
74 *
209 * cannot change in the interim, and that the vdev cannot be reopened.
210 * SCL_STATE as reader suffices for both.
211 *
212 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
213 *
214 * spa_vdev_enter() Acquire the namespace lock and the config lock
215 * for writing.
216 *
217 * spa_vdev_exit() Release the config lock, wait for all I/O
218 * to complete, sync the updated configs to the
219 * cache, and release the namespace lock.
220 *
221 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
222 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
223 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
224 *
225 * spa_rename() is also implemented within this file since it requires
226 * manipulation of the namespace.
227 */
228
229 static avl_tree_t spa_namespace_avl;
230 kmutex_t spa_namespace_lock;
231 static kcondvar_t spa_namespace_cv;
232 static int spa_active_count;
233 int spa_max_replication_override = SPA_DVAS_PER_BP;
234
235 static kmutex_t spa_spare_lock;
236 static avl_tree_t spa_spare_avl;
237 static kmutex_t spa_l2cache_lock;
238 static avl_tree_t spa_l2cache_avl;
239
240 kmem_cache_t *spa_buffer_pool;
241 int spa_mode_global;
242
243 #ifdef ZFS_DEBUG
244 /*
245 * Everything except dprintf, spa, and indirect_remap is on by default
246 * in debug builds.
247 */
248 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP);
249 #else
250 int zfs_flags = 0;
251 #endif
252
253 /*
254 * zfs_recover can be set to nonzero to attempt to recover from
255 * otherwise-fatal errors, typically caused by on-disk corruption. When
256 * set, calls to zfs_panic_recover() will turn into warning messages.
257 * This should only be used as a last resort, as it typically results
258 * in leaked space, or worse.
259 */
260 boolean_t zfs_recover = B_FALSE;
261
262 /*
263 * If destroy encounters an EIO while reading metadata (e.g. indirect
264 * blocks), space referenced by the missing metadata can not be freed.
265 * Normally this causes the background destroy to become "stalled", as
266 * it is unable to make forward progress. While in this stalled state,
267 * all remaining space to free from the error-encountering filesystem is
268 * "temporarily leaked". Set this flag to cause it to ignore the EIO,
269 * permanently leak the space from indirect blocks that can not be read,
270 * and continue to free everything else that it can.
271 *
272 * The default, "stalling" behavior is useful if the storage partially
274 * this case, we will be able to continue pool operations while it is
275 * partially failed, and when it recovers, we can continue to free the
276 * space, with no leaks. However, note that this case is actually
277 * fairly rare.
278 *
279 * Typically pools either (a) fail completely (but perhaps temporarily,
280 * e.g. a top-level vdev going offline), or (b) have localized,
281 * permanent errors (e.g. disk returns the wrong data due to bit flip or
282 * firmware bug). In case (a), this setting does not matter because the
283 * pool will be suspended and the sync thread will not be able to make
284 * forward progress regardless. In case (b), because the error is
285 * permanent, the best we can do is leak the minimum amount of space,
286 * which is what setting this flag will do. Therefore, it is reasonable
287 * for this flag to normally be set, but we chose the more conservative
288 * approach of not setting it, so that there is no possibility of
289 * leaking space in the "partial temporary" failure case.
290 */
291 boolean_t zfs_free_leak_on_eio = B_FALSE;
292
293 /*
294 * Expiration time in milliseconds. This value has two meanings. First it is
295 * used to determine when the spa_deadman() logic should fire. By default the
296 * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
297 * Secondly, the value determines if an I/O is considered "hung". Any I/O that
298 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
299 * in a system panic.
300 */
301 uint64_t zfs_deadman_synctime_ms = 1000000ULL;
302
303 /*
304 * Check time in milliseconds. This defines the frequency at which we check
305 * for hung I/O.
306 */
307 uint64_t zfs_deadman_checktime_ms = 5000ULL;
308
309 /*
310 * Override the zfs deadman behavior via /etc/system. By default the
311 * deadman is enabled except on VMware and sparc deployments.
312 */
313 int zfs_deadman_enabled = -1;
314
315 /*
316 * The worst case is single-sector max-parity RAID-Z blocks, in which
317 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
318 * times the size; so just assume that. Add to this the fact that
319 * we can have up to 3 DVAs per bp, and one more factor of 2 because
320 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
321 * the worst case is:
337 * space free, the user will use these operations to free up space in the pool.
338 * These are the operations that call dsl_pool_adjustedsize() with the netfree
339 * argument set to TRUE.
340 *
341 * A very restricted set of operations are always permitted, regardless of
342 * the amount of free space. These are the operations that call
343 * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
344 * operations result in a net increase in the amount of space used,
345 * it is possible to run the pool completely out of space, causing it to
346 * be permanently read-only.
347 *
348 * Note that on very small pools, the slop space will be larger than
349 * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
350 * but we never allow it to be more than half the pool size.
351 *
352 * See also the comments in zfs_space_check_t.
353 */
354 int spa_slop_shift = 5;
355 uint64_t spa_min_slop = 128 * 1024 * 1024;
356
357 /*PRINTFLIKE2*/
358 void
359 spa_load_failed(spa_t *spa, const char *fmt, ...)
360 {
361 va_list adx;
362 char buf[256];
363
364 va_start(adx, fmt);
365 (void) vsnprintf(buf, sizeof (buf), fmt, adx);
366 va_end(adx);
367
368 zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
369 spa->spa_trust_config ? "trusted" : "untrusted", buf);
370 }
371
372 /*PRINTFLIKE2*/
373 void
374 spa_load_note(spa_t *spa, const char *fmt, ...)
375 {
376 va_list adx;
377 char buf[256];
378
379 va_start(adx, fmt);
380 (void) vsnprintf(buf, sizeof (buf), fmt, adx);
381 va_end(adx);
382
383 zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
384 spa->spa_trust_config ? "trusted" : "untrusted", buf);
385 }
386
387 /*
388 * ==========================================================================
389 * SPA config locking
390 * ==========================================================================
391 */
392 static void
393 spa_config_lock_init(spa_t *spa)
394 {
395 for (int i = 0; i < SCL_LOCKS; i++) {
396 spa_config_lock_t *scl = &spa->spa_config_lock[i];
397 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
398 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
399 refcount_create_untracked(&scl->scl_count);
400 scl->scl_writer = NULL;
401 scl->scl_write_wanted = 0;
402 }
403 }
404
405 static void
406 spa_config_lock_destroy(spa_t *spa)
459 wlocks_held |= (1 << i);
460 if (!(locks & (1 << i)))
461 continue;
462 mutex_enter(&scl->scl_lock);
463 if (rw == RW_READER) {
464 while (scl->scl_writer || scl->scl_write_wanted) {
465 cv_wait(&scl->scl_cv, &scl->scl_lock);
466 }
467 } else {
468 ASSERT(scl->scl_writer != curthread);
469 while (!refcount_is_zero(&scl->scl_count)) {
470 scl->scl_write_wanted++;
471 cv_wait(&scl->scl_cv, &scl->scl_lock);
472 scl->scl_write_wanted--;
473 }
474 scl->scl_writer = curthread;
475 }
476 (void) refcount_add(&scl->scl_count, tag);
477 mutex_exit(&scl->scl_lock);
478 }
479 ASSERT3U(wlocks_held, <=, locks);
480 }
481
482 void
483 spa_config_exit(spa_t *spa, int locks, void *tag)
484 {
485 for (int i = SCL_LOCKS - 1; i >= 0; i--) {
486 spa_config_lock_t *scl = &spa->spa_config_lock[i];
487 if (!(locks & (1 << i)))
488 continue;
489 mutex_enter(&scl->scl_lock);
490 ASSERT(!refcount_is_zero(&scl->scl_count));
491 if (refcount_remove(&scl->scl_count, tag) == 0) {
492 ASSERT(scl->scl_writer == NULL ||
493 scl->scl_writer == curthread);
494 scl->scl_writer = NULL; /* OK in either case */
495 cv_broadcast(&scl->scl_cv);
496 }
497 mutex_exit(&scl->scl_lock);
498 }
499 }
570
571 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
572 (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
573 ++spa->spa_deadman_calls);
574 if (zfs_deadman_enabled)
575 vdev_deadman(spa->spa_root_vdev);
576 }
577
578 /*
579 * Create an uninitialized spa_t with the given name. Requires
580 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
581 * exist by calling spa_lookup() first.
582 */
583 spa_t *
584 spa_add(const char *name, nvlist_t *config, const char *altroot)
585 {
586 spa_t *spa;
587 spa_config_dirent_t *dp;
588 cyc_handler_t hdlr;
589 cyc_time_t when;
590
591 ASSERT(MUTEX_HELD(&spa_namespace_lock));
592
593 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
594
595 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
596 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
597 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
598 mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
599 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
600 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
601 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
602 mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
603 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
604 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
605 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
606 mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
607 mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
608
609 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
610 cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
611 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
612 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
613 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
614
615 for (int t = 0; t < TXG_SIZE; t++)
616 bplist_create(&spa->spa_free_bplist[t]);
617
618 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
619 spa->spa_state = POOL_STATE_UNINITIALIZED;
620 spa->spa_freeze_txg = UINT64_MAX;
621 spa->spa_final_txg = UINT64_MAX;
622 spa->spa_load_max_txg = UINT64_MAX;
623 spa->spa_proc = &p0;
624 spa->spa_proc_state = SPA_PROC_NONE;
625 spa->spa_trust_config = B_TRUE;
626
627 hdlr.cyh_func = spa_deadman;
628 hdlr.cyh_arg = spa;
629 hdlr.cyh_level = CY_LOW_LEVEL;
630
631 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
632
633 /*
634 * This determines how often we need to check for hung I/Os after
635 * the cyclic has already fired. Since checking for hung I/Os is
636 * an expensive operation we don't want to check too frequently.
637 * Instead wait for 5 seconds before checking again.
638 */
639 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
640 when.cyt_when = CY_INFINITY;
641 mutex_enter(&cpu_lock);
642 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
643 mutex_exit(&cpu_lock);
644
645 refcount_create(&spa->spa_refcount);
646 spa_config_lock_init(spa);
647
648 avl_add(&spa_namespace_avl, spa);
649
650 /*
651 * Set the alternate root, if there is one.
652 */
653 if (altroot) {
654 spa->spa_root = spa_strdup(altroot);
655 spa_active_count++;
656 }
657
658 avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
659 sizeof (zio_t), offsetof(zio_t, io_alloc_node));
660
661 /*
662 * Every pool starts with the default cachefile
663 */
664 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
665 offsetof(spa_config_dirent_t, scd_link));
666
667 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
668 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
669 list_insert_head(&spa->spa_config_list, dp);
670
671 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
672 KM_SLEEP) == 0);
673
674 if (config != NULL) {
675 nvlist_t *features;
676
677 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
678 &features) == 0) {
679 VERIFY(nvlist_dup(features, &spa->spa_label_features,
680 0) == 0);
681 }
682
683 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
684 }
685
686 if (spa->spa_label_features == NULL) {
687 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
688 KM_SLEEP) == 0);
689 }
690
691 spa->spa_iokstat = kstat_create("zfs", 0, name,
692 "disk", KSTAT_TYPE_IO, 1, 0);
693 if (spa->spa_iokstat) {
694 spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock;
695 kstat_install(spa->spa_iokstat);
696 }
697
698 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
699
700 spa->spa_min_ashift = INT_MAX;
701 spa->spa_max_ashift = 0;
702
703 /*
704 * As a pool is being created, treat all features as disabled by
705 * setting SPA_FEATURE_DISABLED for all entries in the feature
706 * refcount cache.
707 */
708 for (int i = 0; i < SPA_FEATURES; i++) {
709 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
710 }
711
712 return (spa);
713 }
714
715 /*
716 * Removes a spa_t from the namespace, freeing up any memory used. Requires
717 * spa_namespace_lock. This is called only after the spa_t has been closed and
718 * deactivated.
719 */
720 void
721 spa_remove(spa_t *spa)
726 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
727 ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
728
729 nvlist_free(spa->spa_config_splitting);
730
731 avl_remove(&spa_namespace_avl, spa);
732 cv_broadcast(&spa_namespace_cv);
733
734 if (spa->spa_root) {
735 spa_strfree(spa->spa_root);
736 spa_active_count--;
737 }
738
739 while ((dp = list_head(&spa->spa_config_list)) != NULL) {
740 list_remove(&spa->spa_config_list, dp);
741 if (dp->scd_path != NULL)
742 spa_strfree(dp->scd_path);
743 kmem_free(dp, sizeof (spa_config_dirent_t));
744 }
745
746 avl_destroy(&spa->spa_alloc_tree);
747 list_destroy(&spa->spa_config_list);
748
749 nvlist_free(spa->spa_label_features);
750 nvlist_free(spa->spa_load_info);
751 spa_config_set(spa, NULL);
752
753 mutex_enter(&cpu_lock);
754 if (spa->spa_deadman_cycid != CYCLIC_NONE)
755 cyclic_remove(spa->spa_deadman_cycid);
756 mutex_exit(&cpu_lock);
757 spa->spa_deadman_cycid = CYCLIC_NONE;
758
759 refcount_destroy(&spa->spa_refcount);
760
761 spa_config_lock_destroy(spa);
762
763 kstat_delete(spa->spa_iokstat);
764 spa->spa_iokstat = NULL;
765
766 for (int t = 0; t < TXG_SIZE; t++)
767 bplist_destroy(&spa->spa_free_bplist[t]);
768
769 zio_checksum_templates_free(spa);
770
771 cv_destroy(&spa->spa_async_cv);
772 cv_destroy(&spa->spa_evicting_os_cv);
773 cv_destroy(&spa->spa_proc_cv);
774 cv_destroy(&spa->spa_scrub_io_cv);
775 cv_destroy(&spa->spa_suspend_cv);
776
777 mutex_destroy(&spa->spa_alloc_lock);
778 mutex_destroy(&spa->spa_async_lock);
779 mutex_destroy(&spa->spa_errlist_lock);
780 mutex_destroy(&spa->spa_errlog_lock);
781 mutex_destroy(&spa->spa_evicting_os_lock);
782 mutex_destroy(&spa->spa_history_lock);
783 mutex_destroy(&spa->spa_proc_lock);
784 mutex_destroy(&spa->spa_props_lock);
785 mutex_destroy(&spa->spa_cksum_tmpls_lock);
786 mutex_destroy(&spa->spa_scrub_lock);
787 mutex_destroy(&spa->spa_suspend_lock);
788 mutex_destroy(&spa->spa_vdev_top_lock);
789 mutex_destroy(&spa->spa_iokstat_lock);
790
791 kmem_free(spa, sizeof (spa_t));
792 }
793
794 /*
795 * Given a pool, return the next pool in the namespace, or NULL if there is
796 * none. If 'prev' is NULL, return the first pool.
797 */
798 spa_t *
799 spa_next(spa_t *prev)
800 {
801 ASSERT(MUTEX_HELD(&spa_namespace_lock));
802
803 if (prev)
804 return (AVL_NEXT(&spa_namespace_avl, prev));
805 else
806 return (avl_first(&spa_namespace_avl));
807 }
808
809 /*
1093 spa_aux_activate(vd, &spa_l2cache_avl);
1094 mutex_exit(&spa_l2cache_lock);
1095 }
1096
1097 /*
1098 * ==========================================================================
1099 * SPA vdev locking
1100 * ==========================================================================
1101 */
1102
1103 /*
1104 * Lock the given spa_t for the purpose of adding or removing a vdev.
1105 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1106 * It returns the next transaction group for the spa_t.
1107 */
1108 uint64_t
1109 spa_vdev_enter(spa_t *spa)
1110 {
1111 mutex_enter(&spa->spa_vdev_top_lock);
1112 mutex_enter(&spa_namespace_lock);
1113 return (spa_vdev_config_enter(spa));
1114 }
1115
1116 /*
1117 * Internal implementation for spa_vdev_enter(). Used when a vdev
1118 * operation requires multiple syncs (i.e. removing a device) while
1119 * keeping the spa_namespace_lock held.
1120 */
1121 uint64_t
1122 spa_vdev_config_enter(spa_t *spa)
1123 {
1124 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1125
1126 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1127
1128 return (spa_last_synced_txg(spa) + 1);
1129 }
1130
1131 /*
1132 * Used in combination with spa_vdev_config_enter() to allow the syncing
1141
1142 ASSERT(txg > spa_last_synced_txg(spa));
1143
1144 spa->spa_pending_vdev = NULL;
1145
1146 /*
1147 * Reassess the DTLs.
1148 */
1149 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1150
1151 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1152 config_changed = B_TRUE;
1153 spa->spa_config_generation++;
1154 }
1155
1156 /*
1157 * Verify the metaslab classes.
1158 */
1159 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1160 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1161
1162 spa_config_exit(spa, SCL_ALL, spa);
1163
1164 /*
1165 * Panic the system if the specified tag requires it. This
1166 * is useful for ensuring that configurations are updated
1167 * transactionally.
1168 */
1169 if (zio_injection_enabled)
1170 zio_handle_panic_injection(spa, tag, 0);
1171
1172 /*
1173 * Note: this txg_wait_synced() is important because it ensures
1174 * that there won't be more than one config change per txg.
1175 * This allows us to use the txg as the generation number.
1176 */
1177 if (error == 0)
1178 txg_wait_synced(spa->spa_dsl_pool, txg);
1179
1180 if (vd != NULL) {
1181 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1182 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1183 vdev_free(vd);
1184 spa_config_exit(spa, SCL_ALL, spa);
1185 }
1186
1187 /*
1188 * If the config changed, update the config cache.
1189 */
1190 if (config_changed)
1191 spa_write_cachefile(spa, B_FALSE, B_TRUE);
1192 }
1193
1194 /*
1195 * Unlock the spa_t after adding or removing a vdev. Besides undoing the
1196 * locking of spa_vdev_enter(), we also want make sure the transactions have
1197 * synced to disk, and then update the global configuration cache with the new
1198 * information.
1199 */
1200 int
1201 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1202 {
1203 spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1204 mutex_exit(&spa_namespace_lock);
1205 mutex_exit(&spa->spa_vdev_top_lock);
1206
1207 return (error);
1208 }
1209
1210 /*
1211 * Lock the given spa_t for the purpose of changing vdev state.
1212 */
1213 void
1214 spa_vdev_state_enter(spa_t *spa, int oplocks)
1215 {
1216 int locks = SCL_STATE_ALL | oplocks;
1217
1218 /*
1219 * Root pools may need to read of the underlying devfs filesystem
1220 * when opening up a vdev. Unfortunately if we're holding the
1221 * SCL_ZIO lock it will result in a deadlock when we try to issue
1222 * the read from the root filesystem. Instead we "prefetch"
1223 * the associated vnodes that we need prior to opening the
1255 if (spa_is_root(spa))
1256 vdev_rele(spa->spa_root_vdev);
1257
1258 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1259 spa_config_exit(spa, spa->spa_vdev_locks, spa);
1260
1261 /*
1262 * If anything changed, wait for it to sync. This ensures that,
1263 * from the system administrator's perspective, zpool(1M) commands
1264 * are synchronous. This is important for things like zpool offline:
1265 * when the command completes, you expect no further I/O from ZFS.
1266 */
1267 if (vd != NULL)
1268 txg_wait_synced(spa->spa_dsl_pool, 0);
1269
1270 /*
1271 * If the config changed, update the config cache.
1272 */
1273 if (config_changed) {
1274 mutex_enter(&spa_namespace_lock);
1275 spa_write_cachefile(spa, B_FALSE, B_TRUE);
1276 mutex_exit(&spa_namespace_lock);
1277 }
1278
1279 return (error);
1280 }
1281
1282 /*
1283 * ==========================================================================
1284 * Miscellaneous functions
1285 * ==========================================================================
1286 */
1287
1288 void
1289 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1290 {
1291 if (!nvlist_exists(spa->spa_label_features, feature)) {
1292 fnvlist_add_boolean(spa->spa_label_features, feature);
1293 /*
1294 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1295 * dirty the vdev config because lock SCL_CONFIG is not held.
1333 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1334
1335 avl_remove(&spa_namespace_avl, spa);
1336 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1337 avl_add(&spa_namespace_avl, spa);
1338
1339 /*
1340 * Sync all labels to disk with the new names by marking the root vdev
1341 * dirty and waiting for it to sync. It will pick up the new pool name
1342 * during the sync.
1343 */
1344 vdev_config_dirty(spa->spa_root_vdev);
1345
1346 spa_config_exit(spa, SCL_ALL, FTAG);
1347
1348 txg_wait_synced(spa->spa_dsl_pool, 0);
1349
1350 /*
1351 * Sync the updated config cache.
1352 */
1353 spa_write_cachefile(spa, B_FALSE, B_TRUE);
1354
1355 spa_close(spa, FTAG);
1356
1357 mutex_exit(&spa_namespace_lock);
1358
1359 return (0);
1360 }
1361
1362 /*
1363 * Return the spa_t associated with given pool_guid, if it exists. If
1364 * device_guid is non-zero, determine whether the pool exists *and* contains
1365 * a device with the specified device_guid.
1366 */
1367 spa_t *
1368 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1369 {
1370 spa_t *spa;
1371 avl_tree_t *t = &spa_namespace_avl;
1372
1373 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1391 if (spa->spa_pending_vdev) {
1392 if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1393 device_guid) != NULL)
1394 break;
1395 }
1396 }
1397 }
1398
1399 return (spa);
1400 }
1401
1402 /*
1403 * Determine whether a pool with the given pool_guid exists.
1404 */
1405 boolean_t
1406 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1407 {
1408 return (spa_by_guid(pool_guid, device_guid) != NULL);
1409 }
1410
1411 char *
1412 spa_strdup(const char *s)
1413 {
1414 size_t len;
1415 char *new;
1416
1417 len = strlen(s);
1418 new = kmem_alloc(len + 1, KM_SLEEP);
1419 bcopy(s, new, len);
1420 new[len] = '\0';
1421
1422 return (new);
1423 }
1424
1425 void
1426 spa_strfree(char *s)
1427 {
1428 kmem_free(s, strlen(s) + 1);
1429 }
1430
1549 */
1550
1551 boolean_t
1552 spa_shutting_down(spa_t *spa)
1553 {
1554 return (spa->spa_async_suspended);
1555 }
1556
1557 dsl_pool_t *
1558 spa_get_dsl(spa_t *spa)
1559 {
1560 return (spa->spa_dsl_pool);
1561 }
1562
1563 boolean_t
1564 spa_is_initializing(spa_t *spa)
1565 {
1566 return (spa->spa_is_initializing);
1567 }
1568
1569 boolean_t
1570 spa_indirect_vdevs_loaded(spa_t *spa)
1571 {
1572 return (spa->spa_indirect_vdevs_loaded);
1573 }
1574
1575 blkptr_t *
1576 spa_get_rootblkptr(spa_t *spa)
1577 {
1578 return (&spa->spa_ubsync.ub_rootbp);
1579 }
1580
1581 void
1582 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1583 {
1584 spa->spa_uberblock.ub_rootbp = *bp;
1585 }
1586
1587 void
1588 spa_altroot(spa_t *spa, char *buf, size_t buflen)
1589 {
1590 if (spa->spa_root == NULL)
1591 buf[0] = '\0';
1592 else
1593 (void) strncpy(buf, spa->spa_root, buflen);
1594 }
1681 spa_load_state_t
1682 spa_load_state(spa_t *spa)
1683 {
1684 return (spa->spa_load_state);
1685 }
1686
1687 uint64_t
1688 spa_freeze_txg(spa_t *spa)
1689 {
1690 return (spa->spa_freeze_txg);
1691 }
1692
1693 /* ARGSUSED */
1694 uint64_t
1695 spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
1696 {
1697 return (lsize * spa_asize_inflation);
1698 }
1699
1700 /*
1701 * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
1702 * or at least 128MB, unless that would cause it to be more than half the
1703 * pool size.
1704 *
1705 * See the comment above spa_slop_shift for details.
1706 */
1707 uint64_t
1708 spa_get_slop_space(spa_t *spa)
1709 {
1710 uint64_t space = spa_get_dspace(spa);
1711 return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
1712 }
1713
1714 uint64_t
1715 spa_get_dspace(spa_t *spa)
1716 {
1717 return (spa->spa_dspace);
1718 }
1719
1720 void
1721 spa_update_dspace(spa_t *spa)
1722 {
1723 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1724 ddt_get_dedup_dspace(spa);
1725 if (spa->spa_vdev_removal != NULL) {
1726 /*
1727 * We can't allocate from the removing device, so
1728 * subtract its size. This prevents the DMU/DSL from
1729 * filling up the (now smaller) pool while we are in the
1730 * middle of removing the device.
1731 *
1732 * Note that the DMU/DSL doesn't actually know or care
1733 * how much space is allocated (it does its own tracking
1734 * of how much space has been logically used). So it
1735 * doesn't matter that the data we are moving may be
1736 * allocated twice (on the old device and the new
1737 * device).
1738 */
1739 vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
1740 spa->spa_dspace -= spa_deflate(spa) ?
1741 vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
1742 }
1743 }
1744
1745 /*
1746 * Return the failure mode that has been set to this pool. The default
1747 * behavior will be to block all I/Os when a complete failure occurs.
1748 */
1749 uint8_t
1750 spa_get_failmode(spa_t *spa)
1751 {
1752 return (spa->spa_failmode);
1753 }
1754
1755 boolean_t
1756 spa_suspended(spa_t *spa)
1757 {
1758 return (spa->spa_suspended);
1759 }
1760
1761 uint64_t
1762 spa_version(spa_t *spa)
1763 {
1764 return (spa->spa_ubsync.ub_version);
1765 }
1766
1767 boolean_t
1768 spa_deflate(spa_t *spa)
1769 {
1770 return (spa->spa_deflate);
1771 }
1772
1773 metaslab_class_t *
1774 spa_normal_class(spa_t *spa)
1775 {
1776 return (spa->spa_normal_class);
1777 }
1778
1779 metaslab_class_t *
1780 spa_log_class(spa_t *spa)
1781 {
1782 return (spa->spa_log_class);
1783 }
1784
1785 void
1786 spa_evicting_os_register(spa_t *spa, objset_t *os)
1787 {
1788 mutex_enter(&spa->spa_evicting_os_lock);
1789 list_insert_head(&spa->spa_evicting_os_list, os);
1790 mutex_exit(&spa->spa_evicting_os_lock);
1791 }
1792
1793 void
1794 spa_evicting_os_deregister(spa_t *spa, objset_t *os)
1795 {
1796 mutex_enter(&spa->spa_evicting_os_lock);
1797 list_remove(&spa->spa_evicting_os_list, os);
1798 cv_broadcast(&spa->spa_evicting_os_cv);
1799 mutex_exit(&spa->spa_evicting_os_lock);
1800 }
1801
1802 void
1803 spa_evicting_os_wait(spa_t *spa)
1804 {
1805 mutex_enter(&spa->spa_evicting_os_lock);
1806 while (!list_is_empty(&spa->spa_evicting_os_list))
1807 cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1808 mutex_exit(&spa->spa_evicting_os_lock);
1809
1810 dmu_buf_user_evict_wait();
1811 }
1812
1813 int
1814 spa_max_replication(spa_t *spa)
1815 {
1816 /*
1817 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1818 * handle BPs with more than one DVA allocated. Set our max
1819 * replication level accordingly.
1820 */
1821 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1822 return (1);
1823 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1824 }
1825
1826 int
1827 spa_prev_software_version(spa_t *spa)
1828 {
1829 return (spa->spa_prev_software_version);
1830 }
1831
1832 uint64_t
1833 spa_deadman_synctime(spa_t *spa)
1834 {
1835 return (spa->spa_deadman_synctime);
1836 }
1837
1838 uint64_t
1839 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
1840 {
1841 uint64_t asize = DVA_GET_ASIZE(dva);
1842 uint64_t dsize = asize;
1843
1844 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1845
1846 if (asize != 0 && spa->spa_deflate) {
1847 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
1848 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
1849 }
1850
1851 return (dsize);
1852 }
1853
1854 uint64_t
1855 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
1856 {
1857 uint64_t dsize = 0;
1858
1859 for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1860 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1861
1862 return (dsize);
1863 }
1864
1865 uint64_t
1866 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
1867 {
1868 uint64_t dsize = 0;
1869
1870 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
1871
1872 for (int d = 0; d < BP_GET_NDVAS(bp); d++)
1873 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
1874
1875 spa_config_exit(spa, SCL_VDEV, FTAG);
1876
1877 return (dsize);
1878 }
1879
1880 /*
1881 * ==========================================================================
1882 * Initialization and Termination
1883 * ==========================================================================
1884 */
1885
1886 static int
1887 spa_name_compare(const void *a1, const void *a2)
1888 {
1889 const spa_t *s1 = a1;
1890 const spa_t *s2 = a2;
1891 int s;
1892
1893 s = strcmp(s1->spa_name, s2->spa_name);
1912
1913 void
1914 spa_init(int mode)
1915 {
1916 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
1917 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
1918 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
1919 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
1920
1921 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
1922 offsetof(spa_t, spa_avl));
1923
1924 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
1925 offsetof(spa_aux_t, aux_avl));
1926
1927 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
1928 offsetof(spa_aux_t, aux_avl));
1929
1930 spa_mode_global = mode;
1931
1932 #ifdef _KERNEL
1933 spa_arch_init();
1934 #else
1935 if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
1936 arc_procfd = open("/proc/self/ctl", O_WRONLY);
1937 if (arc_procfd == -1) {
1938 perror("could not enable watchpoints: "
1939 "opening /proc/self/ctl failed: ");
1940 } else {
1941 arc_watch = B_TRUE;
1942 }
1943 }
1944 #endif
1945
1946 refcount_init();
1947 unique_init();
1948 range_tree_init();
1949 metaslab_alloc_trace_init();
1950 zio_init();
1951 dmu_init();
1952 zil_init();
1953 vdev_cache_stat_init();
1954 zfs_prop_init();
1955 zpool_prop_init();
1956 zpool_feature_init();
1957 spa_config_load();
1958 l2arc_start();
1959 }
1960
1961 void
1962 spa_fini(void)
1963 {
1964 l2arc_stop();
1965
1966 spa_evict_all();
1967
1968 vdev_cache_stat_fini();
1969 zil_fini();
1970 dmu_fini();
1971 zio_fini();
1972 metaslab_alloc_trace_fini();
1973 range_tree_fini();
1974 unique_fini();
1975 refcount_fini();
1976
1977 avl_destroy(&spa_namespace_avl);
1978 avl_destroy(&spa_spare_avl);
1979 avl_destroy(&spa_l2cache_avl);
1980
1981 cv_destroy(&spa_namespace_cv);
1982 mutex_destroy(&spa_namespace_lock);
1983 mutex_destroy(&spa_spare_lock);
1984 mutex_destroy(&spa_l2cache_lock);
1985 }
1986
1987 /*
1988 * Return whether this pool has slogs. No locking needed.
1989 * It's not a problem if the wrong answer is returned as it's only for
1990 * performance and not correctness
1991 */
1992 boolean_t
1993 spa_has_slogs(spa_t *spa)
1994 {
1995 return (spa->spa_log_class->mc_rotor != NULL);
1996 }
1999 spa_get_log_state(spa_t *spa)
2000 {
2001 return (spa->spa_log_state);
2002 }
2003
2004 void
2005 spa_set_log_state(spa_t *spa, spa_log_state_t state)
2006 {
2007 spa->spa_log_state = state;
2008 }
2009
2010 boolean_t
2011 spa_is_root(spa_t *spa)
2012 {
2013 return (spa->spa_is_root);
2014 }
2015
2016 boolean_t
2017 spa_writeable(spa_t *spa)
2018 {
2019 return (!!(spa->spa_mode & FWRITE) && spa->spa_trust_config);
2020 }
2021
2022 /*
2023 * Returns true if there is a pending sync task in any of the current
2024 * syncing txg, the current quiescing txg, or the current open txg.
2025 */
2026 boolean_t
2027 spa_has_pending_synctask(spa_t *spa)
2028 {
2029 return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
2030 }
2031
2032 int
2033 spa_mode(spa_t *spa)
2034 {
2035 return (spa->spa_mode);
2036 }
2037
2038 uint64_t
2039 spa_bootfs(spa_t *spa)
2040 {
2041 return (spa->spa_bootfs);
2042 }
2043
2044 uint64_t
2045 spa_delegation(spa_t *spa)
2046 {
2047 return (spa->spa_delegation);
2048 }
2049
2050 objset_t *
2051 spa_meta_objset(spa_t *spa)
2056 enum zio_checksum
2057 spa_dedup_checksum(spa_t *spa)
2058 {
2059 return (spa->spa_dedup_checksum);
2060 }
2061
2062 /*
2063 * Reset pool scan stat per scan pass (or reboot).
2064 */
2065 void
2066 spa_scan_stat_init(spa_t *spa)
2067 {
2068 /* data not stored on disk */
2069 spa->spa_scan_pass_start = gethrestime_sec();
2070 if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
2071 spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
2072 else
2073 spa->spa_scan_pass_scrub_pause = 0;
2074 spa->spa_scan_pass_scrub_spent_paused = 0;
2075 spa->spa_scan_pass_exam = 0;
2076 vdev_scan_stat_init(spa->spa_root_vdev);
2077 }
2078
2079 /*
2080 * Get scan stats for zpool status reports
2081 */
2082 int
2083 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2084 {
2085 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2086
2087 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
2088 return (SET_ERROR(ENOENT));
2089 bzero(ps, sizeof (pool_scan_stat_t));
2090
2091 /* data stored on disk */
2092 ps->pss_func = scn->scn_phys.scn_func;
2093 ps->pss_start_time = scn->scn_phys.scn_start_time;
2094 ps->pss_end_time = scn->scn_phys.scn_end_time;
2095 ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2096 ps->pss_examined = scn->scn_phys.scn_examined;
2097 ps->pss_to_process = scn->scn_phys.scn_to_process;
2098 ps->pss_processed = scn->scn_phys.scn_processed;
2099 ps->pss_errors = scn->scn_phys.scn_errors;
2100 ps->pss_state = scn->scn_phys.scn_state;
2101
2102 /* data not stored on disk */
2103 ps->pss_pass_start = spa->spa_scan_pass_start;
2104 ps->pss_pass_exam = spa->spa_scan_pass_exam;
2105 ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
2106 ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
2107
2108 return (0);
2109 }
2110
2111 boolean_t
2112 spa_debug_enabled(spa_t *spa)
2113 {
2114 return (spa->spa_debug);
2115 }
2116
2117 int
2118 spa_maxblocksize(spa_t *spa)
2119 {
2120 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2121 return (SPA_MAXBLOCKSIZE);
2122 else
2123 return (SPA_OLD_MAXBLOCKSIZE);
2124 }
2125
2126 /*
2127 * Returns the txg that the last device removal completed. No indirect mappings
2128 * have been added since this txg.
2129 */
2130 uint64_t
2131 spa_get_last_removal_txg(spa_t *spa)
2132 {
2133 uint64_t vdevid;
2134 uint64_t ret = -1ULL;
2135
2136 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2137 /*
2138 * sr_prev_indirect_vdev is only modified while holding all the
2139 * config locks, so it is sufficient to hold SCL_VDEV as reader when
2140 * examining it.
2141 */
2142 vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
2143
2144 while (vdevid != -1ULL) {
2145 vdev_t *vd = vdev_lookup_top(spa, vdevid);
2146 vdev_indirect_births_t *vib = vd->vdev_indirect_births;
2147
2148 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
2149
2150 /*
2151 * If the removal did not remap any data, we don't care.
2152 */
2153 if (vdev_indirect_births_count(vib) != 0) {
2154 ret = vdev_indirect_births_last_entry_txg(vib);
2155 break;
2156 }
2157
2158 vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
2159 }
2160 spa_config_exit(spa, SCL_VDEV, FTAG);
2161
2162 IMPLY(ret != -1ULL,
2163 spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
2164
2165 return (ret);
2166 }
2167
2168 boolean_t
2169 spa_trust_config(spa_t *spa)
2170 {
2171 return (spa->spa_trust_config);
2172 }
2173
2174 uint64_t
2175 spa_missing_tvds_allowed(spa_t *spa)
2176 {
2177 return (spa->spa_missing_tvds_allowed);
2178 }
2179
2180 void
2181 spa_set_missing_tvds(spa_t *spa, uint64_t missing)
2182 {
2183 spa->spa_missing_tvds = missing;
2184 }
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
25 * Copyright 2019 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2013 Saso Kiselkov. All rights reserved.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright (c) 2017 Datto Inc.
29 */
30
31 #include <sys/zfs_context.h>
32 #include <sys/spa_impl.h>
33 #include <sys/spa_boot.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/zio_compress.h>
37 #include <sys/dmu.h>
38 #include <sys/dmu_tx.h>
39 #include <sys/zap.h>
40 #include <sys/zil.h>
41 #include <sys/vdev_impl.h>
42 #include <sys/metaslab.h>
43 #include <sys/uberblock_impl.h>
44 #include <sys/txg.h>
45 #include <sys/avl.h>
46 #include <sys/unique.h>
47 #include <sys/dsl_pool.h>
48 #include <sys/dsl_dir.h>
49 #include <sys/dsl_prop.h>
50 #include <sys/dsl_scan.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/metaslab_impl.h>
53 #include <sys/arc.h>
54 #include <sys/ddt.h>
55 #include <sys/cos.h>
56 #include "zfs_prop.h"
57 #include <sys/zfeature.h>
58
59 /*
60 * SPA locking
61 *
62 * There are four basic locks for managing spa_t structures:
63 *
64 * spa_namespace_lock (global mutex)
65 *
66 * This lock must be acquired to do any of the following:
67 *
68 * - Lookup a spa_t by name
69 * - Add or remove a spa_t from the namespace
70 * - Increase spa_refcount from non-zero
71 * - Check if spa_refcount is zero
72 * - Rename a spa_t
73 * - add/remove/attach/detach devices
74 * - Held for the duration of create/destroy/import/export
75 *
210 * cannot change in the interim, and that the vdev cannot be reopened.
211 * SCL_STATE as reader suffices for both.
212 *
213 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
214 *
215 * spa_vdev_enter() Acquire the namespace lock and the config lock
216 * for writing.
217 *
218 * spa_vdev_exit() Release the config lock, wait for all I/O
219 * to complete, sync the updated configs to the
220 * cache, and release the namespace lock.
221 *
222 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
223 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
224 * locking is, always, based on spa_namespace_lock and spa_config_lock[].
225 *
226 * spa_rename() is also implemented within this file since it requires
227 * manipulation of the namespace.
228 */
229
230 struct spa_trimstats {
231 kstat_named_t st_extents; /* # of extents issued to zio */
232 kstat_named_t st_bytes; /* # of bytes issued to zio */
233 kstat_named_t st_extents_skipped; /* # of extents too small */
234 kstat_named_t st_bytes_skipped; /* bytes in extents_skipped */
235 kstat_named_t st_auto_slow; /* trim slow, exts dropped */
236 };
237
238 static avl_tree_t spa_namespace_avl;
239 kmutex_t spa_namespace_lock;
240 static kcondvar_t spa_namespace_cv;
241 static int spa_active_count;
242 int spa_max_replication_override = SPA_DVAS_PER_BP;
243
244 static kmutex_t spa_spare_lock;
245 static avl_tree_t spa_spare_avl;
246 static kmutex_t spa_l2cache_lock;
247 static avl_tree_t spa_l2cache_avl;
248
249 kmem_cache_t *spa_buffer_pool;
250 int spa_mode_global;
251
252 #ifdef ZFS_DEBUG
253 /* Everything except dprintf and spa is on by default in debug builds */
254 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
255 #else
256 int zfs_flags = 0;
257 #endif
258
259 #define ZFS_OBJ_MTX_DEFAULT_SZ 64
260 uint64_t spa_obj_mtx_sz = ZFS_OBJ_MTX_DEFAULT_SZ;
261
262 /*
263 * zfs_recover can be set to nonzero to attempt to recover from
264 * otherwise-fatal errors, typically caused by on-disk corruption. When
265 * set, calls to zfs_panic_recover() will turn into warning messages.
266 * This should only be used as a last resort, as it typically results
267 * in leaked space, or worse.
268 */
269 boolean_t zfs_recover = B_FALSE;
270
271 /*
272 * If destroy encounters an EIO while reading metadata (e.g. indirect
273 * blocks), space referenced by the missing metadata can not be freed.
274 * Normally this causes the background destroy to become "stalled", as
275 * it is unable to make forward progress. While in this stalled state,
276 * all remaining space to free from the error-encountering filesystem is
277 * "temporarily leaked". Set this flag to cause it to ignore the EIO,
278 * permanently leak the space from indirect blocks that can not be read,
279 * and continue to free everything else that it can.
280 *
281 * The default, "stalling" behavior is useful if the storage partially
283 * this case, we will be able to continue pool operations while it is
284 * partially failed, and when it recovers, we can continue to free the
285 * space, with no leaks. However, note that this case is actually
286 * fairly rare.
287 *
288 * Typically pools either (a) fail completely (but perhaps temporarily,
289 * e.g. a top-level vdev going offline), or (b) have localized,
290 * permanent errors (e.g. disk returns the wrong data due to bit flip or
291 * firmware bug). In case (a), this setting does not matter because the
292 * pool will be suspended and the sync thread will not be able to make
293 * forward progress regardless. In case (b), because the error is
294 * permanent, the best we can do is leak the minimum amount of space,
295 * which is what setting this flag will do. Therefore, it is reasonable
296 * for this flag to normally be set, but we chose the more conservative
297 * approach of not setting it, so that there is no possibility of
298 * leaking space in the "partial temporary" failure case.
299 */
300 boolean_t zfs_free_leak_on_eio = B_FALSE;
301
302 /*
303 * alpha for spa_update_latency() rolling average of pool latency, which
304 * is updated on every txg commit.
305 */
306 int64_t zfs_root_latency_alpha = 10;
307
308 /*
309 * Expiration time in milliseconds. This value has two meanings. First it is
310 * used to determine when the spa_deadman() logic should fire. By default the
311 * spa_deadman() will fire if spa_sync() has not completed in 250 seconds.
312 * Secondly, the value determines if an I/O is considered "hung". Any I/O that
313 * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
314 * in a system panic.
315 */
316 uint64_t zfs_deadman_synctime_ms = 250000ULL;
317
318 /*
319 * Check time in milliseconds. This defines the frequency at which we check
320 * for hung I/O.
321 */
322 uint64_t zfs_deadman_checktime_ms = 5000ULL;
323
324 /*
325 * Override the zfs deadman behavior via /etc/system. By default the
326 * deadman is enabled except on VMware and sparc deployments.
327 */
328 int zfs_deadman_enabled = -1;
329
330 /*
331 * The worst case is single-sector max-parity RAID-Z blocks, in which
332 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
333 * times the size; so just assume that. Add to this the fact that
334 * we can have up to 3 DVAs per bp, and one more factor of 2 because
335 * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
336 * the worst case is:
352 * space free, the user will use these operations to free up space in the pool.
353 * These are the operations that call dsl_pool_adjustedsize() with the netfree
354 * argument set to TRUE.
355 *
356 * A very restricted set of operations are always permitted, regardless of
357 * the amount of free space. These are the operations that call
358 * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
359 * operations result in a net increase in the amount of space used,
360 * it is possible to run the pool completely out of space, causing it to
361 * be permanently read-only.
362 *
363 * Note that on very small pools, the slop space will be larger than
364 * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
365 * but we never allow it to be more than half the pool size.
366 *
367 * See also the comments in zfs_space_check_t.
368 */
369 int spa_slop_shift = 5;
370 uint64_t spa_min_slop = 128 * 1024 * 1024;
371
372 static void spa_trimstats_create(spa_t *spa);
373 static void spa_trimstats_destroy(spa_t *spa);
374
375 /*
376 * ==========================================================================
377 * SPA config locking
378 * ==========================================================================
379 */
380 static void
381 spa_config_lock_init(spa_t *spa)
382 {
383 for (int i = 0; i < SCL_LOCKS; i++) {
384 spa_config_lock_t *scl = &spa->spa_config_lock[i];
385 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
386 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
387 refcount_create_untracked(&scl->scl_count);
388 scl->scl_writer = NULL;
389 scl->scl_write_wanted = 0;
390 }
391 }
392
393 static void
394 spa_config_lock_destroy(spa_t *spa)
447 wlocks_held |= (1 << i);
448 if (!(locks & (1 << i)))
449 continue;
450 mutex_enter(&scl->scl_lock);
451 if (rw == RW_READER) {
452 while (scl->scl_writer || scl->scl_write_wanted) {
453 cv_wait(&scl->scl_cv, &scl->scl_lock);
454 }
455 } else {
456 ASSERT(scl->scl_writer != curthread);
457 while (!refcount_is_zero(&scl->scl_count)) {
458 scl->scl_write_wanted++;
459 cv_wait(&scl->scl_cv, &scl->scl_lock);
460 scl->scl_write_wanted--;
461 }
462 scl->scl_writer = curthread;
463 }
464 (void) refcount_add(&scl->scl_count, tag);
465 mutex_exit(&scl->scl_lock);
466 }
467 ASSERT(wlocks_held <= locks);
468 }
469
470 void
471 spa_config_exit(spa_t *spa, int locks, void *tag)
472 {
473 for (int i = SCL_LOCKS - 1; i >= 0; i--) {
474 spa_config_lock_t *scl = &spa->spa_config_lock[i];
475 if (!(locks & (1 << i)))
476 continue;
477 mutex_enter(&scl->scl_lock);
478 ASSERT(!refcount_is_zero(&scl->scl_count));
479 if (refcount_remove(&scl->scl_count, tag) == 0) {
480 ASSERT(scl->scl_writer == NULL ||
481 scl->scl_writer == curthread);
482 scl->scl_writer = NULL; /* OK in either case */
483 cv_broadcast(&scl->scl_cv);
484 }
485 mutex_exit(&scl->scl_lock);
486 }
487 }
558
559 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
560 (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
561 ++spa->spa_deadman_calls);
562 if (zfs_deadman_enabled)
563 vdev_deadman(spa->spa_root_vdev);
564 }
565
566 /*
567 * Create an uninitialized spa_t with the given name. Requires
568 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already
569 * exist by calling spa_lookup() first.
570 */
571 spa_t *
572 spa_add(const char *name, nvlist_t *config, const char *altroot)
573 {
574 spa_t *spa;
575 spa_config_dirent_t *dp;
576 cyc_handler_t hdlr;
577 cyc_time_t when;
578 uint64_t guid;
579
580 ASSERT(MUTEX_HELD(&spa_namespace_lock));
581
582 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
583
584 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
585 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
586 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
587 mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
588 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
589 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
590 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
591 mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
592 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
593 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
594 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
595 mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
596 mutex_init(&spa->spa_cos_props_lock, NULL, MUTEX_DEFAULT, NULL);
597 mutex_init(&spa->spa_vdev_props_lock, NULL, MUTEX_DEFAULT, NULL);
598 mutex_init(&spa->spa_perfmon.perfmon_lock, NULL, MUTEX_DEFAULT, NULL);
599
600 mutex_init(&spa->spa_auto_trim_lock, NULL, MUTEX_DEFAULT, NULL);
601 mutex_init(&spa->spa_man_trim_lock, NULL, MUTEX_DEFAULT, NULL);
602
603 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
604 cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
605 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
606 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
607 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
608 cv_init(&spa->spa_auto_trim_done_cv, NULL, CV_DEFAULT, NULL);
609 cv_init(&spa->spa_man_trim_update_cv, NULL, CV_DEFAULT, NULL);
610 cv_init(&spa->spa_man_trim_done_cv, NULL, CV_DEFAULT, NULL);
611
612 for (int t = 0; t < TXG_SIZE; t++)
613 bplist_create(&spa->spa_free_bplist[t]);
614
615 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
616 spa->spa_state = POOL_STATE_UNINITIALIZED;
617 spa->spa_freeze_txg = UINT64_MAX;
618 spa->spa_final_txg = UINT64_MAX;
619 spa->spa_load_max_txg = UINT64_MAX;
620 spa->spa_proc = &p0;
621 spa->spa_proc_state = SPA_PROC_NONE;
622 if (spa_obj_mtx_sz < 1 || spa_obj_mtx_sz > INT_MAX)
623 spa->spa_obj_mtx_sz = ZFS_OBJ_MTX_DEFAULT_SZ;
624 else
625 spa->spa_obj_mtx_sz = spa_obj_mtx_sz;
626
627 /*
628 * Grabbing the guid here is just so that spa_config_guid_exists can
629 * check early on to protect against doubled imports of the same pool
630 * under different names. If the GUID isn't provided here, we will
631 * let spa generate one later on during spa_load, although in that
632 * case we might not be able to provide the double-import protection.
633 */
634 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0) {
635 spa->spa_config_guid = guid;
636 ASSERT(!spa_config_guid_exists(guid));
637 }
638
639 hdlr.cyh_func = spa_deadman;
640 hdlr.cyh_arg = spa;
641 hdlr.cyh_level = CY_LOW_LEVEL;
642
643 spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
644
645 /*
646 * This determines how often we need to check for hung I/Os after
647 * the cyclic has already fired. Since checking for hung I/Os is
648 * an expensive operation we don't want to check too frequently.
649 * Instead wait for 5 seconds before checking again.
650 */
651 when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
652 when.cyt_when = CY_INFINITY;
653 mutex_enter(&cpu_lock);
654 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
655 mutex_exit(&cpu_lock);
656
657 refcount_create(&spa->spa_refcount);
658 spa_config_lock_init(spa);
659
660 avl_add(&spa_namespace_avl, spa);
661
662 /*
663 * Set the alternate root, if there is one.
664 */
665 if (altroot) {
666 spa->spa_root = spa_strdup(altroot);
667 spa_active_count++;
668 }
669
670 /*
671 * Every pool starts with the default cachefile
672 */
673 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
674 offsetof(spa_config_dirent_t, scd_link));
675
676 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
677 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
678 list_insert_head(&spa->spa_config_list, dp);
679
680 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
681 KM_SLEEP) == 0);
682
683 if (config != NULL) {
684 nvlist_t *features;
685
686 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
687 &features) == 0) {
688 VERIFY(nvlist_dup(features, &spa->spa_label_features,
689 0) == 0);
690 }
691
692 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
693 }
694
695 if (spa->spa_label_features == NULL) {
696 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
697 KM_SLEEP) == 0);
698 }
699
700 spa->spa_iokstat = kstat_create("zfs", 0, name,
701 "zfs", KSTAT_TYPE_IO, 1, 0);
702 if (spa->spa_iokstat) {
703 spa->spa_iokstat->ks_lock = &spa->spa_iokstat_lock;
704 kstat_install(spa->spa_iokstat);
705 }
706
707 spa_trimstats_create(spa);
708
709 spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
710
711 autosnap_init(spa);
712
713 spa_cos_init(spa);
714
715 spa_special_init(spa);
716
717 spa->spa_min_ashift = INT_MAX;
718 spa->spa_max_ashift = 0;
719 wbc_init(&spa->spa_wbc, spa);
720
721 /*
722 * As a pool is being created, treat all features as disabled by
723 * setting SPA_FEATURE_DISABLED for all entries in the feature
724 * refcount cache.
725 */
726 for (int i = 0; i < SPA_FEATURES; i++) {
727 spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
728 }
729
730 return (spa);
731 }
732
733 /*
734 * Removes a spa_t from the namespace, freeing up any memory used. Requires
735 * spa_namespace_lock. This is called only after the spa_t has been closed and
736 * deactivated.
737 */
738 void
739 spa_remove(spa_t *spa)
744 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
745 ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
746
747 nvlist_free(spa->spa_config_splitting);
748
749 avl_remove(&spa_namespace_avl, spa);
750 cv_broadcast(&spa_namespace_cv);
751
752 if (spa->spa_root) {
753 spa_strfree(spa->spa_root);
754 spa_active_count--;
755 }
756
757 while ((dp = list_head(&spa->spa_config_list)) != NULL) {
758 list_remove(&spa->spa_config_list, dp);
759 if (dp->scd_path != NULL)
760 spa_strfree(dp->scd_path);
761 kmem_free(dp, sizeof (spa_config_dirent_t));
762 }
763
764 list_destroy(&spa->spa_config_list);
765
766 wbc_fini(&spa->spa_wbc);
767
768 spa_special_fini(spa);
769
770 spa_cos_fini(spa);
771
772 autosnap_fini(spa);
773
774 nvlist_free(spa->spa_label_features);
775 nvlist_free(spa->spa_load_info);
776 spa_config_set(spa, NULL);
777
778 mutex_enter(&cpu_lock);
779 if (spa->spa_deadman_cycid != CYCLIC_NONE)
780 cyclic_remove(spa->spa_deadman_cycid);
781 mutex_exit(&cpu_lock);
782 spa->spa_deadman_cycid = CYCLIC_NONE;
783
784 refcount_destroy(&spa->spa_refcount);
785
786 spa_config_lock_destroy(spa);
787
788 spa_trimstats_destroy(spa);
789
790 kstat_delete(spa->spa_iokstat);
791 spa->spa_iokstat = NULL;
792
793 for (int t = 0; t < TXG_SIZE; t++)
794 bplist_destroy(&spa->spa_free_bplist[t]);
795
796 zio_checksum_templates_free(spa);
797
798 cv_destroy(&spa->spa_async_cv);
799 cv_destroy(&spa->spa_evicting_os_cv);
800 cv_destroy(&spa->spa_proc_cv);
801 cv_destroy(&spa->spa_scrub_io_cv);
802 cv_destroy(&spa->spa_suspend_cv);
803 cv_destroy(&spa->spa_auto_trim_done_cv);
804 cv_destroy(&spa->spa_man_trim_update_cv);
805 cv_destroy(&spa->spa_man_trim_done_cv);
806
807 mutex_destroy(&spa->spa_async_lock);
808 mutex_destroy(&spa->spa_errlist_lock);
809 mutex_destroy(&spa->spa_errlog_lock);
810 mutex_destroy(&spa->spa_evicting_os_lock);
811 mutex_destroy(&spa->spa_history_lock);
812 mutex_destroy(&spa->spa_proc_lock);
813 mutex_destroy(&spa->spa_props_lock);
814 mutex_destroy(&spa->spa_cksum_tmpls_lock);
815 mutex_destroy(&spa->spa_scrub_lock);
816 mutex_destroy(&spa->spa_suspend_lock);
817 mutex_destroy(&spa->spa_vdev_top_lock);
818 mutex_destroy(&spa->spa_iokstat_lock);
819 mutex_destroy(&spa->spa_cos_props_lock);
820 mutex_destroy(&spa->spa_vdev_props_lock);
821 mutex_destroy(&spa->spa_auto_trim_lock);
822 mutex_destroy(&spa->spa_man_trim_lock);
823
824 kmem_free(spa, sizeof (spa_t));
825 }
826
827 /*
828 * Given a pool, return the next pool in the namespace, or NULL if there is
829 * none. If 'prev' is NULL, return the first pool.
830 */
831 spa_t *
832 spa_next(spa_t *prev)
833 {
834 ASSERT(MUTEX_HELD(&spa_namespace_lock));
835
836 if (prev)
837 return (AVL_NEXT(&spa_namespace_avl, prev));
838 else
839 return (avl_first(&spa_namespace_avl));
840 }
841
842 /*
1126 spa_aux_activate(vd, &spa_l2cache_avl);
1127 mutex_exit(&spa_l2cache_lock);
1128 }
1129
1130 /*
1131 * ==========================================================================
1132 * SPA vdev locking
1133 * ==========================================================================
1134 */
1135
1136 /*
1137 * Lock the given spa_t for the purpose of adding or removing a vdev.
1138 * Grabs the global spa_namespace_lock plus the spa config lock for writing.
1139 * It returns the next transaction group for the spa_t.
1140 */
1141 uint64_t
1142 spa_vdev_enter(spa_t *spa)
1143 {
1144 mutex_enter(&spa->spa_vdev_top_lock);
1145 mutex_enter(&spa_namespace_lock);
1146 mutex_enter(&spa->spa_auto_trim_lock);
1147 mutex_enter(&spa->spa_man_trim_lock);
1148 spa_trim_stop_wait(spa);
1149 return (spa_vdev_config_enter(spa));
1150 }
1151
1152 /*
1153 * Internal implementation for spa_vdev_enter(). Used when a vdev
1154 * operation requires multiple syncs (i.e. removing a device) while
1155 * keeping the spa_namespace_lock held.
1156 */
1157 uint64_t
1158 spa_vdev_config_enter(spa_t *spa)
1159 {
1160 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1161
1162 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1163
1164 return (spa_last_synced_txg(spa) + 1);
1165 }
1166
1167 /*
1168 * Used in combination with spa_vdev_config_enter() to allow the syncing
1177
1178 ASSERT(txg > spa_last_synced_txg(spa));
1179
1180 spa->spa_pending_vdev = NULL;
1181
1182 /*
1183 * Reassess the DTLs.
1184 */
1185 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
1186
1187 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
1188 config_changed = B_TRUE;
1189 spa->spa_config_generation++;
1190 }
1191
1192 /*
1193 * Verify the metaslab classes.
1194 */
1195 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
1196 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
1197 ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
1198
1199 spa_config_exit(spa, SCL_ALL, spa);
1200
1201 /*
1202 * Panic the system if the specified tag requires it. This
1203 * is useful for ensuring that configurations are updated
1204 * transactionally.
1205 */
1206 if (zio_injection_enabled)
1207 zio_handle_panic_injection(spa, tag, 0);
1208
1209 /*
1210 * Note: this txg_wait_synced() is important because it ensures
1211 * that there won't be more than one config change per txg.
1212 * This allows us to use the txg as the generation number.
1213 */
1214 if (error == 0)
1215 txg_wait_synced(spa->spa_dsl_pool, txg);
1216
1217 if (vd != NULL) {
1218 ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
1219 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1220 vdev_free(vd);
1221 spa_config_exit(spa, SCL_ALL, spa);
1222 }
1223
1224 /*
1225 * If the config changed, update the config cache.
1226 */
1227 if (config_changed)
1228 spa_config_sync(spa, B_FALSE, B_TRUE);
1229 }
1230
1231 /*
1232 * Unlock the spa_t after adding or removing a vdev. Besides undoing the
1233 * locking of spa_vdev_enter(), we also want make sure the transactions have
1234 * synced to disk, and then update the global configuration cache with the new
1235 * information.
1236 */
1237 int
1238 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
1239 {
1240 spa_vdev_config_exit(spa, vd, txg, error, FTAG);
1241 mutex_exit(&spa->spa_man_trim_lock);
1242 mutex_exit(&spa->spa_auto_trim_lock);
1243 mutex_exit(&spa_namespace_lock);
1244 mutex_exit(&spa->spa_vdev_top_lock);
1245
1246 return (error);
1247 }
1248
1249 /*
1250 * Lock the given spa_t for the purpose of changing vdev state.
1251 */
1252 void
1253 spa_vdev_state_enter(spa_t *spa, int oplocks)
1254 {
1255 int locks = SCL_STATE_ALL | oplocks;
1256
1257 /*
1258 * Root pools may need to read of the underlying devfs filesystem
1259 * when opening up a vdev. Unfortunately if we're holding the
1260 * SCL_ZIO lock it will result in a deadlock when we try to issue
1261 * the read from the root filesystem. Instead we "prefetch"
1262 * the associated vnodes that we need prior to opening the
1294 if (spa_is_root(spa))
1295 vdev_rele(spa->spa_root_vdev);
1296
1297 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
1298 spa_config_exit(spa, spa->spa_vdev_locks, spa);
1299
1300 /*
1301 * If anything changed, wait for it to sync. This ensures that,
1302 * from the system administrator's perspective, zpool(1M) commands
1303 * are synchronous. This is important for things like zpool offline:
1304 * when the command completes, you expect no further I/O from ZFS.
1305 */
1306 if (vd != NULL)
1307 txg_wait_synced(spa->spa_dsl_pool, 0);
1308
1309 /*
1310 * If the config changed, update the config cache.
1311 */
1312 if (config_changed) {
1313 mutex_enter(&spa_namespace_lock);
1314 spa_config_sync(spa, B_FALSE, B_TRUE);
1315 mutex_exit(&spa_namespace_lock);
1316 }
1317
1318 return (error);
1319 }
1320
1321 /*
1322 * ==========================================================================
1323 * Miscellaneous functions
1324 * ==========================================================================
1325 */
1326
1327 void
1328 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
1329 {
1330 if (!nvlist_exists(spa->spa_label_features, feature)) {
1331 fnvlist_add_boolean(spa->spa_label_features, feature);
1332 /*
1333 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
1334 * dirty the vdev config because lock SCL_CONFIG is not held.
1372 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1373
1374 avl_remove(&spa_namespace_avl, spa);
1375 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name));
1376 avl_add(&spa_namespace_avl, spa);
1377
1378 /*
1379 * Sync all labels to disk with the new names by marking the root vdev
1380 * dirty and waiting for it to sync. It will pick up the new pool name
1381 * during the sync.
1382 */
1383 vdev_config_dirty(spa->spa_root_vdev);
1384
1385 spa_config_exit(spa, SCL_ALL, FTAG);
1386
1387 txg_wait_synced(spa->spa_dsl_pool, 0);
1388
1389 /*
1390 * Sync the updated config cache.
1391 */
1392 spa_config_sync(spa, B_FALSE, B_TRUE);
1393
1394 spa_close(spa, FTAG);
1395
1396 mutex_exit(&spa_namespace_lock);
1397
1398 return (0);
1399 }
1400
1401 /*
1402 * Return the spa_t associated with given pool_guid, if it exists. If
1403 * device_guid is non-zero, determine whether the pool exists *and* contains
1404 * a device with the specified device_guid.
1405 */
1406 spa_t *
1407 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
1408 {
1409 spa_t *spa;
1410 avl_tree_t *t = &spa_namespace_avl;
1411
1412 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1430 if (spa->spa_pending_vdev) {
1431 if (vdev_lookup_by_guid(spa->spa_pending_vdev,
1432 device_guid) != NULL)
1433 break;
1434 }
1435 }
1436 }
1437
1438 return (spa);
1439 }
1440
1441 /*
1442 * Determine whether a pool with the given pool_guid exists.
1443 */
1444 boolean_t
1445 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
1446 {
1447 return (spa_by_guid(pool_guid, device_guid) != NULL);
1448 }
1449
1450 /*
1451 * Similar to spa_guid_exists, but uses the spa_config_guid and doesn't
1452 * filter the check by pool state (as spa_guid_exists does). This is
1453 * used to protect against attempting to spa_add the same pool (with the
1454 * same pool GUID) under different names. This situation can happen if
1455 * the boot_archive contains an outdated zpool.cache file after a pool
1456 * rename. That would make us import the pool twice, resulting in data
1457 * corruption. Normally the boot_archive shouldn't contain a zpool.cache
1458 * file, but if due to misconfiguration it does, this function serves as
1459 * a failsafe to prevent the double import.
1460 */
1461 boolean_t
1462 spa_config_guid_exists(uint64_t pool_guid)
1463 {
1464 spa_t *spa;
1465
1466 ASSERT(MUTEX_HELD(&spa_namespace_lock));
1467 if (pool_guid == 0)
1468 return (B_FALSE);
1469
1470 for (spa = avl_first(&spa_namespace_avl); spa != NULL;
1471 spa = AVL_NEXT(&spa_namespace_avl, spa)) {
1472 if (spa->spa_config_guid == pool_guid)
1473 return (B_TRUE);
1474 }
1475
1476 return (B_FALSE);
1477 }
1478
1479 char *
1480 spa_strdup(const char *s)
1481 {
1482 size_t len;
1483 char *new;
1484
1485 len = strlen(s);
1486 new = kmem_alloc(len + 1, KM_SLEEP);
1487 bcopy(s, new, len);
1488 new[len] = '\0';
1489
1490 return (new);
1491 }
1492
1493 void
1494 spa_strfree(char *s)
1495 {
1496 kmem_free(s, strlen(s) + 1);
1497 }
1498
1617 */
1618
1619 boolean_t
1620 spa_shutting_down(spa_t *spa)
1621 {
1622 return (spa->spa_async_suspended);
1623 }
1624
1625 dsl_pool_t *
1626 spa_get_dsl(spa_t *spa)
1627 {
1628 return (spa->spa_dsl_pool);
1629 }
1630
1631 boolean_t
1632 spa_is_initializing(spa_t *spa)
1633 {
1634 return (spa->spa_is_initializing);
1635 }
1636
1637 blkptr_t *
1638 spa_get_rootblkptr(spa_t *spa)
1639 {
1640 return (&spa->spa_ubsync.ub_rootbp);
1641 }
1642
1643 void
1644 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
1645 {
1646 spa->spa_uberblock.ub_rootbp = *bp;
1647 }
1648
1649 void
1650 spa_altroot(spa_t *spa, char *buf, size_t buflen)
1651 {
1652 if (spa->spa_root == NULL)
1653 buf[0] = '\0';
1654 else
1655 (void) strncpy(buf, spa->spa_root, buflen);
1656 }
1743 spa_load_state_t
1744 spa_load_state(spa_t *spa)
1745 {
1746 return (spa->spa_load_state);
1747 }
1748
1749 uint64_t
1750 spa_freeze_txg(spa_t *spa)
1751 {
1752 return (spa->spa_freeze_txg);
1753 }
1754
1755 /* ARGSUSED */
1756 uint64_t
1757 spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
1758 {
1759 return (lsize * spa_asize_inflation);
1760 }
1761
1762 /*
1763 * Get either on disk (phys == B_TRUE) or possible in core DDT size
1764 */
1765 uint64_t
1766 spa_get_ddts_size(spa_t *spa, boolean_t phys)
1767 {
1768 if (phys)
1769 return (spa->spa_ddt_dsize);
1770
1771 return (spa->spa_ddt_msize);
1772 }
1773
1774 /*
1775 * Check to see if we need to stop DDT growth to stay within some limit
1776 */
1777 boolean_t
1778 spa_enable_dedup_cap(spa_t *spa)
1779 {
1780 if (zfs_ddt_byte_ceiling != 0) {
1781 if (zfs_ddts_msize > zfs_ddt_byte_ceiling) {
1782 /* need to limit DDT to an in core bytecount */
1783 return (B_TRUE);
1784 }
1785 } else if (zfs_ddt_limit_type == DDT_LIMIT_TO_ARC) {
1786 if (zfs_ddts_msize > *arc_ddt_evict_threshold) {
1787 /* need to limit DDT to fit into ARC */
1788 return (B_TRUE);
1789 }
1790 } else if (zfs_ddt_limit_type == DDT_LIMIT_TO_L2ARC) {
1791 if (spa->spa_l2arc_ddt_devs_size != 0) {
1792 if (spa_get_ddts_size(spa, B_TRUE) >
1793 spa->spa_l2arc_ddt_devs_size) {
1794 /* limit DDT to fit into L2ARC DDT dev */
1795 return (B_TRUE);
1796 }
1797 } else if (zfs_ddts_msize > *arc_ddt_evict_threshold) {
1798 /* no L2ARC DDT dev - limit DDT to fit into ARC */
1799 return (B_TRUE);
1800 }
1801 }
1802
1803 return (B_FALSE);
1804 }
1805
1806 /*
1807 * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
1808 * or at least 128MB, unless that would cause it to be more than half the
1809 * pool size.
1810 *
1811 * See the comment above spa_slop_shift for details.
1812 */
1813 uint64_t
1814 spa_get_slop_space(spa_t *spa)
1815 {
1816 uint64_t space = spa_get_dspace(spa);
1817 return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
1818 }
1819
1820 uint64_t
1821 spa_get_dspace(spa_t *spa)
1822 {
1823 return (spa->spa_dspace);
1824 }
1825
1826 void
1827 spa_update_dspace(spa_t *spa)
1828 {
1829 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
1830 ddt_get_dedup_dspace(spa);
1831 }
1832
1833 /*
1834 * EXPERIMENTAL
1835 * Use exponential moving average to track root vdev iotime, as well as top
1836 * level vdev iotime.
1837 * The principle: avg_new = avg_prev + (cur - avg_prev) * a / 100; a is
1838 * tuneable. For example, if a = 10 (alpha = 0.1), it will take 20 iterations,
1839 * or 100 seconds at 5 second txg commit intervals for the values from last 20
1840 * iterations to account for 66% of the moving average.
1841 * Currently, the challenge is that we keep track of iotime in cumulative
1842 * nanoseconds since zpool import, both for leaf and top vdevs, so a way of
1843 * getting delta pre/post txg commit is required.
1844 */
1845
1846 void
1847 spa_update_latency(spa_t *spa)
1848 {
1849 vdev_t *rvd = spa->spa_root_vdev;
1850 vdev_stat_t *rvs = &rvd->vdev_stat;
1851 for (int c = 0; c < rvd->vdev_children; c++) {
1852 vdev_t *cvd = rvd->vdev_child[c];
1853 vdev_stat_t *cvs = &cvd->vdev_stat;
1854 mutex_enter(&rvd->vdev_stat_lock);
1855
1856 for (int t = 0; t < ZIO_TYPES; t++) {
1857
1858 /*
1859 * Non-trivial bit here. We update the moving latency
1860 * average for each child vdev separately, but since we
1861 * want the average to settle at the same rate
1862 * regardless of top level vdev count, we effectively
1863 * divide our alpha by number of children of the root
1864 * vdev to account for that.
1865 */
1866 rvs->vs_latency[t] += ((((int64_t)cvs->vs_latency[t] -
1867 (int64_t)rvs->vs_latency[t]) *
1868 (int64_t)zfs_root_latency_alpha) / 100) /
1869 (int64_t)(rvd->vdev_children);
1870 }
1871 mutex_exit(&rvd->vdev_stat_lock);
1872 }
1873 }
1874
1875
1876 /*
1877 * Return the failure mode that has been set to this pool. The default
1878 * behavior will be to block all I/Os when a complete failure occurs.
1879 */
1880 uint8_t
1881 spa_get_failmode(spa_t *spa)
1882 {
1883 return (spa->spa_failmode);
1884 }
1885
1886 boolean_t
1887 spa_suspended(spa_t *spa)
1888 {
1889 return (spa->spa_suspended);
1890 }
1891
1892 uint64_t
1893 spa_version(spa_t *spa)
1894 {
1895 return (spa->spa_ubsync.ub_version);
1896 }
1897
1898 int
1899 spa_get_obj_mtx_sz(spa_t *spa)
1900 {
1901 return (spa->spa_obj_mtx_sz);
1902 }
1903
1904 boolean_t
1905 spa_deflate(spa_t *spa)
1906 {
1907 return (spa->spa_deflate);
1908 }
1909
1910 metaslab_class_t *
1911 spa_normal_class(spa_t *spa)
1912 {
1913 return (spa->spa_normal_class);
1914 }
1915
1916 metaslab_class_t *
1917 spa_log_class(spa_t *spa)
1918 {
1919 return (spa->spa_log_class);
1920 }
1921
1922 metaslab_class_t *
1923 spa_special_class(spa_t *spa)
1924 {
1925 return (spa->spa_special_class);
1926 }
1927
1928 void
1929 spa_evicting_os_register(spa_t *spa, objset_t *os)
1930 {
1931 mutex_enter(&spa->spa_evicting_os_lock);
1932 list_insert_head(&spa->spa_evicting_os_list, os);
1933 mutex_exit(&spa->spa_evicting_os_lock);
1934 }
1935
1936 void
1937 spa_evicting_os_deregister(spa_t *spa, objset_t *os)
1938 {
1939 mutex_enter(&spa->spa_evicting_os_lock);
1940 list_remove(&spa->spa_evicting_os_list, os);
1941 cv_broadcast(&spa->spa_evicting_os_cv);
1942 mutex_exit(&spa->spa_evicting_os_lock);
1943 }
1944
1945 void
1946 spa_evicting_os_wait(spa_t *spa)
1947 {
1948 mutex_enter(&spa->spa_evicting_os_lock);
1949 while (!list_is_empty(&spa->spa_evicting_os_list))
1950 cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
1951 mutex_exit(&spa->spa_evicting_os_lock);
1952
1953 dmu_buf_user_evict_wait();
1954 }
1955
1956 uint64_t
1957 spa_class_alloc_percentage(metaslab_class_t *mc)
1958 {
1959 uint64_t capacity = mc->mc_space;
1960 uint64_t alloc = mc->mc_alloc;
1961 uint64_t one_percent = capacity / 100;
1962
1963 return (alloc / one_percent);
1964 }
1965
1966 int
1967 spa_max_replication(spa_t *spa)
1968 {
1969 /*
1970 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
1971 * handle BPs with more than one DVA allocated. Set our max
1972 * replication level accordingly.
1973 */
1974 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
1975 return (1);
1976 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
1977 }
1978
1979 int
1980 spa_prev_software_version(spa_t *spa)
1981 {
1982 return (spa->spa_prev_software_version);
1983 }
1984
1985 uint64_t
1986 spa_deadman_synctime(spa_t *spa)
1987 {
1988 return (spa->spa_deadman_synctime);
1989 }
1990
1991 spa_force_trim_t
1992 spa_get_force_trim(spa_t *spa)
1993 {
1994 return (spa->spa_force_trim);
1995 }
1996
1997 spa_auto_trim_t
1998 spa_get_auto_trim(spa_t *spa)
1999 {
2000 return (spa->spa_auto_trim);
2001 }
2002
2003 uint64_t
2004 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
2005 {
2006 uint64_t asize = DVA_GET_ASIZE(dva);
2007 uint64_t dsize = asize;
2008
2009 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
2010
2011 if (asize != 0 && spa->spa_deflate) {
2012 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
2013 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
2014 }
2015
2016 return (dsize);
2017 }
2018
2019 /*
2020 * This function walks over the all DVAs of the given BP and
2021 * adds up their sizes.
2022 */
2023 uint64_t
2024 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
2025 {
2026 /*
2027 * SPECIAL-BP has two DVAs, but DVA[0] in this case is a
2028 * temporary DVA, and after migration only the DVA[1]
2029 * contains valid data. Therefore, we start walking for
2030 * these BPs from DVA[1].
2031 */
2032 int start_dva = BP_IS_SPECIAL(bp) ? 1 : 0;
2033 uint64_t dsize = 0;
2034
2035 for (int d = start_dva; d < BP_GET_NDVAS(bp); d++) {
2036 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
2037 }
2038
2039 return (dsize);
2040 }
2041
2042 uint64_t
2043 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
2044 {
2045 uint64_t dsize;
2046
2047 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2048
2049 dsize = bp_get_dsize_sync(spa, bp);
2050
2051 spa_config_exit(spa, SCL_VDEV, FTAG);
2052
2053 return (dsize);
2054 }
2055
2056 /*
2057 * ==========================================================================
2058 * Initialization and Termination
2059 * ==========================================================================
2060 */
2061
2062 static int
2063 spa_name_compare(const void *a1, const void *a2)
2064 {
2065 const spa_t *s1 = a1;
2066 const spa_t *s2 = a2;
2067 int s;
2068
2069 s = strcmp(s1->spa_name, s2->spa_name);
2088
2089 void
2090 spa_init(int mode)
2091 {
2092 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
2093 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
2094 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
2095 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
2096
2097 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
2098 offsetof(spa_t, spa_avl));
2099
2100 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
2101 offsetof(spa_aux_t, aux_avl));
2102
2103 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
2104 offsetof(spa_aux_t, aux_avl));
2105
2106 spa_mode_global = mode;
2107
2108 /*
2109 * logevent_max_q_sz from log_sysevent.c gives us upper bound on
2110 * the number of taskq entries; queueing of sysevents is serialized,
2111 * so there is no need for more than one worker thread
2112 */
2113 spa_sysevent_taskq = taskq_create("spa_sysevent_tq", 1,
2114 minclsyspri, 1, 5000, TASKQ_DYNAMIC);
2115
2116 #ifdef _KERNEL
2117 spa_arch_init();
2118 #else
2119 if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
2120 arc_procfd = open("/proc/self/ctl", O_WRONLY);
2121 if (arc_procfd == -1) {
2122 perror("could not enable watchpoints: "
2123 "opening /proc/self/ctl failed: ");
2124 } else {
2125 arc_watch = B_TRUE;
2126 }
2127 }
2128 #endif
2129
2130 refcount_init();
2131 unique_init();
2132 range_tree_init();
2133 metaslab_alloc_trace_init();
2134 zio_init();
2135 dmu_init();
2136 zil_init();
2137 vdev_cache_stat_init();
2138 zfs_prop_init();
2139 zpool_prop_init();
2140 zpool_feature_init();
2141 vdev_prop_init();
2142 cos_prop_init();
2143 spa_config_load();
2144 l2arc_start();
2145 ddt_init();
2146 dsl_scan_global_init();
2147 }
2148
2149 void
2150 spa_fini(void)
2151 {
2152 ddt_fini();
2153
2154 l2arc_stop();
2155
2156 spa_evict_all();
2157
2158 vdev_cache_stat_fini();
2159 zil_fini();
2160 dmu_fini();
2161 zio_fini();
2162 metaslab_alloc_trace_fini();
2163 range_tree_fini();
2164 unique_fini();
2165 refcount_fini();
2166
2167 taskq_destroy(spa_sysevent_taskq);
2168
2169 avl_destroy(&spa_namespace_avl);
2170 avl_destroy(&spa_spare_avl);
2171 avl_destroy(&spa_l2cache_avl);
2172
2173 cv_destroy(&spa_namespace_cv);
2174 mutex_destroy(&spa_namespace_lock);
2175 mutex_destroy(&spa_spare_lock);
2176 mutex_destroy(&spa_l2cache_lock);
2177 }
2178
2179 /*
2180 * Return whether this pool has slogs. No locking needed.
2181 * It's not a problem if the wrong answer is returned as it's only for
2182 * performance and not correctness
2183 */
2184 boolean_t
2185 spa_has_slogs(spa_t *spa)
2186 {
2187 return (spa->spa_log_class->mc_rotor != NULL);
2188 }
2191 spa_get_log_state(spa_t *spa)
2192 {
2193 return (spa->spa_log_state);
2194 }
2195
2196 void
2197 spa_set_log_state(spa_t *spa, spa_log_state_t state)
2198 {
2199 spa->spa_log_state = state;
2200 }
2201
2202 boolean_t
2203 spa_is_root(spa_t *spa)
2204 {
2205 return (spa->spa_is_root);
2206 }
2207
2208 boolean_t
2209 spa_writeable(spa_t *spa)
2210 {
2211 return (!!(spa->spa_mode & FWRITE));
2212 }
2213
2214 /*
2215 * Returns true if there is a pending sync task in any of the current
2216 * syncing txg, the current quiescing txg, or the current open txg.
2217 */
2218 boolean_t
2219 spa_has_pending_synctask(spa_t *spa)
2220 {
2221 return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
2222 }
2223
2224 boolean_t
2225 spa_has_special(spa_t *spa)
2226 {
2227 return (spa->spa_special_class->mc_rotor != NULL);
2228 }
2229
2230 int
2231 spa_mode(spa_t *spa)
2232 {
2233 return (spa->spa_mode);
2234 }
2235
2236 uint64_t
2237 spa_bootfs(spa_t *spa)
2238 {
2239 return (spa->spa_bootfs);
2240 }
2241
2242 uint64_t
2243 spa_delegation(spa_t *spa)
2244 {
2245 return (spa->spa_delegation);
2246 }
2247
2248 objset_t *
2249 spa_meta_objset(spa_t *spa)
2254 enum zio_checksum
2255 spa_dedup_checksum(spa_t *spa)
2256 {
2257 return (spa->spa_dedup_checksum);
2258 }
2259
2260 /*
2261 * Reset pool scan stat per scan pass (or reboot).
2262 */
2263 void
2264 spa_scan_stat_init(spa_t *spa)
2265 {
2266 /* data not stored on disk */
2267 spa->spa_scan_pass_start = gethrestime_sec();
2268 if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
2269 spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
2270 else
2271 spa->spa_scan_pass_scrub_pause = 0;
2272 spa->spa_scan_pass_scrub_spent_paused = 0;
2273 spa->spa_scan_pass_exam = 0;
2274 spa->spa_scan_pass_work = 0;
2275 vdev_scan_stat_init(spa->spa_root_vdev);
2276 }
2277
2278 /*
2279 * Get scan stats for zpool status reports
2280 */
2281 int
2282 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
2283 {
2284 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
2285
2286 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
2287 return (SET_ERROR(ENOENT));
2288 bzero(ps, sizeof (pool_scan_stat_t));
2289
2290 /* data stored on disk */
2291 ps->pss_func = scn->scn_phys.scn_func;
2292 ps->pss_start_time = scn->scn_phys.scn_start_time;
2293 ps->pss_end_time = scn->scn_phys.scn_end_time;
2294 ps->pss_to_examine = scn->scn_phys.scn_to_examine;
2295 ps->pss_examined = scn->scn_phys.scn_examined;
2296 ps->pss_to_process = scn->scn_phys.scn_to_process;
2297 ps->pss_processed = scn->scn_phys.scn_processed;
2298 ps->pss_errors = scn->scn_phys.scn_errors;
2299 ps->pss_state = scn->scn_phys.scn_state;
2300 mutex_enter(&scn->scn_status_lock);
2301 ps->pss_issued = scn->scn_bytes_issued;
2302 mutex_exit(&scn->scn_status_lock);
2303
2304 /* data not stored on disk */
2305 ps->pss_pass_start = spa->spa_scan_pass_start;
2306 ps->pss_pass_exam = spa->spa_scan_pass_exam;
2307 ps->pss_pass_work = spa->spa_scan_pass_work;
2308 ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
2309 ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
2310
2311 return (0);
2312 }
2313
2314 boolean_t
2315 spa_debug_enabled(spa_t *spa)
2316 {
2317 return (spa->spa_debug);
2318 }
2319
2320 int
2321 spa_maxblocksize(spa_t *spa)
2322 {
2323 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
2324 return (SPA_MAXBLOCKSIZE);
2325 else
2326 return (SPA_OLD_MAXBLOCKSIZE);
2327 }
2328
2329 boolean_t
2330 spa_wbc_present(spa_t *spa)
2331 {
2332 return (spa->spa_wbc_mode != WBC_MODE_OFF);
2333 }
2334
2335 boolean_t
2336 spa_wbc_active(spa_t *spa)
2337 {
2338 return (spa->spa_wbc_mode == WBC_MODE_ACTIVE);
2339 }
2340
2341 int
2342 spa_wbc_mode(const char *name)
2343 {
2344 int ret = 0;
2345 spa_t *spa;
2346
2347 mutex_enter(&spa_namespace_lock);
2348 spa = spa_lookup(name);
2349 if (!spa) {
2350 mutex_exit(&spa_namespace_lock);
2351 return (-1);
2352 }
2353
2354 ret = (int)spa->spa_wbc_mode;
2355 mutex_exit(&spa_namespace_lock);
2356 return (ret);
2357 }
2358
2359 struct zfs_autosnap *
2360 spa_get_autosnap(spa_t *spa)
2361 {
2362 return (&spa->spa_autosnap);
2363 }
2364
2365 wbc_data_t *
2366 spa_get_wbc_data(spa_t *spa)
2367 {
2368 return (&spa->spa_wbc);
2369 }
2370
2371 /*
2372 * Creates the trim kstats structure for a spa.
2373 */
2374 static void
2375 spa_trimstats_create(spa_t *spa)
2376 {
2377 /* truncate pool name to accomodate "_trimstats" suffix */
2378 char short_spa_name[KSTAT_STRLEN - 10];
2379 char name[KSTAT_STRLEN];
2380
2381 ASSERT3P(spa->spa_trimstats, ==, NULL);
2382 ASSERT3P(spa->spa_trimstats_ks, ==, NULL);
2383
2384 (void) snprintf(short_spa_name, sizeof (short_spa_name), "%s",
2385 spa->spa_name);
2386 (void) snprintf(name, sizeof (name), "%s_trimstats", short_spa_name);
2387
2388 spa->spa_trimstats_ks = kstat_create("zfs", 0, name, "misc",
2389 KSTAT_TYPE_NAMED, sizeof (*spa->spa_trimstats) /
2390 sizeof (kstat_named_t), 0);
2391 if (spa->spa_trimstats_ks) {
2392 spa->spa_trimstats = spa->spa_trimstats_ks->ks_data;
2393
2394 #ifdef _KERNEL
2395 kstat_named_init(&spa->spa_trimstats->st_extents,
2396 "extents", KSTAT_DATA_UINT64);
2397 kstat_named_init(&spa->spa_trimstats->st_bytes,
2398 "bytes", KSTAT_DATA_UINT64);
2399 kstat_named_init(&spa->spa_trimstats->st_extents_skipped,
2400 "extents_skipped", KSTAT_DATA_UINT64);
2401 kstat_named_init(&spa->spa_trimstats->st_bytes_skipped,
2402 "bytes_skipped", KSTAT_DATA_UINT64);
2403 kstat_named_init(&spa->spa_trimstats->st_auto_slow,
2404 "auto_slow", KSTAT_DATA_UINT64);
2405 #endif /* _KERNEL */
2406
2407 kstat_install(spa->spa_trimstats_ks);
2408 } else {
2409 cmn_err(CE_NOTE, "!Cannot create trim kstats for pool %s",
2410 spa->spa_name);
2411 }
2412 }
2413
2414 /*
2415 * Destroys the trim kstats for a spa.
2416 */
2417 static void
2418 spa_trimstats_destroy(spa_t *spa)
2419 {
2420 if (spa->spa_trimstats_ks) {
2421 kstat_delete(spa->spa_trimstats_ks);
2422 spa->spa_trimstats = NULL;
2423 spa->spa_trimstats_ks = NULL;
2424 }
2425 }
2426
2427 /*
2428 * Updates the numerical trim kstats for a spa.
2429 */
2430 void
2431 spa_trimstats_update(spa_t *spa, uint64_t extents, uint64_t bytes,
2432 uint64_t extents_skipped, uint64_t bytes_skipped)
2433 {
2434 spa_trimstats_t *st = spa->spa_trimstats;
2435 if (st) {
2436 atomic_add_64(&st->st_extents.value.ui64, extents);
2437 atomic_add_64(&st->st_bytes.value.ui64, bytes);
2438 atomic_add_64(&st->st_extents_skipped.value.ui64,
2439 extents_skipped);
2440 atomic_add_64(&st->st_bytes_skipped.value.ui64,
2441 bytes_skipped);
2442 }
2443 }
2444
2445 /*
2446 * Increments the slow-trim kstat for a spa.
2447 */
2448 void
2449 spa_trimstats_auto_slow_incr(spa_t *spa)
2450 {
2451 spa_trimstats_t *st = spa->spa_trimstats;
2452 if (st)
2453 atomic_inc_64(&st->st_auto_slow.value.ui64);
2454 }
2455
2456 /*
2457 * Creates the taskq used for dispatching auto-trim. This is called only when
2458 * the property is set to `on' or when the pool is loaded (and the autotrim
2459 * property is `on').
2460 */
2461 void
2462 spa_auto_trim_taskq_create(spa_t *spa)
2463 {
2464 char name[MAXPATHLEN];
2465 ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock));
2466 ASSERT(spa->spa_auto_trim_taskq == NULL);
2467 (void) snprintf(name, sizeof (name), "%s_auto_trim", spa->spa_name);
2468 spa->spa_auto_trim_taskq = taskq_create(name, 1, minclsyspri, 1,
2469 spa->spa_root_vdev->vdev_children, TASKQ_DYNAMIC);
2470 VERIFY(spa->spa_auto_trim_taskq != NULL);
2471 }
2472
2473 /*
2474 * Creates the taskq for dispatching manual trim. This taskq is recreated
2475 * each time `zpool trim <poolname>' is issued and destroyed after the run
2476 * completes in an async spa request.
2477 */
2478 void
2479 spa_man_trim_taskq_create(spa_t *spa)
2480 {
2481 char name[MAXPATHLEN];
2482 ASSERT(MUTEX_HELD(&spa->spa_man_trim_lock));
2483 spa_async_unrequest(spa, SPA_ASYNC_MAN_TRIM_TASKQ_DESTROY);
2484 if (spa->spa_man_trim_taskq != NULL)
2485 /*
2486 * The async taskq destroy has been pre-empted, so just
2487 * return, the taskq is still good to use.
2488 */
2489 return;
2490 (void) snprintf(name, sizeof (name), "%s_man_trim", spa->spa_name);
2491 spa->spa_man_trim_taskq = taskq_create(name, 1, minclsyspri, 1,
2492 spa->spa_root_vdev->vdev_children, TASKQ_DYNAMIC);
2493 VERIFY(spa->spa_man_trim_taskq != NULL);
2494 }
2495
2496 /*
2497 * Destroys the taskq created in spa_auto_trim_taskq_create. The taskq
2498 * is only destroyed when the autotrim property is set to `off'.
2499 */
2500 void
2501 spa_auto_trim_taskq_destroy(spa_t *spa)
2502 {
2503 ASSERT(MUTEX_HELD(&spa->spa_auto_trim_lock));
2504 ASSERT(spa->spa_auto_trim_taskq != NULL);
2505 while (spa->spa_num_auto_trimming != 0)
2506 cv_wait(&spa->spa_auto_trim_done_cv, &spa->spa_auto_trim_lock);
2507 taskq_destroy(spa->spa_auto_trim_taskq);
2508 spa->spa_auto_trim_taskq = NULL;
2509 }
2510
2511 /*
2512 * Destroys the taskq created in spa_man_trim_taskq_create. The taskq is
2513 * destroyed after a manual trim run completes from an async spa request.
2514 * There is a bit of lag between an async request being issued at the
2515 * completion of a trim run and it finally being acted on, hence why this
2516 * function checks if new manual trimming threads haven't been re-spawned.
2517 * If they have, we assume the async spa request been preempted by another
2518 * manual trim request and we back off.
2519 */
2520 void
2521 spa_man_trim_taskq_destroy(spa_t *spa)
2522 {
2523 ASSERT(MUTEX_HELD(&spa->spa_man_trim_lock));
2524 ASSERT(spa->spa_man_trim_taskq != NULL);
2525 if (spa->spa_num_man_trimming != 0)
2526 /* another trim got started before we got here, back off */
2527 return;
2528 taskq_destroy(spa->spa_man_trim_taskq);
2529 spa->spa_man_trim_taskq = NULL;
2530 }
|