1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/space_map.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 #include <sys/spa_impl.h>
37 #include <sys/zfeature.h>
38 #include <sys/wbc.h>
39
40 #define GANG_ALLOCATION(flags) \
41 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
42
43 uint64_t metaslab_aliquot = 512ULL << 10;
44 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
45
46 /*
47 * The in-core space map representation is more compact than its on-disk form.
48 * The zfs_condense_pct determines how much more compact the in-core
49 * space map representation must be before we compact it on-disk.
50 * Values should be greater than or equal to 100.
51 */
52 int zfs_condense_pct = 200;
53
54 /*
55 * Condensing a metaslab is not guaranteed to actually reduce the amount of
56 * space used on disk. In particular, a space map uses data in increments of
57 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
58 * same number of blocks after condensing. Since the goal of condensing is to
59 * reduce the number of IOPs required to read the space map, we only want to
60 * condense when we can be sure we will reduce the number of blocks used by the
61 * space map. Unfortunately, we cannot precisely compute whether or not this is
62 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
63 * we apply the following heuristic: do not condense a spacemap unless the
64 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
65 * blocks.
66 */
67 int zfs_metaslab_condense_block_threshold = 4;
68
69 /*
70 * The zfs_mg_noalloc_threshold defines which metaslab groups should
71 * be eligible for allocation. The value is defined as a percentage of
72 * free space. Metaslab groups that have more free space than
73 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
74 * a metaslab group's free space is less than or equal to the
75 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
76 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
77 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
78 * groups are allowed to accept allocations. Gang blocks are always
79 * eligible to allocate on any metaslab group. The default value of 0 means
80 * no metaslab group will be excluded based on this criterion.
81 */
82 int zfs_mg_noalloc_threshold = 0;
83
84 /*
85 * Metaslab groups are considered eligible for allocations if their
86 * fragmenation metric (measured as a percentage) is less than or equal to
87 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
88 * then it will be skipped unless all metaslab groups within the metaslab
89 * class have also crossed this threshold.
90 */
91 int zfs_mg_fragmentation_threshold = 85;
92
93 /*
94 * Allow metaslabs to keep their active state as long as their fragmentation
95 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
96 * active metaslab that exceeds this threshold will no longer keep its active
97 * status allowing better metaslabs to be selected.
98 */
99 int zfs_metaslab_fragmentation_threshold = 70;
100
101 /*
102 * When set will load all metaslabs when pool is first opened.
103 */
104 int metaslab_debug_load = 0;
105
106 /*
107 * When set will prevent metaslabs from being unloaded.
108 */
109 int metaslab_debug_unload = 0;
110
111 /*
112 * Minimum size which forces the dynamic allocator to change
113 * it's allocation strategy. Once the space map cannot satisfy
114 * an allocation of this size then it switches to using more
115 * aggressive strategy (i.e search by size rather than offset).
116 */
117 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
118
119 /*
120 * The minimum free space, in percent, which must be available
121 * in a space map to continue allocations in a first-fit fashion.
122 * Once the space map's free space drops below this level we dynamically
123 * switch to using best-fit allocations.
124 */
125 int metaslab_df_free_pct = 4;
126
127 /*
128 * A metaslab is considered "free" if it contains a contiguous
129 * segment which is greater than metaslab_min_alloc_size.
130 */
131 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
132
133 /*
134 * Percentage of all cpus that can be used by the metaslab taskq.
135 */
136 int metaslab_load_pct = 50;
137
138 /*
139 * Determines how many txgs a metaslab may remain loaded without having any
140 * allocations from it. As long as a metaslab continues to be used we will
141 * keep it loaded.
142 */
143 int metaslab_unload_delay = TXG_SIZE * 2;
144
145 /*
146 * Max number of metaslabs per group to preload.
147 */
148 int metaslab_preload_limit = SPA_DVAS_PER_BP;
149
150 /*
151 * Enable/disable preloading of metaslab.
152 */
153 boolean_t metaslab_preload_enabled = B_TRUE;
154
155 /*
156 * Enable/disable fragmentation weighting on metaslabs.
157 */
158 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
159
160 /*
161 * Enable/disable lba weighting (i.e. outer tracks are given preference).
162 */
163 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
164
165 /*
166 * Enable/disable metaslab group biasing.
167 */
168 boolean_t metaslab_bias_enabled = B_TRUE;
169
170 /*
171 * Enable/disable segment-based metaslab selection.
172 */
173 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
174
175 /*
176 * When using segment-based metaslab selection, we will continue
177 * allocating from the active metaslab until we have exhausted
178 * zfs_metaslab_switch_threshold of its buckets.
179 */
180 int zfs_metaslab_switch_threshold = 2;
181
182 /*
183 * Internal switch to enable/disable the metaslab allocation tracing
184 * facility.
185 */
186 boolean_t metaslab_trace_enabled = B_TRUE;
187
188 /*
189 * Maximum entries that the metaslab allocation tracing facility will keep
190 * in a given list when running in non-debug mode. We limit the number
191 * of entries in non-debug mode to prevent us from using up too much memory.
192 * The limit should be sufficiently large that we don't expect any allocation
193 * to every exceed this value. In debug mode, the system will panic if this
194 * limit is ever reached allowing for further investigation.
195 */
196 uint64_t metaslab_trace_max_entries = 5000;
197
198 static uint64_t metaslab_weight(metaslab_t *);
199 static void metaslab_set_fragmentation(metaslab_t *);
200
201 kmem_cache_t *metaslab_alloc_trace_cache;
202
203 /*
204 * Toggle between space-based DVA allocator 0, latency-based 1 or hybrid 2.
205 * A value other than 0, 1 or 2 will be considered 0 (default).
206 */
207 int metaslab_alloc_dva_algorithm = 0;
208
209 /*
210 * How many TXG's worth of updates should be aggregated per TRIM/UNMAP
211 * issued to the underlying vdev. We keep two range trees of extents
212 * (called "trim sets") to be trimmed per metaslab, the `current' and
213 * the `previous' TS. New free's are added to the current TS. Then,
214 * once `zfs_txgs_per_trim' transactions have elapsed, the `current'
215 * TS becomes the `previous' TS and a new, blank TS is created to be
216 * the new `current', which will then start accumulating any new frees.
217 * Once another zfs_txgs_per_trim TXGs have passed, the previous TS's
218 * extents are trimmed, the TS is destroyed and the current TS again
219 * becomes the previous TS.
220 * This serves to fulfill two functions: aggregate many small frees
221 * into fewer larger trim operations (which should help with devices
222 * which do not take so kindly to them) and to allow for disaster
223 * recovery (extents won't get trimmed immediately, but instead only
224 * after passing this rather long timeout, thus not preserving
225 * 'zfs import -F' functionality).
226 */
227 unsigned int zfs_txgs_per_trim = 32;
228
229 static void metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size);
230 static void metaslab_trim_add(void *arg, uint64_t offset, uint64_t size);
231
232 static zio_t *metaslab_exec_trim(metaslab_t *msp);
233
234 static metaslab_trimset_t *metaslab_new_trimset(uint64_t txg, kmutex_t *lock);
235 static void metaslab_free_trimset(metaslab_trimset_t *ts);
236 static boolean_t metaslab_check_trim_conflict(metaslab_t *msp,
237 uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit);
238
239 /*
240 * ==========================================================================
241 * Metaslab classes
242 * ==========================================================================
243 */
244 metaslab_class_t *
245 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
246 {
247 metaslab_class_t *mc;
248
249 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
250
251 mutex_init(&mc->mc_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
252 avl_create(&mc->mc_alloc_tree, zio_bookmark_compare,
253 sizeof (zio_t), offsetof(zio_t, io_alloc_node));
254
255 mc->mc_spa = spa;
256 mc->mc_rotor = NULL;
257 mc->mc_ops = ops;
258 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
259 refcount_create_tracked(&mc->mc_alloc_slots);
260
261 return (mc);
262 }
263
264 void
265 metaslab_class_destroy(metaslab_class_t *mc)
266 {
267 ASSERT(mc->mc_rotor == NULL);
268 ASSERT(mc->mc_alloc == 0);
269 ASSERT(mc->mc_deferred == 0);
270 ASSERT(mc->mc_space == 0);
271 ASSERT(mc->mc_dspace == 0);
272
273 avl_destroy(&mc->mc_alloc_tree);
274 mutex_destroy(&mc->mc_alloc_lock);
275
276 refcount_destroy(&mc->mc_alloc_slots);
277 mutex_destroy(&mc->mc_lock);
278 kmem_free(mc, sizeof (metaslab_class_t));
279 }
280
281 int
282 metaslab_class_validate(metaslab_class_t *mc)
283 {
284 metaslab_group_t *mg;
285 vdev_t *vd;
286
287 /*
288 * Must hold one of the spa_config locks.
289 */
290 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
291 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
292
293 if ((mg = mc->mc_rotor) == NULL)
294 return (0);
295
296 do {
297 vd = mg->mg_vd;
298 ASSERT(vd->vdev_mg != NULL);
299 ASSERT3P(vd->vdev_top, ==, vd);
300 ASSERT3P(mg->mg_class, ==, mc);
301 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
302 } while ((mg = mg->mg_next) != mc->mc_rotor);
303
304 return (0);
305 }
306
307 void
308 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
309 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
310 {
311 atomic_add_64(&mc->mc_alloc, alloc_delta);
312 atomic_add_64(&mc->mc_deferred, defer_delta);
313 atomic_add_64(&mc->mc_space, space_delta);
314 atomic_add_64(&mc->mc_dspace, dspace_delta);
315 }
316
317 uint64_t
318 metaslab_class_get_alloc(metaslab_class_t *mc)
319 {
320 return (mc->mc_alloc);
321 }
322
323 uint64_t
324 metaslab_class_get_deferred(metaslab_class_t *mc)
325 {
326 return (mc->mc_deferred);
327 }
328
329 uint64_t
330 metaslab_class_get_space(metaslab_class_t *mc)
331 {
332 return (mc->mc_space);
333 }
334
335 uint64_t
336 metaslab_class_get_dspace(metaslab_class_t *mc)
337 {
338 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
339 }
340
341 void
342 metaslab_class_histogram_verify(metaslab_class_t *mc)
343 {
344 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
345 uint64_t *mc_hist;
346 int i;
347
348 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
349 return;
350
351 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
352 KM_SLEEP);
353
354 for (int c = 0; c < rvd->vdev_children; c++) {
355 vdev_t *tvd = rvd->vdev_child[c];
356 metaslab_group_t *mg = tvd->vdev_mg;
357
358 /*
359 * Skip any holes, uninitialized top-levels, or
360 * vdevs that are not in this metalab class.
361 */
362 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
363 mg->mg_class != mc) {
364 continue;
365 }
366
367 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
368 mc_hist[i] += mg->mg_histogram[i];
369 }
370
371 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
372 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
373
374 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
375 }
376
377 /*
378 * Calculate the metaslab class's fragmentation metric. The metric
379 * is weighted based on the space contribution of each metaslab group.
380 * The return value will be a number between 0 and 100 (inclusive), or
381 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
382 * zfs_frag_table for more information about the metric.
383 */
384 uint64_t
385 metaslab_class_fragmentation(metaslab_class_t *mc)
386 {
387 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
388 uint64_t fragmentation = 0;
389
390 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
391
392 for (int c = 0; c < rvd->vdev_children; c++) {
393 vdev_t *tvd = rvd->vdev_child[c];
394 metaslab_group_t *mg = tvd->vdev_mg;
395
396 /*
397 * Skip any holes, uninitialized top-levels, or
398 * vdevs that are not in this metalab class.
399 */
400 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
401 mg->mg_class != mc) {
402 continue;
403 }
404
405 /*
406 * If a metaslab group does not contain a fragmentation
407 * metric then just bail out.
408 */
409 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
410 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
411 return (ZFS_FRAG_INVALID);
412 }
413
414 /*
415 * Determine how much this metaslab_group is contributing
416 * to the overall pool fragmentation metric.
417 */
418 fragmentation += mg->mg_fragmentation *
419 metaslab_group_get_space(mg);
420 }
421 fragmentation /= metaslab_class_get_space(mc);
422
423 ASSERT3U(fragmentation, <=, 100);
424 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
425 return (fragmentation);
426 }
427
428 /*
429 * Calculate the amount of expandable space that is available in
430 * this metaslab class. If a device is expanded then its expandable
431 * space will be the amount of allocatable space that is currently not
432 * part of this metaslab class.
433 */
434 uint64_t
435 metaslab_class_expandable_space(metaslab_class_t *mc)
436 {
437 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
438 uint64_t space = 0;
439
440 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
441 for (int c = 0; c < rvd->vdev_children; c++) {
442 uint64_t tspace;
443 vdev_t *tvd = rvd->vdev_child[c];
444 metaslab_group_t *mg = tvd->vdev_mg;
445
446 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
447 mg->mg_class != mc) {
448 continue;
449 }
450
451 /*
452 * Calculate if we have enough space to add additional
453 * metaslabs. We report the expandable space in terms
454 * of the metaslab size since that's the unit of expansion.
455 * Adjust by efi system partition size.
456 */
457 tspace = tvd->vdev_max_asize - tvd->vdev_asize;
458 if (tspace > mc->mc_spa->spa_bootsize) {
459 tspace -= mc->mc_spa->spa_bootsize;
460 }
461 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
462 }
463 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
464 return (space);
465 }
466
467 static int
468 metaslab_compare(const void *x1, const void *x2)
469 {
470 const metaslab_t *m1 = x1;
471 const metaslab_t *m2 = x2;
472
473 if (m1->ms_weight < m2->ms_weight)
474 return (1);
475 if (m1->ms_weight > m2->ms_weight)
476 return (-1);
477
478 /*
479 * If the weights are identical, use the offset to force uniqueness.
480 */
481 if (m1->ms_start < m2->ms_start)
482 return (-1);
483 if (m1->ms_start > m2->ms_start)
484 return (1);
485
486 ASSERT3P(m1, ==, m2);
487
488 return (0);
489 }
490
491 /*
492 * Verify that the space accounting on disk matches the in-core range_trees.
493 */
494 void
495 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
496 {
497 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
498 uint64_t allocated = 0;
499 uint64_t sm_free_space, msp_free_space;
500
501 ASSERT(MUTEX_HELD(&msp->ms_lock));
502
503 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
504 return;
505
506 /*
507 * We can only verify the metaslab space when we're called
508 * from syncing context with a loaded metaslab that has an allocated
509 * space map. Calling this in non-syncing context does not
510 * provide a consistent view of the metaslab since we're performing
511 * allocations in the future.
512 */
513 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
514 !msp->ms_loaded)
515 return;
516
517 sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
518 space_map_alloc_delta(msp->ms_sm);
519
520 /*
521 * Account for future allocations since we would have already
522 * deducted that space from the ms_freetree.
523 */
524 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
525 allocated +=
526 range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
527 }
528
529 msp_free_space = range_tree_space(msp->ms_tree) + allocated +
530 msp->ms_deferspace + range_tree_space(msp->ms_freedtree);
531
532 VERIFY3U(sm_free_space, ==, msp_free_space);
533 }
534
535 /*
536 * ==========================================================================
537 * Metaslab groups
538 * ==========================================================================
539 */
540 /*
541 * Update the allocatable flag and the metaslab group's capacity.
542 * The allocatable flag is set to true if the capacity is below
543 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
544 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
545 * transitions from allocatable to non-allocatable or vice versa then the
546 * metaslab group's class is updated to reflect the transition.
547 */
548 static void
549 metaslab_group_alloc_update(metaslab_group_t *mg)
550 {
551 vdev_t *vd = mg->mg_vd;
552 metaslab_class_t *mc = mg->mg_class;
553 vdev_stat_t *vs = &vd->vdev_stat;
554 boolean_t was_allocatable;
555 boolean_t was_initialized;
556
557 ASSERT(vd == vd->vdev_top);
558
559 mutex_enter(&mg->mg_lock);
560 was_allocatable = mg->mg_allocatable;
561 was_initialized = mg->mg_initialized;
562
563 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
564 (vs->vs_space + 1);
565
566 mutex_enter(&mc->mc_lock);
567
568 /*
569 * If the metaslab group was just added then it won't
570 * have any space until we finish syncing out this txg.
571 * At that point we will consider it initialized and available
572 * for allocations. We also don't consider non-activated
573 * metaslab groups (e.g. vdevs that are in the middle of being removed)
574 * to be initialized, because they can't be used for allocation.
575 */
576 mg->mg_initialized = metaslab_group_initialized(mg);
577 if (!was_initialized && mg->mg_initialized) {
578 mc->mc_groups++;
579 } else if (was_initialized && !mg->mg_initialized) {
580 ASSERT3U(mc->mc_groups, >, 0);
581 mc->mc_groups--;
582 }
583 if (mg->mg_initialized)
584 mg->mg_no_free_space = B_FALSE;
585
586 /*
587 * A metaslab group is considered allocatable if it has plenty
588 * of free space or is not heavily fragmented. We only take
589 * fragmentation into account if the metaslab group has a valid
590 * fragmentation metric (i.e. a value between 0 and 100).
591 */
592 mg->mg_allocatable = (mg->mg_activation_count > 0 &&
593 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
594 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
595 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
596
597 /*
598 * The mc_alloc_groups maintains a count of the number of
599 * groups in this metaslab class that are still above the
600 * zfs_mg_noalloc_threshold. This is used by the allocating
601 * threads to determine if they should avoid allocations to
602 * a given group. The allocator will avoid allocations to a group
603 * if that group has reached or is below the zfs_mg_noalloc_threshold
604 * and there are still other groups that are above the threshold.
605 * When a group transitions from allocatable to non-allocatable or
606 * vice versa we update the metaslab class to reflect that change.
607 * When the mc_alloc_groups value drops to 0 that means that all
608 * groups have reached the zfs_mg_noalloc_threshold making all groups
609 * eligible for allocations. This effectively means that all devices
610 * are balanced again.
611 */
612 if (was_allocatable && !mg->mg_allocatable)
613 mc->mc_alloc_groups--;
614 else if (!was_allocatable && mg->mg_allocatable)
615 mc->mc_alloc_groups++;
616 mutex_exit(&mc->mc_lock);
617
618 mutex_exit(&mg->mg_lock);
619 }
620
621 metaslab_group_t *
622 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
623 {
624 metaslab_group_t *mg;
625
626 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
627 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
628 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
629 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
630 mg->mg_vd = vd;
631 mg->mg_class = mc;
632 mg->mg_activation_count = 0;
633 mg->mg_initialized = B_FALSE;
634 mg->mg_no_free_space = B_TRUE;
635 refcount_create_tracked(&mg->mg_alloc_queue_depth);
636
637 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
638 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
639
640 return (mg);
641 }
642
643 void
644 metaslab_group_destroy(metaslab_group_t *mg)
645 {
646 ASSERT(mg->mg_prev == NULL);
647 ASSERT(mg->mg_next == NULL);
648 /*
649 * We may have gone below zero with the activation count
650 * either because we never activated in the first place or
651 * because we're done, and possibly removing the vdev.
652 */
653 ASSERT(mg->mg_activation_count <= 0);
654
655 if (mg->mg_taskq)
656 taskq_destroy(mg->mg_taskq);
657 avl_destroy(&mg->mg_metaslab_tree);
658 mutex_destroy(&mg->mg_lock);
659 refcount_destroy(&mg->mg_alloc_queue_depth);
660 kmem_free(mg, sizeof (metaslab_group_t));
661 }
662
663 void
664 metaslab_group_activate(metaslab_group_t *mg)
665 {
666 metaslab_class_t *mc = mg->mg_class;
667 metaslab_group_t *mgprev, *mgnext;
668
669 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
670
671 ASSERT(mc->mc_rotor != mg);
672 ASSERT(mg->mg_prev == NULL);
673 ASSERT(mg->mg_next == NULL);
674 ASSERT(mg->mg_activation_count <= 0);
675
676 if (++mg->mg_activation_count <= 0)
677 return;
678
679 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
680 metaslab_group_alloc_update(mg);
681
682 if ((mgprev = mc->mc_rotor) == NULL) {
683 mg->mg_prev = mg;
684 mg->mg_next = mg;
685 } else {
686 mgnext = mgprev->mg_next;
687 mg->mg_prev = mgprev;
688 mg->mg_next = mgnext;
689 mgprev->mg_next = mg;
690 mgnext->mg_prev = mg;
691 }
692 mc->mc_rotor = mg;
693 }
694
695 void
696 metaslab_group_passivate(metaslab_group_t *mg)
697 {
698 metaslab_class_t *mc = mg->mg_class;
699 metaslab_group_t *mgprev, *mgnext;
700
701 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
702
703 if (--mg->mg_activation_count != 0) {
704 ASSERT(mc->mc_rotor != mg);
705 ASSERT(mg->mg_prev == NULL);
706 ASSERT(mg->mg_next == NULL);
707 ASSERT(mg->mg_activation_count < 0);
708 return;
709 }
710
711 taskq_wait(mg->mg_taskq);
712 metaslab_group_alloc_update(mg);
713
714 mgprev = mg->mg_prev;
715 mgnext = mg->mg_next;
716
717 if (mg == mgnext) {
718 mc->mc_rotor = NULL;
719 } else {
720 mc->mc_rotor = mgnext;
721 mgprev->mg_next = mgnext;
722 mgnext->mg_prev = mgprev;
723 }
724
725 mg->mg_prev = NULL;
726 mg->mg_next = NULL;
727 }
728
729 boolean_t
730 metaslab_group_initialized(metaslab_group_t *mg)
731 {
732 vdev_t *vd = mg->mg_vd;
733 vdev_stat_t *vs = &vd->vdev_stat;
734
735 return (vs->vs_space != 0 && mg->mg_activation_count > 0);
736 }
737
738 uint64_t
739 metaslab_group_get_space(metaslab_group_t *mg)
740 {
741 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
742 }
743
744 void
745 metaslab_group_histogram_verify(metaslab_group_t *mg)
746 {
747 uint64_t *mg_hist;
748 vdev_t *vd = mg->mg_vd;
749 uint64_t ashift = vd->vdev_ashift;
750 int i;
751
752 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
753 return;
754
755 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
756 KM_SLEEP);
757
758 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
759 SPACE_MAP_HISTOGRAM_SIZE + ashift);
760
761 for (int m = 0; m < vd->vdev_ms_count; m++) {
762 metaslab_t *msp = vd->vdev_ms[m];
763
764 if (msp->ms_sm == NULL)
765 continue;
766
767 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
768 mg_hist[i + ashift] +=
769 msp->ms_sm->sm_phys->smp_histogram[i];
770 }
771
772 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
773 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
774
775 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
776 }
777
778 static void
779 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
780 {
781 metaslab_class_t *mc = mg->mg_class;
782 uint64_t ashift = mg->mg_vd->vdev_ashift;
783
784 ASSERT(MUTEX_HELD(&msp->ms_lock));
785 if (msp->ms_sm == NULL)
786 return;
787
788 mutex_enter(&mg->mg_lock);
789 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
790 mg->mg_histogram[i + ashift] +=
791 msp->ms_sm->sm_phys->smp_histogram[i];
792 mc->mc_histogram[i + ashift] +=
793 msp->ms_sm->sm_phys->smp_histogram[i];
794 }
795 mutex_exit(&mg->mg_lock);
796 }
797
798 void
799 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
800 {
801 metaslab_class_t *mc = mg->mg_class;
802 uint64_t ashift = mg->mg_vd->vdev_ashift;
803
804 ASSERT(MUTEX_HELD(&msp->ms_lock));
805 if (msp->ms_sm == NULL)
806 return;
807
808 mutex_enter(&mg->mg_lock);
809 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
810 ASSERT3U(mg->mg_histogram[i + ashift], >=,
811 msp->ms_sm->sm_phys->smp_histogram[i]);
812 ASSERT3U(mc->mc_histogram[i + ashift], >=,
813 msp->ms_sm->sm_phys->smp_histogram[i]);
814
815 mg->mg_histogram[i + ashift] -=
816 msp->ms_sm->sm_phys->smp_histogram[i];
817 mc->mc_histogram[i + ashift] -=
818 msp->ms_sm->sm_phys->smp_histogram[i];
819 }
820 mutex_exit(&mg->mg_lock);
821 }
822
823 static void
824 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
825 {
826 ASSERT(msp->ms_group == NULL);
827 mutex_enter(&mg->mg_lock);
828 msp->ms_group = mg;
829 msp->ms_weight = 0;
830 avl_add(&mg->mg_metaslab_tree, msp);
831 mutex_exit(&mg->mg_lock);
832
833 mutex_enter(&msp->ms_lock);
834 metaslab_group_histogram_add(mg, msp);
835 mutex_exit(&msp->ms_lock);
836 }
837
838 static void
839 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
840 {
841 mutex_enter(&msp->ms_lock);
842 metaslab_group_histogram_remove(mg, msp);
843 mutex_exit(&msp->ms_lock);
844
845 mutex_enter(&mg->mg_lock);
846 ASSERT(msp->ms_group == mg);
847 avl_remove(&mg->mg_metaslab_tree, msp);
848 msp->ms_group = NULL;
849 mutex_exit(&mg->mg_lock);
850 }
851
852 static void
853 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
854 {
855 /*
856 * Although in principle the weight can be any value, in
857 * practice we do not use values in the range [1, 511].
858 */
859 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
860 ASSERT(MUTEX_HELD(&msp->ms_lock));
861
862 mutex_enter(&mg->mg_lock);
863 ASSERT(msp->ms_group == mg);
864 avl_remove(&mg->mg_metaslab_tree, msp);
865 msp->ms_weight = weight;
866 avl_add(&mg->mg_metaslab_tree, msp);
867 mutex_exit(&mg->mg_lock);
868 }
869
870 /*
871 * Calculate the fragmentation for a given metaslab group. We can use
872 * a simple average here since all metaslabs within the group must have
873 * the same size. The return value will be a value between 0 and 100
874 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
875 * group have a fragmentation metric.
876 */
877 uint64_t
878 metaslab_group_fragmentation(metaslab_group_t *mg)
879 {
880 vdev_t *vd = mg->mg_vd;
881 uint64_t fragmentation = 0;
882 uint64_t valid_ms = 0;
883
884 for (int m = 0; m < vd->vdev_ms_count; m++) {
885 metaslab_t *msp = vd->vdev_ms[m];
886
887 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
888 continue;
889
890 valid_ms++;
891 fragmentation += msp->ms_fragmentation;
892 }
893
894 if (valid_ms <= vd->vdev_ms_count / 2)
895 return (ZFS_FRAG_INVALID);
896
897 fragmentation /= valid_ms;
898 ASSERT3U(fragmentation, <=, 100);
899 return (fragmentation);
900 }
901
902 /*
903 * Determine if a given metaslab group should skip allocations. A metaslab
904 * group should avoid allocations if its free capacity is less than the
905 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
906 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
907 * that can still handle allocations. If the allocation throttle is enabled
908 * then we skip allocations to devices that have reached their maximum
909 * allocation queue depth unless the selected metaslab group is the only
910 * eligible group remaining.
911 */
912 static boolean_t
913 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
914 uint64_t psize)
915 {
916 spa_t *spa = mg->mg_vd->vdev_spa;
917 metaslab_class_t *mc = mg->mg_class;
918
919 /*
920 * We can only consider skipping this metaslab group if it's
921 * in the normal metaslab class and there are other metaslab
922 * groups to select from. Otherwise, we always consider it eligible
923 * for allocations.
924 */
925 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
926 return (B_TRUE);
927
928 /*
929 * If the metaslab group's mg_allocatable flag is set (see comments
930 * in metaslab_group_alloc_update() for more information) and
931 * the allocation throttle is disabled then allow allocations to this
932 * device. However, if the allocation throttle is enabled then
933 * check if we have reached our allocation limit (mg_alloc_queue_depth)
934 * to determine if we should allow allocations to this metaslab group.
935 * If all metaslab groups are no longer considered allocatable
936 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
937 * gang block size then we allow allocations on this metaslab group
938 * regardless of the mg_allocatable or throttle settings.
939 */
940 if (mg->mg_allocatable) {
941 metaslab_group_t *mgp;
942 int64_t qdepth;
943 uint64_t qmax = mg->mg_max_alloc_queue_depth;
944
945 if (!mc->mc_alloc_throttle_enabled)
946 return (B_TRUE);
947
948 /*
949 * If this metaslab group does not have any free space, then
950 * there is no point in looking further.
951 */
952 if (mg->mg_no_free_space)
953 return (B_FALSE);
954
955 qdepth = refcount_count(&mg->mg_alloc_queue_depth);
956
957 /*
958 * If this metaslab group is below its qmax or it's
959 * the only allocatable metasable group, then attempt
960 * to allocate from it.
961 */
962 if (qdepth < qmax || mc->mc_alloc_groups == 1)
963 return (B_TRUE);
964 ASSERT3U(mc->mc_alloc_groups, >, 1);
965
966 /*
967 * Since this metaslab group is at or over its qmax, we
968 * need to determine if there are metaslab groups after this
969 * one that might be able to handle this allocation. This is
970 * racy since we can't hold the locks for all metaslab
971 * groups at the same time when we make this check.
972 */
973 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
974 qmax = mgp->mg_max_alloc_queue_depth;
975
976 qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
977
978 /*
979 * If there is another metaslab group that
980 * might be able to handle the allocation, then
981 * we return false so that we skip this group.
982 */
983 if (qdepth < qmax && !mgp->mg_no_free_space)
984 return (B_FALSE);
985 }
986
987 /*
988 * We didn't find another group to handle the allocation
989 * so we can't skip this metaslab group even though
990 * we are at or over our qmax.
991 */
992 return (B_TRUE);
993
994 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
995 return (B_TRUE);
996 }
997 return (B_FALSE);
998 }
999
1000 /*
1001 * ==========================================================================
1002 * Range tree callbacks
1003 * ==========================================================================
1004 */
1005
1006 /*
1007 * Comparison function for the private size-ordered tree. Tree is sorted
1008 * by size, larger sizes at the end of the tree.
1009 */
1010 static int
1011 metaslab_rangesize_compare(const void *x1, const void *x2)
1012 {
1013 const range_seg_t *r1 = x1;
1014 const range_seg_t *r2 = x2;
1015 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1016 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1017
1018 if (rs_size1 < rs_size2)
1019 return (-1);
1020 if (rs_size1 > rs_size2)
1021 return (1);
1022
1023 if (r1->rs_start < r2->rs_start)
1024 return (-1);
1025
1026 if (r1->rs_start > r2->rs_start)
1027 return (1);
1028
1029 return (0);
1030 }
1031
1032 /*
1033 * Create any block allocator specific components. The current allocators
1034 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1035 */
1036 static void
1037 metaslab_rt_create(range_tree_t *rt, void *arg)
1038 {
1039 metaslab_t *msp = arg;
1040
1041 ASSERT3P(rt->rt_arg, ==, msp);
1042 ASSERT(msp->ms_tree == NULL);
1043
1044 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1045 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1046 }
1047
1048 /*
1049 * Destroy the block allocator specific components.
1050 */
1051 static void
1052 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1053 {
1054 metaslab_t *msp = arg;
1055
1056 ASSERT3P(rt->rt_arg, ==, msp);
1057 ASSERT3P(msp->ms_tree, ==, rt);
1058 ASSERT0(avl_numnodes(&msp->ms_size_tree));
1059
1060 avl_destroy(&msp->ms_size_tree);
1061 }
1062
1063 static void
1064 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1065 {
1066 metaslab_t *msp = arg;
1067
1068 ASSERT3P(rt->rt_arg, ==, msp);
1069 ASSERT3P(msp->ms_tree, ==, rt);
1070 VERIFY(!msp->ms_condensing);
1071 avl_add(&msp->ms_size_tree, rs);
1072 }
1073
1074 static void
1075 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1076 {
1077 metaslab_t *msp = arg;
1078
1079 ASSERT3P(rt->rt_arg, ==, msp);
1080 ASSERT3P(msp->ms_tree, ==, rt);
1081 VERIFY(!msp->ms_condensing);
1082 avl_remove(&msp->ms_size_tree, rs);
1083 }
1084
1085 static void
1086 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1087 {
1088 metaslab_t *msp = arg;
1089
1090 ASSERT3P(rt->rt_arg, ==, msp);
1091 ASSERT3P(msp->ms_tree, ==, rt);
1092
1093 /*
1094 * Normally one would walk the tree freeing nodes along the way.
1095 * Since the nodes are shared with the range trees we can avoid
1096 * walking all nodes and just reinitialize the avl tree. The nodes
1097 * will be freed by the range tree, so we don't want to free them here.
1098 */
1099 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
1100 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
1101 }
1102
1103 static range_tree_ops_t metaslab_rt_ops = {
1104 metaslab_rt_create,
1105 metaslab_rt_destroy,
1106 metaslab_rt_add,
1107 metaslab_rt_remove,
1108 metaslab_rt_vacate
1109 };
1110
1111 /*
1112 * ==========================================================================
1113 * Common allocator routines
1114 * ==========================================================================
1115 */
1116
1117 /*
1118 * Return the maximum contiguous segment within the metaslab.
1119 */
1120 uint64_t
1121 metaslab_block_maxsize(metaslab_t *msp)
1122 {
1123 avl_tree_t *t = &msp->ms_size_tree;
1124 range_seg_t *rs;
1125
1126 if (t == NULL || (rs = avl_last(t)) == NULL)
1127 return (0ULL);
1128
1129 return (rs->rs_end - rs->rs_start);
1130 }
1131
1132 static range_seg_t *
1133 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
1134 {
1135 range_seg_t *rs, rsearch;
1136 avl_index_t where;
1137
1138 rsearch.rs_start = start;
1139 rsearch.rs_end = start + size;
1140
1141 rs = avl_find(t, &rsearch, &where);
1142 if (rs == NULL) {
1143 rs = avl_nearest(t, where, AVL_AFTER);
1144 }
1145
1146 return (rs);
1147 }
1148
1149 /*
1150 * This is a helper function that can be used by the allocator to find
1151 * a suitable block to allocate. This will search the specified AVL
1152 * tree looking for a block that matches the specified criteria.
1153 */
1154 static uint64_t
1155 metaslab_block_picker(metaslab_t *msp, avl_tree_t *t, uint64_t *cursor,
1156 uint64_t size, uint64_t align)
1157 {
1158 range_seg_t *rs = metaslab_block_find(t, *cursor, size);
1159
1160 for (; rs != NULL; rs = AVL_NEXT(t, rs)) {
1161 uint64_t offset = P2ROUNDUP(rs->rs_start, align);
1162
1163 if (offset + size <= rs->rs_end &&
1164 !metaslab_check_trim_conflict(msp, &offset, size, align,
1165 rs->rs_end)) {
1166 *cursor = offset + size;
1167 return (offset);
1168 }
1169 }
1170
1171 /*
1172 * If we know we've searched the whole map (*cursor == 0), give up.
1173 * Otherwise, reset the cursor to the beginning and try again.
1174 */
1175 if (*cursor == 0)
1176 return (-1ULL);
1177
1178 *cursor = 0;
1179 return (metaslab_block_picker(msp, t, cursor, size, align));
1180 }
1181
1182 /*
1183 * ==========================================================================
1184 * The first-fit block allocator
1185 * ==========================================================================
1186 */
1187 static uint64_t
1188 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
1189 {
1190 /*
1191 * Find the largest power of 2 block size that evenly divides the
1192 * requested size. This is used to try to allocate blocks with similar
1193 * alignment from the same area of the metaslab (i.e. same cursor
1194 * bucket) but it does not guarantee that other allocations sizes
1195 * may exist in the same region.
1196 */
1197 uint64_t align = size & -size;
1198 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1199 avl_tree_t *t = &msp->ms_tree->rt_root;
1200
1201 return (metaslab_block_picker(msp, t, cursor, size, align));
1202 }
1203
1204 static metaslab_ops_t metaslab_ff_ops = {
1205 metaslab_ff_alloc
1206 };
1207
1208 /*
1209 * ==========================================================================
1210 * Dynamic block allocator -
1211 * Uses the first fit allocation scheme until space get low and then
1212 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
1213 * and metaslab_df_free_pct to determine when to switch the allocation scheme.
1214 * ==========================================================================
1215 */
1216 static uint64_t
1217 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1218 {
1219 /*
1220 * Find the largest power of 2 block size that evenly divides the
1221 * requested size. This is used to try to allocate blocks with similar
1222 * alignment from the same area of the metaslab (i.e. same cursor
1223 * bucket) but it does not guarantee that other allocations sizes
1224 * may exist in the same region.
1225 */
1226 uint64_t align = size & -size;
1227 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1228 range_tree_t *rt = msp->ms_tree;
1229 avl_tree_t *t = &rt->rt_root;
1230 uint64_t max_size = metaslab_block_maxsize(msp);
1231 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1232
1233 ASSERT(MUTEX_HELD(&msp->ms_lock));
1234 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1235
1236 if (max_size < size)
1237 return (-1ULL);
1238
1239 /*
1240 * If we're running low on space switch to using the size
1241 * sorted AVL tree (best-fit).
1242 */
1243 if (max_size < metaslab_df_alloc_threshold ||
1244 free_pct < metaslab_df_free_pct) {
1245 t = &msp->ms_size_tree;
1246 *cursor = 0;
1247 }
1248
1249 return (metaslab_block_picker(msp, t, cursor, size, 1ULL));
1250 }
1251
1252 static metaslab_ops_t metaslab_df_ops = {
1253 metaslab_df_alloc
1254 };
1255
1256 /*
1257 * ==========================================================================
1258 * Cursor fit block allocator -
1259 * Select the largest region in the metaslab, set the cursor to the beginning
1260 * of the range and the cursor_end to the end of the range. As allocations
1261 * are made advance the cursor. Continue allocating from the cursor until
1262 * the range is exhausted and then find a new range.
1263 * ==========================================================================
1264 */
1265 static uint64_t
1266 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1267 {
1268 range_tree_t *rt = msp->ms_tree;
1269 avl_tree_t *t = &msp->ms_size_tree;
1270 uint64_t *cursor = &msp->ms_lbas[0];
1271 uint64_t *cursor_end = &msp->ms_lbas[1];
1272 uint64_t offset = 0;
1273
1274 ASSERT(MUTEX_HELD(&msp->ms_lock));
1275 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
1276
1277 ASSERT3U(*cursor_end, >=, *cursor);
1278
1279 if ((*cursor + size) > *cursor_end) {
1280 range_seg_t *rs;
1281 for (rs = avl_last(&msp->ms_size_tree);
1282 rs != NULL && rs->rs_end - rs->rs_start >= size;
1283 rs = AVL_PREV(&msp->ms_size_tree, rs)) {
1284 *cursor = rs->rs_start;
1285 *cursor_end = rs->rs_end;
1286 if (!metaslab_check_trim_conflict(msp, cursor, size,
1287 1, *cursor_end)) {
1288 /* segment appears to be acceptable */
1289 break;
1290 }
1291 }
1292 if (rs == NULL || rs->rs_end - rs->rs_start < size)
1293 return (-1ULL);
1294 }
1295
1296 offset = *cursor;
1297 *cursor += size;
1298
1299 return (offset);
1300 }
1301
1302 static metaslab_ops_t metaslab_cf_ops = {
1303 metaslab_cf_alloc
1304 };
1305
1306 /*
1307 * ==========================================================================
1308 * New dynamic fit allocator -
1309 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1310 * contiguous blocks. If no region is found then just use the largest segment
1311 * that remains.
1312 * ==========================================================================
1313 */
1314
1315 /*
1316 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1317 * to request from the allocator.
1318 */
1319 uint64_t metaslab_ndf_clump_shift = 4;
1320
1321 static uint64_t
1322 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1323 {
1324 avl_tree_t *t = &msp->ms_tree->rt_root;
1325 avl_index_t where;
1326 range_seg_t *rs, rsearch;
1327 uint64_t hbit = highbit64(size);
1328 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1329 uint64_t max_size = metaslab_block_maxsize(msp);
1330 /* mutable copy for adjustment by metaslab_check_trim_conflict */
1331 uint64_t adjustable_start;
1332
1333 ASSERT(MUTEX_HELD(&msp->ms_lock));
1334 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
1335
1336 if (max_size < size)
1337 return (-1ULL);
1338
1339 rsearch.rs_start = *cursor;
1340 rsearch.rs_end = *cursor + size;
1341
1342 rs = avl_find(t, &rsearch, &where);
1343 if (rs != NULL)
1344 adjustable_start = rs->rs_start;
1345 if (rs == NULL || rs->rs_end - adjustable_start < size ||
1346 metaslab_check_trim_conflict(msp, &adjustable_start, size, 1,
1347 rs->rs_end)) {
1348 /* segment not usable, try the largest remaining one */
1349 t = &msp->ms_size_tree;
1350
1351 rsearch.rs_start = 0;
1352 rsearch.rs_end = MIN(max_size,
1353 1ULL << (hbit + metaslab_ndf_clump_shift));
1354 rs = avl_find(t, &rsearch, &where);
1355 if (rs == NULL)
1356 rs = avl_nearest(t, where, AVL_AFTER);
1357 ASSERT(rs != NULL);
1358 adjustable_start = rs->rs_start;
1359 if (rs->rs_end - adjustable_start < size ||
1360 metaslab_check_trim_conflict(msp, &adjustable_start,
1361 size, 1, rs->rs_end)) {
1362 /* even largest remaining segment not usable */
1363 return (-1ULL);
1364 }
1365 }
1366
1367 *cursor = adjustable_start + size;
1368 return (*cursor);
1369 }
1370
1371 static metaslab_ops_t metaslab_ndf_ops = {
1372 metaslab_ndf_alloc
1373 };
1374
1375 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1376
1377 /*
1378 * ==========================================================================
1379 * Metaslabs
1380 * ==========================================================================
1381 */
1382
1383 /*
1384 * Wait for any in-progress metaslab loads to complete.
1385 */
1386 void
1387 metaslab_load_wait(metaslab_t *msp)
1388 {
1389 ASSERT(MUTEX_HELD(&msp->ms_lock));
1390
1391 while (msp->ms_loading) {
1392 ASSERT(!msp->ms_loaded);
1393 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1394 }
1395 }
1396
1397 int
1398 metaslab_load(metaslab_t *msp)
1399 {
1400 int error = 0;
1401 boolean_t success = B_FALSE;
1402
1403 ASSERT(MUTEX_HELD(&msp->ms_lock));
1404 ASSERT(!msp->ms_loaded);
1405 ASSERT(!msp->ms_loading);
1406
1407 msp->ms_loading = B_TRUE;
1408
1409 /*
1410 * If the space map has not been allocated yet, then treat
1411 * all the space in the metaslab as free and add it to the
1412 * ms_tree.
1413 */
1414 if (msp->ms_sm != NULL)
1415 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
1416 else
1417 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
1418
1419 success = (error == 0);
1420 msp->ms_loading = B_FALSE;
1421
1422 if (success) {
1423 ASSERT3P(msp->ms_group, !=, NULL);
1424 msp->ms_loaded = B_TRUE;
1425
1426 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1427 range_tree_walk(msp->ms_defertree[t],
1428 range_tree_remove, msp->ms_tree);
1429 range_tree_walk(msp->ms_defertree[t],
1430 metaslab_trim_remove, msp);
1431 }
1432 msp->ms_max_size = metaslab_block_maxsize(msp);
1433 }
1434 cv_broadcast(&msp->ms_load_cv);
1435 return (error);
1436 }
1437
1438 void
1439 metaslab_unload(metaslab_t *msp)
1440 {
1441 ASSERT(MUTEX_HELD(&msp->ms_lock));
1442 range_tree_vacate(msp->ms_tree, NULL, NULL);
1443 msp->ms_loaded = B_FALSE;
1444 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
1445 msp->ms_max_size = 0;
1446 }
1447
1448 int
1449 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
1450 metaslab_t **msp)
1451 {
1452 vdev_t *vd = mg->mg_vd;
1453 objset_t *mos = vd->vdev_spa->spa_meta_objset;
1454 metaslab_t *ms;
1455 int error;
1456
1457 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
1458 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
1459 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
1460 cv_init(&ms->ms_trim_cv, NULL, CV_DEFAULT, NULL);
1461 ms->ms_id = id;
1462 ms->ms_start = id << vd->vdev_ms_shift;
1463 ms->ms_size = 1ULL << vd->vdev_ms_shift;
1464
1465 /*
1466 * We only open space map objects that already exist. All others
1467 * will be opened when we finally allocate an object for it.
1468 */
1469 if (object != 0) {
1470 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
1471 ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
1472
1473 if (error != 0) {
1474 kmem_free(ms, sizeof (metaslab_t));
1475 return (error);
1476 }
1477
1478 ASSERT(ms->ms_sm != NULL);
1479 }
1480
1481 ms->ms_cur_ts = metaslab_new_trimset(0, &ms->ms_lock);
1482
1483 /*
1484 * We create the main range tree here, but we don't create the
1485 * other range trees until metaslab_sync_done(). This serves
1486 * two purposes: it allows metaslab_sync_done() to detect the
1487 * addition of new space; and for debugging, it ensures that we'd
1488 * data fault on any attempt to use this metaslab before it's ready.
1489 */
1490 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
1491 metaslab_group_add(mg, ms);
1492
1493 metaslab_set_fragmentation(ms);
1494
1495 /*
1496 * If we're opening an existing pool (txg == 0) or creating
1497 * a new one (txg == TXG_INITIAL), all space is available now.
1498 * If we're adding space to an existing pool, the new space
1499 * does not become available until after this txg has synced.
1500 * The metaslab's weight will also be initialized when we sync
1501 * out this txg. This ensures that we don't attempt to allocate
1502 * from it before we have initialized it completely.
1503 */
1504 if (txg <= TXG_INITIAL)
1505 metaslab_sync_done(ms, 0);
1506
1507 /*
1508 * If metaslab_debug_load is set and we're initializing a metaslab
1509 * that has an allocated space map object then load the its space
1510 * map so that can verify frees.
1511 */
1512 if (metaslab_debug_load && ms->ms_sm != NULL) {
1513 mutex_enter(&ms->ms_lock);
1514 VERIFY0(metaslab_load(ms));
1515 mutex_exit(&ms->ms_lock);
1516 }
1517
1518 if (txg != 0) {
1519 vdev_dirty(vd, 0, NULL, txg);
1520 vdev_dirty(vd, VDD_METASLAB, ms, txg);
1521 }
1522
1523 *msp = ms;
1524
1525 return (0);
1526 }
1527
1528 void
1529 metaslab_fini(metaslab_t *msp)
1530 {
1531 metaslab_group_t *mg = msp->ms_group;
1532
1533 metaslab_group_remove(mg, msp);
1534
1535 mutex_enter(&msp->ms_lock);
1536 VERIFY(msp->ms_group == NULL);
1537 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1538 0, -msp->ms_size);
1539 space_map_close(msp->ms_sm);
1540
1541 metaslab_unload(msp);
1542 range_tree_destroy(msp->ms_tree);
1543 range_tree_destroy(msp->ms_freeingtree);
1544 range_tree_destroy(msp->ms_freedtree);
1545
1546 for (int t = 0; t < TXG_SIZE; t++) {
1547 range_tree_destroy(msp->ms_alloctree[t]);
1548 }
1549
1550 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1551 range_tree_destroy(msp->ms_defertree[t]);
1552 }
1553
1554 metaslab_free_trimset(msp->ms_cur_ts);
1555 if (msp->ms_prev_ts)
1556 metaslab_free_trimset(msp->ms_prev_ts);
1557 ASSERT3P(msp->ms_trimming_ts, ==, NULL);
1558
1559 ASSERT0(msp->ms_deferspace);
1560
1561 mutex_exit(&msp->ms_lock);
1562 cv_destroy(&msp->ms_load_cv);
1563 cv_destroy(&msp->ms_trim_cv);
1564 mutex_destroy(&msp->ms_lock);
1565
1566 kmem_free(msp, sizeof (metaslab_t));
1567 }
1568
1569 #define FRAGMENTATION_TABLE_SIZE 17
1570
1571 /*
1572 * This table defines a segment size based fragmentation metric that will
1573 * allow each metaslab to derive its own fragmentation value. This is done
1574 * by calculating the space in each bucket of the spacemap histogram and
1575 * multiplying that by the fragmetation metric in this table. Doing
1576 * this for all buckets and dividing it by the total amount of free
1577 * space in this metaslab (i.e. the total free space in all buckets) gives
1578 * us the fragmentation metric. This means that a high fragmentation metric
1579 * equates to most of the free space being comprised of small segments.
1580 * Conversely, if the metric is low, then most of the free space is in
1581 * large segments. A 10% change in fragmentation equates to approximately
1582 * double the number of segments.
1583 *
1584 * This table defines 0% fragmented space using 16MB segments. Testing has
1585 * shown that segments that are greater than or equal to 16MB do not suffer
1586 * from drastic performance problems. Using this value, we derive the rest
1587 * of the table. Since the fragmentation value is never stored on disk, it
1588 * is possible to change these calculations in the future.
1589 */
1590 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
1591 100, /* 512B */
1592 100, /* 1K */
1593 98, /* 2K */
1594 95, /* 4K */
1595 90, /* 8K */
1596 80, /* 16K */
1597 70, /* 32K */
1598 60, /* 64K */
1599 50, /* 128K */
1600 40, /* 256K */
1601 30, /* 512K */
1602 20, /* 1M */
1603 15, /* 2M */
1604 10, /* 4M */
1605 5, /* 8M */
1606 0 /* 16M */
1607 };
1608
1609 /*
1610 * Calclate the metaslab's fragmentation metric. A return value
1611 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
1612 * not support this metric. Otherwise, the return value should be in the
1613 * range [0, 100].
1614 */
1615 static void
1616 metaslab_set_fragmentation(metaslab_t *msp)
1617 {
1618 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1619 uint64_t fragmentation = 0;
1620 uint64_t total = 0;
1621 boolean_t feature_enabled = spa_feature_is_enabled(spa,
1622 SPA_FEATURE_SPACEMAP_HISTOGRAM);
1623
1624 if (!feature_enabled) {
1625 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1626 return;
1627 }
1628
1629 /*
1630 * A null space map means that the entire metaslab is free
1631 * and thus is not fragmented.
1632 */
1633 if (msp->ms_sm == NULL) {
1634 msp->ms_fragmentation = 0;
1635 return;
1636 }
1637
1638 /*
1639 * If this metaslab's space map has not been upgraded, flag it
1640 * so that we upgrade next time we encounter it.
1641 */
1642 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
1643 uint64_t txg = spa_syncing_txg(spa);
1644 vdev_t *vd = msp->ms_group->mg_vd;
1645
1646 /*
1647 * If we've reached the final dirty txg, then we must
1648 * be shutting down the pool. We don't want to dirty
1649 * any data past this point so skip setting the condense
1650 * flag. We can retry this action the next time the pool
1651 * is imported.
1652 */
1653 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
1654 msp->ms_condense_wanted = B_TRUE;
1655 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1656 spa_dbgmsg(spa, "txg %llu, requesting force condense: "
1657 "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
1658 vd->vdev_id);
1659 }
1660 msp->ms_fragmentation = ZFS_FRAG_INVALID;
1661 return;
1662 }
1663
1664 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1665 uint64_t space = 0;
1666 uint8_t shift = msp->ms_sm->sm_shift;
1667
1668 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
1669 FRAGMENTATION_TABLE_SIZE - 1);
1670
1671 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1672 continue;
1673
1674 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
1675 total += space;
1676
1677 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
1678 fragmentation += space * zfs_frag_table[idx];
1679 }
1680
1681 if (total > 0)
1682 fragmentation /= total;
1683 ASSERT3U(fragmentation, <=, 100);
1684
1685 msp->ms_fragmentation = fragmentation;
1686 }
1687
1688 /*
1689 * Compute a weight -- a selection preference value -- for the given metaslab.
1690 * This is based on the amount of free space, the level of fragmentation,
1691 * the LBA range, and whether the metaslab is loaded.
1692 */
1693 static uint64_t
1694 metaslab_space_weight(metaslab_t *msp)
1695 {
1696 metaslab_group_t *mg = msp->ms_group;
1697 vdev_t *vd = mg->mg_vd;
1698 uint64_t weight, space;
1699
1700 ASSERT(MUTEX_HELD(&msp->ms_lock));
1701 ASSERT(!vd->vdev_removing);
1702
1703 /*
1704 * The baseline weight is the metaslab's free space.
1705 */
1706 space = msp->ms_size - space_map_allocated(msp->ms_sm);
1707
1708 if (metaslab_fragmentation_factor_enabled &&
1709 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
1710 /*
1711 * Use the fragmentation information to inversely scale
1712 * down the baseline weight. We need to ensure that we
1713 * don't exclude this metaslab completely when it's 100%
1714 * fragmented. To avoid this we reduce the fragmented value
1715 * by 1.
1716 */
1717 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
1718
1719 /*
1720 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
1721 * this metaslab again. The fragmentation metric may have
1722 * decreased the space to something smaller than
1723 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
1724 * so that we can consume any remaining space.
1725 */
1726 if (space > 0 && space < SPA_MINBLOCKSIZE)
1727 space = SPA_MINBLOCKSIZE;
1728 }
1729 weight = space;
1730
1731 /*
1732 * Modern disks have uniform bit density and constant angular velocity.
1733 * Therefore, the outer recording zones are faster (higher bandwidth)
1734 * than the inner zones by the ratio of outer to inner track diameter,
1735 * which is typically around 2:1. We account for this by assigning
1736 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1737 * In effect, this means that we'll select the metaslab with the most
1738 * free bandwidth rather than simply the one with the most free space.
1739 */
1740 if (metaslab_lba_weighting_enabled) {
1741 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1742 ASSERT(weight >= space && weight <= 2 * space);
1743 }
1744
1745 /*
1746 * If this metaslab is one we're actively using, adjust its
1747 * weight to make it preferable to any inactive metaslab so
1748 * we'll polish it off. If the fragmentation on this metaslab
1749 * has exceed our threshold, then don't mark it active.
1750 */
1751 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
1752 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
1753 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1754 }
1755
1756 WEIGHT_SET_SPACEBASED(weight);
1757 return (weight);
1758 }
1759
1760 /*
1761 * Return the weight of the specified metaslab, according to the segment-based
1762 * weighting algorithm. The metaslab must be loaded. This function can
1763 * be called within a sync pass since it relies only on the metaslab's
1764 * range tree which is always accurate when the metaslab is loaded.
1765 */
1766 static uint64_t
1767 metaslab_weight_from_range_tree(metaslab_t *msp)
1768 {
1769 uint64_t weight = 0;
1770 uint32_t segments = 0;
1771
1772 ASSERT(msp->ms_loaded);
1773
1774 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
1775 i--) {
1776 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
1777 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1778
1779 segments <<= 1;
1780 segments += msp->ms_tree->rt_histogram[i];
1781
1782 /*
1783 * The range tree provides more precision than the space map
1784 * and must be downgraded so that all values fit within the
1785 * space map's histogram. This allows us to compare loaded
1786 * vs. unloaded metaslabs to determine which metaslab is
1787 * considered "best".
1788 */
1789 if (i > max_idx)
1790 continue;
1791
1792 if (segments != 0) {
1793 WEIGHT_SET_COUNT(weight, segments);
1794 WEIGHT_SET_INDEX(weight, i);
1795 WEIGHT_SET_ACTIVE(weight, 0);
1796 break;
1797 }
1798 }
1799 return (weight);
1800 }
1801
1802 /*
1803 * Calculate the weight based on the on-disk histogram. This should only
1804 * be called after a sync pass has completely finished since the on-disk
1805 * information is updated in metaslab_sync().
1806 */
1807 static uint64_t
1808 metaslab_weight_from_spacemap(metaslab_t *msp)
1809 {
1810 uint64_t weight = 0;
1811
1812 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
1813 if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
1814 WEIGHT_SET_COUNT(weight,
1815 msp->ms_sm->sm_phys->smp_histogram[i]);
1816 WEIGHT_SET_INDEX(weight, i +
1817 msp->ms_sm->sm_shift);
1818 WEIGHT_SET_ACTIVE(weight, 0);
1819 break;
1820 }
1821 }
1822 return (weight);
1823 }
1824
1825 /*
1826 * Compute a segment-based weight for the specified metaslab. The weight
1827 * is determined by highest bucket in the histogram. The information
1828 * for the highest bucket is encoded into the weight value.
1829 */
1830 static uint64_t
1831 metaslab_segment_weight(metaslab_t *msp)
1832 {
1833 metaslab_group_t *mg = msp->ms_group;
1834 uint64_t weight = 0;
1835 uint8_t shift = mg->mg_vd->vdev_ashift;
1836
1837 ASSERT(MUTEX_HELD(&msp->ms_lock));
1838
1839 /*
1840 * The metaslab is completely free.
1841 */
1842 if (space_map_allocated(msp->ms_sm) == 0) {
1843 int idx = highbit64(msp->ms_size) - 1;
1844 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
1845
1846 if (idx < max_idx) {
1847 WEIGHT_SET_COUNT(weight, 1ULL);
1848 WEIGHT_SET_INDEX(weight, idx);
1849 } else {
1850 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
1851 WEIGHT_SET_INDEX(weight, max_idx);
1852 }
1853 WEIGHT_SET_ACTIVE(weight, 0);
1854 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
1855
1856 return (weight);
1857 }
1858
1859 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
1860
1861 /*
1862 * If the metaslab is fully allocated then just make the weight 0.
1863 */
1864 if (space_map_allocated(msp->ms_sm) == msp->ms_size)
1865 return (0);
1866 /*
1867 * If the metaslab is already loaded, then use the range tree to
1868 * determine the weight. Otherwise, we rely on the space map information
1869 * to generate the weight.
1870 */
1871 if (msp->ms_loaded) {
1872 weight = metaslab_weight_from_range_tree(msp);
1873 } else {
1874 weight = metaslab_weight_from_spacemap(msp);
1875 }
1876
1877 /*
1878 * If the metaslab was active the last time we calculated its weight
1879 * then keep it active. We want to consume the entire region that
1880 * is associated with this weight.
1881 */
1882 if (msp->ms_activation_weight != 0 && weight != 0)
1883 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
1884 return (weight);
1885 }
1886
1887 /*
1888 * Determine if we should attempt to allocate from this metaslab. If the
1889 * metaslab has a maximum size then we can quickly determine if the desired
1890 * allocation size can be satisfied. Otherwise, if we're using segment-based
1891 * weighting then we can determine the maximum allocation that this metaslab
1892 * can accommodate based on the index encoded in the weight. If we're using
1893 * space-based weights then rely on the entire weight (excluding the weight
1894 * type bit).
1895 */
1896 boolean_t
1897 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
1898 {
1899 boolean_t should_allocate;
1900
1901 if (msp->ms_max_size != 0)
1902 return (msp->ms_max_size >= asize);
1903
1904 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
1905 /*
1906 * The metaslab segment weight indicates segments in the
1907 * range [2^i, 2^(i+1)), where i is the index in the weight.
1908 * Since the asize might be in the middle of the range, we
1909 * should attempt the allocation if asize < 2^(i+1).
1910 */
1911 should_allocate = (asize <
1912 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
1913 } else {
1914 should_allocate = (asize <=
1915 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
1916 }
1917 return (should_allocate);
1918 }
1919
1920 static uint64_t
1921 metaslab_weight(metaslab_t *msp)
1922 {
1923 vdev_t *vd = msp->ms_group->mg_vd;
1924 spa_t *spa = vd->vdev_spa;
1925 uint64_t weight;
1926
1927 ASSERT(MUTEX_HELD(&msp->ms_lock));
1928
1929 /*
1930 * This vdev is in the process of being removed so there is nothing
1931 * for us to do here.
1932 */
1933 if (vd->vdev_removing) {
1934 ASSERT0(space_map_allocated(msp->ms_sm));
1935 ASSERT0(vd->vdev_ms_shift);
1936 return (0);
1937 }
1938
1939 metaslab_set_fragmentation(msp);
1940
1941 /*
1942 * Update the maximum size if the metaslab is loaded. This will
1943 * ensure that we get an accurate maximum size if newly freed space
1944 * has been added back into the free tree.
1945 */
1946 if (msp->ms_loaded)
1947 msp->ms_max_size = metaslab_block_maxsize(msp);
1948
1949 /*
1950 * Segment-based weighting requires space map histogram support.
1951 */
1952 if (zfs_metaslab_segment_weight_enabled &&
1953 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
1954 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
1955 sizeof (space_map_phys_t))) {
1956 weight = metaslab_segment_weight(msp);
1957 } else {
1958 weight = metaslab_space_weight(msp);
1959 }
1960 return (weight);
1961 }
1962
1963 static int
1964 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1965 {
1966 ASSERT(MUTEX_HELD(&msp->ms_lock));
1967
1968 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1969 metaslab_load_wait(msp);
1970 if (!msp->ms_loaded) {
1971 int error = metaslab_load(msp);
1972 if (error) {
1973 metaslab_group_sort(msp->ms_group, msp, 0);
1974 return (error);
1975 }
1976 }
1977
1978 msp->ms_activation_weight = msp->ms_weight;
1979 metaslab_group_sort(msp->ms_group, msp,
1980 msp->ms_weight | activation_weight);
1981 }
1982 ASSERT(msp->ms_loaded);
1983 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1984
1985 return (0);
1986 }
1987
1988 static void
1989 metaslab_passivate(metaslab_t *msp, uint64_t weight)
1990 {
1991 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
1992
1993 /*
1994 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1995 * this metaslab again. In that case, it had better be empty,
1996 * or we would be leaving space on the table.
1997 */
1998 ASSERT(size >= SPA_MINBLOCKSIZE ||
1999 range_tree_space(msp->ms_tree) == 0);
2000 ASSERT0(weight & METASLAB_ACTIVE_MASK);
2001
2002 msp->ms_activation_weight = 0;
2003 metaslab_group_sort(msp->ms_group, msp, weight);
2004 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
2005 }
2006
2007 /*
2008 * Segment-based metaslabs are activated once and remain active until
2009 * we either fail an allocation attempt (similar to space-based metaslabs)
2010 * or have exhausted the free space in zfs_metaslab_switch_threshold
2011 * buckets since the metaslab was activated. This function checks to see
2012 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
2013 * metaslab and passivates it proactively. This will allow us to select a
2014 * metaslabs with larger contiguous region if any remaining within this
2015 * metaslab group. If we're in sync pass > 1, then we continue using this
2016 * metaslab so that we don't dirty more block and cause more sync passes.
2017 */
2018 void
2019 metaslab_segment_may_passivate(metaslab_t *msp)
2020 {
2021 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2022
2023 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
2024 return;
2025
2026 /*
2027 * Since we are in the middle of a sync pass, the most accurate
2028 * information that is accessible to us is the in-core range tree
2029 * histogram; calculate the new weight based on that information.
2030 */
2031 uint64_t weight = metaslab_weight_from_range_tree(msp);
2032 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
2033 int current_idx = WEIGHT_GET_INDEX(weight);
2034
2035 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
2036 metaslab_passivate(msp, weight);
2037 }
2038
2039 static void
2040 metaslab_preload(void *arg)
2041 {
2042 metaslab_t *msp = arg;
2043 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2044
2045 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
2046
2047 mutex_enter(&msp->ms_lock);
2048 metaslab_load_wait(msp);
2049 if (!msp->ms_loaded)
2050 (void) metaslab_load(msp);
2051 msp->ms_selected_txg = spa_syncing_txg(spa);
2052 mutex_exit(&msp->ms_lock);
2053 }
2054
2055 static void
2056 metaslab_group_preload(metaslab_group_t *mg)
2057 {
2058 spa_t *spa = mg->mg_vd->vdev_spa;
2059 metaslab_t *msp;
2060 avl_tree_t *t = &mg->mg_metaslab_tree;
2061 int m = 0;
2062
2063 if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
2064 taskq_wait(mg->mg_taskq);
2065 return;
2066 }
2067
2068 mutex_enter(&mg->mg_lock);
2069 /*
2070 * Load the next potential metaslabs
2071 */
2072 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
2073 /*
2074 * We preload only the maximum number of metaslabs specified
2075 * by metaslab_preload_limit. If a metaslab is being forced
2076 * to condense then we preload it too. This will ensure
2077 * that force condensing happens in the next txg.
2078 */
2079 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
2080 continue;
2081 }
2082
2083 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
2084 msp, TQ_SLEEP) != NULL);
2085 }
2086 mutex_exit(&mg->mg_lock);
2087 }
2088
2089 /*
2090 * Determine if the space map's on-disk footprint is past our tolerance
2091 * for inefficiency. We would like to use the following criteria to make
2092 * our decision:
2093 *
2094 * 1. The size of the space map object should not dramatically increase as a
2095 * result of writing out the free space range tree.
2096 *
2097 * 2. The minimal on-disk space map representation is zfs_condense_pct/100
2098 * times the size than the free space range tree representation
2099 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
2100 *
2101 * 3. The on-disk size of the space map should actually decrease.
2102 *
2103 * Checking the first condition is tricky since we don't want to walk
2104 * the entire AVL tree calculating the estimated on-disk size. Instead we
2105 * use the size-ordered range tree in the metaslab and calculate the
2106 * size required to write out the largest segment in our free tree. If the
2107 * size required to represent that segment on disk is larger than the space
2108 * map object then we avoid condensing this map.
2109 *
2110 * To determine the second criterion we use a best-case estimate and assume
2111 * each segment can be represented on-disk as a single 64-bit entry. We refer
2112 * to this best-case estimate as the space map's minimal form.
2113 *
2114 * Unfortunately, we cannot compute the on-disk size of the space map in this
2115 * context because we cannot accurately compute the effects of compression, etc.
2116 * Instead, we apply the heuristic described in the block comment for
2117 * zfs_metaslab_condense_block_threshold - we only condense if the space used
2118 * is greater than a threshold number of blocks.
2119 */
2120 static boolean_t
2121 metaslab_should_condense(metaslab_t *msp)
2122 {
2123 space_map_t *sm = msp->ms_sm;
2124 range_seg_t *rs;
2125 uint64_t size, entries, segsz, object_size, optimal_size, record_size;
2126 dmu_object_info_t doi;
2127 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
2128
2129 ASSERT(MUTEX_HELD(&msp->ms_lock));
2130 ASSERT(msp->ms_loaded);
2131
2132 /*
2133 * Use the ms_size_tree range tree, which is ordered by size, to
2134 * obtain the largest segment in the free tree. We always condense
2135 * metaslabs that are empty and metaslabs for which a condense
2136 * request has been made.
2137 */
2138 rs = avl_last(&msp->ms_size_tree);
2139 if (rs == NULL || msp->ms_condense_wanted)
2140 return (B_TRUE);
2141
2142 /*
2143 * Calculate the number of 64-bit entries this segment would
2144 * require when written to disk. If this single segment would be
2145 * larger on-disk than the entire current on-disk structure, then
2146 * clearly condensing will increase the on-disk structure size.
2147 */
2148 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
2149 entries = size / (MIN(size, SM_RUN_MAX));
2150 segsz = entries * sizeof (uint64_t);
2151
2152 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
2153 object_size = space_map_length(msp->ms_sm);
2154
2155 dmu_object_info_from_db(sm->sm_dbuf, &doi);
2156 record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
2157
2158 return (segsz <= object_size &&
2159 object_size >= (optimal_size * zfs_condense_pct / 100) &&
2160 object_size > zfs_metaslab_condense_block_threshold * record_size);
2161 }
2162
2163 /*
2164 * Condense the on-disk space map representation to its minimized form.
2165 * The minimized form consists of a small number of allocations followed by
2166 * the entries of the free range tree.
2167 */
2168 static void
2169 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
2170 {
2171 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2172 range_tree_t *condense_tree;
2173 space_map_t *sm = msp->ms_sm;
2174
2175 ASSERT(MUTEX_HELD(&msp->ms_lock));
2176 ASSERT3U(spa_sync_pass(spa), ==, 1);
2177 ASSERT(msp->ms_loaded);
2178
2179
2180 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
2181 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
2182 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
2183 msp->ms_group->mg_vd->vdev_spa->spa_name,
2184 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
2185 msp->ms_condense_wanted ? "TRUE" : "FALSE");
2186
2187 msp->ms_condense_wanted = B_FALSE;
2188
2189 /*
2190 * Create an range tree that is 100% allocated. We remove segments
2191 * that have been freed in this txg, any deferred frees that exist,
2192 * and any allocation in the future. Removing segments should be
2193 * a relatively inexpensive operation since we expect these trees to
2194 * have a small number of nodes.
2195 */
2196 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
2197 range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
2198
2199 /*
2200 * Remove what's been freed in this txg from the condense_tree.
2201 * Since we're in sync_pass 1, we know that all the frees from
2202 * this txg are in the freeingtree.
2203 */
2204 range_tree_walk(msp->ms_freeingtree, range_tree_remove, condense_tree);
2205
2206 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2207 range_tree_walk(msp->ms_defertree[t],
2208 range_tree_remove, condense_tree);
2209 }
2210
2211 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2212 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
2213 range_tree_remove, condense_tree);
2214 }
2215
2216 /*
2217 * We're about to drop the metaslab's lock thus allowing
2218 * other consumers to change it's content. Set the
2219 * metaslab's ms_condensing flag to ensure that
2220 * allocations on this metaslab do not occur while we're
2221 * in the middle of committing it to disk. This is only critical
2222 * for the ms_tree as all other range trees use per txg
2223 * views of their content.
2224 */
2225 msp->ms_condensing = B_TRUE;
2226
2227 mutex_exit(&msp->ms_lock);
2228 space_map_truncate(sm, tx);
2229 mutex_enter(&msp->ms_lock);
2230
2231 /*
2232 * While we would ideally like to create a space map representation
2233 * that consists only of allocation records, doing so can be
2234 * prohibitively expensive because the in-core free tree can be
2235 * large, and therefore computationally expensive to subtract
2236 * from the condense_tree. Instead we sync out two trees, a cheap
2237 * allocation only tree followed by the in-core free tree. While not
2238 * optimal, this is typically close to optimal, and much cheaper to
2239 * compute.
2240 */
2241 space_map_write(sm, condense_tree, SM_ALLOC, tx);
2242 range_tree_vacate(condense_tree, NULL, NULL);
2243 range_tree_destroy(condense_tree);
2244
2245 space_map_write(sm, msp->ms_tree, SM_FREE, tx);
2246 msp->ms_condensing = B_FALSE;
2247 }
2248
2249 /*
2250 * Write a metaslab to disk in the context of the specified transaction group.
2251 */
2252 void
2253 metaslab_sync(metaslab_t *msp, uint64_t txg)
2254 {
2255 metaslab_group_t *mg = msp->ms_group;
2256 vdev_t *vd = mg->mg_vd;
2257 spa_t *spa = vd->vdev_spa;
2258 objset_t *mos = spa_meta_objset(spa);
2259 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
2260 dmu_tx_t *tx;
2261 uint64_t object = space_map_object(msp->ms_sm);
2262
2263 ASSERT(!vd->vdev_ishole);
2264
2265 mutex_enter(&msp->ms_lock);
2266
2267 /*
2268 * This metaslab has just been added so there's no work to do now.
2269 */
2270 if (msp->ms_freeingtree == NULL) {
2271 ASSERT3P(alloctree, ==, NULL);
2272 mutex_exit(&msp->ms_lock);
2273 return;
2274 }
2275
2276 ASSERT3P(alloctree, !=, NULL);
2277 ASSERT3P(msp->ms_freeingtree, !=, NULL);
2278 ASSERT3P(msp->ms_freedtree, !=, NULL);
2279
2280 /*
2281 * Normally, we don't want to process a metaslab if there
2282 * are no allocations or frees to perform. However, if the metaslab
2283 * is being forced to condense and it's loaded, we need to let it
2284 * through.
2285 */
2286 if (range_tree_space(alloctree) == 0 &&
2287 range_tree_space(msp->ms_freeingtree) == 0 &&
2288 !(msp->ms_loaded && msp->ms_condense_wanted)) {
2289 mutex_exit(&msp->ms_lock);
2290 return;
2291 }
2292
2293
2294 VERIFY(txg <= spa_final_dirty_txg(spa));
2295
2296 /*
2297 * The only state that can actually be changing concurrently with
2298 * metaslab_sync() is the metaslab's ms_tree. No other thread can
2299 * be modifying this txg's alloctree, freeingtree, freedtree, or
2300 * space_map_phys_t. Therefore, we only hold ms_lock to satify
2301 * space map ASSERTs. We drop it whenever we call into the DMU,
2302 * because the DMU can call down to us (e.g. via zio_free()) at
2303 * any time.
2304 */
2305
2306 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
2307
2308 if (msp->ms_sm == NULL) {
2309 uint64_t new_object;
2310
2311 new_object = space_map_alloc(mos, tx);
2312 VERIFY3U(new_object, !=, 0);
2313
2314 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
2315 msp->ms_start, msp->ms_size, vd->vdev_ashift,
2316 &msp->ms_lock));
2317 ASSERT(msp->ms_sm != NULL);
2318 }
2319
2320 /*
2321 * Note: metaslab_condense() clears the space map's histogram.
2322 * Therefore we must verify and remove this histogram before
2323 * condensing.
2324 */
2325 metaslab_group_histogram_verify(mg);
2326 metaslab_class_histogram_verify(mg->mg_class);
2327 metaslab_group_histogram_remove(mg, msp);
2328
2329 if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
2330 metaslab_should_condense(msp)) {
2331 metaslab_condense(msp, txg, tx);
2332 } else {
2333 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
2334 space_map_write(msp->ms_sm, msp->ms_freeingtree, SM_FREE, tx);
2335 }
2336
2337 if (msp->ms_loaded) {
2338 /*
2339 * When the space map is loaded, we have an accruate
2340 * histogram in the range tree. This gives us an opportunity
2341 * to bring the space map's histogram up-to-date so we clear
2342 * it first before updating it.
2343 */
2344 space_map_histogram_clear(msp->ms_sm);
2345 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
2346
2347 /*
2348 * Since we've cleared the histogram we need to add back
2349 * any free space that has already been processed, plus
2350 * any deferred space. This allows the on-disk histogram
2351 * to accurately reflect all free space even if some space
2352 * is not yet available for allocation (i.e. deferred).
2353 */
2354 space_map_histogram_add(msp->ms_sm, msp->ms_freedtree, tx);
2355
2356 /*
2357 * Add back any deferred free space that has not been
2358 * added back into the in-core free tree yet. This will
2359 * ensure that we don't end up with a space map histogram
2360 * that is completely empty unless the metaslab is fully
2361 * allocated.
2362 */
2363 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2364 space_map_histogram_add(msp->ms_sm,
2365 msp->ms_defertree[t], tx);
2366 }
2367 }
2368
2369 /*
2370 * Always add the free space from this sync pass to the space
2371 * map histogram. We want to make sure that the on-disk histogram
2372 * accounts for all free space. If the space map is not loaded,
2373 * then we will lose some accuracy but will correct it the next
2374 * time we load the space map.
2375 */
2376 space_map_histogram_add(msp->ms_sm, msp->ms_freeingtree, tx);
2377
2378 metaslab_group_histogram_add(mg, msp);
2379 metaslab_group_histogram_verify(mg);
2380 metaslab_class_histogram_verify(mg->mg_class);
2381
2382 /*
2383 * For sync pass 1, we avoid traversing this txg's free range tree
2384 * and instead will just swap the pointers for freeingtree and
2385 * freedtree. We can safely do this since the freed_tree is
2386 * guaranteed to be empty on the initial pass.
2387 */
2388 if (spa_sync_pass(spa) == 1) {
2389 range_tree_swap(&msp->ms_freeingtree, &msp->ms_freedtree);
2390 } else {
2391 range_tree_vacate(msp->ms_freeingtree,
2392 range_tree_add, msp->ms_freedtree);
2393 }
2394 range_tree_vacate(alloctree, NULL, NULL);
2395
2396 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
2397 ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
2398 ASSERT0(range_tree_space(msp->ms_freeingtree));
2399
2400 mutex_exit(&msp->ms_lock);
2401
2402 if (object != space_map_object(msp->ms_sm)) {
2403 object = space_map_object(msp->ms_sm);
2404 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
2405 msp->ms_id, sizeof (uint64_t), &object, tx);
2406 }
2407 dmu_tx_commit(tx);
2408 }
2409
2410 /*
2411 * Called after a transaction group has completely synced to mark
2412 * all of the metaslab's free space as usable.
2413 */
2414 void
2415 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
2416 {
2417 metaslab_group_t *mg = msp->ms_group;
2418 vdev_t *vd = mg->mg_vd;
2419 spa_t *spa = vd->vdev_spa;
2420 range_tree_t **defer_tree;
2421 int64_t alloc_delta, defer_delta;
2422 boolean_t defer_allowed = B_TRUE;
2423
2424 ASSERT(!vd->vdev_ishole);
2425
2426 mutex_enter(&msp->ms_lock);
2427
2428 /*
2429 * If this metaslab is just becoming available, initialize its
2430 * range trees and add its capacity to the vdev.
2431 */
2432 if (msp->ms_freedtree == NULL) {
2433 for (int t = 0; t < TXG_SIZE; t++) {
2434 ASSERT(msp->ms_alloctree[t] == NULL);
2435
2436 msp->ms_alloctree[t] = range_tree_create(NULL, msp,
2437 &msp->ms_lock);
2438 }
2439
2440 ASSERT3P(msp->ms_freeingtree, ==, NULL);
2441 msp->ms_freeingtree = range_tree_create(NULL, msp,
2442 &msp->ms_lock);
2443
2444 ASSERT3P(msp->ms_freedtree, ==, NULL);
2445 msp->ms_freedtree = range_tree_create(NULL, msp,
2446 &msp->ms_lock);
2447
2448 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2449 ASSERT(msp->ms_defertree[t] == NULL);
2450
2451 msp->ms_defertree[t] = range_tree_create(NULL, msp,
2452 &msp->ms_lock);
2453 }
2454
2455 vdev_space_update(vd, 0, 0, msp->ms_size);
2456 }
2457
2458 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
2459
2460 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
2461 metaslab_class_get_alloc(spa_normal_class(spa));
2462 if (free_space <= spa_get_slop_space(spa)) {
2463 defer_allowed = B_FALSE;
2464 }
2465
2466 defer_delta = 0;
2467 alloc_delta = space_map_alloc_delta(msp->ms_sm);
2468 if (defer_allowed) {
2469 defer_delta = range_tree_space(msp->ms_freedtree) -
2470 range_tree_space(*defer_tree);
2471 } else {
2472 defer_delta -= range_tree_space(*defer_tree);
2473 }
2474
2475 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
2476
2477 /*
2478 * If there's a metaslab_load() in progress, wait for it to complete
2479 * so that we have a consistent view of the in-core space map.
2480 */
2481 metaslab_load_wait(msp);
2482
2483 /*
2484 * Move the frees from the defer_tree back to the free
2485 * range tree (if it's loaded). Swap the freed_tree and the
2486 * defer_tree -- this is safe to do because we've just emptied out
2487 * the defer_tree.
2488 */
2489 if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON &&
2490 !vd->vdev_man_trimming) {
2491 range_tree_walk(*defer_tree, metaslab_trim_add, msp);
2492 if (!defer_allowed) {
2493 range_tree_walk(msp->ms_freedtree, metaslab_trim_add,
2494 msp);
2495 }
2496 }
2497 range_tree_vacate(*defer_tree,
2498 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2499 if (defer_allowed) {
2500 range_tree_swap(&msp->ms_freedtree, defer_tree);
2501 } else {
2502 range_tree_vacate(msp->ms_freedtree,
2503 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
2504 }
2505
2506 space_map_update(msp->ms_sm);
2507
2508 msp->ms_deferspace += defer_delta;
2509 ASSERT3S(msp->ms_deferspace, >=, 0);
2510 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
2511 if (msp->ms_deferspace != 0) {
2512 /*
2513 * Keep syncing this metaslab until all deferred frees
2514 * are back in circulation.
2515 */
2516 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2517 }
2518
2519 /*
2520 * Calculate the new weights before unloading any metaslabs.
2521 * This will give us the most accurate weighting.
2522 */
2523 metaslab_group_sort(mg, msp, metaslab_weight(msp));
2524
2525 /*
2526 * If the metaslab is loaded and we've not tried to load or allocate
2527 * from it in 'metaslab_unload_delay' txgs, then unload it.
2528 */
2529 if (msp->ms_loaded &&
2530 msp->ms_selected_txg + metaslab_unload_delay < txg) {
2531 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
2532 VERIFY0(range_tree_space(
2533 msp->ms_alloctree[(txg + t) & TXG_MASK]));
2534 }
2535
2536 if (!metaslab_debug_unload)
2537 metaslab_unload(msp);
2538 }
2539
2540 mutex_exit(&msp->ms_lock);
2541 }
2542
2543 void
2544 metaslab_sync_reassess(metaslab_group_t *mg)
2545 {
2546 metaslab_group_alloc_update(mg);
2547 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
2548
2549 /*
2550 * Preload the next potential metaslabs
2551 */
2552 metaslab_group_preload(mg);
2553 }
2554
2555 static uint64_t
2556 metaslab_distance(metaslab_t *msp, dva_t *dva)
2557 {
2558 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
2559 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
2560 uint64_t start = msp->ms_id;
2561
2562 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
2563 return (1ULL << 63);
2564
2565 if (offset < start)
2566 return ((start - offset) << ms_shift);
2567 if (offset > start)
2568 return ((offset - start) << ms_shift);
2569 return (0);
2570 }
2571
2572 /*
2573 * ==========================================================================
2574 * Metaslab allocation tracing facility
2575 * ==========================================================================
2576 */
2577 kstat_t *metaslab_trace_ksp;
2578 kstat_named_t metaslab_trace_over_limit;
2579
2580 void
2581 metaslab_alloc_trace_init(void)
2582 {
2583 ASSERT(metaslab_alloc_trace_cache == NULL);
2584 metaslab_alloc_trace_cache = kmem_cache_create(
2585 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
2586 0, NULL, NULL, NULL, NULL, NULL, 0);
2587 metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
2588 "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
2589 if (metaslab_trace_ksp != NULL) {
2590 metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
2591 kstat_named_init(&metaslab_trace_over_limit,
2592 "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
2593 kstat_install(metaslab_trace_ksp);
2594 }
2595 }
2596
2597 void
2598 metaslab_alloc_trace_fini(void)
2599 {
2600 if (metaslab_trace_ksp != NULL) {
2601 kstat_delete(metaslab_trace_ksp);
2602 metaslab_trace_ksp = NULL;
2603 }
2604 kmem_cache_destroy(metaslab_alloc_trace_cache);
2605 metaslab_alloc_trace_cache = NULL;
2606 }
2607
2608 /*
2609 * Add an allocation trace element to the allocation tracing list.
2610 */
2611 static void
2612 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
2613 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
2614 {
2615 if (!metaslab_trace_enabled)
2616 return;
2617
2618 /*
2619 * When the tracing list reaches its maximum we remove
2620 * the second element in the list before adding a new one.
2621 * By removing the second element we preserve the original
2622 * entry as a clue to what allocations steps have already been
2623 * performed.
2624 */
2625 if (zal->zal_size == metaslab_trace_max_entries) {
2626 metaslab_alloc_trace_t *mat_next;
2627 #ifdef DEBUG
2628 panic("too many entries in allocation list");
2629 #endif
2630 atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
2631 zal->zal_size--;
2632 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
2633 list_remove(&zal->zal_list, mat_next);
2634 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
2635 }
2636
2637 metaslab_alloc_trace_t *mat =
2638 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
2639 list_link_init(&mat->mat_list_node);
2640 mat->mat_mg = mg;
2641 mat->mat_msp = msp;
2642 mat->mat_size = psize;
2643 mat->mat_dva_id = dva_id;
2644 mat->mat_offset = offset;
2645 mat->mat_weight = 0;
2646
2647 if (msp != NULL)
2648 mat->mat_weight = msp->ms_weight;
2649
2650 /*
2651 * The list is part of the zio so locking is not required. Only
2652 * a single thread will perform allocations for a given zio.
2653 */
2654 list_insert_tail(&zal->zal_list, mat);
2655 zal->zal_size++;
2656
2657 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
2658 }
2659
2660 void
2661 metaslab_trace_init(zio_alloc_list_t *zal)
2662 {
2663 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
2664 offsetof(metaslab_alloc_trace_t, mat_list_node));
2665 zal->zal_size = 0;
2666 }
2667
2668 void
2669 metaslab_trace_fini(zio_alloc_list_t *zal)
2670 {
2671 metaslab_alloc_trace_t *mat;
2672
2673 while ((mat = list_remove_head(&zal->zal_list)) != NULL)
2674 kmem_cache_free(metaslab_alloc_trace_cache, mat);
2675 list_destroy(&zal->zal_list);
2676 zal->zal_size = 0;
2677 }
2678
2679 /*
2680 * ==========================================================================
2681 * Metaslab block operations
2682 * ==========================================================================
2683 */
2684
2685 static void
2686 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
2687 {
2688 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2689 flags & METASLAB_DONT_THROTTLE)
2690 return;
2691
2692 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2693 if (!mg->mg_class->mc_alloc_throttle_enabled)
2694 return;
2695
2696 (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
2697 }
2698
2699 void
2700 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
2701 {
2702 if (!(flags & METASLAB_ASYNC_ALLOC) ||
2703 flags & METASLAB_DONT_THROTTLE)
2704 return;
2705
2706 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2707 if (!mg->mg_class->mc_alloc_throttle_enabled)
2708 return;
2709
2710 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
2711 }
2712
2713 void
2714 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
2715 {
2716 #ifdef ZFS_DEBUG
2717 const dva_t *dva = bp->blk_dva;
2718 int ndvas = BP_GET_NDVAS(bp);
2719
2720 for (int d = 0; d < ndvas; d++) {
2721 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
2722 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
2723 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
2724 }
2725 #endif
2726 }
2727
2728 static uint64_t
2729 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
2730 {
2731 uint64_t start;
2732 range_tree_t *rt = msp->ms_tree;
2733 metaslab_class_t *mc = msp->ms_group->mg_class;
2734
2735 VERIFY(!msp->ms_condensing);
2736
2737 start = mc->mc_ops->msop_alloc(msp, size);
2738 if (start != -1ULL) {
2739 metaslab_group_t *mg = msp->ms_group;
2740 vdev_t *vd = mg->mg_vd;
2741
2742 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
2743 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2744 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
2745 range_tree_remove(rt, start, size);
2746 metaslab_trim_remove(msp, start, size);
2747
2748 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2749 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
2750
2751 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
2752
2753 /* Track the last successful allocation */
2754 msp->ms_alloc_txg = txg;
2755 metaslab_verify_space(msp, txg);
2756 }
2757
2758 /*
2759 * Now that we've attempted the allocation we need to update the
2760 * metaslab's maximum block size since it may have changed.
2761 */
2762 msp->ms_max_size = metaslab_block_maxsize(msp);
2763 return (start);
2764 }
2765
2766 static uint64_t
2767 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
2768 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
2769 int flags)
2770 {
2771 metaslab_t *msp = NULL;
2772 uint64_t offset = -1ULL;
2773 uint64_t activation_weight;
2774 uint64_t target_distance;
2775 int i;
2776
2777 activation_weight = METASLAB_WEIGHT_PRIMARY;
2778 for (i = 0; i < d; i++) {
2779 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
2780 activation_weight = METASLAB_WEIGHT_SECONDARY;
2781 break;
2782 }
2783 }
2784
2785 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
2786 search->ms_weight = UINT64_MAX;
2787 search->ms_start = 0;
2788 for (;;) {
2789 boolean_t was_active;
2790 boolean_t pass_primary = B_TRUE;
2791 avl_tree_t *t = &mg->mg_metaslab_tree;
2792 avl_index_t idx;
2793
2794 mutex_enter(&mg->mg_lock);
2795
2796 /*
2797 * Find the metaslab with the highest weight that is less
2798 * than what we've already tried. In the common case, this
2799 * means that we will examine each metaslab at most once.
2800 * Note that concurrent callers could reorder metaslabs
2801 * by activation/passivation once we have dropped the mg_lock.
2802 * If a metaslab is activated by another thread, and we fail
2803 * to allocate from the metaslab we have selected, we may
2804 * not try the newly-activated metaslab, and instead activate
2805 * another metaslab. This is not optimal, but generally
2806 * does not cause any problems (a possible exception being
2807 * if every metaslab is completely full except for the
2808 * the newly-activated metaslab which we fail to examine).
2809 */
2810 msp = avl_find(t, search, &idx);
2811 if (msp == NULL)
2812 msp = avl_nearest(t, idx, AVL_AFTER);
2813 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
2814
2815 if (!metaslab_should_allocate(msp, asize)) {
2816 metaslab_trace_add(zal, mg, msp, asize, d,
2817 TRACE_TOO_SMALL);
2818 continue;
2819 }
2820
2821 /*
2822 * If the selected metaslab is condensing, skip it.
2823 */
2824 if (msp->ms_condensing)
2825 continue;
2826
2827 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2828 if (flags & METASLAB_USE_WEIGHT_SECONDARY) {
2829 if (!pass_primary) {
2830 DTRACE_PROBE(metaslab_use_secondary);
2831 activation_weight =
2832 METASLAB_WEIGHT_SECONDARY;
2833 break;
2834 }
2835
2836 pass_primary = B_FALSE;
2837 } else {
2838 if (activation_weight ==
2839 METASLAB_WEIGHT_PRIMARY)
2840 break;
2841
2842 target_distance = min_distance +
2843 (space_map_allocated(msp->ms_sm) != 0 ? 0 :
2844 min_distance >> 1);
2845
2846 for (i = 0; i < d; i++)
2847 if (metaslab_distance(msp, &dva[i]) <
2848 target_distance)
2849 break;
2850 if (i == d)
2851 break;
2852 }
2853 }
2854 mutex_exit(&mg->mg_lock);
2855 if (msp == NULL) {
2856 kmem_free(search, sizeof (*search));
2857 return (-1ULL);
2858 }
2859 search->ms_weight = msp->ms_weight;
2860 search->ms_start = msp->ms_start + 1;
2861
2862 mutex_enter(&msp->ms_lock);
2863
2864 /*
2865 * Ensure that the metaslab we have selected is still
2866 * capable of handling our request. It's possible that
2867 * another thread may have changed the weight while we
2868 * were blocked on the metaslab lock. We check the
2869 * active status first to see if we need to reselect
2870 * a new metaslab.
2871 */
2872 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
2873 mutex_exit(&msp->ms_lock);
2874 continue;
2875 }
2876
2877 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
2878 activation_weight == METASLAB_WEIGHT_PRIMARY) {
2879 metaslab_passivate(msp,
2880 msp->ms_weight & ~METASLAB_ACTIVE_MASK);
2881 mutex_exit(&msp->ms_lock);
2882 continue;
2883 }
2884
2885 if (metaslab_activate(msp, activation_weight) != 0) {
2886 mutex_exit(&msp->ms_lock);
2887 continue;
2888 }
2889 msp->ms_selected_txg = txg;
2890
2891 /*
2892 * Now that we have the lock, recheck to see if we should
2893 * continue to use this metaslab for this allocation. The
2894 * the metaslab is now loaded so metaslab_should_allocate() can
2895 * accurately determine if the allocation attempt should
2896 * proceed.
2897 */
2898 if (!metaslab_should_allocate(msp, asize)) {
2899 /* Passivate this metaslab and select a new one. */
2900 metaslab_trace_add(zal, mg, msp, asize, d,
2901 TRACE_TOO_SMALL);
2902 goto next;
2903 }
2904
2905 /*
2906 * If this metaslab is currently condensing then pick again as
2907 * we can't manipulate this metaslab until it's committed
2908 * to disk.
2909 */
2910 if (msp->ms_condensing) {
2911 metaslab_trace_add(zal, mg, msp, asize, d,
2912 TRACE_CONDENSING);
2913 mutex_exit(&msp->ms_lock);
2914 continue;
2915 }
2916
2917 offset = metaslab_block_alloc(msp, asize, txg);
2918 metaslab_trace_add(zal, mg, msp, asize, d, offset);
2919
2920 if (offset != -1ULL) {
2921 /* Proactively passivate the metaslab, if needed */
2922 metaslab_segment_may_passivate(msp);
2923 break;
2924 }
2925 next:
2926 ASSERT(msp->ms_loaded);
2927
2928 /*
2929 * We were unable to allocate from this metaslab so determine
2930 * a new weight for this metaslab. Now that we have loaded
2931 * the metaslab we can provide a better hint to the metaslab
2932 * selector.
2933 *
2934 * For space-based metaslabs, we use the maximum block size.
2935 * This information is only available when the metaslab
2936 * is loaded and is more accurate than the generic free
2937 * space weight that was calculated by metaslab_weight().
2938 * This information allows us to quickly compare the maximum
2939 * available allocation in the metaslab to the allocation
2940 * size being requested.
2941 *
2942 * For segment-based metaslabs, determine the new weight
2943 * based on the highest bucket in the range tree. We
2944 * explicitly use the loaded segment weight (i.e. the range
2945 * tree histogram) since it contains the space that is
2946 * currently available for allocation and is accurate
2947 * even within a sync pass.
2948 */
2949 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
2950 uint64_t weight = metaslab_block_maxsize(msp);
2951 WEIGHT_SET_SPACEBASED(weight);
2952 metaslab_passivate(msp, weight);
2953 } else {
2954 metaslab_passivate(msp,
2955 metaslab_weight_from_range_tree(msp));
2956 }
2957
2958 /*
2959 * We have just failed an allocation attempt, check
2960 * that metaslab_should_allocate() agrees. Otherwise,
2961 * we may end up in an infinite loop retrying the same
2962 * metaslab.
2963 */
2964 ASSERT(!metaslab_should_allocate(msp, asize));
2965 mutex_exit(&msp->ms_lock);
2966 }
2967 mutex_exit(&msp->ms_lock);
2968 kmem_free(search, sizeof (*search));
2969 return (offset);
2970 }
2971
2972 static uint64_t
2973 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
2974 uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva,
2975 int d, int flags)
2976 {
2977 uint64_t offset;
2978 ASSERT(mg->mg_initialized);
2979
2980 offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
2981 min_distance, dva, d, flags);
2982
2983 mutex_enter(&mg->mg_lock);
2984 if (offset == -1ULL) {
2985 mg->mg_failed_allocations++;
2986 metaslab_trace_add(zal, mg, NULL, asize, d,
2987 TRACE_GROUP_FAILURE);
2988 if (asize == SPA_GANGBLOCKSIZE) {
2989 /*
2990 * This metaslab group was unable to allocate
2991 * the minimum gang block size so it must be out of
2992 * space. We must notify the allocation throttle
2993 * to start skipping allocation attempts to this
2994 * metaslab group until more space becomes available.
2995 * Note: this failure cannot be caused by the
2996 * allocation throttle since the allocation throttle
2997 * is only responsible for skipping devices and
2998 * not failing block allocations.
2999 */
3000 mg->mg_no_free_space = B_TRUE;
3001 }
3002 }
3003 mg->mg_allocations++;
3004 mutex_exit(&mg->mg_lock);
3005 return (offset);
3006 }
3007
3008 /*
3009 * If we have to write a ditto block (i.e. more than one DVA for a given BP)
3010 * on the same vdev as an existing DVA of this BP, then try to allocate it
3011 * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
3012 * existing DVAs.
3013 */
3014 int ditto_same_vdev_distance_shift = 3;
3015
3016 /*
3017 * Allocate a block for the specified i/o.
3018 */
3019 static int
3020 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
3021 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
3022 zio_alloc_list_t *zal)
3023 {
3024 metaslab_group_t *mg, *rotor;
3025 vdev_t *vd;
3026 boolean_t try_hard = B_FALSE;
3027
3028 ASSERT(!DVA_IS_VALID(&dva[d]));
3029
3030 /*
3031 * For testing, make some blocks above a certain size be gang blocks.
3032 */
3033 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
3034 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
3035 return (SET_ERROR(ENOSPC));
3036 }
3037
3038 /*
3039 * Start at the rotor and loop through all mgs until we find something.
3040 * Note that there's no locking on mc_rotor or mc_aliquot because
3041 * nothing actually breaks if we miss a few updates -- we just won't
3042 * allocate quite as evenly. It all balances out over time.
3043 *
3044 * If we are doing ditto or log blocks, try to spread them across
3045 * consecutive vdevs. If we're forced to reuse a vdev before we've
3046 * allocated all of our ditto blocks, then try and spread them out on
3047 * that vdev as much as possible. If it turns out to not be possible,
3048 * gradually lower our standards until anything becomes acceptable.
3049 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
3050 * gives us hope of containing our fault domains to something we're
3051 * able to reason about. Otherwise, any two top-level vdev failures
3052 * will guarantee the loss of data. With consecutive allocation,
3053 * only two adjacent top-level vdev failures will result in data loss.
3054 *
3055 * If we are doing gang blocks (hintdva is non-NULL), try to keep
3056 * ourselves on the same vdev as our gang block header. That
3057 * way, we can hope for locality in vdev_cache, plus it makes our
3058 * fault domains something tractable.
3059 */
3060 if (hintdva) {
3061 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
3062
3063 /*
3064 * It's possible the vdev we're using as the hint no
3065 * longer exists (i.e. removed). Consult the rotor when
3066 * all else fails.
3067 */
3068 if (vd != NULL) {
3069 mg = vd->vdev_mg;
3070
3071 if (flags & METASLAB_HINTBP_AVOID &&
3072 mg->mg_next != NULL)
3073 mg = mg->mg_next;
3074 } else {
3075 mg = mc->mc_rotor;
3076 }
3077 } else if (d != 0) {
3078 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
3079 mg = vd->vdev_mg->mg_next;
3080 } else {
3081 mg = mc->mc_rotor;
3082 }
3083
3084 /*
3085 * If the hint put us into the wrong metaslab class, or into a
3086 * metaslab group that has been passivated, just follow the rotor.
3087 */
3088 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
3089 mg = mc->mc_rotor;
3090
3091 rotor = mg;
3092 top:
3093 do {
3094 boolean_t allocatable;
3095
3096 ASSERT(mg->mg_activation_count == 1);
3097 vd = mg->mg_vd;
3098
3099 /*
3100 * Don't allocate from faulted devices.
3101 */
3102 if (try_hard) {
3103 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
3104 allocatable = vdev_allocatable(vd);
3105 spa_config_exit(spa, SCL_ZIO, FTAG);
3106 } else {
3107 allocatable = vdev_allocatable(vd);
3108 }
3109
3110 /*
3111 * Determine if the selected metaslab group is eligible
3112 * for allocations. If we're ganging then don't allow
3113 * this metaslab group to skip allocations since that would
3114 * inadvertently return ENOSPC and suspend the pool
3115 * even though space is still available.
3116 */
3117 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
3118 allocatable = metaslab_group_allocatable(mg, rotor,
3119 psize);
3120 }
3121
3122 if (!allocatable) {
3123 metaslab_trace_add(zal, mg, NULL, psize, d,
3124 TRACE_NOT_ALLOCATABLE);
3125 goto next;
3126 }
3127
3128 ASSERT(mg->mg_initialized);
3129
3130 /*
3131 * Avoid writing single-copy data to a failing,
3132 * non-redundant vdev, unless we've already tried all
3133 * other vdevs.
3134 */
3135 if ((vd->vdev_stat.vs_write_errors > 0 ||
3136 vd->vdev_state < VDEV_STATE_HEALTHY) &&
3137 d == 0 && !try_hard && vd->vdev_children == 0) {
3138 metaslab_trace_add(zal, mg, NULL, psize, d,
3139 TRACE_VDEV_ERROR);
3140 goto next;
3141 }
3142
3143 ASSERT(mg->mg_class == mc);
3144
3145 /*
3146 * If we don't need to try hard, then require that the
3147 * block be 1/8th of the device away from any other DVAs
3148 * in this BP. If we are trying hard, allow any offset
3149 * to be used (distance=0).
3150 */
3151 uint64_t distance = 0;
3152 if (!try_hard) {
3153 distance = vd->vdev_asize >>
3154 ditto_same_vdev_distance_shift;
3155 if (distance <= (1ULL << vd->vdev_ms_shift))
3156 distance = 0;
3157 }
3158
3159 uint64_t asize = vdev_psize_to_asize(vd, psize);
3160 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
3161
3162 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
3163 distance, dva, d, flags);
3164
3165 if (offset != -1ULL) {
3166 /*
3167 * If we've just selected this metaslab group,
3168 * figure out whether the corresponding vdev is
3169 * over- or under-used relative to the pool,
3170 * and set an allocation bias to even it out.
3171 */
3172 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
3173 vdev_stat_t *vs = &vd->vdev_stat;
3174 vdev_stat_t *pvs = &vd->vdev_parent->vdev_stat;
3175 int64_t vu, cu, vu_io;
3176
3177 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
3178 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
3179 vu_io =
3180 (((vs->vs_iotime[ZIO_TYPE_WRITE] * 100) /
3181 (pvs->vs_iotime[ZIO_TYPE_WRITE] + 1)) *
3182 (vd->vdev_parent->vdev_children)) - 100;
3183
3184 /*
3185 * Calculate how much more or less we should
3186 * try to allocate from this device during
3187 * this iteration around the rotor.
3188 * For example, if a device is 80% full
3189 * and the pool is 20% full then we should
3190 * reduce allocations by 60% on this device.
3191 *
3192 * mg_bias = (20 - 80) * 512K / 100 = -307K
3193 *
3194 * This reduces allocations by 307K for this
3195 * iteration.
3196 */
3197 mg->mg_bias = ((cu - vu) *
3198 (int64_t)mg->mg_aliquot) / 100;
3199
3200 /*
3201 * Experiment: space-based DVA allocator 0,
3202 * latency-based 1 or hybrid 2.
3203 */
3204 switch (metaslab_alloc_dva_algorithm) {
3205 case 1:
3206 mg->mg_bias =
3207 (vu_io * (int64_t)mg->mg_aliquot) /
3208 100;
3209 break;
3210 case 2:
3211 mg->mg_bias =
3212 ((((cu - vu) + vu_io) / 2) *
3213 (int64_t)mg->mg_aliquot) / 100;
3214 break;
3215 default:
3216 break;
3217 }
3218 } else if (!metaslab_bias_enabled) {
3219 mg->mg_bias = 0;
3220 }
3221
3222 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
3223 mg->mg_aliquot + mg->mg_bias) {
3224 mc->mc_rotor = mg->mg_next;
3225 mc->mc_aliquot = 0;
3226 }
3227
3228 DVA_SET_VDEV(&dva[d], vd->vdev_id);
3229 DVA_SET_OFFSET(&dva[d], offset);
3230 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
3231 DVA_SET_ASIZE(&dva[d], asize);
3232 DTRACE_PROBE3(alloc_dva_probe, uint64_t, vd->vdev_id,
3233 uint64_t, offset, uint64_t, psize);
3234
3235 return (0);
3236 }
3237 next:
3238 mc->mc_rotor = mg->mg_next;
3239 mc->mc_aliquot = 0;
3240 } while ((mg = mg->mg_next) != rotor);
3241
3242 /*
3243 * If we haven't tried hard, do so now.
3244 */
3245 if (!try_hard) {
3246 try_hard = B_TRUE;
3247 goto top;
3248 }
3249
3250 bzero(&dva[d], sizeof (dva_t));
3251
3252 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
3253 return (SET_ERROR(ENOSPC));
3254 }
3255
3256 /*
3257 * Free the block represented by DVA in the context of the specified
3258 * transaction group.
3259 */
3260 void
3261 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
3262 {
3263 uint64_t vdev = DVA_GET_VDEV(dva);
3264 uint64_t offset = DVA_GET_OFFSET(dva);
3265 uint64_t size = DVA_GET_ASIZE(dva);
3266 vdev_t *vd;
3267 metaslab_t *msp;
3268
3269 DTRACE_PROBE3(free_dva_probe, uint64_t, vdev,
3270 uint64_t, offset, uint64_t, size);
3271
3272 ASSERT(DVA_IS_VALID(dva));
3273
3274 if (txg > spa_freeze_txg(spa))
3275 return;
3276
3277 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3278 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
3279 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
3280 (u_longlong_t)vdev, (u_longlong_t)offset);
3281 ASSERT(0);
3282 return;
3283 }
3284
3285 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3286
3287 if (DVA_GET_GANG(dva))
3288 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3289
3290 mutex_enter(&msp->ms_lock);
3291
3292 if (now) {
3293 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
3294 offset, size);
3295
3296 VERIFY(!msp->ms_condensing);
3297 VERIFY3U(offset, >=, msp->ms_start);
3298 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
3299 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
3300 msp->ms_size);
3301 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3302 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3303 range_tree_add(msp->ms_tree, offset, size);
3304 if (spa_get_auto_trim(spa) == SPA_AUTO_TRIM_ON &&
3305 !vd->vdev_man_trimming)
3306 metaslab_trim_add(msp, offset, size);
3307 msp->ms_max_size = metaslab_block_maxsize(msp);
3308 } else {
3309 VERIFY3U(txg, ==, spa->spa_syncing_txg);
3310 if (range_tree_space(msp->ms_freeingtree) == 0)
3311 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3312 range_tree_add(msp->ms_freeingtree, offset, size);
3313 }
3314
3315 mutex_exit(&msp->ms_lock);
3316 }
3317
3318 /*
3319 * Intent log support: upon opening the pool after a crash, notify the SPA
3320 * of blocks that the intent log has allocated for immediate write, but
3321 * which are still considered free by the SPA because the last transaction
3322 * group didn't commit yet.
3323 */
3324 static int
3325 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
3326 {
3327 uint64_t vdev = DVA_GET_VDEV(dva);
3328 uint64_t offset = DVA_GET_OFFSET(dva);
3329 uint64_t size = DVA_GET_ASIZE(dva);
3330 vdev_t *vd;
3331 metaslab_t *msp;
3332 int error = 0;
3333
3334 ASSERT(DVA_IS_VALID(dva));
3335
3336 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
3337 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
3338 return (SET_ERROR(ENXIO));
3339
3340 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3341
3342 if (DVA_GET_GANG(dva))
3343 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
3344
3345 mutex_enter(&msp->ms_lock);
3346
3347 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
3348 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
3349
3350 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
3351 error = SET_ERROR(ENOENT);
3352
3353 if (error || txg == 0) { /* txg == 0 indicates dry run */
3354 mutex_exit(&msp->ms_lock);
3355 return (error);
3356 }
3357
3358 VERIFY(!msp->ms_condensing);
3359 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
3360 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
3361 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
3362 range_tree_remove(msp->ms_tree, offset, size);
3363 metaslab_trim_remove(msp, offset, size);
3364
3365 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
3366 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
3367 vdev_dirty(vd, VDD_METASLAB, msp, txg);
3368 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
3369 }
3370
3371 mutex_exit(&msp->ms_lock);
3372
3373 return (0);
3374 }
3375
3376 /*
3377 * Reserve some allocation slots. The reservation system must be called
3378 * before we call into the allocator. If there aren't any available slots
3379 * then the I/O will be throttled until an I/O completes and its slots are
3380 * freed up. The function returns true if it was successful in placing
3381 * the reservation.
3382 */
3383 boolean_t
3384 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
3385 int flags)
3386 {
3387 uint64_t available_slots = 0;
3388 boolean_t slot_reserved = B_FALSE;
3389
3390 ASSERT(mc->mc_alloc_throttle_enabled);
3391 mutex_enter(&mc->mc_lock);
3392
3393 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
3394 if (reserved_slots < mc->mc_alloc_max_slots)
3395 available_slots = mc->mc_alloc_max_slots - reserved_slots;
3396
3397 if (slots <= available_slots || GANG_ALLOCATION(flags)) {
3398 /*
3399 * We reserve the slots individually so that we can unreserve
3400 * them individually when an I/O completes.
3401 */
3402 for (int d = 0; d < slots; d++) {
3403 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
3404 }
3405 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
3406 slot_reserved = B_TRUE;
3407 }
3408
3409 mutex_exit(&mc->mc_lock);
3410 return (slot_reserved);
3411 }
3412
3413 void
3414 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
3415 {
3416 ASSERT(mc->mc_alloc_throttle_enabled);
3417 mutex_enter(&mc->mc_lock);
3418 for (int d = 0; d < slots; d++) {
3419 (void) refcount_remove(&mc->mc_alloc_slots, zio);
3420 }
3421 mutex_exit(&mc->mc_lock);
3422 }
3423
3424 int
3425 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
3426 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
3427 zio_alloc_list_t *zal, zio_t *zio)
3428 {
3429 dva_t *dva = bp->blk_dva;
3430 dva_t *hintdva = hintbp->blk_dva;
3431 int error = 0;
3432
3433 ASSERT(bp->blk_birth == 0);
3434 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
3435
3436 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3437
3438 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
3439 spa_config_exit(spa, SCL_ALLOC, FTAG);
3440 return (SET_ERROR(ENOSPC));
3441 }
3442
3443 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
3444 ASSERT(BP_GET_NDVAS(bp) == 0);
3445 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
3446 ASSERT3P(zal, !=, NULL);
3447
3448 if (mc == spa_special_class(spa) && !BP_IS_METADATA(bp) &&
3449 !(flags & (METASLAB_GANG_HEADER)) &&
3450 !(spa->spa_meta_policy.spa_small_data_to_special &&
3451 psize <= spa->spa_meta_policy.spa_small_data_to_special)) {
3452 error = metaslab_alloc_dva(spa, spa_normal_class(spa),
3453 psize, &dva[WBC_NORMAL_DVA], 0, NULL, txg,
3454 flags | METASLAB_USE_WEIGHT_SECONDARY, zal);
3455 if (error == 0) {
3456 error = metaslab_alloc_dva(spa, mc, psize,
3457 &dva[WBC_SPECIAL_DVA], 0, NULL, txg, flags, zal);
3458 if (error != 0) {
3459 error = 0;
3460 /*
3461 * Change the place of NORMAL and cleanup the
3462 * second DVA. After that this BP is just a
3463 * regular BP with one DVA
3464 *
3465 * This operation is valid only if:
3466 * WBC_SPECIAL_DVA is dva[0]
3467 * WBC_NORMAL_DVA is dva[1]
3468 *
3469 * see wbc.h
3470 */
3471 bcopy(&dva[WBC_NORMAL_DVA],
3472 &dva[WBC_SPECIAL_DVA], sizeof (dva_t));
3473 bzero(&dva[WBC_NORMAL_DVA], sizeof (dva_t));
3474
3475 /*
3476 * Allocation of special DVA has failed,
3477 * so this BP will be a regular BP and need
3478 * to update the metaslab group's queue depth
3479 * based on the newly allocated dva.
3480 */
3481 metaslab_group_alloc_increment(spa,
3482 DVA_GET_VDEV(&dva[0]), zio, flags);
3483 } else {
3484 BP_SET_SPECIAL(bp, 1);
3485 }
3486 } else {
3487 spa_config_exit(spa, SCL_ALLOC, FTAG);
3488 return (error);
3489 }
3490 } else {
3491 for (int d = 0; d < ndvas; d++) {
3492 error = metaslab_alloc_dva(spa, mc, psize, dva, d,
3493 hintdva, txg, flags, zal);
3494 if (error != 0) {
3495 for (d--; d >= 0; d--) {
3496 metaslab_free_dva(spa, &dva[d],
3497 txg, B_TRUE);
3498 metaslab_group_alloc_decrement(spa,
3499 DVA_GET_VDEV(&dva[d]), zio, flags);
3500 bzero(&dva[d], sizeof (dva_t));
3501 }
3502 spa_config_exit(spa, SCL_ALLOC, FTAG);
3503 return (error);
3504 } else {
3505 /*
3506 * Update the metaslab group's queue depth
3507 * based on the newly allocated dva.
3508 */
3509 metaslab_group_alloc_increment(spa,
3510 DVA_GET_VDEV(&dva[d]), zio, flags);
3511 }
3512 }
3513 ASSERT(BP_GET_NDVAS(bp) == ndvas);
3514 }
3515 ASSERT(error == 0);
3516
3517 spa_config_exit(spa, SCL_ALLOC, FTAG);
3518
3519 BP_SET_BIRTH(bp, txg, txg);
3520
3521 return (0);
3522 }
3523
3524 void
3525 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
3526 {
3527 const dva_t *dva = bp->blk_dva;
3528 int ndvas = BP_GET_NDVAS(bp);
3529
3530 ASSERT(!BP_IS_HOLE(bp));
3531 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
3532
3533 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
3534
3535 if (BP_IS_SPECIAL(bp)) {
3536 int start_dva;
3537 wbc_data_t *wbc_data = spa_get_wbc_data(spa);
3538
3539 mutex_enter(&wbc_data->wbc_lock);
3540 start_dva = wbc_first_valid_dva(bp, wbc_data, B_TRUE);
3541 mutex_exit(&wbc_data->wbc_lock);
3542
3543 /*
3544 * Actual freeing should not be locked as
3545 * the block is already exempted from WBC
3546 * trees, and thus will not be moved
3547 */
3548 metaslab_free_dva(spa, &dva[WBC_NORMAL_DVA], txg, now);
3549 if (start_dva == 0) {
3550 metaslab_free_dva(spa, &dva[WBC_SPECIAL_DVA],
3551 txg, now);
3552 }
3553 } else {
3554 for (int d = 0; d < ndvas; d++)
3555 metaslab_free_dva(spa, &dva[d], txg, now);
3556 }
3557
3558 spa_config_exit(spa, SCL_FREE, FTAG);
3559 }
3560
3561 int
3562 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
3563 {
3564 const dva_t *dva = bp->blk_dva;
3565 int ndvas = BP_GET_NDVAS(bp);
3566 int error = 0;
3567
3568 ASSERT(!BP_IS_HOLE(bp));
3569
3570 if (txg != 0) {
3571 /*
3572 * First do a dry run to make sure all DVAs are claimable,
3573 * so we don't have to unwind from partial failures below.
3574 */
3575 if ((error = metaslab_claim(spa, bp, 0)) != 0)
3576 return (error);
3577 }
3578
3579 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
3580
3581 if (BP_IS_SPECIAL(bp)) {
3582 int start_dva;
3583 wbc_data_t *wbc_data = spa_get_wbc_data(spa);
3584
3585 mutex_enter(&wbc_data->wbc_lock);
3586 start_dva = wbc_first_valid_dva(bp, wbc_data, B_FALSE);
3587
3588 /*
3589 * Actual claiming should be under lock for WBC blocks. It must
3590 * be done to ensure zdb will not fail. The only other user of
3591 * the claiming is ZIL whose blocks can not be WBC ones, and
3592 * thus the lock will not be held for them.
3593 */
3594 error = metaslab_claim_dva(spa,
3595 &dva[WBC_NORMAL_DVA], txg);
3596 if (error == 0 && start_dva == 0) {
3597 error = metaslab_claim_dva(spa,
3598 &dva[WBC_SPECIAL_DVA], txg);
3599 }
3600
3601 mutex_exit(&wbc_data->wbc_lock);
3602 } else {
3603 for (int d = 0; d < ndvas; d++)
3604 if ((error = metaslab_claim_dva(spa,
3605 &dva[d], txg)) != 0)
3606 break;
3607 }
3608
3609 spa_config_exit(spa, SCL_ALLOC, FTAG);
3610
3611 ASSERT(error == 0 || txg == 0);
3612
3613 return (error);
3614 }
3615
3616 void
3617 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
3618 {
3619 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
3620 return;
3621
3622 if (BP_IS_SPECIAL(bp)) {
3623 /* Do not check frees for WBC blocks */
3624 return;
3625 }
3626
3627 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
3628 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
3629 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
3630 vdev_t *vd = vdev_lookup_top(spa, vdev);
3631 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
3632 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
3633 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
3634
3635 if (msp->ms_loaded) {
3636 range_tree_verify(msp->ms_tree, offset, size);
3637 range_tree_verify(msp->ms_cur_ts->ts_tree,
3638 offset, size);
3639 if (msp->ms_prev_ts != NULL) {
3640 range_tree_verify(msp->ms_prev_ts->ts_tree,
3641 offset, size);
3642 }
3643 }
3644
3645 range_tree_verify(msp->ms_freeingtree, offset, size);
3646 range_tree_verify(msp->ms_freedtree, offset, size);
3647 for (int j = 0; j < TXG_DEFER_SIZE; j++)
3648 range_tree_verify(msp->ms_defertree[j], offset, size);
3649 }
3650 spa_config_exit(spa, SCL_VDEV, FTAG);
3651 }
3652
3653 /*
3654 * Trims all free space in the metaslab. Returns the root TRIM zio (that the
3655 * caller should zio_wait() for) and the amount of space in the metaslab that
3656 * has been scheduled for trimming in the `delta' return argument.
3657 */
3658 zio_t *
3659 metaslab_trim_all(metaslab_t *msp, uint64_t *delta)
3660 {
3661 boolean_t was_loaded;
3662 uint64_t trimmed_space;
3663 zio_t *trim_io;
3664
3665 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3666
3667 mutex_enter(&msp->ms_lock);
3668
3669 while (msp->ms_loading)
3670 metaslab_load_wait(msp);
3671 /* If we loaded the metaslab, unload it when we're done. */
3672 was_loaded = msp->ms_loaded;
3673 if (!was_loaded) {
3674 if (metaslab_load(msp) != 0) {
3675 mutex_exit(&msp->ms_lock);
3676 return (0);
3677 }
3678 }
3679 /* Flush out any scheduled extents and add everything in ms_tree. */
3680 range_tree_vacate(msp->ms_cur_ts->ts_tree, NULL, NULL);
3681 range_tree_walk(msp->ms_tree, metaslab_trim_add, msp);
3682
3683 /* Force this trim to take place ASAP. */
3684 if (msp->ms_prev_ts != NULL)
3685 metaslab_free_trimset(msp->ms_prev_ts);
3686 msp->ms_prev_ts = msp->ms_cur_ts;
3687 msp->ms_cur_ts = metaslab_new_trimset(0, &msp->ms_lock);
3688 trimmed_space = range_tree_space(msp->ms_tree);
3689 if (!was_loaded)
3690 metaslab_unload(msp);
3691
3692 trim_io = metaslab_exec_trim(msp);
3693 mutex_exit(&msp->ms_lock);
3694 *delta = trimmed_space;
3695
3696 return (trim_io);
3697 }
3698
3699 /*
3700 * Notifies the trimsets in a metaslab that an extent has been allocated.
3701 * This removes the segment from the queues of extents awaiting to be trimmed.
3702 */
3703 static void
3704 metaslab_trim_remove(void *arg, uint64_t offset, uint64_t size)
3705 {
3706 metaslab_t *msp = arg;
3707
3708 range_tree_remove_overlap(msp->ms_cur_ts->ts_tree, offset, size);
3709 if (msp->ms_prev_ts != NULL) {
3710 range_tree_remove_overlap(msp->ms_prev_ts->ts_tree, offset,
3711 size);
3712 }
3713 }
3714
3715 /*
3716 * Notifies the trimsets in a metaslab that an extent has been freed.
3717 * This adds the segment to the currently open queue of extents awaiting
3718 * to be trimmed.
3719 */
3720 static void
3721 metaslab_trim_add(void *arg, uint64_t offset, uint64_t size)
3722 {
3723 metaslab_t *msp = arg;
3724 ASSERT(msp->ms_cur_ts != NULL);
3725 range_tree_add(msp->ms_cur_ts->ts_tree, offset, size);
3726 }
3727
3728 /*
3729 * Does a metaslab's automatic trim operation processing. This must be
3730 * called from metaslab_sync, with the txg number of the txg. This function
3731 * issues trims in intervals as dictated by the zfs_txgs_per_trim tunable.
3732 */
3733 void
3734 metaslab_auto_trim(metaslab_t *msp, uint64_t txg)
3735 {
3736 /* for atomicity */
3737 uint64_t txgs_per_trim = zfs_txgs_per_trim;
3738
3739 ASSERT(!MUTEX_HELD(&msp->ms_lock));
3740 mutex_enter(&msp->ms_lock);
3741
3742 /*
3743 * Since we typically have hundreds of metaslabs per vdev, but we only
3744 * trim them once every zfs_txgs_per_trim txgs, it'd be best if we
3745 * could sequence the TRIM commands from all metaslabs so that they
3746 * don't all always pound the device in the same txg. We do so by
3747 * artificially inflating the birth txg of the first trim set by a
3748 * sequence number derived from the metaslab's starting offset
3749 * (modulo zfs_txgs_per_trim). Thus, for the default 200 metaslabs and
3750 * 32 txgs per trim, we'll only be trimming ~6.25 metaslabs per txg.
3751 *
3752 * If we detect that the txg has advanced too far ahead of ts_birth,
3753 * it means our birth txg is out of lockstep. Recompute it by
3754 * rounding down to the nearest zfs_txgs_per_trim multiple and adding
3755 * our metaslab id modulo zfs_txgs_per_trim.
3756 */
3757 if (txg > msp->ms_cur_ts->ts_birth + txgs_per_trim) {
3758 msp->ms_cur_ts->ts_birth = (txg / txgs_per_trim) *
3759 txgs_per_trim + (msp->ms_id % txgs_per_trim);
3760 }
3761
3762 /* Time to swap out the current and previous trimsets */
3763 if (txg == msp->ms_cur_ts->ts_birth + txgs_per_trim) {
3764 if (msp->ms_prev_ts != NULL) {
3765 if (msp->ms_trimming_ts != NULL) {
3766 spa_t *spa = msp->ms_group->mg_class->mc_spa;
3767 /*
3768 * The previous trim run is still ongoing, so
3769 * the device is reacting slowly to our trim
3770 * requests. Drop this trimset, so as not to
3771 * back the device up with trim requests.
3772 */
3773 spa_trimstats_auto_slow_incr(spa);
3774 metaslab_free_trimset(msp->ms_prev_ts);
3775 } else if (msp->ms_group->mg_vd->vdev_man_trimming) {
3776 /*
3777 * If a manual trim is ongoing, we want to
3778 * inhibit autotrim temporarily so it doesn't
3779 * slow down the manual trim.
3780 */
3781 metaslab_free_trimset(msp->ms_prev_ts);
3782 } else {
3783 /*
3784 * Trim out aged extents on the vdevs - these
3785 * are safe to be destroyed now. We'll keep
3786 * the trimset around to deny allocations from
3787 * these regions while the trims are ongoing.
3788 */
3789 zio_nowait(metaslab_exec_trim(msp));
3790 }
3791 }
3792 msp->ms_prev_ts = msp->ms_cur_ts;
3793 msp->ms_cur_ts = metaslab_new_trimset(txg, &msp->ms_lock);
3794 }
3795 mutex_exit(&msp->ms_lock);
3796 }
3797
3798 static void
3799 metaslab_trim_done(zio_t *zio)
3800 {
3801 metaslab_t *msp = zio->io_private;
3802 boolean_t held;
3803
3804 ASSERT(msp != NULL);
3805 ASSERT(msp->ms_trimming_ts != NULL);
3806 held = MUTEX_HELD(&msp->ms_lock);
3807 if (!held)
3808 mutex_enter(&msp->ms_lock);
3809 metaslab_free_trimset(msp->ms_trimming_ts);
3810 msp->ms_trimming_ts = NULL;
3811 cv_signal(&msp->ms_trim_cv);
3812 if (!held)
3813 mutex_exit(&msp->ms_lock);
3814 }
3815
3816 /*
3817 * Executes a zio_trim on a range tree holding freed extents in the metaslab.
3818 */
3819 static zio_t *
3820 metaslab_exec_trim(metaslab_t *msp)
3821 {
3822 metaslab_group_t *mg = msp->ms_group;
3823 spa_t *spa = mg->mg_class->mc_spa;
3824 vdev_t *vd = mg->mg_vd;
3825 range_tree_t *trim_tree;
3826 zio_t *zio;
3827
3828 ASSERT(MUTEX_HELD(&msp->ms_lock));
3829
3830 /* wait for a preceding trim to finish */
3831 while (msp->ms_trimming_ts != NULL)
3832 cv_wait(&msp->ms_trim_cv, &msp->ms_lock);
3833 msp->ms_trimming_ts = msp->ms_prev_ts;
3834 msp->ms_prev_ts = NULL;
3835 trim_tree = msp->ms_trimming_ts->ts_tree;
3836 #ifdef DEBUG
3837 if (msp->ms_loaded) {
3838 for (range_seg_t *rs = avl_first(&trim_tree->rt_root);
3839 rs != NULL; rs = AVL_NEXT(&trim_tree->rt_root, rs)) {
3840 if (!range_tree_contains(msp->ms_tree,
3841 rs->rs_start, rs->rs_end - rs->rs_start)) {
3842 panic("trimming allocated region; mss=%p",
3843 (void*)rs);
3844 }
3845 }
3846 }
3847 #endif
3848
3849 /* Nothing to trim */
3850 if (range_tree_space(trim_tree) == 0) {
3851 metaslab_free_trimset(msp->ms_trimming_ts);
3852 msp->ms_trimming_ts = 0;
3853 return (zio_root(spa, NULL, NULL, 0));
3854 }
3855 zio = zio_trim(spa, vd, trim_tree, metaslab_trim_done, msp, 0,
3856 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
3857 ZIO_FLAG_CONFIG_WRITER, msp);
3858
3859 return (zio);
3860 }
3861
3862 /*
3863 * Allocates and initializes a new trimset structure. The `txg' argument
3864 * indicates when this trimset was born and `lock' indicates the lock to
3865 * link to the range tree.
3866 */
3867 static metaslab_trimset_t *
3868 metaslab_new_trimset(uint64_t txg, kmutex_t *lock)
3869 {
3870 metaslab_trimset_t *ts;
3871
3872 ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
3873 ts->ts_birth = txg;
3874 ts->ts_tree = range_tree_create(NULL, NULL, lock);
3875
3876 return (ts);
3877 }
3878
3879 /*
3880 * Destroys and frees a trim set previously allocated by metaslab_new_trimset.
3881 */
3882 static void
3883 metaslab_free_trimset(metaslab_trimset_t *ts)
3884 {
3885 range_tree_vacate(ts->ts_tree, NULL, NULL);
3886 range_tree_destroy(ts->ts_tree);
3887 kmem_free(ts, sizeof (*ts));
3888 }
3889
3890 /*
3891 * Checks whether an allocation conflicts with an ongoing trim operation in
3892 * the given metaslab. This function takes a segment starting at `*offset'
3893 * of `size' and checks whether it hits any region in the metaslab currently
3894 * being trimmed. If yes, it tries to adjust the allocation to the end of
3895 * the region being trimmed (P2ROUNDUP aligned by `align'), but only up to
3896 * `limit' (no part of the allocation is allowed to go past this point).
3897 *
3898 * Returns B_FALSE if either the original allocation wasn't in conflict, or
3899 * the conflict could be resolved by adjusting the value stored in `offset'
3900 * such that the whole allocation still fits below `limit'. Returns B_TRUE
3901 * if the allocation conflict couldn't be resolved.
3902 */
3903 static boolean_t metaslab_check_trim_conflict(metaslab_t *msp,
3904 uint64_t *offset, uint64_t size, uint64_t align, uint64_t limit)
3905 {
3906 uint64_t new_offset;
3907
3908 if (msp->ms_trimming_ts == NULL)
3909 /* no trim conflict, original offset is OK */
3910 return (B_FALSE);
3911
3912 new_offset = P2ROUNDUP(range_tree_find_gap(msp->ms_trimming_ts->ts_tree,
3913 *offset, size), align);
3914 if (new_offset != *offset && new_offset + size > limit)
3915 /* trim conflict and adjustment not possible */
3916 return (B_TRUE);
3917
3918 /* trim conflict, but adjusted offset still within limit */
3919 *offset = new_offset;
3920 return (B_FALSE);
3921 }