8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
28 */
29
30 #ifndef _SYS_METASLAB_IMPL_H
31 #define _SYS_METASLAB_IMPL_H
32
33 #include <sys/metaslab.h>
34 #include <sys/space_map.h>
35 #include <sys/range_tree.h>
36 #include <sys/vdev.h>
37 #include <sys/txg.h>
38 #include <sys/avl.h>
39
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43
44 /*
45 * Metaslab allocation tracing record.
46 */
47 typedef struct metaslab_alloc_trace {
171 * an asynchronous zio wants to perform an allocation it must
172 * first reserve the number of blocks that it wants to allocate.
173 * If there aren't sufficient slots available for the pending zio
174 * then that I/O is throttled until more slots free up. The current
175 * number of reserved allocations is maintained by the mc_alloc_slots
176 * refcount. The mc_alloc_max_slots value determines the maximum
177 * number of allocations that the system allows. Gang blocks are
178 * allowed to reserve slots even if we've reached the maximum
179 * number of allocations allowed.
180 */
181 uint64_t mc_alloc_max_slots;
182 refcount_t mc_alloc_slots;
183
184 uint64_t mc_alloc_groups; /* # of allocatable groups */
185
186 uint64_t mc_alloc; /* total allocated space */
187 uint64_t mc_deferred; /* total deferred frees */
188 uint64_t mc_space; /* total space (alloc + free) */
189 uint64_t mc_dspace; /* total deflated space */
190 uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
191 };
192
193 /*
194 * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
195 * of a top-level vdev. They are linked togther to form a circular linked
196 * list and can belong to only one metaslab class. Metaslab groups may become
197 * ineligible for allocations for a number of reasons such as limited free
198 * space, fragmentation, or going offline. When this happens the allocator will
199 * simply find the next metaslab group in the linked list and attempt
200 * to allocate from that group instead.
201 */
202 struct metaslab_group {
203 kmutex_t mg_lock;
204 avl_tree_t mg_metaslab_tree;
205 uint64_t mg_aliquot;
206 boolean_t mg_allocatable; /* can we allocate? */
207
208 /*
209 * A metaslab group is considered to be initialized only after
210 * we have updated the MOS config and added the space to the pool.
229 * can occur when gang blocks are required or when other groups
230 * are unable to handle their share of allocations.
231 */
232 uint64_t mg_max_alloc_queue_depth;
233 refcount_t mg_alloc_queue_depth;
234
235 /*
236 * A metalab group that can no longer allocate the minimum block
237 * size will set mg_no_free_space. Once a metaslab group is out
238 * of space then its share of work must be distributed to other
239 * groups.
240 */
241 boolean_t mg_no_free_space;
242
243 uint64_t mg_allocations;
244 uint64_t mg_failed_allocations;
245 uint64_t mg_fragmentation;
246 uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
247 };
248
249 /*
250 * This value defines the number of elements in the ms_lbas array. The value
251 * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
252 * This is the equivalent of highbit(UINT64_MAX).
253 */
254 #define MAX_LBAS 64
255
256 /*
257 * Each metaslab maintains a set of in-core trees to track metaslab
258 * operations. The in-core free tree (ms_tree) contains the list of
259 * free segments which are eligible for allocation. As blocks are
260 * allocated, the allocated segment are removed from the ms_tree and
261 * added to a per txg allocation tree (ms_alloctree). As blocks are
262 * freed, they are added to the free tree (ms_freeingtree). These trees
263 * allow us to process all allocations and frees in syncing context
264 * where it is safe to update the on-disk space maps. An additional set
265 * of in-core trees is maintained to track deferred frees
266 * (ms_defertree). Once a block is freed it will move from the
267 * ms_freedtree to the ms_defertree. A deferred free means that a block
268 * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
269 * transactions groups later. For example, a block that is freed in txg
270 * 50 will not be available for reallocation until txg 52 (50 +
271 * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback.
272 * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
273 * groups and ensure that no block has been reallocated.
274 *
275 * The simplified transition diagram looks like this:
276 *
277 *
278 * ALLOCATE
279 * |
280 * V
281 * free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
282 * ^
283 * | ms_freeingtree <--- FREE
284 * | |
285 * | v
286 * | ms_freedtree
292 * which is only updated in syncing context. Each time we sync a txg,
293 * we append the allocs and frees from that txg to the space map. The
294 * pool space is only updated once all metaslabs have finished syncing.
295 *
296 * To load the in-core free tree we read the space map from disk. This
297 * object contains a series of alloc and free records that are combined
298 * to make up the list of all free segments in this metaslab. These
299 * segments are represented in-core by the ms_tree and are stored in an
300 * AVL tree.
301 *
302 * As the space map grows (as a result of the appends) it will
303 * eventually become space-inefficient. When the metaslab's in-core
304 * free tree is zfs_condense_pct/100 times the size of the minimal
305 * on-disk representation, we rewrite it in its minimized form. If a
306 * metaslab needs to condense then we must set the ms_condensing flag to
307 * ensure that allocations are not performed on the metaslab that is
308 * being written.
309 */
310 struct metaslab {
311 kmutex_t ms_lock;
312 kmutex_t ms_sync_lock;
313 kcondvar_t ms_load_cv;
314 space_map_t *ms_sm;
315 uint64_t ms_id;
316 uint64_t ms_start;
317 uint64_t ms_size;
318 uint64_t ms_fragmentation;
319
320 range_tree_t *ms_alloctree[TXG_SIZE];
321 range_tree_t *ms_tree;
322
323 /*
324 * The following range trees are accessed only from syncing context.
325 * ms_free*tree only have entries while syncing, and are empty
326 * between syncs.
327 */
328 range_tree_t *ms_freeingtree; /* to free this syncing txg */
329 range_tree_t *ms_freedtree; /* already freed this syncing txg */
330 range_tree_t *ms_defertree[TXG_DEFER_SIZE];
331
332 boolean_t ms_condensing; /* condensing? */
333 boolean_t ms_condense_wanted;
334
335 /*
336 * We must hold both ms_lock and ms_group->mg_lock in order to
337 * modify ms_loaded.
338 */
339 boolean_t ms_loaded;
340 boolean_t ms_loading;
341
342 int64_t ms_deferspace; /* sum of ms_defermap[] space */
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
28 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
29 */
30
31 #ifndef _SYS_METASLAB_IMPL_H
32 #define _SYS_METASLAB_IMPL_H
33
34 #include <sys/metaslab.h>
35 #include <sys/space_map.h>
36 #include <sys/range_tree.h>
37 #include <sys/vdev.h>
38 #include <sys/txg.h>
39 #include <sys/avl.h>
40
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44
45 /*
46 * Metaslab allocation tracing record.
47 */
48 typedef struct metaslab_alloc_trace {
172 * an asynchronous zio wants to perform an allocation it must
173 * first reserve the number of blocks that it wants to allocate.
174 * If there aren't sufficient slots available for the pending zio
175 * then that I/O is throttled until more slots free up. The current
176 * number of reserved allocations is maintained by the mc_alloc_slots
177 * refcount. The mc_alloc_max_slots value determines the maximum
178 * number of allocations that the system allows. Gang blocks are
179 * allowed to reserve slots even if we've reached the maximum
180 * number of allocations allowed.
181 */
182 uint64_t mc_alloc_max_slots;
183 refcount_t mc_alloc_slots;
184
185 uint64_t mc_alloc_groups; /* # of allocatable groups */
186
187 uint64_t mc_alloc; /* total allocated space */
188 uint64_t mc_deferred; /* total deferred frees */
189 uint64_t mc_space; /* total space (alloc + free) */
190 uint64_t mc_dspace; /* total deflated space */
191 uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
192
193 kmutex_t mc_alloc_lock;
194 avl_tree_t mc_alloc_tree;
195 };
196
197 /*
198 * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
199 * of a top-level vdev. They are linked togther to form a circular linked
200 * list and can belong to only one metaslab class. Metaslab groups may become
201 * ineligible for allocations for a number of reasons such as limited free
202 * space, fragmentation, or going offline. When this happens the allocator will
203 * simply find the next metaslab group in the linked list and attempt
204 * to allocate from that group instead.
205 */
206 struct metaslab_group {
207 kmutex_t mg_lock;
208 avl_tree_t mg_metaslab_tree;
209 uint64_t mg_aliquot;
210 boolean_t mg_allocatable; /* can we allocate? */
211
212 /*
213 * A metaslab group is considered to be initialized only after
214 * we have updated the MOS config and added the space to the pool.
233 * can occur when gang blocks are required or when other groups
234 * are unable to handle their share of allocations.
235 */
236 uint64_t mg_max_alloc_queue_depth;
237 refcount_t mg_alloc_queue_depth;
238
239 /*
240 * A metalab group that can no longer allocate the minimum block
241 * size will set mg_no_free_space. Once a metaslab group is out
242 * of space then its share of work must be distributed to other
243 * groups.
244 */
245 boolean_t mg_no_free_space;
246
247 uint64_t mg_allocations;
248 uint64_t mg_failed_allocations;
249 uint64_t mg_fragmentation;
250 uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
251 };
252
253 typedef struct {
254 uint64_t ts_birth; /* TXG at which this trimset starts */
255 range_tree_t *ts_tree; /* tree of extents in the trimset */
256 } metaslab_trimset_t;
257
258 /*
259 * This value defines the number of elements in the ms_lbas array. The value
260 * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
261 * This is the equivalent of highbit(UINT64_MAX).
262 */
263 #define MAX_LBAS 64
264
265 /*
266 * Each metaslab maintains a set of in-core trees to track metaslab
267 * operations. The in-core free tree (ms_tree) contains the list of
268 * free segments which are eligible for allocation. As blocks are
269 * allocated, the allocated segments are removed from the ms_tree and
270 * added to a per txg allocation tree (ms_alloctree). This allows us to
271 * process all allocations in syncing context where it is safe to update
272 * the on-disk space maps. Frees are also processed in syncing context.
273 * Most frees are generated from syncing context, and those that are not
274 * are held in the spa_free_bplist for processing in syncing context.
275 * An additional set of in-core trees is maintained to track deferred
276 * frees (ms_defertree). Once a block is freed it will move from the
277 * ms_freedtree to the ms_defertree. A deferred free means that a block
278 * has been freed but cannot be used by the pool until TXG_DEFER_SIZE
279 * transactions groups later. For example, a block that is freed in txg
280 * 50 will not be available for reallocation until txg 52 (50 +
281 * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback.
282 * A pool could be safely rolled back TXG_DEFERS_SIZE transactions
283 * groups and ensure that no block has been reallocated.
284 *
285 * The simplified transition diagram looks like this:
286 *
287 *
288 * ALLOCATE
289 * |
290 * V
291 * free segment (ms_tree) -----> ms_alloctree[4] ----> (write to space map)
292 * ^
293 * | ms_freeingtree <--- FREE
294 * | |
295 * | v
296 * | ms_freedtree
302 * which is only updated in syncing context. Each time we sync a txg,
303 * we append the allocs and frees from that txg to the space map. The
304 * pool space is only updated once all metaslabs have finished syncing.
305 *
306 * To load the in-core free tree we read the space map from disk. This
307 * object contains a series of alloc and free records that are combined
308 * to make up the list of all free segments in this metaslab. These
309 * segments are represented in-core by the ms_tree and are stored in an
310 * AVL tree.
311 *
312 * As the space map grows (as a result of the appends) it will
313 * eventually become space-inefficient. When the metaslab's in-core
314 * free tree is zfs_condense_pct/100 times the size of the minimal
315 * on-disk representation, we rewrite it in its minimized form. If a
316 * metaslab needs to condense then we must set the ms_condensing flag to
317 * ensure that allocations are not performed on the metaslab that is
318 * being written.
319 */
320 struct metaslab {
321 kmutex_t ms_lock;
322 kcondvar_t ms_load_cv;
323 space_map_t *ms_sm;
324 uint64_t ms_id;
325 uint64_t ms_start;
326 uint64_t ms_size;
327 uint64_t ms_fragmentation;
328
329 range_tree_t *ms_alloctree[TXG_SIZE];
330 range_tree_t *ms_tree;
331
332 metaslab_trimset_t *ms_cur_ts; /* currently prepared trims */
333 metaslab_trimset_t *ms_prev_ts; /* previous (aging) trims */
334 kcondvar_t ms_trim_cv;
335 metaslab_trimset_t *ms_trimming_ts;
336
337 /*
338 * The following range trees are accessed only from syncing context.
339 * ms_free*tree only have entries while syncing, and are empty
340 * between syncs.
341 */
342 range_tree_t *ms_freeingtree; /* to free this syncing txg */
343 range_tree_t *ms_freedtree; /* already freed this syncing txg */
344 range_tree_t *ms_defertree[TXG_DEFER_SIZE];
345
346 boolean_t ms_condensing; /* condensing? */
347 boolean_t ms_condense_wanted;
348
349 /*
350 * We must hold both ms_lock and ms_group->mg_lock in order to
351 * modify ms_loaded.
352 */
353 boolean_t ms_loaded;
354 boolean_t ms_loading;
355
356 int64_t ms_deferspace; /* sum of ms_defermap[] space */
|