1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dnode.h>
33 #include <sys/zio.h>
34 #include <sys/range_tree.h>
35
36 /*
37 * Range trees are tree-based data structures that can be used to
38 * track free space or generally any space allocation information.
39 * A range tree keeps track of individual segments and automatically
40 * provides facilities such as adjacent extent merging and extent
41 * splitting in response to range add/remove requests.
42 *
43 * A range tree starts out completely empty, with no segments in it.
44 * Adding an allocation via range_tree_add to the range tree can either:
45 * 1) create a new extent
46 * 2) extend an adjacent extent
47 * 3) merge two adjacent extents
48 * Conversely, removing an allocation via range_tree_add can:
49 * 1) completely remove an extent
50 * 2) shorten an extent (if the allocation was near one of its ends)
51 * 3) split an extent into two extents, in effect punching a hole
52 *
53 * A range tree is also capable of 'bridging' gaps when adding
54 * allocations. This is useful for cases when close proximity of
55 * allocations is an important detail that needs to be represented
56 * in the range tree. See range_tree_set_gap(). The default behavior
57 * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
58 *
59 * In order to traverse a range tree, use either the range_tree_walk
60 * or range_tree_vacate functions.
61 *
62 * To obtain more accurate information on individual segment
63 * operations that the range tree performs "under the hood", you can
64 * specify a set of callbacks by passing a range_tree_ops_t structure
65 * to the range_tree_create and range_tree_create_custom functions.
66 * Any callbacks that are non-NULL are then called at the appropriate
67 * times.
68 *
69 * It is possible to store additional custom information with each
70 * segment. This is typically useful when using one or more of the
71 * operation callbacks mentioned above. To do so, use the
72 * range_tree_create_custom function to create your range tree and
73 * pass a custom kmem cache allocator. This allocator must allocate
74 * objects of at least sizeof (range_seg_t). The range tree will use
75 * the start of that object as a range_seg_t to keep its internal
76 * data structures and you can use the remainder of the object to
77 * store arbitrary additional fields as necessary.
78 */
79
80 kmem_cache_t *range_seg_cache;
81
82 void
83 range_tree_init(void)
84 {
85 ASSERT(range_seg_cache == NULL);
86 range_seg_cache = kmem_cache_create("range_seg_cache",
87 sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
88 }
89
90 void
91 range_tree_fini(void)
92 {
93 kmem_cache_destroy(range_seg_cache);
94 range_seg_cache = NULL;
95 }
96
97 void
98 range_tree_stat_verify(range_tree_t *rt)
99 {
100 range_seg_t *rs;
101 uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
102 int i;
103
104 for (rs = avl_first(&rt->rt_root); rs != NULL;
105 rs = AVL_NEXT(&rt->rt_root, rs)) {
106 uint64_t size = rs->rs_end - rs->rs_start;
107 int idx = highbit64(size) - 1;
108
109 hist[idx]++;
110 ASSERT3U(hist[idx], !=, 0);
111 }
112
113 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
114 if (hist[i] != rt->rt_histogram[i]) {
115 zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
116 i, hist, hist[i], rt->rt_histogram[i]);
117 }
118 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
119 }
120 }
121
122 /*
123 * Sets the inter-segment gap. Subsequent adding of segments will examine
124 * if an adjacent segment is less than or equal to `gap' apart. If it is,
125 * the segments will be joined together, in effect 'bridging' the gap.
126 */
127 void
128 range_tree_set_gap(range_tree_t *rt, uint64_t gap)
129 {
130 ASSERT(MUTEX_HELD(rt->rt_lock));
131 rt->rt_gap = gap;
132 }
133
134 /*
135 * Changes out the lock used by the range tree. Useful when you are moving
136 * the range tree between containing structures without having to recreate
137 * it. Both the old and new locks must be held by the caller.
138 */
139 void
140 range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
141 {
142 ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
143 rt->rt_lock = lp;
144 }
145
146 static void
147 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
148 {
149 uint64_t size = rs->rs_end - rs->rs_start;
150 int idx = highbit64(size) - 1;
151
152 ASSERT(size != 0);
153 ASSERT3U(idx, <,
154 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
155
156 ASSERT(MUTEX_HELD(rt->rt_lock));
157 rt->rt_histogram[idx]++;
158 ASSERT3U(rt->rt_histogram[idx], !=, 0);
159 }
160
161 static void
162 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
163 {
164 uint64_t size = rs->rs_end - rs->rs_start;
165 int idx = highbit64(size) - 1;
166
167 ASSERT(size != 0);
168 ASSERT3U(idx, <,
169 sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
170
171 ASSERT(MUTEX_HELD(rt->rt_lock));
172 ASSERT3U(rt->rt_histogram[idx], !=, 0);
173 rt->rt_histogram[idx]--;
174 }
175
176 /*
177 * NOTE: caller is responsible for all locking.
178 */
179 static int
180 range_tree_seg_compare(const void *x1, const void *x2)
181 {
182 const range_seg_t *r1 = x1;
183 const range_seg_t *r2 = x2;
184
185 if (r1->rs_start < r2->rs_start) {
186 if (r1->rs_end > r2->rs_start)
187 return (0);
188 return (-1);
189 }
190 if (r1->rs_start > r2->rs_start) {
191 if (r1->rs_start < r2->rs_end)
192 return (0);
193 return (1);
194 }
195 return (0);
196 }
197
198 range_tree_t *
199 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
200 {
201 range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
202
203 avl_create(&rt->rt_root, range_tree_seg_compare,
204 sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
205
206 rt->rt_lock = lp;
207 rt->rt_ops = ops;
208 rt->rt_arg = arg;
209
210 if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
211 rt->rt_ops->rtop_create(rt, rt->rt_arg);
212
213 return (rt);
214 }
215
216 void
217 range_tree_destroy(range_tree_t *rt)
218 {
219 VERIFY0(rt->rt_space);
220
221 if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
222 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
223
224 avl_destroy(&rt->rt_root);
225 kmem_free(rt, sizeof (*rt));
226 }
227
228 void
229 range_tree_add_fill(void *arg, uint64_t start, uint64_t size, uint64_t fill)
230 {
231 range_tree_t *rt = arg;
232 avl_index_t where;
233 range_seg_t rsearch, *rs_before, *rs_after, *rs;
234 uint64_t end = start + size, gap_extra = 0;
235 boolean_t merge_before, merge_after;
236
237 ASSERT(MUTEX_HELD(rt->rt_lock));
238 VERIFY(size != 0);
239
240 rsearch.rs_start = start;
241 rsearch.rs_end = end;
242 rs = avl_find(&rt->rt_root, &rsearch, &where);
243
244 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end &&
245 rt->rt_gap == 0) {
246 zfs_panic_recover("zfs: allocating allocated segment"
247 "(offset=%llu size=%llu)\n",
248 (longlong_t)start, (longlong_t)size);
249 return;
250 }
251
252 if (rt->rt_gap != 0) {
253 if (rs != NULL) {
254 if (rs->rs_start <= start && rs->rs_end >= end) {
255 if (rt->rt_ops != NULL &&
256 rt->rt_ops->rtop_remove != NULL)
257 rt->rt_ops->rtop_remove(rt, rs,
258 rt->rt_arg);
259 rs->rs_fill += fill;
260 if (rt->rt_ops != NULL &&
261 rt->rt_ops->rtop_add != NULL) {
262 rt->rt_ops->rtop_add(rt, rs,
263 rt->rt_arg);
264 }
265 return;
266 }
267 ASSERT0(fill);
268 if (rs->rs_start < start) {
269 ASSERT3U(end, >, rs->rs_end);
270 range_tree_add(rt, rs->rs_end, end -
271 rs->rs_end);
272 } else {
273 ASSERT3U(rs->rs_start, >, start);
274 range_tree_add(rt, start, rs->rs_start - start);
275 }
276 return;
277 }
278 } else {
279 /* Make sure we don't overlap with either of our neighbors */
280 VERIFY(rs == NULL);
281 }
282
283 rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
284 rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
285
286 merge_before = (rs_before != NULL &&
287 start - rs_before->rs_end <= rt->rt_gap);
288 merge_after = (rs_after != NULL &&
289 rs_after->rs_start - end <= rt->rt_gap);
290 if (rt->rt_gap != 0) {
291 if (merge_before)
292 gap_extra += start - rs_before->rs_end;
293 if (merge_after)
294 gap_extra += rs_after->rs_start - end;
295 }
296
297 if (merge_before && merge_after) {
298 avl_remove(&rt->rt_root, rs_before);
299 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
300 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
301 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
302 }
303
304 range_tree_stat_decr(rt, rs_before);
305 range_tree_stat_decr(rt, rs_after);
306
307 rs_after->rs_start = rs_before->rs_start;
308 rs_after->rs_fill += rs_before->rs_fill + fill;
309 kmem_cache_free(range_seg_cache, rs_before);
310 rs = rs_after;
311 } else if (merge_before) {
312 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
313 rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
314
315 range_tree_stat_decr(rt, rs_before);
316
317 rs_before->rs_end = end;
318 rs_before->rs_fill += fill;
319 rs = rs_before;
320 } else if (merge_after) {
321 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
322 rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
323
324 range_tree_stat_decr(rt, rs_after);
325
326 rs_after->rs_start = start;
327 rs_after->rs_fill += fill;
328 rs = rs_after;
329 } else {
330 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
331 rs->rs_start = start;
332 rs->rs_end = end;
333 rs->rs_fill = fill;
334 avl_insert(&rt->rt_root, rs, where);
335 }
336
337 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
338 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
339
340 range_tree_stat_incr(rt, rs);
341 rt->rt_space += size + gap_extra;
342 }
343
344 void
345 range_tree_add(void *arg, uint64_t start, uint64_t size)
346 {
347 range_tree_add_fill(arg, start, size, 0);
348 }
349
350 static void
351 range_tree_remove_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill,
352 uint64_t fill_left, boolean_t partial_overlap)
353 {
354 range_tree_t *rt = arg;
355 avl_index_t where;
356 range_seg_t rsearch, *rs, *newseg;
357 uint64_t end = start + size;
358 boolean_t left_over, right_over;
359
360 ASSERT(MUTEX_HELD(rt->rt_lock));
361 VERIFY3U(size, !=, 0);
362 if (!partial_overlap) {
363 VERIFY3U(size, <=, rt->rt_space);
364 } else {
365 VERIFY0(fill);
366 VERIFY0(fill_left);
367 }
368
369 rsearch.rs_start = start;
370 rsearch.rs_end = end;
371
372 while ((rs = avl_find(&rt->rt_root, &rsearch, &where)) != NULL ||
373 !partial_overlap) {
374 uint64_t overlap_sz;
375
376 if (partial_overlap) {
377 if (rs->rs_start <= start && rs->rs_end >= end)
378 overlap_sz = size;
379 else if (rs->rs_start > start && rs->rs_end < end)
380 overlap_sz = rs->rs_end - rs->rs_start;
381 else if (rs->rs_end < end)
382 overlap_sz = rs->rs_end - start;
383 else /* rs->rs_start > start */
384 overlap_sz = end - rs->rs_start;
385 } else {
386 /* Make sure we completely overlapped with someone */
387 if (rs == NULL) {
388 zfs_panic_recover("zfs: freeing free segment "
389 "(offset=%llu size=%llu)",
390 (longlong_t)start, (longlong_t)size);
391 return;
392 }
393 VERIFY3U(rs->rs_start, <=, start);
394 VERIFY3U(rs->rs_end, >=, end);
395 overlap_sz = size;
396 }
397
398 left_over = (rs->rs_start < start);
399 right_over = (rs->rs_end > end);
400
401 range_tree_stat_decr(rt, rs);
402
403 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
404 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
405
406 if (left_over && right_over) {
407 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
408 newseg->rs_start = end;
409 newseg->rs_end = rs->rs_end;
410 ASSERT3U(rs->rs_fill, >=, (fill + fill_left));
411 newseg->rs_fill = rs->rs_fill - (fill + fill_left);
412 range_tree_stat_incr(rt, newseg);
413
414 rs->rs_end = start;
415 rs->rs_fill = fill_left;
416
417 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
418 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
419 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
420 } else if (left_over) {
421 rs->rs_end = start;
422 ASSERT3U(rs->rs_fill, >=, fill);
423 rs->rs_fill -= fill;
424 } else if (right_over) {
425 rs->rs_start = end;
426 ASSERT3U(rs->rs_fill, >=, fill);
427 rs->rs_fill -= fill;
428 } else {
429 ASSERT3U(rs->rs_fill, ==, fill);
430 ASSERT(fill == 0 || !partial_overlap);
431 avl_remove(&rt->rt_root, rs);
432 kmem_cache_free(range_seg_cache, rs);
433 rs = NULL;
434 }
435
436 if (rs != NULL) {
437 range_tree_stat_incr(rt, rs);
438
439 if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
440 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
441 }
442
443 rt->rt_space -= overlap_sz;
444 if (!partial_overlap) {
445 /*
446 * There can't be any more segments overlapping with
447 * us, so no sense in performing an extra search.
448 */
449 break;
450 }
451 }
452 }
453
454 void
455 range_tree_remove(void *arg, uint64_t start, uint64_t size)
456 {
457 range_tree_remove_impl(arg, start, size, 0, 0, B_FALSE);
458 }
459
460 void
461 range_tree_remove_overlap(void *arg, uint64_t start, uint64_t size)
462 {
463 range_tree_remove_impl(arg, start, size, 0, 0, B_TRUE);
464 }
465
466 void
467 range_tree_remove_fill(void *arg, uint64_t start, uint64_t size,
468 uint64_t fill, uint64_t remain_left)
469 {
470 range_tree_remove_impl(arg, start, size, fill, remain_left, B_FALSE);
471 }
472
473 static range_seg_t *
474 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
475 {
476 avl_index_t where;
477 range_seg_t rsearch;
478 uint64_t end = start + size;
479
480 ASSERT(MUTEX_HELD(rt->rt_lock));
481 VERIFY(size != 0);
482
483 rsearch.rs_start = start;
484 rsearch.rs_end = end;
485 return (avl_find(&rt->rt_root, &rsearch, &where));
486 }
487
488 void *
489 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
490 {
491 range_seg_t *rs = range_tree_find_impl(rt, start, size);
492 if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
493 return (rs);
494 return (NULL);
495 }
496
497 /*
498 * Given an extent start offset and size, will look through the provided
499 * range tree and find a suitable start offset (starting at `start') such
500 * that the requested extent _doesn't_ overlap with any range segment in
501 * the range tree.
502 */
503 uint64_t
504 range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size)
505 {
506 range_seg_t *rs;
507 while ((rs = range_tree_find_impl(rt, start, size)) != NULL)
508 start = rs->rs_end;
509 return (start);
510 }
511
512 void
513 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
514 {
515 range_seg_t *rs;
516
517 mutex_enter(rt->rt_lock);
518 rs = range_tree_find(rt, off, size);
519 if (rs != NULL)
520 panic("freeing free block; rs=%p", (void *)rs);
521 mutex_exit(rt->rt_lock);
522 }
523
524 boolean_t
525 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
526 {
527 return (range_tree_find(rt, start, size) != NULL);
528 }
529
530 /*
531 * Ensure that this range is not in the tree, regardless of whether
532 * it is currently in the tree.
533 */
534 void
535 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
536 {
537 range_seg_t *rs;
538
539 while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
540 uint64_t free_start = MAX(rs->rs_start, start);
541 uint64_t free_end = MIN(rs->rs_end, start + size);
542 range_tree_remove(rt, free_start, free_end - free_start);
543 }
544 }
545
546 void
547 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
548 {
549 range_tree_t *rt;
550
551 ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
552 ASSERT0(range_tree_space(*rtdst));
553 ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
554
555 rt = *rtsrc;
556 *rtsrc = *rtdst;
557 *rtdst = rt;
558 }
559
560 void
561 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
562 {
563 range_seg_t *rs;
564 void *cookie = NULL;
565
566 ASSERT(MUTEX_HELD(rt->rt_lock));
567
568 if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
569 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
570
571 while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
572 if (func != NULL)
573 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
574 kmem_cache_free(range_seg_cache, rs);
575 }
576
577 bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
578 rt->rt_space = 0;
579 }
580
581 void
582 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
583 {
584 range_seg_t *rs;
585
586 ASSERT(MUTEX_HELD(rt->rt_lock));
587
588 for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
589 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
590 }
591
592 range_seg_t *
593 range_tree_first(range_tree_t *rt)
594 {
595 ASSERT(MUTEX_HELD(rt->rt_lock));
596 return (avl_first(&rt->rt_root));
597 }
598
599 uint64_t
600 range_tree_space(range_tree_t *rt)
601 {
602 return (rt->rt_space);
603 }