1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  27  */
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/dmu.h>
  32 #include <sys/dnode.h>
  33 #include <sys/zio.h>
  34 #include <sys/range_tree.h>
  35 
  36 /*
  37  * Range trees are tree-based data structures that can be used to
  38  * track free space or generally any space allocation information.
  39  * A range tree keeps track of individual segments and automatically
  40  * provides facilities such as adjacent extent merging and extent
  41  * splitting in response to range add/remove requests.
  42  *
  43  * A range tree starts out completely empty, with no segments in it.
  44  * Adding an allocation via range_tree_add to the range tree can either:
  45  * 1) create a new extent
  46  * 2) extend an adjacent extent
  47  * 3) merge two adjacent extents
  48  * Conversely, removing an allocation via range_tree_add can:
  49  * 1) completely remove an extent
  50  * 2) shorten an extent (if the allocation was near one of its ends)
  51  * 3) split an extent into two extents, in effect punching a hole
  52  *
  53  * A range tree is also capable of 'bridging' gaps when adding
  54  * allocations. This is useful for cases when close proximity of
  55  * allocations is an important detail that needs to be represented
  56  * in the range tree. See range_tree_set_gap(). The default behavior
  57  * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
  58  *
  59  * In order to traverse a range tree, use either the range_tree_walk
  60  * or range_tree_vacate functions.
  61  *
  62  * To obtain more accurate information on individual segment
  63  * operations that the range tree performs "under the hood", you can
  64  * specify a set of callbacks by passing a range_tree_ops_t structure
  65  * to the range_tree_create and range_tree_create_custom functions.
  66  * Any callbacks that are non-NULL are then called at the appropriate
  67  * times.
  68  *
  69  * It is possible to store additional custom information with each
  70  * segment. This is typically useful when using one or more of the
  71  * operation callbacks mentioned above. To do so, use the
  72  * range_tree_create_custom function to create your range tree and
  73  * pass a custom kmem cache allocator. This allocator must allocate
  74  * objects of at least sizeof (range_seg_t). The range tree will use
  75  * the start of that object as a range_seg_t to keep its internal
  76  * data structures and you can use the remainder of the object to
  77  * store arbitrary additional fields as necessary.
  78  */
  79 
  80 kmem_cache_t *range_seg_cache;
  81 
  82 void
  83 range_tree_init(void)
  84 {
  85         ASSERT(range_seg_cache == NULL);
  86         range_seg_cache = kmem_cache_create("range_seg_cache",
  87             sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
  88 }
  89 
  90 void
  91 range_tree_fini(void)
  92 {
  93         kmem_cache_destroy(range_seg_cache);
  94         range_seg_cache = NULL;
  95 }
  96 
  97 void
  98 range_tree_stat_verify(range_tree_t *rt)
  99 {
 100         range_seg_t *rs;
 101         uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
 102         int i;
 103 
 104         for (rs = avl_first(&rt->rt_root); rs != NULL;
 105             rs = AVL_NEXT(&rt->rt_root, rs)) {
 106                 uint64_t size = rs->rs_end - rs->rs_start;
 107                 int idx = highbit64(size) - 1;
 108 
 109                 hist[idx]++;
 110                 ASSERT3U(hist[idx], !=, 0);
 111         }
 112 
 113         for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 114                 if (hist[i] != rt->rt_histogram[i]) {
 115                         zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
 116                             i, hist, hist[i], rt->rt_histogram[i]);
 117                 }
 118                 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
 119         }
 120 }
 121 
 122 /*
 123  * Sets the inter-segment gap. Subsequent adding of segments will examine
 124  * if an adjacent segment is less than or equal to `gap' apart. If it is,
 125  * the segments will be joined together, in effect 'bridging' the gap.
 126  */
 127 void
 128 range_tree_set_gap(range_tree_t *rt, uint64_t gap)
 129 {
 130         ASSERT(MUTEX_HELD(rt->rt_lock));
 131         rt->rt_gap = gap;
 132 }
 133 
 134 /*
 135  * Changes out the lock used by the range tree. Useful when you are moving
 136  * the range tree between containing structures without having to recreate
 137  * it. Both the old and new locks must be held by the caller.
 138  */
 139 void
 140 range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
 141 {
 142         ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
 143         rt->rt_lock = lp;
 144 }
 145 
 146 static void
 147 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
 148 {
 149         uint64_t size = rs->rs_end - rs->rs_start;
 150         int idx = highbit64(size) - 1;
 151 
 152         ASSERT(size != 0);
 153         ASSERT3U(idx, <,
 154             sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 155 
 156         ASSERT(MUTEX_HELD(rt->rt_lock));
 157         rt->rt_histogram[idx]++;
 158         ASSERT3U(rt->rt_histogram[idx], !=, 0);
 159 }
 160 
 161 static void
 162 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
 163 {
 164         uint64_t size = rs->rs_end - rs->rs_start;
 165         int idx = highbit64(size) - 1;
 166 
 167         ASSERT(size != 0);
 168         ASSERT3U(idx, <,
 169             sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 170 
 171         ASSERT(MUTEX_HELD(rt->rt_lock));
 172         ASSERT3U(rt->rt_histogram[idx], !=, 0);
 173         rt->rt_histogram[idx]--;
 174 }
 175 
 176 /*
 177  * NOTE: caller is responsible for all locking.
 178  */
 179 static int
 180 range_tree_seg_compare(const void *x1, const void *x2)
 181 {
 182         const range_seg_t *r1 = x1;
 183         const range_seg_t *r2 = x2;
 184 
 185         if (r1->rs_start < r2->rs_start) {
 186                 if (r1->rs_end > r2->rs_start)
 187                         return (0);
 188                 return (-1);
 189         }
 190         if (r1->rs_start > r2->rs_start) {
 191                 if (r1->rs_start < r2->rs_end)
 192                         return (0);
 193                 return (1);
 194         }
 195         return (0);
 196 }
 197 
 198 range_tree_t *
 199 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
 200 {
 201         range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 202 
 203         avl_create(&rt->rt_root, range_tree_seg_compare,
 204             sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
 205 
 206         rt->rt_lock = lp;
 207         rt->rt_ops = ops;
 208         rt->rt_arg = arg;
 209 
 210         if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
 211                 rt->rt_ops->rtop_create(rt, rt->rt_arg);
 212 
 213         return (rt);
 214 }
 215 
 216 void
 217 range_tree_destroy(range_tree_t *rt)
 218 {
 219         VERIFY0(rt->rt_space);
 220 
 221         if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
 222                 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 223 
 224         avl_destroy(&rt->rt_root);
 225         kmem_free(rt, sizeof (*rt));
 226 }
 227 
 228 void
 229 range_tree_add_fill(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 230 {
 231         range_tree_t *rt = arg;
 232         avl_index_t where;
 233         range_seg_t rsearch, *rs_before, *rs_after, *rs;
 234         uint64_t end = start + size, gap_extra = 0;
 235         boolean_t merge_before, merge_after;
 236 
 237         ASSERT(MUTEX_HELD(rt->rt_lock));
 238         VERIFY(size != 0);
 239 
 240         rsearch.rs_start = start;
 241         rsearch.rs_end = end;
 242         rs = avl_find(&rt->rt_root, &rsearch, &where);
 243 
 244         if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end &&
 245             rt->rt_gap == 0) {
 246                 zfs_panic_recover("zfs: allocating allocated segment"
 247                     "(offset=%llu size=%llu)\n",
 248                     (longlong_t)start, (longlong_t)size);
 249                 return;
 250         }
 251 
 252         if (rt->rt_gap != 0) {
 253                 if (rs != NULL) {
 254                         if (rs->rs_start <= start && rs->rs_end >= end) {
 255                                 if (rt->rt_ops != NULL &&
 256                                     rt->rt_ops->rtop_remove != NULL)
 257                                         rt->rt_ops->rtop_remove(rt, rs,
 258                                             rt->rt_arg);
 259                                 rs->rs_fill += fill;
 260                                 if (rt->rt_ops != NULL &&
 261                                     rt->rt_ops->rtop_add != NULL) {
 262                                         rt->rt_ops->rtop_add(rt, rs,
 263                                             rt->rt_arg);
 264                                 }
 265                                 return;
 266                         }
 267                         ASSERT0(fill);
 268                         if (rs->rs_start < start) {
 269                                 ASSERT3U(end, >, rs->rs_end);
 270                                 range_tree_add(rt, rs->rs_end, end -
 271                                     rs->rs_end);
 272                         } else {
 273                                 ASSERT3U(rs->rs_start, >, start);
 274                                 range_tree_add(rt, start, rs->rs_start - start);
 275                         }
 276                         return;
 277                 }
 278         } else {
 279                 /* Make sure we don't overlap with either of our neighbors */
 280                 VERIFY(rs == NULL);
 281         }
 282 
 283         rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
 284         rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
 285 
 286         merge_before = (rs_before != NULL &&
 287             start - rs_before->rs_end <= rt->rt_gap);
 288         merge_after = (rs_after != NULL &&
 289             rs_after->rs_start - end <= rt->rt_gap);
 290         if (rt->rt_gap != 0) {
 291                 if (merge_before)
 292                         gap_extra += start - rs_before->rs_end;
 293                 if (merge_after)
 294                         gap_extra += rs_after->rs_start - end;
 295         }
 296 
 297         if (merge_before && merge_after) {
 298                 avl_remove(&rt->rt_root, rs_before);
 299                 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
 300                         rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 301                         rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 302                 }
 303 
 304                 range_tree_stat_decr(rt, rs_before);
 305                 range_tree_stat_decr(rt, rs_after);
 306 
 307                 rs_after->rs_start = rs_before->rs_start;
 308                 rs_after->rs_fill += rs_before->rs_fill + fill;
 309                 kmem_cache_free(range_seg_cache, rs_before);
 310                 rs = rs_after;
 311         } else if (merge_before) {
 312                 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 313                         rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 314 
 315                 range_tree_stat_decr(rt, rs_before);
 316 
 317                 rs_before->rs_end = end;
 318                 rs_before->rs_fill += fill;
 319                 rs = rs_before;
 320         } else if (merge_after) {
 321                 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 322                         rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 323 
 324                 range_tree_stat_decr(rt, rs_after);
 325 
 326                 rs_after->rs_start = start;
 327                 rs_after->rs_fill += fill;
 328                 rs = rs_after;
 329         } else {
 330                 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
 331                 rs->rs_start = start;
 332                 rs->rs_end = end;
 333                 rs->rs_fill = fill;
 334                 avl_insert(&rt->rt_root, rs, where);
 335         }
 336 
 337         if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 338                 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 339 
 340         range_tree_stat_incr(rt, rs);
 341         rt->rt_space += size + gap_extra;
 342 }
 343 
 344 void
 345 range_tree_add(void *arg, uint64_t start, uint64_t size)
 346 {
 347         range_tree_add_fill(arg, start, size, 0);
 348 }
 349 
 350 static void
 351 range_tree_remove_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill,
 352     uint64_t fill_left, boolean_t partial_overlap)
 353 {
 354         range_tree_t *rt = arg;
 355         avl_index_t where;
 356         range_seg_t rsearch, *rs, *newseg;
 357         uint64_t end = start + size;
 358         boolean_t left_over, right_over;
 359 
 360         ASSERT(MUTEX_HELD(rt->rt_lock));
 361         VERIFY3U(size, !=, 0);
 362         if (!partial_overlap) {
 363                 VERIFY3U(size, <=, rt->rt_space);
 364         } else {
 365                 VERIFY0(fill);
 366                 VERIFY0(fill_left);
 367         }
 368 
 369         rsearch.rs_start = start;
 370         rsearch.rs_end = end;
 371 
 372         while ((rs = avl_find(&rt->rt_root, &rsearch, &where)) != NULL ||
 373             !partial_overlap) {
 374                 uint64_t overlap_sz;
 375 
 376                 if (partial_overlap) {
 377                         if (rs->rs_start <= start && rs->rs_end >= end)
 378                                 overlap_sz = size;
 379                         else if (rs->rs_start > start && rs->rs_end < end)
 380                                 overlap_sz = rs->rs_end - rs->rs_start;
 381                         else if (rs->rs_end < end)
 382                                 overlap_sz = rs->rs_end - start;
 383                         else    /* rs->rs_start > start */
 384                                 overlap_sz = end - rs->rs_start;
 385                 } else {
 386                         /* Make sure we completely overlapped with someone */
 387                         if (rs == NULL) {
 388                                 zfs_panic_recover("zfs: freeing free segment "
 389                                     "(offset=%llu size=%llu)",
 390                                     (longlong_t)start, (longlong_t)size);
 391                                 return;
 392                         }
 393                         VERIFY3U(rs->rs_start, <=, start);
 394                         VERIFY3U(rs->rs_end, >=, end);
 395                         overlap_sz = size;
 396                 }
 397 
 398                 left_over = (rs->rs_start < start);
 399                 right_over = (rs->rs_end > end);
 400 
 401                 range_tree_stat_decr(rt, rs);
 402 
 403                 if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
 404                         rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 405 
 406                 if (left_over && right_over) {
 407                         newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
 408                         newseg->rs_start = end;
 409                         newseg->rs_end = rs->rs_end;
 410                         ASSERT3U(rs->rs_fill, >=, (fill + fill_left));
 411                         newseg->rs_fill = rs->rs_fill - (fill + fill_left);
 412                         range_tree_stat_incr(rt, newseg);
 413 
 414                         rs->rs_end = start;
 415                         rs->rs_fill = fill_left;
 416 
 417                         avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
 418                         if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 419                                 rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
 420                 } else if (left_over) {
 421                         rs->rs_end = start;
 422                         ASSERT3U(rs->rs_fill, >=, fill);
 423                         rs->rs_fill -= fill;
 424                 } else if (right_over) {
 425                         rs->rs_start = end;
 426                         ASSERT3U(rs->rs_fill, >=, fill);
 427                         rs->rs_fill -= fill;
 428                 } else {
 429                         ASSERT3U(rs->rs_fill, ==, fill);
 430                         ASSERT(fill == 0 || !partial_overlap);
 431                         avl_remove(&rt->rt_root, rs);
 432                         kmem_cache_free(range_seg_cache, rs);
 433                         rs = NULL;
 434                 }
 435 
 436                 if (rs != NULL) {
 437                         range_tree_stat_incr(rt, rs);
 438 
 439                         if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
 440                                 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 441                 }
 442 
 443                 rt->rt_space -= overlap_sz;
 444                 if (!partial_overlap) {
 445                         /*
 446                          * There can't be any more segments overlapping with
 447                          * us, so no sense in performing an extra search.
 448                          */
 449                         break;
 450                 }
 451         }
 452 }
 453 
 454 void
 455 range_tree_remove(void *arg, uint64_t start, uint64_t size)
 456 {
 457         range_tree_remove_impl(arg, start, size, 0, 0, B_FALSE);
 458 }
 459 
 460 void
 461 range_tree_remove_overlap(void *arg, uint64_t start, uint64_t size)
 462 {
 463         range_tree_remove_impl(arg, start, size, 0, 0, B_TRUE);
 464 }
 465 
 466 void
 467 range_tree_remove_fill(void *arg, uint64_t start, uint64_t size,
 468     uint64_t fill, uint64_t remain_left)
 469 {
 470         range_tree_remove_impl(arg, start, size, fill, remain_left, B_FALSE);
 471 }
 472 
 473 static range_seg_t *
 474 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 475 {
 476         avl_index_t where;
 477         range_seg_t rsearch;
 478         uint64_t end = start + size;
 479 
 480         ASSERT(MUTEX_HELD(rt->rt_lock));
 481         VERIFY(size != 0);
 482 
 483         rsearch.rs_start = start;
 484         rsearch.rs_end = end;
 485         return (avl_find(&rt->rt_root, &rsearch, &where));
 486 }
 487 
 488 void *
 489 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 490 {
 491         range_seg_t *rs = range_tree_find_impl(rt, start, size);
 492         if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
 493                 return (rs);
 494         return (NULL);
 495 }
 496 
 497 /*
 498  * Given an extent start offset and size, will look through the provided
 499  * range tree and find a suitable start offset (starting at `start') such
 500  * that the requested extent _doesn't_ overlap with any range segment in
 501  * the range tree.
 502  */
 503 uint64_t
 504 range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size)
 505 {
 506         range_seg_t *rs;
 507         while ((rs = range_tree_find_impl(rt, start, size)) != NULL)
 508                 start = rs->rs_end;
 509         return (start);
 510 }
 511 
 512 void
 513 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
 514 {
 515         range_seg_t *rs;
 516 
 517         mutex_enter(rt->rt_lock);
 518         rs = range_tree_find(rt, off, size);
 519         if (rs != NULL)
 520                 panic("freeing free block; rs=%p", (void *)rs);
 521         mutex_exit(rt->rt_lock);
 522 }
 523 
 524 boolean_t
 525 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
 526 {
 527         return (range_tree_find(rt, start, size) != NULL);
 528 }
 529 
 530 /*
 531  * Ensure that this range is not in the tree, regardless of whether
 532  * it is currently in the tree.
 533  */
 534 void
 535 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
 536 {
 537         range_seg_t *rs;
 538 
 539         while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
 540                 uint64_t free_start = MAX(rs->rs_start, start);
 541                 uint64_t free_end = MIN(rs->rs_end, start + size);
 542                 range_tree_remove(rt, free_start, free_end - free_start);
 543         }
 544 }
 545 
 546 void
 547 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
 548 {
 549         range_tree_t *rt;
 550 
 551         ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
 552         ASSERT0(range_tree_space(*rtdst));
 553         ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
 554 
 555         rt = *rtsrc;
 556         *rtsrc = *rtdst;
 557         *rtdst = rt;
 558 }
 559 
 560 void
 561 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 562 {
 563         range_seg_t *rs;
 564         void *cookie = NULL;
 565 
 566         ASSERT(MUTEX_HELD(rt->rt_lock));
 567 
 568         if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
 569                 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 570 
 571         while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
 572                 if (func != NULL)
 573                         func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 574                 kmem_cache_free(range_seg_cache, rs);
 575         }
 576 
 577         bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
 578         rt->rt_space = 0;
 579 }
 580 
 581 void
 582 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 583 {
 584         range_seg_t *rs;
 585 
 586         ASSERT(MUTEX_HELD(rt->rt_lock));
 587 
 588         for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
 589                 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 590 }
 591 
 592 range_seg_t *
 593 range_tree_first(range_tree_t *rt)
 594 {
 595         ASSERT(MUTEX_HELD(rt->rt_lock));
 596         return (avl_first(&rt->rt_root));
 597 }
 598 
 599 uint64_t
 600 range_tree_space(range_tree_t *rt)
 601 {
 602         return (rt->rt_space);
 603 }