Print this page
NEX-10069 ZFS_READONLY is a little too strict (fix test lint)
NEX-9553 Move ss_fill gap logic from scan algorithm into range_tree.c
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-9908 System crashes when a user starts scrub for a pool with enabled WBC
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9562 Attaching a vdev while resilver/scrub is running causes panic.
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-6088 ZFS scrub/resilver take excessively long due to issuing lots of random IO
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h

@@ -21,20 +21,64 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/range_tree.h>
 
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_add can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk
+ * or range_tree_vacate functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create and range_tree_create_custom functions.
+ * Any callbacks that are non-NULL are then called at the appropriate
+ * times.
+ *
+ * It is possible to store additional custom information with each
+ * segment. This is typically useful when using one or more of the
+ * operation callbacks mentioned above. To do so, use the
+ * range_tree_create_custom function to create your range tree and
+ * pass a custom kmem cache allocator. This allocator must allocate
+ * objects of at least sizeof (range_seg_t). The range tree will use
+ * the start of that object as a range_seg_t to keep its internal
+ * data structures and you can use the remainder of the object to
+ * store arbitrary additional fields as necessary.
+ */
+
 kmem_cache_t *range_seg_cache;
 
 void
 range_tree_init(void)
 {

@@ -73,10 +117,34 @@
                 }
                 VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
         }
 }
 
+/*
+ * Sets the inter-segment gap. Subsequent adding of segments will examine
+ * if an adjacent segment is less than or equal to `gap' apart. If it is,
+ * the segments will be joined together, in effect 'bridging' the gap.
+ */
+void
+range_tree_set_gap(range_tree_t *rt, uint64_t gap)
+{
+        ASSERT(MUTEX_HELD(rt->rt_lock));
+        rt->rt_gap = gap;
+}
+
+/*
+ * Changes out the lock used by the range tree. Useful when you are moving
+ * the range tree between containing structures without having to recreate
+ * it. Both the old and new locks must be held by the caller.
+ */
+void
+range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
+{
+        ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
+        rt->rt_lock = lp;
+}
+
 static void
 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
 {
         uint64_t size = rs->rs_end - rs->rs_start;
         int idx = highbit64(size) - 1;

@@ -83,10 +151,11 @@
 
         ASSERT(size != 0);
         ASSERT3U(idx, <,
             sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
         rt->rt_histogram[idx]++;
         ASSERT3U(rt->rt_histogram[idx], !=, 0);
 }
 
 static void

@@ -97,10 +166,11 @@
 
         ASSERT(size != 0);
         ASSERT3U(idx, <,
             sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
         ASSERT3U(rt->rt_histogram[idx], !=, 0);
         rt->rt_histogram[idx]--;
 }
 
 /*

@@ -124,23 +194,22 @@
         }
         return (0);
 }
 
 range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg)
+range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
 {
-        range_tree_t *rt;
+        range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 
-        rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
-
         avl_create(&rt->rt_root, range_tree_seg_compare,
             sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
 
+        rt->rt_lock = lp;
         rt->rt_ops = ops;
         rt->rt_arg = arg;
 
-        if (rt->rt_ops != NULL)
+        if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
                 rt->rt_ops->rtop_create(rt, rt->rt_arg);
 
         return (rt);
 }
 

@@ -147,187 +216,311 @@
 void
 range_tree_destroy(range_tree_t *rt)
 {
         VERIFY0(rt->rt_space);
 
-        if (rt->rt_ops != NULL)
+        if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
                 rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 
         avl_destroy(&rt->rt_root);
         kmem_free(rt, sizeof (*rt));
 }
 
 void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
+range_tree_add_fill(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 {
         range_tree_t *rt = arg;
         avl_index_t where;
         range_seg_t rsearch, *rs_before, *rs_after, *rs;
-        uint64_t end = start + size;
+        uint64_t end = start + size, gap_extra = 0;
         boolean_t merge_before, merge_after;
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
         VERIFY(size != 0);
 
         rsearch.rs_start = start;
         rsearch.rs_end = end;
         rs = avl_find(&rt->rt_root, &rsearch, &where);
 
-        if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+        if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end &&
+            rt->rt_gap == 0) {
                 zfs_panic_recover("zfs: allocating allocated segment"
                     "(offset=%llu size=%llu)\n",
                     (longlong_t)start, (longlong_t)size);
                 return;
         }
 
+        if (rt->rt_gap != 0) {
+                if (rs != NULL) {
+                        if (rs->rs_start <= start && rs->rs_end >= end) {
+                                if (rt->rt_ops != NULL &&
+                                    rt->rt_ops->rtop_remove != NULL)
+                                        rt->rt_ops->rtop_remove(rt, rs,
+                                            rt->rt_arg);
+                                rs->rs_fill += fill;
+                                if (rt->rt_ops != NULL &&
+                                    rt->rt_ops->rtop_add != NULL) {
+                                        rt->rt_ops->rtop_add(rt, rs,
+                                            rt->rt_arg);
+                                }
+                                return;
+                        }
+                        ASSERT0(fill);
+                        if (rs->rs_start < start) {
+                                ASSERT3U(end, >, rs->rs_end);
+                                range_tree_add(rt, rs->rs_end, end -
+                                    rs->rs_end);
+                        } else {
+                                ASSERT3U(rs->rs_start, >, start);
+                                range_tree_add(rt, start, rs->rs_start - start);
+                        }
+                        return;
+                }
+        } else {
         /* Make sure we don't overlap with either of our neighbors */
         VERIFY(rs == NULL);
+        }
 
         rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
         rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
 
-        merge_before = (rs_before != NULL && rs_before->rs_end == start);
-        merge_after = (rs_after != NULL && rs_after->rs_start == end);
+        merge_before = (rs_before != NULL &&
+            start - rs_before->rs_end <= rt->rt_gap);
+        merge_after = (rs_after != NULL &&
+            rs_after->rs_start - end <= rt->rt_gap);
+        if (rt->rt_gap != 0) {
+                if (merge_before)
+                        gap_extra += start - rs_before->rs_end;
+                if (merge_after)
+                        gap_extra += rs_after->rs_start - end;
+        }
 
         if (merge_before && merge_after) {
                 avl_remove(&rt->rt_root, rs_before);
-                if (rt->rt_ops != NULL) {
+                if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
                         rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
                         rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
                 }
 
                 range_tree_stat_decr(rt, rs_before);
                 range_tree_stat_decr(rt, rs_after);
 
                 rs_after->rs_start = rs_before->rs_start;
+                rs_after->rs_fill += rs_before->rs_fill + fill;
                 kmem_cache_free(range_seg_cache, rs_before);
                 rs = rs_after;
         } else if (merge_before) {
-                if (rt->rt_ops != NULL)
+                if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
                         rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 
                 range_tree_stat_decr(rt, rs_before);
 
                 rs_before->rs_end = end;
+                rs_before->rs_fill += fill;
                 rs = rs_before;
         } else if (merge_after) {
-                if (rt->rt_ops != NULL)
+                if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
                         rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 
                 range_tree_stat_decr(rt, rs_after);
 
                 rs_after->rs_start = start;
+                rs_after->rs_fill += fill;
                 rs = rs_after;
         } else {
                 rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
                 rs->rs_start = start;
                 rs->rs_end = end;
+                rs->rs_fill = fill;
                 avl_insert(&rt->rt_root, rs, where);
         }
 
-        if (rt->rt_ops != NULL)
+        if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
                 rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 
         range_tree_stat_incr(rt, rs);
-        rt->rt_space += size;
+        rt->rt_space += size + gap_extra;
 }
 
 void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
+range_tree_add(void *arg, uint64_t start, uint64_t size)
 {
+        range_tree_add_fill(arg, start, size, 0);
+}
+
+static void
+range_tree_remove_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill,
+    uint64_t fill_left, boolean_t partial_overlap)
+{
         range_tree_t *rt = arg;
         avl_index_t where;
         range_seg_t rsearch, *rs, *newseg;
         uint64_t end = start + size;
         boolean_t left_over, right_over;
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
         VERIFY3U(size, !=, 0);
+        if (!partial_overlap) {
         VERIFY3U(size, <=, rt->rt_space);
+        } else {
+                VERIFY0(fill);
+                VERIFY0(fill_left);
+        }
 
         rsearch.rs_start = start;
         rsearch.rs_end = end;
-        rs = avl_find(&rt->rt_root, &rsearch, &where);
 
-        /* Make sure we completely overlap with someone */
+        while ((rs = avl_find(&rt->rt_root, &rsearch, &where)) != NULL ||
+            !partial_overlap) {
+                uint64_t overlap_sz;
+
+                if (partial_overlap) {
+                        if (rs->rs_start <= start && rs->rs_end >= end)
+                                overlap_sz = size;
+                        else if (rs->rs_start > start && rs->rs_end < end)
+                                overlap_sz = rs->rs_end - rs->rs_start;
+                        else if (rs->rs_end < end)
+                                overlap_sz = rs->rs_end - start;
+                        else    /* rs->rs_start > start */
+                                overlap_sz = end - rs->rs_start;
+                } else {
+                        /* Make sure we completely overlapped with someone */
         if (rs == NULL) {
                 zfs_panic_recover("zfs: freeing free segment "
                     "(offset=%llu size=%llu)",
                     (longlong_t)start, (longlong_t)size);
                 return;
         }
         VERIFY3U(rs->rs_start, <=, start);
         VERIFY3U(rs->rs_end, >=, end);
+                        overlap_sz = size;
+                }
 
-        left_over = (rs->rs_start != start);
-        right_over = (rs->rs_end != end);
+                left_over = (rs->rs_start < start);
+                right_over = (rs->rs_end > end);
 
         range_tree_stat_decr(rt, rs);
 
-        if (rt->rt_ops != NULL)
+                if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
                 rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
         if (left_over && right_over) {
                 newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
                 newseg->rs_start = end;
                 newseg->rs_end = rs->rs_end;
+                        ASSERT3U(rs->rs_fill, >=, (fill + fill_left));
+                        newseg->rs_fill = rs->rs_fill - (fill + fill_left);
                 range_tree_stat_incr(rt, newseg);
 
                 rs->rs_end = start;
+                        rs->rs_fill = fill_left;
 
                 avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
-                if (rt->rt_ops != NULL)
+                        if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
                         rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
         } else if (left_over) {
                 rs->rs_end = start;
+                        ASSERT3U(rs->rs_fill, >=, fill);
+                        rs->rs_fill -= fill;
         } else if (right_over) {
                 rs->rs_start = end;
+                        ASSERT3U(rs->rs_fill, >=, fill);
+                        rs->rs_fill -= fill;
         } else {
+                        ASSERT3U(rs->rs_fill, ==, fill);
+                        ASSERT(fill == 0 || !partial_overlap);
                 avl_remove(&rt->rt_root, rs);
                 kmem_cache_free(range_seg_cache, rs);
                 rs = NULL;
         }
 
         if (rs != NULL) {
                 range_tree_stat_incr(rt, rs);
 
-                if (rt->rt_ops != NULL)
+                        if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
                         rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
         }
 
-        rt->rt_space -= size;
+                rt->rt_space -= overlap_sz;
+                if (!partial_overlap) {
+                        /*
+                         * There can't be any more segments overlapping with
+                         * us, so no sense in performing an extra search.
+                         */
+                        break;
+                }
+        }
 }
 
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+        range_tree_remove_impl(arg, start, size, 0, 0, B_FALSE);
+}
+
+void
+range_tree_remove_overlap(void *arg, uint64_t start, uint64_t size)
+{
+        range_tree_remove_impl(arg, start, size, 0, 0, B_TRUE);
+}
+
+void
+range_tree_remove_fill(void *arg, uint64_t start, uint64_t size,
+    uint64_t fill, uint64_t remain_left)
+{
+        range_tree_remove_impl(arg, start, size, fill, remain_left, B_FALSE);
+}
+
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
         avl_index_t where;
         range_seg_t rsearch;
         uint64_t end = start + size;
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
         VERIFY(size != 0);
 
         rsearch.rs_start = start;
         rsearch.rs_end = end;
         return (avl_find(&rt->rt_root, &rsearch, &where));
 }
 
-static range_seg_t *
+void *
 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 {
         range_seg_t *rs = range_tree_find_impl(rt, start, size);
         if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
                 return (rs);
         return (NULL);
 }
 
+/*
+ * Given an extent start offset and size, will look through the provided
+ * range tree and find a suitable start offset (starting at `start') such
+ * that the requested extent _doesn't_ overlap with any range segment in
+ * the range tree.
+ */
+uint64_t
+range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+        range_seg_t *rs;
+        while ((rs = range_tree_find_impl(rt, start, size)) != NULL)
+                start = rs->rs_end;
+        return (start);
+}
+
 void
 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
 {
         range_seg_t *rs;
 
+        mutex_enter(rt->rt_lock);
         rs = range_tree_find(rt, off, size);
         if (rs != NULL)
                 panic("freeing free block; rs=%p", (void *)rs);
+        mutex_exit(rt->rt_lock);
 }
 
 boolean_t
 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
 {

@@ -341,13 +534,10 @@
 void
 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
 {
         range_seg_t *rs;
 
-        if (size == 0)
-                return;
-
         while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
                 uint64_t free_start = MAX(rs->rs_start, start);
                 uint64_t free_end = MIN(rs->rs_end, start + size);
                 range_tree_remove(rt, free_start, free_end - free_start);
         }

@@ -356,10 +546,11 @@
 void
 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
 {
         range_tree_t *rt;
 
+        ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
         ASSERT0(range_tree_space(*rtdst));
         ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
 
         rt = *rtsrc;
         *rtsrc = *rtdst;

@@ -370,12 +561,13 @@
 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 {
         range_seg_t *rs;
         void *cookie = NULL;
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
 
-        if (rt->rt_ops != NULL)
+        if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
                 rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 
         while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
                 if (func != NULL)
                         func(arg, rs->rs_start, rs->rs_end - rs->rs_start);

@@ -389,15 +581,23 @@
 void
 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
 {
         range_seg_t *rs;
 
+        ASSERT(MUTEX_HELD(rt->rt_lock));
 
         for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
                 func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 }
 
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+        ASSERT(MUTEX_HELD(rt->rt_lock));
+        return (avl_first(&rt->rt_root));
+}
+
 uint64_t
 range_tree_space(range_tree_t *rt)
 {
         return (rt->rt_space);
 }