Print this page
NEX-10069 ZFS_READONLY is a little too strict (fix test lint)
NEX-9553 Move ss_fill gap logic from scan algorithm into range_tree.c
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-9908 System crashes when a user starts scrub for a pool with enabled WBC
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9562 Attaching a vdev while resilver/scrub is running causes panic.
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-6088 ZFS scrub/resilver take excessively long due to issuing lots of random IO
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
usr/src/uts/common/io/scsi/targets/sd.c
usr/src/uts/common/sys/scsi/targets/sddef.h
@@ -21,20 +21,64 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
#include <sys/dnode.h>
#include <sys/zio.h>
#include <sys/range_tree.h>
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_add can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk
+ * or range_tree_vacate functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create and range_tree_create_custom functions.
+ * Any callbacks that are non-NULL are then called at the appropriate
+ * times.
+ *
+ * It is possible to store additional custom information with each
+ * segment. This is typically useful when using one or more of the
+ * operation callbacks mentioned above. To do so, use the
+ * range_tree_create_custom function to create your range tree and
+ * pass a custom kmem cache allocator. This allocator must allocate
+ * objects of at least sizeof (range_seg_t). The range tree will use
+ * the start of that object as a range_seg_t to keep its internal
+ * data structures and you can use the remainder of the object to
+ * store arbitrary additional fields as necessary.
+ */
+
kmem_cache_t *range_seg_cache;
void
range_tree_init(void)
{
@@ -73,10 +117,34 @@
}
VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
}
}
+/*
+ * Sets the inter-segment gap. Subsequent adding of segments will examine
+ * if an adjacent segment is less than or equal to `gap' apart. If it is,
+ * the segments will be joined together, in effect 'bridging' the gap.
+ */
+void
+range_tree_set_gap(range_tree_t *rt, uint64_t gap)
+{
+ ASSERT(MUTEX_HELD(rt->rt_lock));
+ rt->rt_gap = gap;
+}
+
+/*
+ * Changes out the lock used by the range tree. Useful when you are moving
+ * the range tree between containing structures without having to recreate
+ * it. Both the old and new locks must be held by the caller.
+ */
+void
+range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
+{
+ ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
+ rt->rt_lock = lp;
+}
+
static void
range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
{
uint64_t size = rs->rs_end - rs->rs_start;
int idx = highbit64(size) - 1;
@@ -83,10 +151,11 @@
ASSERT(size != 0);
ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+ ASSERT(MUTEX_HELD(rt->rt_lock));
rt->rt_histogram[idx]++;
ASSERT3U(rt->rt_histogram[idx], !=, 0);
}
static void
@@ -97,10 +166,11 @@
ASSERT(size != 0);
ASSERT3U(idx, <,
sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+ ASSERT(MUTEX_HELD(rt->rt_lock));
ASSERT3U(rt->rt_histogram[idx], !=, 0);
rt->rt_histogram[idx]--;
}
/*
@@ -124,23 +194,22 @@
}
return (0);
}
range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg)
+range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
{
- range_tree_t *rt;
+ range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
- rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
-
avl_create(&rt->rt_root, range_tree_seg_compare,
sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
+ rt->rt_lock = lp;
rt->rt_ops = ops;
rt->rt_arg = arg;
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg);
return (rt);
}
@@ -147,187 +216,311 @@
void
range_tree_destroy(range_tree_t *rt)
{
VERIFY0(rt->rt_space);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
avl_destroy(&rt->rt_root);
kmem_free(rt, sizeof (*rt));
}
void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
+range_tree_add_fill(void *arg, uint64_t start, uint64_t size, uint64_t fill)
{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs_before, *rs_after, *rs;
- uint64_t end = start + size;
+ uint64_t end = start + size, gap_extra = 0;
boolean_t merge_before, merge_after;
+ ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY(size != 0);
rsearch.rs_start = start;
rsearch.rs_end = end;
rs = avl_find(&rt->rt_root, &rsearch, &where);
- if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+ if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end &&
+ rt->rt_gap == 0) {
zfs_panic_recover("zfs: allocating allocated segment"
"(offset=%llu size=%llu)\n",
(longlong_t)start, (longlong_t)size);
return;
}
+ if (rt->rt_gap != 0) {
+ if (rs != NULL) {
+ if (rs->rs_start <= start && rs->rs_end >= end) {
+ if (rt->rt_ops != NULL &&
+ rt->rt_ops->rtop_remove != NULL)
+ rt->rt_ops->rtop_remove(rt, rs,
+ rt->rt_arg);
+ rs->rs_fill += fill;
+ if (rt->rt_ops != NULL &&
+ rt->rt_ops->rtop_add != NULL) {
+ rt->rt_ops->rtop_add(rt, rs,
+ rt->rt_arg);
+ }
+ return;
+ }
+ ASSERT0(fill);
+ if (rs->rs_start < start) {
+ ASSERT3U(end, >, rs->rs_end);
+ range_tree_add(rt, rs->rs_end, end -
+ rs->rs_end);
+ } else {
+ ASSERT3U(rs->rs_start, >, start);
+ range_tree_add(rt, start, rs->rs_start - start);
+ }
+ return;
+ }
+ } else {
/* Make sure we don't overlap with either of our neighbors */
VERIFY(rs == NULL);
+ }
rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
- merge_before = (rs_before != NULL && rs_before->rs_end == start);
- merge_after = (rs_after != NULL && rs_after->rs_start == end);
+ merge_before = (rs_before != NULL &&
+ start - rs_before->rs_end <= rt->rt_gap);
+ merge_after = (rs_after != NULL &&
+ rs_after->rs_start - end <= rt->rt_gap);
+ if (rt->rt_gap != 0) {
+ if (merge_before)
+ gap_extra += start - rs_before->rs_end;
+ if (merge_after)
+ gap_extra += rs_after->rs_start - end;
+ }
if (merge_before && merge_after) {
avl_remove(&rt->rt_root, rs_before);
- if (rt->rt_ops != NULL) {
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
}
range_tree_stat_decr(rt, rs_before);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_start = rs_before->rs_start;
+ rs_after->rs_fill += rs_before->rs_fill + fill;
kmem_cache_free(range_seg_cache, rs_before);
rs = rs_after;
} else if (merge_before) {
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
range_tree_stat_decr(rt, rs_before);
rs_before->rs_end = end;
+ rs_before->rs_fill += fill;
rs = rs_before;
} else if (merge_after) {
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
range_tree_stat_decr(rt, rs_after);
rs_after->rs_start = start;
+ rs_after->rs_fill += fill;
rs = rs_after;
} else {
rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
rs->rs_start = start;
rs->rs_end = end;
+ rs->rs_fill = fill;
avl_insert(&rt->rt_root, rs, where);
}
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
range_tree_stat_incr(rt, rs);
- rt->rt_space += size;
+ rt->rt_space += size + gap_extra;
}
void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
+range_tree_add(void *arg, uint64_t start, uint64_t size)
{
+ range_tree_add_fill(arg, start, size, 0);
+}
+
+static void
+range_tree_remove_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill,
+ uint64_t fill_left, boolean_t partial_overlap)
+{
range_tree_t *rt = arg;
avl_index_t where;
range_seg_t rsearch, *rs, *newseg;
uint64_t end = start + size;
boolean_t left_over, right_over;
+ ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY3U(size, !=, 0);
+ if (!partial_overlap) {
VERIFY3U(size, <=, rt->rt_space);
+ } else {
+ VERIFY0(fill);
+ VERIFY0(fill_left);
+ }
rsearch.rs_start = start;
rsearch.rs_end = end;
- rs = avl_find(&rt->rt_root, &rsearch, &where);
- /* Make sure we completely overlap with someone */
+ while ((rs = avl_find(&rt->rt_root, &rsearch, &where)) != NULL ||
+ !partial_overlap) {
+ uint64_t overlap_sz;
+
+ if (partial_overlap) {
+ if (rs->rs_start <= start && rs->rs_end >= end)
+ overlap_sz = size;
+ else if (rs->rs_start > start && rs->rs_end < end)
+ overlap_sz = rs->rs_end - rs->rs_start;
+ else if (rs->rs_end < end)
+ overlap_sz = rs->rs_end - start;
+ else /* rs->rs_start > start */
+ overlap_sz = end - rs->rs_start;
+ } else {
+ /* Make sure we completely overlapped with someone */
if (rs == NULL) {
zfs_panic_recover("zfs: freeing free segment "
"(offset=%llu size=%llu)",
(longlong_t)start, (longlong_t)size);
return;
}
VERIFY3U(rs->rs_start, <=, start);
VERIFY3U(rs->rs_end, >=, end);
+ overlap_sz = size;
+ }
- left_over = (rs->rs_start != start);
- right_over = (rs->rs_end != end);
+ left_over = (rs->rs_start < start);
+ right_over = (rs->rs_end > end);
range_tree_stat_decr(rt, rs);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
if (left_over && right_over) {
newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
newseg->rs_start = end;
newseg->rs_end = rs->rs_end;
+ ASSERT3U(rs->rs_fill, >=, (fill + fill_left));
+ newseg->rs_fill = rs->rs_fill - (fill + fill_left);
range_tree_stat_incr(rt, newseg);
rs->rs_end = start;
+ rs->rs_fill = fill_left;
avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
} else if (left_over) {
rs->rs_end = start;
+ ASSERT3U(rs->rs_fill, >=, fill);
+ rs->rs_fill -= fill;
} else if (right_over) {
rs->rs_start = end;
+ ASSERT3U(rs->rs_fill, >=, fill);
+ rs->rs_fill -= fill;
} else {
+ ASSERT3U(rs->rs_fill, ==, fill);
+ ASSERT(fill == 0 || !partial_overlap);
avl_remove(&rt->rt_root, rs);
kmem_cache_free(range_seg_cache, rs);
rs = NULL;
}
if (rs != NULL) {
range_tree_stat_incr(rt, rs);
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
}
- rt->rt_space -= size;
+ rt->rt_space -= overlap_sz;
+ if (!partial_overlap) {
+ /*
+ * There can't be any more segments overlapping with
+ * us, so no sense in performing an extra search.
+ */
+ break;
+ }
+ }
}
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(arg, start, size, 0, 0, B_FALSE);
+}
+
+void
+range_tree_remove_overlap(void *arg, uint64_t start, uint64_t size)
+{
+ range_tree_remove_impl(arg, start, size, 0, 0, B_TRUE);
+}
+
+void
+range_tree_remove_fill(void *arg, uint64_t start, uint64_t size,
+ uint64_t fill, uint64_t remain_left)
+{
+ range_tree_remove_impl(arg, start, size, fill, remain_left, B_FALSE);
+}
+
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
avl_index_t where;
range_seg_t rsearch;
uint64_t end = start + size;
+ ASSERT(MUTEX_HELD(rt->rt_lock));
VERIFY(size != 0);
rsearch.rs_start = start;
rsearch.rs_end = end;
return (avl_find(&rt->rt_root, &rsearch, &where));
}
-static range_seg_t *
+void *
range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_seg_t *rs = range_tree_find_impl(rt, start, size);
if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
return (rs);
return (NULL);
}
+/*
+ * Given an extent start offset and size, will look through the provided
+ * range tree and find a suitable start offset (starting at `start') such
+ * that the requested extent _doesn't_ overlap with any range segment in
+ * the range tree.
+ */
+uint64_t
+range_tree_find_gap(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+ range_seg_t *rs;
+ while ((rs = range_tree_find_impl(rt, start, size)) != NULL)
+ start = rs->rs_end;
+ return (start);
+}
+
void
range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
{
range_seg_t *rs;
+ mutex_enter(rt->rt_lock);
rs = range_tree_find(rt, off, size);
if (rs != NULL)
panic("freeing free block; rs=%p", (void *)rs);
+ mutex_exit(rt->rt_lock);
}
boolean_t
range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
{
@@ -341,13 +534,10 @@
void
range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
{
range_seg_t *rs;
- if (size == 0)
- return;
-
while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
uint64_t free_start = MAX(rs->rs_start, start);
uint64_t free_end = MIN(rs->rs_end, start + size);
range_tree_remove(rt, free_start, free_end - free_start);
}
@@ -356,10 +546,11 @@
void
range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
{
range_tree_t *rt;
+ ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
ASSERT0(range_tree_space(*rtdst));
ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
rt = *rtsrc;
*rtsrc = *rtdst;
@@ -370,12 +561,13 @@
range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
range_seg_t *rs;
void *cookie = NULL;
+ ASSERT(MUTEX_HELD(rt->rt_lock));
- if (rt->rt_ops != NULL)
+ if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
if (func != NULL)
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
@@ -389,15 +581,23 @@
void
range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
{
range_seg_t *rs;
+ ASSERT(MUTEX_HELD(rt->rt_lock));
for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
}
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+ ASSERT(MUTEX_HELD(rt->rt_lock));
+ return (avl_first(&rt->rt_root));
+}
+
uint64_t
range_tree_space(range_tree_t *rt)
{
return (rt->rt_space);
}