Print this page
4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
@@ -18,11 +18,11 @@
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
@@ -33,13 +33,12 @@
#include <sys/dsl_dir.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
-static int free_range_compar(const void *node1, const void *node2);
-
static kmem_cache_t *dnode_cache;
/*
* Define DNODE_STATS to turn on statistic gathering. By default, it is only
* turned on when DEBUG is also defined.
*/
@@ -88,13 +87,11 @@
bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
list_link_init(&dn->dn_dirty_link[i]);
- avl_create(&dn->dn_ranges[i], free_range_compar,
- sizeof (free_range_t),
- offsetof(struct free_range, fr_node));
+ dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
@@ -138,11 +135,11 @@
refcount_destroy(&dn->dn_tx_holds);
ASSERT(!list_link_active(&dn->dn_link));
for (i = 0; i < TXG_SIZE; i++) {
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- avl_destroy(&dn->dn_ranges[i]);
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
ASSERT0(dn->dn_next_nblkptr[i]);
ASSERT0(dn->dn_next_nlevels[i]);
ASSERT0(dn->dn_next_indblkshift[i]);
ASSERT0(dn->dn_next_bonustype[i]);
@@ -311,23 +308,10 @@
dnode_byteswap(buf);
buf++;
}
}
-static int
-free_range_compar(const void *node1, const void *node2)
-{
- const free_range_t *rp1 = node1;
- const free_range_t *rp2 = node2;
-
- if (rp1->fr_blkid < rp2->fr_blkid)
- return (-1);
- else if (rp1->fr_blkid > rp2->fr_blkid)
- return (1);
- else return (0);
-}
-
void
dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
{
ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
@@ -372,11 +356,11 @@
ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
dn->dn_datablksz = size;
dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
- dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+ dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
}
static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
@@ -528,11 +512,11 @@
ASSERT0(dn->dn_next_bonustype[i]);
ASSERT0(dn->dn_rm_spillblk[i]);
ASSERT0(dn->dn_next_blksz[i]);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
- ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
}
dn->dn_type = ot;
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
@@ -692,11 +676,12 @@
sizeof (odn->dn_next_blksz));
for (i = 0; i < TXG_SIZE; i++) {
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
- bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
ndn->dn_dirtyctx = odn->dn_dirtyctx;
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
@@ -755,12 +740,11 @@
*/
for (i = 0; i < TXG_SIZE; i++) {
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
- odn->dn_ranges[i].avl_root = NULL;
- odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_free_ranges[i] = NULL;
odn->dn_next_nlevels[i] = 0;
odn->dn_next_indblkshift[i] = 0;
odn->dn_next_bonustype[i] = 0;
odn->dn_rm_spillblk[i] = 0;
odn->dn_next_bonuslen[i] = 0;
@@ -1460,63 +1444,10 @@
if (have_read)
rw_downgrade(&dn->dn_struct_rwlock);
}
void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
- avl_index_t where;
- free_range_t *rp;
- free_range_t rp_tofind;
- uint64_t endblk = blkid + nblks;
-
- ASSERT(MUTEX_HELD(&dn->dn_mtx));
- ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
- rp_tofind.fr_blkid = blkid;
- rp = avl_find(tree, &rp_tofind, &where);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_BEFORE);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_AFTER);
-
- while (rp && (rp->fr_blkid <= blkid + nblks)) {
- uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
- free_range_t *nrp = AVL_NEXT(tree, rp);
-
- if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
- /* clear this entire range */
- avl_remove(tree, rp);
- kmem_free(rp, sizeof (free_range_t));
- } else if (blkid <= rp->fr_blkid &&
- endblk > rp->fr_blkid && endblk < fr_endblk) {
- /* clear the beginning of this range */
- rp->fr_blkid = endblk;
- rp->fr_nblks = fr_endblk - endblk;
- } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
- endblk >= fr_endblk) {
- /* clear the end of this range */
- rp->fr_nblks = blkid - rp->fr_blkid;
- } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
- /* clear a chunk out of this range */
- free_range_t *new_rp =
- kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
- new_rp->fr_blkid = endblk;
- new_rp->fr_nblks = fr_endblk - endblk;
- avl_insert_here(tree, new_rp, rp, AVL_AFTER);
- rp->fr_nblks = blkid - rp->fr_blkid;
- }
- /* there may be no overlap */
- rp = nrp;
- }
-}
-
-void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
uint64_t blkoff, blkid, nblks;
int blksz, blkshift, head, tail;
@@ -1662,26 +1593,19 @@
/*
* Add this range to the dnode range list.
* We will finish up this free operation in the syncing phase.
*/
mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, blkid, nblks, tx);
- {
- free_range_t *rp, *found;
- avl_index_t where;
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
- /* Add new range to dn_ranges */
- rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
- rp->fr_blkid = blkid;
- rp->fr_nblks = nblks;
- found = avl_find(tree, rp, &where);
- ASSERT(found == NULL);
- avl_insert(tree, rp, where);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] == NULL) {
+ dn->dn_free_ranges[txgoff] =
+ range_tree_create(NULL, NULL, &dn->dn_mtx);
+ }
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+ range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
blkid, nblks, tx->tx_txg);
- }
mutex_exit(&dn->dn_mtx);
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
@@ -1705,11 +1629,10 @@
/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
- free_range_t range_tofind;
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
if (blkid == DMU_BONUS_BLKID)
return (FALSE);
@@ -1725,26 +1648,16 @@
return (TRUE);
if (blkid == DMU_SPILL_BLKID)
return (dnode_spill_freed(dn));
- range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
- free_range_t *range_found;
- avl_index_t idx;
-
- range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
- if (range_found) {
- ASSERT(range_found->fr_nblks > 0);
+ if (dn->dn_free_ranges[i] != NULL &&
+ range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
break;
}
- range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
- if (range_found &&
- range_found->fr_blkid + range_found->fr_nblks > blkid)
- break;
- }
mutex_exit(&dn->dn_mtx);
return (i < TXG_SIZE);
}
/* call from syncing context when we actually write/free space for this dnode */