Print this page
4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

@@ -18,11 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>

@@ -33,13 +33,12 @@
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
 
-static int free_range_compar(const void *node1, const void *node2);
-
 static kmem_cache_t *dnode_cache;
 /*
  * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */

@@ -88,13 +87,11 @@
         bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
         bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 
         for (i = 0; i < TXG_SIZE; i++) {
                 list_link_init(&dn->dn_dirty_link[i]);
-                avl_create(&dn->dn_ranges[i], free_range_compar,
-                    sizeof (free_range_t),
-                    offsetof(struct free_range, fr_node));
+                dn->dn_free_ranges[i] = NULL;
                 list_create(&dn->dn_dirty_records[i],
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
         }
 

@@ -138,11 +135,11 @@
         refcount_destroy(&dn->dn_tx_holds);
         ASSERT(!list_link_active(&dn->dn_link));
 
         for (i = 0; i < TXG_SIZE; i++) {
                 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-                avl_destroy(&dn->dn_ranges[i]);
+                ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
                 list_destroy(&dn->dn_dirty_records[i]);
                 ASSERT0(dn->dn_next_nblkptr[i]);
                 ASSERT0(dn->dn_next_nlevels[i]);
                 ASSERT0(dn->dn_next_indblkshift[i]);
                 ASSERT0(dn->dn_next_bonustype[i]);

@@ -311,23 +308,10 @@
                 dnode_byteswap(buf);
                 buf++;
         }
 }
 
-static int
-free_range_compar(const void *node1, const void *node2)
-{
-        const free_range_t *rp1 = node1;
-        const free_range_t *rp2 = node2;
-
-        if (rp1->fr_blkid < rp2->fr_blkid)
-                return (-1);
-        else if (rp1->fr_blkid > rp2->fr_blkid)
-                return (1);
-        else return (0);
-}
-
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
         ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 

@@ -372,11 +356,11 @@
         ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
         ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
             1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
         dn->dn_datablksz = size;
         dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-        dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+        dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)

@@ -528,11 +512,11 @@
                 ASSERT0(dn->dn_next_bonustype[i]);
                 ASSERT0(dn->dn_rm_spillblk[i]);
                 ASSERT0(dn->dn_next_blksz[i]);
                 ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
                 ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-                ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
+                ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
         }
 
         dn->dn_type = ot;
         dnode_setdblksz(dn, blocksize);
         dn->dn_indblkshift = ibs;

@@ -692,11 +676,12 @@
             sizeof (odn->dn_next_blksz));
         for (i = 0; i < TXG_SIZE; i++) {
                 list_move_tail(&ndn->dn_dirty_records[i],
                     &odn->dn_dirty_records[i]);
         }
-        bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+        bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+            sizeof (odn->dn_free_ranges));
         ndn->dn_allocated_txg = odn->dn_allocated_txg;
         ndn->dn_free_txg = odn->dn_free_txg;
         ndn->dn_assigned_txg = odn->dn_assigned_txg;
         ndn->dn_dirtyctx = odn->dn_dirtyctx;
         ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;

@@ -755,12 +740,11 @@
          */
         for (i = 0; i < TXG_SIZE; i++) {
                 list_create(&odn->dn_dirty_records[i],
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
-                odn->dn_ranges[i].avl_root = NULL;
-                odn->dn_ranges[i].avl_numnodes = 0;
+                odn->dn_free_ranges[i] = NULL;
                 odn->dn_next_nlevels[i] = 0;
                 odn->dn_next_indblkshift[i] = 0;
                 odn->dn_next_bonustype[i] = 0;
                 odn->dn_rm_spillblk[i] = 0;
                 odn->dn_next_bonuslen[i] = 0;

@@ -1460,63 +1444,10 @@
         if (have_read)
                 rw_downgrade(&dn->dn_struct_rwlock);
 }
 
 void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
-{
-        avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-        avl_index_t where;
-        free_range_t *rp;
-        free_range_t rp_tofind;
-        uint64_t endblk = blkid + nblks;
-
-        ASSERT(MUTEX_HELD(&dn->dn_mtx));
-        ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
-        dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-            blkid, nblks, tx->tx_txg);
-        rp_tofind.fr_blkid = blkid;
-        rp = avl_find(tree, &rp_tofind, &where);
-        if (rp == NULL)
-                rp = avl_nearest(tree, where, AVL_BEFORE);
-        if (rp == NULL)
-                rp = avl_nearest(tree, where, AVL_AFTER);
-
-        while (rp && (rp->fr_blkid <= blkid + nblks)) {
-                uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
-                free_range_t *nrp = AVL_NEXT(tree, rp);
-
-                if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
-                        /* clear this entire range */
-                        avl_remove(tree, rp);
-                        kmem_free(rp, sizeof (free_range_t));
-                } else if (blkid <= rp->fr_blkid &&
-                    endblk > rp->fr_blkid && endblk < fr_endblk) {
-                        /* clear the beginning of this range */
-                        rp->fr_blkid = endblk;
-                        rp->fr_nblks = fr_endblk - endblk;
-                } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
-                    endblk >= fr_endblk) {
-                        /* clear the end of this range */
-                        rp->fr_nblks = blkid - rp->fr_blkid;
-                } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
-                        /* clear a chunk out of this range */
-                        free_range_t *new_rp =
-                            kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
-                        new_rp->fr_blkid = endblk;
-                        new_rp->fr_nblks = fr_endblk - endblk;
-                        avl_insert_here(tree, new_rp, rp, AVL_AFTER);
-                        rp->fr_nblks = blkid - rp->fr_blkid;
-                }
-                /* there may be no overlap */
-                rp = nrp;
-        }
-}
-
-void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
         dmu_buf_impl_t *db;
         uint64_t blkoff, blkid, nblks;
         int blksz, blkshift, head, tail;

@@ -1662,26 +1593,19 @@
         /*
          * Add this range to the dnode range list.
          * We will finish up this free operation in the syncing phase.
          */
         mutex_enter(&dn->dn_mtx);
-        dnode_clear_range(dn, blkid, nblks, tx);
-        {
-                free_range_t *rp, *found;
-                avl_index_t where;
-                avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
-                /* Add new range to dn_ranges */
-                rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-                rp->fr_blkid = blkid;
-                rp->fr_nblks = nblks;
-                found = avl_find(tree, rp, &where);
-                ASSERT(found == NULL);
-                avl_insert(tree, rp, where);
+        int txgoff = tx->tx_txg & TXG_MASK;
+        if (dn->dn_free_ranges[txgoff] == NULL) {
+                dn->dn_free_ranges[txgoff] =
+                    range_tree_create(NULL, NULL, &dn->dn_mtx);
+        }
+        range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+        range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
                 dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
                     blkid, nblks, tx->tx_txg);
-        }
         mutex_exit(&dn->dn_mtx);
 
         dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
         dnode_setdirty(dn, tx);
 out:

@@ -1705,11 +1629,10 @@
 
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
-        free_range_t range_tofind;
         void *dp = spa_get_dsl(dn->dn_objset->os_spa);
         int i;
 
         if (blkid == DMU_BONUS_BLKID)
                 return (FALSE);

@@ -1725,26 +1648,16 @@
                 return (TRUE);
 
         if (blkid == DMU_SPILL_BLKID)
                 return (dnode_spill_freed(dn));
 
-        range_tofind.fr_blkid = blkid;
         mutex_enter(&dn->dn_mtx);
         for (i = 0; i < TXG_SIZE; i++) {
-                free_range_t *range_found;
-                avl_index_t idx;
-
-                range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
-                if (range_found) {
-                        ASSERT(range_found->fr_nblks > 0);
+                if (dn->dn_free_ranges[i] != NULL &&
+                    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
                         break;
                 }
-                range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
-                if (range_found &&
-                    range_found->fr_blkid + range_found->fr_nblks > blkid)
-                        break;
-        }
         mutex_exit(&dn->dn_mtx);
         return (i < TXG_SIZE);
 }
 
 /* call from syncing context when we actually write/free space for this dnode */