Print this page
4374 dn_free_ranges should use range_tree_t
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com
Reviewed by: Garrett D'Amore <garrett@damore.org>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dnode.c
          +++ new/usr/src/uts/common/fs/zfs/dnode.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright (c) 2013 by Delphix. All rights reserved.
       23 + * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   */
  25   25  
  26   26  #include <sys/zfs_context.h>
  27   27  #include <sys/dbuf.h>
  28   28  #include <sys/dnode.h>
  29   29  #include <sys/dmu.h>
  30   30  #include <sys/dmu_impl.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/dmu_objset.h>
  33   33  #include <sys/dsl_dir.h>
  34   34  #include <sys/dsl_dataset.h>
  35   35  #include <sys/spa.h>
  36   36  #include <sys/zio.h>
  37   37  #include <sys/dmu_zfetch.h>
       38 +#include <sys/range_tree.h>
  38   39  
  39      -static int free_range_compar(const void *node1, const void *node2);
  40      -
  41   40  static kmem_cache_t *dnode_cache;
  42   41  /*
  43   42   * Define DNODE_STATS to turn on statistic gathering. By default, it is only
  44   43   * turned on when DEBUG is also defined.
  45   44   */
  46   45  #ifdef  DEBUG
  47   46  #define DNODE_STATS
  48   47  #endif  /* DEBUG */
  49   48  
  50   49  #ifdef  DNODE_STATS
↓ open down ↓ 32 lines elided ↑ open up ↑
  83   82          bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
  84   83          bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
  85   84          bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
  86   85          bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
  87   86          bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
  88   87          bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
  89   88          bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
  90   89  
  91   90          for (i = 0; i < TXG_SIZE; i++) {
  92   91                  list_link_init(&dn->dn_dirty_link[i]);
  93      -                avl_create(&dn->dn_ranges[i], free_range_compar,
  94      -                    sizeof (free_range_t),
  95      -                    offsetof(struct free_range, fr_node));
       92 +                dn->dn_free_ranges[i] = NULL;
  96   93                  list_create(&dn->dn_dirty_records[i],
  97   94                      sizeof (dbuf_dirty_record_t),
  98   95                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
  99   96          }
 100   97  
 101   98          dn->dn_allocated_txg = 0;
 102   99          dn->dn_free_txg = 0;
 103  100          dn->dn_assigned_txg = 0;
 104  101          dn->dn_dirtyctx = 0;
 105  102          dn->dn_dirtyctx_firstset = NULL;
↓ open down ↓ 27 lines elided ↑ open up ↑
 133  130          rw_destroy(&dn->dn_struct_rwlock);
 134  131          mutex_destroy(&dn->dn_mtx);
 135  132          mutex_destroy(&dn->dn_dbufs_mtx);
 136  133          cv_destroy(&dn->dn_notxholds);
 137  134          refcount_destroy(&dn->dn_holds);
 138  135          refcount_destroy(&dn->dn_tx_holds);
 139  136          ASSERT(!list_link_active(&dn->dn_link));
 140  137  
 141  138          for (i = 0; i < TXG_SIZE; i++) {
 142  139                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 143      -                avl_destroy(&dn->dn_ranges[i]);
      140 +                ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 144  141                  list_destroy(&dn->dn_dirty_records[i]);
 145  142                  ASSERT0(dn->dn_next_nblkptr[i]);
 146  143                  ASSERT0(dn->dn_next_nlevels[i]);
 147  144                  ASSERT0(dn->dn_next_indblkshift[i]);
 148  145                  ASSERT0(dn->dn_next_bonustype[i]);
 149  146                  ASSERT0(dn->dn_rm_spillblk[i]);
 150  147                  ASSERT0(dn->dn_next_bonuslen[i]);
 151  148                  ASSERT0(dn->dn_next_blksz[i]);
 152  149          }
 153  150  
↓ open down ↓ 152 lines elided ↑ open up ↑
 306  303          ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
 307  304          ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 308  305  
 309  306          size >>= DNODE_SHIFT;
 310  307          for (i = 0; i < size; i++) {
 311  308                  dnode_byteswap(buf);
 312  309                  buf++;
 313  310          }
 314  311  }
 315  312  
 316      -static int
 317      -free_range_compar(const void *node1, const void *node2)
 318      -{
 319      -        const free_range_t *rp1 = node1;
 320      -        const free_range_t *rp2 = node2;
 321      -
 322      -        if (rp1->fr_blkid < rp2->fr_blkid)
 323      -                return (-1);
 324      -        else if (rp1->fr_blkid > rp2->fr_blkid)
 325      -                return (1);
 326      -        else return (0);
 327      -}
 328      -
 329  313  void
 330  314  dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 331  315  {
 332  316          ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
 333  317  
 334  318          dnode_setdirty(dn, tx);
 335  319          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 336  320          ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
 337  321              (dn->dn_nblkptr-1) * sizeof (blkptr_t));
 338  322          dn->dn_bonuslen = newsize;
↓ open down ↓ 28 lines elided ↑ open up ↑
 367  351  static void
 368  352  dnode_setdblksz(dnode_t *dn, int size)
 369  353  {
 370  354          ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 371  355          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 372  356          ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 373  357          ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 374  358              1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 375  359          dn->dn_datablksz = size;
 376  360          dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 377      -        dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
      361 +        dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 378  362  }
 379  363  
 380  364  static dnode_t *
 381  365  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 382  366      uint64_t object, dnode_handle_t *dnh)
 383  367  {
 384  368          dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 385  369  
 386  370          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 387  371          dn->dn_moved = 0;
↓ open down ↓ 135 lines elided ↑ open up ↑
 523  507          for (i = 0; i < TXG_SIZE; i++) {
 524  508                  ASSERT0(dn->dn_next_nblkptr[i]);
 525  509                  ASSERT0(dn->dn_next_nlevels[i]);
 526  510                  ASSERT0(dn->dn_next_indblkshift[i]);
 527  511                  ASSERT0(dn->dn_next_bonuslen[i]);
 528  512                  ASSERT0(dn->dn_next_bonustype[i]);
 529  513                  ASSERT0(dn->dn_rm_spillblk[i]);
 530  514                  ASSERT0(dn->dn_next_blksz[i]);
 531  515                  ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 532  516                  ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 533      -                ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
      517 +                ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 534  518          }
 535  519  
 536  520          dn->dn_type = ot;
 537  521          dnode_setdblksz(dn, blocksize);
 538  522          dn->dn_indblkshift = ibs;
 539  523          dn->dn_nlevels = 1;
 540  524          if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
 541  525                  dn->dn_nblkptr = 1;
 542  526          else
 543  527                  dn->dn_nblkptr = 1 +
↓ open down ↓ 143 lines elided ↑ open up ↑
 687  671          bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
 688  672              sizeof (odn->dn_rm_spillblk));
 689  673          bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
 690  674              sizeof (odn->dn_next_bonuslen));
 691  675          bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
 692  676              sizeof (odn->dn_next_blksz));
 693  677          for (i = 0; i < TXG_SIZE; i++) {
 694  678                  list_move_tail(&ndn->dn_dirty_records[i],
 695  679                      &odn->dn_dirty_records[i]);
 696  680          }
 697      -        bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
      681 +        bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
      682 +            sizeof (odn->dn_free_ranges));
 698  683          ndn->dn_allocated_txg = odn->dn_allocated_txg;
 699  684          ndn->dn_free_txg = odn->dn_free_txg;
 700  685          ndn->dn_assigned_txg = odn->dn_assigned_txg;
 701  686          ndn->dn_dirtyctx = odn->dn_dirtyctx;
 702  687          ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
 703  688          ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
 704  689          refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
 705  690          ASSERT(list_is_empty(&ndn->dn_dbufs));
 706  691          list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
 707  692          ndn->dn_dbufs_count = odn->dn_dbufs_count;
↓ open down ↓ 42 lines elided ↑ open up ↑
 750  735           */
 751  736          POINTER_INVALIDATE(&odn->dn_objset);
 752  737  
 753  738          /*
 754  739           * Satisfy the destructor.
 755  740           */
 756  741          for (i = 0; i < TXG_SIZE; i++) {
 757  742                  list_create(&odn->dn_dirty_records[i],
 758  743                      sizeof (dbuf_dirty_record_t),
 759  744                      offsetof(dbuf_dirty_record_t, dr_dirty_node));
 760      -                odn->dn_ranges[i].avl_root = NULL;
 761      -                odn->dn_ranges[i].avl_numnodes = 0;
      745 +                odn->dn_free_ranges[i] = NULL;
 762  746                  odn->dn_next_nlevels[i] = 0;
 763  747                  odn->dn_next_indblkshift[i] = 0;
 764  748                  odn->dn_next_bonustype[i] = 0;
 765  749                  odn->dn_rm_spillblk[i] = 0;
 766  750                  odn->dn_next_bonuslen[i] = 0;
 767  751                  odn->dn_next_blksz[i] = 0;
 768  752          }
 769  753          odn->dn_allocated_txg = 0;
 770  754          odn->dn_free_txg = 0;
 771  755          odn->dn_assigned_txg = 0;
↓ open down ↓ 683 lines elided ↑ open up ↑
1455 1439                  mutex_exit(&new->dt.di.dr_mtx);
1456 1440                  mutex_exit(&dn->dn_mtx);
1457 1441          }
1458 1442  
1459 1443  out:
1460 1444          if (have_read)
1461 1445                  rw_downgrade(&dn->dn_struct_rwlock);
1462 1446  }
1463 1447  
1464 1448  void
1465      -dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
1466      -{
1467      -        avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1468      -        avl_index_t where;
1469      -        free_range_t *rp;
1470      -        free_range_t rp_tofind;
1471      -        uint64_t endblk = blkid + nblks;
1472      -
1473      -        ASSERT(MUTEX_HELD(&dn->dn_mtx));
1474      -        ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
1475      -
1476      -        dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1477      -            blkid, nblks, tx->tx_txg);
1478      -        rp_tofind.fr_blkid = blkid;
1479      -        rp = avl_find(tree, &rp_tofind, &where);
1480      -        if (rp == NULL)
1481      -                rp = avl_nearest(tree, where, AVL_BEFORE);
1482      -        if (rp == NULL)
1483      -                rp = avl_nearest(tree, where, AVL_AFTER);
1484      -
1485      -        while (rp && (rp->fr_blkid <= blkid + nblks)) {
1486      -                uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
1487      -                free_range_t *nrp = AVL_NEXT(tree, rp);
1488      -
1489      -                if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
1490      -                        /* clear this entire range */
1491      -                        avl_remove(tree, rp);
1492      -                        kmem_free(rp, sizeof (free_range_t));
1493      -                } else if (blkid <= rp->fr_blkid &&
1494      -                    endblk > rp->fr_blkid && endblk < fr_endblk) {
1495      -                        /* clear the beginning of this range */
1496      -                        rp->fr_blkid = endblk;
1497      -                        rp->fr_nblks = fr_endblk - endblk;
1498      -                } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
1499      -                    endblk >= fr_endblk) {
1500      -                        /* clear the end of this range */
1501      -                        rp->fr_nblks = blkid - rp->fr_blkid;
1502      -                } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
1503      -                        /* clear a chunk out of this range */
1504      -                        free_range_t *new_rp =
1505      -                            kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1506      -
1507      -                        new_rp->fr_blkid = endblk;
1508      -                        new_rp->fr_nblks = fr_endblk - endblk;
1509      -                        avl_insert_here(tree, new_rp, rp, AVL_AFTER);
1510      -                        rp->fr_nblks = blkid - rp->fr_blkid;
1511      -                }
1512      -                /* there may be no overlap */
1513      -                rp = nrp;
1514      -        }
1515      -}
1516      -
1517      -void
1518 1449  dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
1519 1450  {
1520 1451          dmu_buf_impl_t *db;
1521 1452          uint64_t blkoff, blkid, nblks;
1522 1453          int blksz, blkshift, head, tail;
1523 1454          int trunc = FALSE;
1524 1455          int epbs;
1525 1456  
1526 1457          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1527 1458          blksz = dn->dn_datablksz;
↓ open down ↓ 129 lines elided ↑ open up ↑
1657 1588                          dbuf_rele(db, FTAG);
1658 1589                  }
1659 1590          }
1660 1591  
1661 1592  done:
1662 1593          /*
1663 1594           * Add this range to the dnode range list.
1664 1595           * We will finish up this free operation in the syncing phase.
1665 1596           */
1666 1597          mutex_enter(&dn->dn_mtx);
1667      -        dnode_clear_range(dn, blkid, nblks, tx);
1668      -        {
1669      -                free_range_t *rp, *found;
1670      -                avl_index_t where;
1671      -                avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
1672      -
1673      -                /* Add new range to dn_ranges */
1674      -                rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
1675      -                rp->fr_blkid = blkid;
1676      -                rp->fr_nblks = nblks;
1677      -                found = avl_find(tree, rp, &where);
1678      -                ASSERT(found == NULL);
1679      -                avl_insert(tree, rp, where);
1680      -                dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
1681      -                    blkid, nblks, tx->tx_txg);
     1598 +        int txgoff = tx->tx_txg & TXG_MASK;
     1599 +        if (dn->dn_free_ranges[txgoff] == NULL) {
     1600 +                dn->dn_free_ranges[txgoff] =
     1601 +                    range_tree_create(NULL, NULL, &dn->dn_mtx);
1682 1602          }
     1603 +        range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
     1604 +        range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
     1605 +        dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
     1606 +            blkid, nblks, tx->tx_txg);
1683 1607          mutex_exit(&dn->dn_mtx);
1684 1608  
1685 1609          dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
1686 1610          dnode_setdirty(dn, tx);
1687 1611  out:
1688 1612  
1689 1613          rw_exit(&dn->dn_struct_rwlock);
1690 1614  }
1691 1615  
1692 1616  static boolean_t
↓ open down ↓ 7 lines elided ↑ open up ↑
1700 1624                          break;
1701 1625          }
1702 1626          mutex_exit(&dn->dn_mtx);
1703 1627          return (i < TXG_SIZE);
1704 1628  }
1705 1629  
1706 1630  /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
1707 1631  uint64_t
1708 1632  dnode_block_freed(dnode_t *dn, uint64_t blkid)
1709 1633  {
1710      -        free_range_t range_tofind;
1711 1634          void *dp = spa_get_dsl(dn->dn_objset->os_spa);
1712 1635          int i;
1713 1636  
1714 1637          if (blkid == DMU_BONUS_BLKID)
1715 1638                  return (FALSE);
1716 1639  
1717 1640          /*
1718 1641           * If we're in the process of opening the pool, dp will not be
1719 1642           * set yet, but there shouldn't be anything dirty.
1720 1643           */
1721 1644          if (dp == NULL)
1722 1645                  return (FALSE);
1723 1646  
1724 1647          if (dn->dn_free_txg)
1725 1648                  return (TRUE);
1726 1649  
1727 1650          if (blkid == DMU_SPILL_BLKID)
1728 1651                  return (dnode_spill_freed(dn));
1729 1652  
1730      -        range_tofind.fr_blkid = blkid;
1731 1653          mutex_enter(&dn->dn_mtx);
1732 1654          for (i = 0; i < TXG_SIZE; i++) {
1733      -                free_range_t *range_found;
1734      -                avl_index_t idx;
1735      -
1736      -                range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
1737      -                if (range_found) {
1738      -                        ASSERT(range_found->fr_nblks > 0);
     1655 +                if (dn->dn_free_ranges[i] != NULL &&
     1656 +                    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
1739 1657                          break;
1740      -                }
1741      -                range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
1742      -                if (range_found &&
1743      -                    range_found->fr_blkid + range_found->fr_nblks > blkid)
1744      -                        break;
1745 1658          }
1746 1659          mutex_exit(&dn->dn_mtx);
1747 1660          return (i < TXG_SIZE);
1748 1661  }
1749 1662  
1750 1663  /* call from syncing context when we actually write/free space for this dnode */
1751 1664  void
1752 1665  dnode_diduse_space(dnode_t *dn, int64_t delta)
1753 1666  {
1754 1667          uint64_t space;
↓ open down ↓ 243 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX