Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
        
*** 18,27 ****
--- 18,28 ----
   *
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  #include <sys/dsl_scan.h>
  #include <sys/dsl_pool.h>
  #include <sys/dsl_dataset.h>
*** 42,51 ****
--- 43,53 ----
  #include <sys/zil_impl.h>
  #include <sys/zio_checksum.h>
  #include <sys/ddt.h>
  #include <sys/sa.h>
  #include <sys/sa_impl.h>
+ #include <sys/zfeature.h>
  #ifdef _KERNEL
  #include <sys/zfs_vfsops.h>
  #endif
  
  typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
*** 380,438 ****
  {
          return (arc_read_nolock(pio, spa, bpp, done, private,
              priority, zio_flags, arc_flags, zb));
  }
  
- static boolean_t
- bookmark_is_zero(const zbookmark_t *zb)
- {
-         return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-             zb->zb_level == 0 && zb->zb_blkid == 0);
- }
- 
- /* dnp is the dnode for zb1->zb_object */
- static boolean_t
- bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
-     const zbookmark_t *zb2)
- {
-         uint64_t zb1nextL0, zb2thisobj;
- 
-         ASSERT(zb1->zb_objset == zb2->zb_objset);
-         ASSERT(zb2->zb_level == 0);
- 
-         /*
-          * A bookmark in the deadlist is considered to be after
-          * everything else.
-          */
-         if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-                 return (B_TRUE);
- 
-         /* The objset_phys_t isn't before anything. */
-         if (dnp == NULL)
-                 return (B_FALSE);
- 
-         zb1nextL0 = (zb1->zb_blkid + 1) <<
-             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
- 
-         zb2thisobj = zb2->zb_object ? zb2->zb_object :
-             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
- 
-         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-                 uint64_t nextobj = zb1nextL0 *
-                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-                 return (nextobj <= zb2thisobj);
-         }
- 
-         if (zb1->zb_object < zb2thisobj)
-                 return (B_TRUE);
-         if (zb1->zb_object > zb2thisobj)
-                 return (B_FALSE);
-         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-                 return (B_FALSE);
-         return (zb1nextL0 <= zb2->zb_blkid);
- }
- 
  static uint64_t
  dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
  {
          uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
          if (dsl_dataset_is_snapshot(ds))
--- 382,391 ----
*** 460,470 ****
                  return (B_FALSE);
  
          if (scn->scn_pausing)
                  return (B_TRUE); /* we're already pausing */
  
!         if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
                  return (B_FALSE); /* we're resuming */
  
          /* We only know how to resume from level-0 blocks. */
          if (zb && zb->zb_level != 0)
                  return (B_FALSE);
--- 413,423 ----
                  return (B_FALSE);
  
          if (scn->scn_pausing)
                  return (B_TRUE); /* we're already pausing */
  
!         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
                  return (B_FALSE); /* we're resuming */
  
          /* We only know how to resume from level-0 blocks. */
          if (zb && zb->zb_level != 0)
                  return (B_FALSE);
*** 615,631 ****
      const zbookmark_t *zb)
  {
          /*
           * We never skip over user/group accounting objects (obj<0)
           */
!         if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
              (int64_t)zb->zb_object >= 0) {
                  /*
                   * If we already visited this bp & everything below (in
                   * a prior txg sync), don't bother doing it again.
                   */
!                 if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
                          return (B_TRUE);
  
                  /*
                   * If we found the block we're trying to resume from, or
                   * we went past it to a different object, zero it out to
--- 568,584 ----
      const zbookmark_t *zb)
  {
          /*
           * We never skip over user/group accounting objects (obj<0)
           */
!         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
              (int64_t)zb->zb_object >= 0) {
                  /*
                   * If we already visited this bp & everything below (in
                   * a prior txg sync), don't bother doing it again.
                   */
!                 if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
                          return (B_TRUE);
  
                  /*
                   * If we found the block we're trying to resume from, or
                   * we went past it to a different object, zero it out to
*** 814,839 ****
              pbuf, bp);
  
          if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
                  return;
  
-         if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
-                 /*
-                  * For non-user-accounting blocks, we need to read the
-                  * new bp (from a deleted snapshot, found in
-                  * check_existing_xlation).  If we used the old bp,
-                  * pointers inside this block from before we resumed
-                  * would be untranslated.
-                  *
-                  * For user-accounting blocks, we need to read the old
-                  * bp, because we will apply the entire space delta to
-                  * it (original untranslated -> translations from
-                  * deleted snap -> now).
-                  */
-                 bp_toread = *bp;
-         }
- 
          if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
              &buf) != 0)
                  return;
  
          /*
--- 767,776 ----
*** 1394,1416 ****
                          return;
          }
          zap_cursor_fini(&zc);
  }
  
! static int
! dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
  {
-         dsl_scan_t *scn = arg;
          uint64_t elapsed_nanosecs;
  
          elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
! 
!         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
              (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
              txg_sync_waiting(scn->scn_dp)) ||
!             spa_shutting_down(scn->scn_dp->dp_spa))
                  return (ERESTART);
  
          zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
              dmu_tx_get_txg(tx), bp, 0));
          dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
              -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
--- 1331,1362 ----
                          return;
          }
          zap_cursor_fini(&zc);
  }
  
! static boolean_t
! dsl_scan_free_should_pause(dsl_scan_t *scn)
  {
          uint64_t elapsed_nanosecs;
  
          elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
!         return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
              (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
              txg_sync_waiting(scn->scn_dp)) ||
!             spa_shutting_down(scn->scn_dp->dp_spa));
! }
! 
! static int
! dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
! {
!         dsl_scan_t *scn = arg;
! 
!         if (!scn->scn_is_bptree ||
!             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
!                 if (dsl_scan_free_should_pause(scn))
                          return (ERESTART);
+         }
  
          zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
              dmu_tx_get_txg(tx), bp, 0));
          dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
              -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
*** 1431,1440 ****
--- 1377,1390 ----
                  return (B_FALSE);
  
          if (scn->scn_phys.scn_state == DSS_SCANNING)
                  return (B_TRUE);
  
+         if (spa_feature_is_active(spa,
+             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                 return (B_TRUE);
+         }
          if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                  (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
                      &used, &comp, &uncomp);
          }
          return (used != 0);
*** 1477,1494 ****
           * any scanning.  This ensures that there is no free list when
           * we are scanning, so the scan code doesn't have to worry about
           * traversing it.
           */
          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                  scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
                      NULL, ZIO_FLAG_MUSTSUCCEED);
                  err = bpobj_iterate(&dp->dp_free_bpobj,
!                     dsl_scan_free_cb, scn, tx);
                  VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
                  if (scn->scn_visited_this_txg) {
                          zfs_dbgmsg("freed %llu blocks in %llums from "
!                             "free_bpobj txg %llu",
                              (longlong_t)scn->scn_visited_this_txg,
                              (longlong_t)
                              (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
                              (longlong_t)tx->tx_txg);
                          scn->scn_visited_this_txg = 0;
--- 1427,1470 ----
           * any scanning.  This ensures that there is no free list when
           * we are scanning, so the scan code doesn't have to worry about
           * traversing it.
           */
          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+                 scn->scn_is_bptree = B_FALSE;
                  scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
                      NULL, ZIO_FLAG_MUSTSUCCEED);
                  err = bpobj_iterate(&dp->dp_free_bpobj,
!                     dsl_scan_free_block_cb, scn, tx);
                  VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+ 
+                 if (err == 0 && spa_feature_is_active(spa,
+                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+                         scn->scn_is_bptree = B_TRUE;
+                         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                             NULL, ZIO_FLAG_MUSTSUCCEED);
+                         err = bptree_iterate(dp->dp_meta_objset,
+                             dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
+                             scn, tx);
+                         VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+                         if (err != 0)
+                                 return;
+ 
+                         /* disable async destroy feature */
+                         spa_feature_decr(spa,
+                             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
+                         ASSERT(!spa_feature_is_active(spa,
+                             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
+                         VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+                             DMU_POOL_DIRECTORY_OBJECT,
+                             DMU_POOL_BPTREE_OBJ, tx));
+                         VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
+                             dp->dp_bptree_obj, tx));
+                         dp->dp_bptree_obj = 0;
+                 }
                  if (scn->scn_visited_this_txg) {
                          zfs_dbgmsg("freed %llu blocks in %llums from "
!                             "free_bpobj/bptree txg %llu",
                              (longlong_t)scn->scn_visited_this_txg,
                              (longlong_t)
                              (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
                              (longlong_t)tx->tx_txg);
                          scn->scn_visited_this_txg = 0;
*** 1599,1608 ****
--- 1575,1586 ----
                  return;
  
          for (i = 0; i < 4; i++) {
                  int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
                  int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+                 if (t & DMU_OT_NEWTYPE)
+                         t = DMU_OT_OTHER;
                  zfs_blkstat_t *zb = &zab->zab_type[l][t];
                  int equal;
  
                  zb->zb_count++;
                  zb->zb_asize += BP_GET_ASIZE(bp);