Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
        
*** 18,27 ****
--- 18,28 ----
   *
   * CDDL HEADER END
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+  * Copyright (c) 2012 by Delphix. All rights reserved.
   */
  
  #include <sys/zfs_context.h>
  #include <sys/dmu_objset.h>
  #include <sys/dmu_traverse.h>
*** 51,60 ****
--- 52,62 ----
  typedef struct traverse_data {
          spa_t *td_spa;
          uint64_t td_objset;
          blkptr_t *td_rootbp;
          uint64_t td_min_txg;
+         zbookmark_t *td_resume;
          int td_flags;
          prefetch_data_t *td_pfd;
          blkptr_cb_t *td_func;
          void *td_arg;
  } traverse_data_t;
*** 126,146 ****
              claim_txg);
  
          zil_free(zilog);
  }
  
  static int
  traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
      arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
  {
          zbookmark_t czb;
          int err = 0, lasterr = 0;
          arc_buf_t *buf = NULL;
          prefetch_data_t *pd = td->td_pfd;
          boolean_t hard = td->td_flags & TRAVERSE_HARD;
  
!         if (bp->blk_birth == 0) {
                  err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
                      td->td_arg);
                  return (err);
          }
  
--- 128,208 ----
              claim_txg);
  
          zil_free(zilog);
  }
  
+ typedef enum resume_skip {
+         RESUME_SKIP_ALL,
+         RESUME_SKIP_NONE,
+         RESUME_SKIP_CHILDREN
+ } resume_skip_t;
+ 
+ /*
+  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+  * the block indicated by zb does not need to be visited at all. Returns
+  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+  * resume point. This indicates that this block should be visited but not its
+  * children (since they must have been visited in a previous traversal).
+  * Otherwise returns RESUME_SKIP_NONE.
+  */
+ static resume_skip_t
+ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+     const zbookmark_t *zb)
+ {
+         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+                 /*
+                  * If we already visited this bp & everything below,
+                  * don't bother doing it again.
+                  */
+                 if (zbookmark_is_before(dnp, zb, td->td_resume))
+                         return (RESUME_SKIP_ALL);
+ 
+                 /*
+                  * If we found the block we're trying to resume from, zero
+                  * the bookmark out to indicate that we have resumed.
+                  */
+                 ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
+                 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+                         bzero(td->td_resume, sizeof (*zb));
+                         if (td->td_flags & TRAVERSE_POST)
+                                 return (RESUME_SKIP_CHILDREN);
+                 }
+         }
+         return (RESUME_SKIP_NONE);
+ }
+ 
+ static void
+ traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
+ {
+         ASSERT(td->td_resume != NULL);
+         ASSERT3U(zb->zb_level, ==, 0);
+         bcopy(zb, td->td_resume, sizeof (*td->td_resume));
+ }
+ 
  static int
  traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
      arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
  {
          zbookmark_t czb;
          int err = 0, lasterr = 0;
          arc_buf_t *buf = NULL;
          prefetch_data_t *pd = td->td_pfd;
          boolean_t hard = td->td_flags & TRAVERSE_HARD;
+         boolean_t pause = B_FALSE;
  
!         switch (resume_skip_check(td, dnp, zb)) {
!         case RESUME_SKIP_ALL:
!                 return (0);
!         case RESUME_SKIP_CHILDREN:
!                 goto post;
!         case RESUME_SKIP_NONE:
!                 break;
!         default:
!                 ASSERT(0);
!         }
! 
!         if (BP_IS_HOLE(bp)) {
                  err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
                      td->td_arg);
                  return (err);
          }
  
*** 162,173 ****
          if (td->td_flags & TRAVERSE_PRE) {
                  err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
                      td->td_arg);
                  if (err == TRAVERSE_VISIT_NO_CHILDREN)
                          return (0);
!                 if (err)
!                         return (err);
          }
  
          if (BP_GET_LEVEL(bp) > 0) {
                  uint32_t flags = ARC_WAIT;
                  int i;
--- 224,237 ----
          if (td->td_flags & TRAVERSE_PRE) {
                  err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
                      td->td_arg);
                  if (err == TRAVERSE_VISIT_NO_CHILDREN)
                          return (0);
!                 if (err == ERESTART)
!                         pause = B_TRUE; /* handle pausing at a common point */
!                 if (err != 0)
!                         goto post;
          }
  
          if (BP_GET_LEVEL(bp) > 0) {
                  uint32_t flags = ARC_WAIT;
                  int i;
*** 251,265 ****
--- 315,338 ----
          }
  
          if (buf)
                  (void) arc_buf_remove_ref(buf, &buf);
  
+ post:
          if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
                  err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
                      td->td_arg);
+                 if (err == ERESTART)
+                         pause = B_TRUE;
          }
  
+         if (pause && td->td_resume != NULL) {
+                 ASSERT3U(err, ==, ERESTART);
+                 ASSERT(!hard);
+                 traverse_pause(td, zb);
+         }
+ 
          return (err != 0 ? err : lasterr);
  }
  
  static int
  traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
*** 351,372 ****
  /*
   * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
   * in syncing context).
   */
  static int
! traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
!     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
  {
          traverse_data_t td;
          prefetch_data_t pd = { 0 };
          zbookmark_t czb;
          int err;
  
          td.td_spa = spa;
!         td.td_objset = ds ? ds->ds_object : 0;
          td.td_rootbp = rootbp;
          td.td_min_txg = txg_start;
          td.td_func = func;
          td.td_arg = arg;
          td.td_pfd = &pd;
          td.td_flags = flags;
  
--- 424,450 ----
  /*
   * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
   * in syncing context).
   */
  static int
! traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
!     uint64_t txg_start, zbookmark_t *resume, int flags,
!     blkptr_cb_t func, void *arg)
  {
          traverse_data_t td;
          prefetch_data_t pd = { 0 };
          zbookmark_t czb;
          int err;
  
+         ASSERT(ds == NULL || objset == ds->ds_object);
+         ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+ 
          td.td_spa = spa;
!         td.td_objset = objset;
          td.td_rootbp = rootbp;
          td.td_min_txg = txg_start;
+         td.td_resume = resume;
          td.td_func = func;
          td.td_arg = arg;
          td.td_pfd = &pd;
          td.td_flags = flags;
  
*** 414,427 ****
   */
  int
  traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
      blkptr_cb_t func, void *arg)
  {
!         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
!             &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
  }
  
  /*
   * NB: pool must not be changing on-disk (eg, from zdb or sync context).
   */
  int
  traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
--- 492,514 ----
   */
  int
  traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
      blkptr_cb_t func, void *arg)
  {
!         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
!             &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
  }
  
+ int
+ traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+     uint64_t txg_start, zbookmark_t *resume, int flags,
+     blkptr_cb_t func, void *arg)
+ {
+         return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+             blkptr, txg_start, resume, flags, func, arg));
+ }
+ 
  /*
   * NB: pool must not be changing on-disk (eg, from zdb or sync context).
   */
  int
  traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
*** 432,443 ****
          dsl_pool_t *dp = spa_get_dsl(spa);
          objset_t *mos = dp->dp_meta_objset;
          boolean_t hard = (flags & TRAVERSE_HARD);
  
          /* visit the MOS */
!         err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
!             txg_start, flags, func, arg);
          if (err)
                  return (err);
  
          /* visit each dataset */
          for (obj = 1; err == 0 || (err != ESRCH && hard);
--- 519,530 ----
          dsl_pool_t *dp = spa_get_dsl(spa);
          objset_t *mos = dp->dp_meta_objset;
          boolean_t hard = (flags & TRAVERSE_HARD);
  
          /* visit the MOS */
!         err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
!             txg_start, NULL, flags, func, arg);
          if (err)
                  return (err);
  
          /* visit each dataset */
          for (obj = 1; err == 0 || (err != ESRCH && hard);