1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dmu_traverse.h>
  29 #include <sys/dsl_dataset.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dnode.h>
  33 #include <sys/spa.h>
  34 #include <sys/zio.h>
  35 #include <sys/dmu_impl.h>
  36 #include <sys/sa.h>
  37 #include <sys/sa_impl.h>
  38 #include <sys/callb.h>
  39 #include <sys/zfeature.h>
  40 
  41 int zfs_pd_blks_max = 100;
  42 
  43 typedef struct prefetch_data {
  44         kmutex_t pd_mtx;
  45         kcondvar_t pd_cv;
  46         int pd_blks_max;
  47         int pd_blks_fetched;
  48         int pd_flags;
  49         boolean_t pd_cancel;
  50         boolean_t pd_exited;
  51 } prefetch_data_t;
  52 
  53 typedef struct traverse_data {
  54         spa_t *td_spa;
  55         uint64_t td_objset;
  56         blkptr_t *td_rootbp;
  57         uint64_t td_min_txg;
  58         zbookmark_phys_t *td_resume;
  59         int td_flags;
  60         prefetch_data_t *td_pfd;
  61         boolean_t td_paused;
  62         uint64_t td_hole_birth_enabled_txg;
  63         blkptr_cb_t *td_func;
  64         void *td_arg;
  65 } traverse_data_t;
  66 
  67 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
  68     uint64_t objset, uint64_t object);
  69 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
  70     uint64_t objset, uint64_t object);
  71 
  72 static int
  73 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
  74 {
  75         traverse_data_t *td = arg;
  76         zbookmark_phys_t zb;
  77 
  78         if (BP_IS_HOLE(bp))
  79                 return (0);
  80 
  81         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
  82                 return (0);
  83 
  84         SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
  85             bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
  86 
  87         (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
  88 
  89         return (0);
  90 }
  91 
  92 static int
  93 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
  94 {
  95         traverse_data_t *td = arg;
  96 
  97         if (lrc->lrc_txtype == TX_WRITE) {
  98                 lr_write_t *lr = (lr_write_t *)lrc;
  99                 blkptr_t *bp = &lr->lr_blkptr;
 100                 zbookmark_phys_t zb;
 101 
 102                 if (BP_IS_HOLE(bp))
 103                         return (0);
 104 
 105                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
 106                         return (0);
 107 
 108                 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 109                     ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 110 
 111                 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 112                     td->td_arg);
 113         }
 114         return (0);
 115 }
 116 
 117 static void
 118 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 119 {
 120         uint64_t claim_txg = zh->zh_claim_txg;
 121         zilog_t *zilog;
 122 
 123         /*
 124          * We only want to visit blocks that have been claimed but not yet
 125          * replayed; plus, in read-only mode, blocks that are already stable.
 126          */
 127         if (claim_txg == 0 && spa_writeable(td->td_spa))
 128                 return;
 129 
 130         zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 131 
 132         (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 133             claim_txg);
 134 
 135         zil_free(zilog);
 136 }
 137 
 138 typedef enum resume_skip {
 139         RESUME_SKIP_ALL,
 140         RESUME_SKIP_NONE,
 141         RESUME_SKIP_CHILDREN
 142 } resume_skip_t;
 143 
 144 /*
 145  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
 146  * the block indicated by zb does not need to be visited at all. Returns
 147  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
 148  * resume point. This indicates that this block should be visited but not its
 149  * children (since they must have been visited in a previous traversal).
 150  * Otherwise returns RESUME_SKIP_NONE.
 151  */
 152 static resume_skip_t
 153 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 154     const zbookmark_phys_t *zb)
 155 {
 156         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
 157                 /*
 158                  * If we already visited this bp & everything below,
 159                  * don't bother doing it again.
 160                  */
 161                 if (zbookmark_is_before(dnp, zb, td->td_resume))
 162                         return (RESUME_SKIP_ALL);
 163 
 164                 /*
 165                  * If we found the block we're trying to resume from, zero
 166                  * the bookmark out to indicate that we have resumed.
 167                  */
 168                 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 169                         bzero(td->td_resume, sizeof (*zb));
 170                         if (td->td_flags & TRAVERSE_POST)
 171                                 return (RESUME_SKIP_CHILDREN);
 172                 }
 173         }
 174         return (RESUME_SKIP_NONE);
 175 }
 176 
 177 static void
 178 traverse_prefetch_metadata(traverse_data_t *td,
 179     const blkptr_t *bp, const zbookmark_phys_t *zb)
 180 {
 181         arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 182 
 183         if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 184                 return;
 185         /*
 186          * If we are in the process of resuming, don't prefetch, because
 187          * some children will not be needed (and in fact may have already
 188          * been freed).
 189          */
 190         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
 191                 return;
 192         if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
 193                 return;
 194         if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 195                 return;
 196 
 197         (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 198             ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 199 }
 200 
 201 static boolean_t
 202 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 203 {
 204         ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 205         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 206             BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 207                 return (B_FALSE);
 208         return (B_TRUE);
 209 }
 210 
 211 static int
 212 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 213     const blkptr_t *bp, const zbookmark_phys_t *zb)
 214 {
 215         zbookmark_phys_t czb;
 216         int err = 0;
 217         arc_buf_t *buf = NULL;
 218         prefetch_data_t *pd = td->td_pfd;
 219         boolean_t hard = td->td_flags & TRAVERSE_HARD;
 220 
 221         switch (resume_skip_check(td, dnp, zb)) {
 222         case RESUME_SKIP_ALL:
 223                 return (0);
 224         case RESUME_SKIP_CHILDREN:
 225                 goto post;
 226         case RESUME_SKIP_NONE:
 227                 break;
 228         default:
 229                 ASSERT(0);
 230         }
 231 
 232         if (bp->blk_birth == 0) {
 233                 /*
 234                  * Since this block has a birth time of 0 it must be a
 235                  * hole created before the SPA_FEATURE_HOLE_BIRTH
 236                  * feature was enabled.  If SPA_FEATURE_HOLE_BIRTH
 237                  * was enabled before the min_txg for this traveral we
 238                  * know the hole must have been created before the
 239                  * min_txg for this traveral, so we can skip it. If
 240                  * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg
 241                  * for this traveral we cannot tell if the hole was
 242                  * created before or after the min_txg for this
 243                  * traversal, so we cannot skip it.
 244                  */
 245                 if (td->td_hole_birth_enabled_txg < td->td_min_txg)
 246                         return (0);
 247         } else if (bp->blk_birth <= td->td_min_txg) {
 248                 return (0);
 249         }
 250 
 251         if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 252                 mutex_enter(&pd->pd_mtx);
 253                 ASSERT(pd->pd_blks_fetched >= 0);
 254                 while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
 255                         cv_wait(&pd->pd_cv, &pd->pd_mtx);
 256                 pd->pd_blks_fetched--;
 257                 cv_broadcast(&pd->pd_cv);
 258                 mutex_exit(&pd->pd_mtx);
 259         }
 260 
 261         if (BP_IS_HOLE(bp)) {
 262                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 263                 if (err != 0)
 264                         goto post;
 265                 return (0);
 266         }
 267 
 268         if (td->td_flags & TRAVERSE_PRE) {
 269                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 270                     td->td_arg);
 271                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 272                         return (0);
 273                 if (err != 0)
 274                         goto post;
 275         }
 276 
 277         if (BP_GET_LEVEL(bp) > 0) {
 278                 arc_flags_t flags = ARC_FLAG_WAIT;
 279                 int i;
 280                 blkptr_t *cbp;
 281                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 282 
 283                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 284                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 285                 if (err != 0)
 286                         goto post;
 287                 cbp = buf->b_data;
 288 
 289                 for (i = 0; i < epb; i++) {
 290                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 291                             zb->zb_level - 1,
 292                             zb->zb_blkid * epb + i);
 293                         traverse_prefetch_metadata(td, &cbp[i], &czb);
 294                 }
 295 
 296                 /* recursively visitbp() blocks below this */
 297                 for (i = 0; i < epb; i++) {
 298                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 299                             zb->zb_level - 1,
 300                             zb->zb_blkid * epb + i);
 301                         err = traverse_visitbp(td, dnp, &cbp[i], &czb);
 302                         if (err != 0)
 303                                 break;
 304                 }
 305         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 306                 arc_flags_t flags = ARC_FLAG_WAIT;
 307                 int i;
 308                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 309 
 310                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 311                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 312                 if (err != 0)
 313                         goto post;
 314                 dnp = buf->b_data;
 315 
 316                 for (i = 0; i < epb; i++) {
 317                         prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset,
 318                             zb->zb_blkid * epb + i);
 319                 }
 320 
 321                 /* recursively visitbp() blocks below this */
 322                 for (i = 0; i < epb; i++) {
 323                         err = traverse_dnode(td, &dnp[i], zb->zb_objset,
 324                             zb->zb_blkid * epb + i);
 325                         if (err != 0)
 326                                 break;
 327                 }
 328         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 329                 arc_flags_t flags = ARC_FLAG_WAIT;
 330                 objset_phys_t *osp;
 331                 dnode_phys_t *dnp;
 332 
 333                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 334                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 335                 if (err != 0)
 336                         goto post;
 337 
 338                 osp = buf->b_data;
 339                 dnp = &osp->os_meta_dnode;
 340                 prefetch_dnode_metadata(td, dnp, zb->zb_objset,
 341                     DMU_META_DNODE_OBJECT);
 342                 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 343                         prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 344                             zb->zb_objset, DMU_GROUPUSED_OBJECT);
 345                         prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 346                             zb->zb_objset, DMU_USERUSED_OBJECT);
 347                 }
 348 
 349                 err = traverse_dnode(td, dnp, zb->zb_objset,
 350                     DMU_META_DNODE_OBJECT);
 351                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 352                         dnp = &osp->os_groupused_dnode;
 353                         err = traverse_dnode(td, dnp, zb->zb_objset,
 354                             DMU_GROUPUSED_OBJECT);
 355                 }
 356                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 357                         dnp = &osp->os_userused_dnode;
 358                         err = traverse_dnode(td, dnp, zb->zb_objset,
 359                             DMU_USERUSED_OBJECT);
 360                 }
 361         }
 362 
 363         if (buf)
 364                 (void) arc_buf_remove_ref(buf, &buf);
 365 
 366 post:
 367         if (err == 0 && (td->td_flags & TRAVERSE_POST))
 368                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 369 
 370         if (hard && (err == EIO || err == ECKSUM)) {
 371                 /*
 372                  * Ignore this disk error as requested by the HARD flag,
 373                  * and continue traversal.
 374                  */
 375                 err = 0;
 376         }
 377 
 378         /*
 379          * If we are stopping here, set td_resume.
 380          */
 381         if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 382                 td->td_resume->zb_objset = zb->zb_objset;
 383                 td->td_resume->zb_object = zb->zb_object;
 384                 td->td_resume->zb_level = 0;
 385                 /*
 386                  * If we have stopped on an indirect block (e.g. due to
 387                  * i/o error), we have not visited anything below it.
 388                  * Set the bookmark to the first level-0 block that we need
 389                  * to visit.  This way, the resuming code does not need to
 390                  * deal with resuming from indirect blocks.
 391                  */
 392                 td->td_resume->zb_blkid = zb->zb_blkid <<
 393                     (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
 394                 td->td_paused = B_TRUE;
 395         }
 396 
 397         return (err);
 398 }
 399 
 400 static void
 401 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 402     uint64_t objset, uint64_t object)
 403 {
 404         int j;
 405         zbookmark_phys_t czb;
 406 
 407         for (j = 0; j < dnp->dn_nblkptr; j++) {
 408                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 409                 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
 410         }
 411 
 412         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 413                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 414                 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
 415         }
 416 }
 417 
 418 static int
 419 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 420     uint64_t objset, uint64_t object)
 421 {
 422         int j, err = 0;
 423         zbookmark_phys_t czb;
 424 
 425         for (j = 0; j < dnp->dn_nblkptr; j++) {
 426                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 427                 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 428                 if (err != 0)
 429                         break;
 430         }
 431 
 432         if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 433                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 434                 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 435         }
 436         return (err);
 437 }
 438 
 439 /* ARGSUSED */
 440 static int
 441 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 442     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 443 {
 444         prefetch_data_t *pfd = arg;
 445         arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 446 
 447         ASSERT(pfd->pd_blks_fetched >= 0);
 448         if (pfd->pd_cancel)
 449                 return (SET_ERROR(EINTR));
 450 
 451         if (!prefetch_needed(pfd, bp))
 452                 return (0);
 453 
 454         mutex_enter(&pfd->pd_mtx);
 455         while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
 456                 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
 457         pfd->pd_blks_fetched++;
 458         cv_broadcast(&pfd->pd_cv);
 459         mutex_exit(&pfd->pd_mtx);
 460 
 461         (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 462             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
 463 
 464         return (0);
 465 }
 466 
 467 static void
 468 traverse_prefetch_thread(void *arg)
 469 {
 470         traverse_data_t *td_main = arg;
 471         traverse_data_t td = *td_main;
 472         zbookmark_phys_t czb;
 473 
 474         td.td_func = traverse_prefetcher;
 475         td.td_arg = td_main->td_pfd;
 476         td.td_pfd = NULL;
 477 
 478         SET_BOOKMARK(&czb, td.td_objset,
 479             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 480         (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 481 
 482         mutex_enter(&td_main->td_pfd->pd_mtx);
 483         td_main->td_pfd->pd_exited = B_TRUE;
 484         cv_broadcast(&td_main->td_pfd->pd_cv);
 485         mutex_exit(&td_main->td_pfd->pd_mtx);
 486 }
 487 
 488 /*
 489  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 490  * in syncing context).
 491  */
 492 static int
 493 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 494     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
 495     blkptr_cb_t func, void *arg)
 496 {
 497         traverse_data_t td;
 498         prefetch_data_t pd = { 0 };
 499         zbookmark_phys_t czb;
 500         int err;
 501 
 502         ASSERT(ds == NULL || objset == ds->ds_object);
 503         ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 504 
 505         /*
 506          * The data prefetching mechanism (the prefetch thread) is incompatible
 507          * with resuming from a bookmark.
 508          */
 509         ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
 510 
 511         td.td_spa = spa;
 512         td.td_objset = objset;
 513         td.td_rootbp = rootbp;
 514         td.td_min_txg = txg_start;
 515         td.td_resume = resume;
 516         td.td_func = func;
 517         td.td_arg = arg;
 518         td.td_pfd = &pd;
 519         td.td_flags = flags;
 520         td.td_paused = B_FALSE;
 521 
 522         if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 523                 VERIFY(spa_feature_enabled_txg(spa,
 524                     SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
 525         } else {
 526                 td.td_hole_birth_enabled_txg = 0;
 527         }
 528 
 529         pd.pd_blks_max = zfs_pd_blks_max;
 530         pd.pd_flags = flags;
 531         mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 532         cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 533 
 534         /* See comment on ZIL traversal in dsl_scan_visitds. */
 535         if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
 536                 arc_flags_t flags = ARC_FLAG_WAIT;
 537                 objset_phys_t *osp;
 538                 arc_buf_t *buf;
 539 
 540                 err = arc_read(NULL, td.td_spa, rootbp,
 541                     arc_getbuf_func, &buf,
 542                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
 543                 if (err != 0)
 544                         return (err);
 545 
 546                 osp = buf->b_data;
 547                 traverse_zil(&td, &osp->os_zil_header);
 548                 (void) arc_buf_remove_ref(buf, &buf);
 549         }
 550 
 551         if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 552             0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
 553             &td, TQ_NOQUEUE))
 554                 pd.pd_exited = B_TRUE;
 555 
 556         SET_BOOKMARK(&czb, td.td_objset,
 557             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 558         err = traverse_visitbp(&td, NULL, rootbp, &czb);
 559 
 560         mutex_enter(&pd.pd_mtx);
 561         pd.pd_cancel = B_TRUE;
 562         cv_broadcast(&pd.pd_cv);
 563         while (!pd.pd_exited)
 564                 cv_wait(&pd.pd_cv, &pd.pd_mtx);
 565         mutex_exit(&pd.pd_mtx);
 566 
 567         mutex_destroy(&pd.pd_mtx);
 568         cv_destroy(&pd.pd_cv);
 569 
 570         return (err);
 571 }
 572 
 573 /*
 574  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 575  * in syncing context).
 576  */
 577 int
 578 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
 579     blkptr_cb_t func, void *arg)
 580 {
 581         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
 582             &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
 583 }
 584 
 585 int
 586 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
 587     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
 588     blkptr_cb_t func, void *arg)
 589 {
 590         return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 591             blkptr, txg_start, resume, flags, func, arg));
 592 }
 593 
 594 /*
 595  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
 596  */
 597 int
 598 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 599     blkptr_cb_t func, void *arg)
 600 {
 601         int err;
 602         uint64_t obj;
 603         dsl_pool_t *dp = spa_get_dsl(spa);
 604         objset_t *mos = dp->dp_meta_objset;
 605         boolean_t hard = (flags & TRAVERSE_HARD);
 606 
 607         /* visit the MOS */
 608         err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 609             txg_start, NULL, flags, func, arg);
 610         if (err != 0)
 611                 return (err);
 612 
 613         /* visit each dataset */
 614         for (obj = 1; err == 0;
 615             err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 616                 dmu_object_info_t doi;
 617 
 618                 err = dmu_object_info(mos, obj, &doi);
 619                 if (err != 0) {
 620                         if (hard)
 621                                 continue;
 622                         break;
 623                 }
 624 
 625                 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 626                         dsl_dataset_t *ds;
 627                         uint64_t txg = txg_start;
 628 
 629                         dsl_pool_config_enter(dp, FTAG);
 630                         err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 631                         dsl_pool_config_exit(dp, FTAG);
 632                         if (err != 0) {
 633                                 if (hard)
 634                                         continue;
 635                                 break;
 636                         }
 637                         if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
 638                                 txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 639                         err = traverse_dataset(ds, txg, flags, func, arg);
 640                         dsl_dataset_rele(ds, FTAG);
 641                         if (err != 0)
 642                                 break;
 643                 }
 644         }
 645         if (err == ESRCH)
 646                 err = 0;
 647         return (err);
 648 }