1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/zfs_context.h>
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dmu_traverse.h>
  29 #include <sys/dsl_dataset.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/dsl_pool.h>
  32 #include <sys/dnode.h>
  33 #include <sys/spa.h>
  34 #include <sys/zio.h>
  35 #include <sys/dmu_impl.h>
  36 #include <sys/sa.h>
  37 #include <sys/sa_impl.h>
  38 #include <sys/callb.h>
  39 #include <sys/zfeature.h>
  40 
  41 int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;    /* 50MB */
  42 boolean_t send_holes_without_birth_time = B_TRUE;
  43 
  44 typedef struct prefetch_data {
  45         kmutex_t pd_mtx;
  46         kcondvar_t pd_cv;
  47         int32_t pd_bytes_fetched;
  48         int pd_flags;
  49         boolean_t pd_cancel;
  50         boolean_t pd_exited;
  51         zbookmark_phys_t pd_resume;
  52 } prefetch_data_t;
  53 
  54 typedef struct traverse_data {
  55         spa_t *td_spa;
  56         uint64_t td_objset;
  57         blkptr_t *td_rootbp;
  58         uint64_t td_min_txg;
  59         zbookmark_phys_t *td_resume;
  60         int td_flags;
  61         prefetch_data_t *td_pfd;
  62         boolean_t td_paused;
  63         uint64_t td_hole_birth_enabled_txg;
  64         blkptr_cb_t *td_func;
  65         void *td_arg;
  66         boolean_t td_realloc_possible;
  67 } traverse_data_t;
  68 
  69 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
  70     uint64_t objset, uint64_t object);
  71 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
  72     uint64_t objset, uint64_t object);
  73 
  74 static int
  75 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
  76 {
  77         traverse_data_t *td = arg;
  78         zbookmark_phys_t zb;
  79 
  80         if (BP_IS_HOLE(bp))
  81                 return (0);
  82 
  83         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
  84                 return (0);
  85 
  86         SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
  87             bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
  88 
  89         (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
  90 
  91         return (0);
  92 }
  93 
  94 static int
  95 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
  96 {
  97         traverse_data_t *td = arg;
  98 
  99         if (lrc->lrc_txtype == TX_WRITE) {
 100                 lr_write_t *lr = (lr_write_t *)lrc;
 101                 blkptr_t *bp = &lr->lr_blkptr;
 102                 zbookmark_phys_t zb;
 103 
 104                 if (BP_IS_HOLE(bp))
 105                         return (0);
 106 
 107                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
 108                         return (0);
 109 
 110                 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 111                     ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 112 
 113                 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 114                     td->td_arg);
 115         }
 116         return (0);
 117 }
 118 
 119 static void
 120 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 121 {
 122         uint64_t claim_txg = zh->zh_claim_txg;
 123         zilog_t *zilog;
 124 
 125         /*
 126          * We only want to visit blocks that have been claimed but not yet
 127          * replayed; plus, in read-only mode, blocks that are already stable.
 128          */
 129         if (claim_txg == 0 && spa_writeable(td->td_spa))
 130                 return;
 131 
 132         zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 133 
 134         (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 135             claim_txg);
 136 
 137         zil_free(zilog);
 138 }
 139 
 140 typedef enum resume_skip {
 141         RESUME_SKIP_ALL,
 142         RESUME_SKIP_NONE,
 143         RESUME_SKIP_CHILDREN
 144 } resume_skip_t;
 145 
 146 /*
 147  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
 148  * the block indicated by zb does not need to be visited at all. Returns
 149  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
 150  * resume point. This indicates that this block should be visited but not its
 151  * children (since they must have been visited in a previous traversal).
 152  * Otherwise returns RESUME_SKIP_NONE.
 153  */
 154 static resume_skip_t
 155 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 156     const zbookmark_phys_t *zb)
 157 {
 158         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
 159                 /*
 160                  * If we already visited this bp & everything below,
 161                  * don't bother doing it again.
 162                  */
 163                 if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 164                         return (RESUME_SKIP_ALL);
 165 
 166                 /*
 167                  * If we found the block we're trying to resume from, zero
 168                  * the bookmark out to indicate that we have resumed.
 169                  */
 170                 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 171                         bzero(td->td_resume, sizeof (*zb));
 172                         if (td->td_flags & TRAVERSE_POST)
 173                                 return (RESUME_SKIP_CHILDREN);
 174                 }
 175         }
 176         return (RESUME_SKIP_NONE);
 177 }
 178 
 179 static void
 180 traverse_prefetch_metadata(traverse_data_t *td,
 181     const blkptr_t *bp, const zbookmark_phys_t *zb)
 182 {
 183         arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 184 
 185         if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 186                 return;
 187         /*
 188          * If we are in the process of resuming, don't prefetch, because
 189          * some children will not be needed (and in fact may have already
 190          * been freed).
 191          */
 192         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
 193                 return;
 194         if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
 195                 return;
 196         if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 197                 return;
 198 
 199         (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 200             ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 201 }
 202 
 203 static boolean_t
 204 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 205 {
 206         ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 207         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 208             BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 209                 return (B_FALSE);
 210         return (B_TRUE);
 211 }
 212 
 213 static int
 214 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 215     const blkptr_t *bp, const zbookmark_phys_t *zb)
 216 {
 217         zbookmark_phys_t czb;
 218         int err = 0;
 219         arc_buf_t *buf = NULL;
 220         prefetch_data_t *pd = td->td_pfd;
 221         boolean_t hard = td->td_flags & TRAVERSE_HARD;
 222 
 223         switch (resume_skip_check(td, dnp, zb)) {
 224         case RESUME_SKIP_ALL:
 225                 return (0);
 226         case RESUME_SKIP_CHILDREN:
 227                 goto post;
 228         case RESUME_SKIP_NONE:
 229                 break;
 230         default:
 231                 ASSERT(0);
 232         }
 233 
 234         if (bp->blk_birth == 0) {
 235                 /*
 236                  * Since this block has a birth time of 0 it must be one of
 237                  * two things: a hole created before the
 238                  * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
 239                  * which has always been a hole in an object.
 240                  *
 241                  * If a file is written sparsely, then the unwritten parts of
 242                  * the file were "always holes" -- that is, they have been
 243                  * holes since this object was allocated.  However, we (and
 244                  * our callers) can not necessarily tell when an object was
 245                  * allocated.  Therefore, if it's possible that this object
 246                  * was freed and then its object number reused, we need to
 247                  * visit all the holes with birth==0.
 248                  *
 249                  * If it isn't possible that the object number was reused,
 250                  * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
 251                  * all the blocks we will visit as part of this traversal,
 252                  * then this hole must have always existed, so we can skip
 253                  * it.  We visit blocks born after (exclusive) td_min_txg.
 254                  *
 255                  * Note that the meta-dnode cannot be reallocated.
 256                  */
 257                 if (!send_holes_without_birth_time &&
 258                     (!td->td_realloc_possible ||
 259                     zb->zb_object == DMU_META_DNODE_OBJECT) &&
 260                     td->td_hole_birth_enabled_txg <= td->td_min_txg)
 261                         return (0);
 262         } else if (bp->blk_birth <= td->td_min_txg) {
 263                 return (0);
 264         }
 265 
 266         if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 267                 uint64_t size = BP_GET_LSIZE(bp);
 268                 mutex_enter(&pd->pd_mtx);
 269                 ASSERT(pd->pd_bytes_fetched >= 0);
 270                 while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 271                         cv_wait(&pd->pd_cv, &pd->pd_mtx);
 272                 pd->pd_bytes_fetched -= size;
 273                 cv_broadcast(&pd->pd_cv);
 274                 mutex_exit(&pd->pd_mtx);
 275         }
 276 
 277         if (BP_IS_HOLE(bp)) {
 278                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 279                 if (err != 0)
 280                         goto post;
 281                 return (0);
 282         }
 283 
 284         if (td->td_flags & TRAVERSE_PRE) {
 285                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 286                     td->td_arg);
 287                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 288                         return (0);
 289                 if (err != 0)
 290                         goto post;
 291         }
 292 
 293         if (BP_GET_LEVEL(bp) > 0) {
 294                 arc_flags_t flags = ARC_FLAG_WAIT;
 295                 int i;
 296                 blkptr_t *cbp;
 297                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 298 
 299                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 300                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 301                 if (err != 0)
 302                         goto post;
 303                 cbp = buf->b_data;
 304 
 305                 for (i = 0; i < epb; i++) {
 306                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 307                             zb->zb_level - 1,
 308                             zb->zb_blkid * epb + i);
 309                         traverse_prefetch_metadata(td, &cbp[i], &czb);
 310                 }
 311 
 312                 /* recursively visitbp() blocks below this */
 313                 for (i = 0; i < epb; i++) {
 314                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 315                             zb->zb_level - 1,
 316                             zb->zb_blkid * epb + i);
 317                         err = traverse_visitbp(td, dnp, &cbp[i], &czb);
 318                         if (err != 0)
 319                                 break;
 320                 }
 321         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 322                 arc_flags_t flags = ARC_FLAG_WAIT;
 323                 int i;
 324                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 325 
 326                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 327                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 328                 if (err != 0)
 329                         goto post;
 330                 dnode_phys_t *child_dnp = buf->b_data;
 331 
 332                 for (i = 0; i < epb; i++) {
 333                         prefetch_dnode_metadata(td, &child_dnp[i],
 334                             zb->zb_objset, zb->zb_blkid * epb + i);
 335                 }
 336 
 337                 /* recursively visitbp() blocks below this */
 338                 for (i = 0; i < epb; i++) {
 339                         err = traverse_dnode(td, &child_dnp[i],
 340                             zb->zb_objset, zb->zb_blkid * epb + i);
 341                         if (err != 0)
 342                                 break;
 343                 }
 344         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 345                 arc_flags_t flags = ARC_FLAG_WAIT;
 346 
 347                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 348                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 349                 if (err != 0)
 350                         goto post;
 351 
 352                 objset_phys_t *osp = buf->b_data;
 353                 prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 354                     DMU_META_DNODE_OBJECT);
 355                 /*
 356                  * See the block comment above for the goal of this variable.
 357                  * If the maxblkid of the meta-dnode is 0, then we know that
 358                  * we've never had more than DNODES_PER_BLOCK objects in the
 359                  * dataset, which means we can't have reused any object ids.
 360                  */
 361                 if (osp->os_meta_dnode.dn_maxblkid == 0)
 362                         td->td_realloc_possible = B_FALSE;
 363 
 364                 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 365                         prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 366                             zb->zb_objset, DMU_GROUPUSED_OBJECT);
 367                         prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 368                             zb->zb_objset, DMU_USERUSED_OBJECT);
 369                 }
 370 
 371                 err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
 372                     DMU_META_DNODE_OBJECT);
 373                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 374                         err = traverse_dnode(td, &osp->os_groupused_dnode,
 375                             zb->zb_objset, DMU_GROUPUSED_OBJECT);
 376                 }
 377                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 378                         err = traverse_dnode(td, &osp->os_userused_dnode,
 379                             zb->zb_objset, DMU_USERUSED_OBJECT);
 380                 }
 381         }
 382 
 383         if (buf)
 384                 arc_buf_destroy(buf, &buf);
 385 
 386 post:
 387         if (err == 0 && (td->td_flags & TRAVERSE_POST))
 388                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 389 
 390         if (hard && (err == EIO || err == ECKSUM)) {
 391                 /*
 392                  * Ignore this disk error as requested by the HARD flag,
 393                  * and continue traversal.
 394                  */
 395                 err = 0;
 396         }
 397 
 398         /*
 399          * If we are stopping here, set td_resume.
 400          */
 401         if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 402                 td->td_resume->zb_objset = zb->zb_objset;
 403                 td->td_resume->zb_object = zb->zb_object;
 404                 td->td_resume->zb_level = 0;
 405                 /*
 406                  * If we have stopped on an indirect block (e.g. due to
 407                  * i/o error), we have not visited anything below it.
 408                  * Set the bookmark to the first level-0 block that we need
 409                  * to visit.  This way, the resuming code does not need to
 410                  * deal with resuming from indirect blocks.
 411                  *
 412                  * Note, if zb_level <= 0, dnp may be NULL, so we don't want
 413                  * to dereference it.
 414                  */
 415                 td->td_resume->zb_blkid = zb->zb_blkid;
 416                 if (zb->zb_level > 0) {
 417                         td->td_resume->zb_blkid <<= zb->zb_level *
 418                             (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
 419                 }
 420                 td->td_paused = B_TRUE;
 421         }
 422 
 423         return (err);
 424 }
 425 
 426 static void
 427 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 428     uint64_t objset, uint64_t object)
 429 {
 430         int j;
 431         zbookmark_phys_t czb;
 432 
 433         for (j = 0; j < dnp->dn_nblkptr; j++) {
 434                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 435                 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
 436         }
 437 
 438         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 439                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 440                 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
 441         }
 442 }
 443 
 444 static int
 445 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 446     uint64_t objset, uint64_t object)
 447 {
 448         int j, err = 0;
 449         zbookmark_phys_t czb;
 450 
 451         if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
 452             object < td->td_resume->zb_object)
 453                 return (0);
 454 
 455         if (td->td_flags & TRAVERSE_PRE) {
 456                 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 457                     ZB_DNODE_BLKID);
 458                 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
 459                     td->td_arg);
 460                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 461                         return (0);
 462                 if (err != 0)
 463                         return (err);
 464         }
 465 
 466         for (j = 0; j < dnp->dn_nblkptr; j++) {
 467                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 468                 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 469                 if (err != 0)
 470                         break;
 471         }
 472 
 473         if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 474                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 475                 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 476         }
 477 
 478         if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
 479                 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 480                     ZB_DNODE_BLKID);
 481                 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
 482                     td->td_arg);
 483                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 484                         return (0);
 485                 if (err != 0)
 486                         return (err);
 487         }
 488         return (err);
 489 }
 490 
 491 /* ARGSUSED */
 492 static int
 493 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 494     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 495 {
 496         prefetch_data_t *pfd = arg;
 497         arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 498 
 499         ASSERT(pfd->pd_bytes_fetched >= 0);
 500         if (bp == NULL)
 501                 return (0);
 502         if (pfd->pd_cancel)
 503                 return (SET_ERROR(EINTR));
 504 
 505         if (!prefetch_needed(pfd, bp))
 506                 return (0);
 507 
 508         mutex_enter(&pfd->pd_mtx);
 509         while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 510                 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
 511         pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 512         cv_broadcast(&pfd->pd_cv);
 513         mutex_exit(&pfd->pd_mtx);
 514 
 515         (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 516             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
 517 
 518         return (0);
 519 }
 520 
 521 static void
 522 traverse_prefetch_thread(void *arg)
 523 {
 524         traverse_data_t *td_main = arg;
 525         traverse_data_t td = *td_main;
 526         zbookmark_phys_t czb;
 527 
 528         td.td_func = traverse_prefetcher;
 529         td.td_arg = td_main->td_pfd;
 530         td.td_pfd = NULL;
 531         td.td_resume = &td_main->td_pfd->pd_resume;
 532 
 533         SET_BOOKMARK(&czb, td.td_objset,
 534             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 535         (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 536 
 537         mutex_enter(&td_main->td_pfd->pd_mtx);
 538         td_main->td_pfd->pd_exited = B_TRUE;
 539         cv_broadcast(&td_main->td_pfd->pd_cv);
 540         mutex_exit(&td_main->td_pfd->pd_mtx);
 541 }
 542 
 543 /*
 544  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 545  * in syncing context).
 546  */
 547 static int
 548 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 549     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
 550     blkptr_cb_t func, void *arg)
 551 {
 552         traverse_data_t td;
 553         prefetch_data_t pd = { 0 };
 554         zbookmark_phys_t czb;
 555         int err;
 556 
 557         ASSERT(ds == NULL || objset == ds->ds_object);
 558         ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 559 
 560         td.td_spa = spa;
 561         td.td_objset = objset;
 562         td.td_rootbp = rootbp;
 563         td.td_min_txg = txg_start;
 564         td.td_resume = resume;
 565         td.td_func = func;
 566         td.td_arg = arg;
 567         td.td_pfd = &pd;
 568         td.td_flags = flags;
 569         td.td_paused = B_FALSE;
 570         td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
 571 
 572         if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 573                 VERIFY(spa_feature_enabled_txg(spa,
 574                     SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
 575         } else {
 576                 td.td_hole_birth_enabled_txg = UINT64_MAX;
 577         }
 578 
 579         pd.pd_flags = flags;
 580         if (resume != NULL)
 581                 pd.pd_resume = *resume;
 582         mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 583         cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 584 
 585         /* See comment on ZIL traversal in dsl_scan_visitds. */
 586         if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
 587                 arc_flags_t flags = ARC_FLAG_WAIT;
 588                 objset_phys_t *osp;
 589                 arc_buf_t *buf;
 590 
 591                 err = arc_read(NULL, td.td_spa, rootbp,
 592                     arc_getbuf_func, &buf,
 593                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
 594                 if (err != 0)
 595                         return (err);
 596 
 597                 osp = buf->b_data;
 598                 traverse_zil(&td, &osp->os_zil_header);
 599                 arc_buf_destroy(buf, &buf);
 600         }
 601 
 602         if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 603             0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
 604             &td, TQ_NOQUEUE))
 605                 pd.pd_exited = B_TRUE;
 606 
 607         SET_BOOKMARK(&czb, td.td_objset,
 608             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 609         err = traverse_visitbp(&td, NULL, rootbp, &czb);
 610 
 611         mutex_enter(&pd.pd_mtx);
 612         pd.pd_cancel = B_TRUE;
 613         cv_broadcast(&pd.pd_cv);
 614         while (!pd.pd_exited)
 615                 cv_wait(&pd.pd_cv, &pd.pd_mtx);
 616         mutex_exit(&pd.pd_mtx);
 617 
 618         mutex_destroy(&pd.pd_mtx);
 619         cv_destroy(&pd.pd_cv);
 620 
 621         return (err);
 622 }
 623 
 624 /*
 625  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 626  * in syncing context).
 627  */
 628 int
 629 traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
 630     zbookmark_phys_t *resume,
 631     int flags, blkptr_cb_t func, void *arg)
 632 {
 633         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
 634             &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
 635 }
 636 
 637 int
 638 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
 639     int flags, blkptr_cb_t func, void *arg)
 640 {
 641         return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
 642 }
 643 
 644 int
 645 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
 646     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
 647     blkptr_cb_t func, void *arg)
 648 {
 649         return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 650             blkptr, txg_start, resume, flags, func, arg));
 651 }
 652 
 653 /*
 654  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
 655  */
 656 int
 657 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 658     blkptr_cb_t func, void *arg)
 659 {
 660         int err;
 661         dsl_pool_t *dp = spa_get_dsl(spa);
 662         objset_t *mos = dp->dp_meta_objset;
 663         boolean_t hard = (flags & TRAVERSE_HARD);
 664 
 665         /* visit the MOS */
 666         err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 667             txg_start, NULL, flags, func, arg);
 668         if (err != 0)
 669                 return (err);
 670 
 671         /* visit each dataset */
 672         for (uint64_t obj = 1; err == 0;
 673             err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 674                 dmu_object_info_t doi;
 675 
 676                 err = dmu_object_info(mos, obj, &doi);
 677                 if (err != 0) {
 678                         if (hard)
 679                                 continue;
 680                         break;
 681                 }
 682 
 683                 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 684                         dsl_dataset_t *ds;
 685                         uint64_t txg = txg_start;
 686 
 687                         dsl_pool_config_enter(dp, FTAG);
 688                         err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 689                         dsl_pool_config_exit(dp, FTAG);
 690                         if (err != 0) {
 691                                 if (hard)
 692                                         continue;
 693                                 break;
 694                         }
 695                         if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
 696                                 txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 697                         err = traverse_dataset(ds, txg, flags, func, arg);
 698                         dsl_dataset_rele(ds, FTAG);
 699                         if (err != 0)
 700                                 break;
 701                 }
 702         }
 703         if (err == ESRCH)
 704                 err = 0;
 705         return (err);
 706 }