1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  */
  26 
  27 #include <sys/zfs_context.h>
  28 #include <sys/dmu_objset.h>
  29 #include <sys/dmu_traverse.h>
  30 #include <sys/dsl_dataset.h>
  31 #include <sys/dsl_dir.h>
  32 #include <sys/dsl_pool.h>
  33 #include <sys/dnode.h>
  34 #include <sys/spa.h>
  35 #include <sys/zio.h>
  36 #include <sys/dmu_impl.h>
  37 #include <sys/sa.h>
  38 #include <sys/sa_impl.h>
  39 #include <sys/callb.h>
  40 #include <sys/zfeature.h>
  41 
  42 int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;    /* 50MB */
  43 boolean_t send_holes_without_birth_time = B_TRUE;
  44 
  45 typedef struct prefetch_data {
  46         kmutex_t pd_mtx;
  47         kcondvar_t pd_cv;
  48         int32_t pd_bytes_fetched;
  49         int pd_flags;
  50         boolean_t pd_cancel;
  51         boolean_t pd_exited;
  52         zbookmark_phys_t pd_resume;
  53 } prefetch_data_t;
  54 
  55 typedef struct traverse_data {
  56         spa_t *td_spa;
  57         uint64_t td_objset;
  58         blkptr_t *td_rootbp;
  59         uint64_t td_min_txg;
  60         uint64_t td_max_txg;
  61         zbookmark_phys_t *td_resume;
  62         int td_flags;
  63         prefetch_data_t *td_pfd;
  64         boolean_t td_paused;
  65         uint64_t td_hole_birth_enabled_txg;
  66         blkptr_cb_t *td_func;
  67         void *td_arg;
  68         boolean_t td_realloc_possible;
  69 } traverse_data_t;
  70 
  71 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
  72     uint64_t objset, uint64_t object);
  73 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
  74     uint64_t objset, uint64_t object);
  75 
  76 static int
  77 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
  78 {
  79         traverse_data_t *td = arg;
  80         zbookmark_phys_t zb;
  81 
  82         if (BP_IS_HOLE(bp))
  83                 return (0);
  84 
  85         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
  86                 return (0);
  87 
  88         SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
  89             bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
  90 
  91         (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
  92 
  93         return (0);
  94 }
  95 
  96 static int
  97 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
  98 {
  99         traverse_data_t *td = arg;
 100 
 101         if (lrc->lrc_txtype == TX_WRITE) {
 102                 lr_write_t *lr = (lr_write_t *)lrc;
 103                 blkptr_t *bp = &lr->lr_blkptr;
 104                 zbookmark_phys_t zb;
 105 
 106                 if (BP_IS_HOLE(bp))
 107                         return (0);
 108 
 109                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
 110                         return (0);
 111 
 112                 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 113                     ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 114 
 115                 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 116                     td->td_arg);
 117         }
 118         return (0);
 119 }
 120 
 121 static void
 122 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 123 {
 124         uint64_t claim_txg = zh->zh_claim_txg;
 125         zilog_t *zilog;
 126 
 127         /*
 128          * We only want to visit blocks that have been claimed but not yet
 129          * replayed; plus, in read-only mode, blocks that are already stable.
 130          */
 131         if (claim_txg == 0 && spa_writeable(td->td_spa))
 132                 return;
 133 
 134         zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 135 
 136         (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 137             claim_txg);
 138 
 139         zil_free(zilog);
 140 }
 141 
 142 typedef enum resume_skip {
 143         RESUME_SKIP_ALL,
 144         RESUME_SKIP_NONE,
 145         RESUME_SKIP_CHILDREN
 146 } resume_skip_t;
 147 
 148 /*
 149  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
 150  * the block indicated by zb does not need to be visited at all. Returns
 151  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
 152  * resume point. This indicates that this block should be visited but not its
 153  * children (since they must have been visited in a previous traversal).
 154  * Otherwise returns RESUME_SKIP_NONE.
 155  */
 156 static resume_skip_t
 157 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
 158     const zbookmark_phys_t *zb)
 159 {
 160         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
 161                 /*
 162                  * If we already visited this bp & everything below,
 163                  * don't bother doing it again.
 164                  */
 165                 if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 166                         return (RESUME_SKIP_ALL);
 167 
 168                 /*
 169                  * If we found the block we're trying to resume from, zero
 170                  * the bookmark out to indicate that we have resumed.
 171                  */
 172                 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 173                         bzero(td->td_resume, sizeof (*zb));
 174                         if (td->td_flags & TRAVERSE_POST)
 175                                 return (RESUME_SKIP_CHILDREN);
 176                 }
 177         }
 178         return (RESUME_SKIP_NONE);
 179 }
 180 
 181 static void
 182 traverse_prefetch_metadata(traverse_data_t *td,
 183     const blkptr_t *bp, const zbookmark_phys_t *zb)
 184 {
 185         arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 186 
 187         if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 188                 return;
 189         /*
 190          * If we are in the process of resuming, don't prefetch, because
 191          * some children will not be needed (and in fact may have already
 192          * been freed).
 193          */
 194         if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
 195                 return;
 196         if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg ||
 197             bp->blk_birth >= td->td_max_txg)
 198                 return;
 199         if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 200                 return;
 201 
 202         (void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 203             ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 204 }
 205 
 206 static boolean_t
 207 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 208 {
 209         ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 210         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 211             BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 212                 return (B_FALSE);
 213         return (B_TRUE);
 214 }
 215 
 216 static int
 217 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 218     const blkptr_t *bp, const zbookmark_phys_t *zb)
 219 {
 220         zbookmark_phys_t czb;
 221         int err = 0;
 222         arc_buf_t *buf = NULL;
 223         prefetch_data_t *pd = td->td_pfd;
 224         boolean_t hard = td->td_flags & TRAVERSE_HARD;
 225 
 226         switch (resume_skip_check(td, dnp, zb)) {
 227         case RESUME_SKIP_ALL:
 228                 return (0);
 229         case RESUME_SKIP_CHILDREN:
 230                 goto post;
 231         case RESUME_SKIP_NONE:
 232                 break;
 233         default:
 234                 ASSERT(0);
 235         }
 236 
 237         if (bp->blk_birth == 0) {
 238                 /*
 239                  * Since this block has a birth time of 0 it must be one of
 240                  * two things: a hole created before the
 241                  * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
 242                  * which has always been a hole in an object.
 243                  *
 244                  * If a file is written sparsely, then the unwritten parts of
 245                  * the file were "always holes" -- that is, they have been
 246                  * holes since this object was allocated.  However, we (and
 247                  * our callers) can not necessarily tell when an object was
 248                  * allocated.  Therefore, if it's possible that this object
 249                  * was freed and then its object number reused, we need to
 250                  * visit all the holes with birth==0.
 251                  *
 252                  * If it isn't possible that the object number was reused,
 253                  * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
 254                  * all the blocks we will visit as part of this traversal,
 255                  * then this hole must have always existed, so we can skip
 256                  * it.  We visit blocks born after (exclusive) td_min_txg.
 257                  *
 258                  * Note that the meta-dnode cannot be reallocated.
 259                  */
 260                 if (!send_holes_without_birth_time &&
 261                     (!td->td_realloc_possible ||
 262                     zb->zb_object == DMU_META_DNODE_OBJECT) &&
 263                     (td->td_hole_birth_enabled_txg <= td->td_min_txg ||
 264                     td->td_hole_birth_enabled_txg > td->td_max_txg))
 265                         return (0);
 266         } else if (bp->blk_birth <= td->td_min_txg ||
 267             bp->blk_birth >= td->td_max_txg) {
 268                 return (0);
 269         }
 270 
 271         if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 272                 uint64_t size = BP_GET_LSIZE(bp);
 273                 mutex_enter(&pd->pd_mtx);
 274                 ASSERT(pd->pd_bytes_fetched >= 0);
 275                 while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 276                         cv_wait(&pd->pd_cv, &pd->pd_mtx);
 277                 pd->pd_bytes_fetched -= size;
 278                 cv_broadcast(&pd->pd_cv);
 279                 mutex_exit(&pd->pd_mtx);
 280         }
 281 
 282         if (BP_IS_HOLE(bp)) {
 283                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 284                 if (err != 0)
 285                         goto post;
 286                 return (0);
 287         }
 288 
 289         if (td->td_flags & TRAVERSE_PRE) {
 290                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 291                     td->td_arg);
 292                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 293                         return (0);
 294                 /* handle pausing at a common point */
 295                 if (err == ERESTART)
 296                         td->td_paused = B_TRUE;
 297                 if (err != 0)
 298                         goto post;
 299         }
 300 
 301         if (BP_GET_LEVEL(bp) > 0) {
 302                 arc_flags_t flags = ARC_FLAG_WAIT;
 303                 int i;
 304                 blkptr_t *cbp;
 305                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 306 
 307                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 308                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 309                 if (err != 0)
 310                         goto post;
 311                 cbp = buf->b_data;
 312 
 313                 for (i = 0; i < epb; i++) {
 314                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 315                             zb->zb_level - 1,
 316                             zb->zb_blkid * epb + i);
 317                         traverse_prefetch_metadata(td, &cbp[i], &czb);
 318                 }
 319 
 320                 /* recursively visitbp() blocks below this */
 321                 for (i = 0; i < epb; i++) {
 322                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 323                             zb->zb_level - 1,
 324                             zb->zb_blkid * epb + i);
 325                         err = traverse_visitbp(td, dnp, &cbp[i], &czb);
 326                         if (err != 0)
 327                                 break;
 328                 }
 329         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 330                 arc_flags_t flags = ARC_FLAG_WAIT;
 331                 int i;
 332                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 333 
 334                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 335                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 336                 if (err != 0)
 337                         goto post;
 338                 dnode_phys_t *child_dnp = buf->b_data;
 339 
 340                 for (i = 0; i < epb; i++) {
 341                         prefetch_dnode_metadata(td, &child_dnp[i],
 342                             zb->zb_objset, zb->zb_blkid * epb + i);
 343                 }
 344 
 345                 /* recursively visitbp() blocks below this */
 346                 for (i = 0; i < epb; i++) {
 347                         err = traverse_dnode(td, &child_dnp[i],
 348                             zb->zb_objset, zb->zb_blkid * epb + i);
 349                         if (err != 0)
 350                                 break;
 351                 }
 352         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 353                 arc_flags_t flags = ARC_FLAG_WAIT;
 354 
 355                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 356                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 357                 if (err != 0)
 358                         goto post;
 359 
 360                 objset_phys_t *osp = buf->b_data;
 361                 prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 362                     DMU_META_DNODE_OBJECT);
 363                 /*
 364                  * See the block comment above for the goal of this variable.
 365                  * If the maxblkid of the meta-dnode is 0, then we know that
 366                  * we've never had more than DNODES_PER_BLOCK objects in the
 367                  * dataset, which means we can't have reused any object ids.
 368                  */
 369                 if (osp->os_meta_dnode.dn_maxblkid == 0)
 370                         td->td_realloc_possible = B_FALSE;
 371 
 372                 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 373                         prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 374                             zb->zb_objset, DMU_GROUPUSED_OBJECT);
 375                         prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 376                             zb->zb_objset, DMU_USERUSED_OBJECT);
 377                 }
 378 
 379                 err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
 380                     DMU_META_DNODE_OBJECT);
 381                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 382                         err = traverse_dnode(td, &osp->os_groupused_dnode,
 383                             zb->zb_objset, DMU_GROUPUSED_OBJECT);
 384                 }
 385                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 386                         err = traverse_dnode(td, &osp->os_userused_dnode,
 387                             zb->zb_objset, DMU_USERUSED_OBJECT);
 388                 }
 389         }
 390 
 391         if (buf)
 392                 arc_buf_destroy(buf, &buf);
 393 
 394 post:
 395         if (err == 0 && (td->td_flags & TRAVERSE_POST))
 396                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 397 
 398         if (hard && (err == EIO || err == ECKSUM)) {
 399                 /*
 400                  * Ignore this disk error as requested by the HARD flag,
 401                  * and continue traversal.
 402                  */
 403                 err = 0;
 404         }
 405 
 406         /*
 407          * If we are stopping here, set td_resume.
 408          */
 409         if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 410                 td->td_resume->zb_objset = zb->zb_objset;
 411                 td->td_resume->zb_object = zb->zb_object;
 412                 td->td_resume->zb_level = 0;
 413                 /*
 414                  * If we have stopped on an indirect block (e.g. due to
 415                  * i/o error), we have not visited anything below it.
 416                  * Set the bookmark to the first level-0 block that we need
 417                  * to visit.  This way, the resuming code does not need to
 418                  * deal with resuming from indirect blocks.
 419                  *
 420                  * Note, if zb_level <= 0, dnp may be NULL, so we don't want
 421                  * to dereference it.
 422                  */
 423                 td->td_resume->zb_blkid = zb->zb_blkid;
 424                 if (zb->zb_level > 0) {
 425                         td->td_resume->zb_blkid <<= zb->zb_level *
 426                             (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
 427                 }
 428                 td->td_paused = B_TRUE;
 429         }
 430 
 431         /* if we walked over all bp bookmark must be cleared */
 432         if (!err && !td->td_paused && td->td_resume != NULL &&
 433             bp == td->td_rootbp && td->td_pfd != NULL) {
 434                 bzero(td->td_resume, sizeof (*td->td_resume));
 435         }
 436 
 437         return (err);
 438 }
 439 
 440 static void
 441 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 442     uint64_t objset, uint64_t object)
 443 {
 444         int j;
 445         zbookmark_phys_t czb;
 446 
 447         for (j = 0; j < dnp->dn_nblkptr; j++) {
 448                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 449                 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
 450         }
 451 
 452         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 453                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 454                 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
 455         }
 456 }
 457 
 458 static int
 459 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 460     uint64_t objset, uint64_t object)
 461 {
 462         int j, err = 0;
 463         zbookmark_phys_t czb;
 464 
 465         if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
 466             object < td->td_resume->zb_object)
 467                 return (0);
 468 
 469         if (td->td_flags & TRAVERSE_PRE) {
 470                 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 471                     ZB_DNODE_BLKID);
 472                 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
 473                     td->td_arg);
 474                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 475                         return (0);
 476                 if (err != 0)
 477                         return (err);
 478         }
 479 
 480         for (j = 0; j < dnp->dn_nblkptr; j++) {
 481                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 482                 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 483                 if (err != 0)
 484                         break;
 485         }
 486 
 487         if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 488                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 489                 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
 490         }
 491 
 492         if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
 493                 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 494                     ZB_DNODE_BLKID);
 495                 err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
 496                     td->td_arg);
 497                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 498                         return (0);
 499                 if (err != 0)
 500                         return (err);
 501         }
 502         return (err);
 503 }
 504 
 505 /* ARGSUSED */
 506 static int
 507 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 508     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 509 {
 510         prefetch_data_t *pfd = arg;
 511         arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 512 
 513         ASSERT(pfd->pd_bytes_fetched >= 0);
 514         if (bp == NULL)
 515                 return (0);
 516         if (pfd->pd_cancel)
 517                 return (SET_ERROR(EINTR));
 518 
 519         if (!prefetch_needed(pfd, bp))
 520                 return (0);
 521 
 522         mutex_enter(&pfd->pd_mtx);
 523         while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 524                 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
 525         pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 526         cv_broadcast(&pfd->pd_cv);
 527         mutex_exit(&pfd->pd_mtx);
 528 
 529         (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 530             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
 531 
 532         return (0);
 533 }
 534 
 535 static void
 536 traverse_prefetch_thread(void *arg)
 537 {
 538         traverse_data_t *td_main = arg;
 539         traverse_data_t td = *td_main;
 540         zbookmark_phys_t czb;
 541 
 542         td.td_func = traverse_prefetcher;
 543         td.td_arg = td_main->td_pfd;
 544         td.td_pfd = NULL;
 545         td.td_resume = &td_main->td_pfd->pd_resume;
 546 
 547         SET_BOOKMARK(&czb, td.td_objset,
 548             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 549         (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 550 
 551         mutex_enter(&td_main->td_pfd->pd_mtx);
 552         td_main->td_pfd->pd_exited = B_TRUE;
 553         cv_broadcast(&td_main->td_pfd->pd_cv);
 554         mutex_exit(&td_main->td_pfd->pd_mtx);
 555 }
 556 
 557 /*
 558  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 559  * in syncing context).
 560  */
 561 static int
 562 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 563     uint64_t txg_start, uint64_t txg_finish, zbookmark_phys_t *resume,
 564     int flags, blkptr_cb_t func, void *arg)
 565 {
 566         traverse_data_t td;
 567         prefetch_data_t pd = { 0 };
 568         zbookmark_phys_t czb;
 569         int err;
 570 
 571         ASSERT(ds == NULL || objset == ds->ds_object);
 572         ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 573 
 574         td.td_spa = spa;
 575         td.td_objset = objset;
 576         td.td_rootbp = rootbp;
 577         td.td_min_txg = txg_start;
 578         td.td_max_txg = txg_finish;
 579         td.td_resume = resume;
 580         td.td_func = func;
 581         td.td_arg = arg;
 582         td.td_pfd = &pd;
 583         td.td_flags = flags;
 584         td.td_paused = B_FALSE;
 585         td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
 586 
 587         if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 588                 VERIFY(spa_feature_enabled_txg(spa,
 589                     SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
 590         } else {
 591                 td.td_hole_birth_enabled_txg = UINT64_MAX;
 592         }
 593 
 594         pd.pd_flags = flags;
 595         if (resume != NULL)
 596                 pd.pd_resume = *resume;
 597         mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 598         cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 599 
 600         /* See comment on ZIL traversal in dsl_scan_visitds. */
 601         if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
 602                 arc_flags_t flags = ARC_FLAG_WAIT;
 603                 objset_phys_t *osp;
 604                 arc_buf_t *buf;
 605 
 606                 err = arc_read(NULL, td.td_spa, rootbp,
 607                     arc_getbuf_func, &buf,
 608                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
 609                 if (err != 0)
 610                         return (err);
 611 
 612                 osp = buf->b_data;
 613                 traverse_zil(&td, &osp->os_zil_header);
 614                 arc_buf_destroy(buf, &buf);
 615         }
 616 
 617         if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 618             0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
 619             &td, TQ_NOQUEUE))
 620                 pd.pd_exited = B_TRUE;
 621 
 622         SET_BOOKMARK(&czb, td.td_objset,
 623             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 624         err = traverse_visitbp(&td, NULL, rootbp, &czb);
 625 
 626         mutex_enter(&pd.pd_mtx);
 627         pd.pd_cancel = B_TRUE;
 628         cv_broadcast(&pd.pd_cv);
 629         while (!pd.pd_exited)
 630                 cv_wait(&pd.pd_cv, &pd.pd_mtx);
 631         mutex_exit(&pd.pd_mtx);
 632 
 633         mutex_destroy(&pd.pd_mtx);
 634         cv_destroy(&pd.pd_cv);
 635 
 636         return (err);
 637 }
 638 
 639 /*
 640  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 641  * in syncing context).
 642  */
 643 int
 644 traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
 645     zbookmark_phys_t *resume,
 646     int flags, blkptr_cb_t func, void *arg)
 647 {
 648         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
 649             &dsl_dataset_phys(ds)->ds_bp, txg_start, UINT64_MAX, resume, flags,
 650             func, arg));
 651 }
 652 
 653 int
 654 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
 655     int flags, blkptr_cb_t func, void *arg)
 656 {
 657         return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
 658 }
 659 
 660 int
 661 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
 662     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
 663     blkptr_cb_t func, void *arg)
 664 {
 665         return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 666             blkptr, txg_start, UINT64_MAX, resume, flags, func, arg));
 667 }
 668 
 669 /*
 670  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
 671  */
 672 int
 673 traverse_pool(spa_t *spa, uint64_t txg_start, uint64_t txg_finish, int flags,
 674     blkptr_cb_t func, void *arg, zbookmark_phys_t *zb)
 675 {
 676         int err = 0, lasterr = 0;
 677         dsl_pool_t *dp = spa_get_dsl(spa);
 678         objset_t *mos = dp->dp_meta_objset;
 679         boolean_t hard = (flags & TRAVERSE_HARD);
 680 
 681         /* visit the MOS */
 682         if (!zb || (zb->zb_objset == 0 && zb->zb_object == 0)) {
 683                 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 684                     txg_start, txg_finish, NULL, flags, func, arg);
 685                 if (err != 0)
 686                         return (err);
 687         }
 688 
 689         /* visit each dataset */
 690         for (uint64_t obj = (zb && !ZB_IS_ZERO(zb))? zb->zb_objset : 1;
 691             err == 0 || (err != ESRCH && hard);
 692             err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 693                 dmu_object_info_t doi;
 694 
 695                 err = dmu_object_info(mos, obj, &doi);
 696                 if (err != 0) {
 697                         if (hard)
 698                                 continue;
 699                         break;
 700                 }
 701 
 702                 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 703                         dsl_dataset_t *ds;
 704                         objset_t *os;
 705                         boolean_t os_is_snapshot = B_FALSE;
 706                         uint64_t txg = txg_start;
 707                         uint64_t ctxg;
 708                         uint64_t max_txg = txg_finish;
 709 
 710                         dsl_pool_config_enter(dp, FTAG);
 711                         err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 712                         dsl_pool_config_exit(dp, FTAG);
 713                         if (err != 0) {
 714                                 if (hard)
 715                                         continue;
 716                                 break;
 717                         }
 718 
 719                         dsl_pool_config_enter(dp, FTAG);
 720                         err = dmu_objset_from_ds(ds, &os);
 721                         if (err == 0)
 722                                 os_is_snapshot = dmu_objset_is_snapshot(os);
 723 
 724                         dsl_pool_config_exit(dp, FTAG);
 725                         if (err != 0) {
 726                                 dsl_dataset_rele(ds, FTAG);
 727                                 if (hard)
 728                                         continue;
 729                                 break;
 730                         }
 731                         ctxg = dsl_dataset_phys(ds)->ds_creation_txg;
 732 
 733                         /* uplimited traverse walks over shapshots only */
 734                         if (max_txg != UINT64_MAX && !os_is_snapshot) {
 735                                 dsl_dataset_rele(ds, FTAG);
 736                                 continue;
 737                         }
 738                         if (max_txg != UINT64_MAX && ctxg >= max_txg) {
 739                                 dsl_dataset_rele(ds, FTAG);
 740                                 continue;
 741                         }
 742                         if (os_is_snapshot && ctxg <= txg_start) {
 743                                 dsl_dataset_rele(ds, FTAG);
 744                                 continue;
 745                         }
 746                         if (max_txg == UINT64_MAX &&
 747                             dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
 748                                 txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 749                         if (txg > max_txg)
 750                                 max_txg = txg;
 751                         err = traverse_impl(spa, ds, ds->ds_object,
 752                             &dsl_dataset_phys(ds)->ds_bp,
 753                             txg, max_txg, zb, flags, func, arg);
 754                         dsl_dataset_rele(ds, FTAG);
 755                         if (err != 0) {
 756                                 if (!hard)
 757                                         return (err);
 758                                 lasterr = err;
 759                         }
 760                         if (zb && !ZB_IS_ZERO(zb))
 761                                 break;
 762                 }
 763         }
 764         if (err == ESRCH) {
 765                 /* zero bookmark means we are done */
 766                 if (zb)
 767                         bzero(zb, sizeof (*zb));
 768                 err = 0;
 769         }
 770         return (err != 0 ? err : lasterr);
 771 }