3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/dsl_scan.h>
  26 #include <sys/dsl_pool.h>
  27 #include <sys/dsl_dataset.h>
  28 #include <sys/dsl_prop.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_synctask.h>
  31 #include <sys/dnode.h>
  32 #include <sys/dmu_tx.h>
  33 #include <sys/dmu_objset.h>
  34 #include <sys/arc.h>
  35 #include <sys/zap.h>
  36 #include <sys/zio.h>
  37 #include <sys/zfs_context.h>
  38 #include <sys/fs/zfs.h>
  39 #include <sys/zfs_znode.h>
  40 #include <sys/spa_impl.h>
  41 #include <sys/vdev_impl.h>
  42 #include <sys/zil_impl.h>
  43 #include <sys/zio_checksum.h>
  44 #include <sys/ddt.h>
  45 #include <sys/sa.h>
  46 #include <sys/sa_impl.h>
  47 #ifdef _KERNEL
  48 #include <sys/zfs_vfsops.h>
  49 #endif
  50 
  51 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
  52 
  53 static scan_cb_t dsl_scan_defrag_cb;
  54 static scan_cb_t dsl_scan_scrub_cb;
  55 static scan_cb_t dsl_scan_remove_cb;
  56 static dsl_syncfunc_t dsl_scan_cancel_sync;
  57 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
  58 
  59 int zfs_top_maxinflight = 32;           /* maximum I/Os per top-level */
  60 int zfs_resilver_delay = 2;             /* number of ticks to delay resilver */
  61 int zfs_scrub_delay = 4;                /* number of ticks to delay scrub */
  62 int zfs_scan_idle = 50;                 /* idle window in clock ticks */
  63 
  64 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
  65 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
  66 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
 
 
 365 }
 366 
 367 int
 368 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
 369     arc_done_func_t *done, void *private, int priority, int zio_flags,
 370     uint32_t *arc_flags, const zbookmark_t *zb)
 371 {
 372         return (arc_read(pio, spa, bpp, pbuf, done, private,
 373             priority, zio_flags, arc_flags, zb));
 374 }
 375 
 376 int
 377 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
 378     arc_done_func_t *done, void *private, int priority, int zio_flags,
 379     uint32_t *arc_flags, const zbookmark_t *zb)
 380 {
 381         return (arc_read_nolock(pio, spa, bpp, done, private,
 382             priority, zio_flags, arc_flags, zb));
 383 }
 384 
 385 static boolean_t
 386 bookmark_is_zero(const zbookmark_t *zb)
 387 {
 388         return (zb->zb_objset == 0 && zb->zb_object == 0 &&
 389             zb->zb_level == 0 && zb->zb_blkid == 0);
 390 }
 391 
 392 /* dnp is the dnode for zb1->zb_object */
 393 static boolean_t
 394 bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
 395     const zbookmark_t *zb2)
 396 {
 397         uint64_t zb1nextL0, zb2thisobj;
 398 
 399         ASSERT(zb1->zb_objset == zb2->zb_objset);
 400         ASSERT(zb2->zb_level == 0);
 401 
 402         /*
 403          * A bookmark in the deadlist is considered to be after
 404          * everything else.
 405          */
 406         if (zb2->zb_object == DMU_DEADLIST_OBJECT)
 407                 return (B_TRUE);
 408 
 409         /* The objset_phys_t isn't before anything. */
 410         if (dnp == NULL)
 411                 return (B_FALSE);
 412 
 413         zb1nextL0 = (zb1->zb_blkid + 1) <<
 414             ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
 415 
 416         zb2thisobj = zb2->zb_object ? zb2->zb_object :
 417             zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
 418 
 419         if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 420                 uint64_t nextobj = zb1nextL0 *
 421                     (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
 422                 return (nextobj <= zb2thisobj);
 423         }
 424 
 425         if (zb1->zb_object < zb2thisobj)
 426                 return (B_TRUE);
 427         if (zb1->zb_object > zb2thisobj)
 428                 return (B_FALSE);
 429         if (zb2->zb_object == DMU_META_DNODE_OBJECT)
 430                 return (B_FALSE);
 431         return (zb1nextL0 <= zb2->zb_blkid);
 432 }
 433 
 434 static uint64_t
 435 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 436 {
 437         uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 438         if (dsl_dataset_is_snapshot(ds))
 439                 return (MIN(smt, ds->ds_phys->ds_creation_txg));
 440         return (smt);
 441 }
 442 
 443 static void
 444 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 445 {
 446         VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
 447             DMU_POOL_DIRECTORY_OBJECT,
 448             DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 449             &scn->scn_phys, tx));
 450 }
 451 
 452 static boolean_t
 453 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
 454 {
 455         uint64_t elapsed_nanosecs;
 456         int mintime;
 457 
 458         /* we never skip user/group accounting objects */
 459         if (zb && (int64_t)zb->zb_object < 0)
 460                 return (B_FALSE);
 461 
 462         if (scn->scn_pausing)
 463                 return (B_TRUE); /* we're already pausing */
 464 
 465         if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
 466                 return (B_FALSE); /* we're resuming */
 467 
 468         /* We only know how to resume from level-0 blocks. */
 469         if (zb && zb->zb_level != 0)
 470                 return (B_FALSE);
 471 
 472         mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 473             zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 474         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 475         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 476             (elapsed_nanosecs / MICROSEC > mintime &&
 477             txg_sync_waiting(scn->scn_dp)) ||
 478             spa_shutting_down(scn->scn_dp->dp_spa)) {
 479                 if (zb) {
 480                         dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
 481                             (longlong_t)zb->zb_objset,
 482                             (longlong_t)zb->zb_object,
 483                             (longlong_t)zb->zb_level,
 484                             (longlong_t)zb->zb_blkid);
 485                         scn->scn_phys.scn_bookmark = *zb;
 
 600                 return;
 601 
 602         SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
 603 
 604         /*
 605          * XXX need to make sure all of these arc_read() prefetches are
 606          * done before setting xlateall (similar to dsl_read())
 607          */
 608         (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
 609             buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 610             ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 611 }
 612 
 613 static boolean_t
 614 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 615     const zbookmark_t *zb)
 616 {
 617         /*
 618          * We never skip over user/group accounting objects (obj<0)
 619          */
 620         if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
 621             (int64_t)zb->zb_object >= 0) {
 622                 /*
 623                  * If we already visited this bp & everything below (in
 624                  * a prior txg sync), don't bother doing it again.
 625                  */
 626                 if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
 627                         return (B_TRUE);
 628 
 629                 /*
 630                  * If we found the block we're trying to resume from, or
 631                  * we went past it to a different object, zero it out to
 632                  * indicate that it's OK to start checking for pausing
 633                  * again.
 634                  */
 635                 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
 636                     zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
 637                         dprintf("resuming at %llx/%llx/%llx/%llx\n",
 638                             (longlong_t)zb->zb_objset,
 639                             (longlong_t)zb->zb_object,
 640                             (longlong_t)zb->zb_level,
 641                             (longlong_t)zb->zb_blkid);
 642                         bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
 643                 }
 644         }
 645         return (B_FALSE);
 646 }
 
 799         if (dsl_scan_check_pause(scn, zb))
 800                 return;
 801 
 802         if (dsl_scan_check_resume(scn, dnp, zb))
 803                 return;
 804 
 805         if (bp->blk_birth == 0)
 806                 return;
 807 
 808         scn->scn_visited_this_txg++;
 809 
 810         dprintf_bp(bp,
 811             "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
 812             ds, ds ? ds->ds_object : 0,
 813             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
 814             pbuf, bp);
 815 
 816         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 817                 return;
 818 
 819         if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
 820                 /*
 821                  * For non-user-accounting blocks, we need to read the
 822                  * new bp (from a deleted snapshot, found in
 823                  * check_existing_xlation).  If we used the old bp,
 824                  * pointers inside this block from before we resumed
 825                  * would be untranslated.
 826                  *
 827                  * For user-accounting blocks, we need to read the old
 828                  * bp, because we will apply the entire space delta to
 829                  * it (original untranslated -> translations from
 830                  * deleted snap -> now).
 831                  */
 832                 bp_toread = *bp;
 833         }
 834 
 835         if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
 836             &buf) != 0)
 837                 return;
 838 
 839         /*
 840          * If dsl_scan_ddt() has aready visited this block, it will have
 841          * already done any translations or scrubbing, so don't call the
 842          * callback again.
 843          */
 844         if (ddt_class_contains(dp->dp_spa,
 845             scn->scn_phys.scn_ddt_class_max, bp)) {
 846                 ASSERT(buf == NULL);
 847                 return;
 848         }
 849 
 850         /*
 851          * If this block is from the future (after cur_max_txg), then we
 852          * are doing this on behalf of a deleted snapshot, and we will
 853          * revisit the future block on the next pass of this dataset.
 854          * Don't scan it now unless we need to because something
 
1379                 if (za.za_first_integer != 0) {
1380                         scn->scn_phys.scn_cur_min_txg =
1381                             MAX(scn->scn_phys.scn_min_txg,
1382                             za.za_first_integer);
1383                 } else {
1384                         scn->scn_phys.scn_cur_min_txg =
1385                             MAX(scn->scn_phys.scn_min_txg,
1386                             ds->ds_phys->ds_prev_snap_txg);
1387                 }
1388                 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1389                 dsl_dataset_rele(ds, FTAG);
1390 
1391                 dsl_scan_visitds(scn, dsobj, tx);
1392                 zap_cursor_fini(&zc);
1393                 if (scn->scn_pausing)
1394                         return;
1395         }
1396         zap_cursor_fini(&zc);
1397 }
1398 
1399 static int
1400 dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1401 {
1402         dsl_scan_t *scn = arg;
1403         uint64_t elapsed_nanosecs;
1404 
1405         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1406 
1407         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1408             (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1409             txg_sync_waiting(scn->scn_dp)) ||
1410             spa_shutting_down(scn->scn_dp->dp_spa))
1411                 return (ERESTART);
1412 
1413         zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1414             dmu_tx_get_txg(tx), bp, 0));
1415         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1416             -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1417             -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1418         scn->scn_visited_this_txg++;
1419         return (0);
1420 }
1421 
1422 boolean_t
1423 dsl_scan_active(dsl_scan_t *scn)
1424 {
1425         spa_t *spa = scn->scn_dp->dp_spa;
1426         uint64_t used = 0, comp, uncomp;
1427 
1428         if (spa->spa_load_state != SPA_LOAD_NONE)
1429                 return (B_FALSE);
1430         if (spa_shutting_down(spa))
1431                 return (B_FALSE);
1432 
1433         if (scn->scn_phys.scn_state == DSS_SCANNING)
1434                 return (B_TRUE);
1435 
1436         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1437                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1438                     &used, &comp, &uncomp);
1439         }
1440         return (used != 0);
1441 }
1442 
1443 void
1444 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1445 {
1446         dsl_scan_t *scn = dp->dp_scan;
1447         spa_t *spa = dp->dp_spa;
1448         int err;
1449 
1450         /*
1451          * Check for scn_restart_txg before checking spa_load_state, so
1452          * that we can restart an old-style scan while the pool is being
1453          * imported (see dsl_scan_init).
1454          */
1455         if (scn->scn_restart_txg != 0 &&
 
1462                     func, tx->tx_txg);
1463                 dsl_scan_setup_sync(scn, &func, tx);
1464         }
1465 
1466         if (!dsl_scan_active(scn) ||
1467             spa_sync_pass(dp->dp_spa) > 1)
1468                 return;
1469 
1470         scn->scn_visited_this_txg = 0;
1471         scn->scn_pausing = B_FALSE;
1472         scn->scn_sync_start_time = gethrtime();
1473         spa->spa_scrub_active = B_TRUE;
1474 
1475         /*
1476          * First process the free list.  If we pause the free, don't do
1477          * any scanning.  This ensures that there is no free list when
1478          * we are scanning, so the scan code doesn't have to worry about
1479          * traversing it.
1480          */
1481         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1482                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1483                     NULL, ZIO_FLAG_MUSTSUCCEED);
1484                 err = bpobj_iterate(&dp->dp_free_bpobj,
1485                     dsl_scan_free_cb, scn, tx);
1486                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1487                 if (scn->scn_visited_this_txg) {
1488                         zfs_dbgmsg("freed %llu blocks in %llums from "
1489                             "free_bpobj txg %llu",
1490                             (longlong_t)scn->scn_visited_this_txg,
1491                             (longlong_t)
1492                             (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1493                             (longlong_t)tx->tx_txg);
1494                         scn->scn_visited_this_txg = 0;
1495                         /*
1496                          * Re-sync the ddt so that we can further modify
1497                          * it when doing bprewrite.
1498                          */
1499                         ddt_sync(spa, tx->tx_txg);
1500                 }
1501                 if (err == ERESTART)
1502                         return;
1503         }
1504 
1505         if (scn->scn_phys.scn_state != DSS_SCANNING)
1506                 return;
1507 
1508         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1509             scn->scn_phys.scn_ddt_class_max) {
 
1584 
1585 /*
1586  * scrub consumers
1587  */
1588 
1589 static void
1590 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1591 {
1592         int i;
1593 
1594         /*
1595          * If we resume after a reboot, zab will be NULL; don't record
1596          * incomplete stats in that case.
1597          */
1598         if (zab == NULL)
1599                 return;
1600 
1601         for (i = 0; i < 4; i++) {
1602                 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1603                 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1604                 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1605                 int equal;
1606 
1607                 zb->zb_count++;
1608                 zb->zb_asize += BP_GET_ASIZE(bp);
1609                 zb->zb_lsize += BP_GET_LSIZE(bp);
1610                 zb->zb_psize += BP_GET_PSIZE(bp);
1611                 zb->zb_gangs += BP_COUNT_GANG(bp);
1612 
1613                 switch (BP_GET_NDVAS(bp)) {
1614                 case 2:
1615                         if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1616                             DVA_GET_VDEV(&bp->blk_dva[1]))
1617                                 zb->zb_ditto_2_of_2_samevdev++;
1618                         break;
1619                 case 3:
1620                         equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1621                             DVA_GET_VDEV(&bp->blk_dva[1])) +
1622                             (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1623                             DVA_GET_VDEV(&bp->blk_dva[2])) +
 
 | 
 
 
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/dsl_scan.h>
  27 #include <sys/dsl_pool.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_prop.h>
  30 #include <sys/dsl_dir.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/dnode.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/arc.h>
  36 #include <sys/zap.h>
  37 #include <sys/zio.h>
  38 #include <sys/zfs_context.h>
  39 #include <sys/fs/zfs.h>
  40 #include <sys/zfs_znode.h>
  41 #include <sys/spa_impl.h>
  42 #include <sys/vdev_impl.h>
  43 #include <sys/zil_impl.h>
  44 #include <sys/zio_checksum.h>
  45 #include <sys/ddt.h>
  46 #include <sys/sa.h>
  47 #include <sys/sa_impl.h>
  48 #include <sys/zfeature.h>
  49 #ifdef _KERNEL
  50 #include <sys/zfs_vfsops.h>
  51 #endif
  52 
  53 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
  54 
  55 static scan_cb_t dsl_scan_defrag_cb;
  56 static scan_cb_t dsl_scan_scrub_cb;
  57 static scan_cb_t dsl_scan_remove_cb;
  58 static dsl_syncfunc_t dsl_scan_cancel_sync;
  59 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
  60 
  61 int zfs_top_maxinflight = 32;           /* maximum I/Os per top-level */
  62 int zfs_resilver_delay = 2;             /* number of ticks to delay resilver */
  63 int zfs_scrub_delay = 4;                /* number of ticks to delay scrub */
  64 int zfs_scan_idle = 50;                 /* idle window in clock ticks */
  65 
  66 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
  67 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
  68 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
 
 
 367 }
 368 
 369 int
 370 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
 371     arc_done_func_t *done, void *private, int priority, int zio_flags,
 372     uint32_t *arc_flags, const zbookmark_t *zb)
 373 {
 374         return (arc_read(pio, spa, bpp, pbuf, done, private,
 375             priority, zio_flags, arc_flags, zb));
 376 }
 377 
 378 int
 379 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
 380     arc_done_func_t *done, void *private, int priority, int zio_flags,
 381     uint32_t *arc_flags, const zbookmark_t *zb)
 382 {
 383         return (arc_read_nolock(pio, spa, bpp, done, private,
 384             priority, zio_flags, arc_flags, zb));
 385 }
 386 
 387 static uint64_t
 388 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 389 {
 390         uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 391         if (dsl_dataset_is_snapshot(ds))
 392                 return (MIN(smt, ds->ds_phys->ds_creation_txg));
 393         return (smt);
 394 }
 395 
 396 static void
 397 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 398 {
 399         VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
 400             DMU_POOL_DIRECTORY_OBJECT,
 401             DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 402             &scn->scn_phys, tx));
 403 }
 404 
 405 static boolean_t
 406 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
 407 {
 408         uint64_t elapsed_nanosecs;
 409         int mintime;
 410 
 411         /* we never skip user/group accounting objects */
 412         if (zb && (int64_t)zb->zb_object < 0)
 413                 return (B_FALSE);
 414 
 415         if (scn->scn_pausing)
 416                 return (B_TRUE); /* we're already pausing */
 417 
 418         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 419                 return (B_FALSE); /* we're resuming */
 420 
 421         /* We only know how to resume from level-0 blocks. */
 422         if (zb && zb->zb_level != 0)
 423                 return (B_FALSE);
 424 
 425         mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 426             zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 427         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 428         if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 429             (elapsed_nanosecs / MICROSEC > mintime &&
 430             txg_sync_waiting(scn->scn_dp)) ||
 431             spa_shutting_down(scn->scn_dp->dp_spa)) {
 432                 if (zb) {
 433                         dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
 434                             (longlong_t)zb->zb_objset,
 435                             (longlong_t)zb->zb_object,
 436                             (longlong_t)zb->zb_level,
 437                             (longlong_t)zb->zb_blkid);
 438                         scn->scn_phys.scn_bookmark = *zb;
 
 553                 return;
 554 
 555         SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
 556 
 557         /*
 558          * XXX need to make sure all of these arc_read() prefetches are
 559          * done before setting xlateall (similar to dsl_read())
 560          */
 561         (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
 562             buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 563             ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 564 }
 565 
 566 static boolean_t
 567 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 568     const zbookmark_t *zb)
 569 {
 570         /*
 571          * We never skip over user/group accounting objects (obj<0)
 572          */
 573         if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 574             (int64_t)zb->zb_object >= 0) {
 575                 /*
 576                  * If we already visited this bp & everything below (in
 577                  * a prior txg sync), don't bother doing it again.
 578                  */
 579                 if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
 580                         return (B_TRUE);
 581 
 582                 /*
 583                  * If we found the block we're trying to resume from, or
 584                  * we went past it to a different object, zero it out to
 585                  * indicate that it's OK to start checking for pausing
 586                  * again.
 587                  */
 588                 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
 589                     zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
 590                         dprintf("resuming at %llx/%llx/%llx/%llx\n",
 591                             (longlong_t)zb->zb_objset,
 592                             (longlong_t)zb->zb_object,
 593                             (longlong_t)zb->zb_level,
 594                             (longlong_t)zb->zb_blkid);
 595                         bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
 596                 }
 597         }
 598         return (B_FALSE);
 599 }
 
 752         if (dsl_scan_check_pause(scn, zb))
 753                 return;
 754 
 755         if (dsl_scan_check_resume(scn, dnp, zb))
 756                 return;
 757 
 758         if (bp->blk_birth == 0)
 759                 return;
 760 
 761         scn->scn_visited_this_txg++;
 762 
 763         dprintf_bp(bp,
 764             "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
 765             ds, ds ? ds->ds_object : 0,
 766             zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
 767             pbuf, bp);
 768 
 769         if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 770                 return;
 771 
 772         if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
 773             &buf) != 0)
 774                 return;
 775 
 776         /*
 777          * If dsl_scan_ddt() has aready visited this block, it will have
 778          * already done any translations or scrubbing, so don't call the
 779          * callback again.
 780          */
 781         if (ddt_class_contains(dp->dp_spa,
 782             scn->scn_phys.scn_ddt_class_max, bp)) {
 783                 ASSERT(buf == NULL);
 784                 return;
 785         }
 786 
 787         /*
 788          * If this block is from the future (after cur_max_txg), then we
 789          * are doing this on behalf of a deleted snapshot, and we will
 790          * revisit the future block on the next pass of this dataset.
 791          * Don't scan it now unless we need to because something
 
1316                 if (za.za_first_integer != 0) {
1317                         scn->scn_phys.scn_cur_min_txg =
1318                             MAX(scn->scn_phys.scn_min_txg,
1319                             za.za_first_integer);
1320                 } else {
1321                         scn->scn_phys.scn_cur_min_txg =
1322                             MAX(scn->scn_phys.scn_min_txg,
1323                             ds->ds_phys->ds_prev_snap_txg);
1324                 }
1325                 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1326                 dsl_dataset_rele(ds, FTAG);
1327 
1328                 dsl_scan_visitds(scn, dsobj, tx);
1329                 zap_cursor_fini(&zc);
1330                 if (scn->scn_pausing)
1331                         return;
1332         }
1333         zap_cursor_fini(&zc);
1334 }
1335 
1336 static boolean_t
1337 dsl_scan_free_should_pause(dsl_scan_t *scn)
1338 {
1339         uint64_t elapsed_nanosecs;
1340 
1341         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1342         return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1343             (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1344             txg_sync_waiting(scn->scn_dp)) ||
1345             spa_shutting_down(scn->scn_dp->dp_spa));
1346 }
1347 
1348 static int
1349 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1350 {
1351         dsl_scan_t *scn = arg;
1352 
1353         if (!scn->scn_is_bptree ||
1354             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
1355                 if (dsl_scan_free_should_pause(scn))
1356                         return (ERESTART);
1357         }
1358 
1359         zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1360             dmu_tx_get_txg(tx), bp, 0));
1361         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1362             -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1363             -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1364         scn->scn_visited_this_txg++;
1365         return (0);
1366 }
1367 
1368 boolean_t
1369 dsl_scan_active(dsl_scan_t *scn)
1370 {
1371         spa_t *spa = scn->scn_dp->dp_spa;
1372         uint64_t used = 0, comp, uncomp;
1373 
1374         if (spa->spa_load_state != SPA_LOAD_NONE)
1375                 return (B_FALSE);
1376         if (spa_shutting_down(spa))
1377                 return (B_FALSE);
1378 
1379         if (scn->scn_phys.scn_state == DSS_SCANNING)
1380                 return (B_TRUE);
1381 
1382         if (spa_feature_is_active(spa,
1383             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1384                 return (B_TRUE);
1385         }
1386         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1387                 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1388                     &used, &comp, &uncomp);
1389         }
1390         return (used != 0);
1391 }
1392 
1393 void
1394 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1395 {
1396         dsl_scan_t *scn = dp->dp_scan;
1397         spa_t *spa = dp->dp_spa;
1398         int err;
1399 
1400         /*
1401          * Check for scn_restart_txg before checking spa_load_state, so
1402          * that we can restart an old-style scan while the pool is being
1403          * imported (see dsl_scan_init).
1404          */
1405         if (scn->scn_restart_txg != 0 &&
 
1412                     func, tx->tx_txg);
1413                 dsl_scan_setup_sync(scn, &func, tx);
1414         }
1415 
1416         if (!dsl_scan_active(scn) ||
1417             spa_sync_pass(dp->dp_spa) > 1)
1418                 return;
1419 
1420         scn->scn_visited_this_txg = 0;
1421         scn->scn_pausing = B_FALSE;
1422         scn->scn_sync_start_time = gethrtime();
1423         spa->spa_scrub_active = B_TRUE;
1424 
1425         /*
1426          * First process the free list.  If we pause the free, don't do
1427          * any scanning.  This ensures that there is no free list when
1428          * we are scanning, so the scan code doesn't have to worry about
1429          * traversing it.
1430          */
1431         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1432                 scn->scn_is_bptree = B_FALSE;
1433                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1434                     NULL, ZIO_FLAG_MUSTSUCCEED);
1435                 err = bpobj_iterate(&dp->dp_free_bpobj,
1436                     dsl_scan_free_block_cb, scn, tx);
1437                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1438 
1439                 if (err == 0 && spa_feature_is_active(spa,
1440                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1441                         scn->scn_is_bptree = B_TRUE;
1442                         scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1443                             NULL, ZIO_FLAG_MUSTSUCCEED);
1444                         err = bptree_iterate(dp->dp_meta_objset,
1445                             dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
1446                             scn, tx);
1447                         VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1448                         if (err != 0)
1449                                 return;
1450 
1451                         /* disable async destroy feature */
1452                         spa_feature_decr(spa,
1453                             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
1454                         ASSERT(!spa_feature_is_active(spa,
1455                             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
1456                         VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
1457                             DMU_POOL_DIRECTORY_OBJECT,
1458                             DMU_POOL_BPTREE_OBJ, tx));
1459                         VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
1460                             dp->dp_bptree_obj, tx));
1461                         dp->dp_bptree_obj = 0;
1462                 }
1463                 if (scn->scn_visited_this_txg) {
1464                         zfs_dbgmsg("freed %llu blocks in %llums from "
1465                             "free_bpobj/bptree txg %llu",
1466                             (longlong_t)scn->scn_visited_this_txg,
1467                             (longlong_t)
1468                             (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1469                             (longlong_t)tx->tx_txg);
1470                         scn->scn_visited_this_txg = 0;
1471                         /*
1472                          * Re-sync the ddt so that we can further modify
1473                          * it when doing bprewrite.
1474                          */
1475                         ddt_sync(spa, tx->tx_txg);
1476                 }
1477                 if (err == ERESTART)
1478                         return;
1479         }
1480 
1481         if (scn->scn_phys.scn_state != DSS_SCANNING)
1482                 return;
1483 
1484         if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1485             scn->scn_phys.scn_ddt_class_max) {
 
1560 
1561 /*
1562  * scrub consumers
1563  */
1564 
1565 static void
1566 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1567 {
1568         int i;
1569 
1570         /*
1571          * If we resume after a reboot, zab will be NULL; don't record
1572          * incomplete stats in that case.
1573          */
1574         if (zab == NULL)
1575                 return;
1576 
1577         for (i = 0; i < 4; i++) {
1578                 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1579                 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1580                 if (t & DMU_OT_NEWTYPE)
1581                         t = DMU_OT_OTHER;
1582                 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1583                 int equal;
1584 
1585                 zb->zb_count++;
1586                 zb->zb_asize += BP_GET_ASIZE(bp);
1587                 zb->zb_lsize += BP_GET_LSIZE(bp);
1588                 zb->zb_psize += BP_GET_PSIZE(bp);
1589                 zb->zb_gangs += BP_COUNT_GANG(bp);
1590 
1591                 switch (BP_GET_NDVAS(bp)) {
1592                 case 2:
1593                         if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1594                             DVA_GET_VDEV(&bp->blk_dva[1]))
1595                                 zb->zb_ditto_2_of_2_samevdev++;
1596                         break;
1597                 case 3:
1598                         equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1599                             DVA_GET_VDEV(&bp->blk_dva[1])) +
1600                             (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1601                             DVA_GET_VDEV(&bp->blk_dva[2])) +
 
 |