3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/dsl_scan.h>
26 #include <sys/dsl_pool.h>
27 #include <sys/dsl_dataset.h>
28 #include <sys/dsl_prop.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_synctask.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dmu_objset.h>
34 #include <sys/arc.h>
35 #include <sys/zap.h>
36 #include <sys/zio.h>
37 #include <sys/zfs_context.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/zfs_znode.h>
40 #include <sys/spa_impl.h>
41 #include <sys/vdev_impl.h>
42 #include <sys/zil_impl.h>
43 #include <sys/zio_checksum.h>
44 #include <sys/ddt.h>
45 #include <sys/sa.h>
46 #include <sys/sa_impl.h>
47 #ifdef _KERNEL
48 #include <sys/zfs_vfsops.h>
49 #endif
50
51 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
52
53 static scan_cb_t dsl_scan_defrag_cb;
54 static scan_cb_t dsl_scan_scrub_cb;
55 static scan_cb_t dsl_scan_remove_cb;
56 static dsl_syncfunc_t dsl_scan_cancel_sync;
57 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
58
59 int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
60 int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
61 int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
62 int zfs_scan_idle = 50; /* idle window in clock ticks */
63
64 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
65 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
66 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
365 }
366
367 int
368 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
369 arc_done_func_t *done, void *private, int priority, int zio_flags,
370 uint32_t *arc_flags, const zbookmark_t *zb)
371 {
372 return (arc_read(pio, spa, bpp, pbuf, done, private,
373 priority, zio_flags, arc_flags, zb));
374 }
375
376 int
377 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
378 arc_done_func_t *done, void *private, int priority, int zio_flags,
379 uint32_t *arc_flags, const zbookmark_t *zb)
380 {
381 return (arc_read_nolock(pio, spa, bpp, done, private,
382 priority, zio_flags, arc_flags, zb));
383 }
384
385 static boolean_t
386 bookmark_is_zero(const zbookmark_t *zb)
387 {
388 return (zb->zb_objset == 0 && zb->zb_object == 0 &&
389 zb->zb_level == 0 && zb->zb_blkid == 0);
390 }
391
392 /* dnp is the dnode for zb1->zb_object */
393 static boolean_t
394 bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
395 const zbookmark_t *zb2)
396 {
397 uint64_t zb1nextL0, zb2thisobj;
398
399 ASSERT(zb1->zb_objset == zb2->zb_objset);
400 ASSERT(zb2->zb_level == 0);
401
402 /*
403 * A bookmark in the deadlist is considered to be after
404 * everything else.
405 */
406 if (zb2->zb_object == DMU_DEADLIST_OBJECT)
407 return (B_TRUE);
408
409 /* The objset_phys_t isn't before anything. */
410 if (dnp == NULL)
411 return (B_FALSE);
412
413 zb1nextL0 = (zb1->zb_blkid + 1) <<
414 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
415
416 zb2thisobj = zb2->zb_object ? zb2->zb_object :
417 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
418
419 if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
420 uint64_t nextobj = zb1nextL0 *
421 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
422 return (nextobj <= zb2thisobj);
423 }
424
425 if (zb1->zb_object < zb2thisobj)
426 return (B_TRUE);
427 if (zb1->zb_object > zb2thisobj)
428 return (B_FALSE);
429 if (zb2->zb_object == DMU_META_DNODE_OBJECT)
430 return (B_FALSE);
431 return (zb1nextL0 <= zb2->zb_blkid);
432 }
433
434 static uint64_t
435 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
436 {
437 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
438 if (dsl_dataset_is_snapshot(ds))
439 return (MIN(smt, ds->ds_phys->ds_creation_txg));
440 return (smt);
441 }
442
443 static void
444 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
445 {
446 VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
447 DMU_POOL_DIRECTORY_OBJECT,
448 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
449 &scn->scn_phys, tx));
450 }
451
452 static boolean_t
453 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
454 {
455 uint64_t elapsed_nanosecs;
456 int mintime;
457
458 /* we never skip user/group accounting objects */
459 if (zb && (int64_t)zb->zb_object < 0)
460 return (B_FALSE);
461
462 if (scn->scn_pausing)
463 return (B_TRUE); /* we're already pausing */
464
465 if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
466 return (B_FALSE); /* we're resuming */
467
468 /* We only know how to resume from level-0 blocks. */
469 if (zb && zb->zb_level != 0)
470 return (B_FALSE);
471
472 mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
473 zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
474 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
475 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
476 (elapsed_nanosecs / MICROSEC > mintime &&
477 txg_sync_waiting(scn->scn_dp)) ||
478 spa_shutting_down(scn->scn_dp->dp_spa)) {
479 if (zb) {
480 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
481 (longlong_t)zb->zb_objset,
482 (longlong_t)zb->zb_object,
483 (longlong_t)zb->zb_level,
484 (longlong_t)zb->zb_blkid);
485 scn->scn_phys.scn_bookmark = *zb;
600 return;
601
602 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
603
604 /*
605 * XXX need to make sure all of these arc_read() prefetches are
606 * done before setting xlateall (similar to dsl_read())
607 */
608 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
609 buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
610 ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
611 }
612
613 static boolean_t
614 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
615 const zbookmark_t *zb)
616 {
617 /*
618 * We never skip over user/group accounting objects (obj<0)
619 */
620 if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
621 (int64_t)zb->zb_object >= 0) {
622 /*
623 * If we already visited this bp & everything below (in
624 * a prior txg sync), don't bother doing it again.
625 */
626 if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
627 return (B_TRUE);
628
629 /*
630 * If we found the block we're trying to resume from, or
631 * we went past it to a different object, zero it out to
632 * indicate that it's OK to start checking for pausing
633 * again.
634 */
635 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
636 zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
637 dprintf("resuming at %llx/%llx/%llx/%llx\n",
638 (longlong_t)zb->zb_objset,
639 (longlong_t)zb->zb_object,
640 (longlong_t)zb->zb_level,
641 (longlong_t)zb->zb_blkid);
642 bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
643 }
644 }
645 return (B_FALSE);
646 }
799 if (dsl_scan_check_pause(scn, zb))
800 return;
801
802 if (dsl_scan_check_resume(scn, dnp, zb))
803 return;
804
805 if (bp->blk_birth == 0)
806 return;
807
808 scn->scn_visited_this_txg++;
809
810 dprintf_bp(bp,
811 "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
812 ds, ds ? ds->ds_object : 0,
813 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
814 pbuf, bp);
815
816 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
817 return;
818
819 if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
820 /*
821 * For non-user-accounting blocks, we need to read the
822 * new bp (from a deleted snapshot, found in
823 * check_existing_xlation). If we used the old bp,
824 * pointers inside this block from before we resumed
825 * would be untranslated.
826 *
827 * For user-accounting blocks, we need to read the old
828 * bp, because we will apply the entire space delta to
829 * it (original untranslated -> translations from
830 * deleted snap -> now).
831 */
832 bp_toread = *bp;
833 }
834
835 if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
836 &buf) != 0)
837 return;
838
839 /*
840 * If dsl_scan_ddt() has aready visited this block, it will have
841 * already done any translations or scrubbing, so don't call the
842 * callback again.
843 */
844 if (ddt_class_contains(dp->dp_spa,
845 scn->scn_phys.scn_ddt_class_max, bp)) {
846 ASSERT(buf == NULL);
847 return;
848 }
849
850 /*
851 * If this block is from the future (after cur_max_txg), then we
852 * are doing this on behalf of a deleted snapshot, and we will
853 * revisit the future block on the next pass of this dataset.
854 * Don't scan it now unless we need to because something
1379 if (za.za_first_integer != 0) {
1380 scn->scn_phys.scn_cur_min_txg =
1381 MAX(scn->scn_phys.scn_min_txg,
1382 za.za_first_integer);
1383 } else {
1384 scn->scn_phys.scn_cur_min_txg =
1385 MAX(scn->scn_phys.scn_min_txg,
1386 ds->ds_phys->ds_prev_snap_txg);
1387 }
1388 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1389 dsl_dataset_rele(ds, FTAG);
1390
1391 dsl_scan_visitds(scn, dsobj, tx);
1392 zap_cursor_fini(&zc);
1393 if (scn->scn_pausing)
1394 return;
1395 }
1396 zap_cursor_fini(&zc);
1397 }
1398
1399 static int
1400 dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1401 {
1402 dsl_scan_t *scn = arg;
1403 uint64_t elapsed_nanosecs;
1404
1405 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1406
1407 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1408 (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1409 txg_sync_waiting(scn->scn_dp)) ||
1410 spa_shutting_down(scn->scn_dp->dp_spa))
1411 return (ERESTART);
1412
1413 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1414 dmu_tx_get_txg(tx), bp, 0));
1415 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1416 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1417 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1418 scn->scn_visited_this_txg++;
1419 return (0);
1420 }
1421
1422 boolean_t
1423 dsl_scan_active(dsl_scan_t *scn)
1424 {
1425 spa_t *spa = scn->scn_dp->dp_spa;
1426 uint64_t used = 0, comp, uncomp;
1427
1428 if (spa->spa_load_state != SPA_LOAD_NONE)
1429 return (B_FALSE);
1430 if (spa_shutting_down(spa))
1431 return (B_FALSE);
1432
1433 if (scn->scn_phys.scn_state == DSS_SCANNING)
1434 return (B_TRUE);
1435
1436 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1437 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1438 &used, &comp, &uncomp);
1439 }
1440 return (used != 0);
1441 }
1442
1443 void
1444 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1445 {
1446 dsl_scan_t *scn = dp->dp_scan;
1447 spa_t *spa = dp->dp_spa;
1448 int err;
1449
1450 /*
1451 * Check for scn_restart_txg before checking spa_load_state, so
1452 * that we can restart an old-style scan while the pool is being
1453 * imported (see dsl_scan_init).
1454 */
1455 if (scn->scn_restart_txg != 0 &&
1462 func, tx->tx_txg);
1463 dsl_scan_setup_sync(scn, &func, tx);
1464 }
1465
1466 if (!dsl_scan_active(scn) ||
1467 spa_sync_pass(dp->dp_spa) > 1)
1468 return;
1469
1470 scn->scn_visited_this_txg = 0;
1471 scn->scn_pausing = B_FALSE;
1472 scn->scn_sync_start_time = gethrtime();
1473 spa->spa_scrub_active = B_TRUE;
1474
1475 /*
1476 * First process the free list. If we pause the free, don't do
1477 * any scanning. This ensures that there is no free list when
1478 * we are scanning, so the scan code doesn't have to worry about
1479 * traversing it.
1480 */
1481 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1482 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1483 NULL, ZIO_FLAG_MUSTSUCCEED);
1484 err = bpobj_iterate(&dp->dp_free_bpobj,
1485 dsl_scan_free_cb, scn, tx);
1486 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1487 if (scn->scn_visited_this_txg) {
1488 zfs_dbgmsg("freed %llu blocks in %llums from "
1489 "free_bpobj txg %llu",
1490 (longlong_t)scn->scn_visited_this_txg,
1491 (longlong_t)
1492 (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1493 (longlong_t)tx->tx_txg);
1494 scn->scn_visited_this_txg = 0;
1495 /*
1496 * Re-sync the ddt so that we can further modify
1497 * it when doing bprewrite.
1498 */
1499 ddt_sync(spa, tx->tx_txg);
1500 }
1501 if (err == ERESTART)
1502 return;
1503 }
1504
1505 if (scn->scn_phys.scn_state != DSS_SCANNING)
1506 return;
1507
1508 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1509 scn->scn_phys.scn_ddt_class_max) {
1584
1585 /*
1586 * scrub consumers
1587 */
1588
1589 static void
1590 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1591 {
1592 int i;
1593
1594 /*
1595 * If we resume after a reboot, zab will be NULL; don't record
1596 * incomplete stats in that case.
1597 */
1598 if (zab == NULL)
1599 return;
1600
1601 for (i = 0; i < 4; i++) {
1602 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1603 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1604 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1605 int equal;
1606
1607 zb->zb_count++;
1608 zb->zb_asize += BP_GET_ASIZE(bp);
1609 zb->zb_lsize += BP_GET_LSIZE(bp);
1610 zb->zb_psize += BP_GET_PSIZE(bp);
1611 zb->zb_gangs += BP_COUNT_GANG(bp);
1612
1613 switch (BP_GET_NDVAS(bp)) {
1614 case 2:
1615 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1616 DVA_GET_VDEV(&bp->blk_dva[1]))
1617 zb->zb_ditto_2_of_2_samevdev++;
1618 break;
1619 case 3:
1620 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1621 DVA_GET_VDEV(&bp->blk_dva[1])) +
1622 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1623 DVA_GET_VDEV(&bp->blk_dva[2])) +
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 */
25
26 #include <sys/dsl_scan.h>
27 #include <sys/dsl_pool.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_prop.h>
30 #include <sys/dsl_dir.h>
31 #include <sys/dsl_synctask.h>
32 #include <sys/dnode.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/arc.h>
36 #include <sys/zap.h>
37 #include <sys/zio.h>
38 #include <sys/zfs_context.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/zfs_znode.h>
41 #include <sys/spa_impl.h>
42 #include <sys/vdev_impl.h>
43 #include <sys/zil_impl.h>
44 #include <sys/zio_checksum.h>
45 #include <sys/ddt.h>
46 #include <sys/sa.h>
47 #include <sys/sa_impl.h>
48 #include <sys/zfeature.h>
49 #ifdef _KERNEL
50 #include <sys/zfs_vfsops.h>
51 #endif
52
53 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
54
55 static scan_cb_t dsl_scan_defrag_cb;
56 static scan_cb_t dsl_scan_scrub_cb;
57 static scan_cb_t dsl_scan_remove_cb;
58 static dsl_syncfunc_t dsl_scan_cancel_sync;
59 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
60
61 int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
62 int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
63 int zfs_scrub_delay = 4; /* number of ticks to delay scrub */
64 int zfs_scan_idle = 50; /* idle window in clock ticks */
65
66 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
67 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
68 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
367 }
368
369 int
370 dsl_read(zio_t *pio, spa_t *spa, const blkptr_t *bpp, arc_buf_t *pbuf,
371 arc_done_func_t *done, void *private, int priority, int zio_flags,
372 uint32_t *arc_flags, const zbookmark_t *zb)
373 {
374 return (arc_read(pio, spa, bpp, pbuf, done, private,
375 priority, zio_flags, arc_flags, zb));
376 }
377
378 int
379 dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
380 arc_done_func_t *done, void *private, int priority, int zio_flags,
381 uint32_t *arc_flags, const zbookmark_t *zb)
382 {
383 return (arc_read_nolock(pio, spa, bpp, done, private,
384 priority, zio_flags, arc_flags, zb));
385 }
386
387 static uint64_t
388 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
389 {
390 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
391 if (dsl_dataset_is_snapshot(ds))
392 return (MIN(smt, ds->ds_phys->ds_creation_txg));
393 return (smt);
394 }
395
396 static void
397 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
398 {
399 VERIFY(0 == zap_update(scn->scn_dp->dp_meta_objset,
400 DMU_POOL_DIRECTORY_OBJECT,
401 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
402 &scn->scn_phys, tx));
403 }
404
405 static boolean_t
406 dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
407 {
408 uint64_t elapsed_nanosecs;
409 int mintime;
410
411 /* we never skip user/group accounting objects */
412 if (zb && (int64_t)zb->zb_object < 0)
413 return (B_FALSE);
414
415 if (scn->scn_pausing)
416 return (B_TRUE); /* we're already pausing */
417
418 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
419 return (B_FALSE); /* we're resuming */
420
421 /* We only know how to resume from level-0 blocks. */
422 if (zb && zb->zb_level != 0)
423 return (B_FALSE);
424
425 mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
426 zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
427 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
428 if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
429 (elapsed_nanosecs / MICROSEC > mintime &&
430 txg_sync_waiting(scn->scn_dp)) ||
431 spa_shutting_down(scn->scn_dp->dp_spa)) {
432 if (zb) {
433 dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
434 (longlong_t)zb->zb_objset,
435 (longlong_t)zb->zb_object,
436 (longlong_t)zb->zb_level,
437 (longlong_t)zb->zb_blkid);
438 scn->scn_phys.scn_bookmark = *zb;
553 return;
554
555 SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
556
557 /*
558 * XXX need to make sure all of these arc_read() prefetches are
559 * done before setting xlateall (similar to dsl_read())
560 */
561 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
562 buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
563 ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
564 }
565
566 static boolean_t
567 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
568 const zbookmark_t *zb)
569 {
570 /*
571 * We never skip over user/group accounting objects (obj<0)
572 */
573 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
574 (int64_t)zb->zb_object >= 0) {
575 /*
576 * If we already visited this bp & everything below (in
577 * a prior txg sync), don't bother doing it again.
578 */
579 if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
580 return (B_TRUE);
581
582 /*
583 * If we found the block we're trying to resume from, or
584 * we went past it to a different object, zero it out to
585 * indicate that it's OK to start checking for pausing
586 * again.
587 */
588 if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
589 zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
590 dprintf("resuming at %llx/%llx/%llx/%llx\n",
591 (longlong_t)zb->zb_objset,
592 (longlong_t)zb->zb_object,
593 (longlong_t)zb->zb_level,
594 (longlong_t)zb->zb_blkid);
595 bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
596 }
597 }
598 return (B_FALSE);
599 }
752 if (dsl_scan_check_pause(scn, zb))
753 return;
754
755 if (dsl_scan_check_resume(scn, dnp, zb))
756 return;
757
758 if (bp->blk_birth == 0)
759 return;
760
761 scn->scn_visited_this_txg++;
762
763 dprintf_bp(bp,
764 "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
765 ds, ds ? ds->ds_object : 0,
766 zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
767 pbuf, bp);
768
769 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
770 return;
771
772 if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
773 &buf) != 0)
774 return;
775
776 /*
777 * If dsl_scan_ddt() has aready visited this block, it will have
778 * already done any translations or scrubbing, so don't call the
779 * callback again.
780 */
781 if (ddt_class_contains(dp->dp_spa,
782 scn->scn_phys.scn_ddt_class_max, bp)) {
783 ASSERT(buf == NULL);
784 return;
785 }
786
787 /*
788 * If this block is from the future (after cur_max_txg), then we
789 * are doing this on behalf of a deleted snapshot, and we will
790 * revisit the future block on the next pass of this dataset.
791 * Don't scan it now unless we need to because something
1316 if (za.za_first_integer != 0) {
1317 scn->scn_phys.scn_cur_min_txg =
1318 MAX(scn->scn_phys.scn_min_txg,
1319 za.za_first_integer);
1320 } else {
1321 scn->scn_phys.scn_cur_min_txg =
1322 MAX(scn->scn_phys.scn_min_txg,
1323 ds->ds_phys->ds_prev_snap_txg);
1324 }
1325 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
1326 dsl_dataset_rele(ds, FTAG);
1327
1328 dsl_scan_visitds(scn, dsobj, tx);
1329 zap_cursor_fini(&zc);
1330 if (scn->scn_pausing)
1331 return;
1332 }
1333 zap_cursor_fini(&zc);
1334 }
1335
1336 static boolean_t
1337 dsl_scan_free_should_pause(dsl_scan_t *scn)
1338 {
1339 uint64_t elapsed_nanosecs;
1340
1341 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1342 return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1343 (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1344 txg_sync_waiting(scn->scn_dp)) ||
1345 spa_shutting_down(scn->scn_dp->dp_spa));
1346 }
1347
1348 static int
1349 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1350 {
1351 dsl_scan_t *scn = arg;
1352
1353 if (!scn->scn_is_bptree ||
1354 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
1355 if (dsl_scan_free_should_pause(scn))
1356 return (ERESTART);
1357 }
1358
1359 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1360 dmu_tx_get_txg(tx), bp, 0));
1361 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1362 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1363 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1364 scn->scn_visited_this_txg++;
1365 return (0);
1366 }
1367
1368 boolean_t
1369 dsl_scan_active(dsl_scan_t *scn)
1370 {
1371 spa_t *spa = scn->scn_dp->dp_spa;
1372 uint64_t used = 0, comp, uncomp;
1373
1374 if (spa->spa_load_state != SPA_LOAD_NONE)
1375 return (B_FALSE);
1376 if (spa_shutting_down(spa))
1377 return (B_FALSE);
1378
1379 if (scn->scn_phys.scn_state == DSS_SCANNING)
1380 return (B_TRUE);
1381
1382 if (spa_feature_is_active(spa,
1383 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1384 return (B_TRUE);
1385 }
1386 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1387 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1388 &used, &comp, &uncomp);
1389 }
1390 return (used != 0);
1391 }
1392
1393 void
1394 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1395 {
1396 dsl_scan_t *scn = dp->dp_scan;
1397 spa_t *spa = dp->dp_spa;
1398 int err;
1399
1400 /*
1401 * Check for scn_restart_txg before checking spa_load_state, so
1402 * that we can restart an old-style scan while the pool is being
1403 * imported (see dsl_scan_init).
1404 */
1405 if (scn->scn_restart_txg != 0 &&
1412 func, tx->tx_txg);
1413 dsl_scan_setup_sync(scn, &func, tx);
1414 }
1415
1416 if (!dsl_scan_active(scn) ||
1417 spa_sync_pass(dp->dp_spa) > 1)
1418 return;
1419
1420 scn->scn_visited_this_txg = 0;
1421 scn->scn_pausing = B_FALSE;
1422 scn->scn_sync_start_time = gethrtime();
1423 spa->spa_scrub_active = B_TRUE;
1424
1425 /*
1426 * First process the free list. If we pause the free, don't do
1427 * any scanning. This ensures that there is no free list when
1428 * we are scanning, so the scan code doesn't have to worry about
1429 * traversing it.
1430 */
1431 if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1432 scn->scn_is_bptree = B_FALSE;
1433 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1434 NULL, ZIO_FLAG_MUSTSUCCEED);
1435 err = bpobj_iterate(&dp->dp_free_bpobj,
1436 dsl_scan_free_block_cb, scn, tx);
1437 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1438
1439 if (err == 0 && spa_feature_is_active(spa,
1440 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1441 scn->scn_is_bptree = B_TRUE;
1442 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1443 NULL, ZIO_FLAG_MUSTSUCCEED);
1444 err = bptree_iterate(dp->dp_meta_objset,
1445 dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
1446 scn, tx);
1447 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
1448 if (err != 0)
1449 return;
1450
1451 /* disable async destroy feature */
1452 spa_feature_decr(spa,
1453 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
1454 ASSERT(!spa_feature_is_active(spa,
1455 &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
1456 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
1457 DMU_POOL_DIRECTORY_OBJECT,
1458 DMU_POOL_BPTREE_OBJ, tx));
1459 VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
1460 dp->dp_bptree_obj, tx));
1461 dp->dp_bptree_obj = 0;
1462 }
1463 if (scn->scn_visited_this_txg) {
1464 zfs_dbgmsg("freed %llu blocks in %llums from "
1465 "free_bpobj/bptree txg %llu",
1466 (longlong_t)scn->scn_visited_this_txg,
1467 (longlong_t)
1468 (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1469 (longlong_t)tx->tx_txg);
1470 scn->scn_visited_this_txg = 0;
1471 /*
1472 * Re-sync the ddt so that we can further modify
1473 * it when doing bprewrite.
1474 */
1475 ddt_sync(spa, tx->tx_txg);
1476 }
1477 if (err == ERESTART)
1478 return;
1479 }
1480
1481 if (scn->scn_phys.scn_state != DSS_SCANNING)
1482 return;
1483
1484 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
1485 scn->scn_phys.scn_ddt_class_max) {
1560
1561 /*
1562 * scrub consumers
1563 */
1564
1565 static void
1566 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
1567 {
1568 int i;
1569
1570 /*
1571 * If we resume after a reboot, zab will be NULL; don't record
1572 * incomplete stats in that case.
1573 */
1574 if (zab == NULL)
1575 return;
1576
1577 for (i = 0; i < 4; i++) {
1578 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1579 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
1580 if (t & DMU_OT_NEWTYPE)
1581 t = DMU_OT_OTHER;
1582 zfs_blkstat_t *zb = &zab->zab_type[l][t];
1583 int equal;
1584
1585 zb->zb_count++;
1586 zb->zb_asize += BP_GET_ASIZE(bp);
1587 zb->zb_lsize += BP_GET_LSIZE(bp);
1588 zb->zb_psize += BP_GET_PSIZE(bp);
1589 zb->zb_gangs += BP_COUNT_GANG(bp);
1590
1591 switch (BP_GET_NDVAS(bp)) {
1592 case 2:
1593 if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1594 DVA_GET_VDEV(&bp->blk_dva[1]))
1595 zb->zb_ditto_2_of_2_samevdev++;
1596 break;
1597 case 3:
1598 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1599 DVA_GET_VDEV(&bp->blk_dva[1])) +
1600 (DVA_GET_VDEV(&bp->blk_dva[0]) ==
1601 DVA_GET_VDEV(&bp->blk_dva[2])) +
|