Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dsl_scan.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_scan.c
↓ open down ↓ 12 lines elided ↑ open up ↑
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2012 by Delphix. All rights reserved.
  23   24   */
  24   25  
  25   26  #include <sys/dsl_scan.h>
  26   27  #include <sys/dsl_pool.h>
  27   28  #include <sys/dsl_dataset.h>
  28   29  #include <sys/dsl_prop.h>
  29   30  #include <sys/dsl_dir.h>
  30   31  #include <sys/dsl_synctask.h>
  31   32  #include <sys/dnode.h>
  32   33  #include <sys/dmu_tx.h>
↓ open down ↓ 4 lines elided ↑ open up ↑
  37   38  #include <sys/zfs_context.h>
  38   39  #include <sys/fs/zfs.h>
  39   40  #include <sys/zfs_znode.h>
  40   41  #include <sys/spa_impl.h>
  41   42  #include <sys/vdev_impl.h>
  42   43  #include <sys/zil_impl.h>
  43   44  #include <sys/zio_checksum.h>
  44   45  #include <sys/ddt.h>
  45   46  #include <sys/sa.h>
  46   47  #include <sys/sa_impl.h>
       48 +#include <sys/zfeature.h>
  47   49  #ifdef _KERNEL
  48   50  #include <sys/zfs_vfsops.h>
  49   51  #endif
  50   52  
  51   53  typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
  52   54  
  53   55  static scan_cb_t dsl_scan_defrag_cb;
  54   56  static scan_cb_t dsl_scan_scrub_cb;
  55   57  static scan_cb_t dsl_scan_remove_cb;
  56   58  static dsl_syncfunc_t dsl_scan_cancel_sync;
↓ open down ↓ 318 lines elided ↑ open up ↑
 375  377  
 376  378  int
 377  379  dsl_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bpp,
 378  380      arc_done_func_t *done, void *private, int priority, int zio_flags,
 379  381      uint32_t *arc_flags, const zbookmark_t *zb)
 380  382  {
 381  383          return (arc_read_nolock(pio, spa, bpp, done, private,
 382  384              priority, zio_flags, arc_flags, zb));
 383  385  }
 384  386  
 385      -static boolean_t
 386      -bookmark_is_zero(const zbookmark_t *zb)
 387      -{
 388      -        return (zb->zb_objset == 0 && zb->zb_object == 0 &&
 389      -            zb->zb_level == 0 && zb->zb_blkid == 0);
 390      -}
 391      -
 392      -/* dnp is the dnode for zb1->zb_object */
 393      -static boolean_t
 394      -bookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
 395      -    const zbookmark_t *zb2)
 396      -{
 397      -        uint64_t zb1nextL0, zb2thisobj;
 398      -
 399      -        ASSERT(zb1->zb_objset == zb2->zb_objset);
 400      -        ASSERT(zb2->zb_level == 0);
 401      -
 402      -        /*
 403      -         * A bookmark in the deadlist is considered to be after
 404      -         * everything else.
 405      -         */
 406      -        if (zb2->zb_object == DMU_DEADLIST_OBJECT)
 407      -                return (B_TRUE);
 408      -
 409      -        /* The objset_phys_t isn't before anything. */
 410      -        if (dnp == NULL)
 411      -                return (B_FALSE);
 412      -
 413      -        zb1nextL0 = (zb1->zb_blkid + 1) <<
 414      -            ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
 415      -
 416      -        zb2thisobj = zb2->zb_object ? zb2->zb_object :
 417      -            zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
 418      -
 419      -        if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 420      -                uint64_t nextobj = zb1nextL0 *
 421      -                    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
 422      -                return (nextobj <= zb2thisobj);
 423      -        }
 424      -
 425      -        if (zb1->zb_object < zb2thisobj)
 426      -                return (B_TRUE);
 427      -        if (zb1->zb_object > zb2thisobj)
 428      -                return (B_FALSE);
 429      -        if (zb2->zb_object == DMU_META_DNODE_OBJECT)
 430      -                return (B_FALSE);
 431      -        return (zb1nextL0 <= zb2->zb_blkid);
 432      -}
 433      -
 434  387  static uint64_t
 435  388  dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 436  389  {
 437  390          uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 438  391          if (dsl_dataset_is_snapshot(ds))
 439  392                  return (MIN(smt, ds->ds_phys->ds_creation_txg));
 440  393          return (smt);
 441  394  }
 442  395  
 443  396  static void
↓ open down ↓ 11 lines elided ↑ open up ↑
 455  408          uint64_t elapsed_nanosecs;
 456  409          int mintime;
 457  410  
 458  411          /* we never skip user/group accounting objects */
 459  412          if (zb && (int64_t)zb->zb_object < 0)
 460  413                  return (B_FALSE);
 461  414  
 462  415          if (scn->scn_pausing)
 463  416                  return (B_TRUE); /* we're already pausing */
 464  417  
 465      -        if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark))
      418 +        if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 466  419                  return (B_FALSE); /* we're resuming */
 467  420  
 468  421          /* We only know how to resume from level-0 blocks. */
 469  422          if (zb && zb->zb_level != 0)
 470  423                  return (B_FALSE);
 471  424  
 472  425          mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 473  426              zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
 474  427          elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 475  428          if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
↓ open down ↓ 134 lines elided ↑ open up ↑
 610  563              ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
 611  564  }
 612  565  
 613  566  static boolean_t
 614  567  dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
 615  568      const zbookmark_t *zb)
 616  569  {
 617  570          /*
 618  571           * We never skip over user/group accounting objects (obj<0)
 619  572           */
 620      -        if (!bookmark_is_zero(&scn->scn_phys.scn_bookmark) &&
      573 +        if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 621  574              (int64_t)zb->zb_object >= 0) {
 622  575                  /*
 623  576                   * If we already visited this bp & everything below (in
 624  577                   * a prior txg sync), don't bother doing it again.
 625  578                   */
 626      -                if (bookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
      579 +                if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
 627  580                          return (B_TRUE);
 628  581  
 629  582                  /*
 630  583                   * If we found the block we're trying to resume from, or
 631  584                   * we went past it to a different object, zero it out to
 632  585                   * indicate that it's OK to start checking for pausing
 633  586                   * again.
 634  587                   */
 635  588                  if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
 636  589                      zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
↓ open down ↓ 172 lines elided ↑ open up ↑
 809  762  
 810  763          dprintf_bp(bp,
 811  764              "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
 812  765              ds, ds ? ds->ds_object : 0,
 813  766              zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
 814  767              pbuf, bp);
 815  768  
 816  769          if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
 817  770                  return;
 818  771  
 819      -        if (BP_GET_TYPE(bp) != DMU_OT_USERGROUP_USED) {
 820      -                /*
 821      -                 * For non-user-accounting blocks, we need to read the
 822      -                 * new bp (from a deleted snapshot, found in
 823      -                 * check_existing_xlation).  If we used the old bp,
 824      -                 * pointers inside this block from before we resumed
 825      -                 * would be untranslated.
 826      -                 *
 827      -                 * For user-accounting blocks, we need to read the old
 828      -                 * bp, because we will apply the entire space delta to
 829      -                 * it (original untranslated -> translations from
 830      -                 * deleted snap -> now).
 831      -                 */
 832      -                bp_toread = *bp;
 833      -        }
 834      -
 835  772          if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
 836  773              &buf) != 0)
 837  774                  return;
 838  775  
 839  776          /*
 840  777           * If dsl_scan_ddt() has aready visited this block, it will have
 841  778           * already done any translations or scrubbing, so don't call the
 842  779           * callback again.
 843  780           */
 844  781          if (ddt_class_contains(dp->dp_spa,
↓ open down ↓ 544 lines elided ↑ open up ↑
1389 1326                  dsl_dataset_rele(ds, FTAG);
1390 1327  
1391 1328                  dsl_scan_visitds(scn, dsobj, tx);
1392 1329                  zap_cursor_fini(&zc);
1393 1330                  if (scn->scn_pausing)
1394 1331                          return;
1395 1332          }
1396 1333          zap_cursor_fini(&zc);
1397 1334  }
1398 1335  
1399      -static int
1400      -dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
     1336 +static boolean_t
     1337 +dsl_scan_free_should_pause(dsl_scan_t *scn)
1401 1338  {
1402      -        dsl_scan_t *scn = arg;
1403 1339          uint64_t elapsed_nanosecs;
1404 1340  
1405 1341          elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
1406      -
1407      -        if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
     1342 +        return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1408 1343              (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1409 1344              txg_sync_waiting(scn->scn_dp)) ||
1410      -            spa_shutting_down(scn->scn_dp->dp_spa))
1411      -                return (ERESTART);
     1345 +            spa_shutting_down(scn->scn_dp->dp_spa));
     1346 +}
1412 1347  
     1348 +static int
     1349 +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
     1350 +{
     1351 +        dsl_scan_t *scn = arg;
     1352 +
     1353 +        if (!scn->scn_is_bptree ||
     1354 +            (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
     1355 +                if (dsl_scan_free_should_pause(scn))
     1356 +                        return (ERESTART);
     1357 +        }
     1358 +
1413 1359          zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
1414 1360              dmu_tx_get_txg(tx), bp, 0));
1415 1361          dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
1416 1362              -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
1417 1363              -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
1418 1364          scn->scn_visited_this_txg++;
1419 1365          return (0);
1420 1366  }
1421 1367  
1422 1368  boolean_t
↓ open down ↓ 3 lines elided ↑ open up ↑
1426 1372          uint64_t used = 0, comp, uncomp;
1427 1373  
1428 1374          if (spa->spa_load_state != SPA_LOAD_NONE)
1429 1375                  return (B_FALSE);
1430 1376          if (spa_shutting_down(spa))
1431 1377                  return (B_FALSE);
1432 1378  
1433 1379          if (scn->scn_phys.scn_state == DSS_SCANNING)
1434 1380                  return (B_TRUE);
1435 1381  
     1382 +        if (spa_feature_is_active(spa,
     1383 +            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
     1384 +                return (B_TRUE);
     1385 +        }
1436 1386          if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
1437 1387                  (void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
1438 1388                      &used, &comp, &uncomp);
1439 1389          }
1440 1390          return (used != 0);
1441 1391  }
1442 1392  
1443 1393  void
1444 1394  dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
1445 1395  {
↓ open down ↓ 26 lines elided ↑ open up ↑
1472 1422          scn->scn_sync_start_time = gethrtime();
1473 1423          spa->spa_scrub_active = B_TRUE;
1474 1424  
1475 1425          /*
1476 1426           * First process the free list.  If we pause the free, don't do
1477 1427           * any scanning.  This ensures that there is no free list when
1478 1428           * we are scanning, so the scan code doesn't have to worry about
1479 1429           * traversing it.
1480 1430           */
1481 1431          if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
     1432 +                scn->scn_is_bptree = B_FALSE;
1482 1433                  scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
1483 1434                      NULL, ZIO_FLAG_MUSTSUCCEED);
1484 1435                  err = bpobj_iterate(&dp->dp_free_bpobj,
1485      -                    dsl_scan_free_cb, scn, tx);
     1436 +                    dsl_scan_free_block_cb, scn, tx);
1486 1437                  VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
     1438 +
     1439 +                if (err == 0 && spa_feature_is_active(spa,
     1440 +                    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
     1441 +                        scn->scn_is_bptree = B_TRUE;
     1442 +                        scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
     1443 +                            NULL, ZIO_FLAG_MUSTSUCCEED);
     1444 +                        err = bptree_iterate(dp->dp_meta_objset,
     1445 +                            dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
     1446 +                            scn, tx);
     1447 +                        VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
     1448 +                        if (err != 0)
     1449 +                                return;
     1450 +
     1451 +                        /* disable async destroy feature */
     1452 +                        spa_feature_decr(spa,
     1453 +                            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY], tx);
     1454 +                        ASSERT(!spa_feature_is_active(spa,
     1455 +                            &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]));
     1456 +                        VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
     1457 +                            DMU_POOL_DIRECTORY_OBJECT,
     1458 +                            DMU_POOL_BPTREE_OBJ, tx));
     1459 +                        VERIFY3U(0, ==, bptree_free(dp->dp_meta_objset,
     1460 +                            dp->dp_bptree_obj, tx));
     1461 +                        dp->dp_bptree_obj = 0;
     1462 +                }
1487 1463                  if (scn->scn_visited_this_txg) {
1488 1464                          zfs_dbgmsg("freed %llu blocks in %llums from "
1489      -                            "free_bpobj txg %llu",
     1465 +                            "free_bpobj/bptree txg %llu",
1490 1466                              (longlong_t)scn->scn_visited_this_txg,
1491 1467                              (longlong_t)
1492 1468                              (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1493 1469                              (longlong_t)tx->tx_txg);
1494 1470                          scn->scn_visited_this_txg = 0;
1495 1471                          /*
1496 1472                           * Re-sync the ddt so that we can further modify
1497 1473                           * it when doing bprewrite.
1498 1474                           */
1499 1475                          ddt_sync(spa, tx->tx_txg);
↓ open down ↓ 94 lines elided ↑ open up ↑
1594 1570          /*
1595 1571           * If we resume after a reboot, zab will be NULL; don't record
1596 1572           * incomplete stats in that case.
1597 1573           */
1598 1574          if (zab == NULL)
1599 1575                  return;
1600 1576  
1601 1577          for (i = 0; i < 4; i++) {
1602 1578                  int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
1603 1579                  int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
     1580 +                if (t & DMU_OT_NEWTYPE)
     1581 +                        t = DMU_OT_OTHER;
1604 1582                  zfs_blkstat_t *zb = &zab->zab_type[l][t];
1605 1583                  int equal;
1606 1584  
1607 1585                  zb->zb_count++;
1608 1586                  zb->zb_asize += BP_GET_ASIZE(bp);
1609 1587                  zb->zb_lsize += BP_GET_LSIZE(bp);
1610 1588                  zb->zb_psize += BP_GET_PSIZE(bp);
1611 1589                  zb->zb_gangs += BP_COUNT_GANG(bp);
1612 1590  
1613 1591                  switch (BP_GET_NDVAS(bp)) {
↓ open down ↓ 153 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX