Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 by Delphix. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_prop.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/dmu_traverse.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/zio.h>
  37 #include <sys/zap.h>

  38 #include <sys/unique.h>
  39 #include <sys/zfs_context.h>
  40 #include <sys/zfs_ioctl.h>
  41 #include <sys/spa.h>
  42 #include <sys/zfs_znode.h>
  43 #include <sys/zfs_onexit.h>
  44 #include <sys/zvol.h>
  45 #include <sys/dsl_scan.h>
  46 #include <sys/dsl_deadlist.h>
  47 
  48 static char *dsl_reaper = "the grim reaper";
  49 
  50 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  51 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  52 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  53 
  54 #define SWITCH64(x, y) \
  55         { \
  56                 uint64_t __tmp = (x); \
  57                 (x) = (y); \


  83 
  84         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  85         return (new_bytes - old_bytes);
  86 }
  87 
  88 void
  89 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  90 {
  91         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  92         int compressed = BP_GET_PSIZE(bp);
  93         int uncompressed = BP_GET_UCSIZE(bp);
  94         int64_t delta;
  95 
  96         dprintf_bp(bp, "ds=%p", ds);
  97 
  98         ASSERT(dmu_tx_is_syncing(tx));
  99         /* It could have been compressed away to nothing */
 100         if (BP_IS_HOLE(bp))
 101                 return;
 102         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 103         ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
 104         if (ds == NULL) {
 105                 /*
 106                  * Account for the meta-objset space in its placeholder
 107                  * dsl_dir.
 108                  */
 109                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 110                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 111                     used, compressed, uncompressed, tx);
 112                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 113                 return;
 114         }
 115         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 116 
 117         mutex_enter(&ds->ds_dir->dd_lock);
 118         mutex_enter(&ds->ds_lock);
 119         delta = parent_delta(ds, used);
 120         ds->ds_phys->ds_used_bytes += used;
 121         ds->ds_phys->ds_compressed_bytes += compressed;
 122         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 123         ds->ds_phys->ds_unique_bytes += used;
 124         mutex_exit(&ds->ds_lock);
 125         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 126             compressed, uncompressed, tx);
 127         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 128             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 129         mutex_exit(&ds->ds_dir->dd_lock);
 130 }
 131 
 132 int
 133 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 134     boolean_t async)
 135 {
 136         if (BP_IS_HOLE(bp))
 137                 return (0);
 138 
 139         ASSERT(dmu_tx_is_syncing(tx));
 140         ASSERT(bp->blk_birth <= tx->tx_txg);


 194                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 195                 }
 196                 ASSERT3U(ds->ds_prev->ds_object, ==,
 197                     ds->ds_phys->ds_prev_snap_obj);
 198                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 199                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 200                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 201                     ds->ds_object && bp->blk_birth >
 202                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 203                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 204                         mutex_enter(&ds->ds_prev->ds_lock);
 205                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 206                         mutex_exit(&ds->ds_prev->ds_lock);
 207                 }
 208                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 209                         dsl_dir_transfer_space(ds->ds_dir, used,
 210                             DD_USED_HEAD, DD_USED_SNAP, tx);
 211                 }
 212         }
 213         mutex_enter(&ds->ds_lock);
 214         ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
 215         ds->ds_phys->ds_used_bytes -= used;
 216         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 217         ds->ds_phys->ds_compressed_bytes -= compressed;
 218         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 219         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 220         mutex_exit(&ds->ds_lock);
 221 
 222         return (used);
 223 }
 224 
 225 uint64_t
 226 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 227 {
 228         uint64_t trysnap = 0;
 229 
 230         if (ds == NULL)
 231                 return (0);
 232         /*
 233          * The snapshot creation could fail, but that would cause an
 234          * incorrect FALSE return, which would only result in an
 235          * overestimation of the amount of space that an operation would


 801         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 802         dsphys->ds_dir_obj = dd->dd_object;
 803         dsphys->ds_flags = flags;
 804         dsphys->ds_fsid_guid = unique_create();
 805         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 806             sizeof (dsphys->ds_guid));
 807         dsphys->ds_snapnames_zapobj =
 808             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 809             DMU_OT_NONE, 0, tx);
 810         dsphys->ds_creation_time = gethrestime_sec();
 811         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 812 
 813         if (origin == NULL) {
 814                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 815         } else {
 816                 dsl_dataset_t *ohds;
 817 
 818                 dsphys->ds_prev_snap_obj = origin->ds_object;
 819                 dsphys->ds_prev_snap_txg =
 820                     origin->ds_phys->ds_creation_txg;
 821                 dsphys->ds_used_bytes =
 822                     origin->ds_phys->ds_used_bytes;
 823                 dsphys->ds_compressed_bytes =
 824                     origin->ds_phys->ds_compressed_bytes;
 825                 dsphys->ds_uncompressed_bytes =
 826                     origin->ds_phys->ds_uncompressed_bytes;
 827                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 828                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 829 
 830                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 831                 origin->ds_phys->ds_num_children++;
 832 
 833                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 834                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 835                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 836                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 837                 dsl_dataset_rele(ohds, FTAG);
 838 
 839                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 840                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 841                                 origin->ds_phys->ds_next_clones_obj =
 842                                     zap_create(mos,


 916 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
 917 {
 918         int err;
 919         dsl_sync_task_t *dst;
 920         spa_t *spa;
 921         nvpair_t *pair;
 922         dsl_sync_task_group_t *dstg;
 923 
 924         pair = nvlist_next_nvpair(snaps, NULL);
 925         if (pair == NULL)
 926                 return (0);
 927 
 928         err = spa_open(nvpair_name(pair), &spa, FTAG);
 929         if (err)
 930                 return (err);
 931         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 932 
 933         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 934             pair = nvlist_next_nvpair(snaps, pair)) {
 935                 dsl_dataset_t *ds;
 936                 int err;
 937 
 938                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 939                 if (err == 0) {
 940                         struct dsl_ds_destroyarg *dsda;
 941 
 942                         dsl_dataset_make_exclusive(ds, dstg);
 943                         dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 944                             KM_SLEEP);
 945                         dsda->ds = ds;
 946                         dsda->defer = defer;
 947                         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 948                             dsl_dataset_destroy_sync, dsda, dstg, 0);
 949                 } else if (err == ENOENT) {
 950                         err = 0;
 951                 } else {
 952                         (void) strcpy(failed, nvpair_name(pair));
 953                         break;
 954                 }
 955         }
 956 


1065         }
1066 
1067         dd = ds->ds_dir;
1068         dummy_ds.ds_dir = dd;
1069         dummy_ds.ds_object = ds->ds_object;
1070 
1071         /*
1072          * Check for errors and mark this ds as inconsistent, in
1073          * case we crash while freeing the objects.
1074          */
1075         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
1076             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1077         if (err)
1078                 goto out;
1079 
1080         err = dmu_objset_from_ds(ds, &os);
1081         if (err)
1082                 goto out;
1083 
1084         /*
1085          * remove the objects in open context, so that we won't
1086          * have too much to do in syncing context.

1087          */


1088         for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1089             ds->ds_phys->ds_prev_snap_txg)) {
1090                 /*
1091                  * Ignore errors, if there is not enough disk space
1092                  * we will deal with it in dsl_dataset_destroy_sync().
1093                  */
1094                 (void) dmu_free_object(os, obj);
1095         }
1096         if (err != ESRCH)
1097                 goto out;

1098 
1099         /*
1100          * Only the ZIL knows how to free log blocks.
1101          */
1102         zil_destroy(dmu_objset_zil(os), B_FALSE);
1103 
1104         /*
1105          * Sync out all in-flight IO.
1106          */
1107         txg_wait_synced(dd->dd_pool, 0);
1108 
1109         /*
1110          * If we managed to free all the objects in open
1111          * context, the user space accounting should be zero.
1112          */
1113         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1114             dmu_objset_userused_enabled(os)) {
1115                 uint64_t count;
1116 
1117                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||


1223         }
1224 }
1225 
1226 /*
1227  * The unique space in the head dataset can be calculated by subtracting
1228  * the space used in the most recent snapshot, that is still being used
1229  * in this file system, from the space currently in use.  To figure out
1230  * the space in the most recent snapshot still in use, we need to take
1231  * the total space used in the snapshot and subtract out the space that
1232  * has been freed up since the snapshot was taken.
1233  */
1234 static void
1235 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1236 {
1237         uint64_t mrs_used;
1238         uint64_t dlused, dlcomp, dluncomp;
1239 
1240         ASSERT(!dsl_dataset_is_snapshot(ds));
1241 
1242         if (ds->ds_phys->ds_prev_snap_obj != 0)
1243                 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
1244         else
1245                 mrs_used = 0;
1246 
1247         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1248 
1249         ASSERT3U(dlused, <=, mrs_used);
1250         ds->ds_phys->ds_unique_bytes =
1251             ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
1252 
1253         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1254             SPA_VERSION_UNIQUE_ACCURATE)
1255                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1256 }
1257 
1258 struct killarg {
1259         dsl_dataset_t *ds;
1260         dmu_tx_t *tx;
1261 };
1262 
1263 /* ARGSUSED */
1264 static int
1265 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1266     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1267 {
1268         struct killarg *ka = arg;
1269         dmu_tx_t *tx = ka->tx;
1270 
1271         if (bp == NULL)


1589         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1590         VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1591             process_old_cb, &poa, tx));
1592         VERIFY3U(zio_wait(poa.pio), ==, 0);
1593         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1594 
1595         /* change snapused */
1596         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1597             -poa.used, -poa.comp, -poa.uncomp, tx);
1598 
1599         /* swap next's deadlist to our deadlist */
1600         dsl_deadlist_close(&ds->ds_deadlist);
1601         dsl_deadlist_close(&ds_next->ds_deadlist);
1602         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1603             ds->ds_phys->ds_deadlist_obj);
1604         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1605         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1606             ds_next->ds_phys->ds_deadlist_obj);
1607 }
1608 
























1609 void
1610 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1611 {
1612         struct dsl_ds_destroyarg *dsda = arg1;
1613         dsl_dataset_t *ds = dsda->ds;
1614         int err;
1615         int after_branch_point = FALSE;
1616         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1617         objset_t *mos = dp->dp_meta_objset;
1618         dsl_dataset_t *ds_prev = NULL;
1619         boolean_t wont_destroy;
1620         uint64_t obj;
1621 
1622         wont_destroy = (dsda->defer &&
1623             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1624 
1625         ASSERT(ds->ds_owner || wont_destroy);
1626         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1627         ASSERT(ds->ds_prev == NULL ||
1628             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);


1735                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1736                                     ds_prev->ds_phys->ds_prev_snap_txg,
1737                                     ds->ds_phys->ds_prev_snap_txg,
1738                                     &used, &comp, &uncomp);
1739                                 ds_prev->ds_phys->ds_unique_bytes += used;
1740                         }
1741 
1742                         /* Adjust snapused. */
1743                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1744                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1745                             &used, &comp, &uncomp);
1746                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1747                             -used, -comp, -uncomp, tx);
1748 
1749                         /* Move blocks to be freed to pool's free list. */
1750                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1751                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1752                             tx);
1753                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1754                             DD_USED_HEAD, used, comp, uncomp, tx);
1755                         dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
1756 
1757                         /* Merge our deadlist into next's and free it. */
1758                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1759                             ds->ds_phys->ds_deadlist_obj, tx);
1760                 }
1761                 dsl_deadlist_close(&ds->ds_deadlist);
1762                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1763 
1764                 /* Collapse range in clone heads */
1765                 dsl_dataset_remove_clones_key(ds,
1766                     ds->ds_phys->ds_creation_txg, tx);
1767 
1768                 if (dsl_dataset_is_snapshot(ds_next)) {
1769                         dsl_dataset_t *ds_nextnext;
1770 
1771                         /*
1772                          * Update next's unique to include blocks which
1773                          * were previously shared by only this snapshot
1774                          * and it.  Those blocks will be born after the
1775                          * prev snap and before this snap, and will have


1811 
1812                         /*
1813                          * Reduce the amount of our unconsmed refreservation
1814                          * being charged to our parent by the amount of
1815                          * new unique data we have gained.
1816                          */
1817                         if (old_unique < ds_next->ds_reserved) {
1818                                 int64_t mrsdelta;
1819                                 uint64_t new_unique =
1820                                     ds_next->ds_phys->ds_unique_bytes;
1821 
1822                                 ASSERT(old_unique <= new_unique);
1823                                 mrsdelta = MIN(new_unique - old_unique,
1824                                     ds_next->ds_reserved - old_unique);
1825                                 dsl_dir_diduse_space(ds->ds_dir,
1826                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1827                         }
1828                 }
1829                 dsl_dataset_rele(ds_next, FTAG);
1830         } else {



1831                 /*
1832                  * There's no next snapshot, so this is a head dataset.
1833                  * Destroy the deadlist.  Unless it's a clone, the
1834                  * deadlist should be empty.  (If it's a clone, it's
1835                  * safe to ignore the deadlist contents.)
1836                  */
1837                 struct killarg ka;
1838 
1839                 dsl_deadlist_close(&ds->ds_deadlist);
1840                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1841                 ds->ds_phys->ds_deadlist_obj = 0;
1842 



1843                 /*
1844                  * Free everything that we point to (that's born after
1845                  * the previous snapshot, if we are a clone)
1846                  *
1847                  * NB: this should be very quick, because we already
1848                  * freed all the objects in open context.
1849                  */
1850                 ka.ds = ds;
1851                 ka.tx = tx;
1852                 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
1853                     TRAVERSE_POST, kill_blkptr, &ka);
1854                 ASSERT3U(err, ==, 0);












1855                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1856                     ds->ds_phys->ds_unique_bytes == 0);
1857 









1858                 if (ds->ds_prev != NULL) {
1859                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1860                                 VERIFY3U(0, ==, zap_remove_int(mos,
1861                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1862                                     ds->ds_object, tx));
1863                         }
1864                         dsl_dataset_rele(ds->ds_prev, ds);
1865                         ds->ds_prev = ds_prev = NULL;
1866                 }
1867         }
1868 
1869         /*
1870          * This must be done after the dsl_traverse(), because it will
1871          * re-open the objset.
1872          */
1873         if (ds->ds_objset) {
1874                 dmu_objset_evict(ds->ds_objset);
1875                 ds->ds_objset = NULL;
1876         }
1877 


2027         else
2028                 crtxg = tx->tx_txg;
2029 
2030         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2031             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2032         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2033         dmu_buf_will_dirty(dbuf, tx);
2034         dsphys = dbuf->db_data;
2035         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2036         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2037         dsphys->ds_fsid_guid = unique_create();
2038         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2039             sizeof (dsphys->ds_guid));
2040         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2041         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2042         dsphys->ds_next_snap_obj = ds->ds_object;
2043         dsphys->ds_num_children = 1;
2044         dsphys->ds_creation_time = gethrestime_sec();
2045         dsphys->ds_creation_txg = crtxg;
2046         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2047         dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
2048         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2049         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2050         dsphys->ds_flags = ds->ds_phys->ds_flags;
2051         dsphys->ds_bp = ds->ds_phys->ds_bp;
2052         dmu_buf_rele(dbuf, FTAG);
2053 
2054         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2055         if (ds->ds_prev) {
2056                 uint64_t next_clones_obj =
2057                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2058                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2059                     ds->ds_object ||
2060                     ds->ds_prev->ds_phys->ds_num_children > 1);
2061                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2062                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2063                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2064                             ds->ds_prev->ds_phys->ds_creation_txg);
2065                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2066                 } else if (next_clones_obj != 0) {
2067                         remove_from_next_clones(ds->ds_prev,


2151         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2152         VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2153 
2154         /*
2155          * There may me missing entries in ds_next_clones_obj
2156          * due to a bug in a previous version of the code.
2157          * Only trust it if it has the right number of entries.
2158          */
2159         if (ds->ds_phys->ds_next_clones_obj != 0) {
2160                 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2161                     &count));
2162         }
2163         if (count != ds->ds_phys->ds_num_children - 1) {
2164                 goto fail;
2165         }
2166         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2167             zap_cursor_retrieve(&zc, &za) == 0;
2168             zap_cursor_advance(&zc)) {
2169                 dsl_dataset_t *clone;
2170                 char buf[ZFS_MAXNAMELEN];













2171                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2172                     za.za_first_integer, FTAG, &clone) != 0) {
2173                         goto fail;
2174                 }
2175                 dsl_dir_name(clone->ds_dir, buf);
2176                 VERIFY(nvlist_add_boolean(val, buf) == 0);
2177                 dsl_dataset_rele(clone, FTAG);
2178         }
2179         zap_cursor_fini(&zc);
2180         VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2181         VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2182             propval) == 0);
2183 fail:
2184         nvlist_free(val);
2185         nvlist_free(propval);
2186         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2187 }
2188 
2189 void
2190 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2191 {
2192         uint64_t refd, avail, uobjs, aobjs, ratio;
2193 
2194         dsl_dir_stats(ds->ds_dir, nv);


2277                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2278                 dsl_dataset_name(ods, stat->dds_origin);
2279                 dsl_dataset_drop_ref(ods, FTAG);
2280         } else {
2281                 stat->dds_origin[0] = '\0';
2282         }
2283         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2284 }
2285 
2286 uint64_t
2287 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2288 {
2289         return (ds->ds_fsid_guid);
2290 }
2291 
2292 void
2293 dsl_dataset_space(dsl_dataset_t *ds,
2294     uint64_t *refdbytesp, uint64_t *availbytesp,
2295     uint64_t *usedobjsp, uint64_t *availobjsp)
2296 {
2297         *refdbytesp = ds->ds_phys->ds_used_bytes;
2298         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2299         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2300                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2301         if (ds->ds_quota != 0) {
2302                 /*
2303                  * Adjust available bytes according to refquota
2304                  */
2305                 if (*refdbytesp < ds->ds_quota)
2306                         *availbytesp = MIN(*availbytesp,
2307                             ds->ds_quota - *refdbytesp);
2308                 else
2309                         *availbytesp = 0;
2310         }
2311         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2312         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2313 }
2314 
2315 boolean_t
2316 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2317 {


2614         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2615         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2616             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2617             &pa->unique, &unused, &unused);
2618 
2619         /*
2620          * Walk the snapshots that we are moving
2621          *
2622          * Compute space to transfer.  Consider the incremental changes
2623          * to used for each snapshot:
2624          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2625          * So each snapshot gave birth to:
2626          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2627          * So a sequence would look like:
2628          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2629          * Which simplifies to:
2630          * uN + kN + kN-1 + ... + k1 + k0
2631          * Note however, if we stop before we reach the ORIGIN we get:
2632          * uN + kN + kN-1 + ... + kM - uM-1
2633          */
2634         pa->used = origin_ds->ds_phys->ds_used_bytes;
2635         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2636         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2637         for (snap = list_head(&pa->shared_snaps); snap;
2638             snap = list_next(&pa->shared_snaps, snap)) {
2639                 uint64_t val, dlused, dlcomp, dluncomp;
2640                 dsl_dataset_t *ds = snap->ds;
2641 
2642                 /* Check that the snapshot name does not conflict */
2643                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2644                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2645                 if (err == 0) {
2646                         err = EEXIST;
2647                         goto out;
2648                 }
2649                 if (err != ENOENT)
2650                         goto out;
2651 
2652                 /* The very first snapshot does not have a deadlist */
2653                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2654                         continue;
2655 
2656                 dsl_deadlist_space(&ds->ds_deadlist,
2657                     &dlused, &dlcomp, &dluncomp);
2658                 pa->used += dlused;
2659                 pa->comp += dlcomp;
2660                 pa->uncomp += dluncomp;
2661         }
2662 
2663         /*
2664          * If we are a clone of a clone then we never reached ORIGIN,
2665          * so we need to subtract out the clone origin's used space.
2666          */
2667         if (pa->origin_origin) {
2668                 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
2669                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2670                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2671         }
2672 
2673         /* Check that there is enough space here */
2674         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2675             pa->used);
2676         if (err)
2677                 return (err);
2678 
2679         /*
2680          * Compute the amounts of space that will be used by snapshots
2681          * after the promotion (for both origin and clone).  For each,
2682          * it is the amount of space that will be on all of their
2683          * deadlists (that was not born before their new origin).
2684          */
2685         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2686                 uint64_t space;
2687 
2688                 /*


3164                 blkptr_t tmp;
3165                 tmp = csa->ohds->ds_phys->ds_bp;
3166                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3167                 csa->cds->ds_phys->ds_bp = tmp;
3168         }
3169 
3170         /* set dd_*_bytes */
3171         {
3172                 int64_t dused, dcomp, duncomp;
3173                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3174                 uint64_t odl_used, odl_comp, odl_uncomp;
3175 
3176                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3177                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3178 
3179                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3180                     &cdl_used, &cdl_comp, &cdl_uncomp);
3181                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3182                     &odl_used, &odl_comp, &odl_uncomp);
3183 
3184                 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
3185                     (csa->ohds->ds_phys->ds_used_bytes + odl_used);
3186                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3187                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3188                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3189                     cdl_uncomp -
3190                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3191 
3192                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3193                     dused, dcomp, duncomp, tx);
3194                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3195                     -dused, -dcomp, -duncomp, tx);
3196 
3197                 /*
3198                  * The difference in the space used by snapshots is the
3199                  * difference in snapshot space due to the head's
3200                  * deadlist (since that's the only thing that's
3201                  * changing that affects the snapused).
3202                  */
3203                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3204                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3205                     &cdl_used, &cdl_comp, &cdl_uncomp);
3206                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3207                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3208                     &odl_used, &odl_comp, &odl_uncomp);
3209                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3210                     DD_USED_HEAD, DD_USED_SNAP, tx);
3211         }
3212 
3213         /* swap ds_*_bytes */
3214         SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
3215             csa->cds->ds_phys->ds_used_bytes);
3216         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3217             csa->cds->ds_phys->ds_compressed_bytes);
3218         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3219             csa->cds->ds_phys->ds_uncompressed_bytes);
3220         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3221             csa->cds->ds_phys->ds_unique_bytes);
3222 
3223         /* apply any parent delta for change in unconsumed refreservation */
3224         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3225             csa->unused_refres_delta, 0, 0, tx);
3226 
3227         /*
3228          * Swap deadlists.
3229          */
3230         dsl_deadlist_close(&csa->cds->ds_deadlist);
3231         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3232         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3233             csa->cds->ds_phys->ds_deadlist_obj);
3234         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3235             csa->cds->ds_phys->ds_deadlist_obj);


3324          * Make a space adjustment for reserved bytes.
3325          */
3326         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3327                 ASSERT3U(*used, >=,
3328                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3329                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3330                 *ref_rsrv =
3331                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3332         }
3333 
3334         if (!check_quota || ds->ds_quota == 0) {
3335                 mutex_exit(&ds->ds_lock);
3336                 return (0);
3337         }
3338         /*
3339          * If they are requesting more space, and our current estimate
3340          * is over quota, they get to try again unless the actual
3341          * on-disk is over quota and there are no pending changes (which
3342          * may free up space for us).
3343          */
3344         if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
3345                 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)

3346                         error = ERESTART;
3347                 else
3348                         error = EDQUOT;
3349         }
3350         mutex_exit(&ds->ds_lock);
3351 
3352         return (error);
3353 }
3354 
3355 /* ARGSUSED */
3356 static int
3357 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3358 {
3359         dsl_dataset_t *ds = arg1;
3360         dsl_prop_setarg_t *psa = arg2;
3361         int err;
3362 
3363         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3364                 return (ENOTSUP);
3365 
3366         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3367                 return (err);
3368 
3369         if (psa->psa_effective_value == 0)
3370                 return (0);
3371 
3372         if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
3373             psa->psa_effective_value < ds->ds_reserved)
3374                 return (ENOSPC);
3375 
3376         return (0);
3377 }
3378 
3379 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3380 
3381 void
3382 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3383 {
3384         dsl_dataset_t *ds = arg1;
3385         dsl_prop_setarg_t *psa = arg2;
3386         uint64_t effective_value = psa->psa_effective_value;
3387 
3388         dsl_prop_set_sync(ds, psa, tx);
3389         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3390 
3391         if (ds->ds_quota != effective_value) {
3392                 dmu_buf_will_dirty(ds->ds_dbuf, tx);


4106  * The written space is calculated by considering two components:  First, we
4107  * ignore any freed space, and calculate the written as new's used space
4108  * minus old's used space.  Next, we add in the amount of space that was freed
4109  * between the two snapshots, thus reducing new's used space relative to old's.
4110  * Specifically, this is the space that was born before old->ds_creation_txg,
4111  * and freed before new (ie. on new's deadlist or a previous deadlist).
4112  *
4113  * space freed                         [---------------------]
4114  * snapshots                       ---O-------O--------O-------O------
4115  *                                         oldsnap            new
4116  */
4117 int
4118 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4119     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4120 {
4121         int err = 0;
4122         uint64_t snapobj;
4123         dsl_pool_t *dp = new->ds_dir->dd_pool;
4124 
4125         *usedp = 0;
4126         *usedp += new->ds_phys->ds_used_bytes;
4127         *usedp -= oldsnap->ds_phys->ds_used_bytes;
4128 
4129         *compp = 0;
4130         *compp += new->ds_phys->ds_compressed_bytes;
4131         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4132 
4133         *uncompp = 0;
4134         *uncompp += new->ds_phys->ds_uncompressed_bytes;
4135         *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4136 
4137         rw_enter(&dp->dp_config_rwlock, RW_READER);
4138         snapobj = new->ds_object;
4139         while (snapobj != oldsnap->ds_object) {
4140                 dsl_dataset_t *snap;
4141                 uint64_t used, comp, uncomp;
4142 



4143                 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4144                 if (err != 0)
4145                         break;

4146 
4147                 if (snap->ds_phys->ds_prev_snap_txg ==
4148                     oldsnap->ds_phys->ds_creation_txg) {
4149                         /*
4150                          * The blocks in the deadlist can not be born after
4151                          * ds_prev_snap_txg, so get the whole deadlist space,
4152                          * which is more efficient (especially for old-format
4153                          * deadlists).  Unfortunately the deadlist code
4154                          * doesn't have enough information to make this
4155                          * optimization itself.
4156                          */
4157                         dsl_deadlist_space(&snap->ds_deadlist,
4158                             &used, &comp, &uncomp);
4159                 } else {
4160                         dsl_deadlist_space_range(&snap->ds_deadlist,
4161                             0, oldsnap->ds_phys->ds_creation_txg,
4162                             &used, &comp, &uncomp);
4163                 }
4164                 *usedp += used;
4165                 *compp += comp;
4166                 *uncompp += uncomp;
4167 
4168                 /*
4169                  * If we get to the beginning of the chain of snapshots
4170                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4171                  * was not a snapshot of/before new.
4172                  */
4173                 snapobj = snap->ds_phys->ds_prev_snap_obj;

4174                 dsl_dataset_rele(snap, FTAG);
4175                 if (snapobj == 0) {
4176                         err = EINVAL;
4177                         break;
4178                 }
4179 
4180         }
4181         rw_exit(&dp->dp_config_rwlock);
4182         return (err);
4183 }
4184 
4185 /*
4186  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4187  * lastsnap, and all snapshots in between are deleted.
4188  *
4189  * blocks that would be freed            [---------------------------]
4190  * snapshots                       ---O-------O--------O-------O--------O
4191  *                                        firstsnap        lastsnap
4192  *
4193  * This is the set of blocks that were born after the snap before firstsnap,




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/dmu_objset.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_prop.h>
  31 #include <sys/dsl_synctask.h>
  32 #include <sys/dmu_traverse.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/arc.h>
  36 #include <sys/zio.h>
  37 #include <sys/zap.h>
  38 #include <sys/zfeature.h>
  39 #include <sys/unique.h>
  40 #include <sys/zfs_context.h>
  41 #include <sys/zfs_ioctl.h>
  42 #include <sys/spa.h>
  43 #include <sys/zfs_znode.h>
  44 #include <sys/zfs_onexit.h>
  45 #include <sys/zvol.h>
  46 #include <sys/dsl_scan.h>
  47 #include <sys/dsl_deadlist.h>
  48 
  49 static char *dsl_reaper = "the grim reaper";
  50 
  51 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
  52 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
  53 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
  54 
  55 #define SWITCH64(x, y) \
  56         { \
  57                 uint64_t __tmp = (x); \
  58                 (x) = (y); \


  84 
  85         ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
  86         return (new_bytes - old_bytes);
  87 }
  88 
  89 void
  90 dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
  91 {
  92         int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
  93         int compressed = BP_GET_PSIZE(bp);
  94         int uncompressed = BP_GET_UCSIZE(bp);
  95         int64_t delta;
  96 
  97         dprintf_bp(bp, "ds=%p", ds);
  98 
  99         ASSERT(dmu_tx_is_syncing(tx));
 100         /* It could have been compressed away to nothing */
 101         if (BP_IS_HOLE(bp))
 102                 return;
 103         ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
 104         ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 105         if (ds == NULL) {
 106                 /*
 107                  * Account for the meta-objset space in its placeholder
 108                  * dsl_dir.
 109                  */
 110                 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
 111                 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 112                     used, compressed, uncompressed, tx);
 113                 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
 114                 return;
 115         }
 116         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 117 
 118         mutex_enter(&ds->ds_dir->dd_lock);
 119         mutex_enter(&ds->ds_lock);
 120         delta = parent_delta(ds, used);
 121         ds->ds_phys->ds_referenced_bytes += used;
 122         ds->ds_phys->ds_compressed_bytes += compressed;
 123         ds->ds_phys->ds_uncompressed_bytes += uncompressed;
 124         ds->ds_phys->ds_unique_bytes += used;
 125         mutex_exit(&ds->ds_lock);
 126         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 127             compressed, uncompressed, tx);
 128         dsl_dir_transfer_space(ds->ds_dir, used - delta,
 129             DD_USED_REFRSRV, DD_USED_HEAD, tx);
 130         mutex_exit(&ds->ds_dir->dd_lock);
 131 }
 132 
 133 int
 134 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 135     boolean_t async)
 136 {
 137         if (BP_IS_HOLE(bp))
 138                 return (0);
 139 
 140         ASSERT(dmu_tx_is_syncing(tx));
 141         ASSERT(bp->blk_birth <= tx->tx_txg);


 195                         dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 196                 }
 197                 ASSERT3U(ds->ds_prev->ds_object, ==,
 198                     ds->ds_phys->ds_prev_snap_obj);
 199                 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
 200                 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
 201                 if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
 202                     ds->ds_object && bp->blk_birth >
 203                     ds->ds_prev->ds_phys->ds_prev_snap_txg) {
 204                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 205                         mutex_enter(&ds->ds_prev->ds_lock);
 206                         ds->ds_prev->ds_phys->ds_unique_bytes += used;
 207                         mutex_exit(&ds->ds_prev->ds_lock);
 208                 }
 209                 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 210                         dsl_dir_transfer_space(ds->ds_dir, used,
 211                             DD_USED_HEAD, DD_USED_SNAP, tx);
 212                 }
 213         }
 214         mutex_enter(&ds->ds_lock);
 215         ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
 216         ds->ds_phys->ds_referenced_bytes -= used;
 217         ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
 218         ds->ds_phys->ds_compressed_bytes -= compressed;
 219         ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
 220         ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
 221         mutex_exit(&ds->ds_lock);
 222 
 223         return (used);
 224 }
 225 
 226 uint64_t
 227 dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 228 {
 229         uint64_t trysnap = 0;
 230 
 231         if (ds == NULL)
 232                 return (0);
 233         /*
 234          * The snapshot creation could fail, but that would cause an
 235          * incorrect FALSE return, which would only result in an
 236          * overestimation of the amount of space that an operation would


 802         bzero(dsphys, sizeof (dsl_dataset_phys_t));
 803         dsphys->ds_dir_obj = dd->dd_object;
 804         dsphys->ds_flags = flags;
 805         dsphys->ds_fsid_guid = unique_create();
 806         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
 807             sizeof (dsphys->ds_guid));
 808         dsphys->ds_snapnames_zapobj =
 809             zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 810             DMU_OT_NONE, 0, tx);
 811         dsphys->ds_creation_time = gethrestime_sec();
 812         dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 813 
 814         if (origin == NULL) {
 815                 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
 816         } else {
 817                 dsl_dataset_t *ohds;
 818 
 819                 dsphys->ds_prev_snap_obj = origin->ds_object;
 820                 dsphys->ds_prev_snap_txg =
 821                     origin->ds_phys->ds_creation_txg;
 822                 dsphys->ds_referenced_bytes =
 823                     origin->ds_phys->ds_referenced_bytes;
 824                 dsphys->ds_compressed_bytes =
 825                     origin->ds_phys->ds_compressed_bytes;
 826                 dsphys->ds_uncompressed_bytes =
 827                     origin->ds_phys->ds_uncompressed_bytes;
 828                 dsphys->ds_bp = origin->ds_phys->ds_bp;
 829                 dsphys->ds_flags |= origin->ds_phys->ds_flags;
 830 
 831                 dmu_buf_will_dirty(origin->ds_dbuf, tx);
 832                 origin->ds_phys->ds_num_children++;
 833 
 834                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
 835                     origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
 836                 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
 837                     dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
 838                 dsl_dataset_rele(ohds, FTAG);
 839 
 840                 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 841                         if (origin->ds_phys->ds_next_clones_obj == 0) {
 842                                 origin->ds_phys->ds_next_clones_obj =
 843                                     zap_create(mos,


 917 dmu_snapshots_destroy_nvl(nvlist_t *snaps, boolean_t defer, char *failed)
 918 {
 919         int err;
 920         dsl_sync_task_t *dst;
 921         spa_t *spa;
 922         nvpair_t *pair;
 923         dsl_sync_task_group_t *dstg;
 924 
 925         pair = nvlist_next_nvpair(snaps, NULL);
 926         if (pair == NULL)
 927                 return (0);
 928 
 929         err = spa_open(nvpair_name(pair), &spa, FTAG);
 930         if (err)
 931                 return (err);
 932         dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 933 
 934         for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 935             pair = nvlist_next_nvpair(snaps, pair)) {
 936                 dsl_dataset_t *ds;

 937 
 938                 err = dsl_dataset_own(nvpair_name(pair), B_TRUE, dstg, &ds);
 939                 if (err == 0) {
 940                         struct dsl_ds_destroyarg *dsda;
 941 
 942                         dsl_dataset_make_exclusive(ds, dstg);
 943                         dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg),
 944                             KM_SLEEP);
 945                         dsda->ds = ds;
 946                         dsda->defer = defer;
 947                         dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
 948                             dsl_dataset_destroy_sync, dsda, dstg, 0);
 949                 } else if (err == ENOENT) {
 950                         err = 0;
 951                 } else {
 952                         (void) strcpy(failed, nvpair_name(pair));
 953                         break;
 954                 }
 955         }
 956 


1065         }
1066 
1067         dd = ds->ds_dir;
1068         dummy_ds.ds_dir = dd;
1069         dummy_ds.ds_object = ds->ds_object;
1070 
1071         /*
1072          * Check for errors and mark this ds as inconsistent, in
1073          * case we crash while freeing the objects.
1074          */
1075         err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
1076             dsl_dataset_destroy_begin_sync, ds, NULL, 0);
1077         if (err)
1078                 goto out;
1079 
1080         err = dmu_objset_from_ds(ds, &os);
1081         if (err)
1082                 goto out;
1083 
1084         /*
1085          * If async destruction is not enabled try to remove all objects
1086          * while in the open context so that there is less work to do in
1087          * the syncing context.
1088          */
1089         if (!spa_feature_is_enabled(dsl_dataset_get_spa(ds),
1090             &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1091                 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
1092                     ds->ds_phys->ds_prev_snap_txg)) {
1093                         /*
1094                          * Ignore errors, if there is not enough disk space
1095                          * we will deal with it in dsl_dataset_destroy_sync().
1096                          */
1097                         (void) dmu_free_object(os, obj);
1098                 }
1099                 if (err != ESRCH)
1100                         goto out;
1101         }
1102 
1103         /*
1104          * Only the ZIL knows how to free log blocks.
1105          */
1106         zil_destroy(dmu_objset_zil(os), B_FALSE);
1107 
1108         /*
1109          * Sync out all in-flight IO.
1110          */
1111         txg_wait_synced(dd->dd_pool, 0);
1112 
1113         /*
1114          * If we managed to free all the objects in open
1115          * context, the user space accounting should be zero.
1116          */
1117         if (ds->ds_phys->ds_bp.blk_fill == 0 &&
1118             dmu_objset_userused_enabled(os)) {
1119                 uint64_t count;
1120 
1121                 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||


1227         }
1228 }
1229 
1230 /*
1231  * The unique space in the head dataset can be calculated by subtracting
1232  * the space used in the most recent snapshot, that is still being used
1233  * in this file system, from the space currently in use.  To figure out
1234  * the space in the most recent snapshot still in use, we need to take
1235  * the total space used in the snapshot and subtract out the space that
1236  * has been freed up since the snapshot was taken.
1237  */
1238 static void
1239 dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
1240 {
1241         uint64_t mrs_used;
1242         uint64_t dlused, dlcomp, dluncomp;
1243 
1244         ASSERT(!dsl_dataset_is_snapshot(ds));
1245 
1246         if (ds->ds_phys->ds_prev_snap_obj != 0)
1247                 mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
1248         else
1249                 mrs_used = 0;
1250 
1251         dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
1252 
1253         ASSERT3U(dlused, <=, mrs_used);
1254         ds->ds_phys->ds_unique_bytes =
1255             ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
1256 
1257         if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
1258             SPA_VERSION_UNIQUE_ACCURATE)
1259                 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1260 }
1261 
1262 struct killarg {
1263         dsl_dataset_t *ds;
1264         dmu_tx_t *tx;
1265 };
1266 
1267 /* ARGSUSED */
1268 static int
1269 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
1270     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1271 {
1272         struct killarg *ka = arg;
1273         dmu_tx_t *tx = ka->tx;
1274 
1275         if (bp == NULL)


1593         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
1594         VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
1595             process_old_cb, &poa, tx));
1596         VERIFY3U(zio_wait(poa.pio), ==, 0);
1597         ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
1598 
1599         /* change snapused */
1600         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1601             -poa.used, -poa.comp, -poa.uncomp, tx);
1602 
1603         /* swap next's deadlist to our deadlist */
1604         dsl_deadlist_close(&ds->ds_deadlist);
1605         dsl_deadlist_close(&ds_next->ds_deadlist);
1606         SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
1607             ds->ds_phys->ds_deadlist_obj);
1608         dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1609         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
1610             ds_next->ds_phys->ds_deadlist_obj);
1611 }
1612 
1613 static int
1614 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
1615 {
1616         int err;
1617         struct killarg ka;
1618 
1619         /*
1620          * Free everything that we point to (that's born after
1621          * the previous snapshot, if we are a clone)
1622          *
1623          * NB: this should be very quick, because we already
1624          * freed all the objects in open context.
1625          */
1626         ka.ds = ds;
1627         ka.tx = tx;
1628         err = traverse_dataset(ds,
1629             ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
1630             kill_blkptr, &ka);
1631         ASSERT3U(err, ==, 0);
1632         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
1633 
1634         return (err);
1635 }
1636 
1637 void
1638 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
1639 {
1640         struct dsl_ds_destroyarg *dsda = arg1;
1641         dsl_dataset_t *ds = dsda->ds;
1642         int err;
1643         int after_branch_point = FALSE;
1644         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1645         objset_t *mos = dp->dp_meta_objset;
1646         dsl_dataset_t *ds_prev = NULL;
1647         boolean_t wont_destroy;
1648         uint64_t obj;
1649 
1650         wont_destroy = (dsda->defer &&
1651             (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1));
1652 
1653         ASSERT(ds->ds_owner || wont_destroy);
1654         ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
1655         ASSERT(ds->ds_prev == NULL ||
1656             ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);


1763                                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
1764                                     ds_prev->ds_phys->ds_prev_snap_txg,
1765                                     ds->ds_phys->ds_prev_snap_txg,
1766                                     &used, &comp, &uncomp);
1767                                 ds_prev->ds_phys->ds_unique_bytes += used;
1768                         }
1769 
1770                         /* Adjust snapused. */
1771                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
1772                             ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1773                             &used, &comp, &uncomp);
1774                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
1775                             -used, -comp, -uncomp, tx);
1776 
1777                         /* Move blocks to be freed to pool's free list. */
1778                         dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
1779                             &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
1780                             tx);
1781                         dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
1782                             DD_USED_HEAD, used, comp, uncomp, tx);

1783 
1784                         /* Merge our deadlist into next's and free it. */
1785                         dsl_deadlist_merge(&ds_next->ds_deadlist,
1786                             ds->ds_phys->ds_deadlist_obj, tx);
1787                 }
1788                 dsl_deadlist_close(&ds->ds_deadlist);
1789                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1790 
1791                 /* Collapse range in clone heads */
1792                 dsl_dataset_remove_clones_key(ds,
1793                     ds->ds_phys->ds_creation_txg, tx);
1794 
1795                 if (dsl_dataset_is_snapshot(ds_next)) {
1796                         dsl_dataset_t *ds_nextnext;
1797 
1798                         /*
1799                          * Update next's unique to include blocks which
1800                          * were previously shared by only this snapshot
1801                          * and it.  Those blocks will be born after the
1802                          * prev snap and before this snap, and will have


1838 
1839                         /*
1840                          * Reduce the amount of our unconsmed refreservation
1841                          * being charged to our parent by the amount of
1842                          * new unique data we have gained.
1843                          */
1844                         if (old_unique < ds_next->ds_reserved) {
1845                                 int64_t mrsdelta;
1846                                 uint64_t new_unique =
1847                                     ds_next->ds_phys->ds_unique_bytes;
1848 
1849                                 ASSERT(old_unique <= new_unique);
1850                                 mrsdelta = MIN(new_unique - old_unique,
1851                                     ds_next->ds_reserved - old_unique);
1852                                 dsl_dir_diduse_space(ds->ds_dir,
1853                                     DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
1854                         }
1855                 }
1856                 dsl_dataset_rele(ds_next, FTAG);
1857         } else {
1858                 zfeature_info_t *async_destroy =
1859                     &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
1860 
1861                 /*
1862                  * There's no next snapshot, so this is a head dataset.
1863                  * Destroy the deadlist.  Unless it's a clone, the
1864                  * deadlist should be empty.  (If it's a clone, it's
1865                  * safe to ignore the deadlist contents.)
1866                  */


1867                 dsl_deadlist_close(&ds->ds_deadlist);
1868                 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
1869                 ds->ds_phys->ds_deadlist_obj = 0;
1870 
1871                 if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
1872                         err = old_synchronous_dataset_destroy(ds, tx);
1873                 } else {
1874                         /*
1875                          * Move the bptree into the pool's list of trees to
1876                          * clean up and update space accounting information.



1877                          */
1878                         uint64_t used, comp, uncomp;
1879 
1880                         ASSERT(err == 0 || err == EBUSY);
1881                         if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
1882                                 spa_feature_incr(dp->dp_spa, async_destroy, tx);
1883                                 dp->dp_bptree_obj = bptree_alloc(
1884                                     dp->dp_meta_objset, tx);
1885                                 VERIFY(zap_add(dp->dp_meta_objset,
1886                                     DMU_POOL_DIRECTORY_OBJECT,
1887                                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
1888                                     &dp->dp_bptree_obj, tx) == 0);
1889                         }
1890 
1891                         used = ds->ds_dir->dd_phys->dd_used_bytes;
1892                         comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
1893                         uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
1894 
1895                         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
1896                             ds->ds_phys->ds_unique_bytes == used);
1897 
1898                         bptree_add(dp->dp_meta_objset, dp->dp_bptree_obj,
1899                             &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
1900                             used, comp, uncomp, tx);
1901                         dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
1902                             -used, -comp, -uncomp, tx);
1903                         dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
1904                             used, comp, uncomp, tx);
1905                 }
1906 
1907                 if (ds->ds_prev != NULL) {
1908                         if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
1909                                 VERIFY3U(0, ==, zap_remove_int(mos,
1910                                     ds->ds_prev->ds_dir->dd_phys->dd_clones,
1911                                     ds->ds_object, tx));
1912                         }
1913                         dsl_dataset_rele(ds->ds_prev, ds);
1914                         ds->ds_prev = ds_prev = NULL;
1915                 }
1916         }
1917 
1918         /*
1919          * This must be done after the dsl_traverse(), because it will
1920          * re-open the objset.
1921          */
1922         if (ds->ds_objset) {
1923                 dmu_objset_evict(ds->ds_objset);
1924                 ds->ds_objset = NULL;
1925         }
1926 


2076         else
2077                 crtxg = tx->tx_txg;
2078 
2079         dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
2080             DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
2081         VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
2082         dmu_buf_will_dirty(dbuf, tx);
2083         dsphys = dbuf->db_data;
2084         bzero(dsphys, sizeof (dsl_dataset_phys_t));
2085         dsphys->ds_dir_obj = ds->ds_dir->dd_object;
2086         dsphys->ds_fsid_guid = unique_create();
2087         (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
2088             sizeof (dsphys->ds_guid));
2089         dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
2090         dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
2091         dsphys->ds_next_snap_obj = ds->ds_object;
2092         dsphys->ds_num_children = 1;
2093         dsphys->ds_creation_time = gethrestime_sec();
2094         dsphys->ds_creation_txg = crtxg;
2095         dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
2096         dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
2097         dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
2098         dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
2099         dsphys->ds_flags = ds->ds_phys->ds_flags;
2100         dsphys->ds_bp = ds->ds_phys->ds_bp;
2101         dmu_buf_rele(dbuf, FTAG);
2102 
2103         ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
2104         if (ds->ds_prev) {
2105                 uint64_t next_clones_obj =
2106                     ds->ds_prev->ds_phys->ds_next_clones_obj;
2107                 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
2108                     ds->ds_object ||
2109                     ds->ds_prev->ds_phys->ds_num_children > 1);
2110                 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
2111                         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2112                         ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
2113                             ds->ds_prev->ds_phys->ds_creation_txg);
2114                         ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
2115                 } else if (next_clones_obj != 0) {
2116                         remove_from_next_clones(ds->ds_prev,


2200         VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2201         VERIFY(nvlist_alloc(&val, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2202 
2203         /*
2204          * There may me missing entries in ds_next_clones_obj
2205          * due to a bug in a previous version of the code.
2206          * Only trust it if it has the right number of entries.
2207          */
2208         if (ds->ds_phys->ds_next_clones_obj != 0) {
2209                 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
2210                     &count));
2211         }
2212         if (count != ds->ds_phys->ds_num_children - 1) {
2213                 goto fail;
2214         }
2215         for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
2216             zap_cursor_retrieve(&zc, &za) == 0;
2217             zap_cursor_advance(&zc)) {
2218                 dsl_dataset_t *clone;
2219                 char buf[ZFS_MAXNAMELEN];
2220                 /*
2221                  * Even though we hold the dp_config_rwlock, the dataset
2222                  * may fail to open, returning ENOENT.  If there is a
2223                  * thread concurrently attempting to destroy this
2224                  * dataset, it will have the ds_rwlock held for
2225                  * RW_WRITER.  Our call to dsl_dataset_hold_obj() ->
2226                  * dsl_dataset_hold_ref() will fail its
2227                  * rw_tryenter(&ds->ds_rwlock, RW_READER), drop the
2228                  * dp_config_rwlock, and wait for the destroy progress
2229                  * and signal ds_exclusive_cv.  If the destroy was
2230                  * successful, we will see that
2231                  * DSL_DATASET_IS_DESTROYED(), and return ENOENT.
2232                  */
2233                 if (dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
2234                     za.za_first_integer, FTAG, &clone) != 0)
2235                         continue;

2236                 dsl_dir_name(clone->ds_dir, buf);
2237                 VERIFY(nvlist_add_boolean(val, buf) == 0);
2238                 dsl_dataset_rele(clone, FTAG);
2239         }
2240         zap_cursor_fini(&zc);
2241         VERIFY(nvlist_add_nvlist(propval, ZPROP_VALUE, val) == 0);
2242         VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
2243             propval) == 0);
2244 fail:
2245         nvlist_free(val);
2246         nvlist_free(propval);
2247         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2248 }
2249 
2250 void
2251 dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
2252 {
2253         uint64_t refd, avail, uobjs, aobjs, ratio;
2254 
2255         dsl_dir_stats(ds->ds_dir, nv);


2338                     ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
2339                 dsl_dataset_name(ods, stat->dds_origin);
2340                 dsl_dataset_drop_ref(ods, FTAG);
2341         } else {
2342                 stat->dds_origin[0] = '\0';
2343         }
2344         rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
2345 }
2346 
2347 uint64_t
2348 dsl_dataset_fsid_guid(dsl_dataset_t *ds)
2349 {
2350         return (ds->ds_fsid_guid);
2351 }
2352 
2353 void
2354 dsl_dataset_space(dsl_dataset_t *ds,
2355     uint64_t *refdbytesp, uint64_t *availbytesp,
2356     uint64_t *usedobjsp, uint64_t *availobjsp)
2357 {
2358         *refdbytesp = ds->ds_phys->ds_referenced_bytes;
2359         *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
2360         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
2361                 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
2362         if (ds->ds_quota != 0) {
2363                 /*
2364                  * Adjust available bytes according to refquota
2365                  */
2366                 if (*refdbytesp < ds->ds_quota)
2367                         *availbytesp = MIN(*availbytesp,
2368                             ds->ds_quota - *refdbytesp);
2369                 else
2370                         *availbytesp = 0;
2371         }
2372         *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
2373         *availobjsp = DN_MAX_OBJECT - *usedobjsp;
2374 }
2375 
2376 boolean_t
2377 dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
2378 {


2675         ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2676         dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2677             origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2678             &pa->unique, &unused, &unused);
2679 
2680         /*
2681          * Walk the snapshots that we are moving
2682          *
2683          * Compute space to transfer.  Consider the incremental changes
2684          * to used for each snapshot:
2685          * (my used) = (prev's used) + (blocks born) - (blocks killed)
2686          * So each snapshot gave birth to:
2687          * (blocks born) = (my used) - (prev's used) + (blocks killed)
2688          * So a sequence would look like:
2689          * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
2690          * Which simplifies to:
2691          * uN + kN + kN-1 + ... + k1 + k0
2692          * Note however, if we stop before we reach the ORIGIN we get:
2693          * uN + kN + kN-1 + ... + kM - uM-1
2694          */
2695         pa->used = origin_ds->ds_phys->ds_referenced_bytes;
2696         pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
2697         pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
2698         for (snap = list_head(&pa->shared_snaps); snap;
2699             snap = list_next(&pa->shared_snaps, snap)) {
2700                 uint64_t val, dlused, dlcomp, dluncomp;
2701                 dsl_dataset_t *ds = snap->ds;
2702 
2703                 /* Check that the snapshot name does not conflict */
2704                 VERIFY(0 == dsl_dataset_get_snapname(ds));
2705                 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
2706                 if (err == 0) {
2707                         err = EEXIST;
2708                         goto out;
2709                 }
2710                 if (err != ENOENT)
2711                         goto out;
2712 
2713                 /* The very first snapshot does not have a deadlist */
2714                 if (ds->ds_phys->ds_prev_snap_obj == 0)
2715                         continue;
2716 
2717                 dsl_deadlist_space(&ds->ds_deadlist,
2718                     &dlused, &dlcomp, &dluncomp);
2719                 pa->used += dlused;
2720                 pa->comp += dlcomp;
2721                 pa->uncomp += dluncomp;
2722         }
2723 
2724         /*
2725          * If we are a clone of a clone then we never reached ORIGIN,
2726          * so we need to subtract out the clone origin's used space.
2727          */
2728         if (pa->origin_origin) {
2729                 pa->used -= pa->origin_origin->ds_phys->ds_referenced_bytes;
2730                 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
2731                 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
2732         }
2733 
2734         /* Check that there is enough space here */
2735         err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2736             pa->used);
2737         if (err)
2738                 return (err);
2739 
2740         /*
2741          * Compute the amounts of space that will be used by snapshots
2742          * after the promotion (for both origin and clone).  For each,
2743          * it is the amount of space that will be on all of their
2744          * deadlists (that was not born before their new origin).
2745          */
2746         if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2747                 uint64_t space;
2748 
2749                 /*


3225                 blkptr_t tmp;
3226                 tmp = csa->ohds->ds_phys->ds_bp;
3227                 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
3228                 csa->cds->ds_phys->ds_bp = tmp;
3229         }
3230 
3231         /* set dd_*_bytes */
3232         {
3233                 int64_t dused, dcomp, duncomp;
3234                 uint64_t cdl_used, cdl_comp, cdl_uncomp;
3235                 uint64_t odl_used, odl_comp, odl_uncomp;
3236 
3237                 ASSERT3U(csa->cds->ds_dir->dd_phys->
3238                     dd_used_breakdown[DD_USED_SNAP], ==, 0);
3239 
3240                 dsl_deadlist_space(&csa->cds->ds_deadlist,
3241                     &cdl_used, &cdl_comp, &cdl_uncomp);
3242                 dsl_deadlist_space(&csa->ohds->ds_deadlist,
3243                     &odl_used, &odl_comp, &odl_uncomp);
3244 
3245                 dused = csa->cds->ds_phys->ds_referenced_bytes + cdl_used -
3246                     (csa->ohds->ds_phys->ds_referenced_bytes + odl_used);
3247                 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
3248                     (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
3249                 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
3250                     cdl_uncomp -
3251                     (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
3252 
3253                 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
3254                     dused, dcomp, duncomp, tx);
3255                 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
3256                     -dused, -dcomp, -duncomp, tx);
3257 
3258                 /*
3259                  * The difference in the space used by snapshots is the
3260                  * difference in snapshot space due to the head's
3261                  * deadlist (since that's the only thing that's
3262                  * changing that affects the snapused).
3263                  */
3264                 dsl_deadlist_space_range(&csa->cds->ds_deadlist,
3265                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3266                     &cdl_used, &cdl_comp, &cdl_uncomp);
3267                 dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
3268                     csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
3269                     &odl_used, &odl_comp, &odl_uncomp);
3270                 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
3271                     DD_USED_HEAD, DD_USED_SNAP, tx);
3272         }
3273 
3274         /* swap ds_*_bytes */
3275         SWITCH64(csa->ohds->ds_phys->ds_referenced_bytes,
3276             csa->cds->ds_phys->ds_referenced_bytes);
3277         SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
3278             csa->cds->ds_phys->ds_compressed_bytes);
3279         SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
3280             csa->cds->ds_phys->ds_uncompressed_bytes);
3281         SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
3282             csa->cds->ds_phys->ds_unique_bytes);
3283 
3284         /* apply any parent delta for change in unconsumed refreservation */
3285         dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
3286             csa->unused_refres_delta, 0, 0, tx);
3287 
3288         /*
3289          * Swap deadlists.
3290          */
3291         dsl_deadlist_close(&csa->cds->ds_deadlist);
3292         dsl_deadlist_close(&csa->ohds->ds_deadlist);
3293         SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
3294             csa->cds->ds_phys->ds_deadlist_obj);
3295         dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
3296             csa->cds->ds_phys->ds_deadlist_obj);


3385          * Make a space adjustment for reserved bytes.
3386          */
3387         if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
3388                 ASSERT3U(*used, >=,
3389                     ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3390                 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
3391                 *ref_rsrv =
3392                     asize - MIN(asize, parent_delta(ds, asize + inflight));
3393         }
3394 
3395         if (!check_quota || ds->ds_quota == 0) {
3396                 mutex_exit(&ds->ds_lock);
3397                 return (0);
3398         }
3399         /*
3400          * If they are requesting more space, and our current estimate
3401          * is over quota, they get to try again unless the actual
3402          * on-disk is over quota and there are no pending changes (which
3403          * may free up space for us).
3404          */
3405         if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
3406                 if (inflight > 0 ||
3407                     ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
3408                         error = ERESTART;
3409                 else
3410                         error = EDQUOT;
3411         }
3412         mutex_exit(&ds->ds_lock);
3413 
3414         return (error);
3415 }
3416 
3417 /* ARGSUSED */
3418 static int
3419 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
3420 {
3421         dsl_dataset_t *ds = arg1;
3422         dsl_prop_setarg_t *psa = arg2;
3423         int err;
3424 
3425         if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
3426                 return (ENOTSUP);
3427 
3428         if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
3429                 return (err);
3430 
3431         if (psa->psa_effective_value == 0)
3432                 return (0);
3433 
3434         if (psa->psa_effective_value < ds->ds_phys->ds_referenced_bytes ||
3435             psa->psa_effective_value < ds->ds_reserved)
3436                 return (ENOSPC);
3437 
3438         return (0);
3439 }
3440 
3441 extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *);
3442 
3443 void
3444 dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
3445 {
3446         dsl_dataset_t *ds = arg1;
3447         dsl_prop_setarg_t *psa = arg2;
3448         uint64_t effective_value = psa->psa_effective_value;
3449 
3450         dsl_prop_set_sync(ds, psa, tx);
3451         DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
3452 
3453         if (ds->ds_quota != effective_value) {
3454                 dmu_buf_will_dirty(ds->ds_dbuf, tx);


4168  * The written space is calculated by considering two components:  First, we
4169  * ignore any freed space, and calculate the written as new's used space
4170  * minus old's used space.  Next, we add in the amount of space that was freed
4171  * between the two snapshots, thus reducing new's used space relative to old's.
4172  * Specifically, this is the space that was born before old->ds_creation_txg,
4173  * and freed before new (ie. on new's deadlist or a previous deadlist).
4174  *
4175  * space freed                         [---------------------]
4176  * snapshots                       ---O-------O--------O-------O------
4177  *                                         oldsnap            new
4178  */
4179 int
4180 dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
4181     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
4182 {
4183         int err = 0;
4184         uint64_t snapobj;
4185         dsl_pool_t *dp = new->ds_dir->dd_pool;
4186 
4187         *usedp = 0;
4188         *usedp += new->ds_phys->ds_referenced_bytes;
4189         *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
4190 
4191         *compp = 0;
4192         *compp += new->ds_phys->ds_compressed_bytes;
4193         *compp -= oldsnap->ds_phys->ds_compressed_bytes;
4194 
4195         *uncompp = 0;
4196         *uncompp += new->ds_phys->ds_uncompressed_bytes;
4197         *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
4198 
4199         rw_enter(&dp->dp_config_rwlock, RW_READER);
4200         snapobj = new->ds_object;
4201         while (snapobj != oldsnap->ds_object) {
4202                 dsl_dataset_t *snap;
4203                 uint64_t used, comp, uncomp;
4204 
4205                 if (snapobj == new->ds_object) {
4206                         snap = new;
4207                 } else {
4208                         err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
4209                         if (err != 0)
4210                                 break;
4211                 }
4212 
4213                 if (snap->ds_phys->ds_prev_snap_txg ==
4214                     oldsnap->ds_phys->ds_creation_txg) {
4215                         /*
4216                          * The blocks in the deadlist can not be born after
4217                          * ds_prev_snap_txg, so get the whole deadlist space,
4218                          * which is more efficient (especially for old-format
4219                          * deadlists).  Unfortunately the deadlist code
4220                          * doesn't have enough information to make this
4221                          * optimization itself.
4222                          */
4223                         dsl_deadlist_space(&snap->ds_deadlist,
4224                             &used, &comp, &uncomp);
4225                 } else {
4226                         dsl_deadlist_space_range(&snap->ds_deadlist,
4227                             0, oldsnap->ds_phys->ds_creation_txg,
4228                             &used, &comp, &uncomp);
4229                 }
4230                 *usedp += used;
4231                 *compp += comp;
4232                 *uncompp += uncomp;
4233 
4234                 /*
4235                  * If we get to the beginning of the chain of snapshots
4236                  * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
4237                  * was not a snapshot of/before new.
4238                  */
4239                 snapobj = snap->ds_phys->ds_prev_snap_obj;
4240                 if (snap != new)
4241                         dsl_dataset_rele(snap, FTAG);
4242                 if (snapobj == 0) {
4243                         err = EINVAL;
4244                         break;
4245                 }
4246 
4247         }
4248         rw_exit(&dp->dp_config_rwlock);
4249         return (err);
4250 }
4251 
4252 /*
4253  * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
4254  * lastsnap, and all snapshots in between are deleted.
4255  *
4256  * blocks that would be freed            [---------------------------]
4257  * snapshots                       ---O-------O--------O-------O--------O
4258  *                                        firstsnap        lastsnap
4259  *
4260  * This is the set of blocks that were born after the snap before firstsnap,