Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
 Conflicts:
  usr/src/uts/common/fs/zfs/dbuf.c
  usr/src/uts/common/fs/zfs/dmu.c
  usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9200 Improve the scalability of attribute locking in zfs_zget
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9436 Rate limiting controls (was QoS) per ZFS dataset, updates from demo
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-8972 Async-delete side-effect that may cause unmount EBUSY
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-8852 Quality-of-Service (QoS) controls per NFS share
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5085 implement async delete for large files
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3762 Appliance crashes with a NULL pointer dereference during a zpool export when a zfs_vn_rele_taskq thread attempts to check a bogus rwlock from rw_write_held
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
6160 /usr/lib/fs/zfs/bootinstall should use bootadm
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: Adam Števko <adam.stevko@gmail.com>
Reviewed by: Josef Sipek <jeffpc@josefsipek.net>
Approved by: Richard Lowe <richlowe@richlowe.net>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (NULL is not an int)
6171 dsl_prop_unregister() slows down dataset eviction.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3485 Deferred deletes causing loss of service for NFS clients on cluster failover
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-2965 4.0.3-FP2: deferred deletes causing RSF import failure during fail-over of service
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
re #13253 rb4328 ssh: openssl version checking needs updating
re #11441 rb4292 panic in apic_record_rdt_entry on VMware hardware version 9
re #12619, rb4287 Deadlocked zfs txg processing in dsl_sync_task_group_sync()
re #13204 rb4280 zfs receive/rollback deadlock
re #6815 rb1758 need WORM in nza-kernel (4.0)


  28 /* Portions Copyright 2010 Robert Milkowski */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/kmem.h>
  35 #include <sys/pathname.h>
  36 #include <sys/vnode.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vfs_opreg.h>
  39 #include <sys/mntent.h>
  40 #include <sys/mount.h>
  41 #include <sys/cmn_err.h>
  42 #include "fs/fs_subr.h"
  43 #include <sys/zfs_znode.h>
  44 #include <sys/zfs_dir.h>
  45 #include <sys/zil.h>
  46 #include <sys/fs/zfs.h>
  47 #include <sys/dmu.h>

  48 #include <sys/dsl_prop.h>
  49 #include <sys/dsl_dataset.h>
  50 #include <sys/dsl_deleg.h>
  51 #include <sys/spa.h>
  52 #include <sys/zap.h>
  53 #include <sys/sa.h>
  54 #include <sys/sa_impl.h>
  55 #include <sys/varargs.h>
  56 #include <sys/policy.h>
  57 #include <sys/atomic.h>
  58 #include <sys/mkdev.h>
  59 #include <sys/modctl.h>
  60 #include <sys/refstr.h>
  61 #include <sys/zfs_ioctl.h>
  62 #include <sys/zfs_ctldir.h>
  63 #include <sys/zfs_fuid.h>
  64 #include <sys/bootconf.h>
  65 #include <sys/sunddi.h>
  66 #include <sys/dnlc.h>
  67 #include <sys/dmu_objset.h>


 379 
 380         zfsvfs->z_vscan = newval;
 381 }
 382 
 383 static void
 384 acl_mode_changed_cb(void *arg, uint64_t newval)
 385 {
 386         zfsvfs_t *zfsvfs = arg;
 387 
 388         zfsvfs->z_acl_mode = newval;
 389 }
 390 
 391 static void
 392 acl_inherit_changed_cb(void *arg, uint64_t newval)
 393 {
 394         zfsvfs_t *zfsvfs = arg;
 395 
 396         zfsvfs->z_acl_inherit = newval;
 397 }
 398 










 399 static int
 400 zfs_register_callbacks(vfs_t *vfsp)
 401 {
 402         struct dsl_dataset *ds = NULL;
 403         objset_t *os = NULL;
 404         zfsvfs_t *zfsvfs = NULL;
 405         uint64_t nbmand;
 406         boolean_t readonly = B_FALSE;
 407         boolean_t do_readonly = B_FALSE;
 408         boolean_t setuid = B_FALSE;
 409         boolean_t do_setuid = B_FALSE;
 410         boolean_t exec = B_FALSE;
 411         boolean_t do_exec = B_FALSE;
 412         boolean_t devices = B_FALSE;
 413         boolean_t do_devices = B_FALSE;
 414         boolean_t xattr = B_FALSE;
 415         boolean_t do_xattr = B_FALSE;
 416         boolean_t atime = B_FALSE;
 417         boolean_t do_atime = B_FALSE;
 418         int error = 0;


 516             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 517         error = error ? error : dsl_prop_register(ds,
 518             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 519         error = error ? error : dsl_prop_register(ds,
 520             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 521         error = error ? error : dsl_prop_register(ds,
 522             zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 523         error = error ? error : dsl_prop_register(ds,
 524             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 525         error = error ? error : dsl_prop_register(ds,
 526             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 527         error = error ? error : dsl_prop_register(ds,
 528             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 529         error = error ? error : dsl_prop_register(ds,
 530             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 531         error = error ? error : dsl_prop_register(ds,
 532             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 533             zfsvfs);
 534         error = error ? error : dsl_prop_register(ds,
 535             zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);



 536         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 537         if (error)
 538                 goto unregister;
 539 
 540         /*
 541          * Invoke our callbacks to restore temporary mount options.
 542          */
 543         if (do_readonly)
 544                 readonly_changed_cb(zfsvfs, readonly);
 545         if (do_setuid)
 546                 setuid_changed_cb(zfsvfs, setuid);
 547         if (do_exec)
 548                 exec_changed_cb(zfsvfs, exec);
 549         if (do_devices)
 550                 devices_changed_cb(zfsvfs, devices);
 551         if (do_xattr)
 552                 xattr_changed_cb(zfsvfs, xattr);
 553         if (do_atime)
 554                 atime_changed_cb(zfsvfs, atime);
 555 


 976          */
 977 
 978         error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 979         if (error != 0) {
 980                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 981                 return (error);
 982         }
 983 
 984         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 985         if (error != 0) {
 986                 dmu_objset_disown(os, zfsvfs);
 987         }
 988         return (error);
 989 }
 990 
 991 
 992 int
 993 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 994 {
 995         int error;

 996 
 997         zfsvfs->z_vfs = NULL;
 998         zfsvfs->z_parent = zfsvfs;
 999 
1000         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1001         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1002         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1003             offsetof(znode_t, z_link_node));
1004         rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1005         rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1006         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1007         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)


1008                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);


1009 
1010         error = zfsvfs_init(zfsvfs, os);
1011         if (error != 0) {
1012                 *zfvp = NULL;

1013                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1014                 return (error);
1015         }
1016 
1017         *zfvp = zfsvfs;
1018         return (0);
1019 }
1020 
1021 static int
1022 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1023 {
1024         int error;
1025 
1026         error = zfs_register_callbacks(zfsvfs->z_vfs);
1027         if (error)
1028                 return (error);
1029 
1030         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1031 
1032         /*
1033          * If we are not mounting (ie: online recv), then we don't
1034          * have to worry about replaying the log as we blocked all
1035          * operations out since we closed the ZIL.
1036          */
1037         if (mounting) {
1038                 boolean_t readonly;
1039 
1040                 /*
1041                  * During replay we remove the read only flag to
1042                  * allow replays to succeed.
1043                  */
1044                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1045                 if (readonly != 0)
1046                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1047                 else
1048                         zfs_unlinked_drain(zfsvfs);

1049 
1050                 /*
1051                  * Parse and replay the intent log.
1052                  *
1053                  * Because of ziltest, this must be done after
1054                  * zfs_unlinked_drain().  (Further note: ziltest
1055                  * doesn't use readonly mounts, where
1056                  * zfs_unlinked_drain() isn't called.)  This is because
1057                  * ziltest causes spa_sync() to think it's committed,
1058                  * but actually it is not, so the intent log contains
1059                  * many txg's worth of changes.
1060                  *
1061                  * In particular, if object N is in the unlinked set in
1062                  * the last txg to actually sync, then it could be
1063                  * actually freed in a later txg and then reallocated
1064                  * in a yet later txg.  This would write a "create
1065                  * object N" record to the intent log.  Normally, this
1066                  * would be fine because the spa_sync() would have
1067                  * written out the fact that object N is free, before
1068                  * we could write the "create object N" intent log
1069                  * record.
1070                  *
1071                  * But when we are in ziltest mode, we advance the "open
1072                  * txg" without actually spa_sync()-ing the changes to
1073                  * disk.  So we would see that object N is still
1074                  * allocated and in the unlinked set, and there is an
1075                  * intent log record saying to allocate it.
1076                  */
1077                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1078                         if (zil_replay_disable) {
1079                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1080                         } else {
1081                                 zfsvfs->z_replay = B_TRUE;
1082                                 zil_replay(zfsvfs->z_os, zfsvfs,
1083                                     zfs_replay_vector);
1084                                 zfsvfs->z_replay = B_FALSE;
1085                         }
1086                 }
1087                 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */



1088         }
1089 
1090         /*
1091          * Set the objset user_ptr to track its zfsvfs.
1092          */
1093         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1094         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1095         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1096 
1097         return (0);
1098 }
1099 
1100 void
1101 zfsvfs_free(zfsvfs_t *zfsvfs)
1102 {
1103         int i;
1104         extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1105 
1106         /*
1107          * This is a barrier to prevent the filesystem from going away in
1108          * zfs_znode_move() until we can safely ensure that the filesystem is
1109          * not unmounted. We consider the filesystem valid before the barrier
1110          * and invalid after the barrier.
1111          */
1112         rw_enter(&zfsvfs_lock, RW_READER);
1113         rw_exit(&zfsvfs_lock);
1114 


1115         zfs_fuid_destroy(zfsvfs);
1116 


1117         mutex_destroy(&zfsvfs->z_znodes_lock);
1118         mutex_destroy(&zfsvfs->z_lock);
1119         list_destroy(&zfsvfs->z_all_znodes);
1120         rrm_destroy(&zfsvfs->z_teardown_lock);
1121         rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1122         rw_destroy(&zfsvfs->z_fuid_lock);
1123         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1124                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);



1125         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1126 }
1127 
1128 static void
1129 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1130 {
1131         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1132         if (zfsvfs->z_vfs) {
1133                 if (zfsvfs->z_use_fuids) {
1134                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1135                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1136                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1137                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1138                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1139                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1140                 } else {
1141                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1142                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1143                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1144                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1145                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1146                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1147                 }
1148         }
1149         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1150 }
1151 
1152 static int
1153 zfs_domount(vfs_t *vfsp, char *osname)
1154 {
1155         dev_t mount_dev;
1156         uint64_t recordsize, fsid_guid;
1157         int error = 0;
1158         zfsvfs_t *zfsvfs;

1159 
1160         ASSERT(vfsp);
1161         ASSERT(osname);
1162 
1163         error = zfsvfs_create(osname, &zfsvfs);
1164         if (error)
1165                 return (error);
1166         zfsvfs->z_vfs = vfsp;
1167 
1168         /* Initialize the generic filesystem structure. */
1169         vfsp->vfs_bcount = 0;
1170         vfsp->vfs_data = NULL;
1171 
1172         if (zfs_create_unique_device(&mount_dev) == -1) {
1173                 error = SET_ERROR(ENODEV);
1174                 goto out;
1175         }
1176         ASSERT(vfs_devismounted(mount_dev) == 0);
1177 
1178         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1179             NULL))
1180                 goto out;
1181 








1182         vfsp->vfs_dev = mount_dev;
1183         vfsp->vfs_fstype = zfsfstype;
1184         vfsp->vfs_bsize = recordsize;
1185         vfsp->vfs_flag |= VFS_NOTRUNC;
1186         vfsp->vfs_data = zfsvfs;
1187 
1188         /*
1189          * The fsid is 64 bits, composed of an 8-bit fs type, which
1190          * separates our fsid from any other filesystem types, and a
1191          * 56-bit objset unique ID.  The objset unique ID is unique to
1192          * all objsets open on this system, provided by unique_create().
1193          * The 8-bit fs type must be put in the low bits of fsid[1]
1194          * because that's where other Solaris filesystems put it.
1195          */
1196         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1197         ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1198         vfsp->vfs_fsid.val[0] = fsid_guid;
1199         vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1200             zfsfstype & 0xFF;
1201 


1728 
1729         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1730         if (error == 0)
1731                 *vpp = ZTOV(rootzp);
1732 
1733         ZFS_EXIT(zfsvfs);
1734         return (error);
1735 }
1736 
1737 /*
1738  * Teardown the zfsvfs::z_os.
1739  *
1740  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1741  * and 'z_teardown_inactive_lock' held.
1742  */
1743 static int
1744 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1745 {
1746         znode_t *zp;
1747 

1748         rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1749 
1750         if (!unmounting) {
1751                 /*
1752                  * We purge the parent filesystem's vfsp as the parent
1753                  * filesystem and all of its snapshots have their vnode's
1754                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1755                  * 'z_parent' is self referential for non-snapshots.
1756                  */
1757                 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1758         }
1759 
1760         /*
1761          * Close the zil. NB: Can't close the zil while zfs_inactive
1762          * threads are blocked as zil_close can call zfs_inactive.
1763          */
1764         if (zfsvfs->z_log) {
1765                 zil_close(zfsvfs->z_log);
1766                 zfsvfs->z_log = NULL;
1767         }


1808 
1809         /*
1810          * z_os will be NULL if there was an error in attempting to reopen
1811          * zfsvfs, so just return as the properties had already been
1812          * unregistered and cached data had been evicted before.
1813          */
1814         if (zfsvfs->z_os == NULL)
1815                 return (0);
1816 
1817         /*
1818          * Unregister properties.
1819          */
1820         zfs_unregister_callbacks(zfsvfs);
1821 
1822         /*
1823          * Evict cached data
1824          */
1825         if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
1826             !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1827                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1828         dmu_objset_evict_dbufs(zfsvfs->z_os);
1829 
1830         return (0);
1831 }
1832 
1833 /*ARGSUSED*/
1834 static int
1835 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1836 {
1837         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1838         objset_t *os;
1839         int ret;
1840 
1841         ret = secpolicy_fs_unmount(cr, vfsp);
1842         if (ret) {
1843                 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1844                     ZFS_DELEG_PERM_MOUNT, cr))
1845                         return (ret);
1846         }
1847 
1848         /*
1849          * We purge the parent filesystem's vfsp as the parent filesystem
1850          * and all of its snapshots have their vnode's v_vfsp set to the
1851          * parent's filesystem's vfsp.  Note, 'z_parent' is self
1852          * referential for non-snapshots.
1853          */
1854         (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1855 
1856         /*
1857          * Unmount any snapshots mounted under .zfs before unmounting the
1858          * dataset itself.
1859          */
1860         if (zfsvfs->z_ctldir != NULL &&
1861             (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1862                 return (ret);
1863         }
1864 
1865         if (!(fflag & MS_FORCE)) {


1866                 /*
1867                  * Check the number of active vnodes in the file system.
1868                  * Our count is maintained in the vfs structure, but the
1869                  * number is off by 1 to indicate a hold on the vfs
1870                  * structure itself.
1871                  *
1872                  * The '.zfs' directory maintains a reference of its
1873                  * own, and any active references underneath are
1874                  * reflected in the vnode count.


1875                  */




1876                 if (zfsvfs->z_ctldir == NULL) {
1877                         if (vfsp->vfs_count > 1)
1878                                 return (SET_ERROR(EBUSY));
1879                 } else {
1880                         if (vfsp->vfs_count > 2 ||
1881                             zfsvfs->z_ctldir->v_count > 1)
1882                                 return (SET_ERROR(EBUSY));
1883                 }
1884         }
1885 
1886         vfsp->vfs_flag |= VFS_UNMOUNTED;
1887 
1888         VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1889         os = zfsvfs->z_os;
1890 
1891         /*
1892          * z_os will be NULL if there was an error in
1893          * attempting to reopen zfsvfs.
1894          */
1895         if (os != NULL) {
1896                 /*
1897                  * Unset the objset user_ptr.
1898                  */
1899                 mutex_enter(&os->os_user_ptr_lock);
1900                 dmu_objset_set_user(os, NULL);


1997         }
1998 
1999         *vpp = ZTOV(zp);
2000         ZFS_EXIT(zfsvfs);
2001         return (0);
2002 }
2003 
2004 /*
2005  * Block out VOPs and close zfsvfs_t::z_os
2006  *
2007  * Note, if successful, then we return with the 'z_teardown_lock' and
2008  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2009  * dataset and objset intact so that they can be atomically handed off during
2010  * a subsequent rollback or recv operation and the resume thereafter.
2011  */
2012 int
2013 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2014 {
2015         int error;
2016 
2017         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)











2018                 return (error);

2019 
2020         return (0);
2021 }
2022 
2023 /*
2024  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2025  * is an invariant across any of the operations that can be performed while the
2026  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2027  * are the same: the relevant objset and associated dataset are owned by
2028  * zfsvfs, held, and long held on entry.
2029  */
2030 int
2031 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2032 {
2033         int err;
2034         znode_t *zp;
2035 
2036         ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2037         ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2038 


2049         if (err != 0)
2050                 goto bail;
2051 
2052         VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2053 
2054         zfs_set_fuid_feature(zfsvfs);
2055 
2056         /*
2057          * Attempt to re-establish all the active znodes with
2058          * their dbufs.  If a zfs_rezget() fails, then we'll let
2059          * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2060          * when they try to use their znode.
2061          */
2062         mutex_enter(&zfsvfs->z_znodes_lock);
2063         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2064             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2065                 (void) zfs_rezget(zp);
2066         }
2067         mutex_exit(&zfsvfs->z_znodes_lock);
2068 










2069 bail:
2070         /* release the VOPs */
2071         rw_exit(&zfsvfs->z_teardown_inactive_lock);
2072         rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2073 
2074         if (err) {
2075                 /*
2076                  * Since we couldn't setup the sa framework, try to force
2077                  * unmount this file system.
2078                  */
2079                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2080                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2081         }




2082         return (err);
2083 }
2084 
2085 static void
2086 zfs_freevfs(vfs_t *vfsp)
2087 {
2088         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2089 
2090         /*
2091          * If this is a snapshot, we have an extra VFS_HOLD on our parent
2092          * from zfs_mount().  Release it here.  If we came through
2093          * zfs_mountroot() instead, we didn't grab an extra hold, so
2094          * skip the VFS_RELE for rootvfs.
2095          */
2096         if (zfsvfs->z_issnap && (vfsp != rootvfs))
2097                 VFS_RELE(zfsvfs->z_parent->z_vfs);
2098 
2099         zfsvfs_free(zfsvfs);
2100 
2101         atomic_dec_32(&zfs_active_fs_count);


2215 
2216                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2217                     SPA_VERSION_SA);
2218                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2219                     DMU_OT_NONE, 0, tx);
2220 
2221                 error = zap_add(os, MASTER_NODE_OBJ,
2222                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2223                 ASSERT0(error);
2224 
2225                 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2226                 sa_register_update_callback(os, zfs_sa_upgrade);
2227         }
2228 
2229         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2230             "from %llu to %llu", zfsvfs->z_version, newvers);
2231 
2232         dmu_tx_commit(tx);
2233 
2234         zfsvfs->z_version = newvers;

2235 
2236         zfs_set_fuid_feature(zfsvfs);
2237 
2238         return (0);
2239 }
2240 
2241 /*
2242  * Read a property stored within the master node.
2243  */
2244 int
2245 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2246 {
2247         const char *pname;
2248         int error = ENOENT;
2249 
2250         /*
2251          * Look up the file system's value for the property.  For the
2252          * version property, we look up a slightly different string.
2253          */
2254         if (prop == ZFS_PROP_VERSION)






























2255                 pname = ZPL_VERSION_STR;
2256         else
2257                 pname = zfs_prop_to_name(prop);

2258 
2259         if (os != NULL) {
2260                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2261                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2262         }
2263 
2264         if (error == ENOENT) {
2265                 /* No value set, use the default value */
2266                 switch (prop) {
2267                 case ZFS_PROP_VERSION:
2268                         *value = ZPL_VERSION;
2269                         break;
2270                 case ZFS_PROP_NORMALIZE:
2271                 case ZFS_PROP_UTF8ONLY:
2272                         *value = 0;
2273                         break;
2274                 case ZFS_PROP_CASE:
2275                         *value = ZFS_CASE_SENSITIVE;
2276                         break;
2277                 default:
2278                         return (error);
2279                 }
2280                 error = 0;
2281         }









2282         return (error);
2283 }
2284 
2285 /*
2286  * Return true if the coresponding vfs's unmounted flag is set.
2287  * Otherwise return false.
2288  * If this function returns true we know VFS unmount has been initiated.
2289  */
2290 boolean_t
2291 zfs_get_vfs_flag_unmounted(objset_t *os)
2292 {
2293         zfsvfs_t *zfvp;
2294         boolean_t unmounted = B_FALSE;
2295 
2296         ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2297 
2298         mutex_enter(&os->os_user_ptr_lock);
2299         zfvp = dmu_objset_get_user(os);
2300         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2301             (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))


  28 /* Portions Copyright 2010 Robert Milkowski */
  29 
  30 #include <sys/types.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/kmem.h>
  35 #include <sys/pathname.h>
  36 #include <sys/vnode.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vfs_opreg.h>
  39 #include <sys/mntent.h>
  40 #include <sys/mount.h>
  41 #include <sys/cmn_err.h>
  42 #include "fs/fs_subr.h"
  43 #include <sys/zfs_znode.h>
  44 #include <sys/zfs_dir.h>
  45 #include <sys/zil.h>
  46 #include <sys/fs/zfs.h>
  47 #include <sys/dmu.h>
  48 #include <sys/dsl_dir.h>
  49 #include <sys/dsl_prop.h>
  50 #include <sys/dsl_dataset.h>
  51 #include <sys/dsl_deleg.h>
  52 #include <sys/spa.h>
  53 #include <sys/zap.h>
  54 #include <sys/sa.h>
  55 #include <sys/sa_impl.h>
  56 #include <sys/varargs.h>
  57 #include <sys/policy.h>
  58 #include <sys/atomic.h>
  59 #include <sys/mkdev.h>
  60 #include <sys/modctl.h>
  61 #include <sys/refstr.h>
  62 #include <sys/zfs_ioctl.h>
  63 #include <sys/zfs_ctldir.h>
  64 #include <sys/zfs_fuid.h>
  65 #include <sys/bootconf.h>
  66 #include <sys/sunddi.h>
  67 #include <sys/dnlc.h>
  68 #include <sys/dmu_objset.h>


 380 
 381         zfsvfs->z_vscan = newval;
 382 }
 383 
 384 static void
 385 acl_mode_changed_cb(void *arg, uint64_t newval)
 386 {
 387         zfsvfs_t *zfsvfs = arg;
 388 
 389         zfsvfs->z_acl_mode = newval;
 390 }
 391 
 392 static void
 393 acl_inherit_changed_cb(void *arg, uint64_t newval)
 394 {
 395         zfsvfs_t *zfsvfs = arg;
 396 
 397         zfsvfs->z_acl_inherit = newval;
 398 }
 399 
 400 static void
 401 rate_changed_cb(void *arg, uint64_t newval)
 402 {
 403         zfsvfs_t *zfsvfs = arg;
 404 
 405         if (newval == UINT64_MAX)
 406                 newval = 0;
 407         zfsvfs->z_rate.rate_cap = newval;
 408 }
 409 
 410 static int
 411 zfs_register_callbacks(vfs_t *vfsp)
 412 {
 413         struct dsl_dataset *ds = NULL;
 414         objset_t *os = NULL;
 415         zfsvfs_t *zfsvfs = NULL;
 416         uint64_t nbmand;
 417         boolean_t readonly = B_FALSE;
 418         boolean_t do_readonly = B_FALSE;
 419         boolean_t setuid = B_FALSE;
 420         boolean_t do_setuid = B_FALSE;
 421         boolean_t exec = B_FALSE;
 422         boolean_t do_exec = B_FALSE;
 423         boolean_t devices = B_FALSE;
 424         boolean_t do_devices = B_FALSE;
 425         boolean_t xattr = B_FALSE;
 426         boolean_t do_xattr = B_FALSE;
 427         boolean_t atime = B_FALSE;
 428         boolean_t do_atime = B_FALSE;
 429         int error = 0;


 527             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 528         error = error ? error : dsl_prop_register(ds,
 529             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 530         error = error ? error : dsl_prop_register(ds,
 531             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 532         error = error ? error : dsl_prop_register(ds,
 533             zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 534         error = error ? error : dsl_prop_register(ds,
 535             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 536         error = error ? error : dsl_prop_register(ds,
 537             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 538         error = error ? error : dsl_prop_register(ds,
 539             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 540         error = error ? error : dsl_prop_register(ds,
 541             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 542         error = error ? error : dsl_prop_register(ds,
 543             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 544             zfsvfs);
 545         error = error ? error : dsl_prop_register(ds,
 546             zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 547         error = error ? error : dsl_prop_register(ds,
 548             zfs_prop_to_name(ZFS_PROP_RATE_LIMIT), rate_changed_cb, zfsvfs);
 549 
 550         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 551         if (error)
 552                 goto unregister;
 553 
 554         /*
 555          * Invoke our callbacks to restore temporary mount options.
 556          */
 557         if (do_readonly)
 558                 readonly_changed_cb(zfsvfs, readonly);
 559         if (do_setuid)
 560                 setuid_changed_cb(zfsvfs, setuid);
 561         if (do_exec)
 562                 exec_changed_cb(zfsvfs, exec);
 563         if (do_devices)
 564                 devices_changed_cb(zfsvfs, devices);
 565         if (do_xattr)
 566                 xattr_changed_cb(zfsvfs, xattr);
 567         if (do_atime)
 568                 atime_changed_cb(zfsvfs, atime);
 569 


 990          */
 991 
 992         error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 993         if (error != 0) {
 994                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 995                 return (error);
 996         }
 997 
 998         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 999         if (error != 0) {
1000                 dmu_objset_disown(os, zfsvfs);
1001         }
1002         return (error);
1003 }
1004 
1005 
1006 int
1007 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1008 {
1009         int error;
1010         int size = spa_get_obj_mtx_sz(dmu_objset_spa(os));
1011 
1012         zfsvfs->z_vfs = NULL;
1013         zfsvfs->z_parent = zfsvfs;
1014 
1015         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1016         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1017         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1018             offsetof(znode_t, z_link_node));
1019         rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1020         rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1021         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1022         zfsvfs->z_hold_mtx_sz = size;
1023         zfsvfs->z_hold_mtx = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1024         for (int i = 0; i != size; i++)
1025                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1026         mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL);
1027         cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL);
1028 
1029         error = zfsvfs_init(zfsvfs, os);
1030         if (error != 0) {
1031                 *zfvp = NULL;
1032                 kmem_free(zfsvfs->z_hold_mtx, sizeof (kmutex_t) * size);
1033                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1034                 return (error);
1035         }
1036 
1037         *zfvp = zfsvfs;
1038         return (0);
1039 }
1040 
1041 static int
1042 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1043 {
1044         int error;
1045 
1046         error = zfs_register_callbacks(zfsvfs->z_vfs);
1047         if (error)
1048                 return (error);
1049 
1050         zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1051 
1052         /*
1053          * If we are not mounting (ie: online recv), then we don't
1054          * have to worry about replaying the log as we blocked all
1055          * operations out since we closed the ZIL.
1056          */
1057         if (mounting) {
1058                 boolean_t readonly;
1059 
1060                 /*
1061                  * During replay we remove the read only flag to
1062                  * allow replays to succeed.
1063                  */
1064                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1065                 if (readonly)
1066                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1067                 else {
1068                         zfs_unlinked_drain(zfsvfs);
1069                 }
1070 
1071                 /*
1072                  * Parse and replay the intent log.
1073                  *
1074                  * Because of ziltest, this must be done after
1075                  * zfs_unlinked_drain().  (Further note: ziltest
1076                  * doesn't use readonly mounts, where
1077                  * zfs_unlinked_drain() isn't called.)  This is because
1078                  * ziltest causes spa_sync() to think it's committed,
1079                  * but actually it is not, so the intent log contains
1080                  * many txg's worth of changes.
1081                  *
1082                  * In particular, if object N is in the unlinked set in
1083                  * the last txg to actually sync, then it could be
1084                  * actually freed in a later txg and then reallocated
1085                  * in a yet later txg.  This would write a "create
1086                  * object N" record to the intent log.  Normally, this
1087                  * would be fine because the spa_sync() would have
1088                  * written out the fact that object N is free, before
1089                  * we could write the "create object N" intent log
1090                  * record.
1091                  *
1092                  * But when we are in ziltest mode, we advance the "open
1093                  * txg" without actually spa_sync()-ing the changes to
1094                  * disk.  So we would see that object N is still
1095                  * allocated and in the unlinked set, and there is an
1096                  * intent log record saying to allocate it.
1097                  */
1098                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1099                         if (zil_replay_disable) {
1100                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1101                         } else {
1102                                 zfsvfs->z_replay = B_TRUE;
1103                                 zil_replay(zfsvfs->z_os, zfsvfs,
1104                                     zfs_replay_vector);
1105                                 zfsvfs->z_replay = B_FALSE;
1106                         }
1107                 }
1108 
1109                 /* restore readonly bit */
1110                 if (readonly)
1111                         zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1112         }
1113 
1114         /*
1115          * Set the objset user_ptr to track its zfsvfs.
1116          */
1117         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1118         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1119         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1120 
1121         return (0);
1122 }
1123 
1124 void
1125 zfsvfs_free(zfsvfs_t *zfsvfs)
1126 {
1127         int i;
1128         extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1129 
1130         /*
1131          * This is a barrier to prevent the filesystem from going away in
1132          * zfs_znode_move() until we can safely ensure that the filesystem is
1133          * not unmounted. We consider the filesystem valid before the barrier
1134          * and invalid after the barrier.
1135          */
1136         rw_enter(&zfsvfs_lock, RW_READER);
1137         rw_exit(&zfsvfs_lock);
1138 
1139         VERIFY0(zfsvfs->z_znodes_freeing_cnt);
1140 
1141         zfs_fuid_destroy(zfsvfs);
1142 
1143         cv_destroy(&zfsvfs->z_drain_cv);
1144         mutex_destroy(&zfsvfs->z_drain_lock);
1145         mutex_destroy(&zfsvfs->z_znodes_lock);
1146         mutex_destroy(&zfsvfs->z_lock);
1147         list_destroy(&zfsvfs->z_all_znodes);
1148         rrm_destroy(&zfsvfs->z_teardown_lock);
1149         rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1150         rw_destroy(&zfsvfs->z_fuid_lock);
1151         for (i = 0; i != zfsvfs->z_hold_mtx_sz; i++)
1152                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1153 
1154         kmem_free(zfsvfs->z_hold_mtx,
1155             sizeof (kmutex_t) * zfsvfs->z_hold_mtx_sz);
1156         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1157 }
1158 
1159 static void
1160 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1161 {
1162         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1163         if (zfsvfs->z_vfs) {
1164                 if (zfsvfs->z_use_fuids) {
1165                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1166                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1167                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1168                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1169                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1170                         vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1171                 } else {
1172                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1173                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1174                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1175                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1176                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1177                         vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1178                 }
1179         }
1180         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1181 }
1182 
1183 static int
1184 zfs_domount(vfs_t *vfsp, char *osname)
1185 {
1186         dev_t mount_dev;
1187         uint64_t recordsize, fsid_guid;
1188         int error = 0;
1189         zfsvfs_t *zfsvfs;
1190         char    worminfo[13] = {0};
1191 
1192         ASSERT(vfsp);
1193         ASSERT(osname);
1194 
1195         error = zfsvfs_create(osname, &zfsvfs);
1196         if (error)
1197                 return (error);
1198         zfsvfs->z_vfs = vfsp;
1199 
1200         /* Initialize the generic filesystem structure. */
1201         vfsp->vfs_bcount = 0;
1202         vfsp->vfs_data = NULL;
1203 
1204         if (zfs_create_unique_device(&mount_dev) == -1) {
1205                 error = SET_ERROR(ENODEV);
1206                 goto out;
1207         }
1208         ASSERT(vfs_devismounted(mount_dev) == 0);
1209 
1210         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1211             NULL))
1212                 goto out;
1213 
1214         if (dsl_prop_get(osname, "nms:worm", 1, 12, &worminfo, NULL) == 0 &&
1215             worminfo[0] && strcmp(worminfo, "0") != 0 &&
1216             strcmp(worminfo, "off") != 0 && strcmp(worminfo, "-") != 0) {
1217                 zfsvfs->z_isworm = B_TRUE;
1218         } else {
1219                 zfsvfs->z_isworm = B_FALSE;
1220         }
1221 
1222         vfsp->vfs_dev = mount_dev;
1223         vfsp->vfs_fstype = zfsfstype;
1224         vfsp->vfs_bsize = recordsize;
1225         vfsp->vfs_flag |= VFS_NOTRUNC;
1226         vfsp->vfs_data = zfsvfs;
1227 
1228         /*
1229          * The fsid is 64 bits, composed of an 8-bit fs type, which
1230          * separates our fsid from any other filesystem types, and a
1231          * 56-bit objset unique ID.  The objset unique ID is unique to
1232          * all objsets open on this system, provided by unique_create().
1233          * The 8-bit fs type must be put in the low bits of fsid[1]
1234          * because that's where other Solaris filesystems put it.
1235          */
1236         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1237         ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1238         vfsp->vfs_fsid.val[0] = fsid_guid;
1239         vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1240             zfsfstype & 0xFF;
1241 


1768 
1769         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1770         if (error == 0)
1771                 *vpp = ZTOV(rootzp);
1772 
1773         ZFS_EXIT(zfsvfs);
1774         return (error);
1775 }
1776 
1777 /*
1778  * Teardown the zfsvfs::z_os.
1779  *
1780  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1781  * and 'z_teardown_inactive_lock' held.
1782  */
1783 static int
1784 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1785 {
1786         znode_t *zp;
1787 
1788         zfs_unlinked_drain_stop_wait(zfsvfs);
1789         rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1790 
1791         if (!unmounting) {
1792                 /*
1793                  * We purge the parent filesystem's vfsp as the parent
1794                  * filesystem and all of its snapshots have their vnode's
1795                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1796                  * 'z_parent' is self referential for non-snapshots.
1797                  */
1798                 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1799         }
1800 
1801         /*
1802          * Close the zil. NB: Can't close the zil while zfs_inactive
1803          * threads are blocked as zil_close can call zfs_inactive.
1804          */
1805         if (zfsvfs->z_log) {
1806                 zil_close(zfsvfs->z_log);
1807                 zfsvfs->z_log = NULL;
1808         }


1849 
1850         /*
1851          * z_os will be NULL if there was an error in attempting to reopen
1852          * zfsvfs, so just return as the properties had already been
1853          * unregistered and cached data had been evicted before.
1854          */
1855         if (zfsvfs->z_os == NULL)
1856                 return (0);
1857 
1858         /*
1859          * Unregister properties.
1860          */
1861         zfs_unregister_callbacks(zfsvfs);
1862 
1863         /*
1864          * Evict cached data
1865          */
1866         if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
1867             !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1868                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1869         (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1870 
1871         return (0);
1872 }
1873 
1874 /*ARGSUSED*/
1875 static int
1876 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1877 {
1878         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1879         objset_t *os;
1880         int ret;
1881 
1882         ret = secpolicy_fs_unmount(cr, vfsp);
1883         if (ret) {
1884                 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1885                     ZFS_DELEG_PERM_MOUNT, cr))
1886                         return (ret);
1887         }
1888 
1889         /*
1890          * We purge the parent filesystem's vfsp as the parent filesystem
1891          * and all of its snapshots have their vnode's v_vfsp set to the
1892          * parent's filesystem's vfsp.  Note, 'z_parent' is self
1893          * referential for non-snapshots.
1894          */
1895         (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1896 
1897         /*
1898          * Unmount any snapshots mounted under .zfs before unmounting the
1899          * dataset itself.
1900          */
1901         if (zfsvfs->z_ctldir != NULL &&
1902             (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1903                 return (ret);
1904         }
1905 
1906         if (!(fflag & MS_FORCE)) {
1907                 uint_t active_vnodes;
1908 
1909                 /*
1910                  * Check the number of active vnodes in the file system.
1911                  * Our count is maintained in the vfs structure, but the
1912                  * number is off by 1 to indicate a hold on the vfs
1913                  * structure itself.
1914                  *
1915                  * The '.zfs' directory maintains a reference of its
1916                  * own, and any active references underneath are
1917                  * reflected in the vnode count.
1918                  *
1919                  * Active vnodes: vnodes that were held by an user
1920                  */
1921 
1922                 active_vnodes =
1923                     vfsp->vfs_count - zfsvfs->z_znodes_freeing_cnt;
1924 
1925                 if (zfsvfs->z_ctldir == NULL) {
1926                         if (active_vnodes > 1)
1927                                 return (SET_ERROR(EBUSY));
1928                 } else {
1929                         if (active_vnodes > 2 ||
1930                             zfsvfs->z_ctldir->v_count > 1)
1931                                 return (SET_ERROR(EBUSY));
1932                 }
1933         }
1934 
1935         vfsp->vfs_flag |= VFS_UNMOUNTED;
1936 
1937         VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1938         os = zfsvfs->z_os;
1939 
1940         /*
1941          * z_os will be NULL if there was an error in
1942          * attempting to reopen zfsvfs.
1943          */
1944         if (os != NULL) {
1945                 /*
1946                  * Unset the objset user_ptr.
1947                  */
1948                 mutex_enter(&os->os_user_ptr_lock);
1949                 dmu_objset_set_user(os, NULL);


2046         }
2047 
2048         *vpp = ZTOV(zp);
2049         ZFS_EXIT(zfsvfs);
2050         return (0);
2051 }
2052 
2053 /*
2054  * Block out VOPs and close zfsvfs_t::z_os
2055  *
2056  * Note, if successful, then we return with the 'z_teardown_lock' and
2057  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2058  * dataset and objset intact so that they can be atomically handed off during
2059  * a subsequent rollback or recv operation and the resume thereafter.
2060  */
2061 int
2062 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2063 {
2064         int error;
2065 
2066         mutex_enter(&zfsvfs->z_lock);
2067         if (zfsvfs->z_busy) {
2068                 mutex_exit(&zfsvfs->z_lock);
2069                 return (SET_ERROR(EBUSY));
2070         }
2071         zfsvfs->z_busy = B_TRUE;
2072         mutex_exit(&zfsvfs->z_lock);
2073 
2074         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
2075                 mutex_enter(&zfsvfs->z_lock);
2076                 zfsvfs->z_busy = B_FALSE;
2077                 mutex_exit(&zfsvfs->z_lock);
2078                 return (error);
2079         }
2080 
2081         return (0);
2082 }
2083 
2084 /*
2085  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2086  * is an invariant across any of the operations that can be performed while the
2087  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2088  * are the same: the relevant objset and associated dataset are owned by
2089  * zfsvfs, held, and long held on entry.
2090  */
2091 int
2092 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2093 {
2094         int err;
2095         znode_t *zp;
2096 
2097         ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2098         ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2099 


2110         if (err != 0)
2111                 goto bail;
2112 
2113         VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2114 
2115         zfs_set_fuid_feature(zfsvfs);
2116 
2117         /*
2118          * Attempt to re-establish all the active znodes with
2119          * their dbufs.  If a zfs_rezget() fails, then we'll let
2120          * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2121          * when they try to use their znode.
2122          */
2123         mutex_enter(&zfsvfs->z_znodes_lock);
2124         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2125             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2126                 (void) zfs_rezget(zp);
2127         }
2128         mutex_exit(&zfsvfs->z_znodes_lock);
2129 
2130         if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) &&
2131             !zfsvfs->z_unmounted) {
2132                 /*
2133                  * zfs_suspend_fs() could have interrupted freeing
2134                  * of dnodes. We need to restart this freeing so
2135                  * that we don't "leak" the space.
2136                  */
2137                 zfs_unlinked_drain(zfsvfs);
2138         }
2139 
2140 bail:
2141         /* release the VOPs */
2142         rw_exit(&zfsvfs->z_teardown_inactive_lock);
2143         rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2144 
2145         if (err) {
2146                 /*
2147                  * Since we couldn't setup the sa framework, try to force
2148                  * unmount this file system.
2149                  */
2150                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2151                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2152         }
2153         mutex_enter(&zfsvfs->z_lock);
2154         zfsvfs->z_busy = B_FALSE;
2155         mutex_exit(&zfsvfs->z_lock);
2156 
2157         return (err);
2158 }
2159 
2160 static void
2161 zfs_freevfs(vfs_t *vfsp)
2162 {
2163         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2164 
2165         /*
2166          * If this is a snapshot, we have an extra VFS_HOLD on our parent
2167          * from zfs_mount().  Release it here.  If we came through
2168          * zfs_mountroot() instead, we didn't grab an extra hold, so
2169          * skip the VFS_RELE for rootvfs.
2170          */
2171         if (zfsvfs->z_issnap && (vfsp != rootvfs))
2172                 VFS_RELE(zfsvfs->z_parent->z_vfs);
2173 
2174         zfsvfs_free(zfsvfs);
2175 
2176         atomic_dec_32(&zfs_active_fs_count);


2290 
2291                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2292                     SPA_VERSION_SA);
2293                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2294                     DMU_OT_NONE, 0, tx);
2295 
2296                 error = zap_add(os, MASTER_NODE_OBJ,
2297                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2298                 ASSERT0(error);
2299 
2300                 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2301                 sa_register_update_callback(os, zfs_sa_upgrade);
2302         }
2303 
2304         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2305             "from %llu to %llu", zfsvfs->z_version, newvers);
2306 
2307         dmu_tx_commit(tx);
2308 
2309         zfsvfs->z_version = newvers;
2310         os->os_version = newvers;
2311 
2312         zfs_set_fuid_feature(zfsvfs);
2313 
2314         return (0);
2315 }
2316 
2317 /*
2318  * Read a property stored within the master node.
2319  */
2320 int
2321 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2322 {
2323         uint64_t *cached_copy = NULL;

2324 
2325         /*
2326          * Figure out where in the objset_t the cached copy would live, if it
2327          * is available for the requested property.
2328          */
2329         if (os != NULL) {
2330                 switch (prop) {
2331                 case ZFS_PROP_VERSION:
2332                         cached_copy = &os->os_version;
2333                         break;
2334                 case ZFS_PROP_NORMALIZE:
2335                         cached_copy = &os->os_normalization;
2336                         break;
2337                 case ZFS_PROP_UTF8ONLY:
2338                         cached_copy = &os->os_utf8only;
2339                         break;
2340                 case ZFS_PROP_CASE:
2341                         cached_copy = &os->os_casesensitivity;
2342                         break;
2343                 default:
2344                         break;
2345                 }
2346         }
2347         if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2348                 *value = *cached_copy;
2349                 return (0);
2350         }
2351 
2352         /*
2353          * If the property wasn't cached, look up the file system's value for
2354          * the property. For the version property, we look up a slightly
2355          * different string.
2356          */
2357         const char *pname;
2358         int error = ENOENT;
2359         if (prop == ZFS_PROP_VERSION) {
2360                 pname = ZPL_VERSION_STR;
2361         } else {
2362                 pname = zfs_prop_to_name(prop);
2363         }
2364 
2365         if (os != NULL) {
2366                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2367                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2368         }
2369 
2370         if (error == ENOENT) {
2371                 /* No value set, use the default value */
2372                 switch (prop) {
2373                 case ZFS_PROP_VERSION:
2374                         *value = ZPL_VERSION;
2375                         break;
2376                 case ZFS_PROP_NORMALIZE:
2377                 case ZFS_PROP_UTF8ONLY:
2378                         *value = 0;
2379                         break;
2380                 case ZFS_PROP_CASE:
2381                         *value = ZFS_CASE_SENSITIVE;
2382                         break;
2383                 default:
2384                         return (error);
2385                 }
2386                 error = 0;
2387         }
2388 
2389         /*
2390          * If one of the methods for getting the property value above worked,
2391          * copy it into the objset_t's cache.
2392          */
2393         if (error == 0 && cached_copy != NULL) {
2394                 *cached_copy = *value;
2395         }
2396 
2397         return (error);
2398 }
2399 
2400 /*
2401  * Return true if the coresponding vfs's unmounted flag is set.
2402  * Otherwise return false.
2403  * If this function returns true we know VFS unmount has been initiated.
2404  */
2405 boolean_t
2406 zfs_get_vfs_flag_unmounted(objset_t *os)
2407 {
2408         zfsvfs_t *zfvp;
2409         boolean_t unmounted = B_FALSE;
2410 
2411         ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2412 
2413         mutex_enter(&os->os_user_ptr_lock);
2414         zfvp = dmu_objset_get_user(os);
2415         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2416             (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))