Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Conflicts:
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9200 Improve the scalability of attribute locking in zfs_zget
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9436 Rate limiting controls (was QoS) per ZFS dataset, updates from demo
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-8972 Async-delete side-effect that may cause unmount EBUSY
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-8852 Quality-of-Service (QoS) controls per NFS share
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5085 implement async delete for large files
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3762 Appliance crashes with a NULL pointer dereference during a zpool export when a zfs_vn_rele_taskq thread attempts to check a bogus rwlock from rw_write_held
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
6160 /usr/lib/fs/zfs/bootinstall should use bootadm
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: Adam Števko <adam.stevko@gmail.com>
Reviewed by: Josef Sipek <jeffpc@josefsipek.net>
Approved by: Richard Lowe <richlowe@richlowe.net>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (NULL is not an int)
6171 dsl_prop_unregister() slows down dataset eviction.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3485 Deferred deletes causing loss of service for NFS clients on cluster failover
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-2965 4.0.3-FP2: deferred deletes causing RSF import failure during fail-over of service
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
re #13253 rb4328 ssh: openssl version checking needs updating
re #11441 rb4292 panic in apic_record_rdt_entry on VMware hardware version 9
re #12619, rb4287 Deadlocked zfs txg processing in dsl_sync_task_group_sync()
re #13204 rb4280 zfs receive/rollback deadlock
re #6815 rb1758 need WORM in nza-kernel (4.0)
*** 43,52 ****
--- 43,53 ----
#include <sys/zfs_znode.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
+ #include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
*** 394,403 ****
--- 395,414 ----
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_acl_inherit = newval;
}
+ static void
+ rate_changed_cb(void *arg, uint64_t newval)
+ {
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == UINT64_MAX)
+ newval = 0;
+ zfsvfs->z_rate.rate_cap = newval;
+ }
+
static int
zfs_register_callbacks(vfs_t *vfsp)
{
struct dsl_dataset *ds = NULL;
objset_t *os = NULL;
*** 531,540 ****
--- 542,554 ----
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RATE_LIMIT), rate_changed_cb, zfsvfs);
+
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (error)
goto unregister;
/*
*** 991,1000 ****
--- 1005,1015 ----
int
zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
{
int error;
+ int size = spa_get_obj_mtx_sz(dmu_objset_spa(os));
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
*** 1002,1017 ****
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
! for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
*zfvp = NULL;
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
*zfvp = zfsvfs;
--- 1017,1037 ----
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
! zfsvfs->z_hold_mtx_sz = size;
! zfsvfs->z_hold_mtx = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
! for (int i = 0; i != size; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL);
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
*zfvp = NULL;
+ kmem_free(zfsvfs->z_hold_mtx, sizeof (kmutex_t) * size);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
*zfvp = zfsvfs;
*** 1040,1053 ****
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
! if (readonly != 0)
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
! else
zfs_unlinked_drain(zfsvfs);
/*
* Parse and replay the intent log.
*
* Because of ziltest, this must be done after
--- 1060,1074 ----
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
! if (readonly)
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
! else {
zfs_unlinked_drain(zfsvfs);
+ }
/*
* Parse and replay the intent log.
*
* Because of ziltest, this must be done after
*** 1082,1092 ****
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfsvfs->z_replay = B_FALSE;
}
}
! zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
/*
* Set the objset user_ptr to track its zfsvfs.
*/
--- 1103,1116 ----
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfsvfs->z_replay = B_FALSE;
}
}
!
! /* restore readonly bit */
! if (readonly)
! zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
}
/*
* Set the objset user_ptr to track its zfsvfs.
*/
*** 1110,1129 ****
* and invalid after the barrier.
*/
rw_enter(&zfsvfs_lock, RW_READER);
rw_exit(&zfsvfs_lock);
zfs_fuid_destroy(zfsvfs);
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrm_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
! for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
--- 1134,1160 ----
* and invalid after the barrier.
*/
rw_enter(&zfsvfs_lock, RW_READER);
rw_exit(&zfsvfs_lock);
+ VERIFY0(zfsvfs->z_znodes_freeing_cnt);
+
zfs_fuid_destroy(zfsvfs);
+ cv_destroy(&zfsvfs->z_drain_cv);
+ mutex_destroy(&zfsvfs->z_drain_lock);
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrm_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
! for (i = 0; i != zfsvfs->z_hold_mtx_sz; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+
+ kmem_free(zfsvfs->z_hold_mtx,
+ sizeof (kmutex_t) * zfsvfs->z_hold_mtx_sz);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
*** 1154,1163 ****
--- 1185,1195 ----
{
dev_t mount_dev;
uint64_t recordsize, fsid_guid;
int error = 0;
zfsvfs_t *zfsvfs;
+ char worminfo[13] = {0};
ASSERT(vfsp);
ASSERT(osname);
error = zfsvfs_create(osname, &zfsvfs);
*** 1177,1186 ****
--- 1209,1226 ----
if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
NULL))
goto out;
+ if (dsl_prop_get(osname, "nms:worm", 1, 12, &worminfo, NULL) == 0 &&
+ worminfo[0] && strcmp(worminfo, "0") != 0 &&
+ strcmp(worminfo, "off") != 0 && strcmp(worminfo, "-") != 0) {
+ zfsvfs->z_isworm = B_TRUE;
+ } else {
+ zfsvfs->z_isworm = B_FALSE;
+ }
+
vfsp->vfs_dev = mount_dev;
vfsp->vfs_fstype = zfsfstype;
vfsp->vfs_bsize = recordsize;
vfsp->vfs_flag |= VFS_NOTRUNC;
vfsp->vfs_data = zfsvfs;
*** 1743,1752 ****
--- 1783,1793 ----
static int
zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
znode_t *zp;
+ zfs_unlinked_drain_stop_wait(zfsvfs);
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
if (!unmounting) {
/*
* We purge the parent filesystem's vfsp as the parent
*** 1823,1833 ****
* Evict cached data
*/
if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
! dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
/*ARGSUSED*/
--- 1864,1874 ----
* Evict cached data
*/
if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
! (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
/*ARGSUSED*/
*** 1861,1885 ****
(ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
return (ret);
}
if (!(fflag & MS_FORCE)) {
/*
* Check the number of active vnodes in the file system.
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
*
* The '.zfs' directory maintains a reference of its
* own, and any active references underneath are
* reflected in the vnode count.
*/
if (zfsvfs->z_ctldir == NULL) {
! if (vfsp->vfs_count > 1)
return (SET_ERROR(EBUSY));
} else {
! if (vfsp->vfs_count > 2 ||
zfsvfs->z_ctldir->v_count > 1)
return (SET_ERROR(EBUSY));
}
}
--- 1902,1934 ----
(ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
return (ret);
}
if (!(fflag & MS_FORCE)) {
+ uint_t active_vnodes;
+
/*
* Check the number of active vnodes in the file system.
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
*
* The '.zfs' directory maintains a reference of its
* own, and any active references underneath are
* reflected in the vnode count.
+ *
+ * Active vnodes: vnodes that were held by an user
*/
+
+ active_vnodes =
+ vfsp->vfs_count - zfsvfs->z_znodes_freeing_cnt;
+
if (zfsvfs->z_ctldir == NULL) {
! if (active_vnodes > 1)
return (SET_ERROR(EBUSY));
} else {
! if (active_vnodes > 2 ||
zfsvfs->z_ctldir->v_count > 1)
return (SET_ERROR(EBUSY));
}
}
*** 2012,2023 ****
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
! if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
return (0);
}
/*
--- 2061,2084 ----
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
! mutex_enter(&zfsvfs->z_lock);
! if (zfsvfs->z_busy) {
! mutex_exit(&zfsvfs->z_lock);
! return (SET_ERROR(EBUSY));
! }
! zfsvfs->z_busy = B_TRUE;
! mutex_exit(&zfsvfs->z_lock);
!
! if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
! mutex_enter(&zfsvfs->z_lock);
! zfsvfs->z_busy = B_FALSE;
! mutex_exit(&zfsvfs->z_lock);
return (error);
+ }
return (0);
}
/*
*** 2064,2073 ****
--- 2125,2144 ----
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
+ if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) &&
+ !zfsvfs->z_unmounted) {
+ /*
+ * zfs_suspend_fs() could have interrupted freeing
+ * of dnodes. We need to restart this freeing so
+ * that we don't "leak" the space.
+ */
+ zfs_unlinked_drain(zfsvfs);
+ }
+
bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
*** 2077,2086 ****
--- 2148,2161 ----
* unmount this file system.
*/
if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
}
+ mutex_enter(&zfsvfs->z_lock);
+ zfsvfs->z_busy = B_FALSE;
+ mutex_exit(&zfsvfs->z_lock);
+
return (err);
}
static void
zfs_freevfs(vfs_t *vfsp)
*** 2230,2239 ****
--- 2305,2315 ----
"from %llu to %llu", zfsvfs->z_version, newvers);
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
+ os->os_version = newvers;
zfs_set_fuid_feature(zfsvfs);
return (0);
}
*** 2242,2262 ****
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
! const char *pname;
! int error = ENOENT;
/*
! * Look up the file system's value for the property. For the
! * version property, we look up a slightly different string.
*/
! if (prop == ZFS_PROP_VERSION)
pname = ZPL_VERSION_STR;
! else
pname = zfs_prop_to_name(prop);
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
--- 2318,2368 ----
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
! uint64_t *cached_copy = NULL;
/*
! * Figure out where in the objset_t the cached copy would live, if it
! * is available for the requested property.
*/
! if (os != NULL) {
! switch (prop) {
! case ZFS_PROP_VERSION:
! cached_copy = &os->os_version;
! break;
! case ZFS_PROP_NORMALIZE:
! cached_copy = &os->os_normalization;
! break;
! case ZFS_PROP_UTF8ONLY:
! cached_copy = &os->os_utf8only;
! break;
! case ZFS_PROP_CASE:
! cached_copy = &os->os_casesensitivity;
! break;
! default:
! break;
! }
! }
! if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
! *value = *cached_copy;
! return (0);
! }
!
! /*
! * If the property wasn't cached, look up the file system's value for
! * the property. For the version property, we look up a slightly
! * different string.
! */
! const char *pname;
! int error = ENOENT;
! if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
! } else {
pname = zfs_prop_to_name(prop);
+ }
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
*** 2277,2286 ****
--- 2383,2401 ----
default:
return (error);
}
error = 0;
}
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
return (error);
}
/*
* Return true if the coresponding vfs's unmounted flag is set.