Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Conflicts:
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9200 Improve the scalability of attribute locking in zfs_zget
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9436 Rate limiting controls (was QoS) per ZFS dataset, updates from demo
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-8972 Async-delete side-effect that may cause unmount EBUSY
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-8852 Quality-of-Service (QoS) controls per NFS share
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5085 implement async delete for large files
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3762 Appliance crashes with a NULL pointer dereference during a zpool export when a zfs_vn_rele_taskq thread attempts to check a bogus rwlock from rw_write_held
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
6160 /usr/lib/fs/zfs/bootinstall should use bootadm
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: Adam Števko <adam.stevko@gmail.com>
Reviewed by: Josef Sipek <jeffpc@josefsipek.net>
Approved by: Richard Lowe <richlowe@richlowe.net>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (NULL is not an int)
6171 dsl_prop_unregister() slows down dataset eviction.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3485 Deferred deletes causing loss of service for NFS clients on cluster failover
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-2965 4.0.3-FP2: deferred deletes causing RSF import failure during fail-over of service
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
re #13253 rb4328 ssh: openssl version checking needs updating
re #11441 rb4292 panic in apic_record_rdt_entry on VMware hardware version 9
re #12619, rb4287 Deadlocked zfs txg processing in dsl_sync_task_group_sync()
re #13204 rb4280 zfs receive/rollback deadlock
re #6815 rb1758 need WORM in nza-kernel (4.0)
@@ -43,10 +43,11 @@
#include <sys/zfs_znode.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
@@ -394,10 +395,20 @@
zfsvfs_t *zfsvfs = arg;
zfsvfs->z_acl_inherit = newval;
}
+static void
+rate_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == UINT64_MAX)
+ newval = 0;
+ zfsvfs->z_rate.rate_cap = newval;
+}
+
static int
zfs_register_callbacks(vfs_t *vfsp)
{
struct dsl_dataset *ds = NULL;
objset_t *os = NULL;
@@ -531,10 +542,13 @@
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
zfsvfs);
error = error ? error : dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RATE_LIMIT), rate_changed_cb, zfsvfs);
+
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (error)
goto unregister;
/*
@@ -991,10 +1005,11 @@
int
zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
{
int error;
+ int size = spa_get_obj_mtx_sz(dmu_objset_spa(os));
zfsvfs->z_vfs = NULL;
zfsvfs->z_parent = zfsvfs;
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1002,16 +1017,21 @@
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
- for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ zfsvfs->z_hold_mtx_sz = size;
+ zfsvfs->z_hold_mtx = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+ for (int i = 0; i != size; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL);
error = zfsvfs_init(zfsvfs, os);
if (error != 0) {
*zfvp = NULL;
+ kmem_free(zfsvfs->z_hold_mtx, sizeof (kmutex_t) * size);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
return (error);
}
*zfvp = zfsvfs;
@@ -1040,14 +1060,15 @@
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
- if (readonly != 0)
+ if (readonly)
zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
- else
+ else {
zfs_unlinked_drain(zfsvfs);
+ }
/*
* Parse and replay the intent log.
*
* Because of ziltest, this must be done after
@@ -1082,11 +1103,14 @@
zil_replay(zfsvfs->z_os, zfsvfs,
zfs_replay_vector);
zfsvfs->z_replay = B_FALSE;
}
}
- zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
+
+ /* restore readonly bit */
+ if (readonly)
+ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
}
/*
* Set the objset user_ptr to track its zfsvfs.
*/
@@ -1110,20 +1134,27 @@
* and invalid after the barrier.
*/
rw_enter(&zfsvfs_lock, RW_READER);
rw_exit(&zfsvfs_lock);
+ VERIFY0(zfsvfs->z_znodes_freeing_cnt);
+
zfs_fuid_destroy(zfsvfs);
+ cv_destroy(&zfsvfs->z_drain_cv);
+ mutex_destroy(&zfsvfs->z_drain_lock);
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
rrm_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ for (i = 0; i != zfsvfs->z_hold_mtx_sz; i++)
mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+
+ kmem_free(zfsvfs->z_hold_mtx,
+ sizeof (kmutex_t) * zfsvfs->z_hold_mtx_sz);
kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
@@ -1154,10 +1185,11 @@
{
dev_t mount_dev;
uint64_t recordsize, fsid_guid;
int error = 0;
zfsvfs_t *zfsvfs;
+ char worminfo[13] = {0};
ASSERT(vfsp);
ASSERT(osname);
error = zfsvfs_create(osname, &zfsvfs);
@@ -1177,10 +1209,18 @@
if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
NULL))
goto out;
+ if (dsl_prop_get(osname, "nms:worm", 1, 12, &worminfo, NULL) == 0 &&
+ worminfo[0] && strcmp(worminfo, "0") != 0 &&
+ strcmp(worminfo, "off") != 0 && strcmp(worminfo, "-") != 0) {
+ zfsvfs->z_isworm = B_TRUE;
+ } else {
+ zfsvfs->z_isworm = B_FALSE;
+ }
+
vfsp->vfs_dev = mount_dev;
vfsp->vfs_fstype = zfsfstype;
vfsp->vfs_bsize = recordsize;
vfsp->vfs_flag |= VFS_NOTRUNC;
vfsp->vfs_data = zfsvfs;
@@ -1743,10 +1783,11 @@
static int
zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
znode_t *zp;
+ zfs_unlinked_drain_stop_wait(zfsvfs);
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
if (!unmounting) {
/*
* We purge the parent filesystem's vfsp as the parent
@@ -1823,11 +1864,11 @@
* Evict cached data
*/
if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- dmu_objset_evict_dbufs(zfsvfs->z_os);
+ (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
return (0);
}
/*ARGSUSED*/
@@ -1861,25 +1902,33 @@
(ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
return (ret);
}
if (!(fflag & MS_FORCE)) {
+ uint_t active_vnodes;
+
/*
* Check the number of active vnodes in the file system.
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
*
* The '.zfs' directory maintains a reference of its
* own, and any active references underneath are
* reflected in the vnode count.
+ *
+ * Active vnodes: vnodes that were held by an user
*/
+
+ active_vnodes =
+ vfsp->vfs_count - zfsvfs->z_znodes_freeing_cnt;
+
if (zfsvfs->z_ctldir == NULL) {
- if (vfsp->vfs_count > 1)
+ if (active_vnodes > 1)
return (SET_ERROR(EBUSY));
} else {
- if (vfsp->vfs_count > 2 ||
+ if (active_vnodes > 2 ||
zfsvfs->z_ctldir->v_count > 1)
return (SET_ERROR(EBUSY));
}
}
@@ -2012,12 +2061,24 @@
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
int error;
- if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ mutex_enter(&zfsvfs->z_lock);
+ if (zfsvfs->z_busy) {
+ mutex_exit(&zfsvfs->z_lock);
+ return (SET_ERROR(EBUSY));
+ }
+ zfsvfs->z_busy = B_TRUE;
+ mutex_exit(&zfsvfs->z_lock);
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
+ mutex_enter(&zfsvfs->z_lock);
+ zfsvfs->z_busy = B_FALSE;
+ mutex_exit(&zfsvfs->z_lock);
return (error);
+ }
return (0);
}
/*
@@ -2064,10 +2125,20 @@
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);
+ if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) &&
+ !zfsvfs->z_unmounted) {
+ /*
+ * zfs_suspend_fs() could have interrupted freeing
+ * of dnodes. We need to restart this freeing so
+ * that we don't "leak" the space.
+ */
+ zfs_unlinked_drain(zfsvfs);
+ }
+
bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
@@ -2077,10 +2148,14 @@
* unmount this file system.
*/
if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
}
+ mutex_enter(&zfsvfs->z_lock);
+ zfsvfs->z_busy = B_FALSE;
+ mutex_exit(&zfsvfs->z_lock);
+
return (err);
}
static void
zfs_freevfs(vfs_t *vfsp)
@@ -2230,10 +2305,11 @@
"from %llu to %llu", zfsvfs->z_version, newvers);
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
+ os->os_version = newvers;
zfs_set_fuid_feature(zfsvfs);
return (0);
}
@@ -2242,21 +2318,51 @@
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
- const char *pname;
- int error = ENOENT;
+ uint64_t *cached_copy = NULL;
/*
- * Look up the file system's value for the property. For the
- * version property, we look up a slightly different string.
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
*/
- if (prop == ZFS_PROP_VERSION)
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
- else
+ } else {
pname = zfs_prop_to_name(prop);
+ }
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
@@ -2277,10 +2383,19 @@
default:
return (error);
}
error = 0;
}
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
return (error);
}
/*
* Return true if the coresponding vfs's unmounted flag is set.