Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
 Conflicts:
  usr/src/uts/common/fs/zfs/dbuf.c
  usr/src/uts/common/fs/zfs/dmu.c
  usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9200 Improve the scalability of attribute locking in zfs_zget
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9436 Rate limiting controls (was QoS) per ZFS dataset, updates from demo
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-8972 Async-delete side-effect that may cause unmount EBUSY
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-8852 Quality-of-Service (QoS) controls per NFS share
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5085 implement async delete for large files
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3762 Appliance crashes with a NULL pointer dereference during a zpool export when a zfs_vn_rele_taskq thread attempts to check a bogus rwlock from rw_write_held
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
6160 /usr/lib/fs/zfs/bootinstall should use bootadm
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: Adam Števko <adam.stevko@gmail.com>
Reviewed by: Josef Sipek <jeffpc@josefsipek.net>
Approved by: Richard Lowe <richlowe@richlowe.net>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (NULL is not an int)
6171 dsl_prop_unregister() slows down dataset eviction.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3485 Deferred deletes causing loss of service for NFS clients on cluster failover
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-2965 4.0.3-FP2: deferred deletes causing RSF import failure during fail-over of service
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
re #13253 rb4328 ssh: openssl version checking needs updating
re #11441 rb4292 panic in apic_record_rdt_entry on VMware hardware version 9
re #12619, rb4287 Deadlocked zfs txg processing in dsl_sync_task_group_sync()
re #13204 rb4280 zfs receive/rollback deadlock
re #6815 rb1758 need WORM in nza-kernel (4.0)

*** 43,52 **** --- 43,53 ---- #include <sys/zfs_znode.h> #include <sys/zfs_dir.h> #include <sys/zil.h> #include <sys/fs/zfs.h> #include <sys/dmu.h> + #include <sys/dsl_dir.h> #include <sys/dsl_prop.h> #include <sys/dsl_dataset.h> #include <sys/dsl_deleg.h> #include <sys/spa.h> #include <sys/zap.h>
*** 394,403 **** --- 395,414 ---- zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } + static void + rate_changed_cb(void *arg, uint64_t newval) + { + zfsvfs_t *zfsvfs = arg; + + if (newval == UINT64_MAX) + newval = 0; + zfsvfs->z_rate.rate_cap = newval; + } + static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL;
*** 531,540 **** --- 542,554 ---- error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RATE_LIMIT), rate_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /*
*** 991,1000 **** --- 1005,1015 ---- int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; + int size = spa_get_obj_mtx_sz(dmu_objset_spa(os)); zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
*** 1002,1017 **** list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); ! for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; --- 1017,1037 ---- list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); ! zfsvfs->z_hold_mtx_sz = size; ! zfsvfs->z_hold_mtx = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); ! for (int i = 0; i != size; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; + kmem_free(zfsvfs->z_hold_mtx, sizeof (kmutex_t) * size); kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs;
*** 1040,1053 **** /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; ! if (readonly != 0) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; ! else zfs_unlinked_drain(zfsvfs); /* * Parse and replay the intent log. * * Because of ziltest, this must be done after --- 1060,1074 ---- /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; ! if (readonly) zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; ! else { zfs_unlinked_drain(zfsvfs); + } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after
*** 1082,1092 **** zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } ! zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } /* * Set the objset user_ptr to track its zfsvfs. */ --- 1103,1116 ---- zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } ! ! /* restore readonly bit */ ! if (readonly) ! zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } /* * Set the objset user_ptr to track its zfsvfs. */
*** 1110,1129 **** * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); ! for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) --- 1134,1160 ---- * and invalid after the barrier. */ rw_enter(&zfsvfs_lock, RW_READER); rw_exit(&zfsvfs_lock); + VERIFY0(zfsvfs->z_znodes_freeing_cnt); + zfs_fuid_destroy(zfsvfs); + cv_destroy(&zfsvfs->z_drain_cv); + mutex_destroy(&zfsvfs->z_drain_lock); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); ! for (i = 0; i != zfsvfs->z_hold_mtx_sz; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); + + kmem_free(zfsvfs->z_hold_mtx, + sizeof (kmutex_t) * zfsvfs->z_hold_mtx_sz); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
*** 1154,1163 **** --- 1185,1195 ---- { dev_t mount_dev; uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; + char worminfo[13] = {0}; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, &zfsvfs);
*** 1177,1186 **** --- 1209,1226 ---- if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) goto out; + if (dsl_prop_get(osname, "nms:worm", 1, 12, &worminfo, NULL) == 0 && + worminfo[0] && strcmp(worminfo, "0") != 0 && + strcmp(worminfo, "off") != 0 && strcmp(worminfo, "-") != 0) { + zfsvfs->z_isworm = B_TRUE; + } else { + zfsvfs->z_isworm = B_FALSE; + } + vfsp->vfs_dev = mount_dev; vfsp->vfs_fstype = zfsfstype; vfsp->vfs_bsize = recordsize; vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_data = zfsvfs;
*** 1743,1752 **** --- 1783,1793 ---- static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; + zfs_unlinked_drain_stop_wait(zfsvfs); rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent
*** 1823,1833 **** * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ! dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/ --- 1864,1874 ---- * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ! (void) dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } /*ARGSUSED*/
*** 1861,1885 **** (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { return (ret); } if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { ! if (vfsp->vfs_count > 1) return (SET_ERROR(EBUSY)); } else { ! if (vfsp->vfs_count > 2 || zfsvfs->z_ctldir->v_count > 1) return (SET_ERROR(EBUSY)); } } --- 1902,1934 ---- (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { return (ret); } if (!(fflag & MS_FORCE)) { + uint_t active_vnodes; + /* * Check the number of active vnodes in the file system. * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. * * The '.zfs' directory maintains a reference of its * own, and any active references underneath are * reflected in the vnode count. + * + * Active vnodes: vnodes that were held by an user */ + + active_vnodes = + vfsp->vfs_count - zfsvfs->z_znodes_freeing_cnt; + if (zfsvfs->z_ctldir == NULL) { ! if (active_vnodes > 1) return (SET_ERROR(EBUSY)); } else { ! if (active_vnodes > 2 || zfsvfs->z_ctldir->v_count > 1) return (SET_ERROR(EBUSY)); } }
*** 2012,2023 **** int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; ! if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* --- 2061,2084 ---- int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; ! mutex_enter(&zfsvfs->z_lock); ! if (zfsvfs->z_busy) { ! mutex_exit(&zfsvfs->z_lock); ! return (SET_ERROR(EBUSY)); ! } ! zfsvfs->z_busy = B_TRUE; ! mutex_exit(&zfsvfs->z_lock); ! ! if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) { ! mutex_enter(&zfsvfs->z_lock); ! zfsvfs->z_busy = B_FALSE; ! mutex_exit(&zfsvfs->z_lock); return (error); + } return (0); } /*
*** 2064,2073 **** --- 2125,2144 ---- zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); + if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) && + !zfsvfs->z_unmounted) { + /* + * zfs_suspend_fs() could have interrupted freeing + * of dnodes. We need to restart this freeing so + * that we don't "leak" the space. + */ + zfs_unlinked_drain(zfsvfs); + } + bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
*** 2077,2086 **** --- 2148,2161 ---- * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); } + mutex_enter(&zfsvfs->z_lock); + zfsvfs->z_busy = B_FALSE; + mutex_exit(&zfsvfs->z_lock); + return (err); } static void zfs_freevfs(vfs_t *vfsp)
*** 2230,2239 **** --- 2305,2315 ---- "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; + os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); }
*** 2242,2262 **** * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { ! const char *pname; ! int error = ENOENT; /* ! * Look up the file system's value for the property. For the ! * version property, we look up a slightly different string. */ ! if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; ! else pname = zfs_prop_to_name(prop); if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } --- 2318,2368 ---- * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { ! uint64_t *cached_copy = NULL; /* ! * Figure out where in the objset_t the cached copy would live, if it ! * is available for the requested property. */ ! if (os != NULL) { ! switch (prop) { ! case ZFS_PROP_VERSION: ! cached_copy = &os->os_version; ! break; ! case ZFS_PROP_NORMALIZE: ! cached_copy = &os->os_normalization; ! break; ! case ZFS_PROP_UTF8ONLY: ! cached_copy = &os->os_utf8only; ! break; ! case ZFS_PROP_CASE: ! cached_copy = &os->os_casesensitivity; ! break; ! default: ! break; ! } ! } ! if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { ! *value = *cached_copy; ! return (0); ! } ! ! /* ! * If the property wasn't cached, look up the file system's value for ! * the property. For the version property, we look up a slightly ! * different string. ! */ ! const char *pname; ! int error = ENOENT; ! if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; ! } else { pname = zfs_prop_to_name(prop); + } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); }
*** 2277,2286 **** --- 2383,2401 ---- default: return (error); } error = 0; } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + return (error); } /* * Return true if the coresponding vfs's unmounted flag is set.