Print this page
NEX-19394 backport 9337 zfs get all is slow due to uncached metadata
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Thomas Caputi <tcaputi@datto.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
 Conflicts:
  usr/src/uts/common/fs/zfs/dbuf.c
  usr/src/uts/common/fs/zfs/dmu.c
  usr/src/uts/common/fs/zfs/sys/dmu_objset.h
NEX-9200 Improve the scalability of attribute locking in zfs_zget
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-9436 Rate limiting controls (was QoS) per ZFS dataset, updates from demo
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-8972 Async-delete side-effect that may cause unmount EBUSY
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-8852 Quality-of-Service (QoS) controls per NFS share
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-5085 implement async delete for large files
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
NEX-3762 Appliance crashes with a NULL pointer dereference during a zpool export when a zfs_vn_rele_taskq thread attempts to check a bogus rwlock from rw_write_held
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
6160 /usr/lib/fs/zfs/bootinstall should use bootadm
Reviewed by: Igor Kozhukhov <ikozhukhov@gmail.com>
Reviewed by: Adam Števko <adam.stevko@gmail.com>
Reviewed by: Josef Sipek <jeffpc@josefsipek.net>
Approved by: Richard Lowe <richlowe@richlowe.net>
4185 add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R (NULL is not an int)
6171 dsl_prop_unregister() slows down dataset eviction.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3485 Deferred deletes causing loss of service for NFS clients on cluster failover
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-2965 4.0.3-FP2: deferred deletes causing RSF import failure during fail-over of service
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
re #13253 rb4328 ssh: openssl version checking needs updating
re #11441 rb4292 panic in apic_record_rdt_entry on VMware hardware version 9
re #12619, rb4287 Deadlocked zfs txg processing in dsl_sync_task_group_sync()
re #13204 rb4280 zfs receive/rollback deadlock
re #6815 rb1758 need WORM in nza-kernel (4.0)
        
@@ -43,10 +43,11 @@
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
+#include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
@@ -394,10 +395,20 @@
         zfsvfs_t *zfsvfs = arg;
 
         zfsvfs->z_acl_inherit = newval;
 }
 
+static void
+rate_changed_cb(void *arg, uint64_t newval)
+{
+        zfsvfs_t *zfsvfs = arg;
+
+        if (newval == UINT64_MAX)
+                newval = 0;
+        zfsvfs->z_rate.rate_cap = newval;
+}
+
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
         struct dsl_dataset *ds = NULL;
         objset_t *os = NULL;
@@ -531,10 +542,13 @@
         error = error ? error : dsl_prop_register(ds,
             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
             zfsvfs);
         error = error ? error : dsl_prop_register(ds,
             zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+        error = error ? error : dsl_prop_register(ds,
+            zfs_prop_to_name(ZFS_PROP_RATE_LIMIT), rate_changed_cb, zfsvfs);
+
         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
         if (error)
                 goto unregister;
 
         /*
@@ -991,10 +1005,11 @@
 
 int
 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 {
         int error;
+        int size = spa_get_obj_mtx_sz(dmu_objset_spa(os));
 
         zfsvfs->z_vfs = NULL;
         zfsvfs->z_parent = zfsvfs;
 
         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1002,16 +1017,21 @@
         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
             offsetof(znode_t, z_link_node));
         rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
         rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
-        for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+        zfsvfs->z_hold_mtx_sz = size;
+        zfsvfs->z_hold_mtx = kmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+        for (int i = 0; i != size; i++)
                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+        mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL);
+        cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL);
 
         error = zfsvfs_init(zfsvfs, os);
         if (error != 0) {
                 *zfvp = NULL;
+                kmem_free(zfsvfs->z_hold_mtx, sizeof (kmutex_t) * size);
                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
                 return (error);
         }
 
         *zfvp = zfsvfs;
@@ -1040,14 +1060,15 @@
                 /*
                  * During replay we remove the read only flag to
                  * allow replays to succeed.
                  */
                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-                if (readonly != 0)
+                if (readonly)
                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-                else
+                else {
                         zfs_unlinked_drain(zfsvfs);
+                }
 
                 /*
                  * Parse and replay the intent log.
                  *
                  * Because of ziltest, this must be done after
@@ -1082,11 +1103,14 @@
                                 zil_replay(zfsvfs->z_os, zfsvfs,
                                     zfs_replay_vector);
                                 zfsvfs->z_replay = B_FALSE;
                         }
                 }
-                zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
+
+                /* restore readonly bit */
+                if (readonly)
+                        zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
         }
 
         /*
          * Set the objset user_ptr to track its zfsvfs.
          */
@@ -1110,20 +1134,27 @@
          * and invalid after the barrier.
          */
         rw_enter(&zfsvfs_lock, RW_READER);
         rw_exit(&zfsvfs_lock);
 
+        VERIFY0(zfsvfs->z_znodes_freeing_cnt);
+
         zfs_fuid_destroy(zfsvfs);
 
+        cv_destroy(&zfsvfs->z_drain_cv);
+        mutex_destroy(&zfsvfs->z_drain_lock);
         mutex_destroy(&zfsvfs->z_znodes_lock);
         mutex_destroy(&zfsvfs->z_lock);
         list_destroy(&zfsvfs->z_all_znodes);
         rrm_destroy(&zfsvfs->z_teardown_lock);
         rw_destroy(&zfsvfs->z_teardown_inactive_lock);
         rw_destroy(&zfsvfs->z_fuid_lock);
-        for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+        for (i = 0; i != zfsvfs->z_hold_mtx_sz; i++)
                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+
+        kmem_free(zfsvfs->z_hold_mtx,
+            sizeof (kmutex_t) * zfsvfs->z_hold_mtx_sz);
         kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
@@ -1154,10 +1185,11 @@
 {
         dev_t mount_dev;
         uint64_t recordsize, fsid_guid;
         int error = 0;
         zfsvfs_t *zfsvfs;
+        char    worminfo[13] = {0};
 
         ASSERT(vfsp);
         ASSERT(osname);
 
         error = zfsvfs_create(osname, &zfsvfs);
@@ -1177,10 +1209,18 @@
 
         if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
             NULL))
                 goto out;
 
+        if (dsl_prop_get(osname, "nms:worm", 1, 12, &worminfo, NULL) == 0 &&
+            worminfo[0] && strcmp(worminfo, "0") != 0 &&
+            strcmp(worminfo, "off") != 0 && strcmp(worminfo, "-") != 0) {
+                zfsvfs->z_isworm = B_TRUE;
+        } else {
+                zfsvfs->z_isworm = B_FALSE;
+        }
+
         vfsp->vfs_dev = mount_dev;
         vfsp->vfs_fstype = zfsfstype;
         vfsp->vfs_bsize = recordsize;
         vfsp->vfs_flag |= VFS_NOTRUNC;
         vfsp->vfs_data = zfsvfs;
@@ -1743,10 +1783,11 @@
 static int
 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
         znode_t *zp;
 
+        zfs_unlinked_drain_stop_wait(zfsvfs);
         rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
         if (!unmounting) {
                 /*
                  * We purge the parent filesystem's vfsp as the parent
@@ -1823,11 +1864,11 @@
          * Evict cached data
          */
         if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
             !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-        dmu_objset_evict_dbufs(zfsvfs->z_os);
+        (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
 
         return (0);
 }
 
 /*ARGSUSED*/
@@ -1861,25 +1902,33 @@
             (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
                 return (ret);
         }
 
         if (!(fflag & MS_FORCE)) {
+                uint_t active_vnodes;
+
                 /*
                  * Check the number of active vnodes in the file system.
                  * Our count is maintained in the vfs structure, but the
                  * number is off by 1 to indicate a hold on the vfs
                  * structure itself.
                  *
                  * The '.zfs' directory maintains a reference of its
                  * own, and any active references underneath are
                  * reflected in the vnode count.
+                 *
+                 * Active vnodes: vnodes that were held by an user
                  */
+
+                active_vnodes =
+                    vfsp->vfs_count - zfsvfs->z_znodes_freeing_cnt;
+
                 if (zfsvfs->z_ctldir == NULL) {
-                        if (vfsp->vfs_count > 1)
+                        if (active_vnodes > 1)
                                 return (SET_ERROR(EBUSY));
                 } else {
-                        if (vfsp->vfs_count > 2 ||
+                        if (active_vnodes > 2 ||
                             zfsvfs->z_ctldir->v_count > 1)
                                 return (SET_ERROR(EBUSY));
                 }
         }
 
@@ -2012,12 +2061,24 @@
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
         int error;
 
-        if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+        mutex_enter(&zfsvfs->z_lock);
+        if (zfsvfs->z_busy) {
+                mutex_exit(&zfsvfs->z_lock);
+                return (SET_ERROR(EBUSY));
+        }
+        zfsvfs->z_busy = B_TRUE;
+        mutex_exit(&zfsvfs->z_lock);
+
+        if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
+                mutex_enter(&zfsvfs->z_lock);
+                zfsvfs->z_busy = B_FALSE;
+                mutex_exit(&zfsvfs->z_lock);
                 return (error);
+        }
 
         return (0);
 }
 
 /*
@@ -2064,10 +2125,20 @@
             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
                 (void) zfs_rezget(zp);
         }
         mutex_exit(&zfsvfs->z_znodes_lock);
 
+        if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) &&
+            !zfsvfs->z_unmounted) {
+                /*
+                 * zfs_suspend_fs() could have interrupted freeing
+                 * of dnodes. We need to restart this freeing so
+                 * that we don't "leak" the space.
+                 */
+                zfs_unlinked_drain(zfsvfs);
+        }
+
 bail:
         /* release the VOPs */
         rw_exit(&zfsvfs->z_teardown_inactive_lock);
         rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
@@ -2077,10 +2148,14 @@
                  * unmount this file system.
                  */
                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
         }
+        mutex_enter(&zfsvfs->z_lock);
+        zfsvfs->z_busy = B_FALSE;
+        mutex_exit(&zfsvfs->z_lock);
+
         return (err);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
@@ -2230,10 +2305,11 @@
             "from %llu to %llu", zfsvfs->z_version, newvers);
 
         dmu_tx_commit(tx);
 
         zfsvfs->z_version = newvers;
+        os->os_version = newvers;
 
         zfs_set_fuid_feature(zfsvfs);
 
         return (0);
 }
@@ -2242,21 +2318,51 @@
  * Read a property stored within the master node.
  */
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
-        const char *pname;
-        int error = ENOENT;
+        uint64_t *cached_copy = NULL;
 
         /*
-         * Look up the file system's value for the property.  For the
-         * version property, we look up a slightly different string.
+         * Figure out where in the objset_t the cached copy would live, if it
+         * is available for the requested property.
          */
-        if (prop == ZFS_PROP_VERSION)
+        if (os != NULL) {
+                switch (prop) {
+                case ZFS_PROP_VERSION:
+                        cached_copy = &os->os_version;
+                        break;
+                case ZFS_PROP_NORMALIZE:
+                        cached_copy = &os->os_normalization;
+                        break;
+                case ZFS_PROP_UTF8ONLY:
+                        cached_copy = &os->os_utf8only;
+                        break;
+                case ZFS_PROP_CASE:
+                        cached_copy = &os->os_casesensitivity;
+                        break;
+                default:
+                        break;
+                }
+        }
+        if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+                *value = *cached_copy;
+                return (0);
+        }
+
+        /*
+         * If the property wasn't cached, look up the file system's value for
+         * the property. For the version property, we look up a slightly
+         * different string.
+         */
+        const char *pname;
+        int error = ENOENT;
+        if (prop == ZFS_PROP_VERSION) {
                 pname = ZPL_VERSION_STR;
-        else
+        } else {
                 pname = zfs_prop_to_name(prop);
+        }
 
         if (os != NULL) {
                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
         }
@@ -2277,10 +2383,19 @@
                 default:
                         return (error);
                 }
                 error = 0;
         }
+
+        /*
+         * If one of the methods for getting the property value above worked,
+         * copy it into the objset_t's cache.
+         */
+        if (error == 0 && cached_copy != NULL) {
+                *cached_copy = *value;
+        }
+
         return (error);
 }
 
 /*
  * Return true if the coresponding vfs's unmounted flag is set.