io-lx-public-vs-joyent Udiff usr/src/uts/common/fs/tmpfs/tmp

Print this page

@@ -18,11 +18,11 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>

@@ -54,29 +54,19 @@
 #include <sys/fs/tmpnode.h>
 
 static int tmpfsfstype;
 
 /*
- * tmpfs_mountcount is used to prevent module unloads while there is still
- * state from a former mount hanging around. With forced umount support, the
- * filesystem module must not be allowed to go away before the last
- * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
- * there's no need for locking.
- */
-static uint32_t tmpfs_mountcount;
-
-/*
  * tmpfs vfs operations.
  */
 static int tmpfsinit(int, char *);
 static int tmp_mount(struct vfs *, struct vnode *,
         struct mounta *, struct cred *);
 static int tmp_unmount(struct vfs *, int, struct cred *);
 static int tmp_root(struct vfs *, struct vnode **);
 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
-static void tmp_freevfs(vfs_t *vfsp);
 
 /*
  * Loadable module wrapper
  */
 #include <sys/modctl.h>

@@ -131,18 +121,10 @@
 int
 _fini()
 {
         int error;
 
-        /*
-         * If a forceably unmounted instance is still hanging around, we cannot
-         * allow the module to be unloaded because that would cause panics once
-         * the VFS framework decides it's time to call into VFS_FREEVFS().
-         */
-        if (tmpfs_mountcount)
-                return (EBUSY);
-
         error = mod_remove(&modlinkage);
         if (error)
                 return (error);
         /*
          * Tear down the operations vectors

@@ -157,19 +139,29 @@
 {
         return (mod_info(&modlinkage, modinfop));
 }
 
 /*
+ * The following are patchable variables limiting the amount of system
+ * resources tmpfs can use.
+ *
+ * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
+ * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
+ * It is not determined by setting a hard limit but rather as a percentage of
+ * physical memory which is determined when tmpfs is first used in the system.
+ *
  * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
  * the rest of the system.  In other words, if the amount of free swap space
  * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
  * anon allocations will fail.
  *
  * There is also a per mount limit on the amount of swap space
  * (tmount.tm_anonmax) settable via a mount option.
  */
+size_t tmpfs_maxkmem = 0;
 size_t tmpfs_minfree = 0;
+size_t tmp_kmemspace;           /* bytes of kernel heap used by all tmpfs */
 
 static major_t tmpfs_major;
 static minor_t tmpfs_minor;
 static kmutex_t tmpfs_minor_lock;

@@ -184,11 +176,10 @@
                 VFSNAME_MOUNT,          { .vfs_mount = tmp_mount },
                 VFSNAME_UNMOUNT,        { .vfs_unmount = tmp_unmount },
                 VFSNAME_ROOT,           { .vfs_root = tmp_root },
                 VFSNAME_STATVFS,        { .vfs_statvfs = tmp_statvfs },
                 VFSNAME_VGET,           { .vfs_vget = tmp_vget },
-                VFSNAME_FREEVFS,        { .vfs_freevfs = tmp_freevfs },
                 NULL,                   NULL
         };
         int error;
         extern  void    tmpfs_hash_init();

@@ -219,16 +210,22 @@
                  * Set if not patched
                  */
                 tmpfs_minfree = btopr(TMPMINFREE);
         }
 
+        /*
+         * The maximum amount of space tmpfs can allocate is
+         * TMPMAXPROCKMEM percent of kernel memory
+         */
+        if (tmpfs_maxkmem == 0)
+                tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
+
         if ((tmpfs_major = getudev()) == (major_t)-1) {
                 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
                 tmpfs_major = 0;
         }
         mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
-        tmpfs_mountcount = 0;
         return (0);
 }
 
 static int
 tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)

@@ -235,11 +232,11 @@
 {
         struct tmount *tm = NULL;
         struct tmpnode *tp;
         struct pathname dpn;
         int error;
-        size_t anonmax;
+        pgcnt_t anonmax;
         struct vattr rattr;
         int got_attrs;
         boolean_t mode_arg = B_FALSE;
         mode_t root_mode = 0777;
         char *argstr;

@@ -279,11 +276,11 @@
          */
         if (vfs_optionisset(vfsp, "size", &argstr)) {
                 if ((error = tmp_convnum(argstr, &anonmax)) != 0)
                         goto out;
         } else {
-                anonmax = SIZE_MAX;
+                anonmax = ULONG_MAX;
         }
 
         /*
          * The "mode" mount argument allows the operator to override the
          * permissions of the root of the tmpfs mount.

@@ -312,12 +309,11 @@
                 tm->tm_anonmax = anonmax;
                 mutex_exit(&tm->tm_contents);
                 goto out;
         }
 
-        if ((tm = kmem_zalloc(sizeof (struct tmount),
-            KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
+        if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
                 pn_free(&dpn);
                 error = ENOMEM;
                 goto out;
         }

@@ -345,41 +341,21 @@
         vfsp->vfs_fstype = tmpfsfstype;
         vfsp->vfs_dev = tm->tm_dev;
         vfsp->vfs_bsize = PAGESIZE;
         vfsp->vfs_flag |= VFS_NOTRUNC;
         vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
-        tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+        tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
         (void) strcpy(tm->tm_mntpath, dpn.pn_path);
 
         /*
-         * Preemptively set vfs_zone before any of the tmp_kmem_* functions are
-         * called.  That field is not populated until after a successful
-         * VFS_MOUNT when domount() sets vfsp metadata via vfs_add().  An
-         * accurate value is required for proper swap usage accounting.
-         */
-        ASSERT0(uap->flags & MS_REMOUNT);
-        ASSERT(vfsp->vfs_zone == NULL);
-        vfsp->vfs_zone = curproc->p_zone;
-
-        /*
          * allocate and initialize root tmpnode structure
          */
         bzero(&rattr, sizeof (struct vattr));
         rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
         rattr.va_type = VDIR;
         rattr.va_rdev = 0;
-        tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
-        if (tp == NULL) {
-                kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
-                mutex_destroy(&tm->tm_contents);
-                mutex_destroy(&tm->tm_renamelck);
-                kmem_free(tm, sizeof (struct tmount));
-
-                pn_free(&dpn);
-                error = ENOMEM;
-                goto out;
-        }
+        tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
         tmpnode_init(tm, tp, &rattr, cr);
 
         /*
          * Get the mode, uid, and gid from the underlying mount point.
          */

@@ -414,38 +390,16 @@
         tp->tn_back = tp;
         tp->tn_forw = NULL;
         tp->tn_nlink = 0;
         tm->tm_rootnode = tp;
 
-        if (tdirinit(tp, tp) != 0) {
-                /*
-                 * While we would normally let our VOP_INACTIVE function take
-                 * care of cleaning up here, we're in a bit of a delicate
-                 * situation, so we do so manually. While it's tempting to try
-                 * and rely upon tmpfs_freevfs() and others, it's probably safer
-                 * for the time to do this manually at the cost of duplication.
-                 */
-                vn_invalid(TNTOV(tp));
-                rw_destroy(&tp->tn_rwlock);
-                mutex_destroy(&tp->tn_tlock);
-                vn_free(TNTOV(tp));
-                tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
+        tdirinit(tp, tp);
 
-                kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
-                mutex_destroy(&tm->tm_contents);
-                mutex_destroy(&tm->tm_renamelck);
-                kmem_free(tm, sizeof (struct tmount));
-                pn_free(&dpn);
-                error = ENOMEM;
-                goto out;
-        }
-
         rw_exit(&tp->tn_rwlock);
 
         pn_free(&dpn);
         error = 0;
-        atomic_inc_32(&tmpfs_mountcount);
 
 out:
         if (error == 0)
                 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);

@@ -457,184 +411,67 @@
 {
         struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
         struct tmpnode *tnp, *cancel;
         struct vnode    *vp;
         int error;
-        uint_t cnt;
-        int i;
 
         if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
                 return (error);
 
+        /*
+         * forced unmount is not supported by this file system
+         * and thus, ENOTSUP, is being returned.
+         */
+        if (flag & MS_FORCE)
+                return (ENOTSUP);
+
         mutex_enter(&tm->tm_contents);
 
         /*
-         * In the normal unmount case (non-forced unmount), if there are no
-         * open files, only the root node should have a reference count.
-         *
+         * If there are no open files, only the root node should have
+         * a reference count.
          * With tm_contents held, nothing can be added or removed.
          * There may be some dirty pages.  To prevent fsflush from
          * disrupting the unmount, put a hold on each node while scanning.
          * If we find a previously referenced node, undo the holds we have
          * placed and fail EBUSY.
-         *
-         * However, in the case of a forced umount, things are a bit different.
-         * An additional VFS_HOLD is added for each outstanding VN_HOLD to
-         * ensure that the file system is not cleaned up (tmp_freevfs) until
-         * the last vfs hold is dropped. This happens in tmp_inactive as the
-         * vnodes are released. Also, we can't add an additional VN_HOLD in
-         * this case since that would prevent tmp_inactive from ever being
-         * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
-         * so that the zone is not blocked waiting for the final file system
-         * cleanup.
          */
         tnp = tm->tm_rootnode;
-
-        vp = TNTOV(tnp);
-        mutex_enter(&vp->v_lock);
-        cnt = vp->v_count;
-        if (flag & MS_FORCE) {
-                vfsp->vfs_flag |= VFS_UNMOUNTED;
-                /* Extra hold which we rele below when we drop the zone ref */
-                VFS_HOLD(vfsp);
-
-                for (i = 1; i < cnt; i++)
-                        VFS_HOLD(vfsp);
-
-                /* drop the mutex now because no one can find this mount */
+        if (TNTOV(tnp)->v_count > 1) {
                 mutex_exit(&tm->tm_contents);
-        } else if (cnt > 1) {
-                mutex_exit(&vp->v_lock);
-                mutex_exit(&tm->tm_contents);
                 return (EBUSY);
         }
-        mutex_exit(&vp->v_lock);
 
-        /*
-         * Check for open files. An open file causes everything to unwind
-         * unless this is a forced umount.
-         */
         for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
-                vp = TNTOV(tnp);
-                mutex_enter(&vp->v_lock);
-                cnt = vp->v_count;
-                if (flag & MS_FORCE) {
-                        for (i = 0; i < cnt; i++)
-                                VFS_HOLD(vfsp);
-
-                        /*
-                         * In the case of a forced umount don't add an
-                         * additional VN_HOLD on the already held vnodes, like
-                         * we do in the non-forced unmount case. If the
-                         * cnt > 0, then the vnode already has at least one
-                         * hold and we need tmp_inactive to get called when the
-                         * last pre-existing hold on the node is released so
-                         * that we can VFS_RELE the VFS holds we just added.
-                         */
-                        if (cnt == 0) {
-                                /* directly add VN_HOLD since have the lock */
-                                vp->v_count++;
-                        }
-
-                        mutex_exit(&vp->v_lock);
-
-                        /*
-                         * If the tmpnode has any pages associated with it
-                         * (i.e. if it's a normal file with non-zero size), the
-                         * tmpnode could still be discovered by pageout or
-                         * fsflush via the page vnode pointers. To prevent this
-                         * from interfering with the tmp_freevfs, truncate the
-                         * tmpnode now.
-                         */
-                        if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
-                                rw_enter(&tnp->tn_rwlock, RW_WRITER);
-                                rw_enter(&tnp->tn_contents, RW_WRITER);
-
-                                (void) tmpnode_trunc(tm, tnp, 0);
-
-                                rw_exit(&tnp->tn_contents);
-                                rw_exit(&tnp->tn_rwlock);
-
-                                ASSERT(tnp->tn_size == 0);
-                                ASSERT(tnp->tn_nblocks == 0);
-                        }
-                } else if (cnt > 0) {
-                        /* An open file; unwind the holds we've been adding. */
-                        mutex_exit(&vp->v_lock);
+                if ((vp = TNTOV(tnp))->v_count > 0) {
                         cancel = tm->tm_rootnode->tn_forw;
                         while (cancel != tnp) {
                                 vp = TNTOV(cancel);
                                 ASSERT(vp->v_count > 0);
                                 VN_RELE(vp);
                                 cancel = cancel->tn_forw;
                         }
                         mutex_exit(&tm->tm_contents);
                         return (EBUSY);
-                } else {
-                        /* directly add a VN_HOLD since we have the lock */
-                        vp->v_count++;
-                        mutex_exit(&vp->v_lock);
                 }
+                VN_HOLD(vp);
         }
 
-        if (flag & MS_FORCE) {
                 /*
-                 * Drop the zone ref now since we don't know how long it will
-                 * be until the final vfs_rele is called by tmp_inactive.
+         * We can drop the mutex now because no one can find this mount
                  */
-                if (vfsp->vfs_zone) {
-                        zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
-                            ZONE_REF_VFS);
-                        vfsp->vfs_zone = 0;
-                }
-                /* We can now drop the extra hold we added above. */
-                VFS_RELE(vfsp);
-        } else {
-                /*
-                 * For the non-forced case, we can drop the mutex now because
-                 * no one can find this mount anymore
-                 */
-                vfsp->vfs_flag |= VFS_UNMOUNTED;
                 mutex_exit(&tm->tm_contents);
-        }
 
-        return (0);
-}
-
-/*
- * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
- * the vfs framework after umount and the last VFS_RELE, to trigger the release
- * of any resources still associated with the given vfs_t. We only add
- * additional VFS_HOLDs during the forced umount case, so this is normally
- * called immediately after tmp_umount.
- */
-void
-tmp_freevfs(vfs_t *vfsp)
-{
-        struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
-        struct tmpnode *tnp;
-        struct vnode    *vp;
-
         /*
          * Free all kmemalloc'd and anonalloc'd memory associated with
          * this filesystem.  To do this, we go through the file list twice,
          * once to remove all the directory entries, and then to remove
          * all the files.  We do this because there is useful code in
          * tmpnode_free which assumes that the directory entry has been
          * removed before the file.
          */
-
         /*
-         * Now that we are tearing ourselves down we need to remove the
-         * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
-         * files from the system causing us to have a negative value. Doing this
-         * seems a bit better than trying to set a flag on the tmount that says
-         * we're tearing down.
-         */
-        vfsp->vfs_flag &= ~VFS_UNMOUNTED;
-
-        /*
          * Remove all directory entries
          */
         for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) {
                 rw_enter(&tnp->tn_rwlock, RW_WRITER);
                 if (tnp->tn_type == VDIR)

@@ -696,20 +533,19 @@
         tm->tm_rootnode->tn_xattrdp = NULL;
         VN_RELE(TNTOV(tm->tm_rootnode));
 
         ASSERT(tm->tm_mntpath);
 
-        kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+        tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 
         ASSERT(tm->tm_anonmem == 0);
 
         mutex_destroy(&tm->tm_contents);
         mutex_destroy(&tm->tm_renamelck);
-        kmem_free(tm, sizeof (struct tmount));
+        tmp_memfree(tm, sizeof (struct tmount));
 
-        /* Allow _fini() to succeed now */
-        atomic_dec_32(&tmpfs_mountcount);
+        return (0);
 }
 
 /*
  * return root tmpnode for given vnode
  */

@@ -767,23 +603,22 @@
 
         /*
          * If tm_anonmax for this mount is less than the available swap space
          * (minus the amount tmpfs can't use), use that instead
          */
-        if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) {
+        if (blocks > tmpfs_minfree)
                 sbp->f_bfree = MIN(blocks - tmpfs_minfree,
-                    btop(tm->tm_anonmax) - btopr(tm->tm_anonmem));
-        } else {
+                    tm->tm_anonmax - tm->tm_anonmem);
+        else
                 sbp->f_bfree = 0;
-        }
 
         sbp->f_bavail = sbp->f_bfree;
 
         /*
          * Total number of blocks is what's available plus what's been used
          */
-        sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem));
+        sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem);
 
         if (eff_zid != GLOBAL_ZONEUNIQID &&
             zp->zone_max_swap_ctl != UINT64_MAX) {
                 /*
                  * If the fs is used by a non-global zone with a swap cap,

@@ -809,12 +644,18 @@
          * The maximum number of files available is approximately the number
          * of tmpnodes we can allocate from the remaining kernel memory
          * available to tmpfs.  This is fairly inaccurate since it doesn't
          * take into account the names stored in the directory entries.
          */
-        sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+        if (tmpfs_maxkmem > tmp_kmemspace)
+                sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
             (sizeof (struct tmpnode) + sizeof (struct tdirent));
+        else
+                sbp->f_ffree = 0;
+
+        sbp->f_files = tmpfs_maxkmem /
+            (sizeof (struct tmpnode) + sizeof (struct tdirent));
         sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
         (void) cmpldev(&d32, vfsp->vfs_dev);
         sbp->f_fsid = d32;
         (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name);
         (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr));