1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2016 Joyent, Inc.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/sysmacros.h>
  29 #include <sys/kmem.h>
  30 #include <sys/time.h>
  31 #include <sys/pathname.h>
  32 #include <sys/vfs.h>
  33 #include <sys/vfs_opreg.h>
  34 #include <sys/vnode.h>
  35 #include <sys/stat.h>
  36 #include <sys/uio.h>
  37 #include <sys/stat.h>
  38 #include <sys/errno.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/cred.h>
  41 #include <sys/statvfs.h>
  42 #include <sys/mount.h>
  43 #include <sys/debug.h>
  44 #include <sys/systm.h>
  45 #include <sys/mntent.h>
  46 #include <fs/fs_subr.h>
  47 #include <vm/page.h>
  48 #include <vm/anon.h>
  49 #include <sys/model.h>
  50 #include <sys/policy.h>
  51 
  52 #include <sys/fs/swapnode.h>
  53 #include <sys/fs/tmp.h>
  54 #include <sys/fs/tmpnode.h>
  55 
  56 static int tmpfsfstype;
  57 
  58 /*
  59  * tmpfs_mountcount is used to prevent module unloads while there is still
  60  * state from a former mount hanging around. With forced umount support, the
  61  * filesystem module must not be allowed to go away before the last
  62  * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
  63  * there's no need for locking.
  64  */
  65 static uint32_t tmpfs_mountcount;
  66 
  67 /*
  68  * tmpfs vfs operations.
  69  */
  70 static int tmpfsinit(int, char *);
  71 static int tmp_mount(struct vfs *, struct vnode *,
  72         struct mounta *, struct cred *);
  73 static int tmp_unmount(struct vfs *, int, struct cred *);
  74 static int tmp_root(struct vfs *, struct vnode **);
  75 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
  76 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
  77 static void tmp_freevfs(vfs_t *vfsp);
  78 
  79 /*
  80  * Loadable module wrapper
  81  */
  82 #include <sys/modctl.h>
  83 
  84 static mntopts_t tmpfs_proto_opttbl;
  85 
  86 static vfsdef_t vfw = {
  87         VFSDEF_VERSION,
  88         "tmpfs",
  89         tmpfsinit,
  90         VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
  91         &tmpfs_proto_opttbl
  92 };
  93 
  94 /*
  95  * in-kernel mnttab options
  96  */
  97 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
  98 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
  99 
 100 static mntopt_t tmpfs_options[] = {
 101         /* Option name          Cancel Opt      Arg     Flags           Data */
 102         { MNTOPT_XATTR,         xattr_cancel,   NULL,   MO_DEFAULT,     NULL},
 103         { MNTOPT_NOXATTR,       noxattr_cancel, NULL,   NULL,           NULL},
 104         { "size",               NULL,           "0",    MO_HASVALUE,    NULL},
 105         { "mode",               NULL,           NULL,   MO_HASVALUE,    NULL}
 106 };
 107 
 108 
 109 static mntopts_t tmpfs_proto_opttbl = {
 110         sizeof (tmpfs_options) / sizeof (mntopt_t),
 111         tmpfs_options
 112 };
 113 
 114 /*
 115  * Module linkage information
 116  */
 117 static struct modlfs modlfs = {
 118         &mod_fsops, "filesystem for tmpfs", &vfw
 119 };
 120 
 121 static struct modlinkage modlinkage = {
 122         MODREV_1, &modlfs, NULL
 123 };
 124 
 125 int
 126 _init()
 127 {
 128         return (mod_install(&modlinkage));
 129 }
 130 
 131 int
 132 _fini()
 133 {
 134         int error;
 135 
 136         /*
 137          * If a forceably unmounted instance is still hanging around, we cannot
 138          * allow the module to be unloaded because that would cause panics once
 139          * the VFS framework decides it's time to call into VFS_FREEVFS().
 140          */
 141         if (tmpfs_mountcount)
 142                 return (EBUSY);
 143 
 144         error = mod_remove(&modlinkage);
 145         if (error)
 146                 return (error);
 147         /*
 148          * Tear down the operations vectors
 149          */
 150         (void) vfs_freevfsops_by_type(tmpfsfstype);
 151         vn_freevnodeops(tmp_vnodeops);
 152         return (0);
 153 }
 154 
 155 int
 156 _info(struct modinfo *modinfop)
 157 {
 158         return (mod_info(&modlinkage, modinfop));
 159 }
 160 
 161 /*
 162  * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
 163  * the rest of the system.  In other words, if the amount of free swap space
 164  * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
 165  * anon allocations will fail.
 166  *
 167  * There is also a per mount limit on the amount of swap space
 168  * (tmount.tm_anonmax) settable via a mount option.
 169  */
 170 size_t tmpfs_minfree = 0;
 171 
 172 static major_t tmpfs_major;
 173 static minor_t tmpfs_minor;
 174 static kmutex_t tmpfs_minor_lock;
 175 
 176 /*
 177  * initialize global tmpfs locks and such
 178  * called when loading tmpfs module
 179  */
 180 static int
 181 tmpfsinit(int fstype, char *name)
 182 {
 183         static const fs_operation_def_t tmp_vfsops_template[] = {
 184                 VFSNAME_MOUNT,          { .vfs_mount = tmp_mount },
 185                 VFSNAME_UNMOUNT,        { .vfs_unmount = tmp_unmount },
 186                 VFSNAME_ROOT,           { .vfs_root = tmp_root },
 187                 VFSNAME_STATVFS,        { .vfs_statvfs = tmp_statvfs },
 188                 VFSNAME_VGET,           { .vfs_vget = tmp_vget },
 189                 VFSNAME_FREEVFS,        { .vfs_freevfs = tmp_freevfs },
 190                 NULL,                   NULL
 191         };
 192         int error;
 193         extern  void    tmpfs_hash_init();
 194 
 195         tmpfs_hash_init();
 196         tmpfsfstype = fstype;
 197         ASSERT(tmpfsfstype != 0);
 198 
 199         error = vfs_setfsops(fstype, tmp_vfsops_template, NULL);
 200         if (error != 0) {
 201                 cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template");
 202                 return (error);
 203         }
 204 
 205         error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops);
 206         if (error != 0) {
 207                 (void) vfs_freevfsops_by_type(fstype);
 208                 cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template");
 209                 return (error);
 210         }
 211 
 212         /*
 213          * tmpfs_minfree doesn't need to be some function of configured
 214          * swap space since it really is an absolute limit of swap space
 215          * which still allows other processes to execute.
 216          */
 217         if (tmpfs_minfree == 0) {
 218                 /*
 219                  * Set if not patched
 220                  */
 221                 tmpfs_minfree = btopr(TMPMINFREE);
 222         }
 223 
 224         if ((tmpfs_major = getudev()) == (major_t)-1) {
 225                 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
 226                 tmpfs_major = 0;
 227         }
 228         mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
 229         tmpfs_mountcount = 0;
 230         return (0);
 231 }
 232 
 233 static int
 234 tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 235 {
 236         struct tmount *tm = NULL;
 237         struct tmpnode *tp;
 238         struct pathname dpn;
 239         int error;
 240         size_t anonmax;
 241         struct vattr rattr;
 242         int got_attrs;
 243         boolean_t mode_arg = B_FALSE;
 244         mode_t root_mode = 0777;
 245         char *argstr;
 246 
 247         if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
 248                 return (error);
 249 
 250         if (mvp->v_type != VDIR)
 251                 return (ENOTDIR);
 252 
 253         mutex_enter(&mvp->v_lock);
 254         if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
 255             (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 256                 mutex_exit(&mvp->v_lock);
 257                 return (EBUSY);
 258         }
 259         mutex_exit(&mvp->v_lock);
 260 
 261         /*
 262          * Having the resource be anything but "swap" doesn't make sense.
 263          */
 264         vfs_setresource(vfsp, "swap", 0);
 265 
 266         /*
 267          * now look for options we understand...
 268          */
 269 
 270         /* tmpfs doesn't support read-only mounts */
 271         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
 272                 error = EINVAL;
 273                 goto out;
 274         }
 275 
 276         /*
 277          * tm_anonmax is set according to the mount arguments
 278          * if any.  Otherwise, it is set to a maximum value.
 279          */
 280         if (vfs_optionisset(vfsp, "size", &argstr)) {
 281                 if ((error = tmp_convnum(argstr, &anonmax)) != 0)
 282                         goto out;
 283         } else {
 284                 anonmax = SIZE_MAX;
 285         }
 286 
 287         /*
 288          * The "mode" mount argument allows the operator to override the
 289          * permissions of the root of the tmpfs mount.
 290          */
 291         if (vfs_optionisset(vfsp, "mode", &argstr)) {
 292                 if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
 293                         goto out;
 294                 }
 295                 mode_arg = B_TRUE;
 296         }
 297 
 298         if (error = pn_get(uap->dir,
 299             (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
 300                 goto out;
 301 
 302         if (uap->flags & MS_REMOUNT) {
 303                 tm = (struct tmount *)VFSTOTM(vfsp);
 304 
 305                 /*
 306                  * If we change the size so its less than what is currently
 307                  * being used, we allow that. The file system will simply be
 308                  * full until enough files have been removed to get below the
 309                  * new max.
 310                  */
 311                 mutex_enter(&tm->tm_contents);
 312                 tm->tm_anonmax = anonmax;
 313                 mutex_exit(&tm->tm_contents);
 314                 goto out;
 315         }
 316 
 317         if ((tm = kmem_zalloc(sizeof (struct tmount),
 318             KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
 319                 pn_free(&dpn);
 320                 error = ENOMEM;
 321                 goto out;
 322         }
 323 
 324         /*
 325          * find an available minor device number for this mount
 326          */
 327         mutex_enter(&tmpfs_minor_lock);
 328         do {
 329                 tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32;
 330                 tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor);
 331         } while (vfs_devismounted(tm->tm_dev));
 332         mutex_exit(&tmpfs_minor_lock);
 333 
 334         /*
 335          * Set but don't bother entering the mutex
 336          * (tmount not on mount list yet)
 337          */
 338         mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL);
 339         mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL);
 340 
 341         tm->tm_vfsp = vfsp;
 342         tm->tm_anonmax = anonmax;
 343 
 344         vfsp->vfs_data = (caddr_t)tm;
 345         vfsp->vfs_fstype = tmpfsfstype;
 346         vfsp->vfs_dev = tm->tm_dev;
 347         vfsp->vfs_bsize = PAGESIZE;
 348         vfsp->vfs_flag |= VFS_NOTRUNC;
 349         vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
 350         tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
 351         (void) strcpy(tm->tm_mntpath, dpn.pn_path);
 352 
 353         /*
 354          * Preemptively set vfs_zone before any of the tmp_kmem_* functions are
 355          * called.  That field is not populated until after a successful
 356          * VFS_MOUNT when domount() sets vfsp metadata via vfs_add().  An
 357          * accurate value is required for proper swap usage accounting.
 358          */
 359         ASSERT0(uap->flags & MS_REMOUNT);
 360         ASSERT(vfsp->vfs_zone == NULL);
 361         vfsp->vfs_zone = curproc->p_zone;
 362 
 363         /*
 364          * allocate and initialize root tmpnode structure
 365          */
 366         bzero(&rattr, sizeof (struct vattr));
 367         rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
 368         rattr.va_type = VDIR;
 369         rattr.va_rdev = 0;
 370         tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
 371         if (tp == NULL) {
 372                 kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 373                 mutex_destroy(&tm->tm_contents);
 374                 mutex_destroy(&tm->tm_renamelck);
 375                 kmem_free(tm, sizeof (struct tmount));
 376 
 377                 pn_free(&dpn);
 378                 error = ENOMEM;
 379                 goto out;
 380         }
 381         tmpnode_init(tm, tp, &rattr, cr);
 382 
 383         /*
 384          * Get the mode, uid, and gid from the underlying mount point.
 385          */
 386         rattr.va_mask = AT_MODE|AT_UID|AT_GID;  /* Hint to getattr */
 387         got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
 388 
 389         rw_enter(&tp->tn_rwlock, RW_WRITER);
 390         TNTOV(tp)->v_flag |= VROOT;
 391 
 392         /*
 393          * If the getattr succeeded, use its results.  Otherwise allow
 394          * the previously set hardwired defaults to prevail.
 395          */
 396         if (got_attrs == 0) {
 397                 if (!mode_arg) {
 398                         /*
 399                          * Only use the underlying mount point for the
 400                          * mode if the "mode" mount argument was not
 401                          * provided.
 402                          */
 403                         tp->tn_mode = rattr.va_mode;
 404                 }
 405                 tp->tn_uid = rattr.va_uid;
 406                 tp->tn_gid = rattr.va_gid;
 407         }
 408 
 409         /*
 410          * initialize linked list of tmpnodes so that the back pointer of
 411          * the root tmpnode always points to the last one on the list
 412          * and the forward pointer of the last node is null
 413          */
 414         tp->tn_back = tp;
 415         tp->tn_forw = NULL;
 416         tp->tn_nlink = 0;
 417         tm->tm_rootnode = tp;
 418 
 419         if (tdirinit(tp, tp) != 0) {
 420                 /*
 421                  * While we would normally let our VOP_INACTIVE function take
 422                  * care of cleaning up here, we're in a bit of a delicate
 423                  * situation, so we do so manually. While it's tempting to try
 424                  * and rely upon tmpfs_freevfs() and others, it's probably safer
 425                  * for the time to do this manually at the cost of duplication.
 426                  */
 427                 vn_invalid(TNTOV(tp));
 428                 rw_destroy(&tp->tn_rwlock);
 429                 mutex_destroy(&tp->tn_tlock);
 430                 vn_free(TNTOV(tp));
 431                 tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
 432 
 433                 kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 434                 mutex_destroy(&tm->tm_contents);
 435                 mutex_destroy(&tm->tm_renamelck);
 436                 kmem_free(tm, sizeof (struct tmount));
 437                 pn_free(&dpn);
 438                 error = ENOMEM;
 439                 goto out;
 440         }
 441 
 442         rw_exit(&tp->tn_rwlock);
 443 
 444         pn_free(&dpn);
 445         error = 0;
 446         atomic_inc_32(&tmpfs_mountcount);
 447 
 448 out:
 449         if (error == 0)
 450                 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
 451 
 452         return (error);
 453 }
 454 
 455 static int
 456 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
 457 {
 458         struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
 459         struct tmpnode *tnp, *cancel;
 460         struct vnode    *vp;
 461         int error;
 462         uint_t cnt;
 463         int i;
 464 
 465         if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
 466                 return (error);
 467 
 468         mutex_enter(&tm->tm_contents);
 469 
 470         /*
 471          * In the normal unmount case (non-forced unmount), if there are no
 472          * open files, only the root node should have a reference count.
 473          *
 474          * With tm_contents held, nothing can be added or removed.
 475          * There may be some dirty pages.  To prevent fsflush from
 476          * disrupting the unmount, put a hold on each node while scanning.
 477          * If we find a previously referenced node, undo the holds we have
 478          * placed and fail EBUSY.
 479          *
 480          * However, in the case of a forced umount, things are a bit different.
 481          * An additional VFS_HOLD is added for each outstanding VN_HOLD to
 482          * ensure that the file system is not cleaned up (tmp_freevfs) until
 483          * the last vfs hold is dropped. This happens in tmp_inactive as the
 484          * vnodes are released. Also, we can't add an additional VN_HOLD in
 485          * this case since that would prevent tmp_inactive from ever being
 486          * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
 487          * so that the zone is not blocked waiting for the final file system
 488          * cleanup.
 489          */
 490         tnp = tm->tm_rootnode;
 491 
 492         vp = TNTOV(tnp);
 493         mutex_enter(&vp->v_lock);
 494         cnt = vp->v_count;
 495         if (flag & MS_FORCE) {
 496                 vfsp->vfs_flag |= VFS_UNMOUNTED;
 497                 /* Extra hold which we rele below when we drop the zone ref */
 498                 VFS_HOLD(vfsp);
 499 
 500                 for (i = 1; i < cnt; i++)
 501                         VFS_HOLD(vfsp);
 502 
 503                 /* drop the mutex now because no one can find this mount */
 504                 mutex_exit(&tm->tm_contents);
 505         } else if (cnt > 1) {
 506                 mutex_exit(&vp->v_lock);
 507                 mutex_exit(&tm->tm_contents);
 508                 return (EBUSY);
 509         }
 510         mutex_exit(&vp->v_lock);
 511 
 512         /*
 513          * Check for open files. An open file causes everything to unwind
 514          * unless this is a forced umount.
 515          */
 516         for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
 517                 vp = TNTOV(tnp);
 518                 mutex_enter(&vp->v_lock);
 519                 cnt = vp->v_count;
 520                 if (flag & MS_FORCE) {
 521                         for (i = 0; i < cnt; i++)
 522                                 VFS_HOLD(vfsp);
 523 
 524                         /*
 525                          * In the case of a forced umount don't add an
 526                          * additional VN_HOLD on the already held vnodes, like
 527                          * we do in the non-forced unmount case. If the
 528                          * cnt > 0, then the vnode already has at least one
 529                          * hold and we need tmp_inactive to get called when the
 530                          * last pre-existing hold on the node is released so
 531                          * that we can VFS_RELE the VFS holds we just added.
 532                          */
 533                         if (cnt == 0) {
 534                                 /* directly add VN_HOLD since have the lock */
 535                                 vp->v_count++;
 536                         }
 537 
 538                         mutex_exit(&vp->v_lock);
 539 
 540                         /*
 541                          * If the tmpnode has any pages associated with it
 542                          * (i.e. if it's a normal file with non-zero size), the
 543                          * tmpnode could still be discovered by pageout or
 544                          * fsflush via the page vnode pointers. To prevent this
 545                          * from interfering with the tmp_freevfs, truncate the
 546                          * tmpnode now.
 547                          */
 548                         if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
 549                                 rw_enter(&tnp->tn_rwlock, RW_WRITER);
 550                                 rw_enter(&tnp->tn_contents, RW_WRITER);
 551 
 552                                 (void) tmpnode_trunc(tm, tnp, 0);
 553 
 554                                 rw_exit(&tnp->tn_contents);
 555                                 rw_exit(&tnp->tn_rwlock);
 556 
 557                                 ASSERT(tnp->tn_size == 0);
 558                                 ASSERT(tnp->tn_nblocks == 0);
 559                         }
 560                 } else if (cnt > 0) {
 561                         /* An open file; unwind the holds we've been adding. */
 562                         mutex_exit(&vp->v_lock);
 563                         cancel = tm->tm_rootnode->tn_forw;
 564                         while (cancel != tnp) {
 565                                 vp = TNTOV(cancel);
 566                                 ASSERT(vp->v_count > 0);
 567                                 VN_RELE(vp);
 568                                 cancel = cancel->tn_forw;
 569                         }
 570                         mutex_exit(&tm->tm_contents);
 571                         return (EBUSY);
 572                 } else {
 573                         /* directly add a VN_HOLD since we have the lock */
 574                         vp->v_count++;
 575                         mutex_exit(&vp->v_lock);
 576                 }
 577         }
 578 
 579         if (flag & MS_FORCE) {
 580                 /*
 581                  * Drop the zone ref now since we don't know how long it will
 582                  * be until the final vfs_rele is called by tmp_inactive.
 583                  */
 584                 if (vfsp->vfs_zone) {
 585                         zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
 586                             ZONE_REF_VFS);
 587                         vfsp->vfs_zone = 0;
 588                 }
 589                 /* We can now drop the extra hold we added above. */
 590                 VFS_RELE(vfsp);
 591         } else {
 592                 /*
 593                  * For the non-forced case, we can drop the mutex now because
 594                  * no one can find this mount anymore
 595                  */
 596                 vfsp->vfs_flag |= VFS_UNMOUNTED;
 597                 mutex_exit(&tm->tm_contents);
 598         }
 599 
 600         return (0);
 601 }
 602 
 603 /*
 604  * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
 605  * the vfs framework after umount and the last VFS_RELE, to trigger the release
 606  * of any resources still associated with the given vfs_t. We only add
 607  * additional VFS_HOLDs during the forced umount case, so this is normally
 608  * called immediately after tmp_umount.
 609  */
 610 void
 611 tmp_freevfs(vfs_t *vfsp)
 612 {
 613         struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
 614         struct tmpnode *tnp;
 615         struct vnode    *vp;
 616 
 617         /*
 618          * Free all kmemalloc'd and anonalloc'd memory associated with
 619          * this filesystem.  To do this, we go through the file list twice,
 620          * once to remove all the directory entries, and then to remove
 621          * all the files.  We do this because there is useful code in
 622          * tmpnode_free which assumes that the directory entry has been
 623          * removed before the file.
 624          */
 625 
 626         /*
 627          * Now that we are tearing ourselves down we need to remove the
 628          * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
 629          * files from the system causing us to have a negative value. Doing this
 630          * seems a bit better than trying to set a flag on the tmount that says
 631          * we're tearing down.
 632          */
 633         vfsp->vfs_flag &= ~VFS_UNMOUNTED;
 634 
 635         /*
 636          * Remove all directory entries
 637          */
 638         for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) {
 639                 rw_enter(&tnp->tn_rwlock, RW_WRITER);
 640                 if (tnp->tn_type == VDIR)
 641                         tdirtrunc(tnp);
 642                 if (tnp->tn_vnode->v_flag & V_XATTRDIR) {
 643                         /*
 644                          * Account for implicit attrdir reference.
 645                          */
 646                         ASSERT(tnp->tn_nlink > 0);
 647                         DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock);
 648                 }
 649                 rw_exit(&tnp->tn_rwlock);
 650         }
 651 
 652         ASSERT(tm->tm_rootnode);
 653 
 654         /*
 655          * All links are gone, v_count is keeping nodes in place.
 656          * VN_RELE should make the node disappear, unless somebody
 657          * is holding pages against it.  Nap and retry until it disappears.
 658          *
 659          * We re-acquire the lock to prevent others who have a HOLD on
 660          * a tmpnode via its pages or anon slots from blowing it away
 661          * (in tmp_inactive) while we're trying to get to it here. Once
 662          * we have a HOLD on it we know it'll stick around.
 663          *
 664          */
 665         mutex_enter(&tm->tm_contents);
 666         /*
 667          * Remove all the files (except the rootnode) backwards.
 668          */
 669         while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) {
 670                 mutex_exit(&tm->tm_contents);
 671                 /*
 672                  * Inhibit tmp_inactive from touching attribute directory
 673                  * as all nodes will be released here.
 674                  * Note we handled the link count in pass 2 above.
 675                  */
 676                 rw_enter(&tnp->tn_rwlock, RW_WRITER);
 677                 tnp->tn_xattrdp = NULL;
 678                 rw_exit(&tnp->tn_rwlock);
 679                 vp = TNTOV(tnp);
 680                 VN_RELE(vp);
 681                 mutex_enter(&tm->tm_contents);
 682                 /*
 683                  * It's still there after the RELE. Someone else like pageout
 684                  * has a hold on it so wait a bit and then try again - we know
 685                  * they'll give it up soon.
 686                  */
 687                 if (tnp == tm->tm_rootnode->tn_back) {
 688                         VN_HOLD(vp);
 689                         mutex_exit(&tm->tm_contents);
 690                         delay(hz / 4);
 691                         mutex_enter(&tm->tm_contents);
 692                 }
 693         }
 694         mutex_exit(&tm->tm_contents);
 695 
 696         tm->tm_rootnode->tn_xattrdp = NULL;
 697         VN_RELE(TNTOV(tm->tm_rootnode));
 698 
 699         ASSERT(tm->tm_mntpath);
 700 
 701         kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
 702 
 703         ASSERT(tm->tm_anonmem == 0);
 704 
 705         mutex_destroy(&tm->tm_contents);
 706         mutex_destroy(&tm->tm_renamelck);
 707         kmem_free(tm, sizeof (struct tmount));
 708 
 709         /* Allow _fini() to succeed now */
 710         atomic_dec_32(&tmpfs_mountcount);
 711 }
 712 
 713 /*
 714  * return root tmpnode for given vnode
 715  */
 716 static int
 717 tmp_root(struct vfs *vfsp, struct vnode **vpp)
 718 {
 719         struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
 720         struct tmpnode *tp = tm->tm_rootnode;
 721         struct vnode *vp;
 722 
 723         ASSERT(tp);
 724 
 725         vp = TNTOV(tp);
 726         VN_HOLD(vp);
 727         *vpp = vp;
 728         return (0);
 729 }
 730 
 731 static int
 732 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
 733 {
 734         struct tmount   *tm = (struct tmount *)VFSTOTM(vfsp);
 735         ulong_t blocks;
 736         dev32_t d32;
 737         zoneid_t eff_zid;
 738         struct zone *zp;
 739 
 740         /*
 741          * The file system may have been mounted by the global zone on
 742          * behalf of the non-global zone.  In that case, the tmount zone_id
 743          * will be the global zone.  We still want to show the swap cap inside
 744          * the zone in this case, even though the file system was mounted by
 745          * the global zone.
 746          */
 747         if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
 748                 zp = curproc->p_zone;
 749         else
 750                 zp = tm->tm_vfsp->vfs_zone;
 751 
 752         if (zp == NULL)
 753                 eff_zid = GLOBAL_ZONEUNIQID;
 754         else
 755                 eff_zid = zp->zone_id;
 756 
 757         sbp->f_bsize = PAGESIZE;
 758         sbp->f_frsize = PAGESIZE;
 759 
 760         /*
 761          * Find the amount of available physical and memory swap
 762          */
 763         mutex_enter(&anoninfo_lock);
 764         ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 765         blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
 766         mutex_exit(&anoninfo_lock);
 767 
 768         /*
 769          * If tm_anonmax for this mount is less than the available swap space
 770          * (minus the amount tmpfs can't use), use that instead
 771          */
 772         if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) {
 773                 sbp->f_bfree = MIN(blocks - tmpfs_minfree,
 774                     btop(tm->tm_anonmax) - btopr(tm->tm_anonmem));
 775         } else {
 776                 sbp->f_bfree = 0;
 777         }
 778 
 779         sbp->f_bavail = sbp->f_bfree;
 780 
 781         /*
 782          * Total number of blocks is what's available plus what's been used
 783          */
 784         sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem));
 785 
 786         if (eff_zid != GLOBAL_ZONEUNIQID &&
 787             zp->zone_max_swap_ctl != UINT64_MAX) {
 788                 /*
 789                  * If the fs is used by a non-global zone with a swap cap,
 790                  * then report the capped size.
 791                  */
 792                 rctl_qty_t cap, used;
 793                 pgcnt_t pgcap, pgused;
 794 
 795                 mutex_enter(&zp->zone_mem_lock);
 796                 cap = zp->zone_max_swap_ctl;
 797                 used = zp->zone_max_swap;
 798                 mutex_exit(&zp->zone_mem_lock);
 799 
 800                 pgcap = btop(cap);
 801                 pgused = btop(used);
 802 
 803                 sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
 804                 sbp->f_bavail = sbp->f_bfree;
 805                 sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
 806         }
 807 
 808         /*
 809          * The maximum number of files available is approximately the number
 810          * of tmpnodes we can allocate from the remaining kernel memory
 811          * available to tmpfs.  This is fairly inaccurate since it doesn't
 812          * take into account the names stored in the directory entries.
 813          */
 814         sbp->f_ffree = sbp->f_files = ptob(availrmem) /
 815             (sizeof (struct tmpnode) + sizeof (struct tdirent));
 816         sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
 817         (void) cmpldev(&d32, vfsp->vfs_dev);
 818         sbp->f_fsid = d32;
 819         (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name);
 820         (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr));
 821         /*
 822          * ensure null termination
 823          */
 824         sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
 825         sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
 826         sbp->f_namemax = MAXNAMELEN - 1;
 827         return (0);
 828 }
 829 
 830 static int
 831 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
 832 {
 833         struct tfid *tfid;
 834         struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
 835         struct tmpnode *tp = NULL;
 836 
 837         tfid = (struct tfid *)fidp;
 838         *vpp = NULL;
 839 
 840         mutex_enter(&tm->tm_contents);
 841         for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) {
 842                 mutex_enter(&tp->tn_tlock);
 843                 if (tp->tn_nodeid == tfid->tfid_ino) {
 844                         /*
 845                          * If the gen numbers don't match we know the
 846                          * file won't be found since only one tmpnode
 847                          * can have this number at a time.
 848                          */
 849                         if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) {
 850                                 mutex_exit(&tp->tn_tlock);
 851                                 mutex_exit(&tm->tm_contents);
 852                                 return (0);
 853                         }
 854                         *vpp = (struct vnode *)TNTOV(tp);
 855 
 856                         VN_HOLD(*vpp);
 857 
 858                         if ((tp->tn_mode & S_ISVTX) &&
 859                             !(tp->tn_mode & (S_IXUSR | S_IFDIR))) {
 860                                 mutex_enter(&(*vpp)->v_lock);
 861                                 (*vpp)->v_flag |= VISSWAP;
 862                                 mutex_exit(&(*vpp)->v_lock);
 863                         }
 864                         mutex_exit(&tp->tn_tlock);
 865                         mutex_exit(&tm->tm_contents);
 866                         return (0);
 867                 }
 868                 mutex_exit(&tp->tn_tlock);
 869         }
 870         mutex_exit(&tm->tm_contents);
 871         return (0);
 872 }