1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2016 Joyent, Inc.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/kmem.h>
30 #include <sys/time.h>
31 #include <sys/pathname.h>
32 #include <sys/vfs.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/vnode.h>
35 #include <sys/stat.h>
36 #include <sys/uio.h>
37 #include <sys/stat.h>
38 #include <sys/errno.h>
39 #include <sys/cmn_err.h>
40 #include <sys/cred.h>
41 #include <sys/statvfs.h>
42 #include <sys/mount.h>
43 #include <sys/debug.h>
44 #include <sys/systm.h>
45 #include <sys/mntent.h>
46 #include <fs/fs_subr.h>
47 #include <vm/page.h>
48 #include <vm/anon.h>
49 #include <sys/model.h>
50 #include <sys/policy.h>
51
52 #include <sys/fs/swapnode.h>
53 #include <sys/fs/tmp.h>
54 #include <sys/fs/tmpnode.h>
55
56 static int tmpfsfstype;
57
58 /*
59 * tmpfs_mountcount is used to prevent module unloads while there is still
60 * state from a former mount hanging around. With forced umount support, the
61 * filesystem module must not be allowed to go away before the last
62 * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
63 * there's no need for locking.
64 */
65 static uint32_t tmpfs_mountcount;
66
67 /*
68 * tmpfs vfs operations.
69 */
70 static int tmpfsinit(int, char *);
71 static int tmp_mount(struct vfs *, struct vnode *,
72 struct mounta *, struct cred *);
73 static int tmp_unmount(struct vfs *, int, struct cred *);
74 static int tmp_root(struct vfs *, struct vnode **);
75 static int tmp_statvfs(struct vfs *, struct statvfs64 *);
76 static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
77 static void tmp_freevfs(vfs_t *vfsp);
78
79 /*
80 * Loadable module wrapper
81 */
82 #include <sys/modctl.h>
83
84 static mntopts_t tmpfs_proto_opttbl;
85
86 static vfsdef_t vfw = {
87 VFSDEF_VERSION,
88 "tmpfs",
89 tmpfsinit,
90 VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
91 &tmpfs_proto_opttbl
92 };
93
94 /*
95 * in-kernel mnttab options
96 */
97 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
98 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
99
100 static mntopt_t tmpfs_options[] = {
101 /* Option name Cancel Opt Arg Flags Data */
102 { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL},
103 { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL},
104 { "size", NULL, "0", MO_HASVALUE, NULL},
105 { "mode", NULL, NULL, MO_HASVALUE, NULL}
106 };
107
108
109 static mntopts_t tmpfs_proto_opttbl = {
110 sizeof (tmpfs_options) / sizeof (mntopt_t),
111 tmpfs_options
112 };
113
114 /*
115 * Module linkage information
116 */
117 static struct modlfs modlfs = {
118 &mod_fsops, "filesystem for tmpfs", &vfw
119 };
120
121 static struct modlinkage modlinkage = {
122 MODREV_1, &modlfs, NULL
123 };
124
125 int
126 _init()
127 {
128 return (mod_install(&modlinkage));
129 }
130
131 int
132 _fini()
133 {
134 int error;
135
136 /*
137 * If a forceably unmounted instance is still hanging around, we cannot
138 * allow the module to be unloaded because that would cause panics once
139 * the VFS framework decides it's time to call into VFS_FREEVFS().
140 */
141 if (tmpfs_mountcount)
142 return (EBUSY);
143
144 error = mod_remove(&modlinkage);
145 if (error)
146 return (error);
147 /*
148 * Tear down the operations vectors
149 */
150 (void) vfs_freevfsops_by_type(tmpfsfstype);
151 vn_freevnodeops(tmp_vnodeops);
152 return (0);
153 }
154
155 int
156 _info(struct modinfo *modinfop)
157 {
158 return (mod_info(&modlinkage, modinfop));
159 }
160
161 /*
162 * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
163 * the rest of the system. In other words, if the amount of free swap space
164 * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
165 * anon allocations will fail.
166 *
167 * There is also a per mount limit on the amount of swap space
168 * (tmount.tm_anonmax) settable via a mount option.
169 */
170 size_t tmpfs_minfree = 0;
171
172 static major_t tmpfs_major;
173 static minor_t tmpfs_minor;
174 static kmutex_t tmpfs_minor_lock;
175
176 /*
177 * initialize global tmpfs locks and such
178 * called when loading tmpfs module
179 */
180 static int
181 tmpfsinit(int fstype, char *name)
182 {
183 static const fs_operation_def_t tmp_vfsops_template[] = {
184 VFSNAME_MOUNT, { .vfs_mount = tmp_mount },
185 VFSNAME_UNMOUNT, { .vfs_unmount = tmp_unmount },
186 VFSNAME_ROOT, { .vfs_root = tmp_root },
187 VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs },
188 VFSNAME_VGET, { .vfs_vget = tmp_vget },
189 VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs },
190 NULL, NULL
191 };
192 int error;
193 extern void tmpfs_hash_init();
194
195 tmpfs_hash_init();
196 tmpfsfstype = fstype;
197 ASSERT(tmpfsfstype != 0);
198
199 error = vfs_setfsops(fstype, tmp_vfsops_template, NULL);
200 if (error != 0) {
201 cmn_err(CE_WARN, "tmpfsinit: bad vfs ops template");
202 return (error);
203 }
204
205 error = vn_make_ops(name, tmp_vnodeops_template, &tmp_vnodeops);
206 if (error != 0) {
207 (void) vfs_freevfsops_by_type(fstype);
208 cmn_err(CE_WARN, "tmpfsinit: bad vnode ops template");
209 return (error);
210 }
211
212 /*
213 * tmpfs_minfree doesn't need to be some function of configured
214 * swap space since it really is an absolute limit of swap space
215 * which still allows other processes to execute.
216 */
217 if (tmpfs_minfree == 0) {
218 /*
219 * Set if not patched
220 */
221 tmpfs_minfree = btopr(TMPMINFREE);
222 }
223
224 if ((tmpfs_major = getudev()) == (major_t)-1) {
225 cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
226 tmpfs_major = 0;
227 }
228 mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
229 tmpfs_mountcount = 0;
230 return (0);
231 }
232
233 static int
234 tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
235 {
236 struct tmount *tm = NULL;
237 struct tmpnode *tp;
238 struct pathname dpn;
239 int error;
240 size_t anonmax;
241 struct vattr rattr;
242 int got_attrs;
243 boolean_t mode_arg = B_FALSE;
244 mode_t root_mode = 0777;
245 char *argstr;
246
247 if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
248 return (error);
249
250 if (mvp->v_type != VDIR)
251 return (ENOTDIR);
252
253 mutex_enter(&mvp->v_lock);
254 if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
255 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
256 mutex_exit(&mvp->v_lock);
257 return (EBUSY);
258 }
259 mutex_exit(&mvp->v_lock);
260
261 /*
262 * Having the resource be anything but "swap" doesn't make sense.
263 */
264 vfs_setresource(vfsp, "swap", 0);
265
266 /*
267 * now look for options we understand...
268 */
269
270 /* tmpfs doesn't support read-only mounts */
271 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
272 error = EINVAL;
273 goto out;
274 }
275
276 /*
277 * tm_anonmax is set according to the mount arguments
278 * if any. Otherwise, it is set to a maximum value.
279 */
280 if (vfs_optionisset(vfsp, "size", &argstr)) {
281 if ((error = tmp_convnum(argstr, &anonmax)) != 0)
282 goto out;
283 } else {
284 anonmax = SIZE_MAX;
285 }
286
287 /*
288 * The "mode" mount argument allows the operator to override the
289 * permissions of the root of the tmpfs mount.
290 */
291 if (vfs_optionisset(vfsp, "mode", &argstr)) {
292 if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
293 goto out;
294 }
295 mode_arg = B_TRUE;
296 }
297
298 if (error = pn_get(uap->dir,
299 (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
300 goto out;
301
302 if (uap->flags & MS_REMOUNT) {
303 tm = (struct tmount *)VFSTOTM(vfsp);
304
305 /*
306 * If we change the size so its less than what is currently
307 * being used, we allow that. The file system will simply be
308 * full until enough files have been removed to get below the
309 * new max.
310 */
311 mutex_enter(&tm->tm_contents);
312 tm->tm_anonmax = anonmax;
313 mutex_exit(&tm->tm_contents);
314 goto out;
315 }
316
317 if ((tm = kmem_zalloc(sizeof (struct tmount),
318 KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
319 pn_free(&dpn);
320 error = ENOMEM;
321 goto out;
322 }
323
324 /*
325 * find an available minor device number for this mount
326 */
327 mutex_enter(&tmpfs_minor_lock);
328 do {
329 tmpfs_minor = (tmpfs_minor + 1) & L_MAXMIN32;
330 tm->tm_dev = makedevice(tmpfs_major, tmpfs_minor);
331 } while (vfs_devismounted(tm->tm_dev));
332 mutex_exit(&tmpfs_minor_lock);
333
334 /*
335 * Set but don't bother entering the mutex
336 * (tmount not on mount list yet)
337 */
338 mutex_init(&tm->tm_contents, NULL, MUTEX_DEFAULT, NULL);
339 mutex_init(&tm->tm_renamelck, NULL, MUTEX_DEFAULT, NULL);
340
341 tm->tm_vfsp = vfsp;
342 tm->tm_anonmax = anonmax;
343
344 vfsp->vfs_data = (caddr_t)tm;
345 vfsp->vfs_fstype = tmpfsfstype;
346 vfsp->vfs_dev = tm->tm_dev;
347 vfsp->vfs_bsize = PAGESIZE;
348 vfsp->vfs_flag |= VFS_NOTRUNC;
349 vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
350 tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
351 (void) strcpy(tm->tm_mntpath, dpn.pn_path);
352
353 /*
354 * Preemptively set vfs_zone before any of the tmp_kmem_* functions are
355 * called. That field is not populated until after a successful
356 * VFS_MOUNT when domount() sets vfsp metadata via vfs_add(). An
357 * accurate value is required for proper swap usage accounting.
358 */
359 ASSERT0(uap->flags & MS_REMOUNT);
360 ASSERT(vfsp->vfs_zone == NULL);
361 vfsp->vfs_zone = curproc->p_zone;
362
363 /*
364 * allocate and initialize root tmpnode structure
365 */
366 bzero(&rattr, sizeof (struct vattr));
367 rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
368 rattr.va_type = VDIR;
369 rattr.va_rdev = 0;
370 tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP);
371 if (tp == NULL) {
372 kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
373 mutex_destroy(&tm->tm_contents);
374 mutex_destroy(&tm->tm_renamelck);
375 kmem_free(tm, sizeof (struct tmount));
376
377 pn_free(&dpn);
378 error = ENOMEM;
379 goto out;
380 }
381 tmpnode_init(tm, tp, &rattr, cr);
382
383 /*
384 * Get the mode, uid, and gid from the underlying mount point.
385 */
386 rattr.va_mask = AT_MODE|AT_UID|AT_GID; /* Hint to getattr */
387 got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
388
389 rw_enter(&tp->tn_rwlock, RW_WRITER);
390 TNTOV(tp)->v_flag |= VROOT;
391
392 /*
393 * If the getattr succeeded, use its results. Otherwise allow
394 * the previously set hardwired defaults to prevail.
395 */
396 if (got_attrs == 0) {
397 if (!mode_arg) {
398 /*
399 * Only use the underlying mount point for the
400 * mode if the "mode" mount argument was not
401 * provided.
402 */
403 tp->tn_mode = rattr.va_mode;
404 }
405 tp->tn_uid = rattr.va_uid;
406 tp->tn_gid = rattr.va_gid;
407 }
408
409 /*
410 * initialize linked list of tmpnodes so that the back pointer of
411 * the root tmpnode always points to the last one on the list
412 * and the forward pointer of the last node is null
413 */
414 tp->tn_back = tp;
415 tp->tn_forw = NULL;
416 tp->tn_nlink = 0;
417 tm->tm_rootnode = tp;
418
419 if (tdirinit(tp, tp) != 0) {
420 /*
421 * While we would normally let our VOP_INACTIVE function take
422 * care of cleaning up here, we're in a bit of a delicate
423 * situation, so we do so manually. While it's tempting to try
424 * and rely upon tmpfs_freevfs() and others, it's probably safer
425 * for the time to do this manually at the cost of duplication.
426 */
427 vn_invalid(TNTOV(tp));
428 rw_destroy(&tp->tn_rwlock);
429 mutex_destroy(&tp->tn_tlock);
430 vn_free(TNTOV(tp));
431 tmp_kmem_free(tm, tp, sizeof (struct tmpnode));
432
433 kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
434 mutex_destroy(&tm->tm_contents);
435 mutex_destroy(&tm->tm_renamelck);
436 kmem_free(tm, sizeof (struct tmount));
437 pn_free(&dpn);
438 error = ENOMEM;
439 goto out;
440 }
441
442 rw_exit(&tp->tn_rwlock);
443
444 pn_free(&dpn);
445 error = 0;
446 atomic_inc_32(&tmpfs_mountcount);
447
448 out:
449 if (error == 0)
450 vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
451
452 return (error);
453 }
454
455 static int
456 tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
457 {
458 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
459 struct tmpnode *tnp, *cancel;
460 struct vnode *vp;
461 int error;
462 uint_t cnt;
463 int i;
464
465 if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
466 return (error);
467
468 mutex_enter(&tm->tm_contents);
469
470 /*
471 * In the normal unmount case (non-forced unmount), if there are no
472 * open files, only the root node should have a reference count.
473 *
474 * With tm_contents held, nothing can be added or removed.
475 * There may be some dirty pages. To prevent fsflush from
476 * disrupting the unmount, put a hold on each node while scanning.
477 * If we find a previously referenced node, undo the holds we have
478 * placed and fail EBUSY.
479 *
480 * However, in the case of a forced umount, things are a bit different.
481 * An additional VFS_HOLD is added for each outstanding VN_HOLD to
482 * ensure that the file system is not cleaned up (tmp_freevfs) until
483 * the last vfs hold is dropped. This happens in tmp_inactive as the
484 * vnodes are released. Also, we can't add an additional VN_HOLD in
485 * this case since that would prevent tmp_inactive from ever being
486 * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
487 * so that the zone is not blocked waiting for the final file system
488 * cleanup.
489 */
490 tnp = tm->tm_rootnode;
491
492 vp = TNTOV(tnp);
493 mutex_enter(&vp->v_lock);
494 cnt = vp->v_count;
495 if (flag & MS_FORCE) {
496 vfsp->vfs_flag |= VFS_UNMOUNTED;
497 /* Extra hold which we rele below when we drop the zone ref */
498 VFS_HOLD(vfsp);
499
500 for (i = 1; i < cnt; i++)
501 VFS_HOLD(vfsp);
502
503 /* drop the mutex now because no one can find this mount */
504 mutex_exit(&tm->tm_contents);
505 } else if (cnt > 1) {
506 mutex_exit(&vp->v_lock);
507 mutex_exit(&tm->tm_contents);
508 return (EBUSY);
509 }
510 mutex_exit(&vp->v_lock);
511
512 /*
513 * Check for open files. An open file causes everything to unwind
514 * unless this is a forced umount.
515 */
516 for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
517 vp = TNTOV(tnp);
518 mutex_enter(&vp->v_lock);
519 cnt = vp->v_count;
520 if (flag & MS_FORCE) {
521 for (i = 0; i < cnt; i++)
522 VFS_HOLD(vfsp);
523
524 /*
525 * In the case of a forced umount don't add an
526 * additional VN_HOLD on the already held vnodes, like
527 * we do in the non-forced unmount case. If the
528 * cnt > 0, then the vnode already has at least one
529 * hold and we need tmp_inactive to get called when the
530 * last pre-existing hold on the node is released so
531 * that we can VFS_RELE the VFS holds we just added.
532 */
533 if (cnt == 0) {
534 /* directly add VN_HOLD since have the lock */
535 vp->v_count++;
536 }
537
538 mutex_exit(&vp->v_lock);
539
540 /*
541 * If the tmpnode has any pages associated with it
542 * (i.e. if it's a normal file with non-zero size), the
543 * tmpnode could still be discovered by pageout or
544 * fsflush via the page vnode pointers. To prevent this
545 * from interfering with the tmp_freevfs, truncate the
546 * tmpnode now.
547 */
548 if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
549 rw_enter(&tnp->tn_rwlock, RW_WRITER);
550 rw_enter(&tnp->tn_contents, RW_WRITER);
551
552 (void) tmpnode_trunc(tm, tnp, 0);
553
554 rw_exit(&tnp->tn_contents);
555 rw_exit(&tnp->tn_rwlock);
556
557 ASSERT(tnp->tn_size == 0);
558 ASSERT(tnp->tn_nblocks == 0);
559 }
560 } else if (cnt > 0) {
561 /* An open file; unwind the holds we've been adding. */
562 mutex_exit(&vp->v_lock);
563 cancel = tm->tm_rootnode->tn_forw;
564 while (cancel != tnp) {
565 vp = TNTOV(cancel);
566 ASSERT(vp->v_count > 0);
567 VN_RELE(vp);
568 cancel = cancel->tn_forw;
569 }
570 mutex_exit(&tm->tm_contents);
571 return (EBUSY);
572 } else {
573 /* directly add a VN_HOLD since we have the lock */
574 vp->v_count++;
575 mutex_exit(&vp->v_lock);
576 }
577 }
578
579 if (flag & MS_FORCE) {
580 /*
581 * Drop the zone ref now since we don't know how long it will
582 * be until the final vfs_rele is called by tmp_inactive.
583 */
584 if (vfsp->vfs_zone) {
585 zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
586 ZONE_REF_VFS);
587 vfsp->vfs_zone = 0;
588 }
589 /* We can now drop the extra hold we added above. */
590 VFS_RELE(vfsp);
591 } else {
592 /*
593 * For the non-forced case, we can drop the mutex now because
594 * no one can find this mount anymore
595 */
596 vfsp->vfs_flag |= VFS_UNMOUNTED;
597 mutex_exit(&tm->tm_contents);
598 }
599
600 return (0);
601 }
602
603 /*
604 * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
605 * the vfs framework after umount and the last VFS_RELE, to trigger the release
606 * of any resources still associated with the given vfs_t. We only add
607 * additional VFS_HOLDs during the forced umount case, so this is normally
608 * called immediately after tmp_umount.
609 */
610 void
611 tmp_freevfs(vfs_t *vfsp)
612 {
613 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
614 struct tmpnode *tnp;
615 struct vnode *vp;
616
617 /*
618 * Free all kmemalloc'd and anonalloc'd memory associated with
619 * this filesystem. To do this, we go through the file list twice,
620 * once to remove all the directory entries, and then to remove
621 * all the files. We do this because there is useful code in
622 * tmpnode_free which assumes that the directory entry has been
623 * removed before the file.
624 */
625
626 /*
627 * Now that we are tearing ourselves down we need to remove the
628 * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
629 * files from the system causing us to have a negative value. Doing this
630 * seems a bit better than trying to set a flag on the tmount that says
631 * we're tearing down.
632 */
633 vfsp->vfs_flag &= ~VFS_UNMOUNTED;
634
635 /*
636 * Remove all directory entries
637 */
638 for (tnp = tm->tm_rootnode; tnp; tnp = tnp->tn_forw) {
639 rw_enter(&tnp->tn_rwlock, RW_WRITER);
640 if (tnp->tn_type == VDIR)
641 tdirtrunc(tnp);
642 if (tnp->tn_vnode->v_flag & V_XATTRDIR) {
643 /*
644 * Account for implicit attrdir reference.
645 */
646 ASSERT(tnp->tn_nlink > 0);
647 DECR_COUNT(&tnp->tn_nlink, &tnp->tn_tlock);
648 }
649 rw_exit(&tnp->tn_rwlock);
650 }
651
652 ASSERT(tm->tm_rootnode);
653
654 /*
655 * All links are gone, v_count is keeping nodes in place.
656 * VN_RELE should make the node disappear, unless somebody
657 * is holding pages against it. Nap and retry until it disappears.
658 *
659 * We re-acquire the lock to prevent others who have a HOLD on
660 * a tmpnode via its pages or anon slots from blowing it away
661 * (in tmp_inactive) while we're trying to get to it here. Once
662 * we have a HOLD on it we know it'll stick around.
663 *
664 */
665 mutex_enter(&tm->tm_contents);
666 /*
667 * Remove all the files (except the rootnode) backwards.
668 */
669 while ((tnp = tm->tm_rootnode->tn_back) != tm->tm_rootnode) {
670 mutex_exit(&tm->tm_contents);
671 /*
672 * Inhibit tmp_inactive from touching attribute directory
673 * as all nodes will be released here.
674 * Note we handled the link count in pass 2 above.
675 */
676 rw_enter(&tnp->tn_rwlock, RW_WRITER);
677 tnp->tn_xattrdp = NULL;
678 rw_exit(&tnp->tn_rwlock);
679 vp = TNTOV(tnp);
680 VN_RELE(vp);
681 mutex_enter(&tm->tm_contents);
682 /*
683 * It's still there after the RELE. Someone else like pageout
684 * has a hold on it so wait a bit and then try again - we know
685 * they'll give it up soon.
686 */
687 if (tnp == tm->tm_rootnode->tn_back) {
688 VN_HOLD(vp);
689 mutex_exit(&tm->tm_contents);
690 delay(hz / 4);
691 mutex_enter(&tm->tm_contents);
692 }
693 }
694 mutex_exit(&tm->tm_contents);
695
696 tm->tm_rootnode->tn_xattrdp = NULL;
697 VN_RELE(TNTOV(tm->tm_rootnode));
698
699 ASSERT(tm->tm_mntpath);
700
701 kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
702
703 ASSERT(tm->tm_anonmem == 0);
704
705 mutex_destroy(&tm->tm_contents);
706 mutex_destroy(&tm->tm_renamelck);
707 kmem_free(tm, sizeof (struct tmount));
708
709 /* Allow _fini() to succeed now */
710 atomic_dec_32(&tmpfs_mountcount);
711 }
712
713 /*
714 * return root tmpnode for given vnode
715 */
716 static int
717 tmp_root(struct vfs *vfsp, struct vnode **vpp)
718 {
719 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
720 struct tmpnode *tp = tm->tm_rootnode;
721 struct vnode *vp;
722
723 ASSERT(tp);
724
725 vp = TNTOV(tp);
726 VN_HOLD(vp);
727 *vpp = vp;
728 return (0);
729 }
730
731 static int
732 tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
733 {
734 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
735 ulong_t blocks;
736 dev32_t d32;
737 zoneid_t eff_zid;
738 struct zone *zp;
739
740 /*
741 * The file system may have been mounted by the global zone on
742 * behalf of the non-global zone. In that case, the tmount zone_id
743 * will be the global zone. We still want to show the swap cap inside
744 * the zone in this case, even though the file system was mounted by
745 * the global zone.
746 */
747 if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
748 zp = curproc->p_zone;
749 else
750 zp = tm->tm_vfsp->vfs_zone;
751
752 if (zp == NULL)
753 eff_zid = GLOBAL_ZONEUNIQID;
754 else
755 eff_zid = zp->zone_id;
756
757 sbp->f_bsize = PAGESIZE;
758 sbp->f_frsize = PAGESIZE;
759
760 /*
761 * Find the amount of available physical and memory swap
762 */
763 mutex_enter(&anoninfo_lock);
764 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
765 blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
766 mutex_exit(&anoninfo_lock);
767
768 /*
769 * If tm_anonmax for this mount is less than the available swap space
770 * (minus the amount tmpfs can't use), use that instead
771 */
772 if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) {
773 sbp->f_bfree = MIN(blocks - tmpfs_minfree,
774 btop(tm->tm_anonmax) - btopr(tm->tm_anonmem));
775 } else {
776 sbp->f_bfree = 0;
777 }
778
779 sbp->f_bavail = sbp->f_bfree;
780
781 /*
782 * Total number of blocks is what's available plus what's been used
783 */
784 sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem));
785
786 if (eff_zid != GLOBAL_ZONEUNIQID &&
787 zp->zone_max_swap_ctl != UINT64_MAX) {
788 /*
789 * If the fs is used by a non-global zone with a swap cap,
790 * then report the capped size.
791 */
792 rctl_qty_t cap, used;
793 pgcnt_t pgcap, pgused;
794
795 mutex_enter(&zp->zone_mem_lock);
796 cap = zp->zone_max_swap_ctl;
797 used = zp->zone_max_swap;
798 mutex_exit(&zp->zone_mem_lock);
799
800 pgcap = btop(cap);
801 pgused = btop(used);
802
803 sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
804 sbp->f_bavail = sbp->f_bfree;
805 sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
806 }
807
808 /*
809 * The maximum number of files available is approximately the number
810 * of tmpnodes we can allocate from the remaining kernel memory
811 * available to tmpfs. This is fairly inaccurate since it doesn't
812 * take into account the names stored in the directory entries.
813 */
814 sbp->f_ffree = sbp->f_files = ptob(availrmem) /
815 (sizeof (struct tmpnode) + sizeof (struct tdirent));
816 sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
817 (void) cmpldev(&d32, vfsp->vfs_dev);
818 sbp->f_fsid = d32;
819 (void) strcpy(sbp->f_basetype, vfssw[tmpfsfstype].vsw_name);
820 (void) strncpy(sbp->f_fstr, tm->tm_mntpath, sizeof (sbp->f_fstr));
821 /*
822 * ensure null termination
823 */
824 sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
825 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
826 sbp->f_namemax = MAXNAMELEN - 1;
827 return (0);
828 }
829
830 static int
831 tmp_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
832 {
833 struct tfid *tfid;
834 struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
835 struct tmpnode *tp = NULL;
836
837 tfid = (struct tfid *)fidp;
838 *vpp = NULL;
839
840 mutex_enter(&tm->tm_contents);
841 for (tp = tm->tm_rootnode; tp; tp = tp->tn_forw) {
842 mutex_enter(&tp->tn_tlock);
843 if (tp->tn_nodeid == tfid->tfid_ino) {
844 /*
845 * If the gen numbers don't match we know the
846 * file won't be found since only one tmpnode
847 * can have this number at a time.
848 */
849 if (tp->tn_gen != tfid->tfid_gen || tp->tn_nlink == 0) {
850 mutex_exit(&tp->tn_tlock);
851 mutex_exit(&tm->tm_contents);
852 return (0);
853 }
854 *vpp = (struct vnode *)TNTOV(tp);
855
856 VN_HOLD(*vpp);
857
858 if ((tp->tn_mode & S_ISVTX) &&
859 !(tp->tn_mode & (S_IXUSR | S_IFDIR))) {
860 mutex_enter(&(*vpp)->v_lock);
861 (*vpp)->v_flag |= VISSWAP;
862 mutex_exit(&(*vpp)->v_lock);
863 }
864 mutex_exit(&tp->tn_tlock);
865 mutex_exit(&tm->tm_contents);
866 return (0);
867 }
868 mutex_exit(&tp->tn_tlock);
869 }
870 mutex_exit(&tm->tm_contents);
871 return (0);
872 }