1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2020 Joyent, Inc.
25 * Copyright 2022 Spencer Evans-Cole.
26 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
28 */
29
30 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
31 /* All Rights Reserved */
32
33 /*
34 * University Copyright- Copyright (c) 1982, 1986, 1988
35 * The Regents of the University of California
36 * All Rights Reserved
37 *
38 * University Acknowledgment- Portions of this document are derived from
39 * software developed by the University of California, Berkeley, and its
40 * contributors.
41 */
42
43 #include <sys/types.h>
44 #include <sys/param.h>
45 #include <sys/t_lock.h>
46 #include <sys/errno.h>
47 #include <sys/cred.h>
48 #include <sys/user.h>
49 #include <sys/uio.h>
50 #include <sys/file.h>
51 #include <sys/pathname.h>
52 #include <sys/vfs.h>
53 #include <sys/vfs_opreg.h>
54 #include <sys/vnode.h>
55 #include <sys/filio.h>
56 #include <sys/rwstlock.h>
57 #include <sys/fem.h>
58 #include <sys/stat.h>
59 #include <sys/mode.h>
60 #include <sys/conf.h>
61 #include <sys/sysmacros.h>
62 #include <sys/cmn_err.h>
63 #include <sys/systm.h>
64 #include <sys/kmem.h>
65 #include <sys/debug.h>
66 #include <c2/audit.h>
67 #include <sys/acl.h>
68 #include <sys/nbmlock.h>
69 #include <sys/fcntl.h>
70 #include <fs/fs_subr.h>
71 #include <sys/taskq.h>
72 #include <fs/fs_reparse.h>
73 #include <sys/time.h>
74 #include <sys/sdt.h>
75
76 /* Determine if this vnode is a file that is read-only */
77 #define ISROFILE(vp) \
78 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
79 (vp)->v_type != VFIFO && vn_is_readonly(vp))
80
81 /* Tunable via /etc/system; used only by admin/install */
82 int nfs_global_client_only;
83
84 /*
85 * Array of vopstats_t for per-FS-type vopstats. This array has the same
86 * number of entries as and parallel to the vfssw table. (Arguably, it could
87 * be part of the vfssw table.) Once it's initialized, it's accessed using
88 * the same fstype index that is used to index into the vfssw table.
89 */
90 vopstats_t **vopstats_fstype;
91
92 /* vopstats initialization template used for fast initialization via bcopy() */
93 static vopstats_t *vs_templatep;
94
95 /* Kmem cache handle for vsk_anchor_t allocations */
96 kmem_cache_t *vsk_anchor_cache;
97
98 /* file events cleanup routine */
99 extern void free_fopdata(vnode_t *);
100
101 /*
102 * Root of AVL tree for the kstats associated with vopstats. Lock protects
103 * updates to vsktat_tree.
104 */
105 avl_tree_t vskstat_tree;
106 kmutex_t vskstat_tree_lock;
107
108 /* Global variable which enables/disables the vopstats collection */
109 int vopstats_enabled = 1;
110
111 /* Global used for empty/invalid v_path */
112 char *vn_vpath_empty = "";
113
114 /*
115 * forward declarations for internal vnode specific data (vsd)
116 */
117 static void *vsd_realloc(void *, size_t, size_t);
118
119 /*
120 * forward declarations for reparse point functions
121 */
122 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
123
124 /*
125 * VSD -- VNODE SPECIFIC DATA
126 * The v_data pointer is typically used by a file system to store a
127 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
128 * However, there are times when additional project private data needs
129 * to be stored separately from the data (node) pointed to by v_data.
130 * This additional data could be stored by the file system itself or
131 * by a completely different kernel entity. VSD provides a way for
132 * callers to obtain a key and store a pointer to private data associated
133 * with a vnode.
134 *
135 * Callers are responsible for protecting the vsd by holding v_vsd_lock
136 * for calls to vsd_set() and vsd_get().
137 */
138
139 /*
140 * vsd_lock protects:
141 * vsd_nkeys - creation and deletion of vsd keys
142 * vsd_list - insertion and deletion of vsd_node in the vsd_list
143 * vsd_destructor - adding and removing destructors to the list
144 */
145 static kmutex_t vsd_lock;
146 static uint_t vsd_nkeys; /* size of destructor array */
147 /* list of vsd_node's */
148 static list_t *vsd_list = NULL;
149 /* per-key destructor funcs */
150 static void (**vsd_destructor)(void *);
151
152 /*
153 * The following is the common set of actions needed to update the
154 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
155 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
156 * recording of the bytes transferred. Since the code is similar
157 * but small, it is nearly a duplicate. Consequently any changes
158 * to one may need to be reflected in the other.
159 * Rundown of the variables:
160 * vp - Pointer to the vnode
161 * counter - Partial name structure member to update in vopstats for counts
162 * bytecounter - Partial name structure member to update in vopstats for bytes
163 * bytesval - Value to update in vopstats for bytes
164 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
165 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
166 */
167
168 #define VOPSTATS_UPDATE(vp, counter) { \
169 vfs_t *vfsp = (vp)->v_vfsp; \
170 if (vfsp && vfsp->vfs_implp && \
171 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
172 vopstats_t *vsp = &vfsp->vfs_vopstats; \
173 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
174 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
175 size_t, uint64_t *); \
176 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
177 (*stataddr)++; \
178 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
179 vsp->n##counter.value.ui64++; \
180 } \
181 } \
182 }
183
184 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
185 vfs_t *vfsp = (vp)->v_vfsp; \
186 if (vfsp && vfsp->vfs_implp && \
187 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
188 vopstats_t *vsp = &vfsp->vfs_vopstats; \
189 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
190 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
191 size_t, uint64_t *); \
192 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
193 (*stataddr)++; \
194 vsp->bytecounter.value.ui64 += bytesval; \
195 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
196 vsp->n##counter.value.ui64++; \
197 vsp->bytecounter.value.ui64 += bytesval; \
198 } \
199 } \
200 }
201
202 /*
203 * If the filesystem does not support XIDs map credential
204 * If the vfsp is NULL, perhaps we should also map?
205 */
206 #define VOPXID_MAP_CR(vp, cr) { \
207 vfs_t *vfsp = (vp)->v_vfsp; \
208 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
209 cr = crgetmapped(cr); \
210 }
211
212 /*
213 * Convert stat(2) formats to vnode types and vice versa. (Knows about
214 * numerical order of S_IFMT and vnode types.)
215 */
216 enum vtype iftovt_tab[] = {
217 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
218 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
219 };
220
221 ushort_t vttoif_tab[] = {
222 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
223 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
224 };
225
226 /*
227 * The system vnode cache.
228 */
229
230 kmem_cache_t *vn_cache;
231
232
233 /*
234 * Vnode operations vector.
235 */
236
237 static const fs_operation_trans_def_t vn_ops_table[] = {
238 VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
239 fs_nosys, fs_nosys,
240
241 VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
242 fs_nosys, fs_nosys,
243
244 VOPNAME_READ, offsetof(struct vnodeops, vop_read),
245 fs_nosys, fs_nosys,
246
247 VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
248 fs_nosys, fs_nosys,
249
250 VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
251 fs_nosys, fs_nosys,
252
253 VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
254 fs_setfl, fs_nosys,
255
256 VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
257 fs_nosys, fs_nosys,
258
259 VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
260 fs_nosys, fs_nosys,
261
262 VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
263 fs_nosys, fs_nosys,
264
265 VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
266 fs_nosys, fs_nosys,
267
268 VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
269 fs_nosys, fs_nosys,
270
271 VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
272 fs_nosys, fs_nosys,
273
274 VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
275 fs_nosys, fs_nosys,
276
277 VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
278 fs_nosys, fs_nosys,
279
280 VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
281 fs_nosys, fs_nosys,
282
283 VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
284 fs_nosys, fs_nosys,
285
286 VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
287 fs_nosys, fs_nosys,
288
289 VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
290 fs_nosys, fs_nosys,
291
292 VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
293 fs_nosys, fs_nosys,
294
295 VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
296 fs_nosys, fs_nosys,
297
298 VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
299 fs_nosys, fs_nosys,
300
301 VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
302 fs_nosys, fs_nosys,
303
304 VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
305 fs_rwlock, fs_rwlock,
306
307 VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
308 (fs_generic_func_p)(uintptr_t)fs_rwunlock,
309 (fs_generic_func_p)(uintptr_t)fs_rwunlock, /* no errors allowed */
310
311 VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
312 fs_nosys, fs_nosys,
313
314 VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
315 fs_cmp, fs_cmp, /* no errors allowed */
316
317 VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
318 fs_frlock, fs_nosys,
319
320 VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
321 fs_nosys, fs_nosys,
322
323 VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
324 fs_nosys, fs_nosys,
325
326 VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
327 fs_nosys, fs_nosys,
328
329 VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
330 fs_nosys, fs_nosys,
331
332 VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
333 (fs_generic_func_p) fs_nosys_map,
334 (fs_generic_func_p) fs_nosys_map,
335
336 VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
337 (fs_generic_func_p) fs_nosys_addmap,
338 (fs_generic_func_p) fs_nosys_addmap,
339
340 VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
341 fs_nosys, fs_nosys,
342
343 VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
344 (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
345
346 VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
347 fs_nosys, fs_nosys,
348
349 VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
350 fs_pathconf, fs_nosys,
351
352 VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
353 fs_nosys, fs_nosys,
354
355 VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
356 fs_nosys, fs_nosys,
357
358 VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
359 (fs_generic_func_p)(uintptr_t)fs_dispose,
360 (fs_generic_func_p)(uintptr_t)fs_nodispose,
361
362 VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
363 fs_nosys, fs_nosys,
364
365 VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
366 fs_fab_acl, fs_nosys,
367
368 VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
369 fs_shrlock, fs_nosys,
370
371 VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
372 (fs_generic_func_p) fs_vnevent_nosupport,
373 (fs_generic_func_p) fs_vnevent_nosupport,
374
375 VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
376 fs_nosys, fs_nosys,
377
378 VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
379 fs_nosys, fs_nosys,
380
381 NULL, 0, NULL, NULL
382 };
383
384 /* Extensible attribute (xva) routines. */
385
386 /*
387 * Zero out the structure, set the size of the requested/returned bitmaps,
388 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
389 * to the returned attributes array.
390 */
391 void
392 xva_init(xvattr_t *xvap)
393 {
394 bzero(xvap, sizeof (xvattr_t));
395 xvap->xva_mapsize = XVA_MAPSIZE;
396 xvap->xva_magic = XVA_MAGIC;
397 xvap->xva_vattr.va_mask = AT_XVATTR;
398 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
399 }
400
401 /*
402 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
403 * structure. Otherwise, returns NULL.
404 */
405 xoptattr_t *
406 xva_getxoptattr(xvattr_t *xvap)
407 {
408 xoptattr_t *xoap = NULL;
409 if (xvap->xva_vattr.va_mask & AT_XVATTR)
410 xoap = &xvap->xva_xoptattrs;
411 return (xoap);
412 }
413
414 /*
415 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
416 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
417 * kstat name.
418 */
419 static int
420 vska_compar(const void *n1, const void *n2)
421 {
422 int ret;
423 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
424 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
425
426 if (p1 < p2) {
427 ret = -1;
428 } else if (p1 > p2) {
429 ret = 1;
430 } else {
431 ret = 0;
432 }
433
434 return (ret);
435 }
436
437 /*
438 * Used to create a single template which will be bcopy()ed to a newly
439 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
440 */
441 static vopstats_t *
442 create_vopstats_template()
443 {
444 vopstats_t *vsp;
445
446 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
447 bzero(vsp, sizeof (*vsp)); /* Start fresh */
448
449 /* VOP_OPEN */
450 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
451 /* VOP_CLOSE */
452 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
453 /* VOP_READ I/O */
454 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
455 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
456 /* VOP_WRITE I/O */
457 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
458 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
459 /* VOP_IOCTL */
460 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
461 /* VOP_SETFL */
462 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
463 /* VOP_GETATTR */
464 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
465 /* VOP_SETATTR */
466 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
467 /* VOP_ACCESS */
468 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
469 /* VOP_LOOKUP */
470 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
471 /* VOP_CREATE */
472 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
473 /* VOP_REMOVE */
474 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
475 /* VOP_LINK */
476 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
477 /* VOP_RENAME */
478 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
479 /* VOP_MKDIR */
480 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
481 /* VOP_RMDIR */
482 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
483 /* VOP_READDIR I/O */
484 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
485 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
486 KSTAT_DATA_UINT64);
487 /* VOP_SYMLINK */
488 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
489 /* VOP_READLINK */
490 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
491 /* VOP_FSYNC */
492 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
493 /* VOP_INACTIVE */
494 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
495 /* VOP_FID */
496 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
497 /* VOP_RWLOCK */
498 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
499 /* VOP_RWUNLOCK */
500 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
501 /* VOP_SEEK */
502 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
503 /* VOP_CMP */
504 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
505 /* VOP_FRLOCK */
506 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
507 /* VOP_SPACE */
508 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
509 /* VOP_REALVP */
510 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
511 /* VOP_GETPAGE */
512 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
513 /* VOP_PUTPAGE */
514 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
515 /* VOP_MAP */
516 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
517 /* VOP_ADDMAP */
518 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
519 /* VOP_DELMAP */
520 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
521 /* VOP_POLL */
522 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
523 /* VOP_DUMP */
524 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
525 /* VOP_PATHCONF */
526 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
527 /* VOP_PAGEIO */
528 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
529 /* VOP_DUMPCTL */
530 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
531 /* VOP_DISPOSE */
532 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
533 /* VOP_SETSECATTR */
534 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
535 /* VOP_GETSECATTR */
536 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
537 /* VOP_SHRLOCK */
538 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
539 /* VOP_VNEVENT */
540 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
541 /* VOP_REQZCBUF */
542 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
543 /* VOP_RETZCBUF */
544 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
545
546 return (vsp);
547 }
548
549 /*
550 * Creates a kstat structure associated with a vopstats structure.
551 */
552 kstat_t *
553 new_vskstat(char *ksname, vopstats_t *vsp)
554 {
555 kstat_t *ksp;
556
557 if (!vopstats_enabled) {
558 return (NULL);
559 }
560
561 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
562 sizeof (vopstats_t)/sizeof (kstat_named_t),
563 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
564 if (ksp) {
565 ksp->ks_data = vsp;
566 kstat_install(ksp);
567 }
568
569 return (ksp);
570 }
571
572 /*
573 * Called from vfsinit() to initialize the support mechanisms for vopstats
574 */
575 void
576 vopstats_startup()
577 {
578 if (!vopstats_enabled)
579 return;
580
581 /*
582 * Creates the AVL tree which holds per-vfs vopstat anchors. This
583 * is necessary since we need to check if a kstat exists before we
584 * attempt to create it. Also, initialize its lock.
585 */
586 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
587 offsetof(vsk_anchor_t, vsk_node));
588 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
589
590 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
591 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
592 NULL, NULL, 0);
593
594 /*
595 * Set up the array of pointers for the vopstats-by-FS-type.
596 * The entries will be allocated/initialized as each file system
597 * goes through modload/mod_installfs.
598 */
599 vopstats_fstype = (vopstats_t **)kmem_zalloc(
600 (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
601
602 /* Set up the global vopstats initialization template */
603 vs_templatep = create_vopstats_template();
604 }
605
606 /*
607 * We need to have the all of the counters zeroed.
608 * The initialization of the vopstats_t includes on the order of
609 * 50 calls to kstat_named_init(). Rather that do that on every call,
610 * we do it once in a template (vs_templatep) then bcopy it over.
611 */
612 void
613 initialize_vopstats(vopstats_t *vsp)
614 {
615 if (vsp == NULL)
616 return;
617
618 bcopy(vs_templatep, vsp, sizeof (vopstats_t));
619 }
620
621 /*
622 * If possible, determine which vopstats by fstype to use and
623 * return a pointer to the caller.
624 */
625 vopstats_t *
626 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
627 {
628 int fstype = 0; /* Index into vfssw[] */
629 vopstats_t *vsp = NULL;
630
631 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
632 !vopstats_enabled)
633 return (NULL);
634 /*
635 * Set up the fstype. We go to so much trouble because all versions
636 * of NFS use the same fstype in their vfs even though they have
637 * distinct entries in the vfssw[] table.
638 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
639 */
640 if (vswp) {
641 fstype = vswp - vfssw; /* Gets us the index */
642 } else {
643 fstype = vfsp->vfs_fstype;
644 }
645
646 /*
647 * Point to the per-fstype vopstats. The only valid values are
648 * non-zero positive values less than the number of vfssw[] table
649 * entries.
650 */
651 if (fstype > 0 && fstype < nfstype) {
652 vsp = vopstats_fstype[fstype];
653 }
654
655 return (vsp);
656 }
657
658 /*
659 * Generate a kstat name, create the kstat structure, and allocate a
660 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
661 * to the caller. This must only be called from a mount.
662 */
663 vsk_anchor_t *
664 get_vskstat_anchor(vfs_t *vfsp)
665 {
666 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
667 statvfs64_t statvfsbuf; /* Needed to find f_fsid */
668 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
669 kstat_t *ksp; /* Ptr to new kstat */
670 avl_index_t where; /* Location in the AVL tree */
671
672 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
673 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
674 return (NULL);
675
676 /* Need to get the fsid to build a kstat name */
677 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
678 /* Create a name for our kstats based on fsid */
679 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
680 VOPSTATS_STR, statvfsbuf.f_fsid);
681
682 /* Allocate and initialize the vsk_anchor_t */
683 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
684 bzero(vskp, sizeof (*vskp));
685 vskp->vsk_fsid = statvfsbuf.f_fsid;
686
687 mutex_enter(&vskstat_tree_lock);
688 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
689 avl_insert(&vskstat_tree, vskp, where);
690 mutex_exit(&vskstat_tree_lock);
691
692 /*
693 * Now that we've got the anchor in the AVL
694 * tree, we can create the kstat.
695 */
696 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
697 if (ksp) {
698 vskp->vsk_ksp = ksp;
699 }
700 } else {
701 /* Oops, found one! Release memory and lock. */
702 mutex_exit(&vskstat_tree_lock);
703 kmem_cache_free(vsk_anchor_cache, vskp);
704 vskp = NULL;
705 }
706 }
707 return (vskp);
708 }
709
710 /*
711 * We're in the process of tearing down the vfs and need to cleanup
712 * the data structures associated with the vopstats. Must only be called
713 * from dounmount().
714 */
715 void
716 teardown_vopstats(vfs_t *vfsp)
717 {
718 vsk_anchor_t *vskap;
719 avl_index_t where;
720
721 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
722 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
723 return;
724
725 /* This is a safe check since VFS_STATS must be set (see above) */
726 if ((vskap = vfsp->vfs_vskap) == NULL)
727 return;
728
729 /* Whack the pointer right away */
730 vfsp->vfs_vskap = NULL;
731
732 /* Lock the tree, remove the node, and delete the kstat */
733 mutex_enter(&vskstat_tree_lock);
734 if (avl_find(&vskstat_tree, vskap, &where)) {
735 avl_remove(&vskstat_tree, vskap);
736 }
737
738 if (vskap->vsk_ksp) {
739 kstat_delete(vskap->vsk_ksp);
740 }
741 mutex_exit(&vskstat_tree_lock);
742
743 kmem_cache_free(vsk_anchor_cache, vskap);
744 }
745
746 /*
747 * Read or write a vnode. Called from kernel code.
748 */
749 int
750 vn_rdwr(
751 enum uio_rw rw,
752 struct vnode *vp,
753 caddr_t base,
754 ssize_t len,
755 offset_t offset,
756 enum uio_seg seg,
757 int ioflag,
758 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */
759 cred_t *cr,
760 ssize_t *residp)
761 {
762 struct uio uio;
763 struct iovec iov;
764 int error;
765 int in_crit = 0;
766
767 if (rw == UIO_WRITE && ISROFILE(vp))
768 return (EROFS);
769
770 if (len < 0)
771 return (EIO);
772
773 VOPXID_MAP_CR(vp, cr);
774
775 iov.iov_base = base;
776 iov.iov_len = len;
777 uio.uio_iov = &iov;
778 uio.uio_iovcnt = 1;
779 uio.uio_loffset = offset;
780 uio.uio_segflg = (short)seg;
781 uio.uio_resid = len;
782 uio.uio_llimit = ulimit;
783
784 /*
785 * We have to enter the critical region before calling VOP_RWLOCK
786 * to avoid a deadlock with ufs.
787 */
788 if (nbl_need_check(vp)) {
789 int svmand;
790
791 nbl_start_crit(vp, RW_READER);
792 in_crit = 1;
793 error = nbl_svmand(vp, cr, &svmand);
794 if (error != 0)
795 goto done;
796 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
797 uio.uio_offset, uio.uio_resid, svmand, NULL)) {
798 error = EACCES;
799 goto done;
800 }
801 }
802
803 (void) VOP_RWLOCK(vp,
804 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
805 if (rw == UIO_WRITE) {
806 uio.uio_fmode = FWRITE;
807 uio.uio_extflg = UIO_COPY_DEFAULT;
808 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
809 } else {
810 uio.uio_fmode = FREAD;
811 uio.uio_extflg = UIO_COPY_CACHED;
812 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
813 }
814 VOP_RWUNLOCK(vp,
815 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
816 if (residp)
817 *residp = uio.uio_resid;
818 else if (uio.uio_resid)
819 error = EIO;
820
821 done:
822 if (in_crit)
823 nbl_end_crit(vp);
824 return (error);
825 }
826
827 /*
828 * Release a vnode. Call VOP_INACTIVE on last reference or
829 * decrement reference count.
830 *
831 * To avoid race conditions, the v_count is left at 1 for
832 * the call to VOP_INACTIVE. This prevents another thread
833 * from reclaiming and releasing the vnode *before* the
834 * VOP_INACTIVE routine has a chance to destroy the vnode.
835 * We can't have more than 1 thread calling VOP_INACTIVE
836 * on a vnode.
837 */
838 void
839 vn_rele(vnode_t *vp)
840 {
841 mutex_enter(&vp->v_lock);
842 if (vp->v_count == 1) {
843 mutex_exit(&vp->v_lock);
844 VOP_INACTIVE(vp, CRED(), NULL);
845 return;
846 } else {
847 VERIFY(vp->v_count > 0);
848 }
849 VN_RELE_LOCKED(vp);
850 mutex_exit(&vp->v_lock);
851 }
852
853 /*
854 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
855 * as a single reference, so v_count is not decremented until the last DNLC hold
856 * is released. This makes it possible to distinguish vnodes that are referenced
857 * only by the DNLC.
858 */
859 void
860 vn_rele_dnlc(vnode_t *vp)
861 {
862 mutex_enter(&vp->v_lock);
863 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
864 if (--vp->v_count_dnlc == 0) {
865 if (vp->v_count == 1) {
866 mutex_exit(&vp->v_lock);
867 VOP_INACTIVE(vp, CRED(), NULL);
868 return;
869 }
870 VN_RELE_LOCKED(vp);
871 }
872 mutex_exit(&vp->v_lock);
873 }
874
875 /*
876 * Like vn_rele() except that it clears v_stream under v_lock.
877 * This is used by sockfs when it dismantles the association between
878 * the sockfs node and the vnode in the underlying file system.
879 * v_lock has to be held to prevent a thread coming through the lookupname
880 * path from accessing a stream head that is going away.
881 */
882 void
883 vn_rele_stream(vnode_t *vp)
884 {
885 mutex_enter(&vp->v_lock);
886 vp->v_stream = NULL;
887 if (vp->v_count == 1) {
888 mutex_exit(&vp->v_lock);
889 VOP_INACTIVE(vp, CRED(), NULL);
890 return;
891 } else {
892 VERIFY(vp->v_count > 0);
893 }
894 VN_RELE_LOCKED(vp);
895 mutex_exit(&vp->v_lock);
896 }
897
898 static void
899 vn_rele_inactive(vnode_t *vp)
900 {
901 VOP_INACTIVE(vp, CRED(), NULL);
902 }
903
904 /*
905 * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
906 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
907 * the file system as a result of releasing the vnode. Note, file systems
908 * already have to handle the race where the vnode is incremented before the
909 * inactive routine is called and does its locking.
910 *
911 * Warning: Excessive use of this routine can lead to performance problems.
912 * This is because taskqs throttle back allocation if too many are created.
913 */
914 void
915 vn_rele_async(vnode_t *vp, taskq_t *taskq)
916 {
917 mutex_enter(&vp->v_lock);
918 if (vp->v_count == 1) {
919 mutex_exit(&vp->v_lock);
920 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
921 vp, TQ_SLEEP) != TASKQID_INVALID);
922 return;
923 } else {
924 VERIFY(vp->v_count > 0);
925 }
926 VN_RELE_LOCKED(vp);
927 mutex_exit(&vp->v_lock);
928 }
929
930 int
931 vn_open(
932 char *pnamep,
933 enum uio_seg seg,
934 int filemode,
935 int createmode,
936 struct vnode **vpp,
937 enum create crwhy,
938 mode_t umask)
939 {
940 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
941 umask, NULL, -1));
942 }
943
944
945 /*
946 * Open/create a vnode.
947 * This may be callable by the kernel, the only known use
948 * of user context being that the current user credentials
949 * are used for permissions. crwhy is defined iff filemode & FCREAT.
950 */
951 int
952 vn_openat(
953 char *pnamep,
954 enum uio_seg seg,
955 int filemode,
956 int createmode,
957 struct vnode **vpp,
958 enum create crwhy,
959 mode_t umask,
960 struct vnode *startvp,
961 int fd)
962 {
963 struct vnode *vp;
964 int mode;
965 int accessflags;
966 int error;
967 int in_crit = 0;
968 int open_done = 0;
969 int shrlock_done = 0;
970 struct vattr vattr;
971 enum symfollow follow;
972 int estale_retry = 0;
973 struct shrlock shr;
974 struct shr_locowner shr_own;
975 boolean_t create;
976
977 mode = 0;
978 accessflags = 0;
979 if (filemode & FREAD)
980 mode |= VREAD;
981 if (filemode & (FWRITE|FTRUNC))
982 mode |= VWRITE;
983 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
984 mode |= VEXEC;
985
986 /* symlink interpretation */
987 if (filemode & FNOFOLLOW)
988 follow = NO_FOLLOW;
989 else
990 follow = FOLLOW;
991
992 if (filemode & FAPPEND)
993 accessflags |= V_APPEND;
994
995 /*
996 * We need to handle the case of FCREAT | FDIRECTORY and the case of
997 * FEXCL. If all three are specified, then we always fail because we
998 * cannot create a directory through this interface and FEXCL says we
999 * need to fail the request if we can't create it. If, however, only
1000 * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1001 * of opening a file that already exists. If it exists, we can do
1002 * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1003 * treated as FDIRECTORY.
1004 */
1005 if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1006 (FCREAT | FDIRECTORY | FEXCL)) {
1007 return (EINVAL);
1008 }
1009
1010 if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1011 create = B_FALSE;
1012 } else if ((filemode & FCREAT) != 0) {
1013 create = B_TRUE;
1014 } else {
1015 create = B_FALSE;
1016 }
1017
1018 top:
1019 if (create) {
1020 enum vcexcl excl;
1021
1022 /*
1023 * Wish to create a file.
1024 */
1025 vattr.va_type = VREG;
1026 vattr.va_mode = createmode;
1027 vattr.va_mask = AT_TYPE|AT_MODE;
1028 if (filemode & FTRUNC) {
1029 vattr.va_size = 0;
1030 vattr.va_mask |= AT_SIZE;
1031 }
1032 if (filemode & FEXCL)
1033 excl = EXCL;
1034 else
1035 excl = NONEXCL;
1036
1037 if (error =
1038 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1039 (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1040 return (error);
1041 } else {
1042 /*
1043 * Wish to open a file. Just look it up.
1044 */
1045 if (error = lookupnameat(pnamep, seg, follow,
1046 NULLVPP, &vp, startvp)) {
1047 if ((error == ESTALE) &&
1048 fs_need_estale_retry(estale_retry++))
1049 goto top;
1050 return (error);
1051 }
1052
1053 /*
1054 * Get the attributes to check whether file is large.
1055 * We do this only if the FOFFMAX flag is not set and
1056 * only for regular files.
1057 */
1058
1059 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1060 vattr.va_mask = AT_SIZE;
1061 if ((error = VOP_GETATTR(vp, &vattr, 0,
1062 CRED(), NULL))) {
1063 goto out;
1064 }
1065 if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1066 /*
1067 * Large File API - regular open fails
1068 * if FOFFMAX flag is set in file mode
1069 */
1070 error = EOVERFLOW;
1071 goto out;
1072 }
1073 }
1074 /*
1075 * Can't write directories, active texts, or
1076 * read-only filesystems. Can't truncate files
1077 * on which mandatory locking is in effect.
1078 */
1079 if (filemode & (FWRITE|FTRUNC)) {
1080 /*
1081 * Allow writable directory if VDIROPEN flag is set.
1082 */
1083 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1084 error = EISDIR;
1085 goto out;
1086 }
1087 if (ISROFILE(vp)) {
1088 error = EROFS;
1089 goto out;
1090 }
1091 /*
1092 * Can't truncate files on which
1093 * sysv mandatory locking is in effect.
1094 */
1095 if (filemode & FTRUNC) {
1096 vnode_t *rvp;
1097
1098 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1099 rvp = vp;
1100 if (rvp->v_filocks != NULL) {
1101 vattr.va_mask = AT_MODE;
1102 if ((error = VOP_GETATTR(vp,
1103 &vattr, 0, CRED(), NULL)) == 0 &&
1104 MANDLOCK(vp, vattr.va_mode))
1105 error = EAGAIN;
1106 }
1107 }
1108 if (error)
1109 goto out;
1110 }
1111 /*
1112 * Check permissions.
1113 */
1114 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1115 goto out;
1116
1117 /*
1118 * Require FSEARCH and FDIRECTORY to return a directory. Require
1119 * FEXEC to return a regular file.
1120 */
1121 if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1122 vp->v_type != VDIR) {
1123 error = ENOTDIR;
1124 goto out;
1125 }
1126 if ((filemode & FEXEC) && vp->v_type != VREG) {
1127 error = ENOEXEC; /* XXX: error code? */
1128 goto out;
1129 }
1130 }
1131
1132 /*
1133 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1134 */
1135 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1136 error = ELOOP;
1137 goto out;
1138 }
1139 if (filemode & FNOLINKS) {
1140 vattr.va_mask = AT_NLINK;
1141 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1142 goto out;
1143 }
1144 if (vattr.va_nlink != 1) {
1145 error = EMLINK;
1146 goto out;
1147 }
1148 }
1149
1150 /*
1151 * Opening a socket corresponding to the AF_UNIX pathname
1152 * in the filesystem name space is not supported.
1153 * However, VSOCK nodes in namefs are supported in order
1154 * to make fattach work for sockets.
1155 *
1156 * XXX This uses VOP_REALVP to distinguish between
1157 * an unopened namefs node (where VOP_REALVP returns a
1158 * different VSOCK vnode) and a VSOCK created by vn_create
1159 * in some file system (where VOP_REALVP would never return
1160 * a different vnode).
1161 */
1162 if (vp->v_type == VSOCK) {
1163 struct vnode *nvp;
1164
1165 error = VOP_REALVP(vp, &nvp, NULL);
1166 if (error != 0 || nvp == NULL || nvp == vp ||
1167 nvp->v_type != VSOCK) {
1168 error = EOPNOTSUPP;
1169 goto out;
1170 }
1171 }
1172
1173 if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1174 /* get share reservation */
1175 shr.s_access = 0;
1176 if (filemode & FWRITE)
1177 shr.s_access |= F_WRACC;
1178 if (filemode & FREAD)
1179 shr.s_access |= F_RDACC;
1180 shr.s_deny = 0;
1181 shr.s_sysid = 0;
1182 shr.s_pid = ttoproc(curthread)->p_pid;
1183 shr_own.sl_pid = shr.s_pid;
1184 shr_own.sl_id = fd;
1185 shr.s_own_len = sizeof (shr_own);
1186 shr.s_owner = (caddr_t)&shr_own;
1187 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1188 NULL);
1189 if (error)
1190 goto out;
1191 shrlock_done = 1;
1192
1193 /* nbmand conflict check if truncating file */
1194 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1195 nbl_start_crit(vp, RW_READER);
1196 in_crit = 1;
1197
1198 vattr.va_mask = AT_SIZE;
1199 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1200 goto out;
1201 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1202 NULL)) {
1203 error = EACCES;
1204 goto out;
1205 }
1206 }
1207 }
1208
1209 /*
1210 * Do opening protocol.
1211 */
1212 error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1213 if (error)
1214 goto out;
1215 open_done = 1;
1216
1217 /*
1218 * Truncate if required.
1219 */
1220 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1221 vattr.va_size = 0;
1222 vattr.va_mask = AT_SIZE;
1223 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1224 goto out;
1225 }
1226
1227 /*
1228 * Turn on directio, if requested.
1229 */
1230 if (filemode & FDIRECT) {
1231 if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1232 CRED(), NULL, NULL)) != 0) {
1233 /*
1234 * On Linux, O_DIRECT returns EINVAL when the file
1235 * system does not support directio, so we'll do the
1236 * same.
1237 */
1238 error = EINVAL;
1239 goto out;
1240 }
1241 }
1242 out:
1243 ASSERT(vp->v_count > 0);
1244
1245 if (in_crit) {
1246 nbl_end_crit(vp);
1247 in_crit = 0;
1248 }
1249 if (error) {
1250 if (open_done) {
1251 (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1252 NULL);
1253 open_done = 0;
1254 shrlock_done = 0;
1255 }
1256 if (shrlock_done) {
1257 (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1258 NULL);
1259 shrlock_done = 0;
1260 }
1261
1262 /*
1263 * The following clause was added to handle a problem
1264 * with NFS consistency. It is possible that a lookup
1265 * of the file to be opened succeeded, but the file
1266 * itself doesn't actually exist on the server. This
1267 * is chiefly due to the DNLC containing an entry for
1268 * the file which has been removed on the server. In
1269 * this case, we just start over. If there was some
1270 * other cause for the ESTALE error, then the lookup
1271 * of the file will fail and the error will be returned
1272 * above instead of looping around from here.
1273 */
1274 VN_RELE(vp);
1275 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1276 goto top;
1277 } else
1278 *vpp = vp;
1279 return (error);
1280 }
1281
1282 /*
1283 * The following two accessor functions are for the NFSv4 server. Since there
1284 * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1285 * vnode open counts correct when a client "upgrades" an open or does an
1286 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1287 * open mode (add or subtract read or write), but also change the share/deny
1288 * modes. However, share reservations are not integrated with OPEN, yet, so
1289 * we need to handle each separately. These functions are cleaner than having
1290 * the NFS server manipulate the counts directly, however, nobody else should
1291 * use these functions.
1292 */
1293 void
1294 vn_open_upgrade(
1295 vnode_t *vp,
1296 int filemode)
1297 {
1298 ASSERT(vp->v_type == VREG);
1299
1300 if (filemode & FREAD)
1301 atomic_inc_32(&vp->v_rdcnt);
1302 if (filemode & FWRITE)
1303 atomic_inc_32(&vp->v_wrcnt);
1304
1305 }
1306
1307 void
1308 vn_open_downgrade(
1309 vnode_t *vp,
1310 int filemode)
1311 {
1312 ASSERT(vp->v_type == VREG);
1313
1314 if (filemode & FREAD) {
1315 ASSERT(vp->v_rdcnt > 0);
1316 atomic_dec_32(&vp->v_rdcnt);
1317 }
1318 if (filemode & FWRITE) {
1319 ASSERT(vp->v_wrcnt > 0);
1320 atomic_dec_32(&vp->v_wrcnt);
1321 }
1322
1323 }
1324
1325 int
1326 vn_create(
1327 char *pnamep,
1328 enum uio_seg seg,
1329 struct vattr *vap,
1330 enum vcexcl excl,
1331 int mode,
1332 struct vnode **vpp,
1333 enum create why,
1334 int flag,
1335 mode_t umask)
1336 {
1337 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1338 umask, NULL));
1339 }
1340
1341 /*
1342 * Create a vnode (makenode).
1343 */
1344 int
1345 vn_createat(
1346 char *pnamep,
1347 enum uio_seg seg,
1348 struct vattr *vap,
1349 enum vcexcl excl,
1350 int mode,
1351 struct vnode **vpp,
1352 enum create why,
1353 int flag,
1354 mode_t umask,
1355 struct vnode *startvp)
1356 {
1357 struct vnode *dvp; /* ptr to parent dir vnode */
1358 struct vnode *vp = NULL;
1359 struct pathname pn;
1360 int error;
1361 int in_crit = 0;
1362 struct vattr vattr;
1363 enum symfollow follow;
1364 int estale_retry = 0;
1365 uint32_t auditing = AU_AUDITING();
1366
1367 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1368
1369 /* symlink interpretation */
1370 if ((flag & FNOFOLLOW) || excl == EXCL)
1371 follow = NO_FOLLOW;
1372 else
1373 follow = FOLLOW;
1374 flag &= ~(FNOFOLLOW|FNOLINKS);
1375
1376 top:
1377 /*
1378 * Lookup directory.
1379 * If new object is a file, call lower level to create it.
1380 * Note that it is up to the lower level to enforce exclusive
1381 * creation, if the file is already there.
1382 * This allows the lower level to do whatever
1383 * locking or protocol that is needed to prevent races.
1384 * If the new object is directory call lower level to make
1385 * the new directory, with "." and "..".
1386 */
1387 if (error = pn_get(pnamep, seg, &pn))
1388 return (error);
1389 if (auditing)
1390 audit_vncreate_start();
1391 dvp = NULL;
1392 *vpp = NULL;
1393 /*
1394 * lookup will find the parent directory for the vnode.
1395 * When it is done the pn holds the name of the entry
1396 * in the directory.
1397 * If this is a non-exclusive create we also find the node itself.
1398 */
1399 error = lookuppnat(&pn, NULL, follow, &dvp,
1400 (excl == EXCL) ? NULLVPP : vpp, startvp);
1401 if (error) {
1402 pn_free(&pn);
1403 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1404 goto top;
1405 if (why == CRMKDIR && error == EINVAL)
1406 error = EEXIST; /* SVID */
1407 return (error);
1408 }
1409
1410 if (why != CRMKNOD)
1411 vap->va_mode &= ~VSVTX;
1412
1413 /*
1414 * If default ACLs are defined for the directory don't apply the
1415 * umask if umask is passed.
1416 */
1417
1418 if (umask) {
1419
1420 vsecattr_t vsec;
1421
1422 vsec.vsa_aclcnt = 0;
1423 vsec.vsa_aclentp = NULL;
1424 vsec.vsa_dfaclcnt = 0;
1425 vsec.vsa_dfaclentp = NULL;
1426 vsec.vsa_mask = VSA_DFACLCNT;
1427 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1428 /*
1429 * If error is ENOSYS then treat it as no error
1430 * Don't want to force all file systems to support
1431 * aclent_t style of ACL's.
1432 */
1433 if (error == ENOSYS)
1434 error = 0;
1435 if (error) {
1436 if (*vpp != NULL)
1437 VN_RELE(*vpp);
1438 goto out;
1439 } else {
1440 /*
1441 * Apply the umask if no default ACLs.
1442 */
1443 if (vsec.vsa_dfaclcnt == 0)
1444 vap->va_mode &= ~umask;
1445
1446 /*
1447 * VOP_GETSECATTR() may have allocated memory for
1448 * ACLs we didn't request, so double-check and
1449 * free it if necessary.
1450 */
1451 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1452 kmem_free((caddr_t)vsec.vsa_aclentp,
1453 vsec.vsa_aclcnt * sizeof (aclent_t));
1454 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1455 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1456 vsec.vsa_dfaclcnt * sizeof (aclent_t));
1457 }
1458 }
1459
1460 /*
1461 * In general we want to generate EROFS if the file system is
1462 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1463 * documents the open system call, and it says that O_CREAT has no
1464 * effect if the file already exists. Bug 1119649 states
1465 * that open(path, O_CREAT, ...) fails when attempting to open an
1466 * existing file on a read only file system. Thus, the first part
1467 * of the following if statement has 3 checks:
1468 * if the file exists &&
1469 * it is being open with write access &&
1470 * the file system is read only
1471 * then generate EROFS
1472 */
1473 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1474 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1475 if (*vpp)
1476 VN_RELE(*vpp);
1477 error = EROFS;
1478 } else if (excl == NONEXCL && *vpp != NULL) {
1479 vnode_t *rvp;
1480
1481 /*
1482 * File already exists. If a mandatory lock has been
1483 * applied, return error.
1484 */
1485 vp = *vpp;
1486 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1487 rvp = vp;
1488 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1489 nbl_start_crit(vp, RW_READER);
1490 in_crit = 1;
1491 }
1492 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1493 vattr.va_mask = AT_MODE|AT_SIZE;
1494 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1495 goto out;
1496 }
1497 if (MANDLOCK(vp, vattr.va_mode)) {
1498 error = EAGAIN;
1499 goto out;
1500 }
1501 /*
1502 * File cannot be truncated if non-blocking mandatory
1503 * locks are currently on the file.
1504 */
1505 if ((vap->va_mask & AT_SIZE) && in_crit) {
1506 u_offset_t offset;
1507 ssize_t length;
1508
1509 offset = vap->va_size > vattr.va_size ?
1510 vattr.va_size : vap->va_size;
1511 length = vap->va_size > vattr.va_size ?
1512 vap->va_size - vattr.va_size :
1513 vattr.va_size - vap->va_size;
1514 if (nbl_conflict(vp, NBL_WRITE, offset,
1515 length, 0, NULL)) {
1516 error = EACCES;
1517 goto out;
1518 }
1519 }
1520 }
1521
1522 /*
1523 * If the file is the root of a VFS, we've crossed a
1524 * mount point and the "containing" directory that we
1525 * acquired above (dvp) is irrelevant because it's in
1526 * a different file system. We apply VOP_CREATE to the
1527 * target itself instead of to the containing directory
1528 * and supply a null path name to indicate (conventionally)
1529 * the node itself as the "component" of interest.
1530 *
1531 * The call to VOP_CREATE() is necessary to ensure
1532 * that the appropriate permission checks are made,
1533 * i.e. EISDIR, EACCES, etc. We already know that vpp
1534 * exists since we are in the else condition where this
1535 * was checked.
1536 */
1537 if (vp->v_flag & VROOT) {
1538 ASSERT(why != CRMKDIR);
1539 error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1540 CRED(), flag, NULL, NULL);
1541 /*
1542 * If the create succeeded, it will have created a
1543 * new reference on a new vnode (*vpp) in the child
1544 * file system, so we want to drop our reference on
1545 * the old (vp) upon exit.
1546 */
1547 goto out;
1548 }
1549
1550 /*
1551 * Large File API - non-large open (FOFFMAX flag not set)
1552 * of regular file fails if the file size exceeds MAXOFF32_T.
1553 */
1554 if (why != CRMKDIR &&
1555 !(flag & FOFFMAX) &&
1556 (vp->v_type == VREG)) {
1557 vattr.va_mask = AT_SIZE;
1558 if ((error = VOP_GETATTR(vp, &vattr, 0,
1559 CRED(), NULL))) {
1560 goto out;
1561 }
1562 if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1563 error = EOVERFLOW;
1564 goto out;
1565 }
1566 }
1567 }
1568
1569 if (error == 0) {
1570 /*
1571 * Call mkdir() if specified, otherwise create().
1572 */
1573 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1574
1575 if (why == CRMKDIR)
1576 /*
1577 * N.B., if vn_createat() ever requests
1578 * case-insensitive behavior then it will need
1579 * to be passed to VOP_MKDIR(). VOP_CREATE()
1580 * will already get it via "flag"
1581 */
1582 error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1583 NULL, 0, NULL);
1584 else if (!must_be_dir)
1585 error = VOP_CREATE(dvp, pn.pn_path, vap,
1586 excl, mode, vpp, CRED(), flag, NULL, NULL);
1587 else
1588 error = ENOTDIR;
1589 }
1590
1591 out:
1592
1593 if (auditing)
1594 audit_vncreate_finish(*vpp, error);
1595 if (in_crit) {
1596 nbl_end_crit(vp);
1597 in_crit = 0;
1598 }
1599 if (vp != NULL) {
1600 VN_RELE(vp);
1601 vp = NULL;
1602 }
1603 pn_free(&pn);
1604 VN_RELE(dvp);
1605 /*
1606 * The following clause was added to handle a problem
1607 * with NFS consistency. It is possible that a lookup
1608 * of the file to be created succeeded, but the file
1609 * itself doesn't actually exist on the server. This
1610 * is chiefly due to the DNLC containing an entry for
1611 * the file which has been removed on the server. In
1612 * this case, we just start over. If there was some
1613 * other cause for the ESTALE error, then the lookup
1614 * of the file will fail and the error will be returned
1615 * above instead of looping around from here.
1616 */
1617 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1618 goto top;
1619 return (error);
1620 }
1621
1622 int
1623 vn_link(char *from, char *to, enum uio_seg seg)
1624 {
1625 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1626 }
1627
1628 int
1629 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1630 vnode_t *tstartvp, char *to, enum uio_seg seg)
1631 {
1632 struct vnode *fvp; /* from vnode ptr */
1633 struct vnode *tdvp; /* to directory vnode ptr */
1634 struct pathname pn;
1635 int error;
1636 struct vattr vattr;
1637 dev_t fsid;
1638 int estale_retry = 0;
1639 uint32_t auditing = AU_AUDITING();
1640
1641 top:
1642 fvp = tdvp = NULL;
1643 if (error = pn_get(to, seg, &pn))
1644 return (error);
1645 if (auditing && fstartvp != NULL)
1646 audit_setfsat_path(1);
1647 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1648 goto out;
1649 if (auditing && tstartvp != NULL)
1650 audit_setfsat_path(3);
1651 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1652 goto out;
1653 /*
1654 * Make sure both source vnode and target directory vnode are
1655 * in the same vfs and that it is writeable.
1656 */
1657 vattr.va_mask = AT_FSID;
1658 if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1659 goto out;
1660 fsid = vattr.va_fsid;
1661 vattr.va_mask = AT_FSID;
1662 if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1663 goto out;
1664 if (fsid != vattr.va_fsid) {
1665 error = EXDEV;
1666 goto out;
1667 }
1668 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1669 error = EROFS;
1670 goto out;
1671 }
1672 /*
1673 * Do the link.
1674 */
1675 (void) pn_fixslash(&pn);
1676 error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1677 out:
1678 pn_free(&pn);
1679 if (fvp)
1680 VN_RELE(fvp);
1681 if (tdvp)
1682 VN_RELE(tdvp);
1683 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1684 goto top;
1685 return (error);
1686 }
1687
1688 int
1689 vn_rename(char *from, char *to, enum uio_seg seg)
1690 {
1691 return (vn_renameat(NULL, from, NULL, to, seg));
1692 }
1693
1694 int
1695 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1696 char *tname, enum uio_seg seg)
1697 {
1698 int error;
1699 struct vattr vattr;
1700 struct pathname fpn; /* from pathname */
1701 struct pathname tpn; /* to pathname */
1702 dev_t fsid;
1703 int in_crit_src, in_crit_targ;
1704 vnode_t *fromvp, *fvp;
1705 vnode_t *tovp, *targvp;
1706 int estale_retry = 0;
1707 uint32_t auditing = AU_AUDITING();
1708
1709 top:
1710 fvp = fromvp = tovp = targvp = NULL;
1711 in_crit_src = in_crit_targ = 0;
1712 /*
1713 * Get to and from pathnames.
1714 */
1715 if (error = pn_get(fname, seg, &fpn))
1716 return (error);
1717 if (error = pn_get(tname, seg, &tpn)) {
1718 pn_free(&fpn);
1719 return (error);
1720 }
1721
1722 /*
1723 * First we need to resolve the correct directories
1724 * The passed in directories may only be a starting point,
1725 * but we need the real directories the file(s) live in.
1726 * For example the fname may be something like usr/lib/sparc
1727 * and we were passed in the / directory, but we need to
1728 * use the lib directory for the rename.
1729 */
1730
1731 if (auditing && fdvp != NULL)
1732 audit_setfsat_path(1);
1733 /*
1734 * Lookup to and from directories.
1735 */
1736 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1737 goto out;
1738 }
1739
1740 /*
1741 * Make sure there is an entry.
1742 */
1743 if (fvp == NULL) {
1744 error = ENOENT;
1745 goto out;
1746 }
1747
1748 if (auditing && tdvp != NULL)
1749 audit_setfsat_path(3);
1750 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1751 goto out;
1752 }
1753
1754 /*
1755 * Make sure both the from vnode directory and the to directory
1756 * are in the same vfs and the to directory is writable.
1757 * We check fsid's, not vfs pointers, so loopback fs works.
1758 */
1759 if (fromvp != tovp) {
1760 vattr.va_mask = AT_FSID;
1761 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1762 goto out;
1763 fsid = vattr.va_fsid;
1764 vattr.va_mask = AT_FSID;
1765 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1766 goto out;
1767 if (fsid != vattr.va_fsid) {
1768 error = EXDEV;
1769 goto out;
1770 }
1771 }
1772
1773 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1774 error = EROFS;
1775 goto out;
1776 }
1777
1778 /*
1779 * Make sure "from" vp is not a mount point.
1780 * Note, lookup did traverse() already, so
1781 * we'll be looking at the mounted FS root.
1782 * (but allow files like mnttab)
1783 */
1784 if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1785 error = EBUSY;
1786 goto out;
1787 }
1788
1789 if (targvp && (fvp != targvp)) {
1790 nbl_start_crit(targvp, RW_READER);
1791 in_crit_targ = 1;
1792 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1793 error = EACCES;
1794 goto out;
1795 }
1796 }
1797
1798 if (nbl_need_check(fvp)) {
1799 nbl_start_crit(fvp, RW_READER);
1800 in_crit_src = 1;
1801 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1802 error = EACCES;
1803 goto out;
1804 }
1805 }
1806
1807 /*
1808 * Do the rename.
1809 */
1810 (void) pn_fixslash(&tpn);
1811 error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1812 NULL, 0);
1813
1814 out:
1815 pn_free(&fpn);
1816 pn_free(&tpn);
1817 if (in_crit_src)
1818 nbl_end_crit(fvp);
1819 if (in_crit_targ)
1820 nbl_end_crit(targvp);
1821 if (fromvp)
1822 VN_RELE(fromvp);
1823 if (tovp)
1824 VN_RELE(tovp);
1825 if (targvp)
1826 VN_RELE(targvp);
1827 if (fvp)
1828 VN_RELE(fvp);
1829 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1830 goto top;
1831 return (error);
1832 }
1833
1834 /*
1835 * Remove a file or directory.
1836 */
1837 int
1838 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1839 {
1840 return (vn_removeat(NULL, fnamep, seg, dirflag));
1841 }
1842
1843 int
1844 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1845 {
1846 struct vnode *vp; /* entry vnode */
1847 struct vnode *dvp; /* ptr to parent dir vnode */
1848 struct vnode *coveredvp;
1849 struct pathname pn; /* name of entry */
1850 enum vtype vtype;
1851 int error;
1852 struct vfs *vfsp;
1853 struct vfs *dvfsp; /* ptr to parent dir vfs */
1854 int in_crit = 0;
1855 int estale_retry = 0;
1856
1857 top:
1858 if (error = pn_get(fnamep, seg, &pn))
1859 return (error);
1860 dvp = vp = NULL;
1861 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1862 pn_free(&pn);
1863 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1864 goto top;
1865 return (error);
1866 }
1867
1868 /*
1869 * Make sure there is an entry.
1870 */
1871 if (vp == NULL) {
1872 error = ENOENT;
1873 goto out;
1874 }
1875
1876 vfsp = vp->v_vfsp;
1877 dvfsp = dvp->v_vfsp;
1878
1879 /*
1880 * If the named file is the root of a mounted filesystem, fail,
1881 * unless it's marked unlinkable. In that case, unmount the
1882 * filesystem and proceed to unlink the covered vnode. (If the
1883 * covered vnode is a directory, use rmdir instead of unlink,
1884 * to avoid file system corruption.)
1885 */
1886 if (vp->v_flag & VROOT) {
1887 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1888 error = EBUSY;
1889 goto out;
1890 }
1891
1892 /*
1893 * Namefs specific code starts here.
1894 */
1895
1896 if (dirflag == RMDIRECTORY) {
1897 /*
1898 * User called rmdir(2) on a file that has
1899 * been namefs mounted on top of. Since
1900 * namefs doesn't allow directories to
1901 * be mounted on other files we know
1902 * vp is not of type VDIR so fail to operation.
1903 */
1904 error = ENOTDIR;
1905 goto out;
1906 }
1907
1908 /*
1909 * If VROOT is still set after grabbing vp->v_lock,
1910 * noone has finished nm_unmount so far and coveredvp
1911 * is valid.
1912 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1913 * vp->v_lock, any race window is eliminated.
1914 */
1915
1916 mutex_enter(&vp->v_lock);
1917 if ((vp->v_flag & VROOT) == 0) {
1918 /* Someone beat us to the unmount */
1919 mutex_exit(&vp->v_lock);
1920 error = EBUSY;
1921 goto out;
1922 }
1923 vfsp = vp->v_vfsp;
1924 coveredvp = vfsp->vfs_vnodecovered;
1925 ASSERT(coveredvp);
1926 /*
1927 * Note: Implementation of vn_vfswlock shows that ordering of
1928 * v_lock / vn_vfswlock is not an issue here.
1929 */
1930 error = vn_vfswlock(coveredvp);
1931 mutex_exit(&vp->v_lock);
1932
1933 if (error)
1934 goto out;
1935
1936 VN_HOLD(coveredvp);
1937 VN_RELE(vp);
1938 error = dounmount(vfsp, 0, CRED());
1939
1940 /*
1941 * Unmounted the namefs file system; now get
1942 * the object it was mounted over.
1943 */
1944 vp = coveredvp;
1945 /*
1946 * If namefs was mounted over a directory, then
1947 * we want to use rmdir() instead of unlink().
1948 */
1949 if (vp->v_type == VDIR)
1950 dirflag = RMDIRECTORY;
1951
1952 if (error)
1953 goto out;
1954 }
1955
1956 /*
1957 * Make sure filesystem is writeable.
1958 * We check the parent directory's vfs in case this is an lofs vnode.
1959 */
1960 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1961 error = EROFS;
1962 goto out;
1963 }
1964
1965 vtype = vp->v_type;
1966
1967 /*
1968 * If there is the possibility of an nbmand share reservation, make
1969 * sure it's okay to remove the file. Keep a reference to the
1970 * vnode, so that we can exit the nbl critical region after
1971 * calling VOP_REMOVE.
1972 * If there is no possibility of an nbmand share reservation,
1973 * release the vnode reference now. Filesystems like NFS may
1974 * behave differently if there is an extra reference, so get rid of
1975 * this one. Fortunately, we can't have nbmand mounts on NFS
1976 * filesystems.
1977 */
1978 if (nbl_need_check(vp)) {
1979 nbl_start_crit(vp, RW_READER);
1980 in_crit = 1;
1981 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1982 error = EACCES;
1983 goto out;
1984 }
1985 } else {
1986 VN_RELE(vp);
1987 vp = NULL;
1988 }
1989
1990 if (dirflag == RMDIRECTORY) {
1991 /*
1992 * Caller is using rmdir(2), which can only be applied to
1993 * directories.
1994 */
1995 if (vtype != VDIR) {
1996 error = ENOTDIR;
1997 } else {
1998 vnode_t *cwd;
1999 proc_t *pp = curproc;
2000
2001 mutex_enter(&pp->p_lock);
2002 cwd = PTOU(pp)->u_cdir;
2003 VN_HOLD(cwd);
2004 mutex_exit(&pp->p_lock);
2005 error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2006 NULL, 0);
2007 VN_RELE(cwd);
2008 }
2009 } else {
2010 /*
2011 * Unlink(2) can be applied to anything.
2012 */
2013 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2014 }
2015
2016 out:
2017 pn_free(&pn);
2018 if (in_crit) {
2019 nbl_end_crit(vp);
2020 in_crit = 0;
2021 }
2022 if (vp != NULL)
2023 VN_RELE(vp);
2024 if (dvp != NULL)
2025 VN_RELE(dvp);
2026 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2027 goto top;
2028 return (error);
2029 }
2030
2031 /*
2032 * Utility function to compare equality of vnodes.
2033 * Compare the underlying real vnodes, if there are underlying vnodes.
2034 * This is a more thorough comparison than the VN_CMP() macro provides.
2035 */
2036 int
2037 vn_compare(vnode_t *vp1, vnode_t *vp2)
2038 {
2039 vnode_t *realvp;
2040
2041 if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2042 vp1 = realvp;
2043 if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2044 vp2 = realvp;
2045 return (VN_CMP(vp1, vp2));
2046 }
2047
2048 /*
2049 * The number of locks to hash into. This value must be a power
2050 * of 2 minus 1 and should probably also be prime.
2051 */
2052 #define NUM_BUCKETS 1023
2053
2054 struct vn_vfslocks_bucket {
2055 kmutex_t vb_lock;
2056 vn_vfslocks_entry_t *vb_list;
2057 char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2058 };
2059
2060 /*
2061 * Total number of buckets will be NUM_BUCKETS + 1 .
2062 */
2063
2064 #pragma align 64(vn_vfslocks_buckets)
2065 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
2066
2067 #define VN_VFSLOCKS_SHIFT 9
2068
2069 #define VN_VFSLOCKS_HASH(vfsvpptr) \
2070 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2071
2072 /*
2073 * vn_vfslocks_getlock() uses an HASH scheme to generate
2074 * rwstlock using vfs/vnode pointer passed to it.
2075 *
2076 * vn_vfslocks_rele() releases a reference in the
2077 * HASH table which allows the entry allocated by
2078 * vn_vfslocks_getlock() to be freed at a later
2079 * stage when the refcount drops to zero.
2080 */
2081
2082 vn_vfslocks_entry_t *
2083 vn_vfslocks_getlock(void *vfsvpptr)
2084 {
2085 struct vn_vfslocks_bucket *bp;
2086 vn_vfslocks_entry_t *vep;
2087 vn_vfslocks_entry_t *tvep;
2088
2089 ASSERT(vfsvpptr != NULL);
2090 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2091
2092 mutex_enter(&bp->vb_lock);
2093 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2094 if (vep->ve_vpvfs == vfsvpptr) {
2095 vep->ve_refcnt++;
2096 mutex_exit(&bp->vb_lock);
2097 return (vep);
2098 }
2099 }
2100 mutex_exit(&bp->vb_lock);
2101 vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2102 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2103 vep->ve_vpvfs = (char *)vfsvpptr;
2104 vep->ve_refcnt = 1;
2105 mutex_enter(&bp->vb_lock);
2106 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2107 if (tvep->ve_vpvfs == vfsvpptr) {
2108 tvep->ve_refcnt++;
2109 mutex_exit(&bp->vb_lock);
2110
2111 /*
2112 * There is already an entry in the hash
2113 * destroy what we just allocated.
2114 */
2115 rwst_destroy(&vep->ve_lock);
2116 kmem_free(vep, sizeof (*vep));
2117 return (tvep);
2118 }
2119 }
2120 vep->ve_next = bp->vb_list;
2121 bp->vb_list = vep;
2122 mutex_exit(&bp->vb_lock);
2123 return (vep);
2124 }
2125
2126 void
2127 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2128 {
2129 struct vn_vfslocks_bucket *bp;
2130 vn_vfslocks_entry_t *vep;
2131 vn_vfslocks_entry_t *pvep;
2132
2133 ASSERT(vepent != NULL);
2134 ASSERT(vepent->ve_vpvfs != NULL);
2135
2136 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2137
2138 mutex_enter(&bp->vb_lock);
2139 vepent->ve_refcnt--;
2140
2141 if ((int32_t)vepent->ve_refcnt < 0)
2142 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2143
2144 pvep = NULL;
2145 if (vepent->ve_refcnt == 0) {
2146 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2147 if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2148 if (pvep == NULL)
2149 bp->vb_list = vep->ve_next;
2150 else {
2151 pvep->ve_next = vep->ve_next;
2152 }
2153 mutex_exit(&bp->vb_lock);
2154 rwst_destroy(&vep->ve_lock);
2155 kmem_free(vep, sizeof (*vep));
2156 return;
2157 }
2158 pvep = vep;
2159 }
2160 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2161 }
2162 mutex_exit(&bp->vb_lock);
2163 }
2164
2165 /*
2166 * vn_vfswlock_wait is used to implement a lock which is logically a writers
2167 * lock protecting the v_vfsmountedhere field.
2168 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2169 * except that it blocks to acquire the lock VVFSLOCK.
2170 *
2171 * traverse() and routines re-implementing part of traverse (e.g. autofs)
2172 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2173 * need the non-blocking version of the writers lock i.e. vn_vfswlock
2174 */
2175 int
2176 vn_vfswlock_wait(vnode_t *vp)
2177 {
2178 int retval;
2179 vn_vfslocks_entry_t *vpvfsentry;
2180 ASSERT(vp != NULL);
2181
2182 vpvfsentry = vn_vfslocks_getlock(vp);
2183 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2184
2185 if (retval == EINTR) {
2186 vn_vfslocks_rele(vpvfsentry);
2187 return (EINTR);
2188 }
2189 return (retval);
2190 }
2191
2192 int
2193 vn_vfsrlock_wait(vnode_t *vp)
2194 {
2195 int retval;
2196 vn_vfslocks_entry_t *vpvfsentry;
2197 ASSERT(vp != NULL);
2198
2199 vpvfsentry = vn_vfslocks_getlock(vp);
2200 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2201
2202 if (retval == EINTR) {
2203 vn_vfslocks_rele(vpvfsentry);
2204 return (EINTR);
2205 }
2206
2207 return (retval);
2208 }
2209
2210
2211 /*
2212 * vn_vfswlock is used to implement a lock which is logically a writers lock
2213 * protecting the v_vfsmountedhere field.
2214 */
2215 int
2216 vn_vfswlock(vnode_t *vp)
2217 {
2218 vn_vfslocks_entry_t *vpvfsentry;
2219
2220 /*
2221 * If vp is NULL then somebody is trying to lock the covered vnode
2222 * of /. (vfs_vnodecovered is NULL for /). This situation will
2223 * only happen when unmounting /. Since that operation will fail
2224 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2225 */
2226 if (vp == NULL)
2227 return (EBUSY);
2228
2229 vpvfsentry = vn_vfslocks_getlock(vp);
2230
2231 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2232 return (0);
2233
2234 vn_vfslocks_rele(vpvfsentry);
2235 return (EBUSY);
2236 }
2237
2238 int
2239 vn_vfsrlock(vnode_t *vp)
2240 {
2241 vn_vfslocks_entry_t *vpvfsentry;
2242
2243 /*
2244 * If vp is NULL then somebody is trying to lock the covered vnode
2245 * of /. (vfs_vnodecovered is NULL for /). This situation will
2246 * only happen when unmounting /. Since that operation will fail
2247 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2248 */
2249 if (vp == NULL)
2250 return (EBUSY);
2251
2252 vpvfsentry = vn_vfslocks_getlock(vp);
2253
2254 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2255 return (0);
2256
2257 vn_vfslocks_rele(vpvfsentry);
2258 return (EBUSY);
2259 }
2260
2261 void
2262 vn_vfsunlock(vnode_t *vp)
2263 {
2264 vn_vfslocks_entry_t *vpvfsentry;
2265
2266 /*
2267 * ve_refcnt needs to be decremented twice.
2268 * 1. To release refernce after a call to vn_vfslocks_getlock()
2269 * 2. To release the reference from the locking routines like
2270 * vn_vfsrlock/vn_vfswlock etc,.
2271 */
2272 vpvfsentry = vn_vfslocks_getlock(vp);
2273 vn_vfslocks_rele(vpvfsentry);
2274
2275 rwst_exit(&vpvfsentry->ve_lock);
2276 vn_vfslocks_rele(vpvfsentry);
2277 }
2278
2279 int
2280 vn_vfswlock_held(vnode_t *vp)
2281 {
2282 int held;
2283 vn_vfslocks_entry_t *vpvfsentry;
2284
2285 ASSERT(vp != NULL);
2286
2287 vpvfsentry = vn_vfslocks_getlock(vp);
2288 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2289
2290 vn_vfslocks_rele(vpvfsentry);
2291 return (held);
2292 }
2293
2294
2295 int
2296 vn_make_ops(
2297 const char *name, /* Name of file system */
2298 const fs_operation_def_t *templ, /* Operation specification */
2299 vnodeops_t **actual) /* Return the vnodeops */
2300 {
2301 int unused_ops;
2302 int error;
2303
2304 *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2305
2306 (*actual)->vnop_name = name;
2307
2308 error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2309 if (error) {
2310 kmem_free(*actual, sizeof (vnodeops_t));
2311 }
2312
2313 #if DEBUG
2314 if (unused_ops != 0)
2315 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2316 "but not used", name, unused_ops);
2317 #endif
2318
2319 return (error);
2320 }
2321
2322 /*
2323 * Free the vnodeops created as a result of vn_make_ops()
2324 */
2325 void
2326 vn_freevnodeops(vnodeops_t *vnops)
2327 {
2328 kmem_free(vnops, sizeof (vnodeops_t));
2329 }
2330
2331 /*
2332 * Vnode cache.
2333 */
2334
2335 /* ARGSUSED */
2336 static int
2337 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2338 {
2339 struct vnode *vp;
2340
2341 vp = buf;
2342
2343 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2344 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2345 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2346 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2347 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2348 vp->v_path = vn_vpath_empty;
2349 vp->v_path_stamp = 0;
2350 vp->v_mpssdata = NULL;
2351 vp->v_vsd = NULL;
2352 vp->v_fopdata = NULL;
2353
2354 return (0);
2355 }
2356
2357 /* ARGSUSED */
2358 static void
2359 vn_cache_destructor(void *buf, void *cdrarg)
2360 {
2361 struct vnode *vp;
2362
2363 vp = buf;
2364
2365 rw_destroy(&vp->v_nbllock);
2366 cv_destroy(&vp->v_cv);
2367 mutex_destroy(&vp->v_vsd_lock);
2368 mutex_destroy(&vp->v_lock);
2369 }
2370
2371 void
2372 vn_create_cache(void)
2373 {
2374 /* LINTED */
2375 ASSERT((1 << VNODE_ALIGN_LOG2) ==
2376 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2377 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2378 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2379 NULL, 0);
2380 }
2381
2382 void
2383 vn_destroy_cache(void)
2384 {
2385 kmem_cache_destroy(vn_cache);
2386 }
2387
2388 /*
2389 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2390 * cached by the file system and vnodes remain associated.
2391 */
2392 void
2393 vn_recycle(vnode_t *vp)
2394 {
2395 ASSERT(vp->v_pages == NULL);
2396 VERIFY(vp->v_path != NULL);
2397
2398 /*
2399 * XXX - This really belongs in vn_reinit(), but we have some issues
2400 * with the counts. Best to have it here for clean initialization.
2401 */
2402 vp->v_rdcnt = 0;
2403 vp->v_wrcnt = 0;
2404 vp->v_mmap_read = 0;
2405 vp->v_mmap_write = 0;
2406
2407 /*
2408 * If FEM was in use, make sure everything gets cleaned up
2409 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2410 * constructor.
2411 */
2412 if (vp->v_femhead) {
2413 /* XXX - There should be a free_femhead() that does all this */
2414 ASSERT(vp->v_femhead->femh_list == NULL);
2415 mutex_destroy(&vp->v_femhead->femh_lock);
2416 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2417 vp->v_femhead = NULL;
2418 }
2419 if (vp->v_path != vn_vpath_empty) {
2420 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2421 vp->v_path = vn_vpath_empty;
2422 }
2423 vp->v_path_stamp = 0;
2424
2425 if (vp->v_fopdata != NULL) {
2426 free_fopdata(vp);
2427 }
2428 vp->v_mpssdata = NULL;
2429 vsd_free(vp);
2430 }
2431
2432 /*
2433 * Used to reset the vnode fields including those that are directly accessible
2434 * as well as those which require an accessor function.
2435 *
2436 * Does not initialize:
2437 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2438 * v_data (since FS-nodes and vnodes point to each other and should
2439 * be updated simultaneously)
2440 * v_op (in case someone needs to make a VOP call on this object)
2441 */
2442 void
2443 vn_reinit(vnode_t *vp)
2444 {
2445 vp->v_count = 1;
2446 vp->v_count_dnlc = 0;
2447 vp->v_vfsp = NULL;
2448 vp->v_stream = NULL;
2449 vp->v_vfsmountedhere = NULL;
2450 vp->v_flag = 0;
2451 vp->v_type = VNON;
2452 vp->v_rdev = NODEV;
2453
2454 vp->v_filocks = NULL;
2455 vp->v_shrlocks = NULL;
2456 vp->v_pages = NULL;
2457
2458 vp->v_locality = NULL;
2459 vp->v_xattrdir = NULL;
2460
2461 /*
2462 * In a few specific instances, vn_reinit() is used to initialize
2463 * locally defined vnode_t instances. Lacking the construction offered
2464 * by vn_alloc(), these vnodes require v_path initialization.
2465 */
2466 if (vp->v_path == NULL) {
2467 vp->v_path = vn_vpath_empty;
2468 }
2469
2470 /* Handles v_femhead, v_path, and the r/w/map counts */
2471 vn_recycle(vp);
2472 }
2473
2474 vnode_t *
2475 vn_alloc(int kmflag)
2476 {
2477 vnode_t *vp;
2478
2479 vp = kmem_cache_alloc(vn_cache, kmflag);
2480
2481 if (vp != NULL) {
2482 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2483 vp->v_fopdata = NULL;
2484 vn_reinit(vp);
2485 }
2486
2487 return (vp);
2488 }
2489
2490 void
2491 vn_free(vnode_t *vp)
2492 {
2493 ASSERT(vp->v_shrlocks == NULL);
2494 ASSERT(vp->v_filocks == NULL);
2495
2496 /*
2497 * Some file systems call vn_free() with v_count of zero,
2498 * some with v_count of 1. In any case, the value should
2499 * never be anything else.
2500 */
2501 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2502 ASSERT(vp->v_count_dnlc == 0);
2503 VERIFY(vp->v_path != NULL);
2504 if (vp->v_path != vn_vpath_empty) {
2505 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2506 vp->v_path = vn_vpath_empty;
2507 }
2508
2509 /* If FEM was in use, make sure everything gets cleaned up */
2510 if (vp->v_femhead) {
2511 /* XXX - There should be a free_femhead() that does all this */
2512 ASSERT(vp->v_femhead->femh_list == NULL);
2513 mutex_destroy(&vp->v_femhead->femh_lock);
2514 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2515 vp->v_femhead = NULL;
2516 }
2517
2518 if (vp->v_fopdata != NULL) {
2519 free_fopdata(vp);
2520 }
2521 vp->v_mpssdata = NULL;
2522 vsd_free(vp);
2523 kmem_cache_free(vn_cache, vp);
2524 }
2525
2526 /*
2527 * vnode status changes, should define better states than 1, 0.
2528 */
2529 void
2530 vn_reclaim(vnode_t *vp)
2531 {
2532 vfs_t *vfsp = vp->v_vfsp;
2533
2534 if (vfsp == NULL ||
2535 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2536 return;
2537 }
2538 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2539 }
2540
2541 void
2542 vn_idle(vnode_t *vp)
2543 {
2544 vfs_t *vfsp = vp->v_vfsp;
2545
2546 if (vfsp == NULL ||
2547 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2548 return;
2549 }
2550 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2551 }
2552 void
2553 vn_exists(vnode_t *vp)
2554 {
2555 vfs_t *vfsp = vp->v_vfsp;
2556
2557 if (vfsp == NULL ||
2558 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2559 return;
2560 }
2561 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2562 }
2563
2564 void
2565 vn_invalid(vnode_t *vp)
2566 {
2567 vfs_t *vfsp = vp->v_vfsp;
2568
2569 if (vfsp == NULL ||
2570 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2571 return;
2572 }
2573 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2574 }
2575
2576 /* Vnode event notification */
2577
2578 int
2579 vnevent_support(vnode_t *vp, caller_context_t *ct)
2580 {
2581 if (vp == NULL)
2582 return (EINVAL);
2583
2584 return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2585 }
2586
2587 void
2588 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2589 {
2590 if (vp == NULL || vp->v_femhead == NULL) {
2591 return;
2592 }
2593 (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2594 }
2595
2596 void
2597 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2598 caller_context_t *ct)
2599 {
2600 if (vp == NULL || vp->v_femhead == NULL) {
2601 return;
2602 }
2603 (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2604 }
2605
2606 void
2607 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2608 {
2609 if (vp == NULL || vp->v_femhead == NULL) {
2610 return;
2611 }
2612 (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2613 }
2614
2615 void
2616 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2617 {
2618 if (vp == NULL || vp->v_femhead == NULL) {
2619 return;
2620 }
2621 (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2622 }
2623
2624 void
2625 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2626 {
2627 if (vp == NULL || vp->v_femhead == NULL) {
2628 return;
2629 }
2630 (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2631 }
2632
2633 void
2634 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2635 caller_context_t *ct)
2636 {
2637 if (vp == NULL || vp->v_femhead == NULL) {
2638 return;
2639 }
2640 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2641 }
2642
2643 void
2644 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2645 caller_context_t *ct)
2646 {
2647 if (vp == NULL || vp->v_femhead == NULL) {
2648 return;
2649 }
2650 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2651 }
2652
2653 void
2654 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2655 caller_context_t *ct)
2656 {
2657 if (vp == NULL || vp->v_femhead == NULL) {
2658 return;
2659 }
2660 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2661 }
2662
2663 void
2664 vnevent_create(vnode_t *vp, caller_context_t *ct)
2665 {
2666 if (vp == NULL || vp->v_femhead == NULL) {
2667 return;
2668 }
2669 (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2670 }
2671
2672 void
2673 vnevent_link(vnode_t *vp, caller_context_t *ct)
2674 {
2675 if (vp == NULL || vp->v_femhead == NULL) {
2676 return;
2677 }
2678 (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2679 }
2680
2681 void
2682 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2683 {
2684 if (vp == NULL || vp->v_femhead == NULL) {
2685 return;
2686 }
2687 (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2688 }
2689
2690 void
2691 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2692 {
2693 if (vp == NULL || vp->v_femhead == NULL) {
2694 return;
2695 }
2696 (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2697 }
2698
2699 /*
2700 * Vnode accessors.
2701 */
2702
2703 int
2704 vn_is_readonly(vnode_t *vp)
2705 {
2706 return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2707 }
2708
2709 int
2710 vn_has_flocks(vnode_t *vp)
2711 {
2712 return (vp->v_filocks != NULL);
2713 }
2714
2715 int
2716 vn_has_mandatory_locks(vnode_t *vp, int mode)
2717 {
2718 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2719 }
2720
2721 int
2722 vn_has_cached_data(vnode_t *vp)
2723 {
2724 return (vp->v_pages != NULL);
2725 }
2726
2727 /*
2728 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2729 * zone_enter(2).
2730 */
2731 int
2732 vn_can_change_zones(vnode_t *vp)
2733 {
2734 struct vfssw *vswp;
2735 int allow = 1;
2736 vnode_t *rvp;
2737
2738 if (nfs_global_client_only != 0)
2739 return (1);
2740
2741 /*
2742 * We always want to look at the underlying vnode if there is one.
2743 */
2744 if (VOP_REALVP(vp, &rvp, NULL) != 0)
2745 rvp = vp;
2746 /*
2747 * Some pseudo filesystems (including doorfs) don't actually register
2748 * their vfsops_t, so the following may return NULL; we happily let
2749 * such vnodes switch zones.
2750 */
2751 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2752 if (vswp != NULL) {
2753 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2754 allow = 0;
2755 vfs_unrefvfssw(vswp);
2756 }
2757 return (allow);
2758 }
2759
2760 /*
2761 * Return nonzero if the vnode is a mount point, zero if not.
2762 */
2763 int
2764 vn_ismntpt(vnode_t *vp)
2765 {
2766 return (vp->v_vfsmountedhere != NULL);
2767 }
2768
2769 /* Retrieve the vfs (if any) mounted on this vnode */
2770 vfs_t *
2771 vn_mountedvfs(vnode_t *vp)
2772 {
2773 return (vp->v_vfsmountedhere);
2774 }
2775
2776 /*
2777 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2778 */
2779 int
2780 vn_in_dnlc(vnode_t *vp)
2781 {
2782 return (vp->v_count_dnlc > 0);
2783 }
2784
2785 /*
2786 * vn_has_other_opens() checks whether a particular file is opened by more than
2787 * just the caller and whether the open is for read and/or write.
2788 * This routine is for calling after the caller has already called VOP_OPEN()
2789 * and the caller wishes to know if they are the only one with it open for
2790 * the mode(s) specified.
2791 *
2792 * Vnode counts are only kept on regular files (v_type=VREG).
2793 */
2794 int
2795 vn_has_other_opens(
2796 vnode_t *vp,
2797 v_mode_t mode)
2798 {
2799
2800 ASSERT(vp != NULL);
2801
2802 switch (mode) {
2803 case V_WRITE:
2804 if (vp->v_wrcnt > 1)
2805 return (V_TRUE);
2806 break;
2807 case V_RDORWR:
2808 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2809 return (V_TRUE);
2810 break;
2811 case V_RDANDWR:
2812 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2813 return (V_TRUE);
2814 break;
2815 case V_READ:
2816 if (vp->v_rdcnt > 1)
2817 return (V_TRUE);
2818 break;
2819 }
2820
2821 return (V_FALSE);
2822 }
2823
2824 /*
2825 * vn_is_opened() checks whether a particular file is opened and
2826 * whether the open is for read and/or write.
2827 *
2828 * Vnode counts are only kept on regular files (v_type=VREG).
2829 */
2830 int
2831 vn_is_opened(
2832 vnode_t *vp,
2833 v_mode_t mode)
2834 {
2835
2836 ASSERT(vp != NULL);
2837
2838 switch (mode) {
2839 case V_WRITE:
2840 if (vp->v_wrcnt)
2841 return (V_TRUE);
2842 break;
2843 case V_RDANDWR:
2844 if (vp->v_rdcnt && vp->v_wrcnt)
2845 return (V_TRUE);
2846 break;
2847 case V_RDORWR:
2848 if (vp->v_rdcnt || vp->v_wrcnt)
2849 return (V_TRUE);
2850 break;
2851 case V_READ:
2852 if (vp->v_rdcnt)
2853 return (V_TRUE);
2854 break;
2855 }
2856
2857 return (V_FALSE);
2858 }
2859
2860 /*
2861 * vn_is_mapped() checks whether a particular file is mapped and whether
2862 * the file is mapped read and/or write.
2863 */
2864 int
2865 vn_is_mapped(
2866 vnode_t *vp,
2867 v_mode_t mode)
2868 {
2869
2870 ASSERT(vp != NULL);
2871
2872 #if !defined(_LP64)
2873 switch (mode) {
2874 /*
2875 * The atomic_add_64_nv functions force atomicity in the
2876 * case of 32 bit architectures. Otherwise the 64 bit values
2877 * require two fetches. The value of the fields may be
2878 * (potentially) changed between the first fetch and the
2879 * second
2880 */
2881 case V_WRITE:
2882 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2883 return (V_TRUE);
2884 break;
2885 case V_RDANDWR:
2886 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2887 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2888 return (V_TRUE);
2889 break;
2890 case V_RDORWR:
2891 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2892 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2893 return (V_TRUE);
2894 break;
2895 case V_READ:
2896 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2897 return (V_TRUE);
2898 break;
2899 }
2900 #else
2901 switch (mode) {
2902 case V_WRITE:
2903 if (vp->v_mmap_write)
2904 return (V_TRUE);
2905 break;
2906 case V_RDANDWR:
2907 if (vp->v_mmap_read && vp->v_mmap_write)
2908 return (V_TRUE);
2909 break;
2910 case V_RDORWR:
2911 if (vp->v_mmap_read || vp->v_mmap_write)
2912 return (V_TRUE);
2913 break;
2914 case V_READ:
2915 if (vp->v_mmap_read)
2916 return (V_TRUE);
2917 break;
2918 }
2919 #endif
2920
2921 return (V_FALSE);
2922 }
2923
2924 /*
2925 * Set the operations vector for a vnode.
2926 *
2927 * FEM ensures that the v_femhead pointer is filled in before the
2928 * v_op pointer is changed. This means that if the v_femhead pointer
2929 * is NULL, and the v_op field hasn't changed since before which checked
2930 * the v_femhead pointer; then our update is ok - we are not racing with
2931 * FEM.
2932 */
2933 void
2934 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2935 {
2936 vnodeops_t *op;
2937
2938 ASSERT(vp != NULL);
2939 ASSERT(vnodeops != NULL);
2940
2941 op = vp->v_op;
2942 membar_consumer();
2943 /*
2944 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2945 * the compare-and-swap on vp->v_op. If either fails, then FEM is
2946 * in effect on the vnode and we need to have FEM deal with it.
2947 */
2948 if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2949 op) {
2950 fem_setvnops(vp, vnodeops);
2951 }
2952 }
2953
2954 /*
2955 * Retrieve the operations vector for a vnode
2956 * As with vn_setops(above); make sure we aren't racing with FEM.
2957 * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2958 * make sense to the callers of this routine.
2959 */
2960 vnodeops_t *
2961 vn_getops(vnode_t *vp)
2962 {
2963 vnodeops_t *op;
2964
2965 ASSERT(vp != NULL);
2966
2967 op = vp->v_op;
2968 membar_consumer();
2969 if (vp->v_femhead == NULL && op == vp->v_op) {
2970 return (op);
2971 } else {
2972 return (fem_getvnops(vp));
2973 }
2974 }
2975
2976 /*
2977 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2978 * Returns zero (0) if not.
2979 */
2980 int
2981 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2982 {
2983 return (vn_getops(vp) == vnodeops);
2984 }
2985
2986 /*
2987 * Returns non-zero (1) if the specified operation matches the
2988 * corresponding operation for that the vnode.
2989 * Returns zero (0) if not.
2990 */
2991
2992 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2993
2994 int
2995 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2996 {
2997 const fs_operation_trans_def_t *otdp;
2998 fs_generic_func_p *loc = NULL;
2999 vnodeops_t *vop = vn_getops(vp);
3000
3001 ASSERT(vopname != NULL);
3002
3003 for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3004 if (MATCHNAME(otdp->name, vopname)) {
3005 loc = (fs_generic_func_p *)
3006 ((char *)(vop) + otdp->offset);
3007 break;
3008 }
3009 }
3010
3011 return ((loc != NULL) && (*loc == funcp));
3012 }
3013
3014 /*
3015 * fs_new_caller_id() needs to return a unique ID on a given local system.
3016 * The IDs do not need to survive across reboots. These are primarily
3017 * used so that (FEM) monitors can detect particular callers (such as
3018 * the NFS server) to a given vnode/vfs operation.
3019 */
3020 u_longlong_t
3021 fs_new_caller_id()
3022 {
3023 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3024
3025 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3026 }
3027
3028 /*
3029 * The value stored in v_path is relative to rootdir, located in the global
3030 * zone. Zones or chroot environments which reside deeper inside the VFS
3031 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3032 * what lies below their perceived root. In order to keep v_path usable for
3033 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3034 *
3035 * An upper bound of max_vnode_path is placed upon v_path allocations to
3036 * prevent the system from going too wild at the behest of pathological
3037 * behavior from the operator.
3038 */
3039 size_t max_vnode_path = 4 * MAXPATHLEN;
3040
3041
3042 void
3043 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3044 {
3045 char *buf;
3046
3047 mutex_enter(&vp->v_lock);
3048 /*
3049 * If the snapshot of v_path_stamp passed in via compare_stamp does not
3050 * match the present value on the vnode, it indicates that subsequent
3051 * changes have occurred. The v_path value is not cleared in this case
3052 * since the new value may be valid.
3053 */
3054 if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3055 mutex_exit(&vp->v_lock);
3056 return;
3057 }
3058 buf = vp->v_path;
3059 vp->v_path = vn_vpath_empty;
3060 vp->v_path_stamp = 0;
3061 mutex_exit(&vp->v_lock);
3062 if (buf != vn_vpath_empty) {
3063 kmem_free(buf, strlen(buf) + 1);
3064 }
3065 }
3066
3067 static void
3068 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3069 boolean_t is_rename)
3070 {
3071 char *buf, *oldbuf;
3072 hrtime_t pstamp;
3073 size_t baselen, buflen = 0;
3074
3075 /* Handle the vn_setpath_str case. */
3076 if (pvp == NULL) {
3077 if (len + 1 > max_vnode_path) {
3078 DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3079 vnode_t *, vp, char *, name, size_t, len + 1);
3080 return;
3081 }
3082 buf = kmem_alloc(len + 1, KM_SLEEP);
3083 bcopy(name, buf, len);
3084 buf[len] = '\0';
3085
3086 mutex_enter(&vp->v_lock);
3087 oldbuf = vp->v_path;
3088 vp->v_path = buf;
3089 vp->v_path_stamp = gethrtime();
3090 mutex_exit(&vp->v_lock);
3091 if (oldbuf != vn_vpath_empty) {
3092 kmem_free(oldbuf, strlen(oldbuf) + 1);
3093 }
3094 return;
3095 }
3096
3097 /* Take snapshot of parent dir */
3098 mutex_enter(&pvp->v_lock);
3099
3100 if ((pvp->v_flag & VTRAVERSE) != 0) {
3101 /*
3102 * When the parent vnode has VTRAVERSE set in its flags, normal
3103 * assumptions about v_path calculation no longer apply. The
3104 * primary situation where this occurs is via the VFS tricks
3105 * which procfs plays in order to allow /proc/PID/(root|cwd) to
3106 * yield meaningful results.
3107 *
3108 * When this flag is set, v_path on the child must not be
3109 * updated since the calculated value is likely to be
3110 * incorrect, given the current context.
3111 */
3112 mutex_exit(&pvp->v_lock);
3113 return;
3114 }
3115
3116 retrybuf:
3117 if (pvp->v_path == vn_vpath_empty) {
3118 /*
3119 * Without v_path from the parent directory, generating a child
3120 * path from the name is impossible.
3121 */
3122 if (len > 0) {
3123 pstamp = pvp->v_path_stamp;
3124 mutex_exit(&pvp->v_lock);
3125 vn_clearpath(vp, pstamp);
3126 return;
3127 }
3128
3129 /*
3130 * The only feasible case here is where a NUL lookup is being
3131 * performed on rootdir prior to its v_path being populated.
3132 */
3133 ASSERT(pvp->v_path_stamp == 0);
3134 baselen = 0;
3135 pstamp = 0;
3136 } else {
3137 pstamp = pvp->v_path_stamp;
3138 baselen = strlen(pvp->v_path);
3139 /* ignore a trailing slash if present */
3140 if (pvp->v_path[baselen - 1] == '/') {
3141 /* This should only the be case for rootdir */
3142 ASSERT(baselen == 1 && pvp == rootdir);
3143 baselen--;
3144 }
3145 }
3146 mutex_exit(&pvp->v_lock);
3147
3148 if (buflen != 0) {
3149 /* Free the existing (mis-sized) buffer in case of retry */
3150 kmem_free(buf, buflen);
3151 }
3152 /* base, '/', name and trailing NUL */
3153 buflen = baselen + len + 2;
3154 if (buflen > max_vnode_path) {
3155 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3156 vnode_t *, vp, char *, name, size_t, buflen);
3157 return;
3158 }
3159 buf = kmem_alloc(buflen, KM_SLEEP);
3160
3161 mutex_enter(&pvp->v_lock);
3162 if (pvp->v_path_stamp != pstamp) {
3163 size_t vlen;
3164
3165 /*
3166 * Since v_path_stamp changed on the parent, it is likely that
3167 * v_path has been altered as well. If the length does not
3168 * exactly match what was previously measured, the buffer
3169 * allocation must be repeated for proper sizing.
3170 */
3171 if (pvp->v_path == vn_vpath_empty) {
3172 /* Give up if parent lack v_path */
3173 mutex_exit(&pvp->v_lock);
3174 kmem_free(buf, buflen);
3175 return;
3176 }
3177 vlen = strlen(pvp->v_path);
3178 if (pvp->v_path[vlen - 1] == '/') {
3179 vlen--;
3180 }
3181 if (vlen != baselen) {
3182 goto retrybuf;
3183 }
3184 }
3185 bcopy(pvp->v_path, buf, baselen);
3186 mutex_exit(&pvp->v_lock);
3187
3188 buf[baselen] = '/';
3189 baselen++;
3190 bcopy(name, &buf[baselen], len + 1);
3191
3192 mutex_enter(&vp->v_lock);
3193 if (vp->v_path_stamp == 0) {
3194 /* never-visited vnode can inherit stamp from parent */
3195 ASSERT(vp->v_path == vn_vpath_empty);
3196 vp->v_path_stamp = pstamp;
3197 vp->v_path = buf;
3198 mutex_exit(&vp->v_lock);
3199 } else if (vp->v_path_stamp < pstamp || is_rename) {
3200 /*
3201 * Install the updated path and stamp, ensuring that the v_path
3202 * pointer is valid at all times for dtrace.
3203 */
3204 oldbuf = vp->v_path;
3205 vp->v_path = buf;
3206 vp->v_path_stamp = gethrtime();
3207 mutex_exit(&vp->v_lock);
3208 kmem_free(oldbuf, strlen(oldbuf) + 1);
3209 } else {
3210 /*
3211 * If the timestamp matches or is greater, it means another
3212 * thread performed the update first while locks were dropped
3213 * here to make the allocation. We defer to the newer value.
3214 */
3215 mutex_exit(&vp->v_lock);
3216 kmem_free(buf, buflen);
3217 }
3218 ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3219 }
3220
3221 void
3222 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3223 {
3224 size_t len;
3225
3226 /*
3227 * If the parent is older or empty, there's nothing further to do.
3228 */
3229 if (pvp->v_path == vn_vpath_empty ||
3230 pvp->v_path_stamp <= vp->v_path_stamp) {
3231 return;
3232 }
3233
3234 /*
3235 * Given the lack of appropriate context, meaningful updates to v_path
3236 * cannot be made for during lookups for the '.' or '..' entries.
3237 */
3238 len = strlen(name);
3239 if (len == 0 || (len == 1 && name[0] == '.') ||
3240 (len == 2 && name[0] == '.' && name[1] == '.')) {
3241 return;
3242 }
3243
3244 vn_setpath_common(pvp, vp, name, len, B_FALSE);
3245 }
3246
3247 /*
3248 * Given a starting vnode and a path, updates the path in the target vnode in
3249 * a safe manner. If the vnode already has path information embedded, then the
3250 * cached path is left untouched.
3251 */
3252 /* ARGSUSED */
3253 void
3254 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3255 size_t len)
3256 {
3257 vn_setpath_common(pvp, vp, name, len, B_FALSE);
3258 }
3259
3260 /*
3261 * Sets the path to the vnode to be the given string, regardless of current
3262 * context. The string must be a complete path from rootdir. This is only used
3263 * by fsop_root() for setting the path based on the mountpoint.
3264 */
3265 void
3266 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3267 {
3268 vn_setpath_common(NULL, vp, str, len, B_FALSE);
3269 }
3270
3271 /*
3272 * Called from within filesystem's vop_rename() to handle renames once the
3273 * target vnode is available.
3274 */
3275 void
3276 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3277 {
3278 vn_setpath_common(pvp, vp, name, len, B_TRUE);
3279 }
3280
3281 /*
3282 * Similar to vn_setpath_str(), this function sets the path of the destination
3283 * vnode to the be the same as the source vnode.
3284 */
3285 void
3286 vn_copypath(struct vnode *src, struct vnode *dst)
3287 {
3288 char *buf;
3289 hrtime_t stamp;
3290 size_t buflen;
3291
3292 mutex_enter(&src->v_lock);
3293 if (src->v_path == vn_vpath_empty) {
3294 mutex_exit(&src->v_lock);
3295 return;
3296 }
3297 buflen = strlen(src->v_path) + 1;
3298 mutex_exit(&src->v_lock);
3299
3300 buf = kmem_alloc(buflen, KM_SLEEP);
3301
3302 mutex_enter(&src->v_lock);
3303 if (src->v_path == vn_vpath_empty ||
3304 strlen(src->v_path) + 1 != buflen) {
3305 mutex_exit(&src->v_lock);
3306 kmem_free(buf, buflen);
3307 return;
3308 }
3309 bcopy(src->v_path, buf, buflen);
3310 stamp = src->v_path_stamp;
3311 mutex_exit(&src->v_lock);
3312
3313 mutex_enter(&dst->v_lock);
3314 if (dst->v_path != vn_vpath_empty) {
3315 mutex_exit(&dst->v_lock);
3316 kmem_free(buf, buflen);
3317 return;
3318 }
3319 dst->v_path = buf;
3320 dst->v_path_stamp = stamp;
3321 mutex_exit(&dst->v_lock);
3322 }
3323
3324
3325 /*
3326 * XXX Private interface for segvn routines that handle vnode
3327 * large page segments.
3328 *
3329 * return 1 if vp's file system VOP_PAGEIO() implementation
3330 * can be safely used instead of VOP_GETPAGE() for handling
3331 * pagefaults against regular non swap files. VOP_PAGEIO()
3332 * interface is considered safe here if its implementation
3333 * is very close to VOP_GETPAGE() implementation.
3334 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3335 * panic if there're file holes but instead returns an error.
3336 * Doesn't assume file won't be changed by user writes, etc.
3337 *
3338 * return 0 otherwise.
3339 *
3340 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3341 */
3342 int
3343 vn_vmpss_usepageio(vnode_t *vp)
3344 {
3345 vfs_t *vfsp = vp->v_vfsp;
3346 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3347 char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3348 char **fsok = pageio_ok_fss;
3349
3350 if (fsname == NULL) {
3351 return (0);
3352 }
3353
3354 for (; *fsok; fsok++) {
3355 if (strcmp(*fsok, fsname) == 0) {
3356 return (1);
3357 }
3358 }
3359 return (0);
3360 }
3361
3362 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3363
3364 int
3365 fop_open(
3366 vnode_t **vpp,
3367 int mode,
3368 cred_t *cr,
3369 caller_context_t *ct)
3370 {
3371 int ret;
3372 vnode_t *vp = *vpp;
3373
3374 VN_HOLD(vp);
3375 /*
3376 * Adding to the vnode counts before calling open
3377 * avoids the need for a mutex. It circumvents a race
3378 * condition where a query made on the vnode counts results in a
3379 * false negative. The inquirer goes away believing the file is
3380 * not open when there is an open on the file already under way.
3381 *
3382 * The counts are meant to prevent NFS from granting a delegation
3383 * when it would be dangerous to do so.
3384 *
3385 * The vnode counts are only kept on regular files
3386 */
3387 if ((*vpp)->v_type == VREG) {
3388 if (mode & FREAD)
3389 atomic_inc_32(&(*vpp)->v_rdcnt);
3390 if (mode & FWRITE)
3391 atomic_inc_32(&(*vpp)->v_wrcnt);
3392 }
3393
3394 VOPXID_MAP_CR(vp, cr);
3395
3396 ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3397
3398 if (ret) {
3399 /*
3400 * Use the saved vp just in case the vnode ptr got trashed
3401 * by the error.
3402 */
3403 VOPSTATS_UPDATE(vp, open);
3404 if ((vp->v_type == VREG) && (mode & FREAD))
3405 atomic_dec_32(&vp->v_rdcnt);
3406 if ((vp->v_type == VREG) && (mode & FWRITE))
3407 atomic_dec_32(&vp->v_wrcnt);
3408 } else {
3409 /*
3410 * Some filesystems will return a different vnode,
3411 * but the same path was still used to open it.
3412 * So if we do change the vnode and need to
3413 * copy over the path, do so here, rather than special
3414 * casing each filesystem. Adjust the vnode counts to
3415 * reflect the vnode switch.
3416 */
3417 VOPSTATS_UPDATE(*vpp, open);
3418 if (*vpp != vp) {
3419 vn_copypath(vp, *vpp);
3420 if (((*vpp)->v_type == VREG) && (mode & FREAD))
3421 atomic_inc_32(&(*vpp)->v_rdcnt);
3422 if ((vp->v_type == VREG) && (mode & FREAD))
3423 atomic_dec_32(&vp->v_rdcnt);
3424 if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3425 atomic_inc_32(&(*vpp)->v_wrcnt);
3426 if ((vp->v_type == VREG) && (mode & FWRITE))
3427 atomic_dec_32(&vp->v_wrcnt);
3428 }
3429 }
3430 VN_RELE(vp);
3431 return (ret);
3432 }
3433
3434 int
3435 fop_close(
3436 vnode_t *vp,
3437 int flag,
3438 int count,
3439 offset_t offset,
3440 cred_t *cr,
3441 caller_context_t *ct)
3442 {
3443 int err;
3444
3445 VOPXID_MAP_CR(vp, cr);
3446
3447 err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3448 VOPSTATS_UPDATE(vp, close);
3449 /*
3450 * Check passed in count to handle possible dups. Vnode counts are only
3451 * kept on regular files
3452 */
3453 if ((vp->v_type == VREG) && (count == 1)) {
3454 if (flag & FREAD) {
3455 ASSERT(vp->v_rdcnt > 0);
3456 atomic_dec_32(&vp->v_rdcnt);
3457 }
3458 if (flag & FWRITE) {
3459 ASSERT(vp->v_wrcnt > 0);
3460 atomic_dec_32(&vp->v_wrcnt);
3461 }
3462 }
3463 return (err);
3464 }
3465
3466 int
3467 fop_read(
3468 vnode_t *vp,
3469 uio_t *uiop,
3470 int ioflag,
3471 cred_t *cr,
3472 caller_context_t *ct)
3473 {
3474 int err;
3475 ssize_t resid_start = uiop->uio_resid;
3476
3477 VOPXID_MAP_CR(vp, cr);
3478
3479 err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3480 VOPSTATS_UPDATE_IO(vp, read,
3481 read_bytes, (resid_start - uiop->uio_resid));
3482 return (err);
3483 }
3484
3485 int
3486 fop_write(
3487 vnode_t *vp,
3488 uio_t *uiop,
3489 int ioflag,
3490 cred_t *cr,
3491 caller_context_t *ct)
3492 {
3493 int err;
3494 ssize_t resid_start = uiop->uio_resid;
3495
3496 VOPXID_MAP_CR(vp, cr);
3497
3498 err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3499 VOPSTATS_UPDATE_IO(vp, write,
3500 write_bytes, (resid_start - uiop->uio_resid));
3501 return (err);
3502 }
3503
3504 int
3505 fop_ioctl(
3506 vnode_t *vp,
3507 int cmd,
3508 intptr_t arg,
3509 int flag,
3510 cred_t *cr,
3511 int *rvalp,
3512 caller_context_t *ct)
3513 {
3514 int err;
3515
3516 VOPXID_MAP_CR(vp, cr);
3517
3518 err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3519 VOPSTATS_UPDATE(vp, ioctl);
3520 return (err);
3521 }
3522
3523 int
3524 fop_setfl(
3525 vnode_t *vp,
3526 int oflags,
3527 int nflags,
3528 cred_t *cr,
3529 caller_context_t *ct)
3530 {
3531 int err;
3532
3533 VOPXID_MAP_CR(vp, cr);
3534
3535 err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3536 VOPSTATS_UPDATE(vp, setfl);
3537 return (err);
3538 }
3539
3540 int
3541 fop_getattr(
3542 vnode_t *vp,
3543 vattr_t *vap,
3544 int flags,
3545 cred_t *cr,
3546 caller_context_t *ct)
3547 {
3548 int err;
3549
3550 VOPXID_MAP_CR(vp, cr);
3551
3552 /*
3553 * If this file system doesn't understand the xvattr extensions
3554 * then turn off the xvattr bit.
3555 */
3556 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3557 vap->va_mask &= ~AT_XVATTR;
3558 }
3559
3560 /*
3561 * We're only allowed to skip the ACL check iff we used a 32 bit
3562 * ACE mask with VOP_ACCESS() to determine permissions.
3563 */
3564 if ((flags & ATTR_NOACLCHECK) &&
3565 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3566 return (EINVAL);
3567 }
3568 err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3569 VOPSTATS_UPDATE(vp, getattr);
3570 return (err);
3571 }
3572
3573 int
3574 fop_setattr(
3575 vnode_t *vp,
3576 vattr_t *vap,
3577 int flags,
3578 cred_t *cr,
3579 caller_context_t *ct)
3580 {
3581 int err;
3582
3583 VOPXID_MAP_CR(vp, cr);
3584
3585 /*
3586 * If this file system doesn't understand the xvattr extensions
3587 * then turn off the xvattr bit.
3588 */
3589 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3590 vap->va_mask &= ~AT_XVATTR;
3591 }
3592
3593 /*
3594 * We're only allowed to skip the ACL check iff we used a 32 bit
3595 * ACE mask with VOP_ACCESS() to determine permissions.
3596 */
3597 if ((flags & ATTR_NOACLCHECK) &&
3598 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3599 return (EINVAL);
3600 }
3601 err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3602 VOPSTATS_UPDATE(vp, setattr);
3603 return (err);
3604 }
3605
3606 int
3607 fop_access(
3608 vnode_t *vp,
3609 int mode,
3610 int flags,
3611 cred_t *cr,
3612 caller_context_t *ct)
3613 {
3614 int err;
3615
3616 if ((flags & V_ACE_MASK) &&
3617 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3618 return (EINVAL);
3619 }
3620
3621 VOPXID_MAP_CR(vp, cr);
3622
3623 err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3624 VOPSTATS_UPDATE(vp, access);
3625 return (err);
3626 }
3627
3628 int
3629 fop_lookup(
3630 vnode_t *dvp,
3631 char *nm,
3632 vnode_t **vpp,
3633 pathname_t *pnp,
3634 int flags,
3635 vnode_t *rdir,
3636 cred_t *cr,
3637 caller_context_t *ct,
3638 int *deflags, /* Returned per-dirent flags */
3639 pathname_t *ppnp) /* Returned case-preserved name in directory */
3640 {
3641 int ret;
3642
3643 /*
3644 * If this file system doesn't support case-insensitive access
3645 * and said access is requested, fail quickly. It is required
3646 * that if the vfs supports case-insensitive lookup, it also
3647 * supports extended dirent flags.
3648 */
3649 if (flags & FIGNORECASE &&
3650 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3651 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3652 return (EINVAL);
3653
3654 VOPXID_MAP_CR(dvp, cr);
3655
3656 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3657 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3658 } else {
3659 ret = (*(dvp)->v_op->vop_lookup)
3660 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3661 }
3662 if (ret == 0 && *vpp) {
3663 VOPSTATS_UPDATE(*vpp, lookup);
3664 vn_updatepath(dvp, *vpp, nm);
3665 }
3666
3667 return (ret);
3668 }
3669
3670 int
3671 fop_create(
3672 vnode_t *dvp,
3673 char *name,
3674 vattr_t *vap,
3675 vcexcl_t excl,
3676 int mode,
3677 vnode_t **vpp,
3678 cred_t *cr,
3679 int flags,
3680 caller_context_t *ct,
3681 vsecattr_t *vsecp) /* ACL to set during create */
3682 {
3683 int ret;
3684
3685 if (vsecp != NULL &&
3686 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3687 return (EINVAL);
3688 }
3689 /*
3690 * If this file system doesn't support case-insensitive access
3691 * and said access is requested, fail quickly.
3692 */
3693 if (flags & FIGNORECASE &&
3694 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3695 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3696 return (EINVAL);
3697
3698 VOPXID_MAP_CR(dvp, cr);
3699
3700 ret = (*(dvp)->v_op->vop_create)
3701 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3702 if (ret == 0 && *vpp) {
3703 VOPSTATS_UPDATE(*vpp, create);
3704 vn_updatepath(dvp, *vpp, name);
3705 }
3706
3707 return (ret);
3708 }
3709
3710 int
3711 fop_remove(
3712 vnode_t *dvp,
3713 char *nm,
3714 cred_t *cr,
3715 caller_context_t *ct,
3716 int flags)
3717 {
3718 int err;
3719
3720 /*
3721 * If this file system doesn't support case-insensitive access
3722 * and said access is requested, fail quickly.
3723 */
3724 if (flags & FIGNORECASE &&
3725 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3726 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3727 return (EINVAL);
3728
3729 VOPXID_MAP_CR(dvp, cr);
3730
3731 err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3732 VOPSTATS_UPDATE(dvp, remove);
3733 return (err);
3734 }
3735
3736 int
3737 fop_link(
3738 vnode_t *tdvp,
3739 vnode_t *svp,
3740 char *tnm,
3741 cred_t *cr,
3742 caller_context_t *ct,
3743 int flags)
3744 {
3745 int err;
3746
3747 /*
3748 * If the target file system doesn't support case-insensitive access
3749 * and said access is requested, fail quickly.
3750 */
3751 if (flags & FIGNORECASE &&
3752 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3753 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3754 return (EINVAL);
3755
3756 VOPXID_MAP_CR(tdvp, cr);
3757
3758 err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3759 VOPSTATS_UPDATE(tdvp, link);
3760 return (err);
3761 }
3762
3763 int
3764 fop_rename(
3765 vnode_t *sdvp,
3766 char *snm,
3767 vnode_t *tdvp,
3768 char *tnm,
3769 cred_t *cr,
3770 caller_context_t *ct,
3771 int flags)
3772 {
3773 int err;
3774
3775 /*
3776 * If the file system involved does not support
3777 * case-insensitive access and said access is requested, fail
3778 * quickly.
3779 */
3780 if (flags & FIGNORECASE &&
3781 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3782 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3783 return (EINVAL);
3784
3785 VOPXID_MAP_CR(tdvp, cr);
3786
3787 err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3788 VOPSTATS_UPDATE(sdvp, rename);
3789 return (err);
3790 }
3791
3792 int
3793 fop_mkdir(
3794 vnode_t *dvp,
3795 char *dirname,
3796 vattr_t *vap,
3797 vnode_t **vpp,
3798 cred_t *cr,
3799 caller_context_t *ct,
3800 int flags,
3801 vsecattr_t *vsecp) /* ACL to set during create */
3802 {
3803 int ret;
3804
3805 if (vsecp != NULL &&
3806 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3807 return (EINVAL);
3808 }
3809 /*
3810 * If this file system doesn't support case-insensitive access
3811 * and said access is requested, fail quickly.
3812 */
3813 if (flags & FIGNORECASE &&
3814 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3815 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3816 return (EINVAL);
3817
3818 VOPXID_MAP_CR(dvp, cr);
3819
3820 ret = (*(dvp)->v_op->vop_mkdir)
3821 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3822 if (ret == 0 && *vpp) {
3823 VOPSTATS_UPDATE(*vpp, mkdir);
3824 vn_updatepath(dvp, *vpp, dirname);
3825 }
3826
3827 return (ret);
3828 }
3829
3830 int
3831 fop_rmdir(
3832 vnode_t *dvp,
3833 char *nm,
3834 vnode_t *cdir,
3835 cred_t *cr,
3836 caller_context_t *ct,
3837 int flags)
3838 {
3839 int err;
3840
3841 /*
3842 * If this file system doesn't support case-insensitive access
3843 * and said access is requested, fail quickly.
3844 */
3845 if (flags & FIGNORECASE &&
3846 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3847 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3848 return (EINVAL);
3849
3850 VOPXID_MAP_CR(dvp, cr);
3851
3852 err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3853 VOPSTATS_UPDATE(dvp, rmdir);
3854 return (err);
3855 }
3856
3857 int
3858 fop_readdir(
3859 vnode_t *vp,
3860 uio_t *uiop,
3861 cred_t *cr,
3862 int *eofp,
3863 caller_context_t *ct,
3864 int flags)
3865 {
3866 int err;
3867 ssize_t resid_start = uiop->uio_resid;
3868
3869 /*
3870 * If this file system doesn't support retrieving directory
3871 * entry flags and said access is requested, fail quickly.
3872 */
3873 if (flags & V_RDDIR_ENTFLAGS &&
3874 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3875 return (EINVAL);
3876
3877 VOPXID_MAP_CR(vp, cr);
3878
3879 err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3880 VOPSTATS_UPDATE_IO(vp, readdir,
3881 readdir_bytes, (resid_start - uiop->uio_resid));
3882 return (err);
3883 }
3884
3885 int
3886 fop_symlink(
3887 vnode_t *dvp,
3888 char *linkname,
3889 vattr_t *vap,
3890 char *target,
3891 cred_t *cr,
3892 caller_context_t *ct,
3893 int flags)
3894 {
3895 int err;
3896 xvattr_t xvattr;
3897
3898 /*
3899 * If this file system doesn't support case-insensitive access
3900 * and said access is requested, fail quickly.
3901 */
3902 if (flags & FIGNORECASE &&
3903 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3904 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3905 return (EINVAL);
3906
3907 VOPXID_MAP_CR(dvp, cr);
3908
3909 /* check for reparse point */
3910 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3911 (strncmp(target, FS_REPARSE_TAG_STR,
3912 strlen(FS_REPARSE_TAG_STR)) == 0)) {
3913 if (!fs_reparse_mark(target, vap, &xvattr))
3914 vap = (vattr_t *)&xvattr;
3915 }
3916
3917 err = (*(dvp)->v_op->vop_symlink)
3918 (dvp, linkname, vap, target, cr, ct, flags);
3919 VOPSTATS_UPDATE(dvp, symlink);
3920 return (err);
3921 }
3922
3923 int
3924 fop_readlink(
3925 vnode_t *vp,
3926 uio_t *uiop,
3927 cred_t *cr,
3928 caller_context_t *ct)
3929 {
3930 int err;
3931
3932 VOPXID_MAP_CR(vp, cr);
3933
3934 err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3935 VOPSTATS_UPDATE(vp, readlink);
3936 return (err);
3937 }
3938
3939 int
3940 fop_fsync(
3941 vnode_t *vp,
3942 int syncflag,
3943 cred_t *cr,
3944 caller_context_t *ct)
3945 {
3946 int err;
3947
3948 VOPXID_MAP_CR(vp, cr);
3949
3950 err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3951 VOPSTATS_UPDATE(vp, fsync);
3952 return (err);
3953 }
3954
3955 void
3956 fop_inactive(
3957 vnode_t *vp,
3958 cred_t *cr,
3959 caller_context_t *ct)
3960 {
3961 /* Need to update stats before vop call since we may lose the vnode */
3962 VOPSTATS_UPDATE(vp, inactive);
3963
3964 VOPXID_MAP_CR(vp, cr);
3965
3966 (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3967 }
3968
3969 int
3970 fop_fid(
3971 vnode_t *vp,
3972 fid_t *fidp,
3973 caller_context_t *ct)
3974 {
3975 int err;
3976
3977 err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3978 VOPSTATS_UPDATE(vp, fid);
3979 return (err);
3980 }
3981
3982 int
3983 fop_rwlock(
3984 vnode_t *vp,
3985 int write_lock,
3986 caller_context_t *ct)
3987 {
3988 int ret;
3989
3990 ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3991 VOPSTATS_UPDATE(vp, rwlock);
3992 return (ret);
3993 }
3994
3995 void
3996 fop_rwunlock(
3997 vnode_t *vp,
3998 int write_lock,
3999 caller_context_t *ct)
4000 {
4001 (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4002 VOPSTATS_UPDATE(vp, rwunlock);
4003 }
4004
4005 int
4006 fop_seek(
4007 vnode_t *vp,
4008 offset_t ooff,
4009 offset_t *noffp,
4010 caller_context_t *ct)
4011 {
4012 int err;
4013
4014 err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4015 VOPSTATS_UPDATE(vp, seek);
4016 return (err);
4017 }
4018
4019 int
4020 fop_cmp(
4021 vnode_t *vp1,
4022 vnode_t *vp2,
4023 caller_context_t *ct)
4024 {
4025 int err;
4026
4027 err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4028 VOPSTATS_UPDATE(vp1, cmp);
4029 return (err);
4030 }
4031
4032 int
4033 fop_frlock(
4034 vnode_t *vp,
4035 int cmd,
4036 flock64_t *bfp,
4037 int flag,
4038 offset_t offset,
4039 struct flk_callback *flk_cbp,
4040 cred_t *cr,
4041 caller_context_t *ct)
4042 {
4043 int err;
4044
4045 VOPXID_MAP_CR(vp, cr);
4046
4047 err = (*(vp)->v_op->vop_frlock)
4048 (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4049 VOPSTATS_UPDATE(vp, frlock);
4050 return (err);
4051 }
4052
4053 int
4054 fop_space(
4055 vnode_t *vp,
4056 int cmd,
4057 flock64_t *bfp,
4058 int flag,
4059 offset_t offset,
4060 cred_t *cr,
4061 caller_context_t *ct)
4062 {
4063 int err;
4064
4065 VOPXID_MAP_CR(vp, cr);
4066
4067 err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4068 VOPSTATS_UPDATE(vp, space);
4069 return (err);
4070 }
4071
4072 int
4073 fop_realvp(
4074 vnode_t *vp,
4075 vnode_t **vpp,
4076 caller_context_t *ct)
4077 {
4078 int err;
4079
4080 err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4081 VOPSTATS_UPDATE(vp, realvp);
4082 return (err);
4083 }
4084
4085 int
4086 fop_getpage(
4087 vnode_t *vp,
4088 offset_t off,
4089 size_t len,
4090 uint_t *protp,
4091 page_t **plarr,
4092 size_t plsz,
4093 struct seg *seg,
4094 caddr_t addr,
4095 enum seg_rw rw,
4096 cred_t *cr,
4097 caller_context_t *ct)
4098 {
4099 int err;
4100
4101 VOPXID_MAP_CR(vp, cr);
4102
4103 err = (*(vp)->v_op->vop_getpage)
4104 (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4105 VOPSTATS_UPDATE(vp, getpage);
4106 return (err);
4107 }
4108
4109 int
4110 fop_putpage(
4111 vnode_t *vp,
4112 offset_t off,
4113 size_t len,
4114 int flags,
4115 cred_t *cr,
4116 caller_context_t *ct)
4117 {
4118 int err;
4119
4120 VOPXID_MAP_CR(vp, cr);
4121
4122 err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4123 VOPSTATS_UPDATE(vp, putpage);
4124 return (err);
4125 }
4126
4127 int
4128 fop_map(
4129 vnode_t *vp,
4130 offset_t off,
4131 struct as *as,
4132 caddr_t *addrp,
4133 size_t len,
4134 uchar_t prot,
4135 uchar_t maxprot,
4136 uint_t flags,
4137 cred_t *cr,
4138 caller_context_t *ct)
4139 {
4140 int err;
4141
4142 VOPXID_MAP_CR(vp, cr);
4143
4144 err = (*(vp)->v_op->vop_map)
4145 (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4146 VOPSTATS_UPDATE(vp, map);
4147 return (err);
4148 }
4149
4150 int
4151 fop_addmap(
4152 vnode_t *vp,
4153 offset_t off,
4154 struct as *as,
4155 caddr_t addr,
4156 size_t len,
4157 uchar_t prot,
4158 uchar_t maxprot,
4159 uint_t flags,
4160 cred_t *cr,
4161 caller_context_t *ct)
4162 {
4163 int error;
4164 u_longlong_t delta;
4165
4166 VOPXID_MAP_CR(vp, cr);
4167
4168 error = (*(vp)->v_op->vop_addmap)
4169 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4170
4171 if ((!error) && (vp->v_type == VREG)) {
4172 delta = (u_longlong_t)btopr(len);
4173 /*
4174 * If file is declared MAP_PRIVATE, it can't be written back
4175 * even if open for write. Handle as read.
4176 */
4177 if (flags & MAP_PRIVATE) {
4178 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4179 (int64_t)delta);
4180 } else {
4181 /*
4182 * atomic_add_64 forces the fetch of a 64 bit value to
4183 * be atomic on 32 bit machines
4184 */
4185 if (maxprot & PROT_WRITE)
4186 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4187 (int64_t)delta);
4188 if (maxprot & PROT_READ)
4189 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4190 (int64_t)delta);
4191 if (maxprot & PROT_EXEC)
4192 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4193 (int64_t)delta);
4194 }
4195 }
4196 VOPSTATS_UPDATE(vp, addmap);
4197 return (error);
4198 }
4199
4200 int
4201 fop_delmap(
4202 vnode_t *vp,
4203 offset_t off,
4204 struct as *as,
4205 caddr_t addr,
4206 size_t len,
4207 uint_t prot,
4208 uint_t maxprot,
4209 uint_t flags,
4210 cred_t *cr,
4211 caller_context_t *ct)
4212 {
4213 int error;
4214 u_longlong_t delta;
4215
4216 VOPXID_MAP_CR(vp, cr);
4217
4218 error = (*(vp)->v_op->vop_delmap)
4219 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4220
4221 /*
4222 * NFS calls into delmap twice, the first time
4223 * it simply establishes a callback mechanism and returns EAGAIN
4224 * while the real work is being done upon the second invocation.
4225 * We have to detect this here and only decrement the counts upon
4226 * the second delmap request.
4227 */
4228 if ((error != EAGAIN) && (vp->v_type == VREG)) {
4229
4230 delta = (u_longlong_t)btopr(len);
4231
4232 if (flags & MAP_PRIVATE) {
4233 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4234 (int64_t)(-delta));
4235 } else {
4236 /*
4237 * atomic_add_64 forces the fetch of a 64 bit value
4238 * to be atomic on 32 bit machines
4239 */
4240 if (maxprot & PROT_WRITE)
4241 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4242 (int64_t)(-delta));
4243 if (maxprot & PROT_READ)
4244 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4245 (int64_t)(-delta));
4246 if (maxprot & PROT_EXEC)
4247 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4248 (int64_t)(-delta));
4249 }
4250 }
4251 VOPSTATS_UPDATE(vp, delmap);
4252 return (error);
4253 }
4254
4255
4256 int
4257 fop_poll(
4258 vnode_t *vp,
4259 short events,
4260 int anyyet,
4261 short *reventsp,
4262 struct pollhead **phpp,
4263 caller_context_t *ct)
4264 {
4265 int err;
4266
4267 err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4268 VOPSTATS_UPDATE(vp, poll);
4269 return (err);
4270 }
4271
4272 int
4273 fop_dump(
4274 vnode_t *vp,
4275 caddr_t addr,
4276 offset_t lbdn,
4277 offset_t dblks,
4278 caller_context_t *ct)
4279 {
4280 int err;
4281
4282 /* ensure lbdn and dblks can be passed safely to bdev_dump */
4283 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4284 return (EIO);
4285
4286 err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4287 VOPSTATS_UPDATE(vp, dump);
4288 return (err);
4289 }
4290
4291 int
4292 fop_pathconf(
4293 vnode_t *vp,
4294 int cmd,
4295 ulong_t *valp,
4296 cred_t *cr,
4297 caller_context_t *ct)
4298 {
4299 int err;
4300
4301 VOPXID_MAP_CR(vp, cr);
4302
4303 err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4304 VOPSTATS_UPDATE(vp, pathconf);
4305 return (err);
4306 }
4307
4308 int
4309 fop_pageio(
4310 vnode_t *vp,
4311 struct page *pp,
4312 u_offset_t io_off,
4313 size_t io_len,
4314 int flags,
4315 cred_t *cr,
4316 caller_context_t *ct)
4317 {
4318 int err;
4319
4320 VOPXID_MAP_CR(vp, cr);
4321
4322 err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4323 VOPSTATS_UPDATE(vp, pageio);
4324 return (err);
4325 }
4326
4327 int
4328 fop_dumpctl(
4329 vnode_t *vp,
4330 int action,
4331 offset_t *blkp,
4332 caller_context_t *ct)
4333 {
4334 int err;
4335 err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4336 VOPSTATS_UPDATE(vp, dumpctl);
4337 return (err);
4338 }
4339
4340 void
4341 fop_dispose(
4342 vnode_t *vp,
4343 page_t *pp,
4344 int flag,
4345 int dn,
4346 cred_t *cr,
4347 caller_context_t *ct)
4348 {
4349 /* Must do stats first since it's possible to lose the vnode */
4350 VOPSTATS_UPDATE(vp, dispose);
4351
4352 VOPXID_MAP_CR(vp, cr);
4353
4354 (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4355 }
4356
4357 int
4358 fop_setsecattr(
4359 vnode_t *vp,
4360 vsecattr_t *vsap,
4361 int flag,
4362 cred_t *cr,
4363 caller_context_t *ct)
4364 {
4365 int err;
4366
4367 VOPXID_MAP_CR(vp, cr);
4368
4369 /*
4370 * We're only allowed to skip the ACL check iff we used a 32 bit
4371 * ACE mask with VOP_ACCESS() to determine permissions.
4372 */
4373 if ((flag & ATTR_NOACLCHECK) &&
4374 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4375 return (EINVAL);
4376 }
4377 err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4378 VOPSTATS_UPDATE(vp, setsecattr);
4379 return (err);
4380 }
4381
4382 int
4383 fop_getsecattr(
4384 vnode_t *vp,
4385 vsecattr_t *vsap,
4386 int flag,
4387 cred_t *cr,
4388 caller_context_t *ct)
4389 {
4390 int err;
4391
4392 /*
4393 * We're only allowed to skip the ACL check iff we used a 32 bit
4394 * ACE mask with VOP_ACCESS() to determine permissions.
4395 */
4396 if ((flag & ATTR_NOACLCHECK) &&
4397 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4398 return (EINVAL);
4399 }
4400
4401 VOPXID_MAP_CR(vp, cr);
4402
4403 err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4404 VOPSTATS_UPDATE(vp, getsecattr);
4405 return (err);
4406 }
4407
4408 int
4409 fop_shrlock(
4410 vnode_t *vp,
4411 int cmd,
4412 struct shrlock *shr,
4413 int flag,
4414 cred_t *cr,
4415 caller_context_t *ct)
4416 {
4417 int err;
4418
4419 VOPXID_MAP_CR(vp, cr);
4420
4421 err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4422 VOPSTATS_UPDATE(vp, shrlock);
4423 return (err);
4424 }
4425
4426 int
4427 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4428 caller_context_t *ct)
4429 {
4430 int err;
4431
4432 err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4433 VOPSTATS_UPDATE(vp, vnevent);
4434 return (err);
4435 }
4436
4437 int
4438 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4439 caller_context_t *ct)
4440 {
4441 int err;
4442
4443 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4444 return (ENOTSUP);
4445 err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4446 VOPSTATS_UPDATE(vp, reqzcbuf);
4447 return (err);
4448 }
4449
4450 int
4451 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4452 {
4453 int err;
4454
4455 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4456 return (ENOTSUP);
4457 err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4458 VOPSTATS_UPDATE(vp, retzcbuf);
4459 return (err);
4460 }
4461
4462 /*
4463 * Default destructor
4464 * Needed because NULL destructor means that the key is unused
4465 */
4466 /* ARGSUSED */
4467 void
4468 vsd_defaultdestructor(void *value)
4469 {}
4470
4471 /*
4472 * Create a key (index into per vnode array)
4473 * Locks out vsd_create, vsd_destroy, and vsd_free
4474 * May allocate memory with lock held
4475 */
4476 void
4477 vsd_create(uint_t *keyp, void (*destructor)(void *))
4478 {
4479 int i;
4480 uint_t nkeys;
4481
4482 /*
4483 * if key is allocated, do nothing
4484 */
4485 mutex_enter(&vsd_lock);
4486 if (*keyp) {
4487 mutex_exit(&vsd_lock);
4488 return;
4489 }
4490 /*
4491 * find an unused key
4492 */
4493 if (destructor == NULL)
4494 destructor = vsd_defaultdestructor;
4495
4496 for (i = 0; i < vsd_nkeys; ++i)
4497 if (vsd_destructor[i] == NULL)
4498 break;
4499
4500 /*
4501 * if no unused keys, increase the size of the destructor array
4502 */
4503 if (i == vsd_nkeys) {
4504 if ((nkeys = (vsd_nkeys << 1)) == 0)
4505 nkeys = 1;
4506 vsd_destructor =
4507 (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4508 (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4509 (size_t)(nkeys * sizeof (void (*)(void *))));
4510 vsd_nkeys = nkeys;
4511 }
4512
4513 /*
4514 * allocate the next available unused key
4515 */
4516 vsd_destructor[i] = destructor;
4517 *keyp = i + 1;
4518
4519 /* create vsd_list, if it doesn't exist */
4520 if (vsd_list == NULL) {
4521 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4522 list_create(vsd_list, sizeof (struct vsd_node),
4523 offsetof(struct vsd_node, vs_nodes));
4524 }
4525
4526 mutex_exit(&vsd_lock);
4527 }
4528
4529 /*
4530 * Destroy a key
4531 *
4532 * Assumes that the caller is preventing vsd_set and vsd_get
4533 * Locks out vsd_create, vsd_destroy, and vsd_free
4534 * May free memory with lock held
4535 */
4536 void
4537 vsd_destroy(uint_t *keyp)
4538 {
4539 uint_t key;
4540 struct vsd_node *vsd;
4541
4542 /*
4543 * protect the key namespace and our destructor lists
4544 */
4545 mutex_enter(&vsd_lock);
4546 key = *keyp;
4547 *keyp = 0;
4548
4549 ASSERT(key <= vsd_nkeys);
4550
4551 /*
4552 * if the key is valid
4553 */
4554 if (key != 0) {
4555 uint_t k = key - 1;
4556 /*
4557 * for every vnode with VSD, call key's destructor
4558 */
4559 for (vsd = list_head(vsd_list); vsd != NULL;
4560 vsd = list_next(vsd_list, vsd)) {
4561 /*
4562 * no VSD for key in this vnode
4563 */
4564 if (key > vsd->vs_nkeys)
4565 continue;
4566 /*
4567 * call destructor for key
4568 */
4569 if (vsd->vs_value[k] && vsd_destructor[k])
4570 (*vsd_destructor[k])(vsd->vs_value[k]);
4571 /*
4572 * reset value for key
4573 */
4574 vsd->vs_value[k] = NULL;
4575 }
4576 /*
4577 * actually free the key (NULL destructor == unused)
4578 */
4579 vsd_destructor[k] = NULL;
4580 }
4581
4582 mutex_exit(&vsd_lock);
4583 }
4584
4585 /*
4586 * Quickly return the per vnode value that was stored with the specified key
4587 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4588 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4589 */
4590 void *
4591 vsd_get(vnode_t *vp, uint_t key)
4592 {
4593 struct vsd_node *vsd;
4594
4595 ASSERT(vp != NULL);
4596 ASSERT(mutex_owned(&vp->v_vsd_lock));
4597
4598 vsd = vp->v_vsd;
4599
4600 if (key && vsd != NULL && key <= vsd->vs_nkeys)
4601 return (vsd->vs_value[key - 1]);
4602 return (NULL);
4603 }
4604
4605 /*
4606 * Set a per vnode value indexed with the specified key
4607 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4608 */
4609 int
4610 vsd_set(vnode_t *vp, uint_t key, void *value)
4611 {
4612 struct vsd_node *vsd;
4613
4614 ASSERT(vp != NULL);
4615 ASSERT(mutex_owned(&vp->v_vsd_lock));
4616
4617 if (key == 0)
4618 return (EINVAL);
4619
4620 vsd = vp->v_vsd;
4621 if (vsd == NULL)
4622 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4623
4624 /*
4625 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4626 * code won't happen and we will continue down and allocate space for
4627 * the vs_value array.
4628 * If the caller is replacing one value with another, then it is up
4629 * to the caller to free/rele/destroy the previous value (if needed).
4630 */
4631 if (key <= vsd->vs_nkeys) {
4632 vsd->vs_value[key - 1] = value;
4633 return (0);
4634 }
4635
4636 ASSERT(key <= vsd_nkeys);
4637
4638 if (vsd->vs_nkeys == 0) {
4639 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4640 /*
4641 * Link onto list of all VSD nodes.
4642 */
4643 list_insert_head(vsd_list, vsd);
4644 mutex_exit(&vsd_lock);
4645 }
4646
4647 /*
4648 * Allocate vnode local storage and set the value for key
4649 */
4650 vsd->vs_value = vsd_realloc(vsd->vs_value,
4651 vsd->vs_nkeys * sizeof (void *),
4652 key * sizeof (void *));
4653 vsd->vs_nkeys = key;
4654 vsd->vs_value[key - 1] = value;
4655
4656 return (0);
4657 }
4658
4659 /*
4660 * Called from vn_free() to run the destructor function for each vsd
4661 * Locks out vsd_create and vsd_destroy
4662 * Assumes that the destructor *DOES NOT* use vsd
4663 */
4664 void
4665 vsd_free(vnode_t *vp)
4666 {
4667 int i;
4668 struct vsd_node *vsd = vp->v_vsd;
4669
4670 if (vsd == NULL)
4671 return;
4672
4673 if (vsd->vs_nkeys == 0) {
4674 kmem_free(vsd, sizeof (*vsd));
4675 vp->v_vsd = NULL;
4676 return;
4677 }
4678
4679 /*
4680 * lock out vsd_create and vsd_destroy, call
4681 * the destructor, and mark the value as destroyed.
4682 */
4683 mutex_enter(&vsd_lock);
4684
4685 for (i = 0; i < vsd->vs_nkeys; i++) {
4686 if (vsd->vs_value[i] && vsd_destructor[i])
4687 (*vsd_destructor[i])(vsd->vs_value[i]);
4688 vsd->vs_value[i] = NULL;
4689 }
4690
4691 /*
4692 * remove from linked list of VSD nodes
4693 */
4694 list_remove(vsd_list, vsd);
4695
4696 mutex_exit(&vsd_lock);
4697
4698 /*
4699 * free up the VSD
4700 */
4701 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4702 kmem_free(vsd, sizeof (struct vsd_node));
4703 vp->v_vsd = NULL;
4704 }
4705
4706 /*
4707 * realloc
4708 */
4709 static void *
4710 vsd_realloc(void *old, size_t osize, size_t nsize)
4711 {
4712 void *new;
4713
4714 new = kmem_zalloc(nsize, KM_SLEEP);
4715 if (old) {
4716 bcopy(old, new, osize);
4717 kmem_free(old, osize);
4718 }
4719 return (new);
4720 }
4721
4722 /*
4723 * Setup the extensible system attribute for creating a reparse point.
4724 * The symlink data 'target' is validated for proper format of a reparse
4725 * string and a check also made to make sure the symlink data does not
4726 * point to an existing file.
4727 *
4728 * return 0 if ok else -1.
4729 */
4730 static int
4731 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4732 {
4733 xoptattr_t *xoap;
4734
4735 if ((!target) || (!vap) || (!xvattr))
4736 return (-1);
4737
4738 /* validate reparse string */
4739 if (reparse_validate((const char *)target))
4740 return (-1);
4741
4742 xva_init(xvattr);
4743 xvattr->xva_vattr = *vap;
4744 xvattr->xva_vattr.va_mask |= AT_XVATTR;
4745 xoap = xva_getxoptattr(xvattr);
4746 ASSERT(xoap);
4747 XVA_SET_REQ(xvattr, XAT_REPARSE);
4748 xoap->xoa_reparse = 1;
4749
4750 return (0);
4751 }
4752
4753 /*
4754 * Function to check whether a symlink is a reparse point.
4755 * Return B_TRUE if it is a reparse point, else return B_FALSE
4756 */
4757 boolean_t
4758 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4759 {
4760 xvattr_t xvattr;
4761 xoptattr_t *xoap;
4762
4763 if ((vp->v_type != VLNK) ||
4764 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4765 return (B_FALSE);
4766
4767 xva_init(&xvattr);
4768 xoap = xva_getxoptattr(&xvattr);
4769 ASSERT(xoap);
4770 XVA_SET_REQ(&xvattr, XAT_REPARSE);
4771
4772 if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4773 return (B_FALSE);
4774
4775 if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4776 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4777 return (B_FALSE);
4778
4779 return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4780 }