1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
25 */
26
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/t_lock.h>
43 #include <sys/errno.h>
44 #include <sys/cred.h>
45 #include <sys/user.h>
46 #include <sys/uio.h>
47 #include <sys/file.h>
48 #include <sys/pathname.h>
49 #include <sys/vfs.h>
50 #include <sys/vfs_opreg.h>
51 #include <sys/vnode.h>
52 #include <sys/rwstlock.h>
53 #include <sys/fem.h>
54 #include <sys/stat.h>
55 #include <sys/mode.h>
56 #include <sys/conf.h>
57 #include <sys/sysmacros.h>
58 #include <sys/cmn_err.h>
59 #include <sys/systm.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <c2/audit.h>
63 #include <sys/acl.h>
64 #include <sys/nbmlock.h>
65 #include <sys/fcntl.h>
66 #include <fs/fs_subr.h>
67 #include <sys/taskq.h>
68 #include <fs/fs_reparse.h>
69
70 /* Determine if this vnode is a file that is read-only */
71 #define ISROFILE(vp) \
72 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
73 (vp)->v_type != VFIFO && vn_is_readonly(vp))
74
75 /* Tunable via /etc/system; used only by admin/install */
76 int nfs_global_client_only;
77
78 /*
79 * Array of vopstats_t for per-FS-type vopstats. This array has the same
80 * number of entries as and parallel to the vfssw table. (Arguably, it could
81 * be part of the vfssw table.) Once it's initialized, it's accessed using
82 * the same fstype index that is used to index into the vfssw table.
83 */
84 vopstats_t **vopstats_fstype;
85
86 /* vopstats initialization template used for fast initialization via bcopy() */
87 static vopstats_t *vs_templatep;
88
89 /* Kmem cache handle for vsk_anchor_t allocations */
90 kmem_cache_t *vsk_anchor_cache;
91
92 /* file events cleanup routine */
93 extern void free_fopdata(vnode_t *);
94
95 /*
96 * Root of AVL tree for the kstats associated with vopstats. Lock protects
97 * updates to vsktat_tree.
98 */
99 avl_tree_t vskstat_tree;
100 kmutex_t vskstat_tree_lock;
101
102 /* Global variable which enables/disables the vopstats collection */
103 int vopstats_enabled = 1;
104
105 /*
106 * forward declarations for internal vnode specific data (vsd)
107 */
108 static void *vsd_realloc(void *, size_t, size_t);
109
110 /*
111 * forward declarations for reparse point functions
112 */
113 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
114
115 /*
116 * VSD -- VNODE SPECIFIC DATA
117 * The v_data pointer is typically used by a file system to store a
118 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
119 * However, there are times when additional project private data needs
120 * to be stored separately from the data (node) pointed to by v_data.
121 * This additional data could be stored by the file system itself or
122 * by a completely different kernel entity. VSD provides a way for
123 * callers to obtain a key and store a pointer to private data associated
124 * with a vnode.
125 *
126 * Callers are responsible for protecting the vsd by holding v_vsd_lock
127 * for calls to vsd_set() and vsd_get().
128 */
129
130 /*
131 * vsd_lock protects:
132 * vsd_nkeys - creation and deletion of vsd keys
133 * vsd_list - insertion and deletion of vsd_node in the vsd_list
134 * vsd_destructor - adding and removing destructors to the list
135 */
136 static kmutex_t vsd_lock;
137 static uint_t vsd_nkeys; /* size of destructor array */
138 /* list of vsd_node's */
139 static list_t *vsd_list = NULL;
140 /* per-key destructor funcs */
141 static void (**vsd_destructor)(void *);
142
143 /*
144 * The following is the common set of actions needed to update the
145 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and
146 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
147 * recording of the bytes transferred. Since the code is similar
148 * but small, it is nearly a duplicate. Consequently any changes
149 * to one may need to be reflected in the other.
150 * Rundown of the variables:
151 * vp - Pointer to the vnode
152 * counter - Partial name structure member to update in vopstats for counts
153 * bytecounter - Partial name structure member to update in vopstats for bytes
154 * bytesval - Value to update in vopstats for bytes
155 * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
156 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
157 */
158
159 #define VOPSTATS_UPDATE(vp, counter) { \
160 vfs_t *vfsp = (vp)->v_vfsp; \
161 if (vfsp && vfsp->vfs_implp && \
162 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
163 vopstats_t *vsp = &vfsp->vfs_vopstats; \
164 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
165 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
166 size_t, uint64_t *); \
167 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \
168 (*stataddr)++; \
169 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
170 vsp->n##counter.value.ui64++; \
171 } \
172 } \
173 }
174
175 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \
176 vfs_t *vfsp = (vp)->v_vfsp; \
177 if (vfsp && vfsp->vfs_implp && \
178 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \
179 vopstats_t *vsp = &vfsp->vfs_vopstats; \
180 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \
181 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
182 size_t, uint64_t *); \
183 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
184 (*stataddr)++; \
185 vsp->bytecounter.value.ui64 += bytesval; \
186 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
187 vsp->n##counter.value.ui64++; \
188 vsp->bytecounter.value.ui64 += bytesval; \
189 } \
190 } \
191 }
192
193 /*
194 * If the filesystem does not support XIDs map credential
195 * If the vfsp is NULL, perhaps we should also map?
196 */
197 #define VOPXID_MAP_CR(vp, cr) { \
198 vfs_t *vfsp = (vp)->v_vfsp; \
199 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
200 cr = crgetmapped(cr); \
201 }
202
203 #define VOP_LATENCY_10MS 10000000
204 #define VOP_LATENCY_100MS 100000000
205 #define VOP_LATENCY_1S 1000000000
206
207 /*
208 * Convert stat(2) formats to vnode types and vice versa. (Knows about
209 * numerical order of S_IFMT and vnode types.)
210 */
211 enum vtype iftovt_tab[] = {
212 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
213 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
214 };
215
216 ushort_t vttoif_tab[] = {
217 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
218 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
219 };
220
221 /*
222 * The system vnode cache.
223 */
224
225 kmem_cache_t *vn_cache;
226
227
228 /*
229 * Vnode operations vector.
230 */
231
232 static const fs_operation_trans_def_t vn_ops_table[] = {
233 VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
234 fs_nosys, fs_nosys,
235
236 VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
237 fs_nosys, fs_nosys,
238
239 VOPNAME_READ, offsetof(struct vnodeops, vop_read),
240 fs_nosys, fs_nosys,
241
242 VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
243 fs_nosys, fs_nosys,
244
245 VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
246 fs_nosys, fs_nosys,
247
248 VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
249 fs_setfl, fs_nosys,
250
251 VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
252 fs_nosys, fs_nosys,
253
254 VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
255 fs_nosys, fs_nosys,
256
257 VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
258 fs_nosys, fs_nosys,
259
260 VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
261 fs_nosys, fs_nosys,
262
263 VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
264 fs_nosys, fs_nosys,
265
266 VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
267 fs_nosys, fs_nosys,
268
269 VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
270 fs_nosys, fs_nosys,
271
272 VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
273 fs_nosys, fs_nosys,
274
275 VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
276 fs_nosys, fs_nosys,
277
278 VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
279 fs_nosys, fs_nosys,
280
281 VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
282 fs_nosys, fs_nosys,
283
284 VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
285 fs_nosys, fs_nosys,
286
287 VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
288 fs_nosys, fs_nosys,
289
290 VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
291 fs_nosys, fs_nosys,
292
293 VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
294 fs_nosys, fs_nosys,
295
296 VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
297 fs_nosys, fs_nosys,
298
299 VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
300 fs_rwlock, fs_rwlock,
301
302 VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
303 (fs_generic_func_p) fs_rwunlock,
304 (fs_generic_func_p) fs_rwunlock, /* no errors allowed */
305
306 VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
307 fs_nosys, fs_nosys,
308
309 VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
310 fs_cmp, fs_cmp, /* no errors allowed */
311
312 VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
313 fs_frlock, fs_nosys,
314
315 VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
316 fs_nosys, fs_nosys,
317
318 VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
319 fs_nosys, fs_nosys,
320
321 VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
322 fs_nosys, fs_nosys,
323
324 VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
325 fs_nosys, fs_nosys,
326
327 VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
328 (fs_generic_func_p) fs_nosys_map,
329 (fs_generic_func_p) fs_nosys_map,
330
331 VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
332 (fs_generic_func_p) fs_nosys_addmap,
333 (fs_generic_func_p) fs_nosys_addmap,
334
335 VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
336 fs_nosys, fs_nosys,
337
338 VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
339 (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
340
341 VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
342 fs_nosys, fs_nosys,
343
344 VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
345 fs_pathconf, fs_nosys,
346
347 VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
348 fs_nosys, fs_nosys,
349
350 VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
351 fs_nosys, fs_nosys,
352
353 VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
354 (fs_generic_func_p) fs_dispose,
355 (fs_generic_func_p) fs_nodispose,
356
357 VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
358 fs_nosys, fs_nosys,
359
360 VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
361 fs_fab_acl, fs_nosys,
362
363 VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
364 fs_shrlock, fs_nosys,
365
366 VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
367 (fs_generic_func_p) fs_vnevent_nosupport,
368 (fs_generic_func_p) fs_vnevent_nosupport,
369
370 VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
371 fs_nosys, fs_nosys,
372
373 VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
374 fs_nosys, fs_nosys,
375
376 NULL, 0, NULL, NULL
377 };
378
379 /* Extensible attribute (xva) routines. */
380
381 /*
382 * Zero out the structure, set the size of the requested/returned bitmaps,
383 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
384 * to the returned attributes array.
385 */
386 void
387 xva_init(xvattr_t *xvap)
388 {
389 bzero(xvap, sizeof (xvattr_t));
390 xvap->xva_mapsize = XVA_MAPSIZE;
391 xvap->xva_magic = XVA_MAGIC;
392 xvap->xva_vattr.va_mask = AT_XVATTR;
393 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
394 }
395
396 /*
397 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
398 * structure. Otherwise, returns NULL.
399 */
400 xoptattr_t *
401 xva_getxoptattr(xvattr_t *xvap)
402 {
403 xoptattr_t *xoap = NULL;
404 if (xvap->xva_vattr.va_mask & AT_XVATTR)
405 xoap = &xvap->xva_xoptattrs;
406 return (xoap);
407 }
408
409 /*
410 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
411 * We use the f_fsid reported by VFS_STATVFS() since we use that for the
412 * kstat name.
413 */
414 static int
415 vska_compar(const void *n1, const void *n2)
416 {
417 int ret;
418 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
419 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
420
421 if (p1 < p2) {
422 ret = -1;
423 } else if (p1 > p2) {
424 ret = 1;
425 } else {
426 ret = 0;
427 }
428
429 return (ret);
430 }
431
432 /*
433 * Used to create a single template which will be bcopy()ed to a newly
434 * allocated vsanchor_combo_t structure in new_vsanchor(), below.
435 */
436 static vopstats_t *
437 create_vopstats_template()
438 {
439 vopstats_t *vsp;
440
441 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
442 bzero(vsp, sizeof (*vsp)); /* Start fresh */
443
444 /* VOP_OPEN */
445 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
446 /* VOP_CLOSE */
447 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
448 /* VOP_READ I/O */
449 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
450 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
451 /* VOP_WRITE I/O */
452 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
453 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
454 /* VOP_IOCTL */
455 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
456 /* VOP_SETFL */
457 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
458 /* VOP_GETATTR */
459 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
460 /* VOP_SETATTR */
461 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
462 /* VOP_ACCESS */
463 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
464 /* VOP_LOOKUP */
465 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
466 /* VOP_CREATE */
467 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
468 /* VOP_REMOVE */
469 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
470 /* VOP_LINK */
471 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
472 /* VOP_RENAME */
473 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
474 /* VOP_MKDIR */
475 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
476 /* VOP_RMDIR */
477 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
478 /* VOP_READDIR I/O */
479 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
480 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
481 KSTAT_DATA_UINT64);
482 /* VOP_SYMLINK */
483 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
484 /* VOP_READLINK */
485 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
486 /* VOP_FSYNC */
487 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
488 /* VOP_INACTIVE */
489 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
490 /* VOP_FID */
491 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
492 /* VOP_RWLOCK */
493 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
494 /* VOP_RWUNLOCK */
495 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
496 /* VOP_SEEK */
497 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
498 /* VOP_CMP */
499 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
500 /* VOP_FRLOCK */
501 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
502 /* VOP_SPACE */
503 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
504 /* VOP_REALVP */
505 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
506 /* VOP_GETPAGE */
507 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
508 /* VOP_PUTPAGE */
509 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
510 /* VOP_MAP */
511 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
512 /* VOP_ADDMAP */
513 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
514 /* VOP_DELMAP */
515 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
516 /* VOP_POLL */
517 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
518 /* VOP_DUMP */
519 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
520 /* VOP_PATHCONF */
521 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
522 /* VOP_PAGEIO */
523 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
524 /* VOP_DUMPCTL */
525 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
526 /* VOP_DISPOSE */
527 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
528 /* VOP_SETSECATTR */
529 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
530 /* VOP_GETSECATTR */
531 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
532 /* VOP_SHRLOCK */
533 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
534 /* VOP_VNEVENT */
535 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
536 /* VOP_REQZCBUF */
537 kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
538 /* VOP_RETZCBUF */
539 kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
540
541 return (vsp);
542 }
543
544 /*
545 * Creates a kstat structure associated with a vopstats structure.
546 */
547 kstat_t *
548 new_vskstat(char *ksname, vopstats_t *vsp)
549 {
550 kstat_t *ksp;
551
552 if (!vopstats_enabled) {
553 return (NULL);
554 }
555
556 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
557 sizeof (vopstats_t)/sizeof (kstat_named_t),
558 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
559 if (ksp) {
560 ksp->ks_data = vsp;
561 kstat_install(ksp);
562 }
563
564 return (ksp);
565 }
566
567 /*
568 * Called from vfsinit() to initialize the support mechanisms for vopstats
569 */
570 void
571 vopstats_startup()
572 {
573 if (!vopstats_enabled)
574 return;
575
576 /*
577 * Creates the AVL tree which holds per-vfs vopstat anchors. This
578 * is necessary since we need to check if a kstat exists before we
579 * attempt to create it. Also, initialize its lock.
580 */
581 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
582 offsetof(vsk_anchor_t, vsk_node));
583 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
584
585 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
586 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
587 NULL, NULL, 0);
588
589 /*
590 * Set up the array of pointers for the vopstats-by-FS-type.
591 * The entries will be allocated/initialized as each file system
592 * goes through modload/mod_installfs.
593 */
594 vopstats_fstype = (vopstats_t **)kmem_zalloc(
595 (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
596
597 /* Set up the global vopstats initialization template */
598 vs_templatep = create_vopstats_template();
599 }
600
601 /*
602 * We need to have the all of the counters zeroed.
603 * The initialization of the vopstats_t includes on the order of
604 * 50 calls to kstat_named_init(). Rather that do that on every call,
605 * we do it once in a template (vs_templatep) then bcopy it over.
606 */
607 void
608 initialize_vopstats(vopstats_t *vsp)
609 {
610 if (vsp == NULL)
611 return;
612
613 bcopy(vs_templatep, vsp, sizeof (vopstats_t));
614 }
615
616 /*
617 * If possible, determine which vopstats by fstype to use and
618 * return a pointer to the caller.
619 */
620 vopstats_t *
621 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
622 {
623 int fstype = 0; /* Index into vfssw[] */
624 vopstats_t *vsp = NULL;
625
626 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
627 !vopstats_enabled)
628 return (NULL);
629 /*
630 * Set up the fstype. We go to so much trouble because all versions
631 * of NFS use the same fstype in their vfs even though they have
632 * distinct entries in the vfssw[] table.
633 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
634 */
635 if (vswp) {
636 fstype = vswp - vfssw; /* Gets us the index */
637 } else {
638 fstype = vfsp->vfs_fstype;
639 }
640
641 /*
642 * Point to the per-fstype vopstats. The only valid values are
643 * non-zero positive values less than the number of vfssw[] table
644 * entries.
645 */
646 if (fstype > 0 && fstype < nfstype) {
647 vsp = vopstats_fstype[fstype];
648 }
649
650 return (vsp);
651 }
652
653 /*
654 * Generate a kstat name, create the kstat structure, and allocate a
655 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t
656 * to the caller. This must only be called from a mount.
657 */
658 vsk_anchor_t *
659 get_vskstat_anchor(vfs_t *vfsp)
660 {
661 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
662 statvfs64_t statvfsbuf; /* Needed to find f_fsid */
663 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */
664 kstat_t *ksp; /* Ptr to new kstat */
665 avl_index_t where; /* Location in the AVL tree */
666
667 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
668 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
669 return (NULL);
670
671 /* Need to get the fsid to build a kstat name */
672 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
673 /* Create a name for our kstats based on fsid */
674 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
675 VOPSTATS_STR, statvfsbuf.f_fsid);
676
677 /* Allocate and initialize the vsk_anchor_t */
678 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
679 bzero(vskp, sizeof (*vskp));
680 vskp->vsk_fsid = statvfsbuf.f_fsid;
681
682 mutex_enter(&vskstat_tree_lock);
683 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
684 avl_insert(&vskstat_tree, vskp, where);
685 mutex_exit(&vskstat_tree_lock);
686
687 /*
688 * Now that we've got the anchor in the AVL
689 * tree, we can create the kstat.
690 */
691 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
692 if (ksp) {
693 vskp->vsk_ksp = ksp;
694 }
695 } else {
696 /* Oops, found one! Release memory and lock. */
697 mutex_exit(&vskstat_tree_lock);
698 kmem_cache_free(vsk_anchor_cache, vskp);
699 vskp = NULL;
700 }
701 }
702 return (vskp);
703 }
704
705 /*
706 * We're in the process of tearing down the vfs and need to cleanup
707 * the data structures associated with the vopstats. Must only be called
708 * from dounmount().
709 */
710 void
711 teardown_vopstats(vfs_t *vfsp)
712 {
713 vsk_anchor_t *vskap;
714 avl_index_t where;
715
716 if (vfsp == NULL || vfsp->vfs_implp == NULL ||
717 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
718 return;
719
720 /* This is a safe check since VFS_STATS must be set (see above) */
721 if ((vskap = vfsp->vfs_vskap) == NULL)
722 return;
723
724 /* Whack the pointer right away */
725 vfsp->vfs_vskap = NULL;
726
727 /* Lock the tree, remove the node, and delete the kstat */
728 mutex_enter(&vskstat_tree_lock);
729 if (avl_find(&vskstat_tree, vskap, &where)) {
730 avl_remove(&vskstat_tree, vskap);
731 }
732
733 if (vskap->vsk_ksp) {
734 kstat_delete(vskap->vsk_ksp);
735 }
736 mutex_exit(&vskstat_tree_lock);
737
738 kmem_cache_free(vsk_anchor_cache, vskap);
739 }
740
741 /*
742 * Read or write a vnode. Called from kernel code.
743 */
744 int
745 vn_rdwr(
746 enum uio_rw rw,
747 struct vnode *vp,
748 caddr_t base,
749 ssize_t len,
750 offset_t offset,
751 enum uio_seg seg,
752 int ioflag,
753 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */
754 cred_t *cr,
755 ssize_t *residp)
756 {
757 struct uio uio;
758 struct iovec iov;
759 int error;
760 int in_crit = 0;
761
762 if (rw == UIO_WRITE && ISROFILE(vp))
763 return (EROFS);
764
765 if (len < 0)
766 return (EIO);
767
768 VOPXID_MAP_CR(vp, cr);
769
770 iov.iov_base = base;
771 iov.iov_len = len;
772 uio.uio_iov = &iov;
773 uio.uio_iovcnt = 1;
774 uio.uio_loffset = offset;
775 uio.uio_segflg = (short)seg;
776 uio.uio_resid = len;
777 uio.uio_llimit = ulimit;
778
779 /*
780 * We have to enter the critical region before calling VOP_RWLOCK
781 * to avoid a deadlock with ufs.
782 */
783 if (nbl_need_check(vp)) {
784 int svmand;
785
786 nbl_start_crit(vp, RW_READER);
787 in_crit = 1;
788 error = nbl_svmand(vp, cr, &svmand);
789 if (error != 0)
790 goto done;
791 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
792 uio.uio_offset, uio.uio_resid, svmand, NULL)) {
793 error = EACCES;
794 goto done;
795 }
796 }
797
798 (void) VOP_RWLOCK(vp,
799 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
800 if (rw == UIO_WRITE) {
801 uio.uio_fmode = FWRITE;
802 uio.uio_extflg = UIO_COPY_DEFAULT;
803 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
804 } else {
805 uio.uio_fmode = FREAD;
806 uio.uio_extflg = UIO_COPY_CACHED;
807 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
808 }
809 VOP_RWUNLOCK(vp,
810 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
811 if (residp)
812 *residp = uio.uio_resid;
813 else if (uio.uio_resid)
814 error = EIO;
815
816 done:
817 if (in_crit)
818 nbl_end_crit(vp);
819 return (error);
820 }
821
822 /*
823 * Release a vnode. Call VOP_INACTIVE on last reference or
824 * decrement reference count.
825 *
826 * To avoid race conditions, the v_count is left at 1 for
827 * the call to VOP_INACTIVE. This prevents another thread
828 * from reclaiming and releasing the vnode *before* the
829 * VOP_INACTIVE routine has a chance to destroy the vnode.
830 * We can't have more than 1 thread calling VOP_INACTIVE
831 * on a vnode.
832 */
833 void
834 vn_rele(vnode_t *vp)
835 {
836 VERIFY(vp->v_count > 0);
837 mutex_enter(&vp->v_lock);
838 if (vp->v_count == 1) {
839 mutex_exit(&vp->v_lock);
840 VOP_INACTIVE(vp, CRED(), NULL);
841 return;
842 }
843 vp->v_count--;
844 mutex_exit(&vp->v_lock);
845 }
846
847 /*
848 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
849 * as a single reference, so v_count is not decremented until the last DNLC hold
850 * is released. This makes it possible to distinguish vnodes that are referenced
851 * only by the DNLC.
852 */
853 void
854 vn_rele_dnlc(vnode_t *vp)
855 {
856 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
857 mutex_enter(&vp->v_lock);
858 if (--vp->v_count_dnlc == 0) {
859 if (vp->v_count == 1) {
860 mutex_exit(&vp->v_lock);
861 VOP_INACTIVE(vp, CRED(), NULL);
862 return;
863 }
864 vp->v_count--;
865 }
866 mutex_exit(&vp->v_lock);
867 }
868
869 /*
870 * Like vn_rele() except that it clears v_stream under v_lock.
871 * This is used by sockfs when it dismantels the association between
872 * the sockfs node and the vnode in the underlaying file system.
873 * v_lock has to be held to prevent a thread coming through the lookupname
874 * path from accessing a stream head that is going away.
875 */
876 void
877 vn_rele_stream(vnode_t *vp)
878 {
879 VERIFY(vp->v_count > 0);
880 mutex_enter(&vp->v_lock);
881 vp->v_stream = NULL;
882 if (vp->v_count == 1) {
883 mutex_exit(&vp->v_lock);
884 VOP_INACTIVE(vp, CRED(), NULL);
885 return;
886 }
887 vp->v_count--;
888 mutex_exit(&vp->v_lock);
889 }
890
891 static void
892 vn_rele_inactive(vnode_t *vp)
893 {
894 VOP_INACTIVE(vp, CRED(), NULL);
895 }
896
897 /*
898 * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
899 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
900 * the file system as a result of releasing the vnode. Note, file systems
901 * already have to handle the race where the vnode is incremented before the
902 * inactive routine is called and does its locking.
903 *
904 * Warning: Excessive use of this routine can lead to performance problems.
905 * This is because taskqs throttle back allocation if too many are created.
906 */
907 void
908 vn_rele_async(vnode_t *vp, taskq_t *taskq)
909 {
910 VERIFY(vp->v_count > 0);
911 mutex_enter(&vp->v_lock);
912 if (vp->v_count == 1) {
913 mutex_exit(&vp->v_lock);
914 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
915 vp, TQ_SLEEP) != NULL);
916 return;
917 }
918 vp->v_count--;
919 mutex_exit(&vp->v_lock);
920 }
921
922 int
923 vn_open(
924 char *pnamep,
925 enum uio_seg seg,
926 int filemode,
927 int createmode,
928 struct vnode **vpp,
929 enum create crwhy,
930 mode_t umask)
931 {
932 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
933 umask, NULL, -1));
934 }
935
936
937 /*
938 * Open/create a vnode.
939 * This may be callable by the kernel, the only known use
940 * of user context being that the current user credentials
941 * are used for permissions. crwhy is defined iff filemode & FCREAT.
942 */
943 int
944 vn_openat(
945 char *pnamep,
946 enum uio_seg seg,
947 int filemode,
948 int createmode,
949 struct vnode **vpp,
950 enum create crwhy,
951 mode_t umask,
952 struct vnode *startvp,
953 int fd)
954 {
955 struct vnode *vp;
956 int mode;
957 int accessflags;
958 int error;
959 int in_crit = 0;
960 int open_done = 0;
961 int shrlock_done = 0;
962 struct vattr vattr;
963 enum symfollow follow;
964 int estale_retry = 0;
965 struct shrlock shr;
966 struct shr_locowner shr_own;
967
968 mode = 0;
969 accessflags = 0;
970 if (filemode & FREAD)
971 mode |= VREAD;
972 if (filemode & (FWRITE|FTRUNC))
973 mode |= VWRITE;
974 if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
975 mode |= VEXEC;
976
977 /* symlink interpretation */
978 if (filemode & FNOFOLLOW)
979 follow = NO_FOLLOW;
980 else
981 follow = FOLLOW;
982
983 if (filemode & FAPPEND)
984 accessflags |= V_APPEND;
985
986 top:
987 if (filemode & FCREAT) {
988 enum vcexcl excl;
989
990 /*
991 * Wish to create a file.
992 */
993 vattr.va_type = VREG;
994 vattr.va_mode = createmode;
995 vattr.va_mask = AT_TYPE|AT_MODE;
996 if (filemode & FTRUNC) {
997 vattr.va_size = 0;
998 vattr.va_mask |= AT_SIZE;
999 }
1000 if (filemode & FEXCL)
1001 excl = EXCL;
1002 else
1003 excl = NONEXCL;
1004
1005 if (error =
1006 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1007 (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1008 return (error);
1009 } else {
1010 /*
1011 * Wish to open a file. Just look it up.
1012 */
1013 if (error = lookupnameat(pnamep, seg, follow,
1014 NULLVPP, &vp, startvp)) {
1015 if ((error == ESTALE) &&
1016 fs_need_estale_retry(estale_retry++))
1017 goto top;
1018 return (error);
1019 }
1020
1021 /*
1022 * Get the attributes to check whether file is large.
1023 * We do this only if the FOFFMAX flag is not set and
1024 * only for regular files.
1025 */
1026
1027 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1028 vattr.va_mask = AT_SIZE;
1029 if ((error = VOP_GETATTR(vp, &vattr, 0,
1030 CRED(), NULL))) {
1031 goto out;
1032 }
1033 if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1034 /*
1035 * Large File API - regular open fails
1036 * if FOFFMAX flag is set in file mode
1037 */
1038 error = EOVERFLOW;
1039 goto out;
1040 }
1041 }
1042 /*
1043 * Can't write directories, active texts, or
1044 * read-only filesystems. Can't truncate files
1045 * on which mandatory locking is in effect.
1046 */
1047 if (filemode & (FWRITE|FTRUNC)) {
1048 /*
1049 * Allow writable directory if VDIROPEN flag is set.
1050 */
1051 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1052 error = EISDIR;
1053 goto out;
1054 }
1055 if (ISROFILE(vp)) {
1056 error = EROFS;
1057 goto out;
1058 }
1059 /*
1060 * Can't truncate files on which
1061 * sysv mandatory locking is in effect.
1062 */
1063 if (filemode & FTRUNC) {
1064 vnode_t *rvp;
1065
1066 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1067 rvp = vp;
1068 if (rvp->v_filocks != NULL) {
1069 vattr.va_mask = AT_MODE;
1070 if ((error = VOP_GETATTR(vp,
1071 &vattr, 0, CRED(), NULL)) == 0 &&
1072 MANDLOCK(vp, vattr.va_mode))
1073 error = EAGAIN;
1074 }
1075 }
1076 if (error)
1077 goto out;
1078 }
1079 /*
1080 * Check permissions.
1081 */
1082 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1083 goto out;
1084 /*
1085 * Require FSEARCH to return a directory.
1086 * Require FEXEC to return a regular file.
1087 */
1088 if ((filemode & FSEARCH) && vp->v_type != VDIR) {
1089 error = ENOTDIR;
1090 goto out;
1091 }
1092 if ((filemode & FEXEC) && vp->v_type != VREG) {
1093 error = ENOEXEC; /* XXX: error code? */
1094 goto out;
1095 }
1096 }
1097
1098 /*
1099 * Do remaining checks for FNOFOLLOW and FNOLINKS.
1100 */
1101 if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1102 error = ELOOP;
1103 goto out;
1104 }
1105 if (filemode & FNOLINKS) {
1106 vattr.va_mask = AT_NLINK;
1107 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1108 goto out;
1109 }
1110 if (vattr.va_nlink != 1) {
1111 error = EMLINK;
1112 goto out;
1113 }
1114 }
1115
1116 /*
1117 * Opening a socket corresponding to the AF_UNIX pathname
1118 * in the filesystem name space is not supported.
1119 * However, VSOCK nodes in namefs are supported in order
1120 * to make fattach work for sockets.
1121 *
1122 * XXX This uses VOP_REALVP to distinguish between
1123 * an unopened namefs node (where VOP_REALVP returns a
1124 * different VSOCK vnode) and a VSOCK created by vn_create
1125 * in some file system (where VOP_REALVP would never return
1126 * a different vnode).
1127 */
1128 if (vp->v_type == VSOCK) {
1129 struct vnode *nvp;
1130
1131 error = VOP_REALVP(vp, &nvp, NULL);
1132 if (error != 0 || nvp == NULL || nvp == vp ||
1133 nvp->v_type != VSOCK) {
1134 error = EOPNOTSUPP;
1135 goto out;
1136 }
1137 }
1138
1139 if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1140 /* get share reservation */
1141 shr.s_access = 0;
1142 if (filemode & FWRITE)
1143 shr.s_access |= F_WRACC;
1144 if (filemode & FREAD)
1145 shr.s_access |= F_RDACC;
1146 shr.s_deny = 0;
1147 shr.s_sysid = 0;
1148 shr.s_pid = ttoproc(curthread)->p_pid;
1149 shr_own.sl_pid = shr.s_pid;
1150 shr_own.sl_id = fd;
1151 shr.s_own_len = sizeof (shr_own);
1152 shr.s_owner = (caddr_t)&shr_own;
1153 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1154 NULL);
1155 if (error)
1156 goto out;
1157 shrlock_done = 1;
1158
1159 /* nbmand conflict check if truncating file */
1160 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1161 nbl_start_crit(vp, RW_READER);
1162 in_crit = 1;
1163
1164 vattr.va_mask = AT_SIZE;
1165 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1166 goto out;
1167 if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1168 NULL)) {
1169 error = EACCES;
1170 goto out;
1171 }
1172 }
1173 }
1174
1175 /*
1176 * Do opening protocol.
1177 */
1178 error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1179 if (error)
1180 goto out;
1181 open_done = 1;
1182
1183 /*
1184 * Truncate if required.
1185 */
1186 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1187 vattr.va_size = 0;
1188 vattr.va_mask = AT_SIZE;
1189 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1190 goto out;
1191 }
1192 out:
1193 ASSERT(vp->v_count > 0);
1194
1195 if (in_crit) {
1196 nbl_end_crit(vp);
1197 in_crit = 0;
1198 }
1199 if (error) {
1200 if (open_done) {
1201 (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1202 NULL);
1203 open_done = 0;
1204 shrlock_done = 0;
1205 }
1206 if (shrlock_done) {
1207 (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1208 NULL);
1209 shrlock_done = 0;
1210 }
1211
1212 /*
1213 * The following clause was added to handle a problem
1214 * with NFS consistency. It is possible that a lookup
1215 * of the file to be opened succeeded, but the file
1216 * itself doesn't actually exist on the server. This
1217 * is chiefly due to the DNLC containing an entry for
1218 * the file which has been removed on the server. In
1219 * this case, we just start over. If there was some
1220 * other cause for the ESTALE error, then the lookup
1221 * of the file will fail and the error will be returned
1222 * above instead of looping around from here.
1223 */
1224 VN_RELE(vp);
1225 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1226 goto top;
1227 } else
1228 *vpp = vp;
1229 return (error);
1230 }
1231
1232 /*
1233 * The following two accessor functions are for the NFSv4 server. Since there
1234 * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1235 * vnode open counts correct when a client "upgrades" an open or does an
1236 * open_downgrade. In NFS, an upgrade or downgrade can not only change the
1237 * open mode (add or subtract read or write), but also change the share/deny
1238 * modes. However, share reservations are not integrated with OPEN, yet, so
1239 * we need to handle each separately. These functions are cleaner than having
1240 * the NFS server manipulate the counts directly, however, nobody else should
1241 * use these functions.
1242 */
1243 void
1244 vn_open_upgrade(
1245 vnode_t *vp,
1246 int filemode)
1247 {
1248 ASSERT(vp->v_type == VREG);
1249
1250 if (filemode & FREAD)
1251 atomic_inc_32(&vp->v_rdcnt);
1252 if (filemode & FWRITE)
1253 atomic_inc_32(&vp->v_wrcnt);
1254
1255 }
1256
1257 void
1258 vn_open_downgrade(
1259 vnode_t *vp,
1260 int filemode)
1261 {
1262 ASSERT(vp->v_type == VREG);
1263
1264 if (filemode & FREAD) {
1265 ASSERT(vp->v_rdcnt > 0);
1266 atomic_dec_32(&vp->v_rdcnt);
1267 }
1268 if (filemode & FWRITE) {
1269 ASSERT(vp->v_wrcnt > 0);
1270 atomic_dec_32(&vp->v_wrcnt);
1271 }
1272
1273 }
1274
1275 int
1276 vn_create(
1277 char *pnamep,
1278 enum uio_seg seg,
1279 struct vattr *vap,
1280 enum vcexcl excl,
1281 int mode,
1282 struct vnode **vpp,
1283 enum create why,
1284 int flag,
1285 mode_t umask)
1286 {
1287 return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1288 umask, NULL));
1289 }
1290
1291 /*
1292 * Create a vnode (makenode).
1293 */
1294 int
1295 vn_createat(
1296 char *pnamep,
1297 enum uio_seg seg,
1298 struct vattr *vap,
1299 enum vcexcl excl,
1300 int mode,
1301 struct vnode **vpp,
1302 enum create why,
1303 int flag,
1304 mode_t umask,
1305 struct vnode *startvp)
1306 {
1307 struct vnode *dvp; /* ptr to parent dir vnode */
1308 struct vnode *vp = NULL;
1309 struct pathname pn;
1310 int error;
1311 int in_crit = 0;
1312 struct vattr vattr;
1313 enum symfollow follow;
1314 int estale_retry = 0;
1315 uint32_t auditing = AU_AUDITING();
1316
1317 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1318
1319 /* symlink interpretation */
1320 if ((flag & FNOFOLLOW) || excl == EXCL)
1321 follow = NO_FOLLOW;
1322 else
1323 follow = FOLLOW;
1324 flag &= ~(FNOFOLLOW|FNOLINKS);
1325
1326 top:
1327 /*
1328 * Lookup directory.
1329 * If new object is a file, call lower level to create it.
1330 * Note that it is up to the lower level to enforce exclusive
1331 * creation, if the file is already there.
1332 * This allows the lower level to do whatever
1333 * locking or protocol that is needed to prevent races.
1334 * If the new object is directory call lower level to make
1335 * the new directory, with "." and "..".
1336 */
1337 if (error = pn_get(pnamep, seg, &pn))
1338 return (error);
1339 if (auditing)
1340 audit_vncreate_start();
1341 dvp = NULL;
1342 *vpp = NULL;
1343 /*
1344 * lookup will find the parent directory for the vnode.
1345 * When it is done the pn holds the name of the entry
1346 * in the directory.
1347 * If this is a non-exclusive create we also find the node itself.
1348 */
1349 error = lookuppnat(&pn, NULL, follow, &dvp,
1350 (excl == EXCL) ? NULLVPP : vpp, startvp);
1351 if (error) {
1352 pn_free(&pn);
1353 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1354 goto top;
1355 if (why == CRMKDIR && error == EINVAL)
1356 error = EEXIST; /* SVID */
1357 return (error);
1358 }
1359
1360 if (why != CRMKNOD)
1361 vap->va_mode &= ~VSVTX;
1362
1363 /*
1364 * If default ACLs are defined for the directory don't apply the
1365 * umask if umask is passed.
1366 */
1367
1368 if (umask) {
1369
1370 vsecattr_t vsec;
1371
1372 vsec.vsa_aclcnt = 0;
1373 vsec.vsa_aclentp = NULL;
1374 vsec.vsa_dfaclcnt = 0;
1375 vsec.vsa_dfaclentp = NULL;
1376 vsec.vsa_mask = VSA_DFACLCNT;
1377 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1378 /*
1379 * If error is ENOSYS then treat it as no error
1380 * Don't want to force all file systems to support
1381 * aclent_t style of ACL's.
1382 */
1383 if (error == ENOSYS)
1384 error = 0;
1385 if (error) {
1386 if (*vpp != NULL)
1387 VN_RELE(*vpp);
1388 goto out;
1389 } else {
1390 /*
1391 * Apply the umask if no default ACLs.
1392 */
1393 if (vsec.vsa_dfaclcnt == 0)
1394 vap->va_mode &= ~umask;
1395
1396 /*
1397 * VOP_GETSECATTR() may have allocated memory for
1398 * ACLs we didn't request, so double-check and
1399 * free it if necessary.
1400 */
1401 if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1402 kmem_free((caddr_t)vsec.vsa_aclentp,
1403 vsec.vsa_aclcnt * sizeof (aclent_t));
1404 if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1405 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1406 vsec.vsa_dfaclcnt * sizeof (aclent_t));
1407 }
1408 }
1409
1410 /*
1411 * In general we want to generate EROFS if the file system is
1412 * readonly. However, POSIX (IEEE Std. 1003.1) section 5.3.1
1413 * documents the open system call, and it says that O_CREAT has no
1414 * effect if the file already exists. Bug 1119649 states
1415 * that open(path, O_CREAT, ...) fails when attempting to open an
1416 * existing file on a read only file system. Thus, the first part
1417 * of the following if statement has 3 checks:
1418 * if the file exists &&
1419 * it is being open with write access &&
1420 * the file system is read only
1421 * then generate EROFS
1422 */
1423 if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1424 (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1425 if (*vpp)
1426 VN_RELE(*vpp);
1427 error = EROFS;
1428 } else if (excl == NONEXCL && *vpp != NULL) {
1429 vnode_t *rvp;
1430
1431 /*
1432 * File already exists. If a mandatory lock has been
1433 * applied, return error.
1434 */
1435 vp = *vpp;
1436 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1437 rvp = vp;
1438 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1439 nbl_start_crit(vp, RW_READER);
1440 in_crit = 1;
1441 }
1442 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1443 vattr.va_mask = AT_MODE|AT_SIZE;
1444 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1445 goto out;
1446 }
1447 if (MANDLOCK(vp, vattr.va_mode)) {
1448 error = EAGAIN;
1449 goto out;
1450 }
1451 /*
1452 * File cannot be truncated if non-blocking mandatory
1453 * locks are currently on the file.
1454 */
1455 if ((vap->va_mask & AT_SIZE) && in_crit) {
1456 u_offset_t offset;
1457 ssize_t length;
1458
1459 offset = vap->va_size > vattr.va_size ?
1460 vattr.va_size : vap->va_size;
1461 length = vap->va_size > vattr.va_size ?
1462 vap->va_size - vattr.va_size :
1463 vattr.va_size - vap->va_size;
1464 if (nbl_conflict(vp, NBL_WRITE, offset,
1465 length, 0, NULL)) {
1466 error = EACCES;
1467 goto out;
1468 }
1469 }
1470 }
1471
1472 /*
1473 * If the file is the root of a VFS, we've crossed a
1474 * mount point and the "containing" directory that we
1475 * acquired above (dvp) is irrelevant because it's in
1476 * a different file system. We apply VOP_CREATE to the
1477 * target itself instead of to the containing directory
1478 * and supply a null path name to indicate (conventionally)
1479 * the node itself as the "component" of interest.
1480 *
1481 * The intercession of the file system is necessary to
1482 * ensure that the appropriate permission checks are
1483 * done.
1484 */
1485 if (vp->v_flag & VROOT) {
1486 ASSERT(why != CRMKDIR);
1487 error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1488 CRED(), flag, NULL, NULL);
1489 /*
1490 * If the create succeeded, it will have created
1491 * a new reference to the vnode. Give up the
1492 * original reference. The assertion should not
1493 * get triggered because NBMAND locks only apply to
1494 * VREG files. And if in_crit is non-zero for some
1495 * reason, detect that here, rather than when we
1496 * deference a null vp.
1497 */
1498 ASSERT(in_crit == 0);
1499 VN_RELE(vp);
1500 vp = NULL;
1501 goto out;
1502 }
1503
1504 /*
1505 * Large File API - non-large open (FOFFMAX flag not set)
1506 * of regular file fails if the file size exceeds MAXOFF32_T.
1507 */
1508 if (why != CRMKDIR &&
1509 !(flag & FOFFMAX) &&
1510 (vp->v_type == VREG)) {
1511 vattr.va_mask = AT_SIZE;
1512 if ((error = VOP_GETATTR(vp, &vattr, 0,
1513 CRED(), NULL))) {
1514 goto out;
1515 }
1516 if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1517 error = EOVERFLOW;
1518 goto out;
1519 }
1520 }
1521 }
1522
1523 if (error == 0) {
1524 /*
1525 * Call mkdir() if specified, otherwise create().
1526 */
1527 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1528
1529 if (why == CRMKDIR)
1530 /*
1531 * N.B., if vn_createat() ever requests
1532 * case-insensitive behavior then it will need
1533 * to be passed to VOP_MKDIR(). VOP_CREATE()
1534 * will already get it via "flag"
1535 */
1536 error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1537 NULL, 0, NULL);
1538 else if (!must_be_dir)
1539 error = VOP_CREATE(dvp, pn.pn_path, vap,
1540 excl, mode, vpp, CRED(), flag, NULL, NULL);
1541 else
1542 error = ENOTDIR;
1543 }
1544
1545 out:
1546
1547 if (auditing)
1548 audit_vncreate_finish(*vpp, error);
1549 if (in_crit) {
1550 nbl_end_crit(vp);
1551 in_crit = 0;
1552 }
1553 if (vp != NULL) {
1554 VN_RELE(vp);
1555 vp = NULL;
1556 }
1557 pn_free(&pn);
1558 VN_RELE(dvp);
1559 /*
1560 * The following clause was added to handle a problem
1561 * with NFS consistency. It is possible that a lookup
1562 * of the file to be created succeeded, but the file
1563 * itself doesn't actually exist on the server. This
1564 * is chiefly due to the DNLC containing an entry for
1565 * the file which has been removed on the server. In
1566 * this case, we just start over. If there was some
1567 * other cause for the ESTALE error, then the lookup
1568 * of the file will fail and the error will be returned
1569 * above instead of looping around from here.
1570 */
1571 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1572 goto top;
1573 return (error);
1574 }
1575
1576 int
1577 vn_link(char *from, char *to, enum uio_seg seg)
1578 {
1579 return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1580 }
1581
1582 int
1583 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1584 vnode_t *tstartvp, char *to, enum uio_seg seg)
1585 {
1586 struct vnode *fvp; /* from vnode ptr */
1587 struct vnode *tdvp; /* to directory vnode ptr */
1588 struct pathname pn;
1589 int error;
1590 struct vattr vattr;
1591 dev_t fsid;
1592 int estale_retry = 0;
1593 uint32_t auditing = AU_AUDITING();
1594
1595 top:
1596 fvp = tdvp = NULL;
1597 if (error = pn_get(to, seg, &pn))
1598 return (error);
1599 if (auditing && fstartvp != NULL)
1600 audit_setfsat_path(1);
1601 if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1602 goto out;
1603 if (auditing && tstartvp != NULL)
1604 audit_setfsat_path(3);
1605 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1606 goto out;
1607 /*
1608 * Make sure both source vnode and target directory vnode are
1609 * in the same vfs and that it is writeable.
1610 */
1611 vattr.va_mask = AT_FSID;
1612 if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1613 goto out;
1614 fsid = vattr.va_fsid;
1615 vattr.va_mask = AT_FSID;
1616 if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1617 goto out;
1618 if (fsid != vattr.va_fsid) {
1619 error = EXDEV;
1620 goto out;
1621 }
1622 if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1623 error = EROFS;
1624 goto out;
1625 }
1626 /*
1627 * Do the link.
1628 */
1629 (void) pn_fixslash(&pn);
1630 error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1631 out:
1632 pn_free(&pn);
1633 if (fvp)
1634 VN_RELE(fvp);
1635 if (tdvp)
1636 VN_RELE(tdvp);
1637 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1638 goto top;
1639 return (error);
1640 }
1641
1642 int
1643 vn_rename(char *from, char *to, enum uio_seg seg)
1644 {
1645 return (vn_renameat(NULL, from, NULL, to, seg));
1646 }
1647
1648 int
1649 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1650 char *tname, enum uio_seg seg)
1651 {
1652 int error;
1653 struct vattr vattr;
1654 struct pathname fpn; /* from pathname */
1655 struct pathname tpn; /* to pathname */
1656 dev_t fsid;
1657 int in_crit_src, in_crit_targ;
1658 vnode_t *fromvp, *fvp;
1659 vnode_t *tovp, *targvp;
1660 int estale_retry = 0;
1661 uint32_t auditing = AU_AUDITING();
1662
1663 top:
1664 fvp = fromvp = tovp = targvp = NULL;
1665 in_crit_src = in_crit_targ = 0;
1666 /*
1667 * Get to and from pathnames.
1668 */
1669 if (error = pn_get(fname, seg, &fpn))
1670 return (error);
1671 if (error = pn_get(tname, seg, &tpn)) {
1672 pn_free(&fpn);
1673 return (error);
1674 }
1675
1676 /*
1677 * First we need to resolve the correct directories
1678 * The passed in directories may only be a starting point,
1679 * but we need the real directories the file(s) live in.
1680 * For example the fname may be something like usr/lib/sparc
1681 * and we were passed in the / directory, but we need to
1682 * use the lib directory for the rename.
1683 */
1684
1685 if (auditing && fdvp != NULL)
1686 audit_setfsat_path(1);
1687 /*
1688 * Lookup to and from directories.
1689 */
1690 if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1691 goto out;
1692 }
1693
1694 /*
1695 * Make sure there is an entry.
1696 */
1697 if (fvp == NULL) {
1698 error = ENOENT;
1699 goto out;
1700 }
1701
1702 if (auditing && tdvp != NULL)
1703 audit_setfsat_path(3);
1704 if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1705 goto out;
1706 }
1707
1708 /*
1709 * Make sure both the from vnode directory and the to directory
1710 * are in the same vfs and the to directory is writable.
1711 * We check fsid's, not vfs pointers, so loopback fs works.
1712 */
1713 if (fromvp != tovp) {
1714 vattr.va_mask = AT_FSID;
1715 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1716 goto out;
1717 fsid = vattr.va_fsid;
1718 vattr.va_mask = AT_FSID;
1719 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1720 goto out;
1721 if (fsid != vattr.va_fsid) {
1722 error = EXDEV;
1723 goto out;
1724 }
1725 }
1726
1727 if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1728 error = EROFS;
1729 goto out;
1730 }
1731
1732 if (targvp && (fvp != targvp)) {
1733 nbl_start_crit(targvp, RW_READER);
1734 in_crit_targ = 1;
1735 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1736 error = EACCES;
1737 goto out;
1738 }
1739 }
1740
1741 if (nbl_need_check(fvp)) {
1742 nbl_start_crit(fvp, RW_READER);
1743 in_crit_src = 1;
1744 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1745 error = EACCES;
1746 goto out;
1747 }
1748 }
1749
1750 /*
1751 * Do the rename.
1752 */
1753 (void) pn_fixslash(&tpn);
1754 error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1755 NULL, 0);
1756
1757 out:
1758 pn_free(&fpn);
1759 pn_free(&tpn);
1760 if (in_crit_src)
1761 nbl_end_crit(fvp);
1762 if (in_crit_targ)
1763 nbl_end_crit(targvp);
1764 if (fromvp)
1765 VN_RELE(fromvp);
1766 if (tovp)
1767 VN_RELE(tovp);
1768 if (targvp)
1769 VN_RELE(targvp);
1770 if (fvp)
1771 VN_RELE(fvp);
1772 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1773 goto top;
1774 return (error);
1775 }
1776
1777 /*
1778 * Remove a file or directory.
1779 */
1780 int
1781 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1782 {
1783 return (vn_removeat(NULL, fnamep, seg, dirflag));
1784 }
1785
1786 int
1787 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1788 {
1789 struct vnode *vp; /* entry vnode */
1790 struct vnode *dvp; /* ptr to parent dir vnode */
1791 struct vnode *coveredvp;
1792 struct pathname pn; /* name of entry */
1793 enum vtype vtype;
1794 int error;
1795 struct vfs *vfsp;
1796 struct vfs *dvfsp; /* ptr to parent dir vfs */
1797 int in_crit = 0;
1798 int estale_retry = 0;
1799
1800 top:
1801 if (error = pn_get(fnamep, seg, &pn))
1802 return (error);
1803 dvp = vp = NULL;
1804 if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1805 pn_free(&pn);
1806 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1807 goto top;
1808 return (error);
1809 }
1810
1811 /*
1812 * Make sure there is an entry.
1813 */
1814 if (vp == NULL) {
1815 error = ENOENT;
1816 goto out;
1817 }
1818
1819 vfsp = vp->v_vfsp;
1820 dvfsp = dvp->v_vfsp;
1821
1822 /*
1823 * If the named file is the root of a mounted filesystem, fail,
1824 * unless it's marked unlinkable. In that case, unmount the
1825 * filesystem and proceed to unlink the covered vnode. (If the
1826 * covered vnode is a directory, use rmdir instead of unlink,
1827 * to avoid file system corruption.)
1828 */
1829 if (vp->v_flag & VROOT) {
1830 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1831 error = EBUSY;
1832 goto out;
1833 }
1834
1835 /*
1836 * Namefs specific code starts here.
1837 */
1838
1839 if (dirflag == RMDIRECTORY) {
1840 /*
1841 * User called rmdir(2) on a file that has
1842 * been namefs mounted on top of. Since
1843 * namefs doesn't allow directories to
1844 * be mounted on other files we know
1845 * vp is not of type VDIR so fail to operation.
1846 */
1847 error = ENOTDIR;
1848 goto out;
1849 }
1850
1851 /*
1852 * If VROOT is still set after grabbing vp->v_lock,
1853 * noone has finished nm_unmount so far and coveredvp
1854 * is valid.
1855 * If we manage to grab vn_vfswlock(coveredvp) before releasing
1856 * vp->v_lock, any race window is eliminated.
1857 */
1858
1859 mutex_enter(&vp->v_lock);
1860 if ((vp->v_flag & VROOT) == 0) {
1861 /* Someone beat us to the unmount */
1862 mutex_exit(&vp->v_lock);
1863 error = EBUSY;
1864 goto out;
1865 }
1866 vfsp = vp->v_vfsp;
1867 coveredvp = vfsp->vfs_vnodecovered;
1868 ASSERT(coveredvp);
1869 /*
1870 * Note: Implementation of vn_vfswlock shows that ordering of
1871 * v_lock / vn_vfswlock is not an issue here.
1872 */
1873 error = vn_vfswlock(coveredvp);
1874 mutex_exit(&vp->v_lock);
1875
1876 if (error)
1877 goto out;
1878
1879 VN_HOLD(coveredvp);
1880 VN_RELE(vp);
1881 error = dounmount(vfsp, 0, CRED());
1882
1883 /*
1884 * Unmounted the namefs file system; now get
1885 * the object it was mounted over.
1886 */
1887 vp = coveredvp;
1888 /*
1889 * If namefs was mounted over a directory, then
1890 * we want to use rmdir() instead of unlink().
1891 */
1892 if (vp->v_type == VDIR)
1893 dirflag = RMDIRECTORY;
1894
1895 if (error)
1896 goto out;
1897 }
1898
1899 /*
1900 * Make sure filesystem is writeable.
1901 * We check the parent directory's vfs in case this is an lofs vnode.
1902 */
1903 if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1904 error = EROFS;
1905 goto out;
1906 }
1907
1908 vtype = vp->v_type;
1909
1910 /*
1911 * If there is the possibility of an nbmand share reservation, make
1912 * sure it's okay to remove the file. Keep a reference to the
1913 * vnode, so that we can exit the nbl critical region after
1914 * calling VOP_REMOVE.
1915 * If there is no possibility of an nbmand share reservation,
1916 * release the vnode reference now. Filesystems like NFS may
1917 * behave differently if there is an extra reference, so get rid of
1918 * this one. Fortunately, we can't have nbmand mounts on NFS
1919 * filesystems.
1920 */
1921 if (nbl_need_check(vp)) {
1922 nbl_start_crit(vp, RW_READER);
1923 in_crit = 1;
1924 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1925 error = EACCES;
1926 goto out;
1927 }
1928 } else {
1929 VN_RELE(vp);
1930 vp = NULL;
1931 }
1932
1933 if (dirflag == RMDIRECTORY) {
1934 /*
1935 * Caller is using rmdir(2), which can only be applied to
1936 * directories.
1937 */
1938 if (vtype != VDIR) {
1939 error = ENOTDIR;
1940 } else {
1941 vnode_t *cwd;
1942 proc_t *pp = curproc;
1943
1944 mutex_enter(&pp->p_lock);
1945 cwd = PTOU(pp)->u_cdir;
1946 VN_HOLD(cwd);
1947 mutex_exit(&pp->p_lock);
1948 error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1949 NULL, 0);
1950 VN_RELE(cwd);
1951 }
1952 } else {
1953 /*
1954 * Unlink(2) can be applied to anything.
1955 */
1956 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1957 }
1958
1959 out:
1960 pn_free(&pn);
1961 if (in_crit) {
1962 nbl_end_crit(vp);
1963 in_crit = 0;
1964 }
1965 if (vp != NULL)
1966 VN_RELE(vp);
1967 if (dvp != NULL)
1968 VN_RELE(dvp);
1969 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1970 goto top;
1971 return (error);
1972 }
1973
1974 /*
1975 * Utility function to compare equality of vnodes.
1976 * Compare the underlying real vnodes, if there are underlying vnodes.
1977 * This is a more thorough comparison than the VN_CMP() macro provides.
1978 */
1979 int
1980 vn_compare(vnode_t *vp1, vnode_t *vp2)
1981 {
1982 vnode_t *realvp;
1983
1984 if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1985 vp1 = realvp;
1986 if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1987 vp2 = realvp;
1988 return (VN_CMP(vp1, vp2));
1989 }
1990
1991 /*
1992 * The number of locks to hash into. This value must be a power
1993 * of 2 minus 1 and should probably also be prime.
1994 */
1995 #define NUM_BUCKETS 1023
1996
1997 struct vn_vfslocks_bucket {
1998 kmutex_t vb_lock;
1999 vn_vfslocks_entry_t *vb_list;
2000 char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2001 };
2002
2003 /*
2004 * Total number of buckets will be NUM_BUCKETS + 1 .
2005 */
2006
2007 #pragma align 64(vn_vfslocks_buckets)
2008 static struct vn_vfslocks_bucket vn_vfslocks_buckets[NUM_BUCKETS + 1];
2009
2010 #define VN_VFSLOCKS_SHIFT 9
2011
2012 #define VN_VFSLOCKS_HASH(vfsvpptr) \
2013 ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2014
2015 /*
2016 * vn_vfslocks_getlock() uses an HASH scheme to generate
2017 * rwstlock using vfs/vnode pointer passed to it.
2018 *
2019 * vn_vfslocks_rele() releases a reference in the
2020 * HASH table which allows the entry allocated by
2021 * vn_vfslocks_getlock() to be freed at a later
2022 * stage when the refcount drops to zero.
2023 */
2024
2025 vn_vfslocks_entry_t *
2026 vn_vfslocks_getlock(void *vfsvpptr)
2027 {
2028 struct vn_vfslocks_bucket *bp;
2029 vn_vfslocks_entry_t *vep;
2030 vn_vfslocks_entry_t *tvep;
2031
2032 ASSERT(vfsvpptr != NULL);
2033 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2034
2035 mutex_enter(&bp->vb_lock);
2036 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2037 if (vep->ve_vpvfs == vfsvpptr) {
2038 vep->ve_refcnt++;
2039 mutex_exit(&bp->vb_lock);
2040 return (vep);
2041 }
2042 }
2043 mutex_exit(&bp->vb_lock);
2044 vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2045 rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2046 vep->ve_vpvfs = (char *)vfsvpptr;
2047 vep->ve_refcnt = 1;
2048 mutex_enter(&bp->vb_lock);
2049 for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2050 if (tvep->ve_vpvfs == vfsvpptr) {
2051 tvep->ve_refcnt++;
2052 mutex_exit(&bp->vb_lock);
2053
2054 /*
2055 * There is already an entry in the hash
2056 * destroy what we just allocated.
2057 */
2058 rwst_destroy(&vep->ve_lock);
2059 kmem_free(vep, sizeof (*vep));
2060 return (tvep);
2061 }
2062 }
2063 vep->ve_next = bp->vb_list;
2064 bp->vb_list = vep;
2065 mutex_exit(&bp->vb_lock);
2066 return (vep);
2067 }
2068
2069 void
2070 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2071 {
2072 struct vn_vfslocks_bucket *bp;
2073 vn_vfslocks_entry_t *vep;
2074 vn_vfslocks_entry_t *pvep;
2075
2076 ASSERT(vepent != NULL);
2077 ASSERT(vepent->ve_vpvfs != NULL);
2078
2079 bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2080
2081 mutex_enter(&bp->vb_lock);
2082 vepent->ve_refcnt--;
2083
2084 if ((int32_t)vepent->ve_refcnt < 0)
2085 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2086
2087 if (vepent->ve_refcnt == 0) {
2088 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2089 if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2090 if (bp->vb_list == vep)
2091 bp->vb_list = vep->ve_next;
2092 else {
2093 /* LINTED */
2094 pvep->ve_next = vep->ve_next;
2095 }
2096 mutex_exit(&bp->vb_lock);
2097 rwst_destroy(&vep->ve_lock);
2098 kmem_free(vep, sizeof (*vep));
2099 return;
2100 }
2101 pvep = vep;
2102 }
2103 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2104 }
2105 mutex_exit(&bp->vb_lock);
2106 }
2107
2108 /*
2109 * vn_vfswlock_wait is used to implement a lock which is logically a writers
2110 * lock protecting the v_vfsmountedhere field.
2111 * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2112 * except that it blocks to acquire the lock VVFSLOCK.
2113 *
2114 * traverse() and routines re-implementing part of traverse (e.g. autofs)
2115 * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2116 * need the non-blocking version of the writers lock i.e. vn_vfswlock
2117 */
2118 int
2119 vn_vfswlock_wait(vnode_t *vp)
2120 {
2121 int retval;
2122 vn_vfslocks_entry_t *vpvfsentry;
2123 ASSERT(vp != NULL);
2124
2125 vpvfsentry = vn_vfslocks_getlock(vp);
2126 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2127
2128 if (retval == EINTR) {
2129 vn_vfslocks_rele(vpvfsentry);
2130 return (EINTR);
2131 }
2132 return (retval);
2133 }
2134
2135 int
2136 vn_vfsrlock_wait(vnode_t *vp)
2137 {
2138 int retval;
2139 vn_vfslocks_entry_t *vpvfsentry;
2140 ASSERT(vp != NULL);
2141
2142 vpvfsentry = vn_vfslocks_getlock(vp);
2143 retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2144
2145 if (retval == EINTR) {
2146 vn_vfslocks_rele(vpvfsentry);
2147 return (EINTR);
2148 }
2149
2150 return (retval);
2151 }
2152
2153
2154 /*
2155 * vn_vfswlock is used to implement a lock which is logically a writers lock
2156 * protecting the v_vfsmountedhere field.
2157 */
2158 int
2159 vn_vfswlock(vnode_t *vp)
2160 {
2161 vn_vfslocks_entry_t *vpvfsentry;
2162
2163 /*
2164 * If vp is NULL then somebody is trying to lock the covered vnode
2165 * of /. (vfs_vnodecovered is NULL for /). This situation will
2166 * only happen when unmounting /. Since that operation will fail
2167 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2168 */
2169 if (vp == NULL)
2170 return (EBUSY);
2171
2172 vpvfsentry = vn_vfslocks_getlock(vp);
2173
2174 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2175 return (0);
2176
2177 vn_vfslocks_rele(vpvfsentry);
2178 return (EBUSY);
2179 }
2180
2181 int
2182 vn_vfsrlock(vnode_t *vp)
2183 {
2184 vn_vfslocks_entry_t *vpvfsentry;
2185
2186 /*
2187 * If vp is NULL then somebody is trying to lock the covered vnode
2188 * of /. (vfs_vnodecovered is NULL for /). This situation will
2189 * only happen when unmounting /. Since that operation will fail
2190 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2191 */
2192 if (vp == NULL)
2193 return (EBUSY);
2194
2195 vpvfsentry = vn_vfslocks_getlock(vp);
2196
2197 if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2198 return (0);
2199
2200 vn_vfslocks_rele(vpvfsentry);
2201 return (EBUSY);
2202 }
2203
2204 void
2205 vn_vfsunlock(vnode_t *vp)
2206 {
2207 vn_vfslocks_entry_t *vpvfsentry;
2208
2209 /*
2210 * ve_refcnt needs to be decremented twice.
2211 * 1. To release refernce after a call to vn_vfslocks_getlock()
2212 * 2. To release the reference from the locking routines like
2213 * vn_vfsrlock/vn_vfswlock etc,.
2214 */
2215 vpvfsentry = vn_vfslocks_getlock(vp);
2216 vn_vfslocks_rele(vpvfsentry);
2217
2218 rwst_exit(&vpvfsentry->ve_lock);
2219 vn_vfslocks_rele(vpvfsentry);
2220 }
2221
2222 int
2223 vn_vfswlock_held(vnode_t *vp)
2224 {
2225 int held;
2226 vn_vfslocks_entry_t *vpvfsentry;
2227
2228 ASSERT(vp != NULL);
2229
2230 vpvfsentry = vn_vfslocks_getlock(vp);
2231 held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2232
2233 vn_vfslocks_rele(vpvfsentry);
2234 return (held);
2235 }
2236
2237
2238 int
2239 vn_make_ops(
2240 const char *name, /* Name of file system */
2241 const fs_operation_def_t *templ, /* Operation specification */
2242 vnodeops_t **actual) /* Return the vnodeops */
2243 {
2244 int unused_ops;
2245 int error;
2246
2247 *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2248
2249 (*actual)->vnop_name = name;
2250
2251 error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2252 if (error) {
2253 kmem_free(*actual, sizeof (vnodeops_t));
2254 }
2255
2256 #if DEBUG
2257 if (unused_ops != 0)
2258 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2259 "but not used", name, unused_ops);
2260 #endif
2261
2262 return (error);
2263 }
2264
2265 /*
2266 * Free the vnodeops created as a result of vn_make_ops()
2267 */
2268 void
2269 vn_freevnodeops(vnodeops_t *vnops)
2270 {
2271 kmem_free(vnops, sizeof (vnodeops_t));
2272 }
2273
2274 /*
2275 * Vnode cache.
2276 */
2277
2278 /* ARGSUSED */
2279 static int
2280 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2281 {
2282 struct vnode *vp;
2283
2284 vp = buf;
2285
2286 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2287 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2288 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2289 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2290 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2291 vp->v_path = NULL;
2292 vp->v_mpssdata = NULL;
2293 vp->v_vsd = NULL;
2294 vp->v_fopdata = NULL;
2295
2296 return (0);
2297 }
2298
2299 /* ARGSUSED */
2300 static void
2301 vn_cache_destructor(void *buf, void *cdrarg)
2302 {
2303 struct vnode *vp;
2304
2305 vp = buf;
2306
2307 rw_destroy(&vp->v_nbllock);
2308 cv_destroy(&vp->v_cv);
2309 mutex_destroy(&vp->v_vsd_lock);
2310 mutex_destroy(&vp->v_lock);
2311 }
2312
2313 void
2314 vn_create_cache(void)
2315 {
2316 /* LINTED */
2317 ASSERT((1 << VNODE_ALIGN_LOG2) ==
2318 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2319 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2320 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2321 NULL, 0);
2322 }
2323
2324 void
2325 vn_destroy_cache(void)
2326 {
2327 kmem_cache_destroy(vn_cache);
2328 }
2329
2330 /*
2331 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2332 * cached by the file system and vnodes remain associated.
2333 */
2334 void
2335 vn_recycle(vnode_t *vp)
2336 {
2337 ASSERT(vp->v_pages == NULL);
2338
2339 /*
2340 * XXX - This really belongs in vn_reinit(), but we have some issues
2341 * with the counts. Best to have it here for clean initialization.
2342 */
2343 vp->v_rdcnt = 0;
2344 vp->v_wrcnt = 0;
2345 vp->v_mmap_read = 0;
2346 vp->v_mmap_write = 0;
2347
2348 /*
2349 * If FEM was in use, make sure everything gets cleaned up
2350 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2351 * constructor.
2352 */
2353 if (vp->v_femhead) {
2354 /* XXX - There should be a free_femhead() that does all this */
2355 ASSERT(vp->v_femhead->femh_list == NULL);
2356 mutex_destroy(&vp->v_femhead->femh_lock);
2357 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2358 vp->v_femhead = NULL;
2359 }
2360 if (vp->v_path) {
2361 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2362 vp->v_path = NULL;
2363 }
2364
2365 if (vp->v_fopdata != NULL) {
2366 free_fopdata(vp);
2367 }
2368 vp->v_mpssdata = NULL;
2369 vsd_free(vp);
2370 }
2371
2372 /*
2373 * Used to reset the vnode fields including those that are directly accessible
2374 * as well as those which require an accessor function.
2375 *
2376 * Does not initialize:
2377 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2378 * v_data (since FS-nodes and vnodes point to each other and should
2379 * be updated simultaneously)
2380 * v_op (in case someone needs to make a VOP call on this object)
2381 */
2382 void
2383 vn_reinit(vnode_t *vp)
2384 {
2385 vp->v_count = 1;
2386 vp->v_count_dnlc = 0;
2387 vp->v_vfsp = NULL;
2388 vp->v_stream = NULL;
2389 vp->v_vfsmountedhere = NULL;
2390 vp->v_flag = 0;
2391 vp->v_type = VNON;
2392 vp->v_rdev = NODEV;
2393
2394 vp->v_filocks = NULL;
2395 vp->v_shrlocks = NULL;
2396 vp->v_pages = NULL;
2397
2398 vp->v_locality = NULL;
2399 vp->v_xattrdir = NULL;
2400
2401 /* Handles v_femhead, v_path, and the r/w/map counts */
2402 vn_recycle(vp);
2403 }
2404
2405 vnode_t *
2406 vn_alloc(int kmflag)
2407 {
2408 vnode_t *vp;
2409
2410 vp = kmem_cache_alloc(vn_cache, kmflag);
2411
2412 if (vp != NULL) {
2413 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2414 vp->v_fopdata = NULL;
2415 vn_reinit(vp);
2416 }
2417
2418 return (vp);
2419 }
2420
2421 void
2422 vn_free(vnode_t *vp)
2423 {
2424 ASSERT(vp->v_shrlocks == NULL);
2425 ASSERT(vp->v_filocks == NULL);
2426
2427 /*
2428 * Some file systems call vn_free() with v_count of zero,
2429 * some with v_count of 1. In any case, the value should
2430 * never be anything else.
2431 */
2432 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2433 ASSERT(vp->v_count_dnlc == 0);
2434 if (vp->v_path != NULL) {
2435 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2436 vp->v_path = NULL;
2437 }
2438
2439 /* If FEM was in use, make sure everything gets cleaned up */
2440 if (vp->v_femhead) {
2441 /* XXX - There should be a free_femhead() that does all this */
2442 ASSERT(vp->v_femhead->femh_list == NULL);
2443 mutex_destroy(&vp->v_femhead->femh_lock);
2444 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2445 vp->v_femhead = NULL;
2446 }
2447
2448 if (vp->v_fopdata != NULL) {
2449 free_fopdata(vp);
2450 }
2451 vp->v_mpssdata = NULL;
2452 vsd_free(vp);
2453 kmem_cache_free(vn_cache, vp);
2454 }
2455
2456 /*
2457 * vnode status changes, should define better states than 1, 0.
2458 */
2459 void
2460 vn_reclaim(vnode_t *vp)
2461 {
2462 vfs_t *vfsp = vp->v_vfsp;
2463
2464 if (vfsp == NULL ||
2465 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2466 return;
2467 }
2468 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2469 }
2470
2471 void
2472 vn_idle(vnode_t *vp)
2473 {
2474 vfs_t *vfsp = vp->v_vfsp;
2475
2476 if (vfsp == NULL ||
2477 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2478 return;
2479 }
2480 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2481 }
2482 void
2483 vn_exists(vnode_t *vp)
2484 {
2485 vfs_t *vfsp = vp->v_vfsp;
2486
2487 if (vfsp == NULL ||
2488 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2489 return;
2490 }
2491 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2492 }
2493
2494 void
2495 vn_invalid(vnode_t *vp)
2496 {
2497 vfs_t *vfsp = vp->v_vfsp;
2498
2499 if (vfsp == NULL ||
2500 vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2501 return;
2502 }
2503 (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2504 }
2505
2506 /* Vnode event notification */
2507
2508 int
2509 vnevent_support(vnode_t *vp, caller_context_t *ct)
2510 {
2511 if (vp == NULL)
2512 return (EINVAL);
2513
2514 return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2515 }
2516
2517 void
2518 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2519 {
2520 if (vp == NULL || vp->v_femhead == NULL) {
2521 return;
2522 }
2523 (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2524 (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2525 }
2526
2527 void
2528 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2529 caller_context_t *ct)
2530 {
2531 if (vp == NULL || vp->v_femhead == NULL) {
2532 return;
2533 }
2534 (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2535 }
2536
2537 void
2538 vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2539 caller_context_t *ct)
2540 {
2541 if (vp == NULL || vp->v_femhead == NULL) {
2542 return;
2543 }
2544 (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2545 }
2546
2547 void
2548 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2549 {
2550 if (vp == NULL || vp->v_femhead == NULL) {
2551 return;
2552 }
2553 (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2554 }
2555
2556 void
2557 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2558 {
2559 if (vp == NULL || vp->v_femhead == NULL) {
2560 return;
2561 }
2562 (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2563 }
2564
2565 void
2566 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2567 caller_context_t *ct)
2568 {
2569 if (vp == NULL || vp->v_femhead == NULL) {
2570 return;
2571 }
2572 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2573 }
2574
2575 void
2576 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2577 caller_context_t *ct)
2578 {
2579 if (vp == NULL || vp->v_femhead == NULL) {
2580 return;
2581 }
2582 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2583 }
2584
2585 void
2586 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2587 caller_context_t *ct)
2588 {
2589 if (vp == NULL || vp->v_femhead == NULL) {
2590 return;
2591 }
2592 (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2593 }
2594
2595 void
2596 vnevent_create(vnode_t *vp, caller_context_t *ct)
2597 {
2598 if (vp == NULL || vp->v_femhead == NULL) {
2599 return;
2600 }
2601 (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2602 }
2603
2604 void
2605 vnevent_link(vnode_t *vp, caller_context_t *ct)
2606 {
2607 if (vp == NULL || vp->v_femhead == NULL) {
2608 return;
2609 }
2610 (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2611 }
2612
2613 void
2614 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2615 {
2616 if (vp == NULL || vp->v_femhead == NULL) {
2617 return;
2618 }
2619 (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2620 }
2621
2622 void
2623 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2624 {
2625 if (vp == NULL || vp->v_femhead == NULL) {
2626 return;
2627 }
2628 (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2629 }
2630
2631 void
2632 vnevent_resize(vnode_t *vp, caller_context_t *ct)
2633 {
2634 if (vp == NULL || vp->v_femhead == NULL) {
2635 return;
2636 }
2637 (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2638 }
2639
2640 /*
2641 * Vnode accessors.
2642 */
2643
2644 int
2645 vn_is_readonly(vnode_t *vp)
2646 {
2647 return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2648 }
2649
2650 int
2651 vn_has_flocks(vnode_t *vp)
2652 {
2653 return (vp->v_filocks != NULL);
2654 }
2655
2656 int
2657 vn_has_mandatory_locks(vnode_t *vp, int mode)
2658 {
2659 return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2660 }
2661
2662 int
2663 vn_has_cached_data(vnode_t *vp)
2664 {
2665 return (vp->v_pages != NULL);
2666 }
2667
2668 /*
2669 * Return 0 if the vnode in question shouldn't be permitted into a zone via
2670 * zone_enter(2).
2671 */
2672 int
2673 vn_can_change_zones(vnode_t *vp)
2674 {
2675 struct vfssw *vswp;
2676 int allow = 1;
2677 vnode_t *rvp;
2678
2679 if (nfs_global_client_only != 0)
2680 return (1);
2681
2682 /*
2683 * We always want to look at the underlying vnode if there is one.
2684 */
2685 if (VOP_REALVP(vp, &rvp, NULL) != 0)
2686 rvp = vp;
2687 /*
2688 * Some pseudo filesystems (including doorfs) don't actually register
2689 * their vfsops_t, so the following may return NULL; we happily let
2690 * such vnodes switch zones.
2691 */
2692 vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2693 if (vswp != NULL) {
2694 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2695 allow = 0;
2696 vfs_unrefvfssw(vswp);
2697 }
2698 return (allow);
2699 }
2700
2701 /*
2702 * Return nonzero if the vnode is a mount point, zero if not.
2703 */
2704 int
2705 vn_ismntpt(vnode_t *vp)
2706 {
2707 return (vp->v_vfsmountedhere != NULL);
2708 }
2709
2710 /* Retrieve the vfs (if any) mounted on this vnode */
2711 vfs_t *
2712 vn_mountedvfs(vnode_t *vp)
2713 {
2714 return (vp->v_vfsmountedhere);
2715 }
2716
2717 /*
2718 * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2719 */
2720 int
2721 vn_in_dnlc(vnode_t *vp)
2722 {
2723 return (vp->v_count_dnlc > 0);
2724 }
2725
2726 /*
2727 * vn_has_other_opens() checks whether a particular file is opened by more than
2728 * just the caller and whether the open is for read and/or write.
2729 * This routine is for calling after the caller has already called VOP_OPEN()
2730 * and the caller wishes to know if they are the only one with it open for
2731 * the mode(s) specified.
2732 *
2733 * Vnode counts are only kept on regular files (v_type=VREG).
2734 */
2735 int
2736 vn_has_other_opens(
2737 vnode_t *vp,
2738 v_mode_t mode)
2739 {
2740
2741 ASSERT(vp != NULL);
2742
2743 switch (mode) {
2744 case V_WRITE:
2745 if (vp->v_wrcnt > 1)
2746 return (V_TRUE);
2747 break;
2748 case V_RDORWR:
2749 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2750 return (V_TRUE);
2751 break;
2752 case V_RDANDWR:
2753 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2754 return (V_TRUE);
2755 break;
2756 case V_READ:
2757 if (vp->v_rdcnt > 1)
2758 return (V_TRUE);
2759 break;
2760 }
2761
2762 return (V_FALSE);
2763 }
2764
2765 /*
2766 * vn_is_opened() checks whether a particular file is opened and
2767 * whether the open is for read and/or write.
2768 *
2769 * Vnode counts are only kept on regular files (v_type=VREG).
2770 */
2771 int
2772 vn_is_opened(
2773 vnode_t *vp,
2774 v_mode_t mode)
2775 {
2776
2777 ASSERT(vp != NULL);
2778
2779 switch (mode) {
2780 case V_WRITE:
2781 if (vp->v_wrcnt)
2782 return (V_TRUE);
2783 break;
2784 case V_RDANDWR:
2785 if (vp->v_rdcnt && vp->v_wrcnt)
2786 return (V_TRUE);
2787 break;
2788 case V_RDORWR:
2789 if (vp->v_rdcnt || vp->v_wrcnt)
2790 return (V_TRUE);
2791 break;
2792 case V_READ:
2793 if (vp->v_rdcnt)
2794 return (V_TRUE);
2795 break;
2796 }
2797
2798 return (V_FALSE);
2799 }
2800
2801 /*
2802 * vn_is_mapped() checks whether a particular file is mapped and whether
2803 * the file is mapped read and/or write.
2804 */
2805 int
2806 vn_is_mapped(
2807 vnode_t *vp,
2808 v_mode_t mode)
2809 {
2810
2811 ASSERT(vp != NULL);
2812
2813 #if !defined(_LP64)
2814 switch (mode) {
2815 /*
2816 * The atomic_add_64_nv functions force atomicity in the
2817 * case of 32 bit architectures. Otherwise the 64 bit values
2818 * require two fetches. The value of the fields may be
2819 * (potentially) changed between the first fetch and the
2820 * second
2821 */
2822 case V_WRITE:
2823 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2824 return (V_TRUE);
2825 break;
2826 case V_RDANDWR:
2827 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2828 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2829 return (V_TRUE);
2830 break;
2831 case V_RDORWR:
2832 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2833 (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2834 return (V_TRUE);
2835 break;
2836 case V_READ:
2837 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2838 return (V_TRUE);
2839 break;
2840 }
2841 #else
2842 switch (mode) {
2843 case V_WRITE:
2844 if (vp->v_mmap_write)
2845 return (V_TRUE);
2846 break;
2847 case V_RDANDWR:
2848 if (vp->v_mmap_read && vp->v_mmap_write)
2849 return (V_TRUE);
2850 break;
2851 case V_RDORWR:
2852 if (vp->v_mmap_read || vp->v_mmap_write)
2853 return (V_TRUE);
2854 break;
2855 case V_READ:
2856 if (vp->v_mmap_read)
2857 return (V_TRUE);
2858 break;
2859 }
2860 #endif
2861
2862 return (V_FALSE);
2863 }
2864
2865 /*
2866 * Set the operations vector for a vnode.
2867 *
2868 * FEM ensures that the v_femhead pointer is filled in before the
2869 * v_op pointer is changed. This means that if the v_femhead pointer
2870 * is NULL, and the v_op field hasn't changed since before which checked
2871 * the v_femhead pointer; then our update is ok - we are not racing with
2872 * FEM.
2873 */
2874 void
2875 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2876 {
2877 vnodeops_t *op;
2878
2879 ASSERT(vp != NULL);
2880 ASSERT(vnodeops != NULL);
2881
2882 op = vp->v_op;
2883 membar_consumer();
2884 /*
2885 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2886 * the compare-and-swap on vp->v_op. If either fails, then FEM is
2887 * in effect on the vnode and we need to have FEM deal with it.
2888 */
2889 if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2890 op) {
2891 fem_setvnops(vp, vnodeops);
2892 }
2893 }
2894
2895 /*
2896 * Retrieve the operations vector for a vnode
2897 * As with vn_setops(above); make sure we aren't racing with FEM.
2898 * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2899 * make sense to the callers of this routine.
2900 */
2901 vnodeops_t *
2902 vn_getops(vnode_t *vp)
2903 {
2904 vnodeops_t *op;
2905
2906 ASSERT(vp != NULL);
2907
2908 op = vp->v_op;
2909 membar_consumer();
2910 if (vp->v_femhead == NULL && op == vp->v_op) {
2911 return (op);
2912 } else {
2913 return (fem_getvnops(vp));
2914 }
2915 }
2916
2917 /*
2918 * Returns non-zero (1) if the vnodeops matches that of the vnode.
2919 * Returns zero (0) if not.
2920 */
2921 int
2922 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2923 {
2924 return (vn_getops(vp) == vnodeops);
2925 }
2926
2927 /*
2928 * Returns non-zero (1) if the specified operation matches the
2929 * corresponding operation for that the vnode.
2930 * Returns zero (0) if not.
2931 */
2932
2933 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2934
2935 int
2936 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2937 {
2938 const fs_operation_trans_def_t *otdp;
2939 fs_generic_func_p *loc = NULL;
2940 vnodeops_t *vop = vn_getops(vp);
2941
2942 ASSERT(vopname != NULL);
2943
2944 for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2945 if (MATCHNAME(otdp->name, vopname)) {
2946 loc = (fs_generic_func_p *)
2947 ((char *)(vop) + otdp->offset);
2948 break;
2949 }
2950 }
2951
2952 return ((loc != NULL) && (*loc == funcp));
2953 }
2954
2955 /*
2956 * fs_new_caller_id() needs to return a unique ID on a given local system.
2957 * The IDs do not need to survive across reboots. These are primarily
2958 * used so that (FEM) monitors can detect particular callers (such as
2959 * the NFS server) to a given vnode/vfs operation.
2960 */
2961 u_longlong_t
2962 fs_new_caller_id()
2963 {
2964 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2965
2966 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2967 }
2968
2969 /*
2970 * Given a starting vnode and a path, updates the path in the target vnode in
2971 * a safe manner. If the vnode already has path information embedded, then the
2972 * cached path is left untouched.
2973 */
2974
2975 size_t max_vnode_path = 4 * MAXPATHLEN;
2976
2977 void
2978 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2979 const char *path, size_t plen)
2980 {
2981 char *rpath;
2982 vnode_t *base;
2983 size_t rpathlen, rpathalloc;
2984 int doslash = 1;
2985
2986 if (*path == '/') {
2987 base = rootvp;
2988 path++;
2989 plen--;
2990 } else {
2991 base = startvp;
2992 }
2993
2994 /*
2995 * We cannot grab base->v_lock while we hold vp->v_lock because of
2996 * the potential for deadlock.
2997 */
2998 mutex_enter(&base->v_lock);
2999 if (base->v_path == NULL) {
3000 mutex_exit(&base->v_lock);
3001 return;
3002 }
3003
3004 rpathlen = strlen(base->v_path);
3005 rpathalloc = rpathlen + plen + 1;
3006 /* Avoid adding a slash if there's already one there */
3007 if (base->v_path[rpathlen-1] == '/')
3008 doslash = 0;
3009 else
3010 rpathalloc++;
3011
3012 /*
3013 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
3014 * so we must do this dance. If, by chance, something changes the path,
3015 * just give up since there is no real harm.
3016 */
3017 mutex_exit(&base->v_lock);
3018
3019 /* Paths should stay within reason */
3020 if (rpathalloc > max_vnode_path)
3021 return;
3022
3023 rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3024
3025 mutex_enter(&base->v_lock);
3026 if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3027 mutex_exit(&base->v_lock);
3028 kmem_free(rpath, rpathalloc);
3029 return;
3030 }
3031 bcopy(base->v_path, rpath, rpathlen);
3032 mutex_exit(&base->v_lock);
3033
3034 if (doslash)
3035 rpath[rpathlen++] = '/';
3036 bcopy(path, rpath + rpathlen, plen);
3037 rpath[rpathlen + plen] = '\0';
3038
3039 mutex_enter(&vp->v_lock);
3040 if (vp->v_path != NULL) {
3041 mutex_exit(&vp->v_lock);
3042 kmem_free(rpath, rpathalloc);
3043 } else {
3044 vp->v_path = rpath;
3045 mutex_exit(&vp->v_lock);
3046 }
3047 }
3048
3049 /*
3050 * Sets the path to the vnode to be the given string, regardless of current
3051 * context. The string must be a complete path from rootdir. This is only used
3052 * by fsop_root() for setting the path based on the mountpoint.
3053 */
3054 void
3055 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3056 {
3057 char *buf = kmem_alloc(len + 1, KM_SLEEP);
3058
3059 mutex_enter(&vp->v_lock);
3060 if (vp->v_path != NULL) {
3061 mutex_exit(&vp->v_lock);
3062 kmem_free(buf, len + 1);
3063 return;
3064 }
3065
3066 vp->v_path = buf;
3067 bcopy(str, vp->v_path, len);
3068 vp->v_path[len] = '\0';
3069
3070 mutex_exit(&vp->v_lock);
3071 }
3072
3073 /*
3074 * Called from within filesystem's vop_rename() to handle renames once the
3075 * target vnode is available.
3076 */
3077 void
3078 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3079 {
3080 char *tmp;
3081
3082 mutex_enter(&vp->v_lock);
3083 tmp = vp->v_path;
3084 vp->v_path = NULL;
3085 mutex_exit(&vp->v_lock);
3086 vn_setpath(rootdir, dvp, vp, nm, len);
3087 if (tmp != NULL)
3088 kmem_free(tmp, strlen(tmp) + 1);
3089 }
3090
3091 /*
3092 * Similar to vn_setpath_str(), this function sets the path of the destination
3093 * vnode to the be the same as the source vnode.
3094 */
3095 void
3096 vn_copypath(struct vnode *src, struct vnode *dst)
3097 {
3098 char *buf;
3099 int alloc;
3100
3101 mutex_enter(&src->v_lock);
3102 if (src->v_path == NULL) {
3103 mutex_exit(&src->v_lock);
3104 return;
3105 }
3106 alloc = strlen(src->v_path) + 1;
3107
3108 /* avoid kmem_alloc() with lock held */
3109 mutex_exit(&src->v_lock);
3110 buf = kmem_alloc(alloc, KM_SLEEP);
3111 mutex_enter(&src->v_lock);
3112 if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3113 mutex_exit(&src->v_lock);
3114 kmem_free(buf, alloc);
3115 return;
3116 }
3117 bcopy(src->v_path, buf, alloc);
3118 mutex_exit(&src->v_lock);
3119
3120 mutex_enter(&dst->v_lock);
3121 if (dst->v_path != NULL) {
3122 mutex_exit(&dst->v_lock);
3123 kmem_free(buf, alloc);
3124 return;
3125 }
3126 dst->v_path = buf;
3127 mutex_exit(&dst->v_lock);
3128 }
3129
3130 /*
3131 * XXX Private interface for segvn routines that handle vnode
3132 * large page segments.
3133 *
3134 * return 1 if vp's file system VOP_PAGEIO() implementation
3135 * can be safely used instead of VOP_GETPAGE() for handling
3136 * pagefaults against regular non swap files. VOP_PAGEIO()
3137 * interface is considered safe here if its implementation
3138 * is very close to VOP_GETPAGE() implementation.
3139 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3140 * panic if there're file holes but instead returns an error.
3141 * Doesn't assume file won't be changed by user writes, etc.
3142 *
3143 * return 0 otherwise.
3144 *
3145 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3146 */
3147 int
3148 vn_vmpss_usepageio(vnode_t *vp)
3149 {
3150 vfs_t *vfsp = vp->v_vfsp;
3151 char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3152 char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3153 char **fsok = pageio_ok_fss;
3154
3155 if (fsname == NULL) {
3156 return (0);
3157 }
3158
3159 for (; *fsok; fsok++) {
3160 if (strcmp(*fsok, fsname) == 0) {
3161 return (1);
3162 }
3163 }
3164 return (0);
3165 }
3166
3167 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3168
3169 int
3170 fop_open(
3171 vnode_t **vpp,
3172 int mode,
3173 cred_t *cr,
3174 caller_context_t *ct)
3175 {
3176 int ret;
3177 vnode_t *vp = *vpp;
3178
3179 VN_HOLD(vp);
3180 /*
3181 * Adding to the vnode counts before calling open
3182 * avoids the need for a mutex. It circumvents a race
3183 * condition where a query made on the vnode counts results in a
3184 * false negative. The inquirer goes away believing the file is
3185 * not open when there is an open on the file already under way.
3186 *
3187 * The counts are meant to prevent NFS from granting a delegation
3188 * when it would be dangerous to do so.
3189 *
3190 * The vnode counts are only kept on regular files
3191 */
3192 if ((*vpp)->v_type == VREG) {
3193 if (mode & FREAD)
3194 atomic_inc_32(&(*vpp)->v_rdcnt);
3195 if (mode & FWRITE)
3196 atomic_inc_32(&(*vpp)->v_wrcnt);
3197 }
3198
3199 VOPXID_MAP_CR(vp, cr);
3200
3201 ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3202
3203 if (ret) {
3204 /*
3205 * Use the saved vp just in case the vnode ptr got trashed
3206 * by the error.
3207 */
3208 VOPSTATS_UPDATE(vp, open);
3209 if ((vp->v_type == VREG) && (mode & FREAD))
3210 atomic_dec_32(&vp->v_rdcnt);
3211 if ((vp->v_type == VREG) && (mode & FWRITE))
3212 atomic_dec_32(&vp->v_wrcnt);
3213 } else {
3214 /*
3215 * Some filesystems will return a different vnode,
3216 * but the same path was still used to open it.
3217 * So if we do change the vnode and need to
3218 * copy over the path, do so here, rather than special
3219 * casing each filesystem. Adjust the vnode counts to
3220 * reflect the vnode switch.
3221 */
3222 VOPSTATS_UPDATE(*vpp, open);
3223 if (*vpp != vp && *vpp != NULL) {
3224 vn_copypath(vp, *vpp);
3225 if (((*vpp)->v_type == VREG) && (mode & FREAD))
3226 atomic_inc_32(&(*vpp)->v_rdcnt);
3227 if ((vp->v_type == VREG) && (mode & FREAD))
3228 atomic_dec_32(&vp->v_rdcnt);
3229 if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3230 atomic_inc_32(&(*vpp)->v_wrcnt);
3231 if ((vp->v_type == VREG) && (mode & FWRITE))
3232 atomic_dec_32(&vp->v_wrcnt);
3233 }
3234 }
3235 VN_RELE(vp);
3236 return (ret);
3237 }
3238
3239 int
3240 fop_close(
3241 vnode_t *vp,
3242 int flag,
3243 int count,
3244 offset_t offset,
3245 cred_t *cr,
3246 caller_context_t *ct)
3247 {
3248 int err;
3249
3250 VOPXID_MAP_CR(vp, cr);
3251
3252 err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3253 VOPSTATS_UPDATE(vp, close);
3254 /*
3255 * Check passed in count to handle possible dups. Vnode counts are only
3256 * kept on regular files
3257 */
3258 if ((vp->v_type == VREG) && (count == 1)) {
3259 if (flag & FREAD) {
3260 ASSERT(vp->v_rdcnt > 0);
3261 atomic_dec_32(&vp->v_rdcnt);
3262 }
3263 if (flag & FWRITE) {
3264 ASSERT(vp->v_wrcnt > 0);
3265 atomic_dec_32(&vp->v_wrcnt);
3266 }
3267 }
3268 return (err);
3269 }
3270
3271 int
3272 fop_read(
3273 vnode_t *vp,
3274 uio_t *uiop,
3275 int ioflag,
3276 cred_t *cr,
3277 caller_context_t *ct)
3278 {
3279 ssize_t resid_start = uiop->uio_resid;
3280 zone_t *zonep = curzone;
3281 zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3282
3283 hrtime_t start = 0, lat;
3284 ssize_t len;
3285 int err;
3286
3287 if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3288 vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3289 start = gethrtime();
3290
3291 mutex_enter(&zonep->zone_vfs_lock);
3292 kstat_runq_enter(&zonep->zone_vfs_rwstats);
3293 mutex_exit(&zonep->zone_vfs_lock);
3294 }
3295
3296 VOPXID_MAP_CR(vp, cr);
3297
3298 err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3299 len = resid_start - uiop->uio_resid;
3300
3301 VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3302
3303 if (start != 0) {
3304 mutex_enter(&zonep->zone_vfs_lock);
3305 zonep->zone_vfs_rwstats.reads++;
3306 zonep->zone_vfs_rwstats.nread += len;
3307 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3308 mutex_exit(&zonep->zone_vfs_lock);
3309
3310 lat = gethrtime() - start;
3311
3312 if (lat >= VOP_LATENCY_10MS) {
3313 if (lat < VOP_LATENCY_100MS)
3314 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3315 else if (lat < VOP_LATENCY_1S) {
3316 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3317 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3318 } else {
3319 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3320 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3321 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3322 }
3323 }
3324 }
3325
3326 return (err);
3327 }
3328
3329 int
3330 fop_write(
3331 vnode_t *vp,
3332 uio_t *uiop,
3333 int ioflag,
3334 cred_t *cr,
3335 caller_context_t *ct)
3336 {
3337 ssize_t resid_start = uiop->uio_resid;
3338 zone_t *zonep = curzone;
3339 zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3340
3341 hrtime_t start = 0, lat;
3342 ssize_t len;
3343 int err;
3344
3345 /*
3346 * For the purposes of VFS kstat consumers, the "waitq" calculation is
3347 * repurposed as the active queue for VFS write operations. There's no
3348 * actual wait queue for VFS operations.
3349 */
3350 if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3351 vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3352 start = gethrtime();
3353
3354 mutex_enter(&zonep->zone_vfs_lock);
3355 kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3356 mutex_exit(&zonep->zone_vfs_lock);
3357 }
3358
3359 VOPXID_MAP_CR(vp, cr);
3360
3361 err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3362 len = resid_start - uiop->uio_resid;
3363
3364 VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3365
3366 if (start != 0) {
3367 mutex_enter(&zonep->zone_vfs_lock);
3368 zonep->zone_vfs_rwstats.writes++;
3369 zonep->zone_vfs_rwstats.nwritten += len;
3370 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3371 mutex_exit(&zonep->zone_vfs_lock);
3372
3373 lat = gethrtime() - start;
3374
3375 if (lat >= VOP_LATENCY_10MS) {
3376 if (lat < VOP_LATENCY_100MS)
3377 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3378 else if (lat < VOP_LATENCY_1S) {
3379 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3380 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3381 } else {
3382 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3383 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3384 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3385 }
3386 }
3387 }
3388
3389 return (err);
3390 }
3391
3392 int
3393 fop_ioctl(
3394 vnode_t *vp,
3395 int cmd,
3396 intptr_t arg,
3397 int flag,
3398 cred_t *cr,
3399 int *rvalp,
3400 caller_context_t *ct)
3401 {
3402 int err;
3403
3404 VOPXID_MAP_CR(vp, cr);
3405
3406 err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3407 VOPSTATS_UPDATE(vp, ioctl);
3408 return (err);
3409 }
3410
3411 int
3412 fop_setfl(
3413 vnode_t *vp,
3414 int oflags,
3415 int nflags,
3416 cred_t *cr,
3417 caller_context_t *ct)
3418 {
3419 int err;
3420
3421 VOPXID_MAP_CR(vp, cr);
3422
3423 err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3424 VOPSTATS_UPDATE(vp, setfl);
3425 return (err);
3426 }
3427
3428 int
3429 fop_getattr(
3430 vnode_t *vp,
3431 vattr_t *vap,
3432 int flags,
3433 cred_t *cr,
3434 caller_context_t *ct)
3435 {
3436 int err;
3437
3438 VOPXID_MAP_CR(vp, cr);
3439
3440 /*
3441 * If this file system doesn't understand the xvattr extensions
3442 * then turn off the xvattr bit.
3443 */
3444 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3445 vap->va_mask &= ~AT_XVATTR;
3446 }
3447
3448 /*
3449 * We're only allowed to skip the ACL check iff we used a 32 bit
3450 * ACE mask with VOP_ACCESS() to determine permissions.
3451 */
3452 if ((flags & ATTR_NOACLCHECK) &&
3453 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3454 return (EINVAL);
3455 }
3456 err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3457 VOPSTATS_UPDATE(vp, getattr);
3458 return (err);
3459 }
3460
3461 int
3462 fop_setattr(
3463 vnode_t *vp,
3464 vattr_t *vap,
3465 int flags,
3466 cred_t *cr,
3467 caller_context_t *ct)
3468 {
3469 int err;
3470
3471 VOPXID_MAP_CR(vp, cr);
3472
3473 /*
3474 * If this file system doesn't understand the xvattr extensions
3475 * then turn off the xvattr bit.
3476 */
3477 if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3478 vap->va_mask &= ~AT_XVATTR;
3479 }
3480
3481 /*
3482 * We're only allowed to skip the ACL check iff we used a 32 bit
3483 * ACE mask with VOP_ACCESS() to determine permissions.
3484 */
3485 if ((flags & ATTR_NOACLCHECK) &&
3486 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3487 return (EINVAL);
3488 }
3489 err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3490 VOPSTATS_UPDATE(vp, setattr);
3491 return (err);
3492 }
3493
3494 int
3495 fop_access(
3496 vnode_t *vp,
3497 int mode,
3498 int flags,
3499 cred_t *cr,
3500 caller_context_t *ct)
3501 {
3502 int err;
3503
3504 if ((flags & V_ACE_MASK) &&
3505 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3506 return (EINVAL);
3507 }
3508
3509 VOPXID_MAP_CR(vp, cr);
3510
3511 err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3512 VOPSTATS_UPDATE(vp, access);
3513 return (err);
3514 }
3515
3516 int
3517 fop_lookup(
3518 vnode_t *dvp,
3519 char *nm,
3520 vnode_t **vpp,
3521 pathname_t *pnp,
3522 int flags,
3523 vnode_t *rdir,
3524 cred_t *cr,
3525 caller_context_t *ct,
3526 int *deflags, /* Returned per-dirent flags */
3527 pathname_t *ppnp) /* Returned case-preserved name in directory */
3528 {
3529 int ret;
3530
3531 /*
3532 * If this file system doesn't support case-insensitive access
3533 * and said access is requested, fail quickly. It is required
3534 * that if the vfs supports case-insensitive lookup, it also
3535 * supports extended dirent flags.
3536 */
3537 if (flags & FIGNORECASE &&
3538 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3539 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3540 return (EINVAL);
3541
3542 VOPXID_MAP_CR(dvp, cr);
3543
3544 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3545 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3546 } else {
3547 ret = (*(dvp)->v_op->vop_lookup)
3548 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3549 }
3550 if (ret == 0 && *vpp) {
3551 VOPSTATS_UPDATE(*vpp, lookup);
3552 if ((*vpp)->v_path == NULL) {
3553 vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3554 }
3555 }
3556
3557 return (ret);
3558 }
3559
3560 int
3561 fop_create(
3562 vnode_t *dvp,
3563 char *name,
3564 vattr_t *vap,
3565 vcexcl_t excl,
3566 int mode,
3567 vnode_t **vpp,
3568 cred_t *cr,
3569 int flags,
3570 caller_context_t *ct,
3571 vsecattr_t *vsecp) /* ACL to set during create */
3572 {
3573 int ret;
3574
3575 if (vsecp != NULL &&
3576 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3577 return (EINVAL);
3578 }
3579 /*
3580 * If this file system doesn't support case-insensitive access
3581 * and said access is requested, fail quickly.
3582 */
3583 if (flags & FIGNORECASE &&
3584 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3585 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3586 return (EINVAL);
3587
3588 VOPXID_MAP_CR(dvp, cr);
3589
3590 ret = (*(dvp)->v_op->vop_create)
3591 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3592 if (ret == 0 && *vpp) {
3593 VOPSTATS_UPDATE(*vpp, create);
3594 if ((*vpp)->v_path == NULL) {
3595 vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3596 }
3597 }
3598
3599 return (ret);
3600 }
3601
3602 int
3603 fop_remove(
3604 vnode_t *dvp,
3605 char *nm,
3606 cred_t *cr,
3607 caller_context_t *ct,
3608 int flags)
3609 {
3610 int err;
3611
3612 /*
3613 * If this file system doesn't support case-insensitive access
3614 * and said access is requested, fail quickly.
3615 */
3616 if (flags & FIGNORECASE &&
3617 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3618 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3619 return (EINVAL);
3620
3621 VOPXID_MAP_CR(dvp, cr);
3622
3623 err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3624 VOPSTATS_UPDATE(dvp, remove);
3625 return (err);
3626 }
3627
3628 int
3629 fop_link(
3630 vnode_t *tdvp,
3631 vnode_t *svp,
3632 char *tnm,
3633 cred_t *cr,
3634 caller_context_t *ct,
3635 int flags)
3636 {
3637 int err;
3638
3639 /*
3640 * If the target file system doesn't support case-insensitive access
3641 * and said access is requested, fail quickly.
3642 */
3643 if (flags & FIGNORECASE &&
3644 (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3645 vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3646 return (EINVAL);
3647
3648 VOPXID_MAP_CR(tdvp, cr);
3649
3650 err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3651 VOPSTATS_UPDATE(tdvp, link);
3652 return (err);
3653 }
3654
3655 int
3656 fop_rename(
3657 vnode_t *sdvp,
3658 char *snm,
3659 vnode_t *tdvp,
3660 char *tnm,
3661 cred_t *cr,
3662 caller_context_t *ct,
3663 int flags)
3664 {
3665 int err;
3666
3667 /*
3668 * If the file system involved does not support
3669 * case-insensitive access and said access is requested, fail
3670 * quickly.
3671 */
3672 if (flags & FIGNORECASE &&
3673 ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3674 vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3675 return (EINVAL);
3676
3677 VOPXID_MAP_CR(tdvp, cr);
3678
3679 err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3680 VOPSTATS_UPDATE(sdvp, rename);
3681 return (err);
3682 }
3683
3684 int
3685 fop_mkdir(
3686 vnode_t *dvp,
3687 char *dirname,
3688 vattr_t *vap,
3689 vnode_t **vpp,
3690 cred_t *cr,
3691 caller_context_t *ct,
3692 int flags,
3693 vsecattr_t *vsecp) /* ACL to set during create */
3694 {
3695 int ret;
3696
3697 if (vsecp != NULL &&
3698 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3699 return (EINVAL);
3700 }
3701 /*
3702 * If this file system doesn't support case-insensitive access
3703 * and said access is requested, fail quickly.
3704 */
3705 if (flags & FIGNORECASE &&
3706 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3707 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3708 return (EINVAL);
3709
3710 VOPXID_MAP_CR(dvp, cr);
3711
3712 ret = (*(dvp)->v_op->vop_mkdir)
3713 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3714 if (ret == 0 && *vpp) {
3715 VOPSTATS_UPDATE(*vpp, mkdir);
3716 if ((*vpp)->v_path == NULL) {
3717 vn_setpath(rootdir, dvp, *vpp, dirname,
3718 strlen(dirname));
3719 }
3720 }
3721
3722 return (ret);
3723 }
3724
3725 int
3726 fop_rmdir(
3727 vnode_t *dvp,
3728 char *nm,
3729 vnode_t *cdir,
3730 cred_t *cr,
3731 caller_context_t *ct,
3732 int flags)
3733 {
3734 int err;
3735
3736 /*
3737 * If this file system doesn't support case-insensitive access
3738 * and said access is requested, fail quickly.
3739 */
3740 if (flags & FIGNORECASE &&
3741 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3742 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3743 return (EINVAL);
3744
3745 VOPXID_MAP_CR(dvp, cr);
3746
3747 err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3748 VOPSTATS_UPDATE(dvp, rmdir);
3749 return (err);
3750 }
3751
3752 int
3753 fop_readdir(
3754 vnode_t *vp,
3755 uio_t *uiop,
3756 cred_t *cr,
3757 int *eofp,
3758 caller_context_t *ct,
3759 int flags)
3760 {
3761 int err;
3762 ssize_t resid_start = uiop->uio_resid;
3763
3764 /*
3765 * If this file system doesn't support retrieving directory
3766 * entry flags and said access is requested, fail quickly.
3767 */
3768 if (flags & V_RDDIR_ENTFLAGS &&
3769 vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3770 return (EINVAL);
3771
3772 VOPXID_MAP_CR(vp, cr);
3773
3774 err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3775 VOPSTATS_UPDATE_IO(vp, readdir,
3776 readdir_bytes, (resid_start - uiop->uio_resid));
3777 return (err);
3778 }
3779
3780 int
3781 fop_symlink(
3782 vnode_t *dvp,
3783 char *linkname,
3784 vattr_t *vap,
3785 char *target,
3786 cred_t *cr,
3787 caller_context_t *ct,
3788 int flags)
3789 {
3790 int err;
3791 xvattr_t xvattr;
3792
3793 /*
3794 * If this file system doesn't support case-insensitive access
3795 * and said access is requested, fail quickly.
3796 */
3797 if (flags & FIGNORECASE &&
3798 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3799 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3800 return (EINVAL);
3801
3802 VOPXID_MAP_CR(dvp, cr);
3803
3804 /* check for reparse point */
3805 if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3806 (strncmp(target, FS_REPARSE_TAG_STR,
3807 strlen(FS_REPARSE_TAG_STR)) == 0)) {
3808 if (!fs_reparse_mark(target, vap, &xvattr))
3809 vap = (vattr_t *)&xvattr;
3810 }
3811
3812 err = (*(dvp)->v_op->vop_symlink)
3813 (dvp, linkname, vap, target, cr, ct, flags);
3814 VOPSTATS_UPDATE(dvp, symlink);
3815 return (err);
3816 }
3817
3818 int
3819 fop_readlink(
3820 vnode_t *vp,
3821 uio_t *uiop,
3822 cred_t *cr,
3823 caller_context_t *ct)
3824 {
3825 int err;
3826
3827 VOPXID_MAP_CR(vp, cr);
3828
3829 err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3830 VOPSTATS_UPDATE(vp, readlink);
3831 return (err);
3832 }
3833
3834 int
3835 fop_fsync(
3836 vnode_t *vp,
3837 int syncflag,
3838 cred_t *cr,
3839 caller_context_t *ct)
3840 {
3841 int err;
3842
3843 VOPXID_MAP_CR(vp, cr);
3844
3845 err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3846 VOPSTATS_UPDATE(vp, fsync);
3847 return (err);
3848 }
3849
3850 void
3851 fop_inactive(
3852 vnode_t *vp,
3853 cred_t *cr,
3854 caller_context_t *ct)
3855 {
3856 /* Need to update stats before vop call since we may lose the vnode */
3857 VOPSTATS_UPDATE(vp, inactive);
3858
3859 VOPXID_MAP_CR(vp, cr);
3860
3861 (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3862 }
3863
3864 int
3865 fop_fid(
3866 vnode_t *vp,
3867 fid_t *fidp,
3868 caller_context_t *ct)
3869 {
3870 int err;
3871
3872 err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3873 VOPSTATS_UPDATE(vp, fid);
3874 return (err);
3875 }
3876
3877 int
3878 fop_rwlock(
3879 vnode_t *vp,
3880 int write_lock,
3881 caller_context_t *ct)
3882 {
3883 int ret;
3884
3885 ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3886 VOPSTATS_UPDATE(vp, rwlock);
3887 return (ret);
3888 }
3889
3890 void
3891 fop_rwunlock(
3892 vnode_t *vp,
3893 int write_lock,
3894 caller_context_t *ct)
3895 {
3896 (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3897 VOPSTATS_UPDATE(vp, rwunlock);
3898 }
3899
3900 int
3901 fop_seek(
3902 vnode_t *vp,
3903 offset_t ooff,
3904 offset_t *noffp,
3905 caller_context_t *ct)
3906 {
3907 int err;
3908
3909 err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
3910 VOPSTATS_UPDATE(vp, seek);
3911 return (err);
3912 }
3913
3914 int
3915 fop_cmp(
3916 vnode_t *vp1,
3917 vnode_t *vp2,
3918 caller_context_t *ct)
3919 {
3920 int err;
3921
3922 err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
3923 VOPSTATS_UPDATE(vp1, cmp);
3924 return (err);
3925 }
3926
3927 int
3928 fop_frlock(
3929 vnode_t *vp,
3930 int cmd,
3931 flock64_t *bfp,
3932 int flag,
3933 offset_t offset,
3934 struct flk_callback *flk_cbp,
3935 cred_t *cr,
3936 caller_context_t *ct)
3937 {
3938 int err;
3939
3940 VOPXID_MAP_CR(vp, cr);
3941
3942 err = (*(vp)->v_op->vop_frlock)
3943 (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3944 VOPSTATS_UPDATE(vp, frlock);
3945 return (err);
3946 }
3947
3948 int
3949 fop_space(
3950 vnode_t *vp,
3951 int cmd,
3952 flock64_t *bfp,
3953 int flag,
3954 offset_t offset,
3955 cred_t *cr,
3956 caller_context_t *ct)
3957 {
3958 int err;
3959
3960 VOPXID_MAP_CR(vp, cr);
3961
3962 err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
3963 VOPSTATS_UPDATE(vp, space);
3964 return (err);
3965 }
3966
3967 int
3968 fop_realvp(
3969 vnode_t *vp,
3970 vnode_t **vpp,
3971 caller_context_t *ct)
3972 {
3973 int err;
3974
3975 err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
3976 VOPSTATS_UPDATE(vp, realvp);
3977 return (err);
3978 }
3979
3980 int
3981 fop_getpage(
3982 vnode_t *vp,
3983 offset_t off,
3984 size_t len,
3985 uint_t *protp,
3986 page_t **plarr,
3987 size_t plsz,
3988 struct seg *seg,
3989 caddr_t addr,
3990 enum seg_rw rw,
3991 cred_t *cr,
3992 caller_context_t *ct)
3993 {
3994 int err;
3995
3996 VOPXID_MAP_CR(vp, cr);
3997
3998 err = (*(vp)->v_op->vop_getpage)
3999 (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4000 VOPSTATS_UPDATE(vp, getpage);
4001 return (err);
4002 }
4003
4004 int
4005 fop_putpage(
4006 vnode_t *vp,
4007 offset_t off,
4008 size_t len,
4009 int flags,
4010 cred_t *cr,
4011 caller_context_t *ct)
4012 {
4013 int err;
4014
4015 VOPXID_MAP_CR(vp, cr);
4016
4017 err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4018 VOPSTATS_UPDATE(vp, putpage);
4019 return (err);
4020 }
4021
4022 int
4023 fop_map(
4024 vnode_t *vp,
4025 offset_t off,
4026 struct as *as,
4027 caddr_t *addrp,
4028 size_t len,
4029 uchar_t prot,
4030 uchar_t maxprot,
4031 uint_t flags,
4032 cred_t *cr,
4033 caller_context_t *ct)
4034 {
4035 int err;
4036
4037 VOPXID_MAP_CR(vp, cr);
4038
4039 err = (*(vp)->v_op->vop_map)
4040 (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4041 VOPSTATS_UPDATE(vp, map);
4042 return (err);
4043 }
4044
4045 int
4046 fop_addmap(
4047 vnode_t *vp,
4048 offset_t off,
4049 struct as *as,
4050 caddr_t addr,
4051 size_t len,
4052 uchar_t prot,
4053 uchar_t maxprot,
4054 uint_t flags,
4055 cred_t *cr,
4056 caller_context_t *ct)
4057 {
4058 int error;
4059 u_longlong_t delta;
4060
4061 VOPXID_MAP_CR(vp, cr);
4062
4063 error = (*(vp)->v_op->vop_addmap)
4064 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4065
4066 if ((!error) && (vp->v_type == VREG)) {
4067 delta = (u_longlong_t)btopr(len);
4068 /*
4069 * If file is declared MAP_PRIVATE, it can't be written back
4070 * even if open for write. Handle as read.
4071 */
4072 if (flags & MAP_PRIVATE) {
4073 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4074 (int64_t)delta);
4075 } else {
4076 /*
4077 * atomic_add_64 forces the fetch of a 64 bit value to
4078 * be atomic on 32 bit machines
4079 */
4080 if (maxprot & PROT_WRITE)
4081 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4082 (int64_t)delta);
4083 if (maxprot & PROT_READ)
4084 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4085 (int64_t)delta);
4086 if (maxprot & PROT_EXEC)
4087 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4088 (int64_t)delta);
4089 }
4090 }
4091 VOPSTATS_UPDATE(vp, addmap);
4092 return (error);
4093 }
4094
4095 int
4096 fop_delmap(
4097 vnode_t *vp,
4098 offset_t off,
4099 struct as *as,
4100 caddr_t addr,
4101 size_t len,
4102 uint_t prot,
4103 uint_t maxprot,
4104 uint_t flags,
4105 cred_t *cr,
4106 caller_context_t *ct)
4107 {
4108 int error;
4109 u_longlong_t delta;
4110
4111 VOPXID_MAP_CR(vp, cr);
4112
4113 error = (*(vp)->v_op->vop_delmap)
4114 (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4115
4116 /*
4117 * NFS calls into delmap twice, the first time
4118 * it simply establishes a callback mechanism and returns EAGAIN
4119 * while the real work is being done upon the second invocation.
4120 * We have to detect this here and only decrement the counts upon
4121 * the second delmap request.
4122 */
4123 if ((error != EAGAIN) && (vp->v_type == VREG)) {
4124
4125 delta = (u_longlong_t)btopr(len);
4126
4127 if (flags & MAP_PRIVATE) {
4128 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4129 (int64_t)(-delta));
4130 } else {
4131 /*
4132 * atomic_add_64 forces the fetch of a 64 bit value
4133 * to be atomic on 32 bit machines
4134 */
4135 if (maxprot & PROT_WRITE)
4136 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4137 (int64_t)(-delta));
4138 if (maxprot & PROT_READ)
4139 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4140 (int64_t)(-delta));
4141 if (maxprot & PROT_EXEC)
4142 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4143 (int64_t)(-delta));
4144 }
4145 }
4146 VOPSTATS_UPDATE(vp, delmap);
4147 return (error);
4148 }
4149
4150
4151 int
4152 fop_poll(
4153 vnode_t *vp,
4154 short events,
4155 int anyyet,
4156 short *reventsp,
4157 struct pollhead **phpp,
4158 caller_context_t *ct)
4159 {
4160 int err;
4161
4162 err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4163 VOPSTATS_UPDATE(vp, poll);
4164 return (err);
4165 }
4166
4167 int
4168 fop_dump(
4169 vnode_t *vp,
4170 caddr_t addr,
4171 offset_t lbdn,
4172 offset_t dblks,
4173 caller_context_t *ct)
4174 {
4175 int err;
4176
4177 /* ensure lbdn and dblks can be passed safely to bdev_dump */
4178 if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4179 return (EIO);
4180
4181 err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4182 VOPSTATS_UPDATE(vp, dump);
4183 return (err);
4184 }
4185
4186 int
4187 fop_pathconf(
4188 vnode_t *vp,
4189 int cmd,
4190 ulong_t *valp,
4191 cred_t *cr,
4192 caller_context_t *ct)
4193 {
4194 int err;
4195
4196 VOPXID_MAP_CR(vp, cr);
4197
4198 err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4199 VOPSTATS_UPDATE(vp, pathconf);
4200 return (err);
4201 }
4202
4203 int
4204 fop_pageio(
4205 vnode_t *vp,
4206 struct page *pp,
4207 u_offset_t io_off,
4208 size_t io_len,
4209 int flags,
4210 cred_t *cr,
4211 caller_context_t *ct)
4212 {
4213 int err;
4214
4215 VOPXID_MAP_CR(vp, cr);
4216
4217 err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4218 VOPSTATS_UPDATE(vp, pageio);
4219 return (err);
4220 }
4221
4222 int
4223 fop_dumpctl(
4224 vnode_t *vp,
4225 int action,
4226 offset_t *blkp,
4227 caller_context_t *ct)
4228 {
4229 int err;
4230 err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4231 VOPSTATS_UPDATE(vp, dumpctl);
4232 return (err);
4233 }
4234
4235 void
4236 fop_dispose(
4237 vnode_t *vp,
4238 page_t *pp,
4239 int flag,
4240 int dn,
4241 cred_t *cr,
4242 caller_context_t *ct)
4243 {
4244 /* Must do stats first since it's possible to lose the vnode */
4245 VOPSTATS_UPDATE(vp, dispose);
4246
4247 VOPXID_MAP_CR(vp, cr);
4248
4249 (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4250 }
4251
4252 int
4253 fop_setsecattr(
4254 vnode_t *vp,
4255 vsecattr_t *vsap,
4256 int flag,
4257 cred_t *cr,
4258 caller_context_t *ct)
4259 {
4260 int err;
4261
4262 VOPXID_MAP_CR(vp, cr);
4263
4264 /*
4265 * We're only allowed to skip the ACL check iff we used a 32 bit
4266 * ACE mask with VOP_ACCESS() to determine permissions.
4267 */
4268 if ((flag & ATTR_NOACLCHECK) &&
4269 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4270 return (EINVAL);
4271 }
4272 err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4273 VOPSTATS_UPDATE(vp, setsecattr);
4274 return (err);
4275 }
4276
4277 int
4278 fop_getsecattr(
4279 vnode_t *vp,
4280 vsecattr_t *vsap,
4281 int flag,
4282 cred_t *cr,
4283 caller_context_t *ct)
4284 {
4285 int err;
4286
4287 /*
4288 * We're only allowed to skip the ACL check iff we used a 32 bit
4289 * ACE mask with VOP_ACCESS() to determine permissions.
4290 */
4291 if ((flag & ATTR_NOACLCHECK) &&
4292 vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4293 return (EINVAL);
4294 }
4295
4296 VOPXID_MAP_CR(vp, cr);
4297
4298 err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4299 VOPSTATS_UPDATE(vp, getsecattr);
4300 return (err);
4301 }
4302
4303 int
4304 fop_shrlock(
4305 vnode_t *vp,
4306 int cmd,
4307 struct shrlock *shr,
4308 int flag,
4309 cred_t *cr,
4310 caller_context_t *ct)
4311 {
4312 int err;
4313
4314 VOPXID_MAP_CR(vp, cr);
4315
4316 err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4317 VOPSTATS_UPDATE(vp, shrlock);
4318 return (err);
4319 }
4320
4321 int
4322 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4323 caller_context_t *ct)
4324 {
4325 int err;
4326
4327 err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4328 VOPSTATS_UPDATE(vp, vnevent);
4329 return (err);
4330 }
4331
4332 int
4333 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4334 caller_context_t *ct)
4335 {
4336 int err;
4337
4338 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4339 return (ENOTSUP);
4340 err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4341 VOPSTATS_UPDATE(vp, reqzcbuf);
4342 return (err);
4343 }
4344
4345 int
4346 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4347 {
4348 int err;
4349
4350 if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4351 return (ENOTSUP);
4352 err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4353 VOPSTATS_UPDATE(vp, retzcbuf);
4354 return (err);
4355 }
4356
4357 /*
4358 * Default destructor
4359 * Needed because NULL destructor means that the key is unused
4360 */
4361 /* ARGSUSED */
4362 void
4363 vsd_defaultdestructor(void *value)
4364 {}
4365
4366 /*
4367 * Create a key (index into per vnode array)
4368 * Locks out vsd_create, vsd_destroy, and vsd_free
4369 * May allocate memory with lock held
4370 */
4371 void
4372 vsd_create(uint_t *keyp, void (*destructor)(void *))
4373 {
4374 int i;
4375 uint_t nkeys;
4376
4377 /*
4378 * if key is allocated, do nothing
4379 */
4380 mutex_enter(&vsd_lock);
4381 if (*keyp) {
4382 mutex_exit(&vsd_lock);
4383 return;
4384 }
4385 /*
4386 * find an unused key
4387 */
4388 if (destructor == NULL)
4389 destructor = vsd_defaultdestructor;
4390
4391 for (i = 0; i < vsd_nkeys; ++i)
4392 if (vsd_destructor[i] == NULL)
4393 break;
4394
4395 /*
4396 * if no unused keys, increase the size of the destructor array
4397 */
4398 if (i == vsd_nkeys) {
4399 if ((nkeys = (vsd_nkeys << 1)) == 0)
4400 nkeys = 1;
4401 vsd_destructor =
4402 (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4403 (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4404 (size_t)(nkeys * sizeof (void (*)(void *))));
4405 vsd_nkeys = nkeys;
4406 }
4407
4408 /*
4409 * allocate the next available unused key
4410 */
4411 vsd_destructor[i] = destructor;
4412 *keyp = i + 1;
4413
4414 /* create vsd_list, if it doesn't exist */
4415 if (vsd_list == NULL) {
4416 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4417 list_create(vsd_list, sizeof (struct vsd_node),
4418 offsetof(struct vsd_node, vs_nodes));
4419 }
4420
4421 mutex_exit(&vsd_lock);
4422 }
4423
4424 /*
4425 * Destroy a key
4426 *
4427 * Assumes that the caller is preventing vsd_set and vsd_get
4428 * Locks out vsd_create, vsd_destroy, and vsd_free
4429 * May free memory with lock held
4430 */
4431 void
4432 vsd_destroy(uint_t *keyp)
4433 {
4434 uint_t key;
4435 struct vsd_node *vsd;
4436
4437 /*
4438 * protect the key namespace and our destructor lists
4439 */
4440 mutex_enter(&vsd_lock);
4441 key = *keyp;
4442 *keyp = 0;
4443
4444 ASSERT(key <= vsd_nkeys);
4445
4446 /*
4447 * if the key is valid
4448 */
4449 if (key != 0) {
4450 uint_t k = key - 1;
4451 /*
4452 * for every vnode with VSD, call key's destructor
4453 */
4454 for (vsd = list_head(vsd_list); vsd != NULL;
4455 vsd = list_next(vsd_list, vsd)) {
4456 /*
4457 * no VSD for key in this vnode
4458 */
4459 if (key > vsd->vs_nkeys)
4460 continue;
4461 /*
4462 * call destructor for key
4463 */
4464 if (vsd->vs_value[k] && vsd_destructor[k])
4465 (*vsd_destructor[k])(vsd->vs_value[k]);
4466 /*
4467 * reset value for key
4468 */
4469 vsd->vs_value[k] = NULL;
4470 }
4471 /*
4472 * actually free the key (NULL destructor == unused)
4473 */
4474 vsd_destructor[k] = NULL;
4475 }
4476
4477 mutex_exit(&vsd_lock);
4478 }
4479
4480 /*
4481 * Quickly return the per vnode value that was stored with the specified key
4482 * Assumes the caller is protecting key from vsd_create and vsd_destroy
4483 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4484 */
4485 void *
4486 vsd_get(vnode_t *vp, uint_t key)
4487 {
4488 struct vsd_node *vsd;
4489
4490 ASSERT(vp != NULL);
4491 ASSERT(mutex_owned(&vp->v_vsd_lock));
4492
4493 vsd = vp->v_vsd;
4494
4495 if (key && vsd != NULL && key <= vsd->vs_nkeys)
4496 return (vsd->vs_value[key - 1]);
4497 return (NULL);
4498 }
4499
4500 /*
4501 * Set a per vnode value indexed with the specified key
4502 * Assumes the caller is holding v_vsd_lock to protect the vsd.
4503 */
4504 int
4505 vsd_set(vnode_t *vp, uint_t key, void *value)
4506 {
4507 struct vsd_node *vsd;
4508
4509 ASSERT(vp != NULL);
4510 ASSERT(mutex_owned(&vp->v_vsd_lock));
4511
4512 if (key == 0)
4513 return (EINVAL);
4514
4515 vsd = vp->v_vsd;
4516 if (vsd == NULL)
4517 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4518
4519 /*
4520 * If the vsd was just allocated, vs_nkeys will be 0, so the following
4521 * code won't happen and we will continue down and allocate space for
4522 * the vs_value array.
4523 * If the caller is replacing one value with another, then it is up
4524 * to the caller to free/rele/destroy the previous value (if needed).
4525 */
4526 if (key <= vsd->vs_nkeys) {
4527 vsd->vs_value[key - 1] = value;
4528 return (0);
4529 }
4530
4531 ASSERT(key <= vsd_nkeys);
4532
4533 if (vsd->vs_nkeys == 0) {
4534 mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4535 /*
4536 * Link onto list of all VSD nodes.
4537 */
4538 list_insert_head(vsd_list, vsd);
4539 mutex_exit(&vsd_lock);
4540 }
4541
4542 /*
4543 * Allocate vnode local storage and set the value for key
4544 */
4545 vsd->vs_value = vsd_realloc(vsd->vs_value,
4546 vsd->vs_nkeys * sizeof (void *),
4547 key * sizeof (void *));
4548 vsd->vs_nkeys = key;
4549 vsd->vs_value[key - 1] = value;
4550
4551 return (0);
4552 }
4553
4554 /*
4555 * Called from vn_free() to run the destructor function for each vsd
4556 * Locks out vsd_create and vsd_destroy
4557 * Assumes that the destructor *DOES NOT* use vsd
4558 */
4559 void
4560 vsd_free(vnode_t *vp)
4561 {
4562 int i;
4563 struct vsd_node *vsd = vp->v_vsd;
4564
4565 if (vsd == NULL)
4566 return;
4567
4568 if (vsd->vs_nkeys == 0) {
4569 kmem_free(vsd, sizeof (*vsd));
4570 vp->v_vsd = NULL;
4571 return;
4572 }
4573
4574 /*
4575 * lock out vsd_create and vsd_destroy, call
4576 * the destructor, and mark the value as destroyed.
4577 */
4578 mutex_enter(&vsd_lock);
4579
4580 for (i = 0; i < vsd->vs_nkeys; i++) {
4581 if (vsd->vs_value[i] && vsd_destructor[i])
4582 (*vsd_destructor[i])(vsd->vs_value[i]);
4583 vsd->vs_value[i] = NULL;
4584 }
4585
4586 /*
4587 * remove from linked list of VSD nodes
4588 */
4589 list_remove(vsd_list, vsd);
4590
4591 mutex_exit(&vsd_lock);
4592
4593 /*
4594 * free up the VSD
4595 */
4596 kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4597 kmem_free(vsd, sizeof (struct vsd_node));
4598 vp->v_vsd = NULL;
4599 }
4600
4601 /*
4602 * realloc
4603 */
4604 static void *
4605 vsd_realloc(void *old, size_t osize, size_t nsize)
4606 {
4607 void *new;
4608
4609 new = kmem_zalloc(nsize, KM_SLEEP);
4610 if (old) {
4611 bcopy(old, new, osize);
4612 kmem_free(old, osize);
4613 }
4614 return (new);
4615 }
4616
4617 /*
4618 * Setup the extensible system attribute for creating a reparse point.
4619 * The symlink data 'target' is validated for proper format of a reparse
4620 * string and a check also made to make sure the symlink data does not
4621 * point to an existing file.
4622 *
4623 * return 0 if ok else -1.
4624 */
4625 static int
4626 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4627 {
4628 xoptattr_t *xoap;
4629
4630 if ((!target) || (!vap) || (!xvattr))
4631 return (-1);
4632
4633 /* validate reparse string */
4634 if (reparse_validate((const char *)target))
4635 return (-1);
4636
4637 xva_init(xvattr);
4638 xvattr->xva_vattr = *vap;
4639 xvattr->xva_vattr.va_mask |= AT_XVATTR;
4640 xoap = xva_getxoptattr(xvattr);
4641 ASSERT(xoap);
4642 XVA_SET_REQ(xvattr, XAT_REPARSE);
4643 xoap->xoa_reparse = 1;
4644
4645 return (0);
4646 }
4647
4648 /*
4649 * Function to check whether a symlink is a reparse point.
4650 * Return B_TRUE if it is a reparse point, else return B_FALSE
4651 */
4652 boolean_t
4653 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4654 {
4655 xvattr_t xvattr;
4656 xoptattr_t *xoap;
4657
4658 if ((vp->v_type != VLNK) ||
4659 !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4660 return (B_FALSE);
4661
4662 xva_init(&xvattr);
4663 xoap = xva_getxoptattr(&xvattr);
4664 ASSERT(xoap);
4665 XVA_SET_REQ(&xvattr, XAT_REPARSE);
4666
4667 if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4668 return (B_FALSE);
4669
4670 if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4671 (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4672 return (B_FALSE);
4673
4674 return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4675 }