1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2020 Joyent, Inc.
  25  * Copyright 2022 Spencer Evans-Cole.
  26  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  28  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  29  */
  30 
  31 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  32 /*        All Rights Reserved   */
  33 
  34 /*
  35  * University Copyright- Copyright (c) 1982, 1986, 1988
  36  * The Regents of the University of California
  37  * All Rights Reserved
  38  *
  39  * University Acknowledgment- Portions of this document are derived from
  40  * software developed by the University of California, Berkeley, and its
  41  * contributors.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/param.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/errno.h>
  48 #include <sys/cred.h>
  49 #include <sys/user.h>
  50 #include <sys/uio.h>
  51 #include <sys/file.h>
  52 #include <sys/pathname.h>
  53 #include <sys/vfs.h>
  54 #include <sys/vfs_opreg.h>
  55 #include <sys/vnode.h>
  56 #include <sys/filio.h>
  57 #include <sys/rwstlock.h>
  58 #include <sys/fem.h>
  59 #include <sys/stat.h>
  60 #include <sys/mode.h>
  61 #include <sys/conf.h>
  62 #include <sys/sysmacros.h>
  63 #include <sys/cmn_err.h>
  64 #include <sys/systm.h>
  65 #include <sys/kmem.h>
  66 #include <sys/debug.h>
  67 #include <c2/audit.h>
  68 #include <sys/acl.h>
  69 #include <sys/nbmlock.h>
  70 #include <sys/fcntl.h>
  71 #include <fs/fs_subr.h>
  72 #include <sys/taskq.h>
  73 #include <fs/fs_reparse.h>
  74 #include <sys/time.h>
  75 #include <sys/sdt.h>
  76 
  77 /* Determine if this vnode is a file that is read-only */
  78 #define ISROFILE(vp)    \
  79         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  80             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  81 
  82 /* Tunable via /etc/system; used only by admin/install */
  83 int nfs_global_client_only;
  84 
  85 /*
  86  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  87  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  88  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  89  * the same fstype index that is used to index into the vfssw table.
  90  */
  91 vopstats_t **vopstats_fstype;
  92 
  93 /* vopstats initialization template used for fast initialization via bcopy() */
  94 static vopstats_t *vs_templatep;
  95 
  96 /* Kmem cache handle for vsk_anchor_t allocations */
  97 kmem_cache_t *vsk_anchor_cache;
  98 
  99 /* file events cleanup routine */
 100 extern void free_fopdata(vnode_t *);
 101 
 102 /*
 103  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 104  * updates to vsktat_tree.
 105  */
 106 avl_tree_t      vskstat_tree;
 107 kmutex_t        vskstat_tree_lock;
 108 
 109 /* Global variable which enables/disables the vopstats collection */
 110 int vopstats_enabled = 1;
 111 
 112 /* Global used for empty/invalid v_path */
 113 char *vn_vpath_empty = "";
 114 
 115 /*
 116  * forward declarations for internal vnode specific data (vsd)
 117  */
 118 static void *vsd_realloc(void *, size_t, size_t);
 119 
 120 /*
 121  * forward declarations for reparse point functions
 122  */
 123 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 124 
 125 /*
 126  * VSD -- VNODE SPECIFIC DATA
 127  * The v_data pointer is typically used by a file system to store a
 128  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 129  * However, there are times when additional project private data needs
 130  * to be stored separately from the data (node) pointed to by v_data.
 131  * This additional data could be stored by the file system itself or
 132  * by a completely different kernel entity.  VSD provides a way for
 133  * callers to obtain a key and store a pointer to private data associated
 134  * with a vnode.
 135  *
 136  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 137  * for calls to vsd_set() and vsd_get().
 138  */
 139 
 140 /*
 141  * vsd_lock protects:
 142  *   vsd_nkeys - creation and deletion of vsd keys
 143  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 144  *   vsd_destructor - adding and removing destructors to the list
 145  */
 146 static kmutex_t         vsd_lock;
 147 static uint_t           vsd_nkeys;       /* size of destructor array */
 148 /* list of vsd_node's */
 149 static list_t *vsd_list = NULL;
 150 /* per-key destructor funcs */
 151 static void             (**vsd_destructor)(void *);
 152 
 153 /*
 154  * The following is the common set of actions needed to update the
 155  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 156  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 157  * recording of the bytes transferred.  Since the code is similar
 158  * but small, it is nearly a duplicate.  Consequently any changes
 159  * to one may need to be reflected in the other.
 160  * Rundown of the variables:
 161  * vp - Pointer to the vnode
 162  * counter - Partial name structure member to update in vopstats for counts
 163  * bytecounter - Partial name structure member to update in vopstats for bytes
 164  * bytesval - Value to update in vopstats for bytes
 165  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 166  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 167  */
 168 
 169 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 170         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 171         if (vfsp && vfsp->vfs_implp &&                                       \
 172             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 173                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 174                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 175                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 176                     size_t, uint64_t *);                                \
 177                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 178                 (*stataddr)++;                                          \
 179                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 180                         vsp->n##counter.value.ui64++;                        \
 181                 }                                                       \
 182         }                                                               \
 183 }
 184 
 185 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 186         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 187         if (vfsp && vfsp->vfs_implp &&                                       \
 188             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 189                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 190                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 191                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 192                     size_t, uint64_t *);                                \
 193                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 194                 (*stataddr)++;                                          \
 195                 vsp->bytecounter.value.ui64 += bytesval;             \
 196                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 197                         vsp->n##counter.value.ui64++;                        \
 198                         vsp->bytecounter.value.ui64 += bytesval;     \
 199                 }                                                       \
 200         }                                                               \
 201 }
 202 
 203 /*
 204  * If the filesystem does not support XIDs map credential
 205  * If the vfsp is NULL, perhaps we should also map?
 206  */
 207 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 208         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 209         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 210                 cr = crgetmapped(cr);                                   \
 211         }
 212 
 213 #define VOP_LATENCY_10MS        10000000
 214 #define VOP_LATENCY_100MS       100000000
 215 #define VOP_LATENCY_1S          1000000000
 216 
 217 /*
 218  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 219  * numerical order of S_IFMT and vnode types.)
 220  */
 221 enum vtype iftovt_tab[] = {
 222         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 223         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 224 };
 225 
 226 ushort_t vttoif_tab[] = {
 227         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 228         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 229 };
 230 
 231 /*
 232  * The system vnode cache.
 233  */
 234 
 235 kmem_cache_t *vn_cache;
 236 
 237 
 238 /*
 239  * Vnode operations vector.
 240  */
 241 
 242 static const fs_operation_trans_def_t vn_ops_table[] = {
 243         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 244             fs_nosys, fs_nosys,
 245 
 246         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 247             fs_nosys, fs_nosys,
 248 
 249         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 250             fs_nosys, fs_nosys,
 251 
 252         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 253             fs_nosys, fs_nosys,
 254 
 255         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 256             fs_nosys, fs_nosys,
 257 
 258         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 259             fs_setfl, fs_nosys,
 260 
 261         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 262             fs_nosys, fs_nosys,
 263 
 264         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 265             fs_nosys, fs_nosys,
 266 
 267         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 268             fs_nosys, fs_nosys,
 269 
 270         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 271             fs_nosys, fs_nosys,
 272 
 273         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 274             fs_nosys, fs_nosys,
 275 
 276         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 277             fs_nosys, fs_nosys,
 278 
 279         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 280             fs_nosys, fs_nosys,
 281 
 282         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 283             fs_nosys, fs_nosys,
 284 
 285         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 286             fs_nosys, fs_nosys,
 287 
 288         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 289             fs_nosys, fs_nosys,
 290 
 291         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 292             fs_nosys, fs_nosys,
 293 
 294         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 295             fs_nosys, fs_nosys,
 296 
 297         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 298             fs_nosys, fs_nosys,
 299 
 300         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 301             fs_nosys, fs_nosys,
 302 
 303         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 304             fs_nosys, fs_nosys,
 305 
 306         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 307             fs_nosys, fs_nosys,
 308 
 309         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 310             fs_rwlock, fs_rwlock,
 311 
 312         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 313             (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 314             (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 315 
 316         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 317             fs_nosys, fs_nosys,
 318 
 319         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 320             fs_cmp, fs_cmp,             /* no errors allowed */
 321 
 322         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 323             fs_frlock, fs_nosys,
 324 
 325         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 326             fs_nosys, fs_nosys,
 327 
 328         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 329             fs_nosys, fs_nosys,
 330 
 331         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 332             fs_nosys, fs_nosys,
 333 
 334         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 335             fs_nosys, fs_nosys,
 336 
 337         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 338             (fs_generic_func_p) fs_nosys_map,
 339             (fs_generic_func_p) fs_nosys_map,
 340 
 341         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 342             (fs_generic_func_p) fs_nosys_addmap,
 343             (fs_generic_func_p) fs_nosys_addmap,
 344 
 345         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 346             fs_nosys, fs_nosys,
 347 
 348         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 349             (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 350 
 351         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 352             fs_nosys, fs_nosys,
 353 
 354         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 355             fs_pathconf, fs_nosys,
 356 
 357         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 358             fs_nosys, fs_nosys,
 359 
 360         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 361             fs_nosys, fs_nosys,
 362 
 363         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 364             (fs_generic_func_p)(uintptr_t)fs_dispose,
 365             (fs_generic_func_p)(uintptr_t)fs_nodispose,
 366 
 367         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 368             fs_nosys, fs_nosys,
 369 
 370         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 371             fs_fab_acl, fs_nosys,
 372 
 373         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 374             fs_shrlock, fs_nosys,
 375 
 376         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 377             (fs_generic_func_p) fs_vnevent_nosupport,
 378             (fs_generic_func_p) fs_vnevent_nosupport,
 379 
 380         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 381             fs_nosys, fs_nosys,
 382 
 383         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 384             fs_nosys, fs_nosys,
 385 
 386         NULL, 0, NULL, NULL
 387 };
 388 
 389 /* Extensible attribute (xva) routines. */
 390 
 391 /*
 392  * Zero out the structure, set the size of the requested/returned bitmaps,
 393  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 394  * to the returned attributes array.
 395  */
 396 void
 397 xva_init(xvattr_t *xvap)
 398 {
 399         bzero(xvap, sizeof (xvattr_t));
 400         xvap->xva_mapsize = XVA_MAPSIZE;
 401         xvap->xva_magic = XVA_MAGIC;
 402         xvap->xva_vattr.va_mask = AT_XVATTR;
 403         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 404 }
 405 
 406 /*
 407  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 408  * structure.  Otherwise, returns NULL.
 409  */
 410 xoptattr_t *
 411 xva_getxoptattr(xvattr_t *xvap)
 412 {
 413         xoptattr_t *xoap = NULL;
 414         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 415                 xoap = &xvap->xva_xoptattrs;
 416         return (xoap);
 417 }
 418 
 419 /*
 420  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 421  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 422  * kstat name.
 423  */
 424 static int
 425 vska_compar(const void *n1, const void *n2)
 426 {
 427         int ret;
 428         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 429         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 430 
 431         if (p1 < p2) {
 432                 ret = -1;
 433         } else if (p1 > p2) {
 434                 ret = 1;
 435         } else {
 436                 ret = 0;
 437         }
 438 
 439         return (ret);
 440 }
 441 
 442 /*
 443  * Used to create a single template which will be bcopy()ed to a newly
 444  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 445  */
 446 static vopstats_t *
 447 create_vopstats_template()
 448 {
 449         vopstats_t              *vsp;
 450 
 451         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 452         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 453 
 454         /* VOP_OPEN */
 455         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 456         /* VOP_CLOSE */
 457         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 458         /* VOP_READ I/O */
 459         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 460         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 461         /* VOP_WRITE I/O */
 462         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 463         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 464         /* VOP_IOCTL */
 465         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 466         /* VOP_SETFL */
 467         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 468         /* VOP_GETATTR */
 469         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 470         /* VOP_SETATTR */
 471         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 472         /* VOP_ACCESS */
 473         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 474         /* VOP_LOOKUP */
 475         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 476         /* VOP_CREATE */
 477         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 478         /* VOP_REMOVE */
 479         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 480         /* VOP_LINK */
 481         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 482         /* VOP_RENAME */
 483         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 484         /* VOP_MKDIR */
 485         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 486         /* VOP_RMDIR */
 487         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 488         /* VOP_READDIR I/O */
 489         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 490         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 491             KSTAT_DATA_UINT64);
 492         /* VOP_SYMLINK */
 493         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 494         /* VOP_READLINK */
 495         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 496         /* VOP_FSYNC */
 497         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 498         /* VOP_INACTIVE */
 499         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 500         /* VOP_FID */
 501         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 502         /* VOP_RWLOCK */
 503         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 504         /* VOP_RWUNLOCK */
 505         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 506         /* VOP_SEEK */
 507         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 508         /* VOP_CMP */
 509         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 510         /* VOP_FRLOCK */
 511         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 512         /* VOP_SPACE */
 513         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 514         /* VOP_REALVP */
 515         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 516         /* VOP_GETPAGE */
 517         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 518         /* VOP_PUTPAGE */
 519         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 520         /* VOP_MAP */
 521         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 522         /* VOP_ADDMAP */
 523         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 524         /* VOP_DELMAP */
 525         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 526         /* VOP_POLL */
 527         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 528         /* VOP_DUMP */
 529         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 530         /* VOP_PATHCONF */
 531         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 532         /* VOP_PAGEIO */
 533         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 534         /* VOP_DUMPCTL */
 535         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 536         /* VOP_DISPOSE */
 537         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 538         /* VOP_SETSECATTR */
 539         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 540         /* VOP_GETSECATTR */
 541         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 542         /* VOP_SHRLOCK */
 543         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 544         /* VOP_VNEVENT */
 545         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 546         /* VOP_REQZCBUF */
 547         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 548         /* VOP_RETZCBUF */
 549         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 550 
 551         return (vsp);
 552 }
 553 
 554 /*
 555  * Creates a kstat structure associated with a vopstats structure.
 556  */
 557 kstat_t *
 558 new_vskstat(char *ksname, vopstats_t *vsp)
 559 {
 560         kstat_t         *ksp;
 561 
 562         if (!vopstats_enabled) {
 563                 return (NULL);
 564         }
 565 
 566         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 567             sizeof (vopstats_t)/sizeof (kstat_named_t),
 568             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 569         if (ksp) {
 570                 ksp->ks_data = vsp;
 571                 kstat_install(ksp);
 572         }
 573 
 574         return (ksp);
 575 }
 576 
 577 /*
 578  * Called from vfsinit() to initialize the support mechanisms for vopstats
 579  */
 580 void
 581 vopstats_startup()
 582 {
 583         if (!vopstats_enabled)
 584                 return;
 585 
 586         /*
 587          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 588          * is necessary since we need to check if a kstat exists before we
 589          * attempt to create it.  Also, initialize its lock.
 590          */
 591         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 592             offsetof(vsk_anchor_t, vsk_node));
 593         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 594 
 595         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 596             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 597             NULL, NULL, 0);
 598 
 599         /*
 600          * Set up the array of pointers for the vopstats-by-FS-type.
 601          * The entries will be allocated/initialized as each file system
 602          * goes through modload/mod_installfs.
 603          */
 604         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 605             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 606 
 607         /* Set up the global vopstats initialization template */
 608         vs_templatep = create_vopstats_template();
 609 }
 610 
 611 /*
 612  * We need to have the all of the counters zeroed.
 613  * The initialization of the vopstats_t includes on the order of
 614  * 50 calls to kstat_named_init().  Rather that do that on every call,
 615  * we do it once in a template (vs_templatep) then bcopy it over.
 616  */
 617 void
 618 initialize_vopstats(vopstats_t *vsp)
 619 {
 620         if (vsp == NULL)
 621                 return;
 622 
 623         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 624 }
 625 
 626 /*
 627  * If possible, determine which vopstats by fstype to use and
 628  * return a pointer to the caller.
 629  */
 630 vopstats_t *
 631 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 632 {
 633         int             fstype = 0;     /* Index into vfssw[] */
 634         vopstats_t      *vsp = NULL;
 635 
 636         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 637             !vopstats_enabled)
 638                 return (NULL);
 639         /*
 640          * Set up the fstype.  We go to so much trouble because all versions
 641          * of NFS use the same fstype in their vfs even though they have
 642          * distinct entries in the vfssw[] table.
 643          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 644          */
 645         if (vswp) {
 646                 fstype = vswp - vfssw;  /* Gets us the index */
 647         } else {
 648                 fstype = vfsp->vfs_fstype;
 649         }
 650 
 651         /*
 652          * Point to the per-fstype vopstats. The only valid values are
 653          * non-zero positive values less than the number of vfssw[] table
 654          * entries.
 655          */
 656         if (fstype > 0 && fstype < nfstype) {
 657                 vsp = vopstats_fstype[fstype];
 658         }
 659 
 660         return (vsp);
 661 }
 662 
 663 /*
 664  * Generate a kstat name, create the kstat structure, and allocate a
 665  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 666  * to the caller.  This must only be called from a mount.
 667  */
 668 vsk_anchor_t *
 669 get_vskstat_anchor(vfs_t *vfsp)
 670 {
 671         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 672         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 673         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 674         kstat_t         *ksp;                   /* Ptr to new kstat */
 675         avl_index_t     where;                  /* Location in the AVL tree */
 676 
 677         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 678             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 679                 return (NULL);
 680 
 681         /* Need to get the fsid to build a kstat name */
 682         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 683                 /* Create a name for our kstats based on fsid */
 684                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 685                     VOPSTATS_STR, statvfsbuf.f_fsid);
 686 
 687                 /* Allocate and initialize the vsk_anchor_t */
 688                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 689                 bzero(vskp, sizeof (*vskp));
 690                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 691 
 692                 mutex_enter(&vskstat_tree_lock);
 693                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 694                         avl_insert(&vskstat_tree, vskp, where);
 695                         mutex_exit(&vskstat_tree_lock);
 696 
 697                         /*
 698                          * Now that we've got the anchor in the AVL
 699                          * tree, we can create the kstat.
 700                          */
 701                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 702                         if (ksp) {
 703                                 vskp->vsk_ksp = ksp;
 704                         }
 705                 } else {
 706                         /* Oops, found one! Release memory and lock. */
 707                         mutex_exit(&vskstat_tree_lock);
 708                         kmem_cache_free(vsk_anchor_cache, vskp);
 709                         vskp = NULL;
 710                 }
 711         }
 712         return (vskp);
 713 }
 714 
 715 /*
 716  * We're in the process of tearing down the vfs and need to cleanup
 717  * the data structures associated with the vopstats. Must only be called
 718  * from dounmount().
 719  */
 720 void
 721 teardown_vopstats(vfs_t *vfsp)
 722 {
 723         vsk_anchor_t    *vskap;
 724         avl_index_t     where;
 725 
 726         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 727             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 728                 return;
 729 
 730         /* This is a safe check since VFS_STATS must be set (see above) */
 731         if ((vskap = vfsp->vfs_vskap) == NULL)
 732                 return;
 733 
 734         /* Whack the pointer right away */
 735         vfsp->vfs_vskap = NULL;
 736 
 737         /* Lock the tree, remove the node, and delete the kstat */
 738         mutex_enter(&vskstat_tree_lock);
 739         if (avl_find(&vskstat_tree, vskap, &where)) {
 740                 avl_remove(&vskstat_tree, vskap);
 741         }
 742 
 743         if (vskap->vsk_ksp) {
 744                 kstat_delete(vskap->vsk_ksp);
 745         }
 746         mutex_exit(&vskstat_tree_lock);
 747 
 748         kmem_cache_free(vsk_anchor_cache, vskap);
 749 }
 750 
 751 /*
 752  * Read or write a vnode.  Called from kernel code.
 753  */
 754 int
 755 vn_rdwr(
 756         enum uio_rw rw,
 757         struct vnode *vp,
 758         caddr_t base,
 759         ssize_t len,
 760         offset_t offset,
 761         enum uio_seg seg,
 762         int ioflag,
 763         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 764         cred_t *cr,
 765         ssize_t *residp)
 766 {
 767         struct uio uio;
 768         struct iovec iov;
 769         int error;
 770         int in_crit = 0;
 771 
 772         if (rw == UIO_WRITE && ISROFILE(vp))
 773                 return (EROFS);
 774 
 775         if (len < 0)
 776                 return (EIO);
 777 
 778         VOPXID_MAP_CR(vp, cr);
 779 
 780         iov.iov_base = base;
 781         iov.iov_len = len;
 782         uio.uio_iov = &iov;
 783         uio.uio_iovcnt = 1;
 784         uio.uio_loffset = offset;
 785         uio.uio_segflg = (short)seg;
 786         uio.uio_resid = len;
 787         uio.uio_llimit = ulimit;
 788 
 789         /*
 790          * We have to enter the critical region before calling VOP_RWLOCK
 791          * to avoid a deadlock with ufs.
 792          */
 793         if (nbl_need_check(vp)) {
 794                 int svmand;
 795 
 796                 nbl_start_crit(vp, RW_READER);
 797                 in_crit = 1;
 798                 error = nbl_svmand(vp, cr, &svmand);
 799                 if (error != 0)
 800                         goto done;
 801                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 802                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 803                         error = EACCES;
 804                         goto done;
 805                 }
 806         }
 807 
 808         (void) VOP_RWLOCK(vp,
 809             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 810         if (rw == UIO_WRITE) {
 811                 uio.uio_fmode = FWRITE;
 812                 uio.uio_extflg = UIO_COPY_DEFAULT;
 813                 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 814         } else {
 815                 uio.uio_fmode = FREAD;
 816                 uio.uio_extflg = UIO_COPY_CACHED;
 817                 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 818         }
 819         VOP_RWUNLOCK(vp,
 820             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 821         if (residp)
 822                 *residp = uio.uio_resid;
 823         else if (uio.uio_resid)
 824                 error = EIO;
 825 
 826 done:
 827         if (in_crit)
 828                 nbl_end_crit(vp);
 829         return (error);
 830 }
 831 
 832 /*
 833  * Release a vnode.  Call VOP_INACTIVE on last reference or
 834  * decrement reference count.
 835  *
 836  * To avoid race conditions, the v_count is left at 1 for
 837  * the call to VOP_INACTIVE. This prevents another thread
 838  * from reclaiming and releasing the vnode *before* the
 839  * VOP_INACTIVE routine has a chance to destroy the vnode.
 840  * We can't have more than 1 thread calling VOP_INACTIVE
 841  * on a vnode.
 842  */
 843 void
 844 vn_rele(vnode_t *vp)
 845 {
 846         mutex_enter(&vp->v_lock);
 847         if (vp->v_count == 1) {
 848                 mutex_exit(&vp->v_lock);
 849                 VOP_INACTIVE(vp, CRED(), NULL);
 850                 return;
 851         } else {
 852                 VERIFY(vp->v_count > 0);
 853         }
 854         VN_RELE_LOCKED(vp);
 855         mutex_exit(&vp->v_lock);
 856 }
 857 
 858 void
 859 vn_phantom_rele(vnode_t *vp)
 860 {
 861         mutex_enter(&vp->v_lock);
 862         VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
 863         vp->v_phantom_count--;
 864         DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
 865         if (vp->v_count == 1) {
 866                 ASSERT0(vp->v_phantom_count);
 867                 mutex_exit(&vp->v_lock);
 868                 VOP_INACTIVE(vp, CRED(), NULL);
 869                 return;
 870         } else {
 871                 VERIFY(vp->v_count > 0);
 872         }
 873         VN_RELE_LOCKED(vp);
 874         mutex_exit(&vp->v_lock);
 875 }
 876 
 877 /*
 878  * Return the number of non-phantom holds. Things such as portfs will use
 879  * phantom holds to prevent it from blocking filesystems from mounting over
 880  * watched directories.
 881  */
 882 uint_t
 883 vn_count(vnode_t *vp)
 884 {
 885         ASSERT(MUTEX_HELD(&vp->v_lock));
 886         return (vp->v_count - vp->v_phantom_count);
 887 }
 888 
 889 /*
 890  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 891  * as a single reference, so v_count is not decremented until the last DNLC hold
 892  * is released. This makes it possible to distinguish vnodes that are referenced
 893  * only by the DNLC.
 894  */
 895 void
 896 vn_rele_dnlc(vnode_t *vp)
 897 {
 898         mutex_enter(&vp->v_lock);
 899         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 900         if (--vp->v_count_dnlc == 0) {
 901                 if (vp->v_count == 1) {
 902                         mutex_exit(&vp->v_lock);
 903                         VOP_INACTIVE(vp, CRED(), NULL);
 904                         return;
 905                 }
 906                 VN_RELE_LOCKED(vp);
 907         }
 908         mutex_exit(&vp->v_lock);
 909 }
 910 
 911 /*
 912  * Like vn_rele() except that it clears v_stream under v_lock.
 913  * This is used by sockfs when it dismantles the association between
 914  * the sockfs node and the vnode in the underlying file system.
 915  * v_lock has to be held to prevent a thread coming through the lookupname
 916  * path from accessing a stream head that is going away.
 917  */
 918 void
 919 vn_rele_stream(vnode_t *vp)
 920 {
 921         mutex_enter(&vp->v_lock);
 922         vp->v_stream = NULL;
 923         if (vp->v_count == 1) {
 924                 mutex_exit(&vp->v_lock);
 925                 VOP_INACTIVE(vp, CRED(), NULL);
 926                 return;
 927         } else {
 928                 VERIFY(vp->v_count > 0);
 929         }
 930         VN_RELE_LOCKED(vp);
 931         mutex_exit(&vp->v_lock);
 932 }
 933 
 934 static void
 935 vn_rele_inactive(vnode_t *vp)
 936 {
 937         VOP_INACTIVE(vp, CRED(), NULL);
 938 }
 939 
 940 /*
 941  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 942  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 943  * the file system as a result of releasing the vnode. Note, file systems
 944  * already have to handle the race where the vnode is incremented before the
 945  * inactive routine is called and does its locking.
 946  *
 947  * Warning: Excessive use of this routine can lead to performance problems.
 948  * This is because taskqs throttle back allocation if too many are created.
 949  */
 950 void
 951 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 952 {
 953         mutex_enter(&vp->v_lock);
 954         if (vp->v_count == 1) {
 955                 mutex_exit(&vp->v_lock);
 956                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 957                     vp, TQ_SLEEP) != TASKQID_INVALID);
 958                 return;
 959         } else {
 960                 VERIFY(vp->v_count > 0);
 961         }
 962         VN_RELE_LOCKED(vp);
 963         mutex_exit(&vp->v_lock);
 964 }
 965 
 966 int
 967 vn_open(
 968         char *pnamep,
 969         enum uio_seg seg,
 970         int filemode,
 971         int createmode,
 972         struct vnode **vpp,
 973         enum create crwhy,
 974         mode_t umask)
 975 {
 976         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 977             umask, NULL, -1));
 978 }
 979 
 980 
 981 /*
 982  * Open/create a vnode.
 983  * This may be callable by the kernel, the only known use
 984  * of user context being that the current user credentials
 985  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 986  */
 987 int
 988 vn_openat(
 989         char *pnamep,
 990         enum uio_seg seg,
 991         int filemode,
 992         int createmode,
 993         struct vnode **vpp,
 994         enum create crwhy,
 995         mode_t umask,
 996         struct vnode *startvp,
 997         int fd)
 998 {
 999         struct vnode *vp;
1000         int mode;
1001         int accessflags;
1002         int error;
1003         int in_crit = 0;
1004         int open_done = 0;
1005         int shrlock_done = 0;
1006         struct vattr vattr;
1007         enum symfollow follow;
1008         int estale_retry = 0;
1009         struct shrlock shr;
1010         struct shr_locowner shr_own;
1011         boolean_t create;
1012 
1013         mode = 0;
1014         accessflags = 0;
1015         if (filemode & FREAD)
1016                 mode |= VREAD;
1017         if (filemode & (FWRITE|FTRUNC))
1018                 mode |= VWRITE;
1019         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
1020                 mode |= VEXEC;
1021 
1022         /* symlink interpretation */
1023         if (filemode & FNOFOLLOW)
1024                 follow = NO_FOLLOW;
1025         else
1026                 follow = FOLLOW;
1027 
1028         if (filemode & FAPPEND)
1029                 accessflags |= V_APPEND;
1030 
1031         /*
1032          * We need to handle the case of FCREAT | FDIRECTORY and the case of
1033          * FEXCL. If all three are specified, then we always fail because we
1034          * cannot create a directory through this interface and FEXCL says we
1035          * need to fail the request if we can't create it. If, however, only
1036          * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1037          * of opening a file that already exists. If it exists, we can do
1038          * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1039          * treated as FDIRECTORY.
1040          */
1041         if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1042             (FCREAT | FDIRECTORY | FEXCL)) {
1043                 return (EINVAL);
1044         }
1045 
1046         if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1047                 create = B_FALSE;
1048         } else if ((filemode & FCREAT) != 0) {
1049                 create = B_TRUE;
1050         } else {
1051                 create = B_FALSE;
1052         }
1053 
1054 top:
1055         if (create) {
1056                 enum vcexcl excl;
1057 
1058                 /*
1059                  * Wish to create a file.
1060                  */
1061                 vattr.va_type = VREG;
1062                 vattr.va_mode = createmode;
1063                 vattr.va_mask = AT_TYPE|AT_MODE;
1064                 if (filemode & FTRUNC) {
1065                         vattr.va_size = 0;
1066                         vattr.va_mask |= AT_SIZE;
1067                 }
1068                 if (filemode & FEXCL)
1069                         excl = EXCL;
1070                 else
1071                         excl = NONEXCL;
1072 
1073                 if (error =
1074                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1075                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1076                         return (error);
1077         } else {
1078                 /*
1079                  * Wish to open a file.  Just look it up.
1080                  */
1081                 if (error = lookupnameat(pnamep, seg, follow,
1082                     NULLVPP, &vp, startvp)) {
1083                         if ((error == ESTALE) &&
1084                             fs_need_estale_retry(estale_retry++))
1085                                 goto top;
1086                         return (error);
1087                 }
1088 
1089                 /*
1090                  * Get the attributes to check whether file is large.
1091                  * We do this only if the FOFFMAX flag is not set and
1092                  * only for regular files.
1093                  */
1094 
1095                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1096                         vattr.va_mask = AT_SIZE;
1097                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1098                             CRED(), NULL))) {
1099                                 goto out;
1100                         }
1101                         if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1102                                 /*
1103                                  * Large File API - regular open fails
1104                                  * if FOFFMAX flag is set in file mode
1105                                  */
1106                                 error = EOVERFLOW;
1107                                 goto out;
1108                         }
1109                 }
1110                 /*
1111                  * Can't write directories, active texts, or
1112                  * read-only filesystems.  Can't truncate files
1113                  * on which mandatory locking is in effect.
1114                  */
1115                 if (filemode & (FWRITE|FTRUNC)) {
1116                         /*
1117                          * Allow writable directory if VDIROPEN flag is set.
1118                          */
1119                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1120                                 error = EISDIR;
1121                                 goto out;
1122                         }
1123                         if (ISROFILE(vp)) {
1124                                 error = EROFS;
1125                                 goto out;
1126                         }
1127                         /*
1128                          * Can't truncate files on which
1129                          * sysv mandatory locking is in effect.
1130                          */
1131                         if (filemode & FTRUNC) {
1132                                 vnode_t *rvp;
1133 
1134                                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1135                                         rvp = vp;
1136                                 if (rvp->v_filocks != NULL) {
1137                                         vattr.va_mask = AT_MODE;
1138                                         if ((error = VOP_GETATTR(vp,
1139                                             &vattr, 0, CRED(), NULL)) == 0 &&
1140                                             MANDLOCK(vp, vattr.va_mode))
1141                                                 error = EAGAIN;
1142                                 }
1143                         }
1144                         if (error)
1145                                 goto out;
1146                 }
1147                 /*
1148                  * Check permissions.
1149                  */
1150                 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1151                         goto out;
1152 
1153                 /*
1154                  * Require FSEARCH and FDIRECTORY to return a directory. Require
1155                  * FEXEC to return a regular file.
1156                  */
1157                 if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1158                     vp->v_type != VDIR) {
1159                         error = ENOTDIR;
1160                         goto out;
1161                 }
1162                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1163                         error = ENOEXEC;        /* XXX: error code? */
1164                         goto out;
1165                 }
1166         }
1167 
1168         /*
1169          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1170          */
1171         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1172                 /*
1173                  * The __FLXPATH flag is a private interface for use by the lx
1174                  * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
1175                  * when a symbolic link is encountered, returns a file
1176                  * descriptor which references it.
1177                  * See uts/common/brand/lx/syscall/lx_open.c
1178                  *
1179                  * When this flag is set, VOP_OPEN() is not called (for a
1180                  * symlink, most filesystems will return ENOSYS anyway)
1181                  * and the link's vnode is returned to be linked to the
1182                  * file descriptor.
1183                  */
1184                 if ((filemode & __FLXPATH) == 0)
1185                         error = ELOOP;
1186                 goto out;
1187         }
1188         if (filemode & FNOLINKS) {
1189                 vattr.va_mask = AT_NLINK;
1190                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1191                         goto out;
1192                 }
1193                 if (vattr.va_nlink != 1) {
1194                         error = EMLINK;
1195                         goto out;
1196                 }
1197         }
1198 
1199         /*
1200          * Opening a socket corresponding to the AF_UNIX pathname
1201          * in the filesystem name space is not supported.
1202          * However, VSOCK nodes in namefs are supported in order
1203          * to make fattach work for sockets.
1204          *
1205          * XXX This uses VOP_REALVP to distinguish between
1206          * an unopened namefs node (where VOP_REALVP returns a
1207          * different VSOCK vnode) and a VSOCK created by vn_create
1208          * in some file system (where VOP_REALVP would never return
1209          * a different vnode).
1210          */
1211         if (vp->v_type == VSOCK) {
1212                 struct vnode *nvp;
1213 
1214                 error = VOP_REALVP(vp, &nvp, NULL);
1215                 if (error != 0 || nvp == NULL || nvp == vp ||
1216                     nvp->v_type != VSOCK) {
1217                         error = EOPNOTSUPP;
1218                         goto out;
1219                 }
1220         }
1221 
1222         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1223                 /* get share reservation */
1224                 shr.s_access = 0;
1225                 if (filemode & FWRITE)
1226                         shr.s_access |= F_WRACC;
1227                 if (filemode & FREAD)
1228                         shr.s_access |= F_RDACC;
1229                 shr.s_deny = 0;
1230                 shr.s_sysid = 0;
1231                 shr.s_pid = ttoproc(curthread)->p_pid;
1232                 shr_own.sl_pid = shr.s_pid;
1233                 shr_own.sl_id = fd;
1234                 shr.s_own_len = sizeof (shr_own);
1235                 shr.s_owner = (caddr_t)&shr_own;
1236                 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1237                     NULL);
1238                 if (error)
1239                         goto out;
1240                 shrlock_done = 1;
1241 
1242                 /* nbmand conflict check if truncating file */
1243                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1244                         nbl_start_crit(vp, RW_READER);
1245                         in_crit = 1;
1246 
1247                         vattr.va_mask = AT_SIZE;
1248                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1249                                 goto out;
1250                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1251                             NULL)) {
1252                                 error = EACCES;
1253                                 goto out;
1254                         }
1255                 }
1256         }
1257 
1258         /*
1259          * Do opening protocol.
1260          */
1261         error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1262         if (error)
1263                 goto out;
1264         open_done = 1;
1265 
1266         /*
1267          * Truncate if required.
1268          */
1269         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1270                 vattr.va_size = 0;
1271                 vattr.va_mask = AT_SIZE;
1272                 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1273                         goto out;
1274         }
1275 
1276         /*
1277          * Turn on directio, if requested.
1278          */
1279         if (filemode & FDIRECT) {
1280                 if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1281                     CRED(), NULL, NULL)) != 0) {
1282                         /*
1283                          * On Linux, O_DIRECT returns EINVAL when the file
1284                          * system does not support directio, so we'll do the
1285                          * same.
1286                          */
1287                         error = EINVAL;
1288                         goto out;
1289                 }
1290         }
1291 out:
1292         ASSERT(vp->v_count > 0);
1293 
1294         if (in_crit) {
1295                 nbl_end_crit(vp);
1296                 in_crit = 0;
1297         }
1298         if (error) {
1299                 if (open_done) {
1300                         (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1301                             NULL);
1302                         open_done = 0;
1303                         shrlock_done = 0;
1304                 }
1305                 if (shrlock_done) {
1306                         (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1307                             NULL);
1308                         shrlock_done = 0;
1309                 }
1310 
1311                 /*
1312                  * The following clause was added to handle a problem
1313                  * with NFS consistency.  It is possible that a lookup
1314                  * of the file to be opened succeeded, but the file
1315                  * itself doesn't actually exist on the server.  This
1316                  * is chiefly due to the DNLC containing an entry for
1317                  * the file which has been removed on the server.  In
1318                  * this case, we just start over.  If there was some
1319                  * other cause for the ESTALE error, then the lookup
1320                  * of the file will fail and the error will be returned
1321                  * above instead of looping around from here.
1322                  */
1323                 VN_RELE(vp);
1324                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1325                         goto top;
1326         } else
1327                 *vpp = vp;
1328         return (error);
1329 }
1330 
1331 /*
1332  * The following two accessor functions are for the NFSv4 server.  Since there
1333  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1334  * vnode open counts correct when a client "upgrades" an open or does an
1335  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1336  * open mode (add or subtract read or write), but also change the share/deny
1337  * modes.  However, share reservations are not integrated with OPEN, yet, so
1338  * we need to handle each separately.  These functions are cleaner than having
1339  * the NFS server manipulate the counts directly, however, nobody else should
1340  * use these functions.
1341  */
1342 void
1343 vn_open_upgrade(
1344         vnode_t *vp,
1345         int filemode)
1346 {
1347         ASSERT(vp->v_type == VREG);
1348 
1349         if (filemode & FREAD)
1350                 atomic_inc_32(&vp->v_rdcnt);
1351         if (filemode & FWRITE)
1352                 atomic_inc_32(&vp->v_wrcnt);
1353 
1354 }
1355 
1356 void
1357 vn_open_downgrade(
1358         vnode_t *vp,
1359         int filemode)
1360 {
1361         ASSERT(vp->v_type == VREG);
1362 
1363         if (filemode & FREAD) {
1364                 ASSERT(vp->v_rdcnt > 0);
1365                 atomic_dec_32(&vp->v_rdcnt);
1366         }
1367         if (filemode & FWRITE) {
1368                 ASSERT(vp->v_wrcnt > 0);
1369                 atomic_dec_32(&vp->v_wrcnt);
1370         }
1371 
1372 }
1373 
1374 int
1375 vn_create(
1376         char *pnamep,
1377         enum uio_seg seg,
1378         struct vattr *vap,
1379         enum vcexcl excl,
1380         int mode,
1381         struct vnode **vpp,
1382         enum create why,
1383         int flag,
1384         mode_t umask)
1385 {
1386         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1387             umask, NULL));
1388 }
1389 
1390 /*
1391  * Create a vnode (makenode).
1392  */
1393 int
1394 vn_createat(
1395         char *pnamep,
1396         enum uio_seg seg,
1397         struct vattr *vap,
1398         enum vcexcl excl,
1399         int mode,
1400         struct vnode **vpp,
1401         enum create why,
1402         int flag,
1403         mode_t umask,
1404         struct vnode *startvp)
1405 {
1406         struct vnode *dvp;      /* ptr to parent dir vnode */
1407         struct vnode *vp = NULL;
1408         struct pathname pn;
1409         int error;
1410         int in_crit = 0;
1411         struct vattr vattr;
1412         enum symfollow follow;
1413         int estale_retry = 0;
1414         uint32_t auditing = AU_AUDITING();
1415 
1416         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1417 
1418         /* symlink interpretation */
1419         if ((flag & FNOFOLLOW) || excl == EXCL)
1420                 follow = NO_FOLLOW;
1421         else
1422                 follow = FOLLOW;
1423         flag &= ~(FNOFOLLOW|FNOLINKS);
1424 
1425 top:
1426         /*
1427          * Lookup directory.
1428          * If new object is a file, call lower level to create it.
1429          * Note that it is up to the lower level to enforce exclusive
1430          * creation, if the file is already there.
1431          * This allows the lower level to do whatever
1432          * locking or protocol that is needed to prevent races.
1433          * If the new object is directory call lower level to make
1434          * the new directory, with "." and "..".
1435          */
1436         if (error = pn_get(pnamep, seg, &pn))
1437                 return (error);
1438         if (auditing)
1439                 audit_vncreate_start();
1440         dvp = NULL;
1441         *vpp = NULL;
1442         /*
1443          * lookup will find the parent directory for the vnode.
1444          * When it is done the pn holds the name of the entry
1445          * in the directory.
1446          * If this is a non-exclusive create we also find the node itself.
1447          */
1448         error = lookuppnat(&pn, NULL, follow, &dvp,
1449             (excl == EXCL) ? NULLVPP : vpp, startvp);
1450         if (error) {
1451                 pn_free(&pn);
1452                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1453                         goto top;
1454                 if (why == CRMKDIR && error == EINVAL)
1455                         error = EEXIST;         /* SVID */
1456                 return (error);
1457         }
1458 
1459         if (why != CRMKNOD)
1460                 vap->va_mode &= ~VSVTX;
1461 
1462         /*
1463          * If default ACLs are defined for the directory don't apply the
1464          * umask if umask is passed.
1465          */
1466 
1467         if (umask) {
1468 
1469                 vsecattr_t vsec;
1470 
1471                 vsec.vsa_aclcnt = 0;
1472                 vsec.vsa_aclentp = NULL;
1473                 vsec.vsa_dfaclcnt = 0;
1474                 vsec.vsa_dfaclentp = NULL;
1475                 vsec.vsa_mask = VSA_DFACLCNT;
1476                 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1477                 /*
1478                  * If error is ENOSYS then treat it as no error
1479                  * Don't want to force all file systems to support
1480                  * aclent_t style of ACL's.
1481                  */
1482                 if (error == ENOSYS)
1483                         error = 0;
1484                 if (error) {
1485                         if (*vpp != NULL)
1486                                 VN_RELE(*vpp);
1487                         goto out;
1488                 } else {
1489                         /*
1490                          * Apply the umask if no default ACLs.
1491                          */
1492                         if (vsec.vsa_dfaclcnt == 0)
1493                                 vap->va_mode &= ~umask;
1494 
1495                         /*
1496                          * VOP_GETSECATTR() may have allocated memory for
1497                          * ACLs we didn't request, so double-check and
1498                          * free it if necessary.
1499                          */
1500                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1501                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1502                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1503                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1504                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1505                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1506                 }
1507         }
1508 
1509         /*
1510          * In general we want to generate EROFS if the file system is
1511          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1512          * documents the open system call, and it says that O_CREAT has no
1513          * effect if the file already exists.  Bug 1119649 states
1514          * that open(path, O_CREAT, ...) fails when attempting to open an
1515          * existing file on a read only file system.  Thus, the first part
1516          * of the following if statement has 3 checks:
1517          *      if the file exists &&
1518          *              it is being open with write access &&
1519          *              the file system is read only
1520          *      then generate EROFS
1521          */
1522         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1523             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1524                 if (*vpp)
1525                         VN_RELE(*vpp);
1526                 error = EROFS;
1527         } else if (excl == NONEXCL && *vpp != NULL) {
1528                 vnode_t *rvp;
1529 
1530                 /*
1531                  * File already exists.  If a mandatory lock has been
1532                  * applied, return error.
1533                  */
1534                 vp = *vpp;
1535                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1536                         rvp = vp;
1537                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1538                         nbl_start_crit(vp, RW_READER);
1539                         in_crit = 1;
1540                 }
1541                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1542                         vattr.va_mask = AT_MODE|AT_SIZE;
1543                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1544                                 goto out;
1545                         }
1546                         if (MANDLOCK(vp, vattr.va_mode)) {
1547                                 error = EAGAIN;
1548                                 goto out;
1549                         }
1550                         /*
1551                          * File cannot be truncated if non-blocking mandatory
1552                          * locks are currently on the file.
1553                          */
1554                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1555                                 u_offset_t offset;
1556                                 ssize_t length;
1557 
1558                                 offset = vap->va_size > vattr.va_size ?
1559                                     vattr.va_size : vap->va_size;
1560                                 length = vap->va_size > vattr.va_size ?
1561                                     vap->va_size - vattr.va_size :
1562                                     vattr.va_size - vap->va_size;
1563                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1564                                     length, 0, NULL)) {
1565                                         error = EACCES;
1566                                         goto out;
1567                                 }
1568                         }
1569                 }
1570 
1571                 /*
1572                  * If the file is the root of a VFS, we've crossed a
1573                  * mount point and the "containing" directory that we
1574                  * acquired above (dvp) is irrelevant because it's in
1575                  * a different file system.  We apply VOP_CREATE to the
1576                  * target itself instead of to the containing directory
1577                  * and supply a null path name to indicate (conventionally)
1578                  * the node itself as the "component" of interest.
1579                  *
1580                  * The call to VOP_CREATE() is necessary to ensure
1581                  * that the appropriate permission checks are made,
1582                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1583                  * exists since we are in the else condition where this
1584                  * was checked.
1585                  */
1586                 if (vp->v_flag & VROOT) {
1587                         ASSERT(why != CRMKDIR);
1588                         error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1589                             CRED(), flag, NULL, NULL);
1590                         /*
1591                          * If the create succeeded, it will have created a
1592                          * new reference on a new vnode (*vpp) in the child
1593                          * file system, so we want to drop our reference on
1594                          * the old (vp) upon exit.
1595                          */
1596                         goto out;
1597                 }
1598 
1599                 /*
1600                  * Large File API - non-large open (FOFFMAX flag not set)
1601                  * of regular file fails if the file size exceeds MAXOFF32_T.
1602                  */
1603                 if (why != CRMKDIR &&
1604                     !(flag & FOFFMAX) &&
1605                     (vp->v_type == VREG)) {
1606                         vattr.va_mask = AT_SIZE;
1607                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1608                             CRED(), NULL))) {
1609                                 goto out;
1610                         }
1611                         if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1612                                 error = EOVERFLOW;
1613                                 goto out;
1614                         }
1615                 }
1616         }
1617 
1618         if (error == 0) {
1619                 /*
1620                  * Call mkdir() if specified, otherwise create().
1621                  */
1622                 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1623 
1624                 if (why == CRMKDIR)
1625                         /*
1626                          * N.B., if vn_createat() ever requests
1627                          * case-insensitive behavior then it will need
1628                          * to be passed to VOP_MKDIR().  VOP_CREATE()
1629                          * will already get it via "flag"
1630                          */
1631                         error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1632                             NULL, 0, NULL);
1633                 else if (!must_be_dir)
1634                         error = VOP_CREATE(dvp, pn.pn_path, vap,
1635                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1636                 else
1637                         error = ENOTDIR;
1638         }
1639 
1640 out:
1641 
1642         if (auditing)
1643                 audit_vncreate_finish(*vpp, error);
1644         if (in_crit) {
1645                 nbl_end_crit(vp);
1646                 in_crit = 0;
1647         }
1648         if (vp != NULL) {
1649                 VN_RELE(vp);
1650                 vp = NULL;
1651         }
1652         pn_free(&pn);
1653         VN_RELE(dvp);
1654         /*
1655          * The following clause was added to handle a problem
1656          * with NFS consistency.  It is possible that a lookup
1657          * of the file to be created succeeded, but the file
1658          * itself doesn't actually exist on the server.  This
1659          * is chiefly due to the DNLC containing an entry for
1660          * the file which has been removed on the server.  In
1661          * this case, we just start over.  If there was some
1662          * other cause for the ESTALE error, then the lookup
1663          * of the file will fail and the error will be returned
1664          * above instead of looping around from here.
1665          */
1666         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1667                 goto top;
1668         return (error);
1669 }
1670 
1671 int
1672 vn_link(char *from, char *to, enum uio_seg seg)
1673 {
1674         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1675 }
1676 
1677 int
1678 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1679     vnode_t *tstartvp, char *to, enum uio_seg seg)
1680 {
1681         struct vnode *fvp;              /* from vnode ptr */
1682         struct vnode *tdvp;             /* to directory vnode ptr */
1683         struct pathname pn;
1684         int error;
1685         struct vattr vattr;
1686         dev_t fsid;
1687         int estale_retry = 0;
1688         uint32_t auditing = AU_AUDITING();
1689 
1690 top:
1691         fvp = tdvp = NULL;
1692         if (error = pn_get(to, seg, &pn))
1693                 return (error);
1694         if (auditing && fstartvp != NULL)
1695                 audit_setfsat_path(1);
1696         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1697                 goto out;
1698         if (auditing && tstartvp != NULL)
1699                 audit_setfsat_path(3);
1700         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1701                 goto out;
1702         /*
1703          * Make sure both source vnode and target directory vnode are
1704          * in the same vfs and that it is writeable.
1705          */
1706         vattr.va_mask = AT_FSID;
1707         if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1708                 goto out;
1709         fsid = vattr.va_fsid;
1710         vattr.va_mask = AT_FSID;
1711         if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1712                 goto out;
1713         if (fsid != vattr.va_fsid) {
1714                 error = EXDEV;
1715                 goto out;
1716         }
1717         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1718                 error = EROFS;
1719                 goto out;
1720         }
1721         /*
1722          * Do the link.
1723          */
1724         (void) pn_fixslash(&pn);
1725         error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1726 out:
1727         pn_free(&pn);
1728         if (fvp)
1729                 VN_RELE(fvp);
1730         if (tdvp)
1731                 VN_RELE(tdvp);
1732         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1733                 goto top;
1734         return (error);
1735 }
1736 
1737 int
1738 vn_rename(char *from, char *to, enum uio_seg seg)
1739 {
1740         return (vn_renameat(NULL, from, NULL, to, seg));
1741 }
1742 
1743 int
1744 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1745     char *tname, enum uio_seg seg)
1746 {
1747         int error;
1748         struct vattr vattr;
1749         struct pathname fpn;            /* from pathname */
1750         struct pathname tpn;            /* to pathname */
1751         dev_t fsid;
1752         int in_crit_src, in_crit_targ;
1753         vnode_t *fromvp, *fvp;
1754         vnode_t *tovp, *targvp;
1755         int estale_retry = 0;
1756         uint32_t auditing = AU_AUDITING();
1757 
1758 top:
1759         fvp = fromvp = tovp = targvp = NULL;
1760         in_crit_src = in_crit_targ = 0;
1761         /*
1762          * Get to and from pathnames.
1763          */
1764         if (error = pn_get(fname, seg, &fpn))
1765                 return (error);
1766         if (error = pn_get(tname, seg, &tpn)) {
1767                 pn_free(&fpn);
1768                 return (error);
1769         }
1770 
1771         /*
1772          * First we need to resolve the correct directories
1773          * The passed in directories may only be a starting point,
1774          * but we need the real directories the file(s) live in.
1775          * For example the fname may be something like usr/lib/sparc
1776          * and we were passed in the / directory, but we need to
1777          * use the lib directory for the rename.
1778          */
1779 
1780         if (auditing && fdvp != NULL)
1781                 audit_setfsat_path(1);
1782         /*
1783          * Lookup to and from directories.
1784          */
1785         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1786                 goto out;
1787         }
1788 
1789         /*
1790          * Make sure there is an entry.
1791          */
1792         if (fvp == NULL) {
1793                 error = ENOENT;
1794                 goto out;
1795         }
1796 
1797         if (auditing && tdvp != NULL)
1798                 audit_setfsat_path(3);
1799         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1800                 goto out;
1801         }
1802 
1803         /*
1804          * Make sure both the from vnode directory and the to directory
1805          * are in the same vfs and the to directory is writable.
1806          * We check fsid's, not vfs pointers, so loopback fs works.
1807          */
1808         if (fromvp != tovp) {
1809                 vattr.va_mask = AT_FSID;
1810                 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1811                         goto out;
1812                 fsid = vattr.va_fsid;
1813                 vattr.va_mask = AT_FSID;
1814                 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1815                         goto out;
1816                 if (fsid != vattr.va_fsid) {
1817                         error = EXDEV;
1818                         goto out;
1819                 }
1820         }
1821 
1822         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1823                 error = EROFS;
1824                 goto out;
1825         }
1826 
1827         /*
1828          * Make sure "from" vp is not a mount point.
1829          * Note, lookup did traverse() already, so
1830          * we'll be looking at the mounted FS root.
1831          * (but allow files like mnttab)
1832          */
1833         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1834                 error = EBUSY;
1835                 goto out;
1836         }
1837 
1838         if (targvp && (fvp != targvp)) {
1839                 nbl_start_crit(targvp, RW_READER);
1840                 in_crit_targ = 1;
1841                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1842                         error = EACCES;
1843                         goto out;
1844                 }
1845         }
1846 
1847         if (nbl_need_check(fvp)) {
1848                 nbl_start_crit(fvp, RW_READER);
1849                 in_crit_src = 1;
1850                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1851                         error = EACCES;
1852                         goto out;
1853                 }
1854         }
1855 
1856         /*
1857          * Do the rename.
1858          */
1859         (void) pn_fixslash(&tpn);
1860         error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1861             NULL, 0);
1862 
1863 out:
1864         pn_free(&fpn);
1865         pn_free(&tpn);
1866         if (in_crit_src)
1867                 nbl_end_crit(fvp);
1868         if (in_crit_targ)
1869                 nbl_end_crit(targvp);
1870         if (fromvp)
1871                 VN_RELE(fromvp);
1872         if (tovp)
1873                 VN_RELE(tovp);
1874         if (targvp)
1875                 VN_RELE(targvp);
1876         if (fvp)
1877                 VN_RELE(fvp);
1878         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1879                 goto top;
1880         return (error);
1881 }
1882 
1883 /*
1884  * Remove a file or directory.
1885  */
1886 int
1887 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1888 {
1889         return (vn_removeat(NULL, fnamep, seg, dirflag));
1890 }
1891 
1892 int
1893 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1894 {
1895         struct vnode *vp;               /* entry vnode */
1896         struct vnode *dvp;              /* ptr to parent dir vnode */
1897         struct vnode *coveredvp;
1898         struct pathname pn;             /* name of entry */
1899         enum vtype vtype;
1900         int error;
1901         struct vfs *vfsp;
1902         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1903         int in_crit = 0;
1904         int estale_retry = 0;
1905 
1906 top:
1907         if (error = pn_get(fnamep, seg, &pn))
1908                 return (error);
1909         dvp = vp = NULL;
1910         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1911                 pn_free(&pn);
1912                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1913                         goto top;
1914                 return (error);
1915         }
1916 
1917         /*
1918          * Make sure there is an entry.
1919          */
1920         if (vp == NULL) {
1921                 error = ENOENT;
1922                 goto out;
1923         }
1924 
1925         vfsp = vp->v_vfsp;
1926         dvfsp = dvp->v_vfsp;
1927 
1928         /*
1929          * If the named file is the root of a mounted filesystem, fail,
1930          * unless it's marked unlinkable.  In that case, unmount the
1931          * filesystem and proceed to unlink the covered vnode.  (If the
1932          * covered vnode is a directory, use rmdir instead of unlink,
1933          * to avoid file system corruption.)
1934          */
1935         if (vp->v_flag & VROOT) {
1936                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1937                         error = EBUSY;
1938                         goto out;
1939                 }
1940 
1941                 /*
1942                  * Namefs specific code starts here.
1943                  */
1944 
1945                 if (dirflag == RMDIRECTORY) {
1946                         /*
1947                          * User called rmdir(2) on a file that has
1948                          * been namefs mounted on top of.  Since
1949                          * namefs doesn't allow directories to
1950                          * be mounted on other files we know
1951                          * vp is not of type VDIR so fail to operation.
1952                          */
1953                         error = ENOTDIR;
1954                         goto out;
1955                 }
1956 
1957                 /*
1958                  * If VROOT is still set after grabbing vp->v_lock,
1959                  * noone has finished nm_unmount so far and coveredvp
1960                  * is valid.
1961                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1962                  * vp->v_lock, any race window is eliminated.
1963                  */
1964 
1965                 mutex_enter(&vp->v_lock);
1966                 if ((vp->v_flag & VROOT) == 0) {
1967                         /* Someone beat us to the unmount */
1968                         mutex_exit(&vp->v_lock);
1969                         error = EBUSY;
1970                         goto out;
1971                 }
1972                 vfsp = vp->v_vfsp;
1973                 coveredvp = vfsp->vfs_vnodecovered;
1974                 ASSERT(coveredvp);
1975                 /*
1976                  * Note: Implementation of vn_vfswlock shows that ordering of
1977                  * v_lock / vn_vfswlock is not an issue here.
1978                  */
1979                 error = vn_vfswlock(coveredvp);
1980                 mutex_exit(&vp->v_lock);
1981 
1982                 if (error)
1983                         goto out;
1984 
1985                 VN_HOLD(coveredvp);
1986                 VN_RELE(vp);
1987                 error = dounmount(vfsp, 0, CRED());
1988 
1989                 /*
1990                  * Unmounted the namefs file system; now get
1991                  * the object it was mounted over.
1992                  */
1993                 vp = coveredvp;
1994                 /*
1995                  * If namefs was mounted over a directory, then
1996                  * we want to use rmdir() instead of unlink().
1997                  */
1998                 if (vp->v_type == VDIR)
1999                         dirflag = RMDIRECTORY;
2000 
2001                 if (error)
2002                         goto out;
2003         }
2004 
2005         /*
2006          * Make sure filesystem is writeable.
2007          * We check the parent directory's vfs in case this is an lofs vnode.
2008          */
2009         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
2010                 error = EROFS;
2011                 goto out;
2012         }
2013 
2014         vtype = vp->v_type;
2015 
2016         /*
2017          * If there is the possibility of an nbmand share reservation, make
2018          * sure it's okay to remove the file.  Keep a reference to the
2019          * vnode, so that we can exit the nbl critical region after
2020          * calling VOP_REMOVE.
2021          * If there is no possibility of an nbmand share reservation,
2022          * release the vnode reference now.  Filesystems like NFS may
2023          * behave differently if there is an extra reference, so get rid of
2024          * this one.  Fortunately, we can't have nbmand mounts on NFS
2025          * filesystems.
2026          */
2027         if (nbl_need_check(vp)) {
2028                 nbl_start_crit(vp, RW_READER);
2029                 in_crit = 1;
2030                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
2031                         error = EACCES;
2032                         goto out;
2033                 }
2034         } else {
2035                 VN_RELE(vp);
2036                 vp = NULL;
2037         }
2038 
2039         if (dirflag == RMDIRECTORY) {
2040                 /*
2041                  * Caller is using rmdir(2), which can only be applied to
2042                  * directories.
2043                  */
2044                 if (vtype != VDIR) {
2045                         error = ENOTDIR;
2046                 } else {
2047                         vnode_t *cwd;
2048                         proc_t *pp = curproc;
2049 
2050                         mutex_enter(&pp->p_lock);
2051                         cwd = PTOU(pp)->u_cdir;
2052                         VN_HOLD(cwd);
2053                         mutex_exit(&pp->p_lock);
2054                         error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2055                             NULL, 0);
2056                         VN_RELE(cwd);
2057                 }
2058         } else {
2059                 /*
2060                  * Unlink(2) can be applied to anything.
2061                  */
2062                 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2063         }
2064 
2065 out:
2066         pn_free(&pn);
2067         if (in_crit) {
2068                 nbl_end_crit(vp);
2069                 in_crit = 0;
2070         }
2071         if (vp != NULL)
2072                 VN_RELE(vp);
2073         if (dvp != NULL)
2074                 VN_RELE(dvp);
2075         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2076                 goto top;
2077         return (error);
2078 }
2079 
2080 /*
2081  * Utility function to compare equality of vnodes.
2082  * Compare the underlying real vnodes, if there are underlying vnodes.
2083  * This is a more thorough comparison than the VN_CMP() macro provides.
2084  */
2085 int
2086 vn_compare(vnode_t *vp1, vnode_t *vp2)
2087 {
2088         vnode_t *realvp;
2089 
2090         if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2091                 vp1 = realvp;
2092         if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2093                 vp2 = realvp;
2094         return (VN_CMP(vp1, vp2));
2095 }
2096 
2097 /*
2098  * The number of locks to hash into.  This value must be a power
2099  * of 2 minus 1 and should probably also be prime.
2100  */
2101 #define NUM_BUCKETS     1023
2102 
2103 struct  vn_vfslocks_bucket {
2104         kmutex_t vb_lock;
2105         vn_vfslocks_entry_t *vb_list;
2106         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2107 };
2108 
2109 /*
2110  * Total number of buckets will be NUM_BUCKETS + 1 .
2111  */
2112 
2113 #pragma align   64(vn_vfslocks_buckets)
2114 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2115 
2116 #define VN_VFSLOCKS_SHIFT       9
2117 
2118 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2119         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2120 
2121 /*
2122  * vn_vfslocks_getlock() uses an HASH scheme to generate
2123  * rwstlock using vfs/vnode pointer passed to it.
2124  *
2125  * vn_vfslocks_rele() releases a reference in the
2126  * HASH table which allows the entry allocated by
2127  * vn_vfslocks_getlock() to be freed at a later
2128  * stage when the refcount drops to zero.
2129  */
2130 
2131 vn_vfslocks_entry_t *
2132 vn_vfslocks_getlock(void *vfsvpptr)
2133 {
2134         struct vn_vfslocks_bucket *bp;
2135         vn_vfslocks_entry_t *vep;
2136         vn_vfslocks_entry_t *tvep;
2137 
2138         ASSERT(vfsvpptr != NULL);
2139         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2140 
2141         mutex_enter(&bp->vb_lock);
2142         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2143                 if (vep->ve_vpvfs == vfsvpptr) {
2144                         vep->ve_refcnt++;
2145                         mutex_exit(&bp->vb_lock);
2146                         return (vep);
2147                 }
2148         }
2149         mutex_exit(&bp->vb_lock);
2150         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2151         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2152         vep->ve_vpvfs = (char *)vfsvpptr;
2153         vep->ve_refcnt = 1;
2154         mutex_enter(&bp->vb_lock);
2155         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2156                 if (tvep->ve_vpvfs == vfsvpptr) {
2157                         tvep->ve_refcnt++;
2158                         mutex_exit(&bp->vb_lock);
2159 
2160                         /*
2161                          * There is already an entry in the hash
2162                          * destroy what we just allocated.
2163                          */
2164                         rwst_destroy(&vep->ve_lock);
2165                         kmem_free(vep, sizeof (*vep));
2166                         return (tvep);
2167                 }
2168         }
2169         vep->ve_next = bp->vb_list;
2170         bp->vb_list = vep;
2171         mutex_exit(&bp->vb_lock);
2172         return (vep);
2173 }
2174 
2175 void
2176 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2177 {
2178         struct vn_vfslocks_bucket *bp;
2179         vn_vfslocks_entry_t *vep;
2180         vn_vfslocks_entry_t *pvep;
2181 
2182         ASSERT(vepent != NULL);
2183         ASSERT(vepent->ve_vpvfs != NULL);
2184 
2185         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2186 
2187         mutex_enter(&bp->vb_lock);
2188         vepent->ve_refcnt--;
2189 
2190         if ((int32_t)vepent->ve_refcnt < 0)
2191                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2192 
2193         pvep = NULL;
2194         if (vepent->ve_refcnt == 0) {
2195                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2196                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2197                                 if (pvep == NULL)
2198                                         bp->vb_list = vep->ve_next;
2199                                 else {
2200                                         pvep->ve_next = vep->ve_next;
2201                                 }
2202                                 mutex_exit(&bp->vb_lock);
2203                                 rwst_destroy(&vep->ve_lock);
2204                                 kmem_free(vep, sizeof (*vep));
2205                                 return;
2206                         }
2207                         pvep = vep;
2208                 }
2209                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2210         }
2211         mutex_exit(&bp->vb_lock);
2212 }
2213 
2214 /*
2215  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2216  * lock protecting the v_vfsmountedhere field.
2217  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2218  * except that it blocks to acquire the lock VVFSLOCK.
2219  *
2220  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2221  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2222  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2223  */
2224 int
2225 vn_vfswlock_wait(vnode_t *vp)
2226 {
2227         int retval;
2228         vn_vfslocks_entry_t *vpvfsentry;
2229         ASSERT(vp != NULL);
2230 
2231         vpvfsentry = vn_vfslocks_getlock(vp);
2232         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2233 
2234         if (retval == EINTR) {
2235                 vn_vfslocks_rele(vpvfsentry);
2236                 return (EINTR);
2237         }
2238         return (retval);
2239 }
2240 
2241 int
2242 vn_vfsrlock_wait(vnode_t *vp)
2243 {
2244         int retval;
2245         vn_vfslocks_entry_t *vpvfsentry;
2246         ASSERT(vp != NULL);
2247 
2248         vpvfsentry = vn_vfslocks_getlock(vp);
2249         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2250 
2251         if (retval == EINTR) {
2252                 vn_vfslocks_rele(vpvfsentry);
2253                 return (EINTR);
2254         }
2255 
2256         return (retval);
2257 }
2258 
2259 
2260 /*
2261  * vn_vfswlock is used to implement a lock which is logically a writers lock
2262  * protecting the v_vfsmountedhere field.
2263  */
2264 int
2265 vn_vfswlock(vnode_t *vp)
2266 {
2267         vn_vfslocks_entry_t *vpvfsentry;
2268 
2269         /*
2270          * If vp is NULL then somebody is trying to lock the covered vnode
2271          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2272          * only happen when unmounting /.  Since that operation will fail
2273          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2274          */
2275         if (vp == NULL)
2276                 return (EBUSY);
2277 
2278         vpvfsentry = vn_vfslocks_getlock(vp);
2279 
2280         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2281                 return (0);
2282 
2283         vn_vfslocks_rele(vpvfsentry);
2284         return (EBUSY);
2285 }
2286 
2287 int
2288 vn_vfsrlock(vnode_t *vp)
2289 {
2290         vn_vfslocks_entry_t *vpvfsentry;
2291 
2292         /*
2293          * If vp is NULL then somebody is trying to lock the covered vnode
2294          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2295          * only happen when unmounting /.  Since that operation will fail
2296          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2297          */
2298         if (vp == NULL)
2299                 return (EBUSY);
2300 
2301         vpvfsentry = vn_vfslocks_getlock(vp);
2302 
2303         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2304                 return (0);
2305 
2306         vn_vfslocks_rele(vpvfsentry);
2307         return (EBUSY);
2308 }
2309 
2310 void
2311 vn_vfsunlock(vnode_t *vp)
2312 {
2313         vn_vfslocks_entry_t *vpvfsentry;
2314 
2315         /*
2316          * ve_refcnt needs to be decremented twice.
2317          * 1. To release refernce after a call to vn_vfslocks_getlock()
2318          * 2. To release the reference from the locking routines like
2319          *    vn_vfsrlock/vn_vfswlock etc,.
2320          */
2321         vpvfsentry = vn_vfslocks_getlock(vp);
2322         vn_vfslocks_rele(vpvfsentry);
2323 
2324         rwst_exit(&vpvfsentry->ve_lock);
2325         vn_vfslocks_rele(vpvfsentry);
2326 }
2327 
2328 int
2329 vn_vfswlock_held(vnode_t *vp)
2330 {
2331         int held;
2332         vn_vfslocks_entry_t *vpvfsentry;
2333 
2334         ASSERT(vp != NULL);
2335 
2336         vpvfsentry = vn_vfslocks_getlock(vp);
2337         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2338 
2339         vn_vfslocks_rele(vpvfsentry);
2340         return (held);
2341 }
2342 
2343 
2344 int
2345 vn_make_ops(
2346         const char *name,                       /* Name of file system */
2347         const fs_operation_def_t *templ,        /* Operation specification */
2348         vnodeops_t **actual)                    /* Return the vnodeops */
2349 {
2350         int unused_ops;
2351         int error;
2352 
2353         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2354 
2355         (*actual)->vnop_name = name;
2356 
2357         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2358         if (error) {
2359                 kmem_free(*actual, sizeof (vnodeops_t));
2360         }
2361 
2362 #if DEBUG
2363         if (unused_ops != 0)
2364                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2365                     "but not used", name, unused_ops);
2366 #endif
2367 
2368         return (error);
2369 }
2370 
2371 /*
2372  * Free the vnodeops created as a result of vn_make_ops()
2373  */
2374 void
2375 vn_freevnodeops(vnodeops_t *vnops)
2376 {
2377         kmem_free(vnops, sizeof (vnodeops_t));
2378 }
2379 
2380 /*
2381  * Vnode cache.
2382  */
2383 
2384 /* ARGSUSED */
2385 static int
2386 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2387 {
2388         struct vnode *vp;
2389 
2390         vp = buf;
2391 
2392         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2393         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2394         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2395         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2396         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2397         vp->v_path = vn_vpath_empty;
2398         vp->v_path_stamp = 0;
2399         vp->v_mpssdata = NULL;
2400         vp->v_vsd = NULL;
2401         vp->v_fopdata = NULL;
2402 
2403         return (0);
2404 }
2405 
2406 /* ARGSUSED */
2407 static void
2408 vn_cache_destructor(void *buf, void *cdrarg)
2409 {
2410         struct vnode *vp;
2411 
2412         vp = buf;
2413 
2414         rw_destroy(&vp->v_nbllock);
2415         cv_destroy(&vp->v_cv);
2416         mutex_destroy(&vp->v_vsd_lock);
2417         mutex_destroy(&vp->v_lock);
2418 }
2419 
2420 void
2421 vn_create_cache(void)
2422 {
2423         /* LINTED */
2424         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2425             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2426         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2427             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2428             NULL, 0);
2429 }
2430 
2431 void
2432 vn_destroy_cache(void)
2433 {
2434         kmem_cache_destroy(vn_cache);
2435 }
2436 
2437 /*
2438  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2439  * cached by the file system and vnodes remain associated.
2440  */
2441 void
2442 vn_recycle(vnode_t *vp)
2443 {
2444         ASSERT(vp->v_pages == NULL);
2445         VERIFY(vp->v_path != NULL);
2446 
2447         /*
2448          * XXX - This really belongs in vn_reinit(), but we have some issues
2449          * with the counts.  Best to have it here for clean initialization.
2450          */
2451         vp->v_rdcnt = 0;
2452         vp->v_wrcnt = 0;
2453         vp->v_mmap_read = 0;
2454         vp->v_mmap_write = 0;
2455 
2456         /*
2457          * If FEM was in use, make sure everything gets cleaned up
2458          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2459          * constructor.
2460          */
2461         if (vp->v_femhead) {
2462                 /* XXX - There should be a free_femhead() that does all this */
2463                 ASSERT(vp->v_femhead->femh_list == NULL);
2464                 mutex_destroy(&vp->v_femhead->femh_lock);
2465                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2466                 vp->v_femhead = NULL;
2467         }
2468         if (vp->v_path != vn_vpath_empty) {
2469                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2470                 vp->v_path = vn_vpath_empty;
2471         }
2472         vp->v_path_stamp = 0;
2473 
2474         if (vp->v_fopdata != NULL) {
2475                 free_fopdata(vp);
2476         }
2477         vp->v_mpssdata = NULL;
2478         vsd_free(vp);
2479 }
2480 
2481 /*
2482  * Used to reset the vnode fields including those that are directly accessible
2483  * as well as those which require an accessor function.
2484  *
2485  * Does not initialize:
2486  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2487  *      v_data (since FS-nodes and vnodes point to each other and should
2488  *              be updated simultaneously)
2489  *      v_op (in case someone needs to make a VOP call on this object)
2490  */
2491 void
2492 vn_reinit(vnode_t *vp)
2493 {
2494         vp->v_count = 1;
2495         vp->v_count_dnlc = 0;
2496         vp->v_phantom_count = 0;
2497         vp->v_vfsp = NULL;
2498         vp->v_stream = NULL;
2499         vp->v_vfsmountedhere = NULL;
2500         vp->v_flag = 0;
2501         vp->v_type = VNON;
2502         vp->v_rdev = NODEV;
2503 
2504         vp->v_filocks = NULL;
2505         vp->v_shrlocks = NULL;
2506         vp->v_pages = NULL;
2507 
2508         vp->v_locality = NULL;
2509         vp->v_xattrdir = NULL;
2510 
2511         /*
2512          * In a few specific instances, vn_reinit() is used to initialize
2513          * locally defined vnode_t instances.  Lacking the construction offered
2514          * by vn_alloc(), these vnodes require v_path initialization.
2515          */
2516         if (vp->v_path == NULL) {
2517                 vp->v_path = vn_vpath_empty;
2518         }
2519 
2520         /* Handles v_femhead, v_path, and the r/w/map counts */
2521         vn_recycle(vp);
2522 }
2523 
2524 vnode_t *
2525 vn_alloc(int kmflag)
2526 {
2527         vnode_t *vp;
2528 
2529         vp = kmem_cache_alloc(vn_cache, kmflag);
2530 
2531         if (vp != NULL) {
2532                 vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2533                 vp->v_fopdata = NULL;
2534                 vn_reinit(vp);
2535         }
2536 
2537         return (vp);
2538 }
2539 
2540 void
2541 vn_free(vnode_t *vp)
2542 {
2543         ASSERT(vp->v_shrlocks == NULL);
2544         ASSERT(vp->v_filocks == NULL);
2545 
2546         /*
2547          * Some file systems call vn_free() with v_count of zero,
2548          * some with v_count of 1.  In any case, the value should
2549          * never be anything else.
2550          */
2551         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2552         ASSERT(vp->v_count_dnlc == 0);
2553         ASSERT0(vp->v_phantom_count);
2554         VERIFY(vp->v_path != NULL);
2555         if (vp->v_path != vn_vpath_empty) {
2556                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2557                 vp->v_path = vn_vpath_empty;
2558         }
2559 
2560         /* If FEM was in use, make sure everything gets cleaned up */
2561         if (vp->v_femhead) {
2562                 /* XXX - There should be a free_femhead() that does all this */
2563                 ASSERT(vp->v_femhead->femh_list == NULL);
2564                 mutex_destroy(&vp->v_femhead->femh_lock);
2565                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2566                 vp->v_femhead = NULL;
2567         }
2568 
2569         if (vp->v_fopdata != NULL) {
2570                 free_fopdata(vp);
2571         }
2572         vp->v_mpssdata = NULL;
2573         vsd_free(vp);
2574         kmem_cache_free(vn_cache, vp);
2575 }
2576 
2577 /*
2578  * vnode status changes, should define better states than 1, 0.
2579  */
2580 void
2581 vn_reclaim(vnode_t *vp)
2582 {
2583         vfs_t   *vfsp = vp->v_vfsp;
2584 
2585         if (vfsp == NULL ||
2586             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2587                 return;
2588         }
2589         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2590 }
2591 
2592 void
2593 vn_idle(vnode_t *vp)
2594 {
2595         vfs_t   *vfsp = vp->v_vfsp;
2596 
2597         if (vfsp == NULL ||
2598             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2599                 return;
2600         }
2601         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2602 }
2603 void
2604 vn_exists(vnode_t *vp)
2605 {
2606         vfs_t   *vfsp = vp->v_vfsp;
2607 
2608         if (vfsp == NULL ||
2609             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2610                 return;
2611         }
2612         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2613 }
2614 
2615 void
2616 vn_invalid(vnode_t *vp)
2617 {
2618         vfs_t   *vfsp = vp->v_vfsp;
2619 
2620         if (vfsp == NULL ||
2621             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2622                 return;
2623         }
2624         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2625 }
2626 
2627 /* Vnode event notification */
2628 
2629 int
2630 vnevent_support(vnode_t *vp, caller_context_t *ct)
2631 {
2632         if (vp == NULL)
2633                 return (EINVAL);
2634 
2635         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2636 }
2637 
2638 void
2639 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2640 {
2641         if (vp == NULL || vp->v_femhead == NULL) {
2642                 return;
2643         }
2644         (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2645         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2646 }
2647 
2648 void
2649 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2650     caller_context_t *ct)
2651 {
2652         if (vp == NULL || vp->v_femhead == NULL) {
2653                 return;
2654         }
2655         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2656 }
2657 
2658 void
2659 vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2660     caller_context_t *ct)
2661 {
2662         if (vp == NULL || vp->v_femhead == NULL) {
2663                 return;
2664         }
2665         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2666 }
2667 
2668 void
2669 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2670 {
2671         if (vp == NULL || vp->v_femhead == NULL) {
2672                 return;
2673         }
2674         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2675 }
2676 
2677 void
2678 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2679 {
2680         if (vp == NULL || vp->v_femhead == NULL) {
2681                 return;
2682         }
2683         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2684 }
2685 
2686 void
2687 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2688     caller_context_t *ct)
2689 {
2690         if (vp == NULL || vp->v_femhead == NULL) {
2691                 return;
2692         }
2693         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2694 }
2695 
2696 void
2697 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2698     caller_context_t *ct)
2699 {
2700         if (vp == NULL || vp->v_femhead == NULL) {
2701                 return;
2702         }
2703         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2704 }
2705 
2706 void
2707 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2708     caller_context_t *ct)
2709 {
2710         if (vp == NULL || vp->v_femhead == NULL) {
2711                 return;
2712         }
2713         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2714 }
2715 
2716 void
2717 vnevent_create(vnode_t *vp, caller_context_t *ct)
2718 {
2719         if (vp == NULL || vp->v_femhead == NULL) {
2720                 return;
2721         }
2722         (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2723 }
2724 
2725 void
2726 vnevent_link(vnode_t *vp, caller_context_t *ct)
2727 {
2728         if (vp == NULL || vp->v_femhead == NULL) {
2729                 return;
2730         }
2731         (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2732 }
2733 
2734 void
2735 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2736 {
2737         if (vp == NULL || vp->v_femhead == NULL) {
2738                 return;
2739         }
2740         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2741 }
2742 
2743 void
2744 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2745 {
2746         if (vp == NULL || vp->v_femhead == NULL) {
2747                 return;
2748         }
2749         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2750 }
2751 
2752 void
2753 vnevent_resize(vnode_t *vp, caller_context_t *ct)
2754 {
2755         if (vp == NULL || vp->v_femhead == NULL) {
2756                 return;
2757         }
2758         (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2759 }
2760 
2761 /*
2762  * Vnode accessors.
2763  */
2764 
2765 int
2766 vn_is_readonly(vnode_t *vp)
2767 {
2768         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2769 }
2770 
2771 int
2772 vn_has_flocks(vnode_t *vp)
2773 {
2774         return (vp->v_filocks != NULL);
2775 }
2776 
2777 int
2778 vn_has_mandatory_locks(vnode_t *vp, int mode)
2779 {
2780         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2781 }
2782 
2783 int
2784 vn_has_cached_data(vnode_t *vp)
2785 {
2786         return (vp->v_pages != NULL);
2787 }
2788 
2789 /*
2790  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2791  * zone_enter(2).
2792  */
2793 int
2794 vn_can_change_zones(vnode_t *vp)
2795 {
2796         struct vfssw *vswp;
2797         int allow = 1;
2798         vnode_t *rvp;
2799 
2800         if (nfs_global_client_only != 0)
2801                 return (1);
2802 
2803         /*
2804          * We always want to look at the underlying vnode if there is one.
2805          */
2806         if (VOP_REALVP(vp, &rvp, NULL) != 0)
2807                 rvp = vp;
2808         /*
2809          * Some pseudo filesystems (including doorfs) don't actually register
2810          * their vfsops_t, so the following may return NULL; we happily let
2811          * such vnodes switch zones.
2812          */
2813         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2814         if (vswp != NULL) {
2815                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2816                         allow = 0;
2817                 vfs_unrefvfssw(vswp);
2818         }
2819         return (allow);
2820 }
2821 
2822 /*
2823  * Return nonzero if the vnode is a mount point, zero if not.
2824  */
2825 int
2826 vn_ismntpt(vnode_t *vp)
2827 {
2828         return (vp->v_vfsmountedhere != NULL);
2829 }
2830 
2831 /* Retrieve the vfs (if any) mounted on this vnode */
2832 vfs_t *
2833 vn_mountedvfs(vnode_t *vp)
2834 {
2835         return (vp->v_vfsmountedhere);
2836 }
2837 
2838 /*
2839  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2840  */
2841 int
2842 vn_in_dnlc(vnode_t *vp)
2843 {
2844         return (vp->v_count_dnlc > 0);
2845 }
2846 
2847 /*
2848  * vn_has_other_opens() checks whether a particular file is opened by more than
2849  * just the caller and whether the open is for read and/or write.
2850  * This routine is for calling after the caller has already called VOP_OPEN()
2851  * and the caller wishes to know if they are the only one with it open for
2852  * the mode(s) specified.
2853  *
2854  * Vnode counts are only kept on regular files (v_type=VREG).
2855  */
2856 int
2857 vn_has_other_opens(
2858         vnode_t *vp,
2859         v_mode_t mode)
2860 {
2861 
2862         ASSERT(vp != NULL);
2863 
2864         switch (mode) {
2865         case V_WRITE:
2866                 if (vp->v_wrcnt > 1)
2867                         return (V_TRUE);
2868                 break;
2869         case V_RDORWR:
2870                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2871                         return (V_TRUE);
2872                 break;
2873         case V_RDANDWR:
2874                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2875                         return (V_TRUE);
2876                 break;
2877         case V_READ:
2878                 if (vp->v_rdcnt > 1)
2879                         return (V_TRUE);
2880                 break;
2881         }
2882 
2883         return (V_FALSE);
2884 }
2885 
2886 /*
2887  * vn_is_opened() checks whether a particular file is opened and
2888  * whether the open is for read and/or write.
2889  *
2890  * Vnode counts are only kept on regular files (v_type=VREG).
2891  */
2892 int
2893 vn_is_opened(
2894         vnode_t *vp,
2895         v_mode_t mode)
2896 {
2897 
2898         ASSERT(vp != NULL);
2899 
2900         switch (mode) {
2901         case V_WRITE:
2902                 if (vp->v_wrcnt)
2903                         return (V_TRUE);
2904                 break;
2905         case V_RDANDWR:
2906                 if (vp->v_rdcnt && vp->v_wrcnt)
2907                         return (V_TRUE);
2908                 break;
2909         case V_RDORWR:
2910                 if (vp->v_rdcnt || vp->v_wrcnt)
2911                         return (V_TRUE);
2912                 break;
2913         case V_READ:
2914                 if (vp->v_rdcnt)
2915                         return (V_TRUE);
2916                 break;
2917         }
2918 
2919         return (V_FALSE);
2920 }
2921 
2922 /*
2923  * vn_is_mapped() checks whether a particular file is mapped and whether
2924  * the file is mapped read and/or write.
2925  */
2926 int
2927 vn_is_mapped(
2928         vnode_t *vp,
2929         v_mode_t mode)
2930 {
2931 
2932         ASSERT(vp != NULL);
2933 
2934 #if !defined(_LP64)
2935         switch (mode) {
2936         /*
2937          * The atomic_add_64_nv functions force atomicity in the
2938          * case of 32 bit architectures. Otherwise the 64 bit values
2939          * require two fetches. The value of the fields may be
2940          * (potentially) changed between the first fetch and the
2941          * second
2942          */
2943         case V_WRITE:
2944                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2945                         return (V_TRUE);
2946                 break;
2947         case V_RDANDWR:
2948                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2949                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2950                         return (V_TRUE);
2951                 break;
2952         case V_RDORWR:
2953                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2954                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2955                         return (V_TRUE);
2956                 break;
2957         case V_READ:
2958                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2959                         return (V_TRUE);
2960                 break;
2961         }
2962 #else
2963         switch (mode) {
2964         case V_WRITE:
2965                 if (vp->v_mmap_write)
2966                         return (V_TRUE);
2967                 break;
2968         case V_RDANDWR:
2969                 if (vp->v_mmap_read && vp->v_mmap_write)
2970                         return (V_TRUE);
2971                 break;
2972         case V_RDORWR:
2973                 if (vp->v_mmap_read || vp->v_mmap_write)
2974                         return (V_TRUE);
2975                 break;
2976         case V_READ:
2977                 if (vp->v_mmap_read)
2978                         return (V_TRUE);
2979                 break;
2980         }
2981 #endif
2982 
2983         return (V_FALSE);
2984 }
2985 
2986 /*
2987  * Set the operations vector for a vnode.
2988  *
2989  * FEM ensures that the v_femhead pointer is filled in before the
2990  * v_op pointer is changed.  This means that if the v_femhead pointer
2991  * is NULL, and the v_op field hasn't changed since before which checked
2992  * the v_femhead pointer; then our update is ok - we are not racing with
2993  * FEM.
2994  */
2995 void
2996 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2997 {
2998         vnodeops_t      *op;
2999 
3000         ASSERT(vp != NULL);
3001         ASSERT(vnodeops != NULL);
3002 
3003         op = vp->v_op;
3004         membar_consumer();
3005         /*
3006          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
3007          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
3008          * in effect on the vnode and we need to have FEM deal with it.
3009          */
3010         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
3011             op) {
3012                 fem_setvnops(vp, vnodeops);
3013         }
3014 }
3015 
3016 /*
3017  * Retrieve the operations vector for a vnode
3018  * As with vn_setops(above); make sure we aren't racing with FEM.
3019  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
3020  * make sense to the callers of this routine.
3021  */
3022 vnodeops_t *
3023 vn_getops(vnode_t *vp)
3024 {
3025         vnodeops_t      *op;
3026 
3027         ASSERT(vp != NULL);
3028 
3029         op = vp->v_op;
3030         membar_consumer();
3031         if (vp->v_femhead == NULL && op == vp->v_op) {
3032                 return (op);
3033         } else {
3034                 return (fem_getvnops(vp));
3035         }
3036 }
3037 
3038 /*
3039  * Returns non-zero (1) if the vnodeops matches that of the vnode.
3040  * Returns zero (0) if not.
3041  */
3042 int
3043 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
3044 {
3045         return (vn_getops(vp) == vnodeops);
3046 }
3047 
3048 /*
3049  * Returns non-zero (1) if the specified operation matches the
3050  * corresponding operation for that the vnode.
3051  * Returns zero (0) if not.
3052  */
3053 
3054 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
3055 
3056 int
3057 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
3058 {
3059         const fs_operation_trans_def_t *otdp;
3060         fs_generic_func_p *loc = NULL;
3061         vnodeops_t      *vop = vn_getops(vp);
3062 
3063         ASSERT(vopname != NULL);
3064 
3065         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3066                 if (MATCHNAME(otdp->name, vopname)) {
3067                         loc = (fs_generic_func_p *)
3068                             ((char *)(vop) + otdp->offset);
3069                         break;
3070                 }
3071         }
3072 
3073         return ((loc != NULL) && (*loc == funcp));
3074 }
3075 
3076 /*
3077  * fs_new_caller_id() needs to return a unique ID on a given local system.
3078  * The IDs do not need to survive across reboots.  These are primarily
3079  * used so that (FEM) monitors can detect particular callers (such as
3080  * the NFS server) to a given vnode/vfs operation.
3081  */
3082 u_longlong_t
3083 fs_new_caller_id()
3084 {
3085         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3086 
3087         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3088 }
3089 
3090 /*
3091  * The value stored in v_path is relative to rootdir, located in the global
3092  * zone.  Zones or chroot environments which reside deeper inside the VFS
3093  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3094  * what lies below their perceived root.  In order to keep v_path usable for
3095  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3096  *
3097  * An upper bound of max_vnode_path is placed upon v_path allocations to
3098  * prevent the system from going too wild at the behest of pathological
3099  * behavior from the operator.
3100  */
3101 size_t max_vnode_path = 4 * MAXPATHLEN;
3102 
3103 
3104 void
3105 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3106 {
3107         char *buf;
3108 
3109         mutex_enter(&vp->v_lock);
3110         /*
3111          * If the snapshot of v_path_stamp passed in via compare_stamp does not
3112          * match the present value on the vnode, it indicates that subsequent
3113          * changes have occurred.  The v_path value is not cleared in this case
3114          * since the new value may be valid.
3115          */
3116         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3117                 mutex_exit(&vp->v_lock);
3118                 return;
3119         }
3120         buf = vp->v_path;
3121         vp->v_path = vn_vpath_empty;
3122         vp->v_path_stamp = 0;
3123         mutex_exit(&vp->v_lock);
3124         if (buf != vn_vpath_empty) {
3125                 kmem_free(buf, strlen(buf) + 1);
3126         }
3127 }
3128 
3129 static void
3130 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3131     boolean_t is_rename)
3132 {
3133         char *buf, *oldbuf;
3134         hrtime_t pstamp;
3135         size_t baselen, buflen = 0;
3136 
3137         /* Handle the vn_setpath_str case. */
3138         if (pvp == NULL) {
3139                 if (len + 1 > max_vnode_path) {
3140                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3141                             vnode_t *, vp, char *, name, size_t, len + 1);
3142                         return;
3143                 }
3144                 buf = kmem_alloc(len + 1, KM_SLEEP);
3145                 bcopy(name, buf, len);
3146                 buf[len] = '\0';
3147 
3148                 mutex_enter(&vp->v_lock);
3149                 oldbuf = vp->v_path;
3150                 vp->v_path = buf;
3151                 vp->v_path_stamp = gethrtime();
3152                 mutex_exit(&vp->v_lock);
3153                 if (oldbuf != vn_vpath_empty) {
3154                         kmem_free(oldbuf, strlen(oldbuf) + 1);
3155                 }
3156                 return;
3157         }
3158 
3159         /* Take snapshot of parent dir */
3160         mutex_enter(&pvp->v_lock);
3161 
3162         if ((pvp->v_flag & VTRAVERSE) != 0) {
3163                 /*
3164                  * When the parent vnode has VTRAVERSE set in its flags, normal
3165                  * assumptions about v_path calculation no longer apply.  The
3166                  * primary situation where this occurs is via the VFS tricks
3167                  * which procfs plays in order to allow /proc/PID/(root|cwd) to
3168                  * yield meaningful results.
3169                  *
3170                  * When this flag is set, v_path on the child must not be
3171                  * updated since the calculated value is likely to be
3172                  * incorrect, given the current context.
3173                  */
3174                 mutex_exit(&pvp->v_lock);
3175                 return;
3176         }
3177 
3178 retrybuf:
3179         if (pvp->v_path == vn_vpath_empty) {
3180                 /*
3181                  * Without v_path from the parent directory, generating a child
3182                  * path from the name is impossible.
3183                  */
3184                 if (len > 0) {
3185                         pstamp = pvp->v_path_stamp;
3186                         mutex_exit(&pvp->v_lock);
3187                         vn_clearpath(vp, pstamp);
3188                         return;
3189                 }
3190 
3191                 /*
3192                  * The only feasible case here is where a NUL lookup is being
3193                  * performed on rootdir prior to its v_path being populated.
3194                  */
3195                 ASSERT(pvp->v_path_stamp == 0);
3196                 baselen = 0;
3197                 pstamp = 0;
3198         } else {
3199                 pstamp = pvp->v_path_stamp;
3200                 baselen = strlen(pvp->v_path);
3201                 /* ignore a trailing slash if present */
3202                 if (pvp->v_path[baselen - 1] == '/') {
3203                         /* This should only the be case for rootdir */
3204                         ASSERT(baselen == 1 && pvp == rootdir);
3205                         baselen--;
3206                 }
3207         }
3208         mutex_exit(&pvp->v_lock);
3209 
3210         if (buflen != 0) {
3211                 /* Free the existing (mis-sized) buffer in case of retry */
3212                 kmem_free(buf, buflen);
3213         }
3214         /* base, '/', name and trailing NUL */
3215         buflen = baselen + len + 2;
3216         if (buflen > max_vnode_path) {
3217                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3218                     vnode_t *, vp, char *, name, size_t, buflen);
3219                 return;
3220         }
3221         buf = kmem_alloc(buflen, KM_SLEEP);
3222 
3223         mutex_enter(&pvp->v_lock);
3224         if (pvp->v_path_stamp != pstamp) {
3225                 size_t vlen;
3226 
3227                 /*
3228                  * Since v_path_stamp changed on the parent, it is likely that
3229                  * v_path has been altered as well.  If the length does not
3230                  * exactly match what was previously measured, the buffer
3231                  * allocation must be repeated for proper sizing.
3232                  */
3233                 if (pvp->v_path == vn_vpath_empty) {
3234                         /* Give up if parent lack v_path */
3235                         mutex_exit(&pvp->v_lock);
3236                         kmem_free(buf, buflen);
3237                         return;
3238                 }
3239                 vlen = strlen(pvp->v_path);
3240                 if (pvp->v_path[vlen - 1] == '/') {
3241                         vlen--;
3242                 }
3243                 if (vlen != baselen) {
3244                         goto retrybuf;
3245                 }
3246         }
3247         bcopy(pvp->v_path, buf, baselen);
3248         mutex_exit(&pvp->v_lock);
3249 
3250         buf[baselen] = '/';
3251         baselen++;
3252         bcopy(name, &buf[baselen], len + 1);
3253 
3254         mutex_enter(&vp->v_lock);
3255         if (vp->v_path_stamp == 0) {
3256                 /* never-visited vnode can inherit stamp from parent */
3257                 ASSERT(vp->v_path == vn_vpath_empty);
3258                 vp->v_path_stamp = pstamp;
3259                 vp->v_path = buf;
3260                 mutex_exit(&vp->v_lock);
3261         } else if (vp->v_path_stamp < pstamp || is_rename) {
3262                 /*
3263                  * Install the updated path and stamp, ensuring that the v_path
3264                  * pointer is valid at all times for dtrace.
3265                  */
3266                 oldbuf = vp->v_path;
3267                 vp->v_path = buf;
3268                 vp->v_path_stamp = gethrtime();
3269                 mutex_exit(&vp->v_lock);
3270                 kmem_free(oldbuf, strlen(oldbuf) + 1);
3271         } else {
3272                 /*
3273                  * If the timestamp matches or is greater, it means another
3274                  * thread performed the update first while locks were dropped
3275                  * here to make the allocation.  We defer to the newer value.
3276                  */
3277                 mutex_exit(&vp->v_lock);
3278                 kmem_free(buf, buflen);
3279         }
3280         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3281 }
3282 
3283 void
3284 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3285 {
3286         size_t len;
3287 
3288         /*
3289          * If the parent is older or empty, there's nothing further to do.
3290          */
3291         if (pvp->v_path == vn_vpath_empty ||
3292             pvp->v_path_stamp <= vp->v_path_stamp) {
3293                 return;
3294         }
3295 
3296         /*
3297          * Given the lack of appropriate context, meaningful updates to v_path
3298          * cannot be made for during lookups for the '.' or '..' entries.
3299          */
3300         len = strlen(name);
3301         if (len == 0 || (len == 1 && name[0] == '.') ||
3302             (len == 2 && name[0] == '.' && name[1] == '.')) {
3303                 return;
3304         }
3305 
3306         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3307 }
3308 
3309 /*
3310  * Given a starting vnode and a path, updates the path in the target vnode in
3311  * a safe manner.  If the vnode already has path information embedded, then the
3312  * cached path is left untouched.
3313  */
3314 /* ARGSUSED */
3315 void
3316 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3317     size_t len)
3318 {
3319         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3320 }
3321 
3322 /*
3323  * Sets the path to the vnode to be the given string, regardless of current
3324  * context.  The string must be a complete path from rootdir.  This is only used
3325  * by fsop_root() for setting the path based on the mountpoint.
3326  */
3327 void
3328 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3329 {
3330         vn_setpath_common(NULL, vp, str, len, B_FALSE);
3331 }
3332 
3333 /*
3334  * Called from within filesystem's vop_rename() to handle renames once the
3335  * target vnode is available.
3336  */
3337 void
3338 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3339 {
3340         vn_setpath_common(pvp, vp, name, len, B_TRUE);
3341 }
3342 
3343 /*
3344  * Similar to vn_setpath_str(), this function sets the path of the destination
3345  * vnode to the be the same as the source vnode.
3346  */
3347 void
3348 vn_copypath(struct vnode *src, struct vnode *dst)
3349 {
3350         char *buf;
3351         hrtime_t stamp;
3352         size_t buflen;
3353 
3354         mutex_enter(&src->v_lock);
3355         if (src->v_path == vn_vpath_empty) {
3356                 mutex_exit(&src->v_lock);
3357                 return;
3358         }
3359         buflen = strlen(src->v_path) + 1;
3360         mutex_exit(&src->v_lock);
3361 
3362         buf = kmem_alloc(buflen, KM_SLEEP);
3363 
3364         mutex_enter(&src->v_lock);
3365         if (src->v_path == vn_vpath_empty ||
3366             strlen(src->v_path) + 1 != buflen) {
3367                 mutex_exit(&src->v_lock);
3368                 kmem_free(buf, buflen);
3369                 return;
3370         }
3371         bcopy(src->v_path, buf, buflen);
3372         stamp = src->v_path_stamp;
3373         mutex_exit(&src->v_lock);
3374 
3375         mutex_enter(&dst->v_lock);
3376         if (dst->v_path != vn_vpath_empty) {
3377                 mutex_exit(&dst->v_lock);
3378                 kmem_free(buf, buflen);
3379                 return;
3380         }
3381         dst->v_path = buf;
3382         dst->v_path_stamp = stamp;
3383         mutex_exit(&dst->v_lock);
3384 }
3385 
3386 
3387 /*
3388  * XXX Private interface for segvn routines that handle vnode
3389  * large page segments.
3390  *
3391  * return 1 if vp's file system VOP_PAGEIO() implementation
3392  * can be safely used instead of VOP_GETPAGE() for handling
3393  * pagefaults against regular non swap files. VOP_PAGEIO()
3394  * interface is considered safe here if its implementation
3395  * is very close to VOP_GETPAGE() implementation.
3396  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3397  * panic if there're file holes but instead returns an error.
3398  * Doesn't assume file won't be changed by user writes, etc.
3399  *
3400  * return 0 otherwise.
3401  *
3402  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3403  */
3404 int
3405 vn_vmpss_usepageio(vnode_t *vp)
3406 {
3407         vfs_t   *vfsp = vp->v_vfsp;
3408         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3409         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3410         char **fsok = pageio_ok_fss;
3411 
3412         if (fsname == NULL) {
3413                 return (0);
3414         }
3415 
3416         for (; *fsok; fsok++) {
3417                 if (strcmp(*fsok, fsname) == 0) {
3418                         return (1);
3419                 }
3420         }
3421         return (0);
3422 }
3423 
3424 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3425 
3426 int
3427 fop_open(
3428         vnode_t **vpp,
3429         int mode,
3430         cred_t *cr,
3431         caller_context_t *ct)
3432 {
3433         int ret;
3434         vnode_t *vp = *vpp;
3435 
3436         VN_HOLD(vp);
3437         /*
3438          * Adding to the vnode counts before calling open
3439          * avoids the need for a mutex. It circumvents a race
3440          * condition where a query made on the vnode counts results in a
3441          * false negative. The inquirer goes away believing the file is
3442          * not open when there is an open on the file already under way.
3443          *
3444          * The counts are meant to prevent NFS from granting a delegation
3445          * when it would be dangerous to do so.
3446          *
3447          * The vnode counts are only kept on regular files
3448          */
3449         if ((*vpp)->v_type == VREG) {
3450                 if (mode & FREAD)
3451                         atomic_inc_32(&(*vpp)->v_rdcnt);
3452                 if (mode & FWRITE)
3453                         atomic_inc_32(&(*vpp)->v_wrcnt);
3454         }
3455 
3456         VOPXID_MAP_CR(vp, cr);
3457 
3458         ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3459 
3460         if (ret) {
3461                 /*
3462                  * Use the saved vp just in case the vnode ptr got trashed
3463                  * by the error.
3464                  */
3465                 VOPSTATS_UPDATE(vp, open);
3466                 if ((vp->v_type == VREG) && (mode & FREAD))
3467                         atomic_dec_32(&vp->v_rdcnt);
3468                 if ((vp->v_type == VREG) && (mode & FWRITE))
3469                         atomic_dec_32(&vp->v_wrcnt);
3470         } else {
3471                 /*
3472                  * Some filesystems will return a different vnode,
3473                  * but the same path was still used to open it.
3474                  * So if we do change the vnode and need to
3475                  * copy over the path, do so here, rather than special
3476                  * casing each filesystem. Adjust the vnode counts to
3477                  * reflect the vnode switch.
3478                  */
3479                 VOPSTATS_UPDATE(*vpp, open);
3480                 if (*vpp != vp) {
3481                         vn_copypath(vp, *vpp);
3482                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3483                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3484                         if ((vp->v_type == VREG) && (mode & FREAD))
3485                                 atomic_dec_32(&vp->v_rdcnt);
3486                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3487                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3488                         if ((vp->v_type == VREG) && (mode & FWRITE))
3489                                 atomic_dec_32(&vp->v_wrcnt);
3490                 }
3491         }
3492         VN_RELE(vp);
3493         return (ret);
3494 }
3495 
3496 int
3497 fop_close(
3498         vnode_t *vp,
3499         int flag,
3500         int count,
3501         offset_t offset,
3502         cred_t *cr,
3503         caller_context_t *ct)
3504 {
3505         int err;
3506 
3507         VOPXID_MAP_CR(vp, cr);
3508 
3509         err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3510         VOPSTATS_UPDATE(vp, close);
3511         /*
3512          * Check passed in count to handle possible dups. Vnode counts are only
3513          * kept on regular files
3514          */
3515         if ((vp->v_type == VREG) && (count == 1))  {
3516                 if (flag & FREAD) {
3517                         ASSERT(vp->v_rdcnt > 0);
3518                         atomic_dec_32(&vp->v_rdcnt);
3519                 }
3520                 if (flag & FWRITE) {
3521                         ASSERT(vp->v_wrcnt > 0);
3522                         atomic_dec_32(&vp->v_wrcnt);
3523                 }
3524         }
3525         return (err);
3526 }
3527 
3528 int
3529 fop_read(
3530         vnode_t *vp,
3531         uio_t *uiop,
3532         int ioflag,
3533         cred_t *cr,
3534         caller_context_t *ct)
3535 {
3536         ssize_t resid_start = uiop->uio_resid;
3537         zone_t  *zonep = curzone;
3538         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3539 
3540         hrtime_t start = 0, lat;
3541         ssize_t len;
3542         int err;
3543 
3544         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3545             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3546                 start = gethrtime();
3547 
3548                 mutex_enter(&zonep->zone_vfs_lock);
3549                 kstat_runq_enter(&zonep->zone_vfs_rwstats);
3550                 mutex_exit(&zonep->zone_vfs_lock);
3551         }
3552 
3553         VOPXID_MAP_CR(vp, cr);
3554 
3555         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3556         len = resid_start - uiop->uio_resid;
3557 
3558         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3559 
3560         if (start != 0) {
3561                 mutex_enter(&zonep->zone_vfs_lock);
3562                 zonep->zone_vfs_rwstats.reads++;
3563                 zonep->zone_vfs_rwstats.nread += len;
3564                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3565                 mutex_exit(&zonep->zone_vfs_lock);
3566 
3567                 lat = gethrtime() - start;
3568 
3569                 if (lat >= VOP_LATENCY_10MS) {
3570                         if (lat < VOP_LATENCY_100MS)
3571                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3572                         else if (lat < VOP_LATENCY_1S) {
3573                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3574                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3575                         } else {
3576                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3577                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3578                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3579                         }
3580                 }
3581         }
3582 
3583         return (err);
3584 }
3585 
3586 int
3587 fop_write(
3588         vnode_t *vp,
3589         uio_t *uiop,
3590         int ioflag,
3591         cred_t *cr,
3592         caller_context_t *ct)
3593 {
3594         ssize_t resid_start = uiop->uio_resid;
3595         zone_t  *zonep = curzone;
3596         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3597 
3598         hrtime_t start = 0, lat;
3599         ssize_t len;
3600         int     err;
3601 
3602         /*
3603          * For the purposes of VFS kstat consumers, the "waitq" calculation is
3604          * repurposed as the active queue for VFS write operations.  There's no
3605          * actual wait queue for VFS operations.
3606          */
3607         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3608             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3609                 start = gethrtime();
3610 
3611                 mutex_enter(&zonep->zone_vfs_lock);
3612                 kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3613                 mutex_exit(&zonep->zone_vfs_lock);
3614         }
3615 
3616         VOPXID_MAP_CR(vp, cr);
3617 
3618         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3619         len = resid_start - uiop->uio_resid;
3620 
3621         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3622 
3623         if (start != 0) {
3624                 mutex_enter(&zonep->zone_vfs_lock);
3625                 zonep->zone_vfs_rwstats.writes++;
3626                 zonep->zone_vfs_rwstats.nwritten += len;
3627                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3628                 mutex_exit(&zonep->zone_vfs_lock);
3629 
3630                 lat = gethrtime() - start;
3631 
3632                 if (lat >= VOP_LATENCY_10MS) {
3633                         if (lat < VOP_LATENCY_100MS)
3634                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3635                         else if (lat < VOP_LATENCY_1S) {
3636                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3637                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3638                         } else {
3639                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3640                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3641                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3642                         }
3643                 }
3644         }
3645 
3646         return (err);
3647 }
3648 
3649 int
3650 fop_ioctl(
3651         vnode_t *vp,
3652         int cmd,
3653         intptr_t arg,
3654         int flag,
3655         cred_t *cr,
3656         int *rvalp,
3657         caller_context_t *ct)
3658 {
3659         int     err;
3660 
3661         VOPXID_MAP_CR(vp, cr);
3662 
3663         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3664         VOPSTATS_UPDATE(vp, ioctl);
3665         return (err);
3666 }
3667 
3668 int
3669 fop_setfl(
3670         vnode_t *vp,
3671         int oflags,
3672         int nflags,
3673         cred_t *cr,
3674         caller_context_t *ct)
3675 {
3676         int     err;
3677 
3678         VOPXID_MAP_CR(vp, cr);
3679 
3680         err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3681         VOPSTATS_UPDATE(vp, setfl);
3682         return (err);
3683 }
3684 
3685 int
3686 fop_getattr(
3687         vnode_t *vp,
3688         vattr_t *vap,
3689         int flags,
3690         cred_t *cr,
3691         caller_context_t *ct)
3692 {
3693         int     err;
3694 
3695         VOPXID_MAP_CR(vp, cr);
3696 
3697         /*
3698          * If this file system doesn't understand the xvattr extensions
3699          * then turn off the xvattr bit.
3700          */
3701         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3702                 vap->va_mask &= ~AT_XVATTR;
3703         }
3704 
3705         /*
3706          * We're only allowed to skip the ACL check iff we used a 32 bit
3707          * ACE mask with VOP_ACCESS() to determine permissions.
3708          */
3709         if ((flags & ATTR_NOACLCHECK) &&
3710             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3711                 return (EINVAL);
3712         }
3713         err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3714         VOPSTATS_UPDATE(vp, getattr);
3715         return (err);
3716 }
3717 
3718 int
3719 fop_setattr(
3720         vnode_t *vp,
3721         vattr_t *vap,
3722         int flags,
3723         cred_t *cr,
3724         caller_context_t *ct)
3725 {
3726         int     err;
3727 
3728         VOPXID_MAP_CR(vp, cr);
3729 
3730         /*
3731          * If this file system doesn't understand the xvattr extensions
3732          * then turn off the xvattr bit.
3733          */
3734         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3735                 vap->va_mask &= ~AT_XVATTR;
3736         }
3737 
3738         /*
3739          * We're only allowed to skip the ACL check iff we used a 32 bit
3740          * ACE mask with VOP_ACCESS() to determine permissions.
3741          */
3742         if ((flags & ATTR_NOACLCHECK) &&
3743             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3744                 return (EINVAL);
3745         }
3746         err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3747         VOPSTATS_UPDATE(vp, setattr);
3748         return (err);
3749 }
3750 
3751 int
3752 fop_access(
3753         vnode_t *vp,
3754         int mode,
3755         int flags,
3756         cred_t *cr,
3757         caller_context_t *ct)
3758 {
3759         int     err;
3760 
3761         if ((flags & V_ACE_MASK) &&
3762             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3763                 return (EINVAL);
3764         }
3765 
3766         VOPXID_MAP_CR(vp, cr);
3767 
3768         err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3769         VOPSTATS_UPDATE(vp, access);
3770         return (err);
3771 }
3772 
3773 int
3774 fop_lookup(
3775         vnode_t *dvp,
3776         char *nm,
3777         vnode_t **vpp,
3778         pathname_t *pnp,
3779         int flags,
3780         vnode_t *rdir,
3781         cred_t *cr,
3782         caller_context_t *ct,
3783         int *deflags,           /* Returned per-dirent flags */
3784         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3785 {
3786         int ret;
3787 
3788         /*
3789          * If this file system doesn't support case-insensitive access
3790          * and said access is requested, fail quickly.  It is required
3791          * that if the vfs supports case-insensitive lookup, it also
3792          * supports extended dirent flags.
3793          */
3794         if (flags & FIGNORECASE &&
3795             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3796             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3797                 return (EINVAL);
3798 
3799         VOPXID_MAP_CR(dvp, cr);
3800 
3801         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3802                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3803         } else {
3804                 ret = (*(dvp)->v_op->vop_lookup)
3805                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3806         }
3807         if (ret == 0 && *vpp) {
3808                 VOPSTATS_UPDATE(*vpp, lookup);
3809                 vn_updatepath(dvp, *vpp, nm);
3810         }
3811 
3812         return (ret);
3813 }
3814 
3815 int
3816 fop_create(
3817         vnode_t *dvp,
3818         char *name,
3819         vattr_t *vap,
3820         vcexcl_t excl,
3821         int mode,
3822         vnode_t **vpp,
3823         cred_t *cr,
3824         int flags,
3825         caller_context_t *ct,
3826         vsecattr_t *vsecp)      /* ACL to set during create */
3827 {
3828         int ret;
3829 
3830         if (vsecp != NULL &&
3831             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3832                 return (EINVAL);
3833         }
3834         /*
3835          * If this file system doesn't support case-insensitive access
3836          * and said access is requested, fail quickly.
3837          */
3838         if (flags & FIGNORECASE &&
3839             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3840             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3841                 return (EINVAL);
3842 
3843         VOPXID_MAP_CR(dvp, cr);
3844 
3845         ret = (*(dvp)->v_op->vop_create)
3846             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3847         if (ret == 0 && *vpp) {
3848                 VOPSTATS_UPDATE(*vpp, create);
3849                 vn_updatepath(dvp, *vpp, name);
3850         }
3851 
3852         return (ret);
3853 }
3854 
3855 int
3856 fop_remove(
3857         vnode_t *dvp,
3858         char *nm,
3859         cred_t *cr,
3860         caller_context_t *ct,
3861         int flags)
3862 {
3863         int     err;
3864 
3865         /*
3866          * If this file system doesn't support case-insensitive access
3867          * and said access is requested, fail quickly.
3868          */
3869         if (flags & FIGNORECASE &&
3870             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3871             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3872                 return (EINVAL);
3873 
3874         VOPXID_MAP_CR(dvp, cr);
3875 
3876         err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3877         VOPSTATS_UPDATE(dvp, remove);
3878         return (err);
3879 }
3880 
3881 int
3882 fop_link(
3883         vnode_t *tdvp,
3884         vnode_t *svp,
3885         char *tnm,
3886         cred_t *cr,
3887         caller_context_t *ct,
3888         int flags)
3889 {
3890         int     err;
3891 
3892         /*
3893          * If the target file system doesn't support case-insensitive access
3894          * and said access is requested, fail quickly.
3895          */
3896         if (flags & FIGNORECASE &&
3897             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3898             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3899                 return (EINVAL);
3900 
3901         VOPXID_MAP_CR(tdvp, cr);
3902 
3903         err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3904         VOPSTATS_UPDATE(tdvp, link);
3905         return (err);
3906 }
3907 
3908 int
3909 fop_rename(
3910         vnode_t *sdvp,
3911         char *snm,
3912         vnode_t *tdvp,
3913         char *tnm,
3914         cred_t *cr,
3915         caller_context_t *ct,
3916         int flags)
3917 {
3918         int     err;
3919 
3920         /*
3921          * If the file system involved does not support
3922          * case-insensitive access and said access is requested, fail
3923          * quickly.
3924          */
3925         if (flags & FIGNORECASE &&
3926             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3927             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3928                 return (EINVAL);
3929 
3930         VOPXID_MAP_CR(tdvp, cr);
3931 
3932         err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3933         VOPSTATS_UPDATE(sdvp, rename);
3934         return (err);
3935 }
3936 
3937 int
3938 fop_mkdir(
3939         vnode_t *dvp,
3940         char *dirname,
3941         vattr_t *vap,
3942         vnode_t **vpp,
3943         cred_t *cr,
3944         caller_context_t *ct,
3945         int flags,
3946         vsecattr_t *vsecp)      /* ACL to set during create */
3947 {
3948         int ret;
3949 
3950         if (vsecp != NULL &&
3951             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3952                 return (EINVAL);
3953         }
3954         /*
3955          * If this file system doesn't support case-insensitive access
3956          * and said access is requested, fail quickly.
3957          */
3958         if (flags & FIGNORECASE &&
3959             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3960             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3961                 return (EINVAL);
3962 
3963         VOPXID_MAP_CR(dvp, cr);
3964 
3965         ret = (*(dvp)->v_op->vop_mkdir)
3966             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3967         if (ret == 0 && *vpp) {
3968                 VOPSTATS_UPDATE(*vpp, mkdir);
3969                 vn_updatepath(dvp, *vpp, dirname);
3970         }
3971 
3972         return (ret);
3973 }
3974 
3975 int
3976 fop_rmdir(
3977         vnode_t *dvp,
3978         char *nm,
3979         vnode_t *cdir,
3980         cred_t *cr,
3981         caller_context_t *ct,
3982         int flags)
3983 {
3984         int     err;
3985 
3986         /*
3987          * If this file system doesn't support case-insensitive access
3988          * and said access is requested, fail quickly.
3989          */
3990         if (flags & FIGNORECASE &&
3991             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3992             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3993                 return (EINVAL);
3994 
3995         VOPXID_MAP_CR(dvp, cr);
3996 
3997         err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3998         VOPSTATS_UPDATE(dvp, rmdir);
3999         return (err);
4000 }
4001 
4002 int
4003 fop_readdir(
4004         vnode_t *vp,
4005         uio_t *uiop,
4006         cred_t *cr,
4007         int *eofp,
4008         caller_context_t *ct,
4009         int flags)
4010 {
4011         int     err;
4012         ssize_t resid_start = uiop->uio_resid;
4013 
4014         /*
4015          * If this file system doesn't support retrieving directory
4016          * entry flags and said access is requested, fail quickly.
4017          */
4018         if (flags & V_RDDIR_ENTFLAGS &&
4019             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
4020                 return (EINVAL);
4021 
4022         VOPXID_MAP_CR(vp, cr);
4023 
4024         err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
4025         VOPSTATS_UPDATE_IO(vp, readdir,
4026             readdir_bytes, (resid_start - uiop->uio_resid));
4027         return (err);
4028 }
4029 
4030 int
4031 fop_symlink(
4032         vnode_t *dvp,
4033         char *linkname,
4034         vattr_t *vap,
4035         char *target,
4036         cred_t *cr,
4037         caller_context_t *ct,
4038         int flags)
4039 {
4040         int     err;
4041         xvattr_t xvattr;
4042 
4043         /*
4044          * If this file system doesn't support case-insensitive access
4045          * and said access is requested, fail quickly.
4046          */
4047         if (flags & FIGNORECASE &&
4048             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
4049             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4050                 return (EINVAL);
4051 
4052         VOPXID_MAP_CR(dvp, cr);
4053 
4054         /* check for reparse point */
4055         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
4056             (strncmp(target, FS_REPARSE_TAG_STR,
4057             strlen(FS_REPARSE_TAG_STR)) == 0)) {
4058                 if (!fs_reparse_mark(target, vap, &xvattr))
4059                         vap = (vattr_t *)&xvattr;
4060         }
4061 
4062         err = (*(dvp)->v_op->vop_symlink)
4063             (dvp, linkname, vap, target, cr, ct, flags);
4064         VOPSTATS_UPDATE(dvp, symlink);
4065         return (err);
4066 }
4067 
4068 int
4069 fop_readlink(
4070         vnode_t *vp,
4071         uio_t *uiop,
4072         cred_t *cr,
4073         caller_context_t *ct)
4074 {
4075         int     err;
4076 
4077         VOPXID_MAP_CR(vp, cr);
4078 
4079         err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
4080         VOPSTATS_UPDATE(vp, readlink);
4081         return (err);
4082 }
4083 
4084 int
4085 fop_fsync(
4086         vnode_t *vp,
4087         int syncflag,
4088         cred_t *cr,
4089         caller_context_t *ct)
4090 {
4091         int     err;
4092 
4093         VOPXID_MAP_CR(vp, cr);
4094 
4095         err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
4096         VOPSTATS_UPDATE(vp, fsync);
4097         return (err);
4098 }
4099 
4100 void
4101 fop_inactive(
4102         vnode_t *vp,
4103         cred_t *cr,
4104         caller_context_t *ct)
4105 {
4106         /* Need to update stats before vop call since we may lose the vnode */
4107         VOPSTATS_UPDATE(vp, inactive);
4108 
4109         VOPXID_MAP_CR(vp, cr);
4110 
4111         (*(vp)->v_op->vop_inactive)(vp, cr, ct);
4112 }
4113 
4114 int
4115 fop_fid(
4116         vnode_t *vp,
4117         fid_t *fidp,
4118         caller_context_t *ct)
4119 {
4120         int     err;
4121 
4122         err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
4123         VOPSTATS_UPDATE(vp, fid);
4124         return (err);
4125 }
4126 
4127 int
4128 fop_rwlock(
4129         vnode_t *vp,
4130         int write_lock,
4131         caller_context_t *ct)
4132 {
4133         int     ret;
4134 
4135         ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
4136         VOPSTATS_UPDATE(vp, rwlock);
4137         return (ret);
4138 }
4139 
4140 void
4141 fop_rwunlock(
4142         vnode_t *vp,
4143         int write_lock,
4144         caller_context_t *ct)
4145 {
4146         (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4147         VOPSTATS_UPDATE(vp, rwunlock);
4148 }
4149 
4150 int
4151 fop_seek(
4152         vnode_t *vp,
4153         offset_t ooff,
4154         offset_t *noffp,
4155         caller_context_t *ct)
4156 {
4157         int     err;
4158 
4159         err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4160         VOPSTATS_UPDATE(vp, seek);
4161         return (err);
4162 }
4163 
4164 int
4165 fop_cmp(
4166         vnode_t *vp1,
4167         vnode_t *vp2,
4168         caller_context_t *ct)
4169 {
4170         int     err;
4171 
4172         err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4173         VOPSTATS_UPDATE(vp1, cmp);
4174         return (err);
4175 }
4176 
4177 int
4178 fop_frlock(
4179         vnode_t *vp,
4180         int cmd,
4181         flock64_t *bfp,
4182         int flag,
4183         offset_t offset,
4184         struct flk_callback *flk_cbp,
4185         cred_t *cr,
4186         caller_context_t *ct)
4187 {
4188         int     err;
4189 
4190         VOPXID_MAP_CR(vp, cr);
4191 
4192         err = (*(vp)->v_op->vop_frlock)
4193             (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4194         VOPSTATS_UPDATE(vp, frlock);
4195         return (err);
4196 }
4197 
4198 int
4199 fop_space(
4200         vnode_t *vp,
4201         int cmd,
4202         flock64_t *bfp,
4203         int flag,
4204         offset_t offset,
4205         cred_t *cr,
4206         caller_context_t *ct)
4207 {
4208         int     err;
4209 
4210         VOPXID_MAP_CR(vp, cr);
4211 
4212         err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4213         VOPSTATS_UPDATE(vp, space);
4214         return (err);
4215 }
4216 
4217 int
4218 fop_realvp(
4219         vnode_t *vp,
4220         vnode_t **vpp,
4221         caller_context_t *ct)
4222 {
4223         int     err;
4224 
4225         err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4226         VOPSTATS_UPDATE(vp, realvp);
4227         return (err);
4228 }
4229 
4230 int
4231 fop_getpage(
4232         vnode_t *vp,
4233         offset_t off,
4234         size_t len,
4235         uint_t *protp,
4236         page_t **plarr,
4237         size_t plsz,
4238         struct seg *seg,
4239         caddr_t addr,
4240         enum seg_rw rw,
4241         cred_t *cr,
4242         caller_context_t *ct)
4243 {
4244         int     err;
4245 
4246         VOPXID_MAP_CR(vp, cr);
4247 
4248         err = (*(vp)->v_op->vop_getpage)
4249             (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4250         VOPSTATS_UPDATE(vp, getpage);
4251         return (err);
4252 }
4253 
4254 int
4255 fop_putpage(
4256         vnode_t *vp,
4257         offset_t off,
4258         size_t len,
4259         int flags,
4260         cred_t *cr,
4261         caller_context_t *ct)
4262 {
4263         int     err;
4264 
4265         VOPXID_MAP_CR(vp, cr);
4266 
4267         err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4268         VOPSTATS_UPDATE(vp, putpage);
4269         return (err);
4270 }
4271 
4272 int
4273 fop_map(
4274         vnode_t *vp,
4275         offset_t off,
4276         struct as *as,
4277         caddr_t *addrp,
4278         size_t len,
4279         uchar_t prot,
4280         uchar_t maxprot,
4281         uint_t flags,
4282         cred_t *cr,
4283         caller_context_t *ct)
4284 {
4285         int     err;
4286 
4287         VOPXID_MAP_CR(vp, cr);
4288 
4289         err = (*(vp)->v_op->vop_map)
4290             (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4291         VOPSTATS_UPDATE(vp, map);
4292         return (err);
4293 }
4294 
4295 int
4296 fop_addmap(
4297         vnode_t *vp,
4298         offset_t off,
4299         struct as *as,
4300         caddr_t addr,
4301         size_t len,
4302         uchar_t prot,
4303         uchar_t maxprot,
4304         uint_t flags,
4305         cred_t *cr,
4306         caller_context_t *ct)
4307 {
4308         int error;
4309         u_longlong_t delta;
4310 
4311         VOPXID_MAP_CR(vp, cr);
4312 
4313         error = (*(vp)->v_op->vop_addmap)
4314             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4315 
4316         if ((!error) && (vp->v_type == VREG)) {
4317                 delta = (u_longlong_t)btopr(len);
4318                 /*
4319                  * If file is declared MAP_PRIVATE, it can't be written back
4320                  * even if open for write. Handle as read.
4321                  */
4322                 if (flags & MAP_PRIVATE) {
4323                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4324                             (int64_t)delta);
4325                 } else {
4326                         /*
4327                          * atomic_add_64 forces the fetch of a 64 bit value to
4328                          * be atomic on 32 bit machines
4329                          */
4330                         if (maxprot & PROT_WRITE)
4331                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4332                                     (int64_t)delta);
4333                         if (maxprot & PROT_READ)
4334                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4335                                     (int64_t)delta);
4336                         if (maxprot & PROT_EXEC)
4337                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4338                                     (int64_t)delta);
4339                 }
4340         }
4341         VOPSTATS_UPDATE(vp, addmap);
4342         return (error);
4343 }
4344 
4345 int
4346 fop_delmap(
4347         vnode_t *vp,
4348         offset_t off,
4349         struct as *as,
4350         caddr_t addr,
4351         size_t len,
4352         uint_t prot,
4353         uint_t maxprot,
4354         uint_t flags,
4355         cred_t *cr,
4356         caller_context_t *ct)
4357 {
4358         int error;
4359         u_longlong_t delta;
4360 
4361         VOPXID_MAP_CR(vp, cr);
4362 
4363         error = (*(vp)->v_op->vop_delmap)
4364             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4365 
4366         /*
4367          * NFS calls into delmap twice, the first time
4368          * it simply establishes a callback mechanism and returns EAGAIN
4369          * while the real work is being done upon the second invocation.
4370          * We have to detect this here and only decrement the counts upon
4371          * the second delmap request.
4372          */
4373         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4374 
4375                 delta = (u_longlong_t)btopr(len);
4376 
4377                 if (flags & MAP_PRIVATE) {
4378                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4379                             (int64_t)(-delta));
4380                 } else {
4381                         /*
4382                          * atomic_add_64 forces the fetch of a 64 bit value
4383                          * to be atomic on 32 bit machines
4384                          */
4385                         if (maxprot & PROT_WRITE)
4386                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4387                                     (int64_t)(-delta));
4388                         if (maxprot & PROT_READ)
4389                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4390                                     (int64_t)(-delta));
4391                         if (maxprot & PROT_EXEC)
4392                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4393                                     (int64_t)(-delta));
4394                 }
4395         }
4396         VOPSTATS_UPDATE(vp, delmap);
4397         return (error);
4398 }
4399 
4400 
4401 int
4402 fop_poll(
4403         vnode_t *vp,
4404         short events,
4405         int anyyet,
4406         short *reventsp,
4407         struct pollhead **phpp,
4408         caller_context_t *ct)
4409 {
4410         int     err;
4411 
4412         err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4413         VOPSTATS_UPDATE(vp, poll);
4414         return (err);
4415 }
4416 
4417 int
4418 fop_dump(
4419         vnode_t *vp,
4420         caddr_t addr,
4421         offset_t lbdn,
4422         offset_t dblks,
4423         caller_context_t *ct)
4424 {
4425         int     err;
4426 
4427         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4428         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4429                 return (EIO);
4430 
4431         err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4432         VOPSTATS_UPDATE(vp, dump);
4433         return (err);
4434 }
4435 
4436 int
4437 fop_pathconf(
4438         vnode_t *vp,
4439         int cmd,
4440         ulong_t *valp,
4441         cred_t *cr,
4442         caller_context_t *ct)
4443 {
4444         int     err;
4445 
4446         VOPXID_MAP_CR(vp, cr);
4447 
4448         err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4449         VOPSTATS_UPDATE(vp, pathconf);
4450         return (err);
4451 }
4452 
4453 int
4454 fop_pageio(
4455         vnode_t *vp,
4456         struct page *pp,
4457         u_offset_t io_off,
4458         size_t io_len,
4459         int flags,
4460         cred_t *cr,
4461         caller_context_t *ct)
4462 {
4463         int     err;
4464 
4465         VOPXID_MAP_CR(vp, cr);
4466 
4467         err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4468         VOPSTATS_UPDATE(vp, pageio);
4469         return (err);
4470 }
4471 
4472 int
4473 fop_dumpctl(
4474         vnode_t *vp,
4475         int action,
4476         offset_t *blkp,
4477         caller_context_t *ct)
4478 {
4479         int     err;
4480         err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4481         VOPSTATS_UPDATE(vp, dumpctl);
4482         return (err);
4483 }
4484 
4485 void
4486 fop_dispose(
4487         vnode_t *vp,
4488         page_t *pp,
4489         int flag,
4490         int dn,
4491         cred_t *cr,
4492         caller_context_t *ct)
4493 {
4494         /* Must do stats first since it's possible to lose the vnode */
4495         VOPSTATS_UPDATE(vp, dispose);
4496 
4497         VOPXID_MAP_CR(vp, cr);
4498 
4499         (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4500 }
4501 
4502 int
4503 fop_setsecattr(
4504         vnode_t *vp,
4505         vsecattr_t *vsap,
4506         int flag,
4507         cred_t *cr,
4508         caller_context_t *ct)
4509 {
4510         int     err;
4511 
4512         VOPXID_MAP_CR(vp, cr);
4513 
4514         /*
4515          * We're only allowed to skip the ACL check iff we used a 32 bit
4516          * ACE mask with VOP_ACCESS() to determine permissions.
4517          */
4518         if ((flag & ATTR_NOACLCHECK) &&
4519             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4520                 return (EINVAL);
4521         }
4522         err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4523         VOPSTATS_UPDATE(vp, setsecattr);
4524         return (err);
4525 }
4526 
4527 int
4528 fop_getsecattr(
4529         vnode_t *vp,
4530         vsecattr_t *vsap,
4531         int flag,
4532         cred_t *cr,
4533         caller_context_t *ct)
4534 {
4535         int     err;
4536 
4537         /*
4538          * We're only allowed to skip the ACL check iff we used a 32 bit
4539          * ACE mask with VOP_ACCESS() to determine permissions.
4540          */
4541         if ((flag & ATTR_NOACLCHECK) &&
4542             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4543                 return (EINVAL);
4544         }
4545 
4546         VOPXID_MAP_CR(vp, cr);
4547 
4548         err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4549         VOPSTATS_UPDATE(vp, getsecattr);
4550         return (err);
4551 }
4552 
4553 int
4554 fop_shrlock(
4555         vnode_t *vp,
4556         int cmd,
4557         struct shrlock *shr,
4558         int flag,
4559         cred_t *cr,
4560         caller_context_t *ct)
4561 {
4562         int     err;
4563 
4564         VOPXID_MAP_CR(vp, cr);
4565 
4566         err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4567         VOPSTATS_UPDATE(vp, shrlock);
4568         return (err);
4569 }
4570 
4571 int
4572 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4573     caller_context_t *ct)
4574 {
4575         int     err;
4576 
4577         err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4578         VOPSTATS_UPDATE(vp, vnevent);
4579         return (err);
4580 }
4581 
4582 int
4583 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4584     caller_context_t *ct)
4585 {
4586         int err;
4587 
4588         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4589                 return (ENOTSUP);
4590         err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4591         VOPSTATS_UPDATE(vp, reqzcbuf);
4592         return (err);
4593 }
4594 
4595 int
4596 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4597 {
4598         int err;
4599 
4600         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4601                 return (ENOTSUP);
4602         err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4603         VOPSTATS_UPDATE(vp, retzcbuf);
4604         return (err);
4605 }
4606 
4607 /*
4608  * Default destructor
4609  *      Needed because NULL destructor means that the key is unused
4610  */
4611 /* ARGSUSED */
4612 void
4613 vsd_defaultdestructor(void *value)
4614 {}
4615 
4616 /*
4617  * Create a key (index into per vnode array)
4618  *      Locks out vsd_create, vsd_destroy, and vsd_free
4619  *      May allocate memory with lock held
4620  */
4621 void
4622 vsd_create(uint_t *keyp, void (*destructor)(void *))
4623 {
4624         int     i;
4625         uint_t  nkeys;
4626 
4627         /*
4628          * if key is allocated, do nothing
4629          */
4630         mutex_enter(&vsd_lock);
4631         if (*keyp) {
4632                 mutex_exit(&vsd_lock);
4633                 return;
4634         }
4635         /*
4636          * find an unused key
4637          */
4638         if (destructor == NULL)
4639                 destructor = vsd_defaultdestructor;
4640 
4641         for (i = 0; i < vsd_nkeys; ++i)
4642                 if (vsd_destructor[i] == NULL)
4643                         break;
4644 
4645         /*
4646          * if no unused keys, increase the size of the destructor array
4647          */
4648         if (i == vsd_nkeys) {
4649                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4650                         nkeys = 1;
4651                 vsd_destructor =
4652                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4653                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4654                     (size_t)(nkeys * sizeof (void (*)(void *))));
4655                 vsd_nkeys = nkeys;
4656         }
4657 
4658         /*
4659          * allocate the next available unused key
4660          */
4661         vsd_destructor[i] = destructor;
4662         *keyp = i + 1;
4663 
4664         /* create vsd_list, if it doesn't exist */
4665         if (vsd_list == NULL) {
4666                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4667                 list_create(vsd_list, sizeof (struct vsd_node),
4668                     offsetof(struct vsd_node, vs_nodes));
4669         }
4670 
4671         mutex_exit(&vsd_lock);
4672 }
4673 
4674 /*
4675  * Destroy a key
4676  *
4677  * Assumes that the caller is preventing vsd_set and vsd_get
4678  * Locks out vsd_create, vsd_destroy, and vsd_free
4679  * May free memory with lock held
4680  */
4681 void
4682 vsd_destroy(uint_t *keyp)
4683 {
4684         uint_t key;
4685         struct vsd_node *vsd;
4686 
4687         /*
4688          * protect the key namespace and our destructor lists
4689          */
4690         mutex_enter(&vsd_lock);
4691         key = *keyp;
4692         *keyp = 0;
4693 
4694         ASSERT(key <= vsd_nkeys);
4695 
4696         /*
4697          * if the key is valid
4698          */
4699         if (key != 0) {
4700                 uint_t k = key - 1;
4701                 /*
4702                  * for every vnode with VSD, call key's destructor
4703                  */
4704                 for (vsd = list_head(vsd_list); vsd != NULL;
4705                     vsd = list_next(vsd_list, vsd)) {
4706                         /*
4707                          * no VSD for key in this vnode
4708                          */
4709                         if (key > vsd->vs_nkeys)
4710                                 continue;
4711                         /*
4712                          * call destructor for key
4713                          */
4714                         if (vsd->vs_value[k] && vsd_destructor[k])
4715                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4716                         /*
4717                          * reset value for key
4718                          */
4719                         vsd->vs_value[k] = NULL;
4720                 }
4721                 /*
4722                  * actually free the key (NULL destructor == unused)
4723                  */
4724                 vsd_destructor[k] = NULL;
4725         }
4726 
4727         mutex_exit(&vsd_lock);
4728 }
4729 
4730 /*
4731  * Quickly return the per vnode value that was stored with the specified key
4732  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4733  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4734  */
4735 void *
4736 vsd_get(vnode_t *vp, uint_t key)
4737 {
4738         struct vsd_node *vsd;
4739 
4740         ASSERT(vp != NULL);
4741         ASSERT(mutex_owned(&vp->v_vsd_lock));
4742 
4743         vsd = vp->v_vsd;
4744 
4745         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4746                 return (vsd->vs_value[key - 1]);
4747         return (NULL);
4748 }
4749 
4750 /*
4751  * Set a per vnode value indexed with the specified key
4752  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4753  */
4754 int
4755 vsd_set(vnode_t *vp, uint_t key, void *value)
4756 {
4757         struct vsd_node *vsd;
4758 
4759         ASSERT(vp != NULL);
4760         ASSERT(mutex_owned(&vp->v_vsd_lock));
4761 
4762         if (key == 0)
4763                 return (EINVAL);
4764 
4765         vsd = vp->v_vsd;
4766         if (vsd == NULL)
4767                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4768 
4769         /*
4770          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4771          * code won't happen and we will continue down and allocate space for
4772          * the vs_value array.
4773          * If the caller is replacing one value with another, then it is up
4774          * to the caller to free/rele/destroy the previous value (if needed).
4775          */
4776         if (key <= vsd->vs_nkeys) {
4777                 vsd->vs_value[key - 1] = value;
4778                 return (0);
4779         }
4780 
4781         ASSERT(key <= vsd_nkeys);
4782 
4783         if (vsd->vs_nkeys == 0) {
4784                 mutex_enter(&vsd_lock);     /* lock out vsd_destroy() */
4785                 /*
4786                  * Link onto list of all VSD nodes.
4787                  */
4788                 list_insert_head(vsd_list, vsd);
4789                 mutex_exit(&vsd_lock);
4790         }
4791 
4792         /*
4793          * Allocate vnode local storage and set the value for key
4794          */
4795         vsd->vs_value = vsd_realloc(vsd->vs_value,
4796             vsd->vs_nkeys * sizeof (void *),
4797             key * sizeof (void *));
4798         vsd->vs_nkeys = key;
4799         vsd->vs_value[key - 1] = value;
4800 
4801         return (0);
4802 }
4803 
4804 /*
4805  * Called from vn_free() to run the destructor function for each vsd
4806  *      Locks out vsd_create and vsd_destroy
4807  *      Assumes that the destructor *DOES NOT* use vsd
4808  */
4809 void
4810 vsd_free(vnode_t *vp)
4811 {
4812         int i;
4813         struct vsd_node *vsd = vp->v_vsd;
4814 
4815         if (vsd == NULL)
4816                 return;
4817 
4818         if (vsd->vs_nkeys == 0) {
4819                 kmem_free(vsd, sizeof (*vsd));
4820                 vp->v_vsd = NULL;
4821                 return;
4822         }
4823 
4824         /*
4825          * lock out vsd_create and vsd_destroy, call
4826          * the destructor, and mark the value as destroyed.
4827          */
4828         mutex_enter(&vsd_lock);
4829 
4830         for (i = 0; i < vsd->vs_nkeys; i++) {
4831                 if (vsd->vs_value[i] && vsd_destructor[i])
4832                         (*vsd_destructor[i])(vsd->vs_value[i]);
4833                 vsd->vs_value[i] = NULL;
4834         }
4835 
4836         /*
4837          * remove from linked list of VSD nodes
4838          */
4839         list_remove(vsd_list, vsd);
4840 
4841         mutex_exit(&vsd_lock);
4842 
4843         /*
4844          * free up the VSD
4845          */
4846         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4847         kmem_free(vsd, sizeof (struct vsd_node));
4848         vp->v_vsd = NULL;
4849 }
4850 
4851 /*
4852  * realloc
4853  */
4854 static void *
4855 vsd_realloc(void *old, size_t osize, size_t nsize)
4856 {
4857         void *new;
4858 
4859         new = kmem_zalloc(nsize, KM_SLEEP);
4860         if (old) {
4861                 bcopy(old, new, osize);
4862                 kmem_free(old, osize);
4863         }
4864         return (new);
4865 }
4866 
4867 /*
4868  * Setup the extensible system attribute for creating a reparse point.
4869  * The symlink data 'target' is validated for proper format of a reparse
4870  * string and a check also made to make sure the symlink data does not
4871  * point to an existing file.
4872  *
4873  * return 0 if ok else -1.
4874  */
4875 static int
4876 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4877 {
4878         xoptattr_t *xoap;
4879 
4880         if ((!target) || (!vap) || (!xvattr))
4881                 return (-1);
4882 
4883         /* validate reparse string */
4884         if (reparse_validate((const char *)target))
4885                 return (-1);
4886 
4887         xva_init(xvattr);
4888         xvattr->xva_vattr = *vap;
4889         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4890         xoap = xva_getxoptattr(xvattr);
4891         ASSERT(xoap);
4892         XVA_SET_REQ(xvattr, XAT_REPARSE);
4893         xoap->xoa_reparse = 1;
4894 
4895         return (0);
4896 }
4897 
4898 /*
4899  * Function to check whether a symlink is a reparse point.
4900  * Return B_TRUE if it is a reparse point, else return B_FALSE
4901  */
4902 boolean_t
4903 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4904 {
4905         xvattr_t xvattr;
4906         xoptattr_t *xoap;
4907 
4908         if ((vp->v_type != VLNK) ||
4909             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4910                 return (B_FALSE);
4911 
4912         xva_init(&xvattr);
4913         xoap = xva_getxoptattr(&xvattr);
4914         ASSERT(xoap);
4915         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4916 
4917         if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4918                 return (B_FALSE);
4919 
4920         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4921             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4922                 return (B_FALSE);
4923 
4924         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4925 }