1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2020 Joyent, Inc.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  28  */
  29 
  30 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  31 /*        All Rights Reserved   */
  32 
  33 /*
  34  * University Copyright- Copyright (c) 1982, 1986, 1988
  35  * The Regents of the University of California
  36  * All Rights Reserved
  37  *
  38  * University Acknowledgment- Portions of this document are derived from
  39  * software developed by the University of California, Berkeley, and its
  40  * contributors.
  41  */
  42 
  43 #include <sys/types.h>
  44 #include <sys/param.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/errno.h>
  47 #include <sys/cred.h>
  48 #include <sys/user.h>
  49 #include <sys/uio.h>
  50 #include <sys/file.h>
  51 #include <sys/pathname.h>
  52 #include <sys/vfs.h>
  53 #include <sys/vfs_opreg.h>
  54 #include <sys/vnode.h>
  55 #include <sys/filio.h>
  56 #include <sys/rwstlock.h>
  57 #include <sys/fem.h>
  58 #include <sys/stat.h>
  59 #include <sys/mode.h>
  60 #include <sys/conf.h>
  61 #include <sys/sysmacros.h>
  62 #include <sys/cmn_err.h>
  63 #include <sys/systm.h>
  64 #include <sys/kmem.h>
  65 #include <sys/debug.h>
  66 #include <c2/audit.h>
  67 #include <sys/acl.h>
  68 #include <sys/nbmlock.h>
  69 #include <sys/fcntl.h>
  70 #include <fs/fs_subr.h>
  71 #include <sys/taskq.h>
  72 #include <fs/fs_reparse.h>
  73 #include <sys/time.h>
  74 #include <sys/sdt.h>
  75 
  76 /* Determine if this vnode is a file that is read-only */
  77 #define ISROFILE(vp)    \
  78         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  79             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  80 
  81 /* Tunable via /etc/system; used only by admin/install */
  82 int nfs_global_client_only;
  83 
  84 /*
  85  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  86  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  87  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  88  * the same fstype index that is used to index into the vfssw table.
  89  */
  90 vopstats_t **vopstats_fstype;
  91 
  92 /* vopstats initialization template used for fast initialization via bcopy() */
  93 static vopstats_t *vs_templatep;
  94 
  95 /* Kmem cache handle for vsk_anchor_t allocations */
  96 kmem_cache_t *vsk_anchor_cache;
  97 
  98 /* file events cleanup routine */
  99 extern void free_fopdata(vnode_t *);
 100 
 101 /*
 102  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 103  * updates to vsktat_tree.
 104  */
 105 avl_tree_t      vskstat_tree;
 106 kmutex_t        vskstat_tree_lock;
 107 
 108 /* Global variable which enables/disables the vopstats collection */
 109 int vopstats_enabled = 1;
 110 
 111 /* Global used for empty/invalid v_path */
 112 char *vn_vpath_empty = "";
 113 
 114 /*
 115  * forward declarations for internal vnode specific data (vsd)
 116  */
 117 static void *vsd_realloc(void *, size_t, size_t);
 118 
 119 /*
 120  * forward declarations for reparse point functions
 121  */
 122 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 123 
 124 /*
 125  * VSD -- VNODE SPECIFIC DATA
 126  * The v_data pointer is typically used by a file system to store a
 127  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 128  * However, there are times when additional project private data needs
 129  * to be stored separately from the data (node) pointed to by v_data.
 130  * This additional data could be stored by the file system itself or
 131  * by a completely different kernel entity.  VSD provides a way for
 132  * callers to obtain a key and store a pointer to private data associated
 133  * with a vnode.
 134  *
 135  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 136  * for calls to vsd_set() and vsd_get().
 137  */
 138 
 139 /*
 140  * vsd_lock protects:
 141  *   vsd_nkeys - creation and deletion of vsd keys
 142  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 143  *   vsd_destructor - adding and removing destructors to the list
 144  */
 145 static kmutex_t         vsd_lock;
 146 static uint_t           vsd_nkeys;       /* size of destructor array */
 147 /* list of vsd_node's */
 148 static list_t *vsd_list = NULL;
 149 /* per-key destructor funcs */
 150 static void             (**vsd_destructor)(void *);
 151 
 152 /*
 153  * The following is the common set of actions needed to update the
 154  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 155  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 156  * recording of the bytes transferred.  Since the code is similar
 157  * but small, it is nearly a duplicate.  Consequently any changes
 158  * to one may need to be reflected in the other.
 159  * Rundown of the variables:
 160  * vp - Pointer to the vnode
 161  * counter - Partial name structure member to update in vopstats for counts
 162  * bytecounter - Partial name structure member to update in vopstats for bytes
 163  * bytesval - Value to update in vopstats for bytes
 164  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 165  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 166  */
 167 
 168 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 169         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 170         if (vfsp && vfsp->vfs_implp &&                                       \
 171             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 172                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 173                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 174                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 175                     size_t, uint64_t *);                                \
 176                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 177                 (*stataddr)++;                                          \
 178                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 179                         vsp->n##counter.value.ui64++;                        \
 180                 }                                                       \
 181         }                                                               \
 182 }
 183 
 184 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 185         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 186         if (vfsp && vfsp->vfs_implp &&                                       \
 187             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 188                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 189                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 190                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 191                     size_t, uint64_t *);                                \
 192                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 193                 (*stataddr)++;                                          \
 194                 vsp->bytecounter.value.ui64 += bytesval;             \
 195                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 196                         vsp->n##counter.value.ui64++;                        \
 197                         vsp->bytecounter.value.ui64 += bytesval;     \
 198                 }                                                       \
 199         }                                                               \
 200 }
 201 
 202 /*
 203  * If the filesystem does not support XIDs map credential
 204  * If the vfsp is NULL, perhaps we should also map?
 205  */
 206 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 207         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 208         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 209                 cr = crgetmapped(cr);                                   \
 210         }
 211 
 212 #define VOP_LATENCY_10MS        10000000
 213 #define VOP_LATENCY_100MS       100000000
 214 #define VOP_LATENCY_1S          1000000000
 215 
 216 /*
 217  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 218  * numerical order of S_IFMT and vnode types.)
 219  */
 220 enum vtype iftovt_tab[] = {
 221         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 222         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 223 };
 224 
 225 ushort_t vttoif_tab[] = {
 226         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 227         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 228 };
 229 
 230 /*
 231  * The system vnode cache.
 232  */
 233 
 234 kmem_cache_t *vn_cache;
 235 
 236 
 237 /*
 238  * Vnode operations vector.
 239  */
 240 
 241 static const fs_operation_trans_def_t vn_ops_table[] = {
 242         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 243             fs_nosys, fs_nosys,
 244 
 245         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 246             fs_nosys, fs_nosys,
 247 
 248         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 249             fs_nosys, fs_nosys,
 250 
 251         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 252             fs_nosys, fs_nosys,
 253 
 254         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 255             fs_nosys, fs_nosys,
 256 
 257         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 258             fs_setfl, fs_nosys,
 259 
 260         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 261             fs_nosys, fs_nosys,
 262 
 263         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 264             fs_nosys, fs_nosys,
 265 
 266         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 267             fs_nosys, fs_nosys,
 268 
 269         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 270             fs_nosys, fs_nosys,
 271 
 272         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 273             fs_nosys, fs_nosys,
 274 
 275         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 276             fs_nosys, fs_nosys,
 277 
 278         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 279             fs_nosys, fs_nosys,
 280 
 281         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 282             fs_nosys, fs_nosys,
 283 
 284         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 285             fs_nosys, fs_nosys,
 286 
 287         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 288             fs_nosys, fs_nosys,
 289 
 290         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 291             fs_nosys, fs_nosys,
 292 
 293         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 294             fs_nosys, fs_nosys,
 295 
 296         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 297             fs_nosys, fs_nosys,
 298 
 299         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 300             fs_nosys, fs_nosys,
 301 
 302         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 303             fs_nosys, fs_nosys,
 304 
 305         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 306             fs_nosys, fs_nosys,
 307 
 308         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 309             fs_rwlock, fs_rwlock,
 310 
 311         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 312             (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 313             (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 314 
 315         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 316             fs_nosys, fs_nosys,
 317 
 318         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 319             fs_cmp, fs_cmp,             /* no errors allowed */
 320 
 321         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 322             fs_frlock, fs_nosys,
 323 
 324         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 325             fs_nosys, fs_nosys,
 326 
 327         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 328             fs_nosys, fs_nosys,
 329 
 330         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 331             fs_nosys, fs_nosys,
 332 
 333         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 334             fs_nosys, fs_nosys,
 335 
 336         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 337             (fs_generic_func_p) fs_nosys_map,
 338             (fs_generic_func_p) fs_nosys_map,
 339 
 340         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 341             (fs_generic_func_p) fs_nosys_addmap,
 342             (fs_generic_func_p) fs_nosys_addmap,
 343 
 344         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 345             fs_nosys, fs_nosys,
 346 
 347         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 348             (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 349 
 350         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 351             fs_nosys, fs_nosys,
 352 
 353         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 354             fs_pathconf, fs_nosys,
 355 
 356         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 357             fs_nosys, fs_nosys,
 358 
 359         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 360             fs_nosys, fs_nosys,
 361 
 362         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 363             (fs_generic_func_p)(uintptr_t)fs_dispose,
 364             (fs_generic_func_p)(uintptr_t)fs_nodispose,
 365 
 366         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 367             fs_nosys, fs_nosys,
 368 
 369         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 370             fs_fab_acl, fs_nosys,
 371 
 372         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 373             fs_shrlock, fs_nosys,
 374 
 375         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 376             (fs_generic_func_p) fs_vnevent_nosupport,
 377             (fs_generic_func_p) fs_vnevent_nosupport,
 378 
 379         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 380             fs_nosys, fs_nosys,
 381 
 382         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 383             fs_nosys, fs_nosys,
 384 
 385         NULL, 0, NULL, NULL
 386 };
 387 
 388 /* Extensible attribute (xva) routines. */
 389 
 390 /*
 391  * Zero out the structure, set the size of the requested/returned bitmaps,
 392  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 393  * to the returned attributes array.
 394  */
 395 void
 396 xva_init(xvattr_t *xvap)
 397 {
 398         bzero(xvap, sizeof (xvattr_t));
 399         xvap->xva_mapsize = XVA_MAPSIZE;
 400         xvap->xva_magic = XVA_MAGIC;
 401         xvap->xva_vattr.va_mask = AT_XVATTR;
 402         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 403 }
 404 
 405 /*
 406  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 407  * structure.  Otherwise, returns NULL.
 408  */
 409 xoptattr_t *
 410 xva_getxoptattr(xvattr_t *xvap)
 411 {
 412         xoptattr_t *xoap = NULL;
 413         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 414                 xoap = &xvap->xva_xoptattrs;
 415         return (xoap);
 416 }
 417 
 418 /*
 419  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 420  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 421  * kstat name.
 422  */
 423 static int
 424 vska_compar(const void *n1, const void *n2)
 425 {
 426         int ret;
 427         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 428         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 429 
 430         if (p1 < p2) {
 431                 ret = -1;
 432         } else if (p1 > p2) {
 433                 ret = 1;
 434         } else {
 435                 ret = 0;
 436         }
 437 
 438         return (ret);
 439 }
 440 
 441 /*
 442  * Used to create a single template which will be bcopy()ed to a newly
 443  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 444  */
 445 static vopstats_t *
 446 create_vopstats_template()
 447 {
 448         vopstats_t              *vsp;
 449 
 450         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 451         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 452 
 453         /* VOP_OPEN */
 454         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 455         /* VOP_CLOSE */
 456         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 457         /* VOP_READ I/O */
 458         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 459         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 460         /* VOP_WRITE I/O */
 461         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 462         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 463         /* VOP_IOCTL */
 464         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 465         /* VOP_SETFL */
 466         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 467         /* VOP_GETATTR */
 468         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 469         /* VOP_SETATTR */
 470         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 471         /* VOP_ACCESS */
 472         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 473         /* VOP_LOOKUP */
 474         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 475         /* VOP_CREATE */
 476         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 477         /* VOP_REMOVE */
 478         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 479         /* VOP_LINK */
 480         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 481         /* VOP_RENAME */
 482         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 483         /* VOP_MKDIR */
 484         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 485         /* VOP_RMDIR */
 486         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 487         /* VOP_READDIR I/O */
 488         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 489         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 490             KSTAT_DATA_UINT64);
 491         /* VOP_SYMLINK */
 492         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 493         /* VOP_READLINK */
 494         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 495         /* VOP_FSYNC */
 496         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 497         /* VOP_INACTIVE */
 498         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 499         /* VOP_FID */
 500         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 501         /* VOP_RWLOCK */
 502         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 503         /* VOP_RWUNLOCK */
 504         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 505         /* VOP_SEEK */
 506         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 507         /* VOP_CMP */
 508         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 509         /* VOP_FRLOCK */
 510         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 511         /* VOP_SPACE */
 512         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 513         /* VOP_REALVP */
 514         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 515         /* VOP_GETPAGE */
 516         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 517         /* VOP_PUTPAGE */
 518         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 519         /* VOP_MAP */
 520         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 521         /* VOP_ADDMAP */
 522         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 523         /* VOP_DELMAP */
 524         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 525         /* VOP_POLL */
 526         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 527         /* VOP_DUMP */
 528         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 529         /* VOP_PATHCONF */
 530         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 531         /* VOP_PAGEIO */
 532         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 533         /* VOP_DUMPCTL */
 534         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 535         /* VOP_DISPOSE */
 536         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 537         /* VOP_SETSECATTR */
 538         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 539         /* VOP_GETSECATTR */
 540         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 541         /* VOP_SHRLOCK */
 542         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 543         /* VOP_VNEVENT */
 544         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 545         /* VOP_REQZCBUF */
 546         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 547         /* VOP_RETZCBUF */
 548         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 549 
 550         return (vsp);
 551 }
 552 
 553 /*
 554  * Creates a kstat structure associated with a vopstats structure.
 555  */
 556 kstat_t *
 557 new_vskstat(char *ksname, vopstats_t *vsp)
 558 {
 559         kstat_t         *ksp;
 560 
 561         if (!vopstats_enabled) {
 562                 return (NULL);
 563         }
 564 
 565         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 566             sizeof (vopstats_t)/sizeof (kstat_named_t),
 567             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 568         if (ksp) {
 569                 ksp->ks_data = vsp;
 570                 kstat_install(ksp);
 571         }
 572 
 573         return (ksp);
 574 }
 575 
 576 /*
 577  * Called from vfsinit() to initialize the support mechanisms for vopstats
 578  */
 579 void
 580 vopstats_startup()
 581 {
 582         if (!vopstats_enabled)
 583                 return;
 584 
 585         /*
 586          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 587          * is necessary since we need to check if a kstat exists before we
 588          * attempt to create it.  Also, initialize its lock.
 589          */
 590         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 591             offsetof(vsk_anchor_t, vsk_node));
 592         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 593 
 594         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 595             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 596             NULL, NULL, 0);
 597 
 598         /*
 599          * Set up the array of pointers for the vopstats-by-FS-type.
 600          * The entries will be allocated/initialized as each file system
 601          * goes through modload/mod_installfs.
 602          */
 603         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 604             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 605 
 606         /* Set up the global vopstats initialization template */
 607         vs_templatep = create_vopstats_template();
 608 }
 609 
 610 /*
 611  * We need to have the all of the counters zeroed.
 612  * The initialization of the vopstats_t includes on the order of
 613  * 50 calls to kstat_named_init().  Rather that do that on every call,
 614  * we do it once in a template (vs_templatep) then bcopy it over.
 615  */
 616 void
 617 initialize_vopstats(vopstats_t *vsp)
 618 {
 619         if (vsp == NULL)
 620                 return;
 621 
 622         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 623 }
 624 
 625 /*
 626  * If possible, determine which vopstats by fstype to use and
 627  * return a pointer to the caller.
 628  */
 629 vopstats_t *
 630 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 631 {
 632         int             fstype = 0;     /* Index into vfssw[] */
 633         vopstats_t      *vsp = NULL;
 634 
 635         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 636             !vopstats_enabled)
 637                 return (NULL);
 638         /*
 639          * Set up the fstype.  We go to so much trouble because all versions
 640          * of NFS use the same fstype in their vfs even though they have
 641          * distinct entries in the vfssw[] table.
 642          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 643          */
 644         if (vswp) {
 645                 fstype = vswp - vfssw;  /* Gets us the index */
 646         } else {
 647                 fstype = vfsp->vfs_fstype;
 648         }
 649 
 650         /*
 651          * Point to the per-fstype vopstats. The only valid values are
 652          * non-zero positive values less than the number of vfssw[] table
 653          * entries.
 654          */
 655         if (fstype > 0 && fstype < nfstype) {
 656                 vsp = vopstats_fstype[fstype];
 657         }
 658 
 659         return (vsp);
 660 }
 661 
 662 /*
 663  * Generate a kstat name, create the kstat structure, and allocate a
 664  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 665  * to the caller.  This must only be called from a mount.
 666  */
 667 vsk_anchor_t *
 668 get_vskstat_anchor(vfs_t *vfsp)
 669 {
 670         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 671         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 672         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 673         kstat_t         *ksp;                   /* Ptr to new kstat */
 674         avl_index_t     where;                  /* Location in the AVL tree */
 675 
 676         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 677             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 678                 return (NULL);
 679 
 680         /* Need to get the fsid to build a kstat name */
 681         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 682                 /* Create a name for our kstats based on fsid */
 683                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 684                     VOPSTATS_STR, statvfsbuf.f_fsid);
 685 
 686                 /* Allocate and initialize the vsk_anchor_t */
 687                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 688                 bzero(vskp, sizeof (*vskp));
 689                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 690 
 691                 mutex_enter(&vskstat_tree_lock);
 692                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 693                         avl_insert(&vskstat_tree, vskp, where);
 694                         mutex_exit(&vskstat_tree_lock);
 695 
 696                         /*
 697                          * Now that we've got the anchor in the AVL
 698                          * tree, we can create the kstat.
 699                          */
 700                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 701                         if (ksp) {
 702                                 vskp->vsk_ksp = ksp;
 703                         }
 704                 } else {
 705                         /* Oops, found one! Release memory and lock. */
 706                         mutex_exit(&vskstat_tree_lock);
 707                         kmem_cache_free(vsk_anchor_cache, vskp);
 708                         vskp = NULL;
 709                 }
 710         }
 711         return (vskp);
 712 }
 713 
 714 /*
 715  * We're in the process of tearing down the vfs and need to cleanup
 716  * the data structures associated with the vopstats. Must only be called
 717  * from dounmount().
 718  */
 719 void
 720 teardown_vopstats(vfs_t *vfsp)
 721 {
 722         vsk_anchor_t    *vskap;
 723         avl_index_t     where;
 724 
 725         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 726             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 727                 return;
 728 
 729         /* This is a safe check since VFS_STATS must be set (see above) */
 730         if ((vskap = vfsp->vfs_vskap) == NULL)
 731                 return;
 732 
 733         /* Whack the pointer right away */
 734         vfsp->vfs_vskap = NULL;
 735 
 736         /* Lock the tree, remove the node, and delete the kstat */
 737         mutex_enter(&vskstat_tree_lock);
 738         if (avl_find(&vskstat_tree, vskap, &where)) {
 739                 avl_remove(&vskstat_tree, vskap);
 740         }
 741 
 742         if (vskap->vsk_ksp) {
 743                 kstat_delete(vskap->vsk_ksp);
 744         }
 745         mutex_exit(&vskstat_tree_lock);
 746 
 747         kmem_cache_free(vsk_anchor_cache, vskap);
 748 }
 749 
 750 /*
 751  * Read or write a vnode.  Called from kernel code.
 752  */
 753 int
 754 vn_rdwr(
 755         enum uio_rw rw,
 756         struct vnode *vp,
 757         caddr_t base,
 758         ssize_t len,
 759         offset_t offset,
 760         enum uio_seg seg,
 761         int ioflag,
 762         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 763         cred_t *cr,
 764         ssize_t *residp)
 765 {
 766         struct uio uio;
 767         struct iovec iov;
 768         int error;
 769         int in_crit = 0;
 770 
 771         if (rw == UIO_WRITE && ISROFILE(vp))
 772                 return (EROFS);
 773 
 774         if (len < 0)
 775                 return (EIO);
 776 
 777         VOPXID_MAP_CR(vp, cr);
 778 
 779         iov.iov_base = base;
 780         iov.iov_len = len;
 781         uio.uio_iov = &iov;
 782         uio.uio_iovcnt = 1;
 783         uio.uio_loffset = offset;
 784         uio.uio_segflg = (short)seg;
 785         uio.uio_resid = len;
 786         uio.uio_llimit = ulimit;
 787 
 788         /*
 789          * We have to enter the critical region before calling VOP_RWLOCK
 790          * to avoid a deadlock with ufs.
 791          */
 792         if (nbl_need_check(vp)) {
 793                 int svmand;
 794 
 795                 nbl_start_crit(vp, RW_READER);
 796                 in_crit = 1;
 797                 error = nbl_svmand(vp, cr, &svmand);
 798                 if (error != 0)
 799                         goto done;
 800                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 801                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 802                         error = EACCES;
 803                         goto done;
 804                 }
 805         }
 806 
 807         (void) VOP_RWLOCK(vp,
 808             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 809         if (rw == UIO_WRITE) {
 810                 uio.uio_fmode = FWRITE;
 811                 uio.uio_extflg = UIO_COPY_DEFAULT;
 812                 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 813         } else {
 814                 uio.uio_fmode = FREAD;
 815                 uio.uio_extflg = UIO_COPY_CACHED;
 816                 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 817         }
 818         VOP_RWUNLOCK(vp,
 819             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 820         if (residp)
 821                 *residp = uio.uio_resid;
 822         else if (uio.uio_resid)
 823                 error = EIO;
 824 
 825 done:
 826         if (in_crit)
 827                 nbl_end_crit(vp);
 828         return (error);
 829 }
 830 
 831 /*
 832  * Release a vnode.  Call VOP_INACTIVE on last reference or
 833  * decrement reference count.
 834  *
 835  * To avoid race conditions, the v_count is left at 1 for
 836  * the call to VOP_INACTIVE. This prevents another thread
 837  * from reclaiming and releasing the vnode *before* the
 838  * VOP_INACTIVE routine has a chance to destroy the vnode.
 839  * We can't have more than 1 thread calling VOP_INACTIVE
 840  * on a vnode.
 841  */
 842 void
 843 vn_rele(vnode_t *vp)
 844 {
 845         VERIFY(vp->v_count > 0);
 846         mutex_enter(&vp->v_lock);
 847         if (vp->v_count == 1) {
 848                 mutex_exit(&vp->v_lock);
 849                 VOP_INACTIVE(vp, CRED(), NULL);
 850                 return;
 851         }
 852         VN_RELE_LOCKED(vp);
 853         mutex_exit(&vp->v_lock);
 854 }
 855 
 856 void
 857 vn_phantom_rele(vnode_t *vp)
 858 {
 859         VERIFY(vp->v_count > 0);
 860 
 861         mutex_enter(&vp->v_lock);
 862         VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
 863         vp->v_phantom_count--;
 864         DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
 865         if (vp->v_count == 1) {
 866                 ASSERT0(vp->v_phantom_count);
 867                 mutex_exit(&vp->v_lock);
 868                 VOP_INACTIVE(vp, CRED(), NULL);
 869                 return;
 870         }
 871         VN_RELE_LOCKED(vp);
 872         mutex_exit(&vp->v_lock);
 873 }
 874 
 875 /*
 876  * Return the number of non-phantom holds. Things such as portfs will use
 877  * phantom holds to prevent it from blocking filesystems from mounting over
 878  * watched directories.
 879  */
 880 uint_t
 881 vn_count(vnode_t *vp)
 882 {
 883         ASSERT(MUTEX_HELD(&vp->v_lock));
 884         return (vp->v_count - vp->v_phantom_count);
 885 }
 886 
 887 /*
 888  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 889  * as a single reference, so v_count is not decremented until the last DNLC hold
 890  * is released. This makes it possible to distinguish vnodes that are referenced
 891  * only by the DNLC.
 892  */
 893 void
 894 vn_rele_dnlc(vnode_t *vp)
 895 {
 896         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 897         mutex_enter(&vp->v_lock);
 898         if (--vp->v_count_dnlc == 0) {
 899                 if (vp->v_count == 1) {
 900                         mutex_exit(&vp->v_lock);
 901                         VOP_INACTIVE(vp, CRED(), NULL);
 902                         return;
 903                 }
 904                 VN_RELE_LOCKED(vp);
 905         }
 906         mutex_exit(&vp->v_lock);
 907 }
 908 
 909 /*
 910  * Like vn_rele() except that it clears v_stream under v_lock.
 911  * This is used by sockfs when it dismantles the association between
 912  * the sockfs node and the vnode in the underlying file system.
 913  * v_lock has to be held to prevent a thread coming through the lookupname
 914  * path from accessing a stream head that is going away.
 915  */
 916 void
 917 vn_rele_stream(vnode_t *vp)
 918 {
 919         VERIFY(vp->v_count > 0);
 920         mutex_enter(&vp->v_lock);
 921         vp->v_stream = NULL;
 922         if (vp->v_count == 1) {
 923                 mutex_exit(&vp->v_lock);
 924                 VOP_INACTIVE(vp, CRED(), NULL);
 925                 return;
 926         }
 927         VN_RELE_LOCKED(vp);
 928         mutex_exit(&vp->v_lock);
 929 }
 930 
 931 static void
 932 vn_rele_inactive(vnode_t *vp)
 933 {
 934         VOP_INACTIVE(vp, CRED(), NULL);
 935 }
 936 
 937 /*
 938  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 939  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 940  * the file system as a result of releasing the vnode. Note, file systems
 941  * already have to handle the race where the vnode is incremented before the
 942  * inactive routine is called and does its locking.
 943  *
 944  * Warning: Excessive use of this routine can lead to performance problems.
 945  * This is because taskqs throttle back allocation if too many are created.
 946  */
 947 void
 948 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 949 {
 950         VERIFY(vp->v_count > 0);
 951         mutex_enter(&vp->v_lock);
 952         if (vp->v_count == 1) {
 953                 mutex_exit(&vp->v_lock);
 954                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 955                     vp, TQ_SLEEP) != TASKQID_INVALID);
 956                 return;
 957         }
 958         VN_RELE_LOCKED(vp);
 959         mutex_exit(&vp->v_lock);
 960 }
 961 
 962 int
 963 vn_open(
 964         char *pnamep,
 965         enum uio_seg seg,
 966         int filemode,
 967         int createmode,
 968         struct vnode **vpp,
 969         enum create crwhy,
 970         mode_t umask)
 971 {
 972         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 973             umask, NULL, -1));
 974 }
 975 
 976 
 977 /*
 978  * Open/create a vnode.
 979  * This may be callable by the kernel, the only known use
 980  * of user context being that the current user credentials
 981  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 982  */
 983 int
 984 vn_openat(
 985         char *pnamep,
 986         enum uio_seg seg,
 987         int filemode,
 988         int createmode,
 989         struct vnode **vpp,
 990         enum create crwhy,
 991         mode_t umask,
 992         struct vnode *startvp,
 993         int fd)
 994 {
 995         struct vnode *vp;
 996         int mode;
 997         int accessflags;
 998         int error;
 999         int in_crit = 0;
1000         int open_done = 0;
1001         int shrlock_done = 0;
1002         struct vattr vattr;
1003         enum symfollow follow;
1004         int estale_retry = 0;
1005         struct shrlock shr;
1006         struct shr_locowner shr_own;
1007         boolean_t create;
1008 
1009         mode = 0;
1010         accessflags = 0;
1011         if (filemode & FREAD)
1012                 mode |= VREAD;
1013         if (filemode & (FWRITE|FTRUNC))
1014                 mode |= VWRITE;
1015         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
1016                 mode |= VEXEC;
1017 
1018         /* symlink interpretation */
1019         if (filemode & FNOFOLLOW)
1020                 follow = NO_FOLLOW;
1021         else
1022                 follow = FOLLOW;
1023 
1024         if (filemode & FAPPEND)
1025                 accessflags |= V_APPEND;
1026 
1027         /*
1028          * We need to handle the case of FCREAT | FDIRECTORY and the case of
1029          * FEXCL. If all three are specified, then we always fail because we
1030          * cannot create a directory through this interface and FEXCL says we
1031          * need to fail the request if we can't create it. If, however, only
1032          * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1033          * of opening a file that already exists. If it exists, we can do
1034          * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1035          * treated as FDIRECTORY.
1036          */
1037         if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1038             (FCREAT | FDIRECTORY | FEXCL)) {
1039                 return (EINVAL);
1040         }
1041 
1042         if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1043                 create = B_FALSE;
1044         } else if ((filemode & FCREAT) != 0) {
1045                 create = B_TRUE;
1046         } else {
1047                 create = B_FALSE;
1048         }
1049 
1050 top:
1051         if (create) {
1052                 enum vcexcl excl;
1053 
1054                 /*
1055                  * Wish to create a file.
1056                  */
1057                 vattr.va_type = VREG;
1058                 vattr.va_mode = createmode;
1059                 vattr.va_mask = AT_TYPE|AT_MODE;
1060                 if (filemode & FTRUNC) {
1061                         vattr.va_size = 0;
1062                         vattr.va_mask |= AT_SIZE;
1063                 }
1064                 if (filemode & FEXCL)
1065                         excl = EXCL;
1066                 else
1067                         excl = NONEXCL;
1068 
1069                 if (error =
1070                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1071                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1072                         return (error);
1073         } else {
1074                 /*
1075                  * Wish to open a file.  Just look it up.
1076                  */
1077                 if (error = lookupnameat(pnamep, seg, follow,
1078                     NULLVPP, &vp, startvp)) {
1079                         if ((error == ESTALE) &&
1080                             fs_need_estale_retry(estale_retry++))
1081                                 goto top;
1082                         return (error);
1083                 }
1084 
1085                 /*
1086                  * Get the attributes to check whether file is large.
1087                  * We do this only if the FOFFMAX flag is not set and
1088                  * only for regular files.
1089                  */
1090 
1091                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1092                         vattr.va_mask = AT_SIZE;
1093                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1094                             CRED(), NULL))) {
1095                                 goto out;
1096                         }
1097                         if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1098                                 /*
1099                                  * Large File API - regular open fails
1100                                  * if FOFFMAX flag is set in file mode
1101                                  */
1102                                 error = EOVERFLOW;
1103                                 goto out;
1104                         }
1105                 }
1106                 /*
1107                  * Can't write directories, active texts, or
1108                  * read-only filesystems.  Can't truncate files
1109                  * on which mandatory locking is in effect.
1110                  */
1111                 if (filemode & (FWRITE|FTRUNC)) {
1112                         /*
1113                          * Allow writable directory if VDIROPEN flag is set.
1114                          */
1115                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1116                                 error = EISDIR;
1117                                 goto out;
1118                         }
1119                         if (ISROFILE(vp)) {
1120                                 error = EROFS;
1121                                 goto out;
1122                         }
1123                         /*
1124                          * Can't truncate files on which
1125                          * sysv mandatory locking is in effect.
1126                          */
1127                         if (filemode & FTRUNC) {
1128                                 vnode_t *rvp;
1129 
1130                                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1131                                         rvp = vp;
1132                                 if (rvp->v_filocks != NULL) {
1133                                         vattr.va_mask = AT_MODE;
1134                                         if ((error = VOP_GETATTR(vp,
1135                                             &vattr, 0, CRED(), NULL)) == 0 &&
1136                                             MANDLOCK(vp, vattr.va_mode))
1137                                                 error = EAGAIN;
1138                                 }
1139                         }
1140                         if (error)
1141                                 goto out;
1142                 }
1143                 /*
1144                  * Check permissions.
1145                  */
1146                 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1147                         goto out;
1148 
1149                 /*
1150                  * Require FSEARCH and FDIRECTORY to return a directory. Require
1151                  * FEXEC to return a regular file.
1152                  */
1153                 if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1154                     vp->v_type != VDIR) {
1155                         error = ENOTDIR;
1156                         goto out;
1157                 }
1158                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1159                         error = ENOEXEC;        /* XXX: error code? */
1160                         goto out;
1161                 }
1162         }
1163 
1164         /*
1165          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1166          */
1167         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1168                 /*
1169                  * The __FLXPATH flag is a private interface for use by the lx
1170                  * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
1171                  * when a symbolic link is encountered, returns a file
1172                  * descriptor which references it.
1173                  * See uts/common/brand/lx/syscall/lx_open.c
1174                  *
1175                  * When this flag is set, VOP_OPEN() is not called (for a
1176                  * symlink, most filesystems will return ENOSYS anyway)
1177                  * and the link's vnode is returned to be linked to the
1178                  * file descriptor.
1179                  */
1180                 if ((filemode & __FLXPATH) == 0)
1181                         error = ELOOP;
1182                 goto out;
1183         }
1184         if (filemode & FNOLINKS) {
1185                 vattr.va_mask = AT_NLINK;
1186                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1187                         goto out;
1188                 }
1189                 if (vattr.va_nlink != 1) {
1190                         error = EMLINK;
1191                         goto out;
1192                 }
1193         }
1194 
1195         /*
1196          * Opening a socket corresponding to the AF_UNIX pathname
1197          * in the filesystem name space is not supported.
1198          * However, VSOCK nodes in namefs are supported in order
1199          * to make fattach work for sockets.
1200          *
1201          * XXX This uses VOP_REALVP to distinguish between
1202          * an unopened namefs node (where VOP_REALVP returns a
1203          * different VSOCK vnode) and a VSOCK created by vn_create
1204          * in some file system (where VOP_REALVP would never return
1205          * a different vnode).
1206          */
1207         if (vp->v_type == VSOCK) {
1208                 struct vnode *nvp;
1209 
1210                 error = VOP_REALVP(vp, &nvp, NULL);
1211                 if (error != 0 || nvp == NULL || nvp == vp ||
1212                     nvp->v_type != VSOCK) {
1213                         error = EOPNOTSUPP;
1214                         goto out;
1215                 }
1216         }
1217 
1218         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1219                 /* get share reservation */
1220                 shr.s_access = 0;
1221                 if (filemode & FWRITE)
1222                         shr.s_access |= F_WRACC;
1223                 if (filemode & FREAD)
1224                         shr.s_access |= F_RDACC;
1225                 shr.s_deny = 0;
1226                 shr.s_sysid = 0;
1227                 shr.s_pid = ttoproc(curthread)->p_pid;
1228                 shr_own.sl_pid = shr.s_pid;
1229                 shr_own.sl_id = fd;
1230                 shr.s_own_len = sizeof (shr_own);
1231                 shr.s_owner = (caddr_t)&shr_own;
1232                 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1233                     NULL);
1234                 if (error)
1235                         goto out;
1236                 shrlock_done = 1;
1237 
1238                 /* nbmand conflict check if truncating file */
1239                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1240                         nbl_start_crit(vp, RW_READER);
1241                         in_crit = 1;
1242 
1243                         vattr.va_mask = AT_SIZE;
1244                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1245                                 goto out;
1246                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1247                             NULL)) {
1248                                 error = EACCES;
1249                                 goto out;
1250                         }
1251                 }
1252         }
1253 
1254         /*
1255          * Do opening protocol.
1256          */
1257         error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1258         if (error)
1259                 goto out;
1260         open_done = 1;
1261 
1262         /*
1263          * Truncate if required.
1264          */
1265         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1266                 vattr.va_size = 0;
1267                 vattr.va_mask = AT_SIZE;
1268                 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1269                         goto out;
1270         }
1271 
1272         /*
1273          * Turn on directio, if requested.
1274          */
1275         if (filemode & FDIRECT) {
1276                 if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1277                     CRED(), NULL, NULL)) != 0) {
1278                         /*
1279                          * On Linux, O_DIRECT returns EINVAL when the file
1280                          * system does not support directio, so we'll do the
1281                          * same.
1282                          */
1283                         error = EINVAL;
1284                         goto out;
1285                 }
1286         }
1287 out:
1288         ASSERT(vp->v_count > 0);
1289 
1290         if (in_crit) {
1291                 nbl_end_crit(vp);
1292                 in_crit = 0;
1293         }
1294         if (error) {
1295                 if (open_done) {
1296                         (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1297                             NULL);
1298                         open_done = 0;
1299                         shrlock_done = 0;
1300                 }
1301                 if (shrlock_done) {
1302                         (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1303                             NULL);
1304                         shrlock_done = 0;
1305                 }
1306 
1307                 /*
1308                  * The following clause was added to handle a problem
1309                  * with NFS consistency.  It is possible that a lookup
1310                  * of the file to be opened succeeded, but the file
1311                  * itself doesn't actually exist on the server.  This
1312                  * is chiefly due to the DNLC containing an entry for
1313                  * the file which has been removed on the server.  In
1314                  * this case, we just start over.  If there was some
1315                  * other cause for the ESTALE error, then the lookup
1316                  * of the file will fail and the error will be returned
1317                  * above instead of looping around from here.
1318                  */
1319                 VN_RELE(vp);
1320                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1321                         goto top;
1322         } else
1323                 *vpp = vp;
1324         return (error);
1325 }
1326 
1327 /*
1328  * The following two accessor functions are for the NFSv4 server.  Since there
1329  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1330  * vnode open counts correct when a client "upgrades" an open or does an
1331  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1332  * open mode (add or subtract read or write), but also change the share/deny
1333  * modes.  However, share reservations are not integrated with OPEN, yet, so
1334  * we need to handle each separately.  These functions are cleaner than having
1335  * the NFS server manipulate the counts directly, however, nobody else should
1336  * use these functions.
1337  */
1338 void
1339 vn_open_upgrade(
1340         vnode_t *vp,
1341         int filemode)
1342 {
1343         ASSERT(vp->v_type == VREG);
1344 
1345         if (filemode & FREAD)
1346                 atomic_inc_32(&vp->v_rdcnt);
1347         if (filemode & FWRITE)
1348                 atomic_inc_32(&vp->v_wrcnt);
1349 
1350 }
1351 
1352 void
1353 vn_open_downgrade(
1354         vnode_t *vp,
1355         int filemode)
1356 {
1357         ASSERT(vp->v_type == VREG);
1358 
1359         if (filemode & FREAD) {
1360                 ASSERT(vp->v_rdcnt > 0);
1361                 atomic_dec_32(&vp->v_rdcnt);
1362         }
1363         if (filemode & FWRITE) {
1364                 ASSERT(vp->v_wrcnt > 0);
1365                 atomic_dec_32(&vp->v_wrcnt);
1366         }
1367 
1368 }
1369 
1370 int
1371 vn_create(
1372         char *pnamep,
1373         enum uio_seg seg,
1374         struct vattr *vap,
1375         enum vcexcl excl,
1376         int mode,
1377         struct vnode **vpp,
1378         enum create why,
1379         int flag,
1380         mode_t umask)
1381 {
1382         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1383             umask, NULL));
1384 }
1385 
1386 /*
1387  * Create a vnode (makenode).
1388  */
1389 int
1390 vn_createat(
1391         char *pnamep,
1392         enum uio_seg seg,
1393         struct vattr *vap,
1394         enum vcexcl excl,
1395         int mode,
1396         struct vnode **vpp,
1397         enum create why,
1398         int flag,
1399         mode_t umask,
1400         struct vnode *startvp)
1401 {
1402         struct vnode *dvp;      /* ptr to parent dir vnode */
1403         struct vnode *vp = NULL;
1404         struct pathname pn;
1405         int error;
1406         int in_crit = 0;
1407         struct vattr vattr;
1408         enum symfollow follow;
1409         int estale_retry = 0;
1410         uint32_t auditing = AU_AUDITING();
1411 
1412         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1413 
1414         /* symlink interpretation */
1415         if ((flag & FNOFOLLOW) || excl == EXCL)
1416                 follow = NO_FOLLOW;
1417         else
1418                 follow = FOLLOW;
1419         flag &= ~(FNOFOLLOW|FNOLINKS);
1420 
1421 top:
1422         /*
1423          * Lookup directory.
1424          * If new object is a file, call lower level to create it.
1425          * Note that it is up to the lower level to enforce exclusive
1426          * creation, if the file is already there.
1427          * This allows the lower level to do whatever
1428          * locking or protocol that is needed to prevent races.
1429          * If the new object is directory call lower level to make
1430          * the new directory, with "." and "..".
1431          */
1432         if (error = pn_get(pnamep, seg, &pn))
1433                 return (error);
1434         if (auditing)
1435                 audit_vncreate_start();
1436         dvp = NULL;
1437         *vpp = NULL;
1438         /*
1439          * lookup will find the parent directory for the vnode.
1440          * When it is done the pn holds the name of the entry
1441          * in the directory.
1442          * If this is a non-exclusive create we also find the node itself.
1443          */
1444         error = lookuppnat(&pn, NULL, follow, &dvp,
1445             (excl == EXCL) ? NULLVPP : vpp, startvp);
1446         if (error) {
1447                 pn_free(&pn);
1448                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1449                         goto top;
1450                 if (why == CRMKDIR && error == EINVAL)
1451                         error = EEXIST;         /* SVID */
1452                 return (error);
1453         }
1454 
1455         if (why != CRMKNOD)
1456                 vap->va_mode &= ~VSVTX;
1457 
1458         /*
1459          * If default ACLs are defined for the directory don't apply the
1460          * umask if umask is passed.
1461          */
1462 
1463         if (umask) {
1464 
1465                 vsecattr_t vsec;
1466 
1467                 vsec.vsa_aclcnt = 0;
1468                 vsec.vsa_aclentp = NULL;
1469                 vsec.vsa_dfaclcnt = 0;
1470                 vsec.vsa_dfaclentp = NULL;
1471                 vsec.vsa_mask = VSA_DFACLCNT;
1472                 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1473                 /*
1474                  * If error is ENOSYS then treat it as no error
1475                  * Don't want to force all file systems to support
1476                  * aclent_t style of ACL's.
1477                  */
1478                 if (error == ENOSYS)
1479                         error = 0;
1480                 if (error) {
1481                         if (*vpp != NULL)
1482                                 VN_RELE(*vpp);
1483                         goto out;
1484                 } else {
1485                         /*
1486                          * Apply the umask if no default ACLs.
1487                          */
1488                         if (vsec.vsa_dfaclcnt == 0)
1489                                 vap->va_mode &= ~umask;
1490 
1491                         /*
1492                          * VOP_GETSECATTR() may have allocated memory for
1493                          * ACLs we didn't request, so double-check and
1494                          * free it if necessary.
1495                          */
1496                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1497                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1498                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1499                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1500                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1501                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1502                 }
1503         }
1504 
1505         /*
1506          * In general we want to generate EROFS if the file system is
1507          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1508          * documents the open system call, and it says that O_CREAT has no
1509          * effect if the file already exists.  Bug 1119649 states
1510          * that open(path, O_CREAT, ...) fails when attempting to open an
1511          * existing file on a read only file system.  Thus, the first part
1512          * of the following if statement has 3 checks:
1513          *      if the file exists &&
1514          *              it is being open with write access &&
1515          *              the file system is read only
1516          *      then generate EROFS
1517          */
1518         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1519             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1520                 if (*vpp)
1521                         VN_RELE(*vpp);
1522                 error = EROFS;
1523         } else if (excl == NONEXCL && *vpp != NULL) {
1524                 vnode_t *rvp;
1525 
1526                 /*
1527                  * File already exists.  If a mandatory lock has been
1528                  * applied, return error.
1529                  */
1530                 vp = *vpp;
1531                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1532                         rvp = vp;
1533                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1534                         nbl_start_crit(vp, RW_READER);
1535                         in_crit = 1;
1536                 }
1537                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1538                         vattr.va_mask = AT_MODE|AT_SIZE;
1539                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1540                                 goto out;
1541                         }
1542                         if (MANDLOCK(vp, vattr.va_mode)) {
1543                                 error = EAGAIN;
1544                                 goto out;
1545                         }
1546                         /*
1547                          * File cannot be truncated if non-blocking mandatory
1548                          * locks are currently on the file.
1549                          */
1550                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1551                                 u_offset_t offset;
1552                                 ssize_t length;
1553 
1554                                 offset = vap->va_size > vattr.va_size ?
1555                                     vattr.va_size : vap->va_size;
1556                                 length = vap->va_size > vattr.va_size ?
1557                                     vap->va_size - vattr.va_size :
1558                                     vattr.va_size - vap->va_size;
1559                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1560                                     length, 0, NULL)) {
1561                                         error = EACCES;
1562                                         goto out;
1563                                 }
1564                         }
1565                 }
1566 
1567                 /*
1568                  * If the file is the root of a VFS, we've crossed a
1569                  * mount point and the "containing" directory that we
1570                  * acquired above (dvp) is irrelevant because it's in
1571                  * a different file system.  We apply VOP_CREATE to the
1572                  * target itself instead of to the containing directory
1573                  * and supply a null path name to indicate (conventionally)
1574                  * the node itself as the "component" of interest.
1575                  *
1576                  * The call to VOP_CREATE() is necessary to ensure
1577                  * that the appropriate permission checks are made,
1578                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1579                  * exists since we are in the else condition where this
1580                  * was checked.
1581                  */
1582                 if (vp->v_flag & VROOT) {
1583                         ASSERT(why != CRMKDIR);
1584                         error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1585                             CRED(), flag, NULL, NULL);
1586                         /*
1587                          * If the create succeeded, it will have created a
1588                          * new reference on a new vnode (*vpp) in the child
1589                          * file system, so we want to drop our reference on
1590                          * the old (vp) upon exit.
1591                          */
1592                         goto out;
1593                 }
1594 
1595                 /*
1596                  * Large File API - non-large open (FOFFMAX flag not set)
1597                  * of regular file fails if the file size exceeds MAXOFF32_T.
1598                  */
1599                 if (why != CRMKDIR &&
1600                     !(flag & FOFFMAX) &&
1601                     (vp->v_type == VREG)) {
1602                         vattr.va_mask = AT_SIZE;
1603                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1604                             CRED(), NULL))) {
1605                                 goto out;
1606                         }
1607                         if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1608                                 error = EOVERFLOW;
1609                                 goto out;
1610                         }
1611                 }
1612         }
1613 
1614         if (error == 0) {
1615                 /*
1616                  * Call mkdir() if specified, otherwise create().
1617                  */
1618                 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1619 
1620                 if (why == CRMKDIR)
1621                         /*
1622                          * N.B., if vn_createat() ever requests
1623                          * case-insensitive behavior then it will need
1624                          * to be passed to VOP_MKDIR().  VOP_CREATE()
1625                          * will already get it via "flag"
1626                          */
1627                         error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1628                             NULL, 0, NULL);
1629                 else if (!must_be_dir)
1630                         error = VOP_CREATE(dvp, pn.pn_path, vap,
1631                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1632                 else
1633                         error = ENOTDIR;
1634         }
1635 
1636 out:
1637 
1638         if (auditing)
1639                 audit_vncreate_finish(*vpp, error);
1640         if (in_crit) {
1641                 nbl_end_crit(vp);
1642                 in_crit = 0;
1643         }
1644         if (vp != NULL) {
1645                 VN_RELE(vp);
1646                 vp = NULL;
1647         }
1648         pn_free(&pn);
1649         VN_RELE(dvp);
1650         /*
1651          * The following clause was added to handle a problem
1652          * with NFS consistency.  It is possible that a lookup
1653          * of the file to be created succeeded, but the file
1654          * itself doesn't actually exist on the server.  This
1655          * is chiefly due to the DNLC containing an entry for
1656          * the file which has been removed on the server.  In
1657          * this case, we just start over.  If there was some
1658          * other cause for the ESTALE error, then the lookup
1659          * of the file will fail and the error will be returned
1660          * above instead of looping around from here.
1661          */
1662         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1663                 goto top;
1664         return (error);
1665 }
1666 
1667 int
1668 vn_link(char *from, char *to, enum uio_seg seg)
1669 {
1670         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1671 }
1672 
1673 int
1674 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1675     vnode_t *tstartvp, char *to, enum uio_seg seg)
1676 {
1677         struct vnode *fvp;              /* from vnode ptr */
1678         struct vnode *tdvp;             /* to directory vnode ptr */
1679         struct pathname pn;
1680         int error;
1681         struct vattr vattr;
1682         dev_t fsid;
1683         int estale_retry = 0;
1684         uint32_t auditing = AU_AUDITING();
1685 
1686 top:
1687         fvp = tdvp = NULL;
1688         if (error = pn_get(to, seg, &pn))
1689                 return (error);
1690         if (auditing && fstartvp != NULL)
1691                 audit_setfsat_path(1);
1692         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1693                 goto out;
1694         if (auditing && tstartvp != NULL)
1695                 audit_setfsat_path(3);
1696         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1697                 goto out;
1698         /*
1699          * Make sure both source vnode and target directory vnode are
1700          * in the same vfs and that it is writeable.
1701          */
1702         vattr.va_mask = AT_FSID;
1703         if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1704                 goto out;
1705         fsid = vattr.va_fsid;
1706         vattr.va_mask = AT_FSID;
1707         if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1708                 goto out;
1709         if (fsid != vattr.va_fsid) {
1710                 error = EXDEV;
1711                 goto out;
1712         }
1713         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1714                 error = EROFS;
1715                 goto out;
1716         }
1717         /*
1718          * Do the link.
1719          */
1720         (void) pn_fixslash(&pn);
1721         error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1722 out:
1723         pn_free(&pn);
1724         if (fvp)
1725                 VN_RELE(fvp);
1726         if (tdvp)
1727                 VN_RELE(tdvp);
1728         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1729                 goto top;
1730         return (error);
1731 }
1732 
1733 int
1734 vn_rename(char *from, char *to, enum uio_seg seg)
1735 {
1736         return (vn_renameat(NULL, from, NULL, to, seg));
1737 }
1738 
1739 int
1740 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1741     char *tname, enum uio_seg seg)
1742 {
1743         int error;
1744         struct vattr vattr;
1745         struct pathname fpn;            /* from pathname */
1746         struct pathname tpn;            /* to pathname */
1747         dev_t fsid;
1748         int in_crit_src, in_crit_targ;
1749         vnode_t *fromvp, *fvp;
1750         vnode_t *tovp, *targvp;
1751         int estale_retry = 0;
1752         uint32_t auditing = AU_AUDITING();
1753 
1754 top:
1755         fvp = fromvp = tovp = targvp = NULL;
1756         in_crit_src = in_crit_targ = 0;
1757         /*
1758          * Get to and from pathnames.
1759          */
1760         if (error = pn_get(fname, seg, &fpn))
1761                 return (error);
1762         if (error = pn_get(tname, seg, &tpn)) {
1763                 pn_free(&fpn);
1764                 return (error);
1765         }
1766 
1767         /*
1768          * First we need to resolve the correct directories
1769          * The passed in directories may only be a starting point,
1770          * but we need the real directories the file(s) live in.
1771          * For example the fname may be something like usr/lib/sparc
1772          * and we were passed in the / directory, but we need to
1773          * use the lib directory for the rename.
1774          */
1775 
1776         if (auditing && fdvp != NULL)
1777                 audit_setfsat_path(1);
1778         /*
1779          * Lookup to and from directories.
1780          */
1781         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1782                 goto out;
1783         }
1784 
1785         /*
1786          * Make sure there is an entry.
1787          */
1788         if (fvp == NULL) {
1789                 error = ENOENT;
1790                 goto out;
1791         }
1792 
1793         if (auditing && tdvp != NULL)
1794                 audit_setfsat_path(3);
1795         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1796                 goto out;
1797         }
1798 
1799         /*
1800          * Make sure both the from vnode directory and the to directory
1801          * are in the same vfs and the to directory is writable.
1802          * We check fsid's, not vfs pointers, so loopback fs works.
1803          */
1804         if (fromvp != tovp) {
1805                 vattr.va_mask = AT_FSID;
1806                 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1807                         goto out;
1808                 fsid = vattr.va_fsid;
1809                 vattr.va_mask = AT_FSID;
1810                 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1811                         goto out;
1812                 if (fsid != vattr.va_fsid) {
1813                         error = EXDEV;
1814                         goto out;
1815                 }
1816         }
1817 
1818         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1819                 error = EROFS;
1820                 goto out;
1821         }
1822 
1823         /*
1824          * Make sure "from" vp is not a mount point.
1825          * Note, lookup did traverse() already, so
1826          * we'll be looking at the mounted FS root.
1827          * (but allow files like mnttab)
1828          */
1829         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1830                 error = EBUSY;
1831                 goto out;
1832         }
1833 
1834         if (targvp && (fvp != targvp)) {
1835                 nbl_start_crit(targvp, RW_READER);
1836                 in_crit_targ = 1;
1837                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1838                         error = EACCES;
1839                         goto out;
1840                 }
1841         }
1842 
1843         if (nbl_need_check(fvp)) {
1844                 nbl_start_crit(fvp, RW_READER);
1845                 in_crit_src = 1;
1846                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1847                         error = EACCES;
1848                         goto out;
1849                 }
1850         }
1851 
1852         /*
1853          * Do the rename.
1854          */
1855         (void) pn_fixslash(&tpn);
1856         error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1857             NULL, 0);
1858 
1859 out:
1860         pn_free(&fpn);
1861         pn_free(&tpn);
1862         if (in_crit_src)
1863                 nbl_end_crit(fvp);
1864         if (in_crit_targ)
1865                 nbl_end_crit(targvp);
1866         if (fromvp)
1867                 VN_RELE(fromvp);
1868         if (tovp)
1869                 VN_RELE(tovp);
1870         if (targvp)
1871                 VN_RELE(targvp);
1872         if (fvp)
1873                 VN_RELE(fvp);
1874         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1875                 goto top;
1876         return (error);
1877 }
1878 
1879 /*
1880  * Remove a file or directory.
1881  */
1882 int
1883 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1884 {
1885         return (vn_removeat(NULL, fnamep, seg, dirflag));
1886 }
1887 
1888 int
1889 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1890 {
1891         struct vnode *vp;               /* entry vnode */
1892         struct vnode *dvp;              /* ptr to parent dir vnode */
1893         struct vnode *coveredvp;
1894         struct pathname pn;             /* name of entry */
1895         enum vtype vtype;
1896         int error;
1897         struct vfs *vfsp;
1898         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1899         int in_crit = 0;
1900         int estale_retry = 0;
1901 
1902 top:
1903         if (error = pn_get(fnamep, seg, &pn))
1904                 return (error);
1905         dvp = vp = NULL;
1906         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1907                 pn_free(&pn);
1908                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1909                         goto top;
1910                 return (error);
1911         }
1912 
1913         /*
1914          * Make sure there is an entry.
1915          */
1916         if (vp == NULL) {
1917                 error = ENOENT;
1918                 goto out;
1919         }
1920 
1921         vfsp = vp->v_vfsp;
1922         dvfsp = dvp->v_vfsp;
1923 
1924         /*
1925          * If the named file is the root of a mounted filesystem, fail,
1926          * unless it's marked unlinkable.  In that case, unmount the
1927          * filesystem and proceed to unlink the covered vnode.  (If the
1928          * covered vnode is a directory, use rmdir instead of unlink,
1929          * to avoid file system corruption.)
1930          */
1931         if (vp->v_flag & VROOT) {
1932                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1933                         error = EBUSY;
1934                         goto out;
1935                 }
1936 
1937                 /*
1938                  * Namefs specific code starts here.
1939                  */
1940 
1941                 if (dirflag == RMDIRECTORY) {
1942                         /*
1943                          * User called rmdir(2) on a file that has
1944                          * been namefs mounted on top of.  Since
1945                          * namefs doesn't allow directories to
1946                          * be mounted on other files we know
1947                          * vp is not of type VDIR so fail to operation.
1948                          */
1949                         error = ENOTDIR;
1950                         goto out;
1951                 }
1952 
1953                 /*
1954                  * If VROOT is still set after grabbing vp->v_lock,
1955                  * noone has finished nm_unmount so far and coveredvp
1956                  * is valid.
1957                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1958                  * vp->v_lock, any race window is eliminated.
1959                  */
1960 
1961                 mutex_enter(&vp->v_lock);
1962                 if ((vp->v_flag & VROOT) == 0) {
1963                         /* Someone beat us to the unmount */
1964                         mutex_exit(&vp->v_lock);
1965                         error = EBUSY;
1966                         goto out;
1967                 }
1968                 vfsp = vp->v_vfsp;
1969                 coveredvp = vfsp->vfs_vnodecovered;
1970                 ASSERT(coveredvp);
1971                 /*
1972                  * Note: Implementation of vn_vfswlock shows that ordering of
1973                  * v_lock / vn_vfswlock is not an issue here.
1974                  */
1975                 error = vn_vfswlock(coveredvp);
1976                 mutex_exit(&vp->v_lock);
1977 
1978                 if (error)
1979                         goto out;
1980 
1981                 VN_HOLD(coveredvp);
1982                 VN_RELE(vp);
1983                 error = dounmount(vfsp, 0, CRED());
1984 
1985                 /*
1986                  * Unmounted the namefs file system; now get
1987                  * the object it was mounted over.
1988                  */
1989                 vp = coveredvp;
1990                 /*
1991                  * If namefs was mounted over a directory, then
1992                  * we want to use rmdir() instead of unlink().
1993                  */
1994                 if (vp->v_type == VDIR)
1995                         dirflag = RMDIRECTORY;
1996 
1997                 if (error)
1998                         goto out;
1999         }
2000 
2001         /*
2002          * Make sure filesystem is writeable.
2003          * We check the parent directory's vfs in case this is an lofs vnode.
2004          */
2005         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
2006                 error = EROFS;
2007                 goto out;
2008         }
2009 
2010         vtype = vp->v_type;
2011 
2012         /*
2013          * If there is the possibility of an nbmand share reservation, make
2014          * sure it's okay to remove the file.  Keep a reference to the
2015          * vnode, so that we can exit the nbl critical region after
2016          * calling VOP_REMOVE.
2017          * If there is no possibility of an nbmand share reservation,
2018          * release the vnode reference now.  Filesystems like NFS may
2019          * behave differently if there is an extra reference, so get rid of
2020          * this one.  Fortunately, we can't have nbmand mounts on NFS
2021          * filesystems.
2022          */
2023         if (nbl_need_check(vp)) {
2024                 nbl_start_crit(vp, RW_READER);
2025                 in_crit = 1;
2026                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
2027                         error = EACCES;
2028                         goto out;
2029                 }
2030         } else {
2031                 VN_RELE(vp);
2032                 vp = NULL;
2033         }
2034 
2035         if (dirflag == RMDIRECTORY) {
2036                 /*
2037                  * Caller is using rmdir(2), which can only be applied to
2038                  * directories.
2039                  */
2040                 if (vtype != VDIR) {
2041                         error = ENOTDIR;
2042                 } else {
2043                         vnode_t *cwd;
2044                         proc_t *pp = curproc;
2045 
2046                         mutex_enter(&pp->p_lock);
2047                         cwd = PTOU(pp)->u_cdir;
2048                         VN_HOLD(cwd);
2049                         mutex_exit(&pp->p_lock);
2050                         error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2051                             NULL, 0);
2052                         VN_RELE(cwd);
2053                 }
2054         } else {
2055                 /*
2056                  * Unlink(2) can be applied to anything.
2057                  */
2058                 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2059         }
2060 
2061 out:
2062         pn_free(&pn);
2063         if (in_crit) {
2064                 nbl_end_crit(vp);
2065                 in_crit = 0;
2066         }
2067         if (vp != NULL)
2068                 VN_RELE(vp);
2069         if (dvp != NULL)
2070                 VN_RELE(dvp);
2071         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2072                 goto top;
2073         return (error);
2074 }
2075 
2076 /*
2077  * Utility function to compare equality of vnodes.
2078  * Compare the underlying real vnodes, if there are underlying vnodes.
2079  * This is a more thorough comparison than the VN_CMP() macro provides.
2080  */
2081 int
2082 vn_compare(vnode_t *vp1, vnode_t *vp2)
2083 {
2084         vnode_t *realvp;
2085 
2086         if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2087                 vp1 = realvp;
2088         if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2089                 vp2 = realvp;
2090         return (VN_CMP(vp1, vp2));
2091 }
2092 
2093 /*
2094  * The number of locks to hash into.  This value must be a power
2095  * of 2 minus 1 and should probably also be prime.
2096  */
2097 #define NUM_BUCKETS     1023
2098 
2099 struct  vn_vfslocks_bucket {
2100         kmutex_t vb_lock;
2101         vn_vfslocks_entry_t *vb_list;
2102         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2103 };
2104 
2105 /*
2106  * Total number of buckets will be NUM_BUCKETS + 1 .
2107  */
2108 
2109 #pragma align   64(vn_vfslocks_buckets)
2110 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2111 
2112 #define VN_VFSLOCKS_SHIFT       9
2113 
2114 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2115         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2116 
2117 /*
2118  * vn_vfslocks_getlock() uses an HASH scheme to generate
2119  * rwstlock using vfs/vnode pointer passed to it.
2120  *
2121  * vn_vfslocks_rele() releases a reference in the
2122  * HASH table which allows the entry allocated by
2123  * vn_vfslocks_getlock() to be freed at a later
2124  * stage when the refcount drops to zero.
2125  */
2126 
2127 vn_vfslocks_entry_t *
2128 vn_vfslocks_getlock(void *vfsvpptr)
2129 {
2130         struct vn_vfslocks_bucket *bp;
2131         vn_vfslocks_entry_t *vep;
2132         vn_vfslocks_entry_t *tvep;
2133 
2134         ASSERT(vfsvpptr != NULL);
2135         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2136 
2137         mutex_enter(&bp->vb_lock);
2138         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2139                 if (vep->ve_vpvfs == vfsvpptr) {
2140                         vep->ve_refcnt++;
2141                         mutex_exit(&bp->vb_lock);
2142                         return (vep);
2143                 }
2144         }
2145         mutex_exit(&bp->vb_lock);
2146         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2147         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2148         vep->ve_vpvfs = (char *)vfsvpptr;
2149         vep->ve_refcnt = 1;
2150         mutex_enter(&bp->vb_lock);
2151         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2152                 if (tvep->ve_vpvfs == vfsvpptr) {
2153                         tvep->ve_refcnt++;
2154                         mutex_exit(&bp->vb_lock);
2155 
2156                         /*
2157                          * There is already an entry in the hash
2158                          * destroy what we just allocated.
2159                          */
2160                         rwst_destroy(&vep->ve_lock);
2161                         kmem_free(vep, sizeof (*vep));
2162                         return (tvep);
2163                 }
2164         }
2165         vep->ve_next = bp->vb_list;
2166         bp->vb_list = vep;
2167         mutex_exit(&bp->vb_lock);
2168         return (vep);
2169 }
2170 
2171 void
2172 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2173 {
2174         struct vn_vfslocks_bucket *bp;
2175         vn_vfslocks_entry_t *vep;
2176         vn_vfslocks_entry_t *pvep;
2177 
2178         ASSERT(vepent != NULL);
2179         ASSERT(vepent->ve_vpvfs != NULL);
2180 
2181         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2182 
2183         mutex_enter(&bp->vb_lock);
2184         vepent->ve_refcnt--;
2185 
2186         if ((int32_t)vepent->ve_refcnt < 0)
2187                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2188 
2189         pvep = NULL;
2190         if (vepent->ve_refcnt == 0) {
2191                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2192                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2193                                 if (pvep == NULL)
2194                                         bp->vb_list = vep->ve_next;
2195                                 else {
2196                                         pvep->ve_next = vep->ve_next;
2197                                 }
2198                                 mutex_exit(&bp->vb_lock);
2199                                 rwst_destroy(&vep->ve_lock);
2200                                 kmem_free(vep, sizeof (*vep));
2201                                 return;
2202                         }
2203                         pvep = vep;
2204                 }
2205                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2206         }
2207         mutex_exit(&bp->vb_lock);
2208 }
2209 
2210 /*
2211  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2212  * lock protecting the v_vfsmountedhere field.
2213  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2214  * except that it blocks to acquire the lock VVFSLOCK.
2215  *
2216  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2217  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2218  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2219  */
2220 int
2221 vn_vfswlock_wait(vnode_t *vp)
2222 {
2223         int retval;
2224         vn_vfslocks_entry_t *vpvfsentry;
2225         ASSERT(vp != NULL);
2226 
2227         vpvfsentry = vn_vfslocks_getlock(vp);
2228         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2229 
2230         if (retval == EINTR) {
2231                 vn_vfslocks_rele(vpvfsentry);
2232                 return (EINTR);
2233         }
2234         return (retval);
2235 }
2236 
2237 int
2238 vn_vfsrlock_wait(vnode_t *vp)
2239 {
2240         int retval;
2241         vn_vfslocks_entry_t *vpvfsentry;
2242         ASSERT(vp != NULL);
2243 
2244         vpvfsentry = vn_vfslocks_getlock(vp);
2245         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2246 
2247         if (retval == EINTR) {
2248                 vn_vfslocks_rele(vpvfsentry);
2249                 return (EINTR);
2250         }
2251 
2252         return (retval);
2253 }
2254 
2255 
2256 /*
2257  * vn_vfswlock is used to implement a lock which is logically a writers lock
2258  * protecting the v_vfsmountedhere field.
2259  */
2260 int
2261 vn_vfswlock(vnode_t *vp)
2262 {
2263         vn_vfslocks_entry_t *vpvfsentry;
2264 
2265         /*
2266          * If vp is NULL then somebody is trying to lock the covered vnode
2267          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2268          * only happen when unmounting /.  Since that operation will fail
2269          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2270          */
2271         if (vp == NULL)
2272                 return (EBUSY);
2273 
2274         vpvfsentry = vn_vfslocks_getlock(vp);
2275 
2276         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2277                 return (0);
2278 
2279         vn_vfslocks_rele(vpvfsentry);
2280         return (EBUSY);
2281 }
2282 
2283 int
2284 vn_vfsrlock(vnode_t *vp)
2285 {
2286         vn_vfslocks_entry_t *vpvfsentry;
2287 
2288         /*
2289          * If vp is NULL then somebody is trying to lock the covered vnode
2290          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2291          * only happen when unmounting /.  Since that operation will fail
2292          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2293          */
2294         if (vp == NULL)
2295                 return (EBUSY);
2296 
2297         vpvfsentry = vn_vfslocks_getlock(vp);
2298 
2299         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2300                 return (0);
2301 
2302         vn_vfslocks_rele(vpvfsentry);
2303         return (EBUSY);
2304 }
2305 
2306 void
2307 vn_vfsunlock(vnode_t *vp)
2308 {
2309         vn_vfslocks_entry_t *vpvfsentry;
2310 
2311         /*
2312          * ve_refcnt needs to be decremented twice.
2313          * 1. To release refernce after a call to vn_vfslocks_getlock()
2314          * 2. To release the reference from the locking routines like
2315          *    vn_vfsrlock/vn_vfswlock etc,.
2316          */
2317         vpvfsentry = vn_vfslocks_getlock(vp);
2318         vn_vfslocks_rele(vpvfsentry);
2319 
2320         rwst_exit(&vpvfsentry->ve_lock);
2321         vn_vfslocks_rele(vpvfsentry);
2322 }
2323 
2324 int
2325 vn_vfswlock_held(vnode_t *vp)
2326 {
2327         int held;
2328         vn_vfslocks_entry_t *vpvfsentry;
2329 
2330         ASSERT(vp != NULL);
2331 
2332         vpvfsentry = vn_vfslocks_getlock(vp);
2333         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2334 
2335         vn_vfslocks_rele(vpvfsentry);
2336         return (held);
2337 }
2338 
2339 
2340 int
2341 vn_make_ops(
2342         const char *name,                       /* Name of file system */
2343         const fs_operation_def_t *templ,        /* Operation specification */
2344         vnodeops_t **actual)                    /* Return the vnodeops */
2345 {
2346         int unused_ops;
2347         int error;
2348 
2349         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2350 
2351         (*actual)->vnop_name = name;
2352 
2353         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2354         if (error) {
2355                 kmem_free(*actual, sizeof (vnodeops_t));
2356         }
2357 
2358 #if DEBUG
2359         if (unused_ops != 0)
2360                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2361                     "but not used", name, unused_ops);
2362 #endif
2363 
2364         return (error);
2365 }
2366 
2367 /*
2368  * Free the vnodeops created as a result of vn_make_ops()
2369  */
2370 void
2371 vn_freevnodeops(vnodeops_t *vnops)
2372 {
2373         kmem_free(vnops, sizeof (vnodeops_t));
2374 }
2375 
2376 /*
2377  * Vnode cache.
2378  */
2379 
2380 /* ARGSUSED */
2381 static int
2382 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2383 {
2384         struct vnode *vp;
2385 
2386         vp = buf;
2387 
2388         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2389         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2390         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2391         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2392         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2393         vp->v_path = vn_vpath_empty;
2394         vp->v_path_stamp = 0;
2395         vp->v_mpssdata = NULL;
2396         vp->v_vsd = NULL;
2397         vp->v_fopdata = NULL;
2398 
2399         return (0);
2400 }
2401 
2402 /* ARGSUSED */
2403 static void
2404 vn_cache_destructor(void *buf, void *cdrarg)
2405 {
2406         struct vnode *vp;
2407 
2408         vp = buf;
2409 
2410         rw_destroy(&vp->v_nbllock);
2411         cv_destroy(&vp->v_cv);
2412         mutex_destroy(&vp->v_vsd_lock);
2413         mutex_destroy(&vp->v_lock);
2414 }
2415 
2416 void
2417 vn_create_cache(void)
2418 {
2419         /* LINTED */
2420         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2421             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2422         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2423             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2424             NULL, 0);
2425 }
2426 
2427 void
2428 vn_destroy_cache(void)
2429 {
2430         kmem_cache_destroy(vn_cache);
2431 }
2432 
2433 /*
2434  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2435  * cached by the file system and vnodes remain associated.
2436  */
2437 void
2438 vn_recycle(vnode_t *vp)
2439 {
2440         ASSERT(vp->v_pages == NULL);
2441         VERIFY(vp->v_path != NULL);
2442 
2443         /*
2444          * XXX - This really belongs in vn_reinit(), but we have some issues
2445          * with the counts.  Best to have it here for clean initialization.
2446          */
2447         vp->v_rdcnt = 0;
2448         vp->v_wrcnt = 0;
2449         vp->v_mmap_read = 0;
2450         vp->v_mmap_write = 0;
2451 
2452         /*
2453          * If FEM was in use, make sure everything gets cleaned up
2454          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2455          * constructor.
2456          */
2457         if (vp->v_femhead) {
2458                 /* XXX - There should be a free_femhead() that does all this */
2459                 ASSERT(vp->v_femhead->femh_list == NULL);
2460                 mutex_destroy(&vp->v_femhead->femh_lock);
2461                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2462                 vp->v_femhead = NULL;
2463         }
2464         if (vp->v_path != vn_vpath_empty) {
2465                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2466                 vp->v_path = vn_vpath_empty;
2467         }
2468         vp->v_path_stamp = 0;
2469 
2470         if (vp->v_fopdata != NULL) {
2471                 free_fopdata(vp);
2472         }
2473         vp->v_mpssdata = NULL;
2474         vsd_free(vp);
2475 }
2476 
2477 /*
2478  * Used to reset the vnode fields including those that are directly accessible
2479  * as well as those which require an accessor function.
2480  *
2481  * Does not initialize:
2482  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2483  *      v_data (since FS-nodes and vnodes point to each other and should
2484  *              be updated simultaneously)
2485  *      v_op (in case someone needs to make a VOP call on this object)
2486  */
2487 void
2488 vn_reinit(vnode_t *vp)
2489 {
2490         vp->v_count = 1;
2491         vp->v_count_dnlc = 0;
2492         vp->v_phantom_count = 0;
2493         vp->v_vfsp = NULL;
2494         vp->v_stream = NULL;
2495         vp->v_vfsmountedhere = NULL;
2496         vp->v_flag = 0;
2497         vp->v_type = VNON;
2498         vp->v_rdev = NODEV;
2499 
2500         vp->v_filocks = NULL;
2501         vp->v_shrlocks = NULL;
2502         vp->v_pages = NULL;
2503 
2504         vp->v_locality = NULL;
2505         vp->v_xattrdir = NULL;
2506 
2507         /*
2508          * In a few specific instances, vn_reinit() is used to initialize
2509          * locally defined vnode_t instances.  Lacking the construction offered
2510          * by vn_alloc(), these vnodes require v_path initialization.
2511          */
2512         if (vp->v_path == NULL) {
2513                 vp->v_path = vn_vpath_empty;
2514         }
2515 
2516         /* Handles v_femhead, v_path, and the r/w/map counts */
2517         vn_recycle(vp);
2518 }
2519 
2520 vnode_t *
2521 vn_alloc(int kmflag)
2522 {
2523         vnode_t *vp;
2524 
2525         vp = kmem_cache_alloc(vn_cache, kmflag);
2526 
2527         if (vp != NULL) {
2528                 vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2529                 vp->v_fopdata = NULL;
2530                 vn_reinit(vp);
2531         }
2532 
2533         return (vp);
2534 }
2535 
2536 void
2537 vn_free(vnode_t *vp)
2538 {
2539         ASSERT(vp->v_shrlocks == NULL);
2540         ASSERT(vp->v_filocks == NULL);
2541 
2542         /*
2543          * Some file systems call vn_free() with v_count of zero,
2544          * some with v_count of 1.  In any case, the value should
2545          * never be anything else.
2546          */
2547         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2548         ASSERT(vp->v_count_dnlc == 0);
2549         ASSERT0(vp->v_phantom_count);
2550         VERIFY(vp->v_path != NULL);
2551         if (vp->v_path != vn_vpath_empty) {
2552                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2553                 vp->v_path = vn_vpath_empty;
2554         }
2555 
2556         /* If FEM was in use, make sure everything gets cleaned up */
2557         if (vp->v_femhead) {
2558                 /* XXX - There should be a free_femhead() that does all this */
2559                 ASSERT(vp->v_femhead->femh_list == NULL);
2560                 mutex_destroy(&vp->v_femhead->femh_lock);
2561                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2562                 vp->v_femhead = NULL;
2563         }
2564 
2565         if (vp->v_fopdata != NULL) {
2566                 free_fopdata(vp);
2567         }
2568         vp->v_mpssdata = NULL;
2569         vsd_free(vp);
2570         kmem_cache_free(vn_cache, vp);
2571 }
2572 
2573 /*
2574  * vnode status changes, should define better states than 1, 0.
2575  */
2576 void
2577 vn_reclaim(vnode_t *vp)
2578 {
2579         vfs_t   *vfsp = vp->v_vfsp;
2580 
2581         if (vfsp == NULL ||
2582             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2583                 return;
2584         }
2585         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2586 }
2587 
2588 void
2589 vn_idle(vnode_t *vp)
2590 {
2591         vfs_t   *vfsp = vp->v_vfsp;
2592 
2593         if (vfsp == NULL ||
2594             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2595                 return;
2596         }
2597         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2598 }
2599 void
2600 vn_exists(vnode_t *vp)
2601 {
2602         vfs_t   *vfsp = vp->v_vfsp;
2603 
2604         if (vfsp == NULL ||
2605             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2606                 return;
2607         }
2608         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2609 }
2610 
2611 void
2612 vn_invalid(vnode_t *vp)
2613 {
2614         vfs_t   *vfsp = vp->v_vfsp;
2615 
2616         if (vfsp == NULL ||
2617             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2618                 return;
2619         }
2620         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2621 }
2622 
2623 /* Vnode event notification */
2624 
2625 int
2626 vnevent_support(vnode_t *vp, caller_context_t *ct)
2627 {
2628         if (vp == NULL)
2629                 return (EINVAL);
2630 
2631         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2632 }
2633 
2634 void
2635 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2636 {
2637         if (vp == NULL || vp->v_femhead == NULL) {
2638                 return;
2639         }
2640         (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2641         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2642 }
2643 
2644 void
2645 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2646     caller_context_t *ct)
2647 {
2648         if (vp == NULL || vp->v_femhead == NULL) {
2649                 return;
2650         }
2651         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2652 }
2653 
2654 void
2655 vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2656     caller_context_t *ct)
2657 {
2658         if (vp == NULL || vp->v_femhead == NULL) {
2659                 return;
2660         }
2661         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2662 }
2663 
2664 void
2665 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2666 {
2667         if (vp == NULL || vp->v_femhead == NULL) {
2668                 return;
2669         }
2670         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2671 }
2672 
2673 void
2674 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2675 {
2676         if (vp == NULL || vp->v_femhead == NULL) {
2677                 return;
2678         }
2679         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2680 }
2681 
2682 void
2683 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2684     caller_context_t *ct)
2685 {
2686         if (vp == NULL || vp->v_femhead == NULL) {
2687                 return;
2688         }
2689         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2690 }
2691 
2692 void
2693 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2694     caller_context_t *ct)
2695 {
2696         if (vp == NULL || vp->v_femhead == NULL) {
2697                 return;
2698         }
2699         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2700 }
2701 
2702 void
2703 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2704     caller_context_t *ct)
2705 {
2706         if (vp == NULL || vp->v_femhead == NULL) {
2707                 return;
2708         }
2709         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2710 }
2711 
2712 void
2713 vnevent_create(vnode_t *vp, caller_context_t *ct)
2714 {
2715         if (vp == NULL || vp->v_femhead == NULL) {
2716                 return;
2717         }
2718         (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2719 }
2720 
2721 void
2722 vnevent_link(vnode_t *vp, caller_context_t *ct)
2723 {
2724         if (vp == NULL || vp->v_femhead == NULL) {
2725                 return;
2726         }
2727         (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2728 }
2729 
2730 void
2731 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2732 {
2733         if (vp == NULL || vp->v_femhead == NULL) {
2734                 return;
2735         }
2736         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2737 }
2738 
2739 void
2740 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2741 {
2742         if (vp == NULL || vp->v_femhead == NULL) {
2743                 return;
2744         }
2745         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2746 }
2747 
2748 void
2749 vnevent_resize(vnode_t *vp, caller_context_t *ct)
2750 {
2751         if (vp == NULL || vp->v_femhead == NULL) {
2752                 return;
2753         }
2754         (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2755 }
2756 
2757 /*
2758  * Vnode accessors.
2759  */
2760 
2761 int
2762 vn_is_readonly(vnode_t *vp)
2763 {
2764         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2765 }
2766 
2767 int
2768 vn_has_flocks(vnode_t *vp)
2769 {
2770         return (vp->v_filocks != NULL);
2771 }
2772 
2773 int
2774 vn_has_mandatory_locks(vnode_t *vp, int mode)
2775 {
2776         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2777 }
2778 
2779 int
2780 vn_has_cached_data(vnode_t *vp)
2781 {
2782         return (vp->v_pages != NULL);
2783 }
2784 
2785 /*
2786  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2787  * zone_enter(2).
2788  */
2789 int
2790 vn_can_change_zones(vnode_t *vp)
2791 {
2792         struct vfssw *vswp;
2793         int allow = 1;
2794         vnode_t *rvp;
2795 
2796         if (nfs_global_client_only != 0)
2797                 return (1);
2798 
2799         /*
2800          * We always want to look at the underlying vnode if there is one.
2801          */
2802         if (VOP_REALVP(vp, &rvp, NULL) != 0)
2803                 rvp = vp;
2804         /*
2805          * Some pseudo filesystems (including doorfs) don't actually register
2806          * their vfsops_t, so the following may return NULL; we happily let
2807          * such vnodes switch zones.
2808          */
2809         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2810         if (vswp != NULL) {
2811                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2812                         allow = 0;
2813                 vfs_unrefvfssw(vswp);
2814         }
2815         return (allow);
2816 }
2817 
2818 /*
2819  * Return nonzero if the vnode is a mount point, zero if not.
2820  */
2821 int
2822 vn_ismntpt(vnode_t *vp)
2823 {
2824         return (vp->v_vfsmountedhere != NULL);
2825 }
2826 
2827 /* Retrieve the vfs (if any) mounted on this vnode */
2828 vfs_t *
2829 vn_mountedvfs(vnode_t *vp)
2830 {
2831         return (vp->v_vfsmountedhere);
2832 }
2833 
2834 /*
2835  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2836  */
2837 int
2838 vn_in_dnlc(vnode_t *vp)
2839 {
2840         return (vp->v_count_dnlc > 0);
2841 }
2842 
2843 /*
2844  * vn_has_other_opens() checks whether a particular file is opened by more than
2845  * just the caller and whether the open is for read and/or write.
2846  * This routine is for calling after the caller has already called VOP_OPEN()
2847  * and the caller wishes to know if they are the only one with it open for
2848  * the mode(s) specified.
2849  *
2850  * Vnode counts are only kept on regular files (v_type=VREG).
2851  */
2852 int
2853 vn_has_other_opens(
2854         vnode_t *vp,
2855         v_mode_t mode)
2856 {
2857 
2858         ASSERT(vp != NULL);
2859 
2860         switch (mode) {
2861         case V_WRITE:
2862                 if (vp->v_wrcnt > 1)
2863                         return (V_TRUE);
2864                 break;
2865         case V_RDORWR:
2866                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2867                         return (V_TRUE);
2868                 break;
2869         case V_RDANDWR:
2870                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2871                         return (V_TRUE);
2872                 break;
2873         case V_READ:
2874                 if (vp->v_rdcnt > 1)
2875                         return (V_TRUE);
2876                 break;
2877         }
2878 
2879         return (V_FALSE);
2880 }
2881 
2882 /*
2883  * vn_is_opened() checks whether a particular file is opened and
2884  * whether the open is for read and/or write.
2885  *
2886  * Vnode counts are only kept on regular files (v_type=VREG).
2887  */
2888 int
2889 vn_is_opened(
2890         vnode_t *vp,
2891         v_mode_t mode)
2892 {
2893 
2894         ASSERT(vp != NULL);
2895 
2896         switch (mode) {
2897         case V_WRITE:
2898                 if (vp->v_wrcnt)
2899                         return (V_TRUE);
2900                 break;
2901         case V_RDANDWR:
2902                 if (vp->v_rdcnt && vp->v_wrcnt)
2903                         return (V_TRUE);
2904                 break;
2905         case V_RDORWR:
2906                 if (vp->v_rdcnt || vp->v_wrcnt)
2907                         return (V_TRUE);
2908                 break;
2909         case V_READ:
2910                 if (vp->v_rdcnt)
2911                         return (V_TRUE);
2912                 break;
2913         }
2914 
2915         return (V_FALSE);
2916 }
2917 
2918 /*
2919  * vn_is_mapped() checks whether a particular file is mapped and whether
2920  * the file is mapped read and/or write.
2921  */
2922 int
2923 vn_is_mapped(
2924         vnode_t *vp,
2925         v_mode_t mode)
2926 {
2927 
2928         ASSERT(vp != NULL);
2929 
2930 #if !defined(_LP64)
2931         switch (mode) {
2932         /*
2933          * The atomic_add_64_nv functions force atomicity in the
2934          * case of 32 bit architectures. Otherwise the 64 bit values
2935          * require two fetches. The value of the fields may be
2936          * (potentially) changed between the first fetch and the
2937          * second
2938          */
2939         case V_WRITE:
2940                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2941                         return (V_TRUE);
2942                 break;
2943         case V_RDANDWR:
2944                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2945                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2946                         return (V_TRUE);
2947                 break;
2948         case V_RDORWR:
2949                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2950                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2951                         return (V_TRUE);
2952                 break;
2953         case V_READ:
2954                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2955                         return (V_TRUE);
2956                 break;
2957         }
2958 #else
2959         switch (mode) {
2960         case V_WRITE:
2961                 if (vp->v_mmap_write)
2962                         return (V_TRUE);
2963                 break;
2964         case V_RDANDWR:
2965                 if (vp->v_mmap_read && vp->v_mmap_write)
2966                         return (V_TRUE);
2967                 break;
2968         case V_RDORWR:
2969                 if (vp->v_mmap_read || vp->v_mmap_write)
2970                         return (V_TRUE);
2971                 break;
2972         case V_READ:
2973                 if (vp->v_mmap_read)
2974                         return (V_TRUE);
2975                 break;
2976         }
2977 #endif
2978 
2979         return (V_FALSE);
2980 }
2981 
2982 /*
2983  * Set the operations vector for a vnode.
2984  *
2985  * FEM ensures that the v_femhead pointer is filled in before the
2986  * v_op pointer is changed.  This means that if the v_femhead pointer
2987  * is NULL, and the v_op field hasn't changed since before which checked
2988  * the v_femhead pointer; then our update is ok - we are not racing with
2989  * FEM.
2990  */
2991 void
2992 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2993 {
2994         vnodeops_t      *op;
2995 
2996         ASSERT(vp != NULL);
2997         ASSERT(vnodeops != NULL);
2998 
2999         op = vp->v_op;
3000         membar_consumer();
3001         /*
3002          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
3003          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
3004          * in effect on the vnode and we need to have FEM deal with it.
3005          */
3006         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
3007             op) {
3008                 fem_setvnops(vp, vnodeops);
3009         }
3010 }
3011 
3012 /*
3013  * Retrieve the operations vector for a vnode
3014  * As with vn_setops(above); make sure we aren't racing with FEM.
3015  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
3016  * make sense to the callers of this routine.
3017  */
3018 vnodeops_t *
3019 vn_getops(vnode_t *vp)
3020 {
3021         vnodeops_t      *op;
3022 
3023         ASSERT(vp != NULL);
3024 
3025         op = vp->v_op;
3026         membar_consumer();
3027         if (vp->v_femhead == NULL && op == vp->v_op) {
3028                 return (op);
3029         } else {
3030                 return (fem_getvnops(vp));
3031         }
3032 }
3033 
3034 /*
3035  * Returns non-zero (1) if the vnodeops matches that of the vnode.
3036  * Returns zero (0) if not.
3037  */
3038 int
3039 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
3040 {
3041         return (vn_getops(vp) == vnodeops);
3042 }
3043 
3044 /*
3045  * Returns non-zero (1) if the specified operation matches the
3046  * corresponding operation for that the vnode.
3047  * Returns zero (0) if not.
3048  */
3049 
3050 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
3051 
3052 int
3053 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
3054 {
3055         const fs_operation_trans_def_t *otdp;
3056         fs_generic_func_p *loc = NULL;
3057         vnodeops_t      *vop = vn_getops(vp);
3058 
3059         ASSERT(vopname != NULL);
3060 
3061         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3062                 if (MATCHNAME(otdp->name, vopname)) {
3063                         loc = (fs_generic_func_p *)
3064                             ((char *)(vop) + otdp->offset);
3065                         break;
3066                 }
3067         }
3068 
3069         return ((loc != NULL) && (*loc == funcp));
3070 }
3071 
3072 /*
3073  * fs_new_caller_id() needs to return a unique ID on a given local system.
3074  * The IDs do not need to survive across reboots.  These are primarily
3075  * used so that (FEM) monitors can detect particular callers (such as
3076  * the NFS server) to a given vnode/vfs operation.
3077  */
3078 u_longlong_t
3079 fs_new_caller_id()
3080 {
3081         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3082 
3083         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3084 }
3085 
3086 /*
3087  * The value stored in v_path is relative to rootdir, located in the global
3088  * zone.  Zones or chroot environments which reside deeper inside the VFS
3089  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3090  * what lies below their perceived root.  In order to keep v_path usable for
3091  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3092  *
3093  * An upper bound of max_vnode_path is placed upon v_path allocations to
3094  * prevent the system from going too wild at the behest of pathological
3095  * behavior from the operator.
3096  */
3097 size_t max_vnode_path = 4 * MAXPATHLEN;
3098 
3099 
3100 void
3101 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3102 {
3103         char *buf;
3104 
3105         mutex_enter(&vp->v_lock);
3106         /*
3107          * If the snapshot of v_path_stamp passed in via compare_stamp does not
3108          * match the present value on the vnode, it indicates that subsequent
3109          * changes have occurred.  The v_path value is not cleared in this case
3110          * since the new value may be valid.
3111          */
3112         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3113                 mutex_exit(&vp->v_lock);
3114                 return;
3115         }
3116         buf = vp->v_path;
3117         vp->v_path = vn_vpath_empty;
3118         vp->v_path_stamp = 0;
3119         mutex_exit(&vp->v_lock);
3120         if (buf != vn_vpath_empty) {
3121                 kmem_free(buf, strlen(buf) + 1);
3122         }
3123 }
3124 
3125 static void
3126 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3127     boolean_t is_rename)
3128 {
3129         char *buf, *oldbuf;
3130         hrtime_t pstamp;
3131         size_t baselen, buflen = 0;
3132 
3133         /* Handle the vn_setpath_str case. */
3134         if (pvp == NULL) {
3135                 if (len + 1 > max_vnode_path) {
3136                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3137                             vnode_t *, vp, char *, name, size_t, len + 1);
3138                         return;
3139                 }
3140                 buf = kmem_alloc(len + 1, KM_SLEEP);
3141                 bcopy(name, buf, len);
3142                 buf[len] = '\0';
3143 
3144                 mutex_enter(&vp->v_lock);
3145                 oldbuf = vp->v_path;
3146                 vp->v_path = buf;
3147                 vp->v_path_stamp = gethrtime();
3148                 mutex_exit(&vp->v_lock);
3149                 if (oldbuf != vn_vpath_empty) {
3150                         kmem_free(oldbuf, strlen(oldbuf) + 1);
3151                 }
3152                 return;
3153         }
3154 
3155         /* Take snapshot of parent dir */
3156         mutex_enter(&pvp->v_lock);
3157 
3158         if ((pvp->v_flag & VTRAVERSE) != 0) {
3159                 /*
3160                  * When the parent vnode has VTRAVERSE set in its flags, normal
3161                  * assumptions about v_path calculation no longer apply.  The
3162                  * primary situation where this occurs is via the VFS tricks
3163                  * which procfs plays in order to allow /proc/PID/(root|cwd) to
3164                  * yield meaningful results.
3165                  *
3166                  * When this flag is set, v_path on the child must not be
3167                  * updated since the calculated value is likely to be
3168                  * incorrect, given the current context.
3169                  */
3170                 mutex_exit(&pvp->v_lock);
3171                 return;
3172         }
3173 
3174 retrybuf:
3175         if (pvp->v_path == vn_vpath_empty) {
3176                 /*
3177                  * Without v_path from the parent directory, generating a child
3178                  * path from the name is impossible.
3179                  */
3180                 if (len > 0) {
3181                         pstamp = pvp->v_path_stamp;
3182                         mutex_exit(&pvp->v_lock);
3183                         vn_clearpath(vp, pstamp);
3184                         return;
3185                 }
3186 
3187                 /*
3188                  * The only feasible case here is where a NUL lookup is being
3189                  * performed on rootdir prior to its v_path being populated.
3190                  */
3191                 ASSERT(pvp->v_path_stamp == 0);
3192                 baselen = 0;
3193                 pstamp = 0;
3194         } else {
3195                 pstamp = pvp->v_path_stamp;
3196                 baselen = strlen(pvp->v_path);
3197                 /* ignore a trailing slash if present */
3198                 if (pvp->v_path[baselen - 1] == '/') {
3199                         /* This should only the be case for rootdir */
3200                         ASSERT(baselen == 1 && pvp == rootdir);
3201                         baselen--;
3202                 }
3203         }
3204         mutex_exit(&pvp->v_lock);
3205 
3206         if (buflen != 0) {
3207                 /* Free the existing (mis-sized) buffer in case of retry */
3208                 kmem_free(buf, buflen);
3209         }
3210         /* base, '/', name and trailing NUL */
3211         buflen = baselen + len + 2;
3212         if (buflen > max_vnode_path) {
3213                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3214                     vnode_t *, vp, char *, name, size_t, buflen);
3215                 return;
3216         }
3217         buf = kmem_alloc(buflen, KM_SLEEP);
3218 
3219         mutex_enter(&pvp->v_lock);
3220         if (pvp->v_path_stamp != pstamp) {
3221                 size_t vlen;
3222 
3223                 /*
3224                  * Since v_path_stamp changed on the parent, it is likely that
3225                  * v_path has been altered as well.  If the length does not
3226                  * exactly match what was previously measured, the buffer
3227                  * allocation must be repeated for proper sizing.
3228                  */
3229                 if (pvp->v_path == vn_vpath_empty) {
3230                         /* Give up if parent lack v_path */
3231                         mutex_exit(&pvp->v_lock);
3232                         kmem_free(buf, buflen);
3233                         return;
3234                 }
3235                 vlen = strlen(pvp->v_path);
3236                 if (pvp->v_path[vlen - 1] == '/') {
3237                         vlen--;
3238                 }
3239                 if (vlen != baselen) {
3240                         goto retrybuf;
3241                 }
3242         }
3243         bcopy(pvp->v_path, buf, baselen);
3244         mutex_exit(&pvp->v_lock);
3245 
3246         buf[baselen] = '/';
3247         baselen++;
3248         bcopy(name, &buf[baselen], len + 1);
3249 
3250         mutex_enter(&vp->v_lock);
3251         if (vp->v_path_stamp == 0) {
3252                 /* never-visited vnode can inherit stamp from parent */
3253                 ASSERT(vp->v_path == vn_vpath_empty);
3254                 vp->v_path_stamp = pstamp;
3255                 vp->v_path = buf;
3256                 mutex_exit(&vp->v_lock);
3257         } else if (vp->v_path_stamp < pstamp || is_rename) {
3258                 /*
3259                  * Install the updated path and stamp, ensuring that the v_path
3260                  * pointer is valid at all times for dtrace.
3261                  */
3262                 oldbuf = vp->v_path;
3263                 vp->v_path = buf;
3264                 vp->v_path_stamp = gethrtime();
3265                 mutex_exit(&vp->v_lock);
3266                 kmem_free(oldbuf, strlen(oldbuf) + 1);
3267         } else {
3268                 /*
3269                  * If the timestamp matches or is greater, it means another
3270                  * thread performed the update first while locks were dropped
3271                  * here to make the allocation.  We defer to the newer value.
3272                  */
3273                 mutex_exit(&vp->v_lock);
3274                 kmem_free(buf, buflen);
3275         }
3276         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3277 }
3278 
3279 void
3280 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3281 {
3282         size_t len;
3283 
3284         /*
3285          * If the parent is older or empty, there's nothing further to do.
3286          */
3287         if (pvp->v_path == vn_vpath_empty ||
3288             pvp->v_path_stamp <= vp->v_path_stamp) {
3289                 return;
3290         }
3291 
3292         /*
3293          * Given the lack of appropriate context, meaningful updates to v_path
3294          * cannot be made for during lookups for the '.' or '..' entries.
3295          */
3296         len = strlen(name);
3297         if (len == 0 || (len == 1 && name[0] == '.') ||
3298             (len == 2 && name[0] == '.' && name[1] == '.')) {
3299                 return;
3300         }
3301 
3302         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3303 }
3304 
3305 /*
3306  * Given a starting vnode and a path, updates the path in the target vnode in
3307  * a safe manner.  If the vnode already has path information embedded, then the
3308  * cached path is left untouched.
3309  */
3310 /* ARGSUSED */
3311 void
3312 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3313     size_t len)
3314 {
3315         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3316 }
3317 
3318 /*
3319  * Sets the path to the vnode to be the given string, regardless of current
3320  * context.  The string must be a complete path from rootdir.  This is only used
3321  * by fsop_root() for setting the path based on the mountpoint.
3322  */
3323 void
3324 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3325 {
3326         vn_setpath_common(NULL, vp, str, len, B_FALSE);
3327 }
3328 
3329 /*
3330  * Called from within filesystem's vop_rename() to handle renames once the
3331  * target vnode is available.
3332  */
3333 void
3334 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3335 {
3336         vn_setpath_common(pvp, vp, name, len, B_TRUE);
3337 }
3338 
3339 /*
3340  * Similar to vn_setpath_str(), this function sets the path of the destination
3341  * vnode to the be the same as the source vnode.
3342  */
3343 void
3344 vn_copypath(struct vnode *src, struct vnode *dst)
3345 {
3346         char *buf;
3347         hrtime_t stamp;
3348         size_t buflen;
3349 
3350         mutex_enter(&src->v_lock);
3351         if (src->v_path == vn_vpath_empty) {
3352                 mutex_exit(&src->v_lock);
3353                 return;
3354         }
3355         buflen = strlen(src->v_path) + 1;
3356         mutex_exit(&src->v_lock);
3357 
3358         buf = kmem_alloc(buflen, KM_SLEEP);
3359 
3360         mutex_enter(&src->v_lock);
3361         if (src->v_path == vn_vpath_empty ||
3362             strlen(src->v_path) + 1 != buflen) {
3363                 mutex_exit(&src->v_lock);
3364                 kmem_free(buf, buflen);
3365                 return;
3366         }
3367         bcopy(src->v_path, buf, buflen);
3368         stamp = src->v_path_stamp;
3369         mutex_exit(&src->v_lock);
3370 
3371         mutex_enter(&dst->v_lock);
3372         if (dst->v_path != vn_vpath_empty) {
3373                 mutex_exit(&dst->v_lock);
3374                 kmem_free(buf, buflen);
3375                 return;
3376         }
3377         dst->v_path = buf;
3378         dst->v_path_stamp = stamp;
3379         mutex_exit(&dst->v_lock);
3380 }
3381 
3382 
3383 /*
3384  * XXX Private interface for segvn routines that handle vnode
3385  * large page segments.
3386  *
3387  * return 1 if vp's file system VOP_PAGEIO() implementation
3388  * can be safely used instead of VOP_GETPAGE() for handling
3389  * pagefaults against regular non swap files. VOP_PAGEIO()
3390  * interface is considered safe here if its implementation
3391  * is very close to VOP_GETPAGE() implementation.
3392  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3393  * panic if there're file holes but instead returns an error.
3394  * Doesn't assume file won't be changed by user writes, etc.
3395  *
3396  * return 0 otherwise.
3397  *
3398  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3399  */
3400 int
3401 vn_vmpss_usepageio(vnode_t *vp)
3402 {
3403         vfs_t   *vfsp = vp->v_vfsp;
3404         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3405         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3406         char **fsok = pageio_ok_fss;
3407 
3408         if (fsname == NULL) {
3409                 return (0);
3410         }
3411 
3412         for (; *fsok; fsok++) {
3413                 if (strcmp(*fsok, fsname) == 0) {
3414                         return (1);
3415                 }
3416         }
3417         return (0);
3418 }
3419 
3420 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3421 
3422 int
3423 fop_open(
3424         vnode_t **vpp,
3425         int mode,
3426         cred_t *cr,
3427         caller_context_t *ct)
3428 {
3429         int ret;
3430         vnode_t *vp = *vpp;
3431 
3432         VN_HOLD(vp);
3433         /*
3434          * Adding to the vnode counts before calling open
3435          * avoids the need for a mutex. It circumvents a race
3436          * condition where a query made on the vnode counts results in a
3437          * false negative. The inquirer goes away believing the file is
3438          * not open when there is an open on the file already under way.
3439          *
3440          * The counts are meant to prevent NFS from granting a delegation
3441          * when it would be dangerous to do so.
3442          *
3443          * The vnode counts are only kept on regular files
3444          */
3445         if ((*vpp)->v_type == VREG) {
3446                 if (mode & FREAD)
3447                         atomic_inc_32(&(*vpp)->v_rdcnt);
3448                 if (mode & FWRITE)
3449                         atomic_inc_32(&(*vpp)->v_wrcnt);
3450         }
3451 
3452         VOPXID_MAP_CR(vp, cr);
3453 
3454         ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3455 
3456         if (ret) {
3457                 /*
3458                  * Use the saved vp just in case the vnode ptr got trashed
3459                  * by the error.
3460                  */
3461                 VOPSTATS_UPDATE(vp, open);
3462                 if ((vp->v_type == VREG) && (mode & FREAD))
3463                         atomic_dec_32(&vp->v_rdcnt);
3464                 if ((vp->v_type == VREG) && (mode & FWRITE))
3465                         atomic_dec_32(&vp->v_wrcnt);
3466         } else {
3467                 /*
3468                  * Some filesystems will return a different vnode,
3469                  * but the same path was still used to open it.
3470                  * So if we do change the vnode and need to
3471                  * copy over the path, do so here, rather than special
3472                  * casing each filesystem. Adjust the vnode counts to
3473                  * reflect the vnode switch.
3474                  */
3475                 VOPSTATS_UPDATE(*vpp, open);
3476                 if (*vpp != vp) {
3477                         vn_copypath(vp, *vpp);
3478                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3479                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3480                         if ((vp->v_type == VREG) && (mode & FREAD))
3481                                 atomic_dec_32(&vp->v_rdcnt);
3482                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3483                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3484                         if ((vp->v_type == VREG) && (mode & FWRITE))
3485                                 atomic_dec_32(&vp->v_wrcnt);
3486                 }
3487         }
3488         VN_RELE(vp);
3489         return (ret);
3490 }
3491 
3492 int
3493 fop_close(
3494         vnode_t *vp,
3495         int flag,
3496         int count,
3497         offset_t offset,
3498         cred_t *cr,
3499         caller_context_t *ct)
3500 {
3501         int err;
3502 
3503         VOPXID_MAP_CR(vp, cr);
3504 
3505         err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3506         VOPSTATS_UPDATE(vp, close);
3507         /*
3508          * Check passed in count to handle possible dups. Vnode counts are only
3509          * kept on regular files
3510          */
3511         if ((vp->v_type == VREG) && (count == 1))  {
3512                 if (flag & FREAD) {
3513                         ASSERT(vp->v_rdcnt > 0);
3514                         atomic_dec_32(&vp->v_rdcnt);
3515                 }
3516                 if (flag & FWRITE) {
3517                         ASSERT(vp->v_wrcnt > 0);
3518                         atomic_dec_32(&vp->v_wrcnt);
3519                 }
3520         }
3521         return (err);
3522 }
3523 
3524 int
3525 fop_read(
3526         vnode_t *vp,
3527         uio_t *uiop,
3528         int ioflag,
3529         cred_t *cr,
3530         caller_context_t *ct)
3531 {
3532         ssize_t resid_start = uiop->uio_resid;
3533         zone_t  *zonep = curzone;
3534         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3535 
3536         hrtime_t start = 0, lat;
3537         ssize_t len;
3538         int err;
3539 
3540         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3541             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3542                 start = gethrtime();
3543 
3544                 mutex_enter(&zonep->zone_vfs_lock);
3545                 kstat_runq_enter(&zonep->zone_vfs_rwstats);
3546                 mutex_exit(&zonep->zone_vfs_lock);
3547         }
3548 
3549         VOPXID_MAP_CR(vp, cr);
3550 
3551         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3552         len = resid_start - uiop->uio_resid;
3553 
3554         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3555 
3556         if (start != 0) {
3557                 mutex_enter(&zonep->zone_vfs_lock);
3558                 zonep->zone_vfs_rwstats.reads++;
3559                 zonep->zone_vfs_rwstats.nread += len;
3560                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3561                 mutex_exit(&zonep->zone_vfs_lock);
3562 
3563                 lat = gethrtime() - start;
3564 
3565                 if (lat >= VOP_LATENCY_10MS) {
3566                         if (lat < VOP_LATENCY_100MS)
3567                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3568                         else if (lat < VOP_LATENCY_1S) {
3569                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3570                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3571                         } else {
3572                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3573                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3574                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3575                         }
3576                 }
3577         }
3578 
3579         return (err);
3580 }
3581 
3582 int
3583 fop_write(
3584         vnode_t *vp,
3585         uio_t *uiop,
3586         int ioflag,
3587         cred_t *cr,
3588         caller_context_t *ct)
3589 {
3590         ssize_t resid_start = uiop->uio_resid;
3591         zone_t  *zonep = curzone;
3592         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3593 
3594         hrtime_t start = 0, lat;
3595         ssize_t len;
3596         int     err;
3597 
3598         /*
3599          * For the purposes of VFS kstat consumers, the "waitq" calculation is
3600          * repurposed as the active queue for VFS write operations.  There's no
3601          * actual wait queue for VFS operations.
3602          */
3603         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3604             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3605                 start = gethrtime();
3606 
3607                 mutex_enter(&zonep->zone_vfs_lock);
3608                 kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3609                 mutex_exit(&zonep->zone_vfs_lock);
3610         }
3611 
3612         VOPXID_MAP_CR(vp, cr);
3613 
3614         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3615         len = resid_start - uiop->uio_resid;
3616 
3617         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3618 
3619         if (start != 0) {
3620                 mutex_enter(&zonep->zone_vfs_lock);
3621                 zonep->zone_vfs_rwstats.writes++;
3622                 zonep->zone_vfs_rwstats.nwritten += len;
3623                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3624                 mutex_exit(&zonep->zone_vfs_lock);
3625 
3626                 lat = gethrtime() - start;
3627 
3628                 if (lat >= VOP_LATENCY_10MS) {
3629                         if (lat < VOP_LATENCY_100MS)
3630                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3631                         else if (lat < VOP_LATENCY_1S) {
3632                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3633                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3634                         } else {
3635                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3636                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3637                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3638                         }
3639                 }
3640         }
3641 
3642         return (err);
3643 }
3644 
3645 int
3646 fop_ioctl(
3647         vnode_t *vp,
3648         int cmd,
3649         intptr_t arg,
3650         int flag,
3651         cred_t *cr,
3652         int *rvalp,
3653         caller_context_t *ct)
3654 {
3655         int     err;
3656 
3657         VOPXID_MAP_CR(vp, cr);
3658 
3659         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3660         VOPSTATS_UPDATE(vp, ioctl);
3661         return (err);
3662 }
3663 
3664 int
3665 fop_setfl(
3666         vnode_t *vp,
3667         int oflags,
3668         int nflags,
3669         cred_t *cr,
3670         caller_context_t *ct)
3671 {
3672         int     err;
3673 
3674         VOPXID_MAP_CR(vp, cr);
3675 
3676         err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3677         VOPSTATS_UPDATE(vp, setfl);
3678         return (err);
3679 }
3680 
3681 int
3682 fop_getattr(
3683         vnode_t *vp,
3684         vattr_t *vap,
3685         int flags,
3686         cred_t *cr,
3687         caller_context_t *ct)
3688 {
3689         int     err;
3690 
3691         VOPXID_MAP_CR(vp, cr);
3692 
3693         /*
3694          * If this file system doesn't understand the xvattr extensions
3695          * then turn off the xvattr bit.
3696          */
3697         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3698                 vap->va_mask &= ~AT_XVATTR;
3699         }
3700 
3701         /*
3702          * We're only allowed to skip the ACL check iff we used a 32 bit
3703          * ACE mask with VOP_ACCESS() to determine permissions.
3704          */
3705         if ((flags & ATTR_NOACLCHECK) &&
3706             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3707                 return (EINVAL);
3708         }
3709         err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3710         VOPSTATS_UPDATE(vp, getattr);
3711         return (err);
3712 }
3713 
3714 int
3715 fop_setattr(
3716         vnode_t *vp,
3717         vattr_t *vap,
3718         int flags,
3719         cred_t *cr,
3720         caller_context_t *ct)
3721 {
3722         int     err;
3723 
3724         VOPXID_MAP_CR(vp, cr);
3725 
3726         /*
3727          * If this file system doesn't understand the xvattr extensions
3728          * then turn off the xvattr bit.
3729          */
3730         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3731                 vap->va_mask &= ~AT_XVATTR;
3732         }
3733 
3734         /*
3735          * We're only allowed to skip the ACL check iff we used a 32 bit
3736          * ACE mask with VOP_ACCESS() to determine permissions.
3737          */
3738         if ((flags & ATTR_NOACLCHECK) &&
3739             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3740                 return (EINVAL);
3741         }
3742         err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3743         VOPSTATS_UPDATE(vp, setattr);
3744         return (err);
3745 }
3746 
3747 int
3748 fop_access(
3749         vnode_t *vp,
3750         int mode,
3751         int flags,
3752         cred_t *cr,
3753         caller_context_t *ct)
3754 {
3755         int     err;
3756 
3757         if ((flags & V_ACE_MASK) &&
3758             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3759                 return (EINVAL);
3760         }
3761 
3762         VOPXID_MAP_CR(vp, cr);
3763 
3764         err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3765         VOPSTATS_UPDATE(vp, access);
3766         return (err);
3767 }
3768 
3769 int
3770 fop_lookup(
3771         vnode_t *dvp,
3772         char *nm,
3773         vnode_t **vpp,
3774         pathname_t *pnp,
3775         int flags,
3776         vnode_t *rdir,
3777         cred_t *cr,
3778         caller_context_t *ct,
3779         int *deflags,           /* Returned per-dirent flags */
3780         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3781 {
3782         int ret;
3783 
3784         /*
3785          * If this file system doesn't support case-insensitive access
3786          * and said access is requested, fail quickly.  It is required
3787          * that if the vfs supports case-insensitive lookup, it also
3788          * supports extended dirent flags.
3789          */
3790         if (flags & FIGNORECASE &&
3791             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3792             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3793                 return (EINVAL);
3794 
3795         VOPXID_MAP_CR(dvp, cr);
3796 
3797         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3798                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3799         } else {
3800                 ret = (*(dvp)->v_op->vop_lookup)
3801                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3802         }
3803         if (ret == 0 && *vpp) {
3804                 VOPSTATS_UPDATE(*vpp, lookup);
3805                 vn_updatepath(dvp, *vpp, nm);
3806         }
3807 
3808         return (ret);
3809 }
3810 
3811 int
3812 fop_create(
3813         vnode_t *dvp,
3814         char *name,
3815         vattr_t *vap,
3816         vcexcl_t excl,
3817         int mode,
3818         vnode_t **vpp,
3819         cred_t *cr,
3820         int flags,
3821         caller_context_t *ct,
3822         vsecattr_t *vsecp)      /* ACL to set during create */
3823 {
3824         int ret;
3825 
3826         if (vsecp != NULL &&
3827             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3828                 return (EINVAL);
3829         }
3830         /*
3831          * If this file system doesn't support case-insensitive access
3832          * and said access is requested, fail quickly.
3833          */
3834         if (flags & FIGNORECASE &&
3835             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3836             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3837                 return (EINVAL);
3838 
3839         VOPXID_MAP_CR(dvp, cr);
3840 
3841         ret = (*(dvp)->v_op->vop_create)
3842             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3843         if (ret == 0 && *vpp) {
3844                 VOPSTATS_UPDATE(*vpp, create);
3845                 vn_updatepath(dvp, *vpp, name);
3846         }
3847 
3848         return (ret);
3849 }
3850 
3851 int
3852 fop_remove(
3853         vnode_t *dvp,
3854         char *nm,
3855         cred_t *cr,
3856         caller_context_t *ct,
3857         int flags)
3858 {
3859         int     err;
3860 
3861         /*
3862          * If this file system doesn't support case-insensitive access
3863          * and said access is requested, fail quickly.
3864          */
3865         if (flags & FIGNORECASE &&
3866             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3867             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3868                 return (EINVAL);
3869 
3870         VOPXID_MAP_CR(dvp, cr);
3871 
3872         err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3873         VOPSTATS_UPDATE(dvp, remove);
3874         return (err);
3875 }
3876 
3877 int
3878 fop_link(
3879         vnode_t *tdvp,
3880         vnode_t *svp,
3881         char *tnm,
3882         cred_t *cr,
3883         caller_context_t *ct,
3884         int flags)
3885 {
3886         int     err;
3887 
3888         /*
3889          * If the target file system doesn't support case-insensitive access
3890          * and said access is requested, fail quickly.
3891          */
3892         if (flags & FIGNORECASE &&
3893             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3894             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3895                 return (EINVAL);
3896 
3897         VOPXID_MAP_CR(tdvp, cr);
3898 
3899         err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3900         VOPSTATS_UPDATE(tdvp, link);
3901         return (err);
3902 }
3903 
3904 int
3905 fop_rename(
3906         vnode_t *sdvp,
3907         char *snm,
3908         vnode_t *tdvp,
3909         char *tnm,
3910         cred_t *cr,
3911         caller_context_t *ct,
3912         int flags)
3913 {
3914         int     err;
3915 
3916         /*
3917          * If the file system involved does not support
3918          * case-insensitive access and said access is requested, fail
3919          * quickly.
3920          */
3921         if (flags & FIGNORECASE &&
3922             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3923             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3924                 return (EINVAL);
3925 
3926         VOPXID_MAP_CR(tdvp, cr);
3927 
3928         err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3929         VOPSTATS_UPDATE(sdvp, rename);
3930         return (err);
3931 }
3932 
3933 int
3934 fop_mkdir(
3935         vnode_t *dvp,
3936         char *dirname,
3937         vattr_t *vap,
3938         vnode_t **vpp,
3939         cred_t *cr,
3940         caller_context_t *ct,
3941         int flags,
3942         vsecattr_t *vsecp)      /* ACL to set during create */
3943 {
3944         int ret;
3945 
3946         if (vsecp != NULL &&
3947             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3948                 return (EINVAL);
3949         }
3950         /*
3951          * If this file system doesn't support case-insensitive access
3952          * and said access is requested, fail quickly.
3953          */
3954         if (flags & FIGNORECASE &&
3955             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3956             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3957                 return (EINVAL);
3958 
3959         VOPXID_MAP_CR(dvp, cr);
3960 
3961         ret = (*(dvp)->v_op->vop_mkdir)
3962             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3963         if (ret == 0 && *vpp) {
3964                 VOPSTATS_UPDATE(*vpp, mkdir);
3965                 vn_updatepath(dvp, *vpp, dirname);
3966         }
3967 
3968         return (ret);
3969 }
3970 
3971 int
3972 fop_rmdir(
3973         vnode_t *dvp,
3974         char *nm,
3975         vnode_t *cdir,
3976         cred_t *cr,
3977         caller_context_t *ct,
3978         int flags)
3979 {
3980         int     err;
3981 
3982         /*
3983          * If this file system doesn't support case-insensitive access
3984          * and said access is requested, fail quickly.
3985          */
3986         if (flags & FIGNORECASE &&
3987             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3988             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3989                 return (EINVAL);
3990 
3991         VOPXID_MAP_CR(dvp, cr);
3992 
3993         err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3994         VOPSTATS_UPDATE(dvp, rmdir);
3995         return (err);
3996 }
3997 
3998 int
3999 fop_readdir(
4000         vnode_t *vp,
4001         uio_t *uiop,
4002         cred_t *cr,
4003         int *eofp,
4004         caller_context_t *ct,
4005         int flags)
4006 {
4007         int     err;
4008         ssize_t resid_start = uiop->uio_resid;
4009 
4010         /*
4011          * If this file system doesn't support retrieving directory
4012          * entry flags and said access is requested, fail quickly.
4013          */
4014         if (flags & V_RDDIR_ENTFLAGS &&
4015             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
4016                 return (EINVAL);
4017 
4018         VOPXID_MAP_CR(vp, cr);
4019 
4020         err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
4021         VOPSTATS_UPDATE_IO(vp, readdir,
4022             readdir_bytes, (resid_start - uiop->uio_resid));
4023         return (err);
4024 }
4025 
4026 int
4027 fop_symlink(
4028         vnode_t *dvp,
4029         char *linkname,
4030         vattr_t *vap,
4031         char *target,
4032         cred_t *cr,
4033         caller_context_t *ct,
4034         int flags)
4035 {
4036         int     err;
4037         xvattr_t xvattr;
4038 
4039         /*
4040          * If this file system doesn't support case-insensitive access
4041          * and said access is requested, fail quickly.
4042          */
4043         if (flags & FIGNORECASE &&
4044             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
4045             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4046                 return (EINVAL);
4047 
4048         VOPXID_MAP_CR(dvp, cr);
4049 
4050         /* check for reparse point */
4051         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
4052             (strncmp(target, FS_REPARSE_TAG_STR,
4053             strlen(FS_REPARSE_TAG_STR)) == 0)) {
4054                 if (!fs_reparse_mark(target, vap, &xvattr))
4055                         vap = (vattr_t *)&xvattr;
4056         }
4057 
4058         err = (*(dvp)->v_op->vop_symlink)
4059             (dvp, linkname, vap, target, cr, ct, flags);
4060         VOPSTATS_UPDATE(dvp, symlink);
4061         return (err);
4062 }
4063 
4064 int
4065 fop_readlink(
4066         vnode_t *vp,
4067         uio_t *uiop,
4068         cred_t *cr,
4069         caller_context_t *ct)
4070 {
4071         int     err;
4072 
4073         VOPXID_MAP_CR(vp, cr);
4074 
4075         err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
4076         VOPSTATS_UPDATE(vp, readlink);
4077         return (err);
4078 }
4079 
4080 int
4081 fop_fsync(
4082         vnode_t *vp,
4083         int syncflag,
4084         cred_t *cr,
4085         caller_context_t *ct)
4086 {
4087         int     err;
4088 
4089         VOPXID_MAP_CR(vp, cr);
4090 
4091         err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
4092         VOPSTATS_UPDATE(vp, fsync);
4093         return (err);
4094 }
4095 
4096 void
4097 fop_inactive(
4098         vnode_t *vp,
4099         cred_t *cr,
4100         caller_context_t *ct)
4101 {
4102         /* Need to update stats before vop call since we may lose the vnode */
4103         VOPSTATS_UPDATE(vp, inactive);
4104 
4105         VOPXID_MAP_CR(vp, cr);
4106 
4107         (*(vp)->v_op->vop_inactive)(vp, cr, ct);
4108 }
4109 
4110 int
4111 fop_fid(
4112         vnode_t *vp,
4113         fid_t *fidp,
4114         caller_context_t *ct)
4115 {
4116         int     err;
4117 
4118         err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
4119         VOPSTATS_UPDATE(vp, fid);
4120         return (err);
4121 }
4122 
4123 int
4124 fop_rwlock(
4125         vnode_t *vp,
4126         int write_lock,
4127         caller_context_t *ct)
4128 {
4129         int     ret;
4130 
4131         ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
4132         VOPSTATS_UPDATE(vp, rwlock);
4133         return (ret);
4134 }
4135 
4136 void
4137 fop_rwunlock(
4138         vnode_t *vp,
4139         int write_lock,
4140         caller_context_t *ct)
4141 {
4142         (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4143         VOPSTATS_UPDATE(vp, rwunlock);
4144 }
4145 
4146 int
4147 fop_seek(
4148         vnode_t *vp,
4149         offset_t ooff,
4150         offset_t *noffp,
4151         caller_context_t *ct)
4152 {
4153         int     err;
4154 
4155         err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4156         VOPSTATS_UPDATE(vp, seek);
4157         return (err);
4158 }
4159 
4160 int
4161 fop_cmp(
4162         vnode_t *vp1,
4163         vnode_t *vp2,
4164         caller_context_t *ct)
4165 {
4166         int     err;
4167 
4168         err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4169         VOPSTATS_UPDATE(vp1, cmp);
4170         return (err);
4171 }
4172 
4173 int
4174 fop_frlock(
4175         vnode_t *vp,
4176         int cmd,
4177         flock64_t *bfp,
4178         int flag,
4179         offset_t offset,
4180         struct flk_callback *flk_cbp,
4181         cred_t *cr,
4182         caller_context_t *ct)
4183 {
4184         int     err;
4185 
4186         VOPXID_MAP_CR(vp, cr);
4187 
4188         err = (*(vp)->v_op->vop_frlock)
4189             (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4190         VOPSTATS_UPDATE(vp, frlock);
4191         return (err);
4192 }
4193 
4194 int
4195 fop_space(
4196         vnode_t *vp,
4197         int cmd,
4198         flock64_t *bfp,
4199         int flag,
4200         offset_t offset,
4201         cred_t *cr,
4202         caller_context_t *ct)
4203 {
4204         int     err;
4205 
4206         VOPXID_MAP_CR(vp, cr);
4207 
4208         err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4209         VOPSTATS_UPDATE(vp, space);
4210         return (err);
4211 }
4212 
4213 int
4214 fop_realvp(
4215         vnode_t *vp,
4216         vnode_t **vpp,
4217         caller_context_t *ct)
4218 {
4219         int     err;
4220 
4221         err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4222         VOPSTATS_UPDATE(vp, realvp);
4223         return (err);
4224 }
4225 
4226 int
4227 fop_getpage(
4228         vnode_t *vp,
4229         offset_t off,
4230         size_t len,
4231         uint_t *protp,
4232         page_t **plarr,
4233         size_t plsz,
4234         struct seg *seg,
4235         caddr_t addr,
4236         enum seg_rw rw,
4237         cred_t *cr,
4238         caller_context_t *ct)
4239 {
4240         int     err;
4241 
4242         VOPXID_MAP_CR(vp, cr);
4243 
4244         err = (*(vp)->v_op->vop_getpage)
4245             (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4246         VOPSTATS_UPDATE(vp, getpage);
4247         return (err);
4248 }
4249 
4250 int
4251 fop_putpage(
4252         vnode_t *vp,
4253         offset_t off,
4254         size_t len,
4255         int flags,
4256         cred_t *cr,
4257         caller_context_t *ct)
4258 {
4259         int     err;
4260 
4261         VOPXID_MAP_CR(vp, cr);
4262 
4263         err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4264         VOPSTATS_UPDATE(vp, putpage);
4265         return (err);
4266 }
4267 
4268 int
4269 fop_map(
4270         vnode_t *vp,
4271         offset_t off,
4272         struct as *as,
4273         caddr_t *addrp,
4274         size_t len,
4275         uchar_t prot,
4276         uchar_t maxprot,
4277         uint_t flags,
4278         cred_t *cr,
4279         caller_context_t *ct)
4280 {
4281         int     err;
4282 
4283         VOPXID_MAP_CR(vp, cr);
4284 
4285         err = (*(vp)->v_op->vop_map)
4286             (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4287         VOPSTATS_UPDATE(vp, map);
4288         return (err);
4289 }
4290 
4291 int
4292 fop_addmap(
4293         vnode_t *vp,
4294         offset_t off,
4295         struct as *as,
4296         caddr_t addr,
4297         size_t len,
4298         uchar_t prot,
4299         uchar_t maxprot,
4300         uint_t flags,
4301         cred_t *cr,
4302         caller_context_t *ct)
4303 {
4304         int error;
4305         u_longlong_t delta;
4306 
4307         VOPXID_MAP_CR(vp, cr);
4308 
4309         error = (*(vp)->v_op->vop_addmap)
4310             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4311 
4312         if ((!error) && (vp->v_type == VREG)) {
4313                 delta = (u_longlong_t)btopr(len);
4314                 /*
4315                  * If file is declared MAP_PRIVATE, it can't be written back
4316                  * even if open for write. Handle as read.
4317                  */
4318                 if (flags & MAP_PRIVATE) {
4319                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4320                             (int64_t)delta);
4321                 } else {
4322                         /*
4323                          * atomic_add_64 forces the fetch of a 64 bit value to
4324                          * be atomic on 32 bit machines
4325                          */
4326                         if (maxprot & PROT_WRITE)
4327                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4328                                     (int64_t)delta);
4329                         if (maxprot & PROT_READ)
4330                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4331                                     (int64_t)delta);
4332                         if (maxprot & PROT_EXEC)
4333                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4334                                     (int64_t)delta);
4335                 }
4336         }
4337         VOPSTATS_UPDATE(vp, addmap);
4338         return (error);
4339 }
4340 
4341 int
4342 fop_delmap(
4343         vnode_t *vp,
4344         offset_t off,
4345         struct as *as,
4346         caddr_t addr,
4347         size_t len,
4348         uint_t prot,
4349         uint_t maxprot,
4350         uint_t flags,
4351         cred_t *cr,
4352         caller_context_t *ct)
4353 {
4354         int error;
4355         u_longlong_t delta;
4356 
4357         VOPXID_MAP_CR(vp, cr);
4358 
4359         error = (*(vp)->v_op->vop_delmap)
4360             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4361 
4362         /*
4363          * NFS calls into delmap twice, the first time
4364          * it simply establishes a callback mechanism and returns EAGAIN
4365          * while the real work is being done upon the second invocation.
4366          * We have to detect this here and only decrement the counts upon
4367          * the second delmap request.
4368          */
4369         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4370 
4371                 delta = (u_longlong_t)btopr(len);
4372 
4373                 if (flags & MAP_PRIVATE) {
4374                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4375                             (int64_t)(-delta));
4376                 } else {
4377                         /*
4378                          * atomic_add_64 forces the fetch of a 64 bit value
4379                          * to be atomic on 32 bit machines
4380                          */
4381                         if (maxprot & PROT_WRITE)
4382                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4383                                     (int64_t)(-delta));
4384                         if (maxprot & PROT_READ)
4385                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4386                                     (int64_t)(-delta));
4387                         if (maxprot & PROT_EXEC)
4388                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4389                                     (int64_t)(-delta));
4390                 }
4391         }
4392         VOPSTATS_UPDATE(vp, delmap);
4393         return (error);
4394 }
4395 
4396 
4397 int
4398 fop_poll(
4399         vnode_t *vp,
4400         short events,
4401         int anyyet,
4402         short *reventsp,
4403         struct pollhead **phpp,
4404         caller_context_t *ct)
4405 {
4406         int     err;
4407 
4408         err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4409         VOPSTATS_UPDATE(vp, poll);
4410         return (err);
4411 }
4412 
4413 int
4414 fop_dump(
4415         vnode_t *vp,
4416         caddr_t addr,
4417         offset_t lbdn,
4418         offset_t dblks,
4419         caller_context_t *ct)
4420 {
4421         int     err;
4422 
4423         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4424         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4425                 return (EIO);
4426 
4427         err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4428         VOPSTATS_UPDATE(vp, dump);
4429         return (err);
4430 }
4431 
4432 int
4433 fop_pathconf(
4434         vnode_t *vp,
4435         int cmd,
4436         ulong_t *valp,
4437         cred_t *cr,
4438         caller_context_t *ct)
4439 {
4440         int     err;
4441 
4442         VOPXID_MAP_CR(vp, cr);
4443 
4444         err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4445         VOPSTATS_UPDATE(vp, pathconf);
4446         return (err);
4447 }
4448 
4449 int
4450 fop_pageio(
4451         vnode_t *vp,
4452         struct page *pp,
4453         u_offset_t io_off,
4454         size_t io_len,
4455         int flags,
4456         cred_t *cr,
4457         caller_context_t *ct)
4458 {
4459         int     err;
4460 
4461         VOPXID_MAP_CR(vp, cr);
4462 
4463         err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4464         VOPSTATS_UPDATE(vp, pageio);
4465         return (err);
4466 }
4467 
4468 int
4469 fop_dumpctl(
4470         vnode_t *vp,
4471         int action,
4472         offset_t *blkp,
4473         caller_context_t *ct)
4474 {
4475         int     err;
4476         err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4477         VOPSTATS_UPDATE(vp, dumpctl);
4478         return (err);
4479 }
4480 
4481 void
4482 fop_dispose(
4483         vnode_t *vp,
4484         page_t *pp,
4485         int flag,
4486         int dn,
4487         cred_t *cr,
4488         caller_context_t *ct)
4489 {
4490         /* Must do stats first since it's possible to lose the vnode */
4491         VOPSTATS_UPDATE(vp, dispose);
4492 
4493         VOPXID_MAP_CR(vp, cr);
4494 
4495         (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4496 }
4497 
4498 int
4499 fop_setsecattr(
4500         vnode_t *vp,
4501         vsecattr_t *vsap,
4502         int flag,
4503         cred_t *cr,
4504         caller_context_t *ct)
4505 {
4506         int     err;
4507 
4508         VOPXID_MAP_CR(vp, cr);
4509 
4510         /*
4511          * We're only allowed to skip the ACL check iff we used a 32 bit
4512          * ACE mask with VOP_ACCESS() to determine permissions.
4513          */
4514         if ((flag & ATTR_NOACLCHECK) &&
4515             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4516                 return (EINVAL);
4517         }
4518         err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4519         VOPSTATS_UPDATE(vp, setsecattr);
4520         return (err);
4521 }
4522 
4523 int
4524 fop_getsecattr(
4525         vnode_t *vp,
4526         vsecattr_t *vsap,
4527         int flag,
4528         cred_t *cr,
4529         caller_context_t *ct)
4530 {
4531         int     err;
4532 
4533         /*
4534          * We're only allowed to skip the ACL check iff we used a 32 bit
4535          * ACE mask with VOP_ACCESS() to determine permissions.
4536          */
4537         if ((flag & ATTR_NOACLCHECK) &&
4538             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4539                 return (EINVAL);
4540         }
4541 
4542         VOPXID_MAP_CR(vp, cr);
4543 
4544         err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4545         VOPSTATS_UPDATE(vp, getsecattr);
4546         return (err);
4547 }
4548 
4549 int
4550 fop_shrlock(
4551         vnode_t *vp,
4552         int cmd,
4553         struct shrlock *shr,
4554         int flag,
4555         cred_t *cr,
4556         caller_context_t *ct)
4557 {
4558         int     err;
4559 
4560         VOPXID_MAP_CR(vp, cr);
4561 
4562         err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4563         VOPSTATS_UPDATE(vp, shrlock);
4564         return (err);
4565 }
4566 
4567 int
4568 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4569     caller_context_t *ct)
4570 {
4571         int     err;
4572 
4573         err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4574         VOPSTATS_UPDATE(vp, vnevent);
4575         return (err);
4576 }
4577 
4578 int
4579 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4580     caller_context_t *ct)
4581 {
4582         int err;
4583 
4584         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4585                 return (ENOTSUP);
4586         err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4587         VOPSTATS_UPDATE(vp, reqzcbuf);
4588         return (err);
4589 }
4590 
4591 int
4592 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4593 {
4594         int err;
4595 
4596         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4597                 return (ENOTSUP);
4598         err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4599         VOPSTATS_UPDATE(vp, retzcbuf);
4600         return (err);
4601 }
4602 
4603 /*
4604  * Default destructor
4605  *      Needed because NULL destructor means that the key is unused
4606  */
4607 /* ARGSUSED */
4608 void
4609 vsd_defaultdestructor(void *value)
4610 {}
4611 
4612 /*
4613  * Create a key (index into per vnode array)
4614  *      Locks out vsd_create, vsd_destroy, and vsd_free
4615  *      May allocate memory with lock held
4616  */
4617 void
4618 vsd_create(uint_t *keyp, void (*destructor)(void *))
4619 {
4620         int     i;
4621         uint_t  nkeys;
4622 
4623         /*
4624          * if key is allocated, do nothing
4625          */
4626         mutex_enter(&vsd_lock);
4627         if (*keyp) {
4628                 mutex_exit(&vsd_lock);
4629                 return;
4630         }
4631         /*
4632          * find an unused key
4633          */
4634         if (destructor == NULL)
4635                 destructor = vsd_defaultdestructor;
4636 
4637         for (i = 0; i < vsd_nkeys; ++i)
4638                 if (vsd_destructor[i] == NULL)
4639                         break;
4640 
4641         /*
4642          * if no unused keys, increase the size of the destructor array
4643          */
4644         if (i == vsd_nkeys) {
4645                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4646                         nkeys = 1;
4647                 vsd_destructor =
4648                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4649                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4650                     (size_t)(nkeys * sizeof (void (*)(void *))));
4651                 vsd_nkeys = nkeys;
4652         }
4653 
4654         /*
4655          * allocate the next available unused key
4656          */
4657         vsd_destructor[i] = destructor;
4658         *keyp = i + 1;
4659 
4660         /* create vsd_list, if it doesn't exist */
4661         if (vsd_list == NULL) {
4662                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4663                 list_create(vsd_list, sizeof (struct vsd_node),
4664                     offsetof(struct vsd_node, vs_nodes));
4665         }
4666 
4667         mutex_exit(&vsd_lock);
4668 }
4669 
4670 /*
4671  * Destroy a key
4672  *
4673  * Assumes that the caller is preventing vsd_set and vsd_get
4674  * Locks out vsd_create, vsd_destroy, and vsd_free
4675  * May free memory with lock held
4676  */
4677 void
4678 vsd_destroy(uint_t *keyp)
4679 {
4680         uint_t key;
4681         struct vsd_node *vsd;
4682 
4683         /*
4684          * protect the key namespace and our destructor lists
4685          */
4686         mutex_enter(&vsd_lock);
4687         key = *keyp;
4688         *keyp = 0;
4689 
4690         ASSERT(key <= vsd_nkeys);
4691 
4692         /*
4693          * if the key is valid
4694          */
4695         if (key != 0) {
4696                 uint_t k = key - 1;
4697                 /*
4698                  * for every vnode with VSD, call key's destructor
4699                  */
4700                 for (vsd = list_head(vsd_list); vsd != NULL;
4701                     vsd = list_next(vsd_list, vsd)) {
4702                         /*
4703                          * no VSD for key in this vnode
4704                          */
4705                         if (key > vsd->vs_nkeys)
4706                                 continue;
4707                         /*
4708                          * call destructor for key
4709                          */
4710                         if (vsd->vs_value[k] && vsd_destructor[k])
4711                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4712                         /*
4713                          * reset value for key
4714                          */
4715                         vsd->vs_value[k] = NULL;
4716                 }
4717                 /*
4718                  * actually free the key (NULL destructor == unused)
4719                  */
4720                 vsd_destructor[k] = NULL;
4721         }
4722 
4723         mutex_exit(&vsd_lock);
4724 }
4725 
4726 /*
4727  * Quickly return the per vnode value that was stored with the specified key
4728  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4729  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4730  */
4731 void *
4732 vsd_get(vnode_t *vp, uint_t key)
4733 {
4734         struct vsd_node *vsd;
4735 
4736         ASSERT(vp != NULL);
4737         ASSERT(mutex_owned(&vp->v_vsd_lock));
4738 
4739         vsd = vp->v_vsd;
4740 
4741         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4742                 return (vsd->vs_value[key - 1]);
4743         return (NULL);
4744 }
4745 
4746 /*
4747  * Set a per vnode value indexed with the specified key
4748  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4749  */
4750 int
4751 vsd_set(vnode_t *vp, uint_t key, void *value)
4752 {
4753         struct vsd_node *vsd;
4754 
4755         ASSERT(vp != NULL);
4756         ASSERT(mutex_owned(&vp->v_vsd_lock));
4757 
4758         if (key == 0)
4759                 return (EINVAL);
4760 
4761         vsd = vp->v_vsd;
4762         if (vsd == NULL)
4763                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4764 
4765         /*
4766          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4767          * code won't happen and we will continue down and allocate space for
4768          * the vs_value array.
4769          * If the caller is replacing one value with another, then it is up
4770          * to the caller to free/rele/destroy the previous value (if needed).
4771          */
4772         if (key <= vsd->vs_nkeys) {
4773                 vsd->vs_value[key - 1] = value;
4774                 return (0);
4775         }
4776 
4777         ASSERT(key <= vsd_nkeys);
4778 
4779         if (vsd->vs_nkeys == 0) {
4780                 mutex_enter(&vsd_lock);     /* lock out vsd_destroy() */
4781                 /*
4782                  * Link onto list of all VSD nodes.
4783                  */
4784                 list_insert_head(vsd_list, vsd);
4785                 mutex_exit(&vsd_lock);
4786         }
4787 
4788         /*
4789          * Allocate vnode local storage and set the value for key
4790          */
4791         vsd->vs_value = vsd_realloc(vsd->vs_value,
4792             vsd->vs_nkeys * sizeof (void *),
4793             key * sizeof (void *));
4794         vsd->vs_nkeys = key;
4795         vsd->vs_value[key - 1] = value;
4796 
4797         return (0);
4798 }
4799 
4800 /*
4801  * Called from vn_free() to run the destructor function for each vsd
4802  *      Locks out vsd_create and vsd_destroy
4803  *      Assumes that the destructor *DOES NOT* use vsd
4804  */
4805 void
4806 vsd_free(vnode_t *vp)
4807 {
4808         int i;
4809         struct vsd_node *vsd = vp->v_vsd;
4810 
4811         if (vsd == NULL)
4812                 return;
4813 
4814         if (vsd->vs_nkeys == 0) {
4815                 kmem_free(vsd, sizeof (*vsd));
4816                 vp->v_vsd = NULL;
4817                 return;
4818         }
4819 
4820         /*
4821          * lock out vsd_create and vsd_destroy, call
4822          * the destructor, and mark the value as destroyed.
4823          */
4824         mutex_enter(&vsd_lock);
4825 
4826         for (i = 0; i < vsd->vs_nkeys; i++) {
4827                 if (vsd->vs_value[i] && vsd_destructor[i])
4828                         (*vsd_destructor[i])(vsd->vs_value[i]);
4829                 vsd->vs_value[i] = NULL;
4830         }
4831 
4832         /*
4833          * remove from linked list of VSD nodes
4834          */
4835         list_remove(vsd_list, vsd);
4836 
4837         mutex_exit(&vsd_lock);
4838 
4839         /*
4840          * free up the VSD
4841          */
4842         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4843         kmem_free(vsd, sizeof (struct vsd_node));
4844         vp->v_vsd = NULL;
4845 }
4846 
4847 /*
4848  * realloc
4849  */
4850 static void *
4851 vsd_realloc(void *old, size_t osize, size_t nsize)
4852 {
4853         void *new;
4854 
4855         new = kmem_zalloc(nsize, KM_SLEEP);
4856         if (old) {
4857                 bcopy(old, new, osize);
4858                 kmem_free(old, osize);
4859         }
4860         return (new);
4861 }
4862 
4863 /*
4864  * Setup the extensible system attribute for creating a reparse point.
4865  * The symlink data 'target' is validated for proper format of a reparse
4866  * string and a check also made to make sure the symlink data does not
4867  * point to an existing file.
4868  *
4869  * return 0 if ok else -1.
4870  */
4871 static int
4872 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4873 {
4874         xoptattr_t *xoap;
4875 
4876         if ((!target) || (!vap) || (!xvattr))
4877                 return (-1);
4878 
4879         /* validate reparse string */
4880         if (reparse_validate((const char *)target))
4881                 return (-1);
4882 
4883         xva_init(xvattr);
4884         xvattr->xva_vattr = *vap;
4885         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4886         xoap = xva_getxoptattr(xvattr);
4887         ASSERT(xoap);
4888         XVA_SET_REQ(xvattr, XAT_REPARSE);
4889         xoap->xoa_reparse = 1;
4890 
4891         return (0);
4892 }
4893 
4894 /*
4895  * Function to check whether a symlink is a reparse point.
4896  * Return B_TRUE if it is a reparse point, else return B_FALSE
4897  */
4898 boolean_t
4899 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4900 {
4901         xvattr_t xvattr;
4902         xoptattr_t *xoap;
4903 
4904         if ((vp->v_type != VLNK) ||
4905             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4906                 return (B_FALSE);
4907 
4908         xva_init(&xvattr);
4909         xoap = xva_getxoptattr(&xvattr);
4910         ASSERT(xoap);
4911         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4912 
4913         if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4914                 return (B_FALSE);
4915 
4916         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4917             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4918                 return (B_FALSE);
4919 
4920         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4921 }