spencer-try1 Old usr/src/uts/common/fs/vnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2020 Joyent, Inc.
  25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  28  */
  29 
  30 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  31 /*        All Rights Reserved   */
  32 
  33 /*
  34  * University Copyright- Copyright (c) 1982, 1986, 1988
  35  * The Regents of the University of California
  36  * All Rights Reserved
  37  *
  38  * University Acknowledgment- Portions of this document are derived from
  39  * software developed by the University of California, Berkeley, and its
  40  * contributors.
  41  */
  42 
  43 #include <sys/types.h>
  44 #include <sys/param.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/errno.h>
  47 #include <sys/cred.h>
  48 #include <sys/user.h>
  49 #include <sys/uio.h>
  50 #include <sys/file.h>
  51 #include <sys/pathname.h>
  52 #include <sys/vfs.h>
  53 #include <sys/vfs_opreg.h>
  54 #include <sys/vnode.h>
  55 #include <sys/filio.h>
  56 #include <sys/rwstlock.h>
  57 #include <sys/fem.h>
  58 #include <sys/stat.h>
  59 #include <sys/mode.h>
  60 #include <sys/conf.h>
  61 #include <sys/sysmacros.h>
  62 #include <sys/cmn_err.h>
  63 #include <sys/systm.h>
  64 #include <sys/kmem.h>
  65 #include <sys/debug.h>
  66 #include <c2/audit.h>
  67 #include <sys/acl.h>
  68 #include <sys/nbmlock.h>
  69 #include <sys/fcntl.h>
  70 #include <fs/fs_subr.h>
  71 #include <sys/taskq.h>
  72 #include <fs/fs_reparse.h>
  73 #include <sys/time.h>
  74 #include <sys/sdt.h>
  75 
  76 /* Determine if this vnode is a file that is read-only */
  77 #define ISROFILE(vp)    \
  78         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  79             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  80 
  81 /* Tunable via /etc/system; used only by admin/install */
  82 int nfs_global_client_only;
  83 
  84 /*
  85  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  86  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  87  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  88  * the same fstype index that is used to index into the vfssw table.
  89  */
  90 vopstats_t **vopstats_fstype;
  91 
  92 /* vopstats initialization template used for fast initialization via bcopy() */
  93 static vopstats_t *vs_templatep;
  94 
  95 /* Kmem cache handle for vsk_anchor_t allocations */
  96 kmem_cache_t *vsk_anchor_cache;
  97 
  98 /* file events cleanup routine */
  99 extern void free_fopdata(vnode_t *);
 100 
 101 /*
 102  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 103  * updates to vsktat_tree.
 104  */
 105 avl_tree_t      vskstat_tree;
 106 kmutex_t        vskstat_tree_lock;
 107 
 108 /* Global variable which enables/disables the vopstats collection */
 109 int vopstats_enabled = 1;
 110 
 111 /* Global used for empty/invalid v_path */
 112 char *vn_vpath_empty = "";
 113 
 114 /*
 115  * forward declarations for internal vnode specific data (vsd)
 116  */
 117 static void *vsd_realloc(void *, size_t, size_t);
 118 
 119 /*
 120  * forward declarations for reparse point functions
 121  */
 122 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 123 
 124 /*
 125  * VSD -- VNODE SPECIFIC DATA
 126  * The v_data pointer is typically used by a file system to store a
 127  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 128  * However, there are times when additional project private data needs
 129  * to be stored separately from the data (node) pointed to by v_data.
 130  * This additional data could be stored by the file system itself or
 131  * by a completely different kernel entity.  VSD provides a way for
 132  * callers to obtain a key and store a pointer to private data associated
 133  * with a vnode.
 134  *
 135  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 136  * for calls to vsd_set() and vsd_get().
 137  */
 138 
 139 /*
 140  * vsd_lock protects:
 141  *   vsd_nkeys - creation and deletion of vsd keys
 142  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 143  *   vsd_destructor - adding and removing destructors to the list
 144  */
 145 static kmutex_t         vsd_lock;
 146 static uint_t           vsd_nkeys;       /* size of destructor array */
 147 /* list of vsd_node's */
 148 static list_t *vsd_list = NULL;
 149 /* per-key destructor funcs */
 150 static void             (**vsd_destructor)(void *);
 151 
 152 /*
 153  * The following is the common set of actions needed to update the
 154  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 155  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 156  * recording of the bytes transferred.  Since the code is similar
 157  * but small, it is nearly a duplicate.  Consequently any changes
 158  * to one may need to be reflected in the other.
 159  * Rundown of the variables:
 160  * vp - Pointer to the vnode
 161  * counter - Partial name structure member to update in vopstats for counts
 162  * bytecounter - Partial name structure member to update in vopstats for bytes
 163  * bytesval - Value to update in vopstats for bytes
 164  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 165  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 166  */
 167 
 168 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 169         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 170         if (vfsp && vfsp->vfs_implp &&                                       \
 171             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 172                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 173                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 174                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 175                     size_t, uint64_t *);                                \
 176                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 177                 (*stataddr)++;                                          \
 178                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 179                         vsp->n##counter.value.ui64++;                        \
 180                 }                                                       \
 181         }                                                               \
 182 }
 183 
 184 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 185         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 186         if (vfsp && vfsp->vfs_implp &&                                       \
 187             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 188                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 189                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 190                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 191                     size_t, uint64_t *);                                \
 192                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 193                 (*stataddr)++;                                          \
 194                 vsp->bytecounter.value.ui64 += bytesval;             \
 195                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 196                         vsp->n##counter.value.ui64++;                        \
 197                         vsp->bytecounter.value.ui64 += bytesval;     \
 198                 }                                                       \
 199         }                                                               \
 200 }
 201 
 202 /*
 203  * If the filesystem does not support XIDs map credential
 204  * If the vfsp is NULL, perhaps we should also map?
 205  */
 206 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 207         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 208         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 209                 cr = crgetmapped(cr);                                   \
 210         }
 211 
 212 #define VOP_LATENCY_10MS        10000000
 213 #define VOP_LATENCY_100MS       100000000
 214 #define VOP_LATENCY_1S          1000000000
 215 #define VOP_LATENCY_10S         10000000000
 216 
 217 /*
 218  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 219  * numerical order of S_IFMT and vnode types.)
 220  */
 221 enum vtype iftovt_tab[] = {
 222         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 223         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 224 };
 225 
 226 ushort_t vttoif_tab[] = {
 227         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 228         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 229 };
 230 
 231 /*
 232  * The system vnode cache.
 233  */
 234 
 235 kmem_cache_t *vn_cache;
 236 
 237 
 238 /*
 239  * Vnode operations vector.
 240  */
 241 
 242 static const fs_operation_trans_def_t vn_ops_table[] = {
 243         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 244             fs_nosys, fs_nosys,
 245 
 246         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 247             fs_nosys, fs_nosys,
 248 
 249         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 250             fs_nosys, fs_nosys,
 251 
 252         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 253             fs_nosys, fs_nosys,
 254 
 255         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 256             fs_nosys, fs_nosys,
 257 
 258         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 259             fs_setfl, fs_nosys,
 260 
 261         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 262             fs_nosys, fs_nosys,
 263 
 264         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 265             fs_nosys, fs_nosys,
 266 
 267         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 268             fs_nosys, fs_nosys,
 269 
 270         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 271             fs_nosys, fs_nosys,
 272 
 273         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 274             fs_nosys, fs_nosys,
 275 
 276         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 277             fs_nosys, fs_nosys,
 278 
 279         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 280             fs_nosys, fs_nosys,
 281 
 282         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 283             fs_nosys, fs_nosys,
 284 
 285         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 286             fs_nosys, fs_nosys,
 287 
 288         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 289             fs_nosys, fs_nosys,
 290 
 291         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 292             fs_nosys, fs_nosys,
 293 
 294         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 295             fs_nosys, fs_nosys,
 296 
 297         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 298             fs_nosys, fs_nosys,
 299 
 300         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 301             fs_nosys, fs_nosys,
 302 
 303         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 304             fs_nosys, fs_nosys,
 305 
 306         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 307             fs_nosys, fs_nosys,
 308 
 309         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 310             fs_rwlock, fs_rwlock,
 311 
 312         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 313             (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 314             (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 315 
 316         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 317             fs_nosys, fs_nosys,
 318 
 319         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 320             fs_cmp, fs_cmp,             /* no errors allowed */
 321 
 322         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 323             fs_frlock, fs_nosys,
 324 
 325         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 326             fs_nosys, fs_nosys,
 327 
 328         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 329             fs_nosys, fs_nosys,
 330 
 331         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 332             fs_nosys, fs_nosys,
 333 
 334         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 335             fs_nosys, fs_nosys,
 336 
 337         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 338             (fs_generic_func_p) fs_nosys_map,
 339             (fs_generic_func_p) fs_nosys_map,
 340 
 341         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 342             (fs_generic_func_p) fs_nosys_addmap,
 343             (fs_generic_func_p) fs_nosys_addmap,
 344 
 345         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 346             fs_nosys, fs_nosys,
 347 
 348         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 349             (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 350 
 351         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 352             fs_nosys, fs_nosys,
 353 
 354         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 355             fs_pathconf, fs_nosys,
 356 
 357         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 358             fs_nosys, fs_nosys,
 359 
 360         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 361             fs_nosys, fs_nosys,
 362 
 363         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 364             (fs_generic_func_p)(uintptr_t)fs_dispose,
 365             (fs_generic_func_p)(uintptr_t)fs_nodispose,
 366 
 367         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 368             fs_nosys, fs_nosys,
 369 
 370         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 371             fs_fab_acl, fs_nosys,
 372 
 373         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 374             fs_shrlock, fs_nosys,
 375 
 376         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 377             (fs_generic_func_p) fs_vnevent_nosupport,
 378             (fs_generic_func_p) fs_vnevent_nosupport,
 379 
 380         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 381             fs_nosys, fs_nosys,
 382 
 383         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 384             fs_nosys, fs_nosys,
 385 
 386         NULL, 0, NULL, NULL
 387 };
 388 
 389 /* Extensible attribute (xva) routines. */
 390 
 391 /*
 392  * Zero out the structure, set the size of the requested/returned bitmaps,
 393  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 394  * to the returned attributes array.
 395  */
 396 void
 397 xva_init(xvattr_t *xvap)
 398 {
 399         bzero(xvap, sizeof (xvattr_t));
 400         xvap->xva_mapsize = XVA_MAPSIZE;
 401         xvap->xva_magic = XVA_MAGIC;
 402         xvap->xva_vattr.va_mask = AT_XVATTR;
 403         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 404 }
 405 
 406 /*
 407  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 408  * structure.  Otherwise, returns NULL.
 409  */
 410 xoptattr_t *
 411 xva_getxoptattr(xvattr_t *xvap)
 412 {
 413         xoptattr_t *xoap = NULL;
 414         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 415                 xoap = &xvap->xva_xoptattrs;
 416         return (xoap);
 417 }
 418 
 419 /*
 420  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 421  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 422  * kstat name.
 423  */
 424 static int
 425 vska_compar(const void *n1, const void *n2)
 426 {
 427         int ret;
 428         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 429         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 430 
 431         if (p1 < p2) {
 432                 ret = -1;
 433         } else if (p1 > p2) {
 434                 ret = 1;
 435         } else {
 436                 ret = 0;
 437         }
 438 
 439         return (ret);
 440 }
 441 
 442 /*
 443  * Used to create a single template which will be bcopy()ed to a newly
 444  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 445  */
 446 static vopstats_t *
 447 create_vopstats_template()
 448 {
 449         vopstats_t              *vsp;
 450 
 451         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 452         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 453 
 454         /* VOP_OPEN */
 455         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 456         /* VOP_CLOSE */
 457         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 458         /* VOP_READ I/O */
 459         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 460         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 461         /* VOP_WRITE I/O */
 462         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 463         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 464         /* VOP_IOCTL */
 465         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 466         /* VOP_SETFL */
 467         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 468         /* VOP_GETATTR */
 469         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 470         /* VOP_SETATTR */
 471         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 472         /* VOP_ACCESS */
 473         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 474         /* VOP_LOOKUP */
 475         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 476         /* VOP_CREATE */
 477         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 478         /* VOP_REMOVE */
 479         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 480         /* VOP_LINK */
 481         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 482         /* VOP_RENAME */
 483         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 484         /* VOP_MKDIR */
 485         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 486         /* VOP_RMDIR */
 487         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 488         /* VOP_READDIR I/O */
 489         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 490         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 491             KSTAT_DATA_UINT64);
 492         /* VOP_SYMLINK */
 493         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 494         /* VOP_READLINK */
 495         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 496         /* VOP_FSYNC */
 497         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 498         /* VOP_INACTIVE */
 499         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 500         /* VOP_FID */
 501         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 502         /* VOP_RWLOCK */
 503         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 504         /* VOP_RWUNLOCK */
 505         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 506         /* VOP_SEEK */
 507         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 508         /* VOP_CMP */
 509         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 510         /* VOP_FRLOCK */
 511         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 512         /* VOP_SPACE */
 513         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 514         /* VOP_REALVP */
 515         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 516         /* VOP_GETPAGE */
 517         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 518         /* VOP_PUTPAGE */
 519         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 520         /* VOP_MAP */
 521         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 522         /* VOP_ADDMAP */
 523         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 524         /* VOP_DELMAP */
 525         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 526         /* VOP_POLL */
 527         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 528         /* VOP_DUMP */
 529         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 530         /* VOP_PATHCONF */
 531         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 532         /* VOP_PAGEIO */
 533         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 534         /* VOP_DUMPCTL */
 535         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 536         /* VOP_DISPOSE */
 537         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 538         /* VOP_SETSECATTR */
 539         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 540         /* VOP_GETSECATTR */
 541         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 542         /* VOP_SHRLOCK */
 543         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 544         /* VOP_VNEVENT */
 545         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 546         /* VOP_REQZCBUF */
 547         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 548         /* VOP_RETZCBUF */
 549         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 550 
 551         return (vsp);
 552 }
 553 
 554 /*
 555  * Creates a kstat structure associated with a vopstats structure.
 556  */
 557 kstat_t *
 558 new_vskstat(char *ksname, vopstats_t *vsp)
 559 {
 560         kstat_t         *ksp;
 561 
 562         if (!vopstats_enabled) {
 563                 return (NULL);
 564         }
 565 
 566         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 567             sizeof (vopstats_t)/sizeof (kstat_named_t),
 568             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 569         if (ksp) {
 570                 ksp->ks_data = vsp;
 571                 kstat_install(ksp);
 572         }
 573 
 574         return (ksp);
 575 }
 576 
 577 /*
 578  * Called from vfsinit() to initialize the support mechanisms for vopstats
 579  */
 580 void
 581 vopstats_startup()
 582 {
 583         if (!vopstats_enabled)
 584                 return;
 585 
 586         /*
 587          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 588          * is necessary since we need to check if a kstat exists before we
 589          * attempt to create it.  Also, initialize its lock.
 590          */
 591         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 592             offsetof(vsk_anchor_t, vsk_node));
 593         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 594 
 595         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 596             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 597             NULL, NULL, 0);
 598 
 599         /*
 600          * Set up the array of pointers for the vopstats-by-FS-type.
 601          * The entries will be allocated/initialized as each file system
 602          * goes through modload/mod_installfs.
 603          */
 604         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 605             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 606 
 607         /* Set up the global vopstats initialization template */
 608         vs_templatep = create_vopstats_template();
 609 }
 610 
 611 /*
 612  * We need to have the all of the counters zeroed.
 613  * The initialization of the vopstats_t includes on the order of
 614  * 50 calls to kstat_named_init().  Rather that do that on every call,
 615  * we do it once in a template (vs_templatep) then bcopy it over.
 616  */
 617 void
 618 initialize_vopstats(vopstats_t *vsp)
 619 {
 620         if (vsp == NULL)
 621                 return;
 622 
 623         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 624 }
 625 
 626 /*
 627  * If possible, determine which vopstats by fstype to use and
 628  * return a pointer to the caller.
 629  */
 630 vopstats_t *
 631 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 632 {
 633         int             fstype = 0;     /* Index into vfssw[] */
 634         vopstats_t      *vsp = NULL;
 635 
 636         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 637             !vopstats_enabled)
 638                 return (NULL);
 639         /*
 640          * Set up the fstype.  We go to so much trouble because all versions
 641          * of NFS use the same fstype in their vfs even though they have
 642          * distinct entries in the vfssw[] table.
 643          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 644          */
 645         if (vswp) {
 646                 fstype = vswp - vfssw;  /* Gets us the index */
 647         } else {
 648                 fstype = vfsp->vfs_fstype;
 649         }
 650 
 651         /*
 652          * Point to the per-fstype vopstats. The only valid values are
 653          * non-zero positive values less than the number of vfssw[] table
 654          * entries.
 655          */
 656         if (fstype > 0 && fstype < nfstype) {
 657                 vsp = vopstats_fstype[fstype];
 658         }
 659 
 660         return (vsp);
 661 }
 662 
 663 /*
 664  * Generate a kstat name, create the kstat structure, and allocate a
 665  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 666  * to the caller.  This must only be called from a mount.
 667  */
 668 vsk_anchor_t *
 669 get_vskstat_anchor(vfs_t *vfsp)
 670 {
 671         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 672         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 673         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 674         kstat_t         *ksp;                   /* Ptr to new kstat */
 675         avl_index_t     where;                  /* Location in the AVL tree */
 676 
 677         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 678             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 679                 return (NULL);
 680 
 681         /* Need to get the fsid to build a kstat name */
 682         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 683                 /* Create a name for our kstats based on fsid */
 684                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 685                     VOPSTATS_STR, statvfsbuf.f_fsid);
 686 
 687                 /* Allocate and initialize the vsk_anchor_t */
 688                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 689                 bzero(vskp, sizeof (*vskp));
 690                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 691 
 692                 mutex_enter(&vskstat_tree_lock);
 693                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 694                         avl_insert(&vskstat_tree, vskp, where);
 695                         mutex_exit(&vskstat_tree_lock);
 696 
 697                         /*
 698                          * Now that we've got the anchor in the AVL
 699                          * tree, we can create the kstat.
 700                          */
 701                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 702                         if (ksp) {
 703                                 vskp->vsk_ksp = ksp;
 704                         }
 705                 } else {
 706                         /* Oops, found one! Release memory and lock. */
 707                         mutex_exit(&vskstat_tree_lock);
 708                         kmem_cache_free(vsk_anchor_cache, vskp);
 709                         vskp = NULL;
 710                 }
 711         }
 712         return (vskp);
 713 }
 714 
 715 /*
 716  * We're in the process of tearing down the vfs and need to cleanup
 717  * the data structures associated with the vopstats. Must only be called
 718  * from dounmount().
 719  */
 720 void
 721 teardown_vopstats(vfs_t *vfsp)
 722 {
 723         vsk_anchor_t    *vskap;
 724         avl_index_t     where;
 725 
 726         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 727             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 728                 return;
 729 
 730         /* This is a safe check since VFS_STATS must be set (see above) */
 731         if ((vskap = vfsp->vfs_vskap) == NULL)
 732                 return;
 733 
 734         /* Whack the pointer right away */
 735         vfsp->vfs_vskap = NULL;
 736 
 737         /* Lock the tree, remove the node, and delete the kstat */
 738         mutex_enter(&vskstat_tree_lock);
 739         if (avl_find(&vskstat_tree, vskap, &where)) {
 740                 avl_remove(&vskstat_tree, vskap);
 741         }
 742 
 743         if (vskap->vsk_ksp) {
 744                 kstat_delete(vskap->vsk_ksp);
 745         }
 746         mutex_exit(&vskstat_tree_lock);
 747 
 748         kmem_cache_free(vsk_anchor_cache, vskap);
 749 }
 750 
 751 /*
 752  * Read or write a vnode.  Called from kernel code.
 753  */
 754 int
 755 vn_rdwr(
 756         enum uio_rw rw,
 757         struct vnode *vp,
 758         caddr_t base,
 759         ssize_t len,
 760         offset_t offset,
 761         enum uio_seg seg,
 762         int ioflag,
 763         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 764         cred_t *cr,
 765         ssize_t *residp)
 766 {
 767         struct uio uio;
 768         struct iovec iov;
 769         int error;
 770         int in_crit = 0;
 771 
 772         if (rw == UIO_WRITE && ISROFILE(vp))
 773                 return (EROFS);
 774 
 775         if (len < 0)
 776                 return (EIO);
 777 
 778         VOPXID_MAP_CR(vp, cr);
 779 
 780         iov.iov_base = base;
 781         iov.iov_len = len;
 782         uio.uio_iov = &iov;
 783         uio.uio_iovcnt = 1;
 784         uio.uio_loffset = offset;
 785         uio.uio_segflg = (short)seg;
 786         uio.uio_resid = len;
 787         uio.uio_llimit = ulimit;
 788 
 789         /*
 790          * We have to enter the critical region before calling VOP_RWLOCK
 791          * to avoid a deadlock with ufs.
 792          */
 793         if (nbl_need_check(vp)) {
 794                 int svmand;
 795 
 796                 nbl_start_crit(vp, RW_READER);
 797                 in_crit = 1;
 798                 error = nbl_svmand(vp, cr, &svmand);
 799                 if (error != 0)
 800                         goto done;
 801                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 802                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 803                         error = EACCES;
 804                         goto done;
 805                 }
 806         }
 807 
 808         (void) VOP_RWLOCK(vp,
 809             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 810         if (rw == UIO_WRITE) {
 811                 uio.uio_fmode = FWRITE;
 812                 uio.uio_extflg = UIO_COPY_DEFAULT;
 813                 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 814         } else {
 815                 uio.uio_fmode = FREAD;
 816                 uio.uio_extflg = UIO_COPY_CACHED;
 817                 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 818         }
 819         VOP_RWUNLOCK(vp,
 820             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 821         if (residp)
 822                 *residp = uio.uio_resid;
 823         else if (uio.uio_resid)
 824                 error = EIO;
 825 
 826 done:
 827         if (in_crit)
 828                 nbl_end_crit(vp);
 829         return (error);
 830 }
 831 
 832 /*
 833  * Release a vnode.  Call VOP_INACTIVE on last reference or
 834  * decrement reference count.
 835  *
 836  * To avoid race conditions, the v_count is left at 1 for
 837  * the call to VOP_INACTIVE. This prevents another thread
 838  * from reclaiming and releasing the vnode *before* the
 839  * VOP_INACTIVE routine has a chance to destroy the vnode.
 840  * We can't have more than 1 thread calling VOP_INACTIVE
 841  * on a vnode.
 842  */
 843 void
 844 vn_rele(vnode_t *vp)
 845 {
 846         VERIFY(vp->v_count > 0);
 847         mutex_enter(&vp->v_lock);
 848         if (vp->v_count == 1) {
 849                 mutex_exit(&vp->v_lock);
 850                 VOP_INACTIVE(vp, CRED(), NULL);
 851                 return;
 852         }
 853         VN_RELE_LOCKED(vp);
 854         mutex_exit(&vp->v_lock);
 855 }
 856 
 857 void
 858 vn_phantom_rele(vnode_t *vp)
 859 {
 860         VERIFY(vp->v_count > 0);
 861 
 862         mutex_enter(&vp->v_lock);
 863         VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
 864         vp->v_phantom_count--;
 865         DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
 866         if (vp->v_count == 1) {
 867                 ASSERT0(vp->v_phantom_count);
 868                 mutex_exit(&vp->v_lock);
 869                 VOP_INACTIVE(vp, CRED(), NULL);
 870                 return;
 871         }
 872         VN_RELE_LOCKED(vp);
 873         mutex_exit(&vp->v_lock);
 874 }
 875 
 876 /*
 877  * Return the number of non-phantom holds. Things such as portfs will use
 878  * phantom holds to prevent it from blocking filesystems from mounting over
 879  * watched directories.
 880  */
 881 uint_t
 882 vn_count(vnode_t *vp)
 883 {
 884         ASSERT(MUTEX_HELD(&vp->v_lock));
 885         return (vp->v_count - vp->v_phantom_count);
 886 }
 887 
 888 /*
 889  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 890  * as a single reference, so v_count is not decremented until the last DNLC hold
 891  * is released. This makes it possible to distinguish vnodes that are referenced
 892  * only by the DNLC.
 893  */
 894 void
 895 vn_rele_dnlc(vnode_t *vp)
 896 {
 897         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 898         mutex_enter(&vp->v_lock);
 899         if (--vp->v_count_dnlc == 0) {
 900                 if (vp->v_count == 1) {
 901                         mutex_exit(&vp->v_lock);
 902                         VOP_INACTIVE(vp, CRED(), NULL);
 903                         return;
 904                 }
 905                 VN_RELE_LOCKED(vp);
 906         }
 907         mutex_exit(&vp->v_lock);
 908 }
 909 
 910 /*
 911  * Like vn_rele() except that it clears v_stream under v_lock.
 912  * This is used by sockfs when it dismantles the association between
 913  * the sockfs node and the vnode in the underlying file system.
 914  * v_lock has to be held to prevent a thread coming through the lookupname
 915  * path from accessing a stream head that is going away.
 916  */
 917 void
 918 vn_rele_stream(vnode_t *vp)
 919 {
 920         VERIFY(vp->v_count > 0);
 921         mutex_enter(&vp->v_lock);
 922         vp->v_stream = NULL;
 923         if (vp->v_count == 1) {
 924                 mutex_exit(&vp->v_lock);
 925                 VOP_INACTIVE(vp, CRED(), NULL);
 926                 return;
 927         }
 928         VN_RELE_LOCKED(vp);
 929         mutex_exit(&vp->v_lock);
 930 }
 931 
 932 static void
 933 vn_rele_inactive(vnode_t *vp)
 934 {
 935         VOP_INACTIVE(vp, CRED(), NULL);
 936 }
 937 
 938 /*
 939  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 940  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 941  * the file system as a result of releasing the vnode. Note, file systems
 942  * already have to handle the race where the vnode is incremented before the
 943  * inactive routine is called and does its locking.
 944  *
 945  * Warning: Excessive use of this routine can lead to performance problems.
 946  * This is because taskqs throttle back allocation if too many are created.
 947  */
 948 void
 949 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 950 {
 951         VERIFY(vp->v_count > 0);
 952         mutex_enter(&vp->v_lock);
 953         if (vp->v_count == 1) {
 954                 mutex_exit(&vp->v_lock);
 955                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 956                     vp, TQ_SLEEP) != TASKQID_INVALID);
 957                 return;
 958         }
 959         VN_RELE_LOCKED(vp);
 960         mutex_exit(&vp->v_lock);
 961 }
 962 
 963 int
 964 vn_open(
 965         char *pnamep,
 966         enum uio_seg seg,
 967         int filemode,
 968         int createmode,
 969         struct vnode **vpp,
 970         enum create crwhy,
 971         mode_t umask)
 972 {
 973         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 974             umask, NULL, -1));
 975 }
 976 
 977 
 978 /*
 979  * Open/create a vnode.
 980  * This may be callable by the kernel, the only known use
 981  * of user context being that the current user credentials
 982  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 983  */
 984 int
 985 vn_openat(
 986         char *pnamep,
 987         enum uio_seg seg,
 988         int filemode,
 989         int createmode,
 990         struct vnode **vpp,
 991         enum create crwhy,
 992         mode_t umask,
 993         struct vnode *startvp,
 994         int fd)
 995 {
 996         struct vnode *vp;
 997         int mode;
 998         int accessflags;
 999         int error;
1000         int in_crit = 0;
1001         int open_done = 0;
1002         int shrlock_done = 0;
1003         struct vattr vattr;
1004         enum symfollow follow;
1005         int estale_retry = 0;
1006         struct shrlock shr;
1007         struct shr_locowner shr_own;
1008         boolean_t create;
1009 
1010         mode = 0;
1011         accessflags = 0;
1012         if (filemode & FREAD)
1013                 mode |= VREAD;
1014         if (filemode & (FWRITE|FTRUNC))
1015                 mode |= VWRITE;
1016         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
1017                 mode |= VEXEC;
1018 
1019         /* symlink interpretation */
1020         if (filemode & FNOFOLLOW)
1021                 follow = NO_FOLLOW;
1022         else
1023                 follow = FOLLOW;
1024 
1025         if (filemode & FAPPEND)
1026                 accessflags |= V_APPEND;
1027 
1028         /*
1029          * We need to handle the case of FCREAT | FDIRECTORY and the case of
1030          * FEXCL. If all three are specified, then we always fail because we
1031          * cannot create a directory through this interface and FEXCL says we
1032          * need to fail the request if we can't create it. If, however, only
1033          * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1034          * of opening a file that already exists. If it exists, we can do
1035          * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1036          * treated as FDIRECTORY.
1037          */
1038         if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1039             (FCREAT | FDIRECTORY | FEXCL)) {
1040                 return (EINVAL);
1041         }
1042 
1043         if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1044                 create = B_FALSE;
1045         } else if ((filemode & FCREAT) != 0) {
1046                 create = B_TRUE;
1047         } else {
1048                 create = B_FALSE;
1049         }
1050 
1051 top:
1052         if (create) {
1053                 enum vcexcl excl;
1054 
1055                 /*
1056                  * Wish to create a file.
1057                  */
1058                 vattr.va_type = VREG;
1059                 vattr.va_mode = createmode;
1060                 vattr.va_mask = AT_TYPE|AT_MODE;
1061                 if (filemode & FTRUNC) {
1062                         vattr.va_size = 0;
1063                         vattr.va_mask |= AT_SIZE;
1064                 }
1065                 if (filemode & FEXCL)
1066                         excl = EXCL;
1067                 else
1068                         excl = NONEXCL;
1069 
1070                 if (error =
1071                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1072                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1073                         return (error);
1074         } else {
1075                 /*
1076                  * Wish to open a file.  Just look it up.
1077                  */
1078                 if (error = lookupnameat(pnamep, seg, follow,
1079                     NULLVPP, &vp, startvp)) {
1080                         if ((error == ESTALE) &&
1081                             fs_need_estale_retry(estale_retry++))
1082                                 goto top;
1083                         return (error);
1084                 }
1085 
1086                 /*
1087                  * Get the attributes to check whether file is large.
1088                  * We do this only if the FOFFMAX flag is not set and
1089                  * only for regular files.
1090                  */
1091 
1092                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1093                         vattr.va_mask = AT_SIZE;
1094                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1095                             CRED(), NULL))) {
1096                                 goto out;
1097                         }
1098                         if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1099                                 /*
1100                                  * Large File API - regular open fails
1101                                  * if FOFFMAX flag is set in file mode
1102                                  */
1103                                 error = EOVERFLOW;
1104                                 goto out;
1105                         }
1106                 }
1107                 /*
1108                  * Can't write directories, active texts, or
1109                  * read-only filesystems.  Can't truncate files
1110                  * on which mandatory locking is in effect.
1111                  */
1112                 if (filemode & (FWRITE|FTRUNC)) {
1113                         /*
1114                          * Allow writable directory if VDIROPEN flag is set.
1115                          */
1116                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1117                                 error = EISDIR;
1118                                 goto out;
1119                         }
1120                         if (ISROFILE(vp)) {
1121                                 error = EROFS;
1122                                 goto out;
1123                         }
1124                         /*
1125                          * Can't truncate files on which
1126                          * sysv mandatory locking is in effect.
1127                          */
1128                         if (filemode & FTRUNC) {
1129                                 vnode_t *rvp;
1130 
1131                                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1132                                         rvp = vp;
1133                                 if (rvp->v_filocks != NULL) {
1134                                         vattr.va_mask = AT_MODE;
1135                                         if ((error = VOP_GETATTR(vp,
1136                                             &vattr, 0, CRED(), NULL)) == 0 &&
1137                                             MANDLOCK(vp, vattr.va_mode))
1138                                                 error = EAGAIN;
1139                                 }
1140                         }
1141                         if (error)
1142                                 goto out;
1143                 }
1144                 /*
1145                  * Check permissions.
1146                  */
1147                 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1148                         goto out;
1149 
1150                 /*
1151                  * Require FSEARCH and FDIRECTORY to return a directory. Require
1152                  * FEXEC to return a regular file.
1153                  */
1154                 if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1155                     vp->v_type != VDIR) {
1156                         error = ENOTDIR;
1157                         goto out;
1158                 }
1159                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1160                         error = ENOEXEC;        /* XXX: error code? */
1161                         goto out;
1162                 }
1163         }
1164 
1165         /*
1166          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1167          */
1168         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1169                 /*
1170                  * The __FLXPATH flag is a private interface for use by the lx
1171                  * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
1172                  * when a symbolic link is encountered, returns a file
1173                  * descriptor which references it.
1174                  * See uts/common/brand/lx/syscall/lx_open.c
1175                  *
1176                  * When this flag is set, VOP_OPEN() is not called (for a
1177                  * symlink, most filesystems will return ENOSYS anyway)
1178                  * and the link's vnode is returned to be linked to the
1179                  * file descriptor.
1180                  */
1181                 if ((filemode & __FLXPATH) == 0)
1182                         error = ELOOP;
1183                 goto out;
1184         }
1185         if (filemode & FNOLINKS) {
1186                 vattr.va_mask = AT_NLINK;
1187                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1188                         goto out;
1189                 }
1190                 if (vattr.va_nlink != 1) {
1191                         error = EMLINK;
1192                         goto out;
1193                 }
1194         }
1195 
1196         /*
1197          * Opening a socket corresponding to the AF_UNIX pathname
1198          * in the filesystem name space is not supported.
1199          * However, VSOCK nodes in namefs are supported in order
1200          * to make fattach work for sockets.
1201          *
1202          * XXX This uses VOP_REALVP to distinguish between
1203          * an unopened namefs node (where VOP_REALVP returns a
1204          * different VSOCK vnode) and a VSOCK created by vn_create
1205          * in some file system (where VOP_REALVP would never return
1206          * a different vnode).
1207          */
1208         if (vp->v_type == VSOCK) {
1209                 struct vnode *nvp;
1210 
1211                 error = VOP_REALVP(vp, &nvp, NULL);
1212                 if (error != 0 || nvp == NULL || nvp == vp ||
1213                     nvp->v_type != VSOCK) {
1214                         error = EOPNOTSUPP;
1215                         goto out;
1216                 }
1217         }
1218 
1219         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1220                 /* get share reservation */
1221                 shr.s_access = 0;
1222                 if (filemode & FWRITE)
1223                         shr.s_access |= F_WRACC;
1224                 if (filemode & FREAD)
1225                         shr.s_access |= F_RDACC;
1226                 shr.s_deny = 0;
1227                 shr.s_sysid = 0;
1228                 shr.s_pid = ttoproc(curthread)->p_pid;
1229                 shr_own.sl_pid = shr.s_pid;
1230                 shr_own.sl_id = fd;
1231                 shr.s_own_len = sizeof (shr_own);
1232                 shr.s_owner = (caddr_t)&shr_own;
1233                 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1234                     NULL);
1235                 if (error)
1236                         goto out;
1237                 shrlock_done = 1;
1238 
1239                 /* nbmand conflict check if truncating file */
1240                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1241                         nbl_start_crit(vp, RW_READER);
1242                         in_crit = 1;
1243 
1244                         vattr.va_mask = AT_SIZE;
1245                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1246                                 goto out;
1247                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1248                             NULL)) {
1249                                 error = EACCES;
1250                                 goto out;
1251                         }
1252                 }
1253         }
1254 
1255         /*
1256          * Do opening protocol.
1257          */
1258         error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1259         if (error)
1260                 goto out;
1261         open_done = 1;
1262 
1263         /*
1264          * Truncate if required.
1265          */
1266         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1267                 vattr.va_size = 0;
1268                 vattr.va_mask = AT_SIZE;
1269                 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1270                         goto out;
1271         }
1272 
1273         /*
1274          * Turn on directio, if requested.
1275          */
1276         if (filemode & FDIRECT) {
1277                 if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1278                     CRED(), NULL, NULL)) != 0) {
1279                         /*
1280                          * On Linux, O_DIRECT returns EINVAL when the file
1281                          * system does not support directio, so we'll do the
1282                          * same.
1283                          */
1284                         error = EINVAL;
1285                         goto out;
1286                 }
1287         }
1288 out:
1289         ASSERT(vp->v_count > 0);
1290 
1291         if (in_crit) {
1292                 nbl_end_crit(vp);
1293                 in_crit = 0;
1294         }
1295         if (error) {
1296                 if (open_done) {
1297                         (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1298                             NULL);
1299                         open_done = 0;
1300                         shrlock_done = 0;
1301                 }
1302                 if (shrlock_done) {
1303                         (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1304                             NULL);
1305                         shrlock_done = 0;
1306                 }
1307 
1308                 /*
1309                  * The following clause was added to handle a problem
1310                  * with NFS consistency.  It is possible that a lookup
1311                  * of the file to be opened succeeded, but the file
1312                  * itself doesn't actually exist on the server.  This
1313                  * is chiefly due to the DNLC containing an entry for
1314                  * the file which has been removed on the server.  In
1315                  * this case, we just start over.  If there was some
1316                  * other cause for the ESTALE error, then the lookup
1317                  * of the file will fail and the error will be returned
1318                  * above instead of looping around from here.
1319                  */
1320                 VN_RELE(vp);
1321                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1322                         goto top;
1323         } else
1324                 *vpp = vp;
1325         return (error);
1326 }
1327 
1328 /*
1329  * The following two accessor functions are for the NFSv4 server.  Since there
1330  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1331  * vnode open counts correct when a client "upgrades" an open or does an
1332  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1333  * open mode (add or subtract read or write), but also change the share/deny
1334  * modes.  However, share reservations are not integrated with OPEN, yet, so
1335  * we need to handle each separately.  These functions are cleaner than having
1336  * the NFS server manipulate the counts directly, however, nobody else should
1337  * use these functions.
1338  */
1339 void
1340 vn_open_upgrade(
1341         vnode_t *vp,
1342         int filemode)
1343 {
1344         ASSERT(vp->v_type == VREG);
1345 
1346         if (filemode & FREAD)
1347                 atomic_inc_32(&vp->v_rdcnt);
1348         if (filemode & FWRITE)
1349                 atomic_inc_32(&vp->v_wrcnt);
1350 
1351 }
1352 
1353 void
1354 vn_open_downgrade(
1355         vnode_t *vp,
1356         int filemode)
1357 {
1358         ASSERT(vp->v_type == VREG);
1359 
1360         if (filemode & FREAD) {
1361                 ASSERT(vp->v_rdcnt > 0);
1362                 atomic_dec_32(&vp->v_rdcnt);
1363         }
1364         if (filemode & FWRITE) {
1365                 ASSERT(vp->v_wrcnt > 0);
1366                 atomic_dec_32(&vp->v_wrcnt);
1367         }
1368 
1369 }
1370 
1371 int
1372 vn_create(
1373         char *pnamep,
1374         enum uio_seg seg,
1375         struct vattr *vap,
1376         enum vcexcl excl,
1377         int mode,
1378         struct vnode **vpp,
1379         enum create why,
1380         int flag,
1381         mode_t umask)
1382 {
1383         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1384             umask, NULL));
1385 }
1386 
1387 /*
1388  * Create a vnode (makenode).
1389  */
1390 int
1391 vn_createat(
1392         char *pnamep,
1393         enum uio_seg seg,
1394         struct vattr *vap,
1395         enum vcexcl excl,
1396         int mode,
1397         struct vnode **vpp,
1398         enum create why,
1399         int flag,
1400         mode_t umask,
1401         struct vnode *startvp)
1402 {
1403         struct vnode *dvp;      /* ptr to parent dir vnode */
1404         struct vnode *vp = NULL;
1405         struct pathname pn;
1406         int error;
1407         int in_crit = 0;
1408         struct vattr vattr;
1409         enum symfollow follow;
1410         int estale_retry = 0;
1411         uint32_t auditing = AU_AUDITING();
1412 
1413         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1414 
1415         /* symlink interpretation */
1416         if ((flag & FNOFOLLOW) || excl == EXCL)
1417                 follow = NO_FOLLOW;
1418         else
1419                 follow = FOLLOW;
1420         flag &= ~(FNOFOLLOW|FNOLINKS);
1421 
1422 top:
1423         /*
1424          * Lookup directory.
1425          * If new object is a file, call lower level to create it.
1426          * Note that it is up to the lower level to enforce exclusive
1427          * creation, if the file is already there.
1428          * This allows the lower level to do whatever
1429          * locking or protocol that is needed to prevent races.
1430          * If the new object is directory call lower level to make
1431          * the new directory, with "." and "..".
1432          */
1433         if (error = pn_get(pnamep, seg, &pn))
1434                 return (error);
1435         if (auditing)
1436                 audit_vncreate_start();
1437         dvp = NULL;
1438         *vpp = NULL;
1439         /*
1440          * lookup will find the parent directory for the vnode.
1441          * When it is done the pn holds the name of the entry
1442          * in the directory.
1443          * If this is a non-exclusive create we also find the node itself.
1444          */
1445         error = lookuppnat(&pn, NULL, follow, &dvp,
1446             (excl == EXCL) ? NULLVPP : vpp, startvp);
1447         if (error) {
1448                 pn_free(&pn);
1449                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1450                         goto top;
1451                 if (why == CRMKDIR && error == EINVAL)
1452                         error = EEXIST;         /* SVID */
1453                 return (error);
1454         }
1455 
1456         if (why != CRMKNOD)
1457                 vap->va_mode &= ~VSVTX;
1458 
1459         /*
1460          * If default ACLs are defined for the directory don't apply the
1461          * umask if umask is passed.
1462          */
1463 
1464         if (umask) {
1465 
1466                 vsecattr_t vsec;
1467 
1468                 vsec.vsa_aclcnt = 0;
1469                 vsec.vsa_aclentp = NULL;
1470                 vsec.vsa_dfaclcnt = 0;
1471                 vsec.vsa_dfaclentp = NULL;
1472                 vsec.vsa_mask = VSA_DFACLCNT;
1473                 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1474                 /*
1475                  * If error is ENOSYS then treat it as no error
1476                  * Don't want to force all file systems to support
1477                  * aclent_t style of ACL's.
1478                  */
1479                 if (error == ENOSYS)
1480                         error = 0;
1481                 if (error) {
1482                         if (*vpp != NULL)
1483                                 VN_RELE(*vpp);
1484                         goto out;
1485                 } else {
1486                         /*
1487                          * Apply the umask if no default ACLs.
1488                          */
1489                         if (vsec.vsa_dfaclcnt == 0)
1490                                 vap->va_mode &= ~umask;
1491 
1492                         /*
1493                          * VOP_GETSECATTR() may have allocated memory for
1494                          * ACLs we didn't request, so double-check and
1495                          * free it if necessary.
1496                          */
1497                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1498                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1499                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1500                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1501                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1502                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1503                 }
1504         }
1505 
1506         /*
1507          * In general we want to generate EROFS if the file system is
1508          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1509          * documents the open system call, and it says that O_CREAT has no
1510          * effect if the file already exists.  Bug 1119649 states
1511          * that open(path, O_CREAT, ...) fails when attempting to open an
1512          * existing file on a read only file system.  Thus, the first part
1513          * of the following if statement has 3 checks:
1514          *      if the file exists &&
1515          *              it is being open with write access &&
1516          *              the file system is read only
1517          *      then generate EROFS
1518          */
1519         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1520             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1521                 if (*vpp)
1522                         VN_RELE(*vpp);
1523                 error = EROFS;
1524         } else if (excl == NONEXCL && *vpp != NULL) {
1525                 vnode_t *rvp;
1526 
1527                 /*
1528                  * File already exists.  If a mandatory lock has been
1529                  * applied, return error.
1530                  */
1531                 vp = *vpp;
1532                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1533                         rvp = vp;
1534                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1535                         nbl_start_crit(vp, RW_READER);
1536                         in_crit = 1;
1537                 }
1538                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1539                         vattr.va_mask = AT_MODE|AT_SIZE;
1540                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1541                                 goto out;
1542                         }
1543                         if (MANDLOCK(vp, vattr.va_mode)) {
1544                                 error = EAGAIN;
1545                                 goto out;
1546                         }
1547                         /*
1548                          * File cannot be truncated if non-blocking mandatory
1549                          * locks are currently on the file.
1550                          */
1551                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1552                                 u_offset_t offset;
1553                                 ssize_t length;
1554 
1555                                 offset = vap->va_size > vattr.va_size ?
1556                                     vattr.va_size : vap->va_size;
1557                                 length = vap->va_size > vattr.va_size ?
1558                                     vap->va_size - vattr.va_size :
1559                                     vattr.va_size - vap->va_size;
1560                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1561                                     length, 0, NULL)) {
1562                                         error = EACCES;
1563                                         goto out;
1564                                 }
1565                         }
1566                 }
1567 
1568                 /*
1569                  * If the file is the root of a VFS, we've crossed a
1570                  * mount point and the "containing" directory that we
1571                  * acquired above (dvp) is irrelevant because it's in
1572                  * a different file system.  We apply VOP_CREATE to the
1573                  * target itself instead of to the containing directory
1574                  * and supply a null path name to indicate (conventionally)
1575                  * the node itself as the "component" of interest.
1576                  *
1577                  * The call to VOP_CREATE() is necessary to ensure
1578                  * that the appropriate permission checks are made,
1579                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1580                  * exists since we are in the else condition where this
1581                  * was checked.
1582                  */
1583                 if (vp->v_flag & VROOT) {
1584                         ASSERT(why != CRMKDIR);
1585                         error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1586                             CRED(), flag, NULL, NULL);
1587                         /*
1588                          * If the create succeeded, it will have created a
1589                          * new reference on a new vnode (*vpp) in the child
1590                          * file system, so we want to drop our reference on
1591                          * the old (vp) upon exit.
1592                          */
1593                         goto out;
1594                 }
1595 
1596                 /*
1597                  * Large File API - non-large open (FOFFMAX flag not set)
1598                  * of regular file fails if the file size exceeds MAXOFF32_T.
1599                  */
1600                 if (why != CRMKDIR &&
1601                     !(flag & FOFFMAX) &&
1602                     (vp->v_type == VREG)) {
1603                         vattr.va_mask = AT_SIZE;
1604                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1605                             CRED(), NULL))) {
1606                                 goto out;
1607                         }
1608                         if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1609                                 error = EOVERFLOW;
1610                                 goto out;
1611                         }
1612                 }
1613         }
1614 
1615         if (error == 0) {
1616                 /*
1617                  * Call mkdir() if specified, otherwise create().
1618                  */
1619                 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1620 
1621                 if (why == CRMKDIR)
1622                         /*
1623                          * N.B., if vn_createat() ever requests
1624                          * case-insensitive behavior then it will need
1625                          * to be passed to VOP_MKDIR().  VOP_CREATE()
1626                          * will already get it via "flag"
1627                          */
1628                         error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1629                             NULL, 0, NULL);
1630                 else if (!must_be_dir)
1631                         error = VOP_CREATE(dvp, pn.pn_path, vap,
1632                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1633                 else
1634                         error = ENOTDIR;
1635         }
1636 
1637 out:
1638 
1639         if (auditing)
1640                 audit_vncreate_finish(*vpp, error);
1641         if (in_crit) {
1642                 nbl_end_crit(vp);
1643                 in_crit = 0;
1644         }
1645         if (vp != NULL) {
1646                 VN_RELE(vp);
1647                 vp = NULL;
1648         }
1649         pn_free(&pn);
1650         VN_RELE(dvp);
1651         /*
1652          * The following clause was added to handle a problem
1653          * with NFS consistency.  It is possible that a lookup
1654          * of the file to be created succeeded, but the file
1655          * itself doesn't actually exist on the server.  This
1656          * is chiefly due to the DNLC containing an entry for
1657          * the file which has been removed on the server.  In
1658          * this case, we just start over.  If there was some
1659          * other cause for the ESTALE error, then the lookup
1660          * of the file will fail and the error will be returned
1661          * above instead of looping around from here.
1662          */
1663         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1664                 goto top;
1665         return (error);
1666 }
1667 
1668 int
1669 vn_link(char *from, char *to, enum uio_seg seg)
1670 {
1671         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1672 }
1673 
1674 int
1675 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1676     vnode_t *tstartvp, char *to, enum uio_seg seg)
1677 {
1678         struct vnode *fvp;              /* from vnode ptr */
1679         struct vnode *tdvp;             /* to directory vnode ptr */
1680         struct pathname pn;
1681         int error;
1682         struct vattr vattr;
1683         dev_t fsid;
1684         int estale_retry = 0;
1685         uint32_t auditing = AU_AUDITING();
1686 
1687 top:
1688         fvp = tdvp = NULL;
1689         if (error = pn_get(to, seg, &pn))
1690                 return (error);
1691         if (auditing && fstartvp != NULL)
1692                 audit_setfsat_path(1);
1693         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1694                 goto out;
1695         if (auditing && tstartvp != NULL)
1696                 audit_setfsat_path(3);
1697         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1698                 goto out;
1699         /*
1700          * Make sure both source vnode and target directory vnode are
1701          * in the same vfs and that it is writeable.
1702          */
1703         vattr.va_mask = AT_FSID;
1704         if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1705                 goto out;
1706         fsid = vattr.va_fsid;
1707         vattr.va_mask = AT_FSID;
1708         if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1709                 goto out;
1710         if (fsid != vattr.va_fsid) {
1711                 error = EXDEV;
1712                 goto out;
1713         }
1714         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1715                 error = EROFS;
1716                 goto out;
1717         }
1718         /*
1719          * Do the link.
1720          */
1721         (void) pn_fixslash(&pn);
1722         error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1723 out:
1724         pn_free(&pn);
1725         if (fvp)
1726                 VN_RELE(fvp);
1727         if (tdvp)
1728                 VN_RELE(tdvp);
1729         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1730                 goto top;
1731         return (error);
1732 }
1733 
1734 int
1735 vn_rename(char *from, char *to, enum uio_seg seg)
1736 {
1737         return (vn_renameat(NULL, from, NULL, to, seg));
1738 }
1739 
1740 int
1741 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1742     char *tname, enum uio_seg seg)
1743 {
1744         int error;
1745         struct vattr vattr;
1746         struct pathname fpn;            /* from pathname */
1747         struct pathname tpn;            /* to pathname */
1748         dev_t fsid;
1749         int in_crit_src, in_crit_targ;
1750         vnode_t *fromvp, *fvp;
1751         vnode_t *tovp, *targvp;
1752         int estale_retry = 0;
1753         uint32_t auditing = AU_AUDITING();
1754 
1755 top:
1756         fvp = fromvp = tovp = targvp = NULL;
1757         in_crit_src = in_crit_targ = 0;
1758         /*
1759          * Get to and from pathnames.
1760          */
1761         if (error = pn_get(fname, seg, &fpn))
1762                 return (error);
1763         if (error = pn_get(tname, seg, &tpn)) {
1764                 pn_free(&fpn);
1765                 return (error);
1766         }
1767 
1768         /*
1769          * First we need to resolve the correct directories
1770          * The passed in directories may only be a starting point,
1771          * but we need the real directories the file(s) live in.
1772          * For example the fname may be something like usr/lib/sparc
1773          * and we were passed in the / directory, but we need to
1774          * use the lib directory for the rename.
1775          */
1776 
1777         if (auditing && fdvp != NULL)
1778                 audit_setfsat_path(1);
1779         /*
1780          * Lookup to and from directories.
1781          */
1782         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1783                 goto out;
1784         }
1785 
1786         /*
1787          * Make sure there is an entry.
1788          */
1789         if (fvp == NULL) {
1790                 error = ENOENT;
1791                 goto out;
1792         }
1793 
1794         if (auditing && tdvp != NULL)
1795                 audit_setfsat_path(3);
1796         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1797                 goto out;
1798         }
1799 
1800         /*
1801          * Make sure both the from vnode directory and the to directory
1802          * are in the same vfs and the to directory is writable.
1803          * We check fsid's, not vfs pointers, so loopback fs works.
1804          */
1805         if (fromvp != tovp) {
1806                 vattr.va_mask = AT_FSID;
1807                 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1808                         goto out;
1809                 fsid = vattr.va_fsid;
1810                 vattr.va_mask = AT_FSID;
1811                 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1812                         goto out;
1813                 if (fsid != vattr.va_fsid) {
1814                         error = EXDEV;
1815                         goto out;
1816                 }
1817         }
1818 
1819         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1820                 error = EROFS;
1821                 goto out;
1822         }
1823 
1824         /*
1825          * Make sure "from" vp is not a mount point.
1826          * Note, lookup did traverse() already, so
1827          * we'll be looking at the mounted FS root.
1828          * (but allow files like mnttab)
1829          */
1830         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1831                 error = EBUSY;
1832                 goto out;
1833         }
1834 
1835         if (targvp && (fvp != targvp)) {
1836                 nbl_start_crit(targvp, RW_READER);
1837                 in_crit_targ = 1;
1838                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1839                         error = EACCES;
1840                         goto out;
1841                 }
1842         }
1843 
1844         if (nbl_need_check(fvp)) {
1845                 nbl_start_crit(fvp, RW_READER);
1846                 in_crit_src = 1;
1847                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1848                         error = EACCES;
1849                         goto out;
1850                 }
1851         }
1852 
1853         /*
1854          * Do the rename.
1855          */
1856         (void) pn_fixslash(&tpn);
1857         error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1858             NULL, 0);
1859 
1860 out:
1861         pn_free(&fpn);
1862         pn_free(&tpn);
1863         if (in_crit_src)
1864                 nbl_end_crit(fvp);
1865         if (in_crit_targ)
1866                 nbl_end_crit(targvp);
1867         if (fromvp)
1868                 VN_RELE(fromvp);
1869         if (tovp)
1870                 VN_RELE(tovp);
1871         if (targvp)
1872                 VN_RELE(targvp);
1873         if (fvp)
1874                 VN_RELE(fvp);
1875         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1876                 goto top;
1877         return (error);
1878 }
1879 
1880 /*
1881  * Remove a file or directory.
1882  */
1883 int
1884 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1885 {
1886         return (vn_removeat(NULL, fnamep, seg, dirflag));
1887 }
1888 
1889 int
1890 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1891 {
1892         struct vnode *vp;               /* entry vnode */
1893         struct vnode *dvp;              /* ptr to parent dir vnode */
1894         struct vnode *coveredvp;
1895         struct pathname pn;             /* name of entry */
1896         enum vtype vtype;
1897         int error;
1898         struct vfs *vfsp;
1899         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1900         int in_crit = 0;
1901         int estale_retry = 0;
1902 
1903 top:
1904         if (error = pn_get(fnamep, seg, &pn))
1905                 return (error);
1906         dvp = vp = NULL;
1907         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1908                 pn_free(&pn);
1909                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1910                         goto top;
1911                 return (error);
1912         }
1913 
1914         /*
1915          * Make sure there is an entry.
1916          */
1917         if (vp == NULL) {
1918                 error = ENOENT;
1919                 goto out;
1920         }
1921 
1922         vfsp = vp->v_vfsp;
1923         dvfsp = dvp->v_vfsp;
1924 
1925         /*
1926          * If the named file is the root of a mounted filesystem, fail,
1927          * unless it's marked unlinkable.  In that case, unmount the
1928          * filesystem and proceed to unlink the covered vnode.  (If the
1929          * covered vnode is a directory, use rmdir instead of unlink,
1930          * to avoid file system corruption.)
1931          */
1932         if (vp->v_flag & VROOT) {
1933                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1934                         error = EBUSY;
1935                         goto out;
1936                 }
1937 
1938                 /*
1939                  * Namefs specific code starts here.
1940                  */
1941 
1942                 if (dirflag == RMDIRECTORY) {
1943                         /*
1944                          * User called rmdir(2) on a file that has
1945                          * been namefs mounted on top of.  Since
1946                          * namefs doesn't allow directories to
1947                          * be mounted on other files we know
1948                          * vp is not of type VDIR so fail to operation.
1949                          */
1950                         error = ENOTDIR;
1951                         goto out;
1952                 }
1953 
1954                 /*
1955                  * If VROOT is still set after grabbing vp->v_lock,
1956                  * noone has finished nm_unmount so far and coveredvp
1957                  * is valid.
1958                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1959                  * vp->v_lock, any race window is eliminated.
1960                  */
1961 
1962                 mutex_enter(&vp->v_lock);
1963                 if ((vp->v_flag & VROOT) == 0) {
1964                         /* Someone beat us to the unmount */
1965                         mutex_exit(&vp->v_lock);
1966                         error = EBUSY;
1967                         goto out;
1968                 }
1969                 vfsp = vp->v_vfsp;
1970                 coveredvp = vfsp->vfs_vnodecovered;
1971                 ASSERT(coveredvp);
1972                 /*
1973                  * Note: Implementation of vn_vfswlock shows that ordering of
1974                  * v_lock / vn_vfswlock is not an issue here.
1975                  */
1976                 error = vn_vfswlock(coveredvp);
1977                 mutex_exit(&vp->v_lock);
1978 
1979                 if (error)
1980                         goto out;
1981 
1982                 VN_HOLD(coveredvp);
1983                 VN_RELE(vp);
1984                 error = dounmount(vfsp, 0, CRED());
1985 
1986                 /*
1987                  * Unmounted the namefs file system; now get
1988                  * the object it was mounted over.
1989                  */
1990                 vp = coveredvp;
1991                 /*
1992                  * If namefs was mounted over a directory, then
1993                  * we want to use rmdir() instead of unlink().
1994                  */
1995                 if (vp->v_type == VDIR)
1996                         dirflag = RMDIRECTORY;
1997 
1998                 if (error)
1999                         goto out;
2000         }
2001 
2002         /*
2003          * Make sure filesystem is writeable.
2004          * We check the parent directory's vfs in case this is an lofs vnode.
2005          */
2006         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
2007                 error = EROFS;
2008                 goto out;
2009         }
2010 
2011         vtype = vp->v_type;
2012 
2013         /*
2014          * If there is the possibility of an nbmand share reservation, make
2015          * sure it's okay to remove the file.  Keep a reference to the
2016          * vnode, so that we can exit the nbl critical region after
2017          * calling VOP_REMOVE.
2018          * If there is no possibility of an nbmand share reservation,
2019          * release the vnode reference now.  Filesystems like NFS may
2020          * behave differently if there is an extra reference, so get rid of
2021          * this one.  Fortunately, we can't have nbmand mounts on NFS
2022          * filesystems.
2023          */
2024         if (nbl_need_check(vp)) {
2025                 nbl_start_crit(vp, RW_READER);
2026                 in_crit = 1;
2027                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
2028                         error = EACCES;
2029                         goto out;
2030                 }
2031         } else {
2032                 VN_RELE(vp);
2033                 vp = NULL;
2034         }
2035 
2036         if (dirflag == RMDIRECTORY) {
2037                 /*
2038                  * Caller is using rmdir(2), which can only be applied to
2039                  * directories.
2040                  */
2041                 if (vtype != VDIR) {
2042                         error = ENOTDIR;
2043                 } else {
2044                         vnode_t *cwd;
2045                         proc_t *pp = curproc;
2046 
2047                         mutex_enter(&pp->p_lock);
2048                         cwd = PTOU(pp)->u_cdir;
2049                         VN_HOLD(cwd);
2050                         mutex_exit(&pp->p_lock);
2051                         error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2052                             NULL, 0);
2053                         VN_RELE(cwd);
2054                 }
2055         } else {
2056                 /*
2057                  * Unlink(2) can be applied to anything.
2058                  */
2059                 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2060         }
2061 
2062 out:
2063         pn_free(&pn);
2064         if (in_crit) {
2065                 nbl_end_crit(vp);
2066                 in_crit = 0;
2067         }
2068         if (vp != NULL)
2069                 VN_RELE(vp);
2070         if (dvp != NULL)
2071                 VN_RELE(dvp);
2072         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2073                 goto top;
2074         return (error);
2075 }
2076 
2077 /*
2078  * Utility function to compare equality of vnodes.
2079  * Compare the underlying real vnodes, if there are underlying vnodes.
2080  * This is a more thorough comparison than the VN_CMP() macro provides.
2081  */
2082 int
2083 vn_compare(vnode_t *vp1, vnode_t *vp2)
2084 {
2085         vnode_t *realvp;
2086 
2087         if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2088                 vp1 = realvp;
2089         if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2090                 vp2 = realvp;
2091         return (VN_CMP(vp1, vp2));
2092 }
2093 
2094 /*
2095  * The number of locks to hash into.  This value must be a power
2096  * of 2 minus 1 and should probably also be prime.
2097  */
2098 #define NUM_BUCKETS     1023
2099 
2100 struct  vn_vfslocks_bucket {
2101         kmutex_t vb_lock;
2102         vn_vfslocks_entry_t *vb_list;
2103         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2104 };
2105 
2106 /*
2107  * Total number of buckets will be NUM_BUCKETS + 1 .
2108  */
2109 
2110 #pragma align   64(vn_vfslocks_buckets)
2111 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2112 
2113 #define VN_VFSLOCKS_SHIFT       9
2114 
2115 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2116         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2117 
2118 /*
2119  * vn_vfslocks_getlock() uses an HASH scheme to generate
2120  * rwstlock using vfs/vnode pointer passed to it.
2121  *
2122  * vn_vfslocks_rele() releases a reference in the
2123  * HASH table which allows the entry allocated by
2124  * vn_vfslocks_getlock() to be freed at a later
2125  * stage when the refcount drops to zero.
2126  */
2127 
2128 vn_vfslocks_entry_t *
2129 vn_vfslocks_getlock(void *vfsvpptr)
2130 {
2131         struct vn_vfslocks_bucket *bp;
2132         vn_vfslocks_entry_t *vep;
2133         vn_vfslocks_entry_t *tvep;
2134 
2135         ASSERT(vfsvpptr != NULL);
2136         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2137 
2138         mutex_enter(&bp->vb_lock);
2139         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2140                 if (vep->ve_vpvfs == vfsvpptr) {
2141                         vep->ve_refcnt++;
2142                         mutex_exit(&bp->vb_lock);
2143                         return (vep);
2144                 }
2145         }
2146         mutex_exit(&bp->vb_lock);
2147         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2148         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2149         vep->ve_vpvfs = (char *)vfsvpptr;
2150         vep->ve_refcnt = 1;
2151         mutex_enter(&bp->vb_lock);
2152         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2153                 if (tvep->ve_vpvfs == vfsvpptr) {
2154                         tvep->ve_refcnt++;
2155                         mutex_exit(&bp->vb_lock);
2156 
2157                         /*
2158                          * There is already an entry in the hash
2159                          * destroy what we just allocated.
2160                          */
2161                         rwst_destroy(&vep->ve_lock);
2162                         kmem_free(vep, sizeof (*vep));
2163                         return (tvep);
2164                 }
2165         }
2166         vep->ve_next = bp->vb_list;
2167         bp->vb_list = vep;
2168         mutex_exit(&bp->vb_lock);
2169         return (vep);
2170 }
2171 
2172 void
2173 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2174 {
2175         struct vn_vfslocks_bucket *bp;
2176         vn_vfslocks_entry_t *vep;
2177         vn_vfslocks_entry_t *pvep;
2178 
2179         ASSERT(vepent != NULL);
2180         ASSERT(vepent->ve_vpvfs != NULL);
2181 
2182         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2183 
2184         mutex_enter(&bp->vb_lock);
2185         vepent->ve_refcnt--;
2186 
2187         if ((int32_t)vepent->ve_refcnt < 0)
2188                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2189 
2190         pvep = NULL;
2191         if (vepent->ve_refcnt == 0) {
2192                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2193                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2194                                 if (pvep == NULL)
2195                                         bp->vb_list = vep->ve_next;
2196                                 else {
2197                                         pvep->ve_next = vep->ve_next;
2198                                 }
2199                                 mutex_exit(&bp->vb_lock);
2200                                 rwst_destroy(&vep->ve_lock);
2201                                 kmem_free(vep, sizeof (*vep));
2202                                 return;
2203                         }
2204                         pvep = vep;
2205                 }
2206                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2207         }
2208         mutex_exit(&bp->vb_lock);
2209 }
2210 
2211 /*
2212  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2213  * lock protecting the v_vfsmountedhere field.
2214  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2215  * except that it blocks to acquire the lock VVFSLOCK.
2216  *
2217  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2218  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2219  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2220  */
2221 int
2222 vn_vfswlock_wait(vnode_t *vp)
2223 {
2224         int retval;
2225         vn_vfslocks_entry_t *vpvfsentry;
2226         ASSERT(vp != NULL);
2227 
2228         vpvfsentry = vn_vfslocks_getlock(vp);
2229         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2230 
2231         if (retval == EINTR) {
2232                 vn_vfslocks_rele(vpvfsentry);
2233                 return (EINTR);
2234         }
2235         return (retval);
2236 }
2237 
2238 int
2239 vn_vfsrlock_wait(vnode_t *vp)
2240 {
2241         int retval;
2242         vn_vfslocks_entry_t *vpvfsentry;
2243         ASSERT(vp != NULL);
2244 
2245         vpvfsentry = vn_vfslocks_getlock(vp);
2246         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2247 
2248         if (retval == EINTR) {
2249                 vn_vfslocks_rele(vpvfsentry);
2250                 return (EINTR);
2251         }
2252 
2253         return (retval);
2254 }
2255 
2256 
2257 /*
2258  * vn_vfswlock is used to implement a lock which is logically a writers lock
2259  * protecting the v_vfsmountedhere field.
2260  */
2261 int
2262 vn_vfswlock(vnode_t *vp)
2263 {
2264         vn_vfslocks_entry_t *vpvfsentry;
2265 
2266         /*
2267          * If vp is NULL then somebody is trying to lock the covered vnode
2268          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2269          * only happen when unmounting /.  Since that operation will fail
2270          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2271          */
2272         if (vp == NULL)
2273                 return (EBUSY);
2274 
2275         vpvfsentry = vn_vfslocks_getlock(vp);
2276 
2277         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2278                 return (0);
2279 
2280         vn_vfslocks_rele(vpvfsentry);
2281         return (EBUSY);
2282 }
2283 
2284 int
2285 vn_vfsrlock(vnode_t *vp)
2286 {
2287         vn_vfslocks_entry_t *vpvfsentry;
2288 
2289         /*
2290          * If vp is NULL then somebody is trying to lock the covered vnode
2291          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2292          * only happen when unmounting /.  Since that operation will fail
2293          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2294          */
2295         if (vp == NULL)
2296                 return (EBUSY);
2297 
2298         vpvfsentry = vn_vfslocks_getlock(vp);
2299 
2300         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2301                 return (0);
2302 
2303         vn_vfslocks_rele(vpvfsentry);
2304         return (EBUSY);
2305 }
2306 
2307 void
2308 vn_vfsunlock(vnode_t *vp)
2309 {
2310         vn_vfslocks_entry_t *vpvfsentry;
2311 
2312         /*
2313          * ve_refcnt needs to be decremented twice.
2314          * 1. To release refernce after a call to vn_vfslocks_getlock()
2315          * 2. To release the reference from the locking routines like
2316          *    vn_vfsrlock/vn_vfswlock etc,.
2317          */
2318         vpvfsentry = vn_vfslocks_getlock(vp);
2319         vn_vfslocks_rele(vpvfsentry);
2320 
2321         rwst_exit(&vpvfsentry->ve_lock);
2322         vn_vfslocks_rele(vpvfsentry);
2323 }
2324 
2325 int
2326 vn_vfswlock_held(vnode_t *vp)
2327 {
2328         int held;
2329         vn_vfslocks_entry_t *vpvfsentry;
2330 
2331         ASSERT(vp != NULL);
2332 
2333         vpvfsentry = vn_vfslocks_getlock(vp);
2334         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2335 
2336         vn_vfslocks_rele(vpvfsentry);
2337         return (held);
2338 }
2339 
2340 
2341 int
2342 vn_make_ops(
2343         const char *name,                       /* Name of file system */
2344         const fs_operation_def_t *templ,        /* Operation specification */
2345         vnodeops_t **actual)                    /* Return the vnodeops */
2346 {
2347         int unused_ops;
2348         int error;
2349 
2350         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2351 
2352         (*actual)->vnop_name = name;
2353 
2354         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2355         if (error) {
2356                 kmem_free(*actual, sizeof (vnodeops_t));
2357         }
2358 
2359 #if DEBUG
2360         if (unused_ops != 0)
2361                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2362                     "but not used", name, unused_ops);
2363 #endif
2364 
2365         return (error);
2366 }
2367 
2368 /*
2369  * Free the vnodeops created as a result of vn_make_ops()
2370  */
2371 void
2372 vn_freevnodeops(vnodeops_t *vnops)
2373 {
2374         kmem_free(vnops, sizeof (vnodeops_t));
2375 }
2376 
2377 /*
2378  * Vnode cache.
2379  */
2380 
2381 /* ARGSUSED */
2382 static int
2383 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2384 {
2385         struct vnode *vp;
2386 
2387         vp = buf;
2388 
2389         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2390         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2391         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2392         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2393         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2394         vp->v_path = vn_vpath_empty;
2395         vp->v_path_stamp = 0;
2396         vp->v_mpssdata = NULL;
2397         vp->v_vsd = NULL;
2398         vp->v_fopdata = NULL;
2399 
2400         return (0);
2401 }
2402 
2403 /* ARGSUSED */
2404 static void
2405 vn_cache_destructor(void *buf, void *cdrarg)
2406 {
2407         struct vnode *vp;
2408 
2409         vp = buf;
2410 
2411         rw_destroy(&vp->v_nbllock);
2412         cv_destroy(&vp->v_cv);
2413         mutex_destroy(&vp->v_vsd_lock);
2414         mutex_destroy(&vp->v_lock);
2415 }
2416 
2417 void
2418 vn_create_cache(void)
2419 {
2420         /* LINTED */
2421         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2422             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2423         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2424             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2425             NULL, 0);
2426 }
2427 
2428 void
2429 vn_destroy_cache(void)
2430 {
2431         kmem_cache_destroy(vn_cache);
2432 }
2433 
2434 /*
2435  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2436  * cached by the file system and vnodes remain associated.
2437  */
2438 void
2439 vn_recycle(vnode_t *vp)
2440 {
2441         ASSERT(vp->v_pages == NULL);
2442         VERIFY(vp->v_path != NULL);
2443 
2444         /*
2445          * XXX - This really belongs in vn_reinit(), but we have some issues
2446          * with the counts.  Best to have it here for clean initialization.
2447          */
2448         vp->v_rdcnt = 0;
2449         vp->v_wrcnt = 0;
2450         vp->v_mmap_read = 0;
2451         vp->v_mmap_write = 0;
2452 
2453         /*
2454          * If FEM was in use, make sure everything gets cleaned up
2455          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2456          * constructor.
2457          */
2458         if (vp->v_femhead) {
2459                 /* XXX - There should be a free_femhead() that does all this */
2460                 ASSERT(vp->v_femhead->femh_list == NULL);
2461                 mutex_destroy(&vp->v_femhead->femh_lock);
2462                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2463                 vp->v_femhead = NULL;
2464         }
2465         if (vp->v_path != vn_vpath_empty) {
2466                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2467                 vp->v_path = vn_vpath_empty;
2468         }
2469         vp->v_path_stamp = 0;
2470 
2471         if (vp->v_fopdata != NULL) {
2472                 free_fopdata(vp);
2473         }
2474         vp->v_mpssdata = NULL;
2475         vsd_free(vp);
2476 }
2477 
2478 /*
2479  * Used to reset the vnode fields including those that are directly accessible
2480  * as well as those which require an accessor function.
2481  *
2482  * Does not initialize:
2483  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2484  *      v_data (since FS-nodes and vnodes point to each other and should
2485  *              be updated simultaneously)
2486  *      v_op (in case someone needs to make a VOP call on this object)
2487  */
2488 void
2489 vn_reinit(vnode_t *vp)
2490 {
2491         vp->v_count = 1;
2492         vp->v_count_dnlc = 0;
2493         vp->v_phantom_count = 0;
2494         vp->v_vfsp = NULL;
2495         vp->v_stream = NULL;
2496         vp->v_vfsmountedhere = NULL;
2497         vp->v_flag = 0;
2498         vp->v_type = VNON;
2499         vp->v_rdev = NODEV;
2500 
2501         vp->v_filocks = NULL;
2502         vp->v_shrlocks = NULL;
2503         vp->v_pages = NULL;
2504 
2505         vp->v_locality = NULL;
2506         vp->v_xattrdir = NULL;
2507 
2508         /*
2509          * In a few specific instances, vn_reinit() is used to initialize
2510          * locally defined vnode_t instances.  Lacking the construction offered
2511          * by vn_alloc(), these vnodes require v_path initialization.
2512          */
2513         if (vp->v_path == NULL) {
2514                 vp->v_path = vn_vpath_empty;
2515         }
2516 
2517         /* Handles v_femhead, v_path, and the r/w/map counts */
2518         vn_recycle(vp);
2519 }
2520 
2521 vnode_t *
2522 vn_alloc(int kmflag)
2523 {
2524         vnode_t *vp;
2525 
2526         vp = kmem_cache_alloc(vn_cache, kmflag);
2527 
2528         if (vp != NULL) {
2529                 vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2530                 vp->v_fopdata = NULL;
2531                 vn_reinit(vp);
2532         }
2533 
2534         return (vp);
2535 }
2536 
2537 void
2538 vn_free(vnode_t *vp)
2539 {
2540         ASSERT(vp->v_shrlocks == NULL);
2541         ASSERT(vp->v_filocks == NULL);
2542 
2543         /*
2544          * Some file systems call vn_free() with v_count of zero,
2545          * some with v_count of 1.  In any case, the value should
2546          * never be anything else.
2547          */
2548         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2549         ASSERT(vp->v_count_dnlc == 0);
2550         ASSERT0(vp->v_phantom_count);
2551         VERIFY(vp->v_path != NULL);
2552         if (vp->v_path != vn_vpath_empty) {
2553                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2554                 vp->v_path = vn_vpath_empty;
2555         }
2556 
2557         /* If FEM was in use, make sure everything gets cleaned up */
2558         if (vp->v_femhead) {
2559                 /* XXX - There should be a free_femhead() that does all this */
2560                 ASSERT(vp->v_femhead->femh_list == NULL);
2561                 mutex_destroy(&vp->v_femhead->femh_lock);
2562                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2563                 vp->v_femhead = NULL;
2564         }
2565 
2566         if (vp->v_fopdata != NULL) {
2567                 free_fopdata(vp);
2568         }
2569         vp->v_mpssdata = NULL;
2570         vsd_free(vp);
2571         kmem_cache_free(vn_cache, vp);
2572 }
2573 
2574 /*
2575  * vnode status changes, should define better states than 1, 0.
2576  */
2577 void
2578 vn_reclaim(vnode_t *vp)
2579 {
2580         vfs_t   *vfsp = vp->v_vfsp;
2581 
2582         if (vfsp == NULL ||
2583             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2584                 return;
2585         }
2586         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2587 }
2588 
2589 void
2590 vn_idle(vnode_t *vp)
2591 {
2592         vfs_t   *vfsp = vp->v_vfsp;
2593 
2594         if (vfsp == NULL ||
2595             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2596                 return;
2597         }
2598         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2599 }
2600 void
2601 vn_exists(vnode_t *vp)
2602 {
2603         vfs_t   *vfsp = vp->v_vfsp;
2604 
2605         if (vfsp == NULL ||
2606             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2607                 return;
2608         }
2609         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2610 }
2611 
2612 void
2613 vn_invalid(vnode_t *vp)
2614 {
2615         vfs_t   *vfsp = vp->v_vfsp;
2616 
2617         if (vfsp == NULL ||
2618             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2619                 return;
2620         }
2621         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2622 }
2623 
2624 /* Vnode event notification */
2625 
2626 int
2627 vnevent_support(vnode_t *vp, caller_context_t *ct)
2628 {
2629         if (vp == NULL)
2630                 return (EINVAL);
2631 
2632         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2633 }
2634 
2635 void
2636 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2637 {
2638         if (vp == NULL || vp->v_femhead == NULL) {
2639                 return;
2640         }
2641         (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2642         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2643 }
2644 
2645 void
2646 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2647     caller_context_t *ct)
2648 {
2649         if (vp == NULL || vp->v_femhead == NULL) {
2650                 return;
2651         }
2652         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2653 }
2654 
2655 void
2656 vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2657     caller_context_t *ct)
2658 {
2659         if (vp == NULL || vp->v_femhead == NULL) {
2660                 return;
2661         }
2662         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2663 }
2664 
2665 void
2666 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2667 {
2668         if (vp == NULL || vp->v_femhead == NULL) {
2669                 return;
2670         }
2671         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2672 }
2673 
2674 void
2675 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2676 {
2677         if (vp == NULL || vp->v_femhead == NULL) {
2678                 return;
2679         }
2680         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2681 }
2682 
2683 void
2684 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2685     caller_context_t *ct)
2686 {
2687         if (vp == NULL || vp->v_femhead == NULL) {
2688                 return;
2689         }
2690         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2691 }
2692 
2693 void
2694 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2695     caller_context_t *ct)
2696 {
2697         if (vp == NULL || vp->v_femhead == NULL) {
2698                 return;
2699         }
2700         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2701 }
2702 
2703 void
2704 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2705     caller_context_t *ct)
2706 {
2707         if (vp == NULL || vp->v_femhead == NULL) {
2708                 return;
2709         }
2710         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2711 }
2712 
2713 void
2714 vnevent_create(vnode_t *vp, caller_context_t *ct)
2715 {
2716         if (vp == NULL || vp->v_femhead == NULL) {
2717                 return;
2718         }
2719         (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2720 }
2721 
2722 void
2723 vnevent_link(vnode_t *vp, caller_context_t *ct)
2724 {
2725         if (vp == NULL || vp->v_femhead == NULL) {
2726                 return;
2727         }
2728         (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2729 }
2730 
2731 void
2732 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2733 {
2734         if (vp == NULL || vp->v_femhead == NULL) {
2735                 return;
2736         }
2737         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2738 }
2739 
2740 void
2741 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2742 {
2743         if (vp == NULL || vp->v_femhead == NULL) {
2744                 return;
2745         }
2746         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2747 }
2748 
2749 void
2750 vnevent_resize(vnode_t *vp, caller_context_t *ct)
2751 {
2752         if (vp == NULL || vp->v_femhead == NULL) {
2753                 return;
2754         }
2755         (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2756 }
2757 
2758 /*
2759  * Vnode accessors.
2760  */
2761 
2762 int
2763 vn_is_readonly(vnode_t *vp)
2764 {
2765         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2766 }
2767 
2768 int
2769 vn_has_flocks(vnode_t *vp)
2770 {
2771         return (vp->v_filocks != NULL);
2772 }
2773 
2774 int
2775 vn_has_mandatory_locks(vnode_t *vp, int mode)
2776 {
2777         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2778 }
2779 
2780 int
2781 vn_has_cached_data(vnode_t *vp)
2782 {
2783         return (vp->v_pages != NULL);
2784 }
2785 
2786 /*
2787  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2788  * zone_enter(2).
2789  */
2790 int
2791 vn_can_change_zones(vnode_t *vp)
2792 {
2793         struct vfssw *vswp;
2794         int allow = 1;
2795         vnode_t *rvp;
2796 
2797         if (nfs_global_client_only != 0)
2798                 return (1);
2799 
2800         /*
2801          * We always want to look at the underlying vnode if there is one.
2802          */
2803         if (VOP_REALVP(vp, &rvp, NULL) != 0)
2804                 rvp = vp;
2805         /*
2806          * Some pseudo filesystems (including doorfs) don't actually register
2807          * their vfsops_t, so the following may return NULL; we happily let
2808          * such vnodes switch zones.
2809          */
2810         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2811         if (vswp != NULL) {
2812                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2813                         allow = 0;
2814                 vfs_unrefvfssw(vswp);
2815         }
2816         return (allow);
2817 }
2818 
2819 /*
2820  * Return nonzero if the vnode is a mount point, zero if not.
2821  */
2822 int
2823 vn_ismntpt(vnode_t *vp)
2824 {
2825         return (vp->v_vfsmountedhere != NULL);
2826 }
2827 
2828 /* Retrieve the vfs (if any) mounted on this vnode */
2829 vfs_t *
2830 vn_mountedvfs(vnode_t *vp)
2831 {
2832         return (vp->v_vfsmountedhere);
2833 }
2834 
2835 /*
2836  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2837  */
2838 int
2839 vn_in_dnlc(vnode_t *vp)
2840 {
2841         return (vp->v_count_dnlc > 0);
2842 }
2843 
2844 /*
2845  * vn_has_other_opens() checks whether a particular file is opened by more than
2846  * just the caller and whether the open is for read and/or write.
2847  * This routine is for calling after the caller has already called VOP_OPEN()
2848  * and the caller wishes to know if they are the only one with it open for
2849  * the mode(s) specified.
2850  *
2851  * Vnode counts are only kept on regular files (v_type=VREG).
2852  */
2853 int
2854 vn_has_other_opens(
2855         vnode_t *vp,
2856         v_mode_t mode)
2857 {
2858 
2859         ASSERT(vp != NULL);
2860 
2861         switch (mode) {
2862         case V_WRITE:
2863                 if (vp->v_wrcnt > 1)
2864                         return (V_TRUE);
2865                 break;
2866         case V_RDORWR:
2867                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2868                         return (V_TRUE);
2869                 break;
2870         case V_RDANDWR:
2871                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2872                         return (V_TRUE);
2873                 break;
2874         case V_READ:
2875                 if (vp->v_rdcnt > 1)
2876                         return (V_TRUE);
2877                 break;
2878         }
2879 
2880         return (V_FALSE);
2881 }
2882 
2883 /*
2884  * vn_is_opened() checks whether a particular file is opened and
2885  * whether the open is for read and/or write.
2886  *
2887  * Vnode counts are only kept on regular files (v_type=VREG).
2888  */
2889 int
2890 vn_is_opened(
2891         vnode_t *vp,
2892         v_mode_t mode)
2893 {
2894 
2895         ASSERT(vp != NULL);
2896 
2897         switch (mode) {
2898         case V_WRITE:
2899                 if (vp->v_wrcnt)
2900                         return (V_TRUE);
2901                 break;
2902         case V_RDANDWR:
2903                 if (vp->v_rdcnt && vp->v_wrcnt)
2904                         return (V_TRUE);
2905                 break;
2906         case V_RDORWR:
2907                 if (vp->v_rdcnt || vp->v_wrcnt)
2908                         return (V_TRUE);
2909                 break;
2910         case V_READ:
2911                 if (vp->v_rdcnt)
2912                         return (V_TRUE);
2913                 break;
2914         }
2915 
2916         return (V_FALSE);
2917 }
2918 
2919 /*
2920  * vn_is_mapped() checks whether a particular file is mapped and whether
2921  * the file is mapped read and/or write.
2922  */
2923 int
2924 vn_is_mapped(
2925         vnode_t *vp,
2926         v_mode_t mode)
2927 {
2928 
2929         ASSERT(vp != NULL);
2930 
2931 #if !defined(_LP64)
2932         switch (mode) {
2933         /*
2934          * The atomic_add_64_nv functions force atomicity in the
2935          * case of 32 bit architectures. Otherwise the 64 bit values
2936          * require two fetches. The value of the fields may be
2937          * (potentially) changed between the first fetch and the
2938          * second
2939          */
2940         case V_WRITE:
2941                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2942                         return (V_TRUE);
2943                 break;
2944         case V_RDANDWR:
2945                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2946                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2947                         return (V_TRUE);
2948                 break;
2949         case V_RDORWR:
2950                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2951                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2952                         return (V_TRUE);
2953                 break;
2954         case V_READ:
2955                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2956                         return (V_TRUE);
2957                 break;
2958         }
2959 #else
2960         switch (mode) {
2961         case V_WRITE:
2962                 if (vp->v_mmap_write)
2963                         return (V_TRUE);
2964                 break;
2965         case V_RDANDWR:
2966                 if (vp->v_mmap_read && vp->v_mmap_write)
2967                         return (V_TRUE);
2968                 break;
2969         case V_RDORWR:
2970                 if (vp->v_mmap_read || vp->v_mmap_write)
2971                         return (V_TRUE);
2972                 break;
2973         case V_READ:
2974                 if (vp->v_mmap_read)
2975                         return (V_TRUE);
2976                 break;
2977         }
2978 #endif
2979 
2980         return (V_FALSE);
2981 }
2982 
2983 /*
2984  * Set the operations vector for a vnode.
2985  *
2986  * FEM ensures that the v_femhead pointer is filled in before the
2987  * v_op pointer is changed.  This means that if the v_femhead pointer
2988  * is NULL, and the v_op field hasn't changed since before which checked
2989  * the v_femhead pointer; then our update is ok - we are not racing with
2990  * FEM.
2991  */
2992 void
2993 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2994 {
2995         vnodeops_t      *op;
2996 
2997         ASSERT(vp != NULL);
2998         ASSERT(vnodeops != NULL);
2999 
3000         op = vp->v_op;
3001         membar_consumer();
3002         /*
3003          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
3004          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
3005          * in effect on the vnode and we need to have FEM deal with it.
3006          */
3007         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
3008             op) {
3009                 fem_setvnops(vp, vnodeops);
3010         }
3011 }
3012 
3013 /*
3014  * Retrieve the operations vector for a vnode
3015  * As with vn_setops(above); make sure we aren't racing with FEM.
3016  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
3017  * make sense to the callers of this routine.
3018  */
3019 vnodeops_t *
3020 vn_getops(vnode_t *vp)
3021 {
3022         vnodeops_t      *op;
3023 
3024         ASSERT(vp != NULL);
3025 
3026         op = vp->v_op;
3027         membar_consumer();
3028         if (vp->v_femhead == NULL && op == vp->v_op) {
3029                 return (op);
3030         } else {
3031                 return (fem_getvnops(vp));
3032         }
3033 }
3034 
3035 /*
3036  * Returns non-zero (1) if the vnodeops matches that of the vnode.
3037  * Returns zero (0) if not.
3038  */
3039 int
3040 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
3041 {
3042         return (vn_getops(vp) == vnodeops);
3043 }
3044 
3045 /*
3046  * Returns non-zero (1) if the specified operation matches the
3047  * corresponding operation for that the vnode.
3048  * Returns zero (0) if not.
3049  */
3050 
3051 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
3052 
3053 int
3054 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
3055 {
3056         const fs_operation_trans_def_t *otdp;
3057         fs_generic_func_p *loc = NULL;
3058         vnodeops_t      *vop = vn_getops(vp);
3059 
3060         ASSERT(vopname != NULL);
3061 
3062         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3063                 if (MATCHNAME(otdp->name, vopname)) {
3064                         loc = (fs_generic_func_p *)
3065                             ((char *)(vop) + otdp->offset);
3066                         break;
3067                 }
3068         }
3069 
3070         return ((loc != NULL) && (*loc == funcp));
3071 }
3072 
3073 /*
3074  * fs_new_caller_id() needs to return a unique ID on a given local system.
3075  * The IDs do not need to survive across reboots.  These are primarily
3076  * used so that (FEM) monitors can detect particular callers (such as
3077  * the NFS server) to a given vnode/vfs operation.
3078  */
3079 u_longlong_t
3080 fs_new_caller_id()
3081 {
3082         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3083 
3084         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3085 }
3086 
3087 /*
3088  * The value stored in v_path is relative to rootdir, located in the global
3089  * zone.  Zones or chroot environments which reside deeper inside the VFS
3090  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3091  * what lies below their perceived root.  In order to keep v_path usable for
3092  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3093  *
3094  * An upper bound of max_vnode_path is placed upon v_path allocations to
3095  * prevent the system from going too wild at the behest of pathological
3096  * behavior from the operator.
3097  */
3098 size_t max_vnode_path = 4 * MAXPATHLEN;
3099 
3100 
3101 void
3102 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3103 {
3104         char *buf;
3105 
3106         mutex_enter(&vp->v_lock);
3107         /*
3108          * If the snapshot of v_path_stamp passed in via compare_stamp does not
3109          * match the present value on the vnode, it indicates that subsequent
3110          * changes have occurred.  The v_path value is not cleared in this case
3111          * since the new value may be valid.
3112          */
3113         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3114                 mutex_exit(&vp->v_lock);
3115                 return;
3116         }
3117         buf = vp->v_path;
3118         vp->v_path = vn_vpath_empty;
3119         vp->v_path_stamp = 0;
3120         mutex_exit(&vp->v_lock);
3121         if (buf != vn_vpath_empty) {
3122                 kmem_free(buf, strlen(buf) + 1);
3123         }
3124 }
3125 
3126 static void
3127 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3128     boolean_t is_rename)
3129 {
3130         char *buf, *oldbuf;
3131         hrtime_t pstamp;
3132         size_t baselen, buflen = 0;
3133 
3134         /* Handle the vn_setpath_str case. */
3135         if (pvp == NULL) {
3136                 if (len + 1 > max_vnode_path) {
3137                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3138                             vnode_t *, vp, char *, name, size_t, len + 1);
3139                         return;
3140                 }
3141                 buf = kmem_alloc(len + 1, KM_SLEEP);
3142                 bcopy(name, buf, len);
3143                 buf[len] = '\0';
3144 
3145                 mutex_enter(&vp->v_lock);
3146                 oldbuf = vp->v_path;
3147                 vp->v_path = buf;
3148                 vp->v_path_stamp = gethrtime();
3149                 mutex_exit(&vp->v_lock);
3150                 if (oldbuf != vn_vpath_empty) {
3151                         kmem_free(oldbuf, strlen(oldbuf) + 1);
3152                 }
3153                 return;
3154         }
3155 
3156         /* Take snapshot of parent dir */
3157         mutex_enter(&pvp->v_lock);
3158 
3159         if ((pvp->v_flag & VTRAVERSE) != 0) {
3160                 /*
3161                  * When the parent vnode has VTRAVERSE set in its flags, normal
3162                  * assumptions about v_path calculation no longer apply.  The
3163                  * primary situation where this occurs is via the VFS tricks
3164                  * which procfs plays in order to allow /proc/PID/(root|cwd) to
3165                  * yield meaningful results.
3166                  *
3167                  * When this flag is set, v_path on the child must not be
3168                  * updated since the calculated value is likely to be
3169                  * incorrect, given the current context.
3170                  */
3171                 mutex_exit(&pvp->v_lock);
3172                 return;
3173         }
3174 
3175 retrybuf:
3176         if (pvp->v_path == vn_vpath_empty) {
3177                 /*
3178                  * Without v_path from the parent directory, generating a child
3179                  * path from the name is impossible.
3180                  */
3181                 if (len > 0) {
3182                         pstamp = pvp->v_path_stamp;
3183                         mutex_exit(&pvp->v_lock);
3184                         vn_clearpath(vp, pstamp);
3185                         return;
3186                 }
3187 
3188                 /*
3189                  * The only feasible case here is where a NUL lookup is being
3190                  * performed on rootdir prior to its v_path being populated.
3191                  */
3192                 ASSERT(pvp->v_path_stamp == 0);
3193                 baselen = 0;
3194                 pstamp = 0;
3195         } else {
3196                 pstamp = pvp->v_path_stamp;
3197                 baselen = strlen(pvp->v_path);
3198                 /* ignore a trailing slash if present */
3199                 if (pvp->v_path[baselen - 1] == '/') {
3200                         /* This should only the be case for rootdir */
3201                         ASSERT(baselen == 1 && pvp == rootdir);
3202                         baselen--;
3203                 }
3204         }
3205         mutex_exit(&pvp->v_lock);
3206 
3207         if (buflen != 0) {
3208                 /* Free the existing (mis-sized) buffer in case of retry */
3209                 kmem_free(buf, buflen);
3210         }
3211         /* base, '/', name and trailing NUL */
3212         buflen = baselen + len + 2;
3213         if (buflen > max_vnode_path) {
3214                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3215                     vnode_t *, vp, char *, name, size_t, buflen);
3216                 return;
3217         }
3218         buf = kmem_alloc(buflen, KM_SLEEP);
3219 
3220         mutex_enter(&pvp->v_lock);
3221         if (pvp->v_path_stamp != pstamp) {
3222                 size_t vlen;
3223 
3224                 /*
3225                  * Since v_path_stamp changed on the parent, it is likely that
3226                  * v_path has been altered as well.  If the length does not
3227                  * exactly match what was previously measured, the buffer
3228                  * allocation must be repeated for proper sizing.
3229                  */
3230                 if (pvp->v_path == vn_vpath_empty) {
3231                         /* Give up if parent lack v_path */
3232                         mutex_exit(&pvp->v_lock);
3233                         kmem_free(buf, buflen);
3234                         return;
3235                 }
3236                 vlen = strlen(pvp->v_path);
3237                 if (pvp->v_path[vlen - 1] == '/') {
3238                         vlen--;
3239                 }
3240                 if (vlen != baselen) {
3241                         goto retrybuf;
3242                 }
3243         }
3244         bcopy(pvp->v_path, buf, baselen);
3245         mutex_exit(&pvp->v_lock);
3246 
3247         buf[baselen] = '/';
3248         baselen++;
3249         bcopy(name, &buf[baselen], len + 1);
3250 
3251         mutex_enter(&vp->v_lock);
3252         if (vp->v_path_stamp == 0) {
3253                 /* never-visited vnode can inherit stamp from parent */
3254                 ASSERT(vp->v_path == vn_vpath_empty);
3255                 vp->v_path_stamp = pstamp;
3256                 vp->v_path = buf;
3257                 mutex_exit(&vp->v_lock);
3258         } else if (vp->v_path_stamp < pstamp || is_rename) {
3259                 /*
3260                  * Install the updated path and stamp, ensuring that the v_path
3261                  * pointer is valid at all times for dtrace.
3262                  */
3263                 oldbuf = vp->v_path;
3264                 vp->v_path = buf;
3265                 vp->v_path_stamp = gethrtime();
3266                 mutex_exit(&vp->v_lock);
3267                 kmem_free(oldbuf, strlen(oldbuf) + 1);
3268         } else {
3269                 /*
3270                  * If the timestamp matches or is greater, it means another
3271                  * thread performed the update first while locks were dropped
3272                  * here to make the allocation.  We defer to the newer value.
3273                  */
3274                 mutex_exit(&vp->v_lock);
3275                 kmem_free(buf, buflen);
3276         }
3277         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3278 }
3279 
3280 void
3281 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3282 {
3283         size_t len;
3284 
3285         /*
3286          * If the parent is older or empty, there's nothing further to do.
3287          */
3288         if (pvp->v_path == vn_vpath_empty ||
3289             pvp->v_path_stamp <= vp->v_path_stamp) {
3290                 return;
3291         }
3292 
3293         /*
3294          * Given the lack of appropriate context, meaningful updates to v_path
3295          * cannot be made for during lookups for the '.' or '..' entries.
3296          */
3297         len = strlen(name);
3298         if (len == 0 || (len == 1 && name[0] == '.') ||
3299             (len == 2 && name[0] == '.' && name[1] == '.')) {
3300                 return;
3301         }
3302 
3303         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3304 }
3305 
3306 /*
3307  * Given a starting vnode and a path, updates the path in the target vnode in
3308  * a safe manner.  If the vnode already has path information embedded, then the
3309  * cached path is left untouched.
3310  */
3311 /* ARGSUSED */
3312 void
3313 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3314     size_t len)
3315 {
3316         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3317 }
3318 
3319 /*
3320  * Sets the path to the vnode to be the given string, regardless of current
3321  * context.  The string must be a complete path from rootdir.  This is only used
3322  * by fsop_root() for setting the path based on the mountpoint.
3323  */
3324 void
3325 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3326 {
3327         vn_setpath_common(NULL, vp, str, len, B_FALSE);
3328 }
3329 
3330 /*
3331  * Called from within filesystem's vop_rename() to handle renames once the
3332  * target vnode is available.
3333  */
3334 void
3335 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3336 {
3337         vn_setpath_common(pvp, vp, name, len, B_TRUE);
3338 }
3339 
3340 /*
3341  * Similar to vn_setpath_str(), this function sets the path of the destination
3342  * vnode to the be the same as the source vnode.
3343  */
3344 void
3345 vn_copypath(struct vnode *src, struct vnode *dst)
3346 {
3347         char *buf;
3348         hrtime_t stamp;
3349         size_t buflen;
3350 
3351         mutex_enter(&src->v_lock);
3352         if (src->v_path == vn_vpath_empty) {
3353                 mutex_exit(&src->v_lock);
3354                 return;
3355         }
3356         buflen = strlen(src->v_path) + 1;
3357         mutex_exit(&src->v_lock);
3358 
3359         buf = kmem_alloc(buflen, KM_SLEEP);
3360 
3361         mutex_enter(&src->v_lock);
3362         if (src->v_path == vn_vpath_empty ||
3363             strlen(src->v_path) + 1 != buflen) {
3364                 mutex_exit(&src->v_lock);
3365                 kmem_free(buf, buflen);
3366                 return;
3367         }
3368         bcopy(src->v_path, buf, buflen);
3369         stamp = src->v_path_stamp;
3370         mutex_exit(&src->v_lock);
3371 
3372         mutex_enter(&dst->v_lock);
3373         if (dst->v_path != vn_vpath_empty) {
3374                 mutex_exit(&dst->v_lock);
3375                 kmem_free(buf, buflen);
3376                 return;
3377         }
3378         dst->v_path = buf;
3379         dst->v_path_stamp = stamp;
3380         mutex_exit(&dst->v_lock);
3381 }
3382 
3383 
3384 /*
3385  * XXX Private interface for segvn routines that handle vnode
3386  * large page segments.
3387  *
3388  * return 1 if vp's file system VOP_PAGEIO() implementation
3389  * can be safely used instead of VOP_GETPAGE() for handling
3390  * pagefaults against regular non swap files. VOP_PAGEIO()
3391  * interface is considered safe here if its implementation
3392  * is very close to VOP_GETPAGE() implementation.
3393  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3394  * panic if there're file holes but instead returns an error.
3395  * Doesn't assume file won't be changed by user writes, etc.
3396  *
3397  * return 0 otherwise.
3398  *
3399  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3400  */
3401 int
3402 vn_vmpss_usepageio(vnode_t *vp)
3403 {
3404         vfs_t   *vfsp = vp->v_vfsp;
3405         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3406         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3407         char **fsok = pageio_ok_fss;
3408 
3409         if (fsname == NULL) {
3410                 return (0);
3411         }
3412 
3413         for (; *fsok; fsok++) {
3414                 if (strcmp(*fsok, fsname) == 0) {
3415                         return (1);
3416                 }
3417         }
3418         return (0);
3419 }
3420 
3421 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3422 
3423 int
3424 fop_open(
3425         vnode_t **vpp,
3426         int mode,
3427         cred_t *cr,
3428         caller_context_t *ct)
3429 {
3430         int ret;
3431         vnode_t *vp = *vpp;
3432 
3433         VN_HOLD(vp);
3434         /*
3435          * Adding to the vnode counts before calling open
3436          * avoids the need for a mutex. It circumvents a race
3437          * condition where a query made on the vnode counts results in a
3438          * false negative. The inquirer goes away believing the file is
3439          * not open when there is an open on the file already under way.
3440          *
3441          * The counts are meant to prevent NFS from granting a delegation
3442          * when it would be dangerous to do so.
3443          *
3444          * The vnode counts are only kept on regular files
3445          */
3446         if ((*vpp)->v_type == VREG) {
3447                 if (mode & FREAD)
3448                         atomic_inc_32(&(*vpp)->v_rdcnt);
3449                 if (mode & FWRITE)
3450                         atomic_inc_32(&(*vpp)->v_wrcnt);
3451         }
3452 
3453         VOPXID_MAP_CR(vp, cr);
3454 
3455         ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3456 
3457         if (ret) {
3458                 /*
3459                  * Use the saved vp just in case the vnode ptr got trashed
3460                  * by the error.
3461                  */
3462                 VOPSTATS_UPDATE(vp, open);
3463                 if ((vp->v_type == VREG) && (mode & FREAD))
3464                         atomic_dec_32(&vp->v_rdcnt);
3465                 if ((vp->v_type == VREG) && (mode & FWRITE))
3466                         atomic_dec_32(&vp->v_wrcnt);
3467         } else {
3468                 /*
3469                  * Some filesystems will return a different vnode,
3470                  * but the same path was still used to open it.
3471                  * So if we do change the vnode and need to
3472                  * copy over the path, do so here, rather than special
3473                  * casing each filesystem. Adjust the vnode counts to
3474                  * reflect the vnode switch.
3475                  */
3476                 VOPSTATS_UPDATE(*vpp, open);
3477                 if (*vpp != vp) {
3478                         vn_copypath(vp, *vpp);
3479                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3480                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3481                         if ((vp->v_type == VREG) && (mode & FREAD))
3482                                 atomic_dec_32(&vp->v_rdcnt);
3483                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3484                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3485                         if ((vp->v_type == VREG) && (mode & FWRITE))
3486                                 atomic_dec_32(&vp->v_wrcnt);
3487                 }
3488         }
3489         VN_RELE(vp);
3490         return (ret);
3491 }
3492 
3493 int
3494 fop_close(
3495         vnode_t *vp,
3496         int flag,
3497         int count,
3498         offset_t offset,
3499         cred_t *cr,
3500         caller_context_t *ct)
3501 {
3502         int err;
3503 
3504         VOPXID_MAP_CR(vp, cr);
3505 
3506         err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3507         VOPSTATS_UPDATE(vp, close);
3508         /*
3509          * Check passed in count to handle possible dups. Vnode counts are only
3510          * kept on regular files
3511          */
3512         if ((vp->v_type == VREG) && (count == 1))  {
3513                 if (flag & FREAD) {
3514                         ASSERT(vp->v_rdcnt > 0);
3515                         atomic_dec_32(&vp->v_rdcnt);
3516                 }
3517                 if (flag & FWRITE) {
3518                         ASSERT(vp->v_wrcnt > 0);
3519                         atomic_dec_32(&vp->v_wrcnt);
3520                 }
3521         }
3522         return (err);
3523 }
3524 
3525 int
3526 fop_read(
3527         vnode_t *vp,
3528         uio_t *uiop,
3529         int ioflag,
3530         cred_t *cr,
3531         caller_context_t *ct)
3532 {
3533         ssize_t resid_start = uiop->uio_resid;
3534         zone_t  *zonep = curzone;
3535         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3536 
3537         hrtime_t start = 0, lat;
3538         ssize_t len;
3539         int err;
3540 
3541         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3542             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3543                 start = gethrtime();
3544 
3545                 mutex_enter(&zonep->zone_vfs_lock);
3546                 kstat_runq_enter(&zonep->zone_vfs_rwstats);
3547                 mutex_exit(&zonep->zone_vfs_lock);
3548         }
3549 
3550         VOPXID_MAP_CR(vp, cr);
3551 
3552         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3553         len = resid_start - uiop->uio_resid;
3554 
3555         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3556 
3557         if (start != 0) {
3558                 mutex_enter(&zonep->zone_vfs_lock);
3559                 zonep->zone_vfs_rwstats.reads++;
3560                 zonep->zone_vfs_rwstats.nread += len;
3561                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3562                 mutex_exit(&zonep->zone_vfs_lock);
3563 
3564                 lat = gethrtime() - start;
3565 
3566                 if (lat >= VOP_LATENCY_10MS) {
3567                         if (lat < VOP_LATENCY_100MS)
3568                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3569                         else if (lat < VOP_LATENCY_1S) {
3570                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3571                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3572                         } else if (lat < VOP_LATENCY_10S) {
3573                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3574                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3575                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3576                         } else {
3577                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3578                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3579                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3580                                 atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3581                         }
3582                 }
3583         }
3584 
3585         return (err);
3586 }
3587 
3588 int
3589 fop_write(
3590         vnode_t *vp,
3591         uio_t *uiop,
3592         int ioflag,
3593         cred_t *cr,
3594         caller_context_t *ct)
3595 {
3596         ssize_t resid_start = uiop->uio_resid;
3597         zone_t  *zonep = curzone;
3598         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3599 
3600         hrtime_t start = 0, lat;
3601         ssize_t len;
3602         int     err;
3603 
3604         /*
3605          * For the purposes of VFS kstat consumers, the "waitq" calculation is
3606          * repurposed as the active queue for VFS write operations.  There's no
3607          * actual wait queue for VFS operations.
3608          */
3609         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3610             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3611                 start = gethrtime();
3612 
3613                 mutex_enter(&zonep->zone_vfs_lock);
3614                 kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3615                 mutex_exit(&zonep->zone_vfs_lock);
3616         }
3617 
3618         VOPXID_MAP_CR(vp, cr);
3619 
3620         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3621         len = resid_start - uiop->uio_resid;
3622 
3623         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3624 
3625         if (start != 0) {
3626                 mutex_enter(&zonep->zone_vfs_lock);
3627                 zonep->zone_vfs_rwstats.writes++;
3628                 zonep->zone_vfs_rwstats.nwritten += len;
3629                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3630                 mutex_exit(&zonep->zone_vfs_lock);
3631 
3632                 lat = gethrtime() - start;
3633 
3634                 if (lat >= VOP_LATENCY_10MS) {
3635                         if (lat < VOP_LATENCY_100MS)
3636                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3637                         else if (lat < VOP_LATENCY_1S) {
3638                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3639                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3640                         } else if (lat < VOP_LATENCY_10S) {
3641                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3642                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3643                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3644                         } else {
3645                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3646                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3647                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3648                                 atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3649                         }
3650                 }
3651         }
3652 
3653         return (err);
3654 }
3655 
3656 int
3657 fop_ioctl(
3658         vnode_t *vp,
3659         int cmd,
3660         intptr_t arg,
3661         int flag,
3662         cred_t *cr,
3663         int *rvalp,
3664         caller_context_t *ct)
3665 {
3666         int     err;
3667 
3668         VOPXID_MAP_CR(vp, cr);
3669 
3670         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3671         VOPSTATS_UPDATE(vp, ioctl);
3672         return (err);
3673 }
3674 
3675 int
3676 fop_setfl(
3677         vnode_t *vp,
3678         int oflags,
3679         int nflags,
3680         cred_t *cr,
3681         caller_context_t *ct)
3682 {
3683         int     err;
3684 
3685         VOPXID_MAP_CR(vp, cr);
3686 
3687         err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3688         VOPSTATS_UPDATE(vp, setfl);
3689         return (err);
3690 }
3691 
3692 int
3693 fop_getattr(
3694         vnode_t *vp,
3695         vattr_t *vap,
3696         int flags,
3697         cred_t *cr,
3698         caller_context_t *ct)
3699 {
3700         int     err;
3701 
3702         VOPXID_MAP_CR(vp, cr);
3703 
3704         /*
3705          * If this file system doesn't understand the xvattr extensions
3706          * then turn off the xvattr bit.
3707          */
3708         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3709                 vap->va_mask &= ~AT_XVATTR;
3710         }
3711 
3712         /*
3713          * We're only allowed to skip the ACL check iff we used a 32 bit
3714          * ACE mask with VOP_ACCESS() to determine permissions.
3715          */
3716         if ((flags & ATTR_NOACLCHECK) &&
3717             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3718                 return (EINVAL);
3719         }
3720         err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3721         VOPSTATS_UPDATE(vp, getattr);
3722         return (err);
3723 }
3724 
3725 int
3726 fop_setattr(
3727         vnode_t *vp,
3728         vattr_t *vap,
3729         int flags,
3730         cred_t *cr,
3731         caller_context_t *ct)
3732 {
3733         int     err;
3734 
3735         VOPXID_MAP_CR(vp, cr);
3736 
3737         /*
3738          * If this file system doesn't understand the xvattr extensions
3739          * then turn off the xvattr bit.
3740          */
3741         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3742                 vap->va_mask &= ~AT_XVATTR;
3743         }
3744 
3745         /*
3746          * We're only allowed to skip the ACL check iff we used a 32 bit
3747          * ACE mask with VOP_ACCESS() to determine permissions.
3748          */
3749         if ((flags & ATTR_NOACLCHECK) &&
3750             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3751                 return (EINVAL);
3752         }
3753         err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3754         VOPSTATS_UPDATE(vp, setattr);
3755         return (err);
3756 }
3757 
3758 int
3759 fop_access(
3760         vnode_t *vp,
3761         int mode,
3762         int flags,
3763         cred_t *cr,
3764         caller_context_t *ct)
3765 {
3766         int     err;
3767 
3768         if ((flags & V_ACE_MASK) &&
3769             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3770                 return (EINVAL);
3771         }
3772 
3773         VOPXID_MAP_CR(vp, cr);
3774 
3775         err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3776         VOPSTATS_UPDATE(vp, access);
3777         return (err);
3778 }
3779 
3780 int
3781 fop_lookup(
3782         vnode_t *dvp,
3783         char *nm,
3784         vnode_t **vpp,
3785         pathname_t *pnp,
3786         int flags,
3787         vnode_t *rdir,
3788         cred_t *cr,
3789         caller_context_t *ct,
3790         int *deflags,           /* Returned per-dirent flags */
3791         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3792 {
3793         int ret;
3794 
3795         /*
3796          * If this file system doesn't support case-insensitive access
3797          * and said access is requested, fail quickly.  It is required
3798          * that if the vfs supports case-insensitive lookup, it also
3799          * supports extended dirent flags.
3800          */
3801         if (flags & FIGNORECASE &&
3802             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3803             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3804                 return (EINVAL);
3805 
3806         VOPXID_MAP_CR(dvp, cr);
3807 
3808         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3809                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3810         } else {
3811                 ret = (*(dvp)->v_op->vop_lookup)
3812                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3813         }
3814         if (ret == 0 && *vpp) {
3815                 VOPSTATS_UPDATE(*vpp, lookup);
3816                 vn_updatepath(dvp, *vpp, nm);
3817         }
3818 
3819         return (ret);
3820 }
3821 
3822 int
3823 fop_create(
3824         vnode_t *dvp,
3825         char *name,
3826         vattr_t *vap,
3827         vcexcl_t excl,
3828         int mode,
3829         vnode_t **vpp,
3830         cred_t *cr,
3831         int flags,
3832         caller_context_t *ct,
3833         vsecattr_t *vsecp)      /* ACL to set during create */
3834 {
3835         int ret;
3836 
3837         if (vsecp != NULL &&
3838             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3839                 return (EINVAL);
3840         }
3841         /*
3842          * If this file system doesn't support case-insensitive access
3843          * and said access is requested, fail quickly.
3844          */
3845         if (flags & FIGNORECASE &&
3846             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3847             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3848                 return (EINVAL);
3849 
3850         VOPXID_MAP_CR(dvp, cr);
3851 
3852         ret = (*(dvp)->v_op->vop_create)
3853             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3854         if (ret == 0 && *vpp) {
3855                 VOPSTATS_UPDATE(*vpp, create);
3856                 vn_updatepath(dvp, *vpp, name);
3857         }
3858 
3859         return (ret);
3860 }
3861 
3862 int
3863 fop_remove(
3864         vnode_t *dvp,
3865         char *nm,
3866         cred_t *cr,
3867         caller_context_t *ct,
3868         int flags)
3869 {
3870         int     err;
3871 
3872         /*
3873          * If this file system doesn't support case-insensitive access
3874          * and said access is requested, fail quickly.
3875          */
3876         if (flags & FIGNORECASE &&
3877             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3878             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3879                 return (EINVAL);
3880 
3881         VOPXID_MAP_CR(dvp, cr);
3882 
3883         err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3884         VOPSTATS_UPDATE(dvp, remove);
3885         return (err);
3886 }
3887 
3888 int
3889 fop_link(
3890         vnode_t *tdvp,
3891         vnode_t *svp,
3892         char *tnm,
3893         cred_t *cr,
3894         caller_context_t *ct,
3895         int flags)
3896 {
3897         int     err;
3898 
3899         /*
3900          * If the target file system doesn't support case-insensitive access
3901          * and said access is requested, fail quickly.
3902          */
3903         if (flags & FIGNORECASE &&
3904             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3905             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3906                 return (EINVAL);
3907 
3908         VOPXID_MAP_CR(tdvp, cr);
3909 
3910         err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3911         VOPSTATS_UPDATE(tdvp, link);
3912         return (err);
3913 }
3914 
3915 int
3916 fop_rename(
3917         vnode_t *sdvp,
3918         char *snm,
3919         vnode_t *tdvp,
3920         char *tnm,
3921         cred_t *cr,
3922         caller_context_t *ct,
3923         int flags)
3924 {
3925         int     err;
3926 
3927         /*
3928          * If the file system involved does not support
3929          * case-insensitive access and said access is requested, fail
3930          * quickly.
3931          */
3932         if (flags & FIGNORECASE &&
3933             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3934             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3935                 return (EINVAL);
3936 
3937         VOPXID_MAP_CR(tdvp, cr);
3938 
3939         err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3940         VOPSTATS_UPDATE(sdvp, rename);
3941         return (err);
3942 }
3943 
3944 int
3945 fop_mkdir(
3946         vnode_t *dvp,
3947         char *dirname,
3948         vattr_t *vap,
3949         vnode_t **vpp,
3950         cred_t *cr,
3951         caller_context_t *ct,
3952         int flags,
3953         vsecattr_t *vsecp)      /* ACL to set during create */
3954 {
3955         int ret;
3956 
3957         if (vsecp != NULL &&
3958             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3959                 return (EINVAL);
3960         }
3961         /*
3962          * If this file system doesn't support case-insensitive access
3963          * and said access is requested, fail quickly.
3964          */
3965         if (flags & FIGNORECASE &&
3966             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3967             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3968                 return (EINVAL);
3969 
3970         VOPXID_MAP_CR(dvp, cr);
3971 
3972         ret = (*(dvp)->v_op->vop_mkdir)
3973             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3974         if (ret == 0 && *vpp) {
3975                 VOPSTATS_UPDATE(*vpp, mkdir);
3976                 vn_updatepath(dvp, *vpp, dirname);
3977         }
3978 
3979         return (ret);
3980 }
3981 
3982 int
3983 fop_rmdir(
3984         vnode_t *dvp,
3985         char *nm,
3986         vnode_t *cdir,
3987         cred_t *cr,
3988         caller_context_t *ct,
3989         int flags)
3990 {
3991         int     err;
3992 
3993         /*
3994          * If this file system doesn't support case-insensitive access
3995          * and said access is requested, fail quickly.
3996          */
3997         if (flags & FIGNORECASE &&
3998             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3999             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4000                 return (EINVAL);
4001 
4002         VOPXID_MAP_CR(dvp, cr);
4003 
4004         err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
4005         VOPSTATS_UPDATE(dvp, rmdir);
4006         return (err);
4007 }
4008 
4009 int
4010 fop_readdir(
4011         vnode_t *vp,
4012         uio_t *uiop,
4013         cred_t *cr,
4014         int *eofp,
4015         caller_context_t *ct,
4016         int flags)
4017 {
4018         int     err;
4019         ssize_t resid_start = uiop->uio_resid;
4020 
4021         /*
4022          * If this file system doesn't support retrieving directory
4023          * entry flags and said access is requested, fail quickly.
4024          */
4025         if (flags & V_RDDIR_ENTFLAGS &&
4026             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
4027                 return (EINVAL);
4028 
4029         VOPXID_MAP_CR(vp, cr);
4030 
4031         err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
4032         VOPSTATS_UPDATE_IO(vp, readdir,
4033             readdir_bytes, (resid_start - uiop->uio_resid));
4034         return (err);
4035 }
4036 
4037 int
4038 fop_symlink(
4039         vnode_t *dvp,
4040         char *linkname,
4041         vattr_t *vap,
4042         char *target,
4043         cred_t *cr,
4044         caller_context_t *ct,
4045         int flags)
4046 {
4047         int     err;
4048         xvattr_t xvattr;
4049 
4050         /*
4051          * If this file system doesn't support case-insensitive access
4052          * and said access is requested, fail quickly.
4053          */
4054         if (flags & FIGNORECASE &&
4055             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
4056             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4057                 return (EINVAL);
4058 
4059         VOPXID_MAP_CR(dvp, cr);
4060 
4061         /* check for reparse point */
4062         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
4063             (strncmp(target, FS_REPARSE_TAG_STR,
4064             strlen(FS_REPARSE_TAG_STR)) == 0)) {
4065                 if (!fs_reparse_mark(target, vap, &xvattr))
4066                         vap = (vattr_t *)&xvattr;
4067         }
4068 
4069         err = (*(dvp)->v_op->vop_symlink)
4070             (dvp, linkname, vap, target, cr, ct, flags);
4071         VOPSTATS_UPDATE(dvp, symlink);
4072         return (err);
4073 }
4074 
4075 int
4076 fop_readlink(
4077         vnode_t *vp,
4078         uio_t *uiop,
4079         cred_t *cr,
4080         caller_context_t *ct)
4081 {
4082         int     err;
4083 
4084         VOPXID_MAP_CR(vp, cr);
4085 
4086         err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
4087         VOPSTATS_UPDATE(vp, readlink);
4088         return (err);
4089 }
4090 
4091 int
4092 fop_fsync(
4093         vnode_t *vp,
4094         int syncflag,
4095         cred_t *cr,
4096         caller_context_t *ct)
4097 {
4098         int     err;
4099 
4100         VOPXID_MAP_CR(vp, cr);
4101 
4102         err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
4103         VOPSTATS_UPDATE(vp, fsync);
4104         return (err);
4105 }
4106 
4107 void
4108 fop_inactive(
4109         vnode_t *vp,
4110         cred_t *cr,
4111         caller_context_t *ct)
4112 {
4113         /* Need to update stats before vop call since we may lose the vnode */
4114         VOPSTATS_UPDATE(vp, inactive);
4115 
4116         VOPXID_MAP_CR(vp, cr);
4117 
4118         (*(vp)->v_op->vop_inactive)(vp, cr, ct);
4119 }
4120 
4121 int
4122 fop_fid(
4123         vnode_t *vp,
4124         fid_t *fidp,
4125         caller_context_t *ct)
4126 {
4127         int     err;
4128 
4129         err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
4130         VOPSTATS_UPDATE(vp, fid);
4131         return (err);
4132 }
4133 
4134 int
4135 fop_rwlock(
4136         vnode_t *vp,
4137         int write_lock,
4138         caller_context_t *ct)
4139 {
4140         int     ret;
4141 
4142         ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
4143         VOPSTATS_UPDATE(vp, rwlock);
4144         return (ret);
4145 }
4146 
4147 void
4148 fop_rwunlock(
4149         vnode_t *vp,
4150         int write_lock,
4151         caller_context_t *ct)
4152 {
4153         (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4154         VOPSTATS_UPDATE(vp, rwunlock);
4155 }
4156 
4157 int
4158 fop_seek(
4159         vnode_t *vp,
4160         offset_t ooff,
4161         offset_t *noffp,
4162         caller_context_t *ct)
4163 {
4164         int     err;
4165 
4166         err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4167         VOPSTATS_UPDATE(vp, seek);
4168         return (err);
4169 }
4170 
4171 int
4172 fop_cmp(
4173         vnode_t *vp1,
4174         vnode_t *vp2,
4175         caller_context_t *ct)
4176 {
4177         int     err;
4178 
4179         err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4180         VOPSTATS_UPDATE(vp1, cmp);
4181         return (err);
4182 }
4183 
4184 int
4185 fop_frlock(
4186         vnode_t *vp,
4187         int cmd,
4188         flock64_t *bfp,
4189         int flag,
4190         offset_t offset,
4191         struct flk_callback *flk_cbp,
4192         cred_t *cr,
4193         caller_context_t *ct)
4194 {
4195         int     err;
4196 
4197         VOPXID_MAP_CR(vp, cr);
4198 
4199         err = (*(vp)->v_op->vop_frlock)
4200             (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4201         VOPSTATS_UPDATE(vp, frlock);
4202         return (err);
4203 }
4204 
4205 int
4206 fop_space(
4207         vnode_t *vp,
4208         int cmd,
4209         flock64_t *bfp,
4210         int flag,
4211         offset_t offset,
4212         cred_t *cr,
4213         caller_context_t *ct)
4214 {
4215         int     err;
4216 
4217         VOPXID_MAP_CR(vp, cr);
4218 
4219         err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4220         VOPSTATS_UPDATE(vp, space);
4221         return (err);
4222 }
4223 
4224 int
4225 fop_realvp(
4226         vnode_t *vp,
4227         vnode_t **vpp,
4228         caller_context_t *ct)
4229 {
4230         int     err;
4231 
4232         err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4233         VOPSTATS_UPDATE(vp, realvp);
4234         return (err);
4235 }
4236 
4237 int
4238 fop_getpage(
4239         vnode_t *vp,
4240         offset_t off,
4241         size_t len,
4242         uint_t *protp,
4243         page_t **plarr,
4244         size_t plsz,
4245         struct seg *seg,
4246         caddr_t addr,
4247         enum seg_rw rw,
4248         cred_t *cr,
4249         caller_context_t *ct)
4250 {
4251         int     err;
4252 
4253         VOPXID_MAP_CR(vp, cr);
4254 
4255         err = (*(vp)->v_op->vop_getpage)
4256             (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4257         VOPSTATS_UPDATE(vp, getpage);
4258         return (err);
4259 }
4260 
4261 int
4262 fop_putpage(
4263         vnode_t *vp,
4264         offset_t off,
4265         size_t len,
4266         int flags,
4267         cred_t *cr,
4268         caller_context_t *ct)
4269 {
4270         int     err;
4271 
4272         VOPXID_MAP_CR(vp, cr);
4273 
4274         err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4275         VOPSTATS_UPDATE(vp, putpage);
4276         return (err);
4277 }
4278 
4279 int
4280 fop_map(
4281         vnode_t *vp,
4282         offset_t off,
4283         struct as *as,
4284         caddr_t *addrp,
4285         size_t len,
4286         uchar_t prot,
4287         uchar_t maxprot,
4288         uint_t flags,
4289         cred_t *cr,
4290         caller_context_t *ct)
4291 {
4292         int     err;
4293 
4294         VOPXID_MAP_CR(vp, cr);
4295 
4296         err = (*(vp)->v_op->vop_map)
4297             (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4298         VOPSTATS_UPDATE(vp, map);
4299         return (err);
4300 }
4301 
4302 int
4303 fop_addmap(
4304         vnode_t *vp,
4305         offset_t off,
4306         struct as *as,
4307         caddr_t addr,
4308         size_t len,
4309         uchar_t prot,
4310         uchar_t maxprot,
4311         uint_t flags,
4312         cred_t *cr,
4313         caller_context_t *ct)
4314 {
4315         int error;
4316         u_longlong_t delta;
4317 
4318         VOPXID_MAP_CR(vp, cr);
4319 
4320         error = (*(vp)->v_op->vop_addmap)
4321             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4322 
4323         if ((!error) && (vp->v_type == VREG)) {
4324                 delta = (u_longlong_t)btopr(len);
4325                 /*
4326                  * If file is declared MAP_PRIVATE, it can't be written back
4327                  * even if open for write. Handle as read.
4328                  */
4329                 if (flags & MAP_PRIVATE) {
4330                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4331                             (int64_t)delta);
4332                 } else {
4333                         /*
4334                          * atomic_add_64 forces the fetch of a 64 bit value to
4335                          * be atomic on 32 bit machines
4336                          */
4337                         if (maxprot & PROT_WRITE)
4338                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4339                                     (int64_t)delta);
4340                         if (maxprot & PROT_READ)
4341                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4342                                     (int64_t)delta);
4343                         if (maxprot & PROT_EXEC)
4344                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4345                                     (int64_t)delta);
4346                 }
4347         }
4348         VOPSTATS_UPDATE(vp, addmap);
4349         return (error);
4350 }
4351 
4352 int
4353 fop_delmap(
4354         vnode_t *vp,
4355         offset_t off,
4356         struct as *as,
4357         caddr_t addr,
4358         size_t len,
4359         uint_t prot,
4360         uint_t maxprot,
4361         uint_t flags,
4362         cred_t *cr,
4363         caller_context_t *ct)
4364 {
4365         int error;
4366         u_longlong_t delta;
4367 
4368         VOPXID_MAP_CR(vp, cr);
4369 
4370         error = (*(vp)->v_op->vop_delmap)
4371             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4372 
4373         /*
4374          * NFS calls into delmap twice, the first time
4375          * it simply establishes a callback mechanism and returns EAGAIN
4376          * while the real work is being done upon the second invocation.
4377          * We have to detect this here and only decrement the counts upon
4378          * the second delmap request.
4379          */
4380         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4381 
4382                 delta = (u_longlong_t)btopr(len);
4383 
4384                 if (flags & MAP_PRIVATE) {
4385                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4386                             (int64_t)(-delta));
4387                 } else {
4388                         /*
4389                          * atomic_add_64 forces the fetch of a 64 bit value
4390                          * to be atomic on 32 bit machines
4391                          */
4392                         if (maxprot & PROT_WRITE)
4393                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4394                                     (int64_t)(-delta));
4395                         if (maxprot & PROT_READ)
4396                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4397                                     (int64_t)(-delta));
4398                         if (maxprot & PROT_EXEC)
4399                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4400                                     (int64_t)(-delta));
4401                 }
4402         }
4403         VOPSTATS_UPDATE(vp, delmap);
4404         return (error);
4405 }
4406 
4407 
4408 int
4409 fop_poll(
4410         vnode_t *vp,
4411         short events,
4412         int anyyet,
4413         short *reventsp,
4414         struct pollhead **phpp,
4415         caller_context_t *ct)
4416 {
4417         int     err;
4418 
4419         err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4420         VOPSTATS_UPDATE(vp, poll);
4421         return (err);
4422 }
4423 
4424 int
4425 fop_dump(
4426         vnode_t *vp,
4427         caddr_t addr,
4428         offset_t lbdn,
4429         offset_t dblks,
4430         caller_context_t *ct)
4431 {
4432         int     err;
4433 
4434         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4435         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4436                 return (EIO);
4437 
4438         err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4439         VOPSTATS_UPDATE(vp, dump);
4440         return (err);
4441 }
4442 
4443 int
4444 fop_pathconf(
4445         vnode_t *vp,
4446         int cmd,
4447         ulong_t *valp,
4448         cred_t *cr,
4449         caller_context_t *ct)
4450 {
4451         int     err;
4452 
4453         VOPXID_MAP_CR(vp, cr);
4454 
4455         err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4456         VOPSTATS_UPDATE(vp, pathconf);
4457         return (err);
4458 }
4459 
4460 int
4461 fop_pageio(
4462         vnode_t *vp,
4463         struct page *pp,
4464         u_offset_t io_off,
4465         size_t io_len,
4466         int flags,
4467         cred_t *cr,
4468         caller_context_t *ct)
4469 {
4470         int     err;
4471 
4472         VOPXID_MAP_CR(vp, cr);
4473 
4474         err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4475         VOPSTATS_UPDATE(vp, pageio);
4476         return (err);
4477 }
4478 
4479 int
4480 fop_dumpctl(
4481         vnode_t *vp,
4482         int action,
4483         offset_t *blkp,
4484         caller_context_t *ct)
4485 {
4486         int     err;
4487         err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4488         VOPSTATS_UPDATE(vp, dumpctl);
4489         return (err);
4490 }
4491 
4492 void
4493 fop_dispose(
4494         vnode_t *vp,
4495         page_t *pp,
4496         int flag,
4497         int dn,
4498         cred_t *cr,
4499         caller_context_t *ct)
4500 {
4501         /* Must do stats first since it's possible to lose the vnode */
4502         VOPSTATS_UPDATE(vp, dispose);
4503 
4504         VOPXID_MAP_CR(vp, cr);
4505 
4506         (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4507 }
4508 
4509 int
4510 fop_setsecattr(
4511         vnode_t *vp,
4512         vsecattr_t *vsap,
4513         int flag,
4514         cred_t *cr,
4515         caller_context_t *ct)
4516 {
4517         int     err;
4518 
4519         VOPXID_MAP_CR(vp, cr);
4520 
4521         /*
4522          * We're only allowed to skip the ACL check iff we used a 32 bit
4523          * ACE mask with VOP_ACCESS() to determine permissions.
4524          */
4525         if ((flag & ATTR_NOACLCHECK) &&
4526             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4527                 return (EINVAL);
4528         }
4529         err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4530         VOPSTATS_UPDATE(vp, setsecattr);
4531         return (err);
4532 }
4533 
4534 int
4535 fop_getsecattr(
4536         vnode_t *vp,
4537         vsecattr_t *vsap,
4538         int flag,
4539         cred_t *cr,
4540         caller_context_t *ct)
4541 {
4542         int     err;
4543 
4544         /*
4545          * We're only allowed to skip the ACL check iff we used a 32 bit
4546          * ACE mask with VOP_ACCESS() to determine permissions.
4547          */
4548         if ((flag & ATTR_NOACLCHECK) &&
4549             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4550                 return (EINVAL);
4551         }
4552 
4553         VOPXID_MAP_CR(vp, cr);
4554 
4555         err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4556         VOPSTATS_UPDATE(vp, getsecattr);
4557         return (err);
4558 }
4559 
4560 int
4561 fop_shrlock(
4562         vnode_t *vp,
4563         int cmd,
4564         struct shrlock *shr,
4565         int flag,
4566         cred_t *cr,
4567         caller_context_t *ct)
4568 {
4569         int     err;
4570 
4571         VOPXID_MAP_CR(vp, cr);
4572 
4573         err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4574         VOPSTATS_UPDATE(vp, shrlock);
4575         return (err);
4576 }
4577 
4578 int
4579 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4580     caller_context_t *ct)
4581 {
4582         int     err;
4583 
4584         err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4585         VOPSTATS_UPDATE(vp, vnevent);
4586         return (err);
4587 }
4588 
4589 int
4590 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4591     caller_context_t *ct)
4592 {
4593         int err;
4594 
4595         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4596                 return (ENOTSUP);
4597         err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4598         VOPSTATS_UPDATE(vp, reqzcbuf);
4599         return (err);
4600 }
4601 
4602 int
4603 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4604 {
4605         int err;
4606 
4607         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4608                 return (ENOTSUP);
4609         err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4610         VOPSTATS_UPDATE(vp, retzcbuf);
4611         return (err);
4612 }
4613 
4614 /*
4615  * Default destructor
4616  *      Needed because NULL destructor means that the key is unused
4617  */
4618 /* ARGSUSED */
4619 void
4620 vsd_defaultdestructor(void *value)
4621 {}
4622 
4623 /*
4624  * Create a key (index into per vnode array)
4625  *      Locks out vsd_create, vsd_destroy, and vsd_free
4626  *      May allocate memory with lock held
4627  */
4628 void
4629 vsd_create(uint_t *keyp, void (*destructor)(void *))
4630 {
4631         int     i;
4632         uint_t  nkeys;
4633 
4634         /*
4635          * if key is allocated, do nothing
4636          */
4637         mutex_enter(&vsd_lock);
4638         if (*keyp) {
4639                 mutex_exit(&vsd_lock);
4640                 return;
4641         }
4642         /*
4643          * find an unused key
4644          */
4645         if (destructor == NULL)
4646                 destructor = vsd_defaultdestructor;
4647 
4648         for (i = 0; i < vsd_nkeys; ++i)
4649                 if (vsd_destructor[i] == NULL)
4650                         break;
4651 
4652         /*
4653          * if no unused keys, increase the size of the destructor array
4654          */
4655         if (i == vsd_nkeys) {
4656                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4657                         nkeys = 1;
4658                 vsd_destructor =
4659                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4660                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4661                     (size_t)(nkeys * sizeof (void (*)(void *))));
4662                 vsd_nkeys = nkeys;
4663         }
4664 
4665         /*
4666          * allocate the next available unused key
4667          */
4668         vsd_destructor[i] = destructor;
4669         *keyp = i + 1;
4670 
4671         /* create vsd_list, if it doesn't exist */
4672         if (vsd_list == NULL) {
4673                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4674                 list_create(vsd_list, sizeof (struct vsd_node),
4675                     offsetof(struct vsd_node, vs_nodes));
4676         }
4677 
4678         mutex_exit(&vsd_lock);
4679 }
4680 
4681 /*
4682  * Destroy a key
4683  *
4684  * Assumes that the caller is preventing vsd_set and vsd_get
4685  * Locks out vsd_create, vsd_destroy, and vsd_free
4686  * May free memory with lock held
4687  */
4688 void
4689 vsd_destroy(uint_t *keyp)
4690 {
4691         uint_t key;
4692         struct vsd_node *vsd;
4693 
4694         /*
4695          * protect the key namespace and our destructor lists
4696          */
4697         mutex_enter(&vsd_lock);
4698         key = *keyp;
4699         *keyp = 0;
4700 
4701         ASSERT(key <= vsd_nkeys);
4702 
4703         /*
4704          * if the key is valid
4705          */
4706         if (key != 0) {
4707                 uint_t k = key - 1;
4708                 /*
4709                  * for every vnode with VSD, call key's destructor
4710                  */
4711                 for (vsd = list_head(vsd_list); vsd != NULL;
4712                     vsd = list_next(vsd_list, vsd)) {
4713                         /*
4714                          * no VSD for key in this vnode
4715                          */
4716                         if (key > vsd->vs_nkeys)
4717                                 continue;
4718                         /*
4719                          * call destructor for key
4720                          */
4721                         if (vsd->vs_value[k] && vsd_destructor[k])
4722                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4723                         /*
4724                          * reset value for key
4725                          */
4726                         vsd->vs_value[k] = NULL;
4727                 }
4728                 /*
4729                  * actually free the key (NULL destructor == unused)
4730                  */
4731                 vsd_destructor[k] = NULL;
4732         }
4733 
4734         mutex_exit(&vsd_lock);
4735 }
4736 
4737 /*
4738  * Quickly return the per vnode value that was stored with the specified key
4739  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4740  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4741  */
4742 void *
4743 vsd_get(vnode_t *vp, uint_t key)
4744 {
4745         struct vsd_node *vsd;
4746 
4747         ASSERT(vp != NULL);
4748         ASSERT(mutex_owned(&vp->v_vsd_lock));
4749 
4750         vsd = vp->v_vsd;
4751 
4752         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4753                 return (vsd->vs_value[key - 1]);
4754         return (NULL);
4755 }
4756 
4757 /*
4758  * Set a per vnode value indexed with the specified key
4759  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4760  */
4761 int
4762 vsd_set(vnode_t *vp, uint_t key, void *value)
4763 {
4764         struct vsd_node *vsd;
4765 
4766         ASSERT(vp != NULL);
4767         ASSERT(mutex_owned(&vp->v_vsd_lock));
4768 
4769         if (key == 0)
4770                 return (EINVAL);
4771 
4772         vsd = vp->v_vsd;
4773         if (vsd == NULL)
4774                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4775 
4776         /*
4777          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4778          * code won't happen and we will continue down and allocate space for
4779          * the vs_value array.
4780          * If the caller is replacing one value with another, then it is up
4781          * to the caller to free/rele/destroy the previous value (if needed).
4782          */
4783         if (key <= vsd->vs_nkeys) {
4784                 vsd->vs_value[key - 1] = value;
4785                 return (0);
4786         }
4787 
4788         ASSERT(key <= vsd_nkeys);
4789 
4790         if (vsd->vs_nkeys == 0) {
4791                 mutex_enter(&vsd_lock);     /* lock out vsd_destroy() */
4792                 /*
4793                  * Link onto list of all VSD nodes.
4794                  */
4795                 list_insert_head(vsd_list, vsd);
4796                 mutex_exit(&vsd_lock);
4797         }
4798 
4799         /*
4800          * Allocate vnode local storage and set the value for key
4801          */
4802         vsd->vs_value = vsd_realloc(vsd->vs_value,
4803             vsd->vs_nkeys * sizeof (void *),
4804             key * sizeof (void *));
4805         vsd->vs_nkeys = key;
4806         vsd->vs_value[key - 1] = value;
4807 
4808         return (0);
4809 }
4810 
4811 /*
4812  * Called from vn_free() to run the destructor function for each vsd
4813  *      Locks out vsd_create and vsd_destroy
4814  *      Assumes that the destructor *DOES NOT* use vsd
4815  */
4816 void
4817 vsd_free(vnode_t *vp)
4818 {
4819         int i;
4820         struct vsd_node *vsd = vp->v_vsd;
4821 
4822         if (vsd == NULL)
4823                 return;
4824 
4825         if (vsd->vs_nkeys == 0) {
4826                 kmem_free(vsd, sizeof (*vsd));
4827                 vp->v_vsd = NULL;
4828                 return;
4829         }
4830 
4831         /*
4832          * lock out vsd_create and vsd_destroy, call
4833          * the destructor, and mark the value as destroyed.
4834          */
4835         mutex_enter(&vsd_lock);
4836 
4837         for (i = 0; i < vsd->vs_nkeys; i++) {
4838                 if (vsd->vs_value[i] && vsd_destructor[i])
4839                         (*vsd_destructor[i])(vsd->vs_value[i]);
4840                 vsd->vs_value[i] = NULL;
4841         }
4842 
4843         /*
4844          * remove from linked list of VSD nodes
4845          */
4846         list_remove(vsd_list, vsd);
4847 
4848         mutex_exit(&vsd_lock);
4849 
4850         /*
4851          * free up the VSD
4852          */
4853         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4854         kmem_free(vsd, sizeof (struct vsd_node));
4855         vp->v_vsd = NULL;
4856 }
4857 
4858 /*
4859  * realloc
4860  */
4861 static void *
4862 vsd_realloc(void *old, size_t osize, size_t nsize)
4863 {
4864         void *new;
4865 
4866         new = kmem_zalloc(nsize, KM_SLEEP);
4867         if (old) {
4868                 bcopy(old, new, osize);
4869                 kmem_free(old, osize);
4870         }
4871         return (new);
4872 }
4873 
4874 /*
4875  * Setup the extensible system attribute for creating a reparse point.
4876  * The symlink data 'target' is validated for proper format of a reparse
4877  * string and a check also made to make sure the symlink data does not
4878  * point to an existing file.
4879  *
4880  * return 0 if ok else -1.
4881  */
4882 static int
4883 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4884 {
4885         xoptattr_t *xoap;
4886 
4887         if ((!target) || (!vap) || (!xvattr))
4888                 return (-1);
4889 
4890         /* validate reparse string */
4891         if (reparse_validate((const char *)target))
4892                 return (-1);
4893 
4894         xva_init(xvattr);
4895         xvattr->xva_vattr = *vap;
4896         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4897         xoap = xva_getxoptattr(xvattr);
4898         ASSERT(xoap);
4899         XVA_SET_REQ(xvattr, XAT_REPARSE);
4900         xoap->xoa_reparse = 1;
4901 
4902         return (0);
4903 }
4904 
4905 /*
4906  * Function to check whether a symlink is a reparse point.
4907  * Return B_TRUE if it is a reparse point, else return B_FALSE
4908  */
4909 boolean_t
4910 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4911 {
4912         xvattr_t xvattr;
4913         xoptattr_t *xoap;
4914 
4915         if ((vp->v_type != VLNK) ||
4916             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4917                 return (B_FALSE);
4918 
4919         xva_init(&xvattr);
4920         xoap = xva_getxoptattr(&xvattr);
4921         ASSERT(xoap);
4922         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4923 
4924         if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4925                 return (B_FALSE);
4926 
4927         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4928             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4929                 return (B_FALSE);
4930 
4931         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4932 }