io-lx-public-vs-joyent New usr/src/uts/common/fs/vnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/t_lock.h>
  43 #include <sys/errno.h>
  44 #include <sys/cred.h>
  45 #include <sys/user.h>
  46 #include <sys/uio.h>
  47 #include <sys/file.h>
  48 #include <sys/pathname.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/vnode.h>
  52 #include <sys/rwstlock.h>
  53 #include <sys/fem.h>
  54 #include <sys/stat.h>
  55 #include <sys/mode.h>
  56 #include <sys/conf.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/systm.h>
  60 #include <sys/kmem.h>
  61 #include <sys/debug.h>
  62 #include <c2/audit.h>
  63 #include <sys/acl.h>
  64 #include <sys/nbmlock.h>
  65 #include <sys/fcntl.h>
  66 #include <fs/fs_subr.h>
  67 #include <sys/taskq.h>
  68 #include <fs/fs_reparse.h>
  69 
  70 /* Determine if this vnode is a file that is read-only */
  71 #define ISROFILE(vp)    \
  72         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  73             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  74 
  75 /* Tunable via /etc/system; used only by admin/install */
  76 int nfs_global_client_only;
  77 
  78 /*
  79  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  80  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  81  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  82  * the same fstype index that is used to index into the vfssw table.
  83  */
  84 vopstats_t **vopstats_fstype;
  85 
  86 /* vopstats initialization template used for fast initialization via bcopy() */
  87 static vopstats_t *vs_templatep;
  88 
  89 /* Kmem cache handle for vsk_anchor_t allocations */
  90 kmem_cache_t *vsk_anchor_cache;
  91 
  92 /* file events cleanup routine */
  93 extern void free_fopdata(vnode_t *);
  94 
  95 /*
  96  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  97  * updates to vsktat_tree.
  98  */
  99 avl_tree_t      vskstat_tree;
 100 kmutex_t        vskstat_tree_lock;
 101 
 102 /* Global variable which enables/disables the vopstats collection */
 103 int vopstats_enabled = 1;
 104 
 105 /*
 106  * forward declarations for internal vnode specific data (vsd)
 107  */
 108 static void *vsd_realloc(void *, size_t, size_t);
 109 
 110 /*
 111  * forward declarations for reparse point functions
 112  */
 113 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 114 
 115 /*
 116  * VSD -- VNODE SPECIFIC DATA
 117  * The v_data pointer is typically used by a file system to store a
 118  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 119  * However, there are times when additional project private data needs
 120  * to be stored separately from the data (node) pointed to by v_data.
 121  * This additional data could be stored by the file system itself or
 122  * by a completely different kernel entity.  VSD provides a way for
 123  * callers to obtain a key and store a pointer to private data associated
 124  * with a vnode.
 125  *
 126  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 127  * for calls to vsd_set() and vsd_get().
 128  */
 129 
 130 /*
 131  * vsd_lock protects:
 132  *   vsd_nkeys - creation and deletion of vsd keys
 133  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 134  *   vsd_destructor - adding and removing destructors to the list
 135  */
 136 static kmutex_t         vsd_lock;
 137 static uint_t           vsd_nkeys;       /* size of destructor array */
 138 /* list of vsd_node's */
 139 static list_t *vsd_list = NULL;
 140 /* per-key destructor funcs */
 141 static void             (**vsd_destructor)(void *);
 142 
 143 /*
 144  * The following is the common set of actions needed to update the
 145  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 146  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 147  * recording of the bytes transferred.  Since the code is similar
 148  * but small, it is nearly a duplicate.  Consequently any changes
 149  * to one may need to be reflected in the other.
 150  * Rundown of the variables:
 151  * vp - Pointer to the vnode
 152  * counter - Partial name structure member to update in vopstats for counts
 153  * bytecounter - Partial name structure member to update in vopstats for bytes
 154  * bytesval - Value to update in vopstats for bytes
 155  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 156  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 157  */
 158 
 159 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 160         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 161         if (vfsp && vfsp->vfs_implp &&                                       \
 162             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 163                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 164                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 165                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 166                     size_t, uint64_t *);                                \
 167                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 168                 (*stataddr)++;                                          \
 169                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 170                         vsp->n##counter.value.ui64++;                        \
 171                 }                                                       \
 172         }                                                               \
 173 }
 174 
 175 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 176         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 177         if (vfsp && vfsp->vfs_implp &&                                       \
 178             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 179                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 180                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 181                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 182                     size_t, uint64_t *);                                \
 183                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 184                 (*stataddr)++;                                          \
 185                 vsp->bytecounter.value.ui64 += bytesval;             \
 186                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 187                         vsp->n##counter.value.ui64++;                        \
 188                         vsp->bytecounter.value.ui64 += bytesval;     \
 189                 }                                                       \
 190         }                                                               \
 191 }
 192 
 193 /*
 194  * If the filesystem does not support XIDs map credential
 195  * If the vfsp is NULL, perhaps we should also map?
 196  */
 197 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 198         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 199         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 200                 cr = crgetmapped(cr);                                   \
 201         }
 202 
 203 #define VOP_LATENCY_10MS        10000000
 204 #define VOP_LATENCY_100MS       100000000
 205 #define VOP_LATENCY_1S          1000000000
 206 
 207 /*
 208  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 209  * numerical order of S_IFMT and vnode types.)
 210  */
 211 enum vtype iftovt_tab[] = {
 212         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 213         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 214 };
 215 
 216 ushort_t vttoif_tab[] = {
 217         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 218         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 219 };
 220 
 221 /*
 222  * The system vnode cache.
 223  */
 224 
 225 kmem_cache_t *vn_cache;
 226 
 227 
 228 /*
 229  * Vnode operations vector.
 230  */
 231 
 232 static const fs_operation_trans_def_t vn_ops_table[] = {
 233         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 234             fs_nosys, fs_nosys,
 235 
 236         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 237             fs_nosys, fs_nosys,
 238 
 239         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 240             fs_nosys, fs_nosys,
 241 
 242         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 243             fs_nosys, fs_nosys,
 244 
 245         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 246             fs_nosys, fs_nosys,
 247 
 248         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 249             fs_setfl, fs_nosys,
 250 
 251         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 252             fs_nosys, fs_nosys,
 253 
 254         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 255             fs_nosys, fs_nosys,
 256 
 257         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 258             fs_nosys, fs_nosys,
 259 
 260         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 261             fs_nosys, fs_nosys,
 262 
 263         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 264             fs_nosys, fs_nosys,
 265 
 266         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 267             fs_nosys, fs_nosys,
 268 
 269         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 270             fs_nosys, fs_nosys,
 271 
 272         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 273             fs_nosys, fs_nosys,
 274 
 275         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 276             fs_nosys, fs_nosys,
 277 
 278         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 279             fs_nosys, fs_nosys,
 280 
 281         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 282             fs_nosys, fs_nosys,
 283 
 284         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 285             fs_nosys, fs_nosys,
 286 
 287         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 288             fs_nosys, fs_nosys,
 289 
 290         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 291             fs_nosys, fs_nosys,
 292 
 293         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 294             fs_nosys, fs_nosys,
 295 
 296         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 297             fs_nosys, fs_nosys,
 298 
 299         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 300             fs_rwlock, fs_rwlock,
 301 
 302         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 303             (fs_generic_func_p) fs_rwunlock,
 304             (fs_generic_func_p) fs_rwunlock,    /* no errors allowed */
 305 
 306         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 307             fs_nosys, fs_nosys,
 308 
 309         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 310             fs_cmp, fs_cmp,             /* no errors allowed */
 311 
 312         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 313             fs_frlock, fs_nosys,
 314 
 315         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 316             fs_nosys, fs_nosys,
 317 
 318         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 319             fs_nosys, fs_nosys,
 320 
 321         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 322             fs_nosys, fs_nosys,
 323 
 324         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 325             fs_nosys, fs_nosys,
 326 
 327         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 328             (fs_generic_func_p) fs_nosys_map,
 329             (fs_generic_func_p) fs_nosys_map,
 330 
 331         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 332             (fs_generic_func_p) fs_nosys_addmap,
 333             (fs_generic_func_p) fs_nosys_addmap,
 334 
 335         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 336             fs_nosys, fs_nosys,
 337 
 338         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 339             (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 340 
 341         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 342             fs_nosys, fs_nosys,
 343 
 344         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 345             fs_pathconf, fs_nosys,
 346 
 347         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 348             fs_nosys, fs_nosys,
 349 
 350         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 351             fs_nosys, fs_nosys,
 352 
 353         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 354             (fs_generic_func_p) fs_dispose,
 355             (fs_generic_func_p) fs_nodispose,
 356 
 357         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 358             fs_nosys, fs_nosys,
 359 
 360         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 361             fs_fab_acl, fs_nosys,
 362 
 363         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 364             fs_shrlock, fs_nosys,
 365 
 366         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 367             (fs_generic_func_p) fs_vnevent_nosupport,
 368             (fs_generic_func_p) fs_vnevent_nosupport,
 369 
 370         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 371             fs_nosys, fs_nosys,
 372 
 373         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 374             fs_nosys, fs_nosys,
 375 
 376         NULL, 0, NULL, NULL
 377 };
 378 
 379 /* Extensible attribute (xva) routines. */
 380 
 381 /*
 382  * Zero out the structure, set the size of the requested/returned bitmaps,
 383  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 384  * to the returned attributes array.
 385  */
 386 void
 387 xva_init(xvattr_t *xvap)
 388 {
 389         bzero(xvap, sizeof (xvattr_t));
 390         xvap->xva_mapsize = XVA_MAPSIZE;
 391         xvap->xva_magic = XVA_MAGIC;
 392         xvap->xva_vattr.va_mask = AT_XVATTR;
 393         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 394 }
 395 
 396 /*
 397  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 398  * structure.  Otherwise, returns NULL.
 399  */
 400 xoptattr_t *
 401 xva_getxoptattr(xvattr_t *xvap)
 402 {
 403         xoptattr_t *xoap = NULL;
 404         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 405                 xoap = &xvap->xva_xoptattrs;
 406         return (xoap);
 407 }
 408 
 409 /*
 410  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 411  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 412  * kstat name.
 413  */
 414 static int
 415 vska_compar(const void *n1, const void *n2)
 416 {
 417         int ret;
 418         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 419         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 420 
 421         if (p1 < p2) {
 422                 ret = -1;
 423         } else if (p1 > p2) {
 424                 ret = 1;
 425         } else {
 426                 ret = 0;
 427         }
 428 
 429         return (ret);
 430 }
 431 
 432 /*
 433  * Used to create a single template which will be bcopy()ed to a newly
 434  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 435  */
 436 static vopstats_t *
 437 create_vopstats_template()
 438 {
 439         vopstats_t              *vsp;
 440 
 441         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 442         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 443 
 444         /* VOP_OPEN */
 445         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 446         /* VOP_CLOSE */
 447         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 448         /* VOP_READ I/O */
 449         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 450         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 451         /* VOP_WRITE I/O */
 452         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 453         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 454         /* VOP_IOCTL */
 455         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 456         /* VOP_SETFL */
 457         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 458         /* VOP_GETATTR */
 459         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 460         /* VOP_SETATTR */
 461         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 462         /* VOP_ACCESS */
 463         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 464         /* VOP_LOOKUP */
 465         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 466         /* VOP_CREATE */
 467         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 468         /* VOP_REMOVE */
 469         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 470         /* VOP_LINK */
 471         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 472         /* VOP_RENAME */
 473         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 474         /* VOP_MKDIR */
 475         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 476         /* VOP_RMDIR */
 477         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 478         /* VOP_READDIR I/O */
 479         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 480         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 481             KSTAT_DATA_UINT64);
 482         /* VOP_SYMLINK */
 483         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 484         /* VOP_READLINK */
 485         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 486         /* VOP_FSYNC */
 487         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 488         /* VOP_INACTIVE */
 489         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 490         /* VOP_FID */
 491         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 492         /* VOP_RWLOCK */
 493         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 494         /* VOP_RWUNLOCK */
 495         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 496         /* VOP_SEEK */
 497         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 498         /* VOP_CMP */
 499         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 500         /* VOP_FRLOCK */
 501         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 502         /* VOP_SPACE */
 503         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 504         /* VOP_REALVP */
 505         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 506         /* VOP_GETPAGE */
 507         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 508         /* VOP_PUTPAGE */
 509         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 510         /* VOP_MAP */
 511         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 512         /* VOP_ADDMAP */
 513         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 514         /* VOP_DELMAP */
 515         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 516         /* VOP_POLL */
 517         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 518         /* VOP_DUMP */
 519         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 520         /* VOP_PATHCONF */
 521         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 522         /* VOP_PAGEIO */
 523         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 524         /* VOP_DUMPCTL */
 525         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 526         /* VOP_DISPOSE */
 527         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 528         /* VOP_SETSECATTR */
 529         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 530         /* VOP_GETSECATTR */
 531         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 532         /* VOP_SHRLOCK */
 533         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 534         /* VOP_VNEVENT */
 535         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 536         /* VOP_REQZCBUF */
 537         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 538         /* VOP_RETZCBUF */
 539         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 540 
 541         return (vsp);
 542 }
 543 
 544 /*
 545  * Creates a kstat structure associated with a vopstats structure.
 546  */
 547 kstat_t *
 548 new_vskstat(char *ksname, vopstats_t *vsp)
 549 {
 550         kstat_t         *ksp;
 551 
 552         if (!vopstats_enabled) {
 553                 return (NULL);
 554         }
 555 
 556         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 557             sizeof (vopstats_t)/sizeof (kstat_named_t),
 558             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 559         if (ksp) {
 560                 ksp->ks_data = vsp;
 561                 kstat_install(ksp);
 562         }
 563 
 564         return (ksp);
 565 }
 566 
 567 /*
 568  * Called from vfsinit() to initialize the support mechanisms for vopstats
 569  */
 570 void
 571 vopstats_startup()
 572 {
 573         if (!vopstats_enabled)
 574                 return;
 575 
 576         /*
 577          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 578          * is necessary since we need to check if a kstat exists before we
 579          * attempt to create it.  Also, initialize its lock.
 580          */
 581         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 582             offsetof(vsk_anchor_t, vsk_node));
 583         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 584 
 585         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 586             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 587             NULL, NULL, 0);
 588 
 589         /*
 590          * Set up the array of pointers for the vopstats-by-FS-type.
 591          * The entries will be allocated/initialized as each file system
 592          * goes through modload/mod_installfs.
 593          */
 594         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 595             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 596 
 597         /* Set up the global vopstats initialization template */
 598         vs_templatep = create_vopstats_template();
 599 }
 600 
 601 /*
 602  * We need to have the all of the counters zeroed.
 603  * The initialization of the vopstats_t includes on the order of
 604  * 50 calls to kstat_named_init().  Rather that do that on every call,
 605  * we do it once in a template (vs_templatep) then bcopy it over.
 606  */
 607 void
 608 initialize_vopstats(vopstats_t *vsp)
 609 {
 610         if (vsp == NULL)
 611                 return;
 612 
 613         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 614 }
 615 
 616 /*
 617  * If possible, determine which vopstats by fstype to use and
 618  * return a pointer to the caller.
 619  */
 620 vopstats_t *
 621 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 622 {
 623         int             fstype = 0;     /* Index into vfssw[] */
 624         vopstats_t      *vsp = NULL;
 625 
 626         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 627             !vopstats_enabled)
 628                 return (NULL);
 629         /*
 630          * Set up the fstype.  We go to so much trouble because all versions
 631          * of NFS use the same fstype in their vfs even though they have
 632          * distinct entries in the vfssw[] table.
 633          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 634          */
 635         if (vswp) {
 636                 fstype = vswp - vfssw;  /* Gets us the index */
 637         } else {
 638                 fstype = vfsp->vfs_fstype;
 639         }
 640 
 641         /*
 642          * Point to the per-fstype vopstats. The only valid values are
 643          * non-zero positive values less than the number of vfssw[] table
 644          * entries.
 645          */
 646         if (fstype > 0 && fstype < nfstype) {
 647                 vsp = vopstats_fstype[fstype];
 648         }
 649 
 650         return (vsp);
 651 }
 652 
 653 /*
 654  * Generate a kstat name, create the kstat structure, and allocate a
 655  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 656  * to the caller.  This must only be called from a mount.
 657  */
 658 vsk_anchor_t *
 659 get_vskstat_anchor(vfs_t *vfsp)
 660 {
 661         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 662         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 663         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 664         kstat_t         *ksp;                   /* Ptr to new kstat */
 665         avl_index_t     where;                  /* Location in the AVL tree */
 666 
 667         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 668             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 669                 return (NULL);
 670 
 671         /* Need to get the fsid to build a kstat name */
 672         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 673                 /* Create a name for our kstats based on fsid */
 674                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 675                     VOPSTATS_STR, statvfsbuf.f_fsid);
 676 
 677                 /* Allocate and initialize the vsk_anchor_t */
 678                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 679                 bzero(vskp, sizeof (*vskp));
 680                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 681 
 682                 mutex_enter(&vskstat_tree_lock);
 683                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 684                         avl_insert(&vskstat_tree, vskp, where);
 685                         mutex_exit(&vskstat_tree_lock);
 686 
 687                         /*
 688                          * Now that we've got the anchor in the AVL
 689                          * tree, we can create the kstat.
 690                          */
 691                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 692                         if (ksp) {
 693                                 vskp->vsk_ksp = ksp;
 694                         }
 695                 } else {
 696                         /* Oops, found one! Release memory and lock. */
 697                         mutex_exit(&vskstat_tree_lock);
 698                         kmem_cache_free(vsk_anchor_cache, vskp);
 699                         vskp = NULL;
 700                 }
 701         }
 702         return (vskp);
 703 }
 704 
 705 /*
 706  * We're in the process of tearing down the vfs and need to cleanup
 707  * the data structures associated with the vopstats. Must only be called
 708  * from dounmount().
 709  */
 710 void
 711 teardown_vopstats(vfs_t *vfsp)
 712 {
 713         vsk_anchor_t    *vskap;
 714         avl_index_t     where;
 715 
 716         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 717             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 718                 return;
 719 
 720         /* This is a safe check since VFS_STATS must be set (see above) */
 721         if ((vskap = vfsp->vfs_vskap) == NULL)
 722                 return;
 723 
 724         /* Whack the pointer right away */
 725         vfsp->vfs_vskap = NULL;
 726 
 727         /* Lock the tree, remove the node, and delete the kstat */
 728         mutex_enter(&vskstat_tree_lock);
 729         if (avl_find(&vskstat_tree, vskap, &where)) {
 730                 avl_remove(&vskstat_tree, vskap);
 731         }
 732 
 733         if (vskap->vsk_ksp) {
 734                 kstat_delete(vskap->vsk_ksp);
 735         }
 736         mutex_exit(&vskstat_tree_lock);
 737 
 738         kmem_cache_free(vsk_anchor_cache, vskap);
 739 }
 740 
 741 /*
 742  * Read or write a vnode.  Called from kernel code.
 743  */
 744 int
 745 vn_rdwr(
 746         enum uio_rw rw,
 747         struct vnode *vp,
 748         caddr_t base,
 749         ssize_t len,
 750         offset_t offset,
 751         enum uio_seg seg,
 752         int ioflag,
 753         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 754         cred_t *cr,
 755         ssize_t *residp)
 756 {
 757         struct uio uio;
 758         struct iovec iov;
 759         int error;
 760         int in_crit = 0;
 761 
 762         if (rw == UIO_WRITE && ISROFILE(vp))
 763                 return (EROFS);
 764 
 765         if (len < 0)
 766                 return (EIO);
 767 
 768         VOPXID_MAP_CR(vp, cr);
 769 
 770         iov.iov_base = base;
 771         iov.iov_len = len;
 772         uio.uio_iov = &iov;
 773         uio.uio_iovcnt = 1;
 774         uio.uio_loffset = offset;
 775         uio.uio_segflg = (short)seg;
 776         uio.uio_resid = len;
 777         uio.uio_llimit = ulimit;
 778 
 779         /*
 780          * We have to enter the critical region before calling VOP_RWLOCK
 781          * to avoid a deadlock with ufs.
 782          */
 783         if (nbl_need_check(vp)) {
 784                 int svmand;
 785 
 786                 nbl_start_crit(vp, RW_READER);
 787                 in_crit = 1;
 788                 error = nbl_svmand(vp, cr, &svmand);
 789                 if (error != 0)
 790                         goto done;
 791                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 792                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 793                         error = EACCES;
 794                         goto done;
 795                 }
 796         }
 797 
 798         (void) VOP_RWLOCK(vp,
 799             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 800         if (rw == UIO_WRITE) {
 801                 uio.uio_fmode = FWRITE;
 802                 uio.uio_extflg = UIO_COPY_DEFAULT;
 803                 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 804         } else {
 805                 uio.uio_fmode = FREAD;
 806                 uio.uio_extflg = UIO_COPY_CACHED;
 807                 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 808         }
 809         VOP_RWUNLOCK(vp,
 810             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 811         if (residp)
 812                 *residp = uio.uio_resid;
 813         else if (uio.uio_resid)
 814                 error = EIO;
 815 
 816 done:
 817         if (in_crit)
 818                 nbl_end_crit(vp);
 819         return (error);
 820 }
 821 
 822 /*
 823  * Release a vnode.  Call VOP_INACTIVE on last reference or
 824  * decrement reference count.
 825  *
 826  * To avoid race conditions, the v_count is left at 1 for
 827  * the call to VOP_INACTIVE. This prevents another thread
 828  * from reclaiming and releasing the vnode *before* the
 829  * VOP_INACTIVE routine has a chance to destroy the vnode.
 830  * We can't have more than 1 thread calling VOP_INACTIVE
 831  * on a vnode.
 832  */
 833 void
 834 vn_rele(vnode_t *vp)
 835 {
 836         VERIFY(vp->v_count > 0);
 837         mutex_enter(&vp->v_lock);
 838         if (vp->v_count == 1) {
 839                 mutex_exit(&vp->v_lock);
 840                 VOP_INACTIVE(vp, CRED(), NULL);
 841                 return;
 842         }
 843         vp->v_count--;
 844         mutex_exit(&vp->v_lock);
 845 }
 846 
 847 /*
 848  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 849  * as a single reference, so v_count is not decremented until the last DNLC hold
 850  * is released. This makes it possible to distinguish vnodes that are referenced
 851  * only by the DNLC.
 852  */
 853 void
 854 vn_rele_dnlc(vnode_t *vp)
 855 {
 856         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 857         mutex_enter(&vp->v_lock);
 858         if (--vp->v_count_dnlc == 0) {
 859                 if (vp->v_count == 1) {
 860                         mutex_exit(&vp->v_lock);
 861                         VOP_INACTIVE(vp, CRED(), NULL);
 862                         return;
 863                 }
 864                 vp->v_count--;
 865         }
 866         mutex_exit(&vp->v_lock);
 867 }
 868 
 869 /*
 870  * Like vn_rele() except that it clears v_stream under v_lock.
 871  * This is used by sockfs when it dismantels the association between
 872  * the sockfs node and the vnode in the underlaying file system.
 873  * v_lock has to be held to prevent a thread coming through the lookupname
 874  * path from accessing a stream head that is going away.
 875  */
 876 void
 877 vn_rele_stream(vnode_t *vp)
 878 {
 879         VERIFY(vp->v_count > 0);
 880         mutex_enter(&vp->v_lock);
 881         vp->v_stream = NULL;
 882         if (vp->v_count == 1) {
 883                 mutex_exit(&vp->v_lock);
 884                 VOP_INACTIVE(vp, CRED(), NULL);
 885                 return;
 886         }
 887         vp->v_count--;
 888         mutex_exit(&vp->v_lock);
 889 }
 890 
 891 static void
 892 vn_rele_inactive(vnode_t *vp)
 893 {
 894         VOP_INACTIVE(vp, CRED(), NULL);
 895 }
 896 
 897 /*
 898  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 899  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 900  * the file system as a result of releasing the vnode. Note, file systems
 901  * already have to handle the race where the vnode is incremented before the
 902  * inactive routine is called and does its locking.
 903  *
 904  * Warning: Excessive use of this routine can lead to performance problems.
 905  * This is because taskqs throttle back allocation if too many are created.
 906  */
 907 void
 908 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 909 {
 910         VERIFY(vp->v_count > 0);
 911         mutex_enter(&vp->v_lock);
 912         if (vp->v_count == 1) {
 913                 mutex_exit(&vp->v_lock);
 914                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 915                     vp, TQ_SLEEP) != NULL);
 916                 return;
 917         }
 918         vp->v_count--;
 919         mutex_exit(&vp->v_lock);
 920 }
 921 
 922 int
 923 vn_open(
 924         char *pnamep,
 925         enum uio_seg seg,
 926         int filemode,
 927         int createmode,
 928         struct vnode **vpp,
 929         enum create crwhy,
 930         mode_t umask)
 931 {
 932         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 933             umask, NULL, -1));
 934 }
 935 
 936 
 937 /*
 938  * Open/create a vnode.
 939  * This may be callable by the kernel, the only known use
 940  * of user context being that the current user credentials
 941  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 942  */
 943 int
 944 vn_openat(
 945         char *pnamep,
 946         enum uio_seg seg,
 947         int filemode,
 948         int createmode,
 949         struct vnode **vpp,
 950         enum create crwhy,
 951         mode_t umask,
 952         struct vnode *startvp,
 953         int fd)
 954 {
 955         struct vnode *vp;
 956         int mode;
 957         int accessflags;
 958         int error;
 959         int in_crit = 0;
 960         int open_done = 0;
 961         int shrlock_done = 0;
 962         struct vattr vattr;
 963         enum symfollow follow;
 964         int estale_retry = 0;
 965         struct shrlock shr;
 966         struct shr_locowner shr_own;
 967 
 968         mode = 0;
 969         accessflags = 0;
 970         if (filemode & FREAD)
 971                 mode |= VREAD;
 972         if (filemode & (FWRITE|FTRUNC))
 973                 mode |= VWRITE;
 974         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 975                 mode |= VEXEC;
 976 
 977         /* symlink interpretation */
 978         if (filemode & FNOFOLLOW)
 979                 follow = NO_FOLLOW;
 980         else
 981                 follow = FOLLOW;
 982 
 983         if (filemode & FAPPEND)
 984                 accessflags |= V_APPEND;
 985 
 986 top:
 987         if (filemode & FCREAT) {
 988                 enum vcexcl excl;
 989 
 990                 /*
 991                  * Wish to create a file.
 992                  */
 993                 vattr.va_type = VREG;
 994                 vattr.va_mode = createmode;
 995                 vattr.va_mask = AT_TYPE|AT_MODE;
 996                 if (filemode & FTRUNC) {
 997                         vattr.va_size = 0;
 998                         vattr.va_mask |= AT_SIZE;
 999                 }
1000                 if (filemode & FEXCL)
1001                         excl = EXCL;
1002                 else
1003                         excl = NONEXCL;
1004 
1005                 if (error =
1006                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1007                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1008                         return (error);
1009         } else {
1010                 /*
1011                  * Wish to open a file.  Just look it up.
1012                  */
1013                 if (error = lookupnameat(pnamep, seg, follow,
1014                     NULLVPP, &vp, startvp)) {
1015                         if ((error == ESTALE) &&
1016                             fs_need_estale_retry(estale_retry++))
1017                                 goto top;
1018                         return (error);
1019                 }
1020 
1021                 /*
1022                  * Get the attributes to check whether file is large.
1023                  * We do this only if the FOFFMAX flag is not set and
1024                  * only for regular files.
1025                  */
1026 
1027                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1028                         vattr.va_mask = AT_SIZE;
1029                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1030                             CRED(), NULL))) {
1031                                 goto out;
1032                         }
1033                         if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1034                                 /*
1035                                  * Large File API - regular open fails
1036                                  * if FOFFMAX flag is set in file mode
1037                                  */
1038                                 error = EOVERFLOW;
1039                                 goto out;
1040                         }
1041                 }
1042                 /*
1043                  * Can't write directories, active texts, or
1044                  * read-only filesystems.  Can't truncate files
1045                  * on which mandatory locking is in effect.
1046                  */
1047                 if (filemode & (FWRITE|FTRUNC)) {
1048                         /*
1049                          * Allow writable directory if VDIROPEN flag is set.
1050                          */
1051                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1052                                 error = EISDIR;
1053                                 goto out;
1054                         }
1055                         if (ISROFILE(vp)) {
1056                                 error = EROFS;
1057                                 goto out;
1058                         }
1059                         /*
1060                          * Can't truncate files on which
1061                          * sysv mandatory locking is in effect.
1062                          */
1063                         if (filemode & FTRUNC) {
1064                                 vnode_t *rvp;
1065 
1066                                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1067                                         rvp = vp;
1068                                 if (rvp->v_filocks != NULL) {
1069                                         vattr.va_mask = AT_MODE;
1070                                         if ((error = VOP_GETATTR(vp,
1071                                             &vattr, 0, CRED(), NULL)) == 0 &&
1072                                             MANDLOCK(vp, vattr.va_mode))
1073                                                 error = EAGAIN;
1074                                 }
1075                         }
1076                         if (error)
1077                                 goto out;
1078                 }
1079                 /*
1080                  * Check permissions.
1081                  */
1082                 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1083                         goto out;
1084                 /*
1085                  * Require FSEARCH to return a directory.
1086                  * Require FEXEC to return a regular file.
1087                  */
1088                 if ((filemode & FSEARCH) && vp->v_type != VDIR) {
1089                         error = ENOTDIR;
1090                         goto out;
1091                 }
1092                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1093                         error = ENOEXEC;        /* XXX: error code? */
1094                         goto out;
1095                 }
1096         }
1097 
1098         /*
1099          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1100          */
1101         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1102                 error = ELOOP;
1103                 goto out;
1104         }
1105         if (filemode & FNOLINKS) {
1106                 vattr.va_mask = AT_NLINK;
1107                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1108                         goto out;
1109                 }
1110                 if (vattr.va_nlink != 1) {
1111                         error = EMLINK;
1112                         goto out;
1113                 }
1114         }
1115 
1116         /*
1117          * Opening a socket corresponding to the AF_UNIX pathname
1118          * in the filesystem name space is not supported.
1119          * However, VSOCK nodes in namefs are supported in order
1120          * to make fattach work for sockets.
1121          *
1122          * XXX This uses VOP_REALVP to distinguish between
1123          * an unopened namefs node (where VOP_REALVP returns a
1124          * different VSOCK vnode) and a VSOCK created by vn_create
1125          * in some file system (where VOP_REALVP would never return
1126          * a different vnode).
1127          */
1128         if (vp->v_type == VSOCK) {
1129                 struct vnode *nvp;
1130 
1131                 error = VOP_REALVP(vp, &nvp, NULL);
1132                 if (error != 0 || nvp == NULL || nvp == vp ||
1133                     nvp->v_type != VSOCK) {
1134                         error = EOPNOTSUPP;
1135                         goto out;
1136                 }
1137         }
1138 
1139         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1140                 /* get share reservation */
1141                 shr.s_access = 0;
1142                 if (filemode & FWRITE)
1143                         shr.s_access |= F_WRACC;
1144                 if (filemode & FREAD)
1145                         shr.s_access |= F_RDACC;
1146                 shr.s_deny = 0;
1147                 shr.s_sysid = 0;
1148                 shr.s_pid = ttoproc(curthread)->p_pid;
1149                 shr_own.sl_pid = shr.s_pid;
1150                 shr_own.sl_id = fd;
1151                 shr.s_own_len = sizeof (shr_own);
1152                 shr.s_owner = (caddr_t)&shr_own;
1153                 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1154                     NULL);
1155                 if (error)
1156                         goto out;
1157                 shrlock_done = 1;
1158 
1159                 /* nbmand conflict check if truncating file */
1160                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1161                         nbl_start_crit(vp, RW_READER);
1162                         in_crit = 1;
1163 
1164                         vattr.va_mask = AT_SIZE;
1165                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1166                                 goto out;
1167                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1168                             NULL)) {
1169                                 error = EACCES;
1170                                 goto out;
1171                         }
1172                 }
1173         }
1174 
1175         /*
1176          * Do opening protocol.
1177          */
1178         error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1179         if (error)
1180                 goto out;
1181         open_done = 1;
1182 
1183         /*
1184          * Truncate if required.
1185          */
1186         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1187                 vattr.va_size = 0;
1188                 vattr.va_mask = AT_SIZE;
1189                 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1190                         goto out;
1191         }
1192 out:
1193         ASSERT(vp->v_count > 0);
1194 
1195         if (in_crit) {
1196                 nbl_end_crit(vp);
1197                 in_crit = 0;
1198         }
1199         if (error) {
1200                 if (open_done) {
1201                         (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1202                             NULL);
1203                         open_done = 0;
1204                         shrlock_done = 0;
1205                 }
1206                 if (shrlock_done) {
1207                         (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1208                             NULL);
1209                         shrlock_done = 0;
1210                 }
1211 
1212                 /*
1213                  * The following clause was added to handle a problem
1214                  * with NFS consistency.  It is possible that a lookup
1215                  * of the file to be opened succeeded, but the file
1216                  * itself doesn't actually exist on the server.  This
1217                  * is chiefly due to the DNLC containing an entry for
1218                  * the file which has been removed on the server.  In
1219                  * this case, we just start over.  If there was some
1220                  * other cause for the ESTALE error, then the lookup
1221                  * of the file will fail and the error will be returned
1222                  * above instead of looping around from here.
1223                  */
1224                 VN_RELE(vp);
1225                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1226                         goto top;
1227         } else
1228                 *vpp = vp;
1229         return (error);
1230 }
1231 
1232 /*
1233  * The following two accessor functions are for the NFSv4 server.  Since there
1234  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1235  * vnode open counts correct when a client "upgrades" an open or does an
1236  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1237  * open mode (add or subtract read or write), but also change the share/deny
1238  * modes.  However, share reservations are not integrated with OPEN, yet, so
1239  * we need to handle each separately.  These functions are cleaner than having
1240  * the NFS server manipulate the counts directly, however, nobody else should
1241  * use these functions.
1242  */
1243 void
1244 vn_open_upgrade(
1245         vnode_t *vp,
1246         int filemode)
1247 {
1248         ASSERT(vp->v_type == VREG);
1249 
1250         if (filemode & FREAD)
1251                 atomic_inc_32(&vp->v_rdcnt);
1252         if (filemode & FWRITE)
1253                 atomic_inc_32(&vp->v_wrcnt);
1254 
1255 }
1256 
1257 void
1258 vn_open_downgrade(
1259         vnode_t *vp,
1260         int filemode)
1261 {
1262         ASSERT(vp->v_type == VREG);
1263 
1264         if (filemode & FREAD) {
1265                 ASSERT(vp->v_rdcnt > 0);
1266                 atomic_dec_32(&vp->v_rdcnt);
1267         }
1268         if (filemode & FWRITE) {
1269                 ASSERT(vp->v_wrcnt > 0);
1270                 atomic_dec_32(&vp->v_wrcnt);
1271         }
1272 
1273 }
1274 
1275 int
1276 vn_create(
1277         char *pnamep,
1278         enum uio_seg seg,
1279         struct vattr *vap,
1280         enum vcexcl excl,
1281         int mode,
1282         struct vnode **vpp,
1283         enum create why,
1284         int flag,
1285         mode_t umask)
1286 {
1287         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1288             umask, NULL));
1289 }
1290 
1291 /*
1292  * Create a vnode (makenode).
1293  */
1294 int
1295 vn_createat(
1296         char *pnamep,
1297         enum uio_seg seg,
1298         struct vattr *vap,
1299         enum vcexcl excl,
1300         int mode,
1301         struct vnode **vpp,
1302         enum create why,
1303         int flag,
1304         mode_t umask,
1305         struct vnode *startvp)
1306 {
1307         struct vnode *dvp;      /* ptr to parent dir vnode */
1308         struct vnode *vp = NULL;
1309         struct pathname pn;
1310         int error;
1311         int in_crit = 0;
1312         struct vattr vattr;
1313         enum symfollow follow;
1314         int estale_retry = 0;
1315         uint32_t auditing = AU_AUDITING();
1316 
1317         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1318 
1319         /* symlink interpretation */
1320         if ((flag & FNOFOLLOW) || excl == EXCL)
1321                 follow = NO_FOLLOW;
1322         else
1323                 follow = FOLLOW;
1324         flag &= ~(FNOFOLLOW|FNOLINKS);
1325 
1326 top:
1327         /*
1328          * Lookup directory.
1329          * If new object is a file, call lower level to create it.
1330          * Note that it is up to the lower level to enforce exclusive
1331          * creation, if the file is already there.
1332          * This allows the lower level to do whatever
1333          * locking or protocol that is needed to prevent races.
1334          * If the new object is directory call lower level to make
1335          * the new directory, with "." and "..".
1336          */
1337         if (error = pn_get(pnamep, seg, &pn))
1338                 return (error);
1339         if (auditing)
1340                 audit_vncreate_start();
1341         dvp = NULL;
1342         *vpp = NULL;
1343         /*
1344          * lookup will find the parent directory for the vnode.
1345          * When it is done the pn holds the name of the entry
1346          * in the directory.
1347          * If this is a non-exclusive create we also find the node itself.
1348          */
1349         error = lookuppnat(&pn, NULL, follow, &dvp,
1350             (excl == EXCL) ? NULLVPP : vpp, startvp);
1351         if (error) {
1352                 pn_free(&pn);
1353                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1354                         goto top;
1355                 if (why == CRMKDIR && error == EINVAL)
1356                         error = EEXIST;         /* SVID */
1357                 return (error);
1358         }
1359 
1360         if (why != CRMKNOD)
1361                 vap->va_mode &= ~VSVTX;
1362 
1363         /*
1364          * If default ACLs are defined for the directory don't apply the
1365          * umask if umask is passed.
1366          */
1367 
1368         if (umask) {
1369 
1370                 vsecattr_t vsec;
1371 
1372                 vsec.vsa_aclcnt = 0;
1373                 vsec.vsa_aclentp = NULL;
1374                 vsec.vsa_dfaclcnt = 0;
1375                 vsec.vsa_dfaclentp = NULL;
1376                 vsec.vsa_mask = VSA_DFACLCNT;
1377                 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1378                 /*
1379                  * If error is ENOSYS then treat it as no error
1380                  * Don't want to force all file systems to support
1381                  * aclent_t style of ACL's.
1382                  */
1383                 if (error == ENOSYS)
1384                         error = 0;
1385                 if (error) {
1386                         if (*vpp != NULL)
1387                                 VN_RELE(*vpp);
1388                         goto out;
1389                 } else {
1390                         /*
1391                          * Apply the umask if no default ACLs.
1392                          */
1393                         if (vsec.vsa_dfaclcnt == 0)
1394                                 vap->va_mode &= ~umask;
1395 
1396                         /*
1397                          * VOP_GETSECATTR() may have allocated memory for
1398                          * ACLs we didn't request, so double-check and
1399                          * free it if necessary.
1400                          */
1401                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1402                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1403                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1404                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1405                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1406                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1407                 }
1408         }
1409 
1410         /*
1411          * In general we want to generate EROFS if the file system is
1412          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1413          * documents the open system call, and it says that O_CREAT has no
1414          * effect if the file already exists.  Bug 1119649 states
1415          * that open(path, O_CREAT, ...) fails when attempting to open an
1416          * existing file on a read only file system.  Thus, the first part
1417          * of the following if statement has 3 checks:
1418          *      if the file exists &&
1419          *              it is being open with write access &&
1420          *              the file system is read only
1421          *      then generate EROFS
1422          */
1423         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1424             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1425                 if (*vpp)
1426                         VN_RELE(*vpp);
1427                 error = EROFS;
1428         } else if (excl == NONEXCL && *vpp != NULL) {
1429                 vnode_t *rvp;
1430 
1431                 /*
1432                  * File already exists.  If a mandatory lock has been
1433                  * applied, return error.
1434                  */
1435                 vp = *vpp;
1436                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1437                         rvp = vp;
1438                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1439                         nbl_start_crit(vp, RW_READER);
1440                         in_crit = 1;
1441                 }
1442                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1443                         vattr.va_mask = AT_MODE|AT_SIZE;
1444                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1445                                 goto out;
1446                         }
1447                         if (MANDLOCK(vp, vattr.va_mode)) {
1448                                 error = EAGAIN;
1449                                 goto out;
1450                         }
1451                         /*
1452                          * File cannot be truncated if non-blocking mandatory
1453                          * locks are currently on the file.
1454                          */
1455                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1456                                 u_offset_t offset;
1457                                 ssize_t length;
1458 
1459                                 offset = vap->va_size > vattr.va_size ?
1460                                     vattr.va_size : vap->va_size;
1461                                 length = vap->va_size > vattr.va_size ?
1462                                     vap->va_size - vattr.va_size :
1463                                     vattr.va_size - vap->va_size;
1464                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1465                                     length, 0, NULL)) {
1466                                         error = EACCES;
1467                                         goto out;
1468                                 }
1469                         }
1470                 }
1471 
1472                 /*
1473                  * If the file is the root of a VFS, we've crossed a
1474                  * mount point and the "containing" directory that we
1475                  * acquired above (dvp) is irrelevant because it's in
1476                  * a different file system.  We apply VOP_CREATE to the
1477                  * target itself instead of to the containing directory
1478                  * and supply a null path name to indicate (conventionally)
1479                  * the node itself as the "component" of interest.
1480                  *
1481                  * The intercession of the file system is necessary to
1482                  * ensure that the appropriate permission checks are
1483                  * done.
1484                  */
1485                 if (vp->v_flag & VROOT) {
1486                         ASSERT(why != CRMKDIR);
1487                         error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1488                             CRED(), flag, NULL, NULL);
1489                         /*
1490                          * If the create succeeded, it will have created
1491                          * a new reference to the vnode.  Give up the
1492                          * original reference.  The assertion should not
1493                          * get triggered because NBMAND locks only apply to
1494                          * VREG files.  And if in_crit is non-zero for some
1495                          * reason, detect that here, rather than when we
1496                          * deference a null vp.
1497                          */
1498                         ASSERT(in_crit == 0);
1499                         VN_RELE(vp);
1500                         vp = NULL;
1501                         goto out;
1502                 }
1503 
1504                 /*
1505                  * Large File API - non-large open (FOFFMAX flag not set)
1506                  * of regular file fails if the file size exceeds MAXOFF32_T.
1507                  */
1508                 if (why != CRMKDIR &&
1509                     !(flag & FOFFMAX) &&
1510                     (vp->v_type == VREG)) {
1511                         vattr.va_mask = AT_SIZE;
1512                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1513                             CRED(), NULL))) {
1514                                 goto out;
1515                         }
1516                         if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1517                                 error = EOVERFLOW;
1518                                 goto out;
1519                         }
1520                 }
1521         }
1522 
1523         if (error == 0) {
1524                 /*
1525                  * Call mkdir() if specified, otherwise create().
1526                  */
1527                 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1528 
1529                 if (why == CRMKDIR)
1530                         /*
1531                          * N.B., if vn_createat() ever requests
1532                          * case-insensitive behavior then it will need
1533                          * to be passed to VOP_MKDIR().  VOP_CREATE()
1534                          * will already get it via "flag"
1535                          */
1536                         error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1537                             NULL, 0, NULL);
1538                 else if (!must_be_dir)
1539                         error = VOP_CREATE(dvp, pn.pn_path, vap,
1540                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1541                 else
1542                         error = ENOTDIR;
1543         }
1544 
1545 out:
1546 
1547         if (auditing)
1548                 audit_vncreate_finish(*vpp, error);
1549         if (in_crit) {
1550                 nbl_end_crit(vp);
1551                 in_crit = 0;
1552         }
1553         if (vp != NULL) {
1554                 VN_RELE(vp);
1555                 vp = NULL;
1556         }
1557         pn_free(&pn);
1558         VN_RELE(dvp);
1559         /*
1560          * The following clause was added to handle a problem
1561          * with NFS consistency.  It is possible that a lookup
1562          * of the file to be created succeeded, but the file
1563          * itself doesn't actually exist on the server.  This
1564          * is chiefly due to the DNLC containing an entry for
1565          * the file which has been removed on the server.  In
1566          * this case, we just start over.  If there was some
1567          * other cause for the ESTALE error, then the lookup
1568          * of the file will fail and the error will be returned
1569          * above instead of looping around from here.
1570          */
1571         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1572                 goto top;
1573         return (error);
1574 }
1575 
1576 int
1577 vn_link(char *from, char *to, enum uio_seg seg)
1578 {
1579         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1580 }
1581 
1582 int
1583 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1584     vnode_t *tstartvp, char *to, enum uio_seg seg)
1585 {
1586         struct vnode *fvp;              /* from vnode ptr */
1587         struct vnode *tdvp;             /* to directory vnode ptr */
1588         struct pathname pn;
1589         int error;
1590         struct vattr vattr;
1591         dev_t fsid;
1592         int estale_retry = 0;
1593         uint32_t auditing = AU_AUDITING();
1594 
1595 top:
1596         fvp = tdvp = NULL;
1597         if (error = pn_get(to, seg, &pn))
1598                 return (error);
1599         if (auditing && fstartvp != NULL)
1600                 audit_setfsat_path(1);
1601         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1602                 goto out;
1603         if (auditing && tstartvp != NULL)
1604                 audit_setfsat_path(3);
1605         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1606                 goto out;
1607         /*
1608          * Make sure both source vnode and target directory vnode are
1609          * in the same vfs and that it is writeable.
1610          */
1611         vattr.va_mask = AT_FSID;
1612         if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1613                 goto out;
1614         fsid = vattr.va_fsid;
1615         vattr.va_mask = AT_FSID;
1616         if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1617                 goto out;
1618         if (fsid != vattr.va_fsid) {
1619                 error = EXDEV;
1620                 goto out;
1621         }
1622         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1623                 error = EROFS;
1624                 goto out;
1625         }
1626         /*
1627          * Do the link.
1628          */
1629         (void) pn_fixslash(&pn);
1630         error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1631 out:
1632         pn_free(&pn);
1633         if (fvp)
1634                 VN_RELE(fvp);
1635         if (tdvp)
1636                 VN_RELE(tdvp);
1637         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1638                 goto top;
1639         return (error);
1640 }
1641 
1642 int
1643 vn_rename(char *from, char *to, enum uio_seg seg)
1644 {
1645         return (vn_renameat(NULL, from, NULL, to, seg));
1646 }
1647 
1648 int
1649 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1650                 char *tname, enum uio_seg seg)
1651 {
1652         int error;
1653         struct vattr vattr;
1654         struct pathname fpn;            /* from pathname */
1655         struct pathname tpn;            /* to pathname */
1656         dev_t fsid;
1657         int in_crit_src, in_crit_targ;
1658         vnode_t *fromvp, *fvp;
1659         vnode_t *tovp, *targvp;
1660         int estale_retry = 0;
1661         uint32_t auditing = AU_AUDITING();
1662 
1663 top:
1664         fvp = fromvp = tovp = targvp = NULL;
1665         in_crit_src = in_crit_targ = 0;
1666         /*
1667          * Get to and from pathnames.
1668          */
1669         if (error = pn_get(fname, seg, &fpn))
1670                 return (error);
1671         if (error = pn_get(tname, seg, &tpn)) {
1672                 pn_free(&fpn);
1673                 return (error);
1674         }
1675 
1676         /*
1677          * First we need to resolve the correct directories
1678          * The passed in directories may only be a starting point,
1679          * but we need the real directories the file(s) live in.
1680          * For example the fname may be something like usr/lib/sparc
1681          * and we were passed in the / directory, but we need to
1682          * use the lib directory for the rename.
1683          */
1684 
1685         if (auditing && fdvp != NULL)
1686                 audit_setfsat_path(1);
1687         /*
1688          * Lookup to and from directories.
1689          */
1690         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1691                 goto out;
1692         }
1693 
1694         /*
1695          * Make sure there is an entry.
1696          */
1697         if (fvp == NULL) {
1698                 error = ENOENT;
1699                 goto out;
1700         }
1701 
1702         if (auditing && tdvp != NULL)
1703                 audit_setfsat_path(3);
1704         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1705                 goto out;
1706         }
1707 
1708         /*
1709          * Make sure both the from vnode directory and the to directory
1710          * are in the same vfs and the to directory is writable.
1711          * We check fsid's, not vfs pointers, so loopback fs works.
1712          */
1713         if (fromvp != tovp) {
1714                 vattr.va_mask = AT_FSID;
1715                 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1716                         goto out;
1717                 fsid = vattr.va_fsid;
1718                 vattr.va_mask = AT_FSID;
1719                 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1720                         goto out;
1721                 if (fsid != vattr.va_fsid) {
1722                         error = EXDEV;
1723                         goto out;
1724                 }
1725         }
1726 
1727         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1728                 error = EROFS;
1729                 goto out;
1730         }
1731 
1732         if (targvp && (fvp != targvp)) {
1733                 nbl_start_crit(targvp, RW_READER);
1734                 in_crit_targ = 1;
1735                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1736                         error = EACCES;
1737                         goto out;
1738                 }
1739         }
1740 
1741         if (nbl_need_check(fvp)) {
1742                 nbl_start_crit(fvp, RW_READER);
1743                 in_crit_src = 1;
1744                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1745                         error = EACCES;
1746                         goto out;
1747                 }
1748         }
1749 
1750         /*
1751          * Do the rename.
1752          */
1753         (void) pn_fixslash(&tpn);
1754         error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1755             NULL, 0);
1756 
1757 out:
1758         pn_free(&fpn);
1759         pn_free(&tpn);
1760         if (in_crit_src)
1761                 nbl_end_crit(fvp);
1762         if (in_crit_targ)
1763                 nbl_end_crit(targvp);
1764         if (fromvp)
1765                 VN_RELE(fromvp);
1766         if (tovp)
1767                 VN_RELE(tovp);
1768         if (targvp)
1769                 VN_RELE(targvp);
1770         if (fvp)
1771                 VN_RELE(fvp);
1772         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1773                 goto top;
1774         return (error);
1775 }
1776 
1777 /*
1778  * Remove a file or directory.
1779  */
1780 int
1781 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1782 {
1783         return (vn_removeat(NULL, fnamep, seg, dirflag));
1784 }
1785 
1786 int
1787 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1788 {
1789         struct vnode *vp;               /* entry vnode */
1790         struct vnode *dvp;              /* ptr to parent dir vnode */
1791         struct vnode *coveredvp;
1792         struct pathname pn;             /* name of entry */
1793         enum vtype vtype;
1794         int error;
1795         struct vfs *vfsp;
1796         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1797         int in_crit = 0;
1798         int estale_retry = 0;
1799 
1800 top:
1801         if (error = pn_get(fnamep, seg, &pn))
1802                 return (error);
1803         dvp = vp = NULL;
1804         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1805                 pn_free(&pn);
1806                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1807                         goto top;
1808                 return (error);
1809         }
1810 
1811         /*
1812          * Make sure there is an entry.
1813          */
1814         if (vp == NULL) {
1815                 error = ENOENT;
1816                 goto out;
1817         }
1818 
1819         vfsp = vp->v_vfsp;
1820         dvfsp = dvp->v_vfsp;
1821 
1822         /*
1823          * If the named file is the root of a mounted filesystem, fail,
1824          * unless it's marked unlinkable.  In that case, unmount the
1825          * filesystem and proceed to unlink the covered vnode.  (If the
1826          * covered vnode is a directory, use rmdir instead of unlink,
1827          * to avoid file system corruption.)
1828          */
1829         if (vp->v_flag & VROOT) {
1830                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1831                         error = EBUSY;
1832                         goto out;
1833                 }
1834 
1835                 /*
1836                  * Namefs specific code starts here.
1837                  */
1838 
1839                 if (dirflag == RMDIRECTORY) {
1840                         /*
1841                          * User called rmdir(2) on a file that has
1842                          * been namefs mounted on top of.  Since
1843                          * namefs doesn't allow directories to
1844                          * be mounted on other files we know
1845                          * vp is not of type VDIR so fail to operation.
1846                          */
1847                         error = ENOTDIR;
1848                         goto out;
1849                 }
1850 
1851                 /*
1852                  * If VROOT is still set after grabbing vp->v_lock,
1853                  * noone has finished nm_unmount so far and coveredvp
1854                  * is valid.
1855                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1856                  * vp->v_lock, any race window is eliminated.
1857                  */
1858 
1859                 mutex_enter(&vp->v_lock);
1860                 if ((vp->v_flag & VROOT) == 0) {
1861                         /* Someone beat us to the unmount */
1862                         mutex_exit(&vp->v_lock);
1863                         error = EBUSY;
1864                         goto out;
1865                 }
1866                 vfsp = vp->v_vfsp;
1867                 coveredvp = vfsp->vfs_vnodecovered;
1868                 ASSERT(coveredvp);
1869                 /*
1870                  * Note: Implementation of vn_vfswlock shows that ordering of
1871                  * v_lock / vn_vfswlock is not an issue here.
1872                  */
1873                 error = vn_vfswlock(coveredvp);
1874                 mutex_exit(&vp->v_lock);
1875 
1876                 if (error)
1877                         goto out;
1878 
1879                 VN_HOLD(coveredvp);
1880                 VN_RELE(vp);
1881                 error = dounmount(vfsp, 0, CRED());
1882 
1883                 /*
1884                  * Unmounted the namefs file system; now get
1885                  * the object it was mounted over.
1886                  */
1887                 vp = coveredvp;
1888                 /*
1889                  * If namefs was mounted over a directory, then
1890                  * we want to use rmdir() instead of unlink().
1891                  */
1892                 if (vp->v_type == VDIR)
1893                         dirflag = RMDIRECTORY;
1894 
1895                 if (error)
1896                         goto out;
1897         }
1898 
1899         /*
1900          * Make sure filesystem is writeable.
1901          * We check the parent directory's vfs in case this is an lofs vnode.
1902          */
1903         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1904                 error = EROFS;
1905                 goto out;
1906         }
1907 
1908         vtype = vp->v_type;
1909 
1910         /*
1911          * If there is the possibility of an nbmand share reservation, make
1912          * sure it's okay to remove the file.  Keep a reference to the
1913          * vnode, so that we can exit the nbl critical region after
1914          * calling VOP_REMOVE.
1915          * If there is no possibility of an nbmand share reservation,
1916          * release the vnode reference now.  Filesystems like NFS may
1917          * behave differently if there is an extra reference, so get rid of
1918          * this one.  Fortunately, we can't have nbmand mounts on NFS
1919          * filesystems.
1920          */
1921         if (nbl_need_check(vp)) {
1922                 nbl_start_crit(vp, RW_READER);
1923                 in_crit = 1;
1924                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1925                         error = EACCES;
1926                         goto out;
1927                 }
1928         } else {
1929                 VN_RELE(vp);
1930                 vp = NULL;
1931         }
1932 
1933         if (dirflag == RMDIRECTORY) {
1934                 /*
1935                  * Caller is using rmdir(2), which can only be applied to
1936                  * directories.
1937                  */
1938                 if (vtype != VDIR) {
1939                         error = ENOTDIR;
1940                 } else {
1941                         vnode_t *cwd;
1942                         proc_t *pp = curproc;
1943 
1944                         mutex_enter(&pp->p_lock);
1945                         cwd = PTOU(pp)->u_cdir;
1946                         VN_HOLD(cwd);
1947                         mutex_exit(&pp->p_lock);
1948                         error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1949                             NULL, 0);
1950                         VN_RELE(cwd);
1951                 }
1952         } else {
1953                 /*
1954                  * Unlink(2) can be applied to anything.
1955                  */
1956                 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1957         }
1958 
1959 out:
1960         pn_free(&pn);
1961         if (in_crit) {
1962                 nbl_end_crit(vp);
1963                 in_crit = 0;
1964         }
1965         if (vp != NULL)
1966                 VN_RELE(vp);
1967         if (dvp != NULL)
1968                 VN_RELE(dvp);
1969         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1970                 goto top;
1971         return (error);
1972 }
1973 
1974 /*
1975  * Utility function to compare equality of vnodes.
1976  * Compare the underlying real vnodes, if there are underlying vnodes.
1977  * This is a more thorough comparison than the VN_CMP() macro provides.
1978  */
1979 int
1980 vn_compare(vnode_t *vp1, vnode_t *vp2)
1981 {
1982         vnode_t *realvp;
1983 
1984         if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1985                 vp1 = realvp;
1986         if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1987                 vp2 = realvp;
1988         return (VN_CMP(vp1, vp2));
1989 }
1990 
1991 /*
1992  * The number of locks to hash into.  This value must be a power
1993  * of 2 minus 1 and should probably also be prime.
1994  */
1995 #define NUM_BUCKETS     1023
1996 
1997 struct  vn_vfslocks_bucket {
1998         kmutex_t vb_lock;
1999         vn_vfslocks_entry_t *vb_list;
2000         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2001 };
2002 
2003 /*
2004  * Total number of buckets will be NUM_BUCKETS + 1 .
2005  */
2006 
2007 #pragma align   64(vn_vfslocks_buckets)
2008 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2009 
2010 #define VN_VFSLOCKS_SHIFT       9
2011 
2012 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2013         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2014 
2015 /*
2016  * vn_vfslocks_getlock() uses an HASH scheme to generate
2017  * rwstlock using vfs/vnode pointer passed to it.
2018  *
2019  * vn_vfslocks_rele() releases a reference in the
2020  * HASH table which allows the entry allocated by
2021  * vn_vfslocks_getlock() to be freed at a later
2022  * stage when the refcount drops to zero.
2023  */
2024 
2025 vn_vfslocks_entry_t *
2026 vn_vfslocks_getlock(void *vfsvpptr)
2027 {
2028         struct vn_vfslocks_bucket *bp;
2029         vn_vfslocks_entry_t *vep;
2030         vn_vfslocks_entry_t *tvep;
2031 
2032         ASSERT(vfsvpptr != NULL);
2033         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2034 
2035         mutex_enter(&bp->vb_lock);
2036         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2037                 if (vep->ve_vpvfs == vfsvpptr) {
2038                         vep->ve_refcnt++;
2039                         mutex_exit(&bp->vb_lock);
2040                         return (vep);
2041                 }
2042         }
2043         mutex_exit(&bp->vb_lock);
2044         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2045         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2046         vep->ve_vpvfs = (char *)vfsvpptr;
2047         vep->ve_refcnt = 1;
2048         mutex_enter(&bp->vb_lock);
2049         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2050                 if (tvep->ve_vpvfs == vfsvpptr) {
2051                         tvep->ve_refcnt++;
2052                         mutex_exit(&bp->vb_lock);
2053 
2054                         /*
2055                          * There is already an entry in the hash
2056                          * destroy what we just allocated.
2057                          */
2058                         rwst_destroy(&vep->ve_lock);
2059                         kmem_free(vep, sizeof (*vep));
2060                         return (tvep);
2061                 }
2062         }
2063         vep->ve_next = bp->vb_list;
2064         bp->vb_list = vep;
2065         mutex_exit(&bp->vb_lock);
2066         return (vep);
2067 }
2068 
2069 void
2070 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2071 {
2072         struct vn_vfslocks_bucket *bp;
2073         vn_vfslocks_entry_t *vep;
2074         vn_vfslocks_entry_t *pvep;
2075 
2076         ASSERT(vepent != NULL);
2077         ASSERT(vepent->ve_vpvfs != NULL);
2078 
2079         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2080 
2081         mutex_enter(&bp->vb_lock);
2082         vepent->ve_refcnt--;
2083 
2084         if ((int32_t)vepent->ve_refcnt < 0)
2085                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2086 
2087         if (vepent->ve_refcnt == 0) {
2088                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2089                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2090                                 if (bp->vb_list == vep)
2091                                         bp->vb_list = vep->ve_next;
2092                                 else {
2093                                         /* LINTED */
2094                                         pvep->ve_next = vep->ve_next;
2095                                 }
2096                                 mutex_exit(&bp->vb_lock);
2097                                 rwst_destroy(&vep->ve_lock);
2098                                 kmem_free(vep, sizeof (*vep));
2099                                 return;
2100                         }
2101                         pvep = vep;
2102                 }
2103                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2104         }
2105         mutex_exit(&bp->vb_lock);
2106 }
2107 
2108 /*
2109  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2110  * lock protecting the v_vfsmountedhere field.
2111  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2112  * except that it blocks to acquire the lock VVFSLOCK.
2113  *
2114  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2115  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2116  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2117  */
2118 int
2119 vn_vfswlock_wait(vnode_t *vp)
2120 {
2121         int retval;
2122         vn_vfslocks_entry_t *vpvfsentry;
2123         ASSERT(vp != NULL);
2124 
2125         vpvfsentry = vn_vfslocks_getlock(vp);
2126         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2127 
2128         if (retval == EINTR) {
2129                 vn_vfslocks_rele(vpvfsentry);
2130                 return (EINTR);
2131         }
2132         return (retval);
2133 }
2134 
2135 int
2136 vn_vfsrlock_wait(vnode_t *vp)
2137 {
2138         int retval;
2139         vn_vfslocks_entry_t *vpvfsentry;
2140         ASSERT(vp != NULL);
2141 
2142         vpvfsentry = vn_vfslocks_getlock(vp);
2143         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2144 
2145         if (retval == EINTR) {
2146                 vn_vfslocks_rele(vpvfsentry);
2147                 return (EINTR);
2148         }
2149 
2150         return (retval);
2151 }
2152 
2153 
2154 /*
2155  * vn_vfswlock is used to implement a lock which is logically a writers lock
2156  * protecting the v_vfsmountedhere field.
2157  */
2158 int
2159 vn_vfswlock(vnode_t *vp)
2160 {
2161         vn_vfslocks_entry_t *vpvfsentry;
2162 
2163         /*
2164          * If vp is NULL then somebody is trying to lock the covered vnode
2165          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2166          * only happen when unmounting /.  Since that operation will fail
2167          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2168          */
2169         if (vp == NULL)
2170                 return (EBUSY);
2171 
2172         vpvfsentry = vn_vfslocks_getlock(vp);
2173 
2174         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2175                 return (0);
2176 
2177         vn_vfslocks_rele(vpvfsentry);
2178         return (EBUSY);
2179 }
2180 
2181 int
2182 vn_vfsrlock(vnode_t *vp)
2183 {
2184         vn_vfslocks_entry_t *vpvfsentry;
2185 
2186         /*
2187          * If vp is NULL then somebody is trying to lock the covered vnode
2188          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2189          * only happen when unmounting /.  Since that operation will fail
2190          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2191          */
2192         if (vp == NULL)
2193                 return (EBUSY);
2194 
2195         vpvfsentry = vn_vfslocks_getlock(vp);
2196 
2197         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2198                 return (0);
2199 
2200         vn_vfslocks_rele(vpvfsentry);
2201         return (EBUSY);
2202 }
2203 
2204 void
2205 vn_vfsunlock(vnode_t *vp)
2206 {
2207         vn_vfslocks_entry_t *vpvfsentry;
2208 
2209         /*
2210          * ve_refcnt needs to be decremented twice.
2211          * 1. To release refernce after a call to vn_vfslocks_getlock()
2212          * 2. To release the reference from the locking routines like
2213          *    vn_vfsrlock/vn_vfswlock etc,.
2214          */
2215         vpvfsentry = vn_vfslocks_getlock(vp);
2216         vn_vfslocks_rele(vpvfsentry);
2217 
2218         rwst_exit(&vpvfsentry->ve_lock);
2219         vn_vfslocks_rele(vpvfsentry);
2220 }
2221 
2222 int
2223 vn_vfswlock_held(vnode_t *vp)
2224 {
2225         int held;
2226         vn_vfslocks_entry_t *vpvfsentry;
2227 
2228         ASSERT(vp != NULL);
2229 
2230         vpvfsentry = vn_vfslocks_getlock(vp);
2231         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2232 
2233         vn_vfslocks_rele(vpvfsentry);
2234         return (held);
2235 }
2236 
2237 
2238 int
2239 vn_make_ops(
2240         const char *name,                       /* Name of file system */
2241         const fs_operation_def_t *templ,        /* Operation specification */
2242         vnodeops_t **actual)                    /* Return the vnodeops */
2243 {
2244         int unused_ops;
2245         int error;
2246 
2247         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2248 
2249         (*actual)->vnop_name = name;
2250 
2251         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2252         if (error) {
2253                 kmem_free(*actual, sizeof (vnodeops_t));
2254         }
2255 
2256 #if DEBUG
2257         if (unused_ops != 0)
2258                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2259                     "but not used", name, unused_ops);
2260 #endif
2261 
2262         return (error);
2263 }
2264 
2265 /*
2266  * Free the vnodeops created as a result of vn_make_ops()
2267  */
2268 void
2269 vn_freevnodeops(vnodeops_t *vnops)
2270 {
2271         kmem_free(vnops, sizeof (vnodeops_t));
2272 }
2273 
2274 /*
2275  * Vnode cache.
2276  */
2277 
2278 /* ARGSUSED */
2279 static int
2280 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2281 {
2282         struct vnode *vp;
2283 
2284         vp = buf;
2285 
2286         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2287         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2288         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2289         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2290         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2291         vp->v_path = NULL;
2292         vp->v_mpssdata = NULL;
2293         vp->v_vsd = NULL;
2294         vp->v_fopdata = NULL;
2295 
2296         return (0);
2297 }
2298 
2299 /* ARGSUSED */
2300 static void
2301 vn_cache_destructor(void *buf, void *cdrarg)
2302 {
2303         struct vnode *vp;
2304 
2305         vp = buf;
2306 
2307         rw_destroy(&vp->v_nbllock);
2308         cv_destroy(&vp->v_cv);
2309         mutex_destroy(&vp->v_vsd_lock);
2310         mutex_destroy(&vp->v_lock);
2311 }
2312 
2313 void
2314 vn_create_cache(void)
2315 {
2316         /* LINTED */
2317         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2318             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2319         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2320             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2321             NULL, 0);
2322 }
2323 
2324 void
2325 vn_destroy_cache(void)
2326 {
2327         kmem_cache_destroy(vn_cache);
2328 }
2329 
2330 /*
2331  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2332  * cached by the file system and vnodes remain associated.
2333  */
2334 void
2335 vn_recycle(vnode_t *vp)
2336 {
2337         ASSERT(vp->v_pages == NULL);
2338 
2339         /*
2340          * XXX - This really belongs in vn_reinit(), but we have some issues
2341          * with the counts.  Best to have it here for clean initialization.
2342          */
2343         vp->v_rdcnt = 0;
2344         vp->v_wrcnt = 0;
2345         vp->v_mmap_read = 0;
2346         vp->v_mmap_write = 0;
2347 
2348         /*
2349          * If FEM was in use, make sure everything gets cleaned up
2350          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2351          * constructor.
2352          */
2353         if (vp->v_femhead) {
2354                 /* XXX - There should be a free_femhead() that does all this */
2355                 ASSERT(vp->v_femhead->femh_list == NULL);
2356                 mutex_destroy(&vp->v_femhead->femh_lock);
2357                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2358                 vp->v_femhead = NULL;
2359         }
2360         if (vp->v_path) {
2361                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2362                 vp->v_path = NULL;
2363         }
2364 
2365         if (vp->v_fopdata != NULL) {
2366                 free_fopdata(vp);
2367         }
2368         vp->v_mpssdata = NULL;
2369         vsd_free(vp);
2370 }
2371 
2372 /*
2373  * Used to reset the vnode fields including those that are directly accessible
2374  * as well as those which require an accessor function.
2375  *
2376  * Does not initialize:
2377  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2378  *      v_data (since FS-nodes and vnodes point to each other and should
2379  *              be updated simultaneously)
2380  *      v_op (in case someone needs to make a VOP call on this object)
2381  */
2382 void
2383 vn_reinit(vnode_t *vp)
2384 {
2385         vp->v_count = 1;
2386         vp->v_count_dnlc = 0;
2387         vp->v_vfsp = NULL;
2388         vp->v_stream = NULL;
2389         vp->v_vfsmountedhere = NULL;
2390         vp->v_flag = 0;
2391         vp->v_type = VNON;
2392         vp->v_rdev = NODEV;
2393 
2394         vp->v_filocks = NULL;
2395         vp->v_shrlocks = NULL;
2396         vp->v_pages = NULL;
2397 
2398         vp->v_locality = NULL;
2399         vp->v_xattrdir = NULL;
2400 
2401         /* Handles v_femhead, v_path, and the r/w/map counts */
2402         vn_recycle(vp);
2403 }
2404 
2405 vnode_t *
2406 vn_alloc(int kmflag)
2407 {
2408         vnode_t *vp;
2409 
2410         vp = kmem_cache_alloc(vn_cache, kmflag);
2411 
2412         if (vp != NULL) {
2413                 vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2414                 vp->v_fopdata = NULL;
2415                 vn_reinit(vp);
2416         }
2417 
2418         return (vp);
2419 }
2420 
2421 void
2422 vn_free(vnode_t *vp)
2423 {
2424         ASSERT(vp->v_shrlocks == NULL);
2425         ASSERT(vp->v_filocks == NULL);
2426 
2427         /*
2428          * Some file systems call vn_free() with v_count of zero,
2429          * some with v_count of 1.  In any case, the value should
2430          * never be anything else.
2431          */
2432         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2433         ASSERT(vp->v_count_dnlc == 0);
2434         if (vp->v_path != NULL) {
2435                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2436                 vp->v_path = NULL;
2437         }
2438 
2439         /* If FEM was in use, make sure everything gets cleaned up */
2440         if (vp->v_femhead) {
2441                 /* XXX - There should be a free_femhead() that does all this */
2442                 ASSERT(vp->v_femhead->femh_list == NULL);
2443                 mutex_destroy(&vp->v_femhead->femh_lock);
2444                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2445                 vp->v_femhead = NULL;
2446         }
2447 
2448         if (vp->v_fopdata != NULL) {
2449                 free_fopdata(vp);
2450         }
2451         vp->v_mpssdata = NULL;
2452         vsd_free(vp);
2453         kmem_cache_free(vn_cache, vp);
2454 }
2455 
2456 /*
2457  * vnode status changes, should define better states than 1, 0.
2458  */
2459 void
2460 vn_reclaim(vnode_t *vp)
2461 {
2462         vfs_t   *vfsp = vp->v_vfsp;
2463 
2464         if (vfsp == NULL ||
2465             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2466                 return;
2467         }
2468         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2469 }
2470 
2471 void
2472 vn_idle(vnode_t *vp)
2473 {
2474         vfs_t   *vfsp = vp->v_vfsp;
2475 
2476         if (vfsp == NULL ||
2477             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2478                 return;
2479         }
2480         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2481 }
2482 void
2483 vn_exists(vnode_t *vp)
2484 {
2485         vfs_t   *vfsp = vp->v_vfsp;
2486 
2487         if (vfsp == NULL ||
2488             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2489                 return;
2490         }
2491         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2492 }
2493 
2494 void
2495 vn_invalid(vnode_t *vp)
2496 {
2497         vfs_t   *vfsp = vp->v_vfsp;
2498 
2499         if (vfsp == NULL ||
2500             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2501                 return;
2502         }
2503         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2504 }
2505 
2506 /* Vnode event notification */
2507 
2508 int
2509 vnevent_support(vnode_t *vp, caller_context_t *ct)
2510 {
2511         if (vp == NULL)
2512                 return (EINVAL);
2513 
2514         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2515 }
2516 
2517 void
2518 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2519 {
2520         if (vp == NULL || vp->v_femhead == NULL) {
2521                 return;
2522         }
2523         (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2524         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2525 }
2526 
2527 void
2528 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2529     caller_context_t *ct)
2530 {
2531         if (vp == NULL || vp->v_femhead == NULL) {
2532                 return;
2533         }
2534         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2535 }
2536 
2537 void
2538 vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2539     caller_context_t *ct)
2540 {
2541         if (vp == NULL || vp->v_femhead == NULL) {
2542                 return;
2543         }
2544         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2545 }
2546 
2547 void
2548 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2549 {
2550         if (vp == NULL || vp->v_femhead == NULL) {
2551                 return;
2552         }
2553         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2554 }
2555 
2556 void
2557 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2558 {
2559         if (vp == NULL || vp->v_femhead == NULL) {
2560                 return;
2561         }
2562         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2563 }
2564 
2565 void
2566 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2567     caller_context_t *ct)
2568 {
2569         if (vp == NULL || vp->v_femhead == NULL) {
2570                 return;
2571         }
2572         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2573 }
2574 
2575 void
2576 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2577     caller_context_t *ct)
2578 {
2579         if (vp == NULL || vp->v_femhead == NULL) {
2580                 return;
2581         }
2582         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2583 }
2584 
2585 void
2586 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2587     caller_context_t *ct)
2588 {
2589         if (vp == NULL || vp->v_femhead == NULL) {
2590                 return;
2591         }
2592         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2593 }
2594 
2595 void
2596 vnevent_create(vnode_t *vp, caller_context_t *ct)
2597 {
2598         if (vp == NULL || vp->v_femhead == NULL) {
2599                 return;
2600         }
2601         (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2602 }
2603 
2604 void
2605 vnevent_link(vnode_t *vp, caller_context_t *ct)
2606 {
2607         if (vp == NULL || vp->v_femhead == NULL) {
2608                 return;
2609         }
2610         (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2611 }
2612 
2613 void
2614 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2615 {
2616         if (vp == NULL || vp->v_femhead == NULL) {
2617                 return;
2618         }
2619         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2620 }
2621 
2622 void
2623 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2624 {
2625         if (vp == NULL || vp->v_femhead == NULL) {
2626                 return;
2627         }
2628         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2629 }
2630 
2631 void
2632 vnevent_resize(vnode_t *vp, caller_context_t *ct)
2633 {
2634         if (vp == NULL || vp->v_femhead == NULL) {
2635                 return;
2636         }
2637         (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2638 }
2639 
2640 /*
2641  * Vnode accessors.
2642  */
2643 
2644 int
2645 vn_is_readonly(vnode_t *vp)
2646 {
2647         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2648 }
2649 
2650 int
2651 vn_has_flocks(vnode_t *vp)
2652 {
2653         return (vp->v_filocks != NULL);
2654 }
2655 
2656 int
2657 vn_has_mandatory_locks(vnode_t *vp, int mode)
2658 {
2659         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2660 }
2661 
2662 int
2663 vn_has_cached_data(vnode_t *vp)
2664 {
2665         return (vp->v_pages != NULL);
2666 }
2667 
2668 /*
2669  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2670  * zone_enter(2).
2671  */
2672 int
2673 vn_can_change_zones(vnode_t *vp)
2674 {
2675         struct vfssw *vswp;
2676         int allow = 1;
2677         vnode_t *rvp;
2678 
2679         if (nfs_global_client_only != 0)
2680                 return (1);
2681 
2682         /*
2683          * We always want to look at the underlying vnode if there is one.
2684          */
2685         if (VOP_REALVP(vp, &rvp, NULL) != 0)
2686                 rvp = vp;
2687         /*
2688          * Some pseudo filesystems (including doorfs) don't actually register
2689          * their vfsops_t, so the following may return NULL; we happily let
2690          * such vnodes switch zones.
2691          */
2692         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2693         if (vswp != NULL) {
2694                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2695                         allow = 0;
2696                 vfs_unrefvfssw(vswp);
2697         }
2698         return (allow);
2699 }
2700 
2701 /*
2702  * Return nonzero if the vnode is a mount point, zero if not.
2703  */
2704 int
2705 vn_ismntpt(vnode_t *vp)
2706 {
2707         return (vp->v_vfsmountedhere != NULL);
2708 }
2709 
2710 /* Retrieve the vfs (if any) mounted on this vnode */
2711 vfs_t *
2712 vn_mountedvfs(vnode_t *vp)
2713 {
2714         return (vp->v_vfsmountedhere);
2715 }
2716 
2717 /*
2718  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2719  */
2720 int
2721 vn_in_dnlc(vnode_t *vp)
2722 {
2723         return (vp->v_count_dnlc > 0);
2724 }
2725 
2726 /*
2727  * vn_has_other_opens() checks whether a particular file is opened by more than
2728  * just the caller and whether the open is for read and/or write.
2729  * This routine is for calling after the caller has already called VOP_OPEN()
2730  * and the caller wishes to know if they are the only one with it open for
2731  * the mode(s) specified.
2732  *
2733  * Vnode counts are only kept on regular files (v_type=VREG).
2734  */
2735 int
2736 vn_has_other_opens(
2737         vnode_t *vp,
2738         v_mode_t mode)
2739 {
2740 
2741         ASSERT(vp != NULL);
2742 
2743         switch (mode) {
2744         case V_WRITE:
2745                 if (vp->v_wrcnt > 1)
2746                         return (V_TRUE);
2747                 break;
2748         case V_RDORWR:
2749                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2750                         return (V_TRUE);
2751                 break;
2752         case V_RDANDWR:
2753                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2754                         return (V_TRUE);
2755                 break;
2756         case V_READ:
2757                 if (vp->v_rdcnt > 1)
2758                         return (V_TRUE);
2759                 break;
2760         }
2761 
2762         return (V_FALSE);
2763 }
2764 
2765 /*
2766  * vn_is_opened() checks whether a particular file is opened and
2767  * whether the open is for read and/or write.
2768  *
2769  * Vnode counts are only kept on regular files (v_type=VREG).
2770  */
2771 int
2772 vn_is_opened(
2773         vnode_t *vp,
2774         v_mode_t mode)
2775 {
2776 
2777         ASSERT(vp != NULL);
2778 
2779         switch (mode) {
2780         case V_WRITE:
2781                 if (vp->v_wrcnt)
2782                         return (V_TRUE);
2783                 break;
2784         case V_RDANDWR:
2785                 if (vp->v_rdcnt && vp->v_wrcnt)
2786                         return (V_TRUE);
2787                 break;
2788         case V_RDORWR:
2789                 if (vp->v_rdcnt || vp->v_wrcnt)
2790                         return (V_TRUE);
2791                 break;
2792         case V_READ:
2793                 if (vp->v_rdcnt)
2794                         return (V_TRUE);
2795                 break;
2796         }
2797 
2798         return (V_FALSE);
2799 }
2800 
2801 /*
2802  * vn_is_mapped() checks whether a particular file is mapped and whether
2803  * the file is mapped read and/or write.
2804  */
2805 int
2806 vn_is_mapped(
2807         vnode_t *vp,
2808         v_mode_t mode)
2809 {
2810 
2811         ASSERT(vp != NULL);
2812 
2813 #if !defined(_LP64)
2814         switch (mode) {
2815         /*
2816          * The atomic_add_64_nv functions force atomicity in the
2817          * case of 32 bit architectures. Otherwise the 64 bit values
2818          * require two fetches. The value of the fields may be
2819          * (potentially) changed between the first fetch and the
2820          * second
2821          */
2822         case V_WRITE:
2823                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2824                         return (V_TRUE);
2825                 break;
2826         case V_RDANDWR:
2827                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2828                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2829                         return (V_TRUE);
2830                 break;
2831         case V_RDORWR:
2832                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2833                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2834                         return (V_TRUE);
2835                 break;
2836         case V_READ:
2837                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2838                         return (V_TRUE);
2839                 break;
2840         }
2841 #else
2842         switch (mode) {
2843         case V_WRITE:
2844                 if (vp->v_mmap_write)
2845                         return (V_TRUE);
2846                 break;
2847         case V_RDANDWR:
2848                 if (vp->v_mmap_read && vp->v_mmap_write)
2849                         return (V_TRUE);
2850                 break;
2851         case V_RDORWR:
2852                 if (vp->v_mmap_read || vp->v_mmap_write)
2853                         return (V_TRUE);
2854                 break;
2855         case V_READ:
2856                 if (vp->v_mmap_read)
2857                         return (V_TRUE);
2858                 break;
2859         }
2860 #endif
2861 
2862         return (V_FALSE);
2863 }
2864 
2865 /*
2866  * Set the operations vector for a vnode.
2867  *
2868  * FEM ensures that the v_femhead pointer is filled in before the
2869  * v_op pointer is changed.  This means that if the v_femhead pointer
2870  * is NULL, and the v_op field hasn't changed since before which checked
2871  * the v_femhead pointer; then our update is ok - we are not racing with
2872  * FEM.
2873  */
2874 void
2875 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2876 {
2877         vnodeops_t      *op;
2878 
2879         ASSERT(vp != NULL);
2880         ASSERT(vnodeops != NULL);
2881 
2882         op = vp->v_op;
2883         membar_consumer();
2884         /*
2885          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2886          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2887          * in effect on the vnode and we need to have FEM deal with it.
2888          */
2889         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2890             op) {
2891                 fem_setvnops(vp, vnodeops);
2892         }
2893 }
2894 
2895 /*
2896  * Retrieve the operations vector for a vnode
2897  * As with vn_setops(above); make sure we aren't racing with FEM.
2898  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2899  * make sense to the callers of this routine.
2900  */
2901 vnodeops_t *
2902 vn_getops(vnode_t *vp)
2903 {
2904         vnodeops_t      *op;
2905 
2906         ASSERT(vp != NULL);
2907 
2908         op = vp->v_op;
2909         membar_consumer();
2910         if (vp->v_femhead == NULL && op == vp->v_op) {
2911                 return (op);
2912         } else {
2913                 return (fem_getvnops(vp));
2914         }
2915 }
2916 
2917 /*
2918  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2919  * Returns zero (0) if not.
2920  */
2921 int
2922 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2923 {
2924         return (vn_getops(vp) == vnodeops);
2925 }
2926 
2927 /*
2928  * Returns non-zero (1) if the specified operation matches the
2929  * corresponding operation for that the vnode.
2930  * Returns zero (0) if not.
2931  */
2932 
2933 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2934 
2935 int
2936 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2937 {
2938         const fs_operation_trans_def_t *otdp;
2939         fs_generic_func_p *loc = NULL;
2940         vnodeops_t      *vop = vn_getops(vp);
2941 
2942         ASSERT(vopname != NULL);
2943 
2944         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2945                 if (MATCHNAME(otdp->name, vopname)) {
2946                         loc = (fs_generic_func_p *)
2947                             ((char *)(vop) + otdp->offset);
2948                         break;
2949                 }
2950         }
2951 
2952         return ((loc != NULL) && (*loc == funcp));
2953 }
2954 
2955 /*
2956  * fs_new_caller_id() needs to return a unique ID on a given local system.
2957  * The IDs do not need to survive across reboots.  These are primarily
2958  * used so that (FEM) monitors can detect particular callers (such as
2959  * the NFS server) to a given vnode/vfs operation.
2960  */
2961 u_longlong_t
2962 fs_new_caller_id()
2963 {
2964         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2965 
2966         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2967 }
2968 
2969 /*
2970  * Given a starting vnode and a path, updates the path in the target vnode in
2971  * a safe manner.  If the vnode already has path information embedded, then the
2972  * cached path is left untouched.
2973  */
2974 
2975 size_t max_vnode_path = 4 * MAXPATHLEN;
2976 
2977 void
2978 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2979     const char *path, size_t plen)
2980 {
2981         char    *rpath;
2982         vnode_t *base;
2983         size_t  rpathlen, rpathalloc;
2984         int     doslash = 1;
2985 
2986         if (*path == '/') {
2987                 base = rootvp;
2988                 path++;
2989                 plen--;
2990         } else {
2991                 base = startvp;
2992         }
2993 
2994         /*
2995          * We cannot grab base->v_lock while we hold vp->v_lock because of
2996          * the potential for deadlock.
2997          */
2998         mutex_enter(&base->v_lock);
2999         if (base->v_path == NULL) {
3000                 mutex_exit(&base->v_lock);
3001                 return;
3002         }
3003 
3004         rpathlen = strlen(base->v_path);
3005         rpathalloc = rpathlen + plen + 1;
3006         /* Avoid adding a slash if there's already one there */
3007         if (base->v_path[rpathlen-1] == '/')
3008                 doslash = 0;
3009         else
3010                 rpathalloc++;
3011 
3012         /*
3013          * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
3014          * so we must do this dance.  If, by chance, something changes the path,
3015          * just give up since there is no real harm.
3016          */
3017         mutex_exit(&base->v_lock);
3018 
3019         /* Paths should stay within reason */
3020         if (rpathalloc > max_vnode_path)
3021                 return;
3022 
3023         rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3024 
3025         mutex_enter(&base->v_lock);
3026         if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3027                 mutex_exit(&base->v_lock);
3028                 kmem_free(rpath, rpathalloc);
3029                 return;
3030         }
3031         bcopy(base->v_path, rpath, rpathlen);
3032         mutex_exit(&base->v_lock);
3033 
3034         if (doslash)
3035                 rpath[rpathlen++] = '/';
3036         bcopy(path, rpath + rpathlen, plen);
3037         rpath[rpathlen + plen] = '\0';
3038 
3039         mutex_enter(&vp->v_lock);
3040         if (vp->v_path != NULL) {
3041                 mutex_exit(&vp->v_lock);
3042                 kmem_free(rpath, rpathalloc);
3043         } else {
3044                 vp->v_path = rpath;
3045                 mutex_exit(&vp->v_lock);
3046         }
3047 }
3048 
3049 /*
3050  * Sets the path to the vnode to be the given string, regardless of current
3051  * context.  The string must be a complete path from rootdir.  This is only used
3052  * by fsop_root() for setting the path based on the mountpoint.
3053  */
3054 void
3055 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3056 {
3057         char *buf = kmem_alloc(len + 1, KM_SLEEP);
3058 
3059         mutex_enter(&vp->v_lock);
3060         if (vp->v_path != NULL) {
3061                 mutex_exit(&vp->v_lock);
3062                 kmem_free(buf, len + 1);
3063                 return;
3064         }
3065 
3066         vp->v_path = buf;
3067         bcopy(str, vp->v_path, len);
3068         vp->v_path[len] = '\0';
3069 
3070         mutex_exit(&vp->v_lock);
3071 }
3072 
3073 /*
3074  * Called from within filesystem's vop_rename() to handle renames once the
3075  * target vnode is available.
3076  */
3077 void
3078 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3079 {
3080         char *tmp;
3081 
3082         mutex_enter(&vp->v_lock);
3083         tmp = vp->v_path;
3084         vp->v_path = NULL;
3085         mutex_exit(&vp->v_lock);
3086         vn_setpath(rootdir, dvp, vp, nm, len);
3087         if (tmp != NULL)
3088                 kmem_free(tmp, strlen(tmp) + 1);
3089 }
3090 
3091 /*
3092  * Similar to vn_setpath_str(), this function sets the path of the destination
3093  * vnode to the be the same as the source vnode.
3094  */
3095 void
3096 vn_copypath(struct vnode *src, struct vnode *dst)
3097 {
3098         char *buf;
3099         int alloc;
3100 
3101         mutex_enter(&src->v_lock);
3102         if (src->v_path == NULL) {
3103                 mutex_exit(&src->v_lock);
3104                 return;
3105         }
3106         alloc = strlen(src->v_path) + 1;
3107 
3108         /* avoid kmem_alloc() with lock held */
3109         mutex_exit(&src->v_lock);
3110         buf = kmem_alloc(alloc, KM_SLEEP);
3111         mutex_enter(&src->v_lock);
3112         if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3113                 mutex_exit(&src->v_lock);
3114                 kmem_free(buf, alloc);
3115                 return;
3116         }
3117         bcopy(src->v_path, buf, alloc);
3118         mutex_exit(&src->v_lock);
3119 
3120         mutex_enter(&dst->v_lock);
3121         if (dst->v_path != NULL) {
3122                 mutex_exit(&dst->v_lock);
3123                 kmem_free(buf, alloc);
3124                 return;
3125         }
3126         dst->v_path = buf;
3127         mutex_exit(&dst->v_lock);
3128 }
3129 
3130 /*
3131  * XXX Private interface for segvn routines that handle vnode
3132  * large page segments.
3133  *
3134  * return 1 if vp's file system VOP_PAGEIO() implementation
3135  * can be safely used instead of VOP_GETPAGE() for handling
3136  * pagefaults against regular non swap files. VOP_PAGEIO()
3137  * interface is considered safe here if its implementation
3138  * is very close to VOP_GETPAGE() implementation.
3139  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3140  * panic if there're file holes but instead returns an error.
3141  * Doesn't assume file won't be changed by user writes, etc.
3142  *
3143  * return 0 otherwise.
3144  *
3145  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3146  */
3147 int
3148 vn_vmpss_usepageio(vnode_t *vp)
3149 {
3150         vfs_t   *vfsp = vp->v_vfsp;
3151         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3152         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3153         char **fsok = pageio_ok_fss;
3154 
3155         if (fsname == NULL) {
3156                 return (0);
3157         }
3158 
3159         for (; *fsok; fsok++) {
3160                 if (strcmp(*fsok, fsname) == 0) {
3161                         return (1);
3162                 }
3163         }
3164         return (0);
3165 }
3166 
3167 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3168 
3169 int
3170 fop_open(
3171         vnode_t **vpp,
3172         int mode,
3173         cred_t *cr,
3174         caller_context_t *ct)
3175 {
3176         int ret;
3177         vnode_t *vp = *vpp;
3178 
3179         VN_HOLD(vp);
3180         /*
3181          * Adding to the vnode counts before calling open
3182          * avoids the need for a mutex. It circumvents a race
3183          * condition where a query made on the vnode counts results in a
3184          * false negative. The inquirer goes away believing the file is
3185          * not open when there is an open on the file already under way.
3186          *
3187          * The counts are meant to prevent NFS from granting a delegation
3188          * when it would be dangerous to do so.
3189          *
3190          * The vnode counts are only kept on regular files
3191          */
3192         if ((*vpp)->v_type == VREG) {
3193                 if (mode & FREAD)
3194                         atomic_inc_32(&(*vpp)->v_rdcnt);
3195                 if (mode & FWRITE)
3196                         atomic_inc_32(&(*vpp)->v_wrcnt);
3197         }
3198 
3199         VOPXID_MAP_CR(vp, cr);
3200 
3201         ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3202 
3203         if (ret) {
3204                 /*
3205                  * Use the saved vp just in case the vnode ptr got trashed
3206                  * by the error.
3207                  */
3208                 VOPSTATS_UPDATE(vp, open);
3209                 if ((vp->v_type == VREG) && (mode & FREAD))
3210                         atomic_dec_32(&vp->v_rdcnt);
3211                 if ((vp->v_type == VREG) && (mode & FWRITE))
3212                         atomic_dec_32(&vp->v_wrcnt);
3213         } else {
3214                 /*
3215                  * Some filesystems will return a different vnode,
3216                  * but the same path was still used to open it.
3217                  * So if we do change the vnode and need to
3218                  * copy over the path, do so here, rather than special
3219                  * casing each filesystem. Adjust the vnode counts to
3220                  * reflect the vnode switch.
3221                  */
3222                 VOPSTATS_UPDATE(*vpp, open);
3223                 if (*vpp != vp && *vpp != NULL) {
3224                         vn_copypath(vp, *vpp);
3225                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3226                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3227                         if ((vp->v_type == VREG) && (mode & FREAD))
3228                                 atomic_dec_32(&vp->v_rdcnt);
3229                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3230                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3231                         if ((vp->v_type == VREG) && (mode & FWRITE))
3232                                 atomic_dec_32(&vp->v_wrcnt);
3233                 }
3234         }
3235         VN_RELE(vp);
3236         return (ret);
3237 }
3238 
3239 int
3240 fop_close(
3241         vnode_t *vp,
3242         int flag,
3243         int count,
3244         offset_t offset,
3245         cred_t *cr,
3246         caller_context_t *ct)
3247 {
3248         int err;
3249 
3250         VOPXID_MAP_CR(vp, cr);
3251 
3252         err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3253         VOPSTATS_UPDATE(vp, close);
3254         /*
3255          * Check passed in count to handle possible dups. Vnode counts are only
3256          * kept on regular files
3257          */
3258         if ((vp->v_type == VREG) && (count == 1))  {
3259                 if (flag & FREAD) {
3260                         ASSERT(vp->v_rdcnt > 0);
3261                         atomic_dec_32(&vp->v_rdcnt);
3262                 }
3263                 if (flag & FWRITE) {
3264                         ASSERT(vp->v_wrcnt > 0);
3265                         atomic_dec_32(&vp->v_wrcnt);
3266                 }
3267         }
3268         return (err);
3269 }
3270 
3271 int
3272 fop_read(
3273         vnode_t *vp,
3274         uio_t *uiop,
3275         int ioflag,
3276         cred_t *cr,
3277         caller_context_t *ct)
3278 {
3279         ssize_t resid_start = uiop->uio_resid;
3280         zone_t  *zonep = curzone;
3281         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3282 
3283         hrtime_t start = 0, lat;
3284         ssize_t len;
3285         int err;
3286 
3287         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3288             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3289                 start = gethrtime();
3290 
3291                 mutex_enter(&zonep->zone_vfs_lock);
3292                 kstat_runq_enter(&zonep->zone_vfs_rwstats);
3293                 mutex_exit(&zonep->zone_vfs_lock);
3294         }
3295 
3296         VOPXID_MAP_CR(vp, cr);
3297 
3298         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3299         len = resid_start - uiop->uio_resid;
3300 
3301         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3302 
3303         if (start != 0) {
3304                 mutex_enter(&zonep->zone_vfs_lock);
3305                 zonep->zone_vfs_rwstats.reads++;
3306                 zonep->zone_vfs_rwstats.nread += len;
3307                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3308                 mutex_exit(&zonep->zone_vfs_lock);
3309 
3310                 lat = gethrtime() - start;
3311 
3312                 if (lat >= VOP_LATENCY_10MS) {
3313                         if (lat < VOP_LATENCY_100MS)
3314                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3315                         else if (lat < VOP_LATENCY_1S) {
3316                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3317                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3318                         } else {
3319                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3320                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3321                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3322                         }
3323                 }
3324         }
3325 
3326         return (err);
3327 }
3328 
3329 int
3330 fop_write(
3331         vnode_t *vp,
3332         uio_t *uiop,
3333         int ioflag,
3334         cred_t *cr,
3335         caller_context_t *ct)
3336 {
3337         ssize_t resid_start = uiop->uio_resid;
3338         zone_t  *zonep = curzone;
3339         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3340 
3341         hrtime_t start = 0, lat;
3342         ssize_t len;
3343         int     err;
3344 
3345         /*
3346          * For the purposes of VFS kstat consumers, the "waitq" calculation is
3347          * repurposed as the active queue for VFS write operations.  There's no
3348          * actual wait queue for VFS operations.
3349          */
3350         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3351             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3352                 start = gethrtime();
3353 
3354                 mutex_enter(&zonep->zone_vfs_lock);
3355                 kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3356                 mutex_exit(&zonep->zone_vfs_lock);
3357         }
3358 
3359         VOPXID_MAP_CR(vp, cr);
3360 
3361         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3362         len = resid_start - uiop->uio_resid;
3363 
3364         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3365 
3366         if (start != 0) {
3367                 mutex_enter(&zonep->zone_vfs_lock);
3368                 zonep->zone_vfs_rwstats.writes++;
3369                 zonep->zone_vfs_rwstats.nwritten += len;
3370                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3371                 mutex_exit(&zonep->zone_vfs_lock);
3372 
3373                 lat = gethrtime() - start;
3374 
3375                 if (lat >= VOP_LATENCY_10MS) {
3376                         if (lat < VOP_LATENCY_100MS)
3377                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3378                         else if (lat < VOP_LATENCY_1S) {
3379                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3380                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3381                         } else {
3382                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3383                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3384                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3385                         }
3386                 }
3387         }
3388 
3389         return (err);
3390 }
3391 
3392 int
3393 fop_ioctl(
3394         vnode_t *vp,
3395         int cmd,
3396         intptr_t arg,
3397         int flag,
3398         cred_t *cr,
3399         int *rvalp,
3400         caller_context_t *ct)
3401 {
3402         int     err;
3403 
3404         VOPXID_MAP_CR(vp, cr);
3405 
3406         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3407         VOPSTATS_UPDATE(vp, ioctl);
3408         return (err);
3409 }
3410 
3411 int
3412 fop_setfl(
3413         vnode_t *vp,
3414         int oflags,
3415         int nflags,
3416         cred_t *cr,
3417         caller_context_t *ct)
3418 {
3419         int     err;
3420 
3421         VOPXID_MAP_CR(vp, cr);
3422 
3423         err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3424         VOPSTATS_UPDATE(vp, setfl);
3425         return (err);
3426 }
3427 
3428 int
3429 fop_getattr(
3430         vnode_t *vp,
3431         vattr_t *vap,
3432         int flags,
3433         cred_t *cr,
3434         caller_context_t *ct)
3435 {
3436         int     err;
3437 
3438         VOPXID_MAP_CR(vp, cr);
3439 
3440         /*
3441          * If this file system doesn't understand the xvattr extensions
3442          * then turn off the xvattr bit.
3443          */
3444         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3445                 vap->va_mask &= ~AT_XVATTR;
3446         }
3447 
3448         /*
3449          * We're only allowed to skip the ACL check iff we used a 32 bit
3450          * ACE mask with VOP_ACCESS() to determine permissions.
3451          */
3452         if ((flags & ATTR_NOACLCHECK) &&
3453             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3454                 return (EINVAL);
3455         }
3456         err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3457         VOPSTATS_UPDATE(vp, getattr);
3458         return (err);
3459 }
3460 
3461 int
3462 fop_setattr(
3463         vnode_t *vp,
3464         vattr_t *vap,
3465         int flags,
3466         cred_t *cr,
3467         caller_context_t *ct)
3468 {
3469         int     err;
3470 
3471         VOPXID_MAP_CR(vp, cr);
3472 
3473         /*
3474          * If this file system doesn't understand the xvattr extensions
3475          * then turn off the xvattr bit.
3476          */
3477         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3478                 vap->va_mask &= ~AT_XVATTR;
3479         }
3480 
3481         /*
3482          * We're only allowed to skip the ACL check iff we used a 32 bit
3483          * ACE mask with VOP_ACCESS() to determine permissions.
3484          */
3485         if ((flags & ATTR_NOACLCHECK) &&
3486             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3487                 return (EINVAL);
3488         }
3489         err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3490         VOPSTATS_UPDATE(vp, setattr);
3491         return (err);
3492 }
3493 
3494 int
3495 fop_access(
3496         vnode_t *vp,
3497         int mode,
3498         int flags,
3499         cred_t *cr,
3500         caller_context_t *ct)
3501 {
3502         int     err;
3503 
3504         if ((flags & V_ACE_MASK) &&
3505             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3506                 return (EINVAL);
3507         }
3508 
3509         VOPXID_MAP_CR(vp, cr);
3510 
3511         err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3512         VOPSTATS_UPDATE(vp, access);
3513         return (err);
3514 }
3515 
3516 int
3517 fop_lookup(
3518         vnode_t *dvp,
3519         char *nm,
3520         vnode_t **vpp,
3521         pathname_t *pnp,
3522         int flags,
3523         vnode_t *rdir,
3524         cred_t *cr,
3525         caller_context_t *ct,
3526         int *deflags,           /* Returned per-dirent flags */
3527         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3528 {
3529         int ret;
3530 
3531         /*
3532          * If this file system doesn't support case-insensitive access
3533          * and said access is requested, fail quickly.  It is required
3534          * that if the vfs supports case-insensitive lookup, it also
3535          * supports extended dirent flags.
3536          */
3537         if (flags & FIGNORECASE &&
3538             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3539             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3540                 return (EINVAL);
3541 
3542         VOPXID_MAP_CR(dvp, cr);
3543 
3544         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3545                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3546         } else {
3547                 ret = (*(dvp)->v_op->vop_lookup)
3548                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3549         }
3550         if (ret == 0 && *vpp) {
3551                 VOPSTATS_UPDATE(*vpp, lookup);
3552                 if ((*vpp)->v_path == NULL) {
3553                         vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3554                 }
3555         }
3556 
3557         return (ret);
3558 }
3559 
3560 int
3561 fop_create(
3562         vnode_t *dvp,
3563         char *name,
3564         vattr_t *vap,
3565         vcexcl_t excl,
3566         int mode,
3567         vnode_t **vpp,
3568         cred_t *cr,
3569         int flags,
3570         caller_context_t *ct,
3571         vsecattr_t *vsecp)      /* ACL to set during create */
3572 {
3573         int ret;
3574 
3575         if (vsecp != NULL &&
3576             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3577                 return (EINVAL);
3578         }
3579         /*
3580          * If this file system doesn't support case-insensitive access
3581          * and said access is requested, fail quickly.
3582          */
3583         if (flags & FIGNORECASE &&
3584             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3585             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3586                 return (EINVAL);
3587 
3588         VOPXID_MAP_CR(dvp, cr);
3589 
3590         ret = (*(dvp)->v_op->vop_create)
3591             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3592         if (ret == 0 && *vpp) {
3593                 VOPSTATS_UPDATE(*vpp, create);
3594                 if ((*vpp)->v_path == NULL) {
3595                         vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3596                 }
3597         }
3598 
3599         return (ret);
3600 }
3601 
3602 int
3603 fop_remove(
3604         vnode_t *dvp,
3605         char *nm,
3606         cred_t *cr,
3607         caller_context_t *ct,
3608         int flags)
3609 {
3610         int     err;
3611 
3612         /*
3613          * If this file system doesn't support case-insensitive access
3614          * and said access is requested, fail quickly.
3615          */
3616         if (flags & FIGNORECASE &&
3617             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3618             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3619                 return (EINVAL);
3620 
3621         VOPXID_MAP_CR(dvp, cr);
3622 
3623         err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3624         VOPSTATS_UPDATE(dvp, remove);
3625         return (err);
3626 }
3627 
3628 int
3629 fop_link(
3630         vnode_t *tdvp,
3631         vnode_t *svp,
3632         char *tnm,
3633         cred_t *cr,
3634         caller_context_t *ct,
3635         int flags)
3636 {
3637         int     err;
3638 
3639         /*
3640          * If the target file system doesn't support case-insensitive access
3641          * and said access is requested, fail quickly.
3642          */
3643         if (flags & FIGNORECASE &&
3644             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3645             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3646                 return (EINVAL);
3647 
3648         VOPXID_MAP_CR(tdvp, cr);
3649 
3650         err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3651         VOPSTATS_UPDATE(tdvp, link);
3652         return (err);
3653 }
3654 
3655 int
3656 fop_rename(
3657         vnode_t *sdvp,
3658         char *snm,
3659         vnode_t *tdvp,
3660         char *tnm,
3661         cred_t *cr,
3662         caller_context_t *ct,
3663         int flags)
3664 {
3665         int     err;
3666 
3667         /*
3668          * If the file system involved does not support
3669          * case-insensitive access and said access is requested, fail
3670          * quickly.
3671          */
3672         if (flags & FIGNORECASE &&
3673             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3674             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3675                 return (EINVAL);
3676 
3677         VOPXID_MAP_CR(tdvp, cr);
3678 
3679         err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3680         VOPSTATS_UPDATE(sdvp, rename);
3681         return (err);
3682 }
3683 
3684 int
3685 fop_mkdir(
3686         vnode_t *dvp,
3687         char *dirname,
3688         vattr_t *vap,
3689         vnode_t **vpp,
3690         cred_t *cr,
3691         caller_context_t *ct,
3692         int flags,
3693         vsecattr_t *vsecp)      /* ACL to set during create */
3694 {
3695         int ret;
3696 
3697         if (vsecp != NULL &&
3698             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3699                 return (EINVAL);
3700         }
3701         /*
3702          * If this file system doesn't support case-insensitive access
3703          * and said access is requested, fail quickly.
3704          */
3705         if (flags & FIGNORECASE &&
3706             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3707             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3708                 return (EINVAL);
3709 
3710         VOPXID_MAP_CR(dvp, cr);
3711 
3712         ret = (*(dvp)->v_op->vop_mkdir)
3713             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3714         if (ret == 0 && *vpp) {
3715                 VOPSTATS_UPDATE(*vpp, mkdir);
3716                 if ((*vpp)->v_path == NULL) {
3717                         vn_setpath(rootdir, dvp, *vpp, dirname,
3718                             strlen(dirname));
3719                 }
3720         }
3721 
3722         return (ret);
3723 }
3724 
3725 int
3726 fop_rmdir(
3727         vnode_t *dvp,
3728         char *nm,
3729         vnode_t *cdir,
3730         cred_t *cr,
3731         caller_context_t *ct,
3732         int flags)
3733 {
3734         int     err;
3735 
3736         /*
3737          * If this file system doesn't support case-insensitive access
3738          * and said access is requested, fail quickly.
3739          */
3740         if (flags & FIGNORECASE &&
3741             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3742             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3743                 return (EINVAL);
3744 
3745         VOPXID_MAP_CR(dvp, cr);
3746 
3747         err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3748         VOPSTATS_UPDATE(dvp, rmdir);
3749         return (err);
3750 }
3751 
3752 int
3753 fop_readdir(
3754         vnode_t *vp,
3755         uio_t *uiop,
3756         cred_t *cr,
3757         int *eofp,
3758         caller_context_t *ct,
3759         int flags)
3760 {
3761         int     err;
3762         ssize_t resid_start = uiop->uio_resid;
3763 
3764         /*
3765          * If this file system doesn't support retrieving directory
3766          * entry flags and said access is requested, fail quickly.
3767          */
3768         if (flags & V_RDDIR_ENTFLAGS &&
3769             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3770                 return (EINVAL);
3771 
3772         VOPXID_MAP_CR(vp, cr);
3773 
3774         err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3775         VOPSTATS_UPDATE_IO(vp, readdir,
3776             readdir_bytes, (resid_start - uiop->uio_resid));
3777         return (err);
3778 }
3779 
3780 int
3781 fop_symlink(
3782         vnode_t *dvp,
3783         char *linkname,
3784         vattr_t *vap,
3785         char *target,
3786         cred_t *cr,
3787         caller_context_t *ct,
3788         int flags)
3789 {
3790         int     err;
3791         xvattr_t xvattr;
3792 
3793         /*
3794          * If this file system doesn't support case-insensitive access
3795          * and said access is requested, fail quickly.
3796          */
3797         if (flags & FIGNORECASE &&
3798             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3799             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3800                 return (EINVAL);
3801 
3802         VOPXID_MAP_CR(dvp, cr);
3803 
3804         /* check for reparse point */
3805         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3806             (strncmp(target, FS_REPARSE_TAG_STR,
3807             strlen(FS_REPARSE_TAG_STR)) == 0)) {
3808                 if (!fs_reparse_mark(target, vap, &xvattr))
3809                         vap = (vattr_t *)&xvattr;
3810         }
3811 
3812         err = (*(dvp)->v_op->vop_symlink)
3813             (dvp, linkname, vap, target, cr, ct, flags);
3814         VOPSTATS_UPDATE(dvp, symlink);
3815         return (err);
3816 }
3817 
3818 int
3819 fop_readlink(
3820         vnode_t *vp,
3821         uio_t *uiop,
3822         cred_t *cr,
3823         caller_context_t *ct)
3824 {
3825         int     err;
3826 
3827         VOPXID_MAP_CR(vp, cr);
3828 
3829         err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3830         VOPSTATS_UPDATE(vp, readlink);
3831         return (err);
3832 }
3833 
3834 int
3835 fop_fsync(
3836         vnode_t *vp,
3837         int syncflag,
3838         cred_t *cr,
3839         caller_context_t *ct)
3840 {
3841         int     err;
3842 
3843         VOPXID_MAP_CR(vp, cr);
3844 
3845         err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3846         VOPSTATS_UPDATE(vp, fsync);
3847         return (err);
3848 }
3849 
3850 void
3851 fop_inactive(
3852         vnode_t *vp,
3853         cred_t *cr,
3854         caller_context_t *ct)
3855 {
3856         /* Need to update stats before vop call since we may lose the vnode */
3857         VOPSTATS_UPDATE(vp, inactive);
3858 
3859         VOPXID_MAP_CR(vp, cr);
3860 
3861         (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3862 }
3863 
3864 int
3865 fop_fid(
3866         vnode_t *vp,
3867         fid_t *fidp,
3868         caller_context_t *ct)
3869 {
3870         int     err;
3871 
3872         err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3873         VOPSTATS_UPDATE(vp, fid);
3874         return (err);
3875 }
3876 
3877 int
3878 fop_rwlock(
3879         vnode_t *vp,
3880         int write_lock,
3881         caller_context_t *ct)
3882 {
3883         int     ret;
3884 
3885         ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3886         VOPSTATS_UPDATE(vp, rwlock);
3887         return (ret);
3888 }
3889 
3890 void
3891 fop_rwunlock(
3892         vnode_t *vp,
3893         int write_lock,
3894         caller_context_t *ct)
3895 {
3896         (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3897         VOPSTATS_UPDATE(vp, rwunlock);
3898 }
3899 
3900 int
3901 fop_seek(
3902         vnode_t *vp,
3903         offset_t ooff,
3904         offset_t *noffp,
3905         caller_context_t *ct)
3906 {
3907         int     err;
3908 
3909         err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
3910         VOPSTATS_UPDATE(vp, seek);
3911         return (err);
3912 }
3913 
3914 int
3915 fop_cmp(
3916         vnode_t *vp1,
3917         vnode_t *vp2,
3918         caller_context_t *ct)
3919 {
3920         int     err;
3921 
3922         err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
3923         VOPSTATS_UPDATE(vp1, cmp);
3924         return (err);
3925 }
3926 
3927 int
3928 fop_frlock(
3929         vnode_t *vp,
3930         int cmd,
3931         flock64_t *bfp,
3932         int flag,
3933         offset_t offset,
3934         struct flk_callback *flk_cbp,
3935         cred_t *cr,
3936         caller_context_t *ct)
3937 {
3938         int     err;
3939 
3940         VOPXID_MAP_CR(vp, cr);
3941 
3942         err = (*(vp)->v_op->vop_frlock)
3943             (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3944         VOPSTATS_UPDATE(vp, frlock);
3945         return (err);
3946 }
3947 
3948 int
3949 fop_space(
3950         vnode_t *vp,
3951         int cmd,
3952         flock64_t *bfp,
3953         int flag,
3954         offset_t offset,
3955         cred_t *cr,
3956         caller_context_t *ct)
3957 {
3958         int     err;
3959 
3960         VOPXID_MAP_CR(vp, cr);
3961 
3962         err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
3963         VOPSTATS_UPDATE(vp, space);
3964         return (err);
3965 }
3966 
3967 int
3968 fop_realvp(
3969         vnode_t *vp,
3970         vnode_t **vpp,
3971         caller_context_t *ct)
3972 {
3973         int     err;
3974 
3975         err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
3976         VOPSTATS_UPDATE(vp, realvp);
3977         return (err);
3978 }
3979 
3980 int
3981 fop_getpage(
3982         vnode_t *vp,
3983         offset_t off,
3984         size_t len,
3985         uint_t *protp,
3986         page_t **plarr,
3987         size_t plsz,
3988         struct seg *seg,
3989         caddr_t addr,
3990         enum seg_rw rw,
3991         cred_t *cr,
3992         caller_context_t *ct)
3993 {
3994         int     err;
3995 
3996         VOPXID_MAP_CR(vp, cr);
3997 
3998         err = (*(vp)->v_op->vop_getpage)
3999             (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4000         VOPSTATS_UPDATE(vp, getpage);
4001         return (err);
4002 }
4003 
4004 int
4005 fop_putpage(
4006         vnode_t *vp,
4007         offset_t off,
4008         size_t len,
4009         int flags,
4010         cred_t *cr,
4011         caller_context_t *ct)
4012 {
4013         int     err;
4014 
4015         VOPXID_MAP_CR(vp, cr);
4016 
4017         err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4018         VOPSTATS_UPDATE(vp, putpage);
4019         return (err);
4020 }
4021 
4022 int
4023 fop_map(
4024         vnode_t *vp,
4025         offset_t off,
4026         struct as *as,
4027         caddr_t *addrp,
4028         size_t len,
4029         uchar_t prot,
4030         uchar_t maxprot,
4031         uint_t flags,
4032         cred_t *cr,
4033         caller_context_t *ct)
4034 {
4035         int     err;
4036 
4037         VOPXID_MAP_CR(vp, cr);
4038 
4039         err = (*(vp)->v_op->vop_map)
4040             (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4041         VOPSTATS_UPDATE(vp, map);
4042         return (err);
4043 }
4044 
4045 int
4046 fop_addmap(
4047         vnode_t *vp,
4048         offset_t off,
4049         struct as *as,
4050         caddr_t addr,
4051         size_t len,
4052         uchar_t prot,
4053         uchar_t maxprot,
4054         uint_t flags,
4055         cred_t *cr,
4056         caller_context_t *ct)
4057 {
4058         int error;
4059         u_longlong_t delta;
4060 
4061         VOPXID_MAP_CR(vp, cr);
4062 
4063         error = (*(vp)->v_op->vop_addmap)
4064             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4065 
4066         if ((!error) && (vp->v_type == VREG)) {
4067                 delta = (u_longlong_t)btopr(len);
4068                 /*
4069                  * If file is declared MAP_PRIVATE, it can't be written back
4070                  * even if open for write. Handle as read.
4071                  */
4072                 if (flags & MAP_PRIVATE) {
4073                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4074                             (int64_t)delta);
4075                 } else {
4076                         /*
4077                          * atomic_add_64 forces the fetch of a 64 bit value to
4078                          * be atomic on 32 bit machines
4079                          */
4080                         if (maxprot & PROT_WRITE)
4081                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4082                                     (int64_t)delta);
4083                         if (maxprot & PROT_READ)
4084                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4085                                     (int64_t)delta);
4086                         if (maxprot & PROT_EXEC)
4087                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4088                                     (int64_t)delta);
4089                 }
4090         }
4091         VOPSTATS_UPDATE(vp, addmap);
4092         return (error);
4093 }
4094 
4095 int
4096 fop_delmap(
4097         vnode_t *vp,
4098         offset_t off,
4099         struct as *as,
4100         caddr_t addr,
4101         size_t len,
4102         uint_t prot,
4103         uint_t maxprot,
4104         uint_t flags,
4105         cred_t *cr,
4106         caller_context_t *ct)
4107 {
4108         int error;
4109         u_longlong_t delta;
4110 
4111         VOPXID_MAP_CR(vp, cr);
4112 
4113         error = (*(vp)->v_op->vop_delmap)
4114             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4115 
4116         /*
4117          * NFS calls into delmap twice, the first time
4118          * it simply establishes a callback mechanism and returns EAGAIN
4119          * while the real work is being done upon the second invocation.
4120          * We have to detect this here and only decrement the counts upon
4121          * the second delmap request.
4122          */
4123         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4124 
4125                 delta = (u_longlong_t)btopr(len);
4126 
4127                 if (flags & MAP_PRIVATE) {
4128                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4129                             (int64_t)(-delta));
4130                 } else {
4131                         /*
4132                          * atomic_add_64 forces the fetch of a 64 bit value
4133                          * to be atomic on 32 bit machines
4134                          */
4135                         if (maxprot & PROT_WRITE)
4136                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4137                                     (int64_t)(-delta));
4138                         if (maxprot & PROT_READ)
4139                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4140                                     (int64_t)(-delta));
4141                         if (maxprot & PROT_EXEC)
4142                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4143                                     (int64_t)(-delta));
4144                 }
4145         }
4146         VOPSTATS_UPDATE(vp, delmap);
4147         return (error);
4148 }
4149 
4150 
4151 int
4152 fop_poll(
4153         vnode_t *vp,
4154         short events,
4155         int anyyet,
4156         short *reventsp,
4157         struct pollhead **phpp,
4158         caller_context_t *ct)
4159 {
4160         int     err;
4161 
4162         err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4163         VOPSTATS_UPDATE(vp, poll);
4164         return (err);
4165 }
4166 
4167 int
4168 fop_dump(
4169         vnode_t *vp,
4170         caddr_t addr,
4171         offset_t lbdn,
4172         offset_t dblks,
4173         caller_context_t *ct)
4174 {
4175         int     err;
4176 
4177         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4178         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4179                 return (EIO);
4180 
4181         err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4182         VOPSTATS_UPDATE(vp, dump);
4183         return (err);
4184 }
4185 
4186 int
4187 fop_pathconf(
4188         vnode_t *vp,
4189         int cmd,
4190         ulong_t *valp,
4191         cred_t *cr,
4192         caller_context_t *ct)
4193 {
4194         int     err;
4195 
4196         VOPXID_MAP_CR(vp, cr);
4197 
4198         err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4199         VOPSTATS_UPDATE(vp, pathconf);
4200         return (err);
4201 }
4202 
4203 int
4204 fop_pageio(
4205         vnode_t *vp,
4206         struct page *pp,
4207         u_offset_t io_off,
4208         size_t io_len,
4209         int flags,
4210         cred_t *cr,
4211         caller_context_t *ct)
4212 {
4213         int     err;
4214 
4215         VOPXID_MAP_CR(vp, cr);
4216 
4217         err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4218         VOPSTATS_UPDATE(vp, pageio);
4219         return (err);
4220 }
4221 
4222 int
4223 fop_dumpctl(
4224         vnode_t *vp,
4225         int action,
4226         offset_t *blkp,
4227         caller_context_t *ct)
4228 {
4229         int     err;
4230         err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4231         VOPSTATS_UPDATE(vp, dumpctl);
4232         return (err);
4233 }
4234 
4235 void
4236 fop_dispose(
4237         vnode_t *vp,
4238         page_t *pp,
4239         int flag,
4240         int dn,
4241         cred_t *cr,
4242         caller_context_t *ct)
4243 {
4244         /* Must do stats first since it's possible to lose the vnode */
4245         VOPSTATS_UPDATE(vp, dispose);
4246 
4247         VOPXID_MAP_CR(vp, cr);
4248 
4249         (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4250 }
4251 
4252 int
4253 fop_setsecattr(
4254         vnode_t *vp,
4255         vsecattr_t *vsap,
4256         int flag,
4257         cred_t *cr,
4258         caller_context_t *ct)
4259 {
4260         int     err;
4261 
4262         VOPXID_MAP_CR(vp, cr);
4263 
4264         /*
4265          * We're only allowed to skip the ACL check iff we used a 32 bit
4266          * ACE mask with VOP_ACCESS() to determine permissions.
4267          */
4268         if ((flag & ATTR_NOACLCHECK) &&
4269             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4270                 return (EINVAL);
4271         }
4272         err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4273         VOPSTATS_UPDATE(vp, setsecattr);
4274         return (err);
4275 }
4276 
4277 int
4278 fop_getsecattr(
4279         vnode_t *vp,
4280         vsecattr_t *vsap,
4281         int flag,
4282         cred_t *cr,
4283         caller_context_t *ct)
4284 {
4285         int     err;
4286 
4287         /*
4288          * We're only allowed to skip the ACL check iff we used a 32 bit
4289          * ACE mask with VOP_ACCESS() to determine permissions.
4290          */
4291         if ((flag & ATTR_NOACLCHECK) &&
4292             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4293                 return (EINVAL);
4294         }
4295 
4296         VOPXID_MAP_CR(vp, cr);
4297 
4298         err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4299         VOPSTATS_UPDATE(vp, getsecattr);
4300         return (err);
4301 }
4302 
4303 int
4304 fop_shrlock(
4305         vnode_t *vp,
4306         int cmd,
4307         struct shrlock *shr,
4308         int flag,
4309         cred_t *cr,
4310         caller_context_t *ct)
4311 {
4312         int     err;
4313 
4314         VOPXID_MAP_CR(vp, cr);
4315 
4316         err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4317         VOPSTATS_UPDATE(vp, shrlock);
4318         return (err);
4319 }
4320 
4321 int
4322 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4323     caller_context_t *ct)
4324 {
4325         int     err;
4326 
4327         err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4328         VOPSTATS_UPDATE(vp, vnevent);
4329         return (err);
4330 }
4331 
4332 int
4333 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4334     caller_context_t *ct)
4335 {
4336         int err;
4337 
4338         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4339                 return (ENOTSUP);
4340         err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4341         VOPSTATS_UPDATE(vp, reqzcbuf);
4342         return (err);
4343 }
4344 
4345 int
4346 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4347 {
4348         int err;
4349 
4350         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4351                 return (ENOTSUP);
4352         err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4353         VOPSTATS_UPDATE(vp, retzcbuf);
4354         return (err);
4355 }
4356 
4357 /*
4358  * Default destructor
4359  *      Needed because NULL destructor means that the key is unused
4360  */
4361 /* ARGSUSED */
4362 void
4363 vsd_defaultdestructor(void *value)
4364 {}
4365 
4366 /*
4367  * Create a key (index into per vnode array)
4368  *      Locks out vsd_create, vsd_destroy, and vsd_free
4369  *      May allocate memory with lock held
4370  */
4371 void
4372 vsd_create(uint_t *keyp, void (*destructor)(void *))
4373 {
4374         int     i;
4375         uint_t  nkeys;
4376 
4377         /*
4378          * if key is allocated, do nothing
4379          */
4380         mutex_enter(&vsd_lock);
4381         if (*keyp) {
4382                 mutex_exit(&vsd_lock);
4383                 return;
4384         }
4385         /*
4386          * find an unused key
4387          */
4388         if (destructor == NULL)
4389                 destructor = vsd_defaultdestructor;
4390 
4391         for (i = 0; i < vsd_nkeys; ++i)
4392                 if (vsd_destructor[i] == NULL)
4393                         break;
4394 
4395         /*
4396          * if no unused keys, increase the size of the destructor array
4397          */
4398         if (i == vsd_nkeys) {
4399                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4400                         nkeys = 1;
4401                 vsd_destructor =
4402                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4403                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4404                     (size_t)(nkeys * sizeof (void (*)(void *))));
4405                 vsd_nkeys = nkeys;
4406         }
4407 
4408         /*
4409          * allocate the next available unused key
4410          */
4411         vsd_destructor[i] = destructor;
4412         *keyp = i + 1;
4413 
4414         /* create vsd_list, if it doesn't exist */
4415         if (vsd_list == NULL) {
4416                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4417                 list_create(vsd_list, sizeof (struct vsd_node),
4418                     offsetof(struct vsd_node, vs_nodes));
4419         }
4420 
4421         mutex_exit(&vsd_lock);
4422 }
4423 
4424 /*
4425  * Destroy a key
4426  *
4427  * Assumes that the caller is preventing vsd_set and vsd_get
4428  * Locks out vsd_create, vsd_destroy, and vsd_free
4429  * May free memory with lock held
4430  */
4431 void
4432 vsd_destroy(uint_t *keyp)
4433 {
4434         uint_t key;
4435         struct vsd_node *vsd;
4436 
4437         /*
4438          * protect the key namespace and our destructor lists
4439          */
4440         mutex_enter(&vsd_lock);
4441         key = *keyp;
4442         *keyp = 0;
4443 
4444         ASSERT(key <= vsd_nkeys);
4445 
4446         /*
4447          * if the key is valid
4448          */
4449         if (key != 0) {
4450                 uint_t k = key - 1;
4451                 /*
4452                  * for every vnode with VSD, call key's destructor
4453                  */
4454                 for (vsd = list_head(vsd_list); vsd != NULL;
4455                     vsd = list_next(vsd_list, vsd)) {
4456                         /*
4457                          * no VSD for key in this vnode
4458                          */
4459                         if (key > vsd->vs_nkeys)
4460                                 continue;
4461                         /*
4462                          * call destructor for key
4463                          */
4464                         if (vsd->vs_value[k] && vsd_destructor[k])
4465                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4466                         /*
4467                          * reset value for key
4468                          */
4469                         vsd->vs_value[k] = NULL;
4470                 }
4471                 /*
4472                  * actually free the key (NULL destructor == unused)
4473                  */
4474                 vsd_destructor[k] = NULL;
4475         }
4476 
4477         mutex_exit(&vsd_lock);
4478 }
4479 
4480 /*
4481  * Quickly return the per vnode value that was stored with the specified key
4482  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4483  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4484  */
4485 void *
4486 vsd_get(vnode_t *vp, uint_t key)
4487 {
4488         struct vsd_node *vsd;
4489 
4490         ASSERT(vp != NULL);
4491         ASSERT(mutex_owned(&vp->v_vsd_lock));
4492 
4493         vsd = vp->v_vsd;
4494 
4495         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4496                 return (vsd->vs_value[key - 1]);
4497         return (NULL);
4498 }
4499 
4500 /*
4501  * Set a per vnode value indexed with the specified key
4502  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4503  */
4504 int
4505 vsd_set(vnode_t *vp, uint_t key, void *value)
4506 {
4507         struct vsd_node *vsd;
4508 
4509         ASSERT(vp != NULL);
4510         ASSERT(mutex_owned(&vp->v_vsd_lock));
4511 
4512         if (key == 0)
4513                 return (EINVAL);
4514 
4515         vsd = vp->v_vsd;
4516         if (vsd == NULL)
4517                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4518 
4519         /*
4520          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4521          * code won't happen and we will continue down and allocate space for
4522          * the vs_value array.
4523          * If the caller is replacing one value with another, then it is up
4524          * to the caller to free/rele/destroy the previous value (if needed).
4525          */
4526         if (key <= vsd->vs_nkeys) {
4527                 vsd->vs_value[key - 1] = value;
4528                 return (0);
4529         }
4530 
4531         ASSERT(key <= vsd_nkeys);
4532 
4533         if (vsd->vs_nkeys == 0) {
4534                 mutex_enter(&vsd_lock);     /* lock out vsd_destroy() */
4535                 /*
4536                  * Link onto list of all VSD nodes.
4537                  */
4538                 list_insert_head(vsd_list, vsd);
4539                 mutex_exit(&vsd_lock);
4540         }
4541 
4542         /*
4543          * Allocate vnode local storage and set the value for key
4544          */
4545         vsd->vs_value = vsd_realloc(vsd->vs_value,
4546             vsd->vs_nkeys * sizeof (void *),
4547             key * sizeof (void *));
4548         vsd->vs_nkeys = key;
4549         vsd->vs_value[key - 1] = value;
4550 
4551         return (0);
4552 }
4553 
4554 /*
4555  * Called from vn_free() to run the destructor function for each vsd
4556  *      Locks out vsd_create and vsd_destroy
4557  *      Assumes that the destructor *DOES NOT* use vsd
4558  */
4559 void
4560 vsd_free(vnode_t *vp)
4561 {
4562         int i;
4563         struct vsd_node *vsd = vp->v_vsd;
4564 
4565         if (vsd == NULL)
4566                 return;
4567 
4568         if (vsd->vs_nkeys == 0) {
4569                 kmem_free(vsd, sizeof (*vsd));
4570                 vp->v_vsd = NULL;
4571                 return;
4572         }
4573 
4574         /*
4575          * lock out vsd_create and vsd_destroy, call
4576          * the destructor, and mark the value as destroyed.
4577          */
4578         mutex_enter(&vsd_lock);
4579 
4580         for (i = 0; i < vsd->vs_nkeys; i++) {
4581                 if (vsd->vs_value[i] && vsd_destructor[i])
4582                         (*vsd_destructor[i])(vsd->vs_value[i]);
4583                 vsd->vs_value[i] = NULL;
4584         }
4585 
4586         /*
4587          * remove from linked list of VSD nodes
4588          */
4589         list_remove(vsd_list, vsd);
4590 
4591         mutex_exit(&vsd_lock);
4592 
4593         /*
4594          * free up the VSD
4595          */
4596         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4597         kmem_free(vsd, sizeof (struct vsd_node));
4598         vp->v_vsd = NULL;
4599 }
4600 
4601 /*
4602  * realloc
4603  */
4604 static void *
4605 vsd_realloc(void *old, size_t osize, size_t nsize)
4606 {
4607         void *new;
4608 
4609         new = kmem_zalloc(nsize, KM_SLEEP);
4610         if (old) {
4611                 bcopy(old, new, osize);
4612                 kmem_free(old, osize);
4613         }
4614         return (new);
4615 }
4616 
4617 /*
4618  * Setup the extensible system attribute for creating a reparse point.
4619  * The symlink data 'target' is validated for proper format of a reparse
4620  * string and a check also made to make sure the symlink data does not
4621  * point to an existing file.
4622  *
4623  * return 0 if ok else -1.
4624  */
4625 static int
4626 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4627 {
4628         xoptattr_t *xoap;
4629 
4630         if ((!target) || (!vap) || (!xvattr))
4631                 return (-1);
4632 
4633         /* validate reparse string */
4634         if (reparse_validate((const char *)target))
4635                 return (-1);
4636 
4637         xva_init(xvattr);
4638         xvattr->xva_vattr = *vap;
4639         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4640         xoap = xva_getxoptattr(xvattr);
4641         ASSERT(xoap);
4642         XVA_SET_REQ(xvattr, XAT_REPARSE);
4643         xoap->xoa_reparse = 1;
4644 
4645         return (0);
4646 }
4647 
4648 /*
4649  * Function to check whether a symlink is a reparse point.
4650  * Return B_TRUE if it is a reparse point, else return B_FALSE
4651  */
4652 boolean_t
4653 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4654 {
4655         xvattr_t xvattr;
4656         xoptattr_t *xoap;
4657 
4658         if ((vp->v_type != VLNK) ||
4659             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4660                 return (B_FALSE);
4661 
4662         xva_init(&xvattr);
4663         xoap = xva_getxoptattr(&xvattr);
4664         ASSERT(xoap);
4665         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4666 
4667         if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4668                 return (B_FALSE);
4669 
4670         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4671             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4672                 return (B_FALSE);
4673 
4674         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4675 }