1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2020 Joyent, Inc.
  25  * Copyright 2022 Spencer Evans-Cole.
  26  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  28  */
  29 
  30 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  31 /*        All Rights Reserved   */
  32 
  33 /*
  34  * University Copyright- Copyright (c) 1982, 1986, 1988
  35  * The Regents of the University of California
  36  * All Rights Reserved
  37  *
  38  * University Acknowledgment- Portions of this document are derived from
  39  * software developed by the University of California, Berkeley, and its
  40  * contributors.
  41  */
  42 
  43 #include <sys/types.h>
  44 #include <sys/param.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/errno.h>
  47 #include <sys/cred.h>
  48 #include <sys/user.h>
  49 #include <sys/uio.h>
  50 #include <sys/file.h>
  51 #include <sys/pathname.h>
  52 #include <sys/vfs.h>
  53 #include <sys/vfs_opreg.h>
  54 #include <sys/vnode.h>
  55 #include <sys/filio.h>
  56 #include <sys/rwstlock.h>
  57 #include <sys/fem.h>
  58 #include <sys/stat.h>
  59 #include <sys/mode.h>
  60 #include <sys/conf.h>
  61 #include <sys/sysmacros.h>
  62 #include <sys/cmn_err.h>
  63 #include <sys/systm.h>
  64 #include <sys/kmem.h>
  65 #include <sys/debug.h>
  66 #include <c2/audit.h>
  67 #include <sys/acl.h>
  68 #include <sys/nbmlock.h>
  69 #include <sys/fcntl.h>
  70 #include <fs/fs_subr.h>
  71 #include <sys/taskq.h>
  72 #include <fs/fs_reparse.h>
  73 #include <sys/time.h>
  74 #include <sys/sdt.h>
  75 
  76 /* Determine if this vnode is a file that is read-only */
  77 #define ISROFILE(vp)    \
  78         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  79             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  80 
  81 /* Tunable via /etc/system; used only by admin/install */
  82 int nfs_global_client_only;
  83 
  84 /*
  85  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  86  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  87  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  88  * the same fstype index that is used to index into the vfssw table.
  89  */
  90 vopstats_t **vopstats_fstype;
  91 
  92 /* vopstats initialization template used for fast initialization via bcopy() */
  93 static vopstats_t *vs_templatep;
  94 
  95 /* Kmem cache handle for vsk_anchor_t allocations */
  96 kmem_cache_t *vsk_anchor_cache;
  97 
  98 /* file events cleanup routine */
  99 extern void free_fopdata(vnode_t *);
 100 
 101 /*
 102  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 103  * updates to vsktat_tree.
 104  */
 105 avl_tree_t      vskstat_tree;
 106 kmutex_t        vskstat_tree_lock;
 107 
 108 /* Global variable which enables/disables the vopstats collection */
 109 int vopstats_enabled = 1;
 110 
 111 /* Global used for empty/invalid v_path */
 112 char *vn_vpath_empty = "";
 113 
 114 /*
 115  * forward declarations for internal vnode specific data (vsd)
 116  */
 117 static void *vsd_realloc(void *, size_t, size_t);
 118 
 119 /*
 120  * forward declarations for reparse point functions
 121  */
 122 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 123 
 124 /*
 125  * VSD -- VNODE SPECIFIC DATA
 126  * The v_data pointer is typically used by a file system to store a
 127  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 128  * However, there are times when additional project private data needs
 129  * to be stored separately from the data (node) pointed to by v_data.
 130  * This additional data could be stored by the file system itself or
 131  * by a completely different kernel entity.  VSD provides a way for
 132  * callers to obtain a key and store a pointer to private data associated
 133  * with a vnode.
 134  *
 135  * Callers are responsible for protecting the vsd by holding v_vsd_lock
 136  * for calls to vsd_set() and vsd_get().
 137  */
 138 
 139 /*
 140  * vsd_lock protects:
 141  *   vsd_nkeys - creation and deletion of vsd keys
 142  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 143  *   vsd_destructor - adding and removing destructors to the list
 144  */
 145 static kmutex_t         vsd_lock;
 146 static uint_t           vsd_nkeys;       /* size of destructor array */
 147 /* list of vsd_node's */
 148 static list_t *vsd_list = NULL;
 149 /* per-key destructor funcs */
 150 static void             (**vsd_destructor)(void *);
 151 
 152 /*
 153  * The following is the common set of actions needed to update the
 154  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 155  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 156  * recording of the bytes transferred.  Since the code is similar
 157  * but small, it is nearly a duplicate.  Consequently any changes
 158  * to one may need to be reflected in the other.
 159  * Rundown of the variables:
 160  * vp - Pointer to the vnode
 161  * counter - Partial name structure member to update in vopstats for counts
 162  * bytecounter - Partial name structure member to update in vopstats for bytes
 163  * bytesval - Value to update in vopstats for bytes
 164  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 165  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 166  */
 167 
 168 #define VOPSTATS_UPDATE(vp, counter) {                                  \
 169         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 170         if (vfsp && vfsp->vfs_implp &&                                       \
 171             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 172                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 173                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 174                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 175                     size_t, uint64_t *);                                \
 176                 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 177                 (*stataddr)++;                                          \
 178                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 179                         vsp->n##counter.value.ui64++;                        \
 180                 }                                                       \
 181         }                                                               \
 182 }
 183 
 184 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 185         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 186         if (vfsp && vfsp->vfs_implp &&                                       \
 187             (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {   \
 188                 vopstats_t *vsp = &vfsp->vfs_vopstats;                   \
 189                 uint64_t *stataddr = &(vsp->n##counter.value.ui64);      \
 190                 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 191                     size_t, uint64_t *);                                \
 192                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 193                 (*stataddr)++;                                          \
 194                 vsp->bytecounter.value.ui64 += bytesval;             \
 195                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 196                         vsp->n##counter.value.ui64++;                        \
 197                         vsp->bytecounter.value.ui64 += bytesval;     \
 198                 }                                                       \
 199         }                                                               \
 200 }
 201 
 202 /*
 203  * If the filesystem does not support XIDs map credential
 204  * If the vfsp is NULL, perhaps we should also map?
 205  */
 206 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 207         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 208         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 209                 cr = crgetmapped(cr);                                   \
 210         }
 211 
 212 /*
 213  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 214  * numerical order of S_IFMT and vnode types.)
 215  */
 216 enum vtype iftovt_tab[] = {
 217         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 218         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 219 };
 220 
 221 ushort_t vttoif_tab[] = {
 222         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 223         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 224 };
 225 
 226 /*
 227  * The system vnode cache.
 228  */
 229 
 230 kmem_cache_t *vn_cache;
 231 
 232 
 233 /*
 234  * Vnode operations vector.
 235  */
 236 
 237 static const fs_operation_trans_def_t vn_ops_table[] = {
 238         VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 239             fs_nosys, fs_nosys,
 240 
 241         VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 242             fs_nosys, fs_nosys,
 243 
 244         VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 245             fs_nosys, fs_nosys,
 246 
 247         VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 248             fs_nosys, fs_nosys,
 249 
 250         VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 251             fs_nosys, fs_nosys,
 252 
 253         VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 254             fs_setfl, fs_nosys,
 255 
 256         VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 257             fs_nosys, fs_nosys,
 258 
 259         VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 260             fs_nosys, fs_nosys,
 261 
 262         VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 263             fs_nosys, fs_nosys,
 264 
 265         VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 266             fs_nosys, fs_nosys,
 267 
 268         VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 269             fs_nosys, fs_nosys,
 270 
 271         VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 272             fs_nosys, fs_nosys,
 273 
 274         VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 275             fs_nosys, fs_nosys,
 276 
 277         VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 278             fs_nosys, fs_nosys,
 279 
 280         VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 281             fs_nosys, fs_nosys,
 282 
 283         VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 284             fs_nosys, fs_nosys,
 285 
 286         VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 287             fs_nosys, fs_nosys,
 288 
 289         VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 290             fs_nosys, fs_nosys,
 291 
 292         VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 293             fs_nosys, fs_nosys,
 294 
 295         VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 296             fs_nosys, fs_nosys,
 297 
 298         VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 299             fs_nosys, fs_nosys,
 300 
 301         VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 302             fs_nosys, fs_nosys,
 303 
 304         VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 305             fs_rwlock, fs_rwlock,
 306 
 307         VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 308             (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 309             (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 310 
 311         VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 312             fs_nosys, fs_nosys,
 313 
 314         VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 315             fs_cmp, fs_cmp,             /* no errors allowed */
 316 
 317         VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 318             fs_frlock, fs_nosys,
 319 
 320         VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 321             fs_nosys, fs_nosys,
 322 
 323         VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 324             fs_nosys, fs_nosys,
 325 
 326         VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 327             fs_nosys, fs_nosys,
 328 
 329         VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 330             fs_nosys, fs_nosys,
 331 
 332         VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 333             (fs_generic_func_p) fs_nosys_map,
 334             (fs_generic_func_p) fs_nosys_map,
 335 
 336         VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 337             (fs_generic_func_p) fs_nosys_addmap,
 338             (fs_generic_func_p) fs_nosys_addmap,
 339 
 340         VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 341             fs_nosys, fs_nosys,
 342 
 343         VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 344             (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 345 
 346         VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 347             fs_nosys, fs_nosys,
 348 
 349         VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 350             fs_pathconf, fs_nosys,
 351 
 352         VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 353             fs_nosys, fs_nosys,
 354 
 355         VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 356             fs_nosys, fs_nosys,
 357 
 358         VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 359             (fs_generic_func_p)(uintptr_t)fs_dispose,
 360             (fs_generic_func_p)(uintptr_t)fs_nodispose,
 361 
 362         VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 363             fs_nosys, fs_nosys,
 364 
 365         VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 366             fs_fab_acl, fs_nosys,
 367 
 368         VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 369             fs_shrlock, fs_nosys,
 370 
 371         VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 372             (fs_generic_func_p) fs_vnevent_nosupport,
 373             (fs_generic_func_p) fs_vnevent_nosupport,
 374 
 375         VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 376             fs_nosys, fs_nosys,
 377 
 378         VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 379             fs_nosys, fs_nosys,
 380 
 381         NULL, 0, NULL, NULL
 382 };
 383 
 384 /* Extensible attribute (xva) routines. */
 385 
 386 /*
 387  * Zero out the structure, set the size of the requested/returned bitmaps,
 388  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 389  * to the returned attributes array.
 390  */
 391 void
 392 xva_init(xvattr_t *xvap)
 393 {
 394         bzero(xvap, sizeof (xvattr_t));
 395         xvap->xva_mapsize = XVA_MAPSIZE;
 396         xvap->xva_magic = XVA_MAGIC;
 397         xvap->xva_vattr.va_mask = AT_XVATTR;
 398         xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 399 }
 400 
 401 /*
 402  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 403  * structure.  Otherwise, returns NULL.
 404  */
 405 xoptattr_t *
 406 xva_getxoptattr(xvattr_t *xvap)
 407 {
 408         xoptattr_t *xoap = NULL;
 409         if (xvap->xva_vattr.va_mask & AT_XVATTR)
 410                 xoap = &xvap->xva_xoptattrs;
 411         return (xoap);
 412 }
 413 
 414 /*
 415  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 416  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 417  * kstat name.
 418  */
 419 static int
 420 vska_compar(const void *n1, const void *n2)
 421 {
 422         int ret;
 423         ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 424         ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 425 
 426         if (p1 < p2) {
 427                 ret = -1;
 428         } else if (p1 > p2) {
 429                 ret = 1;
 430         } else {
 431                 ret = 0;
 432         }
 433 
 434         return (ret);
 435 }
 436 
 437 /*
 438  * Used to create a single template which will be bcopy()ed to a newly
 439  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 440  */
 441 static vopstats_t *
 442 create_vopstats_template()
 443 {
 444         vopstats_t              *vsp;
 445 
 446         vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 447         bzero(vsp, sizeof (*vsp));      /* Start fresh */
 448 
 449         /* VOP_OPEN */
 450         kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 451         /* VOP_CLOSE */
 452         kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 453         /* VOP_READ I/O */
 454         kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 455         kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 456         /* VOP_WRITE I/O */
 457         kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 458         kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 459         /* VOP_IOCTL */
 460         kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 461         /* VOP_SETFL */
 462         kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 463         /* VOP_GETATTR */
 464         kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 465         /* VOP_SETATTR */
 466         kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 467         /* VOP_ACCESS */
 468         kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 469         /* VOP_LOOKUP */
 470         kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 471         /* VOP_CREATE */
 472         kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 473         /* VOP_REMOVE */
 474         kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 475         /* VOP_LINK */
 476         kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 477         /* VOP_RENAME */
 478         kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 479         /* VOP_MKDIR */
 480         kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 481         /* VOP_RMDIR */
 482         kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 483         /* VOP_READDIR I/O */
 484         kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 485         kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 486             KSTAT_DATA_UINT64);
 487         /* VOP_SYMLINK */
 488         kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 489         /* VOP_READLINK */
 490         kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 491         /* VOP_FSYNC */
 492         kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 493         /* VOP_INACTIVE */
 494         kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 495         /* VOP_FID */
 496         kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 497         /* VOP_RWLOCK */
 498         kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 499         /* VOP_RWUNLOCK */
 500         kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 501         /* VOP_SEEK */
 502         kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 503         /* VOP_CMP */
 504         kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 505         /* VOP_FRLOCK */
 506         kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 507         /* VOP_SPACE */
 508         kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 509         /* VOP_REALVP */
 510         kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 511         /* VOP_GETPAGE */
 512         kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 513         /* VOP_PUTPAGE */
 514         kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 515         /* VOP_MAP */
 516         kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 517         /* VOP_ADDMAP */
 518         kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 519         /* VOP_DELMAP */
 520         kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 521         /* VOP_POLL */
 522         kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 523         /* VOP_DUMP */
 524         kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 525         /* VOP_PATHCONF */
 526         kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 527         /* VOP_PAGEIO */
 528         kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 529         /* VOP_DUMPCTL */
 530         kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 531         /* VOP_DISPOSE */
 532         kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 533         /* VOP_SETSECATTR */
 534         kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 535         /* VOP_GETSECATTR */
 536         kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 537         /* VOP_SHRLOCK */
 538         kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 539         /* VOP_VNEVENT */
 540         kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 541         /* VOP_REQZCBUF */
 542         kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 543         /* VOP_RETZCBUF */
 544         kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 545 
 546         return (vsp);
 547 }
 548 
 549 /*
 550  * Creates a kstat structure associated with a vopstats structure.
 551  */
 552 kstat_t *
 553 new_vskstat(char *ksname, vopstats_t *vsp)
 554 {
 555         kstat_t         *ksp;
 556 
 557         if (!vopstats_enabled) {
 558                 return (NULL);
 559         }
 560 
 561         ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 562             sizeof (vopstats_t)/sizeof (kstat_named_t),
 563             KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 564         if (ksp) {
 565                 ksp->ks_data = vsp;
 566                 kstat_install(ksp);
 567         }
 568 
 569         return (ksp);
 570 }
 571 
 572 /*
 573  * Called from vfsinit() to initialize the support mechanisms for vopstats
 574  */
 575 void
 576 vopstats_startup()
 577 {
 578         if (!vopstats_enabled)
 579                 return;
 580 
 581         /*
 582          * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 583          * is necessary since we need to check if a kstat exists before we
 584          * attempt to create it.  Also, initialize its lock.
 585          */
 586         avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 587             offsetof(vsk_anchor_t, vsk_node));
 588         mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 589 
 590         vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 591             sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 592             NULL, NULL, 0);
 593 
 594         /*
 595          * Set up the array of pointers for the vopstats-by-FS-type.
 596          * The entries will be allocated/initialized as each file system
 597          * goes through modload/mod_installfs.
 598          */
 599         vopstats_fstype = (vopstats_t **)kmem_zalloc(
 600             (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 601 
 602         /* Set up the global vopstats initialization template */
 603         vs_templatep = create_vopstats_template();
 604 }
 605 
 606 /*
 607  * We need to have the all of the counters zeroed.
 608  * The initialization of the vopstats_t includes on the order of
 609  * 50 calls to kstat_named_init().  Rather that do that on every call,
 610  * we do it once in a template (vs_templatep) then bcopy it over.
 611  */
 612 void
 613 initialize_vopstats(vopstats_t *vsp)
 614 {
 615         if (vsp == NULL)
 616                 return;
 617 
 618         bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 619 }
 620 
 621 /*
 622  * If possible, determine which vopstats by fstype to use and
 623  * return a pointer to the caller.
 624  */
 625 vopstats_t *
 626 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 627 {
 628         int             fstype = 0;     /* Index into vfssw[] */
 629         vopstats_t      *vsp = NULL;
 630 
 631         if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 632             !vopstats_enabled)
 633                 return (NULL);
 634         /*
 635          * Set up the fstype.  We go to so much trouble because all versions
 636          * of NFS use the same fstype in their vfs even though they have
 637          * distinct entries in the vfssw[] table.
 638          * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 639          */
 640         if (vswp) {
 641                 fstype = vswp - vfssw;  /* Gets us the index */
 642         } else {
 643                 fstype = vfsp->vfs_fstype;
 644         }
 645 
 646         /*
 647          * Point to the per-fstype vopstats. The only valid values are
 648          * non-zero positive values less than the number of vfssw[] table
 649          * entries.
 650          */
 651         if (fstype > 0 && fstype < nfstype) {
 652                 vsp = vopstats_fstype[fstype];
 653         }
 654 
 655         return (vsp);
 656 }
 657 
 658 /*
 659  * Generate a kstat name, create the kstat structure, and allocate a
 660  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 661  * to the caller.  This must only be called from a mount.
 662  */
 663 vsk_anchor_t *
 664 get_vskstat_anchor(vfs_t *vfsp)
 665 {
 666         char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 667         statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 668         vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 669         kstat_t         *ksp;                   /* Ptr to new kstat */
 670         avl_index_t     where;                  /* Location in the AVL tree */
 671 
 672         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 673             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 674                 return (NULL);
 675 
 676         /* Need to get the fsid to build a kstat name */
 677         if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 678                 /* Create a name for our kstats based on fsid */
 679                 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 680                     VOPSTATS_STR, statvfsbuf.f_fsid);
 681 
 682                 /* Allocate and initialize the vsk_anchor_t */
 683                 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 684                 bzero(vskp, sizeof (*vskp));
 685                 vskp->vsk_fsid = statvfsbuf.f_fsid;
 686 
 687                 mutex_enter(&vskstat_tree_lock);
 688                 if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 689                         avl_insert(&vskstat_tree, vskp, where);
 690                         mutex_exit(&vskstat_tree_lock);
 691 
 692                         /*
 693                          * Now that we've got the anchor in the AVL
 694                          * tree, we can create the kstat.
 695                          */
 696                         ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 697                         if (ksp) {
 698                                 vskp->vsk_ksp = ksp;
 699                         }
 700                 } else {
 701                         /* Oops, found one! Release memory and lock. */
 702                         mutex_exit(&vskstat_tree_lock);
 703                         kmem_cache_free(vsk_anchor_cache, vskp);
 704                         vskp = NULL;
 705                 }
 706         }
 707         return (vskp);
 708 }
 709 
 710 /*
 711  * We're in the process of tearing down the vfs and need to cleanup
 712  * the data structures associated with the vopstats. Must only be called
 713  * from dounmount().
 714  */
 715 void
 716 teardown_vopstats(vfs_t *vfsp)
 717 {
 718         vsk_anchor_t    *vskap;
 719         avl_index_t     where;
 720 
 721         if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 722             (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 723                 return;
 724 
 725         /* This is a safe check since VFS_STATS must be set (see above) */
 726         if ((vskap = vfsp->vfs_vskap) == NULL)
 727                 return;
 728 
 729         /* Whack the pointer right away */
 730         vfsp->vfs_vskap = NULL;
 731 
 732         /* Lock the tree, remove the node, and delete the kstat */
 733         mutex_enter(&vskstat_tree_lock);
 734         if (avl_find(&vskstat_tree, vskap, &where)) {
 735                 avl_remove(&vskstat_tree, vskap);
 736         }
 737 
 738         if (vskap->vsk_ksp) {
 739                 kstat_delete(vskap->vsk_ksp);
 740         }
 741         mutex_exit(&vskstat_tree_lock);
 742 
 743         kmem_cache_free(vsk_anchor_cache, vskap);
 744 }
 745 
 746 /*
 747  * Read or write a vnode.  Called from kernel code.
 748  */
 749 int
 750 vn_rdwr(
 751         enum uio_rw rw,
 752         struct vnode *vp,
 753         caddr_t base,
 754         ssize_t len,
 755         offset_t offset,
 756         enum uio_seg seg,
 757         int ioflag,
 758         rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 759         cred_t *cr,
 760         ssize_t *residp)
 761 {
 762         struct uio uio;
 763         struct iovec iov;
 764         int error;
 765         int in_crit = 0;
 766 
 767         if (rw == UIO_WRITE && ISROFILE(vp))
 768                 return (EROFS);
 769 
 770         if (len < 0)
 771                 return (EIO);
 772 
 773         VOPXID_MAP_CR(vp, cr);
 774 
 775         iov.iov_base = base;
 776         iov.iov_len = len;
 777         uio.uio_iov = &iov;
 778         uio.uio_iovcnt = 1;
 779         uio.uio_loffset = offset;
 780         uio.uio_segflg = (short)seg;
 781         uio.uio_resid = len;
 782         uio.uio_llimit = ulimit;
 783 
 784         /*
 785          * We have to enter the critical region before calling VOP_RWLOCK
 786          * to avoid a deadlock with ufs.
 787          */
 788         if (nbl_need_check(vp)) {
 789                 int svmand;
 790 
 791                 nbl_start_crit(vp, RW_READER);
 792                 in_crit = 1;
 793                 error = nbl_svmand(vp, cr, &svmand);
 794                 if (error != 0)
 795                         goto done;
 796                 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 797                     uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 798                         error = EACCES;
 799                         goto done;
 800                 }
 801         }
 802 
 803         (void) VOP_RWLOCK(vp,
 804             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 805         if (rw == UIO_WRITE) {
 806                 uio.uio_fmode = FWRITE;
 807                 uio.uio_extflg = UIO_COPY_DEFAULT;
 808                 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 809         } else {
 810                 uio.uio_fmode = FREAD;
 811                 uio.uio_extflg = UIO_COPY_CACHED;
 812                 error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 813         }
 814         VOP_RWUNLOCK(vp,
 815             rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 816         if (residp)
 817                 *residp = uio.uio_resid;
 818         else if (uio.uio_resid)
 819                 error = EIO;
 820 
 821 done:
 822         if (in_crit)
 823                 nbl_end_crit(vp);
 824         return (error);
 825 }
 826 
 827 /*
 828  * Release a vnode.  Call VOP_INACTIVE on last reference or
 829  * decrement reference count.
 830  *
 831  * To avoid race conditions, the v_count is left at 1 for
 832  * the call to VOP_INACTIVE. This prevents another thread
 833  * from reclaiming and releasing the vnode *before* the
 834  * VOP_INACTIVE routine has a chance to destroy the vnode.
 835  * We can't have more than 1 thread calling VOP_INACTIVE
 836  * on a vnode.
 837  */
 838 void
 839 vn_rele(vnode_t *vp)
 840 {
 841         mutex_enter(&vp->v_lock);
 842         if (vp->v_count == 1) {
 843                 mutex_exit(&vp->v_lock);
 844                 VOP_INACTIVE(vp, CRED(), NULL);
 845                 return;
 846         } else {
 847                 VERIFY(vp->v_count > 0);
 848         }
 849         VN_RELE_LOCKED(vp);
 850         mutex_exit(&vp->v_lock);
 851 }
 852 
 853 /*
 854  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 855  * as a single reference, so v_count is not decremented until the last DNLC hold
 856  * is released. This makes it possible to distinguish vnodes that are referenced
 857  * only by the DNLC.
 858  */
 859 void
 860 vn_rele_dnlc(vnode_t *vp)
 861 {
 862         mutex_enter(&vp->v_lock);
 863         VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 864         if (--vp->v_count_dnlc == 0) {
 865                 if (vp->v_count == 1) {
 866                         mutex_exit(&vp->v_lock);
 867                         VOP_INACTIVE(vp, CRED(), NULL);
 868                         return;
 869                 }
 870                 VN_RELE_LOCKED(vp);
 871         }
 872         mutex_exit(&vp->v_lock);
 873 }
 874 
 875 /*
 876  * Like vn_rele() except that it clears v_stream under v_lock.
 877  * This is used by sockfs when it dismantles the association between
 878  * the sockfs node and the vnode in the underlying file system.
 879  * v_lock has to be held to prevent a thread coming through the lookupname
 880  * path from accessing a stream head that is going away.
 881  */
 882 void
 883 vn_rele_stream(vnode_t *vp)
 884 {
 885         mutex_enter(&vp->v_lock);
 886         vp->v_stream = NULL;
 887         if (vp->v_count == 1) {
 888                 mutex_exit(&vp->v_lock);
 889                 VOP_INACTIVE(vp, CRED(), NULL);
 890                 return;
 891         } else {
 892                 VERIFY(vp->v_count > 0);
 893         }
 894         VN_RELE_LOCKED(vp);
 895         mutex_exit(&vp->v_lock);
 896 }
 897 
 898 static void
 899 vn_rele_inactive(vnode_t *vp)
 900 {
 901         VOP_INACTIVE(vp, CRED(), NULL);
 902 }
 903 
 904 /*
 905  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 906  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 907  * the file system as a result of releasing the vnode. Note, file systems
 908  * already have to handle the race where the vnode is incremented before the
 909  * inactive routine is called and does its locking.
 910  *
 911  * Warning: Excessive use of this routine can lead to performance problems.
 912  * This is because taskqs throttle back allocation if too many are created.
 913  */
 914 void
 915 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 916 {
 917         mutex_enter(&vp->v_lock);
 918         if (vp->v_count == 1) {
 919                 mutex_exit(&vp->v_lock);
 920                 VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 921                     vp, TQ_SLEEP) != TASKQID_INVALID);
 922                 return;
 923         } else {
 924                 VERIFY(vp->v_count > 0);
 925         }
 926         VN_RELE_LOCKED(vp);
 927         mutex_exit(&vp->v_lock);
 928 }
 929 
 930 int
 931 vn_open(
 932         char *pnamep,
 933         enum uio_seg seg,
 934         int filemode,
 935         int createmode,
 936         struct vnode **vpp,
 937         enum create crwhy,
 938         mode_t umask)
 939 {
 940         return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 941             umask, NULL, -1));
 942 }
 943 
 944 
 945 /*
 946  * Open/create a vnode.
 947  * This may be callable by the kernel, the only known use
 948  * of user context being that the current user credentials
 949  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 950  */
 951 int
 952 vn_openat(
 953         char *pnamep,
 954         enum uio_seg seg,
 955         int filemode,
 956         int createmode,
 957         struct vnode **vpp,
 958         enum create crwhy,
 959         mode_t umask,
 960         struct vnode *startvp,
 961         int fd)
 962 {
 963         struct vnode *vp;
 964         int mode;
 965         int accessflags;
 966         int error;
 967         int in_crit = 0;
 968         int open_done = 0;
 969         int shrlock_done = 0;
 970         struct vattr vattr;
 971         enum symfollow follow;
 972         int estale_retry = 0;
 973         struct shrlock shr;
 974         struct shr_locowner shr_own;
 975         boolean_t create;
 976 
 977         mode = 0;
 978         accessflags = 0;
 979         if (filemode & FREAD)
 980                 mode |= VREAD;
 981         if (filemode & (FWRITE|FTRUNC))
 982                 mode |= VWRITE;
 983         if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 984                 mode |= VEXEC;
 985 
 986         /* symlink interpretation */
 987         if (filemode & FNOFOLLOW)
 988                 follow = NO_FOLLOW;
 989         else
 990                 follow = FOLLOW;
 991 
 992         if (filemode & FAPPEND)
 993                 accessflags |= V_APPEND;
 994 
 995         /*
 996          * We need to handle the case of FCREAT | FDIRECTORY and the case of
 997          * FEXCL. If all three are specified, then we always fail because we
 998          * cannot create a directory through this interface and FEXCL says we
 999          * need to fail the request if we can't create it. If, however, only
1000          * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1001          * of opening a file that already exists. If it exists, we can do
1002          * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1003          * treated as FDIRECTORY.
1004          */
1005         if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1006             (FCREAT | FDIRECTORY | FEXCL)) {
1007                 return (EINVAL);
1008         }
1009 
1010         if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1011                 create = B_FALSE;
1012         } else if ((filemode & FCREAT) != 0) {
1013                 create = B_TRUE;
1014         } else {
1015                 create = B_FALSE;
1016         }
1017 
1018 top:
1019         if (create) {
1020                 enum vcexcl excl;
1021 
1022                 /*
1023                  * Wish to create a file.
1024                  */
1025                 vattr.va_type = VREG;
1026                 vattr.va_mode = createmode;
1027                 vattr.va_mask = AT_TYPE|AT_MODE;
1028                 if (filemode & FTRUNC) {
1029                         vattr.va_size = 0;
1030                         vattr.va_mask |= AT_SIZE;
1031                 }
1032                 if (filemode & FEXCL)
1033                         excl = EXCL;
1034                 else
1035                         excl = NONEXCL;
1036 
1037                 if (error =
1038                     vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1039                     (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1040                         return (error);
1041         } else {
1042                 /*
1043                  * Wish to open a file.  Just look it up.
1044                  */
1045                 if (error = lookupnameat(pnamep, seg, follow,
1046                     NULLVPP, &vp, startvp)) {
1047                         if ((error == ESTALE) &&
1048                             fs_need_estale_retry(estale_retry++))
1049                                 goto top;
1050                         return (error);
1051                 }
1052 
1053                 /*
1054                  * Get the attributes to check whether file is large.
1055                  * We do this only if the FOFFMAX flag is not set and
1056                  * only for regular files.
1057                  */
1058 
1059                 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1060                         vattr.va_mask = AT_SIZE;
1061                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1062                             CRED(), NULL))) {
1063                                 goto out;
1064                         }
1065                         if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1066                                 /*
1067                                  * Large File API - regular open fails
1068                                  * if FOFFMAX flag is set in file mode
1069                                  */
1070                                 error = EOVERFLOW;
1071                                 goto out;
1072                         }
1073                 }
1074                 /*
1075                  * Can't write directories, active texts, or
1076                  * read-only filesystems.  Can't truncate files
1077                  * on which mandatory locking is in effect.
1078                  */
1079                 if (filemode & (FWRITE|FTRUNC)) {
1080                         /*
1081                          * Allow writable directory if VDIROPEN flag is set.
1082                          */
1083                         if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1084                                 error = EISDIR;
1085                                 goto out;
1086                         }
1087                         if (ISROFILE(vp)) {
1088                                 error = EROFS;
1089                                 goto out;
1090                         }
1091                         /*
1092                          * Can't truncate files on which
1093                          * sysv mandatory locking is in effect.
1094                          */
1095                         if (filemode & FTRUNC) {
1096                                 vnode_t *rvp;
1097 
1098                                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1099                                         rvp = vp;
1100                                 if (rvp->v_filocks != NULL) {
1101                                         vattr.va_mask = AT_MODE;
1102                                         if ((error = VOP_GETATTR(vp,
1103                                             &vattr, 0, CRED(), NULL)) == 0 &&
1104                                             MANDLOCK(vp, vattr.va_mode))
1105                                                 error = EAGAIN;
1106                                 }
1107                         }
1108                         if (error)
1109                                 goto out;
1110                 }
1111                 /*
1112                  * Check permissions.
1113                  */
1114                 if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1115                         goto out;
1116 
1117                 /*
1118                  * Require FSEARCH and FDIRECTORY to return a directory. Require
1119                  * FEXEC to return a regular file.
1120                  */
1121                 if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1122                     vp->v_type != VDIR) {
1123                         error = ENOTDIR;
1124                         goto out;
1125                 }
1126                 if ((filemode & FEXEC) && vp->v_type != VREG) {
1127                         error = ENOEXEC;        /* XXX: error code? */
1128                         goto out;
1129                 }
1130         }
1131 
1132         /*
1133          * Do remaining checks for FNOFOLLOW and FNOLINKS.
1134          */
1135         if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1136                 error = ELOOP;
1137                 goto out;
1138         }
1139         if (filemode & FNOLINKS) {
1140                 vattr.va_mask = AT_NLINK;
1141                 if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1142                         goto out;
1143                 }
1144                 if (vattr.va_nlink != 1) {
1145                         error = EMLINK;
1146                         goto out;
1147                 }
1148         }
1149 
1150         /*
1151          * Opening a socket corresponding to the AF_UNIX pathname
1152          * in the filesystem name space is not supported.
1153          * However, VSOCK nodes in namefs are supported in order
1154          * to make fattach work for sockets.
1155          *
1156          * XXX This uses VOP_REALVP to distinguish between
1157          * an unopened namefs node (where VOP_REALVP returns a
1158          * different VSOCK vnode) and a VSOCK created by vn_create
1159          * in some file system (where VOP_REALVP would never return
1160          * a different vnode).
1161          */
1162         if (vp->v_type == VSOCK) {
1163                 struct vnode *nvp;
1164 
1165                 error = VOP_REALVP(vp, &nvp, NULL);
1166                 if (error != 0 || nvp == NULL || nvp == vp ||
1167                     nvp->v_type != VSOCK) {
1168                         error = EOPNOTSUPP;
1169                         goto out;
1170                 }
1171         }
1172 
1173         if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1174                 /* get share reservation */
1175                 shr.s_access = 0;
1176                 if (filemode & FWRITE)
1177                         shr.s_access |= F_WRACC;
1178                 if (filemode & FREAD)
1179                         shr.s_access |= F_RDACC;
1180                 shr.s_deny = 0;
1181                 shr.s_sysid = 0;
1182                 shr.s_pid = ttoproc(curthread)->p_pid;
1183                 shr_own.sl_pid = shr.s_pid;
1184                 shr_own.sl_id = fd;
1185                 shr.s_own_len = sizeof (shr_own);
1186                 shr.s_owner = (caddr_t)&shr_own;
1187                 error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1188                     NULL);
1189                 if (error)
1190                         goto out;
1191                 shrlock_done = 1;
1192 
1193                 /* nbmand conflict check if truncating file */
1194                 if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1195                         nbl_start_crit(vp, RW_READER);
1196                         in_crit = 1;
1197 
1198                         vattr.va_mask = AT_SIZE;
1199                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1200                                 goto out;
1201                         if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1202                             NULL)) {
1203                                 error = EACCES;
1204                                 goto out;
1205                         }
1206                 }
1207         }
1208 
1209         /*
1210          * Do opening protocol.
1211          */
1212         error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1213         if (error)
1214                 goto out;
1215         open_done = 1;
1216 
1217         /*
1218          * Truncate if required.
1219          */
1220         if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1221                 vattr.va_size = 0;
1222                 vattr.va_mask = AT_SIZE;
1223                 if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1224                         goto out;
1225         }
1226 
1227         /*
1228          * Turn on directio, if requested.
1229          */
1230         if (filemode & FDIRECT) {
1231                 if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1232                     CRED(), NULL, NULL)) != 0) {
1233                         /*
1234                          * On Linux, O_DIRECT returns EINVAL when the file
1235                          * system does not support directio, so we'll do the
1236                          * same.
1237                          */
1238                         error = EINVAL;
1239                         goto out;
1240                 }
1241         }
1242 out:
1243         ASSERT(vp->v_count > 0);
1244 
1245         if (in_crit) {
1246                 nbl_end_crit(vp);
1247                 in_crit = 0;
1248         }
1249         if (error) {
1250                 if (open_done) {
1251                         (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1252                             NULL);
1253                         open_done = 0;
1254                         shrlock_done = 0;
1255                 }
1256                 if (shrlock_done) {
1257                         (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1258                             NULL);
1259                         shrlock_done = 0;
1260                 }
1261 
1262                 /*
1263                  * The following clause was added to handle a problem
1264                  * with NFS consistency.  It is possible that a lookup
1265                  * of the file to be opened succeeded, but the file
1266                  * itself doesn't actually exist on the server.  This
1267                  * is chiefly due to the DNLC containing an entry for
1268                  * the file which has been removed on the server.  In
1269                  * this case, we just start over.  If there was some
1270                  * other cause for the ESTALE error, then the lookup
1271                  * of the file will fail and the error will be returned
1272                  * above instead of looping around from here.
1273                  */
1274                 VN_RELE(vp);
1275                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1276                         goto top;
1277         } else
1278                 *vpp = vp;
1279         return (error);
1280 }
1281 
1282 /*
1283  * The following two accessor functions are for the NFSv4 server.  Since there
1284  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1285  * vnode open counts correct when a client "upgrades" an open or does an
1286  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1287  * open mode (add or subtract read or write), but also change the share/deny
1288  * modes.  However, share reservations are not integrated with OPEN, yet, so
1289  * we need to handle each separately.  These functions are cleaner than having
1290  * the NFS server manipulate the counts directly, however, nobody else should
1291  * use these functions.
1292  */
1293 void
1294 vn_open_upgrade(
1295         vnode_t *vp,
1296         int filemode)
1297 {
1298         ASSERT(vp->v_type == VREG);
1299 
1300         if (filemode & FREAD)
1301                 atomic_inc_32(&vp->v_rdcnt);
1302         if (filemode & FWRITE)
1303                 atomic_inc_32(&vp->v_wrcnt);
1304 
1305 }
1306 
1307 void
1308 vn_open_downgrade(
1309         vnode_t *vp,
1310         int filemode)
1311 {
1312         ASSERT(vp->v_type == VREG);
1313 
1314         if (filemode & FREAD) {
1315                 ASSERT(vp->v_rdcnt > 0);
1316                 atomic_dec_32(&vp->v_rdcnt);
1317         }
1318         if (filemode & FWRITE) {
1319                 ASSERT(vp->v_wrcnt > 0);
1320                 atomic_dec_32(&vp->v_wrcnt);
1321         }
1322 
1323 }
1324 
1325 int
1326 vn_create(
1327         char *pnamep,
1328         enum uio_seg seg,
1329         struct vattr *vap,
1330         enum vcexcl excl,
1331         int mode,
1332         struct vnode **vpp,
1333         enum create why,
1334         int flag,
1335         mode_t umask)
1336 {
1337         return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1338             umask, NULL));
1339 }
1340 
1341 /*
1342  * Create a vnode (makenode).
1343  */
1344 int
1345 vn_createat(
1346         char *pnamep,
1347         enum uio_seg seg,
1348         struct vattr *vap,
1349         enum vcexcl excl,
1350         int mode,
1351         struct vnode **vpp,
1352         enum create why,
1353         int flag,
1354         mode_t umask,
1355         struct vnode *startvp)
1356 {
1357         struct vnode *dvp;      /* ptr to parent dir vnode */
1358         struct vnode *vp = NULL;
1359         struct pathname pn;
1360         int error;
1361         int in_crit = 0;
1362         struct vattr vattr;
1363         enum symfollow follow;
1364         int estale_retry = 0;
1365         uint32_t auditing = AU_AUDITING();
1366 
1367         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1368 
1369         /* symlink interpretation */
1370         if ((flag & FNOFOLLOW) || excl == EXCL)
1371                 follow = NO_FOLLOW;
1372         else
1373                 follow = FOLLOW;
1374         flag &= ~(FNOFOLLOW|FNOLINKS);
1375 
1376 top:
1377         /*
1378          * Lookup directory.
1379          * If new object is a file, call lower level to create it.
1380          * Note that it is up to the lower level to enforce exclusive
1381          * creation, if the file is already there.
1382          * This allows the lower level to do whatever
1383          * locking or protocol that is needed to prevent races.
1384          * If the new object is directory call lower level to make
1385          * the new directory, with "." and "..".
1386          */
1387         if (error = pn_get(pnamep, seg, &pn))
1388                 return (error);
1389         if (auditing)
1390                 audit_vncreate_start();
1391         dvp = NULL;
1392         *vpp = NULL;
1393         /*
1394          * lookup will find the parent directory for the vnode.
1395          * When it is done the pn holds the name of the entry
1396          * in the directory.
1397          * If this is a non-exclusive create we also find the node itself.
1398          */
1399         error = lookuppnat(&pn, NULL, follow, &dvp,
1400             (excl == EXCL) ? NULLVPP : vpp, startvp);
1401         if (error) {
1402                 pn_free(&pn);
1403                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1404                         goto top;
1405                 if (why == CRMKDIR && error == EINVAL)
1406                         error = EEXIST;         /* SVID */
1407                 return (error);
1408         }
1409 
1410         if (why != CRMKNOD)
1411                 vap->va_mode &= ~VSVTX;
1412 
1413         /*
1414          * If default ACLs are defined for the directory don't apply the
1415          * umask if umask is passed.
1416          */
1417 
1418         if (umask) {
1419 
1420                 vsecattr_t vsec;
1421 
1422                 vsec.vsa_aclcnt = 0;
1423                 vsec.vsa_aclentp = NULL;
1424                 vsec.vsa_dfaclcnt = 0;
1425                 vsec.vsa_dfaclentp = NULL;
1426                 vsec.vsa_mask = VSA_DFACLCNT;
1427                 error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1428                 /*
1429                  * If error is ENOSYS then treat it as no error
1430                  * Don't want to force all file systems to support
1431                  * aclent_t style of ACL's.
1432                  */
1433                 if (error == ENOSYS)
1434                         error = 0;
1435                 if (error) {
1436                         if (*vpp != NULL)
1437                                 VN_RELE(*vpp);
1438                         goto out;
1439                 } else {
1440                         /*
1441                          * Apply the umask if no default ACLs.
1442                          */
1443                         if (vsec.vsa_dfaclcnt == 0)
1444                                 vap->va_mode &= ~umask;
1445 
1446                         /*
1447                          * VOP_GETSECATTR() may have allocated memory for
1448                          * ACLs we didn't request, so double-check and
1449                          * free it if necessary.
1450                          */
1451                         if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1452                                 kmem_free((caddr_t)vsec.vsa_aclentp,
1453                                     vsec.vsa_aclcnt * sizeof (aclent_t));
1454                         if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1455                                 kmem_free((caddr_t)vsec.vsa_dfaclentp,
1456                                     vsec.vsa_dfaclcnt * sizeof (aclent_t));
1457                 }
1458         }
1459 
1460         /*
1461          * In general we want to generate EROFS if the file system is
1462          * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1463          * documents the open system call, and it says that O_CREAT has no
1464          * effect if the file already exists.  Bug 1119649 states
1465          * that open(path, O_CREAT, ...) fails when attempting to open an
1466          * existing file on a read only file system.  Thus, the first part
1467          * of the following if statement has 3 checks:
1468          *      if the file exists &&
1469          *              it is being open with write access &&
1470          *              the file system is read only
1471          *      then generate EROFS
1472          */
1473         if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1474             (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1475                 if (*vpp)
1476                         VN_RELE(*vpp);
1477                 error = EROFS;
1478         } else if (excl == NONEXCL && *vpp != NULL) {
1479                 vnode_t *rvp;
1480 
1481                 /*
1482                  * File already exists.  If a mandatory lock has been
1483                  * applied, return error.
1484                  */
1485                 vp = *vpp;
1486                 if (VOP_REALVP(vp, &rvp, NULL) != 0)
1487                         rvp = vp;
1488                 if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1489                         nbl_start_crit(vp, RW_READER);
1490                         in_crit = 1;
1491                 }
1492                 if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1493                         vattr.va_mask = AT_MODE|AT_SIZE;
1494                         if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1495                                 goto out;
1496                         }
1497                         if (MANDLOCK(vp, vattr.va_mode)) {
1498                                 error = EAGAIN;
1499                                 goto out;
1500                         }
1501                         /*
1502                          * File cannot be truncated if non-blocking mandatory
1503                          * locks are currently on the file.
1504                          */
1505                         if ((vap->va_mask & AT_SIZE) && in_crit) {
1506                                 u_offset_t offset;
1507                                 ssize_t length;
1508 
1509                                 offset = vap->va_size > vattr.va_size ?
1510                                     vattr.va_size : vap->va_size;
1511                                 length = vap->va_size > vattr.va_size ?
1512                                     vap->va_size - vattr.va_size :
1513                                     vattr.va_size - vap->va_size;
1514                                 if (nbl_conflict(vp, NBL_WRITE, offset,
1515                                     length, 0, NULL)) {
1516                                         error = EACCES;
1517                                         goto out;
1518                                 }
1519                         }
1520                 }
1521 
1522                 /*
1523                  * If the file is the root of a VFS, we've crossed a
1524                  * mount point and the "containing" directory that we
1525                  * acquired above (dvp) is irrelevant because it's in
1526                  * a different file system.  We apply VOP_CREATE to the
1527                  * target itself instead of to the containing directory
1528                  * and supply a null path name to indicate (conventionally)
1529                  * the node itself as the "component" of interest.
1530                  *
1531                  * The call to VOP_CREATE() is necessary to ensure
1532                  * that the appropriate permission checks are made,
1533                  * i.e. EISDIR, EACCES, etc.  We already know that vpp
1534                  * exists since we are in the else condition where this
1535                  * was checked.
1536                  */
1537                 if (vp->v_flag & VROOT) {
1538                         ASSERT(why != CRMKDIR);
1539                         error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1540                             CRED(), flag, NULL, NULL);
1541                         /*
1542                          * If the create succeeded, it will have created a
1543                          * new reference on a new vnode (*vpp) in the child
1544                          * file system, so we want to drop our reference on
1545                          * the old (vp) upon exit.
1546                          */
1547                         goto out;
1548                 }
1549 
1550                 /*
1551                  * Large File API - non-large open (FOFFMAX flag not set)
1552                  * of regular file fails if the file size exceeds MAXOFF32_T.
1553                  */
1554                 if (why != CRMKDIR &&
1555                     !(flag & FOFFMAX) &&
1556                     (vp->v_type == VREG)) {
1557                         vattr.va_mask = AT_SIZE;
1558                         if ((error = VOP_GETATTR(vp, &vattr, 0,
1559                             CRED(), NULL))) {
1560                                 goto out;
1561                         }
1562                         if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1563                                 error = EOVERFLOW;
1564                                 goto out;
1565                         }
1566                 }
1567         }
1568 
1569         if (error == 0) {
1570                 /*
1571                  * Call mkdir() if specified, otherwise create().
1572                  */
1573                 int must_be_dir = pn_fixslash(&pn); /* trailing '/'? */
1574 
1575                 if (why == CRMKDIR)
1576                         /*
1577                          * N.B., if vn_createat() ever requests
1578                          * case-insensitive behavior then it will need
1579                          * to be passed to VOP_MKDIR().  VOP_CREATE()
1580                          * will already get it via "flag"
1581                          */
1582                         error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1583                             NULL, 0, NULL);
1584                 else if (!must_be_dir)
1585                         error = VOP_CREATE(dvp, pn.pn_path, vap,
1586                             excl, mode, vpp, CRED(), flag, NULL, NULL);
1587                 else
1588                         error = ENOTDIR;
1589         }
1590 
1591 out:
1592 
1593         if (auditing)
1594                 audit_vncreate_finish(*vpp, error);
1595         if (in_crit) {
1596                 nbl_end_crit(vp);
1597                 in_crit = 0;
1598         }
1599         if (vp != NULL) {
1600                 VN_RELE(vp);
1601                 vp = NULL;
1602         }
1603         pn_free(&pn);
1604         VN_RELE(dvp);
1605         /*
1606          * The following clause was added to handle a problem
1607          * with NFS consistency.  It is possible that a lookup
1608          * of the file to be created succeeded, but the file
1609          * itself doesn't actually exist on the server.  This
1610          * is chiefly due to the DNLC containing an entry for
1611          * the file which has been removed on the server.  In
1612          * this case, we just start over.  If there was some
1613          * other cause for the ESTALE error, then the lookup
1614          * of the file will fail and the error will be returned
1615          * above instead of looping around from here.
1616          */
1617         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1618                 goto top;
1619         return (error);
1620 }
1621 
1622 int
1623 vn_link(char *from, char *to, enum uio_seg seg)
1624 {
1625         return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1626 }
1627 
1628 int
1629 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1630     vnode_t *tstartvp, char *to, enum uio_seg seg)
1631 {
1632         struct vnode *fvp;              /* from vnode ptr */
1633         struct vnode *tdvp;             /* to directory vnode ptr */
1634         struct pathname pn;
1635         int error;
1636         struct vattr vattr;
1637         dev_t fsid;
1638         int estale_retry = 0;
1639         uint32_t auditing = AU_AUDITING();
1640 
1641 top:
1642         fvp = tdvp = NULL;
1643         if (error = pn_get(to, seg, &pn))
1644                 return (error);
1645         if (auditing && fstartvp != NULL)
1646                 audit_setfsat_path(1);
1647         if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1648                 goto out;
1649         if (auditing && tstartvp != NULL)
1650                 audit_setfsat_path(3);
1651         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1652                 goto out;
1653         /*
1654          * Make sure both source vnode and target directory vnode are
1655          * in the same vfs and that it is writeable.
1656          */
1657         vattr.va_mask = AT_FSID;
1658         if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1659                 goto out;
1660         fsid = vattr.va_fsid;
1661         vattr.va_mask = AT_FSID;
1662         if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1663                 goto out;
1664         if (fsid != vattr.va_fsid) {
1665                 error = EXDEV;
1666                 goto out;
1667         }
1668         if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1669                 error = EROFS;
1670                 goto out;
1671         }
1672         /*
1673          * Do the link.
1674          */
1675         (void) pn_fixslash(&pn);
1676         error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1677 out:
1678         pn_free(&pn);
1679         if (fvp)
1680                 VN_RELE(fvp);
1681         if (tdvp)
1682                 VN_RELE(tdvp);
1683         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1684                 goto top;
1685         return (error);
1686 }
1687 
1688 int
1689 vn_rename(char *from, char *to, enum uio_seg seg)
1690 {
1691         return (vn_renameat(NULL, from, NULL, to, seg));
1692 }
1693 
1694 int
1695 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1696     char *tname, enum uio_seg seg)
1697 {
1698         int error;
1699         struct vattr vattr;
1700         struct pathname fpn;            /* from pathname */
1701         struct pathname tpn;            /* to pathname */
1702         dev_t fsid;
1703         int in_crit_src, in_crit_targ;
1704         vnode_t *fromvp, *fvp;
1705         vnode_t *tovp, *targvp;
1706         int estale_retry = 0;
1707         uint32_t auditing = AU_AUDITING();
1708 
1709 top:
1710         fvp = fromvp = tovp = targvp = NULL;
1711         in_crit_src = in_crit_targ = 0;
1712         /*
1713          * Get to and from pathnames.
1714          */
1715         if (error = pn_get(fname, seg, &fpn))
1716                 return (error);
1717         if (error = pn_get(tname, seg, &tpn)) {
1718                 pn_free(&fpn);
1719                 return (error);
1720         }
1721 
1722         /*
1723          * First we need to resolve the correct directories
1724          * The passed in directories may only be a starting point,
1725          * but we need the real directories the file(s) live in.
1726          * For example the fname may be something like usr/lib/sparc
1727          * and we were passed in the / directory, but we need to
1728          * use the lib directory for the rename.
1729          */
1730 
1731         if (auditing && fdvp != NULL)
1732                 audit_setfsat_path(1);
1733         /*
1734          * Lookup to and from directories.
1735          */
1736         if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1737                 goto out;
1738         }
1739 
1740         /*
1741          * Make sure there is an entry.
1742          */
1743         if (fvp == NULL) {
1744                 error = ENOENT;
1745                 goto out;
1746         }
1747 
1748         if (auditing && tdvp != NULL)
1749                 audit_setfsat_path(3);
1750         if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1751                 goto out;
1752         }
1753 
1754         /*
1755          * Make sure both the from vnode directory and the to directory
1756          * are in the same vfs and the to directory is writable.
1757          * We check fsid's, not vfs pointers, so loopback fs works.
1758          */
1759         if (fromvp != tovp) {
1760                 vattr.va_mask = AT_FSID;
1761                 if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1762                         goto out;
1763                 fsid = vattr.va_fsid;
1764                 vattr.va_mask = AT_FSID;
1765                 if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1766                         goto out;
1767                 if (fsid != vattr.va_fsid) {
1768                         error = EXDEV;
1769                         goto out;
1770                 }
1771         }
1772 
1773         if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1774                 error = EROFS;
1775                 goto out;
1776         }
1777 
1778         /*
1779          * Make sure "from" vp is not a mount point.
1780          * Note, lookup did traverse() already, so
1781          * we'll be looking at the mounted FS root.
1782          * (but allow files like mnttab)
1783          */
1784         if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1785                 error = EBUSY;
1786                 goto out;
1787         }
1788 
1789         if (targvp && (fvp != targvp)) {
1790                 nbl_start_crit(targvp, RW_READER);
1791                 in_crit_targ = 1;
1792                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1793                         error = EACCES;
1794                         goto out;
1795                 }
1796         }
1797 
1798         if (nbl_need_check(fvp)) {
1799                 nbl_start_crit(fvp, RW_READER);
1800                 in_crit_src = 1;
1801                 if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1802                         error = EACCES;
1803                         goto out;
1804                 }
1805         }
1806 
1807         /*
1808          * Do the rename.
1809          */
1810         (void) pn_fixslash(&tpn);
1811         error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1812             NULL, 0);
1813 
1814 out:
1815         pn_free(&fpn);
1816         pn_free(&tpn);
1817         if (in_crit_src)
1818                 nbl_end_crit(fvp);
1819         if (in_crit_targ)
1820                 nbl_end_crit(targvp);
1821         if (fromvp)
1822                 VN_RELE(fromvp);
1823         if (tovp)
1824                 VN_RELE(tovp);
1825         if (targvp)
1826                 VN_RELE(targvp);
1827         if (fvp)
1828                 VN_RELE(fvp);
1829         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1830                 goto top;
1831         return (error);
1832 }
1833 
1834 /*
1835  * Remove a file or directory.
1836  */
1837 int
1838 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1839 {
1840         return (vn_removeat(NULL, fnamep, seg, dirflag));
1841 }
1842 
1843 int
1844 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1845 {
1846         struct vnode *vp;               /* entry vnode */
1847         struct vnode *dvp;              /* ptr to parent dir vnode */
1848         struct vnode *coveredvp;
1849         struct pathname pn;             /* name of entry */
1850         enum vtype vtype;
1851         int error;
1852         struct vfs *vfsp;
1853         struct vfs *dvfsp;      /* ptr to parent dir vfs */
1854         int in_crit = 0;
1855         int estale_retry = 0;
1856 
1857 top:
1858         if (error = pn_get(fnamep, seg, &pn))
1859                 return (error);
1860         dvp = vp = NULL;
1861         if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1862                 pn_free(&pn);
1863                 if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1864                         goto top;
1865                 return (error);
1866         }
1867 
1868         /*
1869          * Make sure there is an entry.
1870          */
1871         if (vp == NULL) {
1872                 error = ENOENT;
1873                 goto out;
1874         }
1875 
1876         vfsp = vp->v_vfsp;
1877         dvfsp = dvp->v_vfsp;
1878 
1879         /*
1880          * If the named file is the root of a mounted filesystem, fail,
1881          * unless it's marked unlinkable.  In that case, unmount the
1882          * filesystem and proceed to unlink the covered vnode.  (If the
1883          * covered vnode is a directory, use rmdir instead of unlink,
1884          * to avoid file system corruption.)
1885          */
1886         if (vp->v_flag & VROOT) {
1887                 if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1888                         error = EBUSY;
1889                         goto out;
1890                 }
1891 
1892                 /*
1893                  * Namefs specific code starts here.
1894                  */
1895 
1896                 if (dirflag == RMDIRECTORY) {
1897                         /*
1898                          * User called rmdir(2) on a file that has
1899                          * been namefs mounted on top of.  Since
1900                          * namefs doesn't allow directories to
1901                          * be mounted on other files we know
1902                          * vp is not of type VDIR so fail to operation.
1903                          */
1904                         error = ENOTDIR;
1905                         goto out;
1906                 }
1907 
1908                 /*
1909                  * If VROOT is still set after grabbing vp->v_lock,
1910                  * noone has finished nm_unmount so far and coveredvp
1911                  * is valid.
1912                  * If we manage to grab vn_vfswlock(coveredvp) before releasing
1913                  * vp->v_lock, any race window is eliminated.
1914                  */
1915 
1916                 mutex_enter(&vp->v_lock);
1917                 if ((vp->v_flag & VROOT) == 0) {
1918                         /* Someone beat us to the unmount */
1919                         mutex_exit(&vp->v_lock);
1920                         error = EBUSY;
1921                         goto out;
1922                 }
1923                 vfsp = vp->v_vfsp;
1924                 coveredvp = vfsp->vfs_vnodecovered;
1925                 ASSERT(coveredvp);
1926                 /*
1927                  * Note: Implementation of vn_vfswlock shows that ordering of
1928                  * v_lock / vn_vfswlock is not an issue here.
1929                  */
1930                 error = vn_vfswlock(coveredvp);
1931                 mutex_exit(&vp->v_lock);
1932 
1933                 if (error)
1934                         goto out;
1935 
1936                 VN_HOLD(coveredvp);
1937                 VN_RELE(vp);
1938                 error = dounmount(vfsp, 0, CRED());
1939 
1940                 /*
1941                  * Unmounted the namefs file system; now get
1942                  * the object it was mounted over.
1943                  */
1944                 vp = coveredvp;
1945                 /*
1946                  * If namefs was mounted over a directory, then
1947                  * we want to use rmdir() instead of unlink().
1948                  */
1949                 if (vp->v_type == VDIR)
1950                         dirflag = RMDIRECTORY;
1951 
1952                 if (error)
1953                         goto out;
1954         }
1955 
1956         /*
1957          * Make sure filesystem is writeable.
1958          * We check the parent directory's vfs in case this is an lofs vnode.
1959          */
1960         if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1961                 error = EROFS;
1962                 goto out;
1963         }
1964 
1965         vtype = vp->v_type;
1966 
1967         /*
1968          * If there is the possibility of an nbmand share reservation, make
1969          * sure it's okay to remove the file.  Keep a reference to the
1970          * vnode, so that we can exit the nbl critical region after
1971          * calling VOP_REMOVE.
1972          * If there is no possibility of an nbmand share reservation,
1973          * release the vnode reference now.  Filesystems like NFS may
1974          * behave differently if there is an extra reference, so get rid of
1975          * this one.  Fortunately, we can't have nbmand mounts on NFS
1976          * filesystems.
1977          */
1978         if (nbl_need_check(vp)) {
1979                 nbl_start_crit(vp, RW_READER);
1980                 in_crit = 1;
1981                 if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1982                         error = EACCES;
1983                         goto out;
1984                 }
1985         } else {
1986                 VN_RELE(vp);
1987                 vp = NULL;
1988         }
1989 
1990         if (dirflag == RMDIRECTORY) {
1991                 /*
1992                  * Caller is using rmdir(2), which can only be applied to
1993                  * directories.
1994                  */
1995                 if (vtype != VDIR) {
1996                         error = ENOTDIR;
1997                 } else {
1998                         vnode_t *cwd;
1999                         proc_t *pp = curproc;
2000 
2001                         mutex_enter(&pp->p_lock);
2002                         cwd = PTOU(pp)->u_cdir;
2003                         VN_HOLD(cwd);
2004                         mutex_exit(&pp->p_lock);
2005                         error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2006                             NULL, 0);
2007                         VN_RELE(cwd);
2008                 }
2009         } else {
2010                 /*
2011                  * Unlink(2) can be applied to anything.
2012                  */
2013                 error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2014         }
2015 
2016 out:
2017         pn_free(&pn);
2018         if (in_crit) {
2019                 nbl_end_crit(vp);
2020                 in_crit = 0;
2021         }
2022         if (vp != NULL)
2023                 VN_RELE(vp);
2024         if (dvp != NULL)
2025                 VN_RELE(dvp);
2026         if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2027                 goto top;
2028         return (error);
2029 }
2030 
2031 /*
2032  * Utility function to compare equality of vnodes.
2033  * Compare the underlying real vnodes, if there are underlying vnodes.
2034  * This is a more thorough comparison than the VN_CMP() macro provides.
2035  */
2036 int
2037 vn_compare(vnode_t *vp1, vnode_t *vp2)
2038 {
2039         vnode_t *realvp;
2040 
2041         if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2042                 vp1 = realvp;
2043         if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2044                 vp2 = realvp;
2045         return (VN_CMP(vp1, vp2));
2046 }
2047 
2048 /*
2049  * The number of locks to hash into.  This value must be a power
2050  * of 2 minus 1 and should probably also be prime.
2051  */
2052 #define NUM_BUCKETS     1023
2053 
2054 struct  vn_vfslocks_bucket {
2055         kmutex_t vb_lock;
2056         vn_vfslocks_entry_t *vb_list;
2057         char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2058 };
2059 
2060 /*
2061  * Total number of buckets will be NUM_BUCKETS + 1 .
2062  */
2063 
2064 #pragma align   64(vn_vfslocks_buckets)
2065 static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2066 
2067 #define VN_VFSLOCKS_SHIFT       9
2068 
2069 #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2070         ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2071 
2072 /*
2073  * vn_vfslocks_getlock() uses an HASH scheme to generate
2074  * rwstlock using vfs/vnode pointer passed to it.
2075  *
2076  * vn_vfslocks_rele() releases a reference in the
2077  * HASH table which allows the entry allocated by
2078  * vn_vfslocks_getlock() to be freed at a later
2079  * stage when the refcount drops to zero.
2080  */
2081 
2082 vn_vfslocks_entry_t *
2083 vn_vfslocks_getlock(void *vfsvpptr)
2084 {
2085         struct vn_vfslocks_bucket *bp;
2086         vn_vfslocks_entry_t *vep;
2087         vn_vfslocks_entry_t *tvep;
2088 
2089         ASSERT(vfsvpptr != NULL);
2090         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2091 
2092         mutex_enter(&bp->vb_lock);
2093         for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2094                 if (vep->ve_vpvfs == vfsvpptr) {
2095                         vep->ve_refcnt++;
2096                         mutex_exit(&bp->vb_lock);
2097                         return (vep);
2098                 }
2099         }
2100         mutex_exit(&bp->vb_lock);
2101         vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2102         rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2103         vep->ve_vpvfs = (char *)vfsvpptr;
2104         vep->ve_refcnt = 1;
2105         mutex_enter(&bp->vb_lock);
2106         for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2107                 if (tvep->ve_vpvfs == vfsvpptr) {
2108                         tvep->ve_refcnt++;
2109                         mutex_exit(&bp->vb_lock);
2110 
2111                         /*
2112                          * There is already an entry in the hash
2113                          * destroy what we just allocated.
2114                          */
2115                         rwst_destroy(&vep->ve_lock);
2116                         kmem_free(vep, sizeof (*vep));
2117                         return (tvep);
2118                 }
2119         }
2120         vep->ve_next = bp->vb_list;
2121         bp->vb_list = vep;
2122         mutex_exit(&bp->vb_lock);
2123         return (vep);
2124 }
2125 
2126 void
2127 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2128 {
2129         struct vn_vfslocks_bucket *bp;
2130         vn_vfslocks_entry_t *vep;
2131         vn_vfslocks_entry_t *pvep;
2132 
2133         ASSERT(vepent != NULL);
2134         ASSERT(vepent->ve_vpvfs != NULL);
2135 
2136         bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2137 
2138         mutex_enter(&bp->vb_lock);
2139         vepent->ve_refcnt--;
2140 
2141         if ((int32_t)vepent->ve_refcnt < 0)
2142                 cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2143 
2144         pvep = NULL;
2145         if (vepent->ve_refcnt == 0) {
2146                 for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2147                         if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2148                                 if (pvep == NULL)
2149                                         bp->vb_list = vep->ve_next;
2150                                 else {
2151                                         pvep->ve_next = vep->ve_next;
2152                                 }
2153                                 mutex_exit(&bp->vb_lock);
2154                                 rwst_destroy(&vep->ve_lock);
2155                                 kmem_free(vep, sizeof (*vep));
2156                                 return;
2157                         }
2158                         pvep = vep;
2159                 }
2160                 cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2161         }
2162         mutex_exit(&bp->vb_lock);
2163 }
2164 
2165 /*
2166  * vn_vfswlock_wait is used to implement a lock which is logically a writers
2167  * lock protecting the v_vfsmountedhere field.
2168  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2169  * except that it blocks to acquire the lock VVFSLOCK.
2170  *
2171  * traverse() and routines re-implementing part of traverse (e.g. autofs)
2172  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2173  * need the non-blocking version of the writers lock i.e. vn_vfswlock
2174  */
2175 int
2176 vn_vfswlock_wait(vnode_t *vp)
2177 {
2178         int retval;
2179         vn_vfslocks_entry_t *vpvfsentry;
2180         ASSERT(vp != NULL);
2181 
2182         vpvfsentry = vn_vfslocks_getlock(vp);
2183         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2184 
2185         if (retval == EINTR) {
2186                 vn_vfslocks_rele(vpvfsentry);
2187                 return (EINTR);
2188         }
2189         return (retval);
2190 }
2191 
2192 int
2193 vn_vfsrlock_wait(vnode_t *vp)
2194 {
2195         int retval;
2196         vn_vfslocks_entry_t *vpvfsentry;
2197         ASSERT(vp != NULL);
2198 
2199         vpvfsentry = vn_vfslocks_getlock(vp);
2200         retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2201 
2202         if (retval == EINTR) {
2203                 vn_vfslocks_rele(vpvfsentry);
2204                 return (EINTR);
2205         }
2206 
2207         return (retval);
2208 }
2209 
2210 
2211 /*
2212  * vn_vfswlock is used to implement a lock which is logically a writers lock
2213  * protecting the v_vfsmountedhere field.
2214  */
2215 int
2216 vn_vfswlock(vnode_t *vp)
2217 {
2218         vn_vfslocks_entry_t *vpvfsentry;
2219 
2220         /*
2221          * If vp is NULL then somebody is trying to lock the covered vnode
2222          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2223          * only happen when unmounting /.  Since that operation will fail
2224          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2225          */
2226         if (vp == NULL)
2227                 return (EBUSY);
2228 
2229         vpvfsentry = vn_vfslocks_getlock(vp);
2230 
2231         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2232                 return (0);
2233 
2234         vn_vfslocks_rele(vpvfsentry);
2235         return (EBUSY);
2236 }
2237 
2238 int
2239 vn_vfsrlock(vnode_t *vp)
2240 {
2241         vn_vfslocks_entry_t *vpvfsentry;
2242 
2243         /*
2244          * If vp is NULL then somebody is trying to lock the covered vnode
2245          * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2246          * only happen when unmounting /.  Since that operation will fail
2247          * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2248          */
2249         if (vp == NULL)
2250                 return (EBUSY);
2251 
2252         vpvfsentry = vn_vfslocks_getlock(vp);
2253 
2254         if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2255                 return (0);
2256 
2257         vn_vfslocks_rele(vpvfsentry);
2258         return (EBUSY);
2259 }
2260 
2261 void
2262 vn_vfsunlock(vnode_t *vp)
2263 {
2264         vn_vfslocks_entry_t *vpvfsentry;
2265 
2266         /*
2267          * ve_refcnt needs to be decremented twice.
2268          * 1. To release refernce after a call to vn_vfslocks_getlock()
2269          * 2. To release the reference from the locking routines like
2270          *    vn_vfsrlock/vn_vfswlock etc,.
2271          */
2272         vpvfsentry = vn_vfslocks_getlock(vp);
2273         vn_vfslocks_rele(vpvfsentry);
2274 
2275         rwst_exit(&vpvfsentry->ve_lock);
2276         vn_vfslocks_rele(vpvfsentry);
2277 }
2278 
2279 int
2280 vn_vfswlock_held(vnode_t *vp)
2281 {
2282         int held;
2283         vn_vfslocks_entry_t *vpvfsentry;
2284 
2285         ASSERT(vp != NULL);
2286 
2287         vpvfsentry = vn_vfslocks_getlock(vp);
2288         held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2289 
2290         vn_vfslocks_rele(vpvfsentry);
2291         return (held);
2292 }
2293 
2294 
2295 int
2296 vn_make_ops(
2297         const char *name,                       /* Name of file system */
2298         const fs_operation_def_t *templ,        /* Operation specification */
2299         vnodeops_t **actual)                    /* Return the vnodeops */
2300 {
2301         int unused_ops;
2302         int error;
2303 
2304         *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2305 
2306         (*actual)->vnop_name = name;
2307 
2308         error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2309         if (error) {
2310                 kmem_free(*actual, sizeof (vnodeops_t));
2311         }
2312 
2313 #if DEBUG
2314         if (unused_ops != 0)
2315                 cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2316                     "but not used", name, unused_ops);
2317 #endif
2318 
2319         return (error);
2320 }
2321 
2322 /*
2323  * Free the vnodeops created as a result of vn_make_ops()
2324  */
2325 void
2326 vn_freevnodeops(vnodeops_t *vnops)
2327 {
2328         kmem_free(vnops, sizeof (vnodeops_t));
2329 }
2330 
2331 /*
2332  * Vnode cache.
2333  */
2334 
2335 /* ARGSUSED */
2336 static int
2337 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2338 {
2339         struct vnode *vp;
2340 
2341         vp = buf;
2342 
2343         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2344         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2345         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2346         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2347         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2348         vp->v_path = vn_vpath_empty;
2349         vp->v_path_stamp = 0;
2350         vp->v_mpssdata = NULL;
2351         vp->v_vsd = NULL;
2352         vp->v_fopdata = NULL;
2353 
2354         return (0);
2355 }
2356 
2357 /* ARGSUSED */
2358 static void
2359 vn_cache_destructor(void *buf, void *cdrarg)
2360 {
2361         struct vnode *vp;
2362 
2363         vp = buf;
2364 
2365         rw_destroy(&vp->v_nbllock);
2366         cv_destroy(&vp->v_cv);
2367         mutex_destroy(&vp->v_vsd_lock);
2368         mutex_destroy(&vp->v_lock);
2369 }
2370 
2371 void
2372 vn_create_cache(void)
2373 {
2374         /* LINTED */
2375         ASSERT((1 << VNODE_ALIGN_LOG2) ==
2376             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2377         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2378             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2379             NULL, 0);
2380 }
2381 
2382 void
2383 vn_destroy_cache(void)
2384 {
2385         kmem_cache_destroy(vn_cache);
2386 }
2387 
2388 /*
2389  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2390  * cached by the file system and vnodes remain associated.
2391  */
2392 void
2393 vn_recycle(vnode_t *vp)
2394 {
2395         ASSERT(vp->v_pages == NULL);
2396         VERIFY(vp->v_path != NULL);
2397 
2398         /*
2399          * XXX - This really belongs in vn_reinit(), but we have some issues
2400          * with the counts.  Best to have it here for clean initialization.
2401          */
2402         vp->v_rdcnt = 0;
2403         vp->v_wrcnt = 0;
2404         vp->v_mmap_read = 0;
2405         vp->v_mmap_write = 0;
2406 
2407         /*
2408          * If FEM was in use, make sure everything gets cleaned up
2409          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2410          * constructor.
2411          */
2412         if (vp->v_femhead) {
2413                 /* XXX - There should be a free_femhead() that does all this */
2414                 ASSERT(vp->v_femhead->femh_list == NULL);
2415                 mutex_destroy(&vp->v_femhead->femh_lock);
2416                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2417                 vp->v_femhead = NULL;
2418         }
2419         if (vp->v_path != vn_vpath_empty) {
2420                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2421                 vp->v_path = vn_vpath_empty;
2422         }
2423         vp->v_path_stamp = 0;
2424 
2425         if (vp->v_fopdata != NULL) {
2426                 free_fopdata(vp);
2427         }
2428         vp->v_mpssdata = NULL;
2429         vsd_free(vp);
2430 }
2431 
2432 /*
2433  * Used to reset the vnode fields including those that are directly accessible
2434  * as well as those which require an accessor function.
2435  *
2436  * Does not initialize:
2437  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2438  *      v_data (since FS-nodes and vnodes point to each other and should
2439  *              be updated simultaneously)
2440  *      v_op (in case someone needs to make a VOP call on this object)
2441  */
2442 void
2443 vn_reinit(vnode_t *vp)
2444 {
2445         vp->v_count = 1;
2446         vp->v_count_dnlc = 0;
2447         vp->v_vfsp = NULL;
2448         vp->v_stream = NULL;
2449         vp->v_vfsmountedhere = NULL;
2450         vp->v_flag = 0;
2451         vp->v_type = VNON;
2452         vp->v_rdev = NODEV;
2453 
2454         vp->v_filocks = NULL;
2455         vp->v_shrlocks = NULL;
2456         vp->v_pages = NULL;
2457 
2458         vp->v_locality = NULL;
2459         vp->v_xattrdir = NULL;
2460 
2461         /*
2462          * In a few specific instances, vn_reinit() is used to initialize
2463          * locally defined vnode_t instances.  Lacking the construction offered
2464          * by vn_alloc(), these vnodes require v_path initialization.
2465          */
2466         if (vp->v_path == NULL) {
2467                 vp->v_path = vn_vpath_empty;
2468         }
2469 
2470         /* Handles v_femhead, v_path, and the r/w/map counts */
2471         vn_recycle(vp);
2472 }
2473 
2474 vnode_t *
2475 vn_alloc(int kmflag)
2476 {
2477         vnode_t *vp;
2478 
2479         vp = kmem_cache_alloc(vn_cache, kmflag);
2480 
2481         if (vp != NULL) {
2482                 vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2483                 vp->v_fopdata = NULL;
2484                 vn_reinit(vp);
2485         }
2486 
2487         return (vp);
2488 }
2489 
2490 void
2491 vn_free(vnode_t *vp)
2492 {
2493         ASSERT(vp->v_shrlocks == NULL);
2494         ASSERT(vp->v_filocks == NULL);
2495 
2496         /*
2497          * Some file systems call vn_free() with v_count of zero,
2498          * some with v_count of 1.  In any case, the value should
2499          * never be anything else.
2500          */
2501         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2502         ASSERT(vp->v_count_dnlc == 0);
2503         VERIFY(vp->v_path != NULL);
2504         if (vp->v_path != vn_vpath_empty) {
2505                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2506                 vp->v_path = vn_vpath_empty;
2507         }
2508 
2509         /* If FEM was in use, make sure everything gets cleaned up */
2510         if (vp->v_femhead) {
2511                 /* XXX - There should be a free_femhead() that does all this */
2512                 ASSERT(vp->v_femhead->femh_list == NULL);
2513                 mutex_destroy(&vp->v_femhead->femh_lock);
2514                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2515                 vp->v_femhead = NULL;
2516         }
2517 
2518         if (vp->v_fopdata != NULL) {
2519                 free_fopdata(vp);
2520         }
2521         vp->v_mpssdata = NULL;
2522         vsd_free(vp);
2523         kmem_cache_free(vn_cache, vp);
2524 }
2525 
2526 /*
2527  * vnode status changes, should define better states than 1, 0.
2528  */
2529 void
2530 vn_reclaim(vnode_t *vp)
2531 {
2532         vfs_t   *vfsp = vp->v_vfsp;
2533 
2534         if (vfsp == NULL ||
2535             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2536                 return;
2537         }
2538         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2539 }
2540 
2541 void
2542 vn_idle(vnode_t *vp)
2543 {
2544         vfs_t   *vfsp = vp->v_vfsp;
2545 
2546         if (vfsp == NULL ||
2547             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2548                 return;
2549         }
2550         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2551 }
2552 void
2553 vn_exists(vnode_t *vp)
2554 {
2555         vfs_t   *vfsp = vp->v_vfsp;
2556 
2557         if (vfsp == NULL ||
2558             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2559                 return;
2560         }
2561         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2562 }
2563 
2564 void
2565 vn_invalid(vnode_t *vp)
2566 {
2567         vfs_t   *vfsp = vp->v_vfsp;
2568 
2569         if (vfsp == NULL ||
2570             vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2571                 return;
2572         }
2573         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2574 }
2575 
2576 /* Vnode event notification */
2577 
2578 int
2579 vnevent_support(vnode_t *vp, caller_context_t *ct)
2580 {
2581         if (vp == NULL)
2582                 return (EINVAL);
2583 
2584         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2585 }
2586 
2587 void
2588 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2589 {
2590         if (vp == NULL || vp->v_femhead == NULL) {
2591                 return;
2592         }
2593         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2594 }
2595 
2596 void
2597 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2598     caller_context_t *ct)
2599 {
2600         if (vp == NULL || vp->v_femhead == NULL) {
2601                 return;
2602         }
2603         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2604 }
2605 
2606 void
2607 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2608 {
2609         if (vp == NULL || vp->v_femhead == NULL) {
2610                 return;
2611         }
2612         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2613 }
2614 
2615 void
2616 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2617 {
2618         if (vp == NULL || vp->v_femhead == NULL) {
2619                 return;
2620         }
2621         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2622 }
2623 
2624 void
2625 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2626 {
2627         if (vp == NULL || vp->v_femhead == NULL) {
2628                 return;
2629         }
2630         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2631 }
2632 
2633 void
2634 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2635     caller_context_t *ct)
2636 {
2637         if (vp == NULL || vp->v_femhead == NULL) {
2638                 return;
2639         }
2640         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2641 }
2642 
2643 void
2644 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2645     caller_context_t *ct)
2646 {
2647         if (vp == NULL || vp->v_femhead == NULL) {
2648                 return;
2649         }
2650         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2651 }
2652 
2653 void
2654 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2655     caller_context_t *ct)
2656 {
2657         if (vp == NULL || vp->v_femhead == NULL) {
2658                 return;
2659         }
2660         (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2661 }
2662 
2663 void
2664 vnevent_create(vnode_t *vp, caller_context_t *ct)
2665 {
2666         if (vp == NULL || vp->v_femhead == NULL) {
2667                 return;
2668         }
2669         (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2670 }
2671 
2672 void
2673 vnevent_link(vnode_t *vp, caller_context_t *ct)
2674 {
2675         if (vp == NULL || vp->v_femhead == NULL) {
2676                 return;
2677         }
2678         (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2679 }
2680 
2681 void
2682 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2683 {
2684         if (vp == NULL || vp->v_femhead == NULL) {
2685                 return;
2686         }
2687         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2688 }
2689 
2690 void
2691 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2692 {
2693         if (vp == NULL || vp->v_femhead == NULL) {
2694                 return;
2695         }
2696         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2697 }
2698 
2699 /*
2700  * Vnode accessors.
2701  */
2702 
2703 int
2704 vn_is_readonly(vnode_t *vp)
2705 {
2706         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2707 }
2708 
2709 int
2710 vn_has_flocks(vnode_t *vp)
2711 {
2712         return (vp->v_filocks != NULL);
2713 }
2714 
2715 int
2716 vn_has_mandatory_locks(vnode_t *vp, int mode)
2717 {
2718         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2719 }
2720 
2721 int
2722 vn_has_cached_data(vnode_t *vp)
2723 {
2724         return (vp->v_pages != NULL);
2725 }
2726 
2727 /*
2728  * Return 0 if the vnode in question shouldn't be permitted into a zone via
2729  * zone_enter(2).
2730  */
2731 int
2732 vn_can_change_zones(vnode_t *vp)
2733 {
2734         struct vfssw *vswp;
2735         int allow = 1;
2736         vnode_t *rvp;
2737 
2738         if (nfs_global_client_only != 0)
2739                 return (1);
2740 
2741         /*
2742          * We always want to look at the underlying vnode if there is one.
2743          */
2744         if (VOP_REALVP(vp, &rvp, NULL) != 0)
2745                 rvp = vp;
2746         /*
2747          * Some pseudo filesystems (including doorfs) don't actually register
2748          * their vfsops_t, so the following may return NULL; we happily let
2749          * such vnodes switch zones.
2750          */
2751         vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2752         if (vswp != NULL) {
2753                 if (vswp->vsw_flag & VSW_NOTZONESAFE)
2754                         allow = 0;
2755                 vfs_unrefvfssw(vswp);
2756         }
2757         return (allow);
2758 }
2759 
2760 /*
2761  * Return nonzero if the vnode is a mount point, zero if not.
2762  */
2763 int
2764 vn_ismntpt(vnode_t *vp)
2765 {
2766         return (vp->v_vfsmountedhere != NULL);
2767 }
2768 
2769 /* Retrieve the vfs (if any) mounted on this vnode */
2770 vfs_t *
2771 vn_mountedvfs(vnode_t *vp)
2772 {
2773         return (vp->v_vfsmountedhere);
2774 }
2775 
2776 /*
2777  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2778  */
2779 int
2780 vn_in_dnlc(vnode_t *vp)
2781 {
2782         return (vp->v_count_dnlc > 0);
2783 }
2784 
2785 /*
2786  * vn_has_other_opens() checks whether a particular file is opened by more than
2787  * just the caller and whether the open is for read and/or write.
2788  * This routine is for calling after the caller has already called VOP_OPEN()
2789  * and the caller wishes to know if they are the only one with it open for
2790  * the mode(s) specified.
2791  *
2792  * Vnode counts are only kept on regular files (v_type=VREG).
2793  */
2794 int
2795 vn_has_other_opens(
2796         vnode_t *vp,
2797         v_mode_t mode)
2798 {
2799 
2800         ASSERT(vp != NULL);
2801 
2802         switch (mode) {
2803         case V_WRITE:
2804                 if (vp->v_wrcnt > 1)
2805                         return (V_TRUE);
2806                 break;
2807         case V_RDORWR:
2808                 if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2809                         return (V_TRUE);
2810                 break;
2811         case V_RDANDWR:
2812                 if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2813                         return (V_TRUE);
2814                 break;
2815         case V_READ:
2816                 if (vp->v_rdcnt > 1)
2817                         return (V_TRUE);
2818                 break;
2819         }
2820 
2821         return (V_FALSE);
2822 }
2823 
2824 /*
2825  * vn_is_opened() checks whether a particular file is opened and
2826  * whether the open is for read and/or write.
2827  *
2828  * Vnode counts are only kept on regular files (v_type=VREG).
2829  */
2830 int
2831 vn_is_opened(
2832         vnode_t *vp,
2833         v_mode_t mode)
2834 {
2835 
2836         ASSERT(vp != NULL);
2837 
2838         switch (mode) {
2839         case V_WRITE:
2840                 if (vp->v_wrcnt)
2841                         return (V_TRUE);
2842                 break;
2843         case V_RDANDWR:
2844                 if (vp->v_rdcnt && vp->v_wrcnt)
2845                         return (V_TRUE);
2846                 break;
2847         case V_RDORWR:
2848                 if (vp->v_rdcnt || vp->v_wrcnt)
2849                         return (V_TRUE);
2850                 break;
2851         case V_READ:
2852                 if (vp->v_rdcnt)
2853                         return (V_TRUE);
2854                 break;
2855         }
2856 
2857         return (V_FALSE);
2858 }
2859 
2860 /*
2861  * vn_is_mapped() checks whether a particular file is mapped and whether
2862  * the file is mapped read and/or write.
2863  */
2864 int
2865 vn_is_mapped(
2866         vnode_t *vp,
2867         v_mode_t mode)
2868 {
2869 
2870         ASSERT(vp != NULL);
2871 
2872 #if !defined(_LP64)
2873         switch (mode) {
2874         /*
2875          * The atomic_add_64_nv functions force atomicity in the
2876          * case of 32 bit architectures. Otherwise the 64 bit values
2877          * require two fetches. The value of the fields may be
2878          * (potentially) changed between the first fetch and the
2879          * second
2880          */
2881         case V_WRITE:
2882                 if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2883                         return (V_TRUE);
2884                 break;
2885         case V_RDANDWR:
2886                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2887                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2888                         return (V_TRUE);
2889                 break;
2890         case V_RDORWR:
2891                 if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2892                     (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2893                         return (V_TRUE);
2894                 break;
2895         case V_READ:
2896                 if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2897                         return (V_TRUE);
2898                 break;
2899         }
2900 #else
2901         switch (mode) {
2902         case V_WRITE:
2903                 if (vp->v_mmap_write)
2904                         return (V_TRUE);
2905                 break;
2906         case V_RDANDWR:
2907                 if (vp->v_mmap_read && vp->v_mmap_write)
2908                         return (V_TRUE);
2909                 break;
2910         case V_RDORWR:
2911                 if (vp->v_mmap_read || vp->v_mmap_write)
2912                         return (V_TRUE);
2913                 break;
2914         case V_READ:
2915                 if (vp->v_mmap_read)
2916                         return (V_TRUE);
2917                 break;
2918         }
2919 #endif
2920 
2921         return (V_FALSE);
2922 }
2923 
2924 /*
2925  * Set the operations vector for a vnode.
2926  *
2927  * FEM ensures that the v_femhead pointer is filled in before the
2928  * v_op pointer is changed.  This means that if the v_femhead pointer
2929  * is NULL, and the v_op field hasn't changed since before which checked
2930  * the v_femhead pointer; then our update is ok - we are not racing with
2931  * FEM.
2932  */
2933 void
2934 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2935 {
2936         vnodeops_t      *op;
2937 
2938         ASSERT(vp != NULL);
2939         ASSERT(vnodeops != NULL);
2940 
2941         op = vp->v_op;
2942         membar_consumer();
2943         /*
2944          * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2945          * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2946          * in effect on the vnode and we need to have FEM deal with it.
2947          */
2948         if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2949             op) {
2950                 fem_setvnops(vp, vnodeops);
2951         }
2952 }
2953 
2954 /*
2955  * Retrieve the operations vector for a vnode
2956  * As with vn_setops(above); make sure we aren't racing with FEM.
2957  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2958  * make sense to the callers of this routine.
2959  */
2960 vnodeops_t *
2961 vn_getops(vnode_t *vp)
2962 {
2963         vnodeops_t      *op;
2964 
2965         ASSERT(vp != NULL);
2966 
2967         op = vp->v_op;
2968         membar_consumer();
2969         if (vp->v_femhead == NULL && op == vp->v_op) {
2970                 return (op);
2971         } else {
2972                 return (fem_getvnops(vp));
2973         }
2974 }
2975 
2976 /*
2977  * Returns non-zero (1) if the vnodeops matches that of the vnode.
2978  * Returns zero (0) if not.
2979  */
2980 int
2981 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2982 {
2983         return (vn_getops(vp) == vnodeops);
2984 }
2985 
2986 /*
2987  * Returns non-zero (1) if the specified operation matches the
2988  * corresponding operation for that the vnode.
2989  * Returns zero (0) if not.
2990  */
2991 
2992 #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2993 
2994 int
2995 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2996 {
2997         const fs_operation_trans_def_t *otdp;
2998         fs_generic_func_p *loc = NULL;
2999         vnodeops_t      *vop = vn_getops(vp);
3000 
3001         ASSERT(vopname != NULL);
3002 
3003         for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3004                 if (MATCHNAME(otdp->name, vopname)) {
3005                         loc = (fs_generic_func_p *)
3006                             ((char *)(vop) + otdp->offset);
3007                         break;
3008                 }
3009         }
3010 
3011         return ((loc != NULL) && (*loc == funcp));
3012 }
3013 
3014 /*
3015  * fs_new_caller_id() needs to return a unique ID on a given local system.
3016  * The IDs do not need to survive across reboots.  These are primarily
3017  * used so that (FEM) monitors can detect particular callers (such as
3018  * the NFS server) to a given vnode/vfs operation.
3019  */
3020 u_longlong_t
3021 fs_new_caller_id()
3022 {
3023         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3024 
3025         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3026 }
3027 
3028 /*
3029  * The value stored in v_path is relative to rootdir, located in the global
3030  * zone.  Zones or chroot environments which reside deeper inside the VFS
3031  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3032  * what lies below their perceived root.  In order to keep v_path usable for
3033  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3034  *
3035  * An upper bound of max_vnode_path is placed upon v_path allocations to
3036  * prevent the system from going too wild at the behest of pathological
3037  * behavior from the operator.
3038  */
3039 size_t max_vnode_path = 4 * MAXPATHLEN;
3040 
3041 
3042 void
3043 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3044 {
3045         char *buf;
3046 
3047         mutex_enter(&vp->v_lock);
3048         /*
3049          * If the snapshot of v_path_stamp passed in via compare_stamp does not
3050          * match the present value on the vnode, it indicates that subsequent
3051          * changes have occurred.  The v_path value is not cleared in this case
3052          * since the new value may be valid.
3053          */
3054         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3055                 mutex_exit(&vp->v_lock);
3056                 return;
3057         }
3058         buf = vp->v_path;
3059         vp->v_path = vn_vpath_empty;
3060         vp->v_path_stamp = 0;
3061         mutex_exit(&vp->v_lock);
3062         if (buf != vn_vpath_empty) {
3063                 kmem_free(buf, strlen(buf) + 1);
3064         }
3065 }
3066 
3067 static void
3068 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3069     boolean_t is_rename)
3070 {
3071         char *buf, *oldbuf;
3072         hrtime_t pstamp;
3073         size_t baselen, buflen = 0;
3074 
3075         /* Handle the vn_setpath_str case. */
3076         if (pvp == NULL) {
3077                 if (len + 1 > max_vnode_path) {
3078                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3079                             vnode_t *, vp, char *, name, size_t, len + 1);
3080                         return;
3081                 }
3082                 buf = kmem_alloc(len + 1, KM_SLEEP);
3083                 bcopy(name, buf, len);
3084                 buf[len] = '\0';
3085 
3086                 mutex_enter(&vp->v_lock);
3087                 oldbuf = vp->v_path;
3088                 vp->v_path = buf;
3089                 vp->v_path_stamp = gethrtime();
3090                 mutex_exit(&vp->v_lock);
3091                 if (oldbuf != vn_vpath_empty) {
3092                         kmem_free(oldbuf, strlen(oldbuf) + 1);
3093                 }
3094                 return;
3095         }
3096 
3097         /* Take snapshot of parent dir */
3098         mutex_enter(&pvp->v_lock);
3099 
3100         if ((pvp->v_flag & VTRAVERSE) != 0) {
3101                 /*
3102                  * When the parent vnode has VTRAVERSE set in its flags, normal
3103                  * assumptions about v_path calculation no longer apply.  The
3104                  * primary situation where this occurs is via the VFS tricks
3105                  * which procfs plays in order to allow /proc/PID/(root|cwd) to
3106                  * yield meaningful results.
3107                  *
3108                  * When this flag is set, v_path on the child must not be
3109                  * updated since the calculated value is likely to be
3110                  * incorrect, given the current context.
3111                  */
3112                 mutex_exit(&pvp->v_lock);
3113                 return;
3114         }
3115 
3116 retrybuf:
3117         if (pvp->v_path == vn_vpath_empty) {
3118                 /*
3119                  * Without v_path from the parent directory, generating a child
3120                  * path from the name is impossible.
3121                  */
3122                 if (len > 0) {
3123                         pstamp = pvp->v_path_stamp;
3124                         mutex_exit(&pvp->v_lock);
3125                         vn_clearpath(vp, pstamp);
3126                         return;
3127                 }
3128 
3129                 /*
3130                  * The only feasible case here is where a NUL lookup is being
3131                  * performed on rootdir prior to its v_path being populated.
3132                  */
3133                 ASSERT(pvp->v_path_stamp == 0);
3134                 baselen = 0;
3135                 pstamp = 0;
3136         } else {
3137                 pstamp = pvp->v_path_stamp;
3138                 baselen = strlen(pvp->v_path);
3139                 /* ignore a trailing slash if present */
3140                 if (pvp->v_path[baselen - 1] == '/') {
3141                         /* This should only the be case for rootdir */
3142                         ASSERT(baselen == 1 && pvp == rootdir);
3143                         baselen--;
3144                 }
3145         }
3146         mutex_exit(&pvp->v_lock);
3147 
3148         if (buflen != 0) {
3149                 /* Free the existing (mis-sized) buffer in case of retry */
3150                 kmem_free(buf, buflen);
3151         }
3152         /* base, '/', name and trailing NUL */
3153         buflen = baselen + len + 2;
3154         if (buflen > max_vnode_path) {
3155                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3156                     vnode_t *, vp, char *, name, size_t, buflen);
3157                 return;
3158         }
3159         buf = kmem_alloc(buflen, KM_SLEEP);
3160 
3161         mutex_enter(&pvp->v_lock);
3162         if (pvp->v_path_stamp != pstamp) {
3163                 size_t vlen;
3164 
3165                 /*
3166                  * Since v_path_stamp changed on the parent, it is likely that
3167                  * v_path has been altered as well.  If the length does not
3168                  * exactly match what was previously measured, the buffer
3169                  * allocation must be repeated for proper sizing.
3170                  */
3171                 if (pvp->v_path == vn_vpath_empty) {
3172                         /* Give up if parent lack v_path */
3173                         mutex_exit(&pvp->v_lock);
3174                         kmem_free(buf, buflen);
3175                         return;
3176                 }
3177                 vlen = strlen(pvp->v_path);
3178                 if (pvp->v_path[vlen - 1] == '/') {
3179                         vlen--;
3180                 }
3181                 if (vlen != baselen) {
3182                         goto retrybuf;
3183                 }
3184         }
3185         bcopy(pvp->v_path, buf, baselen);
3186         mutex_exit(&pvp->v_lock);
3187 
3188         buf[baselen] = '/';
3189         baselen++;
3190         bcopy(name, &buf[baselen], len + 1);
3191 
3192         mutex_enter(&vp->v_lock);
3193         if (vp->v_path_stamp == 0) {
3194                 /* never-visited vnode can inherit stamp from parent */
3195                 ASSERT(vp->v_path == vn_vpath_empty);
3196                 vp->v_path_stamp = pstamp;
3197                 vp->v_path = buf;
3198                 mutex_exit(&vp->v_lock);
3199         } else if (vp->v_path_stamp < pstamp || is_rename) {
3200                 /*
3201                  * Install the updated path and stamp, ensuring that the v_path
3202                  * pointer is valid at all times for dtrace.
3203                  */
3204                 oldbuf = vp->v_path;
3205                 vp->v_path = buf;
3206                 vp->v_path_stamp = gethrtime();
3207                 mutex_exit(&vp->v_lock);
3208                 kmem_free(oldbuf, strlen(oldbuf) + 1);
3209         } else {
3210                 /*
3211                  * If the timestamp matches or is greater, it means another
3212                  * thread performed the update first while locks were dropped
3213                  * here to make the allocation.  We defer to the newer value.
3214                  */
3215                 mutex_exit(&vp->v_lock);
3216                 kmem_free(buf, buflen);
3217         }
3218         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3219 }
3220 
3221 void
3222 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3223 {
3224         size_t len;
3225 
3226         /*
3227          * If the parent is older or empty, there's nothing further to do.
3228          */
3229         if (pvp->v_path == vn_vpath_empty ||
3230             pvp->v_path_stamp <= vp->v_path_stamp) {
3231                 return;
3232         }
3233 
3234         /*
3235          * Given the lack of appropriate context, meaningful updates to v_path
3236          * cannot be made for during lookups for the '.' or '..' entries.
3237          */
3238         len = strlen(name);
3239         if (len == 0 || (len == 1 && name[0] == '.') ||
3240             (len == 2 && name[0] == '.' && name[1] == '.')) {
3241                 return;
3242         }
3243 
3244         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3245 }
3246 
3247 /*
3248  * Given a starting vnode and a path, updates the path in the target vnode in
3249  * a safe manner.  If the vnode already has path information embedded, then the
3250  * cached path is left untouched.
3251  */
3252 /* ARGSUSED */
3253 void
3254 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3255     size_t len)
3256 {
3257         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3258 }
3259 
3260 /*
3261  * Sets the path to the vnode to be the given string, regardless of current
3262  * context.  The string must be a complete path from rootdir.  This is only used
3263  * by fsop_root() for setting the path based on the mountpoint.
3264  */
3265 void
3266 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3267 {
3268         vn_setpath_common(NULL, vp, str, len, B_FALSE);
3269 }
3270 
3271 /*
3272  * Called from within filesystem's vop_rename() to handle renames once the
3273  * target vnode is available.
3274  */
3275 void
3276 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3277 {
3278         vn_setpath_common(pvp, vp, name, len, B_TRUE);
3279 }
3280 
3281 /*
3282  * Similar to vn_setpath_str(), this function sets the path of the destination
3283  * vnode to the be the same as the source vnode.
3284  */
3285 void
3286 vn_copypath(struct vnode *src, struct vnode *dst)
3287 {
3288         char *buf;
3289         hrtime_t stamp;
3290         size_t buflen;
3291 
3292         mutex_enter(&src->v_lock);
3293         if (src->v_path == vn_vpath_empty) {
3294                 mutex_exit(&src->v_lock);
3295                 return;
3296         }
3297         buflen = strlen(src->v_path) + 1;
3298         mutex_exit(&src->v_lock);
3299 
3300         buf = kmem_alloc(buflen, KM_SLEEP);
3301 
3302         mutex_enter(&src->v_lock);
3303         if (src->v_path == vn_vpath_empty ||
3304             strlen(src->v_path) + 1 != buflen) {
3305                 mutex_exit(&src->v_lock);
3306                 kmem_free(buf, buflen);
3307                 return;
3308         }
3309         bcopy(src->v_path, buf, buflen);
3310         stamp = src->v_path_stamp;
3311         mutex_exit(&src->v_lock);
3312 
3313         mutex_enter(&dst->v_lock);
3314         if (dst->v_path != vn_vpath_empty) {
3315                 mutex_exit(&dst->v_lock);
3316                 kmem_free(buf, buflen);
3317                 return;
3318         }
3319         dst->v_path = buf;
3320         dst->v_path_stamp = stamp;
3321         mutex_exit(&dst->v_lock);
3322 }
3323 
3324 
3325 /*
3326  * XXX Private interface for segvn routines that handle vnode
3327  * large page segments.
3328  *
3329  * return 1 if vp's file system VOP_PAGEIO() implementation
3330  * can be safely used instead of VOP_GETPAGE() for handling
3331  * pagefaults against regular non swap files. VOP_PAGEIO()
3332  * interface is considered safe here if its implementation
3333  * is very close to VOP_GETPAGE() implementation.
3334  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3335  * panic if there're file holes but instead returns an error.
3336  * Doesn't assume file won't be changed by user writes, etc.
3337  *
3338  * return 0 otherwise.
3339  *
3340  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3341  */
3342 int
3343 vn_vmpss_usepageio(vnode_t *vp)
3344 {
3345         vfs_t   *vfsp = vp->v_vfsp;
3346         char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3347         char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3348         char **fsok = pageio_ok_fss;
3349 
3350         if (fsname == NULL) {
3351                 return (0);
3352         }
3353 
3354         for (; *fsok; fsok++) {
3355                 if (strcmp(*fsok, fsname) == 0) {
3356                         return (1);
3357                 }
3358         }
3359         return (0);
3360 }
3361 
3362 /* VOP_XXX() macros call the corresponding fop_xxx() function */
3363 
3364 int
3365 fop_open(
3366         vnode_t **vpp,
3367         int mode,
3368         cred_t *cr,
3369         caller_context_t *ct)
3370 {
3371         int ret;
3372         vnode_t *vp = *vpp;
3373 
3374         VN_HOLD(vp);
3375         /*
3376          * Adding to the vnode counts before calling open
3377          * avoids the need for a mutex. It circumvents a race
3378          * condition where a query made on the vnode counts results in a
3379          * false negative. The inquirer goes away believing the file is
3380          * not open when there is an open on the file already under way.
3381          *
3382          * The counts are meant to prevent NFS from granting a delegation
3383          * when it would be dangerous to do so.
3384          *
3385          * The vnode counts are only kept on regular files
3386          */
3387         if ((*vpp)->v_type == VREG) {
3388                 if (mode & FREAD)
3389                         atomic_inc_32(&(*vpp)->v_rdcnt);
3390                 if (mode & FWRITE)
3391                         atomic_inc_32(&(*vpp)->v_wrcnt);
3392         }
3393 
3394         VOPXID_MAP_CR(vp, cr);
3395 
3396         ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3397 
3398         if (ret) {
3399                 /*
3400                  * Use the saved vp just in case the vnode ptr got trashed
3401                  * by the error.
3402                  */
3403                 VOPSTATS_UPDATE(vp, open);
3404                 if ((vp->v_type == VREG) && (mode & FREAD))
3405                         atomic_dec_32(&vp->v_rdcnt);
3406                 if ((vp->v_type == VREG) && (mode & FWRITE))
3407                         atomic_dec_32(&vp->v_wrcnt);
3408         } else {
3409                 /*
3410                  * Some filesystems will return a different vnode,
3411                  * but the same path was still used to open it.
3412                  * So if we do change the vnode and need to
3413                  * copy over the path, do so here, rather than special
3414                  * casing each filesystem. Adjust the vnode counts to
3415                  * reflect the vnode switch.
3416                  */
3417                 VOPSTATS_UPDATE(*vpp, open);
3418                 if (*vpp != vp) {
3419                         vn_copypath(vp, *vpp);
3420                         if (((*vpp)->v_type == VREG) && (mode & FREAD))
3421                                 atomic_inc_32(&(*vpp)->v_rdcnt);
3422                         if ((vp->v_type == VREG) && (mode & FREAD))
3423                                 atomic_dec_32(&vp->v_rdcnt);
3424                         if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3425                                 atomic_inc_32(&(*vpp)->v_wrcnt);
3426                         if ((vp->v_type == VREG) && (mode & FWRITE))
3427                                 atomic_dec_32(&vp->v_wrcnt);
3428                 }
3429         }
3430         VN_RELE(vp);
3431         return (ret);
3432 }
3433 
3434 int
3435 fop_close(
3436         vnode_t *vp,
3437         int flag,
3438         int count,
3439         offset_t offset,
3440         cred_t *cr,
3441         caller_context_t *ct)
3442 {
3443         int err;
3444 
3445         VOPXID_MAP_CR(vp, cr);
3446 
3447         err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3448         VOPSTATS_UPDATE(vp, close);
3449         /*
3450          * Check passed in count to handle possible dups. Vnode counts are only
3451          * kept on regular files
3452          */
3453         if ((vp->v_type == VREG) && (count == 1))  {
3454                 if (flag & FREAD) {
3455                         ASSERT(vp->v_rdcnt > 0);
3456                         atomic_dec_32(&vp->v_rdcnt);
3457                 }
3458                 if (flag & FWRITE) {
3459                         ASSERT(vp->v_wrcnt > 0);
3460                         atomic_dec_32(&vp->v_wrcnt);
3461                 }
3462         }
3463         return (err);
3464 }
3465 
3466 int
3467 fop_read(
3468         vnode_t *vp,
3469         uio_t *uiop,
3470         int ioflag,
3471         cred_t *cr,
3472         caller_context_t *ct)
3473 {
3474         int     err;
3475         ssize_t resid_start = uiop->uio_resid;
3476 
3477         VOPXID_MAP_CR(vp, cr);
3478 
3479         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3480         VOPSTATS_UPDATE_IO(vp, read,
3481             read_bytes, (resid_start - uiop->uio_resid));
3482         return (err);
3483 }
3484 
3485 int
3486 fop_write(
3487         vnode_t *vp,
3488         uio_t *uiop,
3489         int ioflag,
3490         cred_t *cr,
3491         caller_context_t *ct)
3492 {
3493         int     err;
3494         ssize_t resid_start = uiop->uio_resid;
3495 
3496         VOPXID_MAP_CR(vp, cr);
3497 
3498         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3499         VOPSTATS_UPDATE_IO(vp, write,
3500             write_bytes, (resid_start - uiop->uio_resid));
3501         return (err);
3502 }
3503 
3504 int
3505 fop_ioctl(
3506         vnode_t *vp,
3507         int cmd,
3508         intptr_t arg,
3509         int flag,
3510         cred_t *cr,
3511         int *rvalp,
3512         caller_context_t *ct)
3513 {
3514         int     err;
3515 
3516         VOPXID_MAP_CR(vp, cr);
3517 
3518         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3519         VOPSTATS_UPDATE(vp, ioctl);
3520         return (err);
3521 }
3522 
3523 int
3524 fop_setfl(
3525         vnode_t *vp,
3526         int oflags,
3527         int nflags,
3528         cred_t *cr,
3529         caller_context_t *ct)
3530 {
3531         int     err;
3532 
3533         VOPXID_MAP_CR(vp, cr);
3534 
3535         err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3536         VOPSTATS_UPDATE(vp, setfl);
3537         return (err);
3538 }
3539 
3540 int
3541 fop_getattr(
3542         vnode_t *vp,
3543         vattr_t *vap,
3544         int flags,
3545         cred_t *cr,
3546         caller_context_t *ct)
3547 {
3548         int     err;
3549 
3550         VOPXID_MAP_CR(vp, cr);
3551 
3552         /*
3553          * If this file system doesn't understand the xvattr extensions
3554          * then turn off the xvattr bit.
3555          */
3556         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3557                 vap->va_mask &= ~AT_XVATTR;
3558         }
3559 
3560         /*
3561          * We're only allowed to skip the ACL check iff we used a 32 bit
3562          * ACE mask with VOP_ACCESS() to determine permissions.
3563          */
3564         if ((flags & ATTR_NOACLCHECK) &&
3565             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3566                 return (EINVAL);
3567         }
3568         err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3569         VOPSTATS_UPDATE(vp, getattr);
3570         return (err);
3571 }
3572 
3573 int
3574 fop_setattr(
3575         vnode_t *vp,
3576         vattr_t *vap,
3577         int flags,
3578         cred_t *cr,
3579         caller_context_t *ct)
3580 {
3581         int     err;
3582 
3583         VOPXID_MAP_CR(vp, cr);
3584 
3585         /*
3586          * If this file system doesn't understand the xvattr extensions
3587          * then turn off the xvattr bit.
3588          */
3589         if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3590                 vap->va_mask &= ~AT_XVATTR;
3591         }
3592 
3593         /*
3594          * We're only allowed to skip the ACL check iff we used a 32 bit
3595          * ACE mask with VOP_ACCESS() to determine permissions.
3596          */
3597         if ((flags & ATTR_NOACLCHECK) &&
3598             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3599                 return (EINVAL);
3600         }
3601         err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3602         VOPSTATS_UPDATE(vp, setattr);
3603         return (err);
3604 }
3605 
3606 int
3607 fop_access(
3608         vnode_t *vp,
3609         int mode,
3610         int flags,
3611         cred_t *cr,
3612         caller_context_t *ct)
3613 {
3614         int     err;
3615 
3616         if ((flags & V_ACE_MASK) &&
3617             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3618                 return (EINVAL);
3619         }
3620 
3621         VOPXID_MAP_CR(vp, cr);
3622 
3623         err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3624         VOPSTATS_UPDATE(vp, access);
3625         return (err);
3626 }
3627 
3628 int
3629 fop_lookup(
3630         vnode_t *dvp,
3631         char *nm,
3632         vnode_t **vpp,
3633         pathname_t *pnp,
3634         int flags,
3635         vnode_t *rdir,
3636         cred_t *cr,
3637         caller_context_t *ct,
3638         int *deflags,           /* Returned per-dirent flags */
3639         pathname_t *ppnp)       /* Returned case-preserved name in directory */
3640 {
3641         int ret;
3642 
3643         /*
3644          * If this file system doesn't support case-insensitive access
3645          * and said access is requested, fail quickly.  It is required
3646          * that if the vfs supports case-insensitive lookup, it also
3647          * supports extended dirent flags.
3648          */
3649         if (flags & FIGNORECASE &&
3650             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3651             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3652                 return (EINVAL);
3653 
3654         VOPXID_MAP_CR(dvp, cr);
3655 
3656         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3657                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3658         } else {
3659                 ret = (*(dvp)->v_op->vop_lookup)
3660                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3661         }
3662         if (ret == 0 && *vpp) {
3663                 VOPSTATS_UPDATE(*vpp, lookup);
3664                 vn_updatepath(dvp, *vpp, nm);
3665         }
3666 
3667         return (ret);
3668 }
3669 
3670 int
3671 fop_create(
3672         vnode_t *dvp,
3673         char *name,
3674         vattr_t *vap,
3675         vcexcl_t excl,
3676         int mode,
3677         vnode_t **vpp,
3678         cred_t *cr,
3679         int flags,
3680         caller_context_t *ct,
3681         vsecattr_t *vsecp)      /* ACL to set during create */
3682 {
3683         int ret;
3684 
3685         if (vsecp != NULL &&
3686             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3687                 return (EINVAL);
3688         }
3689         /*
3690          * If this file system doesn't support case-insensitive access
3691          * and said access is requested, fail quickly.
3692          */
3693         if (flags & FIGNORECASE &&
3694             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3695             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3696                 return (EINVAL);
3697 
3698         VOPXID_MAP_CR(dvp, cr);
3699 
3700         ret = (*(dvp)->v_op->vop_create)
3701             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3702         if (ret == 0 && *vpp) {
3703                 VOPSTATS_UPDATE(*vpp, create);
3704                 vn_updatepath(dvp, *vpp, name);
3705         }
3706 
3707         return (ret);
3708 }
3709 
3710 int
3711 fop_remove(
3712         vnode_t *dvp,
3713         char *nm,
3714         cred_t *cr,
3715         caller_context_t *ct,
3716         int flags)
3717 {
3718         int     err;
3719 
3720         /*
3721          * If this file system doesn't support case-insensitive access
3722          * and said access is requested, fail quickly.
3723          */
3724         if (flags & FIGNORECASE &&
3725             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3726             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3727                 return (EINVAL);
3728 
3729         VOPXID_MAP_CR(dvp, cr);
3730 
3731         err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3732         VOPSTATS_UPDATE(dvp, remove);
3733         return (err);
3734 }
3735 
3736 int
3737 fop_link(
3738         vnode_t *tdvp,
3739         vnode_t *svp,
3740         char *tnm,
3741         cred_t *cr,
3742         caller_context_t *ct,
3743         int flags)
3744 {
3745         int     err;
3746 
3747         /*
3748          * If the target file system doesn't support case-insensitive access
3749          * and said access is requested, fail quickly.
3750          */
3751         if (flags & FIGNORECASE &&
3752             (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3753             vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3754                 return (EINVAL);
3755 
3756         VOPXID_MAP_CR(tdvp, cr);
3757 
3758         err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3759         VOPSTATS_UPDATE(tdvp, link);
3760         return (err);
3761 }
3762 
3763 int
3764 fop_rename(
3765         vnode_t *sdvp,
3766         char *snm,
3767         vnode_t *tdvp,
3768         char *tnm,
3769         cred_t *cr,
3770         caller_context_t *ct,
3771         int flags)
3772 {
3773         int     err;
3774 
3775         /*
3776          * If the file system involved does not support
3777          * case-insensitive access and said access is requested, fail
3778          * quickly.
3779          */
3780         if (flags & FIGNORECASE &&
3781             ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3782             vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3783                 return (EINVAL);
3784 
3785         VOPXID_MAP_CR(tdvp, cr);
3786 
3787         err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3788         VOPSTATS_UPDATE(sdvp, rename);
3789         return (err);
3790 }
3791 
3792 int
3793 fop_mkdir(
3794         vnode_t *dvp,
3795         char *dirname,
3796         vattr_t *vap,
3797         vnode_t **vpp,
3798         cred_t *cr,
3799         caller_context_t *ct,
3800         int flags,
3801         vsecattr_t *vsecp)      /* ACL to set during create */
3802 {
3803         int ret;
3804 
3805         if (vsecp != NULL &&
3806             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3807                 return (EINVAL);
3808         }
3809         /*
3810          * If this file system doesn't support case-insensitive access
3811          * and said access is requested, fail quickly.
3812          */
3813         if (flags & FIGNORECASE &&
3814             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3815             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3816                 return (EINVAL);
3817 
3818         VOPXID_MAP_CR(dvp, cr);
3819 
3820         ret = (*(dvp)->v_op->vop_mkdir)
3821             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3822         if (ret == 0 && *vpp) {
3823                 VOPSTATS_UPDATE(*vpp, mkdir);
3824                 vn_updatepath(dvp, *vpp, dirname);
3825         }
3826 
3827         return (ret);
3828 }
3829 
3830 int
3831 fop_rmdir(
3832         vnode_t *dvp,
3833         char *nm,
3834         vnode_t *cdir,
3835         cred_t *cr,
3836         caller_context_t *ct,
3837         int flags)
3838 {
3839         int     err;
3840 
3841         /*
3842          * If this file system doesn't support case-insensitive access
3843          * and said access is requested, fail quickly.
3844          */
3845         if (flags & FIGNORECASE &&
3846             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3847             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3848                 return (EINVAL);
3849 
3850         VOPXID_MAP_CR(dvp, cr);
3851 
3852         err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3853         VOPSTATS_UPDATE(dvp, rmdir);
3854         return (err);
3855 }
3856 
3857 int
3858 fop_readdir(
3859         vnode_t *vp,
3860         uio_t *uiop,
3861         cred_t *cr,
3862         int *eofp,
3863         caller_context_t *ct,
3864         int flags)
3865 {
3866         int     err;
3867         ssize_t resid_start = uiop->uio_resid;
3868 
3869         /*
3870          * If this file system doesn't support retrieving directory
3871          * entry flags and said access is requested, fail quickly.
3872          */
3873         if (flags & V_RDDIR_ENTFLAGS &&
3874             vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3875                 return (EINVAL);
3876 
3877         VOPXID_MAP_CR(vp, cr);
3878 
3879         err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3880         VOPSTATS_UPDATE_IO(vp, readdir,
3881             readdir_bytes, (resid_start - uiop->uio_resid));
3882         return (err);
3883 }
3884 
3885 int
3886 fop_symlink(
3887         vnode_t *dvp,
3888         char *linkname,
3889         vattr_t *vap,
3890         char *target,
3891         cred_t *cr,
3892         caller_context_t *ct,
3893         int flags)
3894 {
3895         int     err;
3896         xvattr_t xvattr;
3897 
3898         /*
3899          * If this file system doesn't support case-insensitive access
3900          * and said access is requested, fail quickly.
3901          */
3902         if (flags & FIGNORECASE &&
3903             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3904             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3905                 return (EINVAL);
3906 
3907         VOPXID_MAP_CR(dvp, cr);
3908 
3909         /* check for reparse point */
3910         if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3911             (strncmp(target, FS_REPARSE_TAG_STR,
3912             strlen(FS_REPARSE_TAG_STR)) == 0)) {
3913                 if (!fs_reparse_mark(target, vap, &xvattr))
3914                         vap = (vattr_t *)&xvattr;
3915         }
3916 
3917         err = (*(dvp)->v_op->vop_symlink)
3918             (dvp, linkname, vap, target, cr, ct, flags);
3919         VOPSTATS_UPDATE(dvp, symlink);
3920         return (err);
3921 }
3922 
3923 int
3924 fop_readlink(
3925         vnode_t *vp,
3926         uio_t *uiop,
3927         cred_t *cr,
3928         caller_context_t *ct)
3929 {
3930         int     err;
3931 
3932         VOPXID_MAP_CR(vp, cr);
3933 
3934         err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3935         VOPSTATS_UPDATE(vp, readlink);
3936         return (err);
3937 }
3938 
3939 int
3940 fop_fsync(
3941         vnode_t *vp,
3942         int syncflag,
3943         cred_t *cr,
3944         caller_context_t *ct)
3945 {
3946         int     err;
3947 
3948         VOPXID_MAP_CR(vp, cr);
3949 
3950         err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3951         VOPSTATS_UPDATE(vp, fsync);
3952         return (err);
3953 }
3954 
3955 void
3956 fop_inactive(
3957         vnode_t *vp,
3958         cred_t *cr,
3959         caller_context_t *ct)
3960 {
3961         /* Need to update stats before vop call since we may lose the vnode */
3962         VOPSTATS_UPDATE(vp, inactive);
3963 
3964         VOPXID_MAP_CR(vp, cr);
3965 
3966         (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3967 }
3968 
3969 int
3970 fop_fid(
3971         vnode_t *vp,
3972         fid_t *fidp,
3973         caller_context_t *ct)
3974 {
3975         int     err;
3976 
3977         err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3978         VOPSTATS_UPDATE(vp, fid);
3979         return (err);
3980 }
3981 
3982 int
3983 fop_rwlock(
3984         vnode_t *vp,
3985         int write_lock,
3986         caller_context_t *ct)
3987 {
3988         int     ret;
3989 
3990         ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3991         VOPSTATS_UPDATE(vp, rwlock);
3992         return (ret);
3993 }
3994 
3995 void
3996 fop_rwunlock(
3997         vnode_t *vp,
3998         int write_lock,
3999         caller_context_t *ct)
4000 {
4001         (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4002         VOPSTATS_UPDATE(vp, rwunlock);
4003 }
4004 
4005 int
4006 fop_seek(
4007         vnode_t *vp,
4008         offset_t ooff,
4009         offset_t *noffp,
4010         caller_context_t *ct)
4011 {
4012         int     err;
4013 
4014         err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4015         VOPSTATS_UPDATE(vp, seek);
4016         return (err);
4017 }
4018 
4019 int
4020 fop_cmp(
4021         vnode_t *vp1,
4022         vnode_t *vp2,
4023         caller_context_t *ct)
4024 {
4025         int     err;
4026 
4027         err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4028         VOPSTATS_UPDATE(vp1, cmp);
4029         return (err);
4030 }
4031 
4032 int
4033 fop_frlock(
4034         vnode_t *vp,
4035         int cmd,
4036         flock64_t *bfp,
4037         int flag,
4038         offset_t offset,
4039         struct flk_callback *flk_cbp,
4040         cred_t *cr,
4041         caller_context_t *ct)
4042 {
4043         int     err;
4044 
4045         VOPXID_MAP_CR(vp, cr);
4046 
4047         err = (*(vp)->v_op->vop_frlock)
4048             (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4049         VOPSTATS_UPDATE(vp, frlock);
4050         return (err);
4051 }
4052 
4053 int
4054 fop_space(
4055         vnode_t *vp,
4056         int cmd,
4057         flock64_t *bfp,
4058         int flag,
4059         offset_t offset,
4060         cred_t *cr,
4061         caller_context_t *ct)
4062 {
4063         int     err;
4064 
4065         VOPXID_MAP_CR(vp, cr);
4066 
4067         err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4068         VOPSTATS_UPDATE(vp, space);
4069         return (err);
4070 }
4071 
4072 int
4073 fop_realvp(
4074         vnode_t *vp,
4075         vnode_t **vpp,
4076         caller_context_t *ct)
4077 {
4078         int     err;
4079 
4080         err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4081         VOPSTATS_UPDATE(vp, realvp);
4082         return (err);
4083 }
4084 
4085 int
4086 fop_getpage(
4087         vnode_t *vp,
4088         offset_t off,
4089         size_t len,
4090         uint_t *protp,
4091         page_t **plarr,
4092         size_t plsz,
4093         struct seg *seg,
4094         caddr_t addr,
4095         enum seg_rw rw,
4096         cred_t *cr,
4097         caller_context_t *ct)
4098 {
4099         int     err;
4100 
4101         VOPXID_MAP_CR(vp, cr);
4102 
4103         err = (*(vp)->v_op->vop_getpage)
4104             (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4105         VOPSTATS_UPDATE(vp, getpage);
4106         return (err);
4107 }
4108 
4109 int
4110 fop_putpage(
4111         vnode_t *vp,
4112         offset_t off,
4113         size_t len,
4114         int flags,
4115         cred_t *cr,
4116         caller_context_t *ct)
4117 {
4118         int     err;
4119 
4120         VOPXID_MAP_CR(vp, cr);
4121 
4122         err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4123         VOPSTATS_UPDATE(vp, putpage);
4124         return (err);
4125 }
4126 
4127 int
4128 fop_map(
4129         vnode_t *vp,
4130         offset_t off,
4131         struct as *as,
4132         caddr_t *addrp,
4133         size_t len,
4134         uchar_t prot,
4135         uchar_t maxprot,
4136         uint_t flags,
4137         cred_t *cr,
4138         caller_context_t *ct)
4139 {
4140         int     err;
4141 
4142         VOPXID_MAP_CR(vp, cr);
4143 
4144         err = (*(vp)->v_op->vop_map)
4145             (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4146         VOPSTATS_UPDATE(vp, map);
4147         return (err);
4148 }
4149 
4150 int
4151 fop_addmap(
4152         vnode_t *vp,
4153         offset_t off,
4154         struct as *as,
4155         caddr_t addr,
4156         size_t len,
4157         uchar_t prot,
4158         uchar_t maxprot,
4159         uint_t flags,
4160         cred_t *cr,
4161         caller_context_t *ct)
4162 {
4163         int error;
4164         u_longlong_t delta;
4165 
4166         VOPXID_MAP_CR(vp, cr);
4167 
4168         error = (*(vp)->v_op->vop_addmap)
4169             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4170 
4171         if ((!error) && (vp->v_type == VREG)) {
4172                 delta = (u_longlong_t)btopr(len);
4173                 /*
4174                  * If file is declared MAP_PRIVATE, it can't be written back
4175                  * even if open for write. Handle as read.
4176                  */
4177                 if (flags & MAP_PRIVATE) {
4178                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4179                             (int64_t)delta);
4180                 } else {
4181                         /*
4182                          * atomic_add_64 forces the fetch of a 64 bit value to
4183                          * be atomic on 32 bit machines
4184                          */
4185                         if (maxprot & PROT_WRITE)
4186                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4187                                     (int64_t)delta);
4188                         if (maxprot & PROT_READ)
4189                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4190                                     (int64_t)delta);
4191                         if (maxprot & PROT_EXEC)
4192                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4193                                     (int64_t)delta);
4194                 }
4195         }
4196         VOPSTATS_UPDATE(vp, addmap);
4197         return (error);
4198 }
4199 
4200 int
4201 fop_delmap(
4202         vnode_t *vp,
4203         offset_t off,
4204         struct as *as,
4205         caddr_t addr,
4206         size_t len,
4207         uint_t prot,
4208         uint_t maxprot,
4209         uint_t flags,
4210         cred_t *cr,
4211         caller_context_t *ct)
4212 {
4213         int error;
4214         u_longlong_t delta;
4215 
4216         VOPXID_MAP_CR(vp, cr);
4217 
4218         error = (*(vp)->v_op->vop_delmap)
4219             (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4220 
4221         /*
4222          * NFS calls into delmap twice, the first time
4223          * it simply establishes a callback mechanism and returns EAGAIN
4224          * while the real work is being done upon the second invocation.
4225          * We have to detect this here and only decrement the counts upon
4226          * the second delmap request.
4227          */
4228         if ((error != EAGAIN) && (vp->v_type == VREG)) {
4229 
4230                 delta = (u_longlong_t)btopr(len);
4231 
4232                 if (flags & MAP_PRIVATE) {
4233                         atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4234                             (int64_t)(-delta));
4235                 } else {
4236                         /*
4237                          * atomic_add_64 forces the fetch of a 64 bit value
4238                          * to be atomic on 32 bit machines
4239                          */
4240                         if (maxprot & PROT_WRITE)
4241                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4242                                     (int64_t)(-delta));
4243                         if (maxprot & PROT_READ)
4244                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4245                                     (int64_t)(-delta));
4246                         if (maxprot & PROT_EXEC)
4247                                 atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4248                                     (int64_t)(-delta));
4249                 }
4250         }
4251         VOPSTATS_UPDATE(vp, delmap);
4252         return (error);
4253 }
4254 
4255 
4256 int
4257 fop_poll(
4258         vnode_t *vp,
4259         short events,
4260         int anyyet,
4261         short *reventsp,
4262         struct pollhead **phpp,
4263         caller_context_t *ct)
4264 {
4265         int     err;
4266 
4267         err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4268         VOPSTATS_UPDATE(vp, poll);
4269         return (err);
4270 }
4271 
4272 int
4273 fop_dump(
4274         vnode_t *vp,
4275         caddr_t addr,
4276         offset_t lbdn,
4277         offset_t dblks,
4278         caller_context_t *ct)
4279 {
4280         int     err;
4281 
4282         /* ensure lbdn and dblks can be passed safely to bdev_dump */
4283         if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4284                 return (EIO);
4285 
4286         err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4287         VOPSTATS_UPDATE(vp, dump);
4288         return (err);
4289 }
4290 
4291 int
4292 fop_pathconf(
4293         vnode_t *vp,
4294         int cmd,
4295         ulong_t *valp,
4296         cred_t *cr,
4297         caller_context_t *ct)
4298 {
4299         int     err;
4300 
4301         VOPXID_MAP_CR(vp, cr);
4302 
4303         err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4304         VOPSTATS_UPDATE(vp, pathconf);
4305         return (err);
4306 }
4307 
4308 int
4309 fop_pageio(
4310         vnode_t *vp,
4311         struct page *pp,
4312         u_offset_t io_off,
4313         size_t io_len,
4314         int flags,
4315         cred_t *cr,
4316         caller_context_t *ct)
4317 {
4318         int     err;
4319 
4320         VOPXID_MAP_CR(vp, cr);
4321 
4322         err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4323         VOPSTATS_UPDATE(vp, pageio);
4324         return (err);
4325 }
4326 
4327 int
4328 fop_dumpctl(
4329         vnode_t *vp,
4330         int action,
4331         offset_t *blkp,
4332         caller_context_t *ct)
4333 {
4334         int     err;
4335         err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4336         VOPSTATS_UPDATE(vp, dumpctl);
4337         return (err);
4338 }
4339 
4340 void
4341 fop_dispose(
4342         vnode_t *vp,
4343         page_t *pp,
4344         int flag,
4345         int dn,
4346         cred_t *cr,
4347         caller_context_t *ct)
4348 {
4349         /* Must do stats first since it's possible to lose the vnode */
4350         VOPSTATS_UPDATE(vp, dispose);
4351 
4352         VOPXID_MAP_CR(vp, cr);
4353 
4354         (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4355 }
4356 
4357 int
4358 fop_setsecattr(
4359         vnode_t *vp,
4360         vsecattr_t *vsap,
4361         int flag,
4362         cred_t *cr,
4363         caller_context_t *ct)
4364 {
4365         int     err;
4366 
4367         VOPXID_MAP_CR(vp, cr);
4368 
4369         /*
4370          * We're only allowed to skip the ACL check iff we used a 32 bit
4371          * ACE mask with VOP_ACCESS() to determine permissions.
4372          */
4373         if ((flag & ATTR_NOACLCHECK) &&
4374             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4375                 return (EINVAL);
4376         }
4377         err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4378         VOPSTATS_UPDATE(vp, setsecattr);
4379         return (err);
4380 }
4381 
4382 int
4383 fop_getsecattr(
4384         vnode_t *vp,
4385         vsecattr_t *vsap,
4386         int flag,
4387         cred_t *cr,
4388         caller_context_t *ct)
4389 {
4390         int     err;
4391 
4392         /*
4393          * We're only allowed to skip the ACL check iff we used a 32 bit
4394          * ACE mask with VOP_ACCESS() to determine permissions.
4395          */
4396         if ((flag & ATTR_NOACLCHECK) &&
4397             vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4398                 return (EINVAL);
4399         }
4400 
4401         VOPXID_MAP_CR(vp, cr);
4402 
4403         err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4404         VOPSTATS_UPDATE(vp, getsecattr);
4405         return (err);
4406 }
4407 
4408 int
4409 fop_shrlock(
4410         vnode_t *vp,
4411         int cmd,
4412         struct shrlock *shr,
4413         int flag,
4414         cred_t *cr,
4415         caller_context_t *ct)
4416 {
4417         int     err;
4418 
4419         VOPXID_MAP_CR(vp, cr);
4420 
4421         err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4422         VOPSTATS_UPDATE(vp, shrlock);
4423         return (err);
4424 }
4425 
4426 int
4427 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4428     caller_context_t *ct)
4429 {
4430         int     err;
4431 
4432         err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4433         VOPSTATS_UPDATE(vp, vnevent);
4434         return (err);
4435 }
4436 
4437 int
4438 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4439     caller_context_t *ct)
4440 {
4441         int err;
4442 
4443         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4444                 return (ENOTSUP);
4445         err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4446         VOPSTATS_UPDATE(vp, reqzcbuf);
4447         return (err);
4448 }
4449 
4450 int
4451 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4452 {
4453         int err;
4454 
4455         if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4456                 return (ENOTSUP);
4457         err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4458         VOPSTATS_UPDATE(vp, retzcbuf);
4459         return (err);
4460 }
4461 
4462 /*
4463  * Default destructor
4464  *      Needed because NULL destructor means that the key is unused
4465  */
4466 /* ARGSUSED */
4467 void
4468 vsd_defaultdestructor(void *value)
4469 {}
4470 
4471 /*
4472  * Create a key (index into per vnode array)
4473  *      Locks out vsd_create, vsd_destroy, and vsd_free
4474  *      May allocate memory with lock held
4475  */
4476 void
4477 vsd_create(uint_t *keyp, void (*destructor)(void *))
4478 {
4479         int     i;
4480         uint_t  nkeys;
4481 
4482         /*
4483          * if key is allocated, do nothing
4484          */
4485         mutex_enter(&vsd_lock);
4486         if (*keyp) {
4487                 mutex_exit(&vsd_lock);
4488                 return;
4489         }
4490         /*
4491          * find an unused key
4492          */
4493         if (destructor == NULL)
4494                 destructor = vsd_defaultdestructor;
4495 
4496         for (i = 0; i < vsd_nkeys; ++i)
4497                 if (vsd_destructor[i] == NULL)
4498                         break;
4499 
4500         /*
4501          * if no unused keys, increase the size of the destructor array
4502          */
4503         if (i == vsd_nkeys) {
4504                 if ((nkeys = (vsd_nkeys << 1)) == 0)
4505                         nkeys = 1;
4506                 vsd_destructor =
4507                     (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4508                     (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4509                     (size_t)(nkeys * sizeof (void (*)(void *))));
4510                 vsd_nkeys = nkeys;
4511         }
4512 
4513         /*
4514          * allocate the next available unused key
4515          */
4516         vsd_destructor[i] = destructor;
4517         *keyp = i + 1;
4518 
4519         /* create vsd_list, if it doesn't exist */
4520         if (vsd_list == NULL) {
4521                 vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4522                 list_create(vsd_list, sizeof (struct vsd_node),
4523                     offsetof(struct vsd_node, vs_nodes));
4524         }
4525 
4526         mutex_exit(&vsd_lock);
4527 }
4528 
4529 /*
4530  * Destroy a key
4531  *
4532  * Assumes that the caller is preventing vsd_set and vsd_get
4533  * Locks out vsd_create, vsd_destroy, and vsd_free
4534  * May free memory with lock held
4535  */
4536 void
4537 vsd_destroy(uint_t *keyp)
4538 {
4539         uint_t key;
4540         struct vsd_node *vsd;
4541 
4542         /*
4543          * protect the key namespace and our destructor lists
4544          */
4545         mutex_enter(&vsd_lock);
4546         key = *keyp;
4547         *keyp = 0;
4548 
4549         ASSERT(key <= vsd_nkeys);
4550 
4551         /*
4552          * if the key is valid
4553          */
4554         if (key != 0) {
4555                 uint_t k = key - 1;
4556                 /*
4557                  * for every vnode with VSD, call key's destructor
4558                  */
4559                 for (vsd = list_head(vsd_list); vsd != NULL;
4560                     vsd = list_next(vsd_list, vsd)) {
4561                         /*
4562                          * no VSD for key in this vnode
4563                          */
4564                         if (key > vsd->vs_nkeys)
4565                                 continue;
4566                         /*
4567                          * call destructor for key
4568                          */
4569                         if (vsd->vs_value[k] && vsd_destructor[k])
4570                                 (*vsd_destructor[k])(vsd->vs_value[k]);
4571                         /*
4572                          * reset value for key
4573                          */
4574                         vsd->vs_value[k] = NULL;
4575                 }
4576                 /*
4577                  * actually free the key (NULL destructor == unused)
4578                  */
4579                 vsd_destructor[k] = NULL;
4580         }
4581 
4582         mutex_exit(&vsd_lock);
4583 }
4584 
4585 /*
4586  * Quickly return the per vnode value that was stored with the specified key
4587  * Assumes the caller is protecting key from vsd_create and vsd_destroy
4588  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4589  */
4590 void *
4591 vsd_get(vnode_t *vp, uint_t key)
4592 {
4593         struct vsd_node *vsd;
4594 
4595         ASSERT(vp != NULL);
4596         ASSERT(mutex_owned(&vp->v_vsd_lock));
4597 
4598         vsd = vp->v_vsd;
4599 
4600         if (key && vsd != NULL && key <= vsd->vs_nkeys)
4601                 return (vsd->vs_value[key - 1]);
4602         return (NULL);
4603 }
4604 
4605 /*
4606  * Set a per vnode value indexed with the specified key
4607  * Assumes the caller is holding v_vsd_lock to protect the vsd.
4608  */
4609 int
4610 vsd_set(vnode_t *vp, uint_t key, void *value)
4611 {
4612         struct vsd_node *vsd;
4613 
4614         ASSERT(vp != NULL);
4615         ASSERT(mutex_owned(&vp->v_vsd_lock));
4616 
4617         if (key == 0)
4618                 return (EINVAL);
4619 
4620         vsd = vp->v_vsd;
4621         if (vsd == NULL)
4622                 vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4623 
4624         /*
4625          * If the vsd was just allocated, vs_nkeys will be 0, so the following
4626          * code won't happen and we will continue down and allocate space for
4627          * the vs_value array.
4628          * If the caller is replacing one value with another, then it is up
4629          * to the caller to free/rele/destroy the previous value (if needed).
4630          */
4631         if (key <= vsd->vs_nkeys) {
4632                 vsd->vs_value[key - 1] = value;
4633                 return (0);
4634         }
4635 
4636         ASSERT(key <= vsd_nkeys);
4637 
4638         if (vsd->vs_nkeys == 0) {
4639                 mutex_enter(&vsd_lock);     /* lock out vsd_destroy() */
4640                 /*
4641                  * Link onto list of all VSD nodes.
4642                  */
4643                 list_insert_head(vsd_list, vsd);
4644                 mutex_exit(&vsd_lock);
4645         }
4646 
4647         /*
4648          * Allocate vnode local storage and set the value for key
4649          */
4650         vsd->vs_value = vsd_realloc(vsd->vs_value,
4651             vsd->vs_nkeys * sizeof (void *),
4652             key * sizeof (void *));
4653         vsd->vs_nkeys = key;
4654         vsd->vs_value[key - 1] = value;
4655 
4656         return (0);
4657 }
4658 
4659 /*
4660  * Called from vn_free() to run the destructor function for each vsd
4661  *      Locks out vsd_create and vsd_destroy
4662  *      Assumes that the destructor *DOES NOT* use vsd
4663  */
4664 void
4665 vsd_free(vnode_t *vp)
4666 {
4667         int i;
4668         struct vsd_node *vsd = vp->v_vsd;
4669 
4670         if (vsd == NULL)
4671                 return;
4672 
4673         if (vsd->vs_nkeys == 0) {
4674                 kmem_free(vsd, sizeof (*vsd));
4675                 vp->v_vsd = NULL;
4676                 return;
4677         }
4678 
4679         /*
4680          * lock out vsd_create and vsd_destroy, call
4681          * the destructor, and mark the value as destroyed.
4682          */
4683         mutex_enter(&vsd_lock);
4684 
4685         for (i = 0; i < vsd->vs_nkeys; i++) {
4686                 if (vsd->vs_value[i] && vsd_destructor[i])
4687                         (*vsd_destructor[i])(vsd->vs_value[i]);
4688                 vsd->vs_value[i] = NULL;
4689         }
4690 
4691         /*
4692          * remove from linked list of VSD nodes
4693          */
4694         list_remove(vsd_list, vsd);
4695 
4696         mutex_exit(&vsd_lock);
4697 
4698         /*
4699          * free up the VSD
4700          */
4701         kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4702         kmem_free(vsd, sizeof (struct vsd_node));
4703         vp->v_vsd = NULL;
4704 }
4705 
4706 /*
4707  * realloc
4708  */
4709 static void *
4710 vsd_realloc(void *old, size_t osize, size_t nsize)
4711 {
4712         void *new;
4713 
4714         new = kmem_zalloc(nsize, KM_SLEEP);
4715         if (old) {
4716                 bcopy(old, new, osize);
4717                 kmem_free(old, osize);
4718         }
4719         return (new);
4720 }
4721 
4722 /*
4723  * Setup the extensible system attribute for creating a reparse point.
4724  * The symlink data 'target' is validated for proper format of a reparse
4725  * string and a check also made to make sure the symlink data does not
4726  * point to an existing file.
4727  *
4728  * return 0 if ok else -1.
4729  */
4730 static int
4731 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4732 {
4733         xoptattr_t *xoap;
4734 
4735         if ((!target) || (!vap) || (!xvattr))
4736                 return (-1);
4737 
4738         /* validate reparse string */
4739         if (reparse_validate((const char *)target))
4740                 return (-1);
4741 
4742         xva_init(xvattr);
4743         xvattr->xva_vattr = *vap;
4744         xvattr->xva_vattr.va_mask |= AT_XVATTR;
4745         xoap = xva_getxoptattr(xvattr);
4746         ASSERT(xoap);
4747         XVA_SET_REQ(xvattr, XAT_REPARSE);
4748         xoap->xoa_reparse = 1;
4749 
4750         return (0);
4751 }
4752 
4753 /*
4754  * Function to check whether a symlink is a reparse point.
4755  * Return B_TRUE if it is a reparse point, else return B_FALSE
4756  */
4757 boolean_t
4758 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4759 {
4760         xvattr_t xvattr;
4761         xoptattr_t *xoap;
4762 
4763         if ((vp->v_type != VLNK) ||
4764             !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4765                 return (B_FALSE);
4766 
4767         xva_init(&xvattr);
4768         xoap = xva_getxoptattr(&xvattr);
4769         ASSERT(xoap);
4770         XVA_SET_REQ(&xvattr, XAT_REPARSE);
4771 
4772         if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4773                 return (B_FALSE);
4774 
4775         if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4776             (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4777                 return (B_FALSE);
4778 
4779         return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4780 }