Print this page
    
11679 vn_rele() and friends should VERIFY after mutex
Reviewed by: Dan McDonald <danmcd@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/vnode.c
          +++ new/usr/src/uts/common/fs/vnode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2020 Joyent, Inc.
       25 + * Copyright 2022 Spencer Evans-Cole.
  25   26   * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26   27   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27   28   */
  28   29  
  29   30  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  30   31  /*        All Rights Reserved   */
  31   32  
  32   33  /*
  33   34   * University Copyright- Copyright (c) 1982, 1986, 1988
  34   35   * The Regents of the University of California
  35   36   * All Rights Reserved
  36   37   *
  37   38   * University Acknowledgment- Portions of this document are derived from
  38   39   * software developed by the University of California, Berkeley, and its
  39   40   * contributors.
  40   41   */
  41   42  
  42   43  #include <sys/types.h>
  43   44  #include <sys/param.h>
  44   45  #include <sys/t_lock.h>
  45   46  #include <sys/errno.h>
  46   47  #include <sys/cred.h>
  47   48  #include <sys/user.h>
  48   49  #include <sys/uio.h>
  49   50  #include <sys/file.h>
  50   51  #include <sys/pathname.h>
  51   52  #include <sys/vfs.h>
  52   53  #include <sys/vfs_opreg.h>
  53   54  #include <sys/vnode.h>
  54   55  #include <sys/filio.h>
  55   56  #include <sys/rwstlock.h>
  56   57  #include <sys/fem.h>
  57   58  #include <sys/stat.h>
  58   59  #include <sys/mode.h>
  59   60  #include <sys/conf.h>
  60   61  #include <sys/sysmacros.h>
  61   62  #include <sys/cmn_err.h>
  62   63  #include <sys/systm.h>
  63   64  #include <sys/kmem.h>
  64   65  #include <sys/debug.h>
  65   66  #include <c2/audit.h>
  66   67  #include <sys/acl.h>
  67   68  #include <sys/nbmlock.h>
  68   69  #include <sys/fcntl.h>
  69   70  #include <fs/fs_subr.h>
  70   71  #include <sys/taskq.h>
  71   72  #include <fs/fs_reparse.h>
  72   73  #include <sys/time.h>
  73   74  #include <sys/sdt.h>
  74   75  
  75   76  /* Determine if this vnode is a file that is read-only */
  76   77  #define ISROFILE(vp)    \
  77   78          ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  78   79              (vp)->v_type != VFIFO && vn_is_readonly(vp))
  79   80  
  80   81  /* Tunable via /etc/system; used only by admin/install */
  81   82  int nfs_global_client_only;
  82   83  
  83   84  /*
  84   85   * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  85   86   * number of entries as and parallel to the vfssw table.  (Arguably, it could
  86   87   * be part of the vfssw table.)  Once it's initialized, it's accessed using
  87   88   * the same fstype index that is used to index into the vfssw table.
  88   89   */
  89   90  vopstats_t **vopstats_fstype;
  90   91  
  91   92  /* vopstats initialization template used for fast initialization via bcopy() */
  92   93  static vopstats_t *vs_templatep;
  93   94  
  94   95  /* Kmem cache handle for vsk_anchor_t allocations */
  95   96  kmem_cache_t *vsk_anchor_cache;
  96   97  
  97   98  /* file events cleanup routine */
  98   99  extern void free_fopdata(vnode_t *);
  99  100  
 100  101  /*
 101  102   * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 102  103   * updates to vsktat_tree.
 103  104   */
 104  105  avl_tree_t      vskstat_tree;
 105  106  kmutex_t        vskstat_tree_lock;
 106  107  
 107  108  /* Global variable which enables/disables the vopstats collection */
 108  109  int vopstats_enabled = 1;
 109  110  
 110  111  /* Global used for empty/invalid v_path */
 111  112  char *vn_vpath_empty = "";
 112  113  
 113  114  /*
 114  115   * forward declarations for internal vnode specific data (vsd)
 115  116   */
 116  117  static void *vsd_realloc(void *, size_t, size_t);
 117  118  
 118  119  /*
 119  120   * forward declarations for reparse point functions
 120  121   */
 121  122  static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 122  123  
 123  124  /*
 124  125   * VSD -- VNODE SPECIFIC DATA
 125  126   * The v_data pointer is typically used by a file system to store a
 126  127   * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 127  128   * However, there are times when additional project private data needs
 128  129   * to be stored separately from the data (node) pointed to by v_data.
 129  130   * This additional data could be stored by the file system itself or
 130  131   * by a completely different kernel entity.  VSD provides a way for
 131  132   * callers to obtain a key and store a pointer to private data associated
 132  133   * with a vnode.
 133  134   *
 134  135   * Callers are responsible for protecting the vsd by holding v_vsd_lock
 135  136   * for calls to vsd_set() and vsd_get().
 136  137   */
 137  138  
 138  139  /*
 139  140   * vsd_lock protects:
 140  141   *   vsd_nkeys - creation and deletion of vsd keys
 141  142   *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 142  143   *   vsd_destructor - adding and removing destructors to the list
 143  144   */
 144  145  static kmutex_t         vsd_lock;
 145  146  static uint_t           vsd_nkeys;       /* size of destructor array */
 146  147  /* list of vsd_node's */
 147  148  static list_t *vsd_list = NULL;
 148  149  /* per-key destructor funcs */
 149  150  static void             (**vsd_destructor)(void *);
 150  151  
 151  152  /*
 152  153   * The following is the common set of actions needed to update the
 153  154   * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 154  155   * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 155  156   * recording of the bytes transferred.  Since the code is similar
 156  157   * but small, it is nearly a duplicate.  Consequently any changes
 157  158   * to one may need to be reflected in the other.
 158  159   * Rundown of the variables:
 159  160   * vp - Pointer to the vnode
 160  161   * counter - Partial name structure member to update in vopstats for counts
 161  162   * bytecounter - Partial name structure member to update in vopstats for bytes
 162  163   * bytesval - Value to update in vopstats for bytes
 163  164   * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 164  165   * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 165  166   */
 166  167  
 167  168  #define VOPSTATS_UPDATE(vp, counter) {                                  \
 168  169          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 169  170          if (vfsp && vfsp->vfs_implp &&                                  \
 170  171              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 171  172                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 172  173                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 173  174                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 174  175                      size_t, uint64_t *);                                \
 175  176                  __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 176  177                  (*stataddr)++;                                          \
 177  178                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 178  179                          vsp->n##counter.value.ui64++;                   \
 179  180                  }                                                       \
 180  181          }                                                               \
 181  182  }
 182  183  
 183  184  #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 184  185          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 185  186          if (vfsp && vfsp->vfs_implp &&                                  \
 186  187              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 187  188                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 188  189                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 189  190                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 190  191                      size_t, uint64_t *);                                \
 191  192                  __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 192  193                  (*stataddr)++;                                          \
 193  194                  vsp->bytecounter.value.ui64 += bytesval;                \
 194  195                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 195  196                          vsp->n##counter.value.ui64++;                   \
 196  197                          vsp->bytecounter.value.ui64 += bytesval;        \
 197  198                  }                                                       \
 198  199          }                                                               \
 199  200  }
 200  201  
 201  202  /*
 202  203   * If the filesystem does not support XIDs map credential
 203  204   * If the vfsp is NULL, perhaps we should also map?
 204  205   */
 205  206  #define VOPXID_MAP_CR(vp, cr)   {                                       \
 206  207          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 207  208          if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 208  209                  cr = crgetmapped(cr);                                   \
 209  210          }
 210  211  
 211  212  /*
 212  213   * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 213  214   * numerical order of S_IFMT and vnode types.)
 214  215   */
 215  216  enum vtype iftovt_tab[] = {
 216  217          VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 217  218          VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 218  219  };
 219  220  
 220  221  ushort_t vttoif_tab[] = {
 221  222          0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 222  223          S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 223  224  };
 224  225  
 225  226  /*
 226  227   * The system vnode cache.
 227  228   */
 228  229  
 229  230  kmem_cache_t *vn_cache;
 230  231  
 231  232  
 232  233  /*
 233  234   * Vnode operations vector.
 234  235   */
 235  236  
 236  237  static const fs_operation_trans_def_t vn_ops_table[] = {
 237  238          VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 238  239              fs_nosys, fs_nosys,
 239  240  
 240  241          VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 241  242              fs_nosys, fs_nosys,
 242  243  
 243  244          VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 244  245              fs_nosys, fs_nosys,
 245  246  
 246  247          VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 247  248              fs_nosys, fs_nosys,
 248  249  
 249  250          VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 250  251              fs_nosys, fs_nosys,
 251  252  
 252  253          VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 253  254              fs_setfl, fs_nosys,
 254  255  
 255  256          VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 256  257              fs_nosys, fs_nosys,
 257  258  
 258  259          VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 259  260              fs_nosys, fs_nosys,
 260  261  
 261  262          VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 262  263              fs_nosys, fs_nosys,
 263  264  
 264  265          VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 265  266              fs_nosys, fs_nosys,
 266  267  
 267  268          VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 268  269              fs_nosys, fs_nosys,
 269  270  
 270  271          VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 271  272              fs_nosys, fs_nosys,
 272  273  
 273  274          VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 274  275              fs_nosys, fs_nosys,
 275  276  
 276  277          VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 277  278              fs_nosys, fs_nosys,
 278  279  
 279  280          VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 280  281              fs_nosys, fs_nosys,
 281  282  
 282  283          VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 283  284              fs_nosys, fs_nosys,
 284  285  
 285  286          VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 286  287              fs_nosys, fs_nosys,
 287  288  
 288  289          VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 289  290              fs_nosys, fs_nosys,
 290  291  
 291  292          VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 292  293              fs_nosys, fs_nosys,
 293  294  
 294  295          VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 295  296              fs_nosys, fs_nosys,
 296  297  
 297  298          VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 298  299              fs_nosys, fs_nosys,
 299  300  
 300  301          VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 301  302              fs_nosys, fs_nosys,
 302  303  
 303  304          VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 304  305              fs_rwlock, fs_rwlock,
 305  306  
 306  307          VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 307  308              (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 308  309              (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 309  310  
 310  311          VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 311  312              fs_nosys, fs_nosys,
 312  313  
 313  314          VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 314  315              fs_cmp, fs_cmp,             /* no errors allowed */
 315  316  
 316  317          VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 317  318              fs_frlock, fs_nosys,
 318  319  
 319  320          VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 320  321              fs_nosys, fs_nosys,
 321  322  
 322  323          VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 323  324              fs_nosys, fs_nosys,
 324  325  
 325  326          VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 326  327              fs_nosys, fs_nosys,
 327  328  
 328  329          VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 329  330              fs_nosys, fs_nosys,
 330  331  
 331  332          VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 332  333              (fs_generic_func_p) fs_nosys_map,
 333  334              (fs_generic_func_p) fs_nosys_map,
 334  335  
 335  336          VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 336  337              (fs_generic_func_p) fs_nosys_addmap,
 337  338              (fs_generic_func_p) fs_nosys_addmap,
 338  339  
 339  340          VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 340  341              fs_nosys, fs_nosys,
 341  342  
 342  343          VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 343  344              (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 344  345  
 345  346          VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 346  347              fs_nosys, fs_nosys,
 347  348  
 348  349          VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 349  350              fs_pathconf, fs_nosys,
 350  351  
 351  352          VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 352  353              fs_nosys, fs_nosys,
 353  354  
 354  355          VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 355  356              fs_nosys, fs_nosys,
 356  357  
 357  358          VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 358  359              (fs_generic_func_p)(uintptr_t)fs_dispose,
 359  360              (fs_generic_func_p)(uintptr_t)fs_nodispose,
 360  361  
 361  362          VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 362  363              fs_nosys, fs_nosys,
 363  364  
 364  365          VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 365  366              fs_fab_acl, fs_nosys,
 366  367  
 367  368          VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 368  369              fs_shrlock, fs_nosys,
 369  370  
 370  371          VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 371  372              (fs_generic_func_p) fs_vnevent_nosupport,
 372  373              (fs_generic_func_p) fs_vnevent_nosupport,
 373  374  
 374  375          VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 375  376              fs_nosys, fs_nosys,
 376  377  
 377  378          VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 378  379              fs_nosys, fs_nosys,
 379  380  
 380  381          NULL, 0, NULL, NULL
 381  382  };
 382  383  
 383  384  /* Extensible attribute (xva) routines. */
 384  385  
 385  386  /*
 386  387   * Zero out the structure, set the size of the requested/returned bitmaps,
 387  388   * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 388  389   * to the returned attributes array.
 389  390   */
 390  391  void
 391  392  xva_init(xvattr_t *xvap)
 392  393  {
 393  394          bzero(xvap, sizeof (xvattr_t));
 394  395          xvap->xva_mapsize = XVA_MAPSIZE;
 395  396          xvap->xva_magic = XVA_MAGIC;
 396  397          xvap->xva_vattr.va_mask = AT_XVATTR;
 397  398          xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 398  399  }
 399  400  
 400  401  /*
 401  402   * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 402  403   * structure.  Otherwise, returns NULL.
 403  404   */
 404  405  xoptattr_t *
 405  406  xva_getxoptattr(xvattr_t *xvap)
 406  407  {
 407  408          xoptattr_t *xoap = NULL;
 408  409          if (xvap->xva_vattr.va_mask & AT_XVATTR)
 409  410                  xoap = &xvap->xva_xoptattrs;
 410  411          return (xoap);
 411  412  }
 412  413  
 413  414  /*
 414  415   * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 415  416   * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 416  417   * kstat name.
 417  418   */
 418  419  static int
 419  420  vska_compar(const void *n1, const void *n2)
 420  421  {
 421  422          int ret;
 422  423          ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 423  424          ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 424  425  
 425  426          if (p1 < p2) {
 426  427                  ret = -1;
 427  428          } else if (p1 > p2) {
 428  429                  ret = 1;
 429  430          } else {
 430  431                  ret = 0;
 431  432          }
 432  433  
 433  434          return (ret);
 434  435  }
 435  436  
 436  437  /*
 437  438   * Used to create a single template which will be bcopy()ed to a newly
 438  439   * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 439  440   */
 440  441  static vopstats_t *
 441  442  create_vopstats_template()
 442  443  {
 443  444          vopstats_t              *vsp;
 444  445  
 445  446          vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 446  447          bzero(vsp, sizeof (*vsp));      /* Start fresh */
 447  448  
 448  449          /* VOP_OPEN */
 449  450          kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 450  451          /* VOP_CLOSE */
 451  452          kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 452  453          /* VOP_READ I/O */
 453  454          kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 454  455          kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 455  456          /* VOP_WRITE I/O */
 456  457          kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 457  458          kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 458  459          /* VOP_IOCTL */
 459  460          kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 460  461          /* VOP_SETFL */
 461  462          kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 462  463          /* VOP_GETATTR */
 463  464          kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 464  465          /* VOP_SETATTR */
 465  466          kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 466  467          /* VOP_ACCESS */
 467  468          kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 468  469          /* VOP_LOOKUP */
 469  470          kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 470  471          /* VOP_CREATE */
 471  472          kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 472  473          /* VOP_REMOVE */
 473  474          kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 474  475          /* VOP_LINK */
 475  476          kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 476  477          /* VOP_RENAME */
 477  478          kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 478  479          /* VOP_MKDIR */
 479  480          kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 480  481          /* VOP_RMDIR */
 481  482          kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 482  483          /* VOP_READDIR I/O */
 483  484          kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 484  485          kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 485  486              KSTAT_DATA_UINT64);
 486  487          /* VOP_SYMLINK */
 487  488          kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 488  489          /* VOP_READLINK */
 489  490          kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 490  491          /* VOP_FSYNC */
 491  492          kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 492  493          /* VOP_INACTIVE */
 493  494          kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 494  495          /* VOP_FID */
 495  496          kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 496  497          /* VOP_RWLOCK */
 497  498          kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 498  499          /* VOP_RWUNLOCK */
 499  500          kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 500  501          /* VOP_SEEK */
 501  502          kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 502  503          /* VOP_CMP */
 503  504          kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 504  505          /* VOP_FRLOCK */
 505  506          kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 506  507          /* VOP_SPACE */
 507  508          kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 508  509          /* VOP_REALVP */
 509  510          kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 510  511          /* VOP_GETPAGE */
 511  512          kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 512  513          /* VOP_PUTPAGE */
 513  514          kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 514  515          /* VOP_MAP */
 515  516          kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 516  517          /* VOP_ADDMAP */
 517  518          kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 518  519          /* VOP_DELMAP */
 519  520          kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 520  521          /* VOP_POLL */
 521  522          kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 522  523          /* VOP_DUMP */
 523  524          kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 524  525          /* VOP_PATHCONF */
 525  526          kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 526  527          /* VOP_PAGEIO */
 527  528          kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 528  529          /* VOP_DUMPCTL */
 529  530          kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 530  531          /* VOP_DISPOSE */
 531  532          kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 532  533          /* VOP_SETSECATTR */
 533  534          kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 534  535          /* VOP_GETSECATTR */
 535  536          kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 536  537          /* VOP_SHRLOCK */
 537  538          kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 538  539          /* VOP_VNEVENT */
 539  540          kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 540  541          /* VOP_REQZCBUF */
 541  542          kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 542  543          /* VOP_RETZCBUF */
 543  544          kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 544  545  
 545  546          return (vsp);
 546  547  }
 547  548  
 548  549  /*
 549  550   * Creates a kstat structure associated with a vopstats structure.
 550  551   */
 551  552  kstat_t *
 552  553  new_vskstat(char *ksname, vopstats_t *vsp)
 553  554  {
 554  555          kstat_t         *ksp;
 555  556  
 556  557          if (!vopstats_enabled) {
 557  558                  return (NULL);
 558  559          }
 559  560  
 560  561          ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 561  562              sizeof (vopstats_t)/sizeof (kstat_named_t),
 562  563              KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 563  564          if (ksp) {
 564  565                  ksp->ks_data = vsp;
 565  566                  kstat_install(ksp);
 566  567          }
 567  568  
 568  569          return (ksp);
 569  570  }
 570  571  
 571  572  /*
 572  573   * Called from vfsinit() to initialize the support mechanisms for vopstats
 573  574   */
 574  575  void
 575  576  vopstats_startup()
 576  577  {
 577  578          if (!vopstats_enabled)
 578  579                  return;
 579  580  
 580  581          /*
 581  582           * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 582  583           * is necessary since we need to check if a kstat exists before we
 583  584           * attempt to create it.  Also, initialize its lock.
 584  585           */
 585  586          avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 586  587              offsetof(vsk_anchor_t, vsk_node));
 587  588          mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 588  589  
 589  590          vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 590  591              sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 591  592              NULL, NULL, 0);
 592  593  
 593  594          /*
 594  595           * Set up the array of pointers for the vopstats-by-FS-type.
 595  596           * The entries will be allocated/initialized as each file system
 596  597           * goes through modload/mod_installfs.
 597  598           */
 598  599          vopstats_fstype = (vopstats_t **)kmem_zalloc(
 599  600              (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 600  601  
 601  602          /* Set up the global vopstats initialization template */
 602  603          vs_templatep = create_vopstats_template();
 603  604  }
 604  605  
 605  606  /*
 606  607   * We need to have the all of the counters zeroed.
 607  608   * The initialization of the vopstats_t includes on the order of
 608  609   * 50 calls to kstat_named_init().  Rather that do that on every call,
 609  610   * we do it once in a template (vs_templatep) then bcopy it over.
 610  611   */
 611  612  void
 612  613  initialize_vopstats(vopstats_t *vsp)
 613  614  {
 614  615          if (vsp == NULL)
 615  616                  return;
 616  617  
 617  618          bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 618  619  }
 619  620  
 620  621  /*
 621  622   * If possible, determine which vopstats by fstype to use and
 622  623   * return a pointer to the caller.
 623  624   */
 624  625  vopstats_t *
 625  626  get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 626  627  {
 627  628          int             fstype = 0;     /* Index into vfssw[] */
 628  629          vopstats_t      *vsp = NULL;
 629  630  
 630  631          if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 631  632              !vopstats_enabled)
 632  633                  return (NULL);
 633  634          /*
 634  635           * Set up the fstype.  We go to so much trouble because all versions
 635  636           * of NFS use the same fstype in their vfs even though they have
 636  637           * distinct entries in the vfssw[] table.
 637  638           * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 638  639           */
 639  640          if (vswp) {
 640  641                  fstype = vswp - vfssw;  /* Gets us the index */
 641  642          } else {
 642  643                  fstype = vfsp->vfs_fstype;
 643  644          }
 644  645  
 645  646          /*
 646  647           * Point to the per-fstype vopstats. The only valid values are
 647  648           * non-zero positive values less than the number of vfssw[] table
 648  649           * entries.
 649  650           */
 650  651          if (fstype > 0 && fstype < nfstype) {
 651  652                  vsp = vopstats_fstype[fstype];
 652  653          }
 653  654  
 654  655          return (vsp);
 655  656  }
 656  657  
 657  658  /*
 658  659   * Generate a kstat name, create the kstat structure, and allocate a
 659  660   * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 660  661   * to the caller.  This must only be called from a mount.
 661  662   */
 662  663  vsk_anchor_t *
 663  664  get_vskstat_anchor(vfs_t *vfsp)
 664  665  {
 665  666          char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 666  667          statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 667  668          vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 668  669          kstat_t         *ksp;                   /* Ptr to new kstat */
 669  670          avl_index_t     where;                  /* Location in the AVL tree */
 670  671  
 671  672          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 672  673              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 673  674                  return (NULL);
 674  675  
 675  676          /* Need to get the fsid to build a kstat name */
 676  677          if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 677  678                  /* Create a name for our kstats based on fsid */
 678  679                  (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 679  680                      VOPSTATS_STR, statvfsbuf.f_fsid);
 680  681  
 681  682                  /* Allocate and initialize the vsk_anchor_t */
 682  683                  vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 683  684                  bzero(vskp, sizeof (*vskp));
 684  685                  vskp->vsk_fsid = statvfsbuf.f_fsid;
 685  686  
 686  687                  mutex_enter(&vskstat_tree_lock);
 687  688                  if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 688  689                          avl_insert(&vskstat_tree, vskp, where);
 689  690                          mutex_exit(&vskstat_tree_lock);
 690  691  
 691  692                          /*
 692  693                           * Now that we've got the anchor in the AVL
 693  694                           * tree, we can create the kstat.
 694  695                           */
 695  696                          ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 696  697                          if (ksp) {
 697  698                                  vskp->vsk_ksp = ksp;
 698  699                          }
 699  700                  } else {
 700  701                          /* Oops, found one! Release memory and lock. */
 701  702                          mutex_exit(&vskstat_tree_lock);
 702  703                          kmem_cache_free(vsk_anchor_cache, vskp);
 703  704                          vskp = NULL;
 704  705                  }
 705  706          }
 706  707          return (vskp);
 707  708  }
 708  709  
 709  710  /*
 710  711   * We're in the process of tearing down the vfs and need to cleanup
 711  712   * the data structures associated with the vopstats. Must only be called
 712  713   * from dounmount().
 713  714   */
 714  715  void
 715  716  teardown_vopstats(vfs_t *vfsp)
 716  717  {
 717  718          vsk_anchor_t    *vskap;
 718  719          avl_index_t     where;
 719  720  
 720  721          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 721  722              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 722  723                  return;
 723  724  
 724  725          /* This is a safe check since VFS_STATS must be set (see above) */
 725  726          if ((vskap = vfsp->vfs_vskap) == NULL)
 726  727                  return;
 727  728  
 728  729          /* Whack the pointer right away */
 729  730          vfsp->vfs_vskap = NULL;
 730  731  
 731  732          /* Lock the tree, remove the node, and delete the kstat */
 732  733          mutex_enter(&vskstat_tree_lock);
 733  734          if (avl_find(&vskstat_tree, vskap, &where)) {
 734  735                  avl_remove(&vskstat_tree, vskap);
 735  736          }
 736  737  
 737  738          if (vskap->vsk_ksp) {
 738  739                  kstat_delete(vskap->vsk_ksp);
 739  740          }
 740  741          mutex_exit(&vskstat_tree_lock);
 741  742  
 742  743          kmem_cache_free(vsk_anchor_cache, vskap);
 743  744  }
 744  745  
 745  746  /*
 746  747   * Read or write a vnode.  Called from kernel code.
 747  748   */
 748  749  int
 749  750  vn_rdwr(
 750  751          enum uio_rw rw,
 751  752          struct vnode *vp,
 752  753          caddr_t base,
 753  754          ssize_t len,
 754  755          offset_t offset,
 755  756          enum uio_seg seg,
 756  757          int ioflag,
 757  758          rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 758  759          cred_t *cr,
 759  760          ssize_t *residp)
 760  761  {
 761  762          struct uio uio;
 762  763          struct iovec iov;
 763  764          int error;
 764  765          int in_crit = 0;
 765  766  
 766  767          if (rw == UIO_WRITE && ISROFILE(vp))
 767  768                  return (EROFS);
 768  769  
 769  770          if (len < 0)
 770  771                  return (EIO);
 771  772  
 772  773          VOPXID_MAP_CR(vp, cr);
 773  774  
 774  775          iov.iov_base = base;
 775  776          iov.iov_len = len;
 776  777          uio.uio_iov = &iov;
 777  778          uio.uio_iovcnt = 1;
 778  779          uio.uio_loffset = offset;
 779  780          uio.uio_segflg = (short)seg;
 780  781          uio.uio_resid = len;
 781  782          uio.uio_llimit = ulimit;
 782  783  
 783  784          /*
 784  785           * We have to enter the critical region before calling VOP_RWLOCK
 785  786           * to avoid a deadlock with ufs.
 786  787           */
 787  788          if (nbl_need_check(vp)) {
 788  789                  int svmand;
 789  790  
 790  791                  nbl_start_crit(vp, RW_READER);
 791  792                  in_crit = 1;
 792  793                  error = nbl_svmand(vp, cr, &svmand);
 793  794                  if (error != 0)
 794  795                          goto done;
 795  796                  if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 796  797                      uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 797  798                          error = EACCES;
 798  799                          goto done;
 799  800                  }
 800  801          }
 801  802  
 802  803          (void) VOP_RWLOCK(vp,
 803  804              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 804  805          if (rw == UIO_WRITE) {
 805  806                  uio.uio_fmode = FWRITE;
 806  807                  uio.uio_extflg = UIO_COPY_DEFAULT;
 807  808                  error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 808  809          } else {
 809  810                  uio.uio_fmode = FREAD;
 810  811                  uio.uio_extflg = UIO_COPY_CACHED;
 811  812                  error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 812  813          }
 813  814          VOP_RWUNLOCK(vp,
 814  815              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 815  816          if (residp)
 816  817                  *residp = uio.uio_resid;
 817  818          else if (uio.uio_resid)
 818  819                  error = EIO;
 819  820  
 820  821  done:
 821  822          if (in_crit)
 822  823                  nbl_end_crit(vp);
 823  824          return (error);
 824  825  }
 825  826  
 826  827  /*
 827  828   * Release a vnode.  Call VOP_INACTIVE on last reference or
 828  829   * decrement reference count.
 829  830   *
  
    | 
      ↓ open down ↓ | 
    795 lines elided | 
    
      ↑ open up ↑ | 
  
 830  831   * To avoid race conditions, the v_count is left at 1 for
 831  832   * the call to VOP_INACTIVE. This prevents another thread
 832  833   * from reclaiming and releasing the vnode *before* the
 833  834   * VOP_INACTIVE routine has a chance to destroy the vnode.
 834  835   * We can't have more than 1 thread calling VOP_INACTIVE
 835  836   * on a vnode.
 836  837   */
 837  838  void
 838  839  vn_rele(vnode_t *vp)
 839  840  {
 840      -        VERIFY(vp->v_count > 0);
 841  841          mutex_enter(&vp->v_lock);
 842  842          if (vp->v_count == 1) {
 843  843                  mutex_exit(&vp->v_lock);
 844  844                  VOP_INACTIVE(vp, CRED(), NULL);
 845  845                  return;
      846 +        } else {
      847 +                VERIFY(vp->v_count > 0);
 846  848          }
 847  849          VN_RELE_LOCKED(vp);
 848  850          mutex_exit(&vp->v_lock);
 849  851  }
 850  852  
 851  853  /*
 852  854   * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 853  855   * as a single reference, so v_count is not decremented until the last DNLC hold
 854  856   * is released. This makes it possible to distinguish vnodes that are referenced
 855  857   * only by the DNLC.
 856  858   */
 857  859  void
 858  860  vn_rele_dnlc(vnode_t *vp)
 859  861  {
 860      -        VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 861  862          mutex_enter(&vp->v_lock);
      863 +        VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 862  864          if (--vp->v_count_dnlc == 0) {
 863  865                  if (vp->v_count == 1) {
 864  866                          mutex_exit(&vp->v_lock);
 865  867                          VOP_INACTIVE(vp, CRED(), NULL);
 866  868                          return;
 867  869                  }
 868  870                  VN_RELE_LOCKED(vp);
 869  871          }
 870  872          mutex_exit(&vp->v_lock);
 871  873  }
 872  874  
  
    | 
      ↓ open down ↓ | 
    1 lines elided | 
    
      ↑ open up ↑ | 
  
 873  875  /*
 874  876   * Like vn_rele() except that it clears v_stream under v_lock.
 875  877   * This is used by sockfs when it dismantles the association between
 876  878   * the sockfs node and the vnode in the underlying file system.
 877  879   * v_lock has to be held to prevent a thread coming through the lookupname
 878  880   * path from accessing a stream head that is going away.
 879  881   */
 880  882  void
 881  883  vn_rele_stream(vnode_t *vp)
 882  884  {
 883      -        VERIFY(vp->v_count > 0);
 884  885          mutex_enter(&vp->v_lock);
 885  886          vp->v_stream = NULL;
 886  887          if (vp->v_count == 1) {
 887  888                  mutex_exit(&vp->v_lock);
 888  889                  VOP_INACTIVE(vp, CRED(), NULL);
 889  890                  return;
      891 +        } else {
      892 +                VERIFY(vp->v_count > 0);
 890  893          }
 891  894          VN_RELE_LOCKED(vp);
 892  895          mutex_exit(&vp->v_lock);
 893  896  }
 894  897  
 895  898  static void
 896  899  vn_rele_inactive(vnode_t *vp)
 897  900  {
 898  901          VOP_INACTIVE(vp, CRED(), NULL);
 899  902  }
 900  903  
 901  904  /*
 902  905   * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 903  906   * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
  
    | 
      ↓ open down ↓ | 
    4 lines elided | 
    
      ↑ open up ↑ | 
  
 904  907   * the file system as a result of releasing the vnode. Note, file systems
 905  908   * already have to handle the race where the vnode is incremented before the
 906  909   * inactive routine is called and does its locking.
 907  910   *
 908  911   * Warning: Excessive use of this routine can lead to performance problems.
 909  912   * This is because taskqs throttle back allocation if too many are created.
 910  913   */
 911  914  void
 912  915  vn_rele_async(vnode_t *vp, taskq_t *taskq)
 913  916  {
 914      -        VERIFY(vp->v_count > 0);
 915  917          mutex_enter(&vp->v_lock);
 916  918          if (vp->v_count == 1) {
 917  919                  mutex_exit(&vp->v_lock);
 918  920                  VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 919  921                      vp, TQ_SLEEP) != TASKQID_INVALID);
 920  922                  return;
      923 +        } else {
      924 +                VERIFY(vp->v_count > 0);
 921  925          }
 922  926          VN_RELE_LOCKED(vp);
 923  927          mutex_exit(&vp->v_lock);
 924  928  }
 925  929  
 926  930  int
 927  931  vn_open(
 928  932          char *pnamep,
 929  933          enum uio_seg seg,
 930  934          int filemode,
 931  935          int createmode,
 932  936          struct vnode **vpp,
 933  937          enum create crwhy,
 934  938          mode_t umask)
 935  939  {
 936  940          return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 937  941              umask, NULL, -1));
 938  942  }
 939  943  
 940  944  
 941  945  /*
 942  946   * Open/create a vnode.
 943  947   * This may be callable by the kernel, the only known use
 944  948   * of user context being that the current user credentials
 945  949   * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 946  950   */
 947  951  int
 948  952  vn_openat(
 949  953          char *pnamep,
 950  954          enum uio_seg seg,
 951  955          int filemode,
 952  956          int createmode,
 953  957          struct vnode **vpp,
 954  958          enum create crwhy,
 955  959          mode_t umask,
 956  960          struct vnode *startvp,
 957  961          int fd)
 958  962  {
 959  963          struct vnode *vp;
 960  964          int mode;
 961  965          int accessflags;
 962  966          int error;
 963  967          int in_crit = 0;
 964  968          int open_done = 0;
 965  969          int shrlock_done = 0;
 966  970          struct vattr vattr;
 967  971          enum symfollow follow;
 968  972          int estale_retry = 0;
 969  973          struct shrlock shr;
 970  974          struct shr_locowner shr_own;
 971  975          boolean_t create;
 972  976  
 973  977          mode = 0;
 974  978          accessflags = 0;
 975  979          if (filemode & FREAD)
 976  980                  mode |= VREAD;
 977  981          if (filemode & (FWRITE|FTRUNC))
 978  982                  mode |= VWRITE;
 979  983          if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 980  984                  mode |= VEXEC;
 981  985  
 982  986          /* symlink interpretation */
 983  987          if (filemode & FNOFOLLOW)
 984  988                  follow = NO_FOLLOW;
 985  989          else
 986  990                  follow = FOLLOW;
 987  991  
 988  992          if (filemode & FAPPEND)
 989  993                  accessflags |= V_APPEND;
 990  994  
 991  995          /*
 992  996           * We need to handle the case of FCREAT | FDIRECTORY and the case of
 993  997           * FEXCL. If all three are specified, then we always fail because we
 994  998           * cannot create a directory through this interface and FEXCL says we
 995  999           * need to fail the request if we can't create it. If, however, only
 996 1000           * FCREAT | FDIRECTORY are specified, then we can treat this as the case
 997 1001           * of opening a file that already exists. If it exists, we can do
 998 1002           * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
 999 1003           * treated as FDIRECTORY.
1000 1004           */
1001 1005          if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1002 1006              (FCREAT | FDIRECTORY | FEXCL)) {
1003 1007                  return (EINVAL);
1004 1008          }
1005 1009  
1006 1010          if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1007 1011                  create = B_FALSE;
1008 1012          } else if ((filemode & FCREAT) != 0) {
1009 1013                  create = B_TRUE;
1010 1014          } else {
1011 1015                  create = B_FALSE;
1012 1016          }
1013 1017  
1014 1018  top:
1015 1019          if (create) {
1016 1020                  enum vcexcl excl;
1017 1021  
1018 1022                  /*
1019 1023                   * Wish to create a file.
1020 1024                   */
1021 1025                  vattr.va_type = VREG;
1022 1026                  vattr.va_mode = createmode;
1023 1027                  vattr.va_mask = AT_TYPE|AT_MODE;
1024 1028                  if (filemode & FTRUNC) {
1025 1029                          vattr.va_size = 0;
1026 1030                          vattr.va_mask |= AT_SIZE;
1027 1031                  }
1028 1032                  if (filemode & FEXCL)
1029 1033                          excl = EXCL;
1030 1034                  else
1031 1035                          excl = NONEXCL;
1032 1036  
1033 1037                  if (error =
1034 1038                      vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1035 1039                      (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1036 1040                          return (error);
1037 1041          } else {
1038 1042                  /*
1039 1043                   * Wish to open a file.  Just look it up.
1040 1044                   */
1041 1045                  if (error = lookupnameat(pnamep, seg, follow,
1042 1046                      NULLVPP, &vp, startvp)) {
1043 1047                          if ((error == ESTALE) &&
1044 1048                              fs_need_estale_retry(estale_retry++))
1045 1049                                  goto top;
1046 1050                          return (error);
1047 1051                  }
1048 1052  
1049 1053                  /*
1050 1054                   * Get the attributes to check whether file is large.
1051 1055                   * We do this only if the FOFFMAX flag is not set and
1052 1056                   * only for regular files.
1053 1057                   */
1054 1058  
1055 1059                  if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1056 1060                          vattr.va_mask = AT_SIZE;
1057 1061                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1058 1062                              CRED(), NULL))) {
1059 1063                                  goto out;
1060 1064                          }
1061 1065                          if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1062 1066                                  /*
1063 1067                                   * Large File API - regular open fails
1064 1068                                   * if FOFFMAX flag is set in file mode
1065 1069                                   */
1066 1070                                  error = EOVERFLOW;
1067 1071                                  goto out;
1068 1072                          }
1069 1073                  }
1070 1074                  /*
1071 1075                   * Can't write directories, active texts, or
1072 1076                   * read-only filesystems.  Can't truncate files
1073 1077                   * on which mandatory locking is in effect.
1074 1078                   */
1075 1079                  if (filemode & (FWRITE|FTRUNC)) {
1076 1080                          /*
1077 1081                           * Allow writable directory if VDIROPEN flag is set.
1078 1082                           */
1079 1083                          if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1080 1084                                  error = EISDIR;
1081 1085                                  goto out;
1082 1086                          }
1083 1087                          if (ISROFILE(vp)) {
1084 1088                                  error = EROFS;
1085 1089                                  goto out;
1086 1090                          }
1087 1091                          /*
1088 1092                           * Can't truncate files on which
1089 1093                           * sysv mandatory locking is in effect.
1090 1094                           */
1091 1095                          if (filemode & FTRUNC) {
1092 1096                                  vnode_t *rvp;
1093 1097  
1094 1098                                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1095 1099                                          rvp = vp;
1096 1100                                  if (rvp->v_filocks != NULL) {
1097 1101                                          vattr.va_mask = AT_MODE;
1098 1102                                          if ((error = VOP_GETATTR(vp,
1099 1103                                              &vattr, 0, CRED(), NULL)) == 0 &&
1100 1104                                              MANDLOCK(vp, vattr.va_mode))
1101 1105                                                  error = EAGAIN;
1102 1106                                  }
1103 1107                          }
1104 1108                          if (error)
1105 1109                                  goto out;
1106 1110                  }
1107 1111                  /*
1108 1112                   * Check permissions.
1109 1113                   */
1110 1114                  if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1111 1115                          goto out;
1112 1116  
1113 1117                  /*
1114 1118                   * Require FSEARCH and FDIRECTORY to return a directory. Require
1115 1119                   * FEXEC to return a regular file.
1116 1120                   */
1117 1121                  if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1118 1122                      vp->v_type != VDIR) {
1119 1123                          error = ENOTDIR;
1120 1124                          goto out;
1121 1125                  }
1122 1126                  if ((filemode & FEXEC) && vp->v_type != VREG) {
1123 1127                          error = ENOEXEC;        /* XXX: error code? */
1124 1128                          goto out;
1125 1129                  }
1126 1130          }
1127 1131  
1128 1132          /*
1129 1133           * Do remaining checks for FNOFOLLOW and FNOLINKS.
1130 1134           */
1131 1135          if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1132 1136                  error = ELOOP;
1133 1137                  goto out;
1134 1138          }
1135 1139          if (filemode & FNOLINKS) {
1136 1140                  vattr.va_mask = AT_NLINK;
1137 1141                  if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1138 1142                          goto out;
1139 1143                  }
1140 1144                  if (vattr.va_nlink != 1) {
1141 1145                          error = EMLINK;
1142 1146                          goto out;
1143 1147                  }
1144 1148          }
1145 1149  
1146 1150          /*
1147 1151           * Opening a socket corresponding to the AF_UNIX pathname
1148 1152           * in the filesystem name space is not supported.
1149 1153           * However, VSOCK nodes in namefs are supported in order
1150 1154           * to make fattach work for sockets.
1151 1155           *
1152 1156           * XXX This uses VOP_REALVP to distinguish between
1153 1157           * an unopened namefs node (where VOP_REALVP returns a
1154 1158           * different VSOCK vnode) and a VSOCK created by vn_create
1155 1159           * in some file system (where VOP_REALVP would never return
1156 1160           * a different vnode).
1157 1161           */
1158 1162          if (vp->v_type == VSOCK) {
1159 1163                  struct vnode *nvp;
1160 1164  
1161 1165                  error = VOP_REALVP(vp, &nvp, NULL);
1162 1166                  if (error != 0 || nvp == NULL || nvp == vp ||
1163 1167                      nvp->v_type != VSOCK) {
1164 1168                          error = EOPNOTSUPP;
1165 1169                          goto out;
1166 1170                  }
1167 1171          }
1168 1172  
1169 1173          if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1170 1174                  /* get share reservation */
1171 1175                  shr.s_access = 0;
1172 1176                  if (filemode & FWRITE)
1173 1177                          shr.s_access |= F_WRACC;
1174 1178                  if (filemode & FREAD)
1175 1179                          shr.s_access |= F_RDACC;
1176 1180                  shr.s_deny = 0;
1177 1181                  shr.s_sysid = 0;
1178 1182                  shr.s_pid = ttoproc(curthread)->p_pid;
1179 1183                  shr_own.sl_pid = shr.s_pid;
1180 1184                  shr_own.sl_id = fd;
1181 1185                  shr.s_own_len = sizeof (shr_own);
1182 1186                  shr.s_owner = (caddr_t)&shr_own;
1183 1187                  error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1184 1188                      NULL);
1185 1189                  if (error)
1186 1190                          goto out;
1187 1191                  shrlock_done = 1;
1188 1192  
1189 1193                  /* nbmand conflict check if truncating file */
1190 1194                  if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1191 1195                          nbl_start_crit(vp, RW_READER);
1192 1196                          in_crit = 1;
1193 1197  
1194 1198                          vattr.va_mask = AT_SIZE;
1195 1199                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1196 1200                                  goto out;
1197 1201                          if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1198 1202                              NULL)) {
1199 1203                                  error = EACCES;
1200 1204                                  goto out;
1201 1205                          }
1202 1206                  }
1203 1207          }
1204 1208  
1205 1209          /*
1206 1210           * Do opening protocol.
1207 1211           */
1208 1212          error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1209 1213          if (error)
1210 1214                  goto out;
1211 1215          open_done = 1;
1212 1216  
1213 1217          /*
1214 1218           * Truncate if required.
1215 1219           */
1216 1220          if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1217 1221                  vattr.va_size = 0;
1218 1222                  vattr.va_mask = AT_SIZE;
1219 1223                  if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1220 1224                          goto out;
1221 1225          }
1222 1226  
1223 1227          /*
1224 1228           * Turn on directio, if requested.
1225 1229           */
1226 1230          if (filemode & FDIRECT) {
1227 1231                  if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1228 1232                      CRED(), NULL, NULL)) != 0) {
1229 1233                          /*
1230 1234                           * On Linux, O_DIRECT returns EINVAL when the file
1231 1235                           * system does not support directio, so we'll do the
1232 1236                           * same.
1233 1237                           */
1234 1238                          error = EINVAL;
1235 1239                          goto out;
1236 1240                  }
1237 1241          }
1238 1242  out:
1239 1243          ASSERT(vp->v_count > 0);
1240 1244  
1241 1245          if (in_crit) {
1242 1246                  nbl_end_crit(vp);
1243 1247                  in_crit = 0;
1244 1248          }
1245 1249          if (error) {
1246 1250                  if (open_done) {
1247 1251                          (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1248 1252                              NULL);
1249 1253                          open_done = 0;
1250 1254                          shrlock_done = 0;
1251 1255                  }
1252 1256                  if (shrlock_done) {
1253 1257                          (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1254 1258                              NULL);
1255 1259                          shrlock_done = 0;
1256 1260                  }
1257 1261  
1258 1262                  /*
1259 1263                   * The following clause was added to handle a problem
1260 1264                   * with NFS consistency.  It is possible that a lookup
1261 1265                   * of the file to be opened succeeded, but the file
1262 1266                   * itself doesn't actually exist on the server.  This
1263 1267                   * is chiefly due to the DNLC containing an entry for
1264 1268                   * the file which has been removed on the server.  In
1265 1269                   * this case, we just start over.  If there was some
1266 1270                   * other cause for the ESTALE error, then the lookup
1267 1271                   * of the file will fail and the error will be returned
1268 1272                   * above instead of looping around from here.
1269 1273                   */
1270 1274                  VN_RELE(vp);
1271 1275                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1272 1276                          goto top;
1273 1277          } else
1274 1278                  *vpp = vp;
1275 1279          return (error);
1276 1280  }
1277 1281  
1278 1282  /*
1279 1283   * The following two accessor functions are for the NFSv4 server.  Since there
1280 1284   * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1281 1285   * vnode open counts correct when a client "upgrades" an open or does an
1282 1286   * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1283 1287   * open mode (add or subtract read or write), but also change the share/deny
1284 1288   * modes.  However, share reservations are not integrated with OPEN, yet, so
1285 1289   * we need to handle each separately.  These functions are cleaner than having
1286 1290   * the NFS server manipulate the counts directly, however, nobody else should
1287 1291   * use these functions.
1288 1292   */
1289 1293  void
1290 1294  vn_open_upgrade(
1291 1295          vnode_t *vp,
1292 1296          int filemode)
1293 1297  {
1294 1298          ASSERT(vp->v_type == VREG);
1295 1299  
1296 1300          if (filemode & FREAD)
1297 1301                  atomic_inc_32(&vp->v_rdcnt);
1298 1302          if (filemode & FWRITE)
1299 1303                  atomic_inc_32(&vp->v_wrcnt);
1300 1304  
1301 1305  }
1302 1306  
1303 1307  void
1304 1308  vn_open_downgrade(
1305 1309          vnode_t *vp,
1306 1310          int filemode)
1307 1311  {
1308 1312          ASSERT(vp->v_type == VREG);
1309 1313  
1310 1314          if (filemode & FREAD) {
1311 1315                  ASSERT(vp->v_rdcnt > 0);
1312 1316                  atomic_dec_32(&vp->v_rdcnt);
1313 1317          }
1314 1318          if (filemode & FWRITE) {
1315 1319                  ASSERT(vp->v_wrcnt > 0);
1316 1320                  atomic_dec_32(&vp->v_wrcnt);
1317 1321          }
1318 1322  
1319 1323  }
1320 1324  
1321 1325  int
1322 1326  vn_create(
1323 1327          char *pnamep,
1324 1328          enum uio_seg seg,
1325 1329          struct vattr *vap,
1326 1330          enum vcexcl excl,
1327 1331          int mode,
1328 1332          struct vnode **vpp,
1329 1333          enum create why,
1330 1334          int flag,
1331 1335          mode_t umask)
1332 1336  {
1333 1337          return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1334 1338              umask, NULL));
1335 1339  }
1336 1340  
1337 1341  /*
1338 1342   * Create a vnode (makenode).
1339 1343   */
1340 1344  int
1341 1345  vn_createat(
1342 1346          char *pnamep,
1343 1347          enum uio_seg seg,
1344 1348          struct vattr *vap,
1345 1349          enum vcexcl excl,
1346 1350          int mode,
1347 1351          struct vnode **vpp,
1348 1352          enum create why,
1349 1353          int flag,
1350 1354          mode_t umask,
1351 1355          struct vnode *startvp)
1352 1356  {
1353 1357          struct vnode *dvp;      /* ptr to parent dir vnode */
1354 1358          struct vnode *vp = NULL;
1355 1359          struct pathname pn;
1356 1360          int error;
1357 1361          int in_crit = 0;
1358 1362          struct vattr vattr;
1359 1363          enum symfollow follow;
1360 1364          int estale_retry = 0;
1361 1365          uint32_t auditing = AU_AUDITING();
1362 1366  
1363 1367          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1364 1368  
1365 1369          /* symlink interpretation */
1366 1370          if ((flag & FNOFOLLOW) || excl == EXCL)
1367 1371                  follow = NO_FOLLOW;
1368 1372          else
1369 1373                  follow = FOLLOW;
1370 1374          flag &= ~(FNOFOLLOW|FNOLINKS);
1371 1375  
1372 1376  top:
1373 1377          /*
1374 1378           * Lookup directory.
1375 1379           * If new object is a file, call lower level to create it.
1376 1380           * Note that it is up to the lower level to enforce exclusive
1377 1381           * creation, if the file is already there.
1378 1382           * This allows the lower level to do whatever
1379 1383           * locking or protocol that is needed to prevent races.
1380 1384           * If the new object is directory call lower level to make
1381 1385           * the new directory, with "." and "..".
1382 1386           */
1383 1387          if (error = pn_get(pnamep, seg, &pn))
1384 1388                  return (error);
1385 1389          if (auditing)
1386 1390                  audit_vncreate_start();
1387 1391          dvp = NULL;
1388 1392          *vpp = NULL;
1389 1393          /*
1390 1394           * lookup will find the parent directory for the vnode.
1391 1395           * When it is done the pn holds the name of the entry
1392 1396           * in the directory.
1393 1397           * If this is a non-exclusive create we also find the node itself.
1394 1398           */
1395 1399          error = lookuppnat(&pn, NULL, follow, &dvp,
1396 1400              (excl == EXCL) ? NULLVPP : vpp, startvp);
1397 1401          if (error) {
1398 1402                  pn_free(&pn);
1399 1403                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1400 1404                          goto top;
1401 1405                  if (why == CRMKDIR && error == EINVAL)
1402 1406                          error = EEXIST;         /* SVID */
1403 1407                  return (error);
1404 1408          }
1405 1409  
1406 1410          if (why != CRMKNOD)
1407 1411                  vap->va_mode &= ~VSVTX;
1408 1412  
1409 1413          /*
1410 1414           * If default ACLs are defined for the directory don't apply the
1411 1415           * umask if umask is passed.
1412 1416           */
1413 1417  
1414 1418          if (umask) {
1415 1419  
1416 1420                  vsecattr_t vsec;
1417 1421  
1418 1422                  vsec.vsa_aclcnt = 0;
1419 1423                  vsec.vsa_aclentp = NULL;
1420 1424                  vsec.vsa_dfaclcnt = 0;
1421 1425                  vsec.vsa_dfaclentp = NULL;
1422 1426                  vsec.vsa_mask = VSA_DFACLCNT;
1423 1427                  error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1424 1428                  /*
1425 1429                   * If error is ENOSYS then treat it as no error
1426 1430                   * Don't want to force all file systems to support
1427 1431                   * aclent_t style of ACL's.
1428 1432                   */
1429 1433                  if (error == ENOSYS)
1430 1434                          error = 0;
1431 1435                  if (error) {
1432 1436                          if (*vpp != NULL)
1433 1437                                  VN_RELE(*vpp);
1434 1438                          goto out;
1435 1439                  } else {
1436 1440                          /*
1437 1441                           * Apply the umask if no default ACLs.
1438 1442                           */
1439 1443                          if (vsec.vsa_dfaclcnt == 0)
1440 1444                                  vap->va_mode &= ~umask;
1441 1445  
1442 1446                          /*
1443 1447                           * VOP_GETSECATTR() may have allocated memory for
1444 1448                           * ACLs we didn't request, so double-check and
1445 1449                           * free it if necessary.
1446 1450                           */
1447 1451                          if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1448 1452                                  kmem_free((caddr_t)vsec.vsa_aclentp,
1449 1453                                      vsec.vsa_aclcnt * sizeof (aclent_t));
1450 1454                          if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1451 1455                                  kmem_free((caddr_t)vsec.vsa_dfaclentp,
1452 1456                                      vsec.vsa_dfaclcnt * sizeof (aclent_t));
1453 1457                  }
1454 1458          }
1455 1459  
1456 1460          /*
1457 1461           * In general we want to generate EROFS if the file system is
1458 1462           * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1459 1463           * documents the open system call, and it says that O_CREAT has no
1460 1464           * effect if the file already exists.  Bug 1119649 states
1461 1465           * that open(path, O_CREAT, ...) fails when attempting to open an
1462 1466           * existing file on a read only file system.  Thus, the first part
1463 1467           * of the following if statement has 3 checks:
1464 1468           *      if the file exists &&
1465 1469           *              it is being open with write access &&
1466 1470           *              the file system is read only
1467 1471           *      then generate EROFS
1468 1472           */
1469 1473          if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1470 1474              (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1471 1475                  if (*vpp)
1472 1476                          VN_RELE(*vpp);
1473 1477                  error = EROFS;
1474 1478          } else if (excl == NONEXCL && *vpp != NULL) {
1475 1479                  vnode_t *rvp;
1476 1480  
1477 1481                  /*
1478 1482                   * File already exists.  If a mandatory lock has been
1479 1483                   * applied, return error.
1480 1484                   */
1481 1485                  vp = *vpp;
1482 1486                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1483 1487                          rvp = vp;
1484 1488                  if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1485 1489                          nbl_start_crit(vp, RW_READER);
1486 1490                          in_crit = 1;
1487 1491                  }
1488 1492                  if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1489 1493                          vattr.va_mask = AT_MODE|AT_SIZE;
1490 1494                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1491 1495                                  goto out;
1492 1496                          }
1493 1497                          if (MANDLOCK(vp, vattr.va_mode)) {
1494 1498                                  error = EAGAIN;
1495 1499                                  goto out;
1496 1500                          }
1497 1501                          /*
1498 1502                           * File cannot be truncated if non-blocking mandatory
1499 1503                           * locks are currently on the file.
1500 1504                           */
1501 1505                          if ((vap->va_mask & AT_SIZE) && in_crit) {
1502 1506                                  u_offset_t offset;
1503 1507                                  ssize_t length;
1504 1508  
1505 1509                                  offset = vap->va_size > vattr.va_size ?
1506 1510                                      vattr.va_size : vap->va_size;
1507 1511                                  length = vap->va_size > vattr.va_size ?
1508 1512                                      vap->va_size - vattr.va_size :
1509 1513                                      vattr.va_size - vap->va_size;
1510 1514                                  if (nbl_conflict(vp, NBL_WRITE, offset,
1511 1515                                      length, 0, NULL)) {
1512 1516                                          error = EACCES;
1513 1517                                          goto out;
1514 1518                                  }
1515 1519                          }
1516 1520                  }
1517 1521  
1518 1522                  /*
1519 1523                   * If the file is the root of a VFS, we've crossed a
1520 1524                   * mount point and the "containing" directory that we
1521 1525                   * acquired above (dvp) is irrelevant because it's in
1522 1526                   * a different file system.  We apply VOP_CREATE to the
1523 1527                   * target itself instead of to the containing directory
1524 1528                   * and supply a null path name to indicate (conventionally)
1525 1529                   * the node itself as the "component" of interest.
1526 1530                   *
1527 1531                   * The call to VOP_CREATE() is necessary to ensure
1528 1532                   * that the appropriate permission checks are made,
1529 1533                   * i.e. EISDIR, EACCES, etc.  We already know that vpp
1530 1534                   * exists since we are in the else condition where this
1531 1535                   * was checked.
1532 1536                   */
1533 1537                  if (vp->v_flag & VROOT) {
1534 1538                          ASSERT(why != CRMKDIR);
1535 1539                          error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1536 1540                              CRED(), flag, NULL, NULL);
1537 1541                          /*
1538 1542                           * If the create succeeded, it will have created a
1539 1543                           * new reference on a new vnode (*vpp) in the child
1540 1544                           * file system, so we want to drop our reference on
1541 1545                           * the old (vp) upon exit.
1542 1546                           */
1543 1547                          goto out;
1544 1548                  }
1545 1549  
1546 1550                  /*
1547 1551                   * Large File API - non-large open (FOFFMAX flag not set)
1548 1552                   * of regular file fails if the file size exceeds MAXOFF32_T.
1549 1553                   */
1550 1554                  if (why != CRMKDIR &&
1551 1555                      !(flag & FOFFMAX) &&
1552 1556                      (vp->v_type == VREG)) {
1553 1557                          vattr.va_mask = AT_SIZE;
1554 1558                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1555 1559                              CRED(), NULL))) {
1556 1560                                  goto out;
1557 1561                          }
1558 1562                          if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1559 1563                                  error = EOVERFLOW;
1560 1564                                  goto out;
1561 1565                          }
1562 1566                  }
1563 1567          }
1564 1568  
1565 1569          if (error == 0) {
1566 1570                  /*
1567 1571                   * Call mkdir() if specified, otherwise create().
1568 1572                   */
1569 1573                  int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1570 1574  
1571 1575                  if (why == CRMKDIR)
1572 1576                          /*
1573 1577                           * N.B., if vn_createat() ever requests
1574 1578                           * case-insensitive behavior then it will need
1575 1579                           * to be passed to VOP_MKDIR().  VOP_CREATE()
1576 1580                           * will already get it via "flag"
1577 1581                           */
1578 1582                          error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1579 1583                              NULL, 0, NULL);
1580 1584                  else if (!must_be_dir)
1581 1585                          error = VOP_CREATE(dvp, pn.pn_path, vap,
1582 1586                              excl, mode, vpp, CRED(), flag, NULL, NULL);
1583 1587                  else
1584 1588                          error = ENOTDIR;
1585 1589          }
1586 1590  
1587 1591  out:
1588 1592  
1589 1593          if (auditing)
1590 1594                  audit_vncreate_finish(*vpp, error);
1591 1595          if (in_crit) {
1592 1596                  nbl_end_crit(vp);
1593 1597                  in_crit = 0;
1594 1598          }
1595 1599          if (vp != NULL) {
1596 1600                  VN_RELE(vp);
1597 1601                  vp = NULL;
1598 1602          }
1599 1603          pn_free(&pn);
1600 1604          VN_RELE(dvp);
1601 1605          /*
1602 1606           * The following clause was added to handle a problem
1603 1607           * with NFS consistency.  It is possible that a lookup
1604 1608           * of the file to be created succeeded, but the file
1605 1609           * itself doesn't actually exist on the server.  This
1606 1610           * is chiefly due to the DNLC containing an entry for
1607 1611           * the file which has been removed on the server.  In
1608 1612           * this case, we just start over.  If there was some
1609 1613           * other cause for the ESTALE error, then the lookup
1610 1614           * of the file will fail and the error will be returned
1611 1615           * above instead of looping around from here.
1612 1616           */
1613 1617          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1614 1618                  goto top;
1615 1619          return (error);
1616 1620  }
1617 1621  
1618 1622  int
1619 1623  vn_link(char *from, char *to, enum uio_seg seg)
1620 1624  {
1621 1625          return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1622 1626  }
1623 1627  
1624 1628  int
1625 1629  vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1626 1630      vnode_t *tstartvp, char *to, enum uio_seg seg)
1627 1631  {
1628 1632          struct vnode *fvp;              /* from vnode ptr */
1629 1633          struct vnode *tdvp;             /* to directory vnode ptr */
1630 1634          struct pathname pn;
1631 1635          int error;
1632 1636          struct vattr vattr;
1633 1637          dev_t fsid;
1634 1638          int estale_retry = 0;
1635 1639          uint32_t auditing = AU_AUDITING();
1636 1640  
1637 1641  top:
1638 1642          fvp = tdvp = NULL;
1639 1643          if (error = pn_get(to, seg, &pn))
1640 1644                  return (error);
1641 1645          if (auditing && fstartvp != NULL)
1642 1646                  audit_setfsat_path(1);
1643 1647          if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1644 1648                  goto out;
1645 1649          if (auditing && tstartvp != NULL)
1646 1650                  audit_setfsat_path(3);
1647 1651          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1648 1652                  goto out;
1649 1653          /*
1650 1654           * Make sure both source vnode and target directory vnode are
1651 1655           * in the same vfs and that it is writeable.
1652 1656           */
1653 1657          vattr.va_mask = AT_FSID;
1654 1658          if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1655 1659                  goto out;
1656 1660          fsid = vattr.va_fsid;
1657 1661          vattr.va_mask = AT_FSID;
1658 1662          if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1659 1663                  goto out;
1660 1664          if (fsid != vattr.va_fsid) {
1661 1665                  error = EXDEV;
1662 1666                  goto out;
1663 1667          }
1664 1668          if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1665 1669                  error = EROFS;
1666 1670                  goto out;
1667 1671          }
1668 1672          /*
1669 1673           * Do the link.
1670 1674           */
1671 1675          (void) pn_fixslash(&pn);
1672 1676          error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1673 1677  out:
1674 1678          pn_free(&pn);
1675 1679          if (fvp)
1676 1680                  VN_RELE(fvp);
1677 1681          if (tdvp)
1678 1682                  VN_RELE(tdvp);
1679 1683          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1680 1684                  goto top;
1681 1685          return (error);
1682 1686  }
1683 1687  
1684 1688  int
1685 1689  vn_rename(char *from, char *to, enum uio_seg seg)
1686 1690  {
1687 1691          return (vn_renameat(NULL, from, NULL, to, seg));
1688 1692  }
1689 1693  
1690 1694  int
1691 1695  vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1692 1696      char *tname, enum uio_seg seg)
1693 1697  {
1694 1698          int error;
1695 1699          struct vattr vattr;
1696 1700          struct pathname fpn;            /* from pathname */
1697 1701          struct pathname tpn;            /* to pathname */
1698 1702          dev_t fsid;
1699 1703          int in_crit_src, in_crit_targ;
1700 1704          vnode_t *fromvp, *fvp;
1701 1705          vnode_t *tovp, *targvp;
1702 1706          int estale_retry = 0;
1703 1707          uint32_t auditing = AU_AUDITING();
1704 1708  
1705 1709  top:
1706 1710          fvp = fromvp = tovp = targvp = NULL;
1707 1711          in_crit_src = in_crit_targ = 0;
1708 1712          /*
1709 1713           * Get to and from pathnames.
1710 1714           */
1711 1715          if (error = pn_get(fname, seg, &fpn))
1712 1716                  return (error);
1713 1717          if (error = pn_get(tname, seg, &tpn)) {
1714 1718                  pn_free(&fpn);
1715 1719                  return (error);
1716 1720          }
1717 1721  
1718 1722          /*
1719 1723           * First we need to resolve the correct directories
1720 1724           * The passed in directories may only be a starting point,
1721 1725           * but we need the real directories the file(s) live in.
1722 1726           * For example the fname may be something like usr/lib/sparc
1723 1727           * and we were passed in the / directory, but we need to
1724 1728           * use the lib directory for the rename.
1725 1729           */
1726 1730  
1727 1731          if (auditing && fdvp != NULL)
1728 1732                  audit_setfsat_path(1);
1729 1733          /*
1730 1734           * Lookup to and from directories.
1731 1735           */
1732 1736          if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1733 1737                  goto out;
1734 1738          }
1735 1739  
1736 1740          /*
1737 1741           * Make sure there is an entry.
1738 1742           */
1739 1743          if (fvp == NULL) {
1740 1744                  error = ENOENT;
1741 1745                  goto out;
1742 1746          }
1743 1747  
1744 1748          if (auditing && tdvp != NULL)
1745 1749                  audit_setfsat_path(3);
1746 1750          if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1747 1751                  goto out;
1748 1752          }
1749 1753  
1750 1754          /*
1751 1755           * Make sure both the from vnode directory and the to directory
1752 1756           * are in the same vfs and the to directory is writable.
1753 1757           * We check fsid's, not vfs pointers, so loopback fs works.
1754 1758           */
1755 1759          if (fromvp != tovp) {
1756 1760                  vattr.va_mask = AT_FSID;
1757 1761                  if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1758 1762                          goto out;
1759 1763                  fsid = vattr.va_fsid;
1760 1764                  vattr.va_mask = AT_FSID;
1761 1765                  if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1762 1766                          goto out;
1763 1767                  if (fsid != vattr.va_fsid) {
1764 1768                          error = EXDEV;
1765 1769                          goto out;
1766 1770                  }
1767 1771          }
1768 1772  
1769 1773          if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1770 1774                  error = EROFS;
1771 1775                  goto out;
1772 1776          }
1773 1777  
1774 1778          /*
1775 1779           * Make sure "from" vp is not a mount point.
1776 1780           * Note, lookup did traverse() already, so
1777 1781           * we'll be looking at the mounted FS root.
1778 1782           * (but allow files like mnttab)
1779 1783           */
1780 1784          if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1781 1785                  error = EBUSY;
1782 1786                  goto out;
1783 1787          }
1784 1788  
1785 1789          if (targvp && (fvp != targvp)) {
1786 1790                  nbl_start_crit(targvp, RW_READER);
1787 1791                  in_crit_targ = 1;
1788 1792                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1789 1793                          error = EACCES;
1790 1794                          goto out;
1791 1795                  }
1792 1796          }
1793 1797  
1794 1798          if (nbl_need_check(fvp)) {
1795 1799                  nbl_start_crit(fvp, RW_READER);
1796 1800                  in_crit_src = 1;
1797 1801                  if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1798 1802                          error = EACCES;
1799 1803                          goto out;
1800 1804                  }
1801 1805          }
1802 1806  
1803 1807          /*
1804 1808           * Do the rename.
1805 1809           */
1806 1810          (void) pn_fixslash(&tpn);
1807 1811          error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1808 1812              NULL, 0);
1809 1813  
1810 1814  out:
1811 1815          pn_free(&fpn);
1812 1816          pn_free(&tpn);
1813 1817          if (in_crit_src)
1814 1818                  nbl_end_crit(fvp);
1815 1819          if (in_crit_targ)
1816 1820                  nbl_end_crit(targvp);
1817 1821          if (fromvp)
1818 1822                  VN_RELE(fromvp);
1819 1823          if (tovp)
1820 1824                  VN_RELE(tovp);
1821 1825          if (targvp)
1822 1826                  VN_RELE(targvp);
1823 1827          if (fvp)
1824 1828                  VN_RELE(fvp);
1825 1829          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1826 1830                  goto top;
1827 1831          return (error);
1828 1832  }
1829 1833  
1830 1834  /*
1831 1835   * Remove a file or directory.
1832 1836   */
1833 1837  int
1834 1838  vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1835 1839  {
1836 1840          return (vn_removeat(NULL, fnamep, seg, dirflag));
1837 1841  }
1838 1842  
1839 1843  int
1840 1844  vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1841 1845  {
1842 1846          struct vnode *vp;               /* entry vnode */
1843 1847          struct vnode *dvp;              /* ptr to parent dir vnode */
1844 1848          struct vnode *coveredvp;
1845 1849          struct pathname pn;             /* name of entry */
1846 1850          enum vtype vtype;
1847 1851          int error;
1848 1852          struct vfs *vfsp;
1849 1853          struct vfs *dvfsp;      /* ptr to parent dir vfs */
1850 1854          int in_crit = 0;
1851 1855          int estale_retry = 0;
1852 1856  
1853 1857  top:
1854 1858          if (error = pn_get(fnamep, seg, &pn))
1855 1859                  return (error);
1856 1860          dvp = vp = NULL;
1857 1861          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1858 1862                  pn_free(&pn);
1859 1863                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1860 1864                          goto top;
1861 1865                  return (error);
1862 1866          }
1863 1867  
1864 1868          /*
1865 1869           * Make sure there is an entry.
1866 1870           */
1867 1871          if (vp == NULL) {
1868 1872                  error = ENOENT;
1869 1873                  goto out;
1870 1874          }
1871 1875  
1872 1876          vfsp = vp->v_vfsp;
1873 1877          dvfsp = dvp->v_vfsp;
1874 1878  
1875 1879          /*
1876 1880           * If the named file is the root of a mounted filesystem, fail,
1877 1881           * unless it's marked unlinkable.  In that case, unmount the
1878 1882           * filesystem and proceed to unlink the covered vnode.  (If the
1879 1883           * covered vnode is a directory, use rmdir instead of unlink,
1880 1884           * to avoid file system corruption.)
1881 1885           */
1882 1886          if (vp->v_flag & VROOT) {
1883 1887                  if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1884 1888                          error = EBUSY;
1885 1889                          goto out;
1886 1890                  }
1887 1891  
1888 1892                  /*
1889 1893                   * Namefs specific code starts here.
1890 1894                   */
1891 1895  
1892 1896                  if (dirflag == RMDIRECTORY) {
1893 1897                          /*
1894 1898                           * User called rmdir(2) on a file that has
1895 1899                           * been namefs mounted on top of.  Since
1896 1900                           * namefs doesn't allow directories to
1897 1901                           * be mounted on other files we know
1898 1902                           * vp is not of type VDIR so fail to operation.
1899 1903                           */
1900 1904                          error = ENOTDIR;
1901 1905                          goto out;
1902 1906                  }
1903 1907  
1904 1908                  /*
1905 1909                   * If VROOT is still set after grabbing vp->v_lock,
1906 1910                   * noone has finished nm_unmount so far and coveredvp
1907 1911                   * is valid.
1908 1912                   * If we manage to grab vn_vfswlock(coveredvp) before releasing
1909 1913                   * vp->v_lock, any race window is eliminated.
1910 1914                   */
1911 1915  
1912 1916                  mutex_enter(&vp->v_lock);
1913 1917                  if ((vp->v_flag & VROOT) == 0) {
1914 1918                          /* Someone beat us to the unmount */
1915 1919                          mutex_exit(&vp->v_lock);
1916 1920                          error = EBUSY;
1917 1921                          goto out;
1918 1922                  }
1919 1923                  vfsp = vp->v_vfsp;
1920 1924                  coveredvp = vfsp->vfs_vnodecovered;
1921 1925                  ASSERT(coveredvp);
1922 1926                  /*
1923 1927                   * Note: Implementation of vn_vfswlock shows that ordering of
1924 1928                   * v_lock / vn_vfswlock is not an issue here.
1925 1929                   */
1926 1930                  error = vn_vfswlock(coveredvp);
1927 1931                  mutex_exit(&vp->v_lock);
1928 1932  
1929 1933                  if (error)
1930 1934                          goto out;
1931 1935  
1932 1936                  VN_HOLD(coveredvp);
1933 1937                  VN_RELE(vp);
1934 1938                  error = dounmount(vfsp, 0, CRED());
1935 1939  
1936 1940                  /*
1937 1941                   * Unmounted the namefs file system; now get
1938 1942                   * the object it was mounted over.
1939 1943                   */
1940 1944                  vp = coveredvp;
1941 1945                  /*
1942 1946                   * If namefs was mounted over a directory, then
1943 1947                   * we want to use rmdir() instead of unlink().
1944 1948                   */
1945 1949                  if (vp->v_type == VDIR)
1946 1950                          dirflag = RMDIRECTORY;
1947 1951  
1948 1952                  if (error)
1949 1953                          goto out;
1950 1954          }
1951 1955  
1952 1956          /*
1953 1957           * Make sure filesystem is writeable.
1954 1958           * We check the parent directory's vfs in case this is an lofs vnode.
1955 1959           */
1956 1960          if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1957 1961                  error = EROFS;
1958 1962                  goto out;
1959 1963          }
1960 1964  
1961 1965          vtype = vp->v_type;
1962 1966  
1963 1967          /*
1964 1968           * If there is the possibility of an nbmand share reservation, make
1965 1969           * sure it's okay to remove the file.  Keep a reference to the
1966 1970           * vnode, so that we can exit the nbl critical region after
1967 1971           * calling VOP_REMOVE.
1968 1972           * If there is no possibility of an nbmand share reservation,
1969 1973           * release the vnode reference now.  Filesystems like NFS may
1970 1974           * behave differently if there is an extra reference, so get rid of
1971 1975           * this one.  Fortunately, we can't have nbmand mounts on NFS
1972 1976           * filesystems.
1973 1977           */
1974 1978          if (nbl_need_check(vp)) {
1975 1979                  nbl_start_crit(vp, RW_READER);
1976 1980                  in_crit = 1;
1977 1981                  if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1978 1982                          error = EACCES;
1979 1983                          goto out;
1980 1984                  }
1981 1985          } else {
1982 1986                  VN_RELE(vp);
1983 1987                  vp = NULL;
1984 1988          }
1985 1989  
1986 1990          if (dirflag == RMDIRECTORY) {
1987 1991                  /*
1988 1992                   * Caller is using rmdir(2), which can only be applied to
1989 1993                   * directories.
1990 1994                   */
1991 1995                  if (vtype != VDIR) {
1992 1996                          error = ENOTDIR;
1993 1997                  } else {
1994 1998                          vnode_t *cwd;
1995 1999                          proc_t *pp = curproc;
1996 2000  
1997 2001                          mutex_enter(&pp->p_lock);
1998 2002                          cwd = PTOU(pp)->u_cdir;
1999 2003                          VN_HOLD(cwd);
2000 2004                          mutex_exit(&pp->p_lock);
2001 2005                          error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2002 2006                              NULL, 0);
2003 2007                          VN_RELE(cwd);
2004 2008                  }
2005 2009          } else {
2006 2010                  /*
2007 2011                   * Unlink(2) can be applied to anything.
2008 2012                   */
2009 2013                  error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2010 2014          }
2011 2015  
2012 2016  out:
2013 2017          pn_free(&pn);
2014 2018          if (in_crit) {
2015 2019                  nbl_end_crit(vp);
2016 2020                  in_crit = 0;
2017 2021          }
2018 2022          if (vp != NULL)
2019 2023                  VN_RELE(vp);
2020 2024          if (dvp != NULL)
2021 2025                  VN_RELE(dvp);
2022 2026          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2023 2027                  goto top;
2024 2028          return (error);
2025 2029  }
2026 2030  
2027 2031  /*
2028 2032   * Utility function to compare equality of vnodes.
2029 2033   * Compare the underlying real vnodes, if there are underlying vnodes.
2030 2034   * This is a more thorough comparison than the VN_CMP() macro provides.
2031 2035   */
2032 2036  int
2033 2037  vn_compare(vnode_t *vp1, vnode_t *vp2)
2034 2038  {
2035 2039          vnode_t *realvp;
2036 2040  
2037 2041          if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2038 2042                  vp1 = realvp;
2039 2043          if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2040 2044                  vp2 = realvp;
2041 2045          return (VN_CMP(vp1, vp2));
2042 2046  }
2043 2047  
2044 2048  /*
2045 2049   * The number of locks to hash into.  This value must be a power
2046 2050   * of 2 minus 1 and should probably also be prime.
2047 2051   */
2048 2052  #define NUM_BUCKETS     1023
2049 2053  
2050 2054  struct  vn_vfslocks_bucket {
2051 2055          kmutex_t vb_lock;
2052 2056          vn_vfslocks_entry_t *vb_list;
2053 2057          char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2054 2058  };
2055 2059  
2056 2060  /*
2057 2061   * Total number of buckets will be NUM_BUCKETS + 1 .
2058 2062   */
2059 2063  
2060 2064  #pragma align   64(vn_vfslocks_buckets)
2061 2065  static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2062 2066  
2063 2067  #define VN_VFSLOCKS_SHIFT       9
2064 2068  
2065 2069  #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2066 2070          ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2067 2071  
2068 2072  /*
2069 2073   * vn_vfslocks_getlock() uses an HASH scheme to generate
2070 2074   * rwstlock using vfs/vnode pointer passed to it.
2071 2075   *
2072 2076   * vn_vfslocks_rele() releases a reference in the
2073 2077   * HASH table which allows the entry allocated by
2074 2078   * vn_vfslocks_getlock() to be freed at a later
2075 2079   * stage when the refcount drops to zero.
2076 2080   */
2077 2081  
2078 2082  vn_vfslocks_entry_t *
2079 2083  vn_vfslocks_getlock(void *vfsvpptr)
2080 2084  {
2081 2085          struct vn_vfslocks_bucket *bp;
2082 2086          vn_vfslocks_entry_t *vep;
2083 2087          vn_vfslocks_entry_t *tvep;
2084 2088  
2085 2089          ASSERT(vfsvpptr != NULL);
2086 2090          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2087 2091  
2088 2092          mutex_enter(&bp->vb_lock);
2089 2093          for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2090 2094                  if (vep->ve_vpvfs == vfsvpptr) {
2091 2095                          vep->ve_refcnt++;
2092 2096                          mutex_exit(&bp->vb_lock);
2093 2097                          return (vep);
2094 2098                  }
2095 2099          }
2096 2100          mutex_exit(&bp->vb_lock);
2097 2101          vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2098 2102          rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2099 2103          vep->ve_vpvfs = (char *)vfsvpptr;
2100 2104          vep->ve_refcnt = 1;
2101 2105          mutex_enter(&bp->vb_lock);
2102 2106          for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2103 2107                  if (tvep->ve_vpvfs == vfsvpptr) {
2104 2108                          tvep->ve_refcnt++;
2105 2109                          mutex_exit(&bp->vb_lock);
2106 2110  
2107 2111                          /*
2108 2112                           * There is already an entry in the hash
2109 2113                           * destroy what we just allocated.
2110 2114                           */
2111 2115                          rwst_destroy(&vep->ve_lock);
2112 2116                          kmem_free(vep, sizeof (*vep));
2113 2117                          return (tvep);
2114 2118                  }
2115 2119          }
2116 2120          vep->ve_next = bp->vb_list;
2117 2121          bp->vb_list = vep;
2118 2122          mutex_exit(&bp->vb_lock);
2119 2123          return (vep);
2120 2124  }
2121 2125  
2122 2126  void
2123 2127  vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2124 2128  {
2125 2129          struct vn_vfslocks_bucket *bp;
2126 2130          vn_vfslocks_entry_t *vep;
2127 2131          vn_vfslocks_entry_t *pvep;
2128 2132  
2129 2133          ASSERT(vepent != NULL);
2130 2134          ASSERT(vepent->ve_vpvfs != NULL);
2131 2135  
2132 2136          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2133 2137  
2134 2138          mutex_enter(&bp->vb_lock);
2135 2139          vepent->ve_refcnt--;
2136 2140  
2137 2141          if ((int32_t)vepent->ve_refcnt < 0)
2138 2142                  cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2139 2143  
2140 2144          pvep = NULL;
2141 2145          if (vepent->ve_refcnt == 0) {
2142 2146                  for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2143 2147                          if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2144 2148                                  if (pvep == NULL)
2145 2149                                          bp->vb_list = vep->ve_next;
2146 2150                                  else {
2147 2151                                          pvep->ve_next = vep->ve_next;
2148 2152                                  }
2149 2153                                  mutex_exit(&bp->vb_lock);
2150 2154                                  rwst_destroy(&vep->ve_lock);
2151 2155                                  kmem_free(vep, sizeof (*vep));
2152 2156                                  return;
2153 2157                          }
2154 2158                          pvep = vep;
2155 2159                  }
2156 2160                  cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2157 2161          }
2158 2162          mutex_exit(&bp->vb_lock);
2159 2163  }
2160 2164  
2161 2165  /*
2162 2166   * vn_vfswlock_wait is used to implement a lock which is logically a writers
2163 2167   * lock protecting the v_vfsmountedhere field.
2164 2168   * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2165 2169   * except that it blocks to acquire the lock VVFSLOCK.
2166 2170   *
2167 2171   * traverse() and routines re-implementing part of traverse (e.g. autofs)
2168 2172   * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2169 2173   * need the non-blocking version of the writers lock i.e. vn_vfswlock
2170 2174   */
2171 2175  int
2172 2176  vn_vfswlock_wait(vnode_t *vp)
2173 2177  {
2174 2178          int retval;
2175 2179          vn_vfslocks_entry_t *vpvfsentry;
2176 2180          ASSERT(vp != NULL);
2177 2181  
2178 2182          vpvfsentry = vn_vfslocks_getlock(vp);
2179 2183          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2180 2184  
2181 2185          if (retval == EINTR) {
2182 2186                  vn_vfslocks_rele(vpvfsentry);
2183 2187                  return (EINTR);
2184 2188          }
2185 2189          return (retval);
2186 2190  }
2187 2191  
2188 2192  int
2189 2193  vn_vfsrlock_wait(vnode_t *vp)
2190 2194  {
2191 2195          int retval;
2192 2196          vn_vfslocks_entry_t *vpvfsentry;
2193 2197          ASSERT(vp != NULL);
2194 2198  
2195 2199          vpvfsentry = vn_vfslocks_getlock(vp);
2196 2200          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2197 2201  
2198 2202          if (retval == EINTR) {
2199 2203                  vn_vfslocks_rele(vpvfsentry);
2200 2204                  return (EINTR);
2201 2205          }
2202 2206  
2203 2207          return (retval);
2204 2208  }
2205 2209  
2206 2210  
2207 2211  /*
2208 2212   * vn_vfswlock is used to implement a lock which is logically a writers lock
2209 2213   * protecting the v_vfsmountedhere field.
2210 2214   */
2211 2215  int
2212 2216  vn_vfswlock(vnode_t *vp)
2213 2217  {
2214 2218          vn_vfslocks_entry_t *vpvfsentry;
2215 2219  
2216 2220          /*
2217 2221           * If vp is NULL then somebody is trying to lock the covered vnode
2218 2222           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2219 2223           * only happen when unmounting /.  Since that operation will fail
2220 2224           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2221 2225           */
2222 2226          if (vp == NULL)
2223 2227                  return (EBUSY);
2224 2228  
2225 2229          vpvfsentry = vn_vfslocks_getlock(vp);
2226 2230  
2227 2231          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2228 2232                  return (0);
2229 2233  
2230 2234          vn_vfslocks_rele(vpvfsentry);
2231 2235          return (EBUSY);
2232 2236  }
2233 2237  
2234 2238  int
2235 2239  vn_vfsrlock(vnode_t *vp)
2236 2240  {
2237 2241          vn_vfslocks_entry_t *vpvfsentry;
2238 2242  
2239 2243          /*
2240 2244           * If vp is NULL then somebody is trying to lock the covered vnode
2241 2245           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2242 2246           * only happen when unmounting /.  Since that operation will fail
2243 2247           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2244 2248           */
2245 2249          if (vp == NULL)
2246 2250                  return (EBUSY);
2247 2251  
2248 2252          vpvfsentry = vn_vfslocks_getlock(vp);
2249 2253  
2250 2254          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2251 2255                  return (0);
2252 2256  
2253 2257          vn_vfslocks_rele(vpvfsentry);
2254 2258          return (EBUSY);
2255 2259  }
2256 2260  
2257 2261  void
2258 2262  vn_vfsunlock(vnode_t *vp)
2259 2263  {
2260 2264          vn_vfslocks_entry_t *vpvfsentry;
2261 2265  
2262 2266          /*
2263 2267           * ve_refcnt needs to be decremented twice.
2264 2268           * 1. To release refernce after a call to vn_vfslocks_getlock()
2265 2269           * 2. To release the reference from the locking routines like
2266 2270           *    vn_vfsrlock/vn_vfswlock etc,.
2267 2271           */
2268 2272          vpvfsentry = vn_vfslocks_getlock(vp);
2269 2273          vn_vfslocks_rele(vpvfsentry);
2270 2274  
2271 2275          rwst_exit(&vpvfsentry->ve_lock);
2272 2276          vn_vfslocks_rele(vpvfsentry);
2273 2277  }
2274 2278  
2275 2279  int
2276 2280  vn_vfswlock_held(vnode_t *vp)
2277 2281  {
2278 2282          int held;
2279 2283          vn_vfslocks_entry_t *vpvfsentry;
2280 2284  
2281 2285          ASSERT(vp != NULL);
2282 2286  
2283 2287          vpvfsentry = vn_vfslocks_getlock(vp);
2284 2288          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2285 2289  
2286 2290          vn_vfslocks_rele(vpvfsentry);
2287 2291          return (held);
2288 2292  }
2289 2293  
2290 2294  
2291 2295  int
2292 2296  vn_make_ops(
2293 2297          const char *name,                       /* Name of file system */
2294 2298          const fs_operation_def_t *templ,        /* Operation specification */
2295 2299          vnodeops_t **actual)                    /* Return the vnodeops */
2296 2300  {
2297 2301          int unused_ops;
2298 2302          int error;
2299 2303  
2300 2304          *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2301 2305  
2302 2306          (*actual)->vnop_name = name;
2303 2307  
2304 2308          error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2305 2309          if (error) {
2306 2310                  kmem_free(*actual, sizeof (vnodeops_t));
2307 2311          }
2308 2312  
2309 2313  #if DEBUG
2310 2314          if (unused_ops != 0)
2311 2315                  cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2312 2316                      "but not used", name, unused_ops);
2313 2317  #endif
2314 2318  
2315 2319          return (error);
2316 2320  }
2317 2321  
2318 2322  /*
2319 2323   * Free the vnodeops created as a result of vn_make_ops()
2320 2324   */
2321 2325  void
2322 2326  vn_freevnodeops(vnodeops_t *vnops)
2323 2327  {
2324 2328          kmem_free(vnops, sizeof (vnodeops_t));
2325 2329  }
2326 2330  
2327 2331  /*
2328 2332   * Vnode cache.
2329 2333   */
2330 2334  
2331 2335  /* ARGSUSED */
2332 2336  static int
2333 2337  vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2334 2338  {
2335 2339          struct vnode *vp;
2336 2340  
2337 2341          vp = buf;
2338 2342  
2339 2343          mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2340 2344          mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2341 2345          cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2342 2346          rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2343 2347          vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2344 2348          vp->v_path = vn_vpath_empty;
2345 2349          vp->v_path_stamp = 0;
2346 2350          vp->v_mpssdata = NULL;
2347 2351          vp->v_vsd = NULL;
2348 2352          vp->v_fopdata = NULL;
2349 2353  
2350 2354          return (0);
2351 2355  }
2352 2356  
2353 2357  /* ARGSUSED */
2354 2358  static void
2355 2359  vn_cache_destructor(void *buf, void *cdrarg)
2356 2360  {
2357 2361          struct vnode *vp;
2358 2362  
2359 2363          vp = buf;
2360 2364  
2361 2365          rw_destroy(&vp->v_nbllock);
2362 2366          cv_destroy(&vp->v_cv);
2363 2367          mutex_destroy(&vp->v_vsd_lock);
2364 2368          mutex_destroy(&vp->v_lock);
2365 2369  }
2366 2370  
2367 2371  void
2368 2372  vn_create_cache(void)
2369 2373  {
2370 2374          /* LINTED */
2371 2375          ASSERT((1 << VNODE_ALIGN_LOG2) ==
2372 2376              P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2373 2377          vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2374 2378              VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2375 2379              NULL, 0);
2376 2380  }
2377 2381  
2378 2382  void
2379 2383  vn_destroy_cache(void)
2380 2384  {
2381 2385          kmem_cache_destroy(vn_cache);
2382 2386  }
2383 2387  
2384 2388  /*
2385 2389   * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2386 2390   * cached by the file system and vnodes remain associated.
2387 2391   */
2388 2392  void
2389 2393  vn_recycle(vnode_t *vp)
2390 2394  {
2391 2395          ASSERT(vp->v_pages == NULL);
2392 2396          VERIFY(vp->v_path != NULL);
2393 2397  
2394 2398          /*
2395 2399           * XXX - This really belongs in vn_reinit(), but we have some issues
2396 2400           * with the counts.  Best to have it here for clean initialization.
2397 2401           */
2398 2402          vp->v_rdcnt = 0;
2399 2403          vp->v_wrcnt = 0;
2400 2404          vp->v_mmap_read = 0;
2401 2405          vp->v_mmap_write = 0;
2402 2406  
2403 2407          /*
2404 2408           * If FEM was in use, make sure everything gets cleaned up
2405 2409           * NOTE: vp->v_femhead is initialized to NULL in the vnode
2406 2410           * constructor.
2407 2411           */
2408 2412          if (vp->v_femhead) {
2409 2413                  /* XXX - There should be a free_femhead() that does all this */
2410 2414                  ASSERT(vp->v_femhead->femh_list == NULL);
2411 2415                  mutex_destroy(&vp->v_femhead->femh_lock);
2412 2416                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2413 2417                  vp->v_femhead = NULL;
2414 2418          }
2415 2419          if (vp->v_path != vn_vpath_empty) {
2416 2420                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2417 2421                  vp->v_path = vn_vpath_empty;
2418 2422          }
2419 2423          vp->v_path_stamp = 0;
2420 2424  
2421 2425          if (vp->v_fopdata != NULL) {
2422 2426                  free_fopdata(vp);
2423 2427          }
2424 2428          vp->v_mpssdata = NULL;
2425 2429          vsd_free(vp);
2426 2430  }
2427 2431  
2428 2432  /*
2429 2433   * Used to reset the vnode fields including those that are directly accessible
2430 2434   * as well as those which require an accessor function.
2431 2435   *
2432 2436   * Does not initialize:
2433 2437   *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2434 2438   *      v_data (since FS-nodes and vnodes point to each other and should
2435 2439   *              be updated simultaneously)
2436 2440   *      v_op (in case someone needs to make a VOP call on this object)
2437 2441   */
2438 2442  void
2439 2443  vn_reinit(vnode_t *vp)
2440 2444  {
2441 2445          vp->v_count = 1;
2442 2446          vp->v_count_dnlc = 0;
2443 2447          vp->v_vfsp = NULL;
2444 2448          vp->v_stream = NULL;
2445 2449          vp->v_vfsmountedhere = NULL;
2446 2450          vp->v_flag = 0;
2447 2451          vp->v_type = VNON;
2448 2452          vp->v_rdev = NODEV;
2449 2453  
2450 2454          vp->v_filocks = NULL;
2451 2455          vp->v_shrlocks = NULL;
2452 2456          vp->v_pages = NULL;
2453 2457  
2454 2458          vp->v_locality = NULL;
2455 2459          vp->v_xattrdir = NULL;
2456 2460  
2457 2461          /*
2458 2462           * In a few specific instances, vn_reinit() is used to initialize
2459 2463           * locally defined vnode_t instances.  Lacking the construction offered
2460 2464           * by vn_alloc(), these vnodes require v_path initialization.
2461 2465           */
2462 2466          if (vp->v_path == NULL) {
2463 2467                  vp->v_path = vn_vpath_empty;
2464 2468          }
2465 2469  
2466 2470          /* Handles v_femhead, v_path, and the r/w/map counts */
2467 2471          vn_recycle(vp);
2468 2472  }
2469 2473  
2470 2474  vnode_t *
2471 2475  vn_alloc(int kmflag)
2472 2476  {
2473 2477          vnode_t *vp;
2474 2478  
2475 2479          vp = kmem_cache_alloc(vn_cache, kmflag);
2476 2480  
2477 2481          if (vp != NULL) {
2478 2482                  vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2479 2483                  vp->v_fopdata = NULL;
2480 2484                  vn_reinit(vp);
2481 2485          }
2482 2486  
2483 2487          return (vp);
2484 2488  }
2485 2489  
2486 2490  void
2487 2491  vn_free(vnode_t *vp)
2488 2492  {
2489 2493          ASSERT(vp->v_shrlocks == NULL);
2490 2494          ASSERT(vp->v_filocks == NULL);
2491 2495  
2492 2496          /*
2493 2497           * Some file systems call vn_free() with v_count of zero,
2494 2498           * some with v_count of 1.  In any case, the value should
2495 2499           * never be anything else.
2496 2500           */
2497 2501          ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2498 2502          ASSERT(vp->v_count_dnlc == 0);
2499 2503          VERIFY(vp->v_path != NULL);
2500 2504          if (vp->v_path != vn_vpath_empty) {
2501 2505                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2502 2506                  vp->v_path = vn_vpath_empty;
2503 2507          }
2504 2508  
2505 2509          /* If FEM was in use, make sure everything gets cleaned up */
2506 2510          if (vp->v_femhead) {
2507 2511                  /* XXX - There should be a free_femhead() that does all this */
2508 2512                  ASSERT(vp->v_femhead->femh_list == NULL);
2509 2513                  mutex_destroy(&vp->v_femhead->femh_lock);
2510 2514                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2511 2515                  vp->v_femhead = NULL;
2512 2516          }
2513 2517  
2514 2518          if (vp->v_fopdata != NULL) {
2515 2519                  free_fopdata(vp);
2516 2520          }
2517 2521          vp->v_mpssdata = NULL;
2518 2522          vsd_free(vp);
2519 2523          kmem_cache_free(vn_cache, vp);
2520 2524  }
2521 2525  
2522 2526  /*
2523 2527   * vnode status changes, should define better states than 1, 0.
2524 2528   */
2525 2529  void
2526 2530  vn_reclaim(vnode_t *vp)
2527 2531  {
2528 2532          vfs_t   *vfsp = vp->v_vfsp;
2529 2533  
2530 2534          if (vfsp == NULL ||
2531 2535              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2532 2536                  return;
2533 2537          }
2534 2538          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2535 2539  }
2536 2540  
2537 2541  void
2538 2542  vn_idle(vnode_t *vp)
2539 2543  {
2540 2544          vfs_t   *vfsp = vp->v_vfsp;
2541 2545  
2542 2546          if (vfsp == NULL ||
2543 2547              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2544 2548                  return;
2545 2549          }
2546 2550          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2547 2551  }
2548 2552  void
2549 2553  vn_exists(vnode_t *vp)
2550 2554  {
2551 2555          vfs_t   *vfsp = vp->v_vfsp;
2552 2556  
2553 2557          if (vfsp == NULL ||
2554 2558              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2555 2559                  return;
2556 2560          }
2557 2561          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2558 2562  }
2559 2563  
2560 2564  void
2561 2565  vn_invalid(vnode_t *vp)
2562 2566  {
2563 2567          vfs_t   *vfsp = vp->v_vfsp;
2564 2568  
2565 2569          if (vfsp == NULL ||
2566 2570              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2567 2571                  return;
2568 2572          }
2569 2573          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2570 2574  }
2571 2575  
2572 2576  /* Vnode event notification */
2573 2577  
2574 2578  int
2575 2579  vnevent_support(vnode_t *vp, caller_context_t *ct)
2576 2580  {
2577 2581          if (vp == NULL)
2578 2582                  return (EINVAL);
2579 2583  
2580 2584          return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2581 2585  }
2582 2586  
2583 2587  void
2584 2588  vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2585 2589  {
2586 2590          if (vp == NULL || vp->v_femhead == NULL) {
2587 2591                  return;
2588 2592          }
2589 2593          (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2590 2594  }
2591 2595  
2592 2596  void
2593 2597  vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2594 2598      caller_context_t *ct)
2595 2599  {
2596 2600          if (vp == NULL || vp->v_femhead == NULL) {
2597 2601                  return;
2598 2602          }
2599 2603          (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2600 2604  }
2601 2605  
2602 2606  void
2603 2607  vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
2604 2608  {
2605 2609          if (vp == NULL || vp->v_femhead == NULL) {
2606 2610                  return;
2607 2611          }
2608 2612          (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2609 2613  }
2610 2614  
2611 2615  void
2612 2616  vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2613 2617  {
2614 2618          if (vp == NULL || vp->v_femhead == NULL) {
2615 2619                  return;
2616 2620          }
2617 2621          (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2618 2622  }
2619 2623  
2620 2624  void
2621 2625  vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2622 2626  {
2623 2627          if (vp == NULL || vp->v_femhead == NULL) {
2624 2628                  return;
2625 2629          }
2626 2630          (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2627 2631  }
2628 2632  
2629 2633  void
2630 2634  vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2631 2635      caller_context_t *ct)
2632 2636  {
2633 2637          if (vp == NULL || vp->v_femhead == NULL) {
2634 2638                  return;
2635 2639          }
2636 2640          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2637 2641  }
2638 2642  
2639 2643  void
2640 2644  vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2641 2645      caller_context_t *ct)
2642 2646  {
2643 2647          if (vp == NULL || vp->v_femhead == NULL) {
2644 2648                  return;
2645 2649          }
2646 2650          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2647 2651  }
2648 2652  
2649 2653  void
2650 2654  vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2651 2655      caller_context_t *ct)
2652 2656  {
2653 2657          if (vp == NULL || vp->v_femhead == NULL) {
2654 2658                  return;
2655 2659          }
2656 2660          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2657 2661  }
2658 2662  
2659 2663  void
2660 2664  vnevent_create(vnode_t *vp, caller_context_t *ct)
2661 2665  {
2662 2666          if (vp == NULL || vp->v_femhead == NULL) {
2663 2667                  return;
2664 2668          }
2665 2669          (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2666 2670  }
2667 2671  
2668 2672  void
2669 2673  vnevent_link(vnode_t *vp, caller_context_t *ct)
2670 2674  {
2671 2675          if (vp == NULL || vp->v_femhead == NULL) {
2672 2676                  return;
2673 2677          }
2674 2678          (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2675 2679  }
2676 2680  
2677 2681  void
2678 2682  vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2679 2683  {
2680 2684          if (vp == NULL || vp->v_femhead == NULL) {
2681 2685                  return;
2682 2686          }
2683 2687          (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2684 2688  }
2685 2689  
2686 2690  void
2687 2691  vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2688 2692  {
2689 2693          if (vp == NULL || vp->v_femhead == NULL) {
2690 2694                  return;
2691 2695          }
2692 2696          (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2693 2697  }
2694 2698  
2695 2699  /*
2696 2700   * Vnode accessors.
2697 2701   */
2698 2702  
2699 2703  int
2700 2704  vn_is_readonly(vnode_t *vp)
2701 2705  {
2702 2706          return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2703 2707  }
2704 2708  
2705 2709  int
2706 2710  vn_has_flocks(vnode_t *vp)
2707 2711  {
2708 2712          return (vp->v_filocks != NULL);
2709 2713  }
2710 2714  
2711 2715  int
2712 2716  vn_has_mandatory_locks(vnode_t *vp, int mode)
2713 2717  {
2714 2718          return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2715 2719  }
2716 2720  
2717 2721  int
2718 2722  vn_has_cached_data(vnode_t *vp)
2719 2723  {
2720 2724          return (vp->v_pages != NULL);
2721 2725  }
2722 2726  
2723 2727  /*
2724 2728   * Return 0 if the vnode in question shouldn't be permitted into a zone via
2725 2729   * zone_enter(2).
2726 2730   */
2727 2731  int
2728 2732  vn_can_change_zones(vnode_t *vp)
2729 2733  {
2730 2734          struct vfssw *vswp;
2731 2735          int allow = 1;
2732 2736          vnode_t *rvp;
2733 2737  
2734 2738          if (nfs_global_client_only != 0)
2735 2739                  return (1);
2736 2740  
2737 2741          /*
2738 2742           * We always want to look at the underlying vnode if there is one.
2739 2743           */
2740 2744          if (VOP_REALVP(vp, &rvp, NULL) != 0)
2741 2745                  rvp = vp;
2742 2746          /*
2743 2747           * Some pseudo filesystems (including doorfs) don't actually register
2744 2748           * their vfsops_t, so the following may return NULL; we happily let
2745 2749           * such vnodes switch zones.
2746 2750           */
2747 2751          vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2748 2752          if (vswp != NULL) {
2749 2753                  if (vswp->vsw_flag & VSW_NOTZONESAFE)
2750 2754                          allow = 0;
2751 2755                  vfs_unrefvfssw(vswp);
2752 2756          }
2753 2757          return (allow);
2754 2758  }
2755 2759  
2756 2760  /*
2757 2761   * Return nonzero if the vnode is a mount point, zero if not.
2758 2762   */
2759 2763  int
2760 2764  vn_ismntpt(vnode_t *vp)
2761 2765  {
2762 2766          return (vp->v_vfsmountedhere != NULL);
2763 2767  }
2764 2768  
2765 2769  /* Retrieve the vfs (if any) mounted on this vnode */
2766 2770  vfs_t *
2767 2771  vn_mountedvfs(vnode_t *vp)
2768 2772  {
2769 2773          return (vp->v_vfsmountedhere);
2770 2774  }
2771 2775  
2772 2776  /*
2773 2777   * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2774 2778   */
2775 2779  int
2776 2780  vn_in_dnlc(vnode_t *vp)
2777 2781  {
2778 2782          return (vp->v_count_dnlc > 0);
2779 2783  }
2780 2784  
2781 2785  /*
2782 2786   * vn_has_other_opens() checks whether a particular file is opened by more than
2783 2787   * just the caller and whether the open is for read and/or write.
2784 2788   * This routine is for calling after the caller has already called VOP_OPEN()
2785 2789   * and the caller wishes to know if they are the only one with it open for
2786 2790   * the mode(s) specified.
2787 2791   *
2788 2792   * Vnode counts are only kept on regular files (v_type=VREG).
2789 2793   */
2790 2794  int
2791 2795  vn_has_other_opens(
2792 2796          vnode_t *vp,
2793 2797          v_mode_t mode)
2794 2798  {
2795 2799  
2796 2800          ASSERT(vp != NULL);
2797 2801  
2798 2802          switch (mode) {
2799 2803          case V_WRITE:
2800 2804                  if (vp->v_wrcnt > 1)
2801 2805                          return (V_TRUE);
2802 2806                  break;
2803 2807          case V_RDORWR:
2804 2808                  if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2805 2809                          return (V_TRUE);
2806 2810                  break;
2807 2811          case V_RDANDWR:
2808 2812                  if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2809 2813                          return (V_TRUE);
2810 2814                  break;
2811 2815          case V_READ:
2812 2816                  if (vp->v_rdcnt > 1)
2813 2817                          return (V_TRUE);
2814 2818                  break;
2815 2819          }
2816 2820  
2817 2821          return (V_FALSE);
2818 2822  }
2819 2823  
2820 2824  /*
2821 2825   * vn_is_opened() checks whether a particular file is opened and
2822 2826   * whether the open is for read and/or write.
2823 2827   *
2824 2828   * Vnode counts are only kept on regular files (v_type=VREG).
2825 2829   */
2826 2830  int
2827 2831  vn_is_opened(
2828 2832          vnode_t *vp,
2829 2833          v_mode_t mode)
2830 2834  {
2831 2835  
2832 2836          ASSERT(vp != NULL);
2833 2837  
2834 2838          switch (mode) {
2835 2839          case V_WRITE:
2836 2840                  if (vp->v_wrcnt)
2837 2841                          return (V_TRUE);
2838 2842                  break;
2839 2843          case V_RDANDWR:
2840 2844                  if (vp->v_rdcnt && vp->v_wrcnt)
2841 2845                          return (V_TRUE);
2842 2846                  break;
2843 2847          case V_RDORWR:
2844 2848                  if (vp->v_rdcnt || vp->v_wrcnt)
2845 2849                          return (V_TRUE);
2846 2850                  break;
2847 2851          case V_READ:
2848 2852                  if (vp->v_rdcnt)
2849 2853                          return (V_TRUE);
2850 2854                  break;
2851 2855          }
2852 2856  
2853 2857          return (V_FALSE);
2854 2858  }
2855 2859  
2856 2860  /*
2857 2861   * vn_is_mapped() checks whether a particular file is mapped and whether
2858 2862   * the file is mapped read and/or write.
2859 2863   */
2860 2864  int
2861 2865  vn_is_mapped(
2862 2866          vnode_t *vp,
2863 2867          v_mode_t mode)
2864 2868  {
2865 2869  
2866 2870          ASSERT(vp != NULL);
2867 2871  
2868 2872  #if !defined(_LP64)
2869 2873          switch (mode) {
2870 2874          /*
2871 2875           * The atomic_add_64_nv functions force atomicity in the
2872 2876           * case of 32 bit architectures. Otherwise the 64 bit values
2873 2877           * require two fetches. The value of the fields may be
2874 2878           * (potentially) changed between the first fetch and the
2875 2879           * second
2876 2880           */
2877 2881          case V_WRITE:
2878 2882                  if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2879 2883                          return (V_TRUE);
2880 2884                  break;
2881 2885          case V_RDANDWR:
2882 2886                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2883 2887                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2884 2888                          return (V_TRUE);
2885 2889                  break;
2886 2890          case V_RDORWR:
2887 2891                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2888 2892                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2889 2893                          return (V_TRUE);
2890 2894                  break;
2891 2895          case V_READ:
2892 2896                  if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2893 2897                          return (V_TRUE);
2894 2898                  break;
2895 2899          }
2896 2900  #else
2897 2901          switch (mode) {
2898 2902          case V_WRITE:
2899 2903                  if (vp->v_mmap_write)
2900 2904                          return (V_TRUE);
2901 2905                  break;
2902 2906          case V_RDANDWR:
2903 2907                  if (vp->v_mmap_read && vp->v_mmap_write)
2904 2908                          return (V_TRUE);
2905 2909                  break;
2906 2910          case V_RDORWR:
2907 2911                  if (vp->v_mmap_read || vp->v_mmap_write)
2908 2912                          return (V_TRUE);
2909 2913                  break;
2910 2914          case V_READ:
2911 2915                  if (vp->v_mmap_read)
2912 2916                          return (V_TRUE);
2913 2917                  break;
2914 2918          }
2915 2919  #endif
2916 2920  
2917 2921          return (V_FALSE);
2918 2922  }
2919 2923  
2920 2924  /*
2921 2925   * Set the operations vector for a vnode.
2922 2926   *
2923 2927   * FEM ensures that the v_femhead pointer is filled in before the
2924 2928   * v_op pointer is changed.  This means that if the v_femhead pointer
2925 2929   * is NULL, and the v_op field hasn't changed since before which checked
2926 2930   * the v_femhead pointer; then our update is ok - we are not racing with
2927 2931   * FEM.
2928 2932   */
2929 2933  void
2930 2934  vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2931 2935  {
2932 2936          vnodeops_t      *op;
2933 2937  
2934 2938          ASSERT(vp != NULL);
2935 2939          ASSERT(vnodeops != NULL);
2936 2940  
2937 2941          op = vp->v_op;
2938 2942          membar_consumer();
2939 2943          /*
2940 2944           * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2941 2945           * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2942 2946           * in effect on the vnode and we need to have FEM deal with it.
2943 2947           */
2944 2948          if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2945 2949              op) {
2946 2950                  fem_setvnops(vp, vnodeops);
2947 2951          }
2948 2952  }
2949 2953  
2950 2954  /*
2951 2955   * Retrieve the operations vector for a vnode
2952 2956   * As with vn_setops(above); make sure we aren't racing with FEM.
2953 2957   * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2954 2958   * make sense to the callers of this routine.
2955 2959   */
2956 2960  vnodeops_t *
2957 2961  vn_getops(vnode_t *vp)
2958 2962  {
2959 2963          vnodeops_t      *op;
2960 2964  
2961 2965          ASSERT(vp != NULL);
2962 2966  
2963 2967          op = vp->v_op;
2964 2968          membar_consumer();
2965 2969          if (vp->v_femhead == NULL && op == vp->v_op) {
2966 2970                  return (op);
2967 2971          } else {
2968 2972                  return (fem_getvnops(vp));
2969 2973          }
2970 2974  }
2971 2975  
2972 2976  /*
2973 2977   * Returns non-zero (1) if the vnodeops matches that of the vnode.
2974 2978   * Returns zero (0) if not.
2975 2979   */
2976 2980  int
2977 2981  vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2978 2982  {
2979 2983          return (vn_getops(vp) == vnodeops);
2980 2984  }
2981 2985  
2982 2986  /*
2983 2987   * Returns non-zero (1) if the specified operation matches the
2984 2988   * corresponding operation for that the vnode.
2985 2989   * Returns zero (0) if not.
2986 2990   */
2987 2991  
2988 2992  #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2989 2993  
2990 2994  int
2991 2995  vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2992 2996  {
2993 2997          const fs_operation_trans_def_t *otdp;
2994 2998          fs_generic_func_p *loc = NULL;
2995 2999          vnodeops_t      *vop = vn_getops(vp);
2996 3000  
2997 3001          ASSERT(vopname != NULL);
2998 3002  
2999 3003          for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3000 3004                  if (MATCHNAME(otdp->name, vopname)) {
3001 3005                          loc = (fs_generic_func_p *)
3002 3006                              ((char *)(vop) + otdp->offset);
3003 3007                          break;
3004 3008                  }
3005 3009          }
3006 3010  
3007 3011          return ((loc != NULL) && (*loc == funcp));
3008 3012  }
3009 3013  
3010 3014  /*
3011 3015   * fs_new_caller_id() needs to return a unique ID on a given local system.
3012 3016   * The IDs do not need to survive across reboots.  These are primarily
3013 3017   * used so that (FEM) monitors can detect particular callers (such as
3014 3018   * the NFS server) to a given vnode/vfs operation.
3015 3019   */
3016 3020  u_longlong_t
3017 3021  fs_new_caller_id()
3018 3022  {
3019 3023          static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3020 3024  
3021 3025          return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3022 3026  }
3023 3027  
3024 3028  /*
3025 3029   * The value stored in v_path is relative to rootdir, located in the global
3026 3030   * zone.  Zones or chroot environments which reside deeper inside the VFS
3027 3031   * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3028 3032   * what lies below their perceived root.  In order to keep v_path usable for
3029 3033   * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3030 3034   *
3031 3035   * An upper bound of max_vnode_path is placed upon v_path allocations to
3032 3036   * prevent the system from going too wild at the behest of pathological
3033 3037   * behavior from the operator.
3034 3038   */
3035 3039  size_t max_vnode_path = 4 * MAXPATHLEN;
3036 3040  
3037 3041  
3038 3042  void
3039 3043  vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3040 3044  {
3041 3045          char *buf;
3042 3046  
3043 3047          mutex_enter(&vp->v_lock);
3044 3048          /*
3045 3049           * If the snapshot of v_path_stamp passed in via compare_stamp does not
3046 3050           * match the present value on the vnode, it indicates that subsequent
3047 3051           * changes have occurred.  The v_path value is not cleared in this case
3048 3052           * since the new value may be valid.
3049 3053           */
3050 3054          if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3051 3055                  mutex_exit(&vp->v_lock);
3052 3056                  return;
3053 3057          }
3054 3058          buf = vp->v_path;
3055 3059          vp->v_path = vn_vpath_empty;
3056 3060          vp->v_path_stamp = 0;
3057 3061          mutex_exit(&vp->v_lock);
3058 3062          if (buf != vn_vpath_empty) {
3059 3063                  kmem_free(buf, strlen(buf) + 1);
3060 3064          }
3061 3065  }
3062 3066  
3063 3067  static void
3064 3068  vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3065 3069      boolean_t is_rename)
3066 3070  {
3067 3071          char *buf, *oldbuf;
3068 3072          hrtime_t pstamp;
3069 3073          size_t baselen, buflen = 0;
3070 3074  
3071 3075          /* Handle the vn_setpath_str case. */
3072 3076          if (pvp == NULL) {
3073 3077                  if (len + 1 > max_vnode_path) {
3074 3078                          DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3075 3079                              vnode_t *, vp, char *, name, size_t, len + 1);
3076 3080                          return;
3077 3081                  }
3078 3082                  buf = kmem_alloc(len + 1, KM_SLEEP);
3079 3083                  bcopy(name, buf, len);
3080 3084                  buf[len] = '\0';
3081 3085  
3082 3086                  mutex_enter(&vp->v_lock);
3083 3087                  oldbuf = vp->v_path;
3084 3088                  vp->v_path = buf;
3085 3089                  vp->v_path_stamp = gethrtime();
3086 3090                  mutex_exit(&vp->v_lock);
3087 3091                  if (oldbuf != vn_vpath_empty) {
3088 3092                          kmem_free(oldbuf, strlen(oldbuf) + 1);
3089 3093                  }
3090 3094                  return;
3091 3095          }
3092 3096  
3093 3097          /* Take snapshot of parent dir */
3094 3098          mutex_enter(&pvp->v_lock);
3095 3099  
3096 3100          if ((pvp->v_flag & VTRAVERSE) != 0) {
3097 3101                  /*
3098 3102                   * When the parent vnode has VTRAVERSE set in its flags, normal
3099 3103                   * assumptions about v_path calculation no longer apply.  The
3100 3104                   * primary situation where this occurs is via the VFS tricks
3101 3105                   * which procfs plays in order to allow /proc/PID/(root|cwd) to
3102 3106                   * yield meaningful results.
3103 3107                   *
3104 3108                   * When this flag is set, v_path on the child must not be
3105 3109                   * updated since the calculated value is likely to be
3106 3110                   * incorrect, given the current context.
3107 3111                   */
3108 3112                  mutex_exit(&pvp->v_lock);
3109 3113                  return;
3110 3114          }
3111 3115  
3112 3116  retrybuf:
3113 3117          if (pvp->v_path == vn_vpath_empty) {
3114 3118                  /*
3115 3119                   * Without v_path from the parent directory, generating a child
3116 3120                   * path from the name is impossible.
3117 3121                   */
3118 3122                  if (len > 0) {
3119 3123                          pstamp = pvp->v_path_stamp;
3120 3124                          mutex_exit(&pvp->v_lock);
3121 3125                          vn_clearpath(vp, pstamp);
3122 3126                          return;
3123 3127                  }
3124 3128  
3125 3129                  /*
3126 3130                   * The only feasible case here is where a NUL lookup is being
3127 3131                   * performed on rootdir prior to its v_path being populated.
3128 3132                   */
3129 3133                  ASSERT(pvp->v_path_stamp == 0);
3130 3134                  baselen = 0;
3131 3135                  pstamp = 0;
3132 3136          } else {
3133 3137                  pstamp = pvp->v_path_stamp;
3134 3138                  baselen = strlen(pvp->v_path);
3135 3139                  /* ignore a trailing slash if present */
3136 3140                  if (pvp->v_path[baselen - 1] == '/') {
3137 3141                          /* This should only the be case for rootdir */
3138 3142                          ASSERT(baselen == 1 && pvp == rootdir);
3139 3143                          baselen--;
3140 3144                  }
3141 3145          }
3142 3146          mutex_exit(&pvp->v_lock);
3143 3147  
3144 3148          if (buflen != 0) {
3145 3149                  /* Free the existing (mis-sized) buffer in case of retry */
3146 3150                  kmem_free(buf, buflen);
3147 3151          }
3148 3152          /* base, '/', name and trailing NUL */
3149 3153          buflen = baselen + len + 2;
3150 3154          if (buflen > max_vnode_path) {
3151 3155                  DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3152 3156                      vnode_t *, vp, char *, name, size_t, buflen);
3153 3157                  return;
3154 3158          }
3155 3159          buf = kmem_alloc(buflen, KM_SLEEP);
3156 3160  
3157 3161          mutex_enter(&pvp->v_lock);
3158 3162          if (pvp->v_path_stamp != pstamp) {
3159 3163                  size_t vlen;
3160 3164  
3161 3165                  /*
3162 3166                   * Since v_path_stamp changed on the parent, it is likely that
3163 3167                   * v_path has been altered as well.  If the length does not
3164 3168                   * exactly match what was previously measured, the buffer
3165 3169                   * allocation must be repeated for proper sizing.
3166 3170                   */
3167 3171                  if (pvp->v_path == vn_vpath_empty) {
3168 3172                          /* Give up if parent lack v_path */
3169 3173                          mutex_exit(&pvp->v_lock);
3170 3174                          kmem_free(buf, buflen);
3171 3175                          return;
3172 3176                  }
3173 3177                  vlen = strlen(pvp->v_path);
3174 3178                  if (pvp->v_path[vlen - 1] == '/') {
3175 3179                          vlen--;
3176 3180                  }
3177 3181                  if (vlen != baselen) {
3178 3182                          goto retrybuf;
3179 3183                  }
3180 3184          }
3181 3185          bcopy(pvp->v_path, buf, baselen);
3182 3186          mutex_exit(&pvp->v_lock);
3183 3187  
3184 3188          buf[baselen] = '/';
3185 3189          baselen++;
3186 3190          bcopy(name, &buf[baselen], len + 1);
3187 3191  
3188 3192          mutex_enter(&vp->v_lock);
3189 3193          if (vp->v_path_stamp == 0) {
3190 3194                  /* never-visited vnode can inherit stamp from parent */
3191 3195                  ASSERT(vp->v_path == vn_vpath_empty);
3192 3196                  vp->v_path_stamp = pstamp;
3193 3197                  vp->v_path = buf;
3194 3198                  mutex_exit(&vp->v_lock);
3195 3199          } else if (vp->v_path_stamp < pstamp || is_rename) {
3196 3200                  /*
3197 3201                   * Install the updated path and stamp, ensuring that the v_path
3198 3202                   * pointer is valid at all times for dtrace.
3199 3203                   */
3200 3204                  oldbuf = vp->v_path;
3201 3205                  vp->v_path = buf;
3202 3206                  vp->v_path_stamp = gethrtime();
3203 3207                  mutex_exit(&vp->v_lock);
3204 3208                  kmem_free(oldbuf, strlen(oldbuf) + 1);
3205 3209          } else {
3206 3210                  /*
3207 3211                   * If the timestamp matches or is greater, it means another
3208 3212                   * thread performed the update first while locks were dropped
3209 3213                   * here to make the allocation.  We defer to the newer value.
3210 3214                   */
3211 3215                  mutex_exit(&vp->v_lock);
3212 3216                  kmem_free(buf, buflen);
3213 3217          }
3214 3218          ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3215 3219  }
3216 3220  
3217 3221  void
3218 3222  vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3219 3223  {
3220 3224          size_t len;
3221 3225  
3222 3226          /*
3223 3227           * If the parent is older or empty, there's nothing further to do.
3224 3228           */
3225 3229          if (pvp->v_path == vn_vpath_empty ||
3226 3230              pvp->v_path_stamp <= vp->v_path_stamp) {
3227 3231                  return;
3228 3232          }
3229 3233  
3230 3234          /*
3231 3235           * Given the lack of appropriate context, meaningful updates to v_path
3232 3236           * cannot be made for during lookups for the '.' or '..' entries.
3233 3237           */
3234 3238          len = strlen(name);
3235 3239          if (len == 0 || (len == 1 && name[0] == '.') ||
3236 3240              (len == 2 && name[0] == '.' && name[1] == '.')) {
3237 3241                  return;
3238 3242          }
3239 3243  
3240 3244          vn_setpath_common(pvp, vp, name, len, B_FALSE);
3241 3245  }
3242 3246  
3243 3247  /*
3244 3248   * Given a starting vnode and a path, updates the path in the target vnode in
3245 3249   * a safe manner.  If the vnode already has path information embedded, then the
3246 3250   * cached path is left untouched.
3247 3251   */
3248 3252  /* ARGSUSED */
3249 3253  void
3250 3254  vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3251 3255      size_t len)
3252 3256  {
3253 3257          vn_setpath_common(pvp, vp, name, len, B_FALSE);
3254 3258  }
3255 3259  
3256 3260  /*
3257 3261   * Sets the path to the vnode to be the given string, regardless of current
3258 3262   * context.  The string must be a complete path from rootdir.  This is only used
3259 3263   * by fsop_root() for setting the path based on the mountpoint.
3260 3264   */
3261 3265  void
3262 3266  vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3263 3267  {
3264 3268          vn_setpath_common(NULL, vp, str, len, B_FALSE);
3265 3269  }
3266 3270  
3267 3271  /*
3268 3272   * Called from within filesystem's vop_rename() to handle renames once the
3269 3273   * target vnode is available.
3270 3274   */
3271 3275  void
3272 3276  vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3273 3277  {
3274 3278          vn_setpath_common(pvp, vp, name, len, B_TRUE);
3275 3279  }
3276 3280  
3277 3281  /*
3278 3282   * Similar to vn_setpath_str(), this function sets the path of the destination
3279 3283   * vnode to the be the same as the source vnode.
3280 3284   */
3281 3285  void
3282 3286  vn_copypath(struct vnode *src, struct vnode *dst)
3283 3287  {
3284 3288          char *buf;
3285 3289          hrtime_t stamp;
3286 3290          size_t buflen;
3287 3291  
3288 3292          mutex_enter(&src->v_lock);
3289 3293          if (src->v_path == vn_vpath_empty) {
3290 3294                  mutex_exit(&src->v_lock);
3291 3295                  return;
3292 3296          }
3293 3297          buflen = strlen(src->v_path) + 1;
3294 3298          mutex_exit(&src->v_lock);
3295 3299  
3296 3300          buf = kmem_alloc(buflen, KM_SLEEP);
3297 3301  
3298 3302          mutex_enter(&src->v_lock);
3299 3303          if (src->v_path == vn_vpath_empty ||
3300 3304              strlen(src->v_path) + 1 != buflen) {
3301 3305                  mutex_exit(&src->v_lock);
3302 3306                  kmem_free(buf, buflen);
3303 3307                  return;
3304 3308          }
3305 3309          bcopy(src->v_path, buf, buflen);
3306 3310          stamp = src->v_path_stamp;
3307 3311          mutex_exit(&src->v_lock);
3308 3312  
3309 3313          mutex_enter(&dst->v_lock);
3310 3314          if (dst->v_path != vn_vpath_empty) {
3311 3315                  mutex_exit(&dst->v_lock);
3312 3316                  kmem_free(buf, buflen);
3313 3317                  return;
3314 3318          }
3315 3319          dst->v_path = buf;
3316 3320          dst->v_path_stamp = stamp;
3317 3321          mutex_exit(&dst->v_lock);
3318 3322  }
3319 3323  
3320 3324  
3321 3325  /*
3322 3326   * XXX Private interface for segvn routines that handle vnode
3323 3327   * large page segments.
3324 3328   *
3325 3329   * return 1 if vp's file system VOP_PAGEIO() implementation
3326 3330   * can be safely used instead of VOP_GETPAGE() for handling
3327 3331   * pagefaults against regular non swap files. VOP_PAGEIO()
3328 3332   * interface is considered safe here if its implementation
3329 3333   * is very close to VOP_GETPAGE() implementation.
3330 3334   * e.g. It zero's out the part of the page beyond EOF. Doesn't
3331 3335   * panic if there're file holes but instead returns an error.
3332 3336   * Doesn't assume file won't be changed by user writes, etc.
3333 3337   *
3334 3338   * return 0 otherwise.
3335 3339   *
3336 3340   * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3337 3341   */
3338 3342  int
3339 3343  vn_vmpss_usepageio(vnode_t *vp)
3340 3344  {
3341 3345          vfs_t   *vfsp = vp->v_vfsp;
3342 3346          char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3343 3347          char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3344 3348          char **fsok = pageio_ok_fss;
3345 3349  
3346 3350          if (fsname == NULL) {
3347 3351                  return (0);
3348 3352          }
3349 3353  
3350 3354          for (; *fsok; fsok++) {
3351 3355                  if (strcmp(*fsok, fsname) == 0) {
3352 3356                          return (1);
3353 3357                  }
3354 3358          }
3355 3359          return (0);
3356 3360  }
3357 3361  
3358 3362  /* VOP_XXX() macros call the corresponding fop_xxx() function */
3359 3363  
3360 3364  int
3361 3365  fop_open(
3362 3366          vnode_t **vpp,
3363 3367          int mode,
3364 3368          cred_t *cr,
3365 3369          caller_context_t *ct)
3366 3370  {
3367 3371          int ret;
3368 3372          vnode_t *vp = *vpp;
3369 3373  
3370 3374          VN_HOLD(vp);
3371 3375          /*
3372 3376           * Adding to the vnode counts before calling open
3373 3377           * avoids the need for a mutex. It circumvents a race
3374 3378           * condition where a query made on the vnode counts results in a
3375 3379           * false negative. The inquirer goes away believing the file is
3376 3380           * not open when there is an open on the file already under way.
3377 3381           *
3378 3382           * The counts are meant to prevent NFS from granting a delegation
3379 3383           * when it would be dangerous to do so.
3380 3384           *
3381 3385           * The vnode counts are only kept on regular files
3382 3386           */
3383 3387          if ((*vpp)->v_type == VREG) {
3384 3388                  if (mode & FREAD)
3385 3389                          atomic_inc_32(&(*vpp)->v_rdcnt);
3386 3390                  if (mode & FWRITE)
3387 3391                          atomic_inc_32(&(*vpp)->v_wrcnt);
3388 3392          }
3389 3393  
3390 3394          VOPXID_MAP_CR(vp, cr);
3391 3395  
3392 3396          ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3393 3397  
3394 3398          if (ret) {
3395 3399                  /*
3396 3400                   * Use the saved vp just in case the vnode ptr got trashed
3397 3401                   * by the error.
3398 3402                   */
3399 3403                  VOPSTATS_UPDATE(vp, open);
3400 3404                  if ((vp->v_type == VREG) && (mode & FREAD))
3401 3405                          atomic_dec_32(&vp->v_rdcnt);
3402 3406                  if ((vp->v_type == VREG) && (mode & FWRITE))
3403 3407                          atomic_dec_32(&vp->v_wrcnt);
3404 3408          } else {
3405 3409                  /*
3406 3410                   * Some filesystems will return a different vnode,
3407 3411                   * but the same path was still used to open it.
3408 3412                   * So if we do change the vnode and need to
3409 3413                   * copy over the path, do so here, rather than special
3410 3414                   * casing each filesystem. Adjust the vnode counts to
3411 3415                   * reflect the vnode switch.
3412 3416                   */
3413 3417                  VOPSTATS_UPDATE(*vpp, open);
3414 3418                  if (*vpp != vp) {
3415 3419                          vn_copypath(vp, *vpp);
3416 3420                          if (((*vpp)->v_type == VREG) && (mode & FREAD))
3417 3421                                  atomic_inc_32(&(*vpp)->v_rdcnt);
3418 3422                          if ((vp->v_type == VREG) && (mode & FREAD))
3419 3423                                  atomic_dec_32(&vp->v_rdcnt);
3420 3424                          if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3421 3425                                  atomic_inc_32(&(*vpp)->v_wrcnt);
3422 3426                          if ((vp->v_type == VREG) && (mode & FWRITE))
3423 3427                                  atomic_dec_32(&vp->v_wrcnt);
3424 3428                  }
3425 3429          }
3426 3430          VN_RELE(vp);
3427 3431          return (ret);
3428 3432  }
3429 3433  
3430 3434  int
3431 3435  fop_close(
3432 3436          vnode_t *vp,
3433 3437          int flag,
3434 3438          int count,
3435 3439          offset_t offset,
3436 3440          cred_t *cr,
3437 3441          caller_context_t *ct)
3438 3442  {
3439 3443          int err;
3440 3444  
3441 3445          VOPXID_MAP_CR(vp, cr);
3442 3446  
3443 3447          err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3444 3448          VOPSTATS_UPDATE(vp, close);
3445 3449          /*
3446 3450           * Check passed in count to handle possible dups. Vnode counts are only
3447 3451           * kept on regular files
3448 3452           */
3449 3453          if ((vp->v_type == VREG) && (count == 1))  {
3450 3454                  if (flag & FREAD) {
3451 3455                          ASSERT(vp->v_rdcnt > 0);
3452 3456                          atomic_dec_32(&vp->v_rdcnt);
3453 3457                  }
3454 3458                  if (flag & FWRITE) {
3455 3459                          ASSERT(vp->v_wrcnt > 0);
3456 3460                          atomic_dec_32(&vp->v_wrcnt);
3457 3461                  }
3458 3462          }
3459 3463          return (err);
3460 3464  }
3461 3465  
3462 3466  int
3463 3467  fop_read(
3464 3468          vnode_t *vp,
3465 3469          uio_t *uiop,
3466 3470          int ioflag,
3467 3471          cred_t *cr,
3468 3472          caller_context_t *ct)
3469 3473  {
3470 3474          int     err;
3471 3475          ssize_t resid_start = uiop->uio_resid;
3472 3476  
3473 3477          VOPXID_MAP_CR(vp, cr);
3474 3478  
3475 3479          err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3476 3480          VOPSTATS_UPDATE_IO(vp, read,
3477 3481              read_bytes, (resid_start - uiop->uio_resid));
3478 3482          return (err);
3479 3483  }
3480 3484  
3481 3485  int
3482 3486  fop_write(
3483 3487          vnode_t *vp,
3484 3488          uio_t *uiop,
3485 3489          int ioflag,
3486 3490          cred_t *cr,
3487 3491          caller_context_t *ct)
3488 3492  {
3489 3493          int     err;
3490 3494          ssize_t resid_start = uiop->uio_resid;
3491 3495  
3492 3496          VOPXID_MAP_CR(vp, cr);
3493 3497  
3494 3498          err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3495 3499          VOPSTATS_UPDATE_IO(vp, write,
3496 3500              write_bytes, (resid_start - uiop->uio_resid));
3497 3501          return (err);
3498 3502  }
3499 3503  
3500 3504  int
3501 3505  fop_ioctl(
3502 3506          vnode_t *vp,
3503 3507          int cmd,
3504 3508          intptr_t arg,
3505 3509          int flag,
3506 3510          cred_t *cr,
3507 3511          int *rvalp,
3508 3512          caller_context_t *ct)
3509 3513  {
3510 3514          int     err;
3511 3515  
3512 3516          VOPXID_MAP_CR(vp, cr);
3513 3517  
3514 3518          err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3515 3519          VOPSTATS_UPDATE(vp, ioctl);
3516 3520          return (err);
3517 3521  }
3518 3522  
3519 3523  int
3520 3524  fop_setfl(
3521 3525          vnode_t *vp,
3522 3526          int oflags,
3523 3527          int nflags,
3524 3528          cred_t *cr,
3525 3529          caller_context_t *ct)
3526 3530  {
3527 3531          int     err;
3528 3532  
3529 3533          VOPXID_MAP_CR(vp, cr);
3530 3534  
3531 3535          err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3532 3536          VOPSTATS_UPDATE(vp, setfl);
3533 3537          return (err);
3534 3538  }
3535 3539  
3536 3540  int
3537 3541  fop_getattr(
3538 3542          vnode_t *vp,
3539 3543          vattr_t *vap,
3540 3544          int flags,
3541 3545          cred_t *cr,
3542 3546          caller_context_t *ct)
3543 3547  {
3544 3548          int     err;
3545 3549  
3546 3550          VOPXID_MAP_CR(vp, cr);
3547 3551  
3548 3552          /*
3549 3553           * If this file system doesn't understand the xvattr extensions
3550 3554           * then turn off the xvattr bit.
3551 3555           */
3552 3556          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3553 3557                  vap->va_mask &= ~AT_XVATTR;
3554 3558          }
3555 3559  
3556 3560          /*
3557 3561           * We're only allowed to skip the ACL check iff we used a 32 bit
3558 3562           * ACE mask with VOP_ACCESS() to determine permissions.
3559 3563           */
3560 3564          if ((flags & ATTR_NOACLCHECK) &&
3561 3565              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3562 3566                  return (EINVAL);
3563 3567          }
3564 3568          err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3565 3569          VOPSTATS_UPDATE(vp, getattr);
3566 3570          return (err);
3567 3571  }
3568 3572  
3569 3573  int
3570 3574  fop_setattr(
3571 3575          vnode_t *vp,
3572 3576          vattr_t *vap,
3573 3577          int flags,
3574 3578          cred_t *cr,
3575 3579          caller_context_t *ct)
3576 3580  {
3577 3581          int     err;
3578 3582  
3579 3583          VOPXID_MAP_CR(vp, cr);
3580 3584  
3581 3585          /*
3582 3586           * If this file system doesn't understand the xvattr extensions
3583 3587           * then turn off the xvattr bit.
3584 3588           */
3585 3589          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3586 3590                  vap->va_mask &= ~AT_XVATTR;
3587 3591          }
3588 3592  
3589 3593          /*
3590 3594           * We're only allowed to skip the ACL check iff we used a 32 bit
3591 3595           * ACE mask with VOP_ACCESS() to determine permissions.
3592 3596           */
3593 3597          if ((flags & ATTR_NOACLCHECK) &&
3594 3598              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3595 3599                  return (EINVAL);
3596 3600          }
3597 3601          err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3598 3602          VOPSTATS_UPDATE(vp, setattr);
3599 3603          return (err);
3600 3604  }
3601 3605  
3602 3606  int
3603 3607  fop_access(
3604 3608          vnode_t *vp,
3605 3609          int mode,
3606 3610          int flags,
3607 3611          cred_t *cr,
3608 3612          caller_context_t *ct)
3609 3613  {
3610 3614          int     err;
3611 3615  
3612 3616          if ((flags & V_ACE_MASK) &&
3613 3617              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3614 3618                  return (EINVAL);
3615 3619          }
3616 3620  
3617 3621          VOPXID_MAP_CR(vp, cr);
3618 3622  
3619 3623          err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3620 3624          VOPSTATS_UPDATE(vp, access);
3621 3625          return (err);
3622 3626  }
3623 3627  
3624 3628  int
3625 3629  fop_lookup(
3626 3630          vnode_t *dvp,
3627 3631          char *nm,
3628 3632          vnode_t **vpp,
3629 3633          pathname_t *pnp,
3630 3634          int flags,
3631 3635          vnode_t *rdir,
3632 3636          cred_t *cr,
3633 3637          caller_context_t *ct,
3634 3638          int *deflags,           /* Returned per-dirent flags */
3635 3639          pathname_t *ppnp)       /* Returned case-preserved name in directory */
3636 3640  {
3637 3641          int ret;
3638 3642  
3639 3643          /*
3640 3644           * If this file system doesn't support case-insensitive access
3641 3645           * and said access is requested, fail quickly.  It is required
3642 3646           * that if the vfs supports case-insensitive lookup, it also
3643 3647           * supports extended dirent flags.
3644 3648           */
3645 3649          if (flags & FIGNORECASE &&
3646 3650              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3647 3651              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3648 3652                  return (EINVAL);
3649 3653  
3650 3654          VOPXID_MAP_CR(dvp, cr);
3651 3655  
3652 3656          if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3653 3657                  ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3654 3658          } else {
3655 3659                  ret = (*(dvp)->v_op->vop_lookup)
3656 3660                      (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3657 3661          }
3658 3662          if (ret == 0 && *vpp) {
3659 3663                  VOPSTATS_UPDATE(*vpp, lookup);
3660 3664                  vn_updatepath(dvp, *vpp, nm);
3661 3665          }
3662 3666  
3663 3667          return (ret);
3664 3668  }
3665 3669  
3666 3670  int
3667 3671  fop_create(
3668 3672          vnode_t *dvp,
3669 3673          char *name,
3670 3674          vattr_t *vap,
3671 3675          vcexcl_t excl,
3672 3676          int mode,
3673 3677          vnode_t **vpp,
3674 3678          cred_t *cr,
3675 3679          int flags,
3676 3680          caller_context_t *ct,
3677 3681          vsecattr_t *vsecp)      /* ACL to set during create */
3678 3682  {
3679 3683          int ret;
3680 3684  
3681 3685          if (vsecp != NULL &&
3682 3686              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3683 3687                  return (EINVAL);
3684 3688          }
3685 3689          /*
3686 3690           * If this file system doesn't support case-insensitive access
3687 3691           * and said access is requested, fail quickly.
3688 3692           */
3689 3693          if (flags & FIGNORECASE &&
3690 3694              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3691 3695              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3692 3696                  return (EINVAL);
3693 3697  
3694 3698          VOPXID_MAP_CR(dvp, cr);
3695 3699  
3696 3700          ret = (*(dvp)->v_op->vop_create)
3697 3701              (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3698 3702          if (ret == 0 && *vpp) {
3699 3703                  VOPSTATS_UPDATE(*vpp, create);
3700 3704                  vn_updatepath(dvp, *vpp, name);
3701 3705          }
3702 3706  
3703 3707          return (ret);
3704 3708  }
3705 3709  
3706 3710  int
3707 3711  fop_remove(
3708 3712          vnode_t *dvp,
3709 3713          char *nm,
3710 3714          cred_t *cr,
3711 3715          caller_context_t *ct,
3712 3716          int flags)
3713 3717  {
3714 3718          int     err;
3715 3719  
3716 3720          /*
3717 3721           * If this file system doesn't support case-insensitive access
3718 3722           * and said access is requested, fail quickly.
3719 3723           */
3720 3724          if (flags & FIGNORECASE &&
3721 3725              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3722 3726              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3723 3727                  return (EINVAL);
3724 3728  
3725 3729          VOPXID_MAP_CR(dvp, cr);
3726 3730  
3727 3731          err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3728 3732          VOPSTATS_UPDATE(dvp, remove);
3729 3733          return (err);
3730 3734  }
3731 3735  
3732 3736  int
3733 3737  fop_link(
3734 3738          vnode_t *tdvp,
3735 3739          vnode_t *svp,
3736 3740          char *tnm,
3737 3741          cred_t *cr,
3738 3742          caller_context_t *ct,
3739 3743          int flags)
3740 3744  {
3741 3745          int     err;
3742 3746  
3743 3747          /*
3744 3748           * If the target file system doesn't support case-insensitive access
3745 3749           * and said access is requested, fail quickly.
3746 3750           */
3747 3751          if (flags & FIGNORECASE &&
3748 3752              (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3749 3753              vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3750 3754                  return (EINVAL);
3751 3755  
3752 3756          VOPXID_MAP_CR(tdvp, cr);
3753 3757  
3754 3758          err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3755 3759          VOPSTATS_UPDATE(tdvp, link);
3756 3760          return (err);
3757 3761  }
3758 3762  
3759 3763  int
3760 3764  fop_rename(
3761 3765          vnode_t *sdvp,
3762 3766          char *snm,
3763 3767          vnode_t *tdvp,
3764 3768          char *tnm,
3765 3769          cred_t *cr,
3766 3770          caller_context_t *ct,
3767 3771          int flags)
3768 3772  {
3769 3773          int     err;
3770 3774  
3771 3775          /*
3772 3776           * If the file system involved does not support
3773 3777           * case-insensitive access and said access is requested, fail
3774 3778           * quickly.
3775 3779           */
3776 3780          if (flags & FIGNORECASE &&
3777 3781              ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3778 3782              vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3779 3783                  return (EINVAL);
3780 3784  
3781 3785          VOPXID_MAP_CR(tdvp, cr);
3782 3786  
3783 3787          err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3784 3788          VOPSTATS_UPDATE(sdvp, rename);
3785 3789          return (err);
3786 3790  }
3787 3791  
3788 3792  int
3789 3793  fop_mkdir(
3790 3794          vnode_t *dvp,
3791 3795          char *dirname,
3792 3796          vattr_t *vap,
3793 3797          vnode_t **vpp,
3794 3798          cred_t *cr,
3795 3799          caller_context_t *ct,
3796 3800          int flags,
3797 3801          vsecattr_t *vsecp)      /* ACL to set during create */
3798 3802  {
3799 3803          int ret;
3800 3804  
3801 3805          if (vsecp != NULL &&
3802 3806              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3803 3807                  return (EINVAL);
3804 3808          }
3805 3809          /*
3806 3810           * If this file system doesn't support case-insensitive access
3807 3811           * and said access is requested, fail quickly.
3808 3812           */
3809 3813          if (flags & FIGNORECASE &&
3810 3814              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3811 3815              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3812 3816                  return (EINVAL);
3813 3817  
3814 3818          VOPXID_MAP_CR(dvp, cr);
3815 3819  
3816 3820          ret = (*(dvp)->v_op->vop_mkdir)
3817 3821              (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3818 3822          if (ret == 0 && *vpp) {
3819 3823                  VOPSTATS_UPDATE(*vpp, mkdir);
3820 3824                  vn_updatepath(dvp, *vpp, dirname);
3821 3825          }
3822 3826  
3823 3827          return (ret);
3824 3828  }
3825 3829  
3826 3830  int
3827 3831  fop_rmdir(
3828 3832          vnode_t *dvp,
3829 3833          char *nm,
3830 3834          vnode_t *cdir,
3831 3835          cred_t *cr,
3832 3836          caller_context_t *ct,
3833 3837          int flags)
3834 3838  {
3835 3839          int     err;
3836 3840  
3837 3841          /*
3838 3842           * If this file system doesn't support case-insensitive access
3839 3843           * and said access is requested, fail quickly.
3840 3844           */
3841 3845          if (flags & FIGNORECASE &&
3842 3846              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3843 3847              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3844 3848                  return (EINVAL);
3845 3849  
3846 3850          VOPXID_MAP_CR(dvp, cr);
3847 3851  
3848 3852          err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3849 3853          VOPSTATS_UPDATE(dvp, rmdir);
3850 3854          return (err);
3851 3855  }
3852 3856  
3853 3857  int
3854 3858  fop_readdir(
3855 3859          vnode_t *vp,
3856 3860          uio_t *uiop,
3857 3861          cred_t *cr,
3858 3862          int *eofp,
3859 3863          caller_context_t *ct,
3860 3864          int flags)
3861 3865  {
3862 3866          int     err;
3863 3867          ssize_t resid_start = uiop->uio_resid;
3864 3868  
3865 3869          /*
3866 3870           * If this file system doesn't support retrieving directory
3867 3871           * entry flags and said access is requested, fail quickly.
3868 3872           */
3869 3873          if (flags & V_RDDIR_ENTFLAGS &&
3870 3874              vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3871 3875                  return (EINVAL);
3872 3876  
3873 3877          VOPXID_MAP_CR(vp, cr);
3874 3878  
3875 3879          err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3876 3880          VOPSTATS_UPDATE_IO(vp, readdir,
3877 3881              readdir_bytes, (resid_start - uiop->uio_resid));
3878 3882          return (err);
3879 3883  }
3880 3884  
3881 3885  int
3882 3886  fop_symlink(
3883 3887          vnode_t *dvp,
3884 3888          char *linkname,
3885 3889          vattr_t *vap,
3886 3890          char *target,
3887 3891          cred_t *cr,
3888 3892          caller_context_t *ct,
3889 3893          int flags)
3890 3894  {
3891 3895          int     err;
3892 3896          xvattr_t xvattr;
3893 3897  
3894 3898          /*
3895 3899           * If this file system doesn't support case-insensitive access
3896 3900           * and said access is requested, fail quickly.
3897 3901           */
3898 3902          if (flags & FIGNORECASE &&
3899 3903              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3900 3904              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3901 3905                  return (EINVAL);
3902 3906  
3903 3907          VOPXID_MAP_CR(dvp, cr);
3904 3908  
3905 3909          /* check for reparse point */
3906 3910          if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3907 3911              (strncmp(target, FS_REPARSE_TAG_STR,
3908 3912              strlen(FS_REPARSE_TAG_STR)) == 0)) {
3909 3913                  if (!fs_reparse_mark(target, vap, &xvattr))
3910 3914                          vap = (vattr_t *)&xvattr;
3911 3915          }
3912 3916  
3913 3917          err = (*(dvp)->v_op->vop_symlink)
3914 3918              (dvp, linkname, vap, target, cr, ct, flags);
3915 3919          VOPSTATS_UPDATE(dvp, symlink);
3916 3920          return (err);
3917 3921  }
3918 3922  
3919 3923  int
3920 3924  fop_readlink(
3921 3925          vnode_t *vp,
3922 3926          uio_t *uiop,
3923 3927          cred_t *cr,
3924 3928          caller_context_t *ct)
3925 3929  {
3926 3930          int     err;
3927 3931  
3928 3932          VOPXID_MAP_CR(vp, cr);
3929 3933  
3930 3934          err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3931 3935          VOPSTATS_UPDATE(vp, readlink);
3932 3936          return (err);
3933 3937  }
3934 3938  
3935 3939  int
3936 3940  fop_fsync(
3937 3941          vnode_t *vp,
3938 3942          int syncflag,
3939 3943          cred_t *cr,
3940 3944          caller_context_t *ct)
3941 3945  {
3942 3946          int     err;
3943 3947  
3944 3948          VOPXID_MAP_CR(vp, cr);
3945 3949  
3946 3950          err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3947 3951          VOPSTATS_UPDATE(vp, fsync);
3948 3952          return (err);
3949 3953  }
3950 3954  
3951 3955  void
3952 3956  fop_inactive(
3953 3957          vnode_t *vp,
3954 3958          cred_t *cr,
3955 3959          caller_context_t *ct)
3956 3960  {
3957 3961          /* Need to update stats before vop call since we may lose the vnode */
3958 3962          VOPSTATS_UPDATE(vp, inactive);
3959 3963  
3960 3964          VOPXID_MAP_CR(vp, cr);
3961 3965  
3962 3966          (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3963 3967  }
3964 3968  
3965 3969  int
3966 3970  fop_fid(
3967 3971          vnode_t *vp,
3968 3972          fid_t *fidp,
3969 3973          caller_context_t *ct)
3970 3974  {
3971 3975          int     err;
3972 3976  
3973 3977          err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3974 3978          VOPSTATS_UPDATE(vp, fid);
3975 3979          return (err);
3976 3980  }
3977 3981  
3978 3982  int
3979 3983  fop_rwlock(
3980 3984          vnode_t *vp,
3981 3985          int write_lock,
3982 3986          caller_context_t *ct)
3983 3987  {
3984 3988          int     ret;
3985 3989  
3986 3990          ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3987 3991          VOPSTATS_UPDATE(vp, rwlock);
3988 3992          return (ret);
3989 3993  }
3990 3994  
3991 3995  void
3992 3996  fop_rwunlock(
3993 3997          vnode_t *vp,
3994 3998          int write_lock,
3995 3999          caller_context_t *ct)
3996 4000  {
3997 4001          (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3998 4002          VOPSTATS_UPDATE(vp, rwunlock);
3999 4003  }
4000 4004  
4001 4005  int
4002 4006  fop_seek(
4003 4007          vnode_t *vp,
4004 4008          offset_t ooff,
4005 4009          offset_t *noffp,
4006 4010          caller_context_t *ct)
4007 4011  {
4008 4012          int     err;
4009 4013  
4010 4014          err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4011 4015          VOPSTATS_UPDATE(vp, seek);
4012 4016          return (err);
4013 4017  }
4014 4018  
4015 4019  int
4016 4020  fop_cmp(
4017 4021          vnode_t *vp1,
4018 4022          vnode_t *vp2,
4019 4023          caller_context_t *ct)
4020 4024  {
4021 4025          int     err;
4022 4026  
4023 4027          err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4024 4028          VOPSTATS_UPDATE(vp1, cmp);
4025 4029          return (err);
4026 4030  }
4027 4031  
4028 4032  int
4029 4033  fop_frlock(
4030 4034          vnode_t *vp,
4031 4035          int cmd,
4032 4036          flock64_t *bfp,
4033 4037          int flag,
4034 4038          offset_t offset,
4035 4039          struct flk_callback *flk_cbp,
4036 4040          cred_t *cr,
4037 4041          caller_context_t *ct)
4038 4042  {
4039 4043          int     err;
4040 4044  
4041 4045          VOPXID_MAP_CR(vp, cr);
4042 4046  
4043 4047          err = (*(vp)->v_op->vop_frlock)
4044 4048              (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4045 4049          VOPSTATS_UPDATE(vp, frlock);
4046 4050          return (err);
4047 4051  }
4048 4052  
4049 4053  int
4050 4054  fop_space(
4051 4055          vnode_t *vp,
4052 4056          int cmd,
4053 4057          flock64_t *bfp,
4054 4058          int flag,
4055 4059          offset_t offset,
4056 4060          cred_t *cr,
4057 4061          caller_context_t *ct)
4058 4062  {
4059 4063          int     err;
4060 4064  
4061 4065          VOPXID_MAP_CR(vp, cr);
4062 4066  
4063 4067          err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4064 4068          VOPSTATS_UPDATE(vp, space);
4065 4069          return (err);
4066 4070  }
4067 4071  
4068 4072  int
4069 4073  fop_realvp(
4070 4074          vnode_t *vp,
4071 4075          vnode_t **vpp,
4072 4076          caller_context_t *ct)
4073 4077  {
4074 4078          int     err;
4075 4079  
4076 4080          err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4077 4081          VOPSTATS_UPDATE(vp, realvp);
4078 4082          return (err);
4079 4083  }
4080 4084  
4081 4085  int
4082 4086  fop_getpage(
4083 4087          vnode_t *vp,
4084 4088          offset_t off,
4085 4089          size_t len,
4086 4090          uint_t *protp,
4087 4091          page_t **plarr,
4088 4092          size_t plsz,
4089 4093          struct seg *seg,
4090 4094          caddr_t addr,
4091 4095          enum seg_rw rw,
4092 4096          cred_t *cr,
4093 4097          caller_context_t *ct)
4094 4098  {
4095 4099          int     err;
4096 4100  
4097 4101          VOPXID_MAP_CR(vp, cr);
4098 4102  
4099 4103          err = (*(vp)->v_op->vop_getpage)
4100 4104              (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4101 4105          VOPSTATS_UPDATE(vp, getpage);
4102 4106          return (err);
4103 4107  }
4104 4108  
4105 4109  int
4106 4110  fop_putpage(
4107 4111          vnode_t *vp,
4108 4112          offset_t off,
4109 4113          size_t len,
4110 4114          int flags,
4111 4115          cred_t *cr,
4112 4116          caller_context_t *ct)
4113 4117  {
4114 4118          int     err;
4115 4119  
4116 4120          VOPXID_MAP_CR(vp, cr);
4117 4121  
4118 4122          err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4119 4123          VOPSTATS_UPDATE(vp, putpage);
4120 4124          return (err);
4121 4125  }
4122 4126  
4123 4127  int
4124 4128  fop_map(
4125 4129          vnode_t *vp,
4126 4130          offset_t off,
4127 4131          struct as *as,
4128 4132          caddr_t *addrp,
4129 4133          size_t len,
4130 4134          uchar_t prot,
4131 4135          uchar_t maxprot,
4132 4136          uint_t flags,
4133 4137          cred_t *cr,
4134 4138          caller_context_t *ct)
4135 4139  {
4136 4140          int     err;
4137 4141  
4138 4142          VOPXID_MAP_CR(vp, cr);
4139 4143  
4140 4144          err = (*(vp)->v_op->vop_map)
4141 4145              (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4142 4146          VOPSTATS_UPDATE(vp, map);
4143 4147          return (err);
4144 4148  }
4145 4149  
4146 4150  int
4147 4151  fop_addmap(
4148 4152          vnode_t *vp,
4149 4153          offset_t off,
4150 4154          struct as *as,
4151 4155          caddr_t addr,
4152 4156          size_t len,
4153 4157          uchar_t prot,
4154 4158          uchar_t maxprot,
4155 4159          uint_t flags,
4156 4160          cred_t *cr,
4157 4161          caller_context_t *ct)
4158 4162  {
4159 4163          int error;
4160 4164          u_longlong_t delta;
4161 4165  
4162 4166          VOPXID_MAP_CR(vp, cr);
4163 4167  
4164 4168          error = (*(vp)->v_op->vop_addmap)
4165 4169              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4166 4170  
4167 4171          if ((!error) && (vp->v_type == VREG)) {
4168 4172                  delta = (u_longlong_t)btopr(len);
4169 4173                  /*
4170 4174                   * If file is declared MAP_PRIVATE, it can't be written back
4171 4175                   * even if open for write. Handle as read.
4172 4176                   */
4173 4177                  if (flags & MAP_PRIVATE) {
4174 4178                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4175 4179                              (int64_t)delta);
4176 4180                  } else {
4177 4181                          /*
4178 4182                           * atomic_add_64 forces the fetch of a 64 bit value to
4179 4183                           * be atomic on 32 bit machines
4180 4184                           */
4181 4185                          if (maxprot & PROT_WRITE)
4182 4186                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4183 4187                                      (int64_t)delta);
4184 4188                          if (maxprot & PROT_READ)
4185 4189                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4186 4190                                      (int64_t)delta);
4187 4191                          if (maxprot & PROT_EXEC)
4188 4192                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4189 4193                                      (int64_t)delta);
4190 4194                  }
4191 4195          }
4192 4196          VOPSTATS_UPDATE(vp, addmap);
4193 4197          return (error);
4194 4198  }
4195 4199  
4196 4200  int
4197 4201  fop_delmap(
4198 4202          vnode_t *vp,
4199 4203          offset_t off,
4200 4204          struct as *as,
4201 4205          caddr_t addr,
4202 4206          size_t len,
4203 4207          uint_t prot,
4204 4208          uint_t maxprot,
4205 4209          uint_t flags,
4206 4210          cred_t *cr,
4207 4211          caller_context_t *ct)
4208 4212  {
4209 4213          int error;
4210 4214          u_longlong_t delta;
4211 4215  
4212 4216          VOPXID_MAP_CR(vp, cr);
4213 4217  
4214 4218          error = (*(vp)->v_op->vop_delmap)
4215 4219              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4216 4220  
4217 4221          /*
4218 4222           * NFS calls into delmap twice, the first time
4219 4223           * it simply establishes a callback mechanism and returns EAGAIN
4220 4224           * while the real work is being done upon the second invocation.
4221 4225           * We have to detect this here and only decrement the counts upon
4222 4226           * the second delmap request.
4223 4227           */
4224 4228          if ((error != EAGAIN) && (vp->v_type == VREG)) {
4225 4229  
4226 4230                  delta = (u_longlong_t)btopr(len);
4227 4231  
4228 4232                  if (flags & MAP_PRIVATE) {
4229 4233                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4230 4234                              (int64_t)(-delta));
4231 4235                  } else {
4232 4236                          /*
4233 4237                           * atomic_add_64 forces the fetch of a 64 bit value
4234 4238                           * to be atomic on 32 bit machines
4235 4239                           */
4236 4240                          if (maxprot & PROT_WRITE)
4237 4241                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4238 4242                                      (int64_t)(-delta));
4239 4243                          if (maxprot & PROT_READ)
4240 4244                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4241 4245                                      (int64_t)(-delta));
4242 4246                          if (maxprot & PROT_EXEC)
4243 4247                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4244 4248                                      (int64_t)(-delta));
4245 4249                  }
4246 4250          }
4247 4251          VOPSTATS_UPDATE(vp, delmap);
4248 4252          return (error);
4249 4253  }
4250 4254  
4251 4255  
4252 4256  int
4253 4257  fop_poll(
4254 4258          vnode_t *vp,
4255 4259          short events,
4256 4260          int anyyet,
4257 4261          short *reventsp,
4258 4262          struct pollhead **phpp,
4259 4263          caller_context_t *ct)
4260 4264  {
4261 4265          int     err;
4262 4266  
4263 4267          err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4264 4268          VOPSTATS_UPDATE(vp, poll);
4265 4269          return (err);
4266 4270  }
4267 4271  
4268 4272  int
4269 4273  fop_dump(
4270 4274          vnode_t *vp,
4271 4275          caddr_t addr,
4272 4276          offset_t lbdn,
4273 4277          offset_t dblks,
4274 4278          caller_context_t *ct)
4275 4279  {
4276 4280          int     err;
4277 4281  
4278 4282          /* ensure lbdn and dblks can be passed safely to bdev_dump */
4279 4283          if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4280 4284                  return (EIO);
4281 4285  
4282 4286          err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4283 4287          VOPSTATS_UPDATE(vp, dump);
4284 4288          return (err);
4285 4289  }
4286 4290  
4287 4291  int
4288 4292  fop_pathconf(
4289 4293          vnode_t *vp,
4290 4294          int cmd,
4291 4295          ulong_t *valp,
4292 4296          cred_t *cr,
4293 4297          caller_context_t *ct)
4294 4298  {
4295 4299          int     err;
4296 4300  
4297 4301          VOPXID_MAP_CR(vp, cr);
4298 4302  
4299 4303          err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4300 4304          VOPSTATS_UPDATE(vp, pathconf);
4301 4305          return (err);
4302 4306  }
4303 4307  
4304 4308  int
4305 4309  fop_pageio(
4306 4310          vnode_t *vp,
4307 4311          struct page *pp,
4308 4312          u_offset_t io_off,
4309 4313          size_t io_len,
4310 4314          int flags,
4311 4315          cred_t *cr,
4312 4316          caller_context_t *ct)
4313 4317  {
4314 4318          int     err;
4315 4319  
4316 4320          VOPXID_MAP_CR(vp, cr);
4317 4321  
4318 4322          err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4319 4323          VOPSTATS_UPDATE(vp, pageio);
4320 4324          return (err);
4321 4325  }
4322 4326  
4323 4327  int
4324 4328  fop_dumpctl(
4325 4329          vnode_t *vp,
4326 4330          int action,
4327 4331          offset_t *blkp,
4328 4332          caller_context_t *ct)
4329 4333  {
4330 4334          int     err;
4331 4335          err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4332 4336          VOPSTATS_UPDATE(vp, dumpctl);
4333 4337          return (err);
4334 4338  }
4335 4339  
4336 4340  void
4337 4341  fop_dispose(
4338 4342          vnode_t *vp,
4339 4343          page_t *pp,
4340 4344          int flag,
4341 4345          int dn,
4342 4346          cred_t *cr,
4343 4347          caller_context_t *ct)
4344 4348  {
4345 4349          /* Must do stats first since it's possible to lose the vnode */
4346 4350          VOPSTATS_UPDATE(vp, dispose);
4347 4351  
4348 4352          VOPXID_MAP_CR(vp, cr);
4349 4353  
4350 4354          (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4351 4355  }
4352 4356  
4353 4357  int
4354 4358  fop_setsecattr(
4355 4359          vnode_t *vp,
4356 4360          vsecattr_t *vsap,
4357 4361          int flag,
4358 4362          cred_t *cr,
4359 4363          caller_context_t *ct)
4360 4364  {
4361 4365          int     err;
4362 4366  
4363 4367          VOPXID_MAP_CR(vp, cr);
4364 4368  
4365 4369          /*
4366 4370           * We're only allowed to skip the ACL check iff we used a 32 bit
4367 4371           * ACE mask with VOP_ACCESS() to determine permissions.
4368 4372           */
4369 4373          if ((flag & ATTR_NOACLCHECK) &&
4370 4374              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4371 4375                  return (EINVAL);
4372 4376          }
4373 4377          err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4374 4378          VOPSTATS_UPDATE(vp, setsecattr);
4375 4379          return (err);
4376 4380  }
4377 4381  
4378 4382  int
4379 4383  fop_getsecattr(
4380 4384          vnode_t *vp,
4381 4385          vsecattr_t *vsap,
4382 4386          int flag,
4383 4387          cred_t *cr,
4384 4388          caller_context_t *ct)
4385 4389  {
4386 4390          int     err;
4387 4391  
4388 4392          /*
4389 4393           * We're only allowed to skip the ACL check iff we used a 32 bit
4390 4394           * ACE mask with VOP_ACCESS() to determine permissions.
4391 4395           */
4392 4396          if ((flag & ATTR_NOACLCHECK) &&
4393 4397              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4394 4398                  return (EINVAL);
4395 4399          }
4396 4400  
4397 4401          VOPXID_MAP_CR(vp, cr);
4398 4402  
4399 4403          err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4400 4404          VOPSTATS_UPDATE(vp, getsecattr);
4401 4405          return (err);
4402 4406  }
4403 4407  
4404 4408  int
4405 4409  fop_shrlock(
4406 4410          vnode_t *vp,
4407 4411          int cmd,
4408 4412          struct shrlock *shr,
4409 4413          int flag,
4410 4414          cred_t *cr,
4411 4415          caller_context_t *ct)
4412 4416  {
4413 4417          int     err;
4414 4418  
4415 4419          VOPXID_MAP_CR(vp, cr);
4416 4420  
4417 4421          err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4418 4422          VOPSTATS_UPDATE(vp, shrlock);
4419 4423          return (err);
4420 4424  }
4421 4425  
4422 4426  int
4423 4427  fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4424 4428      caller_context_t *ct)
4425 4429  {
4426 4430          int     err;
4427 4431  
4428 4432          err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4429 4433          VOPSTATS_UPDATE(vp, vnevent);
4430 4434          return (err);
4431 4435  }
4432 4436  
4433 4437  int
4434 4438  fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4435 4439      caller_context_t *ct)
4436 4440  {
4437 4441          int err;
4438 4442  
4439 4443          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4440 4444                  return (ENOTSUP);
4441 4445          err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4442 4446          VOPSTATS_UPDATE(vp, reqzcbuf);
4443 4447          return (err);
4444 4448  }
4445 4449  
4446 4450  int
4447 4451  fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4448 4452  {
4449 4453          int err;
4450 4454  
4451 4455          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4452 4456                  return (ENOTSUP);
4453 4457          err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4454 4458          VOPSTATS_UPDATE(vp, retzcbuf);
4455 4459          return (err);
4456 4460  }
4457 4461  
4458 4462  /*
4459 4463   * Default destructor
4460 4464   *      Needed because NULL destructor means that the key is unused
4461 4465   */
4462 4466  /* ARGSUSED */
4463 4467  void
4464 4468  vsd_defaultdestructor(void *value)
4465 4469  {}
4466 4470  
4467 4471  /*
4468 4472   * Create a key (index into per vnode array)
4469 4473   *      Locks out vsd_create, vsd_destroy, and vsd_free
4470 4474   *      May allocate memory with lock held
4471 4475   */
4472 4476  void
4473 4477  vsd_create(uint_t *keyp, void (*destructor)(void *))
4474 4478  {
4475 4479          int     i;
4476 4480          uint_t  nkeys;
4477 4481  
4478 4482          /*
4479 4483           * if key is allocated, do nothing
4480 4484           */
4481 4485          mutex_enter(&vsd_lock);
4482 4486          if (*keyp) {
4483 4487                  mutex_exit(&vsd_lock);
4484 4488                  return;
4485 4489          }
4486 4490          /*
4487 4491           * find an unused key
4488 4492           */
4489 4493          if (destructor == NULL)
4490 4494                  destructor = vsd_defaultdestructor;
4491 4495  
4492 4496          for (i = 0; i < vsd_nkeys; ++i)
4493 4497                  if (vsd_destructor[i] == NULL)
4494 4498                          break;
4495 4499  
4496 4500          /*
4497 4501           * if no unused keys, increase the size of the destructor array
4498 4502           */
4499 4503          if (i == vsd_nkeys) {
4500 4504                  if ((nkeys = (vsd_nkeys << 1)) == 0)
4501 4505                          nkeys = 1;
4502 4506                  vsd_destructor =
4503 4507                      (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4504 4508                      (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4505 4509                      (size_t)(nkeys * sizeof (void (*)(void *))));
4506 4510                  vsd_nkeys = nkeys;
4507 4511          }
4508 4512  
4509 4513          /*
4510 4514           * allocate the next available unused key
4511 4515           */
4512 4516          vsd_destructor[i] = destructor;
4513 4517          *keyp = i + 1;
4514 4518  
4515 4519          /* create vsd_list, if it doesn't exist */
4516 4520          if (vsd_list == NULL) {
4517 4521                  vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4518 4522                  list_create(vsd_list, sizeof (struct vsd_node),
4519 4523                      offsetof(struct vsd_node, vs_nodes));
4520 4524          }
4521 4525  
4522 4526          mutex_exit(&vsd_lock);
4523 4527  }
4524 4528  
4525 4529  /*
4526 4530   * Destroy a key
4527 4531   *
4528 4532   * Assumes that the caller is preventing vsd_set and vsd_get
4529 4533   * Locks out vsd_create, vsd_destroy, and vsd_free
4530 4534   * May free memory with lock held
4531 4535   */
4532 4536  void
4533 4537  vsd_destroy(uint_t *keyp)
4534 4538  {
4535 4539          uint_t key;
4536 4540          struct vsd_node *vsd;
4537 4541  
4538 4542          /*
4539 4543           * protect the key namespace and our destructor lists
4540 4544           */
4541 4545          mutex_enter(&vsd_lock);
4542 4546          key = *keyp;
4543 4547          *keyp = 0;
4544 4548  
4545 4549          ASSERT(key <= vsd_nkeys);
4546 4550  
4547 4551          /*
4548 4552           * if the key is valid
4549 4553           */
4550 4554          if (key != 0) {
4551 4555                  uint_t k = key - 1;
4552 4556                  /*
4553 4557                   * for every vnode with VSD, call key's destructor
4554 4558                   */
4555 4559                  for (vsd = list_head(vsd_list); vsd != NULL;
4556 4560                      vsd = list_next(vsd_list, vsd)) {
4557 4561                          /*
4558 4562                           * no VSD for key in this vnode
4559 4563                           */
4560 4564                          if (key > vsd->vs_nkeys)
4561 4565                                  continue;
4562 4566                          /*
4563 4567                           * call destructor for key
4564 4568                           */
4565 4569                          if (vsd->vs_value[k] && vsd_destructor[k])
4566 4570                                  (*vsd_destructor[k])(vsd->vs_value[k]);
4567 4571                          /*
4568 4572                           * reset value for key
4569 4573                           */
4570 4574                          vsd->vs_value[k] = NULL;
4571 4575                  }
4572 4576                  /*
4573 4577                   * actually free the key (NULL destructor == unused)
4574 4578                   */
4575 4579                  vsd_destructor[k] = NULL;
4576 4580          }
4577 4581  
4578 4582          mutex_exit(&vsd_lock);
4579 4583  }
4580 4584  
4581 4585  /*
4582 4586   * Quickly return the per vnode value that was stored with the specified key
4583 4587   * Assumes the caller is protecting key from vsd_create and vsd_destroy
4584 4588   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4585 4589   */
4586 4590  void *
4587 4591  vsd_get(vnode_t *vp, uint_t key)
4588 4592  {
4589 4593          struct vsd_node *vsd;
4590 4594  
4591 4595          ASSERT(vp != NULL);
4592 4596          ASSERT(mutex_owned(&vp->v_vsd_lock));
4593 4597  
4594 4598          vsd = vp->v_vsd;
4595 4599  
4596 4600          if (key && vsd != NULL && key <= vsd->vs_nkeys)
4597 4601                  return (vsd->vs_value[key - 1]);
4598 4602          return (NULL);
4599 4603  }
4600 4604  
4601 4605  /*
4602 4606   * Set a per vnode value indexed with the specified key
4603 4607   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4604 4608   */
4605 4609  int
4606 4610  vsd_set(vnode_t *vp, uint_t key, void *value)
4607 4611  {
4608 4612          struct vsd_node *vsd;
4609 4613  
4610 4614          ASSERT(vp != NULL);
4611 4615          ASSERT(mutex_owned(&vp->v_vsd_lock));
4612 4616  
4613 4617          if (key == 0)
4614 4618                  return (EINVAL);
4615 4619  
4616 4620          vsd = vp->v_vsd;
4617 4621          if (vsd == NULL)
4618 4622                  vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4619 4623  
4620 4624          /*
4621 4625           * If the vsd was just allocated, vs_nkeys will be 0, so the following
4622 4626           * code won't happen and we will continue down and allocate space for
4623 4627           * the vs_value array.
4624 4628           * If the caller is replacing one value with another, then it is up
4625 4629           * to the caller to free/rele/destroy the previous value (if needed).
4626 4630           */
4627 4631          if (key <= vsd->vs_nkeys) {
4628 4632                  vsd->vs_value[key - 1] = value;
4629 4633                  return (0);
4630 4634          }
4631 4635  
4632 4636          ASSERT(key <= vsd_nkeys);
4633 4637  
4634 4638          if (vsd->vs_nkeys == 0) {
4635 4639                  mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4636 4640                  /*
4637 4641                   * Link onto list of all VSD nodes.
4638 4642                   */
4639 4643                  list_insert_head(vsd_list, vsd);
4640 4644                  mutex_exit(&vsd_lock);
4641 4645          }
4642 4646  
4643 4647          /*
4644 4648           * Allocate vnode local storage and set the value for key
4645 4649           */
4646 4650          vsd->vs_value = vsd_realloc(vsd->vs_value,
4647 4651              vsd->vs_nkeys * sizeof (void *),
4648 4652              key * sizeof (void *));
4649 4653          vsd->vs_nkeys = key;
4650 4654          vsd->vs_value[key - 1] = value;
4651 4655  
4652 4656          return (0);
4653 4657  }
4654 4658  
4655 4659  /*
4656 4660   * Called from vn_free() to run the destructor function for each vsd
4657 4661   *      Locks out vsd_create and vsd_destroy
4658 4662   *      Assumes that the destructor *DOES NOT* use vsd
4659 4663   */
4660 4664  void
4661 4665  vsd_free(vnode_t *vp)
4662 4666  {
4663 4667          int i;
4664 4668          struct vsd_node *vsd = vp->v_vsd;
4665 4669  
4666 4670          if (vsd == NULL)
4667 4671                  return;
4668 4672  
4669 4673          if (vsd->vs_nkeys == 0) {
4670 4674                  kmem_free(vsd, sizeof (*vsd));
4671 4675                  vp->v_vsd = NULL;
4672 4676                  return;
4673 4677          }
4674 4678  
4675 4679          /*
4676 4680           * lock out vsd_create and vsd_destroy, call
4677 4681           * the destructor, and mark the value as destroyed.
4678 4682           */
4679 4683          mutex_enter(&vsd_lock);
4680 4684  
4681 4685          for (i = 0; i < vsd->vs_nkeys; i++) {
4682 4686                  if (vsd->vs_value[i] && vsd_destructor[i])
4683 4687                          (*vsd_destructor[i])(vsd->vs_value[i]);
4684 4688                  vsd->vs_value[i] = NULL;
4685 4689          }
4686 4690  
4687 4691          /*
4688 4692           * remove from linked list of VSD nodes
4689 4693           */
4690 4694          list_remove(vsd_list, vsd);
4691 4695  
4692 4696          mutex_exit(&vsd_lock);
4693 4697  
4694 4698          /*
4695 4699           * free up the VSD
4696 4700           */
4697 4701          kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4698 4702          kmem_free(vsd, sizeof (struct vsd_node));
4699 4703          vp->v_vsd = NULL;
4700 4704  }
4701 4705  
4702 4706  /*
4703 4707   * realloc
4704 4708   */
4705 4709  static void *
4706 4710  vsd_realloc(void *old, size_t osize, size_t nsize)
4707 4711  {
4708 4712          void *new;
4709 4713  
4710 4714          new = kmem_zalloc(nsize, KM_SLEEP);
4711 4715          if (old) {
4712 4716                  bcopy(old, new, osize);
4713 4717                  kmem_free(old, osize);
4714 4718          }
4715 4719          return (new);
4716 4720  }
4717 4721  
4718 4722  /*
4719 4723   * Setup the extensible system attribute for creating a reparse point.
4720 4724   * The symlink data 'target' is validated for proper format of a reparse
4721 4725   * string and a check also made to make sure the symlink data does not
4722 4726   * point to an existing file.
4723 4727   *
4724 4728   * return 0 if ok else -1.
4725 4729   */
4726 4730  static int
4727 4731  fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4728 4732  {
4729 4733          xoptattr_t *xoap;
4730 4734  
4731 4735          if ((!target) || (!vap) || (!xvattr))
4732 4736                  return (-1);
4733 4737  
4734 4738          /* validate reparse string */
4735 4739          if (reparse_validate((const char *)target))
4736 4740                  return (-1);
4737 4741  
4738 4742          xva_init(xvattr);
4739 4743          xvattr->xva_vattr = *vap;
4740 4744          xvattr->xva_vattr.va_mask |= AT_XVATTR;
4741 4745          xoap = xva_getxoptattr(xvattr);
4742 4746          ASSERT(xoap);
4743 4747          XVA_SET_REQ(xvattr, XAT_REPARSE);
4744 4748          xoap->xoa_reparse = 1;
4745 4749  
4746 4750          return (0);
4747 4751  }
4748 4752  
4749 4753  /*
4750 4754   * Function to check whether a symlink is a reparse point.
4751 4755   * Return B_TRUE if it is a reparse point, else return B_FALSE
4752 4756   */
4753 4757  boolean_t
4754 4758  vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4755 4759  {
4756 4760          xvattr_t xvattr;
4757 4761          xoptattr_t *xoap;
4758 4762  
4759 4763          if ((vp->v_type != VLNK) ||
4760 4764              !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4761 4765                  return (B_FALSE);
4762 4766  
4763 4767          xva_init(&xvattr);
4764 4768          xoap = xva_getxoptattr(&xvattr);
4765 4769          ASSERT(xoap);
4766 4770          XVA_SET_REQ(&xvattr, XAT_REPARSE);
4767 4771  
4768 4772          if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4769 4773                  return (B_FALSE);
4770 4774  
4771 4775          if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4772 4776              (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4773 4777                  return (B_FALSE);
4774 4778  
4775 4779          return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4776 4780  }
  
    | 
      ↓ open down ↓ | 
    3846 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX