spencer-try1 Wdiff usr/src/uts/common/fs/vnode.c

Print this page

Spencer's first if/else try

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/vnode.c
          +++ new/usr/src/uts/common/fs/vnode.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2020 Joyent, Inc.
  25   25   * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26   26   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27   27   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  28   28   */
  29   29  
  30   30  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  31   31  /*        All Rights Reserved   */
  32   32  
  33   33  /*
  34   34   * University Copyright- Copyright (c) 1982, 1986, 1988
  35   35   * The Regents of the University of California
  36   36   * All Rights Reserved
  37   37   *
  38   38   * University Acknowledgment- Portions of this document are derived from
  39   39   * software developed by the University of California, Berkeley, and its
  40   40   * contributors.
  41   41   */
  42   42  
  43   43  #include <sys/types.h>
  44   44  #include <sys/param.h>
  45   45  #include <sys/t_lock.h>
  46   46  #include <sys/errno.h>
  47   47  #include <sys/cred.h>
  48   48  #include <sys/user.h>
  49   49  #include <sys/uio.h>
  50   50  #include <sys/file.h>
  51   51  #include <sys/pathname.h>
  52   52  #include <sys/vfs.h>
  53   53  #include <sys/vfs_opreg.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/filio.h>
  56   56  #include <sys/rwstlock.h>
  57   57  #include <sys/fem.h>
  58   58  #include <sys/stat.h>
  59   59  #include <sys/mode.h>
  60   60  #include <sys/conf.h>
  61   61  #include <sys/sysmacros.h>
  62   62  #include <sys/cmn_err.h>
  63   63  #include <sys/systm.h>
  64   64  #include <sys/kmem.h>
  65   65  #include <sys/debug.h>
  66   66  #include <c2/audit.h>
  67   67  #include <sys/acl.h>
  68   68  #include <sys/nbmlock.h>
  69   69  #include <sys/fcntl.h>
  70   70  #include <fs/fs_subr.h>
  71   71  #include <sys/taskq.h>
  72   72  #include <fs/fs_reparse.h>
  73   73  #include <sys/time.h>
  74   74  #include <sys/sdt.h>
  75   75  
  76   76  /* Determine if this vnode is a file that is read-only */
  77   77  #define ISROFILE(vp)    \
  78   78          ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  79   79              (vp)->v_type != VFIFO && vn_is_readonly(vp))
  80   80  
  81   81  /* Tunable via /etc/system; used only by admin/install */
  82   82  int nfs_global_client_only;
  83   83  
  84   84  /*
  85   85   * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  86   86   * number of entries as and parallel to the vfssw table.  (Arguably, it could
  87   87   * be part of the vfssw table.)  Once it's initialized, it's accessed using
  88   88   * the same fstype index that is used to index into the vfssw table.
  89   89   */
  90   90  vopstats_t **vopstats_fstype;
  91   91  
  92   92  /* vopstats initialization template used for fast initialization via bcopy() */
  93   93  static vopstats_t *vs_templatep;
  94   94  
  95   95  /* Kmem cache handle for vsk_anchor_t allocations */
  96   96  kmem_cache_t *vsk_anchor_cache;
  97   97  
  98   98  /* file events cleanup routine */
  99   99  extern void free_fopdata(vnode_t *);
 100  100  
 101  101  /*
 102  102   * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 103  103   * updates to vsktat_tree.
 104  104   */
 105  105  avl_tree_t      vskstat_tree;
 106  106  kmutex_t        vskstat_tree_lock;
 107  107  
 108  108  /* Global variable which enables/disables the vopstats collection */
 109  109  int vopstats_enabled = 1;
 110  110  
 111  111  /* Global used for empty/invalid v_path */
 112  112  char *vn_vpath_empty = "";
 113  113  
 114  114  /*
 115  115   * forward declarations for internal vnode specific data (vsd)
 116  116   */
 117  117  static void *vsd_realloc(void *, size_t, size_t);
 118  118  
 119  119  /*
 120  120   * forward declarations for reparse point functions
 121  121   */
 122  122  static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 123  123  
 124  124  /*
 125  125   * VSD -- VNODE SPECIFIC DATA
 126  126   * The v_data pointer is typically used by a file system to store a
 127  127   * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 128  128   * However, there are times when additional project private data needs
 129  129   * to be stored separately from the data (node) pointed to by v_data.
 130  130   * This additional data could be stored by the file system itself or
 131  131   * by a completely different kernel entity.  VSD provides a way for
 132  132   * callers to obtain a key and store a pointer to private data associated
 133  133   * with a vnode.
 134  134   *
 135  135   * Callers are responsible for protecting the vsd by holding v_vsd_lock
 136  136   * for calls to vsd_set() and vsd_get().
 137  137   */
 138  138  
 139  139  /*
 140  140   * vsd_lock protects:
 141  141   *   vsd_nkeys - creation and deletion of vsd keys
 142  142   *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 143  143   *   vsd_destructor - adding and removing destructors to the list
 144  144   */
 145  145  static kmutex_t         vsd_lock;
 146  146  static uint_t           vsd_nkeys;       /* size of destructor array */
 147  147  /* list of vsd_node's */
 148  148  static list_t *vsd_list = NULL;
 149  149  /* per-key destructor funcs */
 150  150  static void             (**vsd_destructor)(void *);
 151  151  
 152  152  /*
 153  153   * The following is the common set of actions needed to update the
 154  154   * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 155  155   * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 156  156   * recording of the bytes transferred.  Since the code is similar
 157  157   * but small, it is nearly a duplicate.  Consequently any changes
 158  158   * to one may need to be reflected in the other.
 159  159   * Rundown of the variables:
 160  160   * vp - Pointer to the vnode
 161  161   * counter - Partial name structure member to update in vopstats for counts
 162  162   * bytecounter - Partial name structure member to update in vopstats for bytes
 163  163   * bytesval - Value to update in vopstats for bytes
 164  164   * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 165  165   * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 166  166   */
 167  167  
 168  168  #define VOPSTATS_UPDATE(vp, counter) {                                  \
 169  169          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 170  170          if (vfsp && vfsp->vfs_implp &&                                  \
 171  171              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 172  172                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 173  173                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 174  174                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 175  175                      size_t, uint64_t *);                                \
 176  176                  __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 177  177                  (*stataddr)++;                                          \
 178  178                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 179  179                          vsp->n##counter.value.ui64++;                   \
 180  180                  }                                                       \
 181  181          }                                                               \
 182  182  }
 183  183  
 184  184  #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 185  185          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 186  186          if (vfsp && vfsp->vfs_implp &&                                  \
 187  187              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 188  188                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 189  189                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 190  190                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 191  191                      size_t, uint64_t *);                                \
 192  192                  __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 193  193                  (*stataddr)++;                                          \
 194  194                  vsp->bytecounter.value.ui64 += bytesval;                \
 195  195                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 196  196                          vsp->n##counter.value.ui64++;                   \
 197  197                          vsp->bytecounter.value.ui64 += bytesval;        \
 198  198                  }                                                       \
 199  199          }                                                               \
 200  200  }
 201  201  
 202  202  /*
 203  203   * If the filesystem does not support XIDs map credential
 204  204   * If the vfsp is NULL, perhaps we should also map?
 205  205   */
 206  206  #define VOPXID_MAP_CR(vp, cr)   {                                       \
 207  207          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 208  208          if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 209  209                  cr = crgetmapped(cr);                                   \
 210  210          }
 211  211  
 212  212  #define VOP_LATENCY_10MS        10000000
 213  213  #define VOP_LATENCY_100MS       100000000
 214  214  #define VOP_LATENCY_1S          1000000000
 215  215  #define VOP_LATENCY_10S         10000000000
 216  216  
 217  217  /*
 218  218   * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 219  219   * numerical order of S_IFMT and vnode types.)
 220  220   */
 221  221  enum vtype iftovt_tab[] = {
 222  222          VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 223  223          VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 224  224  };
 225  225  
 226  226  ushort_t vttoif_tab[] = {
 227  227          0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 228  228          S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 229  229  };
 230  230  
 231  231  /*
 232  232   * The system vnode cache.
 233  233   */
 234  234  
 235  235  kmem_cache_t *vn_cache;
 236  236  
 237  237  
 238  238  /*
 239  239   * Vnode operations vector.
 240  240   */
 241  241  
 242  242  static const fs_operation_trans_def_t vn_ops_table[] = {
 243  243          VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 244  244              fs_nosys, fs_nosys,
 245  245  
 246  246          VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 247  247              fs_nosys, fs_nosys,
 248  248  
 249  249          VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 250  250              fs_nosys, fs_nosys,
 251  251  
 252  252          VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 253  253              fs_nosys, fs_nosys,
 254  254  
 255  255          VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 256  256              fs_nosys, fs_nosys,
 257  257  
 258  258          VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 259  259              fs_setfl, fs_nosys,
 260  260  
 261  261          VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 262  262              fs_nosys, fs_nosys,
 263  263  
 264  264          VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 265  265              fs_nosys, fs_nosys,
 266  266  
 267  267          VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 268  268              fs_nosys, fs_nosys,
 269  269  
 270  270          VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 271  271              fs_nosys, fs_nosys,
 272  272  
 273  273          VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 274  274              fs_nosys, fs_nosys,
 275  275  
 276  276          VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 277  277              fs_nosys, fs_nosys,
 278  278  
 279  279          VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 280  280              fs_nosys, fs_nosys,
 281  281  
 282  282          VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 283  283              fs_nosys, fs_nosys,
 284  284  
 285  285          VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 286  286              fs_nosys, fs_nosys,
 287  287  
 288  288          VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 289  289              fs_nosys, fs_nosys,
 290  290  
 291  291          VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 292  292              fs_nosys, fs_nosys,
 293  293  
 294  294          VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 295  295              fs_nosys, fs_nosys,
 296  296  
 297  297          VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 298  298              fs_nosys, fs_nosys,
 299  299  
 300  300          VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 301  301              fs_nosys, fs_nosys,
 302  302  
 303  303          VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 304  304              fs_nosys, fs_nosys,
 305  305  
 306  306          VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 307  307              fs_nosys, fs_nosys,
 308  308  
 309  309          VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 310  310              fs_rwlock, fs_rwlock,
 311  311  
 312  312          VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 313  313              (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 314  314              (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 315  315  
 316  316          VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 317  317              fs_nosys, fs_nosys,
 318  318  
 319  319          VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 320  320              fs_cmp, fs_cmp,             /* no errors allowed */
 321  321  
 322  322          VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 323  323              fs_frlock, fs_nosys,
 324  324  
 325  325          VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 326  326              fs_nosys, fs_nosys,
 327  327  
 328  328          VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 329  329              fs_nosys, fs_nosys,
 330  330  
 331  331          VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 332  332              fs_nosys, fs_nosys,
 333  333  
 334  334          VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 335  335              fs_nosys, fs_nosys,
 336  336  
 337  337          VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 338  338              (fs_generic_func_p) fs_nosys_map,
 339  339              (fs_generic_func_p) fs_nosys_map,
 340  340  
 341  341          VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 342  342              (fs_generic_func_p) fs_nosys_addmap,
 343  343              (fs_generic_func_p) fs_nosys_addmap,
 344  344  
 345  345          VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 346  346              fs_nosys, fs_nosys,
 347  347  
 348  348          VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 349  349              (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 350  350  
 351  351          VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 352  352              fs_nosys, fs_nosys,
 353  353  
 354  354          VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 355  355              fs_pathconf, fs_nosys,
 356  356  
 357  357          VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 358  358              fs_nosys, fs_nosys,
 359  359  
 360  360          VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 361  361              fs_nosys, fs_nosys,
 362  362  
 363  363          VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 364  364              (fs_generic_func_p)(uintptr_t)fs_dispose,
 365  365              (fs_generic_func_p)(uintptr_t)fs_nodispose,
 366  366  
 367  367          VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 368  368              fs_nosys, fs_nosys,
 369  369  
 370  370          VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 371  371              fs_fab_acl, fs_nosys,
 372  372  
 373  373          VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 374  374              fs_shrlock, fs_nosys,
 375  375  
 376  376          VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 377  377              (fs_generic_func_p) fs_vnevent_nosupport,
 378  378              (fs_generic_func_p) fs_vnevent_nosupport,
 379  379  
 380  380          VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 381  381              fs_nosys, fs_nosys,
 382  382  
 383  383          VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 384  384              fs_nosys, fs_nosys,
 385  385  
 386  386          NULL, 0, NULL, NULL
 387  387  };
 388  388  
 389  389  /* Extensible attribute (xva) routines. */
 390  390  
 391  391  /*
 392  392   * Zero out the structure, set the size of the requested/returned bitmaps,
 393  393   * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 394  394   * to the returned attributes array.
 395  395   */
 396  396  void
 397  397  xva_init(xvattr_t *xvap)
 398  398  {
 399  399          bzero(xvap, sizeof (xvattr_t));
 400  400          xvap->xva_mapsize = XVA_MAPSIZE;
 401  401          xvap->xva_magic = XVA_MAGIC;
 402  402          xvap->xva_vattr.va_mask = AT_XVATTR;
 403  403          xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 404  404  }
 405  405  
 406  406  /*
 407  407   * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 408  408   * structure.  Otherwise, returns NULL.
 409  409   */
 410  410  xoptattr_t *
 411  411  xva_getxoptattr(xvattr_t *xvap)
 412  412  {
 413  413          xoptattr_t *xoap = NULL;
 414  414          if (xvap->xva_vattr.va_mask & AT_XVATTR)
 415  415                  xoap = &xvap->xva_xoptattrs;
 416  416          return (xoap);
 417  417  }
 418  418  
 419  419  /*
 420  420   * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 421  421   * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 422  422   * kstat name.
 423  423   */
 424  424  static int
 425  425  vska_compar(const void *n1, const void *n2)
 426  426  {
 427  427          int ret;
 428  428          ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 429  429          ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 430  430  
 431  431          if (p1 < p2) {
 432  432                  ret = -1;
 433  433          } else if (p1 > p2) {
 434  434                  ret = 1;
 435  435          } else {
 436  436                  ret = 0;
 437  437          }
 438  438  
 439  439          return (ret);
 440  440  }
 441  441  
 442  442  /*
 443  443   * Used to create a single template which will be bcopy()ed to a newly
 444  444   * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 445  445   */
 446  446  static vopstats_t *
 447  447  create_vopstats_template()
 448  448  {
 449  449          vopstats_t              *vsp;
 450  450  
 451  451          vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 452  452          bzero(vsp, sizeof (*vsp));      /* Start fresh */
 453  453  
 454  454          /* VOP_OPEN */
 455  455          kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 456  456          /* VOP_CLOSE */
 457  457          kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 458  458          /* VOP_READ I/O */
 459  459          kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 460  460          kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 461  461          /* VOP_WRITE I/O */
 462  462          kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 463  463          kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 464  464          /* VOP_IOCTL */
 465  465          kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 466  466          /* VOP_SETFL */
 467  467          kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 468  468          /* VOP_GETATTR */
 469  469          kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 470  470          /* VOP_SETATTR */
 471  471          kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 472  472          /* VOP_ACCESS */
 473  473          kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 474  474          /* VOP_LOOKUP */
 475  475          kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 476  476          /* VOP_CREATE */
 477  477          kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 478  478          /* VOP_REMOVE */
 479  479          kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 480  480          /* VOP_LINK */
 481  481          kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 482  482          /* VOP_RENAME */
 483  483          kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 484  484          /* VOP_MKDIR */
 485  485          kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 486  486          /* VOP_RMDIR */
 487  487          kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 488  488          /* VOP_READDIR I/O */
 489  489          kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 490  490          kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 491  491              KSTAT_DATA_UINT64);
 492  492          /* VOP_SYMLINK */
 493  493          kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 494  494          /* VOP_READLINK */
 495  495          kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 496  496          /* VOP_FSYNC */
 497  497          kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 498  498          /* VOP_INACTIVE */
 499  499          kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 500  500          /* VOP_FID */
 501  501          kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 502  502          /* VOP_RWLOCK */
 503  503          kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 504  504          /* VOP_RWUNLOCK */
 505  505          kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 506  506          /* VOP_SEEK */
 507  507          kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 508  508          /* VOP_CMP */
 509  509          kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 510  510          /* VOP_FRLOCK */
 511  511          kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 512  512          /* VOP_SPACE */
 513  513          kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 514  514          /* VOP_REALVP */
 515  515          kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 516  516          /* VOP_GETPAGE */
 517  517          kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 518  518          /* VOP_PUTPAGE */
 519  519          kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 520  520          /* VOP_MAP */
 521  521          kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 522  522          /* VOP_ADDMAP */
 523  523          kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 524  524          /* VOP_DELMAP */
 525  525          kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 526  526          /* VOP_POLL */
 527  527          kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 528  528          /* VOP_DUMP */
 529  529          kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 530  530          /* VOP_PATHCONF */
 531  531          kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 532  532          /* VOP_PAGEIO */
 533  533          kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 534  534          /* VOP_DUMPCTL */
 535  535          kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 536  536          /* VOP_DISPOSE */
 537  537          kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 538  538          /* VOP_SETSECATTR */
 539  539          kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 540  540          /* VOP_GETSECATTR */
 541  541          kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 542  542          /* VOP_SHRLOCK */
 543  543          kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 544  544          /* VOP_VNEVENT */
 545  545          kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 546  546          /* VOP_REQZCBUF */
 547  547          kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 548  548          /* VOP_RETZCBUF */
 549  549          kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 550  550  
 551  551          return (vsp);
 552  552  }
 553  553  
 554  554  /*
 555  555   * Creates a kstat structure associated with a vopstats structure.
 556  556   */
 557  557  kstat_t *
 558  558  new_vskstat(char *ksname, vopstats_t *vsp)
 559  559  {
 560  560          kstat_t         *ksp;
 561  561  
 562  562          if (!vopstats_enabled) {
 563  563                  return (NULL);
 564  564          }
 565  565  
 566  566          ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 567  567              sizeof (vopstats_t)/sizeof (kstat_named_t),
 568  568              KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 569  569          if (ksp) {
 570  570                  ksp->ks_data = vsp;
 571  571                  kstat_install(ksp);
 572  572          }
 573  573  
 574  574          return (ksp);
 575  575  }
 576  576  
 577  577  /*
 578  578   * Called from vfsinit() to initialize the support mechanisms for vopstats
 579  579   */
 580  580  void
 581  581  vopstats_startup()
 582  582  {
 583  583          if (!vopstats_enabled)
 584  584                  return;
 585  585  
 586  586          /*
 587  587           * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 588  588           * is necessary since we need to check if a kstat exists before we
 589  589           * attempt to create it.  Also, initialize its lock.
 590  590           */
 591  591          avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 592  592              offsetof(vsk_anchor_t, vsk_node));
 593  593          mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 594  594  
 595  595          vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 596  596              sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 597  597              NULL, NULL, 0);
 598  598  
 599  599          /*
 600  600           * Set up the array of pointers for the vopstats-by-FS-type.
 601  601           * The entries will be allocated/initialized as each file system
 602  602           * goes through modload/mod_installfs.
 603  603           */
 604  604          vopstats_fstype = (vopstats_t **)kmem_zalloc(
 605  605              (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 606  606  
 607  607          /* Set up the global vopstats initialization template */
 608  608          vs_templatep = create_vopstats_template();
 609  609  }
 610  610  
 611  611  /*
 612  612   * We need to have the all of the counters zeroed.
 613  613   * The initialization of the vopstats_t includes on the order of
 614  614   * 50 calls to kstat_named_init().  Rather that do that on every call,
 615  615   * we do it once in a template (vs_templatep) then bcopy it over.
 616  616   */
 617  617  void
 618  618  initialize_vopstats(vopstats_t *vsp)
 619  619  {
 620  620          if (vsp == NULL)
 621  621                  return;
 622  622  
 623  623          bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 624  624  }
 625  625  
 626  626  /*
 627  627   * If possible, determine which vopstats by fstype to use and
 628  628   * return a pointer to the caller.
 629  629   */
 630  630  vopstats_t *
 631  631  get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 632  632  {
 633  633          int             fstype = 0;     /* Index into vfssw[] */
 634  634          vopstats_t      *vsp = NULL;
 635  635  
 636  636          if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 637  637              !vopstats_enabled)
 638  638                  return (NULL);
 639  639          /*
 640  640           * Set up the fstype.  We go to so much trouble because all versions
 641  641           * of NFS use the same fstype in their vfs even though they have
 642  642           * distinct entries in the vfssw[] table.
 643  643           * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 644  644           */
 645  645          if (vswp) {
 646  646                  fstype = vswp - vfssw;  /* Gets us the index */
 647  647          } else {
 648  648                  fstype = vfsp->vfs_fstype;
 649  649          }
 650  650  
 651  651          /*
 652  652           * Point to the per-fstype vopstats. The only valid values are
 653  653           * non-zero positive values less than the number of vfssw[] table
 654  654           * entries.
 655  655           */
 656  656          if (fstype > 0 && fstype < nfstype) {
 657  657                  vsp = vopstats_fstype[fstype];
 658  658          }
 659  659  
 660  660          return (vsp);
 661  661  }
 662  662  
 663  663  /*
 664  664   * Generate a kstat name, create the kstat structure, and allocate a
 665  665   * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 666  666   * to the caller.  This must only be called from a mount.
 667  667   */
 668  668  vsk_anchor_t *
 669  669  get_vskstat_anchor(vfs_t *vfsp)
 670  670  {
 671  671          char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 672  672          statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 673  673          vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 674  674          kstat_t         *ksp;                   /* Ptr to new kstat */
 675  675          avl_index_t     where;                  /* Location in the AVL tree */
 676  676  
 677  677          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 678  678              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 679  679                  return (NULL);
 680  680  
 681  681          /* Need to get the fsid to build a kstat name */
 682  682          if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 683  683                  /* Create a name for our kstats based on fsid */
 684  684                  (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 685  685                      VOPSTATS_STR, statvfsbuf.f_fsid);
 686  686  
 687  687                  /* Allocate and initialize the vsk_anchor_t */
 688  688                  vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 689  689                  bzero(vskp, sizeof (*vskp));
 690  690                  vskp->vsk_fsid = statvfsbuf.f_fsid;
 691  691  
 692  692                  mutex_enter(&vskstat_tree_lock);
 693  693                  if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 694  694                          avl_insert(&vskstat_tree, vskp, where);
 695  695                          mutex_exit(&vskstat_tree_lock);
 696  696  
 697  697                          /*
 698  698                           * Now that we've got the anchor in the AVL
 699  699                           * tree, we can create the kstat.
 700  700                           */
 701  701                          ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 702  702                          if (ksp) {
 703  703                                  vskp->vsk_ksp = ksp;
 704  704                          }
 705  705                  } else {
 706  706                          /* Oops, found one! Release memory and lock. */
 707  707                          mutex_exit(&vskstat_tree_lock);
 708  708                          kmem_cache_free(vsk_anchor_cache, vskp);
 709  709                          vskp = NULL;
 710  710                  }
 711  711          }
 712  712          return (vskp);
 713  713  }
 714  714  
 715  715  /*
 716  716   * We're in the process of tearing down the vfs and need to cleanup
 717  717   * the data structures associated with the vopstats. Must only be called
 718  718   * from dounmount().
 719  719   */
 720  720  void
 721  721  teardown_vopstats(vfs_t *vfsp)
 722  722  {
 723  723          vsk_anchor_t    *vskap;
 724  724          avl_index_t     where;
 725  725  
 726  726          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 727  727              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 728  728                  return;
 729  729  
 730  730          /* This is a safe check since VFS_STATS must be set (see above) */
 731  731          if ((vskap = vfsp->vfs_vskap) == NULL)
 732  732                  return;
 733  733  
 734  734          /* Whack the pointer right away */
 735  735          vfsp->vfs_vskap = NULL;
 736  736  
 737  737          /* Lock the tree, remove the node, and delete the kstat */
 738  738          mutex_enter(&vskstat_tree_lock);
 739  739          if (avl_find(&vskstat_tree, vskap, &where)) {
 740  740                  avl_remove(&vskstat_tree, vskap);
 741  741          }
 742  742  
 743  743          if (vskap->vsk_ksp) {
 744  744                  kstat_delete(vskap->vsk_ksp);
 745  745          }
 746  746          mutex_exit(&vskstat_tree_lock);
 747  747  
 748  748          kmem_cache_free(vsk_anchor_cache, vskap);
 749  749  }
 750  750  
 751  751  /*
 752  752   * Read or write a vnode.  Called from kernel code.
 753  753   */
 754  754  int
 755  755  vn_rdwr(
 756  756          enum uio_rw rw,
 757  757          struct vnode *vp,
 758  758          caddr_t base,
 759  759          ssize_t len,
 760  760          offset_t offset,
 761  761          enum uio_seg seg,
 762  762          int ioflag,
 763  763          rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 764  764          cred_t *cr,
 765  765          ssize_t *residp)
 766  766  {
 767  767          struct uio uio;
 768  768          struct iovec iov;
 769  769          int error;
 770  770          int in_crit = 0;
 771  771  
 772  772          if (rw == UIO_WRITE && ISROFILE(vp))
 773  773                  return (EROFS);
 774  774  
 775  775          if (len < 0)
 776  776                  return (EIO);
 777  777  
 778  778          VOPXID_MAP_CR(vp, cr);
 779  779  
 780  780          iov.iov_base = base;
 781  781          iov.iov_len = len;
 782  782          uio.uio_iov = &iov;
 783  783          uio.uio_iovcnt = 1;
 784  784          uio.uio_loffset = offset;
 785  785          uio.uio_segflg = (short)seg;
 786  786          uio.uio_resid = len;
 787  787          uio.uio_llimit = ulimit;
 788  788  
 789  789          /*
 790  790           * We have to enter the critical region before calling VOP_RWLOCK
 791  791           * to avoid a deadlock with ufs.
 792  792           */
 793  793          if (nbl_need_check(vp)) {
 794  794                  int svmand;
 795  795  
 796  796                  nbl_start_crit(vp, RW_READER);
 797  797                  in_crit = 1;
 798  798                  error = nbl_svmand(vp, cr, &svmand);
 799  799                  if (error != 0)
 800  800                          goto done;
 801  801                  if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 802  802                      uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 803  803                          error = EACCES;
 804  804                          goto done;
 805  805                  }
 806  806          }
 807  807  
 808  808          (void) VOP_RWLOCK(vp,
 809  809              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 810  810          if (rw == UIO_WRITE) {
 811  811                  uio.uio_fmode = FWRITE;
 812  812                  uio.uio_extflg = UIO_COPY_DEFAULT;
 813  813                  error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 814  814          } else {
 815  815                  uio.uio_fmode = FREAD;
 816  816                  uio.uio_extflg = UIO_COPY_CACHED;
 817  817                  error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 818  818          }
 819  819          VOP_RWUNLOCK(vp,
 820  820              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 821  821          if (residp)
 822  822                  *residp = uio.uio_resid;
 823  823          else if (uio.uio_resid)
 824  824                  error = EIO;
 825  825  
 826  826  done:
 827  827          if (in_crit)
 828  828                  nbl_end_crit(vp);
 829  829          return (error);
 830  830  }
 831  831  
 832  832  /*
 833  833   * Release a vnode.  Call VOP_INACTIVE on last reference or
 834  834   * decrement reference count.
 835  835   *

↓ open down ↓

835 lines elided

↑ open up ↑

 836  836   * To avoid race conditions, the v_count is left at 1 for
 837  837   * the call to VOP_INACTIVE. This prevents another thread
 838  838   * from reclaiming and releasing the vnode *before* the
 839  839   * VOP_INACTIVE routine has a chance to destroy the vnode.
 840  840   * We can't have more than 1 thread calling VOP_INACTIVE
 841  841   * on a vnode.
 842  842   */
 843  843  void
 844  844  vn_rele(vnode_t *vp)
 845  845  {
 846      -        VERIFY(vp->v_count > 0);
 847  846          mutex_enter(&vp->v_lock);
 848  847          if (vp->v_count == 1) {
 849  848                  mutex_exit(&vp->v_lock);
 850  849                  VOP_INACTIVE(vp, CRED(), NULL);
 851  850                  return;
 852  851          }
      852 +        else{
      853 +                VERIFY(vp->v_count > 0);
      854 +        }
 853  855          VN_RELE_LOCKED(vp);
 854  856          mutex_exit(&vp->v_lock);
 855  857  }
 856  858  
 857  859  void
 858  860  vn_phantom_rele(vnode_t *vp)
 859  861  {
 860      -        VERIFY(vp->v_count > 0);
 861      -
 862  862          mutex_enter(&vp->v_lock);
 863      -        VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
      863 +
 864  864          vp->v_phantom_count--;
 865  865          DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
 866  866          if (vp->v_count == 1) {
 867  867                  ASSERT0(vp->v_phantom_count);
 868  868                  mutex_exit(&vp->v_lock);
 869  869                  VOP_INACTIVE(vp, CRED(), NULL);
 870  870                  return;
      871 +        }else{
      872 +                VERIFY(vp->v_count > 0);
      873 +                VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
 871  874          }
 872  875          VN_RELE_LOCKED(vp);
 873  876          mutex_exit(&vp->v_lock);
 874  877  }
 875  878  
 876  879  /*
 877  880   * Return the number of non-phantom holds. Things such as portfs will use
 878  881   * phantom holds to prevent it from blocking filesystems from mounting over
 879  882   * watched directories.
 880  883   */

 881  884  uint_t
 882  885  vn_count(vnode_t *vp)
 883  886  {
 884  887          ASSERT(MUTEX_HELD(&vp->v_lock));
 885  888          return (vp->v_count - vp->v_phantom_count);
 886  889  }

↓ open down ↓

6 lines elided

↑ open up ↑

 887  890  
 888  891  /*
 889  892   * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 890  893   * as a single reference, so v_count is not decremented until the last DNLC hold
 891  894   * is released. This makes it possible to distinguish vnodes that are referenced
 892  895   * only by the DNLC.
 893  896   */
 894  897  void
 895  898  vn_rele_dnlc(vnode_t *vp)
 896  899  {
 897      -        VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 898  900          mutex_enter(&vp->v_lock);
      901 +
 899  902          if (--vp->v_count_dnlc == 0) {
 900  903                  if (vp->v_count == 1) {
 901  904                          mutex_exit(&vp->v_lock);
 902  905                          VOP_INACTIVE(vp, CRED(), NULL);
 903  906                          return;
 904  907                  }
 905  908                  VN_RELE_LOCKED(vp);
      909 +        }else{
      910 +                VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 906  911          }
 907  912          mutex_exit(&vp->v_lock);
 908  913  }
 909  914  
 910  915  /*
 911  916   * Like vn_rele() except that it clears v_stream under v_lock.
 912  917   * This is used by sockfs when it dismantles the association between
 913  918   * the sockfs node and the vnode in the underlying file system.
 914  919   * v_lock has to be held to prevent a thread coming through the lookupname
 915  920   * path from accessing a stream head that is going away.
 916  921   */
 917  922  void
 918  923  vn_rele_stream(vnode_t *vp)
 919  924  {
 920      -        VERIFY(vp->v_count > 0);
 921  925          mutex_enter(&vp->v_lock);
      926 +
 922  927          vp->v_stream = NULL;
 923  928          if (vp->v_count == 1) {
 924  929                  mutex_exit(&vp->v_lock);
 925  930                  VOP_INACTIVE(vp, CRED(), NULL);
 926  931                  return;
 927  932          }
      933 +        else{
      934 +                VERIFY(vp->v_count > 0);
      935 +        }
 928  936          VN_RELE_LOCKED(vp);
 929  937          mutex_exit(&vp->v_lock);
 930  938  }
 931  939  
 932  940  static void
 933  941  vn_rele_inactive(vnode_t *vp)
 934  942  {
 935  943          VOP_INACTIVE(vp, CRED(), NULL);
 936  944  }
 937  945

 938  946  /*
 939  947   * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 940  948   * asynchronously using a taskq. This can avoid deadlocks caused by re-entering

↓ open down ↓

3 lines elided

↑ open up ↑

 941  949   * the file system as a result of releasing the vnode. Note, file systems
 942  950   * already have to handle the race where the vnode is incremented before the
 943  951   * inactive routine is called and does its locking.
 944  952   *
 945  953   * Warning: Excessive use of this routine can lead to performance problems.
 946  954   * This is because taskqs throttle back allocation if too many are created.
 947  955   */
 948  956  void
 949  957  vn_rele_async(vnode_t *vp, taskq_t *taskq)
 950  958  {
 951      -        VERIFY(vp->v_count > 0);
 952  959          mutex_enter(&vp->v_lock);
 953  960          if (vp->v_count == 1) {
 954  961                  mutex_exit(&vp->v_lock);
 955  962                  VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 956  963                      vp, TQ_SLEEP) != TASKQID_INVALID);
 957  964                  return;
 958  965          }
      966 +        else{
      967 +                VERIFY(vp->v_count > 0);
      968 +        }
 959  969          VN_RELE_LOCKED(vp);
 960  970          mutex_exit(&vp->v_lock);
 961  971  }
 962  972  
 963  973  int
 964  974  vn_open(
 965  975          char *pnamep,
 966  976          enum uio_seg seg,
 967  977          int filemode,
 968  978          int createmode,

 969  979          struct vnode **vpp,
 970  980          enum create crwhy,
 971  981          mode_t umask)
 972  982  {
 973  983          return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 974  984              umask, NULL, -1));
 975  985  }
 976  986  
 977  987  
 978  988  /*
 979  989   * Open/create a vnode.
 980  990   * This may be callable by the kernel, the only known use
 981  991   * of user context being that the current user credentials
 982  992   * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 983  993   */
 984  994  int
 985  995  vn_openat(
 986  996          char *pnamep,
 987  997          enum uio_seg seg,
 988  998          int filemode,
 989  999          int createmode,
 990 1000          struct vnode **vpp,
 991 1001          enum create crwhy,
 992 1002          mode_t umask,
 993 1003          struct vnode *startvp,
 994 1004          int fd)
 995 1005  {
 996 1006          struct vnode *vp;
 997 1007          int mode;
 998 1008          int accessflags;
 999 1009          int error;
1000 1010          int in_crit = 0;
1001 1011          int open_done = 0;
1002 1012          int shrlock_done = 0;
1003 1013          struct vattr vattr;
1004 1014          enum symfollow follow;
1005 1015          int estale_retry = 0;
1006 1016          struct shrlock shr;
1007 1017          struct shr_locowner shr_own;
1008 1018          boolean_t create;
1009 1019  
1010 1020          mode = 0;
1011 1021          accessflags = 0;
1012 1022          if (filemode & FREAD)
1013 1023                  mode |= VREAD;
1014 1024          if (filemode & (FWRITE|FTRUNC))
1015 1025                  mode |= VWRITE;
1016 1026          if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
1017 1027                  mode |= VEXEC;
1018 1028  
1019 1029          /* symlink interpretation */
1020 1030          if (filemode & FNOFOLLOW)
1021 1031                  follow = NO_FOLLOW;
1022 1032          else
1023 1033                  follow = FOLLOW;
1024 1034  
1025 1035          if (filemode & FAPPEND)
1026 1036                  accessflags |= V_APPEND;
1027 1037  
1028 1038          /*
1029 1039           * We need to handle the case of FCREAT | FDIRECTORY and the case of
1030 1040           * FEXCL. If all three are specified, then we always fail because we
1031 1041           * cannot create a directory through this interface and FEXCL says we
1032 1042           * need to fail the request if we can't create it. If, however, only
1033 1043           * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1034 1044           * of opening a file that already exists. If it exists, we can do
1035 1045           * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1036 1046           * treated as FDIRECTORY.
1037 1047           */
1038 1048          if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1039 1049              (FCREAT | FDIRECTORY | FEXCL)) {
1040 1050                  return (EINVAL);
1041 1051          }
1042 1052  
1043 1053          if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1044 1054                  create = B_FALSE;
1045 1055          } else if ((filemode & FCREAT) != 0) {
1046 1056                  create = B_TRUE;
1047 1057          } else {
1048 1058                  create = B_FALSE;
1049 1059          }
1050 1060  
1051 1061  top:
1052 1062          if (create) {
1053 1063                  enum vcexcl excl;
1054 1064  
1055 1065                  /*
1056 1066                   * Wish to create a file.
1057 1067                   */
1058 1068                  vattr.va_type = VREG;
1059 1069                  vattr.va_mode = createmode;
1060 1070                  vattr.va_mask = AT_TYPE|AT_MODE;
1061 1071                  if (filemode & FTRUNC) {
1062 1072                          vattr.va_size = 0;
1063 1073                          vattr.va_mask |= AT_SIZE;
1064 1074                  }
1065 1075                  if (filemode & FEXCL)
1066 1076                          excl = EXCL;
1067 1077                  else
1068 1078                          excl = NONEXCL;
1069 1079  
1070 1080                  if (error =
1071 1081                      vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1072 1082                      (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1073 1083                          return (error);
1074 1084          } else {
1075 1085                  /*
1076 1086                   * Wish to open a file.  Just look it up.
1077 1087                   */
1078 1088                  if (error = lookupnameat(pnamep, seg, follow,
1079 1089                      NULLVPP, &vp, startvp)) {
1080 1090                          if ((error == ESTALE) &&
1081 1091                              fs_need_estale_retry(estale_retry++))
1082 1092                                  goto top;
1083 1093                          return (error);
1084 1094                  }
1085 1095  
1086 1096                  /*
1087 1097                   * Get the attributes to check whether file is large.
1088 1098                   * We do this only if the FOFFMAX flag is not set and
1089 1099                   * only for regular files.
1090 1100                   */
1091 1101  
1092 1102                  if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1093 1103                          vattr.va_mask = AT_SIZE;
1094 1104                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1095 1105                              CRED(), NULL))) {
1096 1106                                  goto out;
1097 1107                          }
1098 1108                          if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1099 1109                                  /*
1100 1110                                   * Large File API - regular open fails
1101 1111                                   * if FOFFMAX flag is set in file mode
1102 1112                                   */
1103 1113                                  error = EOVERFLOW;
1104 1114                                  goto out;
1105 1115                          }
1106 1116                  }
1107 1117                  /*
1108 1118                   * Can't write directories, active texts, or
1109 1119                   * read-only filesystems.  Can't truncate files
1110 1120                   * on which mandatory locking is in effect.
1111 1121                   */
1112 1122                  if (filemode & (FWRITE|FTRUNC)) {
1113 1123                          /*
1114 1124                           * Allow writable directory if VDIROPEN flag is set.
1115 1125                           */
1116 1126                          if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1117 1127                                  error = EISDIR;
1118 1128                                  goto out;
1119 1129                          }
1120 1130                          if (ISROFILE(vp)) {
1121 1131                                  error = EROFS;
1122 1132                                  goto out;
1123 1133                          }
1124 1134                          /*
1125 1135                           * Can't truncate files on which
1126 1136                           * sysv mandatory locking is in effect.
1127 1137                           */
1128 1138                          if (filemode & FTRUNC) {
1129 1139                                  vnode_t *rvp;
1130 1140  
1131 1141                                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1132 1142                                          rvp = vp;
1133 1143                                  if (rvp->v_filocks != NULL) {
1134 1144                                          vattr.va_mask = AT_MODE;
1135 1145                                          if ((error = VOP_GETATTR(vp,
1136 1146                                              &vattr, 0, CRED(), NULL)) == 0 &&
1137 1147                                              MANDLOCK(vp, vattr.va_mode))
1138 1148                                                  error = EAGAIN;
1139 1149                                  }
1140 1150                          }
1141 1151                          if (error)
1142 1152                                  goto out;
1143 1153                  }
1144 1154                  /*
1145 1155                   * Check permissions.
1146 1156                   */
1147 1157                  if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1148 1158                          goto out;
1149 1159  
1150 1160                  /*
1151 1161                   * Require FSEARCH and FDIRECTORY to return a directory. Require
1152 1162                   * FEXEC to return a regular file.
1153 1163                   */
1154 1164                  if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1155 1165                      vp->v_type != VDIR) {
1156 1166                          error = ENOTDIR;
1157 1167                          goto out;
1158 1168                  }
1159 1169                  if ((filemode & FEXEC) && vp->v_type != VREG) {
1160 1170                          error = ENOEXEC;        /* XXX: error code? */
1161 1171                          goto out;
1162 1172                  }
1163 1173          }
1164 1174  
1165 1175          /*
1166 1176           * Do remaining checks for FNOFOLLOW and FNOLINKS.
1167 1177           */
1168 1178          if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1169 1179                  /*
1170 1180                   * The __FLXPATH flag is a private interface for use by the lx
1171 1181                   * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
1172 1182                   * when a symbolic link is encountered, returns a file
1173 1183                   * descriptor which references it.
1174 1184                   * See uts/common/brand/lx/syscall/lx_open.c
1175 1185                   *
1176 1186                   * When this flag is set, VOP_OPEN() is not called (for a
1177 1187                   * symlink, most filesystems will return ENOSYS anyway)
1178 1188                   * and the link's vnode is returned to be linked to the
1179 1189                   * file descriptor.
1180 1190                   */
1181 1191                  if ((filemode & __FLXPATH) == 0)
1182 1192                          error = ELOOP;
1183 1193                  goto out;
1184 1194          }
1185 1195          if (filemode & FNOLINKS) {
1186 1196                  vattr.va_mask = AT_NLINK;
1187 1197                  if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1188 1198                          goto out;
1189 1199                  }
1190 1200                  if (vattr.va_nlink != 1) {
1191 1201                          error = EMLINK;
1192 1202                          goto out;
1193 1203                  }
1194 1204          }
1195 1205  
1196 1206          /*
1197 1207           * Opening a socket corresponding to the AF_UNIX pathname
1198 1208           * in the filesystem name space is not supported.
1199 1209           * However, VSOCK nodes in namefs are supported in order
1200 1210           * to make fattach work for sockets.
1201 1211           *
1202 1212           * XXX This uses VOP_REALVP to distinguish between
1203 1213           * an unopened namefs node (where VOP_REALVP returns a
1204 1214           * different VSOCK vnode) and a VSOCK created by vn_create
1205 1215           * in some file system (where VOP_REALVP would never return
1206 1216           * a different vnode).
1207 1217           */
1208 1218          if (vp->v_type == VSOCK) {
1209 1219                  struct vnode *nvp;
1210 1220  
1211 1221                  error = VOP_REALVP(vp, &nvp, NULL);
1212 1222                  if (error != 0 || nvp == NULL || nvp == vp ||
1213 1223                      nvp->v_type != VSOCK) {
1214 1224                          error = EOPNOTSUPP;
1215 1225                          goto out;
1216 1226                  }
1217 1227          }
1218 1228  
1219 1229          if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1220 1230                  /* get share reservation */
1221 1231                  shr.s_access = 0;
1222 1232                  if (filemode & FWRITE)
1223 1233                          shr.s_access |= F_WRACC;
1224 1234                  if (filemode & FREAD)
1225 1235                          shr.s_access |= F_RDACC;
1226 1236                  shr.s_deny = 0;
1227 1237                  shr.s_sysid = 0;
1228 1238                  shr.s_pid = ttoproc(curthread)->p_pid;
1229 1239                  shr_own.sl_pid = shr.s_pid;
1230 1240                  shr_own.sl_id = fd;
1231 1241                  shr.s_own_len = sizeof (shr_own);
1232 1242                  shr.s_owner = (caddr_t)&shr_own;
1233 1243                  error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1234 1244                      NULL);
1235 1245                  if (error)
1236 1246                          goto out;
1237 1247                  shrlock_done = 1;
1238 1248  
1239 1249                  /* nbmand conflict check if truncating file */
1240 1250                  if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1241 1251                          nbl_start_crit(vp, RW_READER);
1242 1252                          in_crit = 1;
1243 1253  
1244 1254                          vattr.va_mask = AT_SIZE;
1245 1255                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1246 1256                                  goto out;
1247 1257                          if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1248 1258                              NULL)) {
1249 1259                                  error = EACCES;
1250 1260                                  goto out;
1251 1261                          }
1252 1262                  }
1253 1263          }
1254 1264  
1255 1265          /*
1256 1266           * Do opening protocol.
1257 1267           */
1258 1268          error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1259 1269          if (error)
1260 1270                  goto out;
1261 1271          open_done = 1;
1262 1272  
1263 1273          /*
1264 1274           * Truncate if required.
1265 1275           */
1266 1276          if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1267 1277                  vattr.va_size = 0;
1268 1278                  vattr.va_mask = AT_SIZE;
1269 1279                  if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1270 1280                          goto out;
1271 1281          }
1272 1282  
1273 1283          /*
1274 1284           * Turn on directio, if requested.
1275 1285           */
1276 1286          if (filemode & FDIRECT) {
1277 1287                  if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1278 1288                      CRED(), NULL, NULL)) != 0) {
1279 1289                          /*
1280 1290                           * On Linux, O_DIRECT returns EINVAL when the file
1281 1291                           * system does not support directio, so we'll do the
1282 1292                           * same.
1283 1293                           */
1284 1294                          error = EINVAL;
1285 1295                          goto out;
1286 1296                  }
1287 1297          }
1288 1298  out:
1289 1299          ASSERT(vp->v_count > 0);
1290 1300  
1291 1301          if (in_crit) {
1292 1302                  nbl_end_crit(vp);
1293 1303                  in_crit = 0;
1294 1304          }
1295 1305          if (error) {
1296 1306                  if (open_done) {
1297 1307                          (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1298 1308                              NULL);
1299 1309                          open_done = 0;
1300 1310                          shrlock_done = 0;
1301 1311                  }
1302 1312                  if (shrlock_done) {
1303 1313                          (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1304 1314                              NULL);
1305 1315                          shrlock_done = 0;
1306 1316                  }
1307 1317  
1308 1318                  /*
1309 1319                   * The following clause was added to handle a problem
1310 1320                   * with NFS consistency.  It is possible that a lookup
1311 1321                   * of the file to be opened succeeded, but the file
1312 1322                   * itself doesn't actually exist on the server.  This
1313 1323                   * is chiefly due to the DNLC containing an entry for
1314 1324                   * the file which has been removed on the server.  In
1315 1325                   * this case, we just start over.  If there was some
1316 1326                   * other cause for the ESTALE error, then the lookup
1317 1327                   * of the file will fail and the error will be returned
1318 1328                   * above instead of looping around from here.
1319 1329                   */
1320 1330                  VN_RELE(vp);
1321 1331                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1322 1332                          goto top;
1323 1333          } else
1324 1334                  *vpp = vp;
1325 1335          return (error);
1326 1336  }
1327 1337  
1328 1338  /*
1329 1339   * The following two accessor functions are for the NFSv4 server.  Since there
1330 1340   * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1331 1341   * vnode open counts correct when a client "upgrades" an open or does an
1332 1342   * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1333 1343   * open mode (add or subtract read or write), but also change the share/deny
1334 1344   * modes.  However, share reservations are not integrated with OPEN, yet, so
1335 1345   * we need to handle each separately.  These functions are cleaner than having
1336 1346   * the NFS server manipulate the counts directly, however, nobody else should
1337 1347   * use these functions.
1338 1348   */
1339 1349  void
1340 1350  vn_open_upgrade(
1341 1351          vnode_t *vp,
1342 1352          int filemode)
1343 1353  {
1344 1354          ASSERT(vp->v_type == VREG);
1345 1355  
1346 1356          if (filemode & FREAD)
1347 1357                  atomic_inc_32(&vp->v_rdcnt);
1348 1358          if (filemode & FWRITE)
1349 1359                  atomic_inc_32(&vp->v_wrcnt);
1350 1360  
1351 1361  }
1352 1362  
1353 1363  void
1354 1364  vn_open_downgrade(
1355 1365          vnode_t *vp,
1356 1366          int filemode)
1357 1367  {
1358 1368          ASSERT(vp->v_type == VREG);
1359 1369  
1360 1370          if (filemode & FREAD) {
1361 1371                  ASSERT(vp->v_rdcnt > 0);
1362 1372                  atomic_dec_32(&vp->v_rdcnt);
1363 1373          }
1364 1374          if (filemode & FWRITE) {
1365 1375                  ASSERT(vp->v_wrcnt > 0);
1366 1376                  atomic_dec_32(&vp->v_wrcnt);
1367 1377          }
1368 1378  
1369 1379  }
1370 1380  
1371 1381  int
1372 1382  vn_create(
1373 1383          char *pnamep,
1374 1384          enum uio_seg seg,
1375 1385          struct vattr *vap,
1376 1386          enum vcexcl excl,
1377 1387          int mode,
1378 1388          struct vnode **vpp,
1379 1389          enum create why,
1380 1390          int flag,
1381 1391          mode_t umask)
1382 1392  {
1383 1393          return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1384 1394              umask, NULL));
1385 1395  }
1386 1396  
1387 1397  /*
1388 1398   * Create a vnode (makenode).
1389 1399   */
1390 1400  int
1391 1401  vn_createat(
1392 1402          char *pnamep,
1393 1403          enum uio_seg seg,
1394 1404          struct vattr *vap,
1395 1405          enum vcexcl excl,
1396 1406          int mode,
1397 1407          struct vnode **vpp,
1398 1408          enum create why,
1399 1409          int flag,
1400 1410          mode_t umask,
1401 1411          struct vnode *startvp)
1402 1412  {
1403 1413          struct vnode *dvp;      /* ptr to parent dir vnode */
1404 1414          struct vnode *vp = NULL;
1405 1415          struct pathname pn;
1406 1416          int error;
1407 1417          int in_crit = 0;
1408 1418          struct vattr vattr;
1409 1419          enum symfollow follow;
1410 1420          int estale_retry = 0;
1411 1421          uint32_t auditing = AU_AUDITING();
1412 1422  
1413 1423          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1414 1424  
1415 1425          /* symlink interpretation */
1416 1426          if ((flag & FNOFOLLOW) || excl == EXCL)
1417 1427                  follow = NO_FOLLOW;
1418 1428          else
1419 1429                  follow = FOLLOW;
1420 1430          flag &= ~(FNOFOLLOW|FNOLINKS);
1421 1431  
1422 1432  top:
1423 1433          /*
1424 1434           * Lookup directory.
1425 1435           * If new object is a file, call lower level to create it.
1426 1436           * Note that it is up to the lower level to enforce exclusive
1427 1437           * creation, if the file is already there.
1428 1438           * This allows the lower level to do whatever
1429 1439           * locking or protocol that is needed to prevent races.
1430 1440           * If the new object is directory call lower level to make
1431 1441           * the new directory, with "." and "..".
1432 1442           */
1433 1443          if (error = pn_get(pnamep, seg, &pn))
1434 1444                  return (error);
1435 1445          if (auditing)
1436 1446                  audit_vncreate_start();
1437 1447          dvp = NULL;
1438 1448          *vpp = NULL;
1439 1449          /*
1440 1450           * lookup will find the parent directory for the vnode.
1441 1451           * When it is done the pn holds the name of the entry
1442 1452           * in the directory.
1443 1453           * If this is a non-exclusive create we also find the node itself.
1444 1454           */
1445 1455          error = lookuppnat(&pn, NULL, follow, &dvp,
1446 1456              (excl == EXCL) ? NULLVPP : vpp, startvp);
1447 1457          if (error) {
1448 1458                  pn_free(&pn);
1449 1459                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1450 1460                          goto top;
1451 1461                  if (why == CRMKDIR && error == EINVAL)
1452 1462                          error = EEXIST;         /* SVID */
1453 1463                  return (error);
1454 1464          }
1455 1465  
1456 1466          if (why != CRMKNOD)
1457 1467                  vap->va_mode &= ~VSVTX;
1458 1468  
1459 1469          /*
1460 1470           * If default ACLs are defined for the directory don't apply the
1461 1471           * umask if umask is passed.
1462 1472           */
1463 1473  
1464 1474          if (umask) {
1465 1475  
1466 1476                  vsecattr_t vsec;
1467 1477  
1468 1478                  vsec.vsa_aclcnt = 0;
1469 1479                  vsec.vsa_aclentp = NULL;
1470 1480                  vsec.vsa_dfaclcnt = 0;
1471 1481                  vsec.vsa_dfaclentp = NULL;
1472 1482                  vsec.vsa_mask = VSA_DFACLCNT;
1473 1483                  error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1474 1484                  /*
1475 1485                   * If error is ENOSYS then treat it as no error
1476 1486                   * Don't want to force all file systems to support
1477 1487                   * aclent_t style of ACL's.
1478 1488                   */
1479 1489                  if (error == ENOSYS)
1480 1490                          error = 0;
1481 1491                  if (error) {
1482 1492                          if (*vpp != NULL)
1483 1493                                  VN_RELE(*vpp);
1484 1494                          goto out;
1485 1495                  } else {
1486 1496                          /*
1487 1497                           * Apply the umask if no default ACLs.
1488 1498                           */
1489 1499                          if (vsec.vsa_dfaclcnt == 0)
1490 1500                                  vap->va_mode &= ~umask;
1491 1501  
1492 1502                          /*
1493 1503                           * VOP_GETSECATTR() may have allocated memory for
1494 1504                           * ACLs we didn't request, so double-check and
1495 1505                           * free it if necessary.
1496 1506                           */
1497 1507                          if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1498 1508                                  kmem_free((caddr_t)vsec.vsa_aclentp,
1499 1509                                      vsec.vsa_aclcnt * sizeof (aclent_t));
1500 1510                          if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1501 1511                                  kmem_free((caddr_t)vsec.vsa_dfaclentp,
1502 1512                                      vsec.vsa_dfaclcnt * sizeof (aclent_t));
1503 1513                  }
1504 1514          }
1505 1515  
1506 1516          /*
1507 1517           * In general we want to generate EROFS if the file system is
1508 1518           * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1509 1519           * documents the open system call, and it says that O_CREAT has no
1510 1520           * effect if the file already exists.  Bug 1119649 states
1511 1521           * that open(path, O_CREAT, ...) fails when attempting to open an
1512 1522           * existing file on a read only file system.  Thus, the first part
1513 1523           * of the following if statement has 3 checks:
1514 1524           *      if the file exists &&
1515 1525           *              it is being open with write access &&
1516 1526           *              the file system is read only
1517 1527           *      then generate EROFS
1518 1528           */
1519 1529          if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1520 1530              (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1521 1531                  if (*vpp)
1522 1532                          VN_RELE(*vpp);
1523 1533                  error = EROFS;
1524 1534          } else if (excl == NONEXCL && *vpp != NULL) {
1525 1535                  vnode_t *rvp;
1526 1536  
1527 1537                  /*
1528 1538                   * File already exists.  If a mandatory lock has been
1529 1539                   * applied, return error.
1530 1540                   */
1531 1541                  vp = *vpp;
1532 1542                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1533 1543                          rvp = vp;
1534 1544                  if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1535 1545                          nbl_start_crit(vp, RW_READER);
1536 1546                          in_crit = 1;
1537 1547                  }
1538 1548                  if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1539 1549                          vattr.va_mask = AT_MODE|AT_SIZE;
1540 1550                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1541 1551                                  goto out;
1542 1552                          }
1543 1553                          if (MANDLOCK(vp, vattr.va_mode)) {
1544 1554                                  error = EAGAIN;
1545 1555                                  goto out;
1546 1556                          }
1547 1557                          /*
1548 1558                           * File cannot be truncated if non-blocking mandatory
1549 1559                           * locks are currently on the file.
1550 1560                           */
1551 1561                          if ((vap->va_mask & AT_SIZE) && in_crit) {
1552 1562                                  u_offset_t offset;
1553 1563                                  ssize_t length;
1554 1564  
1555 1565                                  offset = vap->va_size > vattr.va_size ?
1556 1566                                      vattr.va_size : vap->va_size;
1557 1567                                  length = vap->va_size > vattr.va_size ?
1558 1568                                      vap->va_size - vattr.va_size :
1559 1569                                      vattr.va_size - vap->va_size;
1560 1570                                  if (nbl_conflict(vp, NBL_WRITE, offset,
1561 1571                                      length, 0, NULL)) {
1562 1572                                          error = EACCES;
1563 1573                                          goto out;
1564 1574                                  }
1565 1575                          }
1566 1576                  }
1567 1577  
1568 1578                  /*
1569 1579                   * If the file is the root of a VFS, we've crossed a
1570 1580                   * mount point and the "containing" directory that we
1571 1581                   * acquired above (dvp) is irrelevant because it's in
1572 1582                   * a different file system.  We apply VOP_CREATE to the
1573 1583                   * target itself instead of to the containing directory
1574 1584                   * and supply a null path name to indicate (conventionally)
1575 1585                   * the node itself as the "component" of interest.
1576 1586                   *
1577 1587                   * The call to VOP_CREATE() is necessary to ensure
1578 1588                   * that the appropriate permission checks are made,
1579 1589                   * i.e. EISDIR, EACCES, etc.  We already know that vpp
1580 1590                   * exists since we are in the else condition where this
1581 1591                   * was checked.
1582 1592                   */
1583 1593                  if (vp->v_flag & VROOT) {
1584 1594                          ASSERT(why != CRMKDIR);
1585 1595                          error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1586 1596                              CRED(), flag, NULL, NULL);
1587 1597                          /*
1588 1598                           * If the create succeeded, it will have created a
1589 1599                           * new reference on a new vnode (*vpp) in the child
1590 1600                           * file system, so we want to drop our reference on
1591 1601                           * the old (vp) upon exit.
1592 1602                           */
1593 1603                          goto out;
1594 1604                  }
1595 1605  
1596 1606                  /*
1597 1607                   * Large File API - non-large open (FOFFMAX flag not set)
1598 1608                   * of regular file fails if the file size exceeds MAXOFF32_T.
1599 1609                   */
1600 1610                  if (why != CRMKDIR &&
1601 1611                      !(flag & FOFFMAX) &&
1602 1612                      (vp->v_type == VREG)) {
1603 1613                          vattr.va_mask = AT_SIZE;
1604 1614                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1605 1615                              CRED(), NULL))) {
1606 1616                                  goto out;
1607 1617                          }
1608 1618                          if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1609 1619                                  error = EOVERFLOW;
1610 1620                                  goto out;
1611 1621                          }
1612 1622                  }
1613 1623          }
1614 1624  
1615 1625          if (error == 0) {
1616 1626                  /*
1617 1627                   * Call mkdir() if specified, otherwise create().
1618 1628                   */
1619 1629                  int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1620 1630  
1621 1631                  if (why == CRMKDIR)
1622 1632                          /*
1623 1633                           * N.B., if vn_createat() ever requests
1624 1634                           * case-insensitive behavior then it will need
1625 1635                           * to be passed to VOP_MKDIR().  VOP_CREATE()
1626 1636                           * will already get it via "flag"
1627 1637                           */
1628 1638                          error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1629 1639                              NULL, 0, NULL);
1630 1640                  else if (!must_be_dir)
1631 1641                          error = VOP_CREATE(dvp, pn.pn_path, vap,
1632 1642                              excl, mode, vpp, CRED(), flag, NULL, NULL);
1633 1643                  else
1634 1644                          error = ENOTDIR;
1635 1645          }
1636 1646  
1637 1647  out:
1638 1648  
1639 1649          if (auditing)
1640 1650                  audit_vncreate_finish(*vpp, error);
1641 1651          if (in_crit) {
1642 1652                  nbl_end_crit(vp);
1643 1653                  in_crit = 0;
1644 1654          }
1645 1655          if (vp != NULL) {
1646 1656                  VN_RELE(vp);
1647 1657                  vp = NULL;
1648 1658          }
1649 1659          pn_free(&pn);
1650 1660          VN_RELE(dvp);
1651 1661          /*
1652 1662           * The following clause was added to handle a problem
1653 1663           * with NFS consistency.  It is possible that a lookup
1654 1664           * of the file to be created succeeded, but the file
1655 1665           * itself doesn't actually exist on the server.  This
1656 1666           * is chiefly due to the DNLC containing an entry for
1657 1667           * the file which has been removed on the server.  In
1658 1668           * this case, we just start over.  If there was some
1659 1669           * other cause for the ESTALE error, then the lookup
1660 1670           * of the file will fail and the error will be returned
1661 1671           * above instead of looping around from here.
1662 1672           */
1663 1673          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1664 1674                  goto top;
1665 1675          return (error);
1666 1676  }
1667 1677  
1668 1678  int
1669 1679  vn_link(char *from, char *to, enum uio_seg seg)
1670 1680  {
1671 1681          return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1672 1682  }
1673 1683  
1674 1684  int
1675 1685  vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1676 1686      vnode_t *tstartvp, char *to, enum uio_seg seg)
1677 1687  {
1678 1688          struct vnode *fvp;              /* from vnode ptr */
1679 1689          struct vnode *tdvp;             /* to directory vnode ptr */
1680 1690          struct pathname pn;
1681 1691          int error;
1682 1692          struct vattr vattr;
1683 1693          dev_t fsid;
1684 1694          int estale_retry = 0;
1685 1695          uint32_t auditing = AU_AUDITING();
1686 1696  
1687 1697  top:
1688 1698          fvp = tdvp = NULL;
1689 1699          if (error = pn_get(to, seg, &pn))
1690 1700                  return (error);
1691 1701          if (auditing && fstartvp != NULL)
1692 1702                  audit_setfsat_path(1);
1693 1703          if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1694 1704                  goto out;
1695 1705          if (auditing && tstartvp != NULL)
1696 1706                  audit_setfsat_path(3);
1697 1707          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1698 1708                  goto out;
1699 1709          /*
1700 1710           * Make sure both source vnode and target directory vnode are
1701 1711           * in the same vfs and that it is writeable.
1702 1712           */
1703 1713          vattr.va_mask = AT_FSID;
1704 1714          if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1705 1715                  goto out;
1706 1716          fsid = vattr.va_fsid;
1707 1717          vattr.va_mask = AT_FSID;
1708 1718          if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1709 1719                  goto out;
1710 1720          if (fsid != vattr.va_fsid) {
1711 1721                  error = EXDEV;
1712 1722                  goto out;
1713 1723          }
1714 1724          if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1715 1725                  error = EROFS;
1716 1726                  goto out;
1717 1727          }
1718 1728          /*
1719 1729           * Do the link.
1720 1730           */
1721 1731          (void) pn_fixslash(&pn);
1722 1732          error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1723 1733  out:
1724 1734          pn_free(&pn);
1725 1735          if (fvp)
1726 1736                  VN_RELE(fvp);
1727 1737          if (tdvp)
1728 1738                  VN_RELE(tdvp);
1729 1739          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1730 1740                  goto top;
1731 1741          return (error);
1732 1742  }
1733 1743  
1734 1744  int
1735 1745  vn_rename(char *from, char *to, enum uio_seg seg)
1736 1746  {
1737 1747          return (vn_renameat(NULL, from, NULL, to, seg));
1738 1748  }
1739 1749  
1740 1750  int
1741 1751  vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1742 1752      char *tname, enum uio_seg seg)
1743 1753  {
1744 1754          int error;
1745 1755          struct vattr vattr;
1746 1756          struct pathname fpn;            /* from pathname */
1747 1757          struct pathname tpn;            /* to pathname */
1748 1758          dev_t fsid;
1749 1759          int in_crit_src, in_crit_targ;
1750 1760          vnode_t *fromvp, *fvp;
1751 1761          vnode_t *tovp, *targvp;
1752 1762          int estale_retry = 0;
1753 1763          uint32_t auditing = AU_AUDITING();
1754 1764  
1755 1765  top:
1756 1766          fvp = fromvp = tovp = targvp = NULL;
1757 1767          in_crit_src = in_crit_targ = 0;
1758 1768          /*
1759 1769           * Get to and from pathnames.
1760 1770           */
1761 1771          if (error = pn_get(fname, seg, &fpn))
1762 1772                  return (error);
1763 1773          if (error = pn_get(tname, seg, &tpn)) {
1764 1774                  pn_free(&fpn);
1765 1775                  return (error);
1766 1776          }
1767 1777  
1768 1778          /*
1769 1779           * First we need to resolve the correct directories
1770 1780           * The passed in directories may only be a starting point,
1771 1781           * but we need the real directories the file(s) live in.
1772 1782           * For example the fname may be something like usr/lib/sparc
1773 1783           * and we were passed in the / directory, but we need to
1774 1784           * use the lib directory for the rename.
1775 1785           */
1776 1786  
1777 1787          if (auditing && fdvp != NULL)
1778 1788                  audit_setfsat_path(1);
1779 1789          /*
1780 1790           * Lookup to and from directories.
1781 1791           */
1782 1792          if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1783 1793                  goto out;
1784 1794          }
1785 1795  
1786 1796          /*
1787 1797           * Make sure there is an entry.
1788 1798           */
1789 1799          if (fvp == NULL) {
1790 1800                  error = ENOENT;
1791 1801                  goto out;
1792 1802          }
1793 1803  
1794 1804          if (auditing && tdvp != NULL)
1795 1805                  audit_setfsat_path(3);
1796 1806          if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1797 1807                  goto out;
1798 1808          }
1799 1809  
1800 1810          /*
1801 1811           * Make sure both the from vnode directory and the to directory
1802 1812           * are in the same vfs and the to directory is writable.
1803 1813           * We check fsid's, not vfs pointers, so loopback fs works.
1804 1814           */
1805 1815          if (fromvp != tovp) {
1806 1816                  vattr.va_mask = AT_FSID;
1807 1817                  if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1808 1818                          goto out;
1809 1819                  fsid = vattr.va_fsid;
1810 1820                  vattr.va_mask = AT_FSID;
1811 1821                  if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1812 1822                          goto out;
1813 1823                  if (fsid != vattr.va_fsid) {
1814 1824                          error = EXDEV;
1815 1825                          goto out;
1816 1826                  }
1817 1827          }
1818 1828  
1819 1829          if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1820 1830                  error = EROFS;
1821 1831                  goto out;
1822 1832          }
1823 1833  
1824 1834          /*
1825 1835           * Make sure "from" vp is not a mount point.
1826 1836           * Note, lookup did traverse() already, so
1827 1837           * we'll be looking at the mounted FS root.
1828 1838           * (but allow files like mnttab)
1829 1839           */
1830 1840          if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1831 1841                  error = EBUSY;
1832 1842                  goto out;
1833 1843          }
1834 1844  
1835 1845          if (targvp && (fvp != targvp)) {
1836 1846                  nbl_start_crit(targvp, RW_READER);
1837 1847                  in_crit_targ = 1;
1838 1848                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1839 1849                          error = EACCES;
1840 1850                          goto out;
1841 1851                  }
1842 1852          }
1843 1853  
1844 1854          if (nbl_need_check(fvp)) {
1845 1855                  nbl_start_crit(fvp, RW_READER);
1846 1856                  in_crit_src = 1;
1847 1857                  if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1848 1858                          error = EACCES;
1849 1859                          goto out;
1850 1860                  }
1851 1861          }
1852 1862  
1853 1863          /*
1854 1864           * Do the rename.
1855 1865           */
1856 1866          (void) pn_fixslash(&tpn);
1857 1867          error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1858 1868              NULL, 0);
1859 1869  
1860 1870  out:
1861 1871          pn_free(&fpn);
1862 1872          pn_free(&tpn);
1863 1873          if (in_crit_src)
1864 1874                  nbl_end_crit(fvp);
1865 1875          if (in_crit_targ)
1866 1876                  nbl_end_crit(targvp);
1867 1877          if (fromvp)
1868 1878                  VN_RELE(fromvp);
1869 1879          if (tovp)
1870 1880                  VN_RELE(tovp);
1871 1881          if (targvp)
1872 1882                  VN_RELE(targvp);
1873 1883          if (fvp)
1874 1884                  VN_RELE(fvp);
1875 1885          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1876 1886                  goto top;
1877 1887          return (error);
1878 1888  }
1879 1889  
1880 1890  /*
1881 1891   * Remove a file or directory.
1882 1892   */
1883 1893  int
1884 1894  vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1885 1895  {
1886 1896          return (vn_removeat(NULL, fnamep, seg, dirflag));
1887 1897  }
1888 1898  
1889 1899  int
1890 1900  vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1891 1901  {
1892 1902          struct vnode *vp;               /* entry vnode */
1893 1903          struct vnode *dvp;              /* ptr to parent dir vnode */
1894 1904          struct vnode *coveredvp;
1895 1905          struct pathname pn;             /* name of entry */
1896 1906          enum vtype vtype;
1897 1907          int error;
1898 1908          struct vfs *vfsp;
1899 1909          struct vfs *dvfsp;      /* ptr to parent dir vfs */
1900 1910          int in_crit = 0;
1901 1911          int estale_retry = 0;
1902 1912  
1903 1913  top:
1904 1914          if (error = pn_get(fnamep, seg, &pn))
1905 1915                  return (error);
1906 1916          dvp = vp = NULL;
1907 1917          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1908 1918                  pn_free(&pn);
1909 1919                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1910 1920                          goto top;
1911 1921                  return (error);
1912 1922          }
1913 1923  
1914 1924          /*
1915 1925           * Make sure there is an entry.
1916 1926           */
1917 1927          if (vp == NULL) {
1918 1928                  error = ENOENT;
1919 1929                  goto out;
1920 1930          }
1921 1931  
1922 1932          vfsp = vp->v_vfsp;
1923 1933          dvfsp = dvp->v_vfsp;
1924 1934  
1925 1935          /*
1926 1936           * If the named file is the root of a mounted filesystem, fail,
1927 1937           * unless it's marked unlinkable.  In that case, unmount the
1928 1938           * filesystem and proceed to unlink the covered vnode.  (If the
1929 1939           * covered vnode is a directory, use rmdir instead of unlink,
1930 1940           * to avoid file system corruption.)
1931 1941           */
1932 1942          if (vp->v_flag & VROOT) {
1933 1943                  if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1934 1944                          error = EBUSY;
1935 1945                          goto out;
1936 1946                  }
1937 1947  
1938 1948                  /*
1939 1949                   * Namefs specific code starts here.
1940 1950                   */
1941 1951  
1942 1952                  if (dirflag == RMDIRECTORY) {
1943 1953                          /*
1944 1954                           * User called rmdir(2) on a file that has
1945 1955                           * been namefs mounted on top of.  Since
1946 1956                           * namefs doesn't allow directories to
1947 1957                           * be mounted on other files we know
1948 1958                           * vp is not of type VDIR so fail to operation.
1949 1959                           */
1950 1960                          error = ENOTDIR;
1951 1961                          goto out;
1952 1962                  }
1953 1963  
1954 1964                  /*
1955 1965                   * If VROOT is still set after grabbing vp->v_lock,
1956 1966                   * noone has finished nm_unmount so far and coveredvp
1957 1967                   * is valid.
1958 1968                   * If we manage to grab vn_vfswlock(coveredvp) before releasing
1959 1969                   * vp->v_lock, any race window is eliminated.
1960 1970                   */
1961 1971  
1962 1972                  mutex_enter(&vp->v_lock);
1963 1973                  if ((vp->v_flag & VROOT) == 0) {
1964 1974                          /* Someone beat us to the unmount */
1965 1975                          mutex_exit(&vp->v_lock);
1966 1976                          error = EBUSY;
1967 1977                          goto out;
1968 1978                  }
1969 1979                  vfsp = vp->v_vfsp;
1970 1980                  coveredvp = vfsp->vfs_vnodecovered;
1971 1981                  ASSERT(coveredvp);
1972 1982                  /*
1973 1983                   * Note: Implementation of vn_vfswlock shows that ordering of
1974 1984                   * v_lock / vn_vfswlock is not an issue here.
1975 1985                   */
1976 1986                  error = vn_vfswlock(coveredvp);
1977 1987                  mutex_exit(&vp->v_lock);
1978 1988  
1979 1989                  if (error)
1980 1990                          goto out;
1981 1991  
1982 1992                  VN_HOLD(coveredvp);
1983 1993                  VN_RELE(vp);
1984 1994                  error = dounmount(vfsp, 0, CRED());
1985 1995  
1986 1996                  /*
1987 1997                   * Unmounted the namefs file system; now get
1988 1998                   * the object it was mounted over.
1989 1999                   */
1990 2000                  vp = coveredvp;
1991 2001                  /*
1992 2002                   * If namefs was mounted over a directory, then
1993 2003                   * we want to use rmdir() instead of unlink().
1994 2004                   */
1995 2005                  if (vp->v_type == VDIR)
1996 2006                          dirflag = RMDIRECTORY;
1997 2007  
1998 2008                  if (error)
1999 2009                          goto out;
2000 2010          }
2001 2011  
2002 2012          /*
2003 2013           * Make sure filesystem is writeable.
2004 2014           * We check the parent directory's vfs in case this is an lofs vnode.
2005 2015           */
2006 2016          if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
2007 2017                  error = EROFS;
2008 2018                  goto out;
2009 2019          }
2010 2020  
2011 2021          vtype = vp->v_type;
2012 2022  
2013 2023          /*
2014 2024           * If there is the possibility of an nbmand share reservation, make
2015 2025           * sure it's okay to remove the file.  Keep a reference to the
2016 2026           * vnode, so that we can exit the nbl critical region after
2017 2027           * calling VOP_REMOVE.
2018 2028           * If there is no possibility of an nbmand share reservation,
2019 2029           * release the vnode reference now.  Filesystems like NFS may
2020 2030           * behave differently if there is an extra reference, so get rid of
2021 2031           * this one.  Fortunately, we can't have nbmand mounts on NFS
2022 2032           * filesystems.
2023 2033           */
2024 2034          if (nbl_need_check(vp)) {
2025 2035                  nbl_start_crit(vp, RW_READER);
2026 2036                  in_crit = 1;
2027 2037                  if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
2028 2038                          error = EACCES;
2029 2039                          goto out;
2030 2040                  }
2031 2041          } else {
2032 2042                  VN_RELE(vp);
2033 2043                  vp = NULL;
2034 2044          }
2035 2045  
2036 2046          if (dirflag == RMDIRECTORY) {
2037 2047                  /*
2038 2048                   * Caller is using rmdir(2), which can only be applied to
2039 2049                   * directories.
2040 2050                   */
2041 2051                  if (vtype != VDIR) {
2042 2052                          error = ENOTDIR;
2043 2053                  } else {
2044 2054                          vnode_t *cwd;
2045 2055                          proc_t *pp = curproc;
2046 2056  
2047 2057                          mutex_enter(&pp->p_lock);
2048 2058                          cwd = PTOU(pp)->u_cdir;
2049 2059                          VN_HOLD(cwd);
2050 2060                          mutex_exit(&pp->p_lock);
2051 2061                          error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2052 2062                              NULL, 0);
2053 2063                          VN_RELE(cwd);
2054 2064                  }
2055 2065          } else {
2056 2066                  /*
2057 2067                   * Unlink(2) can be applied to anything.
2058 2068                   */
2059 2069                  error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2060 2070          }
2061 2071  
2062 2072  out:
2063 2073          pn_free(&pn);
2064 2074          if (in_crit) {
2065 2075                  nbl_end_crit(vp);
2066 2076                  in_crit = 0;
2067 2077          }
2068 2078          if (vp != NULL)
2069 2079                  VN_RELE(vp);
2070 2080          if (dvp != NULL)
2071 2081                  VN_RELE(dvp);
2072 2082          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2073 2083                  goto top;
2074 2084          return (error);
2075 2085  }
2076 2086  
2077 2087  /*
2078 2088   * Utility function to compare equality of vnodes.
2079 2089   * Compare the underlying real vnodes, if there are underlying vnodes.
2080 2090   * This is a more thorough comparison than the VN_CMP() macro provides.
2081 2091   */
2082 2092  int
2083 2093  vn_compare(vnode_t *vp1, vnode_t *vp2)
2084 2094  {
2085 2095          vnode_t *realvp;
2086 2096  
2087 2097          if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2088 2098                  vp1 = realvp;
2089 2099          if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2090 2100                  vp2 = realvp;
2091 2101          return (VN_CMP(vp1, vp2));
2092 2102  }
2093 2103  
2094 2104  /*
2095 2105   * The number of locks to hash into.  This value must be a power
2096 2106   * of 2 minus 1 and should probably also be prime.
2097 2107   */
2098 2108  #define NUM_BUCKETS     1023
2099 2109  
2100 2110  struct  vn_vfslocks_bucket {
2101 2111          kmutex_t vb_lock;
2102 2112          vn_vfslocks_entry_t *vb_list;
2103 2113          char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2104 2114  };
2105 2115  
2106 2116  /*
2107 2117   * Total number of buckets will be NUM_BUCKETS + 1 .
2108 2118   */
2109 2119  
2110 2120  #pragma align   64(vn_vfslocks_buckets)
2111 2121  static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2112 2122  
2113 2123  #define VN_VFSLOCKS_SHIFT       9
2114 2124  
2115 2125  #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2116 2126          ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2117 2127  
2118 2128  /*
2119 2129   * vn_vfslocks_getlock() uses an HASH scheme to generate
2120 2130   * rwstlock using vfs/vnode pointer passed to it.
2121 2131   *
2122 2132   * vn_vfslocks_rele() releases a reference in the
2123 2133   * HASH table which allows the entry allocated by
2124 2134   * vn_vfslocks_getlock() to be freed at a later
2125 2135   * stage when the refcount drops to zero.
2126 2136   */
2127 2137  
2128 2138  vn_vfslocks_entry_t *
2129 2139  vn_vfslocks_getlock(void *vfsvpptr)
2130 2140  {
2131 2141          struct vn_vfslocks_bucket *bp;
2132 2142          vn_vfslocks_entry_t *vep;
2133 2143          vn_vfslocks_entry_t *tvep;
2134 2144  
2135 2145          ASSERT(vfsvpptr != NULL);
2136 2146          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2137 2147  
2138 2148          mutex_enter(&bp->vb_lock);
2139 2149          for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2140 2150                  if (vep->ve_vpvfs == vfsvpptr) {
2141 2151                          vep->ve_refcnt++;
2142 2152                          mutex_exit(&bp->vb_lock);
2143 2153                          return (vep);
2144 2154                  }
2145 2155          }
2146 2156          mutex_exit(&bp->vb_lock);
2147 2157          vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2148 2158          rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2149 2159          vep->ve_vpvfs = (char *)vfsvpptr;
2150 2160          vep->ve_refcnt = 1;
2151 2161          mutex_enter(&bp->vb_lock);
2152 2162          for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2153 2163                  if (tvep->ve_vpvfs == vfsvpptr) {
2154 2164                          tvep->ve_refcnt++;
2155 2165                          mutex_exit(&bp->vb_lock);
2156 2166  
2157 2167                          /*
2158 2168                           * There is already an entry in the hash
2159 2169                           * destroy what we just allocated.
2160 2170                           */
2161 2171                          rwst_destroy(&vep->ve_lock);
2162 2172                          kmem_free(vep, sizeof (*vep));
2163 2173                          return (tvep);
2164 2174                  }
2165 2175          }
2166 2176          vep->ve_next = bp->vb_list;
2167 2177          bp->vb_list = vep;
2168 2178          mutex_exit(&bp->vb_lock);
2169 2179          return (vep);
2170 2180  }
2171 2181  
2172 2182  void
2173 2183  vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2174 2184  {
2175 2185          struct vn_vfslocks_bucket *bp;
2176 2186          vn_vfslocks_entry_t *vep;
2177 2187          vn_vfslocks_entry_t *pvep;
2178 2188  
2179 2189          ASSERT(vepent != NULL);
2180 2190          ASSERT(vepent->ve_vpvfs != NULL);
2181 2191  
2182 2192          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2183 2193  
2184 2194          mutex_enter(&bp->vb_lock);
2185 2195          vepent->ve_refcnt--;
2186 2196  
2187 2197          if ((int32_t)vepent->ve_refcnt < 0)
2188 2198                  cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2189 2199  
2190 2200          pvep = NULL;
2191 2201          if (vepent->ve_refcnt == 0) {
2192 2202                  for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2193 2203                          if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2194 2204                                  if (pvep == NULL)
2195 2205                                          bp->vb_list = vep->ve_next;
2196 2206                                  else {
2197 2207                                          pvep->ve_next = vep->ve_next;
2198 2208                                  }
2199 2209                                  mutex_exit(&bp->vb_lock);
2200 2210                                  rwst_destroy(&vep->ve_lock);
2201 2211                                  kmem_free(vep, sizeof (*vep));
2202 2212                                  return;
2203 2213                          }
2204 2214                          pvep = vep;
2205 2215                  }
2206 2216                  cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2207 2217          }
2208 2218          mutex_exit(&bp->vb_lock);
2209 2219  }
2210 2220  
2211 2221  /*
2212 2222   * vn_vfswlock_wait is used to implement a lock which is logically a writers
2213 2223   * lock protecting the v_vfsmountedhere field.
2214 2224   * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2215 2225   * except that it blocks to acquire the lock VVFSLOCK.
2216 2226   *
2217 2227   * traverse() and routines re-implementing part of traverse (e.g. autofs)
2218 2228   * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2219 2229   * need the non-blocking version of the writers lock i.e. vn_vfswlock
2220 2230   */
2221 2231  int
2222 2232  vn_vfswlock_wait(vnode_t *vp)
2223 2233  {
2224 2234          int retval;
2225 2235          vn_vfslocks_entry_t *vpvfsentry;
2226 2236          ASSERT(vp != NULL);
2227 2237  
2228 2238          vpvfsentry = vn_vfslocks_getlock(vp);
2229 2239          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2230 2240  
2231 2241          if (retval == EINTR) {
2232 2242                  vn_vfslocks_rele(vpvfsentry);
2233 2243                  return (EINTR);
2234 2244          }
2235 2245          return (retval);
2236 2246  }
2237 2247  
2238 2248  int
2239 2249  vn_vfsrlock_wait(vnode_t *vp)
2240 2250  {
2241 2251          int retval;
2242 2252          vn_vfslocks_entry_t *vpvfsentry;
2243 2253          ASSERT(vp != NULL);
2244 2254  
2245 2255          vpvfsentry = vn_vfslocks_getlock(vp);
2246 2256          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2247 2257  
2248 2258          if (retval == EINTR) {
2249 2259                  vn_vfslocks_rele(vpvfsentry);
2250 2260                  return (EINTR);
2251 2261          }
2252 2262  
2253 2263          return (retval);
2254 2264  }
2255 2265  
2256 2266  
2257 2267  /*
2258 2268   * vn_vfswlock is used to implement a lock which is logically a writers lock
2259 2269   * protecting the v_vfsmountedhere field.
2260 2270   */
2261 2271  int
2262 2272  vn_vfswlock(vnode_t *vp)
2263 2273  {
2264 2274          vn_vfslocks_entry_t *vpvfsentry;
2265 2275  
2266 2276          /*
2267 2277           * If vp is NULL then somebody is trying to lock the covered vnode
2268 2278           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2269 2279           * only happen when unmounting /.  Since that operation will fail
2270 2280           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2271 2281           */
2272 2282          if (vp == NULL)
2273 2283                  return (EBUSY);
2274 2284  
2275 2285          vpvfsentry = vn_vfslocks_getlock(vp);
2276 2286  
2277 2287          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2278 2288                  return (0);
2279 2289  
2280 2290          vn_vfslocks_rele(vpvfsentry);
2281 2291          return (EBUSY);
2282 2292  }
2283 2293  
2284 2294  int
2285 2295  vn_vfsrlock(vnode_t *vp)
2286 2296  {
2287 2297          vn_vfslocks_entry_t *vpvfsentry;
2288 2298  
2289 2299          /*
2290 2300           * If vp is NULL then somebody is trying to lock the covered vnode
2291 2301           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2292 2302           * only happen when unmounting /.  Since that operation will fail
2293 2303           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2294 2304           */
2295 2305          if (vp == NULL)
2296 2306                  return (EBUSY);
2297 2307  
2298 2308          vpvfsentry = vn_vfslocks_getlock(vp);
2299 2309  
2300 2310          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2301 2311                  return (0);
2302 2312  
2303 2313          vn_vfslocks_rele(vpvfsentry);
2304 2314          return (EBUSY);
2305 2315  }
2306 2316  
2307 2317  void
2308 2318  vn_vfsunlock(vnode_t *vp)
2309 2319  {
2310 2320          vn_vfslocks_entry_t *vpvfsentry;
2311 2321  
2312 2322          /*
2313 2323           * ve_refcnt needs to be decremented twice.
2314 2324           * 1. To release refernce after a call to vn_vfslocks_getlock()
2315 2325           * 2. To release the reference from the locking routines like
2316 2326           *    vn_vfsrlock/vn_vfswlock etc,.
2317 2327           */
2318 2328          vpvfsentry = vn_vfslocks_getlock(vp);
2319 2329          vn_vfslocks_rele(vpvfsentry);
2320 2330  
2321 2331          rwst_exit(&vpvfsentry->ve_lock);
2322 2332          vn_vfslocks_rele(vpvfsentry);
2323 2333  }
2324 2334  
2325 2335  int
2326 2336  vn_vfswlock_held(vnode_t *vp)
2327 2337  {
2328 2338          int held;
2329 2339          vn_vfslocks_entry_t *vpvfsentry;
2330 2340  
2331 2341          ASSERT(vp != NULL);
2332 2342  
2333 2343          vpvfsentry = vn_vfslocks_getlock(vp);
2334 2344          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2335 2345  
2336 2346          vn_vfslocks_rele(vpvfsentry);
2337 2347          return (held);
2338 2348  }
2339 2349  
2340 2350  
2341 2351  int
2342 2352  vn_make_ops(
2343 2353          const char *name,                       /* Name of file system */
2344 2354          const fs_operation_def_t *templ,        /* Operation specification */
2345 2355          vnodeops_t **actual)                    /* Return the vnodeops */
2346 2356  {
2347 2357          int unused_ops;
2348 2358          int error;
2349 2359  
2350 2360          *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2351 2361  
2352 2362          (*actual)->vnop_name = name;
2353 2363  
2354 2364          error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2355 2365          if (error) {
2356 2366                  kmem_free(*actual, sizeof (vnodeops_t));
2357 2367          }
2358 2368  
2359 2369  #if DEBUG
2360 2370          if (unused_ops != 0)
2361 2371                  cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2362 2372                      "but not used", name, unused_ops);
2363 2373  #endif
2364 2374  
2365 2375          return (error);
2366 2376  }
2367 2377  
2368 2378  /*
2369 2379   * Free the vnodeops created as a result of vn_make_ops()
2370 2380   */
2371 2381  void
2372 2382  vn_freevnodeops(vnodeops_t *vnops)
2373 2383  {
2374 2384          kmem_free(vnops, sizeof (vnodeops_t));
2375 2385  }
2376 2386  
2377 2387  /*
2378 2388   * Vnode cache.
2379 2389   */
2380 2390  
2381 2391  /* ARGSUSED */
2382 2392  static int
2383 2393  vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2384 2394  {
2385 2395          struct vnode *vp;
2386 2396  
2387 2397          vp = buf;
2388 2398  
2389 2399          mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2390 2400          mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2391 2401          cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2392 2402          rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2393 2403          vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2394 2404          vp->v_path = vn_vpath_empty;
2395 2405          vp->v_path_stamp = 0;
2396 2406          vp->v_mpssdata = NULL;
2397 2407          vp->v_vsd = NULL;
2398 2408          vp->v_fopdata = NULL;
2399 2409  
2400 2410          return (0);
2401 2411  }
2402 2412  
2403 2413  /* ARGSUSED */
2404 2414  static void
2405 2415  vn_cache_destructor(void *buf, void *cdrarg)
2406 2416  {
2407 2417          struct vnode *vp;
2408 2418  
2409 2419          vp = buf;
2410 2420  
2411 2421          rw_destroy(&vp->v_nbllock);
2412 2422          cv_destroy(&vp->v_cv);
2413 2423          mutex_destroy(&vp->v_vsd_lock);
2414 2424          mutex_destroy(&vp->v_lock);
2415 2425  }
2416 2426  
2417 2427  void
2418 2428  vn_create_cache(void)
2419 2429  {
2420 2430          /* LINTED */
2421 2431          ASSERT((1 << VNODE_ALIGN_LOG2) ==
2422 2432              P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2423 2433          vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2424 2434              VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2425 2435              NULL, 0);
2426 2436  }
2427 2437  
2428 2438  void
2429 2439  vn_destroy_cache(void)
2430 2440  {
2431 2441          kmem_cache_destroy(vn_cache);
2432 2442  }
2433 2443  
2434 2444  /*
2435 2445   * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2436 2446   * cached by the file system and vnodes remain associated.
2437 2447   */
2438 2448  void
2439 2449  vn_recycle(vnode_t *vp)
2440 2450  {
2441 2451          ASSERT(vp->v_pages == NULL);
2442 2452          VERIFY(vp->v_path != NULL);
2443 2453  
2444 2454          /*
2445 2455           * XXX - This really belongs in vn_reinit(), but we have some issues
2446 2456           * with the counts.  Best to have it here for clean initialization.
2447 2457           */
2448 2458          vp->v_rdcnt = 0;
2449 2459          vp->v_wrcnt = 0;
2450 2460          vp->v_mmap_read = 0;
2451 2461          vp->v_mmap_write = 0;
2452 2462  
2453 2463          /*
2454 2464           * If FEM was in use, make sure everything gets cleaned up
2455 2465           * NOTE: vp->v_femhead is initialized to NULL in the vnode
2456 2466           * constructor.
2457 2467           */
2458 2468          if (vp->v_femhead) {
2459 2469                  /* XXX - There should be a free_femhead() that does all this */
2460 2470                  ASSERT(vp->v_femhead->femh_list == NULL);
2461 2471                  mutex_destroy(&vp->v_femhead->femh_lock);
2462 2472                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2463 2473                  vp->v_femhead = NULL;
2464 2474          }
2465 2475          if (vp->v_path != vn_vpath_empty) {
2466 2476                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2467 2477                  vp->v_path = vn_vpath_empty;
2468 2478          }
2469 2479          vp->v_path_stamp = 0;
2470 2480  
2471 2481          if (vp->v_fopdata != NULL) {
2472 2482                  free_fopdata(vp);
2473 2483          }
2474 2484          vp->v_mpssdata = NULL;
2475 2485          vsd_free(vp);
2476 2486  }
2477 2487  
2478 2488  /*
2479 2489   * Used to reset the vnode fields including those that are directly accessible
2480 2490   * as well as those which require an accessor function.
2481 2491   *
2482 2492   * Does not initialize:
2483 2493   *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2484 2494   *      v_data (since FS-nodes and vnodes point to each other and should
2485 2495   *              be updated simultaneously)
2486 2496   *      v_op (in case someone needs to make a VOP call on this object)
2487 2497   */
2488 2498  void
2489 2499  vn_reinit(vnode_t *vp)
2490 2500  {
2491 2501          vp->v_count = 1;
2492 2502          vp->v_count_dnlc = 0;
2493 2503          vp->v_phantom_count = 0;
2494 2504          vp->v_vfsp = NULL;
2495 2505          vp->v_stream = NULL;
2496 2506          vp->v_vfsmountedhere = NULL;
2497 2507          vp->v_flag = 0;
2498 2508          vp->v_type = VNON;
2499 2509          vp->v_rdev = NODEV;
2500 2510  
2501 2511          vp->v_filocks = NULL;
2502 2512          vp->v_shrlocks = NULL;
2503 2513          vp->v_pages = NULL;
2504 2514  
2505 2515          vp->v_locality = NULL;
2506 2516          vp->v_xattrdir = NULL;
2507 2517  
2508 2518          /*
2509 2519           * In a few specific instances, vn_reinit() is used to initialize
2510 2520           * locally defined vnode_t instances.  Lacking the construction offered
2511 2521           * by vn_alloc(), these vnodes require v_path initialization.
2512 2522           */
2513 2523          if (vp->v_path == NULL) {
2514 2524                  vp->v_path = vn_vpath_empty;
2515 2525          }
2516 2526  
2517 2527          /* Handles v_femhead, v_path, and the r/w/map counts */
2518 2528          vn_recycle(vp);
2519 2529  }
2520 2530  
2521 2531  vnode_t *
2522 2532  vn_alloc(int kmflag)
2523 2533  {
2524 2534          vnode_t *vp;
2525 2535  
2526 2536          vp = kmem_cache_alloc(vn_cache, kmflag);
2527 2537  
2528 2538          if (vp != NULL) {
2529 2539                  vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2530 2540                  vp->v_fopdata = NULL;
2531 2541                  vn_reinit(vp);
2532 2542          }
2533 2543  
2534 2544          return (vp);
2535 2545  }
2536 2546  
2537 2547  void
2538 2548  vn_free(vnode_t *vp)
2539 2549  {
2540 2550          ASSERT(vp->v_shrlocks == NULL);
2541 2551          ASSERT(vp->v_filocks == NULL);
2542 2552  
2543 2553          /*
2544 2554           * Some file systems call vn_free() with v_count of zero,
2545 2555           * some with v_count of 1.  In any case, the value should
2546 2556           * never be anything else.
2547 2557           */
2548 2558          ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2549 2559          ASSERT(vp->v_count_dnlc == 0);
2550 2560          ASSERT0(vp->v_phantom_count);
2551 2561          VERIFY(vp->v_path != NULL);
2552 2562          if (vp->v_path != vn_vpath_empty) {
2553 2563                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2554 2564                  vp->v_path = vn_vpath_empty;
2555 2565          }
2556 2566  
2557 2567          /* If FEM was in use, make sure everything gets cleaned up */
2558 2568          if (vp->v_femhead) {
2559 2569                  /* XXX - There should be a free_femhead() that does all this */
2560 2570                  ASSERT(vp->v_femhead->femh_list == NULL);
2561 2571                  mutex_destroy(&vp->v_femhead->femh_lock);
2562 2572                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2563 2573                  vp->v_femhead = NULL;
2564 2574          }
2565 2575  
2566 2576          if (vp->v_fopdata != NULL) {
2567 2577                  free_fopdata(vp);
2568 2578          }
2569 2579          vp->v_mpssdata = NULL;
2570 2580          vsd_free(vp);
2571 2581          kmem_cache_free(vn_cache, vp);
2572 2582  }
2573 2583  
2574 2584  /*
2575 2585   * vnode status changes, should define better states than 1, 0.
2576 2586   */
2577 2587  void
2578 2588  vn_reclaim(vnode_t *vp)
2579 2589  {
2580 2590          vfs_t   *vfsp = vp->v_vfsp;
2581 2591  
2582 2592          if (vfsp == NULL ||
2583 2593              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2584 2594                  return;
2585 2595          }
2586 2596          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2587 2597  }
2588 2598  
2589 2599  void
2590 2600  vn_idle(vnode_t *vp)
2591 2601  {
2592 2602          vfs_t   *vfsp = vp->v_vfsp;
2593 2603  
2594 2604          if (vfsp == NULL ||
2595 2605              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2596 2606                  return;
2597 2607          }
2598 2608          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2599 2609  }
2600 2610  void
2601 2611  vn_exists(vnode_t *vp)
2602 2612  {
2603 2613          vfs_t   *vfsp = vp->v_vfsp;
2604 2614  
2605 2615          if (vfsp == NULL ||
2606 2616              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2607 2617                  return;
2608 2618          }
2609 2619          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2610 2620  }
2611 2621  
2612 2622  void
2613 2623  vn_invalid(vnode_t *vp)
2614 2624  {
2615 2625          vfs_t   *vfsp = vp->v_vfsp;
2616 2626  
2617 2627          if (vfsp == NULL ||
2618 2628              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2619 2629                  return;
2620 2630          }
2621 2631          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2622 2632  }
2623 2633  
2624 2634  /* Vnode event notification */
2625 2635  
2626 2636  int
2627 2637  vnevent_support(vnode_t *vp, caller_context_t *ct)
2628 2638  {
2629 2639          if (vp == NULL)
2630 2640                  return (EINVAL);
2631 2641  
2632 2642          return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2633 2643  }
2634 2644  
2635 2645  void
2636 2646  vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2637 2647  {
2638 2648          if (vp == NULL || vp->v_femhead == NULL) {
2639 2649                  return;
2640 2650          }
2641 2651          (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2642 2652          (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2643 2653  }
2644 2654  
2645 2655  void
2646 2656  vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2647 2657      caller_context_t *ct)
2648 2658  {
2649 2659          if (vp == NULL || vp->v_femhead == NULL) {
2650 2660                  return;
2651 2661          }
2652 2662          (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2653 2663  }
2654 2664  
2655 2665  void
2656 2666  vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2657 2667      caller_context_t *ct)
2658 2668  {
2659 2669          if (vp == NULL || vp->v_femhead == NULL) {
2660 2670                  return;
2661 2671          }
2662 2672          (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2663 2673  }
2664 2674  
2665 2675  void
2666 2676  vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2667 2677  {
2668 2678          if (vp == NULL || vp->v_femhead == NULL) {
2669 2679                  return;
2670 2680          }
2671 2681          (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2672 2682  }
2673 2683  
2674 2684  void
2675 2685  vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2676 2686  {
2677 2687          if (vp == NULL || vp->v_femhead == NULL) {
2678 2688                  return;
2679 2689          }
2680 2690          (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2681 2691  }
2682 2692  
2683 2693  void
2684 2694  vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2685 2695      caller_context_t *ct)
2686 2696  {
2687 2697          if (vp == NULL || vp->v_femhead == NULL) {
2688 2698                  return;
2689 2699          }
2690 2700          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2691 2701  }
2692 2702  
2693 2703  void
2694 2704  vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2695 2705      caller_context_t *ct)
2696 2706  {
2697 2707          if (vp == NULL || vp->v_femhead == NULL) {
2698 2708                  return;
2699 2709          }
2700 2710          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2701 2711  }
2702 2712  
2703 2713  void
2704 2714  vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2705 2715      caller_context_t *ct)
2706 2716  {
2707 2717          if (vp == NULL || vp->v_femhead == NULL) {
2708 2718                  return;
2709 2719          }
2710 2720          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2711 2721  }
2712 2722  
2713 2723  void
2714 2724  vnevent_create(vnode_t *vp, caller_context_t *ct)
2715 2725  {
2716 2726          if (vp == NULL || vp->v_femhead == NULL) {
2717 2727                  return;
2718 2728          }
2719 2729          (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2720 2730  }
2721 2731  
2722 2732  void
2723 2733  vnevent_link(vnode_t *vp, caller_context_t *ct)
2724 2734  {
2725 2735          if (vp == NULL || vp->v_femhead == NULL) {
2726 2736                  return;
2727 2737          }
2728 2738          (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2729 2739  }
2730 2740  
2731 2741  void
2732 2742  vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2733 2743  {
2734 2744          if (vp == NULL || vp->v_femhead == NULL) {
2735 2745                  return;
2736 2746          }
2737 2747          (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2738 2748  }
2739 2749  
2740 2750  void
2741 2751  vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2742 2752  {
2743 2753          if (vp == NULL || vp->v_femhead == NULL) {
2744 2754                  return;
2745 2755          }
2746 2756          (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2747 2757  }
2748 2758  
2749 2759  void
2750 2760  vnevent_resize(vnode_t *vp, caller_context_t *ct)
2751 2761  {
2752 2762          if (vp == NULL || vp->v_femhead == NULL) {
2753 2763                  return;
2754 2764          }
2755 2765          (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2756 2766  }
2757 2767  
2758 2768  /*
2759 2769   * Vnode accessors.
2760 2770   */
2761 2771  
2762 2772  int
2763 2773  vn_is_readonly(vnode_t *vp)
2764 2774  {
2765 2775          return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2766 2776  }
2767 2777  
2768 2778  int
2769 2779  vn_has_flocks(vnode_t *vp)
2770 2780  {
2771 2781          return (vp->v_filocks != NULL);
2772 2782  }
2773 2783  
2774 2784  int
2775 2785  vn_has_mandatory_locks(vnode_t *vp, int mode)
2776 2786  {
2777 2787          return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2778 2788  }
2779 2789  
2780 2790  int
2781 2791  vn_has_cached_data(vnode_t *vp)
2782 2792  {
2783 2793          return (vp->v_pages != NULL);
2784 2794  }
2785 2795  
2786 2796  /*
2787 2797   * Return 0 if the vnode in question shouldn't be permitted into a zone via
2788 2798   * zone_enter(2).
2789 2799   */
2790 2800  int
2791 2801  vn_can_change_zones(vnode_t *vp)
2792 2802  {
2793 2803          struct vfssw *vswp;
2794 2804          int allow = 1;
2795 2805          vnode_t *rvp;
2796 2806  
2797 2807          if (nfs_global_client_only != 0)
2798 2808                  return (1);
2799 2809  
2800 2810          /*
2801 2811           * We always want to look at the underlying vnode if there is one.
2802 2812           */
2803 2813          if (VOP_REALVP(vp, &rvp, NULL) != 0)
2804 2814                  rvp = vp;
2805 2815          /*
2806 2816           * Some pseudo filesystems (including doorfs) don't actually register
2807 2817           * their vfsops_t, so the following may return NULL; we happily let
2808 2818           * such vnodes switch zones.
2809 2819           */
2810 2820          vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2811 2821          if (vswp != NULL) {
2812 2822                  if (vswp->vsw_flag & VSW_NOTZONESAFE)
2813 2823                          allow = 0;
2814 2824                  vfs_unrefvfssw(vswp);
2815 2825          }
2816 2826          return (allow);
2817 2827  }
2818 2828  
2819 2829  /*
2820 2830   * Return nonzero if the vnode is a mount point, zero if not.
2821 2831   */
2822 2832  int
2823 2833  vn_ismntpt(vnode_t *vp)
2824 2834  {
2825 2835          return (vp->v_vfsmountedhere != NULL);
2826 2836  }
2827 2837  
2828 2838  /* Retrieve the vfs (if any) mounted on this vnode */
2829 2839  vfs_t *
2830 2840  vn_mountedvfs(vnode_t *vp)
2831 2841  {
2832 2842          return (vp->v_vfsmountedhere);
2833 2843  }
2834 2844  
2835 2845  /*
2836 2846   * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2837 2847   */
2838 2848  int
2839 2849  vn_in_dnlc(vnode_t *vp)
2840 2850  {
2841 2851          return (vp->v_count_dnlc > 0);
2842 2852  }
2843 2853  
2844 2854  /*
2845 2855   * vn_has_other_opens() checks whether a particular file is opened by more than
2846 2856   * just the caller and whether the open is for read and/or write.
2847 2857   * This routine is for calling after the caller has already called VOP_OPEN()
2848 2858   * and the caller wishes to know if they are the only one with it open for
2849 2859   * the mode(s) specified.
2850 2860   *
2851 2861   * Vnode counts are only kept on regular files (v_type=VREG).
2852 2862   */
2853 2863  int
2854 2864  vn_has_other_opens(
2855 2865          vnode_t *vp,
2856 2866          v_mode_t mode)
2857 2867  {
2858 2868  
2859 2869          ASSERT(vp != NULL);
2860 2870  
2861 2871          switch (mode) {
2862 2872          case V_WRITE:
2863 2873                  if (vp->v_wrcnt > 1)
2864 2874                          return (V_TRUE);
2865 2875                  break;
2866 2876          case V_RDORWR:
2867 2877                  if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2868 2878                          return (V_TRUE);
2869 2879                  break;
2870 2880          case V_RDANDWR:
2871 2881                  if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2872 2882                          return (V_TRUE);
2873 2883                  break;
2874 2884          case V_READ:
2875 2885                  if (vp->v_rdcnt > 1)
2876 2886                          return (V_TRUE);
2877 2887                  break;
2878 2888          }
2879 2889  
2880 2890          return (V_FALSE);
2881 2891  }
2882 2892  
2883 2893  /*
2884 2894   * vn_is_opened() checks whether a particular file is opened and
2885 2895   * whether the open is for read and/or write.
2886 2896   *
2887 2897   * Vnode counts are only kept on regular files (v_type=VREG).
2888 2898   */
2889 2899  int
2890 2900  vn_is_opened(
2891 2901          vnode_t *vp,
2892 2902          v_mode_t mode)
2893 2903  {
2894 2904  
2895 2905          ASSERT(vp != NULL);
2896 2906  
2897 2907          switch (mode) {
2898 2908          case V_WRITE:
2899 2909                  if (vp->v_wrcnt)
2900 2910                          return (V_TRUE);
2901 2911                  break;
2902 2912          case V_RDANDWR:
2903 2913                  if (vp->v_rdcnt && vp->v_wrcnt)
2904 2914                          return (V_TRUE);
2905 2915                  break;
2906 2916          case V_RDORWR:
2907 2917                  if (vp->v_rdcnt || vp->v_wrcnt)
2908 2918                          return (V_TRUE);
2909 2919                  break;
2910 2920          case V_READ:
2911 2921                  if (vp->v_rdcnt)
2912 2922                          return (V_TRUE);
2913 2923                  break;
2914 2924          }
2915 2925  
2916 2926          return (V_FALSE);
2917 2927  }
2918 2928  
2919 2929  /*
2920 2930   * vn_is_mapped() checks whether a particular file is mapped and whether
2921 2931   * the file is mapped read and/or write.
2922 2932   */
2923 2933  int
2924 2934  vn_is_mapped(
2925 2935          vnode_t *vp,
2926 2936          v_mode_t mode)
2927 2937  {
2928 2938  
2929 2939          ASSERT(vp != NULL);
2930 2940  
2931 2941  #if !defined(_LP64)
2932 2942          switch (mode) {
2933 2943          /*
2934 2944           * The atomic_add_64_nv functions force atomicity in the
2935 2945           * case of 32 bit architectures. Otherwise the 64 bit values
2936 2946           * require two fetches. The value of the fields may be
2937 2947           * (potentially) changed between the first fetch and the
2938 2948           * second
2939 2949           */
2940 2950          case V_WRITE:
2941 2951                  if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2942 2952                          return (V_TRUE);
2943 2953                  break;
2944 2954          case V_RDANDWR:
2945 2955                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2946 2956                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2947 2957                          return (V_TRUE);
2948 2958                  break;
2949 2959          case V_RDORWR:
2950 2960                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2951 2961                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2952 2962                          return (V_TRUE);
2953 2963                  break;
2954 2964          case V_READ:
2955 2965                  if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2956 2966                          return (V_TRUE);
2957 2967                  break;
2958 2968          }
2959 2969  #else
2960 2970          switch (mode) {
2961 2971          case V_WRITE:
2962 2972                  if (vp->v_mmap_write)
2963 2973                          return (V_TRUE);
2964 2974                  break;
2965 2975          case V_RDANDWR:
2966 2976                  if (vp->v_mmap_read && vp->v_mmap_write)
2967 2977                          return (V_TRUE);
2968 2978                  break;
2969 2979          case V_RDORWR:
2970 2980                  if (vp->v_mmap_read || vp->v_mmap_write)
2971 2981                          return (V_TRUE);
2972 2982                  break;
2973 2983          case V_READ:
2974 2984                  if (vp->v_mmap_read)
2975 2985                          return (V_TRUE);
2976 2986                  break;
2977 2987          }
2978 2988  #endif
2979 2989  
2980 2990          return (V_FALSE);
2981 2991  }
2982 2992  
2983 2993  /*
2984 2994   * Set the operations vector for a vnode.
2985 2995   *
2986 2996   * FEM ensures that the v_femhead pointer is filled in before the
2987 2997   * v_op pointer is changed.  This means that if the v_femhead pointer
2988 2998   * is NULL, and the v_op field hasn't changed since before which checked
2989 2999   * the v_femhead pointer; then our update is ok - we are not racing with
2990 3000   * FEM.
2991 3001   */
2992 3002  void
2993 3003  vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2994 3004  {
2995 3005          vnodeops_t      *op;
2996 3006  
2997 3007          ASSERT(vp != NULL);
2998 3008          ASSERT(vnodeops != NULL);
2999 3009  
3000 3010          op = vp->v_op;
3001 3011          membar_consumer();
3002 3012          /*
3003 3013           * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
3004 3014           * the compare-and-swap on vp->v_op.  If either fails, then FEM is
3005 3015           * in effect on the vnode and we need to have FEM deal with it.
3006 3016           */
3007 3017          if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
3008 3018              op) {
3009 3019                  fem_setvnops(vp, vnodeops);
3010 3020          }
3011 3021  }
3012 3022  
3013 3023  /*
3014 3024   * Retrieve the operations vector for a vnode
3015 3025   * As with vn_setops(above); make sure we aren't racing with FEM.
3016 3026   * FEM sets the v_op to a special, internal, vnodeops that wouldn't
3017 3027   * make sense to the callers of this routine.
3018 3028   */
3019 3029  vnodeops_t *
3020 3030  vn_getops(vnode_t *vp)
3021 3031  {
3022 3032          vnodeops_t      *op;
3023 3033  
3024 3034          ASSERT(vp != NULL);
3025 3035  
3026 3036          op = vp->v_op;
3027 3037          membar_consumer();
3028 3038          if (vp->v_femhead == NULL && op == vp->v_op) {
3029 3039                  return (op);
3030 3040          } else {
3031 3041                  return (fem_getvnops(vp));
3032 3042          }
3033 3043  }
3034 3044  
3035 3045  /*
3036 3046   * Returns non-zero (1) if the vnodeops matches that of the vnode.
3037 3047   * Returns zero (0) if not.
3038 3048   */
3039 3049  int
3040 3050  vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
3041 3051  {
3042 3052          return (vn_getops(vp) == vnodeops);
3043 3053  }
3044 3054  
3045 3055  /*
3046 3056   * Returns non-zero (1) if the specified operation matches the
3047 3057   * corresponding operation for that the vnode.
3048 3058   * Returns zero (0) if not.
3049 3059   */
3050 3060  
3051 3061  #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
3052 3062  
3053 3063  int
3054 3064  vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
3055 3065  {
3056 3066          const fs_operation_trans_def_t *otdp;
3057 3067          fs_generic_func_p *loc = NULL;
3058 3068          vnodeops_t      *vop = vn_getops(vp);
3059 3069  
3060 3070          ASSERT(vopname != NULL);
3061 3071  
3062 3072          for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3063 3073                  if (MATCHNAME(otdp->name, vopname)) {
3064 3074                          loc = (fs_generic_func_p *)
3065 3075                              ((char *)(vop) + otdp->offset);
3066 3076                          break;
3067 3077                  }
3068 3078          }
3069 3079  
3070 3080          return ((loc != NULL) && (*loc == funcp));
3071 3081  }
3072 3082  
3073 3083  /*
3074 3084   * fs_new_caller_id() needs to return a unique ID on a given local system.
3075 3085   * The IDs do not need to survive across reboots.  These are primarily
3076 3086   * used so that (FEM) monitors can detect particular callers (such as
3077 3087   * the NFS server) to a given vnode/vfs operation.
3078 3088   */
3079 3089  u_longlong_t
3080 3090  fs_new_caller_id()
3081 3091  {
3082 3092          static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3083 3093  
3084 3094          return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3085 3095  }
3086 3096  
3087 3097  /*
3088 3098   * The value stored in v_path is relative to rootdir, located in the global
3089 3099   * zone.  Zones or chroot environments which reside deeper inside the VFS
3090 3100   * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3091 3101   * what lies below their perceived root.  In order to keep v_path usable for
3092 3102   * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3093 3103   *
3094 3104   * An upper bound of max_vnode_path is placed upon v_path allocations to
3095 3105   * prevent the system from going too wild at the behest of pathological
3096 3106   * behavior from the operator.
3097 3107   */
3098 3108  size_t max_vnode_path = 4 * MAXPATHLEN;
3099 3109  
3100 3110  
3101 3111  void
3102 3112  vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3103 3113  {
3104 3114          char *buf;
3105 3115  
3106 3116          mutex_enter(&vp->v_lock);
3107 3117          /*
3108 3118           * If the snapshot of v_path_stamp passed in via compare_stamp does not
3109 3119           * match the present value on the vnode, it indicates that subsequent
3110 3120           * changes have occurred.  The v_path value is not cleared in this case
3111 3121           * since the new value may be valid.
3112 3122           */
3113 3123          if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3114 3124                  mutex_exit(&vp->v_lock);
3115 3125                  return;
3116 3126          }
3117 3127          buf = vp->v_path;
3118 3128          vp->v_path = vn_vpath_empty;
3119 3129          vp->v_path_stamp = 0;
3120 3130          mutex_exit(&vp->v_lock);
3121 3131          if (buf != vn_vpath_empty) {
3122 3132                  kmem_free(buf, strlen(buf) + 1);
3123 3133          }
3124 3134  }
3125 3135  
3126 3136  static void
3127 3137  vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3128 3138      boolean_t is_rename)
3129 3139  {
3130 3140          char *buf, *oldbuf;
3131 3141          hrtime_t pstamp;
3132 3142          size_t baselen, buflen = 0;
3133 3143  
3134 3144          /* Handle the vn_setpath_str case. */
3135 3145          if (pvp == NULL) {
3136 3146                  if (len + 1 > max_vnode_path) {
3137 3147                          DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3138 3148                              vnode_t *, vp, char *, name, size_t, len + 1);
3139 3149                          return;
3140 3150                  }
3141 3151                  buf = kmem_alloc(len + 1, KM_SLEEP);
3142 3152                  bcopy(name, buf, len);
3143 3153                  buf[len] = '\0';
3144 3154  
3145 3155                  mutex_enter(&vp->v_lock);
3146 3156                  oldbuf = vp->v_path;
3147 3157                  vp->v_path = buf;
3148 3158                  vp->v_path_stamp = gethrtime();
3149 3159                  mutex_exit(&vp->v_lock);
3150 3160                  if (oldbuf != vn_vpath_empty) {
3151 3161                          kmem_free(oldbuf, strlen(oldbuf) + 1);
3152 3162                  }
3153 3163                  return;
3154 3164          }
3155 3165  
3156 3166          /* Take snapshot of parent dir */
3157 3167          mutex_enter(&pvp->v_lock);
3158 3168  
3159 3169          if ((pvp->v_flag & VTRAVERSE) != 0) {
3160 3170                  /*
3161 3171                   * When the parent vnode has VTRAVERSE set in its flags, normal
3162 3172                   * assumptions about v_path calculation no longer apply.  The
3163 3173                   * primary situation where this occurs is via the VFS tricks
3164 3174                   * which procfs plays in order to allow /proc/PID/(root|cwd) to
3165 3175                   * yield meaningful results.
3166 3176                   *
3167 3177                   * When this flag is set, v_path on the child must not be
3168 3178                   * updated since the calculated value is likely to be
3169 3179                   * incorrect, given the current context.
3170 3180                   */
3171 3181                  mutex_exit(&pvp->v_lock);
3172 3182                  return;
3173 3183          }
3174 3184  
3175 3185  retrybuf:
3176 3186          if (pvp->v_path == vn_vpath_empty) {
3177 3187                  /*
3178 3188                   * Without v_path from the parent directory, generating a child
3179 3189                   * path from the name is impossible.
3180 3190                   */
3181 3191                  if (len > 0) {
3182 3192                          pstamp = pvp->v_path_stamp;
3183 3193                          mutex_exit(&pvp->v_lock);
3184 3194                          vn_clearpath(vp, pstamp);
3185 3195                          return;
3186 3196                  }
3187 3197  
3188 3198                  /*
3189 3199                   * The only feasible case here is where a NUL lookup is being
3190 3200                   * performed on rootdir prior to its v_path being populated.
3191 3201                   */
3192 3202                  ASSERT(pvp->v_path_stamp == 0);
3193 3203                  baselen = 0;
3194 3204                  pstamp = 0;
3195 3205          } else {
3196 3206                  pstamp = pvp->v_path_stamp;
3197 3207                  baselen = strlen(pvp->v_path);
3198 3208                  /* ignore a trailing slash if present */
3199 3209                  if (pvp->v_path[baselen - 1] == '/') {
3200 3210                          /* This should only the be case for rootdir */
3201 3211                          ASSERT(baselen == 1 && pvp == rootdir);
3202 3212                          baselen--;
3203 3213                  }
3204 3214          }
3205 3215          mutex_exit(&pvp->v_lock);
3206 3216  
3207 3217          if (buflen != 0) {
3208 3218                  /* Free the existing (mis-sized) buffer in case of retry */
3209 3219                  kmem_free(buf, buflen);
3210 3220          }
3211 3221          /* base, '/', name and trailing NUL */
3212 3222          buflen = baselen + len + 2;
3213 3223          if (buflen > max_vnode_path) {
3214 3224                  DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3215 3225                      vnode_t *, vp, char *, name, size_t, buflen);
3216 3226                  return;
3217 3227          }
3218 3228          buf = kmem_alloc(buflen, KM_SLEEP);
3219 3229  
3220 3230          mutex_enter(&pvp->v_lock);
3221 3231          if (pvp->v_path_stamp != pstamp) {
3222 3232                  size_t vlen;
3223 3233  
3224 3234                  /*
3225 3235                   * Since v_path_stamp changed on the parent, it is likely that
3226 3236                   * v_path has been altered as well.  If the length does not
3227 3237                   * exactly match what was previously measured, the buffer
3228 3238                   * allocation must be repeated for proper sizing.
3229 3239                   */
3230 3240                  if (pvp->v_path == vn_vpath_empty) {
3231 3241                          /* Give up if parent lack v_path */
3232 3242                          mutex_exit(&pvp->v_lock);
3233 3243                          kmem_free(buf, buflen);
3234 3244                          return;
3235 3245                  }
3236 3246                  vlen = strlen(pvp->v_path);
3237 3247                  if (pvp->v_path[vlen - 1] == '/') {
3238 3248                          vlen--;
3239 3249                  }
3240 3250                  if (vlen != baselen) {
3241 3251                          goto retrybuf;
3242 3252                  }
3243 3253          }
3244 3254          bcopy(pvp->v_path, buf, baselen);
3245 3255          mutex_exit(&pvp->v_lock);
3246 3256  
3247 3257          buf[baselen] = '/';
3248 3258          baselen++;
3249 3259          bcopy(name, &buf[baselen], len + 1);
3250 3260  
3251 3261          mutex_enter(&vp->v_lock);
3252 3262          if (vp->v_path_stamp == 0) {
3253 3263                  /* never-visited vnode can inherit stamp from parent */
3254 3264                  ASSERT(vp->v_path == vn_vpath_empty);
3255 3265                  vp->v_path_stamp = pstamp;
3256 3266                  vp->v_path = buf;
3257 3267                  mutex_exit(&vp->v_lock);
3258 3268          } else if (vp->v_path_stamp < pstamp || is_rename) {
3259 3269                  /*
3260 3270                   * Install the updated path and stamp, ensuring that the v_path
3261 3271                   * pointer is valid at all times for dtrace.
3262 3272                   */
3263 3273                  oldbuf = vp->v_path;
3264 3274                  vp->v_path = buf;
3265 3275                  vp->v_path_stamp = gethrtime();
3266 3276                  mutex_exit(&vp->v_lock);
3267 3277                  kmem_free(oldbuf, strlen(oldbuf) + 1);
3268 3278          } else {
3269 3279                  /*
3270 3280                   * If the timestamp matches or is greater, it means another
3271 3281                   * thread performed the update first while locks were dropped
3272 3282                   * here to make the allocation.  We defer to the newer value.
3273 3283                   */
3274 3284                  mutex_exit(&vp->v_lock);
3275 3285                  kmem_free(buf, buflen);
3276 3286          }
3277 3287          ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3278 3288  }
3279 3289  
3280 3290  void
3281 3291  vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3282 3292  {
3283 3293          size_t len;
3284 3294  
3285 3295          /*
3286 3296           * If the parent is older or empty, there's nothing further to do.
3287 3297           */
3288 3298          if (pvp->v_path == vn_vpath_empty ||
3289 3299              pvp->v_path_stamp <= vp->v_path_stamp) {
3290 3300                  return;
3291 3301          }
3292 3302  
3293 3303          /*
3294 3304           * Given the lack of appropriate context, meaningful updates to v_path
3295 3305           * cannot be made for during lookups for the '.' or '..' entries.
3296 3306           */
3297 3307          len = strlen(name);
3298 3308          if (len == 0 || (len == 1 && name[0] == '.') ||
3299 3309              (len == 2 && name[0] == '.' && name[1] == '.')) {
3300 3310                  return;
3301 3311          }
3302 3312  
3303 3313          vn_setpath_common(pvp, vp, name, len, B_FALSE);
3304 3314  }
3305 3315  
3306 3316  /*
3307 3317   * Given a starting vnode and a path, updates the path in the target vnode in
3308 3318   * a safe manner.  If the vnode already has path information embedded, then the
3309 3319   * cached path is left untouched.
3310 3320   */
3311 3321  /* ARGSUSED */
3312 3322  void
3313 3323  vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3314 3324      size_t len)
3315 3325  {
3316 3326          vn_setpath_common(pvp, vp, name, len, B_FALSE);
3317 3327  }
3318 3328  
3319 3329  /*
3320 3330   * Sets the path to the vnode to be the given string, regardless of current
3321 3331   * context.  The string must be a complete path from rootdir.  This is only used
3322 3332   * by fsop_root() for setting the path based on the mountpoint.
3323 3333   */
3324 3334  void
3325 3335  vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3326 3336  {
3327 3337          vn_setpath_common(NULL, vp, str, len, B_FALSE);
3328 3338  }
3329 3339  
3330 3340  /*
3331 3341   * Called from within filesystem's vop_rename() to handle renames once the
3332 3342   * target vnode is available.
3333 3343   */
3334 3344  void
3335 3345  vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3336 3346  {
3337 3347          vn_setpath_common(pvp, vp, name, len, B_TRUE);
3338 3348  }
3339 3349  
3340 3350  /*
3341 3351   * Similar to vn_setpath_str(), this function sets the path of the destination
3342 3352   * vnode to the be the same as the source vnode.
3343 3353   */
3344 3354  void
3345 3355  vn_copypath(struct vnode *src, struct vnode *dst)
3346 3356  {
3347 3357          char *buf;
3348 3358          hrtime_t stamp;
3349 3359          size_t buflen;
3350 3360  
3351 3361          mutex_enter(&src->v_lock);
3352 3362          if (src->v_path == vn_vpath_empty) {
3353 3363                  mutex_exit(&src->v_lock);
3354 3364                  return;
3355 3365          }
3356 3366          buflen = strlen(src->v_path) + 1;
3357 3367          mutex_exit(&src->v_lock);
3358 3368  
3359 3369          buf = kmem_alloc(buflen, KM_SLEEP);
3360 3370  
3361 3371          mutex_enter(&src->v_lock);
3362 3372          if (src->v_path == vn_vpath_empty ||
3363 3373              strlen(src->v_path) + 1 != buflen) {
3364 3374                  mutex_exit(&src->v_lock);
3365 3375                  kmem_free(buf, buflen);
3366 3376                  return;
3367 3377          }
3368 3378          bcopy(src->v_path, buf, buflen);
3369 3379          stamp = src->v_path_stamp;
3370 3380          mutex_exit(&src->v_lock);
3371 3381  
3372 3382          mutex_enter(&dst->v_lock);
3373 3383          if (dst->v_path != vn_vpath_empty) {
3374 3384                  mutex_exit(&dst->v_lock);
3375 3385                  kmem_free(buf, buflen);
3376 3386                  return;
3377 3387          }
3378 3388          dst->v_path = buf;
3379 3389          dst->v_path_stamp = stamp;
3380 3390          mutex_exit(&dst->v_lock);
3381 3391  }
3382 3392  
3383 3393  
3384 3394  /*
3385 3395   * XXX Private interface for segvn routines that handle vnode
3386 3396   * large page segments.
3387 3397   *
3388 3398   * return 1 if vp's file system VOP_PAGEIO() implementation
3389 3399   * can be safely used instead of VOP_GETPAGE() for handling
3390 3400   * pagefaults against regular non swap files. VOP_PAGEIO()
3391 3401   * interface is considered safe here if its implementation
3392 3402   * is very close to VOP_GETPAGE() implementation.
3393 3403   * e.g. It zero's out the part of the page beyond EOF. Doesn't
3394 3404   * panic if there're file holes but instead returns an error.
3395 3405   * Doesn't assume file won't be changed by user writes, etc.
3396 3406   *
3397 3407   * return 0 otherwise.
3398 3408   *
3399 3409   * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3400 3410   */
3401 3411  int
3402 3412  vn_vmpss_usepageio(vnode_t *vp)
3403 3413  {
3404 3414          vfs_t   *vfsp = vp->v_vfsp;
3405 3415          char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3406 3416          char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3407 3417          char **fsok = pageio_ok_fss;
3408 3418  
3409 3419          if (fsname == NULL) {
3410 3420                  return (0);
3411 3421          }
3412 3422  
3413 3423          for (; *fsok; fsok++) {
3414 3424                  if (strcmp(*fsok, fsname) == 0) {
3415 3425                          return (1);
3416 3426                  }
3417 3427          }
3418 3428          return (0);
3419 3429  }
3420 3430  
3421 3431  /* VOP_XXX() macros call the corresponding fop_xxx() function */
3422 3432  
3423 3433  int
3424 3434  fop_open(
3425 3435          vnode_t **vpp,
3426 3436          int mode,
3427 3437          cred_t *cr,
3428 3438          caller_context_t *ct)
3429 3439  {
3430 3440          int ret;
3431 3441          vnode_t *vp = *vpp;
3432 3442  
3433 3443          VN_HOLD(vp);
3434 3444          /*
3435 3445           * Adding to the vnode counts before calling open
3436 3446           * avoids the need for a mutex. It circumvents a race
3437 3447           * condition where a query made on the vnode counts results in a
3438 3448           * false negative. The inquirer goes away believing the file is
3439 3449           * not open when there is an open on the file already under way.
3440 3450           *
3441 3451           * The counts are meant to prevent NFS from granting a delegation
3442 3452           * when it would be dangerous to do so.
3443 3453           *
3444 3454           * The vnode counts are only kept on regular files
3445 3455           */
3446 3456          if ((*vpp)->v_type == VREG) {
3447 3457                  if (mode & FREAD)
3448 3458                          atomic_inc_32(&(*vpp)->v_rdcnt);
3449 3459                  if (mode & FWRITE)
3450 3460                          atomic_inc_32(&(*vpp)->v_wrcnt);
3451 3461          }
3452 3462  
3453 3463          VOPXID_MAP_CR(vp, cr);
3454 3464  
3455 3465          ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3456 3466  
3457 3467          if (ret) {
3458 3468                  /*
3459 3469                   * Use the saved vp just in case the vnode ptr got trashed
3460 3470                   * by the error.
3461 3471                   */
3462 3472                  VOPSTATS_UPDATE(vp, open);
3463 3473                  if ((vp->v_type == VREG) && (mode & FREAD))
3464 3474                          atomic_dec_32(&vp->v_rdcnt);
3465 3475                  if ((vp->v_type == VREG) && (mode & FWRITE))
3466 3476                          atomic_dec_32(&vp->v_wrcnt);
3467 3477          } else {
3468 3478                  /*
3469 3479                   * Some filesystems will return a different vnode,
3470 3480                   * but the same path was still used to open it.
3471 3481                   * So if we do change the vnode and need to
3472 3482                   * copy over the path, do so here, rather than special
3473 3483                   * casing each filesystem. Adjust the vnode counts to
3474 3484                   * reflect the vnode switch.
3475 3485                   */
3476 3486                  VOPSTATS_UPDATE(*vpp, open);
3477 3487                  if (*vpp != vp) {
3478 3488                          vn_copypath(vp, *vpp);
3479 3489                          if (((*vpp)->v_type == VREG) && (mode & FREAD))
3480 3490                                  atomic_inc_32(&(*vpp)->v_rdcnt);
3481 3491                          if ((vp->v_type == VREG) && (mode & FREAD))
3482 3492                                  atomic_dec_32(&vp->v_rdcnt);
3483 3493                          if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3484 3494                                  atomic_inc_32(&(*vpp)->v_wrcnt);
3485 3495                          if ((vp->v_type == VREG) && (mode & FWRITE))
3486 3496                                  atomic_dec_32(&vp->v_wrcnt);
3487 3497                  }
3488 3498          }
3489 3499          VN_RELE(vp);
3490 3500          return (ret);
3491 3501  }
3492 3502  
3493 3503  int
3494 3504  fop_close(
3495 3505          vnode_t *vp,
3496 3506          int flag,
3497 3507          int count,
3498 3508          offset_t offset,
3499 3509          cred_t *cr,
3500 3510          caller_context_t *ct)
3501 3511  {
3502 3512          int err;
3503 3513  
3504 3514          VOPXID_MAP_CR(vp, cr);
3505 3515  
3506 3516          err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3507 3517          VOPSTATS_UPDATE(vp, close);
3508 3518          /*
3509 3519           * Check passed in count to handle possible dups. Vnode counts are only
3510 3520           * kept on regular files
3511 3521           */
3512 3522          if ((vp->v_type == VREG) && (count == 1))  {
3513 3523                  if (flag & FREAD) {
3514 3524                          ASSERT(vp->v_rdcnt > 0);
3515 3525                          atomic_dec_32(&vp->v_rdcnt);
3516 3526                  }
3517 3527                  if (flag & FWRITE) {
3518 3528                          ASSERT(vp->v_wrcnt > 0);
3519 3529                          atomic_dec_32(&vp->v_wrcnt);
3520 3530                  }
3521 3531          }
3522 3532          return (err);
3523 3533  }
3524 3534  
3525 3535  int
3526 3536  fop_read(
3527 3537          vnode_t *vp,
3528 3538          uio_t *uiop,
3529 3539          int ioflag,
3530 3540          cred_t *cr,
3531 3541          caller_context_t *ct)
3532 3542  {
3533 3543          ssize_t resid_start = uiop->uio_resid;
3534 3544          zone_t  *zonep = curzone;
3535 3545          zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3536 3546  
3537 3547          hrtime_t start = 0, lat;
3538 3548          ssize_t len;
3539 3549          int err;
3540 3550  
3541 3551          if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3542 3552              vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3543 3553                  start = gethrtime();
3544 3554  
3545 3555                  mutex_enter(&zonep->zone_vfs_lock);
3546 3556                  kstat_runq_enter(&zonep->zone_vfs_rwstats);
3547 3557                  mutex_exit(&zonep->zone_vfs_lock);
3548 3558          }
3549 3559  
3550 3560          VOPXID_MAP_CR(vp, cr);
3551 3561  
3552 3562          err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3553 3563          len = resid_start - uiop->uio_resid;
3554 3564  
3555 3565          VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3556 3566  
3557 3567          if (start != 0) {
3558 3568                  mutex_enter(&zonep->zone_vfs_lock);
3559 3569                  zonep->zone_vfs_rwstats.reads++;
3560 3570                  zonep->zone_vfs_rwstats.nread += len;
3561 3571                  kstat_runq_exit(&zonep->zone_vfs_rwstats);
3562 3572                  mutex_exit(&zonep->zone_vfs_lock);
3563 3573  
3564 3574                  lat = gethrtime() - start;
3565 3575  
3566 3576                  if (lat >= VOP_LATENCY_10MS) {
3567 3577                          if (lat < VOP_LATENCY_100MS)
3568 3578                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3569 3579                          else if (lat < VOP_LATENCY_1S) {
3570 3580                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3571 3581                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3572 3582                          } else if (lat < VOP_LATENCY_10S) {
3573 3583                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3574 3584                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3575 3585                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3576 3586                          } else {
3577 3587                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3578 3588                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3579 3589                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3580 3590                                  atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3581 3591                          }
3582 3592                  }
3583 3593          }
3584 3594  
3585 3595          return (err);
3586 3596  }
3587 3597  
3588 3598  int
3589 3599  fop_write(
3590 3600          vnode_t *vp,
3591 3601          uio_t *uiop,
3592 3602          int ioflag,
3593 3603          cred_t *cr,
3594 3604          caller_context_t *ct)
3595 3605  {
3596 3606          ssize_t resid_start = uiop->uio_resid;
3597 3607          zone_t  *zonep = curzone;
3598 3608          zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3599 3609  
3600 3610          hrtime_t start = 0, lat;
3601 3611          ssize_t len;
3602 3612          int     err;
3603 3613  
3604 3614          /*
3605 3615           * For the purposes of VFS kstat consumers, the "waitq" calculation is
3606 3616           * repurposed as the active queue for VFS write operations.  There's no
3607 3617           * actual wait queue for VFS operations.
3608 3618           */
3609 3619          if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3610 3620              vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3611 3621                  start = gethrtime();
3612 3622  
3613 3623                  mutex_enter(&zonep->zone_vfs_lock);
3614 3624                  kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3615 3625                  mutex_exit(&zonep->zone_vfs_lock);
3616 3626          }
3617 3627  
3618 3628          VOPXID_MAP_CR(vp, cr);
3619 3629  
3620 3630          err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3621 3631          len = resid_start - uiop->uio_resid;
3622 3632  
3623 3633          VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3624 3634  
3625 3635          if (start != 0) {
3626 3636                  mutex_enter(&zonep->zone_vfs_lock);
3627 3637                  zonep->zone_vfs_rwstats.writes++;
3628 3638                  zonep->zone_vfs_rwstats.nwritten += len;
3629 3639                  kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3630 3640                  mutex_exit(&zonep->zone_vfs_lock);
3631 3641  
3632 3642                  lat = gethrtime() - start;
3633 3643  
3634 3644                  if (lat >= VOP_LATENCY_10MS) {
3635 3645                          if (lat < VOP_LATENCY_100MS)
3636 3646                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3637 3647                          else if (lat < VOP_LATENCY_1S) {
3638 3648                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3639 3649                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3640 3650                          } else if (lat < VOP_LATENCY_10S) {
3641 3651                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3642 3652                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3643 3653                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3644 3654                          } else {
3645 3655                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3646 3656                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3647 3657                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3648 3658                                  atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3649 3659                          }
3650 3660                  }
3651 3661          }
3652 3662  
3653 3663          return (err);
3654 3664  }
3655 3665  
3656 3666  int
3657 3667  fop_ioctl(
3658 3668          vnode_t *vp,
3659 3669          int cmd,
3660 3670          intptr_t arg,
3661 3671          int flag,
3662 3672          cred_t *cr,
3663 3673          int *rvalp,
3664 3674          caller_context_t *ct)
3665 3675  {
3666 3676          int     err;
3667 3677  
3668 3678          VOPXID_MAP_CR(vp, cr);
3669 3679  
3670 3680          err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3671 3681          VOPSTATS_UPDATE(vp, ioctl);
3672 3682          return (err);
3673 3683  }
3674 3684  
3675 3685  int
3676 3686  fop_setfl(
3677 3687          vnode_t *vp,
3678 3688          int oflags,
3679 3689          int nflags,
3680 3690          cred_t *cr,
3681 3691          caller_context_t *ct)
3682 3692  {
3683 3693          int     err;
3684 3694  
3685 3695          VOPXID_MAP_CR(vp, cr);
3686 3696  
3687 3697          err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3688 3698          VOPSTATS_UPDATE(vp, setfl);
3689 3699          return (err);
3690 3700  }
3691 3701  
3692 3702  int
3693 3703  fop_getattr(
3694 3704          vnode_t *vp,
3695 3705          vattr_t *vap,
3696 3706          int flags,
3697 3707          cred_t *cr,
3698 3708          caller_context_t *ct)
3699 3709  {
3700 3710          int     err;
3701 3711  
3702 3712          VOPXID_MAP_CR(vp, cr);
3703 3713  
3704 3714          /*
3705 3715           * If this file system doesn't understand the xvattr extensions
3706 3716           * then turn off the xvattr bit.
3707 3717           */
3708 3718          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3709 3719                  vap->va_mask &= ~AT_XVATTR;
3710 3720          }
3711 3721  
3712 3722          /*
3713 3723           * We're only allowed to skip the ACL check iff we used a 32 bit
3714 3724           * ACE mask with VOP_ACCESS() to determine permissions.
3715 3725           */
3716 3726          if ((flags & ATTR_NOACLCHECK) &&
3717 3727              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3718 3728                  return (EINVAL);
3719 3729          }
3720 3730          err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3721 3731          VOPSTATS_UPDATE(vp, getattr);
3722 3732          return (err);
3723 3733  }
3724 3734  
3725 3735  int
3726 3736  fop_setattr(
3727 3737          vnode_t *vp,
3728 3738          vattr_t *vap,
3729 3739          int flags,
3730 3740          cred_t *cr,
3731 3741          caller_context_t *ct)
3732 3742  {
3733 3743          int     err;
3734 3744  
3735 3745          VOPXID_MAP_CR(vp, cr);
3736 3746  
3737 3747          /*
3738 3748           * If this file system doesn't understand the xvattr extensions
3739 3749           * then turn off the xvattr bit.
3740 3750           */
3741 3751          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3742 3752                  vap->va_mask &= ~AT_XVATTR;
3743 3753          }
3744 3754  
3745 3755          /*
3746 3756           * We're only allowed to skip the ACL check iff we used a 32 bit
3747 3757           * ACE mask with VOP_ACCESS() to determine permissions.
3748 3758           */
3749 3759          if ((flags & ATTR_NOACLCHECK) &&
3750 3760              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3751 3761                  return (EINVAL);
3752 3762          }
3753 3763          err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3754 3764          VOPSTATS_UPDATE(vp, setattr);
3755 3765          return (err);
3756 3766  }
3757 3767  
3758 3768  int
3759 3769  fop_access(
3760 3770          vnode_t *vp,
3761 3771          int mode,
3762 3772          int flags,
3763 3773          cred_t *cr,
3764 3774          caller_context_t *ct)
3765 3775  {
3766 3776          int     err;
3767 3777  
3768 3778          if ((flags & V_ACE_MASK) &&
3769 3779              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3770 3780                  return (EINVAL);
3771 3781          }
3772 3782  
3773 3783          VOPXID_MAP_CR(vp, cr);
3774 3784  
3775 3785          err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3776 3786          VOPSTATS_UPDATE(vp, access);
3777 3787          return (err);
3778 3788  }
3779 3789  
3780 3790  int
3781 3791  fop_lookup(
3782 3792          vnode_t *dvp,
3783 3793          char *nm,
3784 3794          vnode_t **vpp,
3785 3795          pathname_t *pnp,
3786 3796          int flags,
3787 3797          vnode_t *rdir,
3788 3798          cred_t *cr,
3789 3799          caller_context_t *ct,
3790 3800          int *deflags,           /* Returned per-dirent flags */
3791 3801          pathname_t *ppnp)       /* Returned case-preserved name in directory */
3792 3802  {
3793 3803          int ret;
3794 3804  
3795 3805          /*
3796 3806           * If this file system doesn't support case-insensitive access
3797 3807           * and said access is requested, fail quickly.  It is required
3798 3808           * that if the vfs supports case-insensitive lookup, it also
3799 3809           * supports extended dirent flags.
3800 3810           */
3801 3811          if (flags & FIGNORECASE &&
3802 3812              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3803 3813              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3804 3814                  return (EINVAL);
3805 3815  
3806 3816          VOPXID_MAP_CR(dvp, cr);
3807 3817  
3808 3818          if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3809 3819                  ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3810 3820          } else {
3811 3821                  ret = (*(dvp)->v_op->vop_lookup)
3812 3822                      (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3813 3823          }
3814 3824          if (ret == 0 && *vpp) {
3815 3825                  VOPSTATS_UPDATE(*vpp, lookup);
3816 3826                  vn_updatepath(dvp, *vpp, nm);
3817 3827          }
3818 3828  
3819 3829          return (ret);
3820 3830  }
3821 3831  
3822 3832  int
3823 3833  fop_create(
3824 3834          vnode_t *dvp,
3825 3835          char *name,
3826 3836          vattr_t *vap,
3827 3837          vcexcl_t excl,
3828 3838          int mode,
3829 3839          vnode_t **vpp,
3830 3840          cred_t *cr,
3831 3841          int flags,
3832 3842          caller_context_t *ct,
3833 3843          vsecattr_t *vsecp)      /* ACL to set during create */
3834 3844  {
3835 3845          int ret;
3836 3846  
3837 3847          if (vsecp != NULL &&
3838 3848              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3839 3849                  return (EINVAL);
3840 3850          }
3841 3851          /*
3842 3852           * If this file system doesn't support case-insensitive access
3843 3853           * and said access is requested, fail quickly.
3844 3854           */
3845 3855          if (flags & FIGNORECASE &&
3846 3856              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3847 3857              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3848 3858                  return (EINVAL);
3849 3859  
3850 3860          VOPXID_MAP_CR(dvp, cr);
3851 3861  
3852 3862          ret = (*(dvp)->v_op->vop_create)
3853 3863              (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3854 3864          if (ret == 0 && *vpp) {
3855 3865                  VOPSTATS_UPDATE(*vpp, create);
3856 3866                  vn_updatepath(dvp, *vpp, name);
3857 3867          }
3858 3868  
3859 3869          return (ret);
3860 3870  }
3861 3871  
3862 3872  int
3863 3873  fop_remove(
3864 3874          vnode_t *dvp,
3865 3875          char *nm,
3866 3876          cred_t *cr,
3867 3877          caller_context_t *ct,
3868 3878          int flags)
3869 3879  {
3870 3880          int     err;
3871 3881  
3872 3882          /*
3873 3883           * If this file system doesn't support case-insensitive access
3874 3884           * and said access is requested, fail quickly.
3875 3885           */
3876 3886          if (flags & FIGNORECASE &&
3877 3887              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3878 3888              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3879 3889                  return (EINVAL);
3880 3890  
3881 3891          VOPXID_MAP_CR(dvp, cr);
3882 3892  
3883 3893          err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3884 3894          VOPSTATS_UPDATE(dvp, remove);
3885 3895          return (err);
3886 3896  }
3887 3897  
3888 3898  int
3889 3899  fop_link(
3890 3900          vnode_t *tdvp,
3891 3901          vnode_t *svp,
3892 3902          char *tnm,
3893 3903          cred_t *cr,
3894 3904          caller_context_t *ct,
3895 3905          int flags)
3896 3906  {
3897 3907          int     err;
3898 3908  
3899 3909          /*
3900 3910           * If the target file system doesn't support case-insensitive access
3901 3911           * and said access is requested, fail quickly.
3902 3912           */
3903 3913          if (flags & FIGNORECASE &&
3904 3914              (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3905 3915              vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3906 3916                  return (EINVAL);
3907 3917  
3908 3918          VOPXID_MAP_CR(tdvp, cr);
3909 3919  
3910 3920          err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3911 3921          VOPSTATS_UPDATE(tdvp, link);
3912 3922          return (err);
3913 3923  }
3914 3924  
3915 3925  int
3916 3926  fop_rename(
3917 3927          vnode_t *sdvp,
3918 3928          char *snm,
3919 3929          vnode_t *tdvp,
3920 3930          char *tnm,
3921 3931          cred_t *cr,
3922 3932          caller_context_t *ct,
3923 3933          int flags)
3924 3934  {
3925 3935          int     err;
3926 3936  
3927 3937          /*
3928 3938           * If the file system involved does not support
3929 3939           * case-insensitive access and said access is requested, fail
3930 3940           * quickly.
3931 3941           */
3932 3942          if (flags & FIGNORECASE &&
3933 3943              ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3934 3944              vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3935 3945                  return (EINVAL);
3936 3946  
3937 3947          VOPXID_MAP_CR(tdvp, cr);
3938 3948  
3939 3949          err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3940 3950          VOPSTATS_UPDATE(sdvp, rename);
3941 3951          return (err);
3942 3952  }
3943 3953  
3944 3954  int
3945 3955  fop_mkdir(
3946 3956          vnode_t *dvp,
3947 3957          char *dirname,
3948 3958          vattr_t *vap,
3949 3959          vnode_t **vpp,
3950 3960          cred_t *cr,
3951 3961          caller_context_t *ct,
3952 3962          int flags,
3953 3963          vsecattr_t *vsecp)      /* ACL to set during create */
3954 3964  {
3955 3965          int ret;
3956 3966  
3957 3967          if (vsecp != NULL &&
3958 3968              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3959 3969                  return (EINVAL);
3960 3970          }
3961 3971          /*
3962 3972           * If this file system doesn't support case-insensitive access
3963 3973           * and said access is requested, fail quickly.
3964 3974           */
3965 3975          if (flags & FIGNORECASE &&
3966 3976              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3967 3977              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3968 3978                  return (EINVAL);
3969 3979  
3970 3980          VOPXID_MAP_CR(dvp, cr);
3971 3981  
3972 3982          ret = (*(dvp)->v_op->vop_mkdir)
3973 3983              (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3974 3984          if (ret == 0 && *vpp) {
3975 3985                  VOPSTATS_UPDATE(*vpp, mkdir);
3976 3986                  vn_updatepath(dvp, *vpp, dirname);
3977 3987          }
3978 3988  
3979 3989          return (ret);
3980 3990  }
3981 3991  
3982 3992  int
3983 3993  fop_rmdir(
3984 3994          vnode_t *dvp,
3985 3995          char *nm,
3986 3996          vnode_t *cdir,
3987 3997          cred_t *cr,
3988 3998          caller_context_t *ct,
3989 3999          int flags)
3990 4000  {
3991 4001          int     err;
3992 4002  
3993 4003          /*
3994 4004           * If this file system doesn't support case-insensitive access
3995 4005           * and said access is requested, fail quickly.
3996 4006           */
3997 4007          if (flags & FIGNORECASE &&
3998 4008              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3999 4009              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4000 4010                  return (EINVAL);
4001 4011  
4002 4012          VOPXID_MAP_CR(dvp, cr);
4003 4013  
4004 4014          err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
4005 4015          VOPSTATS_UPDATE(dvp, rmdir);
4006 4016          return (err);
4007 4017  }
4008 4018  
4009 4019  int
4010 4020  fop_readdir(
4011 4021          vnode_t *vp,
4012 4022          uio_t *uiop,
4013 4023          cred_t *cr,
4014 4024          int *eofp,
4015 4025          caller_context_t *ct,
4016 4026          int flags)
4017 4027  {
4018 4028          int     err;
4019 4029          ssize_t resid_start = uiop->uio_resid;
4020 4030  
4021 4031          /*
4022 4032           * If this file system doesn't support retrieving directory
4023 4033           * entry flags and said access is requested, fail quickly.
4024 4034           */
4025 4035          if (flags & V_RDDIR_ENTFLAGS &&
4026 4036              vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
4027 4037                  return (EINVAL);
4028 4038  
4029 4039          VOPXID_MAP_CR(vp, cr);
4030 4040  
4031 4041          err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
4032 4042          VOPSTATS_UPDATE_IO(vp, readdir,
4033 4043              readdir_bytes, (resid_start - uiop->uio_resid));
4034 4044          return (err);
4035 4045  }
4036 4046  
4037 4047  int
4038 4048  fop_symlink(
4039 4049          vnode_t *dvp,
4040 4050          char *linkname,
4041 4051          vattr_t *vap,
4042 4052          char *target,
4043 4053          cred_t *cr,
4044 4054          caller_context_t *ct,
4045 4055          int flags)
4046 4056  {
4047 4057          int     err;
4048 4058          xvattr_t xvattr;
4049 4059  
4050 4060          /*
4051 4061           * If this file system doesn't support case-insensitive access
4052 4062           * and said access is requested, fail quickly.
4053 4063           */
4054 4064          if (flags & FIGNORECASE &&
4055 4065              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
4056 4066              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4057 4067                  return (EINVAL);
4058 4068  
4059 4069          VOPXID_MAP_CR(dvp, cr);
4060 4070  
4061 4071          /* check for reparse point */
4062 4072          if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
4063 4073              (strncmp(target, FS_REPARSE_TAG_STR,
4064 4074              strlen(FS_REPARSE_TAG_STR)) == 0)) {
4065 4075                  if (!fs_reparse_mark(target, vap, &xvattr))
4066 4076                          vap = (vattr_t *)&xvattr;
4067 4077          }
4068 4078  
4069 4079          err = (*(dvp)->v_op->vop_symlink)
4070 4080              (dvp, linkname, vap, target, cr, ct, flags);
4071 4081          VOPSTATS_UPDATE(dvp, symlink);
4072 4082          return (err);
4073 4083  }
4074 4084  
4075 4085  int
4076 4086  fop_readlink(
4077 4087          vnode_t *vp,
4078 4088          uio_t *uiop,
4079 4089          cred_t *cr,
4080 4090          caller_context_t *ct)
4081 4091  {
4082 4092          int     err;
4083 4093  
4084 4094          VOPXID_MAP_CR(vp, cr);
4085 4095  
4086 4096          err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
4087 4097          VOPSTATS_UPDATE(vp, readlink);
4088 4098          return (err);
4089 4099  }
4090 4100  
4091 4101  int
4092 4102  fop_fsync(
4093 4103          vnode_t *vp,
4094 4104          int syncflag,
4095 4105          cred_t *cr,
4096 4106          caller_context_t *ct)
4097 4107  {
4098 4108          int     err;
4099 4109  
4100 4110          VOPXID_MAP_CR(vp, cr);
4101 4111  
4102 4112          err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
4103 4113          VOPSTATS_UPDATE(vp, fsync);
4104 4114          return (err);
4105 4115  }
4106 4116  
4107 4117  void
4108 4118  fop_inactive(
4109 4119          vnode_t *vp,
4110 4120          cred_t *cr,
4111 4121          caller_context_t *ct)
4112 4122  {
4113 4123          /* Need to update stats before vop call since we may lose the vnode */
4114 4124          VOPSTATS_UPDATE(vp, inactive);
4115 4125  
4116 4126          VOPXID_MAP_CR(vp, cr);
4117 4127  
4118 4128          (*(vp)->v_op->vop_inactive)(vp, cr, ct);
4119 4129  }
4120 4130  
4121 4131  int
4122 4132  fop_fid(
4123 4133          vnode_t *vp,
4124 4134          fid_t *fidp,
4125 4135          caller_context_t *ct)
4126 4136  {
4127 4137          int     err;
4128 4138  
4129 4139          err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
4130 4140          VOPSTATS_UPDATE(vp, fid);
4131 4141          return (err);
4132 4142  }
4133 4143  
4134 4144  int
4135 4145  fop_rwlock(
4136 4146          vnode_t *vp,
4137 4147          int write_lock,
4138 4148          caller_context_t *ct)
4139 4149  {
4140 4150          int     ret;
4141 4151  
4142 4152          ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
4143 4153          VOPSTATS_UPDATE(vp, rwlock);
4144 4154          return (ret);
4145 4155  }
4146 4156  
4147 4157  void
4148 4158  fop_rwunlock(
4149 4159          vnode_t *vp,
4150 4160          int write_lock,
4151 4161          caller_context_t *ct)
4152 4162  {
4153 4163          (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4154 4164          VOPSTATS_UPDATE(vp, rwunlock);
4155 4165  }
4156 4166  
4157 4167  int
4158 4168  fop_seek(
4159 4169          vnode_t *vp,
4160 4170          offset_t ooff,
4161 4171          offset_t *noffp,
4162 4172          caller_context_t *ct)
4163 4173  {
4164 4174          int     err;
4165 4175  
4166 4176          err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4167 4177          VOPSTATS_UPDATE(vp, seek);
4168 4178          return (err);
4169 4179  }
4170 4180  
4171 4181  int
4172 4182  fop_cmp(
4173 4183          vnode_t *vp1,
4174 4184          vnode_t *vp2,
4175 4185          caller_context_t *ct)
4176 4186  {
4177 4187          int     err;
4178 4188  
4179 4189          err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4180 4190          VOPSTATS_UPDATE(vp1, cmp);
4181 4191          return (err);
4182 4192  }
4183 4193  
4184 4194  int
4185 4195  fop_frlock(
4186 4196          vnode_t *vp,
4187 4197          int cmd,
4188 4198          flock64_t *bfp,
4189 4199          int flag,
4190 4200          offset_t offset,
4191 4201          struct flk_callback *flk_cbp,
4192 4202          cred_t *cr,
4193 4203          caller_context_t *ct)
4194 4204  {
4195 4205          int     err;
4196 4206  
4197 4207          VOPXID_MAP_CR(vp, cr);
4198 4208  
4199 4209          err = (*(vp)->v_op->vop_frlock)
4200 4210              (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4201 4211          VOPSTATS_UPDATE(vp, frlock);
4202 4212          return (err);
4203 4213  }
4204 4214  
4205 4215  int
4206 4216  fop_space(
4207 4217          vnode_t *vp,
4208 4218          int cmd,
4209 4219          flock64_t *bfp,
4210 4220          int flag,
4211 4221          offset_t offset,
4212 4222          cred_t *cr,
4213 4223          caller_context_t *ct)
4214 4224  {
4215 4225          int     err;
4216 4226  
4217 4227          VOPXID_MAP_CR(vp, cr);
4218 4228  
4219 4229          err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4220 4230          VOPSTATS_UPDATE(vp, space);
4221 4231          return (err);
4222 4232  }
4223 4233  
4224 4234  int
4225 4235  fop_realvp(
4226 4236          vnode_t *vp,
4227 4237          vnode_t **vpp,
4228 4238          caller_context_t *ct)
4229 4239  {
4230 4240          int     err;
4231 4241  
4232 4242          err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4233 4243          VOPSTATS_UPDATE(vp, realvp);
4234 4244          return (err);
4235 4245  }
4236 4246  
4237 4247  int
4238 4248  fop_getpage(
4239 4249          vnode_t *vp,
4240 4250          offset_t off,
4241 4251          size_t len,
4242 4252          uint_t *protp,
4243 4253          page_t **plarr,
4244 4254          size_t plsz,
4245 4255          struct seg *seg,
4246 4256          caddr_t addr,
4247 4257          enum seg_rw rw,
4248 4258          cred_t *cr,
4249 4259          caller_context_t *ct)
4250 4260  {
4251 4261          int     err;
4252 4262  
4253 4263          VOPXID_MAP_CR(vp, cr);
4254 4264  
4255 4265          err = (*(vp)->v_op->vop_getpage)
4256 4266              (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4257 4267          VOPSTATS_UPDATE(vp, getpage);
4258 4268          return (err);
4259 4269  }
4260 4270  
4261 4271  int
4262 4272  fop_putpage(
4263 4273          vnode_t *vp,
4264 4274          offset_t off,
4265 4275          size_t len,
4266 4276          int flags,
4267 4277          cred_t *cr,
4268 4278          caller_context_t *ct)
4269 4279  {
4270 4280          int     err;
4271 4281  
4272 4282          VOPXID_MAP_CR(vp, cr);
4273 4283  
4274 4284          err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4275 4285          VOPSTATS_UPDATE(vp, putpage);
4276 4286          return (err);
4277 4287  }
4278 4288  
4279 4289  int
4280 4290  fop_map(
4281 4291          vnode_t *vp,
4282 4292          offset_t off,
4283 4293          struct as *as,
4284 4294          caddr_t *addrp,
4285 4295          size_t len,
4286 4296          uchar_t prot,
4287 4297          uchar_t maxprot,
4288 4298          uint_t flags,
4289 4299          cred_t *cr,
4290 4300          caller_context_t *ct)
4291 4301  {
4292 4302          int     err;
4293 4303  
4294 4304          VOPXID_MAP_CR(vp, cr);
4295 4305  
4296 4306          err = (*(vp)->v_op->vop_map)
4297 4307              (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4298 4308          VOPSTATS_UPDATE(vp, map);
4299 4309          return (err);
4300 4310  }
4301 4311  
4302 4312  int
4303 4313  fop_addmap(
4304 4314          vnode_t *vp,
4305 4315          offset_t off,
4306 4316          struct as *as,
4307 4317          caddr_t addr,
4308 4318          size_t len,
4309 4319          uchar_t prot,
4310 4320          uchar_t maxprot,
4311 4321          uint_t flags,
4312 4322          cred_t *cr,
4313 4323          caller_context_t *ct)
4314 4324  {
4315 4325          int error;
4316 4326          u_longlong_t delta;
4317 4327  
4318 4328          VOPXID_MAP_CR(vp, cr);
4319 4329  
4320 4330          error = (*(vp)->v_op->vop_addmap)
4321 4331              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4322 4332  
4323 4333          if ((!error) && (vp->v_type == VREG)) {
4324 4334                  delta = (u_longlong_t)btopr(len);
4325 4335                  /*
4326 4336                   * If file is declared MAP_PRIVATE, it can't be written back
4327 4337                   * even if open for write. Handle as read.
4328 4338                   */
4329 4339                  if (flags & MAP_PRIVATE) {
4330 4340                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4331 4341                              (int64_t)delta);
4332 4342                  } else {
4333 4343                          /*
4334 4344                           * atomic_add_64 forces the fetch of a 64 bit value to
4335 4345                           * be atomic on 32 bit machines
4336 4346                           */
4337 4347                          if (maxprot & PROT_WRITE)
4338 4348                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4339 4349                                      (int64_t)delta);
4340 4350                          if (maxprot & PROT_READ)
4341 4351                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4342 4352                                      (int64_t)delta);
4343 4353                          if (maxprot & PROT_EXEC)
4344 4354                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4345 4355                                      (int64_t)delta);
4346 4356                  }
4347 4357          }
4348 4358          VOPSTATS_UPDATE(vp, addmap);
4349 4359          return (error);
4350 4360  }
4351 4361  
4352 4362  int
4353 4363  fop_delmap(
4354 4364          vnode_t *vp,
4355 4365          offset_t off,
4356 4366          struct as *as,
4357 4367          caddr_t addr,
4358 4368          size_t len,
4359 4369          uint_t prot,
4360 4370          uint_t maxprot,
4361 4371          uint_t flags,
4362 4372          cred_t *cr,
4363 4373          caller_context_t *ct)
4364 4374  {
4365 4375          int error;
4366 4376          u_longlong_t delta;
4367 4377  
4368 4378          VOPXID_MAP_CR(vp, cr);
4369 4379  
4370 4380          error = (*(vp)->v_op->vop_delmap)
4371 4381              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4372 4382  
4373 4383          /*
4374 4384           * NFS calls into delmap twice, the first time
4375 4385           * it simply establishes a callback mechanism and returns EAGAIN
4376 4386           * while the real work is being done upon the second invocation.
4377 4387           * We have to detect this here and only decrement the counts upon
4378 4388           * the second delmap request.
4379 4389           */
4380 4390          if ((error != EAGAIN) && (vp->v_type == VREG)) {
4381 4391  
4382 4392                  delta = (u_longlong_t)btopr(len);
4383 4393  
4384 4394                  if (flags & MAP_PRIVATE) {
4385 4395                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4386 4396                              (int64_t)(-delta));
4387 4397                  } else {
4388 4398                          /*
4389 4399                           * atomic_add_64 forces the fetch of a 64 bit value
4390 4400                           * to be atomic on 32 bit machines
4391 4401                           */
4392 4402                          if (maxprot & PROT_WRITE)
4393 4403                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4394 4404                                      (int64_t)(-delta));
4395 4405                          if (maxprot & PROT_READ)
4396 4406                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4397 4407                                      (int64_t)(-delta));
4398 4408                          if (maxprot & PROT_EXEC)
4399 4409                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4400 4410                                      (int64_t)(-delta));
4401 4411                  }
4402 4412          }
4403 4413          VOPSTATS_UPDATE(vp, delmap);
4404 4414          return (error);
4405 4415  }
4406 4416  
4407 4417  
4408 4418  int
4409 4419  fop_poll(
4410 4420          vnode_t *vp,
4411 4421          short events,
4412 4422          int anyyet,
4413 4423          short *reventsp,
4414 4424          struct pollhead **phpp,
4415 4425          caller_context_t *ct)
4416 4426  {
4417 4427          int     err;
4418 4428  
4419 4429          err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4420 4430          VOPSTATS_UPDATE(vp, poll);
4421 4431          return (err);
4422 4432  }
4423 4433  
4424 4434  int
4425 4435  fop_dump(
4426 4436          vnode_t *vp,
4427 4437          caddr_t addr,
4428 4438          offset_t lbdn,
4429 4439          offset_t dblks,
4430 4440          caller_context_t *ct)
4431 4441  {
4432 4442          int     err;
4433 4443  
4434 4444          /* ensure lbdn and dblks can be passed safely to bdev_dump */
4435 4445          if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4436 4446                  return (EIO);
4437 4447  
4438 4448          err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4439 4449          VOPSTATS_UPDATE(vp, dump);
4440 4450          return (err);
4441 4451  }
4442 4452  
4443 4453  int
4444 4454  fop_pathconf(
4445 4455          vnode_t *vp,
4446 4456          int cmd,
4447 4457          ulong_t *valp,
4448 4458          cred_t *cr,
4449 4459          caller_context_t *ct)
4450 4460  {
4451 4461          int     err;
4452 4462  
4453 4463          VOPXID_MAP_CR(vp, cr);
4454 4464  
4455 4465          err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4456 4466          VOPSTATS_UPDATE(vp, pathconf);
4457 4467          return (err);
4458 4468  }
4459 4469  
4460 4470  int
4461 4471  fop_pageio(
4462 4472          vnode_t *vp,
4463 4473          struct page *pp,
4464 4474          u_offset_t io_off,
4465 4475          size_t io_len,
4466 4476          int flags,
4467 4477          cred_t *cr,
4468 4478          caller_context_t *ct)
4469 4479  {
4470 4480          int     err;
4471 4481  
4472 4482          VOPXID_MAP_CR(vp, cr);
4473 4483  
4474 4484          err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4475 4485          VOPSTATS_UPDATE(vp, pageio);
4476 4486          return (err);
4477 4487  }
4478 4488  
4479 4489  int
4480 4490  fop_dumpctl(
4481 4491          vnode_t *vp,
4482 4492          int action,
4483 4493          offset_t *blkp,
4484 4494          caller_context_t *ct)
4485 4495  {
4486 4496          int     err;
4487 4497          err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4488 4498          VOPSTATS_UPDATE(vp, dumpctl);
4489 4499          return (err);
4490 4500  }
4491 4501  
4492 4502  void
4493 4503  fop_dispose(
4494 4504          vnode_t *vp,
4495 4505          page_t *pp,
4496 4506          int flag,
4497 4507          int dn,
4498 4508          cred_t *cr,
4499 4509          caller_context_t *ct)
4500 4510  {
4501 4511          /* Must do stats first since it's possible to lose the vnode */
4502 4512          VOPSTATS_UPDATE(vp, dispose);
4503 4513  
4504 4514          VOPXID_MAP_CR(vp, cr);
4505 4515  
4506 4516          (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4507 4517  }
4508 4518  
4509 4519  int
4510 4520  fop_setsecattr(
4511 4521          vnode_t *vp,
4512 4522          vsecattr_t *vsap,
4513 4523          int flag,
4514 4524          cred_t *cr,
4515 4525          caller_context_t *ct)
4516 4526  {
4517 4527          int     err;
4518 4528  
4519 4529          VOPXID_MAP_CR(vp, cr);
4520 4530  
4521 4531          /*
4522 4532           * We're only allowed to skip the ACL check iff we used a 32 bit
4523 4533           * ACE mask with VOP_ACCESS() to determine permissions.
4524 4534           */
4525 4535          if ((flag & ATTR_NOACLCHECK) &&
4526 4536              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4527 4537                  return (EINVAL);
4528 4538          }
4529 4539          err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4530 4540          VOPSTATS_UPDATE(vp, setsecattr);
4531 4541          return (err);
4532 4542  }
4533 4543  
4534 4544  int
4535 4545  fop_getsecattr(
4536 4546          vnode_t *vp,
4537 4547          vsecattr_t *vsap,
4538 4548          int flag,
4539 4549          cred_t *cr,
4540 4550          caller_context_t *ct)
4541 4551  {
4542 4552          int     err;
4543 4553  
4544 4554          /*
4545 4555           * We're only allowed to skip the ACL check iff we used a 32 bit
4546 4556           * ACE mask with VOP_ACCESS() to determine permissions.
4547 4557           */
4548 4558          if ((flag & ATTR_NOACLCHECK) &&
4549 4559              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4550 4560                  return (EINVAL);
4551 4561          }
4552 4562  
4553 4563          VOPXID_MAP_CR(vp, cr);
4554 4564  
4555 4565          err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4556 4566          VOPSTATS_UPDATE(vp, getsecattr);
4557 4567          return (err);
4558 4568  }
4559 4569  
4560 4570  int
4561 4571  fop_shrlock(
4562 4572          vnode_t *vp,
4563 4573          int cmd,
4564 4574          struct shrlock *shr,
4565 4575          int flag,
4566 4576          cred_t *cr,
4567 4577          caller_context_t *ct)
4568 4578  {
4569 4579          int     err;
4570 4580  
4571 4581          VOPXID_MAP_CR(vp, cr);
4572 4582  
4573 4583          err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4574 4584          VOPSTATS_UPDATE(vp, shrlock);
4575 4585          return (err);
4576 4586  }
4577 4587  
4578 4588  int
4579 4589  fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4580 4590      caller_context_t *ct)
4581 4591  {
4582 4592          int     err;
4583 4593  
4584 4594          err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4585 4595          VOPSTATS_UPDATE(vp, vnevent);
4586 4596          return (err);
4587 4597  }
4588 4598  
4589 4599  int
4590 4600  fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4591 4601      caller_context_t *ct)
4592 4602  {
4593 4603          int err;
4594 4604  
4595 4605          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4596 4606                  return (ENOTSUP);
4597 4607          err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4598 4608          VOPSTATS_UPDATE(vp, reqzcbuf);
4599 4609          return (err);
4600 4610  }
4601 4611  
4602 4612  int
4603 4613  fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4604 4614  {
4605 4615          int err;
4606 4616  
4607 4617          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4608 4618                  return (ENOTSUP);
4609 4619          err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4610 4620          VOPSTATS_UPDATE(vp, retzcbuf);
4611 4621          return (err);
4612 4622  }
4613 4623  
4614 4624  /*
4615 4625   * Default destructor
4616 4626   *      Needed because NULL destructor means that the key is unused
4617 4627   */
4618 4628  /* ARGSUSED */
4619 4629  void
4620 4630  vsd_defaultdestructor(void *value)
4621 4631  {}
4622 4632  
4623 4633  /*
4624 4634   * Create a key (index into per vnode array)
4625 4635   *      Locks out vsd_create, vsd_destroy, and vsd_free
4626 4636   *      May allocate memory with lock held
4627 4637   */
4628 4638  void
4629 4639  vsd_create(uint_t *keyp, void (*destructor)(void *))
4630 4640  {
4631 4641          int     i;
4632 4642          uint_t  nkeys;
4633 4643  
4634 4644          /*
4635 4645           * if key is allocated, do nothing
4636 4646           */
4637 4647          mutex_enter(&vsd_lock);
4638 4648          if (*keyp) {
4639 4649                  mutex_exit(&vsd_lock);
4640 4650                  return;
4641 4651          }
4642 4652          /*
4643 4653           * find an unused key
4644 4654           */
4645 4655          if (destructor == NULL)
4646 4656                  destructor = vsd_defaultdestructor;
4647 4657  
4648 4658          for (i = 0; i < vsd_nkeys; ++i)
4649 4659                  if (vsd_destructor[i] == NULL)
4650 4660                          break;
4651 4661  
4652 4662          /*
4653 4663           * if no unused keys, increase the size of the destructor array
4654 4664           */
4655 4665          if (i == vsd_nkeys) {
4656 4666                  if ((nkeys = (vsd_nkeys << 1)) == 0)
4657 4667                          nkeys = 1;
4658 4668                  vsd_destructor =
4659 4669                      (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4660 4670                      (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4661 4671                      (size_t)(nkeys * sizeof (void (*)(void *))));
4662 4672                  vsd_nkeys = nkeys;
4663 4673          }
4664 4674  
4665 4675          /*
4666 4676           * allocate the next available unused key
4667 4677           */
4668 4678          vsd_destructor[i] = destructor;
4669 4679          *keyp = i + 1;
4670 4680  
4671 4681          /* create vsd_list, if it doesn't exist */
4672 4682          if (vsd_list == NULL) {
4673 4683                  vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4674 4684                  list_create(vsd_list, sizeof (struct vsd_node),
4675 4685                      offsetof(struct vsd_node, vs_nodes));
4676 4686          }
4677 4687  
4678 4688          mutex_exit(&vsd_lock);
4679 4689  }
4680 4690  
4681 4691  /*
4682 4692   * Destroy a key
4683 4693   *
4684 4694   * Assumes that the caller is preventing vsd_set and vsd_get
4685 4695   * Locks out vsd_create, vsd_destroy, and vsd_free
4686 4696   * May free memory with lock held
4687 4697   */
4688 4698  void
4689 4699  vsd_destroy(uint_t *keyp)
4690 4700  {
4691 4701          uint_t key;
4692 4702          struct vsd_node *vsd;
4693 4703  
4694 4704          /*
4695 4705           * protect the key namespace and our destructor lists
4696 4706           */
4697 4707          mutex_enter(&vsd_lock);
4698 4708          key = *keyp;
4699 4709          *keyp = 0;
4700 4710  
4701 4711          ASSERT(key <= vsd_nkeys);
4702 4712  
4703 4713          /*
4704 4714           * if the key is valid
4705 4715           */
4706 4716          if (key != 0) {
4707 4717                  uint_t k = key - 1;
4708 4718                  /*
4709 4719                   * for every vnode with VSD, call key's destructor
4710 4720                   */
4711 4721                  for (vsd = list_head(vsd_list); vsd != NULL;
4712 4722                      vsd = list_next(vsd_list, vsd)) {
4713 4723                          /*
4714 4724                           * no VSD for key in this vnode
4715 4725                           */
4716 4726                          if (key > vsd->vs_nkeys)
4717 4727                                  continue;
4718 4728                          /*
4719 4729                           * call destructor for key
4720 4730                           */
4721 4731                          if (vsd->vs_value[k] && vsd_destructor[k])
4722 4732                                  (*vsd_destructor[k])(vsd->vs_value[k]);
4723 4733                          /*
4724 4734                           * reset value for key
4725 4735                           */
4726 4736                          vsd->vs_value[k] = NULL;
4727 4737                  }
4728 4738                  /*
4729 4739                   * actually free the key (NULL destructor == unused)
4730 4740                   */
4731 4741                  vsd_destructor[k] = NULL;
4732 4742          }
4733 4743  
4734 4744          mutex_exit(&vsd_lock);
4735 4745  }
4736 4746  
4737 4747  /*
4738 4748   * Quickly return the per vnode value that was stored with the specified key
4739 4749   * Assumes the caller is protecting key from vsd_create and vsd_destroy
4740 4750   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4741 4751   */
4742 4752  void *
4743 4753  vsd_get(vnode_t *vp, uint_t key)
4744 4754  {
4745 4755          struct vsd_node *vsd;
4746 4756  
4747 4757          ASSERT(vp != NULL);
4748 4758          ASSERT(mutex_owned(&vp->v_vsd_lock));
4749 4759  
4750 4760          vsd = vp->v_vsd;
4751 4761  
4752 4762          if (key && vsd != NULL && key <= vsd->vs_nkeys)
4753 4763                  return (vsd->vs_value[key - 1]);
4754 4764          return (NULL);
4755 4765  }
4756 4766  
4757 4767  /*
4758 4768   * Set a per vnode value indexed with the specified key
4759 4769   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4760 4770   */
4761 4771  int
4762 4772  vsd_set(vnode_t *vp, uint_t key, void *value)
4763 4773  {
4764 4774          struct vsd_node *vsd;
4765 4775  
4766 4776          ASSERT(vp != NULL);
4767 4777          ASSERT(mutex_owned(&vp->v_vsd_lock));
4768 4778  
4769 4779          if (key == 0)
4770 4780                  return (EINVAL);
4771 4781  
4772 4782          vsd = vp->v_vsd;
4773 4783          if (vsd == NULL)
4774 4784                  vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4775 4785  
4776 4786          /*
4777 4787           * If the vsd was just allocated, vs_nkeys will be 0, so the following
4778 4788           * code won't happen and we will continue down and allocate space for
4779 4789           * the vs_value array.
4780 4790           * If the caller is replacing one value with another, then it is up
4781 4791           * to the caller to free/rele/destroy the previous value (if needed).
4782 4792           */
4783 4793          if (key <= vsd->vs_nkeys) {
4784 4794                  vsd->vs_value[key - 1] = value;
4785 4795                  return (0);
4786 4796          }
4787 4797  
4788 4798          ASSERT(key <= vsd_nkeys);
4789 4799  
4790 4800          if (vsd->vs_nkeys == 0) {
4791 4801                  mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4792 4802                  /*
4793 4803                   * Link onto list of all VSD nodes.
4794 4804                   */
4795 4805                  list_insert_head(vsd_list, vsd);
4796 4806                  mutex_exit(&vsd_lock);
4797 4807          }
4798 4808  
4799 4809          /*
4800 4810           * Allocate vnode local storage and set the value for key
4801 4811           */
4802 4812          vsd->vs_value = vsd_realloc(vsd->vs_value,
4803 4813              vsd->vs_nkeys * sizeof (void *),
4804 4814              key * sizeof (void *));
4805 4815          vsd->vs_nkeys = key;
4806 4816          vsd->vs_value[key - 1] = value;
4807 4817  
4808 4818          return (0);
4809 4819  }
4810 4820  
4811 4821  /*
4812 4822   * Called from vn_free() to run the destructor function for each vsd
4813 4823   *      Locks out vsd_create and vsd_destroy
4814 4824   *      Assumes that the destructor *DOES NOT* use vsd
4815 4825   */
4816 4826  void
4817 4827  vsd_free(vnode_t *vp)
4818 4828  {
4819 4829          int i;
4820 4830          struct vsd_node *vsd = vp->v_vsd;
4821 4831  
4822 4832          if (vsd == NULL)
4823 4833                  return;
4824 4834  
4825 4835          if (vsd->vs_nkeys == 0) {
4826 4836                  kmem_free(vsd, sizeof (*vsd));
4827 4837                  vp->v_vsd = NULL;
4828 4838                  return;
4829 4839          }
4830 4840  
4831 4841          /*
4832 4842           * lock out vsd_create and vsd_destroy, call
4833 4843           * the destructor, and mark the value as destroyed.
4834 4844           */
4835 4845          mutex_enter(&vsd_lock);
4836 4846  
4837 4847          for (i = 0; i < vsd->vs_nkeys; i++) {
4838 4848                  if (vsd->vs_value[i] && vsd_destructor[i])
4839 4849                          (*vsd_destructor[i])(vsd->vs_value[i]);
4840 4850                  vsd->vs_value[i] = NULL;
4841 4851          }
4842 4852  
4843 4853          /*
4844 4854           * remove from linked list of VSD nodes
4845 4855           */
4846 4856          list_remove(vsd_list, vsd);
4847 4857  
4848 4858          mutex_exit(&vsd_lock);
4849 4859  
4850 4860          /*
4851 4861           * free up the VSD
4852 4862           */
4853 4863          kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4854 4864          kmem_free(vsd, sizeof (struct vsd_node));
4855 4865          vp->v_vsd = NULL;
4856 4866  }
4857 4867  
4858 4868  /*
4859 4869   * realloc
4860 4870   */
4861 4871  static void *
4862 4872  vsd_realloc(void *old, size_t osize, size_t nsize)
4863 4873  {
4864 4874          void *new;
4865 4875  
4866 4876          new = kmem_zalloc(nsize, KM_SLEEP);
4867 4877          if (old) {
4868 4878                  bcopy(old, new, osize);
4869 4879                  kmem_free(old, osize);
4870 4880          }
4871 4881          return (new);
4872 4882  }
4873 4883  
4874 4884  /*
4875 4885   * Setup the extensible system attribute for creating a reparse point.
4876 4886   * The symlink data 'target' is validated for proper format of a reparse
4877 4887   * string and a check also made to make sure the symlink data does not
4878 4888   * point to an existing file.
4879 4889   *
4880 4890   * return 0 if ok else -1.
4881 4891   */
4882 4892  static int
4883 4893  fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4884 4894  {
4885 4895          xoptattr_t *xoap;
4886 4896  
4887 4897          if ((!target) || (!vap) || (!xvattr))
4888 4898                  return (-1);
4889 4899  
4890 4900          /* validate reparse string */
4891 4901          if (reparse_validate((const char *)target))
4892 4902                  return (-1);
4893 4903  
4894 4904          xva_init(xvattr);
4895 4905          xvattr->xva_vattr = *vap;
4896 4906          xvattr->xva_vattr.va_mask |= AT_XVATTR;
4897 4907          xoap = xva_getxoptattr(xvattr);
4898 4908          ASSERT(xoap);
4899 4909          XVA_SET_REQ(xvattr, XAT_REPARSE);
4900 4910          xoap->xoa_reparse = 1;
4901 4911  
4902 4912          return (0);
4903 4913  }
4904 4914  
4905 4915  /*
4906 4916   * Function to check whether a symlink is a reparse point.
4907 4917   * Return B_TRUE if it is a reparse point, else return B_FALSE
4908 4918   */
4909 4919  boolean_t
4910 4920  vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4911 4921  {
4912 4922          xvattr_t xvattr;
4913 4923          xoptattr_t *xoap;
4914 4924  
4915 4925          if ((vp->v_type != VLNK) ||
4916 4926              !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4917 4927                  return (B_FALSE);
4918 4928  
4919 4929          xva_init(&xvattr);
4920 4930          xoap = xva_getxoptattr(&xvattr);
4921 4931          ASSERT(xoap);
4922 4932          XVA_SET_REQ(&xvattr, XAT_REPARSE);
4923 4933  
4924 4934          if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4925 4935                  return (B_FALSE);
4926 4936  
4927 4937          if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4928 4938              (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4929 4939                  return (B_FALSE);
4930 4940  
4931 4941          return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4932 4942  }

↓ open down ↓

3964 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX