omnios-smartos Wdiff usr/src/uts/common/fs/vnode.c

Print this page

11679 vn_rele() and friends should VERIFY after mutex
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/vnode.c
          +++ new/usr/src/uts/common/fs/vnode.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2020 Joyent, Inc.
       25 + * Copyright 2022 Spencer Evans-Cole.
  25   26   * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  26   27   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  27   28   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  28   29   */
  29   30  
  30   31  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  31   32  /*        All Rights Reserved   */
  32   33  
  33   34  /*
  34   35   * University Copyright- Copyright (c) 1982, 1986, 1988

  35   36   * The Regents of the University of California
  36   37   * All Rights Reserved
  37   38   *
  38   39   * University Acknowledgment- Portions of this document are derived from
  39   40   * software developed by the University of California, Berkeley, and its
  40   41   * contributors.
  41   42   */
  42   43  
  43   44  #include <sys/types.h>
  44   45  #include <sys/param.h>
  45   46  #include <sys/t_lock.h>
  46   47  #include <sys/errno.h>
  47   48  #include <sys/cred.h>
  48   49  #include <sys/user.h>
  49   50  #include <sys/uio.h>
  50   51  #include <sys/file.h>
  51   52  #include <sys/pathname.h>
  52   53  #include <sys/vfs.h>
  53   54  #include <sys/vfs_opreg.h>
  54   55  #include <sys/vnode.h>
  55   56  #include <sys/filio.h>
  56   57  #include <sys/rwstlock.h>
  57   58  #include <sys/fem.h>
  58   59  #include <sys/stat.h>
  59   60  #include <sys/mode.h>
  60   61  #include <sys/conf.h>
  61   62  #include <sys/sysmacros.h>
  62   63  #include <sys/cmn_err.h>
  63   64  #include <sys/systm.h>
  64   65  #include <sys/kmem.h>
  65   66  #include <sys/debug.h>
  66   67  #include <c2/audit.h>
  67   68  #include <sys/acl.h>
  68   69  #include <sys/nbmlock.h>
  69   70  #include <sys/fcntl.h>
  70   71  #include <fs/fs_subr.h>
  71   72  #include <sys/taskq.h>
  72   73  #include <fs/fs_reparse.h>
  73   74  #include <sys/time.h>
  74   75  #include <sys/sdt.h>
  75   76  
  76   77  /* Determine if this vnode is a file that is read-only */
  77   78  #define ISROFILE(vp)    \
  78   79          ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  79   80              (vp)->v_type != VFIFO && vn_is_readonly(vp))
  80   81  
  81   82  /* Tunable via /etc/system; used only by admin/install */
  82   83  int nfs_global_client_only;
  83   84  
  84   85  /*
  85   86   * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  86   87   * number of entries as and parallel to the vfssw table.  (Arguably, it could
  87   88   * be part of the vfssw table.)  Once it's initialized, it's accessed using
  88   89   * the same fstype index that is used to index into the vfssw table.
  89   90   */
  90   91  vopstats_t **vopstats_fstype;
  91   92  
  92   93  /* vopstats initialization template used for fast initialization via bcopy() */
  93   94  static vopstats_t *vs_templatep;
  94   95  
  95   96  /* Kmem cache handle for vsk_anchor_t allocations */
  96   97  kmem_cache_t *vsk_anchor_cache;
  97   98  
  98   99  /* file events cleanup routine */
  99  100  extern void free_fopdata(vnode_t *);
 100  101  
 101  102  /*
 102  103   * Root of AVL tree for the kstats associated with vopstats.  Lock protects
 103  104   * updates to vsktat_tree.
 104  105   */
 105  106  avl_tree_t      vskstat_tree;
 106  107  kmutex_t        vskstat_tree_lock;
 107  108  
 108  109  /* Global variable which enables/disables the vopstats collection */
 109  110  int vopstats_enabled = 1;
 110  111  
 111  112  /* Global used for empty/invalid v_path */
 112  113  char *vn_vpath_empty = "";
 113  114  
 114  115  /*
 115  116   * forward declarations for internal vnode specific data (vsd)
 116  117   */
 117  118  static void *vsd_realloc(void *, size_t, size_t);
 118  119  
 119  120  /*
 120  121   * forward declarations for reparse point functions
 121  122   */
 122  123  static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 123  124  
 124  125  /*
 125  126   * VSD -- VNODE SPECIFIC DATA
 126  127   * The v_data pointer is typically used by a file system to store a
 127  128   * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 128  129   * However, there are times when additional project private data needs
 129  130   * to be stored separately from the data (node) pointed to by v_data.
 130  131   * This additional data could be stored by the file system itself or
 131  132   * by a completely different kernel entity.  VSD provides a way for
 132  133   * callers to obtain a key and store a pointer to private data associated
 133  134   * with a vnode.
 134  135   *
 135  136   * Callers are responsible for protecting the vsd by holding v_vsd_lock
 136  137   * for calls to vsd_set() and vsd_get().
 137  138   */
 138  139  
 139  140  /*
 140  141   * vsd_lock protects:
 141  142   *   vsd_nkeys - creation and deletion of vsd keys
 142  143   *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 143  144   *   vsd_destructor - adding and removing destructors to the list
 144  145   */
 145  146  static kmutex_t         vsd_lock;
 146  147  static uint_t           vsd_nkeys;       /* size of destructor array */
 147  148  /* list of vsd_node's */
 148  149  static list_t *vsd_list = NULL;
 149  150  /* per-key destructor funcs */
 150  151  static void             (**vsd_destructor)(void *);
 151  152  
 152  153  /*
 153  154   * The following is the common set of actions needed to update the
 154  155   * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 155  156   * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 156  157   * recording of the bytes transferred.  Since the code is similar
 157  158   * but small, it is nearly a duplicate.  Consequently any changes
 158  159   * to one may need to be reflected in the other.
 159  160   * Rundown of the variables:
 160  161   * vp - Pointer to the vnode
 161  162   * counter - Partial name structure member to update in vopstats for counts
 162  163   * bytecounter - Partial name structure member to update in vopstats for bytes
 163  164   * bytesval - Value to update in vopstats for bytes
 164  165   * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 165  166   * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 166  167   */
 167  168  
 168  169  #define VOPSTATS_UPDATE(vp, counter) {                                  \
 169  170          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 170  171          if (vfsp && vfsp->vfs_implp &&                                  \
 171  172              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 172  173                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 173  174                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 174  175                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 175  176                      size_t, uint64_t *);                                \
 176  177                  __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 177  178                  (*stataddr)++;                                          \
 178  179                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 179  180                          vsp->n##counter.value.ui64++;                   \
 180  181                  }                                                       \
 181  182          }                                                               \
 182  183  }
 183  184  
 184  185  #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 185  186          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 186  187          if (vfsp && vfsp->vfs_implp &&                                  \
 187  188              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 188  189                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 189  190                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 190  191                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 191  192                      size_t, uint64_t *);                                \
 192  193                  __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 193  194                  (*stataddr)++;                                          \
 194  195                  vsp->bytecounter.value.ui64 += bytesval;                \
 195  196                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 196  197                          vsp->n##counter.value.ui64++;                   \
 197  198                          vsp->bytecounter.value.ui64 += bytesval;        \
 198  199                  }                                                       \
 199  200          }                                                               \
 200  201  }
 201  202  
 202  203  /*
 203  204   * If the filesystem does not support XIDs map credential
 204  205   * If the vfsp is NULL, perhaps we should also map?
 205  206   */
 206  207  #define VOPXID_MAP_CR(vp, cr)   {                                       \
 207  208          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 208  209          if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 209  210                  cr = crgetmapped(cr);                                   \
 210  211          }
 211  212  
 212  213  #define VOP_LATENCY_10MS        10000000
 213  214  #define VOP_LATENCY_100MS       100000000
 214  215  #define VOP_LATENCY_1S          1000000000
 215  216  
 216  217  /*
 217  218   * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 218  219   * numerical order of S_IFMT and vnode types.)
 219  220   */
 220  221  enum vtype iftovt_tab[] = {
 221  222          VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 222  223          VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 223  224  };
 224  225  
 225  226  ushort_t vttoif_tab[] = {
 226  227          0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 227  228          S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 228  229  };
 229  230  
 230  231  /*
 231  232   * The system vnode cache.
 232  233   */
 233  234  
 234  235  kmem_cache_t *vn_cache;
 235  236  
 236  237  
 237  238  /*
 238  239   * Vnode operations vector.
 239  240   */
 240  241  
 241  242  static const fs_operation_trans_def_t vn_ops_table[] = {
 242  243          VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 243  244              fs_nosys, fs_nosys,
 244  245  
 245  246          VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 246  247              fs_nosys, fs_nosys,
 247  248  
 248  249          VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 249  250              fs_nosys, fs_nosys,
 250  251  
 251  252          VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 252  253              fs_nosys, fs_nosys,
 253  254  
 254  255          VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 255  256              fs_nosys, fs_nosys,
 256  257  
 257  258          VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 258  259              fs_setfl, fs_nosys,
 259  260  
 260  261          VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 261  262              fs_nosys, fs_nosys,
 262  263  
 263  264          VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 264  265              fs_nosys, fs_nosys,
 265  266  
 266  267          VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 267  268              fs_nosys, fs_nosys,
 268  269  
 269  270          VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 270  271              fs_nosys, fs_nosys,
 271  272  
 272  273          VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 273  274              fs_nosys, fs_nosys,
 274  275  
 275  276          VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 276  277              fs_nosys, fs_nosys,
 277  278  
 278  279          VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 279  280              fs_nosys, fs_nosys,
 280  281  
 281  282          VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 282  283              fs_nosys, fs_nosys,
 283  284  
 284  285          VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 285  286              fs_nosys, fs_nosys,
 286  287  
 287  288          VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 288  289              fs_nosys, fs_nosys,
 289  290  
 290  291          VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 291  292              fs_nosys, fs_nosys,
 292  293  
 293  294          VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 294  295              fs_nosys, fs_nosys,
 295  296  
 296  297          VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 297  298              fs_nosys, fs_nosys,
 298  299  
 299  300          VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 300  301              fs_nosys, fs_nosys,
 301  302  
 302  303          VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 303  304              fs_nosys, fs_nosys,
 304  305  
 305  306          VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 306  307              fs_nosys, fs_nosys,
 307  308  
 308  309          VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 309  310              fs_rwlock, fs_rwlock,
 310  311  
 311  312          VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 312  313              (fs_generic_func_p)(uintptr_t)fs_rwunlock,
 313  314              (fs_generic_func_p)(uintptr_t)fs_rwunlock,  /* no errors allowed */
 314  315  
 315  316          VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 316  317              fs_nosys, fs_nosys,
 317  318  
 318  319          VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 319  320              fs_cmp, fs_cmp,             /* no errors allowed */
 320  321  
 321  322          VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 322  323              fs_frlock, fs_nosys,
 323  324  
 324  325          VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 325  326              fs_nosys, fs_nosys,
 326  327  
 327  328          VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 328  329              fs_nosys, fs_nosys,
 329  330  
 330  331          VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 331  332              fs_nosys, fs_nosys,
 332  333  
 333  334          VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 334  335              fs_nosys, fs_nosys,
 335  336  
 336  337          VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 337  338              (fs_generic_func_p) fs_nosys_map,
 338  339              (fs_generic_func_p) fs_nosys_map,
 339  340  
 340  341          VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 341  342              (fs_generic_func_p) fs_nosys_addmap,
 342  343              (fs_generic_func_p) fs_nosys_addmap,
 343  344  
 344  345          VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 345  346              fs_nosys, fs_nosys,
 346  347  
 347  348          VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 348  349              (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 349  350  
 350  351          VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 351  352              fs_nosys, fs_nosys,
 352  353  
 353  354          VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 354  355              fs_pathconf, fs_nosys,
 355  356  
 356  357          VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 357  358              fs_nosys, fs_nosys,
 358  359  
 359  360          VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 360  361              fs_nosys, fs_nosys,
 361  362  
 362  363          VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 363  364              (fs_generic_func_p)(uintptr_t)fs_dispose,
 364  365              (fs_generic_func_p)(uintptr_t)fs_nodispose,
 365  366  
 366  367          VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 367  368              fs_nosys, fs_nosys,
 368  369  
 369  370          VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 370  371              fs_fab_acl, fs_nosys,
 371  372  
 372  373          VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 373  374              fs_shrlock, fs_nosys,
 374  375  
 375  376          VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 376  377              (fs_generic_func_p) fs_vnevent_nosupport,
 377  378              (fs_generic_func_p) fs_vnevent_nosupport,
 378  379  
 379  380          VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 380  381              fs_nosys, fs_nosys,
 381  382  
 382  383          VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 383  384              fs_nosys, fs_nosys,
 384  385  
 385  386          NULL, 0, NULL, NULL
 386  387  };
 387  388  
 388  389  /* Extensible attribute (xva) routines. */
 389  390  
 390  391  /*
 391  392   * Zero out the structure, set the size of the requested/returned bitmaps,
 392  393   * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 393  394   * to the returned attributes array.
 394  395   */
 395  396  void
 396  397  xva_init(xvattr_t *xvap)
 397  398  {
 398  399          bzero(xvap, sizeof (xvattr_t));
 399  400          xvap->xva_mapsize = XVA_MAPSIZE;
 400  401          xvap->xva_magic = XVA_MAGIC;
 401  402          xvap->xva_vattr.va_mask = AT_XVATTR;
 402  403          xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 403  404  }
 404  405  
 405  406  /*
 406  407   * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 407  408   * structure.  Otherwise, returns NULL.
 408  409   */
 409  410  xoptattr_t *
 410  411  xva_getxoptattr(xvattr_t *xvap)
 411  412  {
 412  413          xoptattr_t *xoap = NULL;
 413  414          if (xvap->xva_vattr.va_mask & AT_XVATTR)
 414  415                  xoap = &xvap->xva_xoptattrs;
 415  416          return (xoap);
 416  417  }
 417  418  
 418  419  /*
 419  420   * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 420  421   * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 421  422   * kstat name.
 422  423   */
 423  424  static int
 424  425  vska_compar(const void *n1, const void *n2)
 425  426  {
 426  427          int ret;
 427  428          ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 428  429          ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 429  430  
 430  431          if (p1 < p2) {
 431  432                  ret = -1;
 432  433          } else if (p1 > p2) {
 433  434                  ret = 1;
 434  435          } else {
 435  436                  ret = 0;
 436  437          }
 437  438  
 438  439          return (ret);
 439  440  }
 440  441  
 441  442  /*
 442  443   * Used to create a single template which will be bcopy()ed to a newly
 443  444   * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 444  445   */
 445  446  static vopstats_t *
 446  447  create_vopstats_template()
 447  448  {
 448  449          vopstats_t              *vsp;
 449  450  
 450  451          vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 451  452          bzero(vsp, sizeof (*vsp));      /* Start fresh */
 452  453  
 453  454          /* VOP_OPEN */
 454  455          kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 455  456          /* VOP_CLOSE */
 456  457          kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 457  458          /* VOP_READ I/O */
 458  459          kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 459  460          kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 460  461          /* VOP_WRITE I/O */
 461  462          kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 462  463          kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 463  464          /* VOP_IOCTL */
 464  465          kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 465  466          /* VOP_SETFL */
 466  467          kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 467  468          /* VOP_GETATTR */
 468  469          kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 469  470          /* VOP_SETATTR */
 470  471          kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 471  472          /* VOP_ACCESS */
 472  473          kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 473  474          /* VOP_LOOKUP */
 474  475          kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 475  476          /* VOP_CREATE */
 476  477          kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 477  478          /* VOP_REMOVE */
 478  479          kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 479  480          /* VOP_LINK */
 480  481          kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 481  482          /* VOP_RENAME */
 482  483          kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 483  484          /* VOP_MKDIR */
 484  485          kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 485  486          /* VOP_RMDIR */
 486  487          kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 487  488          /* VOP_READDIR I/O */
 488  489          kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 489  490          kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 490  491              KSTAT_DATA_UINT64);
 491  492          /* VOP_SYMLINK */
 492  493          kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 493  494          /* VOP_READLINK */
 494  495          kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 495  496          /* VOP_FSYNC */
 496  497          kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 497  498          /* VOP_INACTIVE */
 498  499          kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 499  500          /* VOP_FID */
 500  501          kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 501  502          /* VOP_RWLOCK */
 502  503          kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 503  504          /* VOP_RWUNLOCK */
 504  505          kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 505  506          /* VOP_SEEK */
 506  507          kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 507  508          /* VOP_CMP */
 508  509          kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 509  510          /* VOP_FRLOCK */
 510  511          kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 511  512          /* VOP_SPACE */
 512  513          kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 513  514          /* VOP_REALVP */
 514  515          kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 515  516          /* VOP_GETPAGE */
 516  517          kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 517  518          /* VOP_PUTPAGE */
 518  519          kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 519  520          /* VOP_MAP */
 520  521          kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 521  522          /* VOP_ADDMAP */
 522  523          kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 523  524          /* VOP_DELMAP */
 524  525          kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 525  526          /* VOP_POLL */
 526  527          kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 527  528          /* VOP_DUMP */
 528  529          kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 529  530          /* VOP_PATHCONF */
 530  531          kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 531  532          /* VOP_PAGEIO */
 532  533          kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 533  534          /* VOP_DUMPCTL */
 534  535          kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 535  536          /* VOP_DISPOSE */
 536  537          kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 537  538          /* VOP_SETSECATTR */
 538  539          kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 539  540          /* VOP_GETSECATTR */
 540  541          kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 541  542          /* VOP_SHRLOCK */
 542  543          kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 543  544          /* VOP_VNEVENT */
 544  545          kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 545  546          /* VOP_REQZCBUF */
 546  547          kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 547  548          /* VOP_RETZCBUF */
 548  549          kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 549  550  
 550  551          return (vsp);
 551  552  }
 552  553  
 553  554  /*
 554  555   * Creates a kstat structure associated with a vopstats structure.
 555  556   */
 556  557  kstat_t *
 557  558  new_vskstat(char *ksname, vopstats_t *vsp)
 558  559  {
 559  560          kstat_t         *ksp;
 560  561  
 561  562          if (!vopstats_enabled) {
 562  563                  return (NULL);
 563  564          }
 564  565  
 565  566          ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 566  567              sizeof (vopstats_t)/sizeof (kstat_named_t),
 567  568              KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 568  569          if (ksp) {
 569  570                  ksp->ks_data = vsp;
 570  571                  kstat_install(ksp);
 571  572          }
 572  573  
 573  574          return (ksp);
 574  575  }
 575  576  
 576  577  /*
 577  578   * Called from vfsinit() to initialize the support mechanisms for vopstats
 578  579   */
 579  580  void
 580  581  vopstats_startup()
 581  582  {
 582  583          if (!vopstats_enabled)
 583  584                  return;
 584  585  
 585  586          /*
 586  587           * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 587  588           * is necessary since we need to check if a kstat exists before we
 588  589           * attempt to create it.  Also, initialize its lock.
 589  590           */
 590  591          avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 591  592              offsetof(vsk_anchor_t, vsk_node));
 592  593          mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 593  594  
 594  595          vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 595  596              sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 596  597              NULL, NULL, 0);
 597  598  
 598  599          /*
 599  600           * Set up the array of pointers for the vopstats-by-FS-type.
 600  601           * The entries will be allocated/initialized as each file system
 601  602           * goes through modload/mod_installfs.
 602  603           */
 603  604          vopstats_fstype = (vopstats_t **)kmem_zalloc(
 604  605              (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 605  606  
 606  607          /* Set up the global vopstats initialization template */
 607  608          vs_templatep = create_vopstats_template();
 608  609  }
 609  610  
 610  611  /*
 611  612   * We need to have the all of the counters zeroed.
 612  613   * The initialization of the vopstats_t includes on the order of
 613  614   * 50 calls to kstat_named_init().  Rather that do that on every call,
 614  615   * we do it once in a template (vs_templatep) then bcopy it over.
 615  616   */
 616  617  void
 617  618  initialize_vopstats(vopstats_t *vsp)
 618  619  {
 619  620          if (vsp == NULL)
 620  621                  return;
 621  622  
 622  623          bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 623  624  }
 624  625  
 625  626  /*
 626  627   * If possible, determine which vopstats by fstype to use and
 627  628   * return a pointer to the caller.
 628  629   */
 629  630  vopstats_t *
 630  631  get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 631  632  {
 632  633          int             fstype = 0;     /* Index into vfssw[] */
 633  634          vopstats_t      *vsp = NULL;
 634  635  
 635  636          if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 636  637              !vopstats_enabled)
 637  638                  return (NULL);
 638  639          /*
 639  640           * Set up the fstype.  We go to so much trouble because all versions
 640  641           * of NFS use the same fstype in their vfs even though they have
 641  642           * distinct entries in the vfssw[] table.
 642  643           * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 643  644           */
 644  645          if (vswp) {
 645  646                  fstype = vswp - vfssw;  /* Gets us the index */
 646  647          } else {
 647  648                  fstype = vfsp->vfs_fstype;
 648  649          }
 649  650  
 650  651          /*
 651  652           * Point to the per-fstype vopstats. The only valid values are
 652  653           * non-zero positive values less than the number of vfssw[] table
 653  654           * entries.
 654  655           */
 655  656          if (fstype > 0 && fstype < nfstype) {
 656  657                  vsp = vopstats_fstype[fstype];
 657  658          }
 658  659  
 659  660          return (vsp);
 660  661  }
 661  662  
 662  663  /*
 663  664   * Generate a kstat name, create the kstat structure, and allocate a
 664  665   * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 665  666   * to the caller.  This must only be called from a mount.
 666  667   */
 667  668  vsk_anchor_t *
 668  669  get_vskstat_anchor(vfs_t *vfsp)
 669  670  {
 670  671          char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 671  672          statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 672  673          vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 673  674          kstat_t         *ksp;                   /* Ptr to new kstat */
 674  675          avl_index_t     where;                  /* Location in the AVL tree */
 675  676  
 676  677          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 677  678              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 678  679                  return (NULL);
 679  680  
 680  681          /* Need to get the fsid to build a kstat name */
 681  682          if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 682  683                  /* Create a name for our kstats based on fsid */
 683  684                  (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 684  685                      VOPSTATS_STR, statvfsbuf.f_fsid);
 685  686  
 686  687                  /* Allocate and initialize the vsk_anchor_t */
 687  688                  vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 688  689                  bzero(vskp, sizeof (*vskp));
 689  690                  vskp->vsk_fsid = statvfsbuf.f_fsid;
 690  691  
 691  692                  mutex_enter(&vskstat_tree_lock);
 692  693                  if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 693  694                          avl_insert(&vskstat_tree, vskp, where);
 694  695                          mutex_exit(&vskstat_tree_lock);
 695  696  
 696  697                          /*
 697  698                           * Now that we've got the anchor in the AVL
 698  699                           * tree, we can create the kstat.
 699  700                           */
 700  701                          ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 701  702                          if (ksp) {
 702  703                                  vskp->vsk_ksp = ksp;
 703  704                          }
 704  705                  } else {
 705  706                          /* Oops, found one! Release memory and lock. */
 706  707                          mutex_exit(&vskstat_tree_lock);
 707  708                          kmem_cache_free(vsk_anchor_cache, vskp);
 708  709                          vskp = NULL;
 709  710                  }
 710  711          }
 711  712          return (vskp);
 712  713  }
 713  714  
 714  715  /*
 715  716   * We're in the process of tearing down the vfs and need to cleanup
 716  717   * the data structures associated with the vopstats. Must only be called
 717  718   * from dounmount().
 718  719   */
 719  720  void
 720  721  teardown_vopstats(vfs_t *vfsp)
 721  722  {
 722  723          vsk_anchor_t    *vskap;
 723  724          avl_index_t     where;
 724  725  
 725  726          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 726  727              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 727  728                  return;
 728  729  
 729  730          /* This is a safe check since VFS_STATS must be set (see above) */
 730  731          if ((vskap = vfsp->vfs_vskap) == NULL)
 731  732                  return;
 732  733  
 733  734          /* Whack the pointer right away */
 734  735          vfsp->vfs_vskap = NULL;
 735  736  
 736  737          /* Lock the tree, remove the node, and delete the kstat */
 737  738          mutex_enter(&vskstat_tree_lock);
 738  739          if (avl_find(&vskstat_tree, vskap, &where)) {
 739  740                  avl_remove(&vskstat_tree, vskap);
 740  741          }
 741  742  
 742  743          if (vskap->vsk_ksp) {
 743  744                  kstat_delete(vskap->vsk_ksp);
 744  745          }
 745  746          mutex_exit(&vskstat_tree_lock);
 746  747  
 747  748          kmem_cache_free(vsk_anchor_cache, vskap);
 748  749  }
 749  750  
 750  751  /*
 751  752   * Read or write a vnode.  Called from kernel code.
 752  753   */
 753  754  int
 754  755  vn_rdwr(
 755  756          enum uio_rw rw,
 756  757          struct vnode *vp,
 757  758          caddr_t base,
 758  759          ssize_t len,
 759  760          offset_t offset,
 760  761          enum uio_seg seg,
 761  762          int ioflag,
 762  763          rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 763  764          cred_t *cr,
 764  765          ssize_t *residp)
 765  766  {
 766  767          struct uio uio;
 767  768          struct iovec iov;
 768  769          int error;
 769  770          int in_crit = 0;
 770  771  
 771  772          if (rw == UIO_WRITE && ISROFILE(vp))
 772  773                  return (EROFS);
 773  774  
 774  775          if (len < 0)
 775  776                  return (EIO);
 776  777  
 777  778          VOPXID_MAP_CR(vp, cr);
 778  779  
 779  780          iov.iov_base = base;
 780  781          iov.iov_len = len;
 781  782          uio.uio_iov = &iov;
 782  783          uio.uio_iovcnt = 1;
 783  784          uio.uio_loffset = offset;
 784  785          uio.uio_segflg = (short)seg;
 785  786          uio.uio_resid = len;
 786  787          uio.uio_llimit = ulimit;
 787  788  
 788  789          /*
 789  790           * We have to enter the critical region before calling VOP_RWLOCK
 790  791           * to avoid a deadlock with ufs.
 791  792           */
 792  793          if (nbl_need_check(vp)) {
 793  794                  int svmand;
 794  795  
 795  796                  nbl_start_crit(vp, RW_READER);
 796  797                  in_crit = 1;
 797  798                  error = nbl_svmand(vp, cr, &svmand);
 798  799                  if (error != 0)
 799  800                          goto done;
 800  801                  if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 801  802                      uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 802  803                          error = EACCES;
 803  804                          goto done;
 804  805                  }
 805  806          }
 806  807  
 807  808          (void) VOP_RWLOCK(vp,
 808  809              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 809  810          if (rw == UIO_WRITE) {
 810  811                  uio.uio_fmode = FWRITE;
 811  812                  uio.uio_extflg = UIO_COPY_DEFAULT;
 812  813                  error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 813  814          } else {
 814  815                  uio.uio_fmode = FREAD;
 815  816                  uio.uio_extflg = UIO_COPY_CACHED;
 816  817                  error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 817  818          }
 818  819          VOP_RWUNLOCK(vp,
 819  820              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 820  821          if (residp)
 821  822                  *residp = uio.uio_resid;
 822  823          else if (uio.uio_resid)
 823  824                  error = EIO;
 824  825  
 825  826  done:
 826  827          if (in_crit)
 827  828                  nbl_end_crit(vp);
 828  829          return (error);
 829  830  }
 830  831  
 831  832  /*
 832  833   * Release a vnode.  Call VOP_INACTIVE on last reference or
 833  834   * decrement reference count.
 834  835   *

↓ open down ↓

800 lines elided

↑ open up ↑

 835  836   * To avoid race conditions, the v_count is left at 1 for
 836  837   * the call to VOP_INACTIVE. This prevents another thread
 837  838   * from reclaiming and releasing the vnode *before* the
 838  839   * VOP_INACTIVE routine has a chance to destroy the vnode.
 839  840   * We can't have more than 1 thread calling VOP_INACTIVE
 840  841   * on a vnode.
 841  842   */
 842  843  void
 843  844  vn_rele(vnode_t *vp)
 844  845  {
 845      -        VERIFY(vp->v_count > 0);
 846  846          mutex_enter(&vp->v_lock);
 847  847          if (vp->v_count == 1) {
 848  848                  mutex_exit(&vp->v_lock);
 849  849                  VOP_INACTIVE(vp, CRED(), NULL);
 850  850                  return;
      851 +        } else {
      852 +                VERIFY(vp->v_count > 0);
 851  853          }
 852  854          VN_RELE_LOCKED(vp);
 853  855          mutex_exit(&vp->v_lock);
 854  856  }
 855  857  
 856  858  void
 857  859  vn_phantom_rele(vnode_t *vp)
 858  860  {
 859      -        VERIFY(vp->v_count > 0);
 860      -
 861  861          mutex_enter(&vp->v_lock);
 862  862          VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
 863  863          vp->v_phantom_count--;
 864  864          DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
 865  865          if (vp->v_count == 1) {
 866  866                  ASSERT0(vp->v_phantom_count);
 867  867                  mutex_exit(&vp->v_lock);
 868  868                  VOP_INACTIVE(vp, CRED(), NULL);
 869  869                  return;
      870 +        } else {
      871 +                VERIFY(vp->v_count > 0);
 870  872          }
 871  873          VN_RELE_LOCKED(vp);
 872  874          mutex_exit(&vp->v_lock);
 873  875  }
 874  876  
 875  877  /*
 876  878   * Return the number of non-phantom holds. Things such as portfs will use
 877  879   * phantom holds to prevent it from blocking filesystems from mounting over
 878  880   * watched directories.
 879  881   */

 880  882  uint_t
 881  883  vn_count(vnode_t *vp)
 882  884  {
 883  885          ASSERT(MUTEX_HELD(&vp->v_lock));
 884  886          return (vp->v_count - vp->v_phantom_count);
 885  887  }

↓ open down ↓

6 lines elided

↑ open up ↑

 886  888  
 887  889  /*
 888  890   * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 889  891   * as a single reference, so v_count is not decremented until the last DNLC hold
 890  892   * is released. This makes it possible to distinguish vnodes that are referenced
 891  893   * only by the DNLC.
 892  894   */
 893  895  void
 894  896  vn_rele_dnlc(vnode_t *vp)
 895  897  {
 896      -        VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 897  898          mutex_enter(&vp->v_lock);
      899 +        VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 898  900          if (--vp->v_count_dnlc == 0) {
 899  901                  if (vp->v_count == 1) {
 900  902                          mutex_exit(&vp->v_lock);
 901  903                          VOP_INACTIVE(vp, CRED(), NULL);
 902  904                          return;
 903  905                  }
 904  906                  VN_RELE_LOCKED(vp);
 905  907          }
 906  908          mutex_exit(&vp->v_lock);
 907  909  }

 908  910

↓ open down ↓

1 lines elided

↑ open up ↑

 909  911  /*
 910  912   * Like vn_rele() except that it clears v_stream under v_lock.
 911  913   * This is used by sockfs when it dismantles the association between
 912  914   * the sockfs node and the vnode in the underlying file system.
 913  915   * v_lock has to be held to prevent a thread coming through the lookupname
 914  916   * path from accessing a stream head that is going away.
 915  917   */
 916  918  void
 917  919  vn_rele_stream(vnode_t *vp)
 918  920  {
 919      -        VERIFY(vp->v_count > 0);
 920  921          mutex_enter(&vp->v_lock);
 921  922          vp->v_stream = NULL;
 922  923          if (vp->v_count == 1) {
 923  924                  mutex_exit(&vp->v_lock);
 924  925                  VOP_INACTIVE(vp, CRED(), NULL);
 925  926                  return;
      927 +        } else {
      928 +                VERIFY(vp->v_count > 0);
 926  929          }
 927  930          VN_RELE_LOCKED(vp);
 928  931          mutex_exit(&vp->v_lock);
 929  932  }
 930  933  
 931  934  static void
 932  935  vn_rele_inactive(vnode_t *vp)
 933  936  {
 934  937          VOP_INACTIVE(vp, CRED(), NULL);
 935  938  }

 936  939  
 937  940  /*
 938  941   * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 939  942   * asynchronously using a taskq. This can avoid deadlocks caused by re-entering

↓ open down ↓

4 lines elided

↑ open up ↑

 940  943   * the file system as a result of releasing the vnode. Note, file systems
 941  944   * already have to handle the race where the vnode is incremented before the
 942  945   * inactive routine is called and does its locking.
 943  946   *
 944  947   * Warning: Excessive use of this routine can lead to performance problems.
 945  948   * This is because taskqs throttle back allocation if too many are created.
 946  949   */
 947  950  void
 948  951  vn_rele_async(vnode_t *vp, taskq_t *taskq)
 949  952  {
 950      -        VERIFY(vp->v_count > 0);
 951  953          mutex_enter(&vp->v_lock);
 952  954          if (vp->v_count == 1) {
 953  955                  mutex_exit(&vp->v_lock);
 954  956                  VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 955  957                      vp, TQ_SLEEP) != TASKQID_INVALID);
 956  958                  return;
      959 +        } else {
      960 +                VERIFY(vp->v_count > 0);
 957  961          }
 958  962          VN_RELE_LOCKED(vp);
 959  963          mutex_exit(&vp->v_lock);
 960  964  }
 961  965  
 962  966  int
 963  967  vn_open(
 964  968          char *pnamep,
 965  969          enum uio_seg seg,
 966  970          int filemode,

 967  971          int createmode,
 968  972          struct vnode **vpp,
 969  973          enum create crwhy,
 970  974          mode_t umask)
 971  975  {
 972  976          return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 973  977              umask, NULL, -1));
 974  978  }
 975  979  
 976  980  
 977  981  /*
 978  982   * Open/create a vnode.
 979  983   * This may be callable by the kernel, the only known use
 980  984   * of user context being that the current user credentials
 981  985   * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 982  986   */
 983  987  int
 984  988  vn_openat(
 985  989          char *pnamep,
 986  990          enum uio_seg seg,
 987  991          int filemode,
 988  992          int createmode,
 989  993          struct vnode **vpp,
 990  994          enum create crwhy,
 991  995          mode_t umask,
 992  996          struct vnode *startvp,
 993  997          int fd)
 994  998  {
 995  999          struct vnode *vp;
 996 1000          int mode;
 997 1001          int accessflags;
 998 1002          int error;
 999 1003          int in_crit = 0;
1000 1004          int open_done = 0;
1001 1005          int shrlock_done = 0;
1002 1006          struct vattr vattr;
1003 1007          enum symfollow follow;
1004 1008          int estale_retry = 0;
1005 1009          struct shrlock shr;
1006 1010          struct shr_locowner shr_own;
1007 1011          boolean_t create;
1008 1012  
1009 1013          mode = 0;
1010 1014          accessflags = 0;
1011 1015          if (filemode & FREAD)
1012 1016                  mode |= VREAD;
1013 1017          if (filemode & (FWRITE|FTRUNC))
1014 1018                  mode |= VWRITE;
1015 1019          if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
1016 1020                  mode |= VEXEC;
1017 1021  
1018 1022          /* symlink interpretation */
1019 1023          if (filemode & FNOFOLLOW)
1020 1024                  follow = NO_FOLLOW;
1021 1025          else
1022 1026                  follow = FOLLOW;
1023 1027  
1024 1028          if (filemode & FAPPEND)
1025 1029                  accessflags |= V_APPEND;
1026 1030  
1027 1031          /*
1028 1032           * We need to handle the case of FCREAT | FDIRECTORY and the case of
1029 1033           * FEXCL. If all three are specified, then we always fail because we
1030 1034           * cannot create a directory through this interface and FEXCL says we
1031 1035           * need to fail the request if we can't create it. If, however, only
1032 1036           * FCREAT | FDIRECTORY are specified, then we can treat this as the case
1033 1037           * of opening a file that already exists. If it exists, we can do
1034 1038           * something and if not, we fail. Effectively FCREAT | FDIRECTORY is
1035 1039           * treated as FDIRECTORY.
1036 1040           */
1037 1041          if ((filemode & (FCREAT | FDIRECTORY | FEXCL)) ==
1038 1042              (FCREAT | FDIRECTORY | FEXCL)) {
1039 1043                  return (EINVAL);
1040 1044          }
1041 1045  
1042 1046          if ((filemode & (FCREAT | FDIRECTORY)) == (FCREAT | FDIRECTORY)) {
1043 1047                  create = B_FALSE;
1044 1048          } else if ((filemode & FCREAT) != 0) {
1045 1049                  create = B_TRUE;
1046 1050          } else {
1047 1051                  create = B_FALSE;
1048 1052          }
1049 1053  
1050 1054  top:
1051 1055          if (create) {
1052 1056                  enum vcexcl excl;
1053 1057  
1054 1058                  /*
1055 1059                   * Wish to create a file.
1056 1060                   */
1057 1061                  vattr.va_type = VREG;
1058 1062                  vattr.va_mode = createmode;
1059 1063                  vattr.va_mask = AT_TYPE|AT_MODE;
1060 1064                  if (filemode & FTRUNC) {
1061 1065                          vattr.va_size = 0;
1062 1066                          vattr.va_mask |= AT_SIZE;
1063 1067                  }
1064 1068                  if (filemode & FEXCL)
1065 1069                          excl = EXCL;
1066 1070                  else
1067 1071                          excl = NONEXCL;
1068 1072  
1069 1073                  if (error =
1070 1074                      vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1071 1075                      (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1072 1076                          return (error);
1073 1077          } else {
1074 1078                  /*
1075 1079                   * Wish to open a file.  Just look it up.
1076 1080                   */
1077 1081                  if (error = lookupnameat(pnamep, seg, follow,
1078 1082                      NULLVPP, &vp, startvp)) {
1079 1083                          if ((error == ESTALE) &&
1080 1084                              fs_need_estale_retry(estale_retry++))
1081 1085                                  goto top;
1082 1086                          return (error);
1083 1087                  }
1084 1088  
1085 1089                  /*
1086 1090                   * Get the attributes to check whether file is large.
1087 1091                   * We do this only if the FOFFMAX flag is not set and
1088 1092                   * only for regular files.
1089 1093                   */
1090 1094  
1091 1095                  if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1092 1096                          vattr.va_mask = AT_SIZE;
1093 1097                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1094 1098                              CRED(), NULL))) {
1095 1099                                  goto out;
1096 1100                          }
1097 1101                          if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1098 1102                                  /*
1099 1103                                   * Large File API - regular open fails
1100 1104                                   * if FOFFMAX flag is set in file mode
1101 1105                                   */
1102 1106                                  error = EOVERFLOW;
1103 1107                                  goto out;
1104 1108                          }
1105 1109                  }
1106 1110                  /*
1107 1111                   * Can't write directories, active texts, or
1108 1112                   * read-only filesystems.  Can't truncate files
1109 1113                   * on which mandatory locking is in effect.
1110 1114                   */
1111 1115                  if (filemode & (FWRITE|FTRUNC)) {
1112 1116                          /*
1113 1117                           * Allow writable directory if VDIROPEN flag is set.
1114 1118                           */
1115 1119                          if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1116 1120                                  error = EISDIR;
1117 1121                                  goto out;
1118 1122                          }
1119 1123                          if (ISROFILE(vp)) {
1120 1124                                  error = EROFS;
1121 1125                                  goto out;
1122 1126                          }
1123 1127                          /*
1124 1128                           * Can't truncate files on which
1125 1129                           * sysv mandatory locking is in effect.
1126 1130                           */
1127 1131                          if (filemode & FTRUNC) {
1128 1132                                  vnode_t *rvp;
1129 1133  
1130 1134                                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1131 1135                                          rvp = vp;
1132 1136                                  if (rvp->v_filocks != NULL) {
1133 1137                                          vattr.va_mask = AT_MODE;
1134 1138                                          if ((error = VOP_GETATTR(vp,
1135 1139                                              &vattr, 0, CRED(), NULL)) == 0 &&
1136 1140                                              MANDLOCK(vp, vattr.va_mode))
1137 1141                                                  error = EAGAIN;
1138 1142                                  }
1139 1143                          }
1140 1144                          if (error)
1141 1145                                  goto out;
1142 1146                  }
1143 1147                  /*
1144 1148                   * Check permissions.
1145 1149                   */
1146 1150                  if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1147 1151                          goto out;
1148 1152  
1149 1153                  /*
1150 1154                   * Require FSEARCH and FDIRECTORY to return a directory. Require
1151 1155                   * FEXEC to return a regular file.
1152 1156                   */
1153 1157                  if ((filemode & (FSEARCH|FDIRECTORY)) != 0 &&
1154 1158                      vp->v_type != VDIR) {
1155 1159                          error = ENOTDIR;
1156 1160                          goto out;
1157 1161                  }
1158 1162                  if ((filemode & FEXEC) && vp->v_type != VREG) {
1159 1163                          error = ENOEXEC;        /* XXX: error code? */
1160 1164                          goto out;
1161 1165                  }
1162 1166          }
1163 1167  
1164 1168          /*
1165 1169           * Do remaining checks for FNOFOLLOW and FNOLINKS.
1166 1170           */
1167 1171          if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1168 1172                  /*
1169 1173                   * The __FLXPATH flag is a private interface for use by the lx
1170 1174                   * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
1171 1175                   * when a symbolic link is encountered, returns a file
1172 1176                   * descriptor which references it.
1173 1177                   * See uts/common/brand/lx/syscall/lx_open.c
1174 1178                   *
1175 1179                   * When this flag is set, VOP_OPEN() is not called (for a
1176 1180                   * symlink, most filesystems will return ENOSYS anyway)
1177 1181                   * and the link's vnode is returned to be linked to the
1178 1182                   * file descriptor.
1179 1183                   */
1180 1184                  if ((filemode & __FLXPATH) == 0)
1181 1185                          error = ELOOP;
1182 1186                  goto out;
1183 1187          }
1184 1188          if (filemode & FNOLINKS) {
1185 1189                  vattr.va_mask = AT_NLINK;
1186 1190                  if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1187 1191                          goto out;
1188 1192                  }
1189 1193                  if (vattr.va_nlink != 1) {
1190 1194                          error = EMLINK;
1191 1195                          goto out;
1192 1196                  }
1193 1197          }
1194 1198  
1195 1199          /*
1196 1200           * Opening a socket corresponding to the AF_UNIX pathname
1197 1201           * in the filesystem name space is not supported.
1198 1202           * However, VSOCK nodes in namefs are supported in order
1199 1203           * to make fattach work for sockets.
1200 1204           *
1201 1205           * XXX This uses VOP_REALVP to distinguish between
1202 1206           * an unopened namefs node (where VOP_REALVP returns a
1203 1207           * different VSOCK vnode) and a VSOCK created by vn_create
1204 1208           * in some file system (where VOP_REALVP would never return
1205 1209           * a different vnode).
1206 1210           */
1207 1211          if (vp->v_type == VSOCK) {
1208 1212                  struct vnode *nvp;
1209 1213  
1210 1214                  error = VOP_REALVP(vp, &nvp, NULL);
1211 1215                  if (error != 0 || nvp == NULL || nvp == vp ||
1212 1216                      nvp->v_type != VSOCK) {
1213 1217                          error = EOPNOTSUPP;
1214 1218                          goto out;
1215 1219                  }
1216 1220          }
1217 1221  
1218 1222          if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1219 1223                  /* get share reservation */
1220 1224                  shr.s_access = 0;
1221 1225                  if (filemode & FWRITE)
1222 1226                          shr.s_access |= F_WRACC;
1223 1227                  if (filemode & FREAD)
1224 1228                          shr.s_access |= F_RDACC;
1225 1229                  shr.s_deny = 0;
1226 1230                  shr.s_sysid = 0;
1227 1231                  shr.s_pid = ttoproc(curthread)->p_pid;
1228 1232                  shr_own.sl_pid = shr.s_pid;
1229 1233                  shr_own.sl_id = fd;
1230 1234                  shr.s_own_len = sizeof (shr_own);
1231 1235                  shr.s_owner = (caddr_t)&shr_own;
1232 1236                  error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1233 1237                      NULL);
1234 1238                  if (error)
1235 1239                          goto out;
1236 1240                  shrlock_done = 1;
1237 1241  
1238 1242                  /* nbmand conflict check if truncating file */
1239 1243                  if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1240 1244                          nbl_start_crit(vp, RW_READER);
1241 1245                          in_crit = 1;
1242 1246  
1243 1247                          vattr.va_mask = AT_SIZE;
1244 1248                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1245 1249                                  goto out;
1246 1250                          if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1247 1251                              NULL)) {
1248 1252                                  error = EACCES;
1249 1253                                  goto out;
1250 1254                          }
1251 1255                  }
1252 1256          }
1253 1257  
1254 1258          /*
1255 1259           * Do opening protocol.
1256 1260           */
1257 1261          error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1258 1262          if (error)
1259 1263                  goto out;
1260 1264          open_done = 1;
1261 1265  
1262 1266          /*
1263 1267           * Truncate if required.
1264 1268           */
1265 1269          if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1266 1270                  vattr.va_size = 0;
1267 1271                  vattr.va_mask = AT_SIZE;
1268 1272                  if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1269 1273                          goto out;
1270 1274          }
1271 1275  
1272 1276          /*
1273 1277           * Turn on directio, if requested.
1274 1278           */
1275 1279          if (filemode & FDIRECT) {
1276 1280                  if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
1277 1281                      CRED(), NULL, NULL)) != 0) {
1278 1282                          /*
1279 1283                           * On Linux, O_DIRECT returns EINVAL when the file
1280 1284                           * system does not support directio, so we'll do the
1281 1285                           * same.
1282 1286                           */
1283 1287                          error = EINVAL;
1284 1288                          goto out;
1285 1289                  }
1286 1290          }
1287 1291  out:
1288 1292          ASSERT(vp->v_count > 0);
1289 1293  
1290 1294          if (in_crit) {
1291 1295                  nbl_end_crit(vp);
1292 1296                  in_crit = 0;
1293 1297          }
1294 1298          if (error) {
1295 1299                  if (open_done) {
1296 1300                          (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1297 1301                              NULL);
1298 1302                          open_done = 0;
1299 1303                          shrlock_done = 0;
1300 1304                  }
1301 1305                  if (shrlock_done) {
1302 1306                          (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1303 1307                              NULL);
1304 1308                          shrlock_done = 0;
1305 1309                  }
1306 1310  
1307 1311                  /*
1308 1312                   * The following clause was added to handle a problem
1309 1313                   * with NFS consistency.  It is possible that a lookup
1310 1314                   * of the file to be opened succeeded, but the file
1311 1315                   * itself doesn't actually exist on the server.  This
1312 1316                   * is chiefly due to the DNLC containing an entry for
1313 1317                   * the file which has been removed on the server.  In
1314 1318                   * this case, we just start over.  If there was some
1315 1319                   * other cause for the ESTALE error, then the lookup
1316 1320                   * of the file will fail and the error will be returned
1317 1321                   * above instead of looping around from here.
1318 1322                   */
1319 1323                  VN_RELE(vp);
1320 1324                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1321 1325                          goto top;
1322 1326          } else
1323 1327                  *vpp = vp;
1324 1328          return (error);
1325 1329  }
1326 1330  
1327 1331  /*
1328 1332   * The following two accessor functions are for the NFSv4 server.  Since there
1329 1333   * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1330 1334   * vnode open counts correct when a client "upgrades" an open or does an
1331 1335   * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1332 1336   * open mode (add or subtract read or write), but also change the share/deny
1333 1337   * modes.  However, share reservations are not integrated with OPEN, yet, so
1334 1338   * we need to handle each separately.  These functions are cleaner than having
1335 1339   * the NFS server manipulate the counts directly, however, nobody else should
1336 1340   * use these functions.
1337 1341   */
1338 1342  void
1339 1343  vn_open_upgrade(
1340 1344          vnode_t *vp,
1341 1345          int filemode)
1342 1346  {
1343 1347          ASSERT(vp->v_type == VREG);
1344 1348  
1345 1349          if (filemode & FREAD)
1346 1350                  atomic_inc_32(&vp->v_rdcnt);
1347 1351          if (filemode & FWRITE)
1348 1352                  atomic_inc_32(&vp->v_wrcnt);
1349 1353  
1350 1354  }
1351 1355  
1352 1356  void
1353 1357  vn_open_downgrade(
1354 1358          vnode_t *vp,
1355 1359          int filemode)
1356 1360  {
1357 1361          ASSERT(vp->v_type == VREG);
1358 1362  
1359 1363          if (filemode & FREAD) {
1360 1364                  ASSERT(vp->v_rdcnt > 0);
1361 1365                  atomic_dec_32(&vp->v_rdcnt);
1362 1366          }
1363 1367          if (filemode & FWRITE) {
1364 1368                  ASSERT(vp->v_wrcnt > 0);
1365 1369                  atomic_dec_32(&vp->v_wrcnt);
1366 1370          }
1367 1371  
1368 1372  }
1369 1373  
1370 1374  int
1371 1375  vn_create(
1372 1376          char *pnamep,
1373 1377          enum uio_seg seg,
1374 1378          struct vattr *vap,
1375 1379          enum vcexcl excl,
1376 1380          int mode,
1377 1381          struct vnode **vpp,
1378 1382          enum create why,
1379 1383          int flag,
1380 1384          mode_t umask)
1381 1385  {
1382 1386          return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1383 1387              umask, NULL));
1384 1388  }
1385 1389  
1386 1390  /*
1387 1391   * Create a vnode (makenode).
1388 1392   */
1389 1393  int
1390 1394  vn_createat(
1391 1395          char *pnamep,
1392 1396          enum uio_seg seg,
1393 1397          struct vattr *vap,
1394 1398          enum vcexcl excl,
1395 1399          int mode,
1396 1400          struct vnode **vpp,
1397 1401          enum create why,
1398 1402          int flag,
1399 1403          mode_t umask,
1400 1404          struct vnode *startvp)
1401 1405  {
1402 1406          struct vnode *dvp;      /* ptr to parent dir vnode */
1403 1407          struct vnode *vp = NULL;
1404 1408          struct pathname pn;
1405 1409          int error;
1406 1410          int in_crit = 0;
1407 1411          struct vattr vattr;
1408 1412          enum symfollow follow;
1409 1413          int estale_retry = 0;
1410 1414          uint32_t auditing = AU_AUDITING();
1411 1415  
1412 1416          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1413 1417  
1414 1418          /* symlink interpretation */
1415 1419          if ((flag & FNOFOLLOW) || excl == EXCL)
1416 1420                  follow = NO_FOLLOW;
1417 1421          else
1418 1422                  follow = FOLLOW;
1419 1423          flag &= ~(FNOFOLLOW|FNOLINKS);
1420 1424  
1421 1425  top:
1422 1426          /*
1423 1427           * Lookup directory.
1424 1428           * If new object is a file, call lower level to create it.
1425 1429           * Note that it is up to the lower level to enforce exclusive
1426 1430           * creation, if the file is already there.
1427 1431           * This allows the lower level to do whatever
1428 1432           * locking or protocol that is needed to prevent races.
1429 1433           * If the new object is directory call lower level to make
1430 1434           * the new directory, with "." and "..".
1431 1435           */
1432 1436          if (error = pn_get(pnamep, seg, &pn))
1433 1437                  return (error);
1434 1438          if (auditing)
1435 1439                  audit_vncreate_start();
1436 1440          dvp = NULL;
1437 1441          *vpp = NULL;
1438 1442          /*
1439 1443           * lookup will find the parent directory for the vnode.
1440 1444           * When it is done the pn holds the name of the entry
1441 1445           * in the directory.
1442 1446           * If this is a non-exclusive create we also find the node itself.
1443 1447           */
1444 1448          error = lookuppnat(&pn, NULL, follow, &dvp,
1445 1449              (excl == EXCL) ? NULLVPP : vpp, startvp);
1446 1450          if (error) {
1447 1451                  pn_free(&pn);
1448 1452                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1449 1453                          goto top;
1450 1454                  if (why == CRMKDIR && error == EINVAL)
1451 1455                          error = EEXIST;         /* SVID */
1452 1456                  return (error);
1453 1457          }
1454 1458  
1455 1459          if (why != CRMKNOD)
1456 1460                  vap->va_mode &= ~VSVTX;
1457 1461  
1458 1462          /*
1459 1463           * If default ACLs are defined for the directory don't apply the
1460 1464           * umask if umask is passed.
1461 1465           */
1462 1466  
1463 1467          if (umask) {
1464 1468  
1465 1469                  vsecattr_t vsec;
1466 1470  
1467 1471                  vsec.vsa_aclcnt = 0;
1468 1472                  vsec.vsa_aclentp = NULL;
1469 1473                  vsec.vsa_dfaclcnt = 0;
1470 1474                  vsec.vsa_dfaclentp = NULL;
1471 1475                  vsec.vsa_mask = VSA_DFACLCNT;
1472 1476                  error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1473 1477                  /*
1474 1478                   * If error is ENOSYS then treat it as no error
1475 1479                   * Don't want to force all file systems to support
1476 1480                   * aclent_t style of ACL's.
1477 1481                   */
1478 1482                  if (error == ENOSYS)
1479 1483                          error = 0;
1480 1484                  if (error) {
1481 1485                          if (*vpp != NULL)
1482 1486                                  VN_RELE(*vpp);
1483 1487                          goto out;
1484 1488                  } else {
1485 1489                          /*
1486 1490                           * Apply the umask if no default ACLs.
1487 1491                           */
1488 1492                          if (vsec.vsa_dfaclcnt == 0)
1489 1493                                  vap->va_mode &= ~umask;
1490 1494  
1491 1495                          /*
1492 1496                           * VOP_GETSECATTR() may have allocated memory for
1493 1497                           * ACLs we didn't request, so double-check and
1494 1498                           * free it if necessary.
1495 1499                           */
1496 1500                          if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1497 1501                                  kmem_free((caddr_t)vsec.vsa_aclentp,
1498 1502                                      vsec.vsa_aclcnt * sizeof (aclent_t));
1499 1503                          if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1500 1504                                  kmem_free((caddr_t)vsec.vsa_dfaclentp,
1501 1505                                      vsec.vsa_dfaclcnt * sizeof (aclent_t));
1502 1506                  }
1503 1507          }
1504 1508  
1505 1509          /*
1506 1510           * In general we want to generate EROFS if the file system is
1507 1511           * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1508 1512           * documents the open system call, and it says that O_CREAT has no
1509 1513           * effect if the file already exists.  Bug 1119649 states
1510 1514           * that open(path, O_CREAT, ...) fails when attempting to open an
1511 1515           * existing file on a read only file system.  Thus, the first part
1512 1516           * of the following if statement has 3 checks:
1513 1517           *      if the file exists &&
1514 1518           *              it is being open with write access &&
1515 1519           *              the file system is read only
1516 1520           *      then generate EROFS
1517 1521           */
1518 1522          if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1519 1523              (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1520 1524                  if (*vpp)
1521 1525                          VN_RELE(*vpp);
1522 1526                  error = EROFS;
1523 1527          } else if (excl == NONEXCL && *vpp != NULL) {
1524 1528                  vnode_t *rvp;
1525 1529  
1526 1530                  /*
1527 1531                   * File already exists.  If a mandatory lock has been
1528 1532                   * applied, return error.
1529 1533                   */
1530 1534                  vp = *vpp;
1531 1535                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1532 1536                          rvp = vp;
1533 1537                  if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1534 1538                          nbl_start_crit(vp, RW_READER);
1535 1539                          in_crit = 1;
1536 1540                  }
1537 1541                  if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1538 1542                          vattr.va_mask = AT_MODE|AT_SIZE;
1539 1543                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1540 1544                                  goto out;
1541 1545                          }
1542 1546                          if (MANDLOCK(vp, vattr.va_mode)) {
1543 1547                                  error = EAGAIN;
1544 1548                                  goto out;
1545 1549                          }
1546 1550                          /*
1547 1551                           * File cannot be truncated if non-blocking mandatory
1548 1552                           * locks are currently on the file.
1549 1553                           */
1550 1554                          if ((vap->va_mask & AT_SIZE) && in_crit) {
1551 1555                                  u_offset_t offset;
1552 1556                                  ssize_t length;
1553 1557  
1554 1558                                  offset = vap->va_size > vattr.va_size ?
1555 1559                                      vattr.va_size : vap->va_size;
1556 1560                                  length = vap->va_size > vattr.va_size ?
1557 1561                                      vap->va_size - vattr.va_size :
1558 1562                                      vattr.va_size - vap->va_size;
1559 1563                                  if (nbl_conflict(vp, NBL_WRITE, offset,
1560 1564                                      length, 0, NULL)) {
1561 1565                                          error = EACCES;
1562 1566                                          goto out;
1563 1567                                  }
1564 1568                          }
1565 1569                  }
1566 1570  
1567 1571                  /*
1568 1572                   * If the file is the root of a VFS, we've crossed a
1569 1573                   * mount point and the "containing" directory that we
1570 1574                   * acquired above (dvp) is irrelevant because it's in
1571 1575                   * a different file system.  We apply VOP_CREATE to the
1572 1576                   * target itself instead of to the containing directory
1573 1577                   * and supply a null path name to indicate (conventionally)
1574 1578                   * the node itself as the "component" of interest.
1575 1579                   *
1576 1580                   * The call to VOP_CREATE() is necessary to ensure
1577 1581                   * that the appropriate permission checks are made,
1578 1582                   * i.e. EISDIR, EACCES, etc.  We already know that vpp
1579 1583                   * exists since we are in the else condition where this
1580 1584                   * was checked.
1581 1585                   */
1582 1586                  if (vp->v_flag & VROOT) {
1583 1587                          ASSERT(why != CRMKDIR);
1584 1588                          error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1585 1589                              CRED(), flag, NULL, NULL);
1586 1590                          /*
1587 1591                           * If the create succeeded, it will have created a
1588 1592                           * new reference on a new vnode (*vpp) in the child
1589 1593                           * file system, so we want to drop our reference on
1590 1594                           * the old (vp) upon exit.
1591 1595                           */
1592 1596                          goto out;
1593 1597                  }
1594 1598  
1595 1599                  /*
1596 1600                   * Large File API - non-large open (FOFFMAX flag not set)
1597 1601                   * of regular file fails if the file size exceeds MAXOFF32_T.
1598 1602                   */
1599 1603                  if (why != CRMKDIR &&
1600 1604                      !(flag & FOFFMAX) &&
1601 1605                      (vp->v_type == VREG)) {
1602 1606                          vattr.va_mask = AT_SIZE;
1603 1607                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1604 1608                              CRED(), NULL))) {
1605 1609                                  goto out;
1606 1610                          }
1607 1611                          if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1608 1612                                  error = EOVERFLOW;
1609 1613                                  goto out;
1610 1614                          }
1611 1615                  }
1612 1616          }
1613 1617  
1614 1618          if (error == 0) {
1615 1619                  /*
1616 1620                   * Call mkdir() if specified, otherwise create().
1617 1621                   */
1618 1622                  int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1619 1623  
1620 1624                  if (why == CRMKDIR)
1621 1625                          /*
1622 1626                           * N.B., if vn_createat() ever requests
1623 1627                           * case-insensitive behavior then it will need
1624 1628                           * to be passed to VOP_MKDIR().  VOP_CREATE()
1625 1629                           * will already get it via "flag"
1626 1630                           */
1627 1631                          error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1628 1632                              NULL, 0, NULL);
1629 1633                  else if (!must_be_dir)
1630 1634                          error = VOP_CREATE(dvp, pn.pn_path, vap,
1631 1635                              excl, mode, vpp, CRED(), flag, NULL, NULL);
1632 1636                  else
1633 1637                          error = ENOTDIR;
1634 1638          }
1635 1639  
1636 1640  out:
1637 1641  
1638 1642          if (auditing)
1639 1643                  audit_vncreate_finish(*vpp, error);
1640 1644          if (in_crit) {
1641 1645                  nbl_end_crit(vp);
1642 1646                  in_crit = 0;
1643 1647          }
1644 1648          if (vp != NULL) {
1645 1649                  VN_RELE(vp);
1646 1650                  vp = NULL;
1647 1651          }
1648 1652          pn_free(&pn);
1649 1653          VN_RELE(dvp);
1650 1654          /*
1651 1655           * The following clause was added to handle a problem
1652 1656           * with NFS consistency.  It is possible that a lookup
1653 1657           * of the file to be created succeeded, but the file
1654 1658           * itself doesn't actually exist on the server.  This
1655 1659           * is chiefly due to the DNLC containing an entry for
1656 1660           * the file which has been removed on the server.  In
1657 1661           * this case, we just start over.  If there was some
1658 1662           * other cause for the ESTALE error, then the lookup
1659 1663           * of the file will fail and the error will be returned
1660 1664           * above instead of looping around from here.
1661 1665           */
1662 1666          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1663 1667                  goto top;
1664 1668          return (error);
1665 1669  }
1666 1670  
1667 1671  int
1668 1672  vn_link(char *from, char *to, enum uio_seg seg)
1669 1673  {
1670 1674          return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1671 1675  }
1672 1676  
1673 1677  int
1674 1678  vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1675 1679      vnode_t *tstartvp, char *to, enum uio_seg seg)
1676 1680  {
1677 1681          struct vnode *fvp;              /* from vnode ptr */
1678 1682          struct vnode *tdvp;             /* to directory vnode ptr */
1679 1683          struct pathname pn;
1680 1684          int error;
1681 1685          struct vattr vattr;
1682 1686          dev_t fsid;
1683 1687          int estale_retry = 0;
1684 1688          uint32_t auditing = AU_AUDITING();
1685 1689  
1686 1690  top:
1687 1691          fvp = tdvp = NULL;
1688 1692          if (error = pn_get(to, seg, &pn))
1689 1693                  return (error);
1690 1694          if (auditing && fstartvp != NULL)
1691 1695                  audit_setfsat_path(1);
1692 1696          if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1693 1697                  goto out;
1694 1698          if (auditing && tstartvp != NULL)
1695 1699                  audit_setfsat_path(3);
1696 1700          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1697 1701                  goto out;
1698 1702          /*
1699 1703           * Make sure both source vnode and target directory vnode are
1700 1704           * in the same vfs and that it is writeable.
1701 1705           */
1702 1706          vattr.va_mask = AT_FSID;
1703 1707          if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1704 1708                  goto out;
1705 1709          fsid = vattr.va_fsid;
1706 1710          vattr.va_mask = AT_FSID;
1707 1711          if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1708 1712                  goto out;
1709 1713          if (fsid != vattr.va_fsid) {
1710 1714                  error = EXDEV;
1711 1715                  goto out;
1712 1716          }
1713 1717          if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1714 1718                  error = EROFS;
1715 1719                  goto out;
1716 1720          }
1717 1721          /*
1718 1722           * Do the link.
1719 1723           */
1720 1724          (void) pn_fixslash(&pn);
1721 1725          error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1722 1726  out:
1723 1727          pn_free(&pn);
1724 1728          if (fvp)
1725 1729                  VN_RELE(fvp);
1726 1730          if (tdvp)
1727 1731                  VN_RELE(tdvp);
1728 1732          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1729 1733                  goto top;
1730 1734          return (error);
1731 1735  }
1732 1736  
1733 1737  int
1734 1738  vn_rename(char *from, char *to, enum uio_seg seg)
1735 1739  {
1736 1740          return (vn_renameat(NULL, from, NULL, to, seg));
1737 1741  }
1738 1742  
1739 1743  int
1740 1744  vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1741 1745      char *tname, enum uio_seg seg)
1742 1746  {
1743 1747          int error;
1744 1748          struct vattr vattr;
1745 1749          struct pathname fpn;            /* from pathname */
1746 1750          struct pathname tpn;            /* to pathname */
1747 1751          dev_t fsid;
1748 1752          int in_crit_src, in_crit_targ;
1749 1753          vnode_t *fromvp, *fvp;
1750 1754          vnode_t *tovp, *targvp;
1751 1755          int estale_retry = 0;
1752 1756          uint32_t auditing = AU_AUDITING();
1753 1757  
1754 1758  top:
1755 1759          fvp = fromvp = tovp = targvp = NULL;
1756 1760          in_crit_src = in_crit_targ = 0;
1757 1761          /*
1758 1762           * Get to and from pathnames.
1759 1763           */
1760 1764          if (error = pn_get(fname, seg, &fpn))
1761 1765                  return (error);
1762 1766          if (error = pn_get(tname, seg, &tpn)) {
1763 1767                  pn_free(&fpn);
1764 1768                  return (error);
1765 1769          }
1766 1770  
1767 1771          /*
1768 1772           * First we need to resolve the correct directories
1769 1773           * The passed in directories may only be a starting point,
1770 1774           * but we need the real directories the file(s) live in.
1771 1775           * For example the fname may be something like usr/lib/sparc
1772 1776           * and we were passed in the / directory, but we need to
1773 1777           * use the lib directory for the rename.
1774 1778           */
1775 1779  
1776 1780          if (auditing && fdvp != NULL)
1777 1781                  audit_setfsat_path(1);
1778 1782          /*
1779 1783           * Lookup to and from directories.
1780 1784           */
1781 1785          if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1782 1786                  goto out;
1783 1787          }
1784 1788  
1785 1789          /*
1786 1790           * Make sure there is an entry.
1787 1791           */
1788 1792          if (fvp == NULL) {
1789 1793                  error = ENOENT;
1790 1794                  goto out;
1791 1795          }
1792 1796  
1793 1797          if (auditing && tdvp != NULL)
1794 1798                  audit_setfsat_path(3);
1795 1799          if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1796 1800                  goto out;
1797 1801          }
1798 1802  
1799 1803          /*
1800 1804           * Make sure both the from vnode directory and the to directory
1801 1805           * are in the same vfs and the to directory is writable.
1802 1806           * We check fsid's, not vfs pointers, so loopback fs works.
1803 1807           */
1804 1808          if (fromvp != tovp) {
1805 1809                  vattr.va_mask = AT_FSID;
1806 1810                  if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1807 1811                          goto out;
1808 1812                  fsid = vattr.va_fsid;
1809 1813                  vattr.va_mask = AT_FSID;
1810 1814                  if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1811 1815                          goto out;
1812 1816                  if (fsid != vattr.va_fsid) {
1813 1817                          error = EXDEV;
1814 1818                          goto out;
1815 1819                  }
1816 1820          }
1817 1821  
1818 1822          if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1819 1823                  error = EROFS;
1820 1824                  goto out;
1821 1825          }
1822 1826  
1823 1827          /*
1824 1828           * Make sure "from" vp is not a mount point.
1825 1829           * Note, lookup did traverse() already, so
1826 1830           * we'll be looking at the mounted FS root.
1827 1831           * (but allow files like mnttab)
1828 1832           */
1829 1833          if ((fvp->v_flag & VROOT) != 0 && fvp->v_type == VDIR) {
1830 1834                  error = EBUSY;
1831 1835                  goto out;
1832 1836          }
1833 1837  
1834 1838          if (targvp && (fvp != targvp)) {
1835 1839                  nbl_start_crit(targvp, RW_READER);
1836 1840                  in_crit_targ = 1;
1837 1841                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1838 1842                          error = EACCES;
1839 1843                          goto out;
1840 1844                  }
1841 1845          }
1842 1846  
1843 1847          if (nbl_need_check(fvp)) {
1844 1848                  nbl_start_crit(fvp, RW_READER);
1845 1849                  in_crit_src = 1;
1846 1850                  if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1847 1851                          error = EACCES;
1848 1852                          goto out;
1849 1853                  }
1850 1854          }
1851 1855  
1852 1856          /*
1853 1857           * Do the rename.
1854 1858           */
1855 1859          (void) pn_fixslash(&tpn);
1856 1860          error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1857 1861              NULL, 0);
1858 1862  
1859 1863  out:
1860 1864          pn_free(&fpn);
1861 1865          pn_free(&tpn);
1862 1866          if (in_crit_src)
1863 1867                  nbl_end_crit(fvp);
1864 1868          if (in_crit_targ)
1865 1869                  nbl_end_crit(targvp);
1866 1870          if (fromvp)
1867 1871                  VN_RELE(fromvp);
1868 1872          if (tovp)
1869 1873                  VN_RELE(tovp);
1870 1874          if (targvp)
1871 1875                  VN_RELE(targvp);
1872 1876          if (fvp)
1873 1877                  VN_RELE(fvp);
1874 1878          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1875 1879                  goto top;
1876 1880          return (error);
1877 1881  }
1878 1882  
1879 1883  /*
1880 1884   * Remove a file or directory.
1881 1885   */
1882 1886  int
1883 1887  vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1884 1888  {
1885 1889          return (vn_removeat(NULL, fnamep, seg, dirflag));
1886 1890  }
1887 1891  
1888 1892  int
1889 1893  vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1890 1894  {
1891 1895          struct vnode *vp;               /* entry vnode */
1892 1896          struct vnode *dvp;              /* ptr to parent dir vnode */
1893 1897          struct vnode *coveredvp;
1894 1898          struct pathname pn;             /* name of entry */
1895 1899          enum vtype vtype;
1896 1900          int error;
1897 1901          struct vfs *vfsp;
1898 1902          struct vfs *dvfsp;      /* ptr to parent dir vfs */
1899 1903          int in_crit = 0;
1900 1904          int estale_retry = 0;
1901 1905  
1902 1906  top:
1903 1907          if (error = pn_get(fnamep, seg, &pn))
1904 1908                  return (error);
1905 1909          dvp = vp = NULL;
1906 1910          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1907 1911                  pn_free(&pn);
1908 1912                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1909 1913                          goto top;
1910 1914                  return (error);
1911 1915          }
1912 1916  
1913 1917          /*
1914 1918           * Make sure there is an entry.
1915 1919           */
1916 1920          if (vp == NULL) {
1917 1921                  error = ENOENT;
1918 1922                  goto out;
1919 1923          }
1920 1924  
1921 1925          vfsp = vp->v_vfsp;
1922 1926          dvfsp = dvp->v_vfsp;
1923 1927  
1924 1928          /*
1925 1929           * If the named file is the root of a mounted filesystem, fail,
1926 1930           * unless it's marked unlinkable.  In that case, unmount the
1927 1931           * filesystem and proceed to unlink the covered vnode.  (If the
1928 1932           * covered vnode is a directory, use rmdir instead of unlink,
1929 1933           * to avoid file system corruption.)
1930 1934           */
1931 1935          if (vp->v_flag & VROOT) {
1932 1936                  if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1933 1937                          error = EBUSY;
1934 1938                          goto out;
1935 1939                  }
1936 1940  
1937 1941                  /*
1938 1942                   * Namefs specific code starts here.
1939 1943                   */
1940 1944  
1941 1945                  if (dirflag == RMDIRECTORY) {
1942 1946                          /*
1943 1947                           * User called rmdir(2) on a file that has
1944 1948                           * been namefs mounted on top of.  Since
1945 1949                           * namefs doesn't allow directories to
1946 1950                           * be mounted on other files we know
1947 1951                           * vp is not of type VDIR so fail to operation.
1948 1952                           */
1949 1953                          error = ENOTDIR;
1950 1954                          goto out;
1951 1955                  }
1952 1956  
1953 1957                  /*
1954 1958                   * If VROOT is still set after grabbing vp->v_lock,
1955 1959                   * noone has finished nm_unmount so far and coveredvp
1956 1960                   * is valid.
1957 1961                   * If we manage to grab vn_vfswlock(coveredvp) before releasing
1958 1962                   * vp->v_lock, any race window is eliminated.
1959 1963                   */
1960 1964  
1961 1965                  mutex_enter(&vp->v_lock);
1962 1966                  if ((vp->v_flag & VROOT) == 0) {
1963 1967                          /* Someone beat us to the unmount */
1964 1968                          mutex_exit(&vp->v_lock);
1965 1969                          error = EBUSY;
1966 1970                          goto out;
1967 1971                  }
1968 1972                  vfsp = vp->v_vfsp;
1969 1973                  coveredvp = vfsp->vfs_vnodecovered;
1970 1974                  ASSERT(coveredvp);
1971 1975                  /*
1972 1976                   * Note: Implementation of vn_vfswlock shows that ordering of
1973 1977                   * v_lock / vn_vfswlock is not an issue here.
1974 1978                   */
1975 1979                  error = vn_vfswlock(coveredvp);
1976 1980                  mutex_exit(&vp->v_lock);
1977 1981  
1978 1982                  if (error)
1979 1983                          goto out;
1980 1984  
1981 1985                  VN_HOLD(coveredvp);
1982 1986                  VN_RELE(vp);
1983 1987                  error = dounmount(vfsp, 0, CRED());
1984 1988  
1985 1989                  /*
1986 1990                   * Unmounted the namefs file system; now get
1987 1991                   * the object it was mounted over.
1988 1992                   */
1989 1993                  vp = coveredvp;
1990 1994                  /*
1991 1995                   * If namefs was mounted over a directory, then
1992 1996                   * we want to use rmdir() instead of unlink().
1993 1997                   */
1994 1998                  if (vp->v_type == VDIR)
1995 1999                          dirflag = RMDIRECTORY;
1996 2000  
1997 2001                  if (error)
1998 2002                          goto out;
1999 2003          }
2000 2004  
2001 2005          /*
2002 2006           * Make sure filesystem is writeable.
2003 2007           * We check the parent directory's vfs in case this is an lofs vnode.
2004 2008           */
2005 2009          if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
2006 2010                  error = EROFS;
2007 2011                  goto out;
2008 2012          }
2009 2013  
2010 2014          vtype = vp->v_type;
2011 2015  
2012 2016          /*
2013 2017           * If there is the possibility of an nbmand share reservation, make
2014 2018           * sure it's okay to remove the file.  Keep a reference to the
2015 2019           * vnode, so that we can exit the nbl critical region after
2016 2020           * calling VOP_REMOVE.
2017 2021           * If there is no possibility of an nbmand share reservation,
2018 2022           * release the vnode reference now.  Filesystems like NFS may
2019 2023           * behave differently if there is an extra reference, so get rid of
2020 2024           * this one.  Fortunately, we can't have nbmand mounts on NFS
2021 2025           * filesystems.
2022 2026           */
2023 2027          if (nbl_need_check(vp)) {
2024 2028                  nbl_start_crit(vp, RW_READER);
2025 2029                  in_crit = 1;
2026 2030                  if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
2027 2031                          error = EACCES;
2028 2032                          goto out;
2029 2033                  }
2030 2034          } else {
2031 2035                  VN_RELE(vp);
2032 2036                  vp = NULL;
2033 2037          }
2034 2038  
2035 2039          if (dirflag == RMDIRECTORY) {
2036 2040                  /*
2037 2041                   * Caller is using rmdir(2), which can only be applied to
2038 2042                   * directories.
2039 2043                   */
2040 2044                  if (vtype != VDIR) {
2041 2045                          error = ENOTDIR;
2042 2046                  } else {
2043 2047                          vnode_t *cwd;
2044 2048                          proc_t *pp = curproc;
2045 2049  
2046 2050                          mutex_enter(&pp->p_lock);
2047 2051                          cwd = PTOU(pp)->u_cdir;
2048 2052                          VN_HOLD(cwd);
2049 2053                          mutex_exit(&pp->p_lock);
2050 2054                          error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
2051 2055                              NULL, 0);
2052 2056                          VN_RELE(cwd);
2053 2057                  }
2054 2058          } else {
2055 2059                  /*
2056 2060                   * Unlink(2) can be applied to anything.
2057 2061                   */
2058 2062                  error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
2059 2063          }
2060 2064  
2061 2065  out:
2062 2066          pn_free(&pn);
2063 2067          if (in_crit) {
2064 2068                  nbl_end_crit(vp);
2065 2069                  in_crit = 0;
2066 2070          }
2067 2071          if (vp != NULL)
2068 2072                  VN_RELE(vp);
2069 2073          if (dvp != NULL)
2070 2074                  VN_RELE(dvp);
2071 2075          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
2072 2076                  goto top;
2073 2077          return (error);
2074 2078  }
2075 2079  
2076 2080  /*
2077 2081   * Utility function to compare equality of vnodes.
2078 2082   * Compare the underlying real vnodes, if there are underlying vnodes.
2079 2083   * This is a more thorough comparison than the VN_CMP() macro provides.
2080 2084   */
2081 2085  int
2082 2086  vn_compare(vnode_t *vp1, vnode_t *vp2)
2083 2087  {
2084 2088          vnode_t *realvp;
2085 2089  
2086 2090          if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
2087 2091                  vp1 = realvp;
2088 2092          if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
2089 2093                  vp2 = realvp;
2090 2094          return (VN_CMP(vp1, vp2));
2091 2095  }
2092 2096  
2093 2097  /*
2094 2098   * The number of locks to hash into.  This value must be a power
2095 2099   * of 2 minus 1 and should probably also be prime.
2096 2100   */
2097 2101  #define NUM_BUCKETS     1023
2098 2102  
2099 2103  struct  vn_vfslocks_bucket {
2100 2104          kmutex_t vb_lock;
2101 2105          vn_vfslocks_entry_t *vb_list;
2102 2106          char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2103 2107  };
2104 2108  
2105 2109  /*
2106 2110   * Total number of buckets will be NUM_BUCKETS + 1 .
2107 2111   */
2108 2112  
2109 2113  #pragma align   64(vn_vfslocks_buckets)
2110 2114  static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2111 2115  
2112 2116  #define VN_VFSLOCKS_SHIFT       9
2113 2117  
2114 2118  #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2115 2119          ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2116 2120  
2117 2121  /*
2118 2122   * vn_vfslocks_getlock() uses an HASH scheme to generate
2119 2123   * rwstlock using vfs/vnode pointer passed to it.
2120 2124   *
2121 2125   * vn_vfslocks_rele() releases a reference in the
2122 2126   * HASH table which allows the entry allocated by
2123 2127   * vn_vfslocks_getlock() to be freed at a later
2124 2128   * stage when the refcount drops to zero.
2125 2129   */
2126 2130  
2127 2131  vn_vfslocks_entry_t *
2128 2132  vn_vfslocks_getlock(void *vfsvpptr)
2129 2133  {
2130 2134          struct vn_vfslocks_bucket *bp;
2131 2135          vn_vfslocks_entry_t *vep;
2132 2136          vn_vfslocks_entry_t *tvep;
2133 2137  
2134 2138          ASSERT(vfsvpptr != NULL);
2135 2139          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2136 2140  
2137 2141          mutex_enter(&bp->vb_lock);
2138 2142          for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2139 2143                  if (vep->ve_vpvfs == vfsvpptr) {
2140 2144                          vep->ve_refcnt++;
2141 2145                          mutex_exit(&bp->vb_lock);
2142 2146                          return (vep);
2143 2147                  }
2144 2148          }
2145 2149          mutex_exit(&bp->vb_lock);
2146 2150          vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2147 2151          rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2148 2152          vep->ve_vpvfs = (char *)vfsvpptr;
2149 2153          vep->ve_refcnt = 1;
2150 2154          mutex_enter(&bp->vb_lock);
2151 2155          for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2152 2156                  if (tvep->ve_vpvfs == vfsvpptr) {
2153 2157                          tvep->ve_refcnt++;
2154 2158                          mutex_exit(&bp->vb_lock);
2155 2159  
2156 2160                          /*
2157 2161                           * There is already an entry in the hash
2158 2162                           * destroy what we just allocated.
2159 2163                           */
2160 2164                          rwst_destroy(&vep->ve_lock);
2161 2165                          kmem_free(vep, sizeof (*vep));
2162 2166                          return (tvep);
2163 2167                  }
2164 2168          }
2165 2169          vep->ve_next = bp->vb_list;
2166 2170          bp->vb_list = vep;
2167 2171          mutex_exit(&bp->vb_lock);
2168 2172          return (vep);
2169 2173  }
2170 2174  
2171 2175  void
2172 2176  vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2173 2177  {
2174 2178          struct vn_vfslocks_bucket *bp;
2175 2179          vn_vfslocks_entry_t *vep;
2176 2180          vn_vfslocks_entry_t *pvep;
2177 2181  
2178 2182          ASSERT(vepent != NULL);
2179 2183          ASSERT(vepent->ve_vpvfs != NULL);
2180 2184  
2181 2185          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2182 2186  
2183 2187          mutex_enter(&bp->vb_lock);
2184 2188          vepent->ve_refcnt--;
2185 2189  
2186 2190          if ((int32_t)vepent->ve_refcnt < 0)
2187 2191                  cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2188 2192  
2189 2193          pvep = NULL;
2190 2194          if (vepent->ve_refcnt == 0) {
2191 2195                  for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2192 2196                          if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2193 2197                                  if (pvep == NULL)
2194 2198                                          bp->vb_list = vep->ve_next;
2195 2199                                  else {
2196 2200                                          pvep->ve_next = vep->ve_next;
2197 2201                                  }
2198 2202                                  mutex_exit(&bp->vb_lock);
2199 2203                                  rwst_destroy(&vep->ve_lock);
2200 2204                                  kmem_free(vep, sizeof (*vep));
2201 2205                                  return;
2202 2206                          }
2203 2207                          pvep = vep;
2204 2208                  }
2205 2209                  cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2206 2210          }
2207 2211          mutex_exit(&bp->vb_lock);
2208 2212  }
2209 2213  
2210 2214  /*
2211 2215   * vn_vfswlock_wait is used to implement a lock which is logically a writers
2212 2216   * lock protecting the v_vfsmountedhere field.
2213 2217   * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2214 2218   * except that it blocks to acquire the lock VVFSLOCK.
2215 2219   *
2216 2220   * traverse() and routines re-implementing part of traverse (e.g. autofs)
2217 2221   * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2218 2222   * need the non-blocking version of the writers lock i.e. vn_vfswlock
2219 2223   */
2220 2224  int
2221 2225  vn_vfswlock_wait(vnode_t *vp)
2222 2226  {
2223 2227          int retval;
2224 2228          vn_vfslocks_entry_t *vpvfsentry;
2225 2229          ASSERT(vp != NULL);
2226 2230  
2227 2231          vpvfsentry = vn_vfslocks_getlock(vp);
2228 2232          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2229 2233  
2230 2234          if (retval == EINTR) {
2231 2235                  vn_vfslocks_rele(vpvfsentry);
2232 2236                  return (EINTR);
2233 2237          }
2234 2238          return (retval);
2235 2239  }
2236 2240  
2237 2241  int
2238 2242  vn_vfsrlock_wait(vnode_t *vp)
2239 2243  {
2240 2244          int retval;
2241 2245          vn_vfslocks_entry_t *vpvfsentry;
2242 2246          ASSERT(vp != NULL);
2243 2247  
2244 2248          vpvfsentry = vn_vfslocks_getlock(vp);
2245 2249          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2246 2250  
2247 2251          if (retval == EINTR) {
2248 2252                  vn_vfslocks_rele(vpvfsentry);
2249 2253                  return (EINTR);
2250 2254          }
2251 2255  
2252 2256          return (retval);
2253 2257  }
2254 2258  
2255 2259  
2256 2260  /*
2257 2261   * vn_vfswlock is used to implement a lock which is logically a writers lock
2258 2262   * protecting the v_vfsmountedhere field.
2259 2263   */
2260 2264  int
2261 2265  vn_vfswlock(vnode_t *vp)
2262 2266  {
2263 2267          vn_vfslocks_entry_t *vpvfsentry;
2264 2268  
2265 2269          /*
2266 2270           * If vp is NULL then somebody is trying to lock the covered vnode
2267 2271           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2268 2272           * only happen when unmounting /.  Since that operation will fail
2269 2273           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2270 2274           */
2271 2275          if (vp == NULL)
2272 2276                  return (EBUSY);
2273 2277  
2274 2278          vpvfsentry = vn_vfslocks_getlock(vp);
2275 2279  
2276 2280          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2277 2281                  return (0);
2278 2282  
2279 2283          vn_vfslocks_rele(vpvfsentry);
2280 2284          return (EBUSY);
2281 2285  }
2282 2286  
2283 2287  int
2284 2288  vn_vfsrlock(vnode_t *vp)
2285 2289  {
2286 2290          vn_vfslocks_entry_t *vpvfsentry;
2287 2291  
2288 2292          /*
2289 2293           * If vp is NULL then somebody is trying to lock the covered vnode
2290 2294           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2291 2295           * only happen when unmounting /.  Since that operation will fail
2292 2296           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2293 2297           */
2294 2298          if (vp == NULL)
2295 2299                  return (EBUSY);
2296 2300  
2297 2301          vpvfsentry = vn_vfslocks_getlock(vp);
2298 2302  
2299 2303          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2300 2304                  return (0);
2301 2305  
2302 2306          vn_vfslocks_rele(vpvfsentry);
2303 2307          return (EBUSY);
2304 2308  }
2305 2309  
2306 2310  void
2307 2311  vn_vfsunlock(vnode_t *vp)
2308 2312  {
2309 2313          vn_vfslocks_entry_t *vpvfsentry;
2310 2314  
2311 2315          /*
2312 2316           * ve_refcnt needs to be decremented twice.
2313 2317           * 1. To release refernce after a call to vn_vfslocks_getlock()
2314 2318           * 2. To release the reference from the locking routines like
2315 2319           *    vn_vfsrlock/vn_vfswlock etc,.
2316 2320           */
2317 2321          vpvfsentry = vn_vfslocks_getlock(vp);
2318 2322          vn_vfslocks_rele(vpvfsentry);
2319 2323  
2320 2324          rwst_exit(&vpvfsentry->ve_lock);
2321 2325          vn_vfslocks_rele(vpvfsentry);
2322 2326  }
2323 2327  
2324 2328  int
2325 2329  vn_vfswlock_held(vnode_t *vp)
2326 2330  {
2327 2331          int held;
2328 2332          vn_vfslocks_entry_t *vpvfsentry;
2329 2333  
2330 2334          ASSERT(vp != NULL);
2331 2335  
2332 2336          vpvfsentry = vn_vfslocks_getlock(vp);
2333 2337          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2334 2338  
2335 2339          vn_vfslocks_rele(vpvfsentry);
2336 2340          return (held);
2337 2341  }
2338 2342  
2339 2343  
2340 2344  int
2341 2345  vn_make_ops(
2342 2346          const char *name,                       /* Name of file system */
2343 2347          const fs_operation_def_t *templ,        /* Operation specification */
2344 2348          vnodeops_t **actual)                    /* Return the vnodeops */
2345 2349  {
2346 2350          int unused_ops;
2347 2351          int error;
2348 2352  
2349 2353          *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2350 2354  
2351 2355          (*actual)->vnop_name = name;
2352 2356  
2353 2357          error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2354 2358          if (error) {
2355 2359                  kmem_free(*actual, sizeof (vnodeops_t));
2356 2360          }
2357 2361  
2358 2362  #if DEBUG
2359 2363          if (unused_ops != 0)
2360 2364                  cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2361 2365                      "but not used", name, unused_ops);
2362 2366  #endif
2363 2367  
2364 2368          return (error);
2365 2369  }
2366 2370  
2367 2371  /*
2368 2372   * Free the vnodeops created as a result of vn_make_ops()
2369 2373   */
2370 2374  void
2371 2375  vn_freevnodeops(vnodeops_t *vnops)
2372 2376  {
2373 2377          kmem_free(vnops, sizeof (vnodeops_t));
2374 2378  }
2375 2379  
2376 2380  /*
2377 2381   * Vnode cache.
2378 2382   */
2379 2383  
2380 2384  /* ARGSUSED */
2381 2385  static int
2382 2386  vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2383 2387  {
2384 2388          struct vnode *vp;
2385 2389  
2386 2390          vp = buf;
2387 2391  
2388 2392          mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2389 2393          mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2390 2394          cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2391 2395          rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2392 2396          vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2393 2397          vp->v_path = vn_vpath_empty;
2394 2398          vp->v_path_stamp = 0;
2395 2399          vp->v_mpssdata = NULL;
2396 2400          vp->v_vsd = NULL;
2397 2401          vp->v_fopdata = NULL;
2398 2402  
2399 2403          return (0);
2400 2404  }
2401 2405  
2402 2406  /* ARGSUSED */
2403 2407  static void
2404 2408  vn_cache_destructor(void *buf, void *cdrarg)
2405 2409  {
2406 2410          struct vnode *vp;
2407 2411  
2408 2412          vp = buf;
2409 2413  
2410 2414          rw_destroy(&vp->v_nbllock);
2411 2415          cv_destroy(&vp->v_cv);
2412 2416          mutex_destroy(&vp->v_vsd_lock);
2413 2417          mutex_destroy(&vp->v_lock);
2414 2418  }
2415 2419  
2416 2420  void
2417 2421  vn_create_cache(void)
2418 2422  {
2419 2423          /* LINTED */
2420 2424          ASSERT((1 << VNODE_ALIGN_LOG2) ==
2421 2425              P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2422 2426          vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2423 2427              VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2424 2428              NULL, 0);
2425 2429  }
2426 2430  
2427 2431  void
2428 2432  vn_destroy_cache(void)
2429 2433  {
2430 2434          kmem_cache_destroy(vn_cache);
2431 2435  }
2432 2436  
2433 2437  /*
2434 2438   * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2435 2439   * cached by the file system and vnodes remain associated.
2436 2440   */
2437 2441  void
2438 2442  vn_recycle(vnode_t *vp)
2439 2443  {
2440 2444          ASSERT(vp->v_pages == NULL);
2441 2445          VERIFY(vp->v_path != NULL);
2442 2446  
2443 2447          /*
2444 2448           * XXX - This really belongs in vn_reinit(), but we have some issues
2445 2449           * with the counts.  Best to have it here for clean initialization.
2446 2450           */
2447 2451          vp->v_rdcnt = 0;
2448 2452          vp->v_wrcnt = 0;
2449 2453          vp->v_mmap_read = 0;
2450 2454          vp->v_mmap_write = 0;
2451 2455  
2452 2456          /*
2453 2457           * If FEM was in use, make sure everything gets cleaned up
2454 2458           * NOTE: vp->v_femhead is initialized to NULL in the vnode
2455 2459           * constructor.
2456 2460           */
2457 2461          if (vp->v_femhead) {
2458 2462                  /* XXX - There should be a free_femhead() that does all this */
2459 2463                  ASSERT(vp->v_femhead->femh_list == NULL);
2460 2464                  mutex_destroy(&vp->v_femhead->femh_lock);
2461 2465                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2462 2466                  vp->v_femhead = NULL;
2463 2467          }
2464 2468          if (vp->v_path != vn_vpath_empty) {
2465 2469                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2466 2470                  vp->v_path = vn_vpath_empty;
2467 2471          }
2468 2472          vp->v_path_stamp = 0;
2469 2473  
2470 2474          if (vp->v_fopdata != NULL) {
2471 2475                  free_fopdata(vp);
2472 2476          }
2473 2477          vp->v_mpssdata = NULL;
2474 2478          vsd_free(vp);
2475 2479  }
2476 2480  
2477 2481  /*
2478 2482   * Used to reset the vnode fields including those that are directly accessible
2479 2483   * as well as those which require an accessor function.
2480 2484   *
2481 2485   * Does not initialize:
2482 2486   *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2483 2487   *      v_data (since FS-nodes and vnodes point to each other and should
2484 2488   *              be updated simultaneously)
2485 2489   *      v_op (in case someone needs to make a VOP call on this object)
2486 2490   */
2487 2491  void
2488 2492  vn_reinit(vnode_t *vp)
2489 2493  {
2490 2494          vp->v_count = 1;
2491 2495          vp->v_count_dnlc = 0;
2492 2496          vp->v_phantom_count = 0;
2493 2497          vp->v_vfsp = NULL;
2494 2498          vp->v_stream = NULL;
2495 2499          vp->v_vfsmountedhere = NULL;
2496 2500          vp->v_flag = 0;
2497 2501          vp->v_type = VNON;
2498 2502          vp->v_rdev = NODEV;
2499 2503  
2500 2504          vp->v_filocks = NULL;
2501 2505          vp->v_shrlocks = NULL;
2502 2506          vp->v_pages = NULL;
2503 2507  
2504 2508          vp->v_locality = NULL;
2505 2509          vp->v_xattrdir = NULL;
2506 2510  
2507 2511          /*
2508 2512           * In a few specific instances, vn_reinit() is used to initialize
2509 2513           * locally defined vnode_t instances.  Lacking the construction offered
2510 2514           * by vn_alloc(), these vnodes require v_path initialization.
2511 2515           */
2512 2516          if (vp->v_path == NULL) {
2513 2517                  vp->v_path = vn_vpath_empty;
2514 2518          }
2515 2519  
2516 2520          /* Handles v_femhead, v_path, and the r/w/map counts */
2517 2521          vn_recycle(vp);
2518 2522  }
2519 2523  
2520 2524  vnode_t *
2521 2525  vn_alloc(int kmflag)
2522 2526  {
2523 2527          vnode_t *vp;
2524 2528  
2525 2529          vp = kmem_cache_alloc(vn_cache, kmflag);
2526 2530  
2527 2531          if (vp != NULL) {
2528 2532                  vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2529 2533                  vp->v_fopdata = NULL;
2530 2534                  vn_reinit(vp);
2531 2535          }
2532 2536  
2533 2537          return (vp);
2534 2538  }
2535 2539  
2536 2540  void
2537 2541  vn_free(vnode_t *vp)
2538 2542  {
2539 2543          ASSERT(vp->v_shrlocks == NULL);
2540 2544          ASSERT(vp->v_filocks == NULL);
2541 2545  
2542 2546          /*
2543 2547           * Some file systems call vn_free() with v_count of zero,
2544 2548           * some with v_count of 1.  In any case, the value should
2545 2549           * never be anything else.
2546 2550           */
2547 2551          ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2548 2552          ASSERT(vp->v_count_dnlc == 0);
2549 2553          ASSERT0(vp->v_phantom_count);
2550 2554          VERIFY(vp->v_path != NULL);
2551 2555          if (vp->v_path != vn_vpath_empty) {
2552 2556                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2553 2557                  vp->v_path = vn_vpath_empty;
2554 2558          }
2555 2559  
2556 2560          /* If FEM was in use, make sure everything gets cleaned up */
2557 2561          if (vp->v_femhead) {
2558 2562                  /* XXX - There should be a free_femhead() that does all this */
2559 2563                  ASSERT(vp->v_femhead->femh_list == NULL);
2560 2564                  mutex_destroy(&vp->v_femhead->femh_lock);
2561 2565                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2562 2566                  vp->v_femhead = NULL;
2563 2567          }
2564 2568  
2565 2569          if (vp->v_fopdata != NULL) {
2566 2570                  free_fopdata(vp);
2567 2571          }
2568 2572          vp->v_mpssdata = NULL;
2569 2573          vsd_free(vp);
2570 2574          kmem_cache_free(vn_cache, vp);
2571 2575  }
2572 2576  
2573 2577  /*
2574 2578   * vnode status changes, should define better states than 1, 0.
2575 2579   */
2576 2580  void
2577 2581  vn_reclaim(vnode_t *vp)
2578 2582  {
2579 2583          vfs_t   *vfsp = vp->v_vfsp;
2580 2584  
2581 2585          if (vfsp == NULL ||
2582 2586              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2583 2587                  return;
2584 2588          }
2585 2589          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2586 2590  }
2587 2591  
2588 2592  void
2589 2593  vn_idle(vnode_t *vp)
2590 2594  {
2591 2595          vfs_t   *vfsp = vp->v_vfsp;
2592 2596  
2593 2597          if (vfsp == NULL ||
2594 2598              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2595 2599                  return;
2596 2600          }
2597 2601          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2598 2602  }
2599 2603  void
2600 2604  vn_exists(vnode_t *vp)
2601 2605  {
2602 2606          vfs_t   *vfsp = vp->v_vfsp;
2603 2607  
2604 2608          if (vfsp == NULL ||
2605 2609              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2606 2610                  return;
2607 2611          }
2608 2612          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2609 2613  }
2610 2614  
2611 2615  void
2612 2616  vn_invalid(vnode_t *vp)
2613 2617  {
2614 2618          vfs_t   *vfsp = vp->v_vfsp;
2615 2619  
2616 2620          if (vfsp == NULL ||
2617 2621              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2618 2622                  return;
2619 2623          }
2620 2624          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2621 2625  }
2622 2626  
2623 2627  /* Vnode event notification */
2624 2628  
2625 2629  int
2626 2630  vnevent_support(vnode_t *vp, caller_context_t *ct)
2627 2631  {
2628 2632          if (vp == NULL)
2629 2633                  return (EINVAL);
2630 2634  
2631 2635          return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2632 2636  }
2633 2637  
2634 2638  void
2635 2639  vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2636 2640  {
2637 2641          if (vp == NULL || vp->v_femhead == NULL) {
2638 2642                  return;
2639 2643          }
2640 2644          (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2641 2645          (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2642 2646  }
2643 2647  
2644 2648  void
2645 2649  vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2646 2650      caller_context_t *ct)
2647 2651  {
2648 2652          if (vp == NULL || vp->v_femhead == NULL) {
2649 2653                  return;
2650 2654          }
2651 2655          (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2652 2656  }
2653 2657  
2654 2658  void
2655 2659  vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2656 2660      caller_context_t *ct)
2657 2661  {
2658 2662          if (vp == NULL || vp->v_femhead == NULL) {
2659 2663                  return;
2660 2664          }
2661 2665          (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2662 2666  }
2663 2667  
2664 2668  void
2665 2669  vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2666 2670  {
2667 2671          if (vp == NULL || vp->v_femhead == NULL) {
2668 2672                  return;
2669 2673          }
2670 2674          (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2671 2675  }
2672 2676  
2673 2677  void
2674 2678  vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2675 2679  {
2676 2680          if (vp == NULL || vp->v_femhead == NULL) {
2677 2681                  return;
2678 2682          }
2679 2683          (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2680 2684  }
2681 2685  
2682 2686  void
2683 2687  vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2684 2688      caller_context_t *ct)
2685 2689  {
2686 2690          if (vp == NULL || vp->v_femhead == NULL) {
2687 2691                  return;
2688 2692          }
2689 2693          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2690 2694  }
2691 2695  
2692 2696  void
2693 2697  vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2694 2698      caller_context_t *ct)
2695 2699  {
2696 2700          if (vp == NULL || vp->v_femhead == NULL) {
2697 2701                  return;
2698 2702          }
2699 2703          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2700 2704  }
2701 2705  
2702 2706  void
2703 2707  vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2704 2708      caller_context_t *ct)
2705 2709  {
2706 2710          if (vp == NULL || vp->v_femhead == NULL) {
2707 2711                  return;
2708 2712          }
2709 2713          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2710 2714  }
2711 2715  
2712 2716  void
2713 2717  vnevent_create(vnode_t *vp, caller_context_t *ct)
2714 2718  {
2715 2719          if (vp == NULL || vp->v_femhead == NULL) {
2716 2720                  return;
2717 2721          }
2718 2722          (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2719 2723  }
2720 2724  
2721 2725  void
2722 2726  vnevent_link(vnode_t *vp, caller_context_t *ct)
2723 2727  {
2724 2728          if (vp == NULL || vp->v_femhead == NULL) {
2725 2729                  return;
2726 2730          }
2727 2731          (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2728 2732  }
2729 2733  
2730 2734  void
2731 2735  vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2732 2736  {
2733 2737          if (vp == NULL || vp->v_femhead == NULL) {
2734 2738                  return;
2735 2739          }
2736 2740          (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2737 2741  }
2738 2742  
2739 2743  void
2740 2744  vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2741 2745  {
2742 2746          if (vp == NULL || vp->v_femhead == NULL) {
2743 2747                  return;
2744 2748          }
2745 2749          (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2746 2750  }
2747 2751  
2748 2752  void
2749 2753  vnevent_resize(vnode_t *vp, caller_context_t *ct)
2750 2754  {
2751 2755          if (vp == NULL || vp->v_femhead == NULL) {
2752 2756                  return;
2753 2757          }
2754 2758          (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2755 2759  }
2756 2760  
2757 2761  /*
2758 2762   * Vnode accessors.
2759 2763   */
2760 2764  
2761 2765  int
2762 2766  vn_is_readonly(vnode_t *vp)
2763 2767  {
2764 2768          return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2765 2769  }
2766 2770  
2767 2771  int
2768 2772  vn_has_flocks(vnode_t *vp)
2769 2773  {
2770 2774          return (vp->v_filocks != NULL);
2771 2775  }
2772 2776  
2773 2777  int
2774 2778  vn_has_mandatory_locks(vnode_t *vp, int mode)
2775 2779  {
2776 2780          return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2777 2781  }
2778 2782  
2779 2783  int
2780 2784  vn_has_cached_data(vnode_t *vp)
2781 2785  {
2782 2786          return (vp->v_pages != NULL);
2783 2787  }
2784 2788  
2785 2789  /*
2786 2790   * Return 0 if the vnode in question shouldn't be permitted into a zone via
2787 2791   * zone_enter(2).
2788 2792   */
2789 2793  int
2790 2794  vn_can_change_zones(vnode_t *vp)
2791 2795  {
2792 2796          struct vfssw *vswp;
2793 2797          int allow = 1;
2794 2798          vnode_t *rvp;
2795 2799  
2796 2800          if (nfs_global_client_only != 0)
2797 2801                  return (1);
2798 2802  
2799 2803          /*
2800 2804           * We always want to look at the underlying vnode if there is one.
2801 2805           */
2802 2806          if (VOP_REALVP(vp, &rvp, NULL) != 0)
2803 2807                  rvp = vp;
2804 2808          /*
2805 2809           * Some pseudo filesystems (including doorfs) don't actually register
2806 2810           * their vfsops_t, so the following may return NULL; we happily let
2807 2811           * such vnodes switch zones.
2808 2812           */
2809 2813          vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2810 2814          if (vswp != NULL) {
2811 2815                  if (vswp->vsw_flag & VSW_NOTZONESAFE)
2812 2816                          allow = 0;
2813 2817                  vfs_unrefvfssw(vswp);
2814 2818          }
2815 2819          return (allow);
2816 2820  }
2817 2821  
2818 2822  /*
2819 2823   * Return nonzero if the vnode is a mount point, zero if not.
2820 2824   */
2821 2825  int
2822 2826  vn_ismntpt(vnode_t *vp)
2823 2827  {
2824 2828          return (vp->v_vfsmountedhere != NULL);
2825 2829  }
2826 2830  
2827 2831  /* Retrieve the vfs (if any) mounted on this vnode */
2828 2832  vfs_t *
2829 2833  vn_mountedvfs(vnode_t *vp)
2830 2834  {
2831 2835          return (vp->v_vfsmountedhere);
2832 2836  }
2833 2837  
2834 2838  /*
2835 2839   * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2836 2840   */
2837 2841  int
2838 2842  vn_in_dnlc(vnode_t *vp)
2839 2843  {
2840 2844          return (vp->v_count_dnlc > 0);
2841 2845  }
2842 2846  
2843 2847  /*
2844 2848   * vn_has_other_opens() checks whether a particular file is opened by more than
2845 2849   * just the caller and whether the open is for read and/or write.
2846 2850   * This routine is for calling after the caller has already called VOP_OPEN()
2847 2851   * and the caller wishes to know if they are the only one with it open for
2848 2852   * the mode(s) specified.
2849 2853   *
2850 2854   * Vnode counts are only kept on regular files (v_type=VREG).
2851 2855   */
2852 2856  int
2853 2857  vn_has_other_opens(
2854 2858          vnode_t *vp,
2855 2859          v_mode_t mode)
2856 2860  {
2857 2861  
2858 2862          ASSERT(vp != NULL);
2859 2863  
2860 2864          switch (mode) {
2861 2865          case V_WRITE:
2862 2866                  if (vp->v_wrcnt > 1)
2863 2867                          return (V_TRUE);
2864 2868                  break;
2865 2869          case V_RDORWR:
2866 2870                  if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2867 2871                          return (V_TRUE);
2868 2872                  break;
2869 2873          case V_RDANDWR:
2870 2874                  if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2871 2875                          return (V_TRUE);
2872 2876                  break;
2873 2877          case V_READ:
2874 2878                  if (vp->v_rdcnt > 1)
2875 2879                          return (V_TRUE);
2876 2880                  break;
2877 2881          }
2878 2882  
2879 2883          return (V_FALSE);
2880 2884  }
2881 2885  
2882 2886  /*
2883 2887   * vn_is_opened() checks whether a particular file is opened and
2884 2888   * whether the open is for read and/or write.
2885 2889   *
2886 2890   * Vnode counts are only kept on regular files (v_type=VREG).
2887 2891   */
2888 2892  int
2889 2893  vn_is_opened(
2890 2894          vnode_t *vp,
2891 2895          v_mode_t mode)
2892 2896  {
2893 2897  
2894 2898          ASSERT(vp != NULL);
2895 2899  
2896 2900          switch (mode) {
2897 2901          case V_WRITE:
2898 2902                  if (vp->v_wrcnt)
2899 2903                          return (V_TRUE);
2900 2904                  break;
2901 2905          case V_RDANDWR:
2902 2906                  if (vp->v_rdcnt && vp->v_wrcnt)
2903 2907                          return (V_TRUE);
2904 2908                  break;
2905 2909          case V_RDORWR:
2906 2910                  if (vp->v_rdcnt || vp->v_wrcnt)
2907 2911                          return (V_TRUE);
2908 2912                  break;
2909 2913          case V_READ:
2910 2914                  if (vp->v_rdcnt)
2911 2915                          return (V_TRUE);
2912 2916                  break;
2913 2917          }
2914 2918  
2915 2919          return (V_FALSE);
2916 2920  }
2917 2921  
2918 2922  /*
2919 2923   * vn_is_mapped() checks whether a particular file is mapped and whether
2920 2924   * the file is mapped read and/or write.
2921 2925   */
2922 2926  int
2923 2927  vn_is_mapped(
2924 2928          vnode_t *vp,
2925 2929          v_mode_t mode)
2926 2930  {
2927 2931  
2928 2932          ASSERT(vp != NULL);
2929 2933  
2930 2934  #if !defined(_LP64)
2931 2935          switch (mode) {
2932 2936          /*
2933 2937           * The atomic_add_64_nv functions force atomicity in the
2934 2938           * case of 32 bit architectures. Otherwise the 64 bit values
2935 2939           * require two fetches. The value of the fields may be
2936 2940           * (potentially) changed between the first fetch and the
2937 2941           * second
2938 2942           */
2939 2943          case V_WRITE:
2940 2944                  if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2941 2945                          return (V_TRUE);
2942 2946                  break;
2943 2947          case V_RDANDWR:
2944 2948                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2945 2949                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2946 2950                          return (V_TRUE);
2947 2951                  break;
2948 2952          case V_RDORWR:
2949 2953                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2950 2954                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2951 2955                          return (V_TRUE);
2952 2956                  break;
2953 2957          case V_READ:
2954 2958                  if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2955 2959                          return (V_TRUE);
2956 2960                  break;
2957 2961          }
2958 2962  #else
2959 2963          switch (mode) {
2960 2964          case V_WRITE:
2961 2965                  if (vp->v_mmap_write)
2962 2966                          return (V_TRUE);
2963 2967                  break;
2964 2968          case V_RDANDWR:
2965 2969                  if (vp->v_mmap_read && vp->v_mmap_write)
2966 2970                          return (V_TRUE);
2967 2971                  break;
2968 2972          case V_RDORWR:
2969 2973                  if (vp->v_mmap_read || vp->v_mmap_write)
2970 2974                          return (V_TRUE);
2971 2975                  break;
2972 2976          case V_READ:
2973 2977                  if (vp->v_mmap_read)
2974 2978                          return (V_TRUE);
2975 2979                  break;
2976 2980          }
2977 2981  #endif
2978 2982  
2979 2983          return (V_FALSE);
2980 2984  }
2981 2985  
2982 2986  /*
2983 2987   * Set the operations vector for a vnode.
2984 2988   *
2985 2989   * FEM ensures that the v_femhead pointer is filled in before the
2986 2990   * v_op pointer is changed.  This means that if the v_femhead pointer
2987 2991   * is NULL, and the v_op field hasn't changed since before which checked
2988 2992   * the v_femhead pointer; then our update is ok - we are not racing with
2989 2993   * FEM.
2990 2994   */
2991 2995  void
2992 2996  vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2993 2997  {
2994 2998          vnodeops_t      *op;
2995 2999  
2996 3000          ASSERT(vp != NULL);
2997 3001          ASSERT(vnodeops != NULL);
2998 3002  
2999 3003          op = vp->v_op;
3000 3004          membar_consumer();
3001 3005          /*
3002 3006           * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
3003 3007           * the compare-and-swap on vp->v_op.  If either fails, then FEM is
3004 3008           * in effect on the vnode and we need to have FEM deal with it.
3005 3009           */
3006 3010          if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
3007 3011              op) {
3008 3012                  fem_setvnops(vp, vnodeops);
3009 3013          }
3010 3014  }
3011 3015  
3012 3016  /*
3013 3017   * Retrieve the operations vector for a vnode
3014 3018   * As with vn_setops(above); make sure we aren't racing with FEM.
3015 3019   * FEM sets the v_op to a special, internal, vnodeops that wouldn't
3016 3020   * make sense to the callers of this routine.
3017 3021   */
3018 3022  vnodeops_t *
3019 3023  vn_getops(vnode_t *vp)
3020 3024  {
3021 3025          vnodeops_t      *op;
3022 3026  
3023 3027          ASSERT(vp != NULL);
3024 3028  
3025 3029          op = vp->v_op;
3026 3030          membar_consumer();
3027 3031          if (vp->v_femhead == NULL && op == vp->v_op) {
3028 3032                  return (op);
3029 3033          } else {
3030 3034                  return (fem_getvnops(vp));
3031 3035          }
3032 3036  }
3033 3037  
3034 3038  /*
3035 3039   * Returns non-zero (1) if the vnodeops matches that of the vnode.
3036 3040   * Returns zero (0) if not.
3037 3041   */
3038 3042  int
3039 3043  vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
3040 3044  {
3041 3045          return (vn_getops(vp) == vnodeops);
3042 3046  }
3043 3047  
3044 3048  /*
3045 3049   * Returns non-zero (1) if the specified operation matches the
3046 3050   * corresponding operation for that the vnode.
3047 3051   * Returns zero (0) if not.
3048 3052   */
3049 3053  
3050 3054  #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
3051 3055  
3052 3056  int
3053 3057  vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
3054 3058  {
3055 3059          const fs_operation_trans_def_t *otdp;
3056 3060          fs_generic_func_p *loc = NULL;
3057 3061          vnodeops_t      *vop = vn_getops(vp);
3058 3062  
3059 3063          ASSERT(vopname != NULL);
3060 3064  
3061 3065          for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
3062 3066                  if (MATCHNAME(otdp->name, vopname)) {
3063 3067                          loc = (fs_generic_func_p *)
3064 3068                              ((char *)(vop) + otdp->offset);
3065 3069                          break;
3066 3070                  }
3067 3071          }
3068 3072  
3069 3073          return ((loc != NULL) && (*loc == funcp));
3070 3074  }
3071 3075  
3072 3076  /*
3073 3077   * fs_new_caller_id() needs to return a unique ID on a given local system.
3074 3078   * The IDs do not need to survive across reboots.  These are primarily
3075 3079   * used so that (FEM) monitors can detect particular callers (such as
3076 3080   * the NFS server) to a given vnode/vfs operation.
3077 3081   */
3078 3082  u_longlong_t
3079 3083  fs_new_caller_id()
3080 3084  {
3081 3085          static uint64_t next_caller_id = 0LL; /* First call returns 1 */
3082 3086  
3083 3087          return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
3084 3088  }
3085 3089  
3086 3090  /*
3087 3091   * The value stored in v_path is relative to rootdir, located in the global
3088 3092   * zone.  Zones or chroot environments which reside deeper inside the VFS
3089 3093   * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
3090 3094   * what lies below their perceived root.  In order to keep v_path usable for
3091 3095   * these child environments, its allocations are allowed to exceed MAXPATHLEN.
3092 3096   *
3093 3097   * An upper bound of max_vnode_path is placed upon v_path allocations to
3094 3098   * prevent the system from going too wild at the behest of pathological
3095 3099   * behavior from the operator.
3096 3100   */
3097 3101  size_t max_vnode_path = 4 * MAXPATHLEN;
3098 3102  
3099 3103  
3100 3104  void
3101 3105  vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
3102 3106  {
3103 3107          char *buf;
3104 3108  
3105 3109          mutex_enter(&vp->v_lock);
3106 3110          /*
3107 3111           * If the snapshot of v_path_stamp passed in via compare_stamp does not
3108 3112           * match the present value on the vnode, it indicates that subsequent
3109 3113           * changes have occurred.  The v_path value is not cleared in this case
3110 3114           * since the new value may be valid.
3111 3115           */
3112 3116          if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3113 3117                  mutex_exit(&vp->v_lock);
3114 3118                  return;
3115 3119          }
3116 3120          buf = vp->v_path;
3117 3121          vp->v_path = vn_vpath_empty;
3118 3122          vp->v_path_stamp = 0;
3119 3123          mutex_exit(&vp->v_lock);
3120 3124          if (buf != vn_vpath_empty) {
3121 3125                  kmem_free(buf, strlen(buf) + 1);
3122 3126          }
3123 3127  }
3124 3128  
3125 3129  static void
3126 3130  vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3127 3131      boolean_t is_rename)
3128 3132  {
3129 3133          char *buf, *oldbuf;
3130 3134          hrtime_t pstamp;
3131 3135          size_t baselen, buflen = 0;
3132 3136  
3133 3137          /* Handle the vn_setpath_str case. */
3134 3138          if (pvp == NULL) {
3135 3139                  if (len + 1 > max_vnode_path) {
3136 3140                          DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3137 3141                              vnode_t *, vp, char *, name, size_t, len + 1);
3138 3142                          return;
3139 3143                  }
3140 3144                  buf = kmem_alloc(len + 1, KM_SLEEP);
3141 3145                  bcopy(name, buf, len);
3142 3146                  buf[len] = '\0';
3143 3147  
3144 3148                  mutex_enter(&vp->v_lock);
3145 3149                  oldbuf = vp->v_path;
3146 3150                  vp->v_path = buf;
3147 3151                  vp->v_path_stamp = gethrtime();
3148 3152                  mutex_exit(&vp->v_lock);
3149 3153                  if (oldbuf != vn_vpath_empty) {
3150 3154                          kmem_free(oldbuf, strlen(oldbuf) + 1);
3151 3155                  }
3152 3156                  return;
3153 3157          }
3154 3158  
3155 3159          /* Take snapshot of parent dir */
3156 3160          mutex_enter(&pvp->v_lock);
3157 3161  
3158 3162          if ((pvp->v_flag & VTRAVERSE) != 0) {
3159 3163                  /*
3160 3164                   * When the parent vnode has VTRAVERSE set in its flags, normal
3161 3165                   * assumptions about v_path calculation no longer apply.  The
3162 3166                   * primary situation where this occurs is via the VFS tricks
3163 3167                   * which procfs plays in order to allow /proc/PID/(root|cwd) to
3164 3168                   * yield meaningful results.
3165 3169                   *
3166 3170                   * When this flag is set, v_path on the child must not be
3167 3171                   * updated since the calculated value is likely to be
3168 3172                   * incorrect, given the current context.
3169 3173                   */
3170 3174                  mutex_exit(&pvp->v_lock);
3171 3175                  return;
3172 3176          }
3173 3177  
3174 3178  retrybuf:
3175 3179          if (pvp->v_path == vn_vpath_empty) {
3176 3180                  /*
3177 3181                   * Without v_path from the parent directory, generating a child
3178 3182                   * path from the name is impossible.
3179 3183                   */
3180 3184                  if (len > 0) {
3181 3185                          pstamp = pvp->v_path_stamp;
3182 3186                          mutex_exit(&pvp->v_lock);
3183 3187                          vn_clearpath(vp, pstamp);
3184 3188                          return;
3185 3189                  }
3186 3190  
3187 3191                  /*
3188 3192                   * The only feasible case here is where a NUL lookup is being
3189 3193                   * performed on rootdir prior to its v_path being populated.
3190 3194                   */
3191 3195                  ASSERT(pvp->v_path_stamp == 0);
3192 3196                  baselen = 0;
3193 3197                  pstamp = 0;
3194 3198          } else {
3195 3199                  pstamp = pvp->v_path_stamp;
3196 3200                  baselen = strlen(pvp->v_path);
3197 3201                  /* ignore a trailing slash if present */
3198 3202                  if (pvp->v_path[baselen - 1] == '/') {
3199 3203                          /* This should only the be case for rootdir */
3200 3204                          ASSERT(baselen == 1 && pvp == rootdir);
3201 3205                          baselen--;
3202 3206                  }
3203 3207          }
3204 3208          mutex_exit(&pvp->v_lock);
3205 3209  
3206 3210          if (buflen != 0) {
3207 3211                  /* Free the existing (mis-sized) buffer in case of retry */
3208 3212                  kmem_free(buf, buflen);
3209 3213          }
3210 3214          /* base, '/', name and trailing NUL */
3211 3215          buflen = baselen + len + 2;
3212 3216          if (buflen > max_vnode_path) {
3213 3217                  DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3214 3218                      vnode_t *, vp, char *, name, size_t, buflen);
3215 3219                  return;
3216 3220          }
3217 3221          buf = kmem_alloc(buflen, KM_SLEEP);
3218 3222  
3219 3223          mutex_enter(&pvp->v_lock);
3220 3224          if (pvp->v_path_stamp != pstamp) {
3221 3225                  size_t vlen;
3222 3226  
3223 3227                  /*
3224 3228                   * Since v_path_stamp changed on the parent, it is likely that
3225 3229                   * v_path has been altered as well.  If the length does not
3226 3230                   * exactly match what was previously measured, the buffer
3227 3231                   * allocation must be repeated for proper sizing.
3228 3232                   */
3229 3233                  if (pvp->v_path == vn_vpath_empty) {
3230 3234                          /* Give up if parent lack v_path */
3231 3235                          mutex_exit(&pvp->v_lock);
3232 3236                          kmem_free(buf, buflen);
3233 3237                          return;
3234 3238                  }
3235 3239                  vlen = strlen(pvp->v_path);
3236 3240                  if (pvp->v_path[vlen - 1] == '/') {
3237 3241                          vlen--;
3238 3242                  }
3239 3243                  if (vlen != baselen) {
3240 3244                          goto retrybuf;
3241 3245                  }
3242 3246          }
3243 3247          bcopy(pvp->v_path, buf, baselen);
3244 3248          mutex_exit(&pvp->v_lock);
3245 3249  
3246 3250          buf[baselen] = '/';
3247 3251          baselen++;
3248 3252          bcopy(name, &buf[baselen], len + 1);
3249 3253  
3250 3254          mutex_enter(&vp->v_lock);
3251 3255          if (vp->v_path_stamp == 0) {
3252 3256                  /* never-visited vnode can inherit stamp from parent */
3253 3257                  ASSERT(vp->v_path == vn_vpath_empty);
3254 3258                  vp->v_path_stamp = pstamp;
3255 3259                  vp->v_path = buf;
3256 3260                  mutex_exit(&vp->v_lock);
3257 3261          } else if (vp->v_path_stamp < pstamp || is_rename) {
3258 3262                  /*
3259 3263                   * Install the updated path and stamp, ensuring that the v_path
3260 3264                   * pointer is valid at all times for dtrace.
3261 3265                   */
3262 3266                  oldbuf = vp->v_path;
3263 3267                  vp->v_path = buf;
3264 3268                  vp->v_path_stamp = gethrtime();
3265 3269                  mutex_exit(&vp->v_lock);
3266 3270                  kmem_free(oldbuf, strlen(oldbuf) + 1);
3267 3271          } else {
3268 3272                  /*
3269 3273                   * If the timestamp matches or is greater, it means another
3270 3274                   * thread performed the update first while locks were dropped
3271 3275                   * here to make the allocation.  We defer to the newer value.
3272 3276                   */
3273 3277                  mutex_exit(&vp->v_lock);
3274 3278                  kmem_free(buf, buflen);
3275 3279          }
3276 3280          ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3277 3281  }
3278 3282  
3279 3283  void
3280 3284  vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3281 3285  {
3282 3286          size_t len;
3283 3287  
3284 3288          /*
3285 3289           * If the parent is older or empty, there's nothing further to do.
3286 3290           */
3287 3291          if (pvp->v_path == vn_vpath_empty ||
3288 3292              pvp->v_path_stamp <= vp->v_path_stamp) {
3289 3293                  return;
3290 3294          }
3291 3295  
3292 3296          /*
3293 3297           * Given the lack of appropriate context, meaningful updates to v_path
3294 3298           * cannot be made for during lookups for the '.' or '..' entries.
3295 3299           */
3296 3300          len = strlen(name);
3297 3301          if (len == 0 || (len == 1 && name[0] == '.') ||
3298 3302              (len == 2 && name[0] == '.' && name[1] == '.')) {
3299 3303                  return;
3300 3304          }
3301 3305  
3302 3306          vn_setpath_common(pvp, vp, name, len, B_FALSE);
3303 3307  }
3304 3308  
3305 3309  /*
3306 3310   * Given a starting vnode and a path, updates the path in the target vnode in
3307 3311   * a safe manner.  If the vnode already has path information embedded, then the
3308 3312   * cached path is left untouched.
3309 3313   */
3310 3314  /* ARGSUSED */
3311 3315  void
3312 3316  vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3313 3317      size_t len)
3314 3318  {
3315 3319          vn_setpath_common(pvp, vp, name, len, B_FALSE);
3316 3320  }
3317 3321  
3318 3322  /*
3319 3323   * Sets the path to the vnode to be the given string, regardless of current
3320 3324   * context.  The string must be a complete path from rootdir.  This is only used
3321 3325   * by fsop_root() for setting the path based on the mountpoint.
3322 3326   */
3323 3327  void
3324 3328  vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3325 3329  {
3326 3330          vn_setpath_common(NULL, vp, str, len, B_FALSE);
3327 3331  }
3328 3332  
3329 3333  /*
3330 3334   * Called from within filesystem's vop_rename() to handle renames once the
3331 3335   * target vnode is available.
3332 3336   */
3333 3337  void
3334 3338  vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3335 3339  {
3336 3340          vn_setpath_common(pvp, vp, name, len, B_TRUE);
3337 3341  }
3338 3342  
3339 3343  /*
3340 3344   * Similar to vn_setpath_str(), this function sets the path of the destination
3341 3345   * vnode to the be the same as the source vnode.
3342 3346   */
3343 3347  void
3344 3348  vn_copypath(struct vnode *src, struct vnode *dst)
3345 3349  {
3346 3350          char *buf;
3347 3351          hrtime_t stamp;
3348 3352          size_t buflen;
3349 3353  
3350 3354          mutex_enter(&src->v_lock);
3351 3355          if (src->v_path == vn_vpath_empty) {
3352 3356                  mutex_exit(&src->v_lock);
3353 3357                  return;
3354 3358          }
3355 3359          buflen = strlen(src->v_path) + 1;
3356 3360          mutex_exit(&src->v_lock);
3357 3361  
3358 3362          buf = kmem_alloc(buflen, KM_SLEEP);
3359 3363  
3360 3364          mutex_enter(&src->v_lock);
3361 3365          if (src->v_path == vn_vpath_empty ||
3362 3366              strlen(src->v_path) + 1 != buflen) {
3363 3367                  mutex_exit(&src->v_lock);
3364 3368                  kmem_free(buf, buflen);
3365 3369                  return;
3366 3370          }
3367 3371          bcopy(src->v_path, buf, buflen);
3368 3372          stamp = src->v_path_stamp;
3369 3373          mutex_exit(&src->v_lock);
3370 3374  
3371 3375          mutex_enter(&dst->v_lock);
3372 3376          if (dst->v_path != vn_vpath_empty) {
3373 3377                  mutex_exit(&dst->v_lock);
3374 3378                  kmem_free(buf, buflen);
3375 3379                  return;
3376 3380          }
3377 3381          dst->v_path = buf;
3378 3382          dst->v_path_stamp = stamp;
3379 3383          mutex_exit(&dst->v_lock);
3380 3384  }
3381 3385  
3382 3386  
3383 3387  /*
3384 3388   * XXX Private interface for segvn routines that handle vnode
3385 3389   * large page segments.
3386 3390   *
3387 3391   * return 1 if vp's file system VOP_PAGEIO() implementation
3388 3392   * can be safely used instead of VOP_GETPAGE() for handling
3389 3393   * pagefaults against regular non swap files. VOP_PAGEIO()
3390 3394   * interface is considered safe here if its implementation
3391 3395   * is very close to VOP_GETPAGE() implementation.
3392 3396   * e.g. It zero's out the part of the page beyond EOF. Doesn't
3393 3397   * panic if there're file holes but instead returns an error.
3394 3398   * Doesn't assume file won't be changed by user writes, etc.
3395 3399   *
3396 3400   * return 0 otherwise.
3397 3401   *
3398 3402   * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3399 3403   */
3400 3404  int
3401 3405  vn_vmpss_usepageio(vnode_t *vp)
3402 3406  {
3403 3407          vfs_t   *vfsp = vp->v_vfsp;
3404 3408          char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3405 3409          char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3406 3410          char **fsok = pageio_ok_fss;
3407 3411  
3408 3412          if (fsname == NULL) {
3409 3413                  return (0);
3410 3414          }
3411 3415  
3412 3416          for (; *fsok; fsok++) {
3413 3417                  if (strcmp(*fsok, fsname) == 0) {
3414 3418                          return (1);
3415 3419                  }
3416 3420          }
3417 3421          return (0);
3418 3422  }
3419 3423  
3420 3424  /* VOP_XXX() macros call the corresponding fop_xxx() function */
3421 3425  
3422 3426  int
3423 3427  fop_open(
3424 3428          vnode_t **vpp,
3425 3429          int mode,
3426 3430          cred_t *cr,
3427 3431          caller_context_t *ct)
3428 3432  {
3429 3433          int ret;
3430 3434          vnode_t *vp = *vpp;
3431 3435  
3432 3436          VN_HOLD(vp);
3433 3437          /*
3434 3438           * Adding to the vnode counts before calling open
3435 3439           * avoids the need for a mutex. It circumvents a race
3436 3440           * condition where a query made on the vnode counts results in a
3437 3441           * false negative. The inquirer goes away believing the file is
3438 3442           * not open when there is an open on the file already under way.
3439 3443           *
3440 3444           * The counts are meant to prevent NFS from granting a delegation
3441 3445           * when it would be dangerous to do so.
3442 3446           *
3443 3447           * The vnode counts are only kept on regular files
3444 3448           */
3445 3449          if ((*vpp)->v_type == VREG) {
3446 3450                  if (mode & FREAD)
3447 3451                          atomic_inc_32(&(*vpp)->v_rdcnt);
3448 3452                  if (mode & FWRITE)
3449 3453                          atomic_inc_32(&(*vpp)->v_wrcnt);
3450 3454          }
3451 3455  
3452 3456          VOPXID_MAP_CR(vp, cr);
3453 3457  
3454 3458          ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3455 3459  
3456 3460          if (ret) {
3457 3461                  /*
3458 3462                   * Use the saved vp just in case the vnode ptr got trashed
3459 3463                   * by the error.
3460 3464                   */
3461 3465                  VOPSTATS_UPDATE(vp, open);
3462 3466                  if ((vp->v_type == VREG) && (mode & FREAD))
3463 3467                          atomic_dec_32(&vp->v_rdcnt);
3464 3468                  if ((vp->v_type == VREG) && (mode & FWRITE))
3465 3469                          atomic_dec_32(&vp->v_wrcnt);
3466 3470          } else {
3467 3471                  /*
3468 3472                   * Some filesystems will return a different vnode,
3469 3473                   * but the same path was still used to open it.
3470 3474                   * So if we do change the vnode and need to
3471 3475                   * copy over the path, do so here, rather than special
3472 3476                   * casing each filesystem. Adjust the vnode counts to
3473 3477                   * reflect the vnode switch.
3474 3478                   */
3475 3479                  VOPSTATS_UPDATE(*vpp, open);
3476 3480                  if (*vpp != vp) {
3477 3481                          vn_copypath(vp, *vpp);
3478 3482                          if (((*vpp)->v_type == VREG) && (mode & FREAD))
3479 3483                                  atomic_inc_32(&(*vpp)->v_rdcnt);
3480 3484                          if ((vp->v_type == VREG) && (mode & FREAD))
3481 3485                                  atomic_dec_32(&vp->v_rdcnt);
3482 3486                          if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3483 3487                                  atomic_inc_32(&(*vpp)->v_wrcnt);
3484 3488                          if ((vp->v_type == VREG) && (mode & FWRITE))
3485 3489                                  atomic_dec_32(&vp->v_wrcnt);
3486 3490                  }
3487 3491          }
3488 3492          VN_RELE(vp);
3489 3493          return (ret);
3490 3494  }
3491 3495  
3492 3496  int
3493 3497  fop_close(
3494 3498          vnode_t *vp,
3495 3499          int flag,
3496 3500          int count,
3497 3501          offset_t offset,
3498 3502          cred_t *cr,
3499 3503          caller_context_t *ct)
3500 3504  {
3501 3505          int err;
3502 3506  
3503 3507          VOPXID_MAP_CR(vp, cr);
3504 3508  
3505 3509          err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3506 3510          VOPSTATS_UPDATE(vp, close);
3507 3511          /*
3508 3512           * Check passed in count to handle possible dups. Vnode counts are only
3509 3513           * kept on regular files
3510 3514           */
3511 3515          if ((vp->v_type == VREG) && (count == 1))  {
3512 3516                  if (flag & FREAD) {
3513 3517                          ASSERT(vp->v_rdcnt > 0);
3514 3518                          atomic_dec_32(&vp->v_rdcnt);
3515 3519                  }
3516 3520                  if (flag & FWRITE) {
3517 3521                          ASSERT(vp->v_wrcnt > 0);
3518 3522                          atomic_dec_32(&vp->v_wrcnt);
3519 3523                  }
3520 3524          }
3521 3525          return (err);
3522 3526  }
3523 3527  
3524 3528  int
3525 3529  fop_read(
3526 3530          vnode_t *vp,
3527 3531          uio_t *uiop,
3528 3532          int ioflag,
3529 3533          cred_t *cr,
3530 3534          caller_context_t *ct)
3531 3535  {
3532 3536          ssize_t resid_start = uiop->uio_resid;
3533 3537          zone_t  *zonep = curzone;
3534 3538          zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3535 3539  
3536 3540          hrtime_t start = 0, lat;
3537 3541          ssize_t len;
3538 3542          int err;
3539 3543  
3540 3544          if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3541 3545              vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3542 3546                  start = gethrtime();
3543 3547  
3544 3548                  mutex_enter(&zonep->zone_vfs_lock);
3545 3549                  kstat_runq_enter(&zonep->zone_vfs_rwstats);
3546 3550                  mutex_exit(&zonep->zone_vfs_lock);
3547 3551          }
3548 3552  
3549 3553          VOPXID_MAP_CR(vp, cr);
3550 3554  
3551 3555          err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3552 3556          len = resid_start - uiop->uio_resid;
3553 3557  
3554 3558          VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3555 3559  
3556 3560          if (start != 0) {
3557 3561                  mutex_enter(&zonep->zone_vfs_lock);
3558 3562                  zonep->zone_vfs_rwstats.reads++;
3559 3563                  zonep->zone_vfs_rwstats.nread += len;
3560 3564                  kstat_runq_exit(&zonep->zone_vfs_rwstats);
3561 3565                  mutex_exit(&zonep->zone_vfs_lock);
3562 3566  
3563 3567                  lat = gethrtime() - start;
3564 3568  
3565 3569                  if (lat >= VOP_LATENCY_10MS) {
3566 3570                          if (lat < VOP_LATENCY_100MS)
3567 3571                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3568 3572                          else if (lat < VOP_LATENCY_1S) {
3569 3573                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3570 3574                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3571 3575                          } else {
3572 3576                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3573 3577                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3574 3578                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3575 3579                          }
3576 3580                  }
3577 3581          }
3578 3582  
3579 3583          return (err);
3580 3584  }
3581 3585  
3582 3586  int
3583 3587  fop_write(
3584 3588          vnode_t *vp,
3585 3589          uio_t *uiop,
3586 3590          int ioflag,
3587 3591          cred_t *cr,
3588 3592          caller_context_t *ct)
3589 3593  {
3590 3594          ssize_t resid_start = uiop->uio_resid;
3591 3595          zone_t  *zonep = curzone;
3592 3596          zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3593 3597  
3594 3598          hrtime_t start = 0, lat;
3595 3599          ssize_t len;
3596 3600          int     err;
3597 3601  
3598 3602          /*
3599 3603           * For the purposes of VFS kstat consumers, the "waitq" calculation is
3600 3604           * repurposed as the active queue for VFS write operations.  There's no
3601 3605           * actual wait queue for VFS operations.
3602 3606           */
3603 3607          if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3604 3608              vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3605 3609                  start = gethrtime();
3606 3610  
3607 3611                  mutex_enter(&zonep->zone_vfs_lock);
3608 3612                  kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3609 3613                  mutex_exit(&zonep->zone_vfs_lock);
3610 3614          }
3611 3615  
3612 3616          VOPXID_MAP_CR(vp, cr);
3613 3617  
3614 3618          err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3615 3619          len = resid_start - uiop->uio_resid;
3616 3620  
3617 3621          VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3618 3622  
3619 3623          if (start != 0) {
3620 3624                  mutex_enter(&zonep->zone_vfs_lock);
3621 3625                  zonep->zone_vfs_rwstats.writes++;
3622 3626                  zonep->zone_vfs_rwstats.nwritten += len;
3623 3627                  kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3624 3628                  mutex_exit(&zonep->zone_vfs_lock);
3625 3629  
3626 3630                  lat = gethrtime() - start;
3627 3631  
3628 3632                  if (lat >= VOP_LATENCY_10MS) {
3629 3633                          if (lat < VOP_LATENCY_100MS)
3630 3634                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3631 3635                          else if (lat < VOP_LATENCY_1S) {
3632 3636                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3633 3637                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3634 3638                          } else {
3635 3639                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3636 3640                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3637 3641                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3638 3642                          }
3639 3643                  }
3640 3644          }
3641 3645  
3642 3646          return (err);
3643 3647  }
3644 3648  
3645 3649  int
3646 3650  fop_ioctl(
3647 3651          vnode_t *vp,
3648 3652          int cmd,
3649 3653          intptr_t arg,
3650 3654          int flag,
3651 3655          cred_t *cr,
3652 3656          int *rvalp,
3653 3657          caller_context_t *ct)
3654 3658  {
3655 3659          int     err;
3656 3660  
3657 3661          VOPXID_MAP_CR(vp, cr);
3658 3662  
3659 3663          err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3660 3664          VOPSTATS_UPDATE(vp, ioctl);
3661 3665          return (err);
3662 3666  }
3663 3667  
3664 3668  int
3665 3669  fop_setfl(
3666 3670          vnode_t *vp,
3667 3671          int oflags,
3668 3672          int nflags,
3669 3673          cred_t *cr,
3670 3674          caller_context_t *ct)
3671 3675  {
3672 3676          int     err;
3673 3677  
3674 3678          VOPXID_MAP_CR(vp, cr);
3675 3679  
3676 3680          err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3677 3681          VOPSTATS_UPDATE(vp, setfl);
3678 3682          return (err);
3679 3683  }
3680 3684  
3681 3685  int
3682 3686  fop_getattr(
3683 3687          vnode_t *vp,
3684 3688          vattr_t *vap,
3685 3689          int flags,
3686 3690          cred_t *cr,
3687 3691          caller_context_t *ct)
3688 3692  {
3689 3693          int     err;
3690 3694  
3691 3695          VOPXID_MAP_CR(vp, cr);
3692 3696  
3693 3697          /*
3694 3698           * If this file system doesn't understand the xvattr extensions
3695 3699           * then turn off the xvattr bit.
3696 3700           */
3697 3701          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3698 3702                  vap->va_mask &= ~AT_XVATTR;
3699 3703          }
3700 3704  
3701 3705          /*
3702 3706           * We're only allowed to skip the ACL check iff we used a 32 bit
3703 3707           * ACE mask with VOP_ACCESS() to determine permissions.
3704 3708           */
3705 3709          if ((flags & ATTR_NOACLCHECK) &&
3706 3710              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3707 3711                  return (EINVAL);
3708 3712          }
3709 3713          err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3710 3714          VOPSTATS_UPDATE(vp, getattr);
3711 3715          return (err);
3712 3716  }
3713 3717  
3714 3718  int
3715 3719  fop_setattr(
3716 3720          vnode_t *vp,
3717 3721          vattr_t *vap,
3718 3722          int flags,
3719 3723          cred_t *cr,
3720 3724          caller_context_t *ct)
3721 3725  {
3722 3726          int     err;
3723 3727  
3724 3728          VOPXID_MAP_CR(vp, cr);
3725 3729  
3726 3730          /*
3727 3731           * If this file system doesn't understand the xvattr extensions
3728 3732           * then turn off the xvattr bit.
3729 3733           */
3730 3734          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3731 3735                  vap->va_mask &= ~AT_XVATTR;
3732 3736          }
3733 3737  
3734 3738          /*
3735 3739           * We're only allowed to skip the ACL check iff we used a 32 bit
3736 3740           * ACE mask with VOP_ACCESS() to determine permissions.
3737 3741           */
3738 3742          if ((flags & ATTR_NOACLCHECK) &&
3739 3743              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3740 3744                  return (EINVAL);
3741 3745          }
3742 3746          err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3743 3747          VOPSTATS_UPDATE(vp, setattr);
3744 3748          return (err);
3745 3749  }
3746 3750  
3747 3751  int
3748 3752  fop_access(
3749 3753          vnode_t *vp,
3750 3754          int mode,
3751 3755          int flags,
3752 3756          cred_t *cr,
3753 3757          caller_context_t *ct)
3754 3758  {
3755 3759          int     err;
3756 3760  
3757 3761          if ((flags & V_ACE_MASK) &&
3758 3762              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3759 3763                  return (EINVAL);
3760 3764          }
3761 3765  
3762 3766          VOPXID_MAP_CR(vp, cr);
3763 3767  
3764 3768          err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3765 3769          VOPSTATS_UPDATE(vp, access);
3766 3770          return (err);
3767 3771  }
3768 3772  
3769 3773  int
3770 3774  fop_lookup(
3771 3775          vnode_t *dvp,
3772 3776          char *nm,
3773 3777          vnode_t **vpp,
3774 3778          pathname_t *pnp,
3775 3779          int flags,
3776 3780          vnode_t *rdir,
3777 3781          cred_t *cr,
3778 3782          caller_context_t *ct,
3779 3783          int *deflags,           /* Returned per-dirent flags */
3780 3784          pathname_t *ppnp)       /* Returned case-preserved name in directory */
3781 3785  {
3782 3786          int ret;
3783 3787  
3784 3788          /*
3785 3789           * If this file system doesn't support case-insensitive access
3786 3790           * and said access is requested, fail quickly.  It is required
3787 3791           * that if the vfs supports case-insensitive lookup, it also
3788 3792           * supports extended dirent flags.
3789 3793           */
3790 3794          if (flags & FIGNORECASE &&
3791 3795              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3792 3796              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3793 3797                  return (EINVAL);
3794 3798  
3795 3799          VOPXID_MAP_CR(dvp, cr);
3796 3800  
3797 3801          if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3798 3802                  ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3799 3803          } else {
3800 3804                  ret = (*(dvp)->v_op->vop_lookup)
3801 3805                      (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3802 3806          }
3803 3807          if (ret == 0 && *vpp) {
3804 3808                  VOPSTATS_UPDATE(*vpp, lookup);
3805 3809                  vn_updatepath(dvp, *vpp, nm);
3806 3810          }
3807 3811  
3808 3812          return (ret);
3809 3813  }
3810 3814  
3811 3815  int
3812 3816  fop_create(
3813 3817          vnode_t *dvp,
3814 3818          char *name,
3815 3819          vattr_t *vap,
3816 3820          vcexcl_t excl,
3817 3821          int mode,
3818 3822          vnode_t **vpp,
3819 3823          cred_t *cr,
3820 3824          int flags,
3821 3825          caller_context_t *ct,
3822 3826          vsecattr_t *vsecp)      /* ACL to set during create */
3823 3827  {
3824 3828          int ret;
3825 3829  
3826 3830          if (vsecp != NULL &&
3827 3831              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3828 3832                  return (EINVAL);
3829 3833          }
3830 3834          /*
3831 3835           * If this file system doesn't support case-insensitive access
3832 3836           * and said access is requested, fail quickly.
3833 3837           */
3834 3838          if (flags & FIGNORECASE &&
3835 3839              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3836 3840              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3837 3841                  return (EINVAL);
3838 3842  
3839 3843          VOPXID_MAP_CR(dvp, cr);
3840 3844  
3841 3845          ret = (*(dvp)->v_op->vop_create)
3842 3846              (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3843 3847          if (ret == 0 && *vpp) {
3844 3848                  VOPSTATS_UPDATE(*vpp, create);
3845 3849                  vn_updatepath(dvp, *vpp, name);
3846 3850          }
3847 3851  
3848 3852          return (ret);
3849 3853  }
3850 3854  
3851 3855  int
3852 3856  fop_remove(
3853 3857          vnode_t *dvp,
3854 3858          char *nm,
3855 3859          cred_t *cr,
3856 3860          caller_context_t *ct,
3857 3861          int flags)
3858 3862  {
3859 3863          int     err;
3860 3864  
3861 3865          /*
3862 3866           * If this file system doesn't support case-insensitive access
3863 3867           * and said access is requested, fail quickly.
3864 3868           */
3865 3869          if (flags & FIGNORECASE &&
3866 3870              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3867 3871              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3868 3872                  return (EINVAL);
3869 3873  
3870 3874          VOPXID_MAP_CR(dvp, cr);
3871 3875  
3872 3876          err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3873 3877          VOPSTATS_UPDATE(dvp, remove);
3874 3878          return (err);
3875 3879  }
3876 3880  
3877 3881  int
3878 3882  fop_link(
3879 3883          vnode_t *tdvp,
3880 3884          vnode_t *svp,
3881 3885          char *tnm,
3882 3886          cred_t *cr,
3883 3887          caller_context_t *ct,
3884 3888          int flags)
3885 3889  {
3886 3890          int     err;
3887 3891  
3888 3892          /*
3889 3893           * If the target file system doesn't support case-insensitive access
3890 3894           * and said access is requested, fail quickly.
3891 3895           */
3892 3896          if (flags & FIGNORECASE &&
3893 3897              (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3894 3898              vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3895 3899                  return (EINVAL);
3896 3900  
3897 3901          VOPXID_MAP_CR(tdvp, cr);
3898 3902  
3899 3903          err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3900 3904          VOPSTATS_UPDATE(tdvp, link);
3901 3905          return (err);
3902 3906  }
3903 3907  
3904 3908  int
3905 3909  fop_rename(
3906 3910          vnode_t *sdvp,
3907 3911          char *snm,
3908 3912          vnode_t *tdvp,
3909 3913          char *tnm,
3910 3914          cred_t *cr,
3911 3915          caller_context_t *ct,
3912 3916          int flags)
3913 3917  {
3914 3918          int     err;
3915 3919  
3916 3920          /*
3917 3921           * If the file system involved does not support
3918 3922           * case-insensitive access and said access is requested, fail
3919 3923           * quickly.
3920 3924           */
3921 3925          if (flags & FIGNORECASE &&
3922 3926              ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3923 3927              vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3924 3928                  return (EINVAL);
3925 3929  
3926 3930          VOPXID_MAP_CR(tdvp, cr);
3927 3931  
3928 3932          err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3929 3933          VOPSTATS_UPDATE(sdvp, rename);
3930 3934          return (err);
3931 3935  }
3932 3936  
3933 3937  int
3934 3938  fop_mkdir(
3935 3939          vnode_t *dvp,
3936 3940          char *dirname,
3937 3941          vattr_t *vap,
3938 3942          vnode_t **vpp,
3939 3943          cred_t *cr,
3940 3944          caller_context_t *ct,
3941 3945          int flags,
3942 3946          vsecattr_t *vsecp)      /* ACL to set during create */
3943 3947  {
3944 3948          int ret;
3945 3949  
3946 3950          if (vsecp != NULL &&
3947 3951              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3948 3952                  return (EINVAL);
3949 3953          }
3950 3954          /*
3951 3955           * If this file system doesn't support case-insensitive access
3952 3956           * and said access is requested, fail quickly.
3953 3957           */
3954 3958          if (flags & FIGNORECASE &&
3955 3959              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3956 3960              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3957 3961                  return (EINVAL);
3958 3962  
3959 3963          VOPXID_MAP_CR(dvp, cr);
3960 3964  
3961 3965          ret = (*(dvp)->v_op->vop_mkdir)
3962 3966              (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3963 3967          if (ret == 0 && *vpp) {
3964 3968                  VOPSTATS_UPDATE(*vpp, mkdir);
3965 3969                  vn_updatepath(dvp, *vpp, dirname);
3966 3970          }
3967 3971  
3968 3972          return (ret);
3969 3973  }
3970 3974  
3971 3975  int
3972 3976  fop_rmdir(
3973 3977          vnode_t *dvp,
3974 3978          char *nm,
3975 3979          vnode_t *cdir,
3976 3980          cred_t *cr,
3977 3981          caller_context_t *ct,
3978 3982          int flags)
3979 3983  {
3980 3984          int     err;
3981 3985  
3982 3986          /*
3983 3987           * If this file system doesn't support case-insensitive access
3984 3988           * and said access is requested, fail quickly.
3985 3989           */
3986 3990          if (flags & FIGNORECASE &&
3987 3991              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3988 3992              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3989 3993                  return (EINVAL);
3990 3994  
3991 3995          VOPXID_MAP_CR(dvp, cr);
3992 3996  
3993 3997          err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3994 3998          VOPSTATS_UPDATE(dvp, rmdir);
3995 3999          return (err);
3996 4000  }
3997 4001  
3998 4002  int
3999 4003  fop_readdir(
4000 4004          vnode_t *vp,
4001 4005          uio_t *uiop,
4002 4006          cred_t *cr,
4003 4007          int *eofp,
4004 4008          caller_context_t *ct,
4005 4009          int flags)
4006 4010  {
4007 4011          int     err;
4008 4012          ssize_t resid_start = uiop->uio_resid;
4009 4013  
4010 4014          /*
4011 4015           * If this file system doesn't support retrieving directory
4012 4016           * entry flags and said access is requested, fail quickly.
4013 4017           */
4014 4018          if (flags & V_RDDIR_ENTFLAGS &&
4015 4019              vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
4016 4020                  return (EINVAL);
4017 4021  
4018 4022          VOPXID_MAP_CR(vp, cr);
4019 4023  
4020 4024          err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
4021 4025          VOPSTATS_UPDATE_IO(vp, readdir,
4022 4026              readdir_bytes, (resid_start - uiop->uio_resid));
4023 4027          return (err);
4024 4028  }
4025 4029  
4026 4030  int
4027 4031  fop_symlink(
4028 4032          vnode_t *dvp,
4029 4033          char *linkname,
4030 4034          vattr_t *vap,
4031 4035          char *target,
4032 4036          cred_t *cr,
4033 4037          caller_context_t *ct,
4034 4038          int flags)
4035 4039  {
4036 4040          int     err;
4037 4041          xvattr_t xvattr;
4038 4042  
4039 4043          /*
4040 4044           * If this file system doesn't support case-insensitive access
4041 4045           * and said access is requested, fail quickly.
4042 4046           */
4043 4047          if (flags & FIGNORECASE &&
4044 4048              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
4045 4049              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
4046 4050                  return (EINVAL);
4047 4051  
4048 4052          VOPXID_MAP_CR(dvp, cr);
4049 4053  
4050 4054          /* check for reparse point */
4051 4055          if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
4052 4056              (strncmp(target, FS_REPARSE_TAG_STR,
4053 4057              strlen(FS_REPARSE_TAG_STR)) == 0)) {
4054 4058                  if (!fs_reparse_mark(target, vap, &xvattr))
4055 4059                          vap = (vattr_t *)&xvattr;
4056 4060          }
4057 4061  
4058 4062          err = (*(dvp)->v_op->vop_symlink)
4059 4063              (dvp, linkname, vap, target, cr, ct, flags);
4060 4064          VOPSTATS_UPDATE(dvp, symlink);
4061 4065          return (err);
4062 4066  }
4063 4067  
4064 4068  int
4065 4069  fop_readlink(
4066 4070          vnode_t *vp,
4067 4071          uio_t *uiop,
4068 4072          cred_t *cr,
4069 4073          caller_context_t *ct)
4070 4074  {
4071 4075          int     err;
4072 4076  
4073 4077          VOPXID_MAP_CR(vp, cr);
4074 4078  
4075 4079          err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
4076 4080          VOPSTATS_UPDATE(vp, readlink);
4077 4081          return (err);
4078 4082  }
4079 4083  
4080 4084  int
4081 4085  fop_fsync(
4082 4086          vnode_t *vp,
4083 4087          int syncflag,
4084 4088          cred_t *cr,
4085 4089          caller_context_t *ct)
4086 4090  {
4087 4091          int     err;
4088 4092  
4089 4093          VOPXID_MAP_CR(vp, cr);
4090 4094  
4091 4095          err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
4092 4096          VOPSTATS_UPDATE(vp, fsync);
4093 4097          return (err);
4094 4098  }
4095 4099  
4096 4100  void
4097 4101  fop_inactive(
4098 4102          vnode_t *vp,
4099 4103          cred_t *cr,
4100 4104          caller_context_t *ct)
4101 4105  {
4102 4106          /* Need to update stats before vop call since we may lose the vnode */
4103 4107          VOPSTATS_UPDATE(vp, inactive);
4104 4108  
4105 4109          VOPXID_MAP_CR(vp, cr);
4106 4110  
4107 4111          (*(vp)->v_op->vop_inactive)(vp, cr, ct);
4108 4112  }
4109 4113  
4110 4114  int
4111 4115  fop_fid(
4112 4116          vnode_t *vp,
4113 4117          fid_t *fidp,
4114 4118          caller_context_t *ct)
4115 4119  {
4116 4120          int     err;
4117 4121  
4118 4122          err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
4119 4123          VOPSTATS_UPDATE(vp, fid);
4120 4124          return (err);
4121 4125  }
4122 4126  
4123 4127  int
4124 4128  fop_rwlock(
4125 4129          vnode_t *vp,
4126 4130          int write_lock,
4127 4131          caller_context_t *ct)
4128 4132  {
4129 4133          int     ret;
4130 4134  
4131 4135          ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
4132 4136          VOPSTATS_UPDATE(vp, rwlock);
4133 4137          return (ret);
4134 4138  }
4135 4139  
4136 4140  void
4137 4141  fop_rwunlock(
4138 4142          vnode_t *vp,
4139 4143          int write_lock,
4140 4144          caller_context_t *ct)
4141 4145  {
4142 4146          (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4143 4147          VOPSTATS_UPDATE(vp, rwunlock);
4144 4148  }
4145 4149  
4146 4150  int
4147 4151  fop_seek(
4148 4152          vnode_t *vp,
4149 4153          offset_t ooff,
4150 4154          offset_t *noffp,
4151 4155          caller_context_t *ct)
4152 4156  {
4153 4157          int     err;
4154 4158  
4155 4159          err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4156 4160          VOPSTATS_UPDATE(vp, seek);
4157 4161          return (err);
4158 4162  }
4159 4163  
4160 4164  int
4161 4165  fop_cmp(
4162 4166          vnode_t *vp1,
4163 4167          vnode_t *vp2,
4164 4168          caller_context_t *ct)
4165 4169  {
4166 4170          int     err;
4167 4171  
4168 4172          err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4169 4173          VOPSTATS_UPDATE(vp1, cmp);
4170 4174          return (err);
4171 4175  }
4172 4176  
4173 4177  int
4174 4178  fop_frlock(
4175 4179          vnode_t *vp,
4176 4180          int cmd,
4177 4181          flock64_t *bfp,
4178 4182          int flag,
4179 4183          offset_t offset,
4180 4184          struct flk_callback *flk_cbp,
4181 4185          cred_t *cr,
4182 4186          caller_context_t *ct)
4183 4187  {
4184 4188          int     err;
4185 4189  
4186 4190          VOPXID_MAP_CR(vp, cr);
4187 4191  
4188 4192          err = (*(vp)->v_op->vop_frlock)
4189 4193              (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4190 4194          VOPSTATS_UPDATE(vp, frlock);
4191 4195          return (err);
4192 4196  }
4193 4197  
4194 4198  int
4195 4199  fop_space(
4196 4200          vnode_t *vp,
4197 4201          int cmd,
4198 4202          flock64_t *bfp,
4199 4203          int flag,
4200 4204          offset_t offset,
4201 4205          cred_t *cr,
4202 4206          caller_context_t *ct)
4203 4207  {
4204 4208          int     err;
4205 4209  
4206 4210          VOPXID_MAP_CR(vp, cr);
4207 4211  
4208 4212          err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4209 4213          VOPSTATS_UPDATE(vp, space);
4210 4214          return (err);
4211 4215  }
4212 4216  
4213 4217  int
4214 4218  fop_realvp(
4215 4219          vnode_t *vp,
4216 4220          vnode_t **vpp,
4217 4221          caller_context_t *ct)
4218 4222  {
4219 4223          int     err;
4220 4224  
4221 4225          err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4222 4226          VOPSTATS_UPDATE(vp, realvp);
4223 4227          return (err);
4224 4228  }
4225 4229  
4226 4230  int
4227 4231  fop_getpage(
4228 4232          vnode_t *vp,
4229 4233          offset_t off,
4230 4234          size_t len,
4231 4235          uint_t *protp,
4232 4236          page_t **plarr,
4233 4237          size_t plsz,
4234 4238          struct seg *seg,
4235 4239          caddr_t addr,
4236 4240          enum seg_rw rw,
4237 4241          cred_t *cr,
4238 4242          caller_context_t *ct)
4239 4243  {
4240 4244          int     err;
4241 4245  
4242 4246          VOPXID_MAP_CR(vp, cr);
4243 4247  
4244 4248          err = (*(vp)->v_op->vop_getpage)
4245 4249              (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4246 4250          VOPSTATS_UPDATE(vp, getpage);
4247 4251          return (err);
4248 4252  }
4249 4253  
4250 4254  int
4251 4255  fop_putpage(
4252 4256          vnode_t *vp,
4253 4257          offset_t off,
4254 4258          size_t len,
4255 4259          int flags,
4256 4260          cred_t *cr,
4257 4261          caller_context_t *ct)
4258 4262  {
4259 4263          int     err;
4260 4264  
4261 4265          VOPXID_MAP_CR(vp, cr);
4262 4266  
4263 4267          err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4264 4268          VOPSTATS_UPDATE(vp, putpage);
4265 4269          return (err);
4266 4270  }
4267 4271  
4268 4272  int
4269 4273  fop_map(
4270 4274          vnode_t *vp,
4271 4275          offset_t off,
4272 4276          struct as *as,
4273 4277          caddr_t *addrp,
4274 4278          size_t len,
4275 4279          uchar_t prot,
4276 4280          uchar_t maxprot,
4277 4281          uint_t flags,
4278 4282          cred_t *cr,
4279 4283          caller_context_t *ct)
4280 4284  {
4281 4285          int     err;
4282 4286  
4283 4287          VOPXID_MAP_CR(vp, cr);
4284 4288  
4285 4289          err = (*(vp)->v_op->vop_map)
4286 4290              (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4287 4291          VOPSTATS_UPDATE(vp, map);
4288 4292          return (err);
4289 4293  }
4290 4294  
4291 4295  int
4292 4296  fop_addmap(
4293 4297          vnode_t *vp,
4294 4298          offset_t off,
4295 4299          struct as *as,
4296 4300          caddr_t addr,
4297 4301          size_t len,
4298 4302          uchar_t prot,
4299 4303          uchar_t maxprot,
4300 4304          uint_t flags,
4301 4305          cred_t *cr,
4302 4306          caller_context_t *ct)
4303 4307  {
4304 4308          int error;
4305 4309          u_longlong_t delta;
4306 4310  
4307 4311          VOPXID_MAP_CR(vp, cr);
4308 4312  
4309 4313          error = (*(vp)->v_op->vop_addmap)
4310 4314              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4311 4315  
4312 4316          if ((!error) && (vp->v_type == VREG)) {
4313 4317                  delta = (u_longlong_t)btopr(len);
4314 4318                  /*
4315 4319                   * If file is declared MAP_PRIVATE, it can't be written back
4316 4320                   * even if open for write. Handle as read.
4317 4321                   */
4318 4322                  if (flags & MAP_PRIVATE) {
4319 4323                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4320 4324                              (int64_t)delta);
4321 4325                  } else {
4322 4326                          /*
4323 4327                           * atomic_add_64 forces the fetch of a 64 bit value to
4324 4328                           * be atomic on 32 bit machines
4325 4329                           */
4326 4330                          if (maxprot & PROT_WRITE)
4327 4331                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4328 4332                                      (int64_t)delta);
4329 4333                          if (maxprot & PROT_READ)
4330 4334                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4331 4335                                      (int64_t)delta);
4332 4336                          if (maxprot & PROT_EXEC)
4333 4337                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4334 4338                                      (int64_t)delta);
4335 4339                  }
4336 4340          }
4337 4341          VOPSTATS_UPDATE(vp, addmap);
4338 4342          return (error);
4339 4343  }
4340 4344  
4341 4345  int
4342 4346  fop_delmap(
4343 4347          vnode_t *vp,
4344 4348          offset_t off,
4345 4349          struct as *as,
4346 4350          caddr_t addr,
4347 4351          size_t len,
4348 4352          uint_t prot,
4349 4353          uint_t maxprot,
4350 4354          uint_t flags,
4351 4355          cred_t *cr,
4352 4356          caller_context_t *ct)
4353 4357  {
4354 4358          int error;
4355 4359          u_longlong_t delta;
4356 4360  
4357 4361          VOPXID_MAP_CR(vp, cr);
4358 4362  
4359 4363          error = (*(vp)->v_op->vop_delmap)
4360 4364              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4361 4365  
4362 4366          /*
4363 4367           * NFS calls into delmap twice, the first time
4364 4368           * it simply establishes a callback mechanism and returns EAGAIN
4365 4369           * while the real work is being done upon the second invocation.
4366 4370           * We have to detect this here and only decrement the counts upon
4367 4371           * the second delmap request.
4368 4372           */
4369 4373          if ((error != EAGAIN) && (vp->v_type == VREG)) {
4370 4374  
4371 4375                  delta = (u_longlong_t)btopr(len);
4372 4376  
4373 4377                  if (flags & MAP_PRIVATE) {
4374 4378                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4375 4379                              (int64_t)(-delta));
4376 4380                  } else {
4377 4381                          /*
4378 4382                           * atomic_add_64 forces the fetch of a 64 bit value
4379 4383                           * to be atomic on 32 bit machines
4380 4384                           */
4381 4385                          if (maxprot & PROT_WRITE)
4382 4386                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4383 4387                                      (int64_t)(-delta));
4384 4388                          if (maxprot & PROT_READ)
4385 4389                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4386 4390                                      (int64_t)(-delta));
4387 4391                          if (maxprot & PROT_EXEC)
4388 4392                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4389 4393                                      (int64_t)(-delta));
4390 4394                  }
4391 4395          }
4392 4396          VOPSTATS_UPDATE(vp, delmap);
4393 4397          return (error);
4394 4398  }
4395 4399  
4396 4400  
4397 4401  int
4398 4402  fop_poll(
4399 4403          vnode_t *vp,
4400 4404          short events,
4401 4405          int anyyet,
4402 4406          short *reventsp,
4403 4407          struct pollhead **phpp,
4404 4408          caller_context_t *ct)
4405 4409  {
4406 4410          int     err;
4407 4411  
4408 4412          err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4409 4413          VOPSTATS_UPDATE(vp, poll);
4410 4414          return (err);
4411 4415  }
4412 4416  
4413 4417  int
4414 4418  fop_dump(
4415 4419          vnode_t *vp,
4416 4420          caddr_t addr,
4417 4421          offset_t lbdn,
4418 4422          offset_t dblks,
4419 4423          caller_context_t *ct)
4420 4424  {
4421 4425          int     err;
4422 4426  
4423 4427          /* ensure lbdn and dblks can be passed safely to bdev_dump */
4424 4428          if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4425 4429                  return (EIO);
4426 4430  
4427 4431          err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4428 4432          VOPSTATS_UPDATE(vp, dump);
4429 4433          return (err);
4430 4434  }
4431 4435  
4432 4436  int
4433 4437  fop_pathconf(
4434 4438          vnode_t *vp,
4435 4439          int cmd,
4436 4440          ulong_t *valp,
4437 4441          cred_t *cr,
4438 4442          caller_context_t *ct)
4439 4443  {
4440 4444          int     err;
4441 4445  
4442 4446          VOPXID_MAP_CR(vp, cr);
4443 4447  
4444 4448          err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4445 4449          VOPSTATS_UPDATE(vp, pathconf);
4446 4450          return (err);
4447 4451  }
4448 4452  
4449 4453  int
4450 4454  fop_pageio(
4451 4455          vnode_t *vp,
4452 4456          struct page *pp,
4453 4457          u_offset_t io_off,
4454 4458          size_t io_len,
4455 4459          int flags,
4456 4460          cred_t *cr,
4457 4461          caller_context_t *ct)
4458 4462  {
4459 4463          int     err;
4460 4464  
4461 4465          VOPXID_MAP_CR(vp, cr);
4462 4466  
4463 4467          err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4464 4468          VOPSTATS_UPDATE(vp, pageio);
4465 4469          return (err);
4466 4470  }
4467 4471  
4468 4472  int
4469 4473  fop_dumpctl(
4470 4474          vnode_t *vp,
4471 4475          int action,
4472 4476          offset_t *blkp,
4473 4477          caller_context_t *ct)
4474 4478  {
4475 4479          int     err;
4476 4480          err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4477 4481          VOPSTATS_UPDATE(vp, dumpctl);
4478 4482          return (err);
4479 4483  }
4480 4484  
4481 4485  void
4482 4486  fop_dispose(
4483 4487          vnode_t *vp,
4484 4488          page_t *pp,
4485 4489          int flag,
4486 4490          int dn,
4487 4491          cred_t *cr,
4488 4492          caller_context_t *ct)
4489 4493  {
4490 4494          /* Must do stats first since it's possible to lose the vnode */
4491 4495          VOPSTATS_UPDATE(vp, dispose);
4492 4496  
4493 4497          VOPXID_MAP_CR(vp, cr);
4494 4498  
4495 4499          (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4496 4500  }
4497 4501  
4498 4502  int
4499 4503  fop_setsecattr(
4500 4504          vnode_t *vp,
4501 4505          vsecattr_t *vsap,
4502 4506          int flag,
4503 4507          cred_t *cr,
4504 4508          caller_context_t *ct)
4505 4509  {
4506 4510          int     err;
4507 4511  
4508 4512          VOPXID_MAP_CR(vp, cr);
4509 4513  
4510 4514          /*
4511 4515           * We're only allowed to skip the ACL check iff we used a 32 bit
4512 4516           * ACE mask with VOP_ACCESS() to determine permissions.
4513 4517           */
4514 4518          if ((flag & ATTR_NOACLCHECK) &&
4515 4519              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4516 4520                  return (EINVAL);
4517 4521          }
4518 4522          err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4519 4523          VOPSTATS_UPDATE(vp, setsecattr);
4520 4524          return (err);
4521 4525  }
4522 4526  
4523 4527  int
4524 4528  fop_getsecattr(
4525 4529          vnode_t *vp,
4526 4530          vsecattr_t *vsap,
4527 4531          int flag,
4528 4532          cred_t *cr,
4529 4533          caller_context_t *ct)
4530 4534  {
4531 4535          int     err;
4532 4536  
4533 4537          /*
4534 4538           * We're only allowed to skip the ACL check iff we used a 32 bit
4535 4539           * ACE mask with VOP_ACCESS() to determine permissions.
4536 4540           */
4537 4541          if ((flag & ATTR_NOACLCHECK) &&
4538 4542              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4539 4543                  return (EINVAL);
4540 4544          }
4541 4545  
4542 4546          VOPXID_MAP_CR(vp, cr);
4543 4547  
4544 4548          err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4545 4549          VOPSTATS_UPDATE(vp, getsecattr);
4546 4550          return (err);
4547 4551  }
4548 4552  
4549 4553  int
4550 4554  fop_shrlock(
4551 4555          vnode_t *vp,
4552 4556          int cmd,
4553 4557          struct shrlock *shr,
4554 4558          int flag,
4555 4559          cred_t *cr,
4556 4560          caller_context_t *ct)
4557 4561  {
4558 4562          int     err;
4559 4563  
4560 4564          VOPXID_MAP_CR(vp, cr);
4561 4565  
4562 4566          err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4563 4567          VOPSTATS_UPDATE(vp, shrlock);
4564 4568          return (err);
4565 4569  }
4566 4570  
4567 4571  int
4568 4572  fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4569 4573      caller_context_t *ct)
4570 4574  {
4571 4575          int     err;
4572 4576  
4573 4577          err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4574 4578          VOPSTATS_UPDATE(vp, vnevent);
4575 4579          return (err);
4576 4580  }
4577 4581  
4578 4582  int
4579 4583  fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4580 4584      caller_context_t *ct)
4581 4585  {
4582 4586          int err;
4583 4587  
4584 4588          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4585 4589                  return (ENOTSUP);
4586 4590          err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4587 4591          VOPSTATS_UPDATE(vp, reqzcbuf);
4588 4592          return (err);
4589 4593  }
4590 4594  
4591 4595  int
4592 4596  fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4593 4597  {
4594 4598          int err;
4595 4599  
4596 4600          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4597 4601                  return (ENOTSUP);
4598 4602          err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4599 4603          VOPSTATS_UPDATE(vp, retzcbuf);
4600 4604          return (err);
4601 4605  }
4602 4606  
4603 4607  /*
4604 4608   * Default destructor
4605 4609   *      Needed because NULL destructor means that the key is unused
4606 4610   */
4607 4611  /* ARGSUSED */
4608 4612  void
4609 4613  vsd_defaultdestructor(void *value)
4610 4614  {}
4611 4615  
4612 4616  /*
4613 4617   * Create a key (index into per vnode array)
4614 4618   *      Locks out vsd_create, vsd_destroy, and vsd_free
4615 4619   *      May allocate memory with lock held
4616 4620   */
4617 4621  void
4618 4622  vsd_create(uint_t *keyp, void (*destructor)(void *))
4619 4623  {
4620 4624          int     i;
4621 4625          uint_t  nkeys;
4622 4626  
4623 4627          /*
4624 4628           * if key is allocated, do nothing
4625 4629           */
4626 4630          mutex_enter(&vsd_lock);
4627 4631          if (*keyp) {
4628 4632                  mutex_exit(&vsd_lock);
4629 4633                  return;
4630 4634          }
4631 4635          /*
4632 4636           * find an unused key
4633 4637           */
4634 4638          if (destructor == NULL)
4635 4639                  destructor = vsd_defaultdestructor;
4636 4640  
4637 4641          for (i = 0; i < vsd_nkeys; ++i)
4638 4642                  if (vsd_destructor[i] == NULL)
4639 4643                          break;
4640 4644  
4641 4645          /*
4642 4646           * if no unused keys, increase the size of the destructor array
4643 4647           */
4644 4648          if (i == vsd_nkeys) {
4645 4649                  if ((nkeys = (vsd_nkeys << 1)) == 0)
4646 4650                          nkeys = 1;
4647 4651                  vsd_destructor =
4648 4652                      (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4649 4653                      (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4650 4654                      (size_t)(nkeys * sizeof (void (*)(void *))));
4651 4655                  vsd_nkeys = nkeys;
4652 4656          }
4653 4657  
4654 4658          /*
4655 4659           * allocate the next available unused key
4656 4660           */
4657 4661          vsd_destructor[i] = destructor;
4658 4662          *keyp = i + 1;
4659 4663  
4660 4664          /* create vsd_list, if it doesn't exist */
4661 4665          if (vsd_list == NULL) {
4662 4666                  vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4663 4667                  list_create(vsd_list, sizeof (struct vsd_node),
4664 4668                      offsetof(struct vsd_node, vs_nodes));
4665 4669          }
4666 4670  
4667 4671          mutex_exit(&vsd_lock);
4668 4672  }
4669 4673  
4670 4674  /*
4671 4675   * Destroy a key
4672 4676   *
4673 4677   * Assumes that the caller is preventing vsd_set and vsd_get
4674 4678   * Locks out vsd_create, vsd_destroy, and vsd_free
4675 4679   * May free memory with lock held
4676 4680   */
4677 4681  void
4678 4682  vsd_destroy(uint_t *keyp)
4679 4683  {
4680 4684          uint_t key;
4681 4685          struct vsd_node *vsd;
4682 4686  
4683 4687          /*
4684 4688           * protect the key namespace and our destructor lists
4685 4689           */
4686 4690          mutex_enter(&vsd_lock);
4687 4691          key = *keyp;
4688 4692          *keyp = 0;
4689 4693  
4690 4694          ASSERT(key <= vsd_nkeys);
4691 4695  
4692 4696          /*
4693 4697           * if the key is valid
4694 4698           */
4695 4699          if (key != 0) {
4696 4700                  uint_t k = key - 1;
4697 4701                  /*
4698 4702                   * for every vnode with VSD, call key's destructor
4699 4703                   */
4700 4704                  for (vsd = list_head(vsd_list); vsd != NULL;
4701 4705                      vsd = list_next(vsd_list, vsd)) {
4702 4706                          /*
4703 4707                           * no VSD for key in this vnode
4704 4708                           */
4705 4709                          if (key > vsd->vs_nkeys)
4706 4710                                  continue;
4707 4711                          /*
4708 4712                           * call destructor for key
4709 4713                           */
4710 4714                          if (vsd->vs_value[k] && vsd_destructor[k])
4711 4715                                  (*vsd_destructor[k])(vsd->vs_value[k]);
4712 4716                          /*
4713 4717                           * reset value for key
4714 4718                           */
4715 4719                          vsd->vs_value[k] = NULL;
4716 4720                  }
4717 4721                  /*
4718 4722                   * actually free the key (NULL destructor == unused)
4719 4723                   */
4720 4724                  vsd_destructor[k] = NULL;
4721 4725          }
4722 4726  
4723 4727          mutex_exit(&vsd_lock);
4724 4728  }
4725 4729  
4726 4730  /*
4727 4731   * Quickly return the per vnode value that was stored with the specified key
4728 4732   * Assumes the caller is protecting key from vsd_create and vsd_destroy
4729 4733   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4730 4734   */
4731 4735  void *
4732 4736  vsd_get(vnode_t *vp, uint_t key)
4733 4737  {
4734 4738          struct vsd_node *vsd;
4735 4739  
4736 4740          ASSERT(vp != NULL);
4737 4741          ASSERT(mutex_owned(&vp->v_vsd_lock));
4738 4742  
4739 4743          vsd = vp->v_vsd;
4740 4744  
4741 4745          if (key && vsd != NULL && key <= vsd->vs_nkeys)
4742 4746                  return (vsd->vs_value[key - 1]);
4743 4747          return (NULL);
4744 4748  }
4745 4749  
4746 4750  /*
4747 4751   * Set a per vnode value indexed with the specified key
4748 4752   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4749 4753   */
4750 4754  int
4751 4755  vsd_set(vnode_t *vp, uint_t key, void *value)
4752 4756  {
4753 4757          struct vsd_node *vsd;
4754 4758  
4755 4759          ASSERT(vp != NULL);
4756 4760          ASSERT(mutex_owned(&vp->v_vsd_lock));
4757 4761  
4758 4762          if (key == 0)
4759 4763                  return (EINVAL);
4760 4764  
4761 4765          vsd = vp->v_vsd;
4762 4766          if (vsd == NULL)
4763 4767                  vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4764 4768  
4765 4769          /*
4766 4770           * If the vsd was just allocated, vs_nkeys will be 0, so the following
4767 4771           * code won't happen and we will continue down and allocate space for
4768 4772           * the vs_value array.
4769 4773           * If the caller is replacing one value with another, then it is up
4770 4774           * to the caller to free/rele/destroy the previous value (if needed).
4771 4775           */
4772 4776          if (key <= vsd->vs_nkeys) {
4773 4777                  vsd->vs_value[key - 1] = value;
4774 4778                  return (0);
4775 4779          }
4776 4780  
4777 4781          ASSERT(key <= vsd_nkeys);
4778 4782  
4779 4783          if (vsd->vs_nkeys == 0) {
4780 4784                  mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4781 4785                  /*
4782 4786                   * Link onto list of all VSD nodes.
4783 4787                   */
4784 4788                  list_insert_head(vsd_list, vsd);
4785 4789                  mutex_exit(&vsd_lock);
4786 4790          }
4787 4791  
4788 4792          /*
4789 4793           * Allocate vnode local storage and set the value for key
4790 4794           */
4791 4795          vsd->vs_value = vsd_realloc(vsd->vs_value,
4792 4796              vsd->vs_nkeys * sizeof (void *),
4793 4797              key * sizeof (void *));
4794 4798          vsd->vs_nkeys = key;
4795 4799          vsd->vs_value[key - 1] = value;
4796 4800  
4797 4801          return (0);
4798 4802  }
4799 4803  
4800 4804  /*
4801 4805   * Called from vn_free() to run the destructor function for each vsd
4802 4806   *      Locks out vsd_create and vsd_destroy
4803 4807   *      Assumes that the destructor *DOES NOT* use vsd
4804 4808   */
4805 4809  void
4806 4810  vsd_free(vnode_t *vp)
4807 4811  {
4808 4812          int i;
4809 4813          struct vsd_node *vsd = vp->v_vsd;
4810 4814  
4811 4815          if (vsd == NULL)
4812 4816                  return;
4813 4817  
4814 4818          if (vsd->vs_nkeys == 0) {
4815 4819                  kmem_free(vsd, sizeof (*vsd));
4816 4820                  vp->v_vsd = NULL;
4817 4821                  return;
4818 4822          }
4819 4823  
4820 4824          /*
4821 4825           * lock out vsd_create and vsd_destroy, call
4822 4826           * the destructor, and mark the value as destroyed.
4823 4827           */
4824 4828          mutex_enter(&vsd_lock);
4825 4829  
4826 4830          for (i = 0; i < vsd->vs_nkeys; i++) {
4827 4831                  if (vsd->vs_value[i] && vsd_destructor[i])
4828 4832                          (*vsd_destructor[i])(vsd->vs_value[i]);
4829 4833                  vsd->vs_value[i] = NULL;
4830 4834          }
4831 4835  
4832 4836          /*
4833 4837           * remove from linked list of VSD nodes
4834 4838           */
4835 4839          list_remove(vsd_list, vsd);
4836 4840  
4837 4841          mutex_exit(&vsd_lock);
4838 4842  
4839 4843          /*
4840 4844           * free up the VSD
4841 4845           */
4842 4846          kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4843 4847          kmem_free(vsd, sizeof (struct vsd_node));
4844 4848          vp->v_vsd = NULL;
4845 4849  }
4846 4850  
4847 4851  /*
4848 4852   * realloc
4849 4853   */
4850 4854  static void *
4851 4855  vsd_realloc(void *old, size_t osize, size_t nsize)
4852 4856  {
4853 4857          void *new;
4854 4858  
4855 4859          new = kmem_zalloc(nsize, KM_SLEEP);
4856 4860          if (old) {
4857 4861                  bcopy(old, new, osize);
4858 4862                  kmem_free(old, osize);
4859 4863          }
4860 4864          return (new);
4861 4865  }
4862 4866  
4863 4867  /*
4864 4868   * Setup the extensible system attribute for creating a reparse point.
4865 4869   * The symlink data 'target' is validated for proper format of a reparse
4866 4870   * string and a check also made to make sure the symlink data does not
4867 4871   * point to an existing file.
4868 4872   *
4869 4873   * return 0 if ok else -1.
4870 4874   */
4871 4875  static int
4872 4876  fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4873 4877  {
4874 4878          xoptattr_t *xoap;
4875 4879  
4876 4880          if ((!target) || (!vap) || (!xvattr))
4877 4881                  return (-1);
4878 4882  
4879 4883          /* validate reparse string */
4880 4884          if (reparse_validate((const char *)target))
4881 4885                  return (-1);
4882 4886  
4883 4887          xva_init(xvattr);
4884 4888          xvattr->xva_vattr = *vap;
4885 4889          xvattr->xva_vattr.va_mask |= AT_XVATTR;
4886 4890          xoap = xva_getxoptattr(xvattr);
4887 4891          ASSERT(xoap);
4888 4892          XVA_SET_REQ(xvattr, XAT_REPARSE);
4889 4893          xoap->xoa_reparse = 1;
4890 4894  
4891 4895          return (0);
4892 4896  }
4893 4897  
4894 4898  /*
4895 4899   * Function to check whether a symlink is a reparse point.
4896 4900   * Return B_TRUE if it is a reparse point, else return B_FALSE
4897 4901   */
4898 4902  boolean_t
4899 4903  vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4900 4904  {
4901 4905          xvattr_t xvattr;
4902 4906          xoptattr_t *xoap;
4903 4907  
4904 4908          if ((vp->v_type != VLNK) ||
4905 4909              !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4906 4910                  return (B_FALSE);
4907 4911  
4908 4912          xva_init(&xvattr);
4909 4913          xoap = xva_getxoptattr(&xvattr);
4910 4914          ASSERT(xoap);
4911 4915          XVA_SET_REQ(&xvattr, XAT_REPARSE);
4912 4916  
4913 4917          if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4914 4918                  return (B_FALSE);
4915 4919  
4916 4920          if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4917 4921              (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4918 4922                  return (B_FALSE);
4919 4923  
4920 4924          return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4921 4925  }

↓ open down ↓

3955 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX