Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/vnode.c
          +++ new/usr/src/uts/common/fs/vnode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright 2016, Joyent, Inc.
       24 + * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *
  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  #include <sys/types.h>
  41   41  #include <sys/param.h>
  42   42  #include <sys/t_lock.h>
  43   43  #include <sys/errno.h>
  44   44  #include <sys/cred.h>
  45   45  #include <sys/user.h>
  46   46  #include <sys/uio.h>
  47   47  #include <sys/file.h>
  48   48  #include <sys/pathname.h>
  49   49  #include <sys/vfs.h>
  50   50  #include <sys/vfs_opreg.h>
  51   51  #include <sys/vnode.h>
  52   52  #include <sys/rwstlock.h>
  53   53  #include <sys/fem.h>
  54   54  #include <sys/stat.h>
  55   55  #include <sys/mode.h>
  56   56  #include <sys/conf.h>
  57   57  #include <sys/sysmacros.h>
  58   58  #include <sys/cmn_err.h>
  
    | 
      ↓ open down ↓ | 
    24 lines elided | 
    
      ↑ open up ↑ | 
  
  59   59  #include <sys/systm.h>
  60   60  #include <sys/kmem.h>
  61   61  #include <sys/debug.h>
  62   62  #include <c2/audit.h>
  63   63  #include <sys/acl.h>
  64   64  #include <sys/nbmlock.h>
  65   65  #include <sys/fcntl.h>
  66   66  #include <fs/fs_subr.h>
  67   67  #include <sys/taskq.h>
  68   68  #include <fs/fs_reparse.h>
  69      -#include <sys/time.h>
  70      -#include <sys/sdt.h>
  71   69  
  72   70  /* Determine if this vnode is a file that is read-only */
  73   71  #define ISROFILE(vp)    \
  74   72          ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  75   73              (vp)->v_type != VFIFO && vn_is_readonly(vp))
  76   74  
  77   75  /* Tunable via /etc/system; used only by admin/install */
  78   76  int nfs_global_client_only;
  79   77  
  80   78  /*
  81   79   * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  82   80   * number of entries as and parallel to the vfssw table.  (Arguably, it could
  83   81   * be part of the vfssw table.)  Once it's initialized, it's accessed using
  84   82   * the same fstype index that is used to index into the vfssw table.
  85   83   */
  86   84  vopstats_t **vopstats_fstype;
  87   85  
  88   86  /* vopstats initialization template used for fast initialization via bcopy() */
  89   87  static vopstats_t *vs_templatep;
  90   88  
  91   89  /* Kmem cache handle for vsk_anchor_t allocations */
  92   90  kmem_cache_t *vsk_anchor_cache;
  93   91  
  94   92  /* file events cleanup routine */
  95   93  extern void free_fopdata(vnode_t *);
  96   94  
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
  97   95  /*
  98   96   * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  99   97   * updates to vsktat_tree.
 100   98   */
 101   99  avl_tree_t      vskstat_tree;
 102  100  kmutex_t        vskstat_tree_lock;
 103  101  
 104  102  /* Global variable which enables/disables the vopstats collection */
 105  103  int vopstats_enabled = 1;
 106  104  
 107      -/* Global used for empty/invalid v_path */
 108      -char *vn_vpath_empty = "";
 109      -
 110  105  /*
 111  106   * forward declarations for internal vnode specific data (vsd)
 112  107   */
 113  108  static void *vsd_realloc(void *, size_t, size_t);
 114  109  
 115  110  /*
 116  111   * forward declarations for reparse point functions
 117  112   */
 118  113  static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 119  114  
 120  115  /*
 121  116   * VSD -- VNODE SPECIFIC DATA
 122  117   * The v_data pointer is typically used by a file system to store a
 123  118   * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 124  119   * However, there are times when additional project private data needs
 125  120   * to be stored separately from the data (node) pointed to by v_data.
 126  121   * This additional data could be stored by the file system itself or
 127  122   * by a completely different kernel entity.  VSD provides a way for
 128  123   * callers to obtain a key and store a pointer to private data associated
 129  124   * with a vnode.
 130  125   *
 131  126   * Callers are responsible for protecting the vsd by holding v_vsd_lock
 132  127   * for calls to vsd_set() and vsd_get().
 133  128   */
 134  129  
 135  130  /*
 136  131   * vsd_lock protects:
 137  132   *   vsd_nkeys - creation and deletion of vsd keys
 138  133   *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 139  134   *   vsd_destructor - adding and removing destructors to the list
 140  135   */
 141  136  static kmutex_t         vsd_lock;
 142  137  static uint_t           vsd_nkeys;       /* size of destructor array */
 143  138  /* list of vsd_node's */
 144  139  static list_t *vsd_list = NULL;
 145  140  /* per-key destructor funcs */
 146  141  static void             (**vsd_destructor)(void *);
 147  142  
 148  143  /*
 149  144   * The following is the common set of actions needed to update the
 150  145   * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 151  146   * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 152  147   * recording of the bytes transferred.  Since the code is similar
 153  148   * but small, it is nearly a duplicate.  Consequently any changes
 154  149   * to one may need to be reflected in the other.
 155  150   * Rundown of the variables:
 156  151   * vp - Pointer to the vnode
 157  152   * counter - Partial name structure member to update in vopstats for counts
 158  153   * bytecounter - Partial name structure member to update in vopstats for bytes
 159  154   * bytesval - Value to update in vopstats for bytes
 160  155   * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 161  156   * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 162  157   */
 163  158  
 164  159  #define VOPSTATS_UPDATE(vp, counter) {                                  \
 165  160          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 166  161          if (vfsp && vfsp->vfs_implp &&                                  \
 167  162              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 168  163                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 169  164                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 170  165                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 171  166                      size_t, uint64_t *);                                \
 172  167                  __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 173  168                  (*stataddr)++;                                          \
 174  169                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 175  170                          vsp->n##counter.value.ui64++;                   \
 176  171                  }                                                       \
 177  172          }                                                               \
 178  173  }
 179  174  
 180  175  #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 181  176          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 182  177          if (vfsp && vfsp->vfs_implp &&                                  \
 183  178              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 184  179                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 185  180                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 186  181                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 187  182                      size_t, uint64_t *);                                \
 188  183                  __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 189  184                  (*stataddr)++;                                          \
 190  185                  vsp->bytecounter.value.ui64 += bytesval;                \
 191  186                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 192  187                          vsp->n##counter.value.ui64++;                   \
 193  188                          vsp->bytecounter.value.ui64 += bytesval;        \
 194  189                  }                                                       \
 195  190          }                                                               \
 196  191  }
 197  192  
 198  193  /*
 199  194   * If the filesystem does not support XIDs map credential
 200  195   * If the vfsp is NULL, perhaps we should also map?
  
    | 
      ↓ open down ↓ | 
    81 lines elided | 
    
      ↑ open up ↑ | 
  
 201  196   */
 202  197  #define VOPXID_MAP_CR(vp, cr)   {                                       \
 203  198          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 204  199          if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 205  200                  cr = crgetmapped(cr);                                   \
 206  201          }
 207  202  
 208  203  #define VOP_LATENCY_10MS        10000000
 209  204  #define VOP_LATENCY_100MS       100000000
 210  205  #define VOP_LATENCY_1S          1000000000
 211      -#define VOP_LATENCY_10S         10000000000
 212  206  
 213  207  /*
 214  208   * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 215  209   * numerical order of S_IFMT and vnode types.)
 216  210   */
 217  211  enum vtype iftovt_tab[] = {
 218  212          VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 219  213          VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 220  214  };
 221  215  
 222  216  ushort_t vttoif_tab[] = {
 223  217          0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 224  218          S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 225  219  };
 226  220  
 227  221  /*
 228  222   * The system vnode cache.
 229  223   */
 230  224  
 231  225  kmem_cache_t *vn_cache;
 232  226  
 233  227  
 234  228  /*
 235  229   * Vnode operations vector.
 236  230   */
 237  231  
 238  232  static const fs_operation_trans_def_t vn_ops_table[] = {
 239  233          VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 240  234              fs_nosys, fs_nosys,
 241  235  
 242  236          VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 243  237              fs_nosys, fs_nosys,
 244  238  
 245  239          VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 246  240              fs_nosys, fs_nosys,
 247  241  
 248  242          VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 249  243              fs_nosys, fs_nosys,
 250  244  
 251  245          VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 252  246              fs_nosys, fs_nosys,
 253  247  
 254  248          VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 255  249              fs_setfl, fs_nosys,
 256  250  
 257  251          VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 258  252              fs_nosys, fs_nosys,
 259  253  
 260  254          VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 261  255              fs_nosys, fs_nosys,
 262  256  
 263  257          VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 264  258              fs_nosys, fs_nosys,
 265  259  
 266  260          VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 267  261              fs_nosys, fs_nosys,
 268  262  
 269  263          VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 270  264              fs_nosys, fs_nosys,
 271  265  
 272  266          VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 273  267              fs_nosys, fs_nosys,
 274  268  
 275  269          VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 276  270              fs_nosys, fs_nosys,
 277  271  
 278  272          VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 279  273              fs_nosys, fs_nosys,
 280  274  
 281  275          VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 282  276              fs_nosys, fs_nosys,
 283  277  
 284  278          VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 285  279              fs_nosys, fs_nosys,
 286  280  
 287  281          VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 288  282              fs_nosys, fs_nosys,
 289  283  
 290  284          VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 291  285              fs_nosys, fs_nosys,
 292  286  
 293  287          VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 294  288              fs_nosys, fs_nosys,
 295  289  
 296  290          VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 297  291              fs_nosys, fs_nosys,
 298  292  
 299  293          VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 300  294              fs_nosys, fs_nosys,
 301  295  
 302  296          VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 303  297              fs_nosys, fs_nosys,
 304  298  
 305  299          VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 306  300              fs_rwlock, fs_rwlock,
 307  301  
 308  302          VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 309  303              (fs_generic_func_p) fs_rwunlock,
 310  304              (fs_generic_func_p) fs_rwunlock,    /* no errors allowed */
 311  305  
 312  306          VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 313  307              fs_nosys, fs_nosys,
 314  308  
 315  309          VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 316  310              fs_cmp, fs_cmp,             /* no errors allowed */
 317  311  
 318  312          VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 319  313              fs_frlock, fs_nosys,
 320  314  
 321  315          VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 322  316              fs_nosys, fs_nosys,
 323  317  
 324  318          VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 325  319              fs_nosys, fs_nosys,
 326  320  
 327  321          VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 328  322              fs_nosys, fs_nosys,
 329  323  
 330  324          VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 331  325              fs_nosys, fs_nosys,
 332  326  
 333  327          VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 334  328              (fs_generic_func_p) fs_nosys_map,
 335  329              (fs_generic_func_p) fs_nosys_map,
 336  330  
 337  331          VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 338  332              (fs_generic_func_p) fs_nosys_addmap,
 339  333              (fs_generic_func_p) fs_nosys_addmap,
 340  334  
 341  335          VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 342  336              fs_nosys, fs_nosys,
 343  337  
 344  338          VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 345  339              (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 346  340  
 347  341          VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 348  342              fs_nosys, fs_nosys,
 349  343  
 350  344          VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 351  345              fs_pathconf, fs_nosys,
 352  346  
 353  347          VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 354  348              fs_nosys, fs_nosys,
 355  349  
 356  350          VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 357  351              fs_nosys, fs_nosys,
 358  352  
 359  353          VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 360  354              (fs_generic_func_p) fs_dispose,
 361  355              (fs_generic_func_p) fs_nodispose,
 362  356  
 363  357          VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 364  358              fs_nosys, fs_nosys,
 365  359  
 366  360          VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 367  361              fs_fab_acl, fs_nosys,
 368  362  
 369  363          VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 370  364              fs_shrlock, fs_nosys,
 371  365  
 372  366          VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 373  367              (fs_generic_func_p) fs_vnevent_nosupport,
 374  368              (fs_generic_func_p) fs_vnevent_nosupport,
 375  369  
 376  370          VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 377  371              fs_nosys, fs_nosys,
 378  372  
 379  373          VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 380  374              fs_nosys, fs_nosys,
 381  375  
 382  376          NULL, 0, NULL, NULL
 383  377  };
 384  378  
 385  379  /* Extensible attribute (xva) routines. */
 386  380  
 387  381  /*
 388  382   * Zero out the structure, set the size of the requested/returned bitmaps,
 389  383   * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 390  384   * to the returned attributes array.
 391  385   */
 392  386  void
 393  387  xva_init(xvattr_t *xvap)
 394  388  {
 395  389          bzero(xvap, sizeof (xvattr_t));
 396  390          xvap->xva_mapsize = XVA_MAPSIZE;
 397  391          xvap->xva_magic = XVA_MAGIC;
 398  392          xvap->xva_vattr.va_mask = AT_XVATTR;
 399  393          xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 400  394  }
 401  395  
 402  396  /*
 403  397   * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 404  398   * structure.  Otherwise, returns NULL.
 405  399   */
 406  400  xoptattr_t *
 407  401  xva_getxoptattr(xvattr_t *xvap)
 408  402  {
 409  403          xoptattr_t *xoap = NULL;
 410  404          if (xvap->xva_vattr.va_mask & AT_XVATTR)
 411  405                  xoap = &xvap->xva_xoptattrs;
 412  406          return (xoap);
 413  407  }
 414  408  
 415  409  /*
 416  410   * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 417  411   * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 418  412   * kstat name.
 419  413   */
 420  414  static int
 421  415  vska_compar(const void *n1, const void *n2)
 422  416  {
 423  417          int ret;
 424  418          ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 425  419          ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 426  420  
 427  421          if (p1 < p2) {
 428  422                  ret = -1;
 429  423          } else if (p1 > p2) {
 430  424                  ret = 1;
 431  425          } else {
 432  426                  ret = 0;
 433  427          }
 434  428  
 435  429          return (ret);
 436  430  }
 437  431  
 438  432  /*
 439  433   * Used to create a single template which will be bcopy()ed to a newly
 440  434   * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 441  435   */
 442  436  static vopstats_t *
 443  437  create_vopstats_template()
 444  438  {
 445  439          vopstats_t              *vsp;
 446  440  
 447  441          vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 448  442          bzero(vsp, sizeof (*vsp));      /* Start fresh */
 449  443  
 450  444          /* VOP_OPEN */
 451  445          kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 452  446          /* VOP_CLOSE */
 453  447          kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 454  448          /* VOP_READ I/O */
 455  449          kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 456  450          kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 457  451          /* VOP_WRITE I/O */
 458  452          kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 459  453          kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 460  454          /* VOP_IOCTL */
 461  455          kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 462  456          /* VOP_SETFL */
 463  457          kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 464  458          /* VOP_GETATTR */
 465  459          kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 466  460          /* VOP_SETATTR */
 467  461          kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 468  462          /* VOP_ACCESS */
 469  463          kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 470  464          /* VOP_LOOKUP */
 471  465          kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 472  466          /* VOP_CREATE */
 473  467          kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 474  468          /* VOP_REMOVE */
 475  469          kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 476  470          /* VOP_LINK */
 477  471          kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 478  472          /* VOP_RENAME */
 479  473          kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 480  474          /* VOP_MKDIR */
 481  475          kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 482  476          /* VOP_RMDIR */
 483  477          kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 484  478          /* VOP_READDIR I/O */
 485  479          kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 486  480          kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 487  481              KSTAT_DATA_UINT64);
 488  482          /* VOP_SYMLINK */
 489  483          kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 490  484          /* VOP_READLINK */
 491  485          kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 492  486          /* VOP_FSYNC */
 493  487          kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 494  488          /* VOP_INACTIVE */
 495  489          kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 496  490          /* VOP_FID */
 497  491          kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 498  492          /* VOP_RWLOCK */
 499  493          kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 500  494          /* VOP_RWUNLOCK */
 501  495          kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 502  496          /* VOP_SEEK */
 503  497          kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 504  498          /* VOP_CMP */
 505  499          kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 506  500          /* VOP_FRLOCK */
 507  501          kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 508  502          /* VOP_SPACE */
 509  503          kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 510  504          /* VOP_REALVP */
 511  505          kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 512  506          /* VOP_GETPAGE */
 513  507          kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 514  508          /* VOP_PUTPAGE */
 515  509          kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 516  510          /* VOP_MAP */
 517  511          kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 518  512          /* VOP_ADDMAP */
 519  513          kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 520  514          /* VOP_DELMAP */
 521  515          kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 522  516          /* VOP_POLL */
 523  517          kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 524  518          /* VOP_DUMP */
 525  519          kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 526  520          /* VOP_PATHCONF */
 527  521          kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 528  522          /* VOP_PAGEIO */
 529  523          kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 530  524          /* VOP_DUMPCTL */
 531  525          kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 532  526          /* VOP_DISPOSE */
 533  527          kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 534  528          /* VOP_SETSECATTR */
 535  529          kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 536  530          /* VOP_GETSECATTR */
 537  531          kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 538  532          /* VOP_SHRLOCK */
 539  533          kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 540  534          /* VOP_VNEVENT */
 541  535          kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 542  536          /* VOP_REQZCBUF */
 543  537          kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 544  538          /* VOP_RETZCBUF */
 545  539          kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 546  540  
 547  541          return (vsp);
 548  542  }
 549  543  
 550  544  /*
 551  545   * Creates a kstat structure associated with a vopstats structure.
 552  546   */
 553  547  kstat_t *
 554  548  new_vskstat(char *ksname, vopstats_t *vsp)
 555  549  {
 556  550          kstat_t         *ksp;
 557  551  
 558  552          if (!vopstats_enabled) {
 559  553                  return (NULL);
 560  554          }
 561  555  
 562  556          ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 563  557              sizeof (vopstats_t)/sizeof (kstat_named_t),
 564  558              KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 565  559          if (ksp) {
 566  560                  ksp->ks_data = vsp;
 567  561                  kstat_install(ksp);
 568  562          }
 569  563  
 570  564          return (ksp);
 571  565  }
 572  566  
 573  567  /*
 574  568   * Called from vfsinit() to initialize the support mechanisms for vopstats
 575  569   */
 576  570  void
 577  571  vopstats_startup()
 578  572  {
 579  573          if (!vopstats_enabled)
 580  574                  return;
 581  575  
 582  576          /*
 583  577           * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 584  578           * is necessary since we need to check if a kstat exists before we
 585  579           * attempt to create it.  Also, initialize its lock.
 586  580           */
 587  581          avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 588  582              offsetof(vsk_anchor_t, vsk_node));
 589  583          mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 590  584  
 591  585          vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 592  586              sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 593  587              NULL, NULL, 0);
 594  588  
 595  589          /*
 596  590           * Set up the array of pointers for the vopstats-by-FS-type.
 597  591           * The entries will be allocated/initialized as each file system
 598  592           * goes through modload/mod_installfs.
 599  593           */
 600  594          vopstats_fstype = (vopstats_t **)kmem_zalloc(
 601  595              (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 602  596  
 603  597          /* Set up the global vopstats initialization template */
 604  598          vs_templatep = create_vopstats_template();
 605  599  }
 606  600  
 607  601  /*
 608  602   * We need to have the all of the counters zeroed.
 609  603   * The initialization of the vopstats_t includes on the order of
 610  604   * 50 calls to kstat_named_init().  Rather that do that on every call,
 611  605   * we do it once in a template (vs_templatep) then bcopy it over.
 612  606   */
 613  607  void
 614  608  initialize_vopstats(vopstats_t *vsp)
 615  609  {
 616  610          if (vsp == NULL)
 617  611                  return;
 618  612  
 619  613          bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 620  614  }
 621  615  
 622  616  /*
 623  617   * If possible, determine which vopstats by fstype to use and
 624  618   * return a pointer to the caller.
 625  619   */
 626  620  vopstats_t *
 627  621  get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 628  622  {
 629  623          int             fstype = 0;     /* Index into vfssw[] */
 630  624          vopstats_t      *vsp = NULL;
 631  625  
 632  626          if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 633  627              !vopstats_enabled)
 634  628                  return (NULL);
 635  629          /*
 636  630           * Set up the fstype.  We go to so much trouble because all versions
 637  631           * of NFS use the same fstype in their vfs even though they have
 638  632           * distinct entries in the vfssw[] table.
 639  633           * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 640  634           */
 641  635          if (vswp) {
 642  636                  fstype = vswp - vfssw;  /* Gets us the index */
 643  637          } else {
 644  638                  fstype = vfsp->vfs_fstype;
 645  639          }
 646  640  
 647  641          /*
 648  642           * Point to the per-fstype vopstats. The only valid values are
 649  643           * non-zero positive values less than the number of vfssw[] table
 650  644           * entries.
 651  645           */
 652  646          if (fstype > 0 && fstype < nfstype) {
 653  647                  vsp = vopstats_fstype[fstype];
 654  648          }
 655  649  
 656  650          return (vsp);
 657  651  }
 658  652  
 659  653  /*
 660  654   * Generate a kstat name, create the kstat structure, and allocate a
 661  655   * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 662  656   * to the caller.  This must only be called from a mount.
 663  657   */
 664  658  vsk_anchor_t *
 665  659  get_vskstat_anchor(vfs_t *vfsp)
 666  660  {
 667  661          char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 668  662          statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 669  663          vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 670  664          kstat_t         *ksp;                   /* Ptr to new kstat */
 671  665          avl_index_t     where;                  /* Location in the AVL tree */
 672  666  
 673  667          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 674  668              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 675  669                  return (NULL);
 676  670  
 677  671          /* Need to get the fsid to build a kstat name */
 678  672          if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 679  673                  /* Create a name for our kstats based on fsid */
 680  674                  (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 681  675                      VOPSTATS_STR, statvfsbuf.f_fsid);
 682  676  
 683  677                  /* Allocate and initialize the vsk_anchor_t */
 684  678                  vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 685  679                  bzero(vskp, sizeof (*vskp));
 686  680                  vskp->vsk_fsid = statvfsbuf.f_fsid;
 687  681  
 688  682                  mutex_enter(&vskstat_tree_lock);
 689  683                  if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 690  684                          avl_insert(&vskstat_tree, vskp, where);
 691  685                          mutex_exit(&vskstat_tree_lock);
 692  686  
 693  687                          /*
 694  688                           * Now that we've got the anchor in the AVL
 695  689                           * tree, we can create the kstat.
 696  690                           */
 697  691                          ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 698  692                          if (ksp) {
 699  693                                  vskp->vsk_ksp = ksp;
 700  694                          }
 701  695                  } else {
 702  696                          /* Oops, found one! Release memory and lock. */
 703  697                          mutex_exit(&vskstat_tree_lock);
 704  698                          kmem_cache_free(vsk_anchor_cache, vskp);
 705  699                          vskp = NULL;
 706  700                  }
 707  701          }
 708  702          return (vskp);
 709  703  }
 710  704  
 711  705  /*
 712  706   * We're in the process of tearing down the vfs and need to cleanup
 713  707   * the data structures associated with the vopstats. Must only be called
 714  708   * from dounmount().
 715  709   */
 716  710  void
 717  711  teardown_vopstats(vfs_t *vfsp)
 718  712  {
 719  713          vsk_anchor_t    *vskap;
 720  714          avl_index_t     where;
 721  715  
 722  716          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 723  717              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 724  718                  return;
 725  719  
 726  720          /* This is a safe check since VFS_STATS must be set (see above) */
 727  721          if ((vskap = vfsp->vfs_vskap) == NULL)
 728  722                  return;
 729  723  
 730  724          /* Whack the pointer right away */
 731  725          vfsp->vfs_vskap = NULL;
 732  726  
 733  727          /* Lock the tree, remove the node, and delete the kstat */
 734  728          mutex_enter(&vskstat_tree_lock);
 735  729          if (avl_find(&vskstat_tree, vskap, &where)) {
 736  730                  avl_remove(&vskstat_tree, vskap);
 737  731          }
 738  732  
 739  733          if (vskap->vsk_ksp) {
 740  734                  kstat_delete(vskap->vsk_ksp);
 741  735          }
 742  736          mutex_exit(&vskstat_tree_lock);
 743  737  
 744  738          kmem_cache_free(vsk_anchor_cache, vskap);
 745  739  }
 746  740  
 747  741  /*
 748  742   * Read or write a vnode.  Called from kernel code.
 749  743   */
 750  744  int
 751  745  vn_rdwr(
 752  746          enum uio_rw rw,
 753  747          struct vnode *vp,
 754  748          caddr_t base,
 755  749          ssize_t len,
 756  750          offset_t offset,
 757  751          enum uio_seg seg,
 758  752          int ioflag,
 759  753          rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 760  754          cred_t *cr,
 761  755          ssize_t *residp)
 762  756  {
 763  757          struct uio uio;
 764  758          struct iovec iov;
 765  759          int error;
 766  760          int in_crit = 0;
 767  761  
 768  762          if (rw == UIO_WRITE && ISROFILE(vp))
 769  763                  return (EROFS);
 770  764  
 771  765          if (len < 0)
 772  766                  return (EIO);
 773  767  
 774  768          VOPXID_MAP_CR(vp, cr);
 775  769  
 776  770          iov.iov_base = base;
 777  771          iov.iov_len = len;
 778  772          uio.uio_iov = &iov;
 779  773          uio.uio_iovcnt = 1;
 780  774          uio.uio_loffset = offset;
 781  775          uio.uio_segflg = (short)seg;
 782  776          uio.uio_resid = len;
 783  777          uio.uio_llimit = ulimit;
 784  778  
 785  779          /*
 786  780           * We have to enter the critical region before calling VOP_RWLOCK
 787  781           * to avoid a deadlock with ufs.
 788  782           */
 789  783          if (nbl_need_check(vp)) {
 790  784                  int svmand;
 791  785  
 792  786                  nbl_start_crit(vp, RW_READER);
 793  787                  in_crit = 1;
 794  788                  error = nbl_svmand(vp, cr, &svmand);
 795  789                  if (error != 0)
 796  790                          goto done;
 797  791                  if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 798  792                      uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 799  793                          error = EACCES;
 800  794                          goto done;
 801  795                  }
 802  796          }
 803  797  
 804  798          (void) VOP_RWLOCK(vp,
 805  799              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 806  800          if (rw == UIO_WRITE) {
 807  801                  uio.uio_fmode = FWRITE;
 808  802                  uio.uio_extflg = UIO_COPY_DEFAULT;
 809  803                  error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 810  804          } else {
 811  805                  uio.uio_fmode = FREAD;
 812  806                  uio.uio_extflg = UIO_COPY_CACHED;
 813  807                  error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 814  808          }
 815  809          VOP_RWUNLOCK(vp,
 816  810              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 817  811          if (residp)
 818  812                  *residp = uio.uio_resid;
 819  813          else if (uio.uio_resid)
 820  814                  error = EIO;
 821  815  
 822  816  done:
 823  817          if (in_crit)
 824  818                  nbl_end_crit(vp);
 825  819          return (error);
 826  820  }
 827  821  
 828  822  /*
 829  823   * Release a vnode.  Call VOP_INACTIVE on last reference or
 830  824   * decrement reference count.
 831  825   *
 832  826   * To avoid race conditions, the v_count is left at 1 for
 833  827   * the call to VOP_INACTIVE. This prevents another thread
 834  828   * from reclaiming and releasing the vnode *before* the
 835  829   * VOP_INACTIVE routine has a chance to destroy the vnode.
 836  830   * We can't have more than 1 thread calling VOP_INACTIVE
 837  831   * on a vnode.
 838  832   */
 839  833  void
 840  834  vn_rele(vnode_t *vp)
 841  835  {
 842  836          VERIFY(vp->v_count > 0);
 843  837          mutex_enter(&vp->v_lock);
 844  838          if (vp->v_count == 1) {
 845  839                  mutex_exit(&vp->v_lock);
 846  840                  VOP_INACTIVE(vp, CRED(), NULL);
 847  841                  return;
 848  842          }
 849  843          vp->v_count--;
 850  844          mutex_exit(&vp->v_lock);
 851  845  }
 852  846  
 853  847  /*
 854  848   * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 855  849   * as a single reference, so v_count is not decremented until the last DNLC hold
 856  850   * is released. This makes it possible to distinguish vnodes that are referenced
 857  851   * only by the DNLC.
 858  852   */
 859  853  void
 860  854  vn_rele_dnlc(vnode_t *vp)
 861  855  {
 862  856          VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 863  857          mutex_enter(&vp->v_lock);
 864  858          if (--vp->v_count_dnlc == 0) {
 865  859                  if (vp->v_count == 1) {
 866  860                          mutex_exit(&vp->v_lock);
 867  861                          VOP_INACTIVE(vp, CRED(), NULL);
 868  862                          return;
 869  863                  }
 870  864                  vp->v_count--;
 871  865          }
 872  866          mutex_exit(&vp->v_lock);
 873  867  }
 874  868  
 875  869  /*
 876  870   * Like vn_rele() except that it clears v_stream under v_lock.
 877  871   * This is used by sockfs when it dismantels the association between
 878  872   * the sockfs node and the vnode in the underlaying file system.
 879  873   * v_lock has to be held to prevent a thread coming through the lookupname
 880  874   * path from accessing a stream head that is going away.
 881  875   */
 882  876  void
 883  877  vn_rele_stream(vnode_t *vp)
 884  878  {
 885  879          VERIFY(vp->v_count > 0);
 886  880          mutex_enter(&vp->v_lock);
 887  881          vp->v_stream = NULL;
 888  882          if (vp->v_count == 1) {
 889  883                  mutex_exit(&vp->v_lock);
 890  884                  VOP_INACTIVE(vp, CRED(), NULL);
 891  885                  return;
 892  886          }
 893  887          vp->v_count--;
 894  888          mutex_exit(&vp->v_lock);
 895  889  }
 896  890  
 897  891  static void
 898  892  vn_rele_inactive(vnode_t *vp)
 899  893  {
 900  894          VOP_INACTIVE(vp, CRED(), NULL);
 901  895  }
 902  896  
 903  897  /*
 904  898   * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 905  899   * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 906  900   * the file system as a result of releasing the vnode. Note, file systems
 907  901   * already have to handle the race where the vnode is incremented before the
 908  902   * inactive routine is called and does its locking.
 909  903   *
 910  904   * Warning: Excessive use of this routine can lead to performance problems.
 911  905   * This is because taskqs throttle back allocation if too many are created.
 912  906   */
 913  907  void
 914  908  vn_rele_async(vnode_t *vp, taskq_t *taskq)
 915  909  {
 916  910          VERIFY(vp->v_count > 0);
 917  911          mutex_enter(&vp->v_lock);
 918  912          if (vp->v_count == 1) {
 919  913                  mutex_exit(&vp->v_lock);
 920  914                  VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 921  915                      vp, TQ_SLEEP) != NULL);
 922  916                  return;
 923  917          }
 924  918          vp->v_count--;
 925  919          mutex_exit(&vp->v_lock);
 926  920  }
 927  921  
 928  922  int
 929  923  vn_open(
 930  924          char *pnamep,
 931  925          enum uio_seg seg,
 932  926          int filemode,
 933  927          int createmode,
 934  928          struct vnode **vpp,
 935  929          enum create crwhy,
 936  930          mode_t umask)
 937  931  {
 938  932          return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 939  933              umask, NULL, -1));
 940  934  }
 941  935  
 942  936  
 943  937  /*
 944  938   * Open/create a vnode.
 945  939   * This may be callable by the kernel, the only known use
 946  940   * of user context being that the current user credentials
 947  941   * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 948  942   */
 949  943  int
 950  944  vn_openat(
 951  945          char *pnamep,
 952  946          enum uio_seg seg,
 953  947          int filemode,
 954  948          int createmode,
 955  949          struct vnode **vpp,
 956  950          enum create crwhy,
 957  951          mode_t umask,
 958  952          struct vnode *startvp,
 959  953          int fd)
 960  954  {
 961  955          struct vnode *vp;
 962  956          int mode;
 963  957          int accessflags;
 964  958          int error;
 965  959          int in_crit = 0;
 966  960          int open_done = 0;
 967  961          int shrlock_done = 0;
 968  962          struct vattr vattr;
 969  963          enum symfollow follow;
 970  964          int estale_retry = 0;
 971  965          struct shrlock shr;
 972  966          struct shr_locowner shr_own;
 973  967  
 974  968          mode = 0;
 975  969          accessflags = 0;
 976  970          if (filemode & FREAD)
 977  971                  mode |= VREAD;
 978  972          if (filemode & (FWRITE|FTRUNC))
 979  973                  mode |= VWRITE;
 980  974          if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 981  975                  mode |= VEXEC;
 982  976  
 983  977          /* symlink interpretation */
 984  978          if (filemode & FNOFOLLOW)
 985  979                  follow = NO_FOLLOW;
 986  980          else
 987  981                  follow = FOLLOW;
 988  982  
 989  983          if (filemode & FAPPEND)
 990  984                  accessflags |= V_APPEND;
 991  985  
 992  986  top:
 993  987          if (filemode & FCREAT) {
 994  988                  enum vcexcl excl;
 995  989  
 996  990                  /*
 997  991                   * Wish to create a file.
 998  992                   */
 999  993                  vattr.va_type = VREG;
1000  994                  vattr.va_mode = createmode;
1001  995                  vattr.va_mask = AT_TYPE|AT_MODE;
1002  996                  if (filemode & FTRUNC) {
1003  997                          vattr.va_size = 0;
1004  998                          vattr.va_mask |= AT_SIZE;
1005  999                  }
1006 1000                  if (filemode & FEXCL)
1007 1001                          excl = EXCL;
1008 1002                  else
1009 1003                          excl = NONEXCL;
1010 1004  
1011 1005                  if (error =
1012 1006                      vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1013 1007                      (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1014 1008                          return (error);
1015 1009          } else {
1016 1010                  /*
1017 1011                   * Wish to open a file.  Just look it up.
1018 1012                   */
1019 1013                  if (error = lookupnameat(pnamep, seg, follow,
1020 1014                      NULLVPP, &vp, startvp)) {
1021 1015                          if ((error == ESTALE) &&
1022 1016                              fs_need_estale_retry(estale_retry++))
1023 1017                                  goto top;
1024 1018                          return (error);
1025 1019                  }
1026 1020  
1027 1021                  /*
1028 1022                   * Get the attributes to check whether file is large.
1029 1023                   * We do this only if the FOFFMAX flag is not set and
1030 1024                   * only for regular files.
1031 1025                   */
1032 1026  
1033 1027                  if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1034 1028                          vattr.va_mask = AT_SIZE;
1035 1029                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1036 1030                              CRED(), NULL))) {
1037 1031                                  goto out;
1038 1032                          }
1039 1033                          if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1040 1034                                  /*
1041 1035                                   * Large File API - regular open fails
1042 1036                                   * if FOFFMAX flag is set in file mode
1043 1037                                   */
1044 1038                                  error = EOVERFLOW;
1045 1039                                  goto out;
1046 1040                          }
1047 1041                  }
1048 1042                  /*
1049 1043                   * Can't write directories, active texts, or
1050 1044                   * read-only filesystems.  Can't truncate files
1051 1045                   * on which mandatory locking is in effect.
1052 1046                   */
1053 1047                  if (filemode & (FWRITE|FTRUNC)) {
1054 1048                          /*
1055 1049                           * Allow writable directory if VDIROPEN flag is set.
1056 1050                           */
1057 1051                          if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1058 1052                                  error = EISDIR;
1059 1053                                  goto out;
1060 1054                          }
1061 1055                          if (ISROFILE(vp)) {
1062 1056                                  error = EROFS;
1063 1057                                  goto out;
1064 1058                          }
1065 1059                          /*
1066 1060                           * Can't truncate files on which
1067 1061                           * sysv mandatory locking is in effect.
1068 1062                           */
1069 1063                          if (filemode & FTRUNC) {
1070 1064                                  vnode_t *rvp;
1071 1065  
1072 1066                                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1073 1067                                          rvp = vp;
1074 1068                                  if (rvp->v_filocks != NULL) {
1075 1069                                          vattr.va_mask = AT_MODE;
1076 1070                                          if ((error = VOP_GETATTR(vp,
1077 1071                                              &vattr, 0, CRED(), NULL)) == 0 &&
1078 1072                                              MANDLOCK(vp, vattr.va_mode))
1079 1073                                                  error = EAGAIN;
1080 1074                                  }
1081 1075                          }
1082 1076                          if (error)
1083 1077                                  goto out;
1084 1078                  }
1085 1079                  /*
1086 1080                   * Check permissions.
1087 1081                   */
1088 1082                  if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1089 1083                          goto out;
1090 1084                  /*
1091 1085                   * Require FSEARCH to return a directory.
1092 1086                   * Require FEXEC to return a regular file.
1093 1087                   */
1094 1088                  if ((filemode & FSEARCH) && vp->v_type != VDIR) {
1095 1089                          error = ENOTDIR;
1096 1090                          goto out;
1097 1091                  }
1098 1092                  if ((filemode & FEXEC) && vp->v_type != VREG) {
1099 1093                          error = ENOEXEC;        /* XXX: error code? */
1100 1094                          goto out;
1101 1095                  }
1102 1096          }
1103 1097  
1104 1098          /*
1105 1099           * Do remaining checks for FNOFOLLOW and FNOLINKS.
1106 1100           */
1107 1101          if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1108 1102                  error = ELOOP;
1109 1103                  goto out;
1110 1104          }
1111 1105          if (filemode & FNOLINKS) {
1112 1106                  vattr.va_mask = AT_NLINK;
1113 1107                  if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1114 1108                          goto out;
1115 1109                  }
1116 1110                  if (vattr.va_nlink != 1) {
1117 1111                          error = EMLINK;
1118 1112                          goto out;
1119 1113                  }
1120 1114          }
1121 1115  
1122 1116          /*
1123 1117           * Opening a socket corresponding to the AF_UNIX pathname
1124 1118           * in the filesystem name space is not supported.
1125 1119           * However, VSOCK nodes in namefs are supported in order
1126 1120           * to make fattach work for sockets.
1127 1121           *
1128 1122           * XXX This uses VOP_REALVP to distinguish between
1129 1123           * an unopened namefs node (where VOP_REALVP returns a
1130 1124           * different VSOCK vnode) and a VSOCK created by vn_create
1131 1125           * in some file system (where VOP_REALVP would never return
1132 1126           * a different vnode).
1133 1127           */
1134 1128          if (vp->v_type == VSOCK) {
1135 1129                  struct vnode *nvp;
1136 1130  
1137 1131                  error = VOP_REALVP(vp, &nvp, NULL);
1138 1132                  if (error != 0 || nvp == NULL || nvp == vp ||
1139 1133                      nvp->v_type != VSOCK) {
1140 1134                          error = EOPNOTSUPP;
1141 1135                          goto out;
1142 1136                  }
1143 1137          }
1144 1138  
1145 1139          if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1146 1140                  /* get share reservation */
1147 1141                  shr.s_access = 0;
1148 1142                  if (filemode & FWRITE)
1149 1143                          shr.s_access |= F_WRACC;
1150 1144                  if (filemode & FREAD)
1151 1145                          shr.s_access |= F_RDACC;
1152 1146                  shr.s_deny = 0;
1153 1147                  shr.s_sysid = 0;
1154 1148                  shr.s_pid = ttoproc(curthread)->p_pid;
1155 1149                  shr_own.sl_pid = shr.s_pid;
1156 1150                  shr_own.sl_id = fd;
1157 1151                  shr.s_own_len = sizeof (shr_own);
1158 1152                  shr.s_owner = (caddr_t)&shr_own;
1159 1153                  error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1160 1154                      NULL);
1161 1155                  if (error)
1162 1156                          goto out;
1163 1157                  shrlock_done = 1;
1164 1158  
1165 1159                  /* nbmand conflict check if truncating file */
1166 1160                  if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1167 1161                          nbl_start_crit(vp, RW_READER);
1168 1162                          in_crit = 1;
1169 1163  
1170 1164                          vattr.va_mask = AT_SIZE;
1171 1165                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1172 1166                                  goto out;
1173 1167                          if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1174 1168                              NULL)) {
1175 1169                                  error = EACCES;
1176 1170                                  goto out;
1177 1171                          }
1178 1172                  }
1179 1173          }
1180 1174  
1181 1175          /*
1182 1176           * Do opening protocol.
1183 1177           */
1184 1178          error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1185 1179          if (error)
1186 1180                  goto out;
1187 1181          open_done = 1;
1188 1182  
1189 1183          /*
1190 1184           * Truncate if required.
1191 1185           */
1192 1186          if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1193 1187                  vattr.va_size = 0;
1194 1188                  vattr.va_mask = AT_SIZE;
1195 1189                  if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1196 1190                          goto out;
1197 1191          }
1198 1192  out:
1199 1193          ASSERT(vp->v_count > 0);
1200 1194  
1201 1195          if (in_crit) {
1202 1196                  nbl_end_crit(vp);
1203 1197                  in_crit = 0;
1204 1198          }
1205 1199          if (error) {
1206 1200                  if (open_done) {
1207 1201                          (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1208 1202                              NULL);
1209 1203                          open_done = 0;
1210 1204                          shrlock_done = 0;
1211 1205                  }
1212 1206                  if (shrlock_done) {
1213 1207                          (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1214 1208                              NULL);
1215 1209                          shrlock_done = 0;
1216 1210                  }
1217 1211  
1218 1212                  /*
1219 1213                   * The following clause was added to handle a problem
1220 1214                   * with NFS consistency.  It is possible that a lookup
1221 1215                   * of the file to be opened succeeded, but the file
1222 1216                   * itself doesn't actually exist on the server.  This
1223 1217                   * is chiefly due to the DNLC containing an entry for
1224 1218                   * the file which has been removed on the server.  In
1225 1219                   * this case, we just start over.  If there was some
1226 1220                   * other cause for the ESTALE error, then the lookup
1227 1221                   * of the file will fail and the error will be returned
1228 1222                   * above instead of looping around from here.
1229 1223                   */
1230 1224                  VN_RELE(vp);
1231 1225                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1232 1226                          goto top;
1233 1227          } else
1234 1228                  *vpp = vp;
1235 1229          return (error);
1236 1230  }
1237 1231  
1238 1232  /*
1239 1233   * The following two accessor functions are for the NFSv4 server.  Since there
1240 1234   * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1241 1235   * vnode open counts correct when a client "upgrades" an open or does an
1242 1236   * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1243 1237   * open mode (add or subtract read or write), but also change the share/deny
1244 1238   * modes.  However, share reservations are not integrated with OPEN, yet, so
1245 1239   * we need to handle each separately.  These functions are cleaner than having
1246 1240   * the NFS server manipulate the counts directly, however, nobody else should
1247 1241   * use these functions.
1248 1242   */
1249 1243  void
1250 1244  vn_open_upgrade(
1251 1245          vnode_t *vp,
1252 1246          int filemode)
1253 1247  {
1254 1248          ASSERT(vp->v_type == VREG);
1255 1249  
1256 1250          if (filemode & FREAD)
1257 1251                  atomic_inc_32(&vp->v_rdcnt);
1258 1252          if (filemode & FWRITE)
1259 1253                  atomic_inc_32(&vp->v_wrcnt);
1260 1254  
1261 1255  }
1262 1256  
1263 1257  void
1264 1258  vn_open_downgrade(
1265 1259          vnode_t *vp,
1266 1260          int filemode)
1267 1261  {
1268 1262          ASSERT(vp->v_type == VREG);
1269 1263  
1270 1264          if (filemode & FREAD) {
1271 1265                  ASSERT(vp->v_rdcnt > 0);
1272 1266                  atomic_dec_32(&vp->v_rdcnt);
1273 1267          }
1274 1268          if (filemode & FWRITE) {
1275 1269                  ASSERT(vp->v_wrcnt > 0);
1276 1270                  atomic_dec_32(&vp->v_wrcnt);
1277 1271          }
1278 1272  
1279 1273  }
1280 1274  
1281 1275  int
1282 1276  vn_create(
1283 1277          char *pnamep,
1284 1278          enum uio_seg seg,
1285 1279          struct vattr *vap,
1286 1280          enum vcexcl excl,
1287 1281          int mode,
1288 1282          struct vnode **vpp,
1289 1283          enum create why,
1290 1284          int flag,
1291 1285          mode_t umask)
1292 1286  {
1293 1287          return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1294 1288              umask, NULL));
1295 1289  }
1296 1290  
1297 1291  /*
1298 1292   * Create a vnode (makenode).
1299 1293   */
1300 1294  int
1301 1295  vn_createat(
1302 1296          char *pnamep,
1303 1297          enum uio_seg seg,
1304 1298          struct vattr *vap,
1305 1299          enum vcexcl excl,
1306 1300          int mode,
1307 1301          struct vnode **vpp,
1308 1302          enum create why,
1309 1303          int flag,
1310 1304          mode_t umask,
1311 1305          struct vnode *startvp)
1312 1306  {
1313 1307          struct vnode *dvp;      /* ptr to parent dir vnode */
1314 1308          struct vnode *vp = NULL;
1315 1309          struct pathname pn;
1316 1310          int error;
1317 1311          int in_crit = 0;
1318 1312          struct vattr vattr;
1319 1313          enum symfollow follow;
1320 1314          int estale_retry = 0;
1321 1315          uint32_t auditing = AU_AUDITING();
1322 1316  
1323 1317          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1324 1318  
1325 1319          /* symlink interpretation */
1326 1320          if ((flag & FNOFOLLOW) || excl == EXCL)
1327 1321                  follow = NO_FOLLOW;
1328 1322          else
1329 1323                  follow = FOLLOW;
1330 1324          flag &= ~(FNOFOLLOW|FNOLINKS);
1331 1325  
1332 1326  top:
1333 1327          /*
1334 1328           * Lookup directory.
1335 1329           * If new object is a file, call lower level to create it.
1336 1330           * Note that it is up to the lower level to enforce exclusive
1337 1331           * creation, if the file is already there.
1338 1332           * This allows the lower level to do whatever
1339 1333           * locking or protocol that is needed to prevent races.
1340 1334           * If the new object is directory call lower level to make
1341 1335           * the new directory, with "." and "..".
1342 1336           */
1343 1337          if (error = pn_get(pnamep, seg, &pn))
1344 1338                  return (error);
1345 1339          if (auditing)
1346 1340                  audit_vncreate_start();
1347 1341          dvp = NULL;
1348 1342          *vpp = NULL;
1349 1343          /*
1350 1344           * lookup will find the parent directory for the vnode.
1351 1345           * When it is done the pn holds the name of the entry
1352 1346           * in the directory.
1353 1347           * If this is a non-exclusive create we also find the node itself.
1354 1348           */
1355 1349          error = lookuppnat(&pn, NULL, follow, &dvp,
1356 1350              (excl == EXCL) ? NULLVPP : vpp, startvp);
1357 1351          if (error) {
1358 1352                  pn_free(&pn);
1359 1353                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1360 1354                          goto top;
1361 1355                  if (why == CRMKDIR && error == EINVAL)
1362 1356                          error = EEXIST;         /* SVID */
1363 1357                  return (error);
1364 1358          }
1365 1359  
1366 1360          if (why != CRMKNOD)
1367 1361                  vap->va_mode &= ~VSVTX;
1368 1362  
1369 1363          /*
1370 1364           * If default ACLs are defined for the directory don't apply the
1371 1365           * umask if umask is passed.
1372 1366           */
1373 1367  
1374 1368          if (umask) {
1375 1369  
1376 1370                  vsecattr_t vsec;
1377 1371  
1378 1372                  vsec.vsa_aclcnt = 0;
1379 1373                  vsec.vsa_aclentp = NULL;
1380 1374                  vsec.vsa_dfaclcnt = 0;
1381 1375                  vsec.vsa_dfaclentp = NULL;
1382 1376                  vsec.vsa_mask = VSA_DFACLCNT;
1383 1377                  error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1384 1378                  /*
1385 1379                   * If error is ENOSYS then treat it as no error
1386 1380                   * Don't want to force all file systems to support
1387 1381                   * aclent_t style of ACL's.
1388 1382                   */
1389 1383                  if (error == ENOSYS)
1390 1384                          error = 0;
1391 1385                  if (error) {
1392 1386                          if (*vpp != NULL)
1393 1387                                  VN_RELE(*vpp);
1394 1388                          goto out;
1395 1389                  } else {
1396 1390                          /*
1397 1391                           * Apply the umask if no default ACLs.
1398 1392                           */
1399 1393                          if (vsec.vsa_dfaclcnt == 0)
1400 1394                                  vap->va_mode &= ~umask;
1401 1395  
1402 1396                          /*
1403 1397                           * VOP_GETSECATTR() may have allocated memory for
1404 1398                           * ACLs we didn't request, so double-check and
1405 1399                           * free it if necessary.
1406 1400                           */
1407 1401                          if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1408 1402                                  kmem_free((caddr_t)vsec.vsa_aclentp,
1409 1403                                      vsec.vsa_aclcnt * sizeof (aclent_t));
1410 1404                          if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1411 1405                                  kmem_free((caddr_t)vsec.vsa_dfaclentp,
1412 1406                                      vsec.vsa_dfaclcnt * sizeof (aclent_t));
1413 1407                  }
1414 1408          }
1415 1409  
1416 1410          /*
1417 1411           * In general we want to generate EROFS if the file system is
1418 1412           * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1419 1413           * documents the open system call, and it says that O_CREAT has no
1420 1414           * effect if the file already exists.  Bug 1119649 states
1421 1415           * that open(path, O_CREAT, ...) fails when attempting to open an
1422 1416           * existing file on a read only file system.  Thus, the first part
1423 1417           * of the following if statement has 3 checks:
1424 1418           *      if the file exists &&
1425 1419           *              it is being open with write access &&
1426 1420           *              the file system is read only
1427 1421           *      then generate EROFS
1428 1422           */
1429 1423          if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1430 1424              (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1431 1425                  if (*vpp)
1432 1426                          VN_RELE(*vpp);
1433 1427                  error = EROFS;
1434 1428          } else if (excl == NONEXCL && *vpp != NULL) {
1435 1429                  vnode_t *rvp;
1436 1430  
1437 1431                  /*
1438 1432                   * File already exists.  If a mandatory lock has been
1439 1433                   * applied, return error.
1440 1434                   */
1441 1435                  vp = *vpp;
1442 1436                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1443 1437                          rvp = vp;
1444 1438                  if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1445 1439                          nbl_start_crit(vp, RW_READER);
1446 1440                          in_crit = 1;
1447 1441                  }
1448 1442                  if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1449 1443                          vattr.va_mask = AT_MODE|AT_SIZE;
1450 1444                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1451 1445                                  goto out;
1452 1446                          }
1453 1447                          if (MANDLOCK(vp, vattr.va_mode)) {
1454 1448                                  error = EAGAIN;
1455 1449                                  goto out;
1456 1450                          }
1457 1451                          /*
1458 1452                           * File cannot be truncated if non-blocking mandatory
1459 1453                           * locks are currently on the file.
1460 1454                           */
1461 1455                          if ((vap->va_mask & AT_SIZE) && in_crit) {
1462 1456                                  u_offset_t offset;
1463 1457                                  ssize_t length;
1464 1458  
1465 1459                                  offset = vap->va_size > vattr.va_size ?
1466 1460                                      vattr.va_size : vap->va_size;
1467 1461                                  length = vap->va_size > vattr.va_size ?
1468 1462                                      vap->va_size - vattr.va_size :
1469 1463                                      vattr.va_size - vap->va_size;
1470 1464                                  if (nbl_conflict(vp, NBL_WRITE, offset,
1471 1465                                      length, 0, NULL)) {
1472 1466                                          error = EACCES;
1473 1467                                          goto out;
1474 1468                                  }
1475 1469                          }
1476 1470                  }
1477 1471  
1478 1472                  /*
1479 1473                   * If the file is the root of a VFS, we've crossed a
1480 1474                   * mount point and the "containing" directory that we
1481 1475                   * acquired above (dvp) is irrelevant because it's in
1482 1476                   * a different file system.  We apply VOP_CREATE to the
1483 1477                   * target itself instead of to the containing directory
1484 1478                   * and supply a null path name to indicate (conventionally)
1485 1479                   * the node itself as the "component" of interest.
1486 1480                   *
1487 1481                   * The intercession of the file system is necessary to
1488 1482                   * ensure that the appropriate permission checks are
1489 1483                   * done.
1490 1484                   */
1491 1485                  if (vp->v_flag & VROOT) {
1492 1486                          ASSERT(why != CRMKDIR);
1493 1487                          error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1494 1488                              CRED(), flag, NULL, NULL);
1495 1489                          /*
1496 1490                           * If the create succeeded, it will have created
1497 1491                           * a new reference to the vnode.  Give up the
1498 1492                           * original reference.  The assertion should not
1499 1493                           * get triggered because NBMAND locks only apply to
1500 1494                           * VREG files.  And if in_crit is non-zero for some
1501 1495                           * reason, detect that here, rather than when we
1502 1496                           * deference a null vp.
1503 1497                           */
1504 1498                          ASSERT(in_crit == 0);
1505 1499                          VN_RELE(vp);
1506 1500                          vp = NULL;
1507 1501                          goto out;
1508 1502                  }
1509 1503  
1510 1504                  /*
1511 1505                   * Large File API - non-large open (FOFFMAX flag not set)
1512 1506                   * of regular file fails if the file size exceeds MAXOFF32_T.
1513 1507                   */
1514 1508                  if (why != CRMKDIR &&
1515 1509                      !(flag & FOFFMAX) &&
1516 1510                      (vp->v_type == VREG)) {
1517 1511                          vattr.va_mask = AT_SIZE;
1518 1512                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1519 1513                              CRED(), NULL))) {
1520 1514                                  goto out;
1521 1515                          }
1522 1516                          if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1523 1517                                  error = EOVERFLOW;
1524 1518                                  goto out;
1525 1519                          }
1526 1520                  }
1527 1521          }
1528 1522  
1529 1523          if (error == 0) {
1530 1524                  /*
1531 1525                   * Call mkdir() if specified, otherwise create().
1532 1526                   */
1533 1527                  int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1534 1528  
1535 1529                  if (why == CRMKDIR)
1536 1530                          /*
1537 1531                           * N.B., if vn_createat() ever requests
1538 1532                           * case-insensitive behavior then it will need
1539 1533                           * to be passed to VOP_MKDIR().  VOP_CREATE()
1540 1534                           * will already get it via "flag"
1541 1535                           */
1542 1536                          error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1543 1537                              NULL, 0, NULL);
1544 1538                  else if (!must_be_dir)
1545 1539                          error = VOP_CREATE(dvp, pn.pn_path, vap,
1546 1540                              excl, mode, vpp, CRED(), flag, NULL, NULL);
1547 1541                  else
1548 1542                          error = ENOTDIR;
1549 1543          }
1550 1544  
1551 1545  out:
1552 1546  
1553 1547          if (auditing)
1554 1548                  audit_vncreate_finish(*vpp, error);
1555 1549          if (in_crit) {
1556 1550                  nbl_end_crit(vp);
1557 1551                  in_crit = 0;
1558 1552          }
1559 1553          if (vp != NULL) {
1560 1554                  VN_RELE(vp);
1561 1555                  vp = NULL;
1562 1556          }
1563 1557          pn_free(&pn);
1564 1558          VN_RELE(dvp);
1565 1559          /*
1566 1560           * The following clause was added to handle a problem
1567 1561           * with NFS consistency.  It is possible that a lookup
1568 1562           * of the file to be created succeeded, but the file
1569 1563           * itself doesn't actually exist on the server.  This
1570 1564           * is chiefly due to the DNLC containing an entry for
1571 1565           * the file which has been removed on the server.  In
1572 1566           * this case, we just start over.  If there was some
1573 1567           * other cause for the ESTALE error, then the lookup
1574 1568           * of the file will fail and the error will be returned
1575 1569           * above instead of looping around from here.
1576 1570           */
1577 1571          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1578 1572                  goto top;
1579 1573          return (error);
1580 1574  }
1581 1575  
1582 1576  int
1583 1577  vn_link(char *from, char *to, enum uio_seg seg)
1584 1578  {
1585 1579          return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1586 1580  }
1587 1581  
1588 1582  int
1589 1583  vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1590 1584      vnode_t *tstartvp, char *to, enum uio_seg seg)
1591 1585  {
1592 1586          struct vnode *fvp;              /* from vnode ptr */
1593 1587          struct vnode *tdvp;             /* to directory vnode ptr */
1594 1588          struct pathname pn;
1595 1589          int error;
1596 1590          struct vattr vattr;
1597 1591          dev_t fsid;
1598 1592          int estale_retry = 0;
1599 1593          uint32_t auditing = AU_AUDITING();
1600 1594  
1601 1595  top:
1602 1596          fvp = tdvp = NULL;
1603 1597          if (error = pn_get(to, seg, &pn))
1604 1598                  return (error);
1605 1599          if (auditing && fstartvp != NULL)
1606 1600                  audit_setfsat_path(1);
1607 1601          if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1608 1602                  goto out;
1609 1603          if (auditing && tstartvp != NULL)
1610 1604                  audit_setfsat_path(3);
1611 1605          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1612 1606                  goto out;
1613 1607          /*
1614 1608           * Make sure both source vnode and target directory vnode are
1615 1609           * in the same vfs and that it is writeable.
1616 1610           */
1617 1611          vattr.va_mask = AT_FSID;
1618 1612          if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1619 1613                  goto out;
1620 1614          fsid = vattr.va_fsid;
1621 1615          vattr.va_mask = AT_FSID;
1622 1616          if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1623 1617                  goto out;
1624 1618          if (fsid != vattr.va_fsid) {
1625 1619                  error = EXDEV;
1626 1620                  goto out;
1627 1621          }
1628 1622          if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1629 1623                  error = EROFS;
1630 1624                  goto out;
1631 1625          }
1632 1626          /*
1633 1627           * Do the link.
1634 1628           */
1635 1629          (void) pn_fixslash(&pn);
1636 1630          error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1637 1631  out:
1638 1632          pn_free(&pn);
1639 1633          if (fvp)
1640 1634                  VN_RELE(fvp);
1641 1635          if (tdvp)
1642 1636                  VN_RELE(tdvp);
1643 1637          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1644 1638                  goto top;
1645 1639          return (error);
  
    | 
      ↓ open down ↓ | 
    1424 lines elided | 
    
      ↑ open up ↑ | 
  
1646 1640  }
1647 1641  
1648 1642  int
1649 1643  vn_rename(char *from, char *to, enum uio_seg seg)
1650 1644  {
1651 1645          return (vn_renameat(NULL, from, NULL, to, seg));
1652 1646  }
1653 1647  
1654 1648  int
1655 1649  vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1656      -    char *tname, enum uio_seg seg)
     1650 +                char *tname, enum uio_seg seg)
1657 1651  {
1658 1652          int error;
1659 1653          struct vattr vattr;
1660 1654          struct pathname fpn;            /* from pathname */
1661 1655          struct pathname tpn;            /* to pathname */
1662 1656          dev_t fsid;
1663 1657          int in_crit_src, in_crit_targ;
1664 1658          vnode_t *fromvp, *fvp;
1665 1659          vnode_t *tovp, *targvp;
1666 1660          int estale_retry = 0;
1667 1661          uint32_t auditing = AU_AUDITING();
1668 1662  
1669 1663  top:
1670 1664          fvp = fromvp = tovp = targvp = NULL;
1671 1665          in_crit_src = in_crit_targ = 0;
1672 1666          /*
1673 1667           * Get to and from pathnames.
1674 1668           */
1675 1669          if (error = pn_get(fname, seg, &fpn))
1676 1670                  return (error);
1677 1671          if (error = pn_get(tname, seg, &tpn)) {
1678 1672                  pn_free(&fpn);
1679 1673                  return (error);
1680 1674          }
1681 1675  
1682 1676          /*
1683 1677           * First we need to resolve the correct directories
1684 1678           * The passed in directories may only be a starting point,
1685 1679           * but we need the real directories the file(s) live in.
1686 1680           * For example the fname may be something like usr/lib/sparc
1687 1681           * and we were passed in the / directory, but we need to
1688 1682           * use the lib directory for the rename.
1689 1683           */
1690 1684  
1691 1685          if (auditing && fdvp != NULL)
1692 1686                  audit_setfsat_path(1);
1693 1687          /*
1694 1688           * Lookup to and from directories.
1695 1689           */
1696 1690          if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1697 1691                  goto out;
1698 1692          }
1699 1693  
1700 1694          /*
1701 1695           * Make sure there is an entry.
1702 1696           */
1703 1697          if (fvp == NULL) {
1704 1698                  error = ENOENT;
1705 1699                  goto out;
1706 1700          }
1707 1701  
1708 1702          if (auditing && tdvp != NULL)
1709 1703                  audit_setfsat_path(3);
1710 1704          if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1711 1705                  goto out;
1712 1706          }
1713 1707  
1714 1708          /*
1715 1709           * Make sure both the from vnode directory and the to directory
1716 1710           * are in the same vfs and the to directory is writable.
1717 1711           * We check fsid's, not vfs pointers, so loopback fs works.
1718 1712           */
1719 1713          if (fromvp != tovp) {
1720 1714                  vattr.va_mask = AT_FSID;
1721 1715                  if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1722 1716                          goto out;
1723 1717                  fsid = vattr.va_fsid;
1724 1718                  vattr.va_mask = AT_FSID;
1725 1719                  if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1726 1720                          goto out;
1727 1721                  if (fsid != vattr.va_fsid) {
1728 1722                          error = EXDEV;
1729 1723                          goto out;
1730 1724                  }
1731 1725          }
1732 1726  
1733 1727          if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1734 1728                  error = EROFS;
1735 1729                  goto out;
1736 1730          }
1737 1731  
1738 1732          if (targvp && (fvp != targvp)) {
1739 1733                  nbl_start_crit(targvp, RW_READER);
1740 1734                  in_crit_targ = 1;
1741 1735                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1742 1736                          error = EACCES;
1743 1737                          goto out;
1744 1738                  }
1745 1739          }
1746 1740  
1747 1741          if (nbl_need_check(fvp)) {
1748 1742                  nbl_start_crit(fvp, RW_READER);
1749 1743                  in_crit_src = 1;
1750 1744                  if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1751 1745                          error = EACCES;
1752 1746                          goto out;
1753 1747                  }
1754 1748          }
1755 1749  
1756 1750          /*
1757 1751           * Do the rename.
1758 1752           */
1759 1753          (void) pn_fixslash(&tpn);
1760 1754          error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1761 1755              NULL, 0);
1762 1756  
1763 1757  out:
1764 1758          pn_free(&fpn);
1765 1759          pn_free(&tpn);
1766 1760          if (in_crit_src)
1767 1761                  nbl_end_crit(fvp);
1768 1762          if (in_crit_targ)
1769 1763                  nbl_end_crit(targvp);
1770 1764          if (fromvp)
1771 1765                  VN_RELE(fromvp);
1772 1766          if (tovp)
1773 1767                  VN_RELE(tovp);
1774 1768          if (targvp)
1775 1769                  VN_RELE(targvp);
1776 1770          if (fvp)
1777 1771                  VN_RELE(fvp);
1778 1772          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1779 1773                  goto top;
1780 1774          return (error);
1781 1775  }
1782 1776  
1783 1777  /*
1784 1778   * Remove a file or directory.
1785 1779   */
1786 1780  int
1787 1781  vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1788 1782  {
1789 1783          return (vn_removeat(NULL, fnamep, seg, dirflag));
1790 1784  }
1791 1785  
1792 1786  int
1793 1787  vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1794 1788  {
1795 1789          struct vnode *vp;               /* entry vnode */
1796 1790          struct vnode *dvp;              /* ptr to parent dir vnode */
1797 1791          struct vnode *coveredvp;
1798 1792          struct pathname pn;             /* name of entry */
1799 1793          enum vtype vtype;
1800 1794          int error;
1801 1795          struct vfs *vfsp;
1802 1796          struct vfs *dvfsp;      /* ptr to parent dir vfs */
1803 1797          int in_crit = 0;
1804 1798          int estale_retry = 0;
1805 1799  
1806 1800  top:
1807 1801          if (error = pn_get(fnamep, seg, &pn))
1808 1802                  return (error);
1809 1803          dvp = vp = NULL;
1810 1804          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1811 1805                  pn_free(&pn);
1812 1806                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1813 1807                          goto top;
1814 1808                  return (error);
1815 1809          }
1816 1810  
1817 1811          /*
1818 1812           * Make sure there is an entry.
1819 1813           */
1820 1814          if (vp == NULL) {
1821 1815                  error = ENOENT;
1822 1816                  goto out;
1823 1817          }
1824 1818  
1825 1819          vfsp = vp->v_vfsp;
1826 1820          dvfsp = dvp->v_vfsp;
1827 1821  
1828 1822          /*
1829 1823           * If the named file is the root of a mounted filesystem, fail,
1830 1824           * unless it's marked unlinkable.  In that case, unmount the
1831 1825           * filesystem and proceed to unlink the covered vnode.  (If the
1832 1826           * covered vnode is a directory, use rmdir instead of unlink,
1833 1827           * to avoid file system corruption.)
1834 1828           */
1835 1829          if (vp->v_flag & VROOT) {
1836 1830                  if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1837 1831                          error = EBUSY;
1838 1832                          goto out;
1839 1833                  }
1840 1834  
1841 1835                  /*
1842 1836                   * Namefs specific code starts here.
1843 1837                   */
1844 1838  
1845 1839                  if (dirflag == RMDIRECTORY) {
1846 1840                          /*
1847 1841                           * User called rmdir(2) on a file that has
1848 1842                           * been namefs mounted on top of.  Since
1849 1843                           * namefs doesn't allow directories to
1850 1844                           * be mounted on other files we know
1851 1845                           * vp is not of type VDIR so fail to operation.
1852 1846                           */
1853 1847                          error = ENOTDIR;
1854 1848                          goto out;
1855 1849                  }
1856 1850  
1857 1851                  /*
1858 1852                   * If VROOT is still set after grabbing vp->v_lock,
1859 1853                   * noone has finished nm_unmount so far and coveredvp
1860 1854                   * is valid.
1861 1855                   * If we manage to grab vn_vfswlock(coveredvp) before releasing
1862 1856                   * vp->v_lock, any race window is eliminated.
1863 1857                   */
1864 1858  
1865 1859                  mutex_enter(&vp->v_lock);
1866 1860                  if ((vp->v_flag & VROOT) == 0) {
1867 1861                          /* Someone beat us to the unmount */
1868 1862                          mutex_exit(&vp->v_lock);
1869 1863                          error = EBUSY;
1870 1864                          goto out;
1871 1865                  }
1872 1866                  vfsp = vp->v_vfsp;
1873 1867                  coveredvp = vfsp->vfs_vnodecovered;
1874 1868                  ASSERT(coveredvp);
1875 1869                  /*
1876 1870                   * Note: Implementation of vn_vfswlock shows that ordering of
1877 1871                   * v_lock / vn_vfswlock is not an issue here.
1878 1872                   */
1879 1873                  error = vn_vfswlock(coveredvp);
1880 1874                  mutex_exit(&vp->v_lock);
1881 1875  
1882 1876                  if (error)
1883 1877                          goto out;
1884 1878  
1885 1879                  VN_HOLD(coveredvp);
1886 1880                  VN_RELE(vp);
1887 1881                  error = dounmount(vfsp, 0, CRED());
1888 1882  
1889 1883                  /*
1890 1884                   * Unmounted the namefs file system; now get
1891 1885                   * the object it was mounted over.
1892 1886                   */
1893 1887                  vp = coveredvp;
1894 1888                  /*
1895 1889                   * If namefs was mounted over a directory, then
1896 1890                   * we want to use rmdir() instead of unlink().
1897 1891                   */
1898 1892                  if (vp->v_type == VDIR)
1899 1893                          dirflag = RMDIRECTORY;
1900 1894  
1901 1895                  if (error)
1902 1896                          goto out;
1903 1897          }
1904 1898  
1905 1899          /*
1906 1900           * Make sure filesystem is writeable.
1907 1901           * We check the parent directory's vfs in case this is an lofs vnode.
1908 1902           */
1909 1903          if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1910 1904                  error = EROFS;
1911 1905                  goto out;
1912 1906          }
1913 1907  
1914 1908          vtype = vp->v_type;
1915 1909  
1916 1910          /*
1917 1911           * If there is the possibility of an nbmand share reservation, make
1918 1912           * sure it's okay to remove the file.  Keep a reference to the
1919 1913           * vnode, so that we can exit the nbl critical region after
1920 1914           * calling VOP_REMOVE.
1921 1915           * If there is no possibility of an nbmand share reservation,
1922 1916           * release the vnode reference now.  Filesystems like NFS may
1923 1917           * behave differently if there is an extra reference, so get rid of
1924 1918           * this one.  Fortunately, we can't have nbmand mounts on NFS
1925 1919           * filesystems.
1926 1920           */
1927 1921          if (nbl_need_check(vp)) {
1928 1922                  nbl_start_crit(vp, RW_READER);
1929 1923                  in_crit = 1;
1930 1924                  if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1931 1925                          error = EACCES;
1932 1926                          goto out;
1933 1927                  }
1934 1928          } else {
1935 1929                  VN_RELE(vp);
1936 1930                  vp = NULL;
1937 1931          }
1938 1932  
1939 1933          if (dirflag == RMDIRECTORY) {
1940 1934                  /*
1941 1935                   * Caller is using rmdir(2), which can only be applied to
1942 1936                   * directories.
1943 1937                   */
1944 1938                  if (vtype != VDIR) {
1945 1939                          error = ENOTDIR;
1946 1940                  } else {
1947 1941                          vnode_t *cwd;
1948 1942                          proc_t *pp = curproc;
1949 1943  
1950 1944                          mutex_enter(&pp->p_lock);
1951 1945                          cwd = PTOU(pp)->u_cdir;
1952 1946                          VN_HOLD(cwd);
1953 1947                          mutex_exit(&pp->p_lock);
1954 1948                          error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1955 1949                              NULL, 0);
1956 1950                          VN_RELE(cwd);
1957 1951                  }
1958 1952          } else {
1959 1953                  /*
1960 1954                   * Unlink(2) can be applied to anything.
1961 1955                   */
1962 1956                  error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1963 1957          }
1964 1958  
1965 1959  out:
1966 1960          pn_free(&pn);
1967 1961          if (in_crit) {
1968 1962                  nbl_end_crit(vp);
1969 1963                  in_crit = 0;
1970 1964          }
1971 1965          if (vp != NULL)
1972 1966                  VN_RELE(vp);
1973 1967          if (dvp != NULL)
1974 1968                  VN_RELE(dvp);
1975 1969          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1976 1970                  goto top;
1977 1971          return (error);
1978 1972  }
1979 1973  
1980 1974  /*
1981 1975   * Utility function to compare equality of vnodes.
1982 1976   * Compare the underlying real vnodes, if there are underlying vnodes.
1983 1977   * This is a more thorough comparison than the VN_CMP() macro provides.
1984 1978   */
1985 1979  int
1986 1980  vn_compare(vnode_t *vp1, vnode_t *vp2)
1987 1981  {
1988 1982          vnode_t *realvp;
1989 1983  
1990 1984          if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1991 1985                  vp1 = realvp;
1992 1986          if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1993 1987                  vp2 = realvp;
1994 1988          return (VN_CMP(vp1, vp2));
1995 1989  }
1996 1990  
1997 1991  /*
1998 1992   * The number of locks to hash into.  This value must be a power
1999 1993   * of 2 minus 1 and should probably also be prime.
2000 1994   */
2001 1995  #define NUM_BUCKETS     1023
2002 1996  
2003 1997  struct  vn_vfslocks_bucket {
2004 1998          kmutex_t vb_lock;
2005 1999          vn_vfslocks_entry_t *vb_list;
2006 2000          char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
2007 2001  };
2008 2002  
2009 2003  /*
2010 2004   * Total number of buckets will be NUM_BUCKETS + 1 .
2011 2005   */
2012 2006  
2013 2007  #pragma align   64(vn_vfslocks_buckets)
2014 2008  static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2015 2009  
2016 2010  #define VN_VFSLOCKS_SHIFT       9
2017 2011  
2018 2012  #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2019 2013          ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2020 2014  
2021 2015  /*
2022 2016   * vn_vfslocks_getlock() uses an HASH scheme to generate
2023 2017   * rwstlock using vfs/vnode pointer passed to it.
2024 2018   *
2025 2019   * vn_vfslocks_rele() releases a reference in the
2026 2020   * HASH table which allows the entry allocated by
2027 2021   * vn_vfslocks_getlock() to be freed at a later
2028 2022   * stage when the refcount drops to zero.
2029 2023   */
2030 2024  
2031 2025  vn_vfslocks_entry_t *
2032 2026  vn_vfslocks_getlock(void *vfsvpptr)
2033 2027  {
2034 2028          struct vn_vfslocks_bucket *bp;
2035 2029          vn_vfslocks_entry_t *vep;
2036 2030          vn_vfslocks_entry_t *tvep;
2037 2031  
2038 2032          ASSERT(vfsvpptr != NULL);
2039 2033          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2040 2034  
2041 2035          mutex_enter(&bp->vb_lock);
2042 2036          for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2043 2037                  if (vep->ve_vpvfs == vfsvpptr) {
2044 2038                          vep->ve_refcnt++;
2045 2039                          mutex_exit(&bp->vb_lock);
2046 2040                          return (vep);
2047 2041                  }
2048 2042          }
2049 2043          mutex_exit(&bp->vb_lock);
2050 2044          vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2051 2045          rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2052 2046          vep->ve_vpvfs = (char *)vfsvpptr;
2053 2047          vep->ve_refcnt = 1;
2054 2048          mutex_enter(&bp->vb_lock);
2055 2049          for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2056 2050                  if (tvep->ve_vpvfs == vfsvpptr) {
2057 2051                          tvep->ve_refcnt++;
2058 2052                          mutex_exit(&bp->vb_lock);
2059 2053  
2060 2054                          /*
2061 2055                           * There is already an entry in the hash
2062 2056                           * destroy what we just allocated.
2063 2057                           */
2064 2058                          rwst_destroy(&vep->ve_lock);
2065 2059                          kmem_free(vep, sizeof (*vep));
2066 2060                          return (tvep);
2067 2061                  }
2068 2062          }
2069 2063          vep->ve_next = bp->vb_list;
2070 2064          bp->vb_list = vep;
2071 2065          mutex_exit(&bp->vb_lock);
2072 2066          return (vep);
2073 2067  }
2074 2068  
2075 2069  void
2076 2070  vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2077 2071  {
2078 2072          struct vn_vfslocks_bucket *bp;
2079 2073          vn_vfslocks_entry_t *vep;
2080 2074          vn_vfslocks_entry_t *pvep;
2081 2075  
2082 2076          ASSERT(vepent != NULL);
2083 2077          ASSERT(vepent->ve_vpvfs != NULL);
2084 2078  
2085 2079          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2086 2080  
2087 2081          mutex_enter(&bp->vb_lock);
2088 2082          vepent->ve_refcnt--;
2089 2083  
2090 2084          if ((int32_t)vepent->ve_refcnt < 0)
2091 2085                  cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2092 2086  
2093 2087          if (vepent->ve_refcnt == 0) {
2094 2088                  for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2095 2089                          if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2096 2090                                  if (bp->vb_list == vep)
2097 2091                                          bp->vb_list = vep->ve_next;
2098 2092                                  else {
2099 2093                                          /* LINTED */
2100 2094                                          pvep->ve_next = vep->ve_next;
2101 2095                                  }
2102 2096                                  mutex_exit(&bp->vb_lock);
2103 2097                                  rwst_destroy(&vep->ve_lock);
2104 2098                                  kmem_free(vep, sizeof (*vep));
2105 2099                                  return;
2106 2100                          }
2107 2101                          pvep = vep;
2108 2102                  }
2109 2103                  cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2110 2104          }
2111 2105          mutex_exit(&bp->vb_lock);
2112 2106  }
2113 2107  
2114 2108  /*
2115 2109   * vn_vfswlock_wait is used to implement a lock which is logically a writers
2116 2110   * lock protecting the v_vfsmountedhere field.
2117 2111   * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2118 2112   * except that it blocks to acquire the lock VVFSLOCK.
2119 2113   *
2120 2114   * traverse() and routines re-implementing part of traverse (e.g. autofs)
2121 2115   * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2122 2116   * need the non-blocking version of the writers lock i.e. vn_vfswlock
2123 2117   */
2124 2118  int
2125 2119  vn_vfswlock_wait(vnode_t *vp)
2126 2120  {
2127 2121          int retval;
2128 2122          vn_vfslocks_entry_t *vpvfsentry;
2129 2123          ASSERT(vp != NULL);
2130 2124  
2131 2125          vpvfsentry = vn_vfslocks_getlock(vp);
2132 2126          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2133 2127  
2134 2128          if (retval == EINTR) {
2135 2129                  vn_vfslocks_rele(vpvfsentry);
2136 2130                  return (EINTR);
2137 2131          }
2138 2132          return (retval);
2139 2133  }
2140 2134  
2141 2135  int
2142 2136  vn_vfsrlock_wait(vnode_t *vp)
2143 2137  {
2144 2138          int retval;
2145 2139          vn_vfslocks_entry_t *vpvfsentry;
2146 2140          ASSERT(vp != NULL);
2147 2141  
2148 2142          vpvfsentry = vn_vfslocks_getlock(vp);
2149 2143          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2150 2144  
2151 2145          if (retval == EINTR) {
2152 2146                  vn_vfslocks_rele(vpvfsentry);
2153 2147                  return (EINTR);
2154 2148          }
2155 2149  
2156 2150          return (retval);
2157 2151  }
2158 2152  
2159 2153  
2160 2154  /*
2161 2155   * vn_vfswlock is used to implement a lock which is logically a writers lock
2162 2156   * protecting the v_vfsmountedhere field.
2163 2157   */
2164 2158  int
2165 2159  vn_vfswlock(vnode_t *vp)
2166 2160  {
2167 2161          vn_vfslocks_entry_t *vpvfsentry;
2168 2162  
2169 2163          /*
2170 2164           * If vp is NULL then somebody is trying to lock the covered vnode
2171 2165           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2172 2166           * only happen when unmounting /.  Since that operation will fail
2173 2167           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2174 2168           */
2175 2169          if (vp == NULL)
2176 2170                  return (EBUSY);
2177 2171  
2178 2172          vpvfsentry = vn_vfslocks_getlock(vp);
2179 2173  
2180 2174          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2181 2175                  return (0);
2182 2176  
2183 2177          vn_vfslocks_rele(vpvfsentry);
2184 2178          return (EBUSY);
2185 2179  }
2186 2180  
2187 2181  int
2188 2182  vn_vfsrlock(vnode_t *vp)
2189 2183  {
2190 2184          vn_vfslocks_entry_t *vpvfsentry;
2191 2185  
2192 2186          /*
2193 2187           * If vp is NULL then somebody is trying to lock the covered vnode
2194 2188           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2195 2189           * only happen when unmounting /.  Since that operation will fail
2196 2190           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2197 2191           */
2198 2192          if (vp == NULL)
2199 2193                  return (EBUSY);
2200 2194  
2201 2195          vpvfsentry = vn_vfslocks_getlock(vp);
2202 2196  
2203 2197          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2204 2198                  return (0);
2205 2199  
2206 2200          vn_vfslocks_rele(vpvfsentry);
2207 2201          return (EBUSY);
2208 2202  }
2209 2203  
2210 2204  void
2211 2205  vn_vfsunlock(vnode_t *vp)
2212 2206  {
2213 2207          vn_vfslocks_entry_t *vpvfsentry;
2214 2208  
2215 2209          /*
2216 2210           * ve_refcnt needs to be decremented twice.
2217 2211           * 1. To release refernce after a call to vn_vfslocks_getlock()
2218 2212           * 2. To release the reference from the locking routines like
2219 2213           *    vn_vfsrlock/vn_vfswlock etc,.
2220 2214           */
2221 2215          vpvfsentry = vn_vfslocks_getlock(vp);
2222 2216          vn_vfslocks_rele(vpvfsentry);
2223 2217  
2224 2218          rwst_exit(&vpvfsentry->ve_lock);
2225 2219          vn_vfslocks_rele(vpvfsentry);
2226 2220  }
2227 2221  
2228 2222  int
2229 2223  vn_vfswlock_held(vnode_t *vp)
2230 2224  {
2231 2225          int held;
2232 2226          vn_vfslocks_entry_t *vpvfsentry;
2233 2227  
2234 2228          ASSERT(vp != NULL);
2235 2229  
2236 2230          vpvfsentry = vn_vfslocks_getlock(vp);
2237 2231          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2238 2232  
2239 2233          vn_vfslocks_rele(vpvfsentry);
2240 2234          return (held);
2241 2235  }
2242 2236  
2243 2237  
2244 2238  int
2245 2239  vn_make_ops(
2246 2240          const char *name,                       /* Name of file system */
2247 2241          const fs_operation_def_t *templ,        /* Operation specification */
2248 2242          vnodeops_t **actual)                    /* Return the vnodeops */
2249 2243  {
2250 2244          int unused_ops;
2251 2245          int error;
2252 2246  
2253 2247          *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2254 2248  
2255 2249          (*actual)->vnop_name = name;
2256 2250  
2257 2251          error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2258 2252          if (error) {
2259 2253                  kmem_free(*actual, sizeof (vnodeops_t));
2260 2254          }
2261 2255  
2262 2256  #if DEBUG
2263 2257          if (unused_ops != 0)
2264 2258                  cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2265 2259                      "but not used", name, unused_ops);
2266 2260  #endif
2267 2261  
2268 2262          return (error);
2269 2263  }
2270 2264  
2271 2265  /*
2272 2266   * Free the vnodeops created as a result of vn_make_ops()
2273 2267   */
2274 2268  void
2275 2269  vn_freevnodeops(vnodeops_t *vnops)
2276 2270  {
2277 2271          kmem_free(vnops, sizeof (vnodeops_t));
2278 2272  }
2279 2273  
2280 2274  /*
2281 2275   * Vnode cache.
2282 2276   */
2283 2277  
2284 2278  /* ARGSUSED */
2285 2279  static int
2286 2280  vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
  
    | 
      ↓ open down ↓ | 
    620 lines elided | 
    
      ↑ open up ↑ | 
  
2287 2281  {
2288 2282          struct vnode *vp;
2289 2283  
2290 2284          vp = buf;
2291 2285  
2292 2286          mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2293 2287          mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2294 2288          cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2295 2289          rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2296 2290          vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2297      -        vp->v_path = vn_vpath_empty;
2298      -        vp->v_path_stamp = 0;
     2291 +        vp->v_path = NULL;
2299 2292          vp->v_mpssdata = NULL;
2300 2293          vp->v_vsd = NULL;
2301 2294          vp->v_fopdata = NULL;
2302 2295  
2303 2296          return (0);
2304 2297  }
2305 2298  
2306 2299  /* ARGSUSED */
2307 2300  static void
2308 2301  vn_cache_destructor(void *buf, void *cdrarg)
2309 2302  {
2310 2303          struct vnode *vp;
2311 2304  
2312 2305          vp = buf;
2313 2306  
2314 2307          rw_destroy(&vp->v_nbllock);
2315 2308          cv_destroy(&vp->v_cv);
2316 2309          mutex_destroy(&vp->v_vsd_lock);
2317 2310          mutex_destroy(&vp->v_lock);
2318 2311  }
2319 2312  
2320 2313  void
2321 2314  vn_create_cache(void)
2322 2315  {
2323 2316          /* LINTED */
2324 2317          ASSERT((1 << VNODE_ALIGN_LOG2) ==
2325 2318              P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2326 2319          vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2327 2320              VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2328 2321              NULL, 0);
2329 2322  }
2330 2323  
2331 2324  void
2332 2325  vn_destroy_cache(void)
2333 2326  {
2334 2327          kmem_cache_destroy(vn_cache);
  
    | 
      ↓ open down ↓ | 
    26 lines elided | 
    
      ↑ open up ↑ | 
  
2335 2328  }
2336 2329  
2337 2330  /*
2338 2331   * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2339 2332   * cached by the file system and vnodes remain associated.
2340 2333   */
2341 2334  void
2342 2335  vn_recycle(vnode_t *vp)
2343 2336  {
2344 2337          ASSERT(vp->v_pages == NULL);
2345      -        VERIFY(vp->v_path != NULL);
2346 2338  
2347 2339          /*
2348 2340           * XXX - This really belongs in vn_reinit(), but we have some issues
2349 2341           * with the counts.  Best to have it here for clean initialization.
2350 2342           */
2351 2343          vp->v_rdcnt = 0;
2352 2344          vp->v_wrcnt = 0;
2353 2345          vp->v_mmap_read = 0;
2354 2346          vp->v_mmap_write = 0;
2355 2347  
2356 2348          /*
2357 2349           * If FEM was in use, make sure everything gets cleaned up
  
    | 
      ↓ open down ↓ | 
    2 lines elided | 
    
      ↑ open up ↑ | 
  
2358 2350           * NOTE: vp->v_femhead is initialized to NULL in the vnode
2359 2351           * constructor.
2360 2352           */
2361 2353          if (vp->v_femhead) {
2362 2354                  /* XXX - There should be a free_femhead() that does all this */
2363 2355                  ASSERT(vp->v_femhead->femh_list == NULL);
2364 2356                  mutex_destroy(&vp->v_femhead->femh_lock);
2365 2357                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2366 2358                  vp->v_femhead = NULL;
2367 2359          }
2368      -        if (vp->v_path != vn_vpath_empty) {
     2360 +        if (vp->v_path) {
2369 2361                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2370      -                vp->v_path = vn_vpath_empty;
     2362 +                vp->v_path = NULL;
2371 2363          }
2372      -        vp->v_path_stamp = 0;
2373 2364  
2374 2365          if (vp->v_fopdata != NULL) {
2375 2366                  free_fopdata(vp);
2376 2367          }
2377 2368          vp->v_mpssdata = NULL;
2378 2369          vsd_free(vp);
2379 2370  }
2380 2371  
2381 2372  /*
2382 2373   * Used to reset the vnode fields including those that are directly accessible
2383 2374   * as well as those which require an accessor function.
2384 2375   *
2385 2376   * Does not initialize:
2386 2377   *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2387 2378   *      v_data (since FS-nodes and vnodes point to each other and should
2388 2379   *              be updated simultaneously)
2389 2380   *      v_op (in case someone needs to make a VOP call on this object)
2390 2381   */
2391 2382  void
2392 2383  vn_reinit(vnode_t *vp)
2393 2384  {
2394 2385          vp->v_count = 1;
2395 2386          vp->v_count_dnlc = 0;
2396 2387          vp->v_vfsp = NULL;
2397 2388          vp->v_stream = NULL;
2398 2389          vp->v_vfsmountedhere = NULL;
2399 2390          vp->v_flag = 0;
2400 2391          vp->v_type = VNON;
2401 2392          vp->v_rdev = NODEV;
2402 2393  
2403 2394          vp->v_filocks = NULL;
2404 2395          vp->v_shrlocks = NULL;
2405 2396          vp->v_pages = NULL;
2406 2397  
2407 2398          vp->v_locality = NULL;
2408 2399          vp->v_xattrdir = NULL;
2409 2400  
2410 2401          /* Handles v_femhead, v_path, and the r/w/map counts */
2411 2402          vn_recycle(vp);
2412 2403  }
2413 2404  
2414 2405  vnode_t *
2415 2406  vn_alloc(int kmflag)
2416 2407  {
2417 2408          vnode_t *vp;
2418 2409  
2419 2410          vp = kmem_cache_alloc(vn_cache, kmflag);
2420 2411  
2421 2412          if (vp != NULL) {
2422 2413                  vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2423 2414                  vp->v_fopdata = NULL;
2424 2415                  vn_reinit(vp);
2425 2416          }
2426 2417  
2427 2418          return (vp);
2428 2419  }
2429 2420  
2430 2421  void
2431 2422  vn_free(vnode_t *vp)
2432 2423  {
  
    | 
      ↓ open down ↓ | 
    50 lines elided | 
    
      ↑ open up ↑ | 
  
2433 2424          ASSERT(vp->v_shrlocks == NULL);
2434 2425          ASSERT(vp->v_filocks == NULL);
2435 2426  
2436 2427          /*
2437 2428           * Some file systems call vn_free() with v_count of zero,
2438 2429           * some with v_count of 1.  In any case, the value should
2439 2430           * never be anything else.
2440 2431           */
2441 2432          ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2442 2433          ASSERT(vp->v_count_dnlc == 0);
2443      -        VERIFY(vp->v_path != NULL);
2444      -        if (vp->v_path != vn_vpath_empty) {
     2434 +        if (vp->v_path != NULL) {
2445 2435                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2446      -                vp->v_path = vn_vpath_empty;
     2436 +                vp->v_path = NULL;
2447 2437          }
2448 2438  
2449 2439          /* If FEM was in use, make sure everything gets cleaned up */
2450 2440          if (vp->v_femhead) {
2451 2441                  /* XXX - There should be a free_femhead() that does all this */
2452 2442                  ASSERT(vp->v_femhead->femh_list == NULL);
2453 2443                  mutex_destroy(&vp->v_femhead->femh_lock);
2454 2444                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2455 2445                  vp->v_femhead = NULL;
2456 2446          }
2457 2447  
2458 2448          if (vp->v_fopdata != NULL) {
2459 2449                  free_fopdata(vp);
2460 2450          }
2461 2451          vp->v_mpssdata = NULL;
2462 2452          vsd_free(vp);
2463 2453          kmem_cache_free(vn_cache, vp);
2464 2454  }
2465 2455  
2466 2456  /*
2467 2457   * vnode status changes, should define better states than 1, 0.
2468 2458   */
2469 2459  void
2470 2460  vn_reclaim(vnode_t *vp)
2471 2461  {
2472 2462          vfs_t   *vfsp = vp->v_vfsp;
2473 2463  
2474 2464          if (vfsp == NULL ||
2475 2465              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2476 2466                  return;
2477 2467          }
2478 2468          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2479 2469  }
2480 2470  
2481 2471  void
2482 2472  vn_idle(vnode_t *vp)
2483 2473  {
2484 2474          vfs_t   *vfsp = vp->v_vfsp;
2485 2475  
2486 2476          if (vfsp == NULL ||
2487 2477              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2488 2478                  return;
2489 2479          }
2490 2480          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2491 2481  }
2492 2482  void
2493 2483  vn_exists(vnode_t *vp)
2494 2484  {
2495 2485          vfs_t   *vfsp = vp->v_vfsp;
2496 2486  
2497 2487          if (vfsp == NULL ||
2498 2488              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2499 2489                  return;
2500 2490          }
2501 2491          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2502 2492  }
2503 2493  
2504 2494  void
2505 2495  vn_invalid(vnode_t *vp)
2506 2496  {
2507 2497          vfs_t   *vfsp = vp->v_vfsp;
2508 2498  
2509 2499          if (vfsp == NULL ||
2510 2500              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2511 2501                  return;
2512 2502          }
2513 2503          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2514 2504  }
2515 2505  
2516 2506  /* Vnode event notification */
2517 2507  
2518 2508  int
2519 2509  vnevent_support(vnode_t *vp, caller_context_t *ct)
2520 2510  {
2521 2511          if (vp == NULL)
2522 2512                  return (EINVAL);
2523 2513  
2524 2514          return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2525 2515  }
2526 2516  
2527 2517  void
2528 2518  vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2529 2519  {
2530 2520          if (vp == NULL || vp->v_femhead == NULL) {
2531 2521                  return;
2532 2522          }
2533 2523          (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2534 2524          (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2535 2525  }
2536 2526  
2537 2527  void
2538 2528  vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2539 2529      caller_context_t *ct)
2540 2530  {
2541 2531          if (vp == NULL || vp->v_femhead == NULL) {
2542 2532                  return;
2543 2533          }
2544 2534          (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2545 2535  }
2546 2536  
2547 2537  void
2548 2538  vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2549 2539      caller_context_t *ct)
2550 2540  {
2551 2541          if (vp == NULL || vp->v_femhead == NULL) {
2552 2542                  return;
2553 2543          }
2554 2544          (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2555 2545  }
2556 2546  
2557 2547  void
2558 2548  vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2559 2549  {
2560 2550          if (vp == NULL || vp->v_femhead == NULL) {
2561 2551                  return;
2562 2552          }
2563 2553          (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2564 2554  }
2565 2555  
2566 2556  void
2567 2557  vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2568 2558  {
2569 2559          if (vp == NULL || vp->v_femhead == NULL) {
2570 2560                  return;
2571 2561          }
2572 2562          (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2573 2563  }
2574 2564  
2575 2565  void
2576 2566  vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2577 2567      caller_context_t *ct)
2578 2568  {
2579 2569          if (vp == NULL || vp->v_femhead == NULL) {
2580 2570                  return;
2581 2571          }
2582 2572          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2583 2573  }
2584 2574  
2585 2575  void
2586 2576  vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2587 2577      caller_context_t *ct)
2588 2578  {
2589 2579          if (vp == NULL || vp->v_femhead == NULL) {
2590 2580                  return;
2591 2581          }
2592 2582          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2593 2583  }
2594 2584  
2595 2585  void
2596 2586  vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2597 2587      caller_context_t *ct)
2598 2588  {
2599 2589          if (vp == NULL || vp->v_femhead == NULL) {
2600 2590                  return;
2601 2591          }
2602 2592          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2603 2593  }
2604 2594  
2605 2595  void
2606 2596  vnevent_create(vnode_t *vp, caller_context_t *ct)
2607 2597  {
2608 2598          if (vp == NULL || vp->v_femhead == NULL) {
2609 2599                  return;
2610 2600          }
2611 2601          (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2612 2602  }
2613 2603  
2614 2604  void
2615 2605  vnevent_link(vnode_t *vp, caller_context_t *ct)
2616 2606  {
2617 2607          if (vp == NULL || vp->v_femhead == NULL) {
2618 2608                  return;
2619 2609          }
2620 2610          (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2621 2611  }
2622 2612  
2623 2613  void
2624 2614  vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2625 2615  {
2626 2616          if (vp == NULL || vp->v_femhead == NULL) {
2627 2617                  return;
2628 2618          }
2629 2619          (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2630 2620  }
2631 2621  
2632 2622  void
2633 2623  vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2634 2624  {
2635 2625          if (vp == NULL || vp->v_femhead == NULL) {
2636 2626                  return;
2637 2627          }
2638 2628          (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2639 2629  }
2640 2630  
2641 2631  void
2642 2632  vnevent_resize(vnode_t *vp, caller_context_t *ct)
2643 2633  {
2644 2634          if (vp == NULL || vp->v_femhead == NULL) {
2645 2635                  return;
2646 2636          }
2647 2637          (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2648 2638  }
2649 2639  
2650 2640  /*
2651 2641   * Vnode accessors.
2652 2642   */
2653 2643  
2654 2644  int
2655 2645  vn_is_readonly(vnode_t *vp)
2656 2646  {
2657 2647          return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2658 2648  }
2659 2649  
2660 2650  int
2661 2651  vn_has_flocks(vnode_t *vp)
2662 2652  {
2663 2653          return (vp->v_filocks != NULL);
2664 2654  }
2665 2655  
2666 2656  int
2667 2657  vn_has_mandatory_locks(vnode_t *vp, int mode)
2668 2658  {
2669 2659          return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2670 2660  }
2671 2661  
2672 2662  int
2673 2663  vn_has_cached_data(vnode_t *vp)
2674 2664  {
2675 2665          return (vp->v_pages != NULL);
2676 2666  }
2677 2667  
2678 2668  /*
2679 2669   * Return 0 if the vnode in question shouldn't be permitted into a zone via
2680 2670   * zone_enter(2).
2681 2671   */
2682 2672  int
2683 2673  vn_can_change_zones(vnode_t *vp)
2684 2674  {
2685 2675          struct vfssw *vswp;
2686 2676          int allow = 1;
2687 2677          vnode_t *rvp;
2688 2678  
2689 2679          if (nfs_global_client_only != 0)
2690 2680                  return (1);
2691 2681  
2692 2682          /*
2693 2683           * We always want to look at the underlying vnode if there is one.
2694 2684           */
2695 2685          if (VOP_REALVP(vp, &rvp, NULL) != 0)
2696 2686                  rvp = vp;
2697 2687          /*
2698 2688           * Some pseudo filesystems (including doorfs) don't actually register
2699 2689           * their vfsops_t, so the following may return NULL; we happily let
2700 2690           * such vnodes switch zones.
2701 2691           */
2702 2692          vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2703 2693          if (vswp != NULL) {
2704 2694                  if (vswp->vsw_flag & VSW_NOTZONESAFE)
2705 2695                          allow = 0;
2706 2696                  vfs_unrefvfssw(vswp);
2707 2697          }
2708 2698          return (allow);
2709 2699  }
2710 2700  
2711 2701  /*
2712 2702   * Return nonzero if the vnode is a mount point, zero if not.
2713 2703   */
2714 2704  int
2715 2705  vn_ismntpt(vnode_t *vp)
2716 2706  {
2717 2707          return (vp->v_vfsmountedhere != NULL);
2718 2708  }
2719 2709  
2720 2710  /* Retrieve the vfs (if any) mounted on this vnode */
2721 2711  vfs_t *
2722 2712  vn_mountedvfs(vnode_t *vp)
2723 2713  {
2724 2714          return (vp->v_vfsmountedhere);
2725 2715  }
2726 2716  
2727 2717  /*
2728 2718   * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2729 2719   */
2730 2720  int
2731 2721  vn_in_dnlc(vnode_t *vp)
2732 2722  {
2733 2723          return (vp->v_count_dnlc > 0);
2734 2724  }
2735 2725  
2736 2726  /*
2737 2727   * vn_has_other_opens() checks whether a particular file is opened by more than
2738 2728   * just the caller and whether the open is for read and/or write.
2739 2729   * This routine is for calling after the caller has already called VOP_OPEN()
2740 2730   * and the caller wishes to know if they are the only one with it open for
2741 2731   * the mode(s) specified.
2742 2732   *
2743 2733   * Vnode counts are only kept on regular files (v_type=VREG).
2744 2734   */
2745 2735  int
2746 2736  vn_has_other_opens(
2747 2737          vnode_t *vp,
2748 2738          v_mode_t mode)
2749 2739  {
2750 2740  
2751 2741          ASSERT(vp != NULL);
2752 2742  
2753 2743          switch (mode) {
2754 2744          case V_WRITE:
2755 2745                  if (vp->v_wrcnt > 1)
2756 2746                          return (V_TRUE);
2757 2747                  break;
2758 2748          case V_RDORWR:
2759 2749                  if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2760 2750                          return (V_TRUE);
2761 2751                  break;
2762 2752          case V_RDANDWR:
2763 2753                  if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2764 2754                          return (V_TRUE);
2765 2755                  break;
2766 2756          case V_READ:
2767 2757                  if (vp->v_rdcnt > 1)
2768 2758                          return (V_TRUE);
2769 2759                  break;
2770 2760          }
2771 2761  
2772 2762          return (V_FALSE);
2773 2763  }
2774 2764  
2775 2765  /*
2776 2766   * vn_is_opened() checks whether a particular file is opened and
2777 2767   * whether the open is for read and/or write.
2778 2768   *
2779 2769   * Vnode counts are only kept on regular files (v_type=VREG).
2780 2770   */
2781 2771  int
2782 2772  vn_is_opened(
2783 2773          vnode_t *vp,
2784 2774          v_mode_t mode)
2785 2775  {
2786 2776  
2787 2777          ASSERT(vp != NULL);
2788 2778  
2789 2779          switch (mode) {
2790 2780          case V_WRITE:
2791 2781                  if (vp->v_wrcnt)
2792 2782                          return (V_TRUE);
2793 2783                  break;
2794 2784          case V_RDANDWR:
2795 2785                  if (vp->v_rdcnt && vp->v_wrcnt)
2796 2786                          return (V_TRUE);
2797 2787                  break;
2798 2788          case V_RDORWR:
2799 2789                  if (vp->v_rdcnt || vp->v_wrcnt)
2800 2790                          return (V_TRUE);
2801 2791                  break;
2802 2792          case V_READ:
2803 2793                  if (vp->v_rdcnt)
2804 2794                          return (V_TRUE);
2805 2795                  break;
2806 2796          }
2807 2797  
2808 2798          return (V_FALSE);
2809 2799  }
2810 2800  
2811 2801  /*
2812 2802   * vn_is_mapped() checks whether a particular file is mapped and whether
2813 2803   * the file is mapped read and/or write.
2814 2804   */
2815 2805  int
2816 2806  vn_is_mapped(
2817 2807          vnode_t *vp,
2818 2808          v_mode_t mode)
2819 2809  {
2820 2810  
2821 2811          ASSERT(vp != NULL);
2822 2812  
2823 2813  #if !defined(_LP64)
2824 2814          switch (mode) {
2825 2815          /*
2826 2816           * The atomic_add_64_nv functions force atomicity in the
2827 2817           * case of 32 bit architectures. Otherwise the 64 bit values
2828 2818           * require two fetches. The value of the fields may be
2829 2819           * (potentially) changed between the first fetch and the
2830 2820           * second
2831 2821           */
2832 2822          case V_WRITE:
2833 2823                  if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2834 2824                          return (V_TRUE);
2835 2825                  break;
2836 2826          case V_RDANDWR:
2837 2827                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2838 2828                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2839 2829                          return (V_TRUE);
2840 2830                  break;
2841 2831          case V_RDORWR:
2842 2832                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2843 2833                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2844 2834                          return (V_TRUE);
2845 2835                  break;
2846 2836          case V_READ:
2847 2837                  if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2848 2838                          return (V_TRUE);
2849 2839                  break;
2850 2840          }
2851 2841  #else
2852 2842          switch (mode) {
2853 2843          case V_WRITE:
2854 2844                  if (vp->v_mmap_write)
2855 2845                          return (V_TRUE);
2856 2846                  break;
2857 2847          case V_RDANDWR:
2858 2848                  if (vp->v_mmap_read && vp->v_mmap_write)
2859 2849                          return (V_TRUE);
2860 2850                  break;
2861 2851          case V_RDORWR:
2862 2852                  if (vp->v_mmap_read || vp->v_mmap_write)
2863 2853                          return (V_TRUE);
2864 2854                  break;
2865 2855          case V_READ:
2866 2856                  if (vp->v_mmap_read)
2867 2857                          return (V_TRUE);
2868 2858                  break;
2869 2859          }
2870 2860  #endif
2871 2861  
2872 2862          return (V_FALSE);
2873 2863  }
2874 2864  
2875 2865  /*
2876 2866   * Set the operations vector for a vnode.
2877 2867   *
2878 2868   * FEM ensures that the v_femhead pointer is filled in before the
2879 2869   * v_op pointer is changed.  This means that if the v_femhead pointer
2880 2870   * is NULL, and the v_op field hasn't changed since before which checked
2881 2871   * the v_femhead pointer; then our update is ok - we are not racing with
2882 2872   * FEM.
2883 2873   */
2884 2874  void
2885 2875  vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2886 2876  {
2887 2877          vnodeops_t      *op;
2888 2878  
2889 2879          ASSERT(vp != NULL);
2890 2880          ASSERT(vnodeops != NULL);
2891 2881  
2892 2882          op = vp->v_op;
2893 2883          membar_consumer();
2894 2884          /*
2895 2885           * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2896 2886           * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2897 2887           * in effect on the vnode and we need to have FEM deal with it.
2898 2888           */
2899 2889          if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2900 2890              op) {
2901 2891                  fem_setvnops(vp, vnodeops);
2902 2892          }
2903 2893  }
2904 2894  
2905 2895  /*
2906 2896   * Retrieve the operations vector for a vnode
2907 2897   * As with vn_setops(above); make sure we aren't racing with FEM.
2908 2898   * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2909 2899   * make sense to the callers of this routine.
2910 2900   */
2911 2901  vnodeops_t *
2912 2902  vn_getops(vnode_t *vp)
2913 2903  {
2914 2904          vnodeops_t      *op;
2915 2905  
2916 2906          ASSERT(vp != NULL);
2917 2907  
2918 2908          op = vp->v_op;
2919 2909          membar_consumer();
2920 2910          if (vp->v_femhead == NULL && op == vp->v_op) {
2921 2911                  return (op);
2922 2912          } else {
2923 2913                  return (fem_getvnops(vp));
2924 2914          }
2925 2915  }
2926 2916  
2927 2917  /*
2928 2918   * Returns non-zero (1) if the vnodeops matches that of the vnode.
2929 2919   * Returns zero (0) if not.
2930 2920   */
2931 2921  int
2932 2922  vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2933 2923  {
2934 2924          return (vn_getops(vp) == vnodeops);
2935 2925  }
2936 2926  
2937 2927  /*
2938 2928   * Returns non-zero (1) if the specified operation matches the
2939 2929   * corresponding operation for that the vnode.
2940 2930   * Returns zero (0) if not.
2941 2931   */
2942 2932  
2943 2933  #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2944 2934  
2945 2935  int
2946 2936  vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2947 2937  {
2948 2938          const fs_operation_trans_def_t *otdp;
2949 2939          fs_generic_func_p *loc = NULL;
2950 2940          vnodeops_t      *vop = vn_getops(vp);
2951 2941  
2952 2942          ASSERT(vopname != NULL);
2953 2943  
2954 2944          for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2955 2945                  if (MATCHNAME(otdp->name, vopname)) {
2956 2946                          loc = (fs_generic_func_p *)
2957 2947                              ((char *)(vop) + otdp->offset);
2958 2948                          break;
2959 2949                  }
2960 2950          }
2961 2951  
2962 2952          return ((loc != NULL) && (*loc == funcp));
2963 2953  }
2964 2954  
2965 2955  /*
2966 2956   * fs_new_caller_id() needs to return a unique ID on a given local system.
2967 2957   * The IDs do not need to survive across reboots.  These are primarily
2968 2958   * used so that (FEM) monitors can detect particular callers (such as
2969 2959   * the NFS server) to a given vnode/vfs operation.
  
    | 
      ↓ open down ↓ | 
    513 lines elided | 
    
      ↑ open up ↑ | 
  
2970 2960   */
2971 2961  u_longlong_t
2972 2962  fs_new_caller_id()
2973 2963  {
2974 2964          static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2975 2965  
2976 2966          return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2977 2967  }
2978 2968  
2979 2969  /*
2980      - * The value stored in v_path is relative to rootdir, located in the global
2981      - * zone.  Zones or chroot environments which reside deeper inside the VFS
2982      - * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2983      - * what lies below their perceived root.  In order to keep v_path usable for
2984      - * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2985      - *
2986      - * An upper bound of max_vnode_path is placed upon v_path allocations to
2987      - * prevent the system from going too wild at the behest of pathological
2988      - * behavior from the operator.
     2970 + * Given a starting vnode and a path, updates the path in the target vnode in
     2971 + * a safe manner.  If the vnode already has path information embedded, then the
     2972 + * cached path is left untouched.
2989 2973   */
     2974 +
2990 2975  size_t max_vnode_path = 4 * MAXPATHLEN;
2991 2976  
2992      -
2993 2977  void
2994      -vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
     2978 +vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
     2979 +    const char *path, size_t plen)
2995 2980  {
2996      -        char *buf;
     2981 +        char    *rpath;
     2982 +        vnode_t *base;
     2983 +        size_t  rpathlen, rpathalloc;
     2984 +        int     doslash = 1;
2997 2985  
2998      -        mutex_enter(&vp->v_lock);
     2986 +        if (*path == '/') {
     2987 +                base = rootvp;
     2988 +                path++;
     2989 +                plen--;
     2990 +        } else {
     2991 +                base = startvp;
     2992 +        }
     2993 +
2999 2994          /*
3000      -         * If the snapshot of v_path_stamp passed in via compare_stamp does not
3001      -         * match the present value on the vnode, it indicates that subsequent
3002      -         * changes have occurred.  The v_path value is not cleared in this case
3003      -         * since the new value may be valid.
     2995 +         * We cannot grab base->v_lock while we hold vp->v_lock because of
     2996 +         * the potential for deadlock.
3004 2997           */
3005      -        if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3006      -                mutex_exit(&vp->v_lock);
     2998 +        mutex_enter(&base->v_lock);
     2999 +        if (base->v_path == NULL) {
     3000 +                mutex_exit(&base->v_lock);
3007 3001                  return;
3008 3002          }
3009      -        buf = vp->v_path;
3010      -        vp->v_path = vn_vpath_empty;
3011      -        vp->v_path_stamp = 0;
3012      -        mutex_exit(&vp->v_lock);
3013      -        if (buf != vn_vpath_empty) {
3014      -                kmem_free(buf, strlen(buf) + 1);
3015      -        }
3016      -}
3017 3003  
3018      -static void
3019      -vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3020      -    boolean_t is_rename)
3021      -{
3022      -        char *buf, *oldbuf;
3023      -        hrtime_t pstamp;
3024      -        size_t baselen, buflen = 0;
     3004 +        rpathlen = strlen(base->v_path);
     3005 +        rpathalloc = rpathlen + plen + 1;
     3006 +        /* Avoid adding a slash if there's already one there */
     3007 +        if (base->v_path[rpathlen-1] == '/')
     3008 +                doslash = 0;
     3009 +        else
     3010 +                rpathalloc++;
3025 3011  
3026      -        /* Handle the vn_setpath_str case. */
3027      -        if (pvp == NULL) {
3028      -                if (len + 1 > max_vnode_path) {
3029      -                        DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3030      -                            vnode_t *, vp, char *, name, size_t, len + 1);
3031      -                        return;
3032      -                }
3033      -                buf = kmem_alloc(len + 1, KM_SLEEP);
3034      -                bcopy(name, buf, len);
3035      -                buf[len] = '\0';
     3012 +        /*
     3013 +         * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
     3014 +         * so we must do this dance.  If, by chance, something changes the path,
     3015 +         * just give up since there is no real harm.
     3016 +         */
     3017 +        mutex_exit(&base->v_lock);
3036 3018  
3037      -                mutex_enter(&vp->v_lock);
3038      -                oldbuf = vp->v_path;
3039      -                vp->v_path = buf;
3040      -                vp->v_path_stamp = gethrtime();
3041      -                mutex_exit(&vp->v_lock);
3042      -                if (oldbuf != vn_vpath_empty) {
3043      -                        kmem_free(oldbuf, strlen(oldbuf) + 1);
3044      -                }
     3019 +        /* Paths should stay within reason */
     3020 +        if (rpathalloc > max_vnode_path)
3045 3021                  return;
3046      -        }
3047 3022  
3048      -        /* Take snapshot of parent dir */
3049      -        mutex_enter(&pvp->v_lock);
3050      -retrybuf:
3051      -        if (pvp->v_path == vn_vpath_empty) {
3052      -                /*
3053      -                 * Without v_path from the parent directory, generating a child
3054      -                 * path from the name is impossible.
3055      -                 */
3056      -                if (len > 0) {
3057      -                        pstamp = pvp->v_path_stamp;
3058      -                        mutex_exit(&pvp->v_lock);
3059      -                        vn_clearpath(vp, pstamp);
3060      -                        return;
3061      -                }
     3023 +        rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3062 3024  
3063      -                /*
3064      -                 * The only feasible case here is where a NUL lookup is being
3065      -                 * performed on rootdir prior to its v_path being populated.
3066      -                 */
3067      -                ASSERT(pvp->v_path_stamp = 0);
3068      -                baselen = 0;
3069      -                pstamp = 0;
3070      -        } else {
3071      -                pstamp = pvp->v_path_stamp;
3072      -                baselen = strlen(pvp->v_path);
3073      -                /* ignore a trailing slash if present */
3074      -                if (pvp->v_path[baselen - 1] == '/') {
3075      -                        /* This should only the be case for rootdir */
3076      -                        ASSERT(baselen == 1 && pvp == rootdir);
3077      -                        baselen--;
3078      -                }
3079      -        }
3080      -        mutex_exit(&pvp->v_lock);
3081      -
3082      -        if (buflen != 0) {
3083      -                /* Free the existing (mis-sized) buffer in case of retry */
3084      -                kmem_free(buf, buflen);
3085      -        }
3086      -        /* base, '/', name and trailing NUL */
3087      -        buflen = baselen + len + 2;
3088      -        if (buflen > max_vnode_path) {
3089      -                DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3090      -                    vnode_t *, vp, char *, name, size_t, buflen);
     3025 +        mutex_enter(&base->v_lock);
     3026 +        if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
     3027 +                mutex_exit(&base->v_lock);
     3028 +                kmem_free(rpath, rpathalloc);
3091 3029                  return;
3092 3030          }
3093      -        buf = kmem_alloc(buflen, KM_SLEEP);
     3031 +        bcopy(base->v_path, rpath, rpathlen);
     3032 +        mutex_exit(&base->v_lock);
3094 3033  
3095      -        mutex_enter(&pvp->v_lock);
3096      -        if (pvp->v_path_stamp != pstamp) {
3097      -                size_t vlen;
     3034 +        if (doslash)
     3035 +                rpath[rpathlen++] = '/';
     3036 +        bcopy(path, rpath + rpathlen, plen);
     3037 +        rpath[rpathlen + plen] = '\0';
3098 3038  
3099      -                /*
3100      -                 * Since v_path_stamp changed on the parent, it is likely that
3101      -                 * v_path has been altered as well.  If the length does not
3102      -                 * exactly match what was previously measured, the buffer
3103      -                 * allocation must be repeated for proper sizing.
3104      -                 */
3105      -                if (pvp->v_path == vn_vpath_empty) {
3106      -                        /* Give up if parent lack v_path */
3107      -                        mutex_exit(&pvp->v_lock);
3108      -                        kmem_free(buf, buflen);
3109      -                        return;
3110      -                }
3111      -                vlen = strlen(pvp->v_path);
3112      -                if (pvp->v_path[vlen - 1] == '/') {
3113      -                        vlen--;
3114      -                }
3115      -                if (vlen != baselen) {
3116      -                        goto retrybuf;
3117      -                }
3118      -        }
3119      -        bcopy(pvp->v_path, buf, baselen);
3120      -        mutex_exit(&pvp->v_lock);
3121      -
3122      -        buf[baselen] = '/';
3123      -        baselen++;
3124      -        bcopy(name, &buf[baselen], len + 1);
3125      -
3126 3039          mutex_enter(&vp->v_lock);
3127      -        if (vp->v_path_stamp == 0) {
3128      -                /* never-visited vnode can inherit stamp from parent */
3129      -                ASSERT(vp->v_path == vn_vpath_empty);
3130      -                vp->v_path_stamp = pstamp;
3131      -                vp->v_path = buf;
     3040 +        if (vp->v_path != NULL) {
3132 3041                  mutex_exit(&vp->v_lock);
3133      -        } else if (vp->v_path_stamp < pstamp || is_rename) {
3134      -                /*
3135      -                 * Install the updated path and stamp, ensuring that the v_path
3136      -                 * pointer is valid at all times for dtrace.
3137      -                 */
3138      -                oldbuf = vp->v_path;
3139      -                vp->v_path = buf;
3140      -                vp->v_path_stamp = gethrtime();
3141      -                mutex_exit(&vp->v_lock);
3142      -                kmem_free(oldbuf, strlen(oldbuf) + 1);
     3042 +                kmem_free(rpath, rpathalloc);
3143 3043          } else {
3144      -                /*
3145      -                 * If the timestamp matches or is greater, it means another
3146      -                 * thread performed the update first while locks were dropped
3147      -                 * here to make the allocation.  We defer to the newer value.
3148      -                 */
     3044 +                vp->v_path = rpath;
3149 3045                  mutex_exit(&vp->v_lock);
3150      -                kmem_free(buf, buflen);
3151 3046          }
3152      -        ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3153 3047  }
3154 3048  
     3049 +/*
     3050 + * Sets the path to the vnode to be the given string, regardless of current
     3051 + * context.  The string must be a complete path from rootdir.  This is only used
     3052 + * by fsop_root() for setting the path based on the mountpoint.
     3053 + */
3155 3054  void
3156      -vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
     3055 +vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3157 3056  {
3158      -        size_t len;
     3057 +        char *buf = kmem_alloc(len + 1, KM_SLEEP);
3159 3058  
3160      -        /*
3161      -         * If the parent is older or empty, there's nothing further to do.
3162      -         */
3163      -        if (pvp->v_path == vn_vpath_empty ||
3164      -            pvp->v_path_stamp <= vp->v_path_stamp) {
     3059 +        mutex_enter(&vp->v_lock);
     3060 +        if (vp->v_path != NULL) {
     3061 +                mutex_exit(&vp->v_lock);
     3062 +                kmem_free(buf, len + 1);
3165 3063                  return;
3166 3064          }
3167 3065  
3168      -        /*
3169      -         * Given the lack of appropriate context, meaningful updates to v_path
3170      -         * cannot be made for during lookups for the '.' or '..' entries.
3171      -         */
3172      -        len = strlen(name);
3173      -        if (len == 0 || (len == 1 && name[0] == '.') ||
3174      -            (len == 2 && name[0] == '.' && name[1] == '.')) {
3175      -                return;
3176      -        }
     3066 +        vp->v_path = buf;
     3067 +        bcopy(str, vp->v_path, len);
     3068 +        vp->v_path[len] = '\0';
3177 3069  
3178      -        vn_setpath_common(pvp, vp, name, len, B_FALSE);
     3070 +        mutex_exit(&vp->v_lock);
3179 3071  }
3180 3072  
3181 3073  /*
3182      - * Given a starting vnode and a path, updates the path in the target vnode in
3183      - * a safe manner.  If the vnode already has path information embedded, then the
3184      - * cached path is left untouched.
3185      - */
3186      -/* ARGSUSED */
3187      -void
3188      -vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3189      -    size_t len)
3190      -{
3191      -        vn_setpath_common(pvp, vp, name, len, B_FALSE);
3192      -}
3193      -
3194      -/*
3195      - * Sets the path to the vnode to be the given string, regardless of current
3196      - * context.  The string must be a complete path from rootdir.  This is only used
3197      - * by fsop_root() for setting the path based on the mountpoint.
3198      - */
3199      -void
3200      -vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3201      -{
3202      -        vn_setpath_common(NULL, vp, str, len, B_FALSE);
3203      -}
3204      -
3205      -/*
3206 3074   * Called from within filesystem's vop_rename() to handle renames once the
3207 3075   * target vnode is available.
3208 3076   */
3209 3077  void
3210      -vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
     3078 +vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3211 3079  {
3212      -        vn_setpath_common(pvp, vp, name, len, B_TRUE);
     3080 +        char *tmp;
     3081 +
     3082 +        mutex_enter(&vp->v_lock);
     3083 +        tmp = vp->v_path;
     3084 +        vp->v_path = NULL;
     3085 +        mutex_exit(&vp->v_lock);
     3086 +        vn_setpath(rootdir, dvp, vp, nm, len);
     3087 +        if (tmp != NULL)
     3088 +                kmem_free(tmp, strlen(tmp) + 1);
3213 3089  }
3214 3090  
3215 3091  /*
3216 3092   * Similar to vn_setpath_str(), this function sets the path of the destination
3217 3093   * vnode to the be the same as the source vnode.
3218 3094   */
3219 3095  void
3220 3096  vn_copypath(struct vnode *src, struct vnode *dst)
3221 3097  {
3222 3098          char *buf;
3223      -        hrtime_t stamp;
3224      -        size_t buflen;
     3099 +        int alloc;
3225 3100  
3226 3101          mutex_enter(&src->v_lock);
3227      -        if (src->v_path == vn_vpath_empty) {
     3102 +        if (src->v_path == NULL) {
3228 3103                  mutex_exit(&src->v_lock);
3229 3104                  return;
3230 3105          }
3231      -        buflen = strlen(src->v_path) + 1;
3232      -        mutex_exit(&src->v_lock);
     3106 +        alloc = strlen(src->v_path) + 1;
3233 3107  
3234      -        buf = kmem_alloc(buflen, KM_SLEEP);
3235      -
     3108 +        /* avoid kmem_alloc() with lock held */
     3109 +        mutex_exit(&src->v_lock);
     3110 +        buf = kmem_alloc(alloc, KM_SLEEP);
3236 3111          mutex_enter(&src->v_lock);
3237      -        if (src->v_path == vn_vpath_empty ||
3238      -            strlen(src->v_path) + 1 != buflen) {
     3112 +        if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3239 3113                  mutex_exit(&src->v_lock);
3240      -                kmem_free(buf, buflen);
     3114 +                kmem_free(buf, alloc);
3241 3115                  return;
3242 3116          }
3243      -        bcopy(src->v_path, buf, buflen);
3244      -        stamp = src->v_path_stamp;
     3117 +        bcopy(src->v_path, buf, alloc);
3245 3118          mutex_exit(&src->v_lock);
3246 3119  
3247 3120          mutex_enter(&dst->v_lock);
3248      -        if (dst->v_path != vn_vpath_empty) {
     3121 +        if (dst->v_path != NULL) {
3249 3122                  mutex_exit(&dst->v_lock);
3250      -                kmem_free(buf, buflen);
     3123 +                kmem_free(buf, alloc);
3251 3124                  return;
3252 3125          }
3253 3126          dst->v_path = buf;
3254      -        dst->v_path_stamp = stamp;
3255 3127          mutex_exit(&dst->v_lock);
3256 3128  }
3257 3129  
3258      -
3259 3130  /*
3260 3131   * XXX Private interface for segvn routines that handle vnode
3261 3132   * large page segments.
3262 3133   *
3263 3134   * return 1 if vp's file system VOP_PAGEIO() implementation
3264 3135   * can be safely used instead of VOP_GETPAGE() for handling
3265 3136   * pagefaults against regular non swap files. VOP_PAGEIO()
3266 3137   * interface is considered safe here if its implementation
3267 3138   * is very close to VOP_GETPAGE() implementation.
3268 3139   * e.g. It zero's out the part of the page beyond EOF. Doesn't
3269 3140   * panic if there're file holes but instead returns an error.
3270 3141   * Doesn't assume file won't be changed by user writes, etc.
3271 3142   *
3272 3143   * return 0 otherwise.
3273 3144   *
3274 3145   * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3275 3146   */
3276 3147  int
3277 3148  vn_vmpss_usepageio(vnode_t *vp)
3278 3149  {
3279 3150          vfs_t   *vfsp = vp->v_vfsp;
3280 3151          char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3281 3152          char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3282 3153          char **fsok = pageio_ok_fss;
3283 3154  
3284 3155          if (fsname == NULL) {
3285 3156                  return (0);
3286 3157          }
3287 3158  
3288 3159          for (; *fsok; fsok++) {
3289 3160                  if (strcmp(*fsok, fsname) == 0) {
3290 3161                          return (1);
3291 3162                  }
3292 3163          }
3293 3164          return (0);
3294 3165  }
3295 3166  
3296 3167  /* VOP_XXX() macros call the corresponding fop_xxx() function */
3297 3168  
3298 3169  int
3299 3170  fop_open(
3300 3171          vnode_t **vpp,
3301 3172          int mode,
3302 3173          cred_t *cr,
3303 3174          caller_context_t *ct)
3304 3175  {
3305 3176          int ret;
3306 3177          vnode_t *vp = *vpp;
3307 3178  
3308 3179          VN_HOLD(vp);
3309 3180          /*
3310 3181           * Adding to the vnode counts before calling open
3311 3182           * avoids the need for a mutex. It circumvents a race
3312 3183           * condition where a query made on the vnode counts results in a
3313 3184           * false negative. The inquirer goes away believing the file is
3314 3185           * not open when there is an open on the file already under way.
3315 3186           *
3316 3187           * The counts are meant to prevent NFS from granting a delegation
3317 3188           * when it would be dangerous to do so.
3318 3189           *
3319 3190           * The vnode counts are only kept on regular files
3320 3191           */
3321 3192          if ((*vpp)->v_type == VREG) {
3322 3193                  if (mode & FREAD)
3323 3194                          atomic_inc_32(&(*vpp)->v_rdcnt);
3324 3195                  if (mode & FWRITE)
3325 3196                          atomic_inc_32(&(*vpp)->v_wrcnt);
3326 3197          }
3327 3198  
3328 3199          VOPXID_MAP_CR(vp, cr);
3329 3200  
3330 3201          ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3331 3202  
3332 3203          if (ret) {
3333 3204                  /*
3334 3205                   * Use the saved vp just in case the vnode ptr got trashed
3335 3206                   * by the error.
3336 3207                   */
3337 3208                  VOPSTATS_UPDATE(vp, open);
3338 3209                  if ((vp->v_type == VREG) && (mode & FREAD))
3339 3210                          atomic_dec_32(&vp->v_rdcnt);
3340 3211                  if ((vp->v_type == VREG) && (mode & FWRITE))
3341 3212                          atomic_dec_32(&vp->v_wrcnt);
3342 3213          } else {
3343 3214                  /*
3344 3215                   * Some filesystems will return a different vnode,
3345 3216                   * but the same path was still used to open it.
3346 3217                   * So if we do change the vnode and need to
3347 3218                   * copy over the path, do so here, rather than special
3348 3219                   * casing each filesystem. Adjust the vnode counts to
3349 3220                   * reflect the vnode switch.
3350 3221                   */
3351 3222                  VOPSTATS_UPDATE(*vpp, open);
3352 3223                  if (*vpp != vp && *vpp != NULL) {
3353 3224                          vn_copypath(vp, *vpp);
3354 3225                          if (((*vpp)->v_type == VREG) && (mode & FREAD))
3355 3226                                  atomic_inc_32(&(*vpp)->v_rdcnt);
3356 3227                          if ((vp->v_type == VREG) && (mode & FREAD))
3357 3228                                  atomic_dec_32(&vp->v_rdcnt);
3358 3229                          if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3359 3230                                  atomic_inc_32(&(*vpp)->v_wrcnt);
3360 3231                          if ((vp->v_type == VREG) && (mode & FWRITE))
3361 3232                                  atomic_dec_32(&vp->v_wrcnt);
3362 3233                  }
3363 3234          }
3364 3235          VN_RELE(vp);
3365 3236          return (ret);
3366 3237  }
3367 3238  
3368 3239  int
3369 3240  fop_close(
3370 3241          vnode_t *vp,
3371 3242          int flag,
3372 3243          int count,
3373 3244          offset_t offset,
3374 3245          cred_t *cr,
3375 3246          caller_context_t *ct)
3376 3247  {
3377 3248          int err;
3378 3249  
3379 3250          VOPXID_MAP_CR(vp, cr);
3380 3251  
3381 3252          err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3382 3253          VOPSTATS_UPDATE(vp, close);
3383 3254          /*
3384 3255           * Check passed in count to handle possible dups. Vnode counts are only
3385 3256           * kept on regular files
3386 3257           */
3387 3258          if ((vp->v_type == VREG) && (count == 1))  {
3388 3259                  if (flag & FREAD) {
3389 3260                          ASSERT(vp->v_rdcnt > 0);
3390 3261                          atomic_dec_32(&vp->v_rdcnt);
3391 3262                  }
3392 3263                  if (flag & FWRITE) {
3393 3264                          ASSERT(vp->v_wrcnt > 0);
3394 3265                          atomic_dec_32(&vp->v_wrcnt);
3395 3266                  }
3396 3267          }
3397 3268          return (err);
3398 3269  }
3399 3270  
3400 3271  int
3401 3272  fop_read(
3402 3273          vnode_t *vp,
3403 3274          uio_t *uiop,
3404 3275          int ioflag,
3405 3276          cred_t *cr,
3406 3277          caller_context_t *ct)
3407 3278  {
3408 3279          ssize_t resid_start = uiop->uio_resid;
3409 3280          zone_t  *zonep = curzone;
3410 3281          zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3411 3282  
3412 3283          hrtime_t start = 0, lat;
3413 3284          ssize_t len;
3414 3285          int err;
3415 3286  
3416 3287          if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3417 3288              vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3418 3289                  start = gethrtime();
3419 3290  
3420 3291                  mutex_enter(&zonep->zone_vfs_lock);
3421 3292                  kstat_runq_enter(&zonep->zone_vfs_rwstats);
3422 3293                  mutex_exit(&zonep->zone_vfs_lock);
3423 3294          }
3424 3295  
3425 3296          VOPXID_MAP_CR(vp, cr);
3426 3297  
3427 3298          err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3428 3299          len = resid_start - uiop->uio_resid;
3429 3300  
3430 3301          VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3431 3302  
3432 3303          if (start != 0) {
3433 3304                  mutex_enter(&zonep->zone_vfs_lock);
3434 3305                  zonep->zone_vfs_rwstats.reads++;
3435 3306                  zonep->zone_vfs_rwstats.nread += len;
3436 3307                  kstat_runq_exit(&zonep->zone_vfs_rwstats);
  
    | 
      ↓ open down ↓ | 
    168 lines elided | 
    
      ↑ open up ↑ | 
  
3437 3308                  mutex_exit(&zonep->zone_vfs_lock);
3438 3309  
3439 3310                  lat = gethrtime() - start;
3440 3311  
3441 3312                  if (lat >= VOP_LATENCY_10MS) {
3442 3313                          if (lat < VOP_LATENCY_100MS)
3443 3314                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3444 3315                          else if (lat < VOP_LATENCY_1S) {
3445 3316                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3446 3317                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3447      -                        } else if (lat < VOP_LATENCY_10S) {
3448      -                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3449      -                                atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3450      -                                atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3451 3318                          } else {
3452 3319                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3453 3320                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3454 3321                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3455      -                                atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3456 3322                          }
3457 3323                  }
3458 3324          }
3459 3325  
3460 3326          return (err);
3461 3327  }
3462 3328  
3463 3329  int
3464 3330  fop_write(
3465 3331          vnode_t *vp,
3466 3332          uio_t *uiop,
3467 3333          int ioflag,
3468 3334          cred_t *cr,
3469 3335          caller_context_t *ct)
3470 3336  {
3471 3337          ssize_t resid_start = uiop->uio_resid;
3472 3338          zone_t  *zonep = curzone;
3473 3339          zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3474 3340  
3475 3341          hrtime_t start = 0, lat;
3476 3342          ssize_t len;
3477 3343          int     err;
3478 3344  
3479 3345          /*
3480 3346           * For the purposes of VFS kstat consumers, the "waitq" calculation is
3481 3347           * repurposed as the active queue for VFS write operations.  There's no
3482 3348           * actual wait queue for VFS operations.
3483 3349           */
3484 3350          if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3485 3351              vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3486 3352                  start = gethrtime();
3487 3353  
3488 3354                  mutex_enter(&zonep->zone_vfs_lock);
3489 3355                  kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3490 3356                  mutex_exit(&zonep->zone_vfs_lock);
3491 3357          }
3492 3358  
3493 3359          VOPXID_MAP_CR(vp, cr);
3494 3360  
3495 3361          err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3496 3362          len = resid_start - uiop->uio_resid;
3497 3363  
3498 3364          VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3499 3365  
3500 3366          if (start != 0) {
3501 3367                  mutex_enter(&zonep->zone_vfs_lock);
3502 3368                  zonep->zone_vfs_rwstats.writes++;
3503 3369                  zonep->zone_vfs_rwstats.nwritten += len;
3504 3370                  kstat_waitq_exit(&zonep->zone_vfs_rwstats);
  
    | 
      ↓ open down ↓ | 
    39 lines elided | 
    
      ↑ open up ↑ | 
  
3505 3371                  mutex_exit(&zonep->zone_vfs_lock);
3506 3372  
3507 3373                  lat = gethrtime() - start;
3508 3374  
3509 3375                  if (lat >= VOP_LATENCY_10MS) {
3510 3376                          if (lat < VOP_LATENCY_100MS)
3511 3377                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3512 3378                          else if (lat < VOP_LATENCY_1S) {
3513 3379                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3514 3380                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3515      -                        } else if (lat < VOP_LATENCY_10S) {
3516      -                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3517      -                                atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3518      -                                atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3519 3381                          } else {
3520 3382                                  atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3521 3383                                  atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3522 3384                                  atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3523      -                                atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3524 3385                          }
3525 3386                  }
3526 3387          }
3527 3388  
3528 3389          return (err);
3529 3390  }
3530 3391  
3531 3392  int
3532 3393  fop_ioctl(
3533 3394          vnode_t *vp,
3534 3395          int cmd,
3535 3396          intptr_t arg,
3536 3397          int flag,
3537 3398          cred_t *cr,
3538 3399          int *rvalp,
3539 3400          caller_context_t *ct)
3540 3401  {
3541 3402          int     err;
3542 3403  
3543 3404          VOPXID_MAP_CR(vp, cr);
3544 3405  
3545 3406          err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3546 3407          VOPSTATS_UPDATE(vp, ioctl);
3547 3408          return (err);
3548 3409  }
3549 3410  
3550 3411  int
3551 3412  fop_setfl(
3552 3413          vnode_t *vp,
3553 3414          int oflags,
3554 3415          int nflags,
3555 3416          cred_t *cr,
3556 3417          caller_context_t *ct)
3557 3418  {
3558 3419          int     err;
3559 3420  
3560 3421          VOPXID_MAP_CR(vp, cr);
3561 3422  
3562 3423          err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3563 3424          VOPSTATS_UPDATE(vp, setfl);
3564 3425          return (err);
3565 3426  }
3566 3427  
3567 3428  int
3568 3429  fop_getattr(
3569 3430          vnode_t *vp,
3570 3431          vattr_t *vap,
3571 3432          int flags,
3572 3433          cred_t *cr,
3573 3434          caller_context_t *ct)
3574 3435  {
3575 3436          int     err;
3576 3437  
3577 3438          VOPXID_MAP_CR(vp, cr);
3578 3439  
3579 3440          /*
3580 3441           * If this file system doesn't understand the xvattr extensions
3581 3442           * then turn off the xvattr bit.
3582 3443           */
3583 3444          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3584 3445                  vap->va_mask &= ~AT_XVATTR;
3585 3446          }
3586 3447  
3587 3448          /*
3588 3449           * We're only allowed to skip the ACL check iff we used a 32 bit
3589 3450           * ACE mask with VOP_ACCESS() to determine permissions.
3590 3451           */
3591 3452          if ((flags & ATTR_NOACLCHECK) &&
3592 3453              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3593 3454                  return (EINVAL);
3594 3455          }
3595 3456          err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3596 3457          VOPSTATS_UPDATE(vp, getattr);
3597 3458          return (err);
3598 3459  }
3599 3460  
3600 3461  int
3601 3462  fop_setattr(
3602 3463          vnode_t *vp,
3603 3464          vattr_t *vap,
3604 3465          int flags,
3605 3466          cred_t *cr,
3606 3467          caller_context_t *ct)
3607 3468  {
3608 3469          int     err;
3609 3470  
3610 3471          VOPXID_MAP_CR(vp, cr);
3611 3472  
3612 3473          /*
3613 3474           * If this file system doesn't understand the xvattr extensions
3614 3475           * then turn off the xvattr bit.
3615 3476           */
3616 3477          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3617 3478                  vap->va_mask &= ~AT_XVATTR;
3618 3479          }
3619 3480  
3620 3481          /*
3621 3482           * We're only allowed to skip the ACL check iff we used a 32 bit
3622 3483           * ACE mask with VOP_ACCESS() to determine permissions.
3623 3484           */
3624 3485          if ((flags & ATTR_NOACLCHECK) &&
3625 3486              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3626 3487                  return (EINVAL);
3627 3488          }
3628 3489          err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3629 3490          VOPSTATS_UPDATE(vp, setattr);
3630 3491          return (err);
3631 3492  }
3632 3493  
3633 3494  int
3634 3495  fop_access(
3635 3496          vnode_t *vp,
3636 3497          int mode,
3637 3498          int flags,
3638 3499          cred_t *cr,
3639 3500          caller_context_t *ct)
3640 3501  {
3641 3502          int     err;
3642 3503  
3643 3504          if ((flags & V_ACE_MASK) &&
3644 3505              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3645 3506                  return (EINVAL);
3646 3507          }
3647 3508  
3648 3509          VOPXID_MAP_CR(vp, cr);
3649 3510  
3650 3511          err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3651 3512          VOPSTATS_UPDATE(vp, access);
3652 3513          return (err);
3653 3514  }
3654 3515  
3655 3516  int
3656 3517  fop_lookup(
3657 3518          vnode_t *dvp,
3658 3519          char *nm,
3659 3520          vnode_t **vpp,
3660 3521          pathname_t *pnp,
3661 3522          int flags,
3662 3523          vnode_t *rdir,
3663 3524          cred_t *cr,
3664 3525          caller_context_t *ct,
3665 3526          int *deflags,           /* Returned per-dirent flags */
3666 3527          pathname_t *ppnp)       /* Returned case-preserved name in directory */
3667 3528  {
3668 3529          int ret;
3669 3530  
3670 3531          /*
3671 3532           * If this file system doesn't support case-insensitive access
3672 3533           * and said access is requested, fail quickly.  It is required
3673 3534           * that if the vfs supports case-insensitive lookup, it also
3674 3535           * supports extended dirent flags.
3675 3536           */
3676 3537          if (flags & FIGNORECASE &&
3677 3538              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3678 3539              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3679 3540                  return (EINVAL);
3680 3541  
  
    | 
      ↓ open down ↓ | 
    147 lines elided | 
    
      ↑ open up ↑ | 
  
3681 3542          VOPXID_MAP_CR(dvp, cr);
3682 3543  
3683 3544          if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3684 3545                  ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3685 3546          } else {
3686 3547                  ret = (*(dvp)->v_op->vop_lookup)
3687 3548                      (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3688 3549          }
3689 3550          if (ret == 0 && *vpp) {
3690 3551                  VOPSTATS_UPDATE(*vpp, lookup);
3691      -                vn_updatepath(dvp, *vpp, nm);
     3552 +                if ((*vpp)->v_path == NULL) {
     3553 +                        vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
     3554 +                }
3692 3555          }
3693 3556  
3694 3557          return (ret);
3695 3558  }
3696 3559  
3697 3560  int
3698 3561  fop_create(
3699 3562          vnode_t *dvp,
3700 3563          char *name,
3701 3564          vattr_t *vap,
3702 3565          vcexcl_t excl,
3703 3566          int mode,
3704 3567          vnode_t **vpp,
3705 3568          cred_t *cr,
3706 3569          int flags,
3707 3570          caller_context_t *ct,
3708 3571          vsecattr_t *vsecp)      /* ACL to set during create */
3709 3572  {
3710 3573          int ret;
3711 3574  
3712 3575          if (vsecp != NULL &&
3713 3576              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3714 3577                  return (EINVAL);
3715 3578          }
3716 3579          /*
3717 3580           * If this file system doesn't support case-insensitive access
3718 3581           * and said access is requested, fail quickly.
3719 3582           */
3720 3583          if (flags & FIGNORECASE &&
  
    | 
      ↓ open down ↓ | 
    19 lines elided | 
    
      ↑ open up ↑ | 
  
3721 3584              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3722 3585              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3723 3586                  return (EINVAL);
3724 3587  
3725 3588          VOPXID_MAP_CR(dvp, cr);
3726 3589  
3727 3590          ret = (*(dvp)->v_op->vop_create)
3728 3591              (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3729 3592          if (ret == 0 && *vpp) {
3730 3593                  VOPSTATS_UPDATE(*vpp, create);
3731      -                vn_updatepath(dvp, *vpp, name);
     3594 +                if ((*vpp)->v_path == NULL) {
     3595 +                        vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
     3596 +                }
3732 3597          }
3733 3598  
3734 3599          return (ret);
3735 3600  }
3736 3601  
3737 3602  int
3738 3603  fop_remove(
3739 3604          vnode_t *dvp,
3740 3605          char *nm,
3741 3606          cred_t *cr,
3742 3607          caller_context_t *ct,
3743 3608          int flags)
3744 3609  {
3745 3610          int     err;
3746 3611  
3747 3612          /*
3748 3613           * If this file system doesn't support case-insensitive access
3749 3614           * and said access is requested, fail quickly.
3750 3615           */
3751 3616          if (flags & FIGNORECASE &&
3752 3617              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3753 3618              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3754 3619                  return (EINVAL);
3755 3620  
3756 3621          VOPXID_MAP_CR(dvp, cr);
3757 3622  
3758 3623          err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3759 3624          VOPSTATS_UPDATE(dvp, remove);
3760 3625          return (err);
3761 3626  }
3762 3627  
3763 3628  int
3764 3629  fop_link(
3765 3630          vnode_t *tdvp,
3766 3631          vnode_t *svp,
3767 3632          char *tnm,
3768 3633          cred_t *cr,
3769 3634          caller_context_t *ct,
3770 3635          int flags)
3771 3636  {
3772 3637          int     err;
3773 3638  
3774 3639          /*
3775 3640           * If the target file system doesn't support case-insensitive access
3776 3641           * and said access is requested, fail quickly.
3777 3642           */
3778 3643          if (flags & FIGNORECASE &&
3779 3644              (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3780 3645              vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3781 3646                  return (EINVAL);
3782 3647  
3783 3648          VOPXID_MAP_CR(tdvp, cr);
3784 3649  
3785 3650          err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3786 3651          VOPSTATS_UPDATE(tdvp, link);
3787 3652          return (err);
3788 3653  }
3789 3654  
3790 3655  int
3791 3656  fop_rename(
3792 3657          vnode_t *sdvp,
3793 3658          char *snm,
3794 3659          vnode_t *tdvp,
3795 3660          char *tnm,
3796 3661          cred_t *cr,
3797 3662          caller_context_t *ct,
3798 3663          int flags)
3799 3664  {
3800 3665          int     err;
3801 3666  
3802 3667          /*
3803 3668           * If the file system involved does not support
3804 3669           * case-insensitive access and said access is requested, fail
3805 3670           * quickly.
3806 3671           */
3807 3672          if (flags & FIGNORECASE &&
3808 3673              ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3809 3674              vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3810 3675                  return (EINVAL);
3811 3676  
3812 3677          VOPXID_MAP_CR(tdvp, cr);
3813 3678  
3814 3679          err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3815 3680          VOPSTATS_UPDATE(sdvp, rename);
3816 3681          return (err);
3817 3682  }
3818 3683  
3819 3684  int
3820 3685  fop_mkdir(
3821 3686          vnode_t *dvp,
3822 3687          char *dirname,
3823 3688          vattr_t *vap,
3824 3689          vnode_t **vpp,
3825 3690          cred_t *cr,
3826 3691          caller_context_t *ct,
3827 3692          int flags,
3828 3693          vsecattr_t *vsecp)      /* ACL to set during create */
3829 3694  {
3830 3695          int ret;
3831 3696  
3832 3697          if (vsecp != NULL &&
3833 3698              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3834 3699                  return (EINVAL);
3835 3700          }
3836 3701          /*
3837 3702           * If this file system doesn't support case-insensitive access
3838 3703           * and said access is requested, fail quickly.
3839 3704           */
3840 3705          if (flags & FIGNORECASE &&
  
    | 
      ↓ open down ↓ | 
    99 lines elided | 
    
      ↑ open up ↑ | 
  
3841 3706              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3842 3707              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3843 3708                  return (EINVAL);
3844 3709  
3845 3710          VOPXID_MAP_CR(dvp, cr);
3846 3711  
3847 3712          ret = (*(dvp)->v_op->vop_mkdir)
3848 3713              (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3849 3714          if (ret == 0 && *vpp) {
3850 3715                  VOPSTATS_UPDATE(*vpp, mkdir);
3851      -                vn_updatepath(dvp, *vpp, dirname);
     3716 +                if ((*vpp)->v_path == NULL) {
     3717 +                        vn_setpath(rootdir, dvp, *vpp, dirname,
     3718 +                            strlen(dirname));
     3719 +                }
3852 3720          }
3853 3721  
3854 3722          return (ret);
3855 3723  }
3856 3724  
3857 3725  int
3858 3726  fop_rmdir(
3859 3727          vnode_t *dvp,
3860 3728          char *nm,
3861 3729          vnode_t *cdir,
3862 3730          cred_t *cr,
3863 3731          caller_context_t *ct,
3864 3732          int flags)
3865 3733  {
3866 3734          int     err;
3867 3735  
3868 3736          /*
3869 3737           * If this file system doesn't support case-insensitive access
3870 3738           * and said access is requested, fail quickly.
3871 3739           */
3872 3740          if (flags & FIGNORECASE &&
3873 3741              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3874 3742              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3875 3743                  return (EINVAL);
3876 3744  
3877 3745          VOPXID_MAP_CR(dvp, cr);
3878 3746  
3879 3747          err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3880 3748          VOPSTATS_UPDATE(dvp, rmdir);
3881 3749          return (err);
3882 3750  }
3883 3751  
3884 3752  int
3885 3753  fop_readdir(
3886 3754          vnode_t *vp,
3887 3755          uio_t *uiop,
3888 3756          cred_t *cr,
3889 3757          int *eofp,
3890 3758          caller_context_t *ct,
3891 3759          int flags)
3892 3760  {
3893 3761          int     err;
3894 3762          ssize_t resid_start = uiop->uio_resid;
3895 3763  
3896 3764          /*
3897 3765           * If this file system doesn't support retrieving directory
3898 3766           * entry flags and said access is requested, fail quickly.
3899 3767           */
3900 3768          if (flags & V_RDDIR_ENTFLAGS &&
3901 3769              vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3902 3770                  return (EINVAL);
3903 3771  
3904 3772          VOPXID_MAP_CR(vp, cr);
3905 3773  
3906 3774          err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3907 3775          VOPSTATS_UPDATE_IO(vp, readdir,
3908 3776              readdir_bytes, (resid_start - uiop->uio_resid));
3909 3777          return (err);
3910 3778  }
3911 3779  
3912 3780  int
3913 3781  fop_symlink(
3914 3782          vnode_t *dvp,
3915 3783          char *linkname,
3916 3784          vattr_t *vap,
3917 3785          char *target,
3918 3786          cred_t *cr,
3919 3787          caller_context_t *ct,
3920 3788          int flags)
3921 3789  {
3922 3790          int     err;
3923 3791          xvattr_t xvattr;
3924 3792  
3925 3793          /*
3926 3794           * If this file system doesn't support case-insensitive access
3927 3795           * and said access is requested, fail quickly.
3928 3796           */
3929 3797          if (flags & FIGNORECASE &&
3930 3798              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3931 3799              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3932 3800                  return (EINVAL);
3933 3801  
3934 3802          VOPXID_MAP_CR(dvp, cr);
3935 3803  
3936 3804          /* check for reparse point */
3937 3805          if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3938 3806              (strncmp(target, FS_REPARSE_TAG_STR,
3939 3807              strlen(FS_REPARSE_TAG_STR)) == 0)) {
3940 3808                  if (!fs_reparse_mark(target, vap, &xvattr))
3941 3809                          vap = (vattr_t *)&xvattr;
3942 3810          }
3943 3811  
3944 3812          err = (*(dvp)->v_op->vop_symlink)
3945 3813              (dvp, linkname, vap, target, cr, ct, flags);
3946 3814          VOPSTATS_UPDATE(dvp, symlink);
3947 3815          return (err);
3948 3816  }
3949 3817  
3950 3818  int
3951 3819  fop_readlink(
3952 3820          vnode_t *vp,
3953 3821          uio_t *uiop,
3954 3822          cred_t *cr,
3955 3823          caller_context_t *ct)
3956 3824  {
3957 3825          int     err;
3958 3826  
3959 3827          VOPXID_MAP_CR(vp, cr);
3960 3828  
3961 3829          err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3962 3830          VOPSTATS_UPDATE(vp, readlink);
3963 3831          return (err);
3964 3832  }
3965 3833  
3966 3834  int
3967 3835  fop_fsync(
3968 3836          vnode_t *vp,
3969 3837          int syncflag,
3970 3838          cred_t *cr,
3971 3839          caller_context_t *ct)
3972 3840  {
3973 3841          int     err;
3974 3842  
3975 3843          VOPXID_MAP_CR(vp, cr);
3976 3844  
3977 3845          err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3978 3846          VOPSTATS_UPDATE(vp, fsync);
3979 3847          return (err);
3980 3848  }
3981 3849  
3982 3850  void
3983 3851  fop_inactive(
3984 3852          vnode_t *vp,
3985 3853          cred_t *cr,
3986 3854          caller_context_t *ct)
3987 3855  {
3988 3856          /* Need to update stats before vop call since we may lose the vnode */
3989 3857          VOPSTATS_UPDATE(vp, inactive);
3990 3858  
3991 3859          VOPXID_MAP_CR(vp, cr);
3992 3860  
3993 3861          (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3994 3862  }
3995 3863  
3996 3864  int
3997 3865  fop_fid(
3998 3866          vnode_t *vp,
3999 3867          fid_t *fidp,
4000 3868          caller_context_t *ct)
4001 3869  {
4002 3870          int     err;
4003 3871  
4004 3872          err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
4005 3873          VOPSTATS_UPDATE(vp, fid);
4006 3874          return (err);
4007 3875  }
4008 3876  
4009 3877  int
4010 3878  fop_rwlock(
4011 3879          vnode_t *vp,
4012 3880          int write_lock,
4013 3881          caller_context_t *ct)
4014 3882  {
4015 3883          int     ret;
4016 3884  
4017 3885          ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
4018 3886          VOPSTATS_UPDATE(vp, rwlock);
4019 3887          return (ret);
4020 3888  }
4021 3889  
4022 3890  void
4023 3891  fop_rwunlock(
4024 3892          vnode_t *vp,
4025 3893          int write_lock,
4026 3894          caller_context_t *ct)
4027 3895  {
4028 3896          (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
4029 3897          VOPSTATS_UPDATE(vp, rwunlock);
4030 3898  }
4031 3899  
4032 3900  int
4033 3901  fop_seek(
4034 3902          vnode_t *vp,
4035 3903          offset_t ooff,
4036 3904          offset_t *noffp,
4037 3905          caller_context_t *ct)
4038 3906  {
4039 3907          int     err;
4040 3908  
4041 3909          err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
4042 3910          VOPSTATS_UPDATE(vp, seek);
4043 3911          return (err);
4044 3912  }
4045 3913  
4046 3914  int
4047 3915  fop_cmp(
4048 3916          vnode_t *vp1,
4049 3917          vnode_t *vp2,
4050 3918          caller_context_t *ct)
4051 3919  {
4052 3920          int     err;
4053 3921  
4054 3922          err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
4055 3923          VOPSTATS_UPDATE(vp1, cmp);
4056 3924          return (err);
4057 3925  }
4058 3926  
4059 3927  int
4060 3928  fop_frlock(
4061 3929          vnode_t *vp,
4062 3930          int cmd,
4063 3931          flock64_t *bfp,
4064 3932          int flag,
4065 3933          offset_t offset,
4066 3934          struct flk_callback *flk_cbp,
4067 3935          cred_t *cr,
4068 3936          caller_context_t *ct)
4069 3937  {
4070 3938          int     err;
4071 3939  
4072 3940          VOPXID_MAP_CR(vp, cr);
4073 3941  
4074 3942          err = (*(vp)->v_op->vop_frlock)
4075 3943              (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
4076 3944          VOPSTATS_UPDATE(vp, frlock);
4077 3945          return (err);
4078 3946  }
4079 3947  
4080 3948  int
4081 3949  fop_space(
4082 3950          vnode_t *vp,
4083 3951          int cmd,
4084 3952          flock64_t *bfp,
4085 3953          int flag,
4086 3954          offset_t offset,
4087 3955          cred_t *cr,
4088 3956          caller_context_t *ct)
4089 3957  {
4090 3958          int     err;
4091 3959  
4092 3960          VOPXID_MAP_CR(vp, cr);
4093 3961  
4094 3962          err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
4095 3963          VOPSTATS_UPDATE(vp, space);
4096 3964          return (err);
4097 3965  }
4098 3966  
4099 3967  int
4100 3968  fop_realvp(
4101 3969          vnode_t *vp,
4102 3970          vnode_t **vpp,
4103 3971          caller_context_t *ct)
4104 3972  {
4105 3973          int     err;
4106 3974  
4107 3975          err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
4108 3976          VOPSTATS_UPDATE(vp, realvp);
4109 3977          return (err);
4110 3978  }
4111 3979  
4112 3980  int
4113 3981  fop_getpage(
4114 3982          vnode_t *vp,
4115 3983          offset_t off,
4116 3984          size_t len,
4117 3985          uint_t *protp,
4118 3986          page_t **plarr,
4119 3987          size_t plsz,
4120 3988          struct seg *seg,
4121 3989          caddr_t addr,
4122 3990          enum seg_rw rw,
4123 3991          cred_t *cr,
4124 3992          caller_context_t *ct)
4125 3993  {
4126 3994          int     err;
4127 3995  
4128 3996          VOPXID_MAP_CR(vp, cr);
4129 3997  
4130 3998          err = (*(vp)->v_op->vop_getpage)
4131 3999              (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
4132 4000          VOPSTATS_UPDATE(vp, getpage);
4133 4001          return (err);
4134 4002  }
4135 4003  
4136 4004  int
4137 4005  fop_putpage(
4138 4006          vnode_t *vp,
4139 4007          offset_t off,
4140 4008          size_t len,
4141 4009          int flags,
4142 4010          cred_t *cr,
4143 4011          caller_context_t *ct)
4144 4012  {
4145 4013          int     err;
4146 4014  
4147 4015          VOPXID_MAP_CR(vp, cr);
4148 4016  
4149 4017          err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
4150 4018          VOPSTATS_UPDATE(vp, putpage);
4151 4019          return (err);
4152 4020  }
4153 4021  
4154 4022  int
4155 4023  fop_map(
4156 4024          vnode_t *vp,
4157 4025          offset_t off,
4158 4026          struct as *as,
4159 4027          caddr_t *addrp,
4160 4028          size_t len,
4161 4029          uchar_t prot,
4162 4030          uchar_t maxprot,
4163 4031          uint_t flags,
4164 4032          cred_t *cr,
4165 4033          caller_context_t *ct)
4166 4034  {
4167 4035          int     err;
4168 4036  
4169 4037          VOPXID_MAP_CR(vp, cr);
4170 4038  
4171 4039          err = (*(vp)->v_op->vop_map)
4172 4040              (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
4173 4041          VOPSTATS_UPDATE(vp, map);
4174 4042          return (err);
4175 4043  }
4176 4044  
4177 4045  int
4178 4046  fop_addmap(
4179 4047          vnode_t *vp,
4180 4048          offset_t off,
4181 4049          struct as *as,
4182 4050          caddr_t addr,
4183 4051          size_t len,
4184 4052          uchar_t prot,
4185 4053          uchar_t maxprot,
4186 4054          uint_t flags,
4187 4055          cred_t *cr,
4188 4056          caller_context_t *ct)
4189 4057  {
4190 4058          int error;
4191 4059          u_longlong_t delta;
4192 4060  
4193 4061          VOPXID_MAP_CR(vp, cr);
4194 4062  
4195 4063          error = (*(vp)->v_op->vop_addmap)
4196 4064              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4197 4065  
4198 4066          if ((!error) && (vp->v_type == VREG)) {
4199 4067                  delta = (u_longlong_t)btopr(len);
4200 4068                  /*
4201 4069                   * If file is declared MAP_PRIVATE, it can't be written back
4202 4070                   * even if open for write. Handle as read.
4203 4071                   */
4204 4072                  if (flags & MAP_PRIVATE) {
4205 4073                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4206 4074                              (int64_t)delta);
4207 4075                  } else {
4208 4076                          /*
4209 4077                           * atomic_add_64 forces the fetch of a 64 bit value to
4210 4078                           * be atomic on 32 bit machines
4211 4079                           */
4212 4080                          if (maxprot & PROT_WRITE)
4213 4081                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4214 4082                                      (int64_t)delta);
4215 4083                          if (maxprot & PROT_READ)
4216 4084                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4217 4085                                      (int64_t)delta);
4218 4086                          if (maxprot & PROT_EXEC)
4219 4087                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4220 4088                                      (int64_t)delta);
4221 4089                  }
4222 4090          }
4223 4091          VOPSTATS_UPDATE(vp, addmap);
4224 4092          return (error);
4225 4093  }
4226 4094  
4227 4095  int
4228 4096  fop_delmap(
4229 4097          vnode_t *vp,
4230 4098          offset_t off,
4231 4099          struct as *as,
4232 4100          caddr_t addr,
4233 4101          size_t len,
4234 4102          uint_t prot,
4235 4103          uint_t maxprot,
4236 4104          uint_t flags,
4237 4105          cred_t *cr,
4238 4106          caller_context_t *ct)
4239 4107  {
4240 4108          int error;
4241 4109          u_longlong_t delta;
4242 4110  
4243 4111          VOPXID_MAP_CR(vp, cr);
4244 4112  
4245 4113          error = (*(vp)->v_op->vop_delmap)
4246 4114              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4247 4115  
4248 4116          /*
4249 4117           * NFS calls into delmap twice, the first time
4250 4118           * it simply establishes a callback mechanism and returns EAGAIN
4251 4119           * while the real work is being done upon the second invocation.
4252 4120           * We have to detect this here and only decrement the counts upon
4253 4121           * the second delmap request.
4254 4122           */
4255 4123          if ((error != EAGAIN) && (vp->v_type == VREG)) {
4256 4124  
4257 4125                  delta = (u_longlong_t)btopr(len);
4258 4126  
4259 4127                  if (flags & MAP_PRIVATE) {
4260 4128                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4261 4129                              (int64_t)(-delta));
4262 4130                  } else {
4263 4131                          /*
4264 4132                           * atomic_add_64 forces the fetch of a 64 bit value
4265 4133                           * to be atomic on 32 bit machines
4266 4134                           */
4267 4135                          if (maxprot & PROT_WRITE)
4268 4136                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4269 4137                                      (int64_t)(-delta));
4270 4138                          if (maxprot & PROT_READ)
4271 4139                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4272 4140                                      (int64_t)(-delta));
4273 4141                          if (maxprot & PROT_EXEC)
4274 4142                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4275 4143                                      (int64_t)(-delta));
4276 4144                  }
4277 4145          }
4278 4146          VOPSTATS_UPDATE(vp, delmap);
4279 4147          return (error);
4280 4148  }
4281 4149  
4282 4150  
4283 4151  int
4284 4152  fop_poll(
4285 4153          vnode_t *vp,
4286 4154          short events,
4287 4155          int anyyet,
4288 4156          short *reventsp,
4289 4157          struct pollhead **phpp,
4290 4158          caller_context_t *ct)
4291 4159  {
4292 4160          int     err;
4293 4161  
4294 4162          err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4295 4163          VOPSTATS_UPDATE(vp, poll);
4296 4164          return (err);
4297 4165  }
4298 4166  
4299 4167  int
4300 4168  fop_dump(
4301 4169          vnode_t *vp,
4302 4170          caddr_t addr,
4303 4171          offset_t lbdn,
4304 4172          offset_t dblks,
4305 4173          caller_context_t *ct)
4306 4174  {
4307 4175          int     err;
4308 4176  
4309 4177          /* ensure lbdn and dblks can be passed safely to bdev_dump */
4310 4178          if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4311 4179                  return (EIO);
4312 4180  
4313 4181          err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4314 4182          VOPSTATS_UPDATE(vp, dump);
4315 4183          return (err);
4316 4184  }
4317 4185  
4318 4186  int
4319 4187  fop_pathconf(
4320 4188          vnode_t *vp,
4321 4189          int cmd,
4322 4190          ulong_t *valp,
4323 4191          cred_t *cr,
4324 4192          caller_context_t *ct)
4325 4193  {
4326 4194          int     err;
4327 4195  
4328 4196          VOPXID_MAP_CR(vp, cr);
4329 4197  
4330 4198          err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4331 4199          VOPSTATS_UPDATE(vp, pathconf);
4332 4200          return (err);
4333 4201  }
4334 4202  
4335 4203  int
4336 4204  fop_pageio(
4337 4205          vnode_t *vp,
4338 4206          struct page *pp,
4339 4207          u_offset_t io_off,
4340 4208          size_t io_len,
4341 4209          int flags,
4342 4210          cred_t *cr,
4343 4211          caller_context_t *ct)
4344 4212  {
4345 4213          int     err;
4346 4214  
4347 4215          VOPXID_MAP_CR(vp, cr);
4348 4216  
4349 4217          err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4350 4218          VOPSTATS_UPDATE(vp, pageio);
4351 4219          return (err);
4352 4220  }
4353 4221  
4354 4222  int
4355 4223  fop_dumpctl(
4356 4224          vnode_t *vp,
4357 4225          int action,
4358 4226          offset_t *blkp,
4359 4227          caller_context_t *ct)
4360 4228  {
4361 4229          int     err;
4362 4230          err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4363 4231          VOPSTATS_UPDATE(vp, dumpctl);
4364 4232          return (err);
4365 4233  }
4366 4234  
4367 4235  void
4368 4236  fop_dispose(
4369 4237          vnode_t *vp,
4370 4238          page_t *pp,
4371 4239          int flag,
4372 4240          int dn,
4373 4241          cred_t *cr,
4374 4242          caller_context_t *ct)
4375 4243  {
4376 4244          /* Must do stats first since it's possible to lose the vnode */
4377 4245          VOPSTATS_UPDATE(vp, dispose);
4378 4246  
4379 4247          VOPXID_MAP_CR(vp, cr);
4380 4248  
4381 4249          (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4382 4250  }
4383 4251  
4384 4252  int
4385 4253  fop_setsecattr(
4386 4254          vnode_t *vp,
4387 4255          vsecattr_t *vsap,
4388 4256          int flag,
4389 4257          cred_t *cr,
4390 4258          caller_context_t *ct)
4391 4259  {
4392 4260          int     err;
4393 4261  
4394 4262          VOPXID_MAP_CR(vp, cr);
4395 4263  
4396 4264          /*
4397 4265           * We're only allowed to skip the ACL check iff we used a 32 bit
4398 4266           * ACE mask with VOP_ACCESS() to determine permissions.
4399 4267           */
4400 4268          if ((flag & ATTR_NOACLCHECK) &&
4401 4269              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4402 4270                  return (EINVAL);
4403 4271          }
4404 4272          err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4405 4273          VOPSTATS_UPDATE(vp, setsecattr);
4406 4274          return (err);
4407 4275  }
4408 4276  
4409 4277  int
4410 4278  fop_getsecattr(
4411 4279          vnode_t *vp,
4412 4280          vsecattr_t *vsap,
4413 4281          int flag,
4414 4282          cred_t *cr,
4415 4283          caller_context_t *ct)
4416 4284  {
4417 4285          int     err;
4418 4286  
4419 4287          /*
4420 4288           * We're only allowed to skip the ACL check iff we used a 32 bit
4421 4289           * ACE mask with VOP_ACCESS() to determine permissions.
4422 4290           */
4423 4291          if ((flag & ATTR_NOACLCHECK) &&
4424 4292              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4425 4293                  return (EINVAL);
4426 4294          }
4427 4295  
4428 4296          VOPXID_MAP_CR(vp, cr);
4429 4297  
4430 4298          err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4431 4299          VOPSTATS_UPDATE(vp, getsecattr);
4432 4300          return (err);
4433 4301  }
4434 4302  
4435 4303  int
4436 4304  fop_shrlock(
4437 4305          vnode_t *vp,
4438 4306          int cmd,
4439 4307          struct shrlock *shr,
4440 4308          int flag,
4441 4309          cred_t *cr,
4442 4310          caller_context_t *ct)
4443 4311  {
4444 4312          int     err;
4445 4313  
4446 4314          VOPXID_MAP_CR(vp, cr);
4447 4315  
4448 4316          err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4449 4317          VOPSTATS_UPDATE(vp, shrlock);
4450 4318          return (err);
4451 4319  }
4452 4320  
4453 4321  int
4454 4322  fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4455 4323      caller_context_t *ct)
4456 4324  {
4457 4325          int     err;
4458 4326  
4459 4327          err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4460 4328          VOPSTATS_UPDATE(vp, vnevent);
4461 4329          return (err);
4462 4330  }
4463 4331  
4464 4332  int
4465 4333  fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4466 4334      caller_context_t *ct)
4467 4335  {
4468 4336          int err;
4469 4337  
4470 4338          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4471 4339                  return (ENOTSUP);
4472 4340          err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4473 4341          VOPSTATS_UPDATE(vp, reqzcbuf);
4474 4342          return (err);
4475 4343  }
4476 4344  
4477 4345  int
4478 4346  fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4479 4347  {
4480 4348          int err;
4481 4349  
4482 4350          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4483 4351                  return (ENOTSUP);
4484 4352          err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4485 4353          VOPSTATS_UPDATE(vp, retzcbuf);
4486 4354          return (err);
4487 4355  }
4488 4356  
4489 4357  /*
4490 4358   * Default destructor
4491 4359   *      Needed because NULL destructor means that the key is unused
4492 4360   */
4493 4361  /* ARGSUSED */
4494 4362  void
4495 4363  vsd_defaultdestructor(void *value)
4496 4364  {}
4497 4365  
4498 4366  /*
4499 4367   * Create a key (index into per vnode array)
4500 4368   *      Locks out vsd_create, vsd_destroy, and vsd_free
4501 4369   *      May allocate memory with lock held
4502 4370   */
4503 4371  void
4504 4372  vsd_create(uint_t *keyp, void (*destructor)(void *))
4505 4373  {
4506 4374          int     i;
4507 4375          uint_t  nkeys;
4508 4376  
4509 4377          /*
4510 4378           * if key is allocated, do nothing
4511 4379           */
4512 4380          mutex_enter(&vsd_lock);
4513 4381          if (*keyp) {
4514 4382                  mutex_exit(&vsd_lock);
4515 4383                  return;
4516 4384          }
4517 4385          /*
4518 4386           * find an unused key
4519 4387           */
4520 4388          if (destructor == NULL)
4521 4389                  destructor = vsd_defaultdestructor;
4522 4390  
4523 4391          for (i = 0; i < vsd_nkeys; ++i)
4524 4392                  if (vsd_destructor[i] == NULL)
4525 4393                          break;
4526 4394  
4527 4395          /*
4528 4396           * if no unused keys, increase the size of the destructor array
4529 4397           */
4530 4398          if (i == vsd_nkeys) {
4531 4399                  if ((nkeys = (vsd_nkeys << 1)) == 0)
4532 4400                          nkeys = 1;
4533 4401                  vsd_destructor =
4534 4402                      (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4535 4403                      (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4536 4404                      (size_t)(nkeys * sizeof (void (*)(void *))));
4537 4405                  vsd_nkeys = nkeys;
4538 4406          }
4539 4407  
4540 4408          /*
4541 4409           * allocate the next available unused key
4542 4410           */
4543 4411          vsd_destructor[i] = destructor;
4544 4412          *keyp = i + 1;
4545 4413  
4546 4414          /* create vsd_list, if it doesn't exist */
4547 4415          if (vsd_list == NULL) {
4548 4416                  vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4549 4417                  list_create(vsd_list, sizeof (struct vsd_node),
4550 4418                      offsetof(struct vsd_node, vs_nodes));
4551 4419          }
4552 4420  
4553 4421          mutex_exit(&vsd_lock);
4554 4422  }
4555 4423  
4556 4424  /*
4557 4425   * Destroy a key
4558 4426   *
4559 4427   * Assumes that the caller is preventing vsd_set and vsd_get
4560 4428   * Locks out vsd_create, vsd_destroy, and vsd_free
4561 4429   * May free memory with lock held
4562 4430   */
4563 4431  void
4564 4432  vsd_destroy(uint_t *keyp)
4565 4433  {
4566 4434          uint_t key;
4567 4435          struct vsd_node *vsd;
4568 4436  
4569 4437          /*
4570 4438           * protect the key namespace and our destructor lists
4571 4439           */
4572 4440          mutex_enter(&vsd_lock);
4573 4441          key = *keyp;
4574 4442          *keyp = 0;
4575 4443  
4576 4444          ASSERT(key <= vsd_nkeys);
4577 4445  
4578 4446          /*
4579 4447           * if the key is valid
4580 4448           */
4581 4449          if (key != 0) {
4582 4450                  uint_t k = key - 1;
4583 4451                  /*
4584 4452                   * for every vnode with VSD, call key's destructor
4585 4453                   */
4586 4454                  for (vsd = list_head(vsd_list); vsd != NULL;
4587 4455                      vsd = list_next(vsd_list, vsd)) {
4588 4456                          /*
4589 4457                           * no VSD for key in this vnode
4590 4458                           */
4591 4459                          if (key > vsd->vs_nkeys)
4592 4460                                  continue;
4593 4461                          /*
4594 4462                           * call destructor for key
4595 4463                           */
4596 4464                          if (vsd->vs_value[k] && vsd_destructor[k])
4597 4465                                  (*vsd_destructor[k])(vsd->vs_value[k]);
4598 4466                          /*
4599 4467                           * reset value for key
4600 4468                           */
4601 4469                          vsd->vs_value[k] = NULL;
4602 4470                  }
4603 4471                  /*
4604 4472                   * actually free the key (NULL destructor == unused)
4605 4473                   */
4606 4474                  vsd_destructor[k] = NULL;
4607 4475          }
4608 4476  
4609 4477          mutex_exit(&vsd_lock);
4610 4478  }
4611 4479  
4612 4480  /*
4613 4481   * Quickly return the per vnode value that was stored with the specified key
4614 4482   * Assumes the caller is protecting key from vsd_create and vsd_destroy
4615 4483   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4616 4484   */
4617 4485  void *
4618 4486  vsd_get(vnode_t *vp, uint_t key)
4619 4487  {
4620 4488          struct vsd_node *vsd;
4621 4489  
4622 4490          ASSERT(vp != NULL);
4623 4491          ASSERT(mutex_owned(&vp->v_vsd_lock));
4624 4492  
4625 4493          vsd = vp->v_vsd;
4626 4494  
4627 4495          if (key && vsd != NULL && key <= vsd->vs_nkeys)
4628 4496                  return (vsd->vs_value[key - 1]);
4629 4497          return (NULL);
4630 4498  }
4631 4499  
4632 4500  /*
4633 4501   * Set a per vnode value indexed with the specified key
4634 4502   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4635 4503   */
4636 4504  int
4637 4505  vsd_set(vnode_t *vp, uint_t key, void *value)
4638 4506  {
4639 4507          struct vsd_node *vsd;
4640 4508  
4641 4509          ASSERT(vp != NULL);
4642 4510          ASSERT(mutex_owned(&vp->v_vsd_lock));
4643 4511  
4644 4512          if (key == 0)
4645 4513                  return (EINVAL);
4646 4514  
4647 4515          vsd = vp->v_vsd;
4648 4516          if (vsd == NULL)
4649 4517                  vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4650 4518  
4651 4519          /*
4652 4520           * If the vsd was just allocated, vs_nkeys will be 0, so the following
4653 4521           * code won't happen and we will continue down and allocate space for
4654 4522           * the vs_value array.
4655 4523           * If the caller is replacing one value with another, then it is up
4656 4524           * to the caller to free/rele/destroy the previous value (if needed).
4657 4525           */
4658 4526          if (key <= vsd->vs_nkeys) {
4659 4527                  vsd->vs_value[key - 1] = value;
4660 4528                  return (0);
4661 4529          }
4662 4530  
4663 4531          ASSERT(key <= vsd_nkeys);
4664 4532  
4665 4533          if (vsd->vs_nkeys == 0) {
4666 4534                  mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4667 4535                  /*
4668 4536                   * Link onto list of all VSD nodes.
4669 4537                   */
4670 4538                  list_insert_head(vsd_list, vsd);
4671 4539                  mutex_exit(&vsd_lock);
4672 4540          }
4673 4541  
4674 4542          /*
4675 4543           * Allocate vnode local storage and set the value for key
4676 4544           */
4677 4545          vsd->vs_value = vsd_realloc(vsd->vs_value,
4678 4546              vsd->vs_nkeys * sizeof (void *),
4679 4547              key * sizeof (void *));
4680 4548          vsd->vs_nkeys = key;
4681 4549          vsd->vs_value[key - 1] = value;
4682 4550  
4683 4551          return (0);
4684 4552  }
4685 4553  
4686 4554  /*
4687 4555   * Called from vn_free() to run the destructor function for each vsd
4688 4556   *      Locks out vsd_create and vsd_destroy
4689 4557   *      Assumes that the destructor *DOES NOT* use vsd
4690 4558   */
4691 4559  void
4692 4560  vsd_free(vnode_t *vp)
4693 4561  {
4694 4562          int i;
4695 4563          struct vsd_node *vsd = vp->v_vsd;
4696 4564  
4697 4565          if (vsd == NULL)
4698 4566                  return;
4699 4567  
4700 4568          if (vsd->vs_nkeys == 0) {
4701 4569                  kmem_free(vsd, sizeof (*vsd));
4702 4570                  vp->v_vsd = NULL;
4703 4571                  return;
4704 4572          }
4705 4573  
4706 4574          /*
4707 4575           * lock out vsd_create and vsd_destroy, call
4708 4576           * the destructor, and mark the value as destroyed.
4709 4577           */
4710 4578          mutex_enter(&vsd_lock);
4711 4579  
4712 4580          for (i = 0; i < vsd->vs_nkeys; i++) {
4713 4581                  if (vsd->vs_value[i] && vsd_destructor[i])
4714 4582                          (*vsd_destructor[i])(vsd->vs_value[i]);
4715 4583                  vsd->vs_value[i] = NULL;
4716 4584          }
4717 4585  
4718 4586          /*
4719 4587           * remove from linked list of VSD nodes
4720 4588           */
4721 4589          list_remove(vsd_list, vsd);
4722 4590  
4723 4591          mutex_exit(&vsd_lock);
4724 4592  
4725 4593          /*
4726 4594           * free up the VSD
4727 4595           */
4728 4596          kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4729 4597          kmem_free(vsd, sizeof (struct vsd_node));
4730 4598          vp->v_vsd = NULL;
4731 4599  }
4732 4600  
4733 4601  /*
4734 4602   * realloc
4735 4603   */
4736 4604  static void *
4737 4605  vsd_realloc(void *old, size_t osize, size_t nsize)
4738 4606  {
4739 4607          void *new;
4740 4608  
4741 4609          new = kmem_zalloc(nsize, KM_SLEEP);
4742 4610          if (old) {
4743 4611                  bcopy(old, new, osize);
4744 4612                  kmem_free(old, osize);
4745 4613          }
4746 4614          return (new);
4747 4615  }
4748 4616  
4749 4617  /*
4750 4618   * Setup the extensible system attribute for creating a reparse point.
4751 4619   * The symlink data 'target' is validated for proper format of a reparse
4752 4620   * string and a check also made to make sure the symlink data does not
4753 4621   * point to an existing file.
4754 4622   *
4755 4623   * return 0 if ok else -1.
4756 4624   */
4757 4625  static int
4758 4626  fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4759 4627  {
4760 4628          xoptattr_t *xoap;
4761 4629  
4762 4630          if ((!target) || (!vap) || (!xvattr))
4763 4631                  return (-1);
4764 4632  
4765 4633          /* validate reparse string */
4766 4634          if (reparse_validate((const char *)target))
4767 4635                  return (-1);
4768 4636  
4769 4637          xva_init(xvattr);
4770 4638          xvattr->xva_vattr = *vap;
4771 4639          xvattr->xva_vattr.va_mask |= AT_XVATTR;
4772 4640          xoap = xva_getxoptattr(xvattr);
4773 4641          ASSERT(xoap);
4774 4642          XVA_SET_REQ(xvattr, XAT_REPARSE);
4775 4643          xoap->xoa_reparse = 1;
4776 4644  
4777 4645          return (0);
4778 4646  }
4779 4647  
4780 4648  /*
4781 4649   * Function to check whether a symlink is a reparse point.
4782 4650   * Return B_TRUE if it is a reparse point, else return B_FALSE
4783 4651   */
4784 4652  boolean_t
4785 4653  vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4786 4654  {
4787 4655          xvattr_t xvattr;
4788 4656          xoptattr_t *xoap;
4789 4657  
4790 4658          if ((vp->v_type != VLNK) ||
4791 4659              !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4792 4660                  return (B_FALSE);
4793 4661  
4794 4662          xva_init(&xvattr);
4795 4663          xoap = xva_getxoptattr(&xvattr);
4796 4664          ASSERT(xoap);
4797 4665          XVA_SET_REQ(&xvattr, XAT_REPARSE);
4798 4666  
4799 4667          if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4800 4668                  return (B_FALSE);
4801 4669  
4802 4670          if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4803 4671              (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4804 4672                  return (B_FALSE);
4805 4673  
4806 4674          return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4807 4675  }
  
    | 
      ↓ open down ↓ | 
    946 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX