Print this page
    
OS-5483 iostat -x shows around 100% utilization for idle zone
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
OS-5148 ftruncate at offset should emit proper events
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-338 Kstat counters to show "slow" VFS operations
OS-3294 add support for inotify
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/vnode.c
          +++ new/usr/src/uts/common/fs/vnode.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       24 + * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *
  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  #include <sys/types.h>
  41   41  #include <sys/param.h>
  42   42  #include <sys/t_lock.h>
  43   43  #include <sys/errno.h>
  44   44  #include <sys/cred.h>
  45   45  #include <sys/user.h>
  46   46  #include <sys/uio.h>
  47   47  #include <sys/file.h>
  48   48  #include <sys/pathname.h>
  49   49  #include <sys/vfs.h>
  50   50  #include <sys/vfs_opreg.h>
  51   51  #include <sys/vnode.h>
  52   52  #include <sys/rwstlock.h>
  53   53  #include <sys/fem.h>
  54   54  #include <sys/stat.h>
  55   55  #include <sys/mode.h>
  56   56  #include <sys/conf.h>
  57   57  #include <sys/sysmacros.h>
  58   58  #include <sys/cmn_err.h>
  59   59  #include <sys/systm.h>
  60   60  #include <sys/kmem.h>
  61   61  #include <sys/debug.h>
  62   62  #include <c2/audit.h>
  63   63  #include <sys/acl.h>
  64   64  #include <sys/nbmlock.h>
  65   65  #include <sys/fcntl.h>
  66   66  #include <fs/fs_subr.h>
  67   67  #include <sys/taskq.h>
  68   68  #include <fs/fs_reparse.h>
  69   69  
  70   70  /* Determine if this vnode is a file that is read-only */
  71   71  #define ISROFILE(vp)    \
  72   72          ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  73   73              (vp)->v_type != VFIFO && vn_is_readonly(vp))
  74   74  
  75   75  /* Tunable via /etc/system; used only by admin/install */
  76   76  int nfs_global_client_only;
  77   77  
  78   78  /*
  79   79   * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  80   80   * number of entries as and parallel to the vfssw table.  (Arguably, it could
  81   81   * be part of the vfssw table.)  Once it's initialized, it's accessed using
  82   82   * the same fstype index that is used to index into the vfssw table.
  83   83   */
  84   84  vopstats_t **vopstats_fstype;
  85   85  
  86   86  /* vopstats initialization template used for fast initialization via bcopy() */
  87   87  static vopstats_t *vs_templatep;
  88   88  
  89   89  /* Kmem cache handle for vsk_anchor_t allocations */
  90   90  kmem_cache_t *vsk_anchor_cache;
  91   91  
  92   92  /* file events cleanup routine */
  93   93  extern void free_fopdata(vnode_t *);
  94   94  
  95   95  /*
  96   96   * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  97   97   * updates to vsktat_tree.
  98   98   */
  99   99  avl_tree_t      vskstat_tree;
 100  100  kmutex_t        vskstat_tree_lock;
 101  101  
 102  102  /* Global variable which enables/disables the vopstats collection */
 103  103  int vopstats_enabled = 1;
 104  104  
 105  105  /*
 106  106   * forward declarations for internal vnode specific data (vsd)
 107  107   */
 108  108  static void *vsd_realloc(void *, size_t, size_t);
 109  109  
 110  110  /*
 111  111   * forward declarations for reparse point functions
 112  112   */
 113  113  static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 114  114  
 115  115  /*
 116  116   * VSD -- VNODE SPECIFIC DATA
 117  117   * The v_data pointer is typically used by a file system to store a
 118  118   * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 119  119   * However, there are times when additional project private data needs
 120  120   * to be stored separately from the data (node) pointed to by v_data.
 121  121   * This additional data could be stored by the file system itself or
 122  122   * by a completely different kernel entity.  VSD provides a way for
 123  123   * callers to obtain a key and store a pointer to private data associated
 124  124   * with a vnode.
 125  125   *
 126  126   * Callers are responsible for protecting the vsd by holding v_vsd_lock
 127  127   * for calls to vsd_set() and vsd_get().
 128  128   */
 129  129  
 130  130  /*
 131  131   * vsd_lock protects:
 132  132   *   vsd_nkeys - creation and deletion of vsd keys
 133  133   *   vsd_list - insertion and deletion of vsd_node in the vsd_list
 134  134   *   vsd_destructor - adding and removing destructors to the list
 135  135   */
 136  136  static kmutex_t         vsd_lock;
 137  137  static uint_t           vsd_nkeys;       /* size of destructor array */
 138  138  /* list of vsd_node's */
 139  139  static list_t *vsd_list = NULL;
 140  140  /* per-key destructor funcs */
 141  141  static void             (**vsd_destructor)(void *);
 142  142  
 143  143  /*
 144  144   * The following is the common set of actions needed to update the
 145  145   * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
 146  146   * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
 147  147   * recording of the bytes transferred.  Since the code is similar
 148  148   * but small, it is nearly a duplicate.  Consequently any changes
 149  149   * to one may need to be reflected in the other.
 150  150   * Rundown of the variables:
 151  151   * vp - Pointer to the vnode
 152  152   * counter - Partial name structure member to update in vopstats for counts
 153  153   * bytecounter - Partial name structure member to update in vopstats for bytes
 154  154   * bytesval - Value to update in vopstats for bytes
 155  155   * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
 156  156   * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
 157  157   */
 158  158  
 159  159  #define VOPSTATS_UPDATE(vp, counter) {                                  \
 160  160          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 161  161          if (vfsp && vfsp->vfs_implp &&                                  \
 162  162              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 163  163                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 164  164                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 165  165                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 166  166                      size_t, uint64_t *);                                \
 167  167                  __dtrace_probe___fsinfo_##counter(vp, 0, stataddr);     \
 168  168                  (*stataddr)++;                                          \
 169  169                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 170  170                          vsp->n##counter.value.ui64++;                   \
 171  171                  }                                                       \
 172  172          }                                                               \
 173  173  }
 174  174  
 175  175  #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {        \
 176  176          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 177  177          if (vfsp && vfsp->vfs_implp &&                                  \
 178  178              (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {     \
 179  179                  vopstats_t *vsp = &vfsp->vfs_vopstats;                  \
 180  180                  uint64_t *stataddr = &(vsp->n##counter.value.ui64);     \
 181  181                  extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 182  182                      size_t, uint64_t *);                                \
 183  183                  __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 184  184                  (*stataddr)++;                                          \
 185  185                  vsp->bytecounter.value.ui64 += bytesval;                \
 186  186                  if ((vsp = vfsp->vfs_fstypevsp) != NULL) {              \
 187  187                          vsp->n##counter.value.ui64++;                   \
 188  188                          vsp->bytecounter.value.ui64 += bytesval;        \
 189  189                  }                                                       \
 190  190          }                                                               \
 191  191  }
 192  192  
  
    | 
      ↓ open down ↓ | 
    158 lines elided | 
    
      ↑ open up ↑ | 
  
 193  193  /*
 194  194   * If the filesystem does not support XIDs map credential
 195  195   * If the vfsp is NULL, perhaps we should also map?
 196  196   */
 197  197  #define VOPXID_MAP_CR(vp, cr)   {                                       \
 198  198          vfs_t *vfsp = (vp)->v_vfsp;                                     \
 199  199          if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)            \
 200  200                  cr = crgetmapped(cr);                                   \
 201  201          }
 202  202  
      203 +#define VOP_LATENCY_10MS        10000000
      204 +#define VOP_LATENCY_100MS       100000000
      205 +#define VOP_LATENCY_1S          1000000000
      206 +
 203  207  /*
 204  208   * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 205  209   * numerical order of S_IFMT and vnode types.)
 206  210   */
 207  211  enum vtype iftovt_tab[] = {
 208  212          VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 209  213          VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 210  214  };
 211  215  
 212  216  ushort_t vttoif_tab[] = {
 213  217          0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 214  218          S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 215  219  };
 216  220  
 217  221  /*
 218  222   * The system vnode cache.
 219  223   */
 220  224  
 221  225  kmem_cache_t *vn_cache;
 222  226  
 223  227  
 224  228  /*
 225  229   * Vnode operations vector.
 226  230   */
 227  231  
 228  232  static const fs_operation_trans_def_t vn_ops_table[] = {
 229  233          VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 230  234              fs_nosys, fs_nosys,
 231  235  
 232  236          VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 233  237              fs_nosys, fs_nosys,
 234  238  
 235  239          VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 236  240              fs_nosys, fs_nosys,
 237  241  
 238  242          VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 239  243              fs_nosys, fs_nosys,
 240  244  
 241  245          VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 242  246              fs_nosys, fs_nosys,
 243  247  
 244  248          VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 245  249              fs_setfl, fs_nosys,
 246  250  
 247  251          VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 248  252              fs_nosys, fs_nosys,
 249  253  
 250  254          VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 251  255              fs_nosys, fs_nosys,
 252  256  
 253  257          VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 254  258              fs_nosys, fs_nosys,
 255  259  
 256  260          VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 257  261              fs_nosys, fs_nosys,
 258  262  
 259  263          VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 260  264              fs_nosys, fs_nosys,
 261  265  
 262  266          VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 263  267              fs_nosys, fs_nosys,
 264  268  
 265  269          VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 266  270              fs_nosys, fs_nosys,
 267  271  
 268  272          VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 269  273              fs_nosys, fs_nosys,
 270  274  
 271  275          VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 272  276              fs_nosys, fs_nosys,
 273  277  
 274  278          VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 275  279              fs_nosys, fs_nosys,
 276  280  
 277  281          VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 278  282              fs_nosys, fs_nosys,
 279  283  
 280  284          VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 281  285              fs_nosys, fs_nosys,
 282  286  
 283  287          VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 284  288              fs_nosys, fs_nosys,
 285  289  
 286  290          VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 287  291              fs_nosys, fs_nosys,
 288  292  
 289  293          VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 290  294              fs_nosys, fs_nosys,
 291  295  
 292  296          VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 293  297              fs_nosys, fs_nosys,
 294  298  
 295  299          VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 296  300              fs_rwlock, fs_rwlock,
 297  301  
 298  302          VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 299  303              (fs_generic_func_p) fs_rwunlock,
 300  304              (fs_generic_func_p) fs_rwunlock,    /* no errors allowed */
 301  305  
 302  306          VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 303  307              fs_nosys, fs_nosys,
 304  308  
 305  309          VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 306  310              fs_cmp, fs_cmp,             /* no errors allowed */
 307  311  
 308  312          VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 309  313              fs_frlock, fs_nosys,
 310  314  
 311  315          VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 312  316              fs_nosys, fs_nosys,
 313  317  
 314  318          VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 315  319              fs_nosys, fs_nosys,
 316  320  
 317  321          VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 318  322              fs_nosys, fs_nosys,
 319  323  
 320  324          VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 321  325              fs_nosys, fs_nosys,
 322  326  
 323  327          VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 324  328              (fs_generic_func_p) fs_nosys_map,
 325  329              (fs_generic_func_p) fs_nosys_map,
 326  330  
 327  331          VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 328  332              (fs_generic_func_p) fs_nosys_addmap,
 329  333              (fs_generic_func_p) fs_nosys_addmap,
 330  334  
 331  335          VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 332  336              fs_nosys, fs_nosys,
 333  337  
 334  338          VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 335  339              (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 336  340  
 337  341          VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 338  342              fs_nosys, fs_nosys,
 339  343  
 340  344          VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 341  345              fs_pathconf, fs_nosys,
 342  346  
 343  347          VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 344  348              fs_nosys, fs_nosys,
 345  349  
 346  350          VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 347  351              fs_nosys, fs_nosys,
 348  352  
 349  353          VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 350  354              (fs_generic_func_p) fs_dispose,
 351  355              (fs_generic_func_p) fs_nodispose,
 352  356  
 353  357          VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 354  358              fs_nosys, fs_nosys,
 355  359  
 356  360          VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 357  361              fs_fab_acl, fs_nosys,
 358  362  
 359  363          VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 360  364              fs_shrlock, fs_nosys,
 361  365  
 362  366          VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 363  367              (fs_generic_func_p) fs_vnevent_nosupport,
 364  368              (fs_generic_func_p) fs_vnevent_nosupport,
 365  369  
 366  370          VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 367  371              fs_nosys, fs_nosys,
 368  372  
 369  373          VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 370  374              fs_nosys, fs_nosys,
 371  375  
 372  376          NULL, 0, NULL, NULL
 373  377  };
 374  378  
 375  379  /* Extensible attribute (xva) routines. */
 376  380  
 377  381  /*
 378  382   * Zero out the structure, set the size of the requested/returned bitmaps,
 379  383   * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
 380  384   * to the returned attributes array.
 381  385   */
 382  386  void
 383  387  xva_init(xvattr_t *xvap)
 384  388  {
 385  389          bzero(xvap, sizeof (xvattr_t));
 386  390          xvap->xva_mapsize = XVA_MAPSIZE;
 387  391          xvap->xva_magic = XVA_MAGIC;
 388  392          xvap->xva_vattr.va_mask = AT_XVATTR;
 389  393          xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 390  394  }
 391  395  
 392  396  /*
 393  397   * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
 394  398   * structure.  Otherwise, returns NULL.
 395  399   */
 396  400  xoptattr_t *
 397  401  xva_getxoptattr(xvattr_t *xvap)
 398  402  {
 399  403          xoptattr_t *xoap = NULL;
 400  404          if (xvap->xva_vattr.va_mask & AT_XVATTR)
 401  405                  xoap = &xvap->xva_xoptattrs;
 402  406          return (xoap);
 403  407  }
 404  408  
 405  409  /*
 406  410   * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
 407  411   * We use the f_fsid reported by VFS_STATVFS() since we use that for the
 408  412   * kstat name.
 409  413   */
 410  414  static int
 411  415  vska_compar(const void *n1, const void *n2)
 412  416  {
 413  417          int ret;
 414  418          ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 415  419          ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 416  420  
 417  421          if (p1 < p2) {
 418  422                  ret = -1;
 419  423          } else if (p1 > p2) {
 420  424                  ret = 1;
 421  425          } else {
 422  426                  ret = 0;
 423  427          }
 424  428  
 425  429          return (ret);
 426  430  }
 427  431  
 428  432  /*
 429  433   * Used to create a single template which will be bcopy()ed to a newly
 430  434   * allocated vsanchor_combo_t structure in new_vsanchor(), below.
 431  435   */
 432  436  static vopstats_t *
 433  437  create_vopstats_template()
 434  438  {
 435  439          vopstats_t              *vsp;
 436  440  
 437  441          vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 438  442          bzero(vsp, sizeof (*vsp));      /* Start fresh */
 439  443  
 440  444          /* VOP_OPEN */
 441  445          kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 442  446          /* VOP_CLOSE */
 443  447          kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 444  448          /* VOP_READ I/O */
 445  449          kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 446  450          kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 447  451          /* VOP_WRITE I/O */
 448  452          kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 449  453          kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 450  454          /* VOP_IOCTL */
 451  455          kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 452  456          /* VOP_SETFL */
 453  457          kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 454  458          /* VOP_GETATTR */
 455  459          kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 456  460          /* VOP_SETATTR */
 457  461          kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 458  462          /* VOP_ACCESS */
 459  463          kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 460  464          /* VOP_LOOKUP */
 461  465          kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 462  466          /* VOP_CREATE */
 463  467          kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 464  468          /* VOP_REMOVE */
 465  469          kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 466  470          /* VOP_LINK */
 467  471          kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 468  472          /* VOP_RENAME */
 469  473          kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 470  474          /* VOP_MKDIR */
 471  475          kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 472  476          /* VOP_RMDIR */
 473  477          kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 474  478          /* VOP_READDIR I/O */
 475  479          kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 476  480          kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 477  481              KSTAT_DATA_UINT64);
 478  482          /* VOP_SYMLINK */
 479  483          kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 480  484          /* VOP_READLINK */
 481  485          kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 482  486          /* VOP_FSYNC */
 483  487          kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 484  488          /* VOP_INACTIVE */
 485  489          kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 486  490          /* VOP_FID */
 487  491          kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 488  492          /* VOP_RWLOCK */
 489  493          kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 490  494          /* VOP_RWUNLOCK */
 491  495          kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 492  496          /* VOP_SEEK */
 493  497          kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 494  498          /* VOP_CMP */
 495  499          kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 496  500          /* VOP_FRLOCK */
 497  501          kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 498  502          /* VOP_SPACE */
 499  503          kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 500  504          /* VOP_REALVP */
 501  505          kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 502  506          /* VOP_GETPAGE */
 503  507          kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 504  508          /* VOP_PUTPAGE */
 505  509          kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 506  510          /* VOP_MAP */
 507  511          kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 508  512          /* VOP_ADDMAP */
 509  513          kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 510  514          /* VOP_DELMAP */
 511  515          kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 512  516          /* VOP_POLL */
 513  517          kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 514  518          /* VOP_DUMP */
 515  519          kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 516  520          /* VOP_PATHCONF */
 517  521          kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 518  522          /* VOP_PAGEIO */
 519  523          kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 520  524          /* VOP_DUMPCTL */
 521  525          kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 522  526          /* VOP_DISPOSE */
 523  527          kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 524  528          /* VOP_SETSECATTR */
 525  529          kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 526  530          /* VOP_GETSECATTR */
 527  531          kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 528  532          /* VOP_SHRLOCK */
 529  533          kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 530  534          /* VOP_VNEVENT */
 531  535          kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 532  536          /* VOP_REQZCBUF */
 533  537          kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 534  538          /* VOP_RETZCBUF */
 535  539          kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 536  540  
 537  541          return (vsp);
 538  542  }
 539  543  
 540  544  /*
 541  545   * Creates a kstat structure associated with a vopstats structure.
 542  546   */
 543  547  kstat_t *
 544  548  new_vskstat(char *ksname, vopstats_t *vsp)
 545  549  {
 546  550          kstat_t         *ksp;
 547  551  
 548  552          if (!vopstats_enabled) {
 549  553                  return (NULL);
 550  554          }
 551  555  
 552  556          ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 553  557              sizeof (vopstats_t)/sizeof (kstat_named_t),
 554  558              KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 555  559          if (ksp) {
 556  560                  ksp->ks_data = vsp;
 557  561                  kstat_install(ksp);
 558  562          }
 559  563  
 560  564          return (ksp);
 561  565  }
 562  566  
 563  567  /*
 564  568   * Called from vfsinit() to initialize the support mechanisms for vopstats
 565  569   */
 566  570  void
 567  571  vopstats_startup()
 568  572  {
 569  573          if (!vopstats_enabled)
 570  574                  return;
 571  575  
 572  576          /*
 573  577           * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 574  578           * is necessary since we need to check if a kstat exists before we
 575  579           * attempt to create it.  Also, initialize its lock.
 576  580           */
 577  581          avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 578  582              offsetof(vsk_anchor_t, vsk_node));
 579  583          mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 580  584  
 581  585          vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 582  586              sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 583  587              NULL, NULL, 0);
 584  588  
 585  589          /*
 586  590           * Set up the array of pointers for the vopstats-by-FS-type.
 587  591           * The entries will be allocated/initialized as each file system
 588  592           * goes through modload/mod_installfs.
 589  593           */
 590  594          vopstats_fstype = (vopstats_t **)kmem_zalloc(
 591  595              (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 592  596  
 593  597          /* Set up the global vopstats initialization template */
 594  598          vs_templatep = create_vopstats_template();
 595  599  }
 596  600  
 597  601  /*
 598  602   * We need to have the all of the counters zeroed.
 599  603   * The initialization of the vopstats_t includes on the order of
 600  604   * 50 calls to kstat_named_init().  Rather that do that on every call,
 601  605   * we do it once in a template (vs_templatep) then bcopy it over.
 602  606   */
 603  607  void
 604  608  initialize_vopstats(vopstats_t *vsp)
 605  609  {
 606  610          if (vsp == NULL)
 607  611                  return;
 608  612  
 609  613          bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 610  614  }
 611  615  
 612  616  /*
 613  617   * If possible, determine which vopstats by fstype to use and
 614  618   * return a pointer to the caller.
 615  619   */
 616  620  vopstats_t *
 617  621  get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 618  622  {
 619  623          int             fstype = 0;     /* Index into vfssw[] */
 620  624          vopstats_t      *vsp = NULL;
 621  625  
 622  626          if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 623  627              !vopstats_enabled)
 624  628                  return (NULL);
 625  629          /*
 626  630           * Set up the fstype.  We go to so much trouble because all versions
 627  631           * of NFS use the same fstype in their vfs even though they have
 628  632           * distinct entries in the vfssw[] table.
 629  633           * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 630  634           */
 631  635          if (vswp) {
 632  636                  fstype = vswp - vfssw;  /* Gets us the index */
 633  637          } else {
 634  638                  fstype = vfsp->vfs_fstype;
 635  639          }
 636  640  
 637  641          /*
 638  642           * Point to the per-fstype vopstats. The only valid values are
 639  643           * non-zero positive values less than the number of vfssw[] table
 640  644           * entries.
 641  645           */
 642  646          if (fstype > 0 && fstype < nfstype) {
 643  647                  vsp = vopstats_fstype[fstype];
 644  648          }
 645  649  
 646  650          return (vsp);
 647  651  }
 648  652  
 649  653  /*
 650  654   * Generate a kstat name, create the kstat structure, and allocate a
 651  655   * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
 652  656   * to the caller.  This must only be called from a mount.
 653  657   */
 654  658  vsk_anchor_t *
 655  659  get_vskstat_anchor(vfs_t *vfsp)
 656  660  {
 657  661          char            kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 658  662          statvfs64_t     statvfsbuf;             /* Needed to find f_fsid */
 659  663          vsk_anchor_t    *vskp = NULL;           /* vfs <--> kstat anchor */
 660  664          kstat_t         *ksp;                   /* Ptr to new kstat */
 661  665          avl_index_t     where;                  /* Location in the AVL tree */
 662  666  
 663  667          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 664  668              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 665  669                  return (NULL);
 666  670  
 667  671          /* Need to get the fsid to build a kstat name */
 668  672          if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 669  673                  /* Create a name for our kstats based on fsid */
 670  674                  (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 671  675                      VOPSTATS_STR, statvfsbuf.f_fsid);
 672  676  
 673  677                  /* Allocate and initialize the vsk_anchor_t */
 674  678                  vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 675  679                  bzero(vskp, sizeof (*vskp));
 676  680                  vskp->vsk_fsid = statvfsbuf.f_fsid;
 677  681  
 678  682                  mutex_enter(&vskstat_tree_lock);
 679  683                  if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 680  684                          avl_insert(&vskstat_tree, vskp, where);
 681  685                          mutex_exit(&vskstat_tree_lock);
 682  686  
 683  687                          /*
 684  688                           * Now that we've got the anchor in the AVL
 685  689                           * tree, we can create the kstat.
 686  690                           */
 687  691                          ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 688  692                          if (ksp) {
 689  693                                  vskp->vsk_ksp = ksp;
 690  694                          }
 691  695                  } else {
 692  696                          /* Oops, found one! Release memory and lock. */
 693  697                          mutex_exit(&vskstat_tree_lock);
 694  698                          kmem_cache_free(vsk_anchor_cache, vskp);
 695  699                          vskp = NULL;
 696  700                  }
 697  701          }
 698  702          return (vskp);
 699  703  }
 700  704  
 701  705  /*
 702  706   * We're in the process of tearing down the vfs and need to cleanup
 703  707   * the data structures associated with the vopstats. Must only be called
 704  708   * from dounmount().
 705  709   */
 706  710  void
 707  711  teardown_vopstats(vfs_t *vfsp)
 708  712  {
 709  713          vsk_anchor_t    *vskap;
 710  714          avl_index_t     where;
 711  715  
 712  716          if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 713  717              (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 714  718                  return;
 715  719  
 716  720          /* This is a safe check since VFS_STATS must be set (see above) */
 717  721          if ((vskap = vfsp->vfs_vskap) == NULL)
 718  722                  return;
 719  723  
 720  724          /* Whack the pointer right away */
 721  725          vfsp->vfs_vskap = NULL;
 722  726  
 723  727          /* Lock the tree, remove the node, and delete the kstat */
 724  728          mutex_enter(&vskstat_tree_lock);
 725  729          if (avl_find(&vskstat_tree, vskap, &where)) {
 726  730                  avl_remove(&vskstat_tree, vskap);
 727  731          }
 728  732  
 729  733          if (vskap->vsk_ksp) {
 730  734                  kstat_delete(vskap->vsk_ksp);
 731  735          }
 732  736          mutex_exit(&vskstat_tree_lock);
 733  737  
 734  738          kmem_cache_free(vsk_anchor_cache, vskap);
 735  739  }
 736  740  
 737  741  /*
 738  742   * Read or write a vnode.  Called from kernel code.
 739  743   */
 740  744  int
 741  745  vn_rdwr(
 742  746          enum uio_rw rw,
 743  747          struct vnode *vp,
 744  748          caddr_t base,
 745  749          ssize_t len,
 746  750          offset_t offset,
 747  751          enum uio_seg seg,
 748  752          int ioflag,
 749  753          rlim64_t ulimit,        /* meaningful only if rw is UIO_WRITE */
 750  754          cred_t *cr,
 751  755          ssize_t *residp)
 752  756  {
 753  757          struct uio uio;
 754  758          struct iovec iov;
 755  759          int error;
 756  760          int in_crit = 0;
 757  761  
 758  762          if (rw == UIO_WRITE && ISROFILE(vp))
 759  763                  return (EROFS);
 760  764  
 761  765          if (len < 0)
 762  766                  return (EIO);
 763  767  
 764  768          VOPXID_MAP_CR(vp, cr);
 765  769  
 766  770          iov.iov_base = base;
 767  771          iov.iov_len = len;
 768  772          uio.uio_iov = &iov;
 769  773          uio.uio_iovcnt = 1;
 770  774          uio.uio_loffset = offset;
 771  775          uio.uio_segflg = (short)seg;
 772  776          uio.uio_resid = len;
 773  777          uio.uio_llimit = ulimit;
 774  778  
 775  779          /*
 776  780           * We have to enter the critical region before calling VOP_RWLOCK
 777  781           * to avoid a deadlock with ufs.
 778  782           */
 779  783          if (nbl_need_check(vp)) {
 780  784                  int svmand;
 781  785  
 782  786                  nbl_start_crit(vp, RW_READER);
 783  787                  in_crit = 1;
 784  788                  error = nbl_svmand(vp, cr, &svmand);
 785  789                  if (error != 0)
 786  790                          goto done;
 787  791                  if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 788  792                      uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 789  793                          error = EACCES;
 790  794                          goto done;
 791  795                  }
 792  796          }
 793  797  
 794  798          (void) VOP_RWLOCK(vp,
 795  799              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 796  800          if (rw == UIO_WRITE) {
 797  801                  uio.uio_fmode = FWRITE;
 798  802                  uio.uio_extflg = UIO_COPY_DEFAULT;
 799  803                  error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 800  804          } else {
 801  805                  uio.uio_fmode = FREAD;
 802  806                  uio.uio_extflg = UIO_COPY_CACHED;
 803  807                  error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 804  808          }
 805  809          VOP_RWUNLOCK(vp,
 806  810              rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 807  811          if (residp)
 808  812                  *residp = uio.uio_resid;
 809  813          else if (uio.uio_resid)
 810  814                  error = EIO;
 811  815  
 812  816  done:
 813  817          if (in_crit)
 814  818                  nbl_end_crit(vp);
 815  819          return (error);
 816  820  }
 817  821  
 818  822  /*
 819  823   * Release a vnode.  Call VOP_INACTIVE on last reference or
 820  824   * decrement reference count.
 821  825   *
 822  826   * To avoid race conditions, the v_count is left at 1 for
 823  827   * the call to VOP_INACTIVE. This prevents another thread
 824  828   * from reclaiming and releasing the vnode *before* the
 825  829   * VOP_INACTIVE routine has a chance to destroy the vnode.
 826  830   * We can't have more than 1 thread calling VOP_INACTIVE
 827  831   * on a vnode.
 828  832   */
 829  833  void
 830  834  vn_rele(vnode_t *vp)
 831  835  {
 832  836          VERIFY(vp->v_count > 0);
 833  837          mutex_enter(&vp->v_lock);
 834  838          if (vp->v_count == 1) {
 835  839                  mutex_exit(&vp->v_lock);
 836  840                  VOP_INACTIVE(vp, CRED(), NULL);
 837  841                  return;
 838  842          }
 839  843          vp->v_count--;
 840  844          mutex_exit(&vp->v_lock);
 841  845  }
 842  846  
 843  847  /*
 844  848   * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
 845  849   * as a single reference, so v_count is not decremented until the last DNLC hold
 846  850   * is released. This makes it possible to distinguish vnodes that are referenced
 847  851   * only by the DNLC.
 848  852   */
 849  853  void
 850  854  vn_rele_dnlc(vnode_t *vp)
 851  855  {
 852  856          VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 853  857          mutex_enter(&vp->v_lock);
 854  858          if (--vp->v_count_dnlc == 0) {
 855  859                  if (vp->v_count == 1) {
 856  860                          mutex_exit(&vp->v_lock);
 857  861                          VOP_INACTIVE(vp, CRED(), NULL);
 858  862                          return;
 859  863                  }
 860  864                  vp->v_count--;
 861  865          }
 862  866          mutex_exit(&vp->v_lock);
 863  867  }
 864  868  
 865  869  /*
 866  870   * Like vn_rele() except that it clears v_stream under v_lock.
 867  871   * This is used by sockfs when it dismantels the association between
 868  872   * the sockfs node and the vnode in the underlaying file system.
 869  873   * v_lock has to be held to prevent a thread coming through the lookupname
 870  874   * path from accessing a stream head that is going away.
 871  875   */
 872  876  void
 873  877  vn_rele_stream(vnode_t *vp)
 874  878  {
 875  879          VERIFY(vp->v_count > 0);
 876  880          mutex_enter(&vp->v_lock);
 877  881          vp->v_stream = NULL;
 878  882          if (vp->v_count == 1) {
 879  883                  mutex_exit(&vp->v_lock);
 880  884                  VOP_INACTIVE(vp, CRED(), NULL);
 881  885                  return;
 882  886          }
 883  887          vp->v_count--;
 884  888          mutex_exit(&vp->v_lock);
 885  889  }
 886  890  
 887  891  static void
 888  892  vn_rele_inactive(vnode_t *vp)
 889  893  {
 890  894          VOP_INACTIVE(vp, CRED(), NULL);
 891  895  }
 892  896  
 893  897  /*
 894  898   * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
 895  899   * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
 896  900   * the file system as a result of releasing the vnode. Note, file systems
 897  901   * already have to handle the race where the vnode is incremented before the
 898  902   * inactive routine is called and does its locking.
 899  903   *
 900  904   * Warning: Excessive use of this routine can lead to performance problems.
 901  905   * This is because taskqs throttle back allocation if too many are created.
 902  906   */
 903  907  void
 904  908  vn_rele_async(vnode_t *vp, taskq_t *taskq)
 905  909  {
 906  910          VERIFY(vp->v_count > 0);
 907  911          mutex_enter(&vp->v_lock);
 908  912          if (vp->v_count == 1) {
 909  913                  mutex_exit(&vp->v_lock);
 910  914                  VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 911  915                      vp, TQ_SLEEP) != NULL);
 912  916                  return;
 913  917          }
 914  918          vp->v_count--;
 915  919          mutex_exit(&vp->v_lock);
 916  920  }
 917  921  
 918  922  int
 919  923  vn_open(
 920  924          char *pnamep,
 921  925          enum uio_seg seg,
 922  926          int filemode,
 923  927          int createmode,
 924  928          struct vnode **vpp,
 925  929          enum create crwhy,
 926  930          mode_t umask)
 927  931  {
 928  932          return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 929  933              umask, NULL, -1));
 930  934  }
 931  935  
 932  936  
 933  937  /*
 934  938   * Open/create a vnode.
 935  939   * This may be callable by the kernel, the only known use
 936  940   * of user context being that the current user credentials
 937  941   * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 938  942   */
 939  943  int
 940  944  vn_openat(
 941  945          char *pnamep,
 942  946          enum uio_seg seg,
 943  947          int filemode,
 944  948          int createmode,
 945  949          struct vnode **vpp,
 946  950          enum create crwhy,
 947  951          mode_t umask,
 948  952          struct vnode *startvp,
 949  953          int fd)
 950  954  {
 951  955          struct vnode *vp;
 952  956          int mode;
 953  957          int accessflags;
 954  958          int error;
 955  959          int in_crit = 0;
 956  960          int open_done = 0;
 957  961          int shrlock_done = 0;
 958  962          struct vattr vattr;
 959  963          enum symfollow follow;
 960  964          int estale_retry = 0;
 961  965          struct shrlock shr;
 962  966          struct shr_locowner shr_own;
 963  967  
 964  968          mode = 0;
 965  969          accessflags = 0;
 966  970          if (filemode & FREAD)
 967  971                  mode |= VREAD;
 968  972          if (filemode & (FWRITE|FTRUNC))
 969  973                  mode |= VWRITE;
 970  974          if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 971  975                  mode |= VEXEC;
 972  976  
 973  977          /* symlink interpretation */
 974  978          if (filemode & FNOFOLLOW)
 975  979                  follow = NO_FOLLOW;
 976  980          else
 977  981                  follow = FOLLOW;
 978  982  
 979  983          if (filemode & FAPPEND)
 980  984                  accessflags |= V_APPEND;
 981  985  
 982  986  top:
 983  987          if (filemode & FCREAT) {
 984  988                  enum vcexcl excl;
 985  989  
 986  990                  /*
 987  991                   * Wish to create a file.
 988  992                   */
 989  993                  vattr.va_type = VREG;
 990  994                  vattr.va_mode = createmode;
 991  995                  vattr.va_mask = AT_TYPE|AT_MODE;
 992  996                  if (filemode & FTRUNC) {
 993  997                          vattr.va_size = 0;
 994  998                          vattr.va_mask |= AT_SIZE;
 995  999                  }
 996 1000                  if (filemode & FEXCL)
 997 1001                          excl = EXCL;
 998 1002                  else
 999 1003                          excl = NONEXCL;
1000 1004  
1001 1005                  if (error =
1002 1006                      vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
1003 1007                      (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
1004 1008                          return (error);
1005 1009          } else {
1006 1010                  /*
1007 1011                   * Wish to open a file.  Just look it up.
1008 1012                   */
1009 1013                  if (error = lookupnameat(pnamep, seg, follow,
1010 1014                      NULLVPP, &vp, startvp)) {
1011 1015                          if ((error == ESTALE) &&
1012 1016                              fs_need_estale_retry(estale_retry++))
1013 1017                                  goto top;
1014 1018                          return (error);
1015 1019                  }
1016 1020  
1017 1021                  /*
1018 1022                   * Get the attributes to check whether file is large.
1019 1023                   * We do this only if the FOFFMAX flag is not set and
1020 1024                   * only for regular files.
1021 1025                   */
1022 1026  
1023 1027                  if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
1024 1028                          vattr.va_mask = AT_SIZE;
1025 1029                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1026 1030                              CRED(), NULL))) {
1027 1031                                  goto out;
1028 1032                          }
1029 1033                          if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
1030 1034                                  /*
1031 1035                                   * Large File API - regular open fails
1032 1036                                   * if FOFFMAX flag is set in file mode
1033 1037                                   */
1034 1038                                  error = EOVERFLOW;
1035 1039                                  goto out;
1036 1040                          }
1037 1041                  }
1038 1042                  /*
1039 1043                   * Can't write directories, active texts, or
1040 1044                   * read-only filesystems.  Can't truncate files
1041 1045                   * on which mandatory locking is in effect.
1042 1046                   */
1043 1047                  if (filemode & (FWRITE|FTRUNC)) {
1044 1048                          /*
1045 1049                           * Allow writable directory if VDIROPEN flag is set.
1046 1050                           */
1047 1051                          if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
1048 1052                                  error = EISDIR;
1049 1053                                  goto out;
1050 1054                          }
1051 1055                          if (ISROFILE(vp)) {
1052 1056                                  error = EROFS;
1053 1057                                  goto out;
1054 1058                          }
1055 1059                          /*
1056 1060                           * Can't truncate files on which
1057 1061                           * sysv mandatory locking is in effect.
1058 1062                           */
1059 1063                          if (filemode & FTRUNC) {
1060 1064                                  vnode_t *rvp;
1061 1065  
1062 1066                                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1063 1067                                          rvp = vp;
1064 1068                                  if (rvp->v_filocks != NULL) {
1065 1069                                          vattr.va_mask = AT_MODE;
1066 1070                                          if ((error = VOP_GETATTR(vp,
1067 1071                                              &vattr, 0, CRED(), NULL)) == 0 &&
1068 1072                                              MANDLOCK(vp, vattr.va_mode))
1069 1073                                                  error = EAGAIN;
1070 1074                                  }
1071 1075                          }
1072 1076                          if (error)
1073 1077                                  goto out;
1074 1078                  }
1075 1079                  /*
1076 1080                   * Check permissions.
1077 1081                   */
1078 1082                  if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
1079 1083                          goto out;
1080 1084                  /*
1081 1085                   * Require FSEARCH to return a directory.
1082 1086                   * Require FEXEC to return a regular file.
1083 1087                   */
1084 1088                  if ((filemode & FSEARCH) && vp->v_type != VDIR) {
1085 1089                          error = ENOTDIR;
1086 1090                          goto out;
1087 1091                  }
1088 1092                  if ((filemode & FEXEC) && vp->v_type != VREG) {
1089 1093                          error = ENOEXEC;        /* XXX: error code? */
1090 1094                          goto out;
1091 1095                  }
1092 1096          }
1093 1097  
1094 1098          /*
1095 1099           * Do remaining checks for FNOFOLLOW and FNOLINKS.
1096 1100           */
1097 1101          if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
1098 1102                  error = ELOOP;
1099 1103                  goto out;
1100 1104          }
1101 1105          if (filemode & FNOLINKS) {
1102 1106                  vattr.va_mask = AT_NLINK;
1103 1107                  if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
1104 1108                          goto out;
1105 1109                  }
1106 1110                  if (vattr.va_nlink != 1) {
1107 1111                          error = EMLINK;
1108 1112                          goto out;
1109 1113                  }
1110 1114          }
1111 1115  
1112 1116          /*
1113 1117           * Opening a socket corresponding to the AF_UNIX pathname
1114 1118           * in the filesystem name space is not supported.
1115 1119           * However, VSOCK nodes in namefs are supported in order
1116 1120           * to make fattach work for sockets.
1117 1121           *
1118 1122           * XXX This uses VOP_REALVP to distinguish between
1119 1123           * an unopened namefs node (where VOP_REALVP returns a
1120 1124           * different VSOCK vnode) and a VSOCK created by vn_create
1121 1125           * in some file system (where VOP_REALVP would never return
1122 1126           * a different vnode).
1123 1127           */
1124 1128          if (vp->v_type == VSOCK) {
1125 1129                  struct vnode *nvp;
1126 1130  
1127 1131                  error = VOP_REALVP(vp, &nvp, NULL);
1128 1132                  if (error != 0 || nvp == NULL || nvp == vp ||
1129 1133                      nvp->v_type != VSOCK) {
1130 1134                          error = EOPNOTSUPP;
1131 1135                          goto out;
1132 1136                  }
1133 1137          }
1134 1138  
1135 1139          if ((vp->v_type == VREG) && nbl_need_check(vp)) {
1136 1140                  /* get share reservation */
1137 1141                  shr.s_access = 0;
1138 1142                  if (filemode & FWRITE)
1139 1143                          shr.s_access |= F_WRACC;
1140 1144                  if (filemode & FREAD)
1141 1145                          shr.s_access |= F_RDACC;
1142 1146                  shr.s_deny = 0;
1143 1147                  shr.s_sysid = 0;
1144 1148                  shr.s_pid = ttoproc(curthread)->p_pid;
1145 1149                  shr_own.sl_pid = shr.s_pid;
1146 1150                  shr_own.sl_id = fd;
1147 1151                  shr.s_own_len = sizeof (shr_own);
1148 1152                  shr.s_owner = (caddr_t)&shr_own;
1149 1153                  error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
1150 1154                      NULL);
1151 1155                  if (error)
1152 1156                          goto out;
1153 1157                  shrlock_done = 1;
1154 1158  
1155 1159                  /* nbmand conflict check if truncating file */
1156 1160                  if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1157 1161                          nbl_start_crit(vp, RW_READER);
1158 1162                          in_crit = 1;
1159 1163  
1160 1164                          vattr.va_mask = AT_SIZE;
1161 1165                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
1162 1166                                  goto out;
1163 1167                          if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
1164 1168                              NULL)) {
1165 1169                                  error = EACCES;
1166 1170                                  goto out;
1167 1171                          }
1168 1172                  }
1169 1173          }
1170 1174  
1171 1175          /*
1172 1176           * Do opening protocol.
1173 1177           */
1174 1178          error = VOP_OPEN(&vp, filemode, CRED(), NULL);
1175 1179          if (error)
1176 1180                  goto out;
1177 1181          open_done = 1;
1178 1182  
1179 1183          /*
1180 1184           * Truncate if required.
1181 1185           */
1182 1186          if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
1183 1187                  vattr.va_size = 0;
1184 1188                  vattr.va_mask = AT_SIZE;
1185 1189                  if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
1186 1190                          goto out;
1187 1191          }
1188 1192  out:
1189 1193          ASSERT(vp->v_count > 0);
1190 1194  
1191 1195          if (in_crit) {
1192 1196                  nbl_end_crit(vp);
1193 1197                  in_crit = 0;
1194 1198          }
1195 1199          if (error) {
1196 1200                  if (open_done) {
1197 1201                          (void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
1198 1202                              NULL);
1199 1203                          open_done = 0;
1200 1204                          shrlock_done = 0;
1201 1205                  }
1202 1206                  if (shrlock_done) {
1203 1207                          (void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
1204 1208                              NULL);
1205 1209                          shrlock_done = 0;
1206 1210                  }
1207 1211  
1208 1212                  /*
1209 1213                   * The following clause was added to handle a problem
1210 1214                   * with NFS consistency.  It is possible that a lookup
1211 1215                   * of the file to be opened succeeded, but the file
1212 1216                   * itself doesn't actually exist on the server.  This
1213 1217                   * is chiefly due to the DNLC containing an entry for
1214 1218                   * the file which has been removed on the server.  In
1215 1219                   * this case, we just start over.  If there was some
1216 1220                   * other cause for the ESTALE error, then the lookup
1217 1221                   * of the file will fail and the error will be returned
1218 1222                   * above instead of looping around from here.
1219 1223                   */
1220 1224                  VN_RELE(vp);
1221 1225                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1222 1226                          goto top;
1223 1227          } else
1224 1228                  *vpp = vp;
1225 1229          return (error);
1226 1230  }
1227 1231  
1228 1232  /*
1229 1233   * The following two accessor functions are for the NFSv4 server.  Since there
1230 1234   * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
1231 1235   * vnode open counts correct when a client "upgrades" an open or does an
1232 1236   * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
1233 1237   * open mode (add or subtract read or write), but also change the share/deny
1234 1238   * modes.  However, share reservations are not integrated with OPEN, yet, so
1235 1239   * we need to handle each separately.  These functions are cleaner than having
1236 1240   * the NFS server manipulate the counts directly, however, nobody else should
1237 1241   * use these functions.
1238 1242   */
1239 1243  void
1240 1244  vn_open_upgrade(
1241 1245          vnode_t *vp,
1242 1246          int filemode)
1243 1247  {
1244 1248          ASSERT(vp->v_type == VREG);
1245 1249  
1246 1250          if (filemode & FREAD)
1247 1251                  atomic_inc_32(&vp->v_rdcnt);
1248 1252          if (filemode & FWRITE)
1249 1253                  atomic_inc_32(&vp->v_wrcnt);
1250 1254  
1251 1255  }
1252 1256  
1253 1257  void
1254 1258  vn_open_downgrade(
1255 1259          vnode_t *vp,
1256 1260          int filemode)
1257 1261  {
1258 1262          ASSERT(vp->v_type == VREG);
1259 1263  
1260 1264          if (filemode & FREAD) {
1261 1265                  ASSERT(vp->v_rdcnt > 0);
1262 1266                  atomic_dec_32(&vp->v_rdcnt);
1263 1267          }
1264 1268          if (filemode & FWRITE) {
1265 1269                  ASSERT(vp->v_wrcnt > 0);
1266 1270                  atomic_dec_32(&vp->v_wrcnt);
1267 1271          }
1268 1272  
1269 1273  }
1270 1274  
1271 1275  int
1272 1276  vn_create(
1273 1277          char *pnamep,
1274 1278          enum uio_seg seg,
1275 1279          struct vattr *vap,
1276 1280          enum vcexcl excl,
1277 1281          int mode,
1278 1282          struct vnode **vpp,
1279 1283          enum create why,
1280 1284          int flag,
1281 1285          mode_t umask)
1282 1286  {
1283 1287          return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
1284 1288              umask, NULL));
1285 1289  }
1286 1290  
1287 1291  /*
1288 1292   * Create a vnode (makenode).
1289 1293   */
1290 1294  int
1291 1295  vn_createat(
1292 1296          char *pnamep,
1293 1297          enum uio_seg seg,
1294 1298          struct vattr *vap,
1295 1299          enum vcexcl excl,
1296 1300          int mode,
1297 1301          struct vnode **vpp,
1298 1302          enum create why,
1299 1303          int flag,
1300 1304          mode_t umask,
1301 1305          struct vnode *startvp)
1302 1306  {
1303 1307          struct vnode *dvp;      /* ptr to parent dir vnode */
1304 1308          struct vnode *vp = NULL;
1305 1309          struct pathname pn;
1306 1310          int error;
1307 1311          int in_crit = 0;
1308 1312          struct vattr vattr;
1309 1313          enum symfollow follow;
1310 1314          int estale_retry = 0;
1311 1315          uint32_t auditing = AU_AUDITING();
1312 1316  
1313 1317          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1314 1318  
1315 1319          /* symlink interpretation */
1316 1320          if ((flag & FNOFOLLOW) || excl == EXCL)
1317 1321                  follow = NO_FOLLOW;
1318 1322          else
1319 1323                  follow = FOLLOW;
1320 1324          flag &= ~(FNOFOLLOW|FNOLINKS);
1321 1325  
1322 1326  top:
1323 1327          /*
1324 1328           * Lookup directory.
1325 1329           * If new object is a file, call lower level to create it.
1326 1330           * Note that it is up to the lower level to enforce exclusive
1327 1331           * creation, if the file is already there.
1328 1332           * This allows the lower level to do whatever
1329 1333           * locking or protocol that is needed to prevent races.
1330 1334           * If the new object is directory call lower level to make
1331 1335           * the new directory, with "." and "..".
1332 1336           */
1333 1337          if (error = pn_get(pnamep, seg, &pn))
1334 1338                  return (error);
1335 1339          if (auditing)
1336 1340                  audit_vncreate_start();
1337 1341          dvp = NULL;
1338 1342          *vpp = NULL;
1339 1343          /*
1340 1344           * lookup will find the parent directory for the vnode.
1341 1345           * When it is done the pn holds the name of the entry
1342 1346           * in the directory.
1343 1347           * If this is a non-exclusive create we also find the node itself.
1344 1348           */
1345 1349          error = lookuppnat(&pn, NULL, follow, &dvp,
1346 1350              (excl == EXCL) ? NULLVPP : vpp, startvp);
1347 1351          if (error) {
1348 1352                  pn_free(&pn);
1349 1353                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1350 1354                          goto top;
1351 1355                  if (why == CRMKDIR && error == EINVAL)
1352 1356                          error = EEXIST;         /* SVID */
1353 1357                  return (error);
1354 1358          }
1355 1359  
1356 1360          if (why != CRMKNOD)
1357 1361                  vap->va_mode &= ~VSVTX;
1358 1362  
1359 1363          /*
1360 1364           * If default ACLs are defined for the directory don't apply the
1361 1365           * umask if umask is passed.
1362 1366           */
1363 1367  
1364 1368          if (umask) {
1365 1369  
1366 1370                  vsecattr_t vsec;
1367 1371  
1368 1372                  vsec.vsa_aclcnt = 0;
1369 1373                  vsec.vsa_aclentp = NULL;
1370 1374                  vsec.vsa_dfaclcnt = 0;
1371 1375                  vsec.vsa_dfaclentp = NULL;
1372 1376                  vsec.vsa_mask = VSA_DFACLCNT;
1373 1377                  error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
1374 1378                  /*
1375 1379                   * If error is ENOSYS then treat it as no error
1376 1380                   * Don't want to force all file systems to support
1377 1381                   * aclent_t style of ACL's.
1378 1382                   */
1379 1383                  if (error == ENOSYS)
1380 1384                          error = 0;
1381 1385                  if (error) {
1382 1386                          if (*vpp != NULL)
1383 1387                                  VN_RELE(*vpp);
1384 1388                          goto out;
1385 1389                  } else {
1386 1390                          /*
1387 1391                           * Apply the umask if no default ACLs.
1388 1392                           */
1389 1393                          if (vsec.vsa_dfaclcnt == 0)
1390 1394                                  vap->va_mode &= ~umask;
1391 1395  
1392 1396                          /*
1393 1397                           * VOP_GETSECATTR() may have allocated memory for
1394 1398                           * ACLs we didn't request, so double-check and
1395 1399                           * free it if necessary.
1396 1400                           */
1397 1401                          if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
1398 1402                                  kmem_free((caddr_t)vsec.vsa_aclentp,
1399 1403                                      vsec.vsa_aclcnt * sizeof (aclent_t));
1400 1404                          if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
1401 1405                                  kmem_free((caddr_t)vsec.vsa_dfaclentp,
1402 1406                                      vsec.vsa_dfaclcnt * sizeof (aclent_t));
1403 1407                  }
1404 1408          }
1405 1409  
1406 1410          /*
1407 1411           * In general we want to generate EROFS if the file system is
1408 1412           * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
1409 1413           * documents the open system call, and it says that O_CREAT has no
1410 1414           * effect if the file already exists.  Bug 1119649 states
1411 1415           * that open(path, O_CREAT, ...) fails when attempting to open an
1412 1416           * existing file on a read only file system.  Thus, the first part
1413 1417           * of the following if statement has 3 checks:
1414 1418           *      if the file exists &&
1415 1419           *              it is being open with write access &&
1416 1420           *              the file system is read only
1417 1421           *      then generate EROFS
1418 1422           */
1419 1423          if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
1420 1424              (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
1421 1425                  if (*vpp)
1422 1426                          VN_RELE(*vpp);
1423 1427                  error = EROFS;
1424 1428          } else if (excl == NONEXCL && *vpp != NULL) {
1425 1429                  vnode_t *rvp;
1426 1430  
1427 1431                  /*
1428 1432                   * File already exists.  If a mandatory lock has been
1429 1433                   * applied, return error.
1430 1434                   */
1431 1435                  vp = *vpp;
1432 1436                  if (VOP_REALVP(vp, &rvp, NULL) != 0)
1433 1437                          rvp = vp;
1434 1438                  if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
1435 1439                          nbl_start_crit(vp, RW_READER);
1436 1440                          in_crit = 1;
1437 1441                  }
1438 1442                  if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
1439 1443                          vattr.va_mask = AT_MODE|AT_SIZE;
1440 1444                          if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
1441 1445                                  goto out;
1442 1446                          }
1443 1447                          if (MANDLOCK(vp, vattr.va_mode)) {
1444 1448                                  error = EAGAIN;
1445 1449                                  goto out;
1446 1450                          }
1447 1451                          /*
1448 1452                           * File cannot be truncated if non-blocking mandatory
1449 1453                           * locks are currently on the file.
1450 1454                           */
1451 1455                          if ((vap->va_mask & AT_SIZE) && in_crit) {
1452 1456                                  u_offset_t offset;
1453 1457                                  ssize_t length;
1454 1458  
1455 1459                                  offset = vap->va_size > vattr.va_size ?
1456 1460                                      vattr.va_size : vap->va_size;
1457 1461                                  length = vap->va_size > vattr.va_size ?
1458 1462                                      vap->va_size - vattr.va_size :
1459 1463                                      vattr.va_size - vap->va_size;
1460 1464                                  if (nbl_conflict(vp, NBL_WRITE, offset,
1461 1465                                      length, 0, NULL)) {
1462 1466                                          error = EACCES;
1463 1467                                          goto out;
1464 1468                                  }
1465 1469                          }
1466 1470                  }
1467 1471  
1468 1472                  /*
1469 1473                   * If the file is the root of a VFS, we've crossed a
1470 1474                   * mount point and the "containing" directory that we
1471 1475                   * acquired above (dvp) is irrelevant because it's in
1472 1476                   * a different file system.  We apply VOP_CREATE to the
1473 1477                   * target itself instead of to the containing directory
1474 1478                   * and supply a null path name to indicate (conventionally)
1475 1479                   * the node itself as the "component" of interest.
1476 1480                   *
1477 1481                   * The intercession of the file system is necessary to
1478 1482                   * ensure that the appropriate permission checks are
1479 1483                   * done.
1480 1484                   */
1481 1485                  if (vp->v_flag & VROOT) {
1482 1486                          ASSERT(why != CRMKDIR);
1483 1487                          error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
1484 1488                              CRED(), flag, NULL, NULL);
1485 1489                          /*
1486 1490                           * If the create succeeded, it will have created
1487 1491                           * a new reference to the vnode.  Give up the
1488 1492                           * original reference.  The assertion should not
1489 1493                           * get triggered because NBMAND locks only apply to
1490 1494                           * VREG files.  And if in_crit is non-zero for some
1491 1495                           * reason, detect that here, rather than when we
1492 1496                           * deference a null vp.
1493 1497                           */
1494 1498                          ASSERT(in_crit == 0);
1495 1499                          VN_RELE(vp);
1496 1500                          vp = NULL;
1497 1501                          goto out;
1498 1502                  }
1499 1503  
1500 1504                  /*
1501 1505                   * Large File API - non-large open (FOFFMAX flag not set)
1502 1506                   * of regular file fails if the file size exceeds MAXOFF32_T.
1503 1507                   */
1504 1508                  if (why != CRMKDIR &&
1505 1509                      !(flag & FOFFMAX) &&
1506 1510                      (vp->v_type == VREG)) {
1507 1511                          vattr.va_mask = AT_SIZE;
1508 1512                          if ((error = VOP_GETATTR(vp, &vattr, 0,
1509 1513                              CRED(), NULL))) {
1510 1514                                  goto out;
1511 1515                          }
1512 1516                          if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
1513 1517                                  error = EOVERFLOW;
1514 1518                                  goto out;
1515 1519                          }
1516 1520                  }
1517 1521          }
1518 1522  
1519 1523          if (error == 0) {
1520 1524                  /*
1521 1525                   * Call mkdir() if specified, otherwise create().
1522 1526                   */
1523 1527                  int must_be_dir = pn_fixslash(&pn);     /* trailing '/'? */
1524 1528  
1525 1529                  if (why == CRMKDIR)
1526 1530                          /*
1527 1531                           * N.B., if vn_createat() ever requests
1528 1532                           * case-insensitive behavior then it will need
1529 1533                           * to be passed to VOP_MKDIR().  VOP_CREATE()
1530 1534                           * will already get it via "flag"
1531 1535                           */
1532 1536                          error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
1533 1537                              NULL, 0, NULL);
1534 1538                  else if (!must_be_dir)
1535 1539                          error = VOP_CREATE(dvp, pn.pn_path, vap,
1536 1540                              excl, mode, vpp, CRED(), flag, NULL, NULL);
1537 1541                  else
1538 1542                          error = ENOTDIR;
1539 1543          }
1540 1544  
1541 1545  out:
1542 1546  
1543 1547          if (auditing)
1544 1548                  audit_vncreate_finish(*vpp, error);
1545 1549          if (in_crit) {
1546 1550                  nbl_end_crit(vp);
1547 1551                  in_crit = 0;
1548 1552          }
1549 1553          if (vp != NULL) {
1550 1554                  VN_RELE(vp);
1551 1555                  vp = NULL;
1552 1556          }
1553 1557          pn_free(&pn);
1554 1558          VN_RELE(dvp);
1555 1559          /*
1556 1560           * The following clause was added to handle a problem
1557 1561           * with NFS consistency.  It is possible that a lookup
1558 1562           * of the file to be created succeeded, but the file
1559 1563           * itself doesn't actually exist on the server.  This
1560 1564           * is chiefly due to the DNLC containing an entry for
1561 1565           * the file which has been removed on the server.  In
1562 1566           * this case, we just start over.  If there was some
1563 1567           * other cause for the ESTALE error, then the lookup
1564 1568           * of the file will fail and the error will be returned
1565 1569           * above instead of looping around from here.
1566 1570           */
1567 1571          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1568 1572                  goto top;
1569 1573          return (error);
1570 1574  }
1571 1575  
1572 1576  int
1573 1577  vn_link(char *from, char *to, enum uio_seg seg)
1574 1578  {
1575 1579          return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
1576 1580  }
1577 1581  
1578 1582  int
1579 1583  vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
1580 1584      vnode_t *tstartvp, char *to, enum uio_seg seg)
1581 1585  {
1582 1586          struct vnode *fvp;              /* from vnode ptr */
1583 1587          struct vnode *tdvp;             /* to directory vnode ptr */
1584 1588          struct pathname pn;
1585 1589          int error;
1586 1590          struct vattr vattr;
1587 1591          dev_t fsid;
1588 1592          int estale_retry = 0;
1589 1593          uint32_t auditing = AU_AUDITING();
1590 1594  
1591 1595  top:
1592 1596          fvp = tdvp = NULL;
1593 1597          if (error = pn_get(to, seg, &pn))
1594 1598                  return (error);
1595 1599          if (auditing && fstartvp != NULL)
1596 1600                  audit_setfsat_path(1);
1597 1601          if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
1598 1602                  goto out;
1599 1603          if (auditing && tstartvp != NULL)
1600 1604                  audit_setfsat_path(3);
1601 1605          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
1602 1606                  goto out;
1603 1607          /*
1604 1608           * Make sure both source vnode and target directory vnode are
1605 1609           * in the same vfs and that it is writeable.
1606 1610           */
1607 1611          vattr.va_mask = AT_FSID;
1608 1612          if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
1609 1613                  goto out;
1610 1614          fsid = vattr.va_fsid;
1611 1615          vattr.va_mask = AT_FSID;
1612 1616          if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
1613 1617                  goto out;
1614 1618          if (fsid != vattr.va_fsid) {
1615 1619                  error = EXDEV;
1616 1620                  goto out;
1617 1621          }
1618 1622          if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
1619 1623                  error = EROFS;
1620 1624                  goto out;
1621 1625          }
1622 1626          /*
1623 1627           * Do the link.
1624 1628           */
1625 1629          (void) pn_fixslash(&pn);
1626 1630          error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
1627 1631  out:
1628 1632          pn_free(&pn);
1629 1633          if (fvp)
1630 1634                  VN_RELE(fvp);
1631 1635          if (tdvp)
1632 1636                  VN_RELE(tdvp);
1633 1637          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1634 1638                  goto top;
1635 1639          return (error);
1636 1640  }
1637 1641  
1638 1642  int
1639 1643  vn_rename(char *from, char *to, enum uio_seg seg)
1640 1644  {
1641 1645          return (vn_renameat(NULL, from, NULL, to, seg));
1642 1646  }
1643 1647  
1644 1648  int
1645 1649  vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
1646 1650                  char *tname, enum uio_seg seg)
1647 1651  {
1648 1652          int error;
1649 1653          struct vattr vattr;
1650 1654          struct pathname fpn;            /* from pathname */
1651 1655          struct pathname tpn;            /* to pathname */
1652 1656          dev_t fsid;
1653 1657          int in_crit_src, in_crit_targ;
1654 1658          vnode_t *fromvp, *fvp;
1655 1659          vnode_t *tovp, *targvp;
1656 1660          int estale_retry = 0;
1657 1661          uint32_t auditing = AU_AUDITING();
1658 1662  
1659 1663  top:
1660 1664          fvp = fromvp = tovp = targvp = NULL;
1661 1665          in_crit_src = in_crit_targ = 0;
1662 1666          /*
1663 1667           * Get to and from pathnames.
1664 1668           */
1665 1669          if (error = pn_get(fname, seg, &fpn))
1666 1670                  return (error);
1667 1671          if (error = pn_get(tname, seg, &tpn)) {
1668 1672                  pn_free(&fpn);
1669 1673                  return (error);
1670 1674          }
1671 1675  
1672 1676          /*
1673 1677           * First we need to resolve the correct directories
1674 1678           * The passed in directories may only be a starting point,
1675 1679           * but we need the real directories the file(s) live in.
1676 1680           * For example the fname may be something like usr/lib/sparc
1677 1681           * and we were passed in the / directory, but we need to
1678 1682           * use the lib directory for the rename.
1679 1683           */
1680 1684  
1681 1685          if (auditing && fdvp != NULL)
1682 1686                  audit_setfsat_path(1);
1683 1687          /*
1684 1688           * Lookup to and from directories.
1685 1689           */
1686 1690          if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
1687 1691                  goto out;
1688 1692          }
1689 1693  
1690 1694          /*
1691 1695           * Make sure there is an entry.
1692 1696           */
1693 1697          if (fvp == NULL) {
1694 1698                  error = ENOENT;
1695 1699                  goto out;
1696 1700          }
1697 1701  
1698 1702          if (auditing && tdvp != NULL)
1699 1703                  audit_setfsat_path(3);
1700 1704          if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
1701 1705                  goto out;
1702 1706          }
1703 1707  
1704 1708          /*
1705 1709           * Make sure both the from vnode directory and the to directory
1706 1710           * are in the same vfs and the to directory is writable.
1707 1711           * We check fsid's, not vfs pointers, so loopback fs works.
1708 1712           */
1709 1713          if (fromvp != tovp) {
1710 1714                  vattr.va_mask = AT_FSID;
1711 1715                  if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
1712 1716                          goto out;
1713 1717                  fsid = vattr.va_fsid;
1714 1718                  vattr.va_mask = AT_FSID;
1715 1719                  if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
1716 1720                          goto out;
1717 1721                  if (fsid != vattr.va_fsid) {
1718 1722                          error = EXDEV;
1719 1723                          goto out;
1720 1724                  }
1721 1725          }
1722 1726  
1723 1727          if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
1724 1728                  error = EROFS;
1725 1729                  goto out;
1726 1730          }
1727 1731  
1728 1732          if (targvp && (fvp != targvp)) {
1729 1733                  nbl_start_crit(targvp, RW_READER);
1730 1734                  in_crit_targ = 1;
1731 1735                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1732 1736                          error = EACCES;
1733 1737                          goto out;
1734 1738                  }
1735 1739          }
1736 1740  
1737 1741          if (nbl_need_check(fvp)) {
1738 1742                  nbl_start_crit(fvp, RW_READER);
1739 1743                  in_crit_src = 1;
1740 1744                  if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
1741 1745                          error = EACCES;
1742 1746                          goto out;
1743 1747                  }
1744 1748          }
1745 1749  
1746 1750          /*
1747 1751           * Do the rename.
1748 1752           */
1749 1753          (void) pn_fixslash(&tpn);
1750 1754          error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
1751 1755              NULL, 0);
1752 1756  
1753 1757  out:
1754 1758          pn_free(&fpn);
1755 1759          pn_free(&tpn);
1756 1760          if (in_crit_src)
1757 1761                  nbl_end_crit(fvp);
1758 1762          if (in_crit_targ)
1759 1763                  nbl_end_crit(targvp);
1760 1764          if (fromvp)
1761 1765                  VN_RELE(fromvp);
1762 1766          if (tovp)
1763 1767                  VN_RELE(tovp);
1764 1768          if (targvp)
1765 1769                  VN_RELE(targvp);
1766 1770          if (fvp)
1767 1771                  VN_RELE(fvp);
1768 1772          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1769 1773                  goto top;
1770 1774          return (error);
1771 1775  }
1772 1776  
1773 1777  /*
1774 1778   * Remove a file or directory.
1775 1779   */
1776 1780  int
1777 1781  vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
1778 1782  {
1779 1783          return (vn_removeat(NULL, fnamep, seg, dirflag));
1780 1784  }
1781 1785  
1782 1786  int
1783 1787  vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
1784 1788  {
1785 1789          struct vnode *vp;               /* entry vnode */
1786 1790          struct vnode *dvp;              /* ptr to parent dir vnode */
1787 1791          struct vnode *coveredvp;
1788 1792          struct pathname pn;             /* name of entry */
1789 1793          enum vtype vtype;
1790 1794          int error;
1791 1795          struct vfs *vfsp;
1792 1796          struct vfs *dvfsp;      /* ptr to parent dir vfs */
1793 1797          int in_crit = 0;
1794 1798          int estale_retry = 0;
1795 1799  
1796 1800  top:
1797 1801          if (error = pn_get(fnamep, seg, &pn))
1798 1802                  return (error);
1799 1803          dvp = vp = NULL;
1800 1804          if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
1801 1805                  pn_free(&pn);
1802 1806                  if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1803 1807                          goto top;
1804 1808                  return (error);
1805 1809          }
1806 1810  
1807 1811          /*
1808 1812           * Make sure there is an entry.
1809 1813           */
1810 1814          if (vp == NULL) {
1811 1815                  error = ENOENT;
1812 1816                  goto out;
1813 1817          }
1814 1818  
1815 1819          vfsp = vp->v_vfsp;
1816 1820          dvfsp = dvp->v_vfsp;
1817 1821  
1818 1822          /*
1819 1823           * If the named file is the root of a mounted filesystem, fail,
1820 1824           * unless it's marked unlinkable.  In that case, unmount the
1821 1825           * filesystem and proceed to unlink the covered vnode.  (If the
1822 1826           * covered vnode is a directory, use rmdir instead of unlink,
1823 1827           * to avoid file system corruption.)
1824 1828           */
1825 1829          if (vp->v_flag & VROOT) {
1826 1830                  if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
1827 1831                          error = EBUSY;
1828 1832                          goto out;
1829 1833                  }
1830 1834  
1831 1835                  /*
1832 1836                   * Namefs specific code starts here.
1833 1837                   */
1834 1838  
1835 1839                  if (dirflag == RMDIRECTORY) {
1836 1840                          /*
1837 1841                           * User called rmdir(2) on a file that has
1838 1842                           * been namefs mounted on top of.  Since
1839 1843                           * namefs doesn't allow directories to
1840 1844                           * be mounted on other files we know
1841 1845                           * vp is not of type VDIR so fail to operation.
1842 1846                           */
1843 1847                          error = ENOTDIR;
1844 1848                          goto out;
1845 1849                  }
1846 1850  
1847 1851                  /*
1848 1852                   * If VROOT is still set after grabbing vp->v_lock,
1849 1853                   * noone has finished nm_unmount so far and coveredvp
1850 1854                   * is valid.
1851 1855                   * If we manage to grab vn_vfswlock(coveredvp) before releasing
1852 1856                   * vp->v_lock, any race window is eliminated.
1853 1857                   */
1854 1858  
1855 1859                  mutex_enter(&vp->v_lock);
1856 1860                  if ((vp->v_flag & VROOT) == 0) {
1857 1861                          /* Someone beat us to the unmount */
1858 1862                          mutex_exit(&vp->v_lock);
1859 1863                          error = EBUSY;
1860 1864                          goto out;
1861 1865                  }
1862 1866                  vfsp = vp->v_vfsp;
1863 1867                  coveredvp = vfsp->vfs_vnodecovered;
1864 1868                  ASSERT(coveredvp);
1865 1869                  /*
1866 1870                   * Note: Implementation of vn_vfswlock shows that ordering of
1867 1871                   * v_lock / vn_vfswlock is not an issue here.
1868 1872                   */
1869 1873                  error = vn_vfswlock(coveredvp);
1870 1874                  mutex_exit(&vp->v_lock);
1871 1875  
1872 1876                  if (error)
1873 1877                          goto out;
1874 1878  
1875 1879                  VN_HOLD(coveredvp);
1876 1880                  VN_RELE(vp);
1877 1881                  error = dounmount(vfsp, 0, CRED());
1878 1882  
1879 1883                  /*
1880 1884                   * Unmounted the namefs file system; now get
1881 1885                   * the object it was mounted over.
1882 1886                   */
1883 1887                  vp = coveredvp;
1884 1888                  /*
1885 1889                   * If namefs was mounted over a directory, then
1886 1890                   * we want to use rmdir() instead of unlink().
1887 1891                   */
1888 1892                  if (vp->v_type == VDIR)
1889 1893                          dirflag = RMDIRECTORY;
1890 1894  
1891 1895                  if (error)
1892 1896                          goto out;
1893 1897          }
1894 1898  
1895 1899          /*
1896 1900           * Make sure filesystem is writeable.
1897 1901           * We check the parent directory's vfs in case this is an lofs vnode.
1898 1902           */
1899 1903          if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
1900 1904                  error = EROFS;
1901 1905                  goto out;
1902 1906          }
1903 1907  
1904 1908          vtype = vp->v_type;
1905 1909  
1906 1910          /*
1907 1911           * If there is the possibility of an nbmand share reservation, make
1908 1912           * sure it's okay to remove the file.  Keep a reference to the
1909 1913           * vnode, so that we can exit the nbl critical region after
1910 1914           * calling VOP_REMOVE.
1911 1915           * If there is no possibility of an nbmand share reservation,
1912 1916           * release the vnode reference now.  Filesystems like NFS may
1913 1917           * behave differently if there is an extra reference, so get rid of
1914 1918           * this one.  Fortunately, we can't have nbmand mounts on NFS
1915 1919           * filesystems.
1916 1920           */
1917 1921          if (nbl_need_check(vp)) {
1918 1922                  nbl_start_crit(vp, RW_READER);
1919 1923                  in_crit = 1;
1920 1924                  if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
1921 1925                          error = EACCES;
1922 1926                          goto out;
1923 1927                  }
1924 1928          } else {
1925 1929                  VN_RELE(vp);
1926 1930                  vp = NULL;
1927 1931          }
1928 1932  
1929 1933          if (dirflag == RMDIRECTORY) {
1930 1934                  /*
1931 1935                   * Caller is using rmdir(2), which can only be applied to
1932 1936                   * directories.
1933 1937                   */
1934 1938                  if (vtype != VDIR) {
1935 1939                          error = ENOTDIR;
1936 1940                  } else {
1937 1941                          vnode_t *cwd;
1938 1942                          proc_t *pp = curproc;
1939 1943  
1940 1944                          mutex_enter(&pp->p_lock);
1941 1945                          cwd = PTOU(pp)->u_cdir;
1942 1946                          VN_HOLD(cwd);
1943 1947                          mutex_exit(&pp->p_lock);
1944 1948                          error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
1945 1949                              NULL, 0);
1946 1950                          VN_RELE(cwd);
1947 1951                  }
1948 1952          } else {
1949 1953                  /*
1950 1954                   * Unlink(2) can be applied to anything.
1951 1955                   */
1952 1956                  error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
1953 1957          }
1954 1958  
1955 1959  out:
1956 1960          pn_free(&pn);
1957 1961          if (in_crit) {
1958 1962                  nbl_end_crit(vp);
1959 1963                  in_crit = 0;
1960 1964          }
1961 1965          if (vp != NULL)
1962 1966                  VN_RELE(vp);
1963 1967          if (dvp != NULL)
1964 1968                  VN_RELE(dvp);
1965 1969          if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
1966 1970                  goto top;
1967 1971          return (error);
1968 1972  }
1969 1973  
1970 1974  /*
1971 1975   * Utility function to compare equality of vnodes.
1972 1976   * Compare the underlying real vnodes, if there are underlying vnodes.
1973 1977   * This is a more thorough comparison than the VN_CMP() macro provides.
1974 1978   */
1975 1979  int
1976 1980  vn_compare(vnode_t *vp1, vnode_t *vp2)
1977 1981  {
1978 1982          vnode_t *realvp;
1979 1983  
1980 1984          if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
1981 1985                  vp1 = realvp;
1982 1986          if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
1983 1987                  vp2 = realvp;
1984 1988          return (VN_CMP(vp1, vp2));
1985 1989  }
1986 1990  
1987 1991  /*
1988 1992   * The number of locks to hash into.  This value must be a power
1989 1993   * of 2 minus 1 and should probably also be prime.
1990 1994   */
1991 1995  #define NUM_BUCKETS     1023
1992 1996  
1993 1997  struct  vn_vfslocks_bucket {
1994 1998          kmutex_t vb_lock;
1995 1999          vn_vfslocks_entry_t *vb_list;
1996 2000          char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
1997 2001  };
1998 2002  
1999 2003  /*
2000 2004   * Total number of buckets will be NUM_BUCKETS + 1 .
2001 2005   */
2002 2006  
2003 2007  #pragma align   64(vn_vfslocks_buckets)
2004 2008  static  struct vn_vfslocks_bucket       vn_vfslocks_buckets[NUM_BUCKETS + 1];
2005 2009  
2006 2010  #define VN_VFSLOCKS_SHIFT       9
2007 2011  
2008 2012  #define VN_VFSLOCKS_HASH(vfsvpptr)      \
2009 2013          ((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
2010 2014  
2011 2015  /*
2012 2016   * vn_vfslocks_getlock() uses an HASH scheme to generate
2013 2017   * rwstlock using vfs/vnode pointer passed to it.
2014 2018   *
2015 2019   * vn_vfslocks_rele() releases a reference in the
2016 2020   * HASH table which allows the entry allocated by
2017 2021   * vn_vfslocks_getlock() to be freed at a later
2018 2022   * stage when the refcount drops to zero.
2019 2023   */
2020 2024  
2021 2025  vn_vfslocks_entry_t *
2022 2026  vn_vfslocks_getlock(void *vfsvpptr)
2023 2027  {
2024 2028          struct vn_vfslocks_bucket *bp;
2025 2029          vn_vfslocks_entry_t *vep;
2026 2030          vn_vfslocks_entry_t *tvep;
2027 2031  
2028 2032          ASSERT(vfsvpptr != NULL);
2029 2033          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
2030 2034  
2031 2035          mutex_enter(&bp->vb_lock);
2032 2036          for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2033 2037                  if (vep->ve_vpvfs == vfsvpptr) {
2034 2038                          vep->ve_refcnt++;
2035 2039                          mutex_exit(&bp->vb_lock);
2036 2040                          return (vep);
2037 2041                  }
2038 2042          }
2039 2043          mutex_exit(&bp->vb_lock);
2040 2044          vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
2041 2045          rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
2042 2046          vep->ve_vpvfs = (char *)vfsvpptr;
2043 2047          vep->ve_refcnt = 1;
2044 2048          mutex_enter(&bp->vb_lock);
2045 2049          for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
2046 2050                  if (tvep->ve_vpvfs == vfsvpptr) {
2047 2051                          tvep->ve_refcnt++;
2048 2052                          mutex_exit(&bp->vb_lock);
2049 2053  
2050 2054                          /*
2051 2055                           * There is already an entry in the hash
2052 2056                           * destroy what we just allocated.
2053 2057                           */
2054 2058                          rwst_destroy(&vep->ve_lock);
2055 2059                          kmem_free(vep, sizeof (*vep));
2056 2060                          return (tvep);
2057 2061                  }
2058 2062          }
2059 2063          vep->ve_next = bp->vb_list;
2060 2064          bp->vb_list = vep;
2061 2065          mutex_exit(&bp->vb_lock);
2062 2066          return (vep);
2063 2067  }
2064 2068  
2065 2069  void
2066 2070  vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
2067 2071  {
2068 2072          struct vn_vfslocks_bucket *bp;
2069 2073          vn_vfslocks_entry_t *vep;
2070 2074          vn_vfslocks_entry_t *pvep;
2071 2075  
2072 2076          ASSERT(vepent != NULL);
2073 2077          ASSERT(vepent->ve_vpvfs != NULL);
2074 2078  
2075 2079          bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
2076 2080  
2077 2081          mutex_enter(&bp->vb_lock);
2078 2082          vepent->ve_refcnt--;
2079 2083  
2080 2084          if ((int32_t)vepent->ve_refcnt < 0)
2081 2085                  cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
2082 2086  
2083 2087          if (vepent->ve_refcnt == 0) {
2084 2088                  for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
2085 2089                          if (vep->ve_vpvfs == vepent->ve_vpvfs) {
2086 2090                                  if (bp->vb_list == vep)
2087 2091                                          bp->vb_list = vep->ve_next;
2088 2092                                  else {
2089 2093                                          /* LINTED */
2090 2094                                          pvep->ve_next = vep->ve_next;
2091 2095                                  }
2092 2096                                  mutex_exit(&bp->vb_lock);
2093 2097                                  rwst_destroy(&vep->ve_lock);
2094 2098                                  kmem_free(vep, sizeof (*vep));
2095 2099                                  return;
2096 2100                          }
2097 2101                          pvep = vep;
2098 2102                  }
2099 2103                  cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
2100 2104          }
2101 2105          mutex_exit(&bp->vb_lock);
2102 2106  }
2103 2107  
2104 2108  /*
2105 2109   * vn_vfswlock_wait is used to implement a lock which is logically a writers
2106 2110   * lock protecting the v_vfsmountedhere field.
2107 2111   * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
2108 2112   * except that it blocks to acquire the lock VVFSLOCK.
2109 2113   *
2110 2114   * traverse() and routines re-implementing part of traverse (e.g. autofs)
2111 2115   * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
2112 2116   * need the non-blocking version of the writers lock i.e. vn_vfswlock
2113 2117   */
2114 2118  int
2115 2119  vn_vfswlock_wait(vnode_t *vp)
2116 2120  {
2117 2121          int retval;
2118 2122          vn_vfslocks_entry_t *vpvfsentry;
2119 2123          ASSERT(vp != NULL);
2120 2124  
2121 2125          vpvfsentry = vn_vfslocks_getlock(vp);
2122 2126          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
2123 2127  
2124 2128          if (retval == EINTR) {
2125 2129                  vn_vfslocks_rele(vpvfsentry);
2126 2130                  return (EINTR);
2127 2131          }
2128 2132          return (retval);
2129 2133  }
2130 2134  
2131 2135  int
2132 2136  vn_vfsrlock_wait(vnode_t *vp)
2133 2137  {
2134 2138          int retval;
2135 2139          vn_vfslocks_entry_t *vpvfsentry;
2136 2140          ASSERT(vp != NULL);
2137 2141  
2138 2142          vpvfsentry = vn_vfslocks_getlock(vp);
2139 2143          retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
2140 2144  
2141 2145          if (retval == EINTR) {
2142 2146                  vn_vfslocks_rele(vpvfsentry);
2143 2147                  return (EINTR);
2144 2148          }
2145 2149  
2146 2150          return (retval);
2147 2151  }
2148 2152  
2149 2153  
2150 2154  /*
2151 2155   * vn_vfswlock is used to implement a lock which is logically a writers lock
2152 2156   * protecting the v_vfsmountedhere field.
2153 2157   */
2154 2158  int
2155 2159  vn_vfswlock(vnode_t *vp)
2156 2160  {
2157 2161          vn_vfslocks_entry_t *vpvfsentry;
2158 2162  
2159 2163          /*
2160 2164           * If vp is NULL then somebody is trying to lock the covered vnode
2161 2165           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2162 2166           * only happen when unmounting /.  Since that operation will fail
2163 2167           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2164 2168           */
2165 2169          if (vp == NULL)
2166 2170                  return (EBUSY);
2167 2171  
2168 2172          vpvfsentry = vn_vfslocks_getlock(vp);
2169 2173  
2170 2174          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
2171 2175                  return (0);
2172 2176  
2173 2177          vn_vfslocks_rele(vpvfsentry);
2174 2178          return (EBUSY);
2175 2179  }
2176 2180  
2177 2181  int
2178 2182  vn_vfsrlock(vnode_t *vp)
2179 2183  {
2180 2184          vn_vfslocks_entry_t *vpvfsentry;
2181 2185  
2182 2186          /*
2183 2187           * If vp is NULL then somebody is trying to lock the covered vnode
2184 2188           * of /.  (vfs_vnodecovered is NULL for /).  This situation will
2185 2189           * only happen when unmounting /.  Since that operation will fail
2186 2190           * anyway, return EBUSY here instead of in VFS_UNMOUNT.
2187 2191           */
2188 2192          if (vp == NULL)
2189 2193                  return (EBUSY);
2190 2194  
2191 2195          vpvfsentry = vn_vfslocks_getlock(vp);
2192 2196  
2193 2197          if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
2194 2198                  return (0);
2195 2199  
2196 2200          vn_vfslocks_rele(vpvfsentry);
2197 2201          return (EBUSY);
2198 2202  }
2199 2203  
2200 2204  void
2201 2205  vn_vfsunlock(vnode_t *vp)
2202 2206  {
2203 2207          vn_vfslocks_entry_t *vpvfsentry;
2204 2208  
2205 2209          /*
2206 2210           * ve_refcnt needs to be decremented twice.
2207 2211           * 1. To release refernce after a call to vn_vfslocks_getlock()
2208 2212           * 2. To release the reference from the locking routines like
2209 2213           *    vn_vfsrlock/vn_vfswlock etc,.
2210 2214           */
2211 2215          vpvfsentry = vn_vfslocks_getlock(vp);
2212 2216          vn_vfslocks_rele(vpvfsentry);
2213 2217  
2214 2218          rwst_exit(&vpvfsentry->ve_lock);
2215 2219          vn_vfslocks_rele(vpvfsentry);
2216 2220  }
2217 2221  
2218 2222  int
2219 2223  vn_vfswlock_held(vnode_t *vp)
2220 2224  {
2221 2225          int held;
2222 2226          vn_vfslocks_entry_t *vpvfsentry;
2223 2227  
2224 2228          ASSERT(vp != NULL);
2225 2229  
2226 2230          vpvfsentry = vn_vfslocks_getlock(vp);
2227 2231          held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
2228 2232  
2229 2233          vn_vfslocks_rele(vpvfsentry);
2230 2234          return (held);
2231 2235  }
2232 2236  
2233 2237  
2234 2238  int
2235 2239  vn_make_ops(
2236 2240          const char *name,                       /* Name of file system */
2237 2241          const fs_operation_def_t *templ,        /* Operation specification */
2238 2242          vnodeops_t **actual)                    /* Return the vnodeops */
2239 2243  {
2240 2244          int unused_ops;
2241 2245          int error;
2242 2246  
2243 2247          *actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
2244 2248  
2245 2249          (*actual)->vnop_name = name;
2246 2250  
2247 2251          error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
2248 2252          if (error) {
2249 2253                  kmem_free(*actual, sizeof (vnodeops_t));
2250 2254          }
2251 2255  
2252 2256  #if DEBUG
2253 2257          if (unused_ops != 0)
2254 2258                  cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
2255 2259                      "but not used", name, unused_ops);
2256 2260  #endif
2257 2261  
2258 2262          return (error);
2259 2263  }
2260 2264  
2261 2265  /*
2262 2266   * Free the vnodeops created as a result of vn_make_ops()
2263 2267   */
2264 2268  void
2265 2269  vn_freevnodeops(vnodeops_t *vnops)
2266 2270  {
2267 2271          kmem_free(vnops, sizeof (vnodeops_t));
2268 2272  }
2269 2273  
2270 2274  /*
2271 2275   * Vnode cache.
2272 2276   */
2273 2277  
2274 2278  /* ARGSUSED */
2275 2279  static int
2276 2280  vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2277 2281  {
2278 2282          struct vnode *vp;
2279 2283  
2280 2284          vp = buf;
2281 2285  
2282 2286          mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2283 2287          mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2284 2288          cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2285 2289          rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2286 2290          vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2287 2291          vp->v_path = NULL;
2288 2292          vp->v_mpssdata = NULL;
2289 2293          vp->v_vsd = NULL;
2290 2294          vp->v_fopdata = NULL;
2291 2295  
2292 2296          return (0);
2293 2297  }
2294 2298  
2295 2299  /* ARGSUSED */
2296 2300  static void
2297 2301  vn_cache_destructor(void *buf, void *cdrarg)
2298 2302  {
2299 2303          struct vnode *vp;
2300 2304  
2301 2305          vp = buf;
2302 2306  
2303 2307          rw_destroy(&vp->v_nbllock);
2304 2308          cv_destroy(&vp->v_cv);
2305 2309          mutex_destroy(&vp->v_vsd_lock);
2306 2310          mutex_destroy(&vp->v_lock);
2307 2311  }
2308 2312  
2309 2313  void
2310 2314  vn_create_cache(void)
2311 2315  {
2312 2316          /* LINTED */
2313 2317          ASSERT((1 << VNODE_ALIGN_LOG2) ==
2314 2318              P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2315 2319          vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2316 2320              VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2317 2321              NULL, 0);
2318 2322  }
2319 2323  
2320 2324  void
2321 2325  vn_destroy_cache(void)
2322 2326  {
2323 2327          kmem_cache_destroy(vn_cache);
2324 2328  }
2325 2329  
2326 2330  /*
2327 2331   * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2328 2332   * cached by the file system and vnodes remain associated.
2329 2333   */
2330 2334  void
2331 2335  vn_recycle(vnode_t *vp)
2332 2336  {
2333 2337          ASSERT(vp->v_pages == NULL);
2334 2338  
2335 2339          /*
2336 2340           * XXX - This really belongs in vn_reinit(), but we have some issues
2337 2341           * with the counts.  Best to have it here for clean initialization.
2338 2342           */
2339 2343          vp->v_rdcnt = 0;
2340 2344          vp->v_wrcnt = 0;
2341 2345          vp->v_mmap_read = 0;
2342 2346          vp->v_mmap_write = 0;
2343 2347  
2344 2348          /*
2345 2349           * If FEM was in use, make sure everything gets cleaned up
2346 2350           * NOTE: vp->v_femhead is initialized to NULL in the vnode
2347 2351           * constructor.
2348 2352           */
2349 2353          if (vp->v_femhead) {
2350 2354                  /* XXX - There should be a free_femhead() that does all this */
2351 2355                  ASSERT(vp->v_femhead->femh_list == NULL);
2352 2356                  mutex_destroy(&vp->v_femhead->femh_lock);
2353 2357                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2354 2358                  vp->v_femhead = NULL;
2355 2359          }
2356 2360          if (vp->v_path) {
2357 2361                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2358 2362                  vp->v_path = NULL;
2359 2363          }
2360 2364  
2361 2365          if (vp->v_fopdata != NULL) {
2362 2366                  free_fopdata(vp);
2363 2367          }
2364 2368          vp->v_mpssdata = NULL;
2365 2369          vsd_free(vp);
2366 2370  }
2367 2371  
2368 2372  /*
2369 2373   * Used to reset the vnode fields including those that are directly accessible
2370 2374   * as well as those which require an accessor function.
2371 2375   *
2372 2376   * Does not initialize:
2373 2377   *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2374 2378   *      v_data (since FS-nodes and vnodes point to each other and should
2375 2379   *              be updated simultaneously)
2376 2380   *      v_op (in case someone needs to make a VOP call on this object)
2377 2381   */
2378 2382  void
2379 2383  vn_reinit(vnode_t *vp)
2380 2384  {
2381 2385          vp->v_count = 1;
2382 2386          vp->v_count_dnlc = 0;
2383 2387          vp->v_vfsp = NULL;
2384 2388          vp->v_stream = NULL;
2385 2389          vp->v_vfsmountedhere = NULL;
2386 2390          vp->v_flag = 0;
2387 2391          vp->v_type = VNON;
2388 2392          vp->v_rdev = NODEV;
2389 2393  
2390 2394          vp->v_filocks = NULL;
2391 2395          vp->v_shrlocks = NULL;
2392 2396          vp->v_pages = NULL;
2393 2397  
2394 2398          vp->v_locality = NULL;
2395 2399          vp->v_xattrdir = NULL;
2396 2400  
2397 2401          /* Handles v_femhead, v_path, and the r/w/map counts */
2398 2402          vn_recycle(vp);
2399 2403  }
2400 2404  
2401 2405  vnode_t *
2402 2406  vn_alloc(int kmflag)
2403 2407  {
2404 2408          vnode_t *vp;
2405 2409  
2406 2410          vp = kmem_cache_alloc(vn_cache, kmflag);
2407 2411  
2408 2412          if (vp != NULL) {
2409 2413                  vp->v_femhead = NULL;   /* Must be done before vn_reinit() */
2410 2414                  vp->v_fopdata = NULL;
2411 2415                  vn_reinit(vp);
2412 2416          }
2413 2417  
2414 2418          return (vp);
2415 2419  }
2416 2420  
2417 2421  void
2418 2422  vn_free(vnode_t *vp)
2419 2423  {
2420 2424          ASSERT(vp->v_shrlocks == NULL);
2421 2425          ASSERT(vp->v_filocks == NULL);
2422 2426  
2423 2427          /*
2424 2428           * Some file systems call vn_free() with v_count of zero,
2425 2429           * some with v_count of 1.  In any case, the value should
2426 2430           * never be anything else.
2427 2431           */
2428 2432          ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2429 2433          ASSERT(vp->v_count_dnlc == 0);
2430 2434          if (vp->v_path != NULL) {
2431 2435                  kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2432 2436                  vp->v_path = NULL;
2433 2437          }
2434 2438  
2435 2439          /* If FEM was in use, make sure everything gets cleaned up */
2436 2440          if (vp->v_femhead) {
2437 2441                  /* XXX - There should be a free_femhead() that does all this */
2438 2442                  ASSERT(vp->v_femhead->femh_list == NULL);
2439 2443                  mutex_destroy(&vp->v_femhead->femh_lock);
2440 2444                  kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2441 2445                  vp->v_femhead = NULL;
2442 2446          }
2443 2447  
2444 2448          if (vp->v_fopdata != NULL) {
2445 2449                  free_fopdata(vp);
2446 2450          }
2447 2451          vp->v_mpssdata = NULL;
2448 2452          vsd_free(vp);
2449 2453          kmem_cache_free(vn_cache, vp);
2450 2454  }
2451 2455  
2452 2456  /*
2453 2457   * vnode status changes, should define better states than 1, 0.
2454 2458   */
2455 2459  void
2456 2460  vn_reclaim(vnode_t *vp)
2457 2461  {
2458 2462          vfs_t   *vfsp = vp->v_vfsp;
2459 2463  
2460 2464          if (vfsp == NULL ||
2461 2465              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2462 2466                  return;
2463 2467          }
2464 2468          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
2465 2469  }
2466 2470  
2467 2471  void
2468 2472  vn_idle(vnode_t *vp)
2469 2473  {
2470 2474          vfs_t   *vfsp = vp->v_vfsp;
2471 2475  
2472 2476          if (vfsp == NULL ||
2473 2477              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2474 2478                  return;
2475 2479          }
2476 2480          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
2477 2481  }
2478 2482  void
2479 2483  vn_exists(vnode_t *vp)
2480 2484  {
2481 2485          vfs_t   *vfsp = vp->v_vfsp;
2482 2486  
2483 2487          if (vfsp == NULL ||
2484 2488              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2485 2489                  return;
2486 2490          }
2487 2491          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
2488 2492  }
2489 2493  
2490 2494  void
2491 2495  vn_invalid(vnode_t *vp)
2492 2496  {
2493 2497          vfs_t   *vfsp = vp->v_vfsp;
2494 2498  
2495 2499          if (vfsp == NULL ||
2496 2500              vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
2497 2501                  return;
2498 2502          }
2499 2503          (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2500 2504  }
2501 2505  
2502 2506  /* Vnode event notification */
2503 2507  
2504 2508  int
2505 2509  vnevent_support(vnode_t *vp, caller_context_t *ct)
2506 2510  {
2507 2511          if (vp == NULL)
2508 2512                  return (EINVAL);
  
    | 
      ↓ open down ↓ | 
    2296 lines elided | 
    
      ↑ open up ↑ | 
  
2509 2513  
2510 2514          return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2511 2515  }
2512 2516  
2513 2517  void
2514 2518  vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2515 2519  {
2516 2520          if (vp == NULL || vp->v_femhead == NULL) {
2517 2521                  return;
2518 2522          }
     2523 +        (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2519 2524          (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2520 2525  }
2521 2526  
2522 2527  void
2523 2528  vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2524 2529      caller_context_t *ct)
2525 2530  {
2526 2531          if (vp == NULL || vp->v_femhead == NULL) {
2527 2532                  return;
2528 2533          }
2529 2534          (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2530 2535  }
2531 2536  
2532 2537  void
2533      -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
     2538 +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
     2539 +    caller_context_t *ct)
2534 2540  {
2535 2541          if (vp == NULL || vp->v_femhead == NULL) {
2536 2542                  return;
2537 2543          }
2538      -        (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
     2544 +        (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2539 2545  }
2540 2546  
2541 2547  void
2542 2548  vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2543 2549  {
2544 2550          if (vp == NULL || vp->v_femhead == NULL) {
2545 2551                  return;
2546 2552          }
2547 2553          (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2548 2554  }
2549 2555  
2550 2556  void
2551 2557  vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2552 2558  {
2553 2559          if (vp == NULL || vp->v_femhead == NULL) {
2554 2560                  return;
2555 2561          }
2556 2562          (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2557 2563  }
2558 2564  
2559 2565  void
2560 2566  vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
2561 2567      caller_context_t *ct)
2562 2568  {
2563 2569          if (vp == NULL || vp->v_femhead == NULL) {
2564 2570                  return;
2565 2571          }
2566 2572          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
2567 2573  }
2568 2574  
2569 2575  void
2570 2576  vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2571 2577      caller_context_t *ct)
2572 2578  {
2573 2579          if (vp == NULL || vp->v_femhead == NULL) {
2574 2580                  return;
2575 2581          }
2576 2582          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
2577 2583  }
2578 2584  
2579 2585  void
2580 2586  vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2581 2587      caller_context_t *ct)
2582 2588  {
2583 2589          if (vp == NULL || vp->v_femhead == NULL) {
2584 2590                  return;
2585 2591          }
2586 2592          (void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
2587 2593  }
2588 2594  
2589 2595  void
2590 2596  vnevent_create(vnode_t *vp, caller_context_t *ct)
2591 2597  {
2592 2598          if (vp == NULL || vp->v_femhead == NULL) {
2593 2599                  return;
2594 2600          }
2595 2601          (void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
2596 2602  }
2597 2603  
2598 2604  void
2599 2605  vnevent_link(vnode_t *vp, caller_context_t *ct)
2600 2606  {
2601 2607          if (vp == NULL || vp->v_femhead == NULL) {
2602 2608                  return;
2603 2609          }
2604 2610          (void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
2605 2611  }
2606 2612  
2607 2613  void
2608 2614  vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2609 2615  {
2610 2616          if (vp == NULL || vp->v_femhead == NULL) {
2611 2617                  return;
2612 2618          }
2613 2619          (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2614 2620  }
  
    | 
      ↓ open down ↓ | 
    66 lines elided | 
    
      ↑ open up ↑ | 
  
2615 2621  
2616 2622  void
2617 2623  vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2618 2624  {
2619 2625          if (vp == NULL || vp->v_femhead == NULL) {
2620 2626                  return;
2621 2627          }
2622 2628          (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2623 2629  }
2624 2630  
     2631 +void
     2632 +vnevent_resize(vnode_t *vp, caller_context_t *ct)
     2633 +{
     2634 +        if (vp == NULL || vp->v_femhead == NULL) {
     2635 +                return;
     2636 +        }
     2637 +        (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
     2638 +}
     2639 +
2625 2640  /*
2626 2641   * Vnode accessors.
2627 2642   */
2628 2643  
2629 2644  int
2630 2645  vn_is_readonly(vnode_t *vp)
2631 2646  {
2632 2647          return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2633 2648  }
2634 2649  
2635 2650  int
2636 2651  vn_has_flocks(vnode_t *vp)
2637 2652  {
2638 2653          return (vp->v_filocks != NULL);
2639 2654  }
2640 2655  
2641 2656  int
2642 2657  vn_has_mandatory_locks(vnode_t *vp, int mode)
2643 2658  {
2644 2659          return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
2645 2660  }
2646 2661  
2647 2662  int
2648 2663  vn_has_cached_data(vnode_t *vp)
2649 2664  {
2650 2665          return (vp->v_pages != NULL);
2651 2666  }
2652 2667  
2653 2668  /*
2654 2669   * Return 0 if the vnode in question shouldn't be permitted into a zone via
2655 2670   * zone_enter(2).
2656 2671   */
2657 2672  int
2658 2673  vn_can_change_zones(vnode_t *vp)
2659 2674  {
2660 2675          struct vfssw *vswp;
2661 2676          int allow = 1;
2662 2677          vnode_t *rvp;
2663 2678  
2664 2679          if (nfs_global_client_only != 0)
2665 2680                  return (1);
2666 2681  
2667 2682          /*
2668 2683           * We always want to look at the underlying vnode if there is one.
2669 2684           */
2670 2685          if (VOP_REALVP(vp, &rvp, NULL) != 0)
2671 2686                  rvp = vp;
2672 2687          /*
2673 2688           * Some pseudo filesystems (including doorfs) don't actually register
2674 2689           * their vfsops_t, so the following may return NULL; we happily let
2675 2690           * such vnodes switch zones.
2676 2691           */
2677 2692          vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
2678 2693          if (vswp != NULL) {
2679 2694                  if (vswp->vsw_flag & VSW_NOTZONESAFE)
2680 2695                          allow = 0;
2681 2696                  vfs_unrefvfssw(vswp);
2682 2697          }
2683 2698          return (allow);
2684 2699  }
2685 2700  
2686 2701  /*
2687 2702   * Return nonzero if the vnode is a mount point, zero if not.
2688 2703   */
2689 2704  int
2690 2705  vn_ismntpt(vnode_t *vp)
2691 2706  {
2692 2707          return (vp->v_vfsmountedhere != NULL);
2693 2708  }
2694 2709  
2695 2710  /* Retrieve the vfs (if any) mounted on this vnode */
2696 2711  vfs_t *
2697 2712  vn_mountedvfs(vnode_t *vp)
2698 2713  {
2699 2714          return (vp->v_vfsmountedhere);
2700 2715  }
2701 2716  
2702 2717  /*
2703 2718   * Return nonzero if the vnode is referenced by the dnlc, zero if not.
2704 2719   */
2705 2720  int
2706 2721  vn_in_dnlc(vnode_t *vp)
2707 2722  {
2708 2723          return (vp->v_count_dnlc > 0);
2709 2724  }
2710 2725  
2711 2726  /*
2712 2727   * vn_has_other_opens() checks whether a particular file is opened by more than
2713 2728   * just the caller and whether the open is for read and/or write.
2714 2729   * This routine is for calling after the caller has already called VOP_OPEN()
2715 2730   * and the caller wishes to know if they are the only one with it open for
2716 2731   * the mode(s) specified.
2717 2732   *
2718 2733   * Vnode counts are only kept on regular files (v_type=VREG).
2719 2734   */
2720 2735  int
2721 2736  vn_has_other_opens(
2722 2737          vnode_t *vp,
2723 2738          v_mode_t mode)
2724 2739  {
2725 2740  
2726 2741          ASSERT(vp != NULL);
2727 2742  
2728 2743          switch (mode) {
2729 2744          case V_WRITE:
2730 2745                  if (vp->v_wrcnt > 1)
2731 2746                          return (V_TRUE);
2732 2747                  break;
2733 2748          case V_RDORWR:
2734 2749                  if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
2735 2750                          return (V_TRUE);
2736 2751                  break;
2737 2752          case V_RDANDWR:
2738 2753                  if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
2739 2754                          return (V_TRUE);
2740 2755                  break;
2741 2756          case V_READ:
2742 2757                  if (vp->v_rdcnt > 1)
2743 2758                          return (V_TRUE);
2744 2759                  break;
2745 2760          }
2746 2761  
2747 2762          return (V_FALSE);
2748 2763  }
2749 2764  
2750 2765  /*
2751 2766   * vn_is_opened() checks whether a particular file is opened and
2752 2767   * whether the open is for read and/or write.
2753 2768   *
2754 2769   * Vnode counts are only kept on regular files (v_type=VREG).
2755 2770   */
2756 2771  int
2757 2772  vn_is_opened(
2758 2773          vnode_t *vp,
2759 2774          v_mode_t mode)
2760 2775  {
2761 2776  
2762 2777          ASSERT(vp != NULL);
2763 2778  
2764 2779          switch (mode) {
2765 2780          case V_WRITE:
2766 2781                  if (vp->v_wrcnt)
2767 2782                          return (V_TRUE);
2768 2783                  break;
2769 2784          case V_RDANDWR:
2770 2785                  if (vp->v_rdcnt && vp->v_wrcnt)
2771 2786                          return (V_TRUE);
2772 2787                  break;
2773 2788          case V_RDORWR:
2774 2789                  if (vp->v_rdcnt || vp->v_wrcnt)
2775 2790                          return (V_TRUE);
2776 2791                  break;
2777 2792          case V_READ:
2778 2793                  if (vp->v_rdcnt)
2779 2794                          return (V_TRUE);
2780 2795                  break;
2781 2796          }
2782 2797  
2783 2798          return (V_FALSE);
2784 2799  }
2785 2800  
2786 2801  /*
2787 2802   * vn_is_mapped() checks whether a particular file is mapped and whether
2788 2803   * the file is mapped read and/or write.
2789 2804   */
2790 2805  int
2791 2806  vn_is_mapped(
2792 2807          vnode_t *vp,
2793 2808          v_mode_t mode)
2794 2809  {
2795 2810  
2796 2811          ASSERT(vp != NULL);
2797 2812  
2798 2813  #if !defined(_LP64)
2799 2814          switch (mode) {
2800 2815          /*
2801 2816           * The atomic_add_64_nv functions force atomicity in the
2802 2817           * case of 32 bit architectures. Otherwise the 64 bit values
2803 2818           * require two fetches. The value of the fields may be
2804 2819           * (potentially) changed between the first fetch and the
2805 2820           * second
2806 2821           */
2807 2822          case V_WRITE:
2808 2823                  if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
2809 2824                          return (V_TRUE);
2810 2825                  break;
2811 2826          case V_RDANDWR:
2812 2827                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
2813 2828                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2814 2829                          return (V_TRUE);
2815 2830                  break;
2816 2831          case V_RDORWR:
2817 2832                  if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
2818 2833                      (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
2819 2834                          return (V_TRUE);
2820 2835                  break;
2821 2836          case V_READ:
2822 2837                  if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
2823 2838                          return (V_TRUE);
2824 2839                  break;
2825 2840          }
2826 2841  #else
2827 2842          switch (mode) {
2828 2843          case V_WRITE:
2829 2844                  if (vp->v_mmap_write)
2830 2845                          return (V_TRUE);
2831 2846                  break;
2832 2847          case V_RDANDWR:
2833 2848                  if (vp->v_mmap_read && vp->v_mmap_write)
2834 2849                          return (V_TRUE);
2835 2850                  break;
2836 2851          case V_RDORWR:
2837 2852                  if (vp->v_mmap_read || vp->v_mmap_write)
2838 2853                          return (V_TRUE);
2839 2854                  break;
2840 2855          case V_READ:
2841 2856                  if (vp->v_mmap_read)
2842 2857                          return (V_TRUE);
2843 2858                  break;
2844 2859          }
2845 2860  #endif
2846 2861  
2847 2862          return (V_FALSE);
2848 2863  }
2849 2864  
2850 2865  /*
2851 2866   * Set the operations vector for a vnode.
2852 2867   *
2853 2868   * FEM ensures that the v_femhead pointer is filled in before the
2854 2869   * v_op pointer is changed.  This means that if the v_femhead pointer
2855 2870   * is NULL, and the v_op field hasn't changed since before which checked
2856 2871   * the v_femhead pointer; then our update is ok - we are not racing with
2857 2872   * FEM.
2858 2873   */
2859 2874  void
2860 2875  vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
2861 2876  {
2862 2877          vnodeops_t      *op;
2863 2878  
2864 2879          ASSERT(vp != NULL);
2865 2880          ASSERT(vnodeops != NULL);
2866 2881  
2867 2882          op = vp->v_op;
2868 2883          membar_consumer();
2869 2884          /*
2870 2885           * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
2871 2886           * the compare-and-swap on vp->v_op.  If either fails, then FEM is
2872 2887           * in effect on the vnode and we need to have FEM deal with it.
2873 2888           */
2874 2889          if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
2875 2890              op) {
2876 2891                  fem_setvnops(vp, vnodeops);
2877 2892          }
2878 2893  }
2879 2894  
2880 2895  /*
2881 2896   * Retrieve the operations vector for a vnode
2882 2897   * As with vn_setops(above); make sure we aren't racing with FEM.
2883 2898   * FEM sets the v_op to a special, internal, vnodeops that wouldn't
2884 2899   * make sense to the callers of this routine.
2885 2900   */
2886 2901  vnodeops_t *
2887 2902  vn_getops(vnode_t *vp)
2888 2903  {
2889 2904          vnodeops_t      *op;
2890 2905  
2891 2906          ASSERT(vp != NULL);
2892 2907  
2893 2908          op = vp->v_op;
2894 2909          membar_consumer();
2895 2910          if (vp->v_femhead == NULL && op == vp->v_op) {
2896 2911                  return (op);
2897 2912          } else {
2898 2913                  return (fem_getvnops(vp));
2899 2914          }
2900 2915  }
2901 2916  
2902 2917  /*
2903 2918   * Returns non-zero (1) if the vnodeops matches that of the vnode.
2904 2919   * Returns zero (0) if not.
2905 2920   */
2906 2921  int
2907 2922  vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
2908 2923  {
2909 2924          return (vn_getops(vp) == vnodeops);
2910 2925  }
2911 2926  
2912 2927  /*
2913 2928   * Returns non-zero (1) if the specified operation matches the
2914 2929   * corresponding operation for that the vnode.
2915 2930   * Returns zero (0) if not.
2916 2931   */
2917 2932  
2918 2933  #define MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
2919 2934  
2920 2935  int
2921 2936  vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
2922 2937  {
2923 2938          const fs_operation_trans_def_t *otdp;
2924 2939          fs_generic_func_p *loc = NULL;
2925 2940          vnodeops_t      *vop = vn_getops(vp);
2926 2941  
2927 2942          ASSERT(vopname != NULL);
2928 2943  
2929 2944          for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
2930 2945                  if (MATCHNAME(otdp->name, vopname)) {
2931 2946                          loc = (fs_generic_func_p *)
2932 2947                              ((char *)(vop) + otdp->offset);
2933 2948                          break;
2934 2949                  }
2935 2950          }
2936 2951  
2937 2952          return ((loc != NULL) && (*loc == funcp));
2938 2953  }
2939 2954  
2940 2955  /*
2941 2956   * fs_new_caller_id() needs to return a unique ID on a given local system.
2942 2957   * The IDs do not need to survive across reboots.  These are primarily
2943 2958   * used so that (FEM) monitors can detect particular callers (such as
2944 2959   * the NFS server) to a given vnode/vfs operation.
2945 2960   */
2946 2961  u_longlong_t
2947 2962  fs_new_caller_id()
2948 2963  {
2949 2964          static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2950 2965  
2951 2966          return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2952 2967  }
2953 2968  
2954 2969  /*
2955 2970   * Given a starting vnode and a path, updates the path in the target vnode in
2956 2971   * a safe manner.  If the vnode already has path information embedded, then the
2957 2972   * cached path is left untouched.
2958 2973   */
2959 2974  
2960 2975  size_t max_vnode_path = 4 * MAXPATHLEN;
2961 2976  
2962 2977  void
2963 2978  vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2964 2979      const char *path, size_t plen)
2965 2980  {
2966 2981          char    *rpath;
2967 2982          vnode_t *base;
2968 2983          size_t  rpathlen, rpathalloc;
2969 2984          int     doslash = 1;
2970 2985  
2971 2986          if (*path == '/') {
2972 2987                  base = rootvp;
2973 2988                  path++;
2974 2989                  plen--;
2975 2990          } else {
2976 2991                  base = startvp;
2977 2992          }
2978 2993  
2979 2994          /*
2980 2995           * We cannot grab base->v_lock while we hold vp->v_lock because of
2981 2996           * the potential for deadlock.
2982 2997           */
2983 2998          mutex_enter(&base->v_lock);
2984 2999          if (base->v_path == NULL) {
2985 3000                  mutex_exit(&base->v_lock);
2986 3001                  return;
2987 3002          }
2988 3003  
2989 3004          rpathlen = strlen(base->v_path);
2990 3005          rpathalloc = rpathlen + plen + 1;
2991 3006          /* Avoid adding a slash if there's already one there */
2992 3007          if (base->v_path[rpathlen-1] == '/')
2993 3008                  doslash = 0;
2994 3009          else
2995 3010                  rpathalloc++;
2996 3011  
2997 3012          /*
2998 3013           * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
2999 3014           * so we must do this dance.  If, by chance, something changes the path,
3000 3015           * just give up since there is no real harm.
3001 3016           */
3002 3017          mutex_exit(&base->v_lock);
3003 3018  
3004 3019          /* Paths should stay within reason */
3005 3020          if (rpathalloc > max_vnode_path)
3006 3021                  return;
3007 3022  
3008 3023          rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3009 3024  
3010 3025          mutex_enter(&base->v_lock);
3011 3026          if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3012 3027                  mutex_exit(&base->v_lock);
3013 3028                  kmem_free(rpath, rpathalloc);
3014 3029                  return;
3015 3030          }
3016 3031          bcopy(base->v_path, rpath, rpathlen);
3017 3032          mutex_exit(&base->v_lock);
3018 3033  
3019 3034          if (doslash)
3020 3035                  rpath[rpathlen++] = '/';
3021 3036          bcopy(path, rpath + rpathlen, plen);
3022 3037          rpath[rpathlen + plen] = '\0';
3023 3038  
3024 3039          mutex_enter(&vp->v_lock);
3025 3040          if (vp->v_path != NULL) {
3026 3041                  mutex_exit(&vp->v_lock);
3027 3042                  kmem_free(rpath, rpathalloc);
3028 3043          } else {
3029 3044                  vp->v_path = rpath;
3030 3045                  mutex_exit(&vp->v_lock);
3031 3046          }
3032 3047  }
3033 3048  
3034 3049  /*
3035 3050   * Sets the path to the vnode to be the given string, regardless of current
3036 3051   * context.  The string must be a complete path from rootdir.  This is only used
3037 3052   * by fsop_root() for setting the path based on the mountpoint.
3038 3053   */
3039 3054  void
3040 3055  vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3041 3056  {
3042 3057          char *buf = kmem_alloc(len + 1, KM_SLEEP);
3043 3058  
3044 3059          mutex_enter(&vp->v_lock);
3045 3060          if (vp->v_path != NULL) {
3046 3061                  mutex_exit(&vp->v_lock);
3047 3062                  kmem_free(buf, len + 1);
3048 3063                  return;
3049 3064          }
3050 3065  
3051 3066          vp->v_path = buf;
3052 3067          bcopy(str, vp->v_path, len);
3053 3068          vp->v_path[len] = '\0';
3054 3069  
3055 3070          mutex_exit(&vp->v_lock);
3056 3071  }
3057 3072  
3058 3073  /*
3059 3074   * Called from within filesystem's vop_rename() to handle renames once the
3060 3075   * target vnode is available.
3061 3076   */
3062 3077  void
3063 3078  vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3064 3079  {
3065 3080          char *tmp;
3066 3081  
3067 3082          mutex_enter(&vp->v_lock);
3068 3083          tmp = vp->v_path;
3069 3084          vp->v_path = NULL;
3070 3085          mutex_exit(&vp->v_lock);
3071 3086          vn_setpath(rootdir, dvp, vp, nm, len);
3072 3087          if (tmp != NULL)
3073 3088                  kmem_free(tmp, strlen(tmp) + 1);
3074 3089  }
3075 3090  
3076 3091  /*
3077 3092   * Similar to vn_setpath_str(), this function sets the path of the destination
3078 3093   * vnode to the be the same as the source vnode.
3079 3094   */
3080 3095  void
3081 3096  vn_copypath(struct vnode *src, struct vnode *dst)
3082 3097  {
3083 3098          char *buf;
3084 3099          int alloc;
3085 3100  
3086 3101          mutex_enter(&src->v_lock);
3087 3102          if (src->v_path == NULL) {
3088 3103                  mutex_exit(&src->v_lock);
3089 3104                  return;
3090 3105          }
3091 3106          alloc = strlen(src->v_path) + 1;
3092 3107  
3093 3108          /* avoid kmem_alloc() with lock held */
3094 3109          mutex_exit(&src->v_lock);
3095 3110          buf = kmem_alloc(alloc, KM_SLEEP);
3096 3111          mutex_enter(&src->v_lock);
3097 3112          if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3098 3113                  mutex_exit(&src->v_lock);
3099 3114                  kmem_free(buf, alloc);
3100 3115                  return;
3101 3116          }
3102 3117          bcopy(src->v_path, buf, alloc);
3103 3118          mutex_exit(&src->v_lock);
3104 3119  
3105 3120          mutex_enter(&dst->v_lock);
3106 3121          if (dst->v_path != NULL) {
3107 3122                  mutex_exit(&dst->v_lock);
3108 3123                  kmem_free(buf, alloc);
3109 3124                  return;
3110 3125          }
3111 3126          dst->v_path = buf;
3112 3127          mutex_exit(&dst->v_lock);
3113 3128  }
3114 3129  
3115 3130  /*
3116 3131   * XXX Private interface for segvn routines that handle vnode
3117 3132   * large page segments.
3118 3133   *
3119 3134   * return 1 if vp's file system VOP_PAGEIO() implementation
3120 3135   * can be safely used instead of VOP_GETPAGE() for handling
3121 3136   * pagefaults against regular non swap files. VOP_PAGEIO()
3122 3137   * interface is considered safe here if its implementation
3123 3138   * is very close to VOP_GETPAGE() implementation.
3124 3139   * e.g. It zero's out the part of the page beyond EOF. Doesn't
3125 3140   * panic if there're file holes but instead returns an error.
3126 3141   * Doesn't assume file won't be changed by user writes, etc.
3127 3142   *
3128 3143   * return 0 otherwise.
3129 3144   *
3130 3145   * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3131 3146   */
3132 3147  int
3133 3148  vn_vmpss_usepageio(vnode_t *vp)
3134 3149  {
3135 3150          vfs_t   *vfsp = vp->v_vfsp;
3136 3151          char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
3137 3152          char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
3138 3153          char **fsok = pageio_ok_fss;
3139 3154  
3140 3155          if (fsname == NULL) {
3141 3156                  return (0);
3142 3157          }
3143 3158  
3144 3159          for (; *fsok; fsok++) {
3145 3160                  if (strcmp(*fsok, fsname) == 0) {
3146 3161                          return (1);
3147 3162                  }
3148 3163          }
3149 3164          return (0);
3150 3165  }
3151 3166  
3152 3167  /* VOP_XXX() macros call the corresponding fop_xxx() function */
3153 3168  
3154 3169  int
3155 3170  fop_open(
3156 3171          vnode_t **vpp,
3157 3172          int mode,
3158 3173          cred_t *cr,
3159 3174          caller_context_t *ct)
3160 3175  {
3161 3176          int ret;
3162 3177          vnode_t *vp = *vpp;
3163 3178  
3164 3179          VN_HOLD(vp);
3165 3180          /*
3166 3181           * Adding to the vnode counts before calling open
3167 3182           * avoids the need for a mutex. It circumvents a race
3168 3183           * condition where a query made on the vnode counts results in a
3169 3184           * false negative. The inquirer goes away believing the file is
3170 3185           * not open when there is an open on the file already under way.
3171 3186           *
3172 3187           * The counts are meant to prevent NFS from granting a delegation
3173 3188           * when it would be dangerous to do so.
3174 3189           *
3175 3190           * The vnode counts are only kept on regular files
3176 3191           */
3177 3192          if ((*vpp)->v_type == VREG) {
3178 3193                  if (mode & FREAD)
3179 3194                          atomic_inc_32(&(*vpp)->v_rdcnt);
3180 3195                  if (mode & FWRITE)
3181 3196                          atomic_inc_32(&(*vpp)->v_wrcnt);
3182 3197          }
3183 3198  
3184 3199          VOPXID_MAP_CR(vp, cr);
3185 3200  
3186 3201          ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
3187 3202  
3188 3203          if (ret) {
3189 3204                  /*
3190 3205                   * Use the saved vp just in case the vnode ptr got trashed
3191 3206                   * by the error.
3192 3207                   */
3193 3208                  VOPSTATS_UPDATE(vp, open);
3194 3209                  if ((vp->v_type == VREG) && (mode & FREAD))
3195 3210                          atomic_dec_32(&vp->v_rdcnt);
3196 3211                  if ((vp->v_type == VREG) && (mode & FWRITE))
3197 3212                          atomic_dec_32(&vp->v_wrcnt);
3198 3213          } else {
3199 3214                  /*
3200 3215                   * Some filesystems will return a different vnode,
3201 3216                   * but the same path was still used to open it.
3202 3217                   * So if we do change the vnode and need to
3203 3218                   * copy over the path, do so here, rather than special
3204 3219                   * casing each filesystem. Adjust the vnode counts to
3205 3220                   * reflect the vnode switch.
3206 3221                   */
3207 3222                  VOPSTATS_UPDATE(*vpp, open);
3208 3223                  if (*vpp != vp && *vpp != NULL) {
3209 3224                          vn_copypath(vp, *vpp);
3210 3225                          if (((*vpp)->v_type == VREG) && (mode & FREAD))
3211 3226                                  atomic_inc_32(&(*vpp)->v_rdcnt);
3212 3227                          if ((vp->v_type == VREG) && (mode & FREAD))
3213 3228                                  atomic_dec_32(&vp->v_rdcnt);
3214 3229                          if (((*vpp)->v_type == VREG) && (mode & FWRITE))
3215 3230                                  atomic_inc_32(&(*vpp)->v_wrcnt);
3216 3231                          if ((vp->v_type == VREG) && (mode & FWRITE))
3217 3232                                  atomic_dec_32(&vp->v_wrcnt);
3218 3233                  }
3219 3234          }
3220 3235          VN_RELE(vp);
3221 3236          return (ret);
3222 3237  }
3223 3238  
3224 3239  int
3225 3240  fop_close(
3226 3241          vnode_t *vp,
3227 3242          int flag,
3228 3243          int count,
3229 3244          offset_t offset,
3230 3245          cred_t *cr,
3231 3246          caller_context_t *ct)
3232 3247  {
3233 3248          int err;
3234 3249  
3235 3250          VOPXID_MAP_CR(vp, cr);
3236 3251  
3237 3252          err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
3238 3253          VOPSTATS_UPDATE(vp, close);
3239 3254          /*
3240 3255           * Check passed in count to handle possible dups. Vnode counts are only
3241 3256           * kept on regular files
3242 3257           */
3243 3258          if ((vp->v_type == VREG) && (count == 1))  {
3244 3259                  if (flag & FREAD) {
3245 3260                          ASSERT(vp->v_rdcnt > 0);
3246 3261                          atomic_dec_32(&vp->v_rdcnt);
3247 3262                  }
3248 3263                  if (flag & FWRITE) {
3249 3264                          ASSERT(vp->v_wrcnt > 0);
3250 3265                          atomic_dec_32(&vp->v_wrcnt);
3251 3266                  }
3252 3267          }
3253 3268          return (err);
  
    | 
      ↓ open down ↓ | 
    619 lines elided | 
    
      ↑ open up ↑ | 
  
3254 3269  }
3255 3270  
3256 3271  int
3257 3272  fop_read(
3258 3273          vnode_t *vp,
3259 3274          uio_t *uiop,
3260 3275          int ioflag,
3261 3276          cred_t *cr,
3262 3277          caller_context_t *ct)
3263 3278  {
3264      -        int     err;
3265 3279          ssize_t resid_start = uiop->uio_resid;
     3280 +        zone_t  *zonep = curzone;
     3281 +        zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3266 3282  
     3283 +        hrtime_t start = 0, lat;
     3284 +        ssize_t len;
     3285 +        int err;
     3286 +
     3287 +        if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
     3288 +            vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
     3289 +                start = gethrtime();
     3290 +
     3291 +                mutex_enter(&zonep->zone_vfs_lock);
     3292 +                kstat_runq_enter(&zonep->zone_vfs_rwstats);
     3293 +                mutex_exit(&zonep->zone_vfs_lock);
     3294 +        }
     3295 +
3267 3296          VOPXID_MAP_CR(vp, cr);
3268 3297  
3269 3298          err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3270      -        VOPSTATS_UPDATE_IO(vp, read,
3271      -            read_bytes, (resid_start - uiop->uio_resid));
     3299 +        len = resid_start - uiop->uio_resid;
     3300 +
     3301 +        VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
     3302 +
     3303 +        if (start != 0) {
     3304 +                mutex_enter(&zonep->zone_vfs_lock);
     3305 +                zonep->zone_vfs_rwstats.reads++;
     3306 +                zonep->zone_vfs_rwstats.nread += len;
     3307 +                kstat_runq_exit(&zonep->zone_vfs_rwstats);
     3308 +                mutex_exit(&zonep->zone_vfs_lock);
     3309 +
     3310 +                lat = gethrtime() - start;
     3311 +
     3312 +                if (lat >= VOP_LATENCY_10MS) {
     3313 +                        if (lat < VOP_LATENCY_100MS)
     3314 +                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
     3315 +                        else if (lat < VOP_LATENCY_1S) {
     3316 +                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
     3317 +                                atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
     3318 +                        } else {
     3319 +                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
     3320 +                                atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
     3321 +                                atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
     3322 +                        }
     3323 +                }
     3324 +        }
     3325 +
3272 3326          return (err);
3273 3327  }
3274 3328  
3275 3329  int
3276 3330  fop_write(
3277 3331          vnode_t *vp,
3278 3332          uio_t *uiop,
3279 3333          int ioflag,
3280 3334          cred_t *cr,
3281 3335          caller_context_t *ct)
3282 3336  {
3283      -        int     err;
3284 3337          ssize_t resid_start = uiop->uio_resid;
     3338 +        zone_t  *zonep = curzone;
     3339 +        zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3285 3340  
     3341 +        hrtime_t start = 0, lat;
     3342 +        ssize_t len;
     3343 +        int     err;
     3344 +
     3345 +        /*
     3346 +         * For the purposes of VFS kstat consumers, the "waitq" calculation is
     3347 +         * repurposed as the active queue for VFS write operations.  There's no
     3348 +         * actual wait queue for VFS operations.
     3349 +         */
     3350 +        if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
     3351 +            vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
     3352 +                start = gethrtime();
     3353 +
     3354 +                mutex_enter(&zonep->zone_vfs_lock);
     3355 +                kstat_waitq_enter(&zonep->zone_vfs_rwstats);
     3356 +                mutex_exit(&zonep->zone_vfs_lock);
     3357 +        }
     3358 +
3286 3359          VOPXID_MAP_CR(vp, cr);
3287 3360  
3288 3361          err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3289      -        VOPSTATS_UPDATE_IO(vp, write,
3290      -            write_bytes, (resid_start - uiop->uio_resid));
     3362 +        len = resid_start - uiop->uio_resid;
     3363 +
     3364 +        VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
     3365 +
     3366 +        if (start != 0) {
     3367 +                mutex_enter(&zonep->zone_vfs_lock);
     3368 +                zonep->zone_vfs_rwstats.writes++;
     3369 +                zonep->zone_vfs_rwstats.nwritten += len;
     3370 +                kstat_waitq_exit(&zonep->zone_vfs_rwstats);
     3371 +                mutex_exit(&zonep->zone_vfs_lock);
     3372 +
     3373 +                lat = gethrtime() - start;
     3374 +
     3375 +                if (lat >= VOP_LATENCY_10MS) {
     3376 +                        if (lat < VOP_LATENCY_100MS)
     3377 +                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
     3378 +                        else if (lat < VOP_LATENCY_1S) {
     3379 +                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
     3380 +                                atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
     3381 +                        } else {
     3382 +                                atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
     3383 +                                atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
     3384 +                                atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
     3385 +                        }
     3386 +                }
     3387 +        }
     3388 +
3291 3389          return (err);
3292 3390  }
3293 3391  
3294 3392  int
3295 3393  fop_ioctl(
3296 3394          vnode_t *vp,
3297 3395          int cmd,
3298 3396          intptr_t arg,
3299 3397          int flag,
3300 3398          cred_t *cr,
3301 3399          int *rvalp,
3302 3400          caller_context_t *ct)
3303 3401  {
3304 3402          int     err;
3305 3403  
3306 3404          VOPXID_MAP_CR(vp, cr);
3307 3405  
3308 3406          err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3309 3407          VOPSTATS_UPDATE(vp, ioctl);
3310 3408          return (err);
3311 3409  }
3312 3410  
3313 3411  int
3314 3412  fop_setfl(
3315 3413          vnode_t *vp,
3316 3414          int oflags,
3317 3415          int nflags,
3318 3416          cred_t *cr,
3319 3417          caller_context_t *ct)
3320 3418  {
3321 3419          int     err;
3322 3420  
3323 3421          VOPXID_MAP_CR(vp, cr);
3324 3422  
3325 3423          err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
3326 3424          VOPSTATS_UPDATE(vp, setfl);
3327 3425          return (err);
3328 3426  }
3329 3427  
3330 3428  int
3331 3429  fop_getattr(
3332 3430          vnode_t *vp,
3333 3431          vattr_t *vap,
3334 3432          int flags,
3335 3433          cred_t *cr,
3336 3434          caller_context_t *ct)
3337 3435  {
3338 3436          int     err;
3339 3437  
3340 3438          VOPXID_MAP_CR(vp, cr);
3341 3439  
3342 3440          /*
3343 3441           * If this file system doesn't understand the xvattr extensions
3344 3442           * then turn off the xvattr bit.
3345 3443           */
3346 3444          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3347 3445                  vap->va_mask &= ~AT_XVATTR;
3348 3446          }
3349 3447  
3350 3448          /*
3351 3449           * We're only allowed to skip the ACL check iff we used a 32 bit
3352 3450           * ACE mask with VOP_ACCESS() to determine permissions.
3353 3451           */
3354 3452          if ((flags & ATTR_NOACLCHECK) &&
3355 3453              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3356 3454                  return (EINVAL);
3357 3455          }
3358 3456          err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
3359 3457          VOPSTATS_UPDATE(vp, getattr);
3360 3458          return (err);
3361 3459  }
3362 3460  
3363 3461  int
3364 3462  fop_setattr(
3365 3463          vnode_t *vp,
3366 3464          vattr_t *vap,
3367 3465          int flags,
3368 3466          cred_t *cr,
3369 3467          caller_context_t *ct)
3370 3468  {
3371 3469          int     err;
3372 3470  
3373 3471          VOPXID_MAP_CR(vp, cr);
3374 3472  
3375 3473          /*
3376 3474           * If this file system doesn't understand the xvattr extensions
3377 3475           * then turn off the xvattr bit.
3378 3476           */
3379 3477          if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
3380 3478                  vap->va_mask &= ~AT_XVATTR;
3381 3479          }
3382 3480  
3383 3481          /*
3384 3482           * We're only allowed to skip the ACL check iff we used a 32 bit
3385 3483           * ACE mask with VOP_ACCESS() to determine permissions.
3386 3484           */
3387 3485          if ((flags & ATTR_NOACLCHECK) &&
3388 3486              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3389 3487                  return (EINVAL);
3390 3488          }
3391 3489          err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
3392 3490          VOPSTATS_UPDATE(vp, setattr);
3393 3491          return (err);
3394 3492  }
3395 3493  
3396 3494  int
3397 3495  fop_access(
3398 3496          vnode_t *vp,
3399 3497          int mode,
3400 3498          int flags,
3401 3499          cred_t *cr,
3402 3500          caller_context_t *ct)
3403 3501  {
3404 3502          int     err;
3405 3503  
3406 3504          if ((flags & V_ACE_MASK) &&
3407 3505              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
3408 3506                  return (EINVAL);
3409 3507          }
3410 3508  
3411 3509          VOPXID_MAP_CR(vp, cr);
3412 3510  
3413 3511          err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
3414 3512          VOPSTATS_UPDATE(vp, access);
3415 3513          return (err);
3416 3514  }
3417 3515  
3418 3516  int
3419 3517  fop_lookup(
3420 3518          vnode_t *dvp,
3421 3519          char *nm,
3422 3520          vnode_t **vpp,
3423 3521          pathname_t *pnp,
3424 3522          int flags,
3425 3523          vnode_t *rdir,
3426 3524          cred_t *cr,
3427 3525          caller_context_t *ct,
3428 3526          int *deflags,           /* Returned per-dirent flags */
3429 3527          pathname_t *ppnp)       /* Returned case-preserved name in directory */
3430 3528  {
3431 3529          int ret;
3432 3530  
3433 3531          /*
3434 3532           * If this file system doesn't support case-insensitive access
3435 3533           * and said access is requested, fail quickly.  It is required
3436 3534           * that if the vfs supports case-insensitive lookup, it also
3437 3535           * supports extended dirent flags.
3438 3536           */
3439 3537          if (flags & FIGNORECASE &&
3440 3538              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3441 3539              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3442 3540                  return (EINVAL);
3443 3541  
3444 3542          VOPXID_MAP_CR(dvp, cr);
3445 3543  
3446 3544          if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3447 3545                  ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3448 3546          } else {
3449 3547                  ret = (*(dvp)->v_op->vop_lookup)
3450 3548                      (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3451 3549          }
3452 3550          if (ret == 0 && *vpp) {
3453 3551                  VOPSTATS_UPDATE(*vpp, lookup);
3454 3552                  if ((*vpp)->v_path == NULL) {
3455 3553                          vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3456 3554                  }
3457 3555          }
3458 3556  
3459 3557          return (ret);
3460 3558  }
3461 3559  
3462 3560  int
3463 3561  fop_create(
3464 3562          vnode_t *dvp,
3465 3563          char *name,
3466 3564          vattr_t *vap,
3467 3565          vcexcl_t excl,
3468 3566          int mode,
3469 3567          vnode_t **vpp,
3470 3568          cred_t *cr,
3471 3569          int flags,
3472 3570          caller_context_t *ct,
3473 3571          vsecattr_t *vsecp)      /* ACL to set during create */
3474 3572  {
3475 3573          int ret;
3476 3574  
3477 3575          if (vsecp != NULL &&
3478 3576              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3479 3577                  return (EINVAL);
3480 3578          }
3481 3579          /*
3482 3580           * If this file system doesn't support case-insensitive access
3483 3581           * and said access is requested, fail quickly.
3484 3582           */
3485 3583          if (flags & FIGNORECASE &&
3486 3584              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3487 3585              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3488 3586                  return (EINVAL);
3489 3587  
3490 3588          VOPXID_MAP_CR(dvp, cr);
3491 3589  
3492 3590          ret = (*(dvp)->v_op->vop_create)
3493 3591              (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3494 3592          if (ret == 0 && *vpp) {
3495 3593                  VOPSTATS_UPDATE(*vpp, create);
3496 3594                  if ((*vpp)->v_path == NULL) {
3497 3595                          vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3498 3596                  }
3499 3597          }
3500 3598  
3501 3599          return (ret);
3502 3600  }
3503 3601  
3504 3602  int
3505 3603  fop_remove(
3506 3604          vnode_t *dvp,
3507 3605          char *nm,
3508 3606          cred_t *cr,
3509 3607          caller_context_t *ct,
3510 3608          int flags)
3511 3609  {
3512 3610          int     err;
3513 3611  
3514 3612          /*
3515 3613           * If this file system doesn't support case-insensitive access
3516 3614           * and said access is requested, fail quickly.
3517 3615           */
3518 3616          if (flags & FIGNORECASE &&
3519 3617              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3520 3618              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3521 3619                  return (EINVAL);
3522 3620  
3523 3621          VOPXID_MAP_CR(dvp, cr);
3524 3622  
3525 3623          err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
3526 3624          VOPSTATS_UPDATE(dvp, remove);
3527 3625          return (err);
3528 3626  }
3529 3627  
3530 3628  int
3531 3629  fop_link(
3532 3630          vnode_t *tdvp,
3533 3631          vnode_t *svp,
3534 3632          char *tnm,
3535 3633          cred_t *cr,
3536 3634          caller_context_t *ct,
3537 3635          int flags)
3538 3636  {
3539 3637          int     err;
3540 3638  
3541 3639          /*
3542 3640           * If the target file system doesn't support case-insensitive access
3543 3641           * and said access is requested, fail quickly.
3544 3642           */
3545 3643          if (flags & FIGNORECASE &&
3546 3644              (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3547 3645              vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3548 3646                  return (EINVAL);
3549 3647  
3550 3648          VOPXID_MAP_CR(tdvp, cr);
3551 3649  
3552 3650          err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
3553 3651          VOPSTATS_UPDATE(tdvp, link);
3554 3652          return (err);
3555 3653  }
3556 3654  
3557 3655  int
3558 3656  fop_rename(
3559 3657          vnode_t *sdvp,
3560 3658          char *snm,
3561 3659          vnode_t *tdvp,
3562 3660          char *tnm,
3563 3661          cred_t *cr,
3564 3662          caller_context_t *ct,
3565 3663          int flags)
3566 3664  {
3567 3665          int     err;
3568 3666  
3569 3667          /*
3570 3668           * If the file system involved does not support
3571 3669           * case-insensitive access and said access is requested, fail
3572 3670           * quickly.
3573 3671           */
3574 3672          if (flags & FIGNORECASE &&
3575 3673              ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3576 3674              vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
3577 3675                  return (EINVAL);
3578 3676  
3579 3677          VOPXID_MAP_CR(tdvp, cr);
3580 3678  
3581 3679          err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
3582 3680          VOPSTATS_UPDATE(sdvp, rename);
3583 3681          return (err);
3584 3682  }
3585 3683  
3586 3684  int
3587 3685  fop_mkdir(
3588 3686          vnode_t *dvp,
3589 3687          char *dirname,
3590 3688          vattr_t *vap,
3591 3689          vnode_t **vpp,
3592 3690          cred_t *cr,
3593 3691          caller_context_t *ct,
3594 3692          int flags,
3595 3693          vsecattr_t *vsecp)      /* ACL to set during create */
3596 3694  {
3597 3695          int ret;
3598 3696  
3599 3697          if (vsecp != NULL &&
3600 3698              vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3601 3699                  return (EINVAL);
3602 3700          }
3603 3701          /*
3604 3702           * If this file system doesn't support case-insensitive access
3605 3703           * and said access is requested, fail quickly.
3606 3704           */
3607 3705          if (flags & FIGNORECASE &&
3608 3706              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3609 3707              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3610 3708                  return (EINVAL);
3611 3709  
3612 3710          VOPXID_MAP_CR(dvp, cr);
3613 3711  
3614 3712          ret = (*(dvp)->v_op->vop_mkdir)
3615 3713              (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3616 3714          if (ret == 0 && *vpp) {
3617 3715                  VOPSTATS_UPDATE(*vpp, mkdir);
3618 3716                  if ((*vpp)->v_path == NULL) {
3619 3717                          vn_setpath(rootdir, dvp, *vpp, dirname,
3620 3718                              strlen(dirname));
3621 3719                  }
3622 3720          }
3623 3721  
3624 3722          return (ret);
3625 3723  }
3626 3724  
3627 3725  int
3628 3726  fop_rmdir(
3629 3727          vnode_t *dvp,
3630 3728          char *nm,
3631 3729          vnode_t *cdir,
3632 3730          cred_t *cr,
3633 3731          caller_context_t *ct,
3634 3732          int flags)
3635 3733  {
3636 3734          int     err;
3637 3735  
3638 3736          /*
3639 3737           * If this file system doesn't support case-insensitive access
3640 3738           * and said access is requested, fail quickly.
3641 3739           */
3642 3740          if (flags & FIGNORECASE &&
3643 3741              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3644 3742              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3645 3743                  return (EINVAL);
3646 3744  
3647 3745          VOPXID_MAP_CR(dvp, cr);
3648 3746  
3649 3747          err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
3650 3748          VOPSTATS_UPDATE(dvp, rmdir);
3651 3749          return (err);
3652 3750  }
3653 3751  
3654 3752  int
3655 3753  fop_readdir(
3656 3754          vnode_t *vp,
3657 3755          uio_t *uiop,
3658 3756          cred_t *cr,
3659 3757          int *eofp,
3660 3758          caller_context_t *ct,
3661 3759          int flags)
3662 3760  {
3663 3761          int     err;
3664 3762          ssize_t resid_start = uiop->uio_resid;
3665 3763  
3666 3764          /*
3667 3765           * If this file system doesn't support retrieving directory
3668 3766           * entry flags and said access is requested, fail quickly.
3669 3767           */
3670 3768          if (flags & V_RDDIR_ENTFLAGS &&
3671 3769              vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
3672 3770                  return (EINVAL);
3673 3771  
3674 3772          VOPXID_MAP_CR(vp, cr);
3675 3773  
3676 3774          err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
3677 3775          VOPSTATS_UPDATE_IO(vp, readdir,
3678 3776              readdir_bytes, (resid_start - uiop->uio_resid));
3679 3777          return (err);
3680 3778  }
3681 3779  
3682 3780  int
3683 3781  fop_symlink(
3684 3782          vnode_t *dvp,
3685 3783          char *linkname,
3686 3784          vattr_t *vap,
3687 3785          char *target,
3688 3786          cred_t *cr,
3689 3787          caller_context_t *ct,
3690 3788          int flags)
3691 3789  {
3692 3790          int     err;
3693 3791          xvattr_t xvattr;
3694 3792  
3695 3793          /*
3696 3794           * If this file system doesn't support case-insensitive access
3697 3795           * and said access is requested, fail quickly.
3698 3796           */
3699 3797          if (flags & FIGNORECASE &&
3700 3798              (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3701 3799              vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3702 3800                  return (EINVAL);
3703 3801  
3704 3802          VOPXID_MAP_CR(dvp, cr);
3705 3803  
3706 3804          /* check for reparse point */
3707 3805          if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
3708 3806              (strncmp(target, FS_REPARSE_TAG_STR,
3709 3807              strlen(FS_REPARSE_TAG_STR)) == 0)) {
3710 3808                  if (!fs_reparse_mark(target, vap, &xvattr))
3711 3809                          vap = (vattr_t *)&xvattr;
3712 3810          }
3713 3811  
3714 3812          err = (*(dvp)->v_op->vop_symlink)
3715 3813              (dvp, linkname, vap, target, cr, ct, flags);
3716 3814          VOPSTATS_UPDATE(dvp, symlink);
3717 3815          return (err);
3718 3816  }
3719 3817  
3720 3818  int
3721 3819  fop_readlink(
3722 3820          vnode_t *vp,
3723 3821          uio_t *uiop,
3724 3822          cred_t *cr,
3725 3823          caller_context_t *ct)
3726 3824  {
3727 3825          int     err;
3728 3826  
3729 3827          VOPXID_MAP_CR(vp, cr);
3730 3828  
3731 3829          err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
3732 3830          VOPSTATS_UPDATE(vp, readlink);
3733 3831          return (err);
3734 3832  }
3735 3833  
3736 3834  int
3737 3835  fop_fsync(
3738 3836          vnode_t *vp,
3739 3837          int syncflag,
3740 3838          cred_t *cr,
3741 3839          caller_context_t *ct)
3742 3840  {
3743 3841          int     err;
3744 3842  
3745 3843          VOPXID_MAP_CR(vp, cr);
3746 3844  
3747 3845          err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
3748 3846          VOPSTATS_UPDATE(vp, fsync);
3749 3847          return (err);
3750 3848  }
3751 3849  
3752 3850  void
3753 3851  fop_inactive(
3754 3852          vnode_t *vp,
3755 3853          cred_t *cr,
3756 3854          caller_context_t *ct)
3757 3855  {
3758 3856          /* Need to update stats before vop call since we may lose the vnode */
3759 3857          VOPSTATS_UPDATE(vp, inactive);
3760 3858  
3761 3859          VOPXID_MAP_CR(vp, cr);
3762 3860  
3763 3861          (*(vp)->v_op->vop_inactive)(vp, cr, ct);
3764 3862  }
3765 3863  
3766 3864  int
3767 3865  fop_fid(
3768 3866          vnode_t *vp,
3769 3867          fid_t *fidp,
3770 3868          caller_context_t *ct)
3771 3869  {
3772 3870          int     err;
3773 3871  
3774 3872          err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
3775 3873          VOPSTATS_UPDATE(vp, fid);
3776 3874          return (err);
3777 3875  }
3778 3876  
3779 3877  int
3780 3878  fop_rwlock(
3781 3879          vnode_t *vp,
3782 3880          int write_lock,
3783 3881          caller_context_t *ct)
3784 3882  {
3785 3883          int     ret;
3786 3884  
3787 3885          ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
3788 3886          VOPSTATS_UPDATE(vp, rwlock);
3789 3887          return (ret);
3790 3888  }
3791 3889  
3792 3890  void
3793 3891  fop_rwunlock(
3794 3892          vnode_t *vp,
3795 3893          int write_lock,
3796 3894          caller_context_t *ct)
3797 3895  {
3798 3896          (*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
3799 3897          VOPSTATS_UPDATE(vp, rwunlock);
3800 3898  }
3801 3899  
3802 3900  int
3803 3901  fop_seek(
3804 3902          vnode_t *vp,
3805 3903          offset_t ooff,
3806 3904          offset_t *noffp,
3807 3905          caller_context_t *ct)
3808 3906  {
3809 3907          int     err;
3810 3908  
3811 3909          err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
3812 3910          VOPSTATS_UPDATE(vp, seek);
3813 3911          return (err);
3814 3912  }
3815 3913  
3816 3914  int
3817 3915  fop_cmp(
3818 3916          vnode_t *vp1,
3819 3917          vnode_t *vp2,
3820 3918          caller_context_t *ct)
3821 3919  {
3822 3920          int     err;
3823 3921  
3824 3922          err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
3825 3923          VOPSTATS_UPDATE(vp1, cmp);
3826 3924          return (err);
3827 3925  }
3828 3926  
3829 3927  int
3830 3928  fop_frlock(
3831 3929          vnode_t *vp,
3832 3930          int cmd,
3833 3931          flock64_t *bfp,
3834 3932          int flag,
3835 3933          offset_t offset,
3836 3934          struct flk_callback *flk_cbp,
3837 3935          cred_t *cr,
3838 3936          caller_context_t *ct)
3839 3937  {
3840 3938          int     err;
3841 3939  
3842 3940          VOPXID_MAP_CR(vp, cr);
3843 3941  
3844 3942          err = (*(vp)->v_op->vop_frlock)
3845 3943              (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
3846 3944          VOPSTATS_UPDATE(vp, frlock);
3847 3945          return (err);
3848 3946  }
3849 3947  
3850 3948  int
3851 3949  fop_space(
3852 3950          vnode_t *vp,
3853 3951          int cmd,
3854 3952          flock64_t *bfp,
3855 3953          int flag,
3856 3954          offset_t offset,
3857 3955          cred_t *cr,
3858 3956          caller_context_t *ct)
3859 3957  {
3860 3958          int     err;
3861 3959  
3862 3960          VOPXID_MAP_CR(vp, cr);
3863 3961  
3864 3962          err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
3865 3963          VOPSTATS_UPDATE(vp, space);
3866 3964          return (err);
3867 3965  }
3868 3966  
3869 3967  int
3870 3968  fop_realvp(
3871 3969          vnode_t *vp,
3872 3970          vnode_t **vpp,
3873 3971          caller_context_t *ct)
3874 3972  {
3875 3973          int     err;
3876 3974  
3877 3975          err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
3878 3976          VOPSTATS_UPDATE(vp, realvp);
3879 3977          return (err);
3880 3978  }
3881 3979  
3882 3980  int
3883 3981  fop_getpage(
3884 3982          vnode_t *vp,
3885 3983          offset_t off,
3886 3984          size_t len,
3887 3985          uint_t *protp,
3888 3986          page_t **plarr,
3889 3987          size_t plsz,
3890 3988          struct seg *seg,
3891 3989          caddr_t addr,
3892 3990          enum seg_rw rw,
3893 3991          cred_t *cr,
3894 3992          caller_context_t *ct)
3895 3993  {
3896 3994          int     err;
3897 3995  
3898 3996          VOPXID_MAP_CR(vp, cr);
3899 3997  
3900 3998          err = (*(vp)->v_op->vop_getpage)
3901 3999              (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
3902 4000          VOPSTATS_UPDATE(vp, getpage);
3903 4001          return (err);
3904 4002  }
3905 4003  
3906 4004  int
3907 4005  fop_putpage(
3908 4006          vnode_t *vp,
3909 4007          offset_t off,
3910 4008          size_t len,
3911 4009          int flags,
3912 4010          cred_t *cr,
3913 4011          caller_context_t *ct)
3914 4012  {
3915 4013          int     err;
3916 4014  
3917 4015          VOPXID_MAP_CR(vp, cr);
3918 4016  
3919 4017          err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
3920 4018          VOPSTATS_UPDATE(vp, putpage);
3921 4019          return (err);
3922 4020  }
3923 4021  
3924 4022  int
3925 4023  fop_map(
3926 4024          vnode_t *vp,
3927 4025          offset_t off,
3928 4026          struct as *as,
3929 4027          caddr_t *addrp,
3930 4028          size_t len,
3931 4029          uchar_t prot,
3932 4030          uchar_t maxprot,
3933 4031          uint_t flags,
3934 4032          cred_t *cr,
3935 4033          caller_context_t *ct)
3936 4034  {
3937 4035          int     err;
3938 4036  
3939 4037          VOPXID_MAP_CR(vp, cr);
3940 4038  
3941 4039          err = (*(vp)->v_op->vop_map)
3942 4040              (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
3943 4041          VOPSTATS_UPDATE(vp, map);
3944 4042          return (err);
3945 4043  }
3946 4044  
3947 4045  int
3948 4046  fop_addmap(
3949 4047          vnode_t *vp,
3950 4048          offset_t off,
3951 4049          struct as *as,
3952 4050          caddr_t addr,
3953 4051          size_t len,
3954 4052          uchar_t prot,
3955 4053          uchar_t maxprot,
3956 4054          uint_t flags,
3957 4055          cred_t *cr,
3958 4056          caller_context_t *ct)
3959 4057  {
3960 4058          int error;
3961 4059          u_longlong_t delta;
3962 4060  
3963 4061          VOPXID_MAP_CR(vp, cr);
3964 4062  
3965 4063          error = (*(vp)->v_op->vop_addmap)
3966 4064              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
3967 4065  
3968 4066          if ((!error) && (vp->v_type == VREG)) {
3969 4067                  delta = (u_longlong_t)btopr(len);
3970 4068                  /*
3971 4069                   * If file is declared MAP_PRIVATE, it can't be written back
3972 4070                   * even if open for write. Handle as read.
3973 4071                   */
3974 4072                  if (flags & MAP_PRIVATE) {
3975 4073                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3976 4074                              (int64_t)delta);
3977 4075                  } else {
3978 4076                          /*
3979 4077                           * atomic_add_64 forces the fetch of a 64 bit value to
3980 4078                           * be atomic on 32 bit machines
3981 4079                           */
3982 4080                          if (maxprot & PROT_WRITE)
3983 4081                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
3984 4082                                      (int64_t)delta);
3985 4083                          if (maxprot & PROT_READ)
3986 4084                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3987 4085                                      (int64_t)delta);
3988 4086                          if (maxprot & PROT_EXEC)
3989 4087                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
3990 4088                                      (int64_t)delta);
3991 4089                  }
3992 4090          }
3993 4091          VOPSTATS_UPDATE(vp, addmap);
3994 4092          return (error);
3995 4093  }
3996 4094  
3997 4095  int
3998 4096  fop_delmap(
3999 4097          vnode_t *vp,
4000 4098          offset_t off,
4001 4099          struct as *as,
4002 4100          caddr_t addr,
4003 4101          size_t len,
4004 4102          uint_t prot,
4005 4103          uint_t maxprot,
4006 4104          uint_t flags,
4007 4105          cred_t *cr,
4008 4106          caller_context_t *ct)
4009 4107  {
4010 4108          int error;
4011 4109          u_longlong_t delta;
4012 4110  
4013 4111          VOPXID_MAP_CR(vp, cr);
4014 4112  
4015 4113          error = (*(vp)->v_op->vop_delmap)
4016 4114              (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
4017 4115  
4018 4116          /*
4019 4117           * NFS calls into delmap twice, the first time
4020 4118           * it simply establishes a callback mechanism and returns EAGAIN
4021 4119           * while the real work is being done upon the second invocation.
4022 4120           * We have to detect this here and only decrement the counts upon
4023 4121           * the second delmap request.
4024 4122           */
4025 4123          if ((error != EAGAIN) && (vp->v_type == VREG)) {
4026 4124  
4027 4125                  delta = (u_longlong_t)btopr(len);
4028 4126  
4029 4127                  if (flags & MAP_PRIVATE) {
4030 4128                          atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4031 4129                              (int64_t)(-delta));
4032 4130                  } else {
4033 4131                          /*
4034 4132                           * atomic_add_64 forces the fetch of a 64 bit value
4035 4133                           * to be atomic on 32 bit machines
4036 4134                           */
4037 4135                          if (maxprot & PROT_WRITE)
4038 4136                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
4039 4137                                      (int64_t)(-delta));
4040 4138                          if (maxprot & PROT_READ)
4041 4139                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4042 4140                                      (int64_t)(-delta));
4043 4141                          if (maxprot & PROT_EXEC)
4044 4142                                  atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
4045 4143                                      (int64_t)(-delta));
4046 4144                  }
4047 4145          }
4048 4146          VOPSTATS_UPDATE(vp, delmap);
4049 4147          return (error);
4050 4148  }
4051 4149  
4052 4150  
4053 4151  int
4054 4152  fop_poll(
4055 4153          vnode_t *vp,
4056 4154          short events,
4057 4155          int anyyet,
4058 4156          short *reventsp,
4059 4157          struct pollhead **phpp,
4060 4158          caller_context_t *ct)
4061 4159  {
4062 4160          int     err;
4063 4161  
4064 4162          err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
4065 4163          VOPSTATS_UPDATE(vp, poll);
4066 4164          return (err);
4067 4165  }
4068 4166  
4069 4167  int
4070 4168  fop_dump(
4071 4169          vnode_t *vp,
4072 4170          caddr_t addr,
4073 4171          offset_t lbdn,
4074 4172          offset_t dblks,
4075 4173          caller_context_t *ct)
4076 4174  {
4077 4175          int     err;
4078 4176  
4079 4177          /* ensure lbdn and dblks can be passed safely to bdev_dump */
4080 4178          if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
4081 4179                  return (EIO);
4082 4180  
4083 4181          err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
4084 4182          VOPSTATS_UPDATE(vp, dump);
4085 4183          return (err);
4086 4184  }
4087 4185  
4088 4186  int
4089 4187  fop_pathconf(
4090 4188          vnode_t *vp,
4091 4189          int cmd,
4092 4190          ulong_t *valp,
4093 4191          cred_t *cr,
4094 4192          caller_context_t *ct)
4095 4193  {
4096 4194          int     err;
4097 4195  
4098 4196          VOPXID_MAP_CR(vp, cr);
4099 4197  
4100 4198          err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
4101 4199          VOPSTATS_UPDATE(vp, pathconf);
4102 4200          return (err);
4103 4201  }
4104 4202  
4105 4203  int
4106 4204  fop_pageio(
4107 4205          vnode_t *vp,
4108 4206          struct page *pp,
4109 4207          u_offset_t io_off,
4110 4208          size_t io_len,
4111 4209          int flags,
4112 4210          cred_t *cr,
4113 4211          caller_context_t *ct)
4114 4212  {
4115 4213          int     err;
4116 4214  
4117 4215          VOPXID_MAP_CR(vp, cr);
4118 4216  
4119 4217          err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
4120 4218          VOPSTATS_UPDATE(vp, pageio);
4121 4219          return (err);
4122 4220  }
4123 4221  
4124 4222  int
4125 4223  fop_dumpctl(
4126 4224          vnode_t *vp,
4127 4225          int action,
4128 4226          offset_t *blkp,
4129 4227          caller_context_t *ct)
4130 4228  {
4131 4229          int     err;
4132 4230          err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
4133 4231          VOPSTATS_UPDATE(vp, dumpctl);
4134 4232          return (err);
4135 4233  }
4136 4234  
4137 4235  void
4138 4236  fop_dispose(
4139 4237          vnode_t *vp,
4140 4238          page_t *pp,
4141 4239          int flag,
4142 4240          int dn,
4143 4241          cred_t *cr,
4144 4242          caller_context_t *ct)
4145 4243  {
4146 4244          /* Must do stats first since it's possible to lose the vnode */
4147 4245          VOPSTATS_UPDATE(vp, dispose);
4148 4246  
4149 4247          VOPXID_MAP_CR(vp, cr);
4150 4248  
4151 4249          (*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
4152 4250  }
4153 4251  
4154 4252  int
4155 4253  fop_setsecattr(
4156 4254          vnode_t *vp,
4157 4255          vsecattr_t *vsap,
4158 4256          int flag,
4159 4257          cred_t *cr,
4160 4258          caller_context_t *ct)
4161 4259  {
4162 4260          int     err;
4163 4261  
4164 4262          VOPXID_MAP_CR(vp, cr);
4165 4263  
4166 4264          /*
4167 4265           * We're only allowed to skip the ACL check iff we used a 32 bit
4168 4266           * ACE mask with VOP_ACCESS() to determine permissions.
4169 4267           */
4170 4268          if ((flag & ATTR_NOACLCHECK) &&
4171 4269              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4172 4270                  return (EINVAL);
4173 4271          }
4174 4272          err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
4175 4273          VOPSTATS_UPDATE(vp, setsecattr);
4176 4274          return (err);
4177 4275  }
4178 4276  
4179 4277  int
4180 4278  fop_getsecattr(
4181 4279          vnode_t *vp,
4182 4280          vsecattr_t *vsap,
4183 4281          int flag,
4184 4282          cred_t *cr,
4185 4283          caller_context_t *ct)
4186 4284  {
4187 4285          int     err;
4188 4286  
4189 4287          /*
4190 4288           * We're only allowed to skip the ACL check iff we used a 32 bit
4191 4289           * ACE mask with VOP_ACCESS() to determine permissions.
4192 4290           */
4193 4291          if ((flag & ATTR_NOACLCHECK) &&
4194 4292              vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
4195 4293                  return (EINVAL);
4196 4294          }
4197 4295  
4198 4296          VOPXID_MAP_CR(vp, cr);
4199 4297  
4200 4298          err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
4201 4299          VOPSTATS_UPDATE(vp, getsecattr);
4202 4300          return (err);
4203 4301  }
4204 4302  
4205 4303  int
4206 4304  fop_shrlock(
4207 4305          vnode_t *vp,
4208 4306          int cmd,
4209 4307          struct shrlock *shr,
4210 4308          int flag,
4211 4309          cred_t *cr,
4212 4310          caller_context_t *ct)
4213 4311  {
4214 4312          int     err;
4215 4313  
4216 4314          VOPXID_MAP_CR(vp, cr);
4217 4315  
4218 4316          err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
4219 4317          VOPSTATS_UPDATE(vp, shrlock);
4220 4318          return (err);
4221 4319  }
4222 4320  
4223 4321  int
4224 4322  fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
4225 4323      caller_context_t *ct)
4226 4324  {
4227 4325          int     err;
4228 4326  
4229 4327          err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
4230 4328          VOPSTATS_UPDATE(vp, vnevent);
4231 4329          return (err);
4232 4330  }
4233 4331  
4234 4332  int
4235 4333  fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
4236 4334      caller_context_t *ct)
4237 4335  {
4238 4336          int err;
4239 4337  
4240 4338          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4241 4339                  return (ENOTSUP);
4242 4340          err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
4243 4341          VOPSTATS_UPDATE(vp, reqzcbuf);
4244 4342          return (err);
4245 4343  }
4246 4344  
4247 4345  int
4248 4346  fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
4249 4347  {
4250 4348          int err;
4251 4349  
4252 4350          if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
4253 4351                  return (ENOTSUP);
4254 4352          err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
4255 4353          VOPSTATS_UPDATE(vp, retzcbuf);
4256 4354          return (err);
4257 4355  }
4258 4356  
4259 4357  /*
4260 4358   * Default destructor
4261 4359   *      Needed because NULL destructor means that the key is unused
4262 4360   */
4263 4361  /* ARGSUSED */
4264 4362  void
4265 4363  vsd_defaultdestructor(void *value)
4266 4364  {}
4267 4365  
4268 4366  /*
4269 4367   * Create a key (index into per vnode array)
4270 4368   *      Locks out vsd_create, vsd_destroy, and vsd_free
4271 4369   *      May allocate memory with lock held
4272 4370   */
4273 4371  void
4274 4372  vsd_create(uint_t *keyp, void (*destructor)(void *))
4275 4373  {
4276 4374          int     i;
4277 4375          uint_t  nkeys;
4278 4376  
4279 4377          /*
4280 4378           * if key is allocated, do nothing
4281 4379           */
4282 4380          mutex_enter(&vsd_lock);
4283 4381          if (*keyp) {
4284 4382                  mutex_exit(&vsd_lock);
4285 4383                  return;
4286 4384          }
4287 4385          /*
4288 4386           * find an unused key
4289 4387           */
4290 4388          if (destructor == NULL)
4291 4389                  destructor = vsd_defaultdestructor;
4292 4390  
4293 4391          for (i = 0; i < vsd_nkeys; ++i)
4294 4392                  if (vsd_destructor[i] == NULL)
4295 4393                          break;
4296 4394  
4297 4395          /*
4298 4396           * if no unused keys, increase the size of the destructor array
4299 4397           */
4300 4398          if (i == vsd_nkeys) {
4301 4399                  if ((nkeys = (vsd_nkeys << 1)) == 0)
4302 4400                          nkeys = 1;
4303 4401                  vsd_destructor =
4304 4402                      (void (**)(void *))vsd_realloc((void *)vsd_destructor,
4305 4403                      (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
4306 4404                      (size_t)(nkeys * sizeof (void (*)(void *))));
4307 4405                  vsd_nkeys = nkeys;
4308 4406          }
4309 4407  
4310 4408          /*
4311 4409           * allocate the next available unused key
4312 4410           */
4313 4411          vsd_destructor[i] = destructor;
4314 4412          *keyp = i + 1;
4315 4413  
4316 4414          /* create vsd_list, if it doesn't exist */
4317 4415          if (vsd_list == NULL) {
4318 4416                  vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
4319 4417                  list_create(vsd_list, sizeof (struct vsd_node),
4320 4418                      offsetof(struct vsd_node, vs_nodes));
4321 4419          }
4322 4420  
4323 4421          mutex_exit(&vsd_lock);
4324 4422  }
4325 4423  
4326 4424  /*
4327 4425   * Destroy a key
4328 4426   *
4329 4427   * Assumes that the caller is preventing vsd_set and vsd_get
4330 4428   * Locks out vsd_create, vsd_destroy, and vsd_free
4331 4429   * May free memory with lock held
4332 4430   */
4333 4431  void
4334 4432  vsd_destroy(uint_t *keyp)
4335 4433  {
4336 4434          uint_t key;
4337 4435          struct vsd_node *vsd;
4338 4436  
4339 4437          /*
4340 4438           * protect the key namespace and our destructor lists
4341 4439           */
4342 4440          mutex_enter(&vsd_lock);
4343 4441          key = *keyp;
4344 4442          *keyp = 0;
4345 4443  
4346 4444          ASSERT(key <= vsd_nkeys);
4347 4445  
4348 4446          /*
4349 4447           * if the key is valid
4350 4448           */
4351 4449          if (key != 0) {
4352 4450                  uint_t k = key - 1;
4353 4451                  /*
4354 4452                   * for every vnode with VSD, call key's destructor
4355 4453                   */
4356 4454                  for (vsd = list_head(vsd_list); vsd != NULL;
4357 4455                      vsd = list_next(vsd_list, vsd)) {
4358 4456                          /*
4359 4457                           * no VSD for key in this vnode
4360 4458                           */
4361 4459                          if (key > vsd->vs_nkeys)
4362 4460                                  continue;
4363 4461                          /*
4364 4462                           * call destructor for key
4365 4463                           */
4366 4464                          if (vsd->vs_value[k] && vsd_destructor[k])
4367 4465                                  (*vsd_destructor[k])(vsd->vs_value[k]);
4368 4466                          /*
4369 4467                           * reset value for key
4370 4468                           */
4371 4469                          vsd->vs_value[k] = NULL;
4372 4470                  }
4373 4471                  /*
4374 4472                   * actually free the key (NULL destructor == unused)
4375 4473                   */
4376 4474                  vsd_destructor[k] = NULL;
4377 4475          }
4378 4476  
4379 4477          mutex_exit(&vsd_lock);
4380 4478  }
4381 4479  
4382 4480  /*
4383 4481   * Quickly return the per vnode value that was stored with the specified key
4384 4482   * Assumes the caller is protecting key from vsd_create and vsd_destroy
4385 4483   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4386 4484   */
4387 4485  void *
4388 4486  vsd_get(vnode_t *vp, uint_t key)
4389 4487  {
4390 4488          struct vsd_node *vsd;
4391 4489  
4392 4490          ASSERT(vp != NULL);
4393 4491          ASSERT(mutex_owned(&vp->v_vsd_lock));
4394 4492  
4395 4493          vsd = vp->v_vsd;
4396 4494  
4397 4495          if (key && vsd != NULL && key <= vsd->vs_nkeys)
4398 4496                  return (vsd->vs_value[key - 1]);
4399 4497          return (NULL);
4400 4498  }
4401 4499  
4402 4500  /*
4403 4501   * Set a per vnode value indexed with the specified key
4404 4502   * Assumes the caller is holding v_vsd_lock to protect the vsd.
4405 4503   */
4406 4504  int
4407 4505  vsd_set(vnode_t *vp, uint_t key, void *value)
4408 4506  {
4409 4507          struct vsd_node *vsd;
4410 4508  
4411 4509          ASSERT(vp != NULL);
4412 4510          ASSERT(mutex_owned(&vp->v_vsd_lock));
4413 4511  
4414 4512          if (key == 0)
4415 4513                  return (EINVAL);
4416 4514  
4417 4515          vsd = vp->v_vsd;
4418 4516          if (vsd == NULL)
4419 4517                  vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
4420 4518  
4421 4519          /*
4422 4520           * If the vsd was just allocated, vs_nkeys will be 0, so the following
4423 4521           * code won't happen and we will continue down and allocate space for
4424 4522           * the vs_value array.
4425 4523           * If the caller is replacing one value with another, then it is up
4426 4524           * to the caller to free/rele/destroy the previous value (if needed).
4427 4525           */
4428 4526          if (key <= vsd->vs_nkeys) {
4429 4527                  vsd->vs_value[key - 1] = value;
4430 4528                  return (0);
4431 4529          }
4432 4530  
4433 4531          ASSERT(key <= vsd_nkeys);
4434 4532  
4435 4533          if (vsd->vs_nkeys == 0) {
4436 4534                  mutex_enter(&vsd_lock); /* lock out vsd_destroy() */
4437 4535                  /*
4438 4536                   * Link onto list of all VSD nodes.
4439 4537                   */
4440 4538                  list_insert_head(vsd_list, vsd);
4441 4539                  mutex_exit(&vsd_lock);
4442 4540          }
4443 4541  
4444 4542          /*
4445 4543           * Allocate vnode local storage and set the value for key
4446 4544           */
4447 4545          vsd->vs_value = vsd_realloc(vsd->vs_value,
4448 4546              vsd->vs_nkeys * sizeof (void *),
4449 4547              key * sizeof (void *));
4450 4548          vsd->vs_nkeys = key;
4451 4549          vsd->vs_value[key - 1] = value;
4452 4550  
4453 4551          return (0);
4454 4552  }
4455 4553  
4456 4554  /*
4457 4555   * Called from vn_free() to run the destructor function for each vsd
4458 4556   *      Locks out vsd_create and vsd_destroy
4459 4557   *      Assumes that the destructor *DOES NOT* use vsd
4460 4558   */
4461 4559  void
4462 4560  vsd_free(vnode_t *vp)
4463 4561  {
4464 4562          int i;
4465 4563          struct vsd_node *vsd = vp->v_vsd;
4466 4564  
4467 4565          if (vsd == NULL)
4468 4566                  return;
4469 4567  
4470 4568          if (vsd->vs_nkeys == 0) {
4471 4569                  kmem_free(vsd, sizeof (*vsd));
4472 4570                  vp->v_vsd = NULL;
4473 4571                  return;
4474 4572          }
4475 4573  
4476 4574          /*
4477 4575           * lock out vsd_create and vsd_destroy, call
4478 4576           * the destructor, and mark the value as destroyed.
4479 4577           */
4480 4578          mutex_enter(&vsd_lock);
4481 4579  
4482 4580          for (i = 0; i < vsd->vs_nkeys; i++) {
4483 4581                  if (vsd->vs_value[i] && vsd_destructor[i])
4484 4582                          (*vsd_destructor[i])(vsd->vs_value[i]);
4485 4583                  vsd->vs_value[i] = NULL;
4486 4584          }
4487 4585  
4488 4586          /*
4489 4587           * remove from linked list of VSD nodes
4490 4588           */
4491 4589          list_remove(vsd_list, vsd);
4492 4590  
4493 4591          mutex_exit(&vsd_lock);
4494 4592  
4495 4593          /*
4496 4594           * free up the VSD
4497 4595           */
4498 4596          kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
4499 4597          kmem_free(vsd, sizeof (struct vsd_node));
4500 4598          vp->v_vsd = NULL;
4501 4599  }
4502 4600  
4503 4601  /*
4504 4602   * realloc
4505 4603   */
4506 4604  static void *
4507 4605  vsd_realloc(void *old, size_t osize, size_t nsize)
4508 4606  {
4509 4607          void *new;
4510 4608  
4511 4609          new = kmem_zalloc(nsize, KM_SLEEP);
4512 4610          if (old) {
4513 4611                  bcopy(old, new, osize);
4514 4612                  kmem_free(old, osize);
4515 4613          }
4516 4614          return (new);
4517 4615  }
4518 4616  
4519 4617  /*
4520 4618   * Setup the extensible system attribute for creating a reparse point.
4521 4619   * The symlink data 'target' is validated for proper format of a reparse
4522 4620   * string and a check also made to make sure the symlink data does not
4523 4621   * point to an existing file.
4524 4622   *
4525 4623   * return 0 if ok else -1.
4526 4624   */
4527 4625  static int
4528 4626  fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
4529 4627  {
4530 4628          xoptattr_t *xoap;
4531 4629  
4532 4630          if ((!target) || (!vap) || (!xvattr))
4533 4631                  return (-1);
4534 4632  
4535 4633          /* validate reparse string */
4536 4634          if (reparse_validate((const char *)target))
4537 4635                  return (-1);
4538 4636  
4539 4637          xva_init(xvattr);
4540 4638          xvattr->xva_vattr = *vap;
4541 4639          xvattr->xva_vattr.va_mask |= AT_XVATTR;
4542 4640          xoap = xva_getxoptattr(xvattr);
4543 4641          ASSERT(xoap);
4544 4642          XVA_SET_REQ(xvattr, XAT_REPARSE);
4545 4643          xoap->xoa_reparse = 1;
4546 4644  
4547 4645          return (0);
4548 4646  }
4549 4647  
4550 4648  /*
4551 4649   * Function to check whether a symlink is a reparse point.
4552 4650   * Return B_TRUE if it is a reparse point, else return B_FALSE
4553 4651   */
4554 4652  boolean_t
4555 4653  vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4556 4654  {
4557 4655          xvattr_t xvattr;
4558 4656          xoptattr_t *xoap;
4559 4657  
4560 4658          if ((vp->v_type != VLNK) ||
4561 4659              !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
4562 4660                  return (B_FALSE);
4563 4661  
4564 4662          xva_init(&xvattr);
4565 4663          xoap = xva_getxoptattr(&xvattr);
4566 4664          ASSERT(xoap);
4567 4665          XVA_SET_REQ(&xvattr, XAT_REPARSE);
4568 4666  
4569 4667          if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
4570 4668                  return (B_FALSE);
4571 4669  
4572 4670          if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
4573 4671              (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
4574 4672                  return (B_FALSE);
4575 4673  
4576 4674          return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
4577 4675  }
  
    | 
      ↓ open down ↓ | 
    1277 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX