4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016, Joyent, Inc.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/t_lock.h>
  43 #include <sys/errno.h>
  44 #include <sys/cred.h>
 
 
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/vnode.h>
  52 #include <sys/rwstlock.h>
  53 #include <sys/fem.h>
  54 #include <sys/stat.h>
  55 #include <sys/mode.h>
  56 #include <sys/conf.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/systm.h>
  60 #include <sys/kmem.h>
  61 #include <sys/debug.h>
  62 #include <c2/audit.h>
  63 #include <sys/acl.h>
  64 #include <sys/nbmlock.h>
  65 #include <sys/fcntl.h>
  66 #include <fs/fs_subr.h>
  67 #include <sys/taskq.h>
  68 #include <fs/fs_reparse.h>
  69 #include <sys/time.h>
  70 #include <sys/sdt.h>
  71 
  72 /* Determine if this vnode is a file that is read-only */
  73 #define ISROFILE(vp)    \
  74         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  75             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  76 
  77 /* Tunable via /etc/system; used only by admin/install */
  78 int nfs_global_client_only;
  79 
  80 /*
  81  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  82  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  83  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  84  * the same fstype index that is used to index into the vfssw table.
  85  */
  86 vopstats_t **vopstats_fstype;
  87 
  88 /* vopstats initialization template used for fast initialization via bcopy() */
  89 static vopstats_t *vs_templatep;
  90 
  91 /* Kmem cache handle for vsk_anchor_t allocations */
  92 kmem_cache_t *vsk_anchor_cache;
  93 
  94 /* file events cleanup routine */
  95 extern void free_fopdata(vnode_t *);
  96 
  97 /*
  98  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  99  * updates to vsktat_tree.
 100  */
 101 avl_tree_t      vskstat_tree;
 102 kmutex_t        vskstat_tree_lock;
 103 
 104 /* Global variable which enables/disables the vopstats collection */
 105 int vopstats_enabled = 1;
 106 
 107 /* Global used for empty/invalid v_path */
 108 char *vn_vpath_empty = "";
 109 
 110 /*
 111  * forward declarations for internal vnode specific data (vsd)
 112  */
 113 static void *vsd_realloc(void *, size_t, size_t);
 114 
 115 /*
 116  * forward declarations for reparse point functions
 117  */
 118 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 119 
 120 /*
 121  * VSD -- VNODE SPECIFIC DATA
 122  * The v_data pointer is typically used by a file system to store a
 123  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 124  * However, there are times when additional project private data needs
 125  * to be stored separately from the data (node) pointed to by v_data.
 126  * This additional data could be stored by the file system itself or
 127  * by a completely different kernel entity.  VSD provides a way for
 128  * callers to obtain a key and store a pointer to private data associated
 129  * with a vnode.
 
 
 191                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 192                         vsp->n##counter.value.ui64++;                        \
 193                         vsp->bytecounter.value.ui64 += bytesval;     \
 194                 }                                                       \
 195         }                                                               \
 196 }
 197 
 198 /*
 199  * If the filesystem does not support XIDs map credential
 200  * If the vfsp is NULL, perhaps we should also map?
 201  */
 202 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 203         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 204         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 205                 cr = crgetmapped(cr);                                   \
 206         }
 207 
 208 #define VOP_LATENCY_10MS        10000000
 209 #define VOP_LATENCY_100MS       100000000
 210 #define VOP_LATENCY_1S          1000000000
 211 #define VOP_LATENCY_10S         10000000000
 212 
 213 /*
 214  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 215  * numerical order of S_IFMT and vnode types.)
 216  */
 217 enum vtype iftovt_tab[] = {
 218         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 219         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 220 };
 221 
 222 ushort_t vttoif_tab[] = {
 223         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 224         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 225 };
 226 
 227 /*
 228  * The system vnode cache.
 229  */
 230 
 231 kmem_cache_t *vn_cache;
 
2277         kmem_free(vnops, sizeof (vnodeops_t));
2278 }
2279 
2280 /*
2281  * Vnode cache.
2282  */
2283 
2284 /* ARGSUSED */
2285 static int
2286 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2287 {
2288         struct vnode *vp;
2289 
2290         vp = buf;
2291 
2292         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2293         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2294         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2295         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2296         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2297         vp->v_path = vn_vpath_empty;
2298         vp->v_path_stamp = 0;
2299         vp->v_mpssdata = NULL;
2300         vp->v_vsd = NULL;
2301         vp->v_fopdata = NULL;
2302 
2303         return (0);
2304 }
2305 
2306 /* ARGSUSED */
2307 static void
2308 vn_cache_destructor(void *buf, void *cdrarg)
2309 {
2310         struct vnode *vp;
2311 
2312         vp = buf;
2313 
2314         rw_destroy(&vp->v_nbllock);
2315         cv_destroy(&vp->v_cv);
2316         mutex_destroy(&vp->v_vsd_lock);
2317         mutex_destroy(&vp->v_lock);
2318 }
 
2325             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2326         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2327             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2328             NULL, 0);
2329 }
2330 
2331 void
2332 vn_destroy_cache(void)
2333 {
2334         kmem_cache_destroy(vn_cache);
2335 }
2336 
2337 /*
2338  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2339  * cached by the file system and vnodes remain associated.
2340  */
2341 void
2342 vn_recycle(vnode_t *vp)
2343 {
2344         ASSERT(vp->v_pages == NULL);
2345         VERIFY(vp->v_path != NULL);
2346 
2347         /*
2348          * XXX - This really belongs in vn_reinit(), but we have some issues
2349          * with the counts.  Best to have it here for clean initialization.
2350          */
2351         vp->v_rdcnt = 0;
2352         vp->v_wrcnt = 0;
2353         vp->v_mmap_read = 0;
2354         vp->v_mmap_write = 0;
2355 
2356         /*
2357          * If FEM was in use, make sure everything gets cleaned up
2358          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2359          * constructor.
2360          */
2361         if (vp->v_femhead) {
2362                 /* XXX - There should be a free_femhead() that does all this */
2363                 ASSERT(vp->v_femhead->femh_list == NULL);
2364                 mutex_destroy(&vp->v_femhead->femh_lock);
2365                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2366                 vp->v_femhead = NULL;
2367         }
2368         if (vp->v_path != vn_vpath_empty) {
2369                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2370                 vp->v_path = vn_vpath_empty;
2371         }
2372         vp->v_path_stamp = 0;
2373 
2374         if (vp->v_fopdata != NULL) {
2375                 free_fopdata(vp);
2376         }
2377         vp->v_mpssdata = NULL;
2378         vsd_free(vp);
2379 }
2380 
2381 /*
2382  * Used to reset the vnode fields including those that are directly accessible
2383  * as well as those which require an accessor function.
2384  *
2385  * Does not initialize:
2386  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2387  *      v_data (since FS-nodes and vnodes point to each other and should
2388  *              be updated simultaneously)
2389  *      v_op (in case someone needs to make a VOP call on this object)
2390  */
2391 void
2392 vn_reinit(vnode_t *vp)
 
2423                 vp->v_fopdata = NULL;
2424                 vn_reinit(vp);
2425         }
2426 
2427         return (vp);
2428 }
2429 
2430 void
2431 vn_free(vnode_t *vp)
2432 {
2433         ASSERT(vp->v_shrlocks == NULL);
2434         ASSERT(vp->v_filocks == NULL);
2435 
2436         /*
2437          * Some file systems call vn_free() with v_count of zero,
2438          * some with v_count of 1.  In any case, the value should
2439          * never be anything else.
2440          */
2441         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2442         ASSERT(vp->v_count_dnlc == 0);
2443         VERIFY(vp->v_path != NULL);
2444         if (vp->v_path != vn_vpath_empty) {
2445                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2446                 vp->v_path = vn_vpath_empty;
2447         }
2448 
2449         /* If FEM was in use, make sure everything gets cleaned up */
2450         if (vp->v_femhead) {
2451                 /* XXX - There should be a free_femhead() that does all this */
2452                 ASSERT(vp->v_femhead->femh_list == NULL);
2453                 mutex_destroy(&vp->v_femhead->femh_lock);
2454                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2455                 vp->v_femhead = NULL;
2456         }
2457 
2458         if (vp->v_fopdata != NULL) {
2459                 free_fopdata(vp);
2460         }
2461         vp->v_mpssdata = NULL;
2462         vsd_free(vp);
2463         kmem_cache_free(vn_cache, vp);
2464 }
2465 
2466 /*
 
2960         }
2961 
2962         return ((loc != NULL) && (*loc == funcp));
2963 }
2964 
2965 /*
2966  * fs_new_caller_id() needs to return a unique ID on a given local system.
2967  * The IDs do not need to survive across reboots.  These are primarily
2968  * used so that (FEM) monitors can detect particular callers (such as
2969  * the NFS server) to a given vnode/vfs operation.
2970  */
2971 u_longlong_t
2972 fs_new_caller_id()
2973 {
2974         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2975 
2976         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2977 }
2978 
2979 /*
2980  * The value stored in v_path is relative to rootdir, located in the global
2981  * zone.  Zones or chroot environments which reside deeper inside the VFS
2982  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2983  * what lies below their perceived root.  In order to keep v_path usable for
2984  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2985  *
2986  * An upper bound of max_vnode_path is placed upon v_path allocations to
2987  * prevent the system from going too wild at the behest of pathological
2988  * behavior from the operator.
2989  */
2990 size_t max_vnode_path = 4 * MAXPATHLEN;
2991 
2992 
2993 void
2994 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2995 {
2996         char *buf;
2997 
2998         mutex_enter(&vp->v_lock);
2999         /*
3000          * If the snapshot of v_path_stamp passed in via compare_stamp does not
3001          * match the present value on the vnode, it indicates that subsequent
3002          * changes have occurred.  The v_path value is not cleared in this case
3003          * since the new value may be valid.
3004          */
3005         if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3006                 mutex_exit(&vp->v_lock);
3007                 return;
3008         }
3009         buf = vp->v_path;
3010         vp->v_path = vn_vpath_empty;
3011         vp->v_path_stamp = 0;
3012         mutex_exit(&vp->v_lock);
3013         if (buf != vn_vpath_empty) {
3014                 kmem_free(buf, strlen(buf) + 1);
3015         }
3016 }
3017 
3018 static void
3019 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3020     boolean_t is_rename)
3021 {
3022         char *buf, *oldbuf;
3023         hrtime_t pstamp;
3024         size_t baselen, buflen = 0;
3025 
3026         /* Handle the vn_setpath_str case. */
3027         if (pvp == NULL) {
3028                 if (len + 1 > max_vnode_path) {
3029                         DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3030                             vnode_t *, vp, char *, name, size_t, len + 1);
3031                         return;
3032                 }
3033                 buf = kmem_alloc(len + 1, KM_SLEEP);
3034                 bcopy(name, buf, len);
3035                 buf[len] = '\0';
3036 
3037                 mutex_enter(&vp->v_lock);
3038                 oldbuf = vp->v_path;
3039                 vp->v_path = buf;
3040                 vp->v_path_stamp = gethrtime();
3041                 mutex_exit(&vp->v_lock);
3042                 if (oldbuf != vn_vpath_empty) {
3043                         kmem_free(oldbuf, strlen(oldbuf) + 1);
3044                 }
3045                 return;
3046         }
3047 
3048         /* Take snapshot of parent dir */
3049         mutex_enter(&pvp->v_lock);
3050 retrybuf:
3051         if (pvp->v_path == vn_vpath_empty) {
3052                 /*
3053                  * Without v_path from the parent directory, generating a child
3054                  * path from the name is impossible.
3055                  */
3056                 if (len > 0) {
3057                         pstamp = pvp->v_path_stamp;
3058                         mutex_exit(&pvp->v_lock);
3059                         vn_clearpath(vp, pstamp);
3060                         return;
3061                 }
3062 
3063                 /*
3064                  * The only feasible case here is where a NUL lookup is being
3065                  * performed on rootdir prior to its v_path being populated.
3066                  */
3067                 ASSERT(pvp->v_path_stamp = 0);
3068                 baselen = 0;
3069                 pstamp = 0;
3070         } else {
3071                 pstamp = pvp->v_path_stamp;
3072                 baselen = strlen(pvp->v_path);
3073                 /* ignore a trailing slash if present */
3074                 if (pvp->v_path[baselen - 1] == '/') {
3075                         /* This should only the be case for rootdir */
3076                         ASSERT(baselen == 1 && pvp == rootdir);
3077                         baselen--;
3078                 }
3079         }
3080         mutex_exit(&pvp->v_lock);
3081 
3082         if (buflen != 0) {
3083                 /* Free the existing (mis-sized) buffer in case of retry */
3084                 kmem_free(buf, buflen);
3085         }
3086         /* base, '/', name and trailing NUL */
3087         buflen = baselen + len + 2;
3088         if (buflen > max_vnode_path) {
3089                 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3090                     vnode_t *, vp, char *, name, size_t, buflen);
3091                 return;
3092         }
3093         buf = kmem_alloc(buflen, KM_SLEEP);
3094 
3095         mutex_enter(&pvp->v_lock);
3096         if (pvp->v_path_stamp != pstamp) {
3097                 size_t vlen;
3098 
3099                 /*
3100                  * Since v_path_stamp changed on the parent, it is likely that
3101                  * v_path has been altered as well.  If the length does not
3102                  * exactly match what was previously measured, the buffer
3103                  * allocation must be repeated for proper sizing.
3104                  */
3105                 if (pvp->v_path == vn_vpath_empty) {
3106                         /* Give up if parent lack v_path */
3107                         mutex_exit(&pvp->v_lock);
3108                         kmem_free(buf, buflen);
3109                         return;
3110                 }
3111                 vlen = strlen(pvp->v_path);
3112                 if (pvp->v_path[vlen - 1] == '/') {
3113                         vlen--;
3114                 }
3115                 if (vlen != baselen) {
3116                         goto retrybuf;
3117                 }
3118         }
3119         bcopy(pvp->v_path, buf, baselen);
3120         mutex_exit(&pvp->v_lock);
3121 
3122         buf[baselen] = '/';
3123         baselen++;
3124         bcopy(name, &buf[baselen], len + 1);
3125 
3126         mutex_enter(&vp->v_lock);
3127         if (vp->v_path_stamp == 0) {
3128                 /* never-visited vnode can inherit stamp from parent */
3129                 ASSERT(vp->v_path == vn_vpath_empty);
3130                 vp->v_path_stamp = pstamp;
3131                 vp->v_path = buf;
3132                 mutex_exit(&vp->v_lock);
3133         } else if (vp->v_path_stamp < pstamp || is_rename) {
3134                 /*
3135                  * Install the updated path and stamp, ensuring that the v_path
3136                  * pointer is valid at all times for dtrace.
3137                  */
3138                 oldbuf = vp->v_path;
3139                 vp->v_path = buf;
3140                 vp->v_path_stamp = gethrtime();
3141                 mutex_exit(&vp->v_lock);
3142                 kmem_free(oldbuf, strlen(oldbuf) + 1);
3143         } else {
3144                 /*
3145                  * If the timestamp matches or is greater, it means another
3146                  * thread performed the update first while locks were dropped
3147                  * here to make the allocation.  We defer to the newer value.
3148                  */
3149                 mutex_exit(&vp->v_lock);
3150                 kmem_free(buf, buflen);
3151         }
3152         ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3153 }
3154 
3155 void
3156 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3157 {
3158         size_t len;
3159 
3160         /*
3161          * If the parent is older or empty, there's nothing further to do.
3162          */
3163         if (pvp->v_path == vn_vpath_empty ||
3164             pvp->v_path_stamp <= vp->v_path_stamp) {
3165                 return;
3166         }
3167 
3168         /*
3169          * Given the lack of appropriate context, meaningful updates to v_path
3170          * cannot be made for during lookups for the '.' or '..' entries.
3171          */
3172         len = strlen(name);
3173         if (len == 0 || (len == 1 && name[0] == '.') ||
3174             (len == 2 && name[0] == '.' && name[1] == '.')) {
3175                 return;
3176         }
3177 
3178         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3179 }
3180 
3181 /*
3182  * Given a starting vnode and a path, updates the path in the target vnode in
3183  * a safe manner.  If the vnode already has path information embedded, then the
3184  * cached path is left untouched.
3185  */
3186 /* ARGSUSED */
3187 void
3188 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3189     size_t len)
3190 {
3191         vn_setpath_common(pvp, vp, name, len, B_FALSE);
3192 }
3193 
3194 /*
3195  * Sets the path to the vnode to be the given string, regardless of current
3196  * context.  The string must be a complete path from rootdir.  This is only used
3197  * by fsop_root() for setting the path based on the mountpoint.
3198  */
3199 void
3200 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3201 {
3202         vn_setpath_common(NULL, vp, str, len, B_FALSE);
3203 }
3204 
3205 /*
3206  * Called from within filesystem's vop_rename() to handle renames once the
3207  * target vnode is available.
3208  */
3209 void
3210 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3211 {
3212         vn_setpath_common(pvp, vp, name, len, B_TRUE);
3213 }
3214 
3215 /*
3216  * Similar to vn_setpath_str(), this function sets the path of the destination
3217  * vnode to the be the same as the source vnode.
3218  */
3219 void
3220 vn_copypath(struct vnode *src, struct vnode *dst)
3221 {
3222         char *buf;
3223         hrtime_t stamp;
3224         size_t buflen;
3225 
3226         mutex_enter(&src->v_lock);
3227         if (src->v_path == vn_vpath_empty) {
3228                 mutex_exit(&src->v_lock);
3229                 return;
3230         }
3231         buflen = strlen(src->v_path) + 1;
3232         mutex_exit(&src->v_lock);
3233 
3234         buf = kmem_alloc(buflen, KM_SLEEP);
3235 
3236         mutex_enter(&src->v_lock);
3237         if (src->v_path == vn_vpath_empty ||
3238             strlen(src->v_path) + 1 != buflen) {
3239                 mutex_exit(&src->v_lock);
3240                 kmem_free(buf, buflen);
3241                 return;
3242         }
3243         bcopy(src->v_path, buf, buflen);
3244         stamp = src->v_path_stamp;
3245         mutex_exit(&src->v_lock);
3246 
3247         mutex_enter(&dst->v_lock);
3248         if (dst->v_path != vn_vpath_empty) {
3249                 mutex_exit(&dst->v_lock);
3250                 kmem_free(buf, buflen);
3251                 return;
3252         }
3253         dst->v_path = buf;
3254         dst->v_path_stamp = stamp;
3255         mutex_exit(&dst->v_lock);
3256 }
3257 
3258 
3259 /*
3260  * XXX Private interface for segvn routines that handle vnode
3261  * large page segments.
3262  *
3263  * return 1 if vp's file system VOP_PAGEIO() implementation
3264  * can be safely used instead of VOP_GETPAGE() for handling
3265  * pagefaults against regular non swap files. VOP_PAGEIO()
3266  * interface is considered safe here if its implementation
3267  * is very close to VOP_GETPAGE() implementation.
3268  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3269  * panic if there're file holes but instead returns an error.
3270  * Doesn't assume file won't be changed by user writes, etc.
3271  *
3272  * return 0 otherwise.
3273  *
3274  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3275  */
3276 int
3277 vn_vmpss_usepageio(vnode_t *vp)
3278 {
 
3427         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3428         len = resid_start - uiop->uio_resid;
3429 
3430         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3431 
3432         if (start != 0) {
3433                 mutex_enter(&zonep->zone_vfs_lock);
3434                 zonep->zone_vfs_rwstats.reads++;
3435                 zonep->zone_vfs_rwstats.nread += len;
3436                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3437                 mutex_exit(&zonep->zone_vfs_lock);
3438 
3439                 lat = gethrtime() - start;
3440 
3441                 if (lat >= VOP_LATENCY_10MS) {
3442                         if (lat < VOP_LATENCY_100MS)
3443                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3444                         else if (lat < VOP_LATENCY_1S) {
3445                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3446                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3447                         } else if (lat < VOP_LATENCY_10S) {
3448                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3449                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3450                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3451                         } else {
3452                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3453                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3454                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3455                                 atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3456                         }
3457                 }
3458         }
3459 
3460         return (err);
3461 }
3462 
3463 int
3464 fop_write(
3465         vnode_t *vp,
3466         uio_t *uiop,
3467         int ioflag,
3468         cred_t *cr,
3469         caller_context_t *ct)
3470 {
3471         ssize_t resid_start = uiop->uio_resid;
3472         zone_t  *zonep = curzone;
3473         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3474 
3475         hrtime_t start = 0, lat;
 
3495         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3496         len = resid_start - uiop->uio_resid;
3497 
3498         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3499 
3500         if (start != 0) {
3501                 mutex_enter(&zonep->zone_vfs_lock);
3502                 zonep->zone_vfs_rwstats.writes++;
3503                 zonep->zone_vfs_rwstats.nwritten += len;
3504                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3505                 mutex_exit(&zonep->zone_vfs_lock);
3506 
3507                 lat = gethrtime() - start;
3508 
3509                 if (lat >= VOP_LATENCY_10MS) {
3510                         if (lat < VOP_LATENCY_100MS)
3511                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3512                         else if (lat < VOP_LATENCY_1S) {
3513                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3514                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3515                         } else if (lat < VOP_LATENCY_10S) {
3516                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3517                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3518                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3519                         } else {
3520                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3521                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3522                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3523                                 atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3524                         }
3525                 }
3526         }
3527 
3528         return (err);
3529 }
3530 
3531 int
3532 fop_ioctl(
3533         vnode_t *vp,
3534         int cmd,
3535         intptr_t arg,
3536         int flag,
3537         cred_t *cr,
3538         int *rvalp,
3539         caller_context_t *ct)
3540 {
3541         int     err;
3542 
3543         VOPXID_MAP_CR(vp, cr);
 
3671          * If this file system doesn't support case-insensitive access
3672          * and said access is requested, fail quickly.  It is required
3673          * that if the vfs supports case-insensitive lookup, it also
3674          * supports extended dirent flags.
3675          */
3676         if (flags & FIGNORECASE &&
3677             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3678             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3679                 return (EINVAL);
3680 
3681         VOPXID_MAP_CR(dvp, cr);
3682 
3683         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3684                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3685         } else {
3686                 ret = (*(dvp)->v_op->vop_lookup)
3687                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3688         }
3689         if (ret == 0 && *vpp) {
3690                 VOPSTATS_UPDATE(*vpp, lookup);
3691                 vn_updatepath(dvp, *vpp, nm);
3692         }
3693 
3694         return (ret);
3695 }
3696 
3697 int
3698 fop_create(
3699         vnode_t *dvp,
3700         char *name,
3701         vattr_t *vap,
3702         vcexcl_t excl,
3703         int mode,
3704         vnode_t **vpp,
3705         cred_t *cr,
3706         int flags,
3707         caller_context_t *ct,
3708         vsecattr_t *vsecp)      /* ACL to set during create */
3709 {
3710         int ret;
3711 
3712         if (vsecp != NULL &&
3713             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3714                 return (EINVAL);
3715         }
3716         /*
3717          * If this file system doesn't support case-insensitive access
3718          * and said access is requested, fail quickly.
3719          */
3720         if (flags & FIGNORECASE &&
3721             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3722             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3723                 return (EINVAL);
3724 
3725         VOPXID_MAP_CR(dvp, cr);
3726 
3727         ret = (*(dvp)->v_op->vop_create)
3728             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3729         if (ret == 0 && *vpp) {
3730                 VOPSTATS_UPDATE(*vpp, create);
3731                 vn_updatepath(dvp, *vpp, name);
3732         }
3733 
3734         return (ret);
3735 }
3736 
3737 int
3738 fop_remove(
3739         vnode_t *dvp,
3740         char *nm,
3741         cred_t *cr,
3742         caller_context_t *ct,
3743         int flags)
3744 {
3745         int     err;
3746 
3747         /*
3748          * If this file system doesn't support case-insensitive access
3749          * and said access is requested, fail quickly.
3750          */
3751         if (flags & FIGNORECASE &&
3752             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 
3831 
3832         if (vsecp != NULL &&
3833             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3834                 return (EINVAL);
3835         }
3836         /*
3837          * If this file system doesn't support case-insensitive access
3838          * and said access is requested, fail quickly.
3839          */
3840         if (flags & FIGNORECASE &&
3841             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3842             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3843                 return (EINVAL);
3844 
3845         VOPXID_MAP_CR(dvp, cr);
3846 
3847         ret = (*(dvp)->v_op->vop_mkdir)
3848             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3849         if (ret == 0 && *vpp) {
3850                 VOPSTATS_UPDATE(*vpp, mkdir);
3851                 vn_updatepath(dvp, *vpp, dirname);
3852         }
3853 
3854         return (ret);
3855 }
3856 
3857 int
3858 fop_rmdir(
3859         vnode_t *dvp,
3860         char *nm,
3861         vnode_t *cdir,
3862         cred_t *cr,
3863         caller_context_t *ct,
3864         int flags)
3865 {
3866         int     err;
3867 
3868         /*
3869          * If this file system doesn't support case-insensitive access
3870          * and said access is requested, fail quickly.
3871          */
3872         if (flags & FIGNORECASE &&
 
 | 
 
 
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/t_lock.h>
  43 #include <sys/errno.h>
  44 #include <sys/cred.h>
 
 
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/vnode.h>
  52 #include <sys/rwstlock.h>
  53 #include <sys/fem.h>
  54 #include <sys/stat.h>
  55 #include <sys/mode.h>
  56 #include <sys/conf.h>
  57 #include <sys/sysmacros.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/systm.h>
  60 #include <sys/kmem.h>
  61 #include <sys/debug.h>
  62 #include <c2/audit.h>
  63 #include <sys/acl.h>
  64 #include <sys/nbmlock.h>
  65 #include <sys/fcntl.h>
  66 #include <fs/fs_subr.h>
  67 #include <sys/taskq.h>
  68 #include <fs/fs_reparse.h>
  69 
  70 /* Determine if this vnode is a file that is read-only */
  71 #define ISROFILE(vp)    \
  72         ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
  73             (vp)->v_type != VFIFO && vn_is_readonly(vp))
  74 
  75 /* Tunable via /etc/system; used only by admin/install */
  76 int nfs_global_client_only;
  77 
  78 /*
  79  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  80  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  81  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  82  * the same fstype index that is used to index into the vfssw table.
  83  */
  84 vopstats_t **vopstats_fstype;
  85 
  86 /* vopstats initialization template used for fast initialization via bcopy() */
  87 static vopstats_t *vs_templatep;
  88 
  89 /* Kmem cache handle for vsk_anchor_t allocations */
  90 kmem_cache_t *vsk_anchor_cache;
  91 
  92 /* file events cleanup routine */
  93 extern void free_fopdata(vnode_t *);
  94 
  95 /*
  96  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  97  * updates to vsktat_tree.
  98  */
  99 avl_tree_t      vskstat_tree;
 100 kmutex_t        vskstat_tree_lock;
 101 
 102 /* Global variable which enables/disables the vopstats collection */
 103 int vopstats_enabled = 1;
 104 
 105 /*
 106  * forward declarations for internal vnode specific data (vsd)
 107  */
 108 static void *vsd_realloc(void *, size_t, size_t);
 109 
 110 /*
 111  * forward declarations for reparse point functions
 112  */
 113 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 114 
 115 /*
 116  * VSD -- VNODE SPECIFIC DATA
 117  * The v_data pointer is typically used by a file system to store a
 118  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
 119  * However, there are times when additional project private data needs
 120  * to be stored separately from the data (node) pointed to by v_data.
 121  * This additional data could be stored by the file system itself or
 122  * by a completely different kernel entity.  VSD provides a way for
 123  * callers to obtain a key and store a pointer to private data associated
 124  * with a vnode.
 
 
 186                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 187                         vsp->n##counter.value.ui64++;                        \
 188                         vsp->bytecounter.value.ui64 += bytesval;     \
 189                 }                                                       \
 190         }                                                               \
 191 }
 192 
 193 /*
 194  * If the filesystem does not support XIDs map credential
 195  * If the vfsp is NULL, perhaps we should also map?
 196  */
 197 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 198         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 199         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 200                 cr = crgetmapped(cr);                                   \
 201         }
 202 
 203 #define VOP_LATENCY_10MS        10000000
 204 #define VOP_LATENCY_100MS       100000000
 205 #define VOP_LATENCY_1S          1000000000
 206 
 207 /*
 208  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 209  * numerical order of S_IFMT and vnode types.)
 210  */
 211 enum vtype iftovt_tab[] = {
 212         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 213         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 214 };
 215 
 216 ushort_t vttoif_tab[] = {
 217         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 218         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 219 };
 220 
 221 /*
 222  * The system vnode cache.
 223  */
 224 
 225 kmem_cache_t *vn_cache;
 
2271         kmem_free(vnops, sizeof (vnodeops_t));
2272 }
2273 
2274 /*
2275  * Vnode cache.
2276  */
2277 
2278 /* ARGSUSED */
2279 static int
2280 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2281 {
2282         struct vnode *vp;
2283 
2284         vp = buf;
2285 
2286         mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2287         mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2288         cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2289         rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2290         vp->v_femhead = NULL;        /* Must be done before vn_reinit() */
2291         vp->v_path = NULL;
2292         vp->v_mpssdata = NULL;
2293         vp->v_vsd = NULL;
2294         vp->v_fopdata = NULL;
2295 
2296         return (0);
2297 }
2298 
2299 /* ARGSUSED */
2300 static void
2301 vn_cache_destructor(void *buf, void *cdrarg)
2302 {
2303         struct vnode *vp;
2304 
2305         vp = buf;
2306 
2307         rw_destroy(&vp->v_nbllock);
2308         cv_destroy(&vp->v_cv);
2309         mutex_destroy(&vp->v_vsd_lock);
2310         mutex_destroy(&vp->v_lock);
2311 }
 
2318             P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2319         vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2320             VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2321             NULL, 0);
2322 }
2323 
2324 void
2325 vn_destroy_cache(void)
2326 {
2327         kmem_cache_destroy(vn_cache);
2328 }
2329 
2330 /*
2331  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2332  * cached by the file system and vnodes remain associated.
2333  */
2334 void
2335 vn_recycle(vnode_t *vp)
2336 {
2337         ASSERT(vp->v_pages == NULL);
2338 
2339         /*
2340          * XXX - This really belongs in vn_reinit(), but we have some issues
2341          * with the counts.  Best to have it here for clean initialization.
2342          */
2343         vp->v_rdcnt = 0;
2344         vp->v_wrcnt = 0;
2345         vp->v_mmap_read = 0;
2346         vp->v_mmap_write = 0;
2347 
2348         /*
2349          * If FEM was in use, make sure everything gets cleaned up
2350          * NOTE: vp->v_femhead is initialized to NULL in the vnode
2351          * constructor.
2352          */
2353         if (vp->v_femhead) {
2354                 /* XXX - There should be a free_femhead() that does all this */
2355                 ASSERT(vp->v_femhead->femh_list == NULL);
2356                 mutex_destroy(&vp->v_femhead->femh_lock);
2357                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2358                 vp->v_femhead = NULL;
2359         }
2360         if (vp->v_path) {
2361                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2362                 vp->v_path = NULL;
2363         }
2364 
2365         if (vp->v_fopdata != NULL) {
2366                 free_fopdata(vp);
2367         }
2368         vp->v_mpssdata = NULL;
2369         vsd_free(vp);
2370 }
2371 
2372 /*
2373  * Used to reset the vnode fields including those that are directly accessible
2374  * as well as those which require an accessor function.
2375  *
2376  * Does not initialize:
2377  *      synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2378  *      v_data (since FS-nodes and vnodes point to each other and should
2379  *              be updated simultaneously)
2380  *      v_op (in case someone needs to make a VOP call on this object)
2381  */
2382 void
2383 vn_reinit(vnode_t *vp)
 
2414                 vp->v_fopdata = NULL;
2415                 vn_reinit(vp);
2416         }
2417 
2418         return (vp);
2419 }
2420 
2421 void
2422 vn_free(vnode_t *vp)
2423 {
2424         ASSERT(vp->v_shrlocks == NULL);
2425         ASSERT(vp->v_filocks == NULL);
2426 
2427         /*
2428          * Some file systems call vn_free() with v_count of zero,
2429          * some with v_count of 1.  In any case, the value should
2430          * never be anything else.
2431          */
2432         ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2433         ASSERT(vp->v_count_dnlc == 0);
2434         if (vp->v_path != NULL) {
2435                 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2436                 vp->v_path = NULL;
2437         }
2438 
2439         /* If FEM was in use, make sure everything gets cleaned up */
2440         if (vp->v_femhead) {
2441                 /* XXX - There should be a free_femhead() that does all this */
2442                 ASSERT(vp->v_femhead->femh_list == NULL);
2443                 mutex_destroy(&vp->v_femhead->femh_lock);
2444                 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2445                 vp->v_femhead = NULL;
2446         }
2447 
2448         if (vp->v_fopdata != NULL) {
2449                 free_fopdata(vp);
2450         }
2451         vp->v_mpssdata = NULL;
2452         vsd_free(vp);
2453         kmem_cache_free(vn_cache, vp);
2454 }
2455 
2456 /*
 
2950         }
2951 
2952         return ((loc != NULL) && (*loc == funcp));
2953 }
2954 
2955 /*
2956  * fs_new_caller_id() needs to return a unique ID on a given local system.
2957  * The IDs do not need to survive across reboots.  These are primarily
2958  * used so that (FEM) monitors can detect particular callers (such as
2959  * the NFS server) to a given vnode/vfs operation.
2960  */
2961 u_longlong_t
2962 fs_new_caller_id()
2963 {
2964         static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2965 
2966         return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2967 }
2968 
2969 /*
2970  * Given a starting vnode and a path, updates the path in the target vnode in
2971  * a safe manner.  If the vnode already has path information embedded, then the
2972  * cached path is left untouched.
2973  */
2974 
2975 size_t max_vnode_path = 4 * MAXPATHLEN;
2976 
2977 void
2978 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2979     const char *path, size_t plen)
2980 {
2981         char    *rpath;
2982         vnode_t *base;
2983         size_t  rpathlen, rpathalloc;
2984         int     doslash = 1;
2985 
2986         if (*path == '/') {
2987                 base = rootvp;
2988                 path++;
2989                 plen--;
2990         } else {
2991                 base = startvp;
2992         }
2993 
2994         /*
2995          * We cannot grab base->v_lock while we hold vp->v_lock because of
2996          * the potential for deadlock.
2997          */
2998         mutex_enter(&base->v_lock);
2999         if (base->v_path == NULL) {
3000                 mutex_exit(&base->v_lock);
3001                 return;
3002         }
3003 
3004         rpathlen = strlen(base->v_path);
3005         rpathalloc = rpathlen + plen + 1;
3006         /* Avoid adding a slash if there's already one there */
3007         if (base->v_path[rpathlen-1] == '/')
3008                 doslash = 0;
3009         else
3010                 rpathalloc++;
3011 
3012         /*
3013          * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
3014          * so we must do this dance.  If, by chance, something changes the path,
3015          * just give up since there is no real harm.
3016          */
3017         mutex_exit(&base->v_lock);
3018 
3019         /* Paths should stay within reason */
3020         if (rpathalloc > max_vnode_path)
3021                 return;
3022 
3023         rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3024 
3025         mutex_enter(&base->v_lock);
3026         if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3027                 mutex_exit(&base->v_lock);
3028                 kmem_free(rpath, rpathalloc);
3029                 return;
3030         }
3031         bcopy(base->v_path, rpath, rpathlen);
3032         mutex_exit(&base->v_lock);
3033 
3034         if (doslash)
3035                 rpath[rpathlen++] = '/';
3036         bcopy(path, rpath + rpathlen, plen);
3037         rpath[rpathlen + plen] = '\0';
3038 
3039         mutex_enter(&vp->v_lock);
3040         if (vp->v_path != NULL) {
3041                 mutex_exit(&vp->v_lock);
3042                 kmem_free(rpath, rpathalloc);
3043         } else {
3044                 vp->v_path = rpath;
3045                 mutex_exit(&vp->v_lock);
3046         }
3047 }
3048 
3049 /*
3050  * Sets the path to the vnode to be the given string, regardless of current
3051  * context.  The string must be a complete path from rootdir.  This is only used
3052  * by fsop_root() for setting the path based on the mountpoint.
3053  */
3054 void
3055 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3056 {
3057         char *buf = kmem_alloc(len + 1, KM_SLEEP);
3058 
3059         mutex_enter(&vp->v_lock);
3060         if (vp->v_path != NULL) {
3061                 mutex_exit(&vp->v_lock);
3062                 kmem_free(buf, len + 1);
3063                 return;
3064         }
3065 
3066         vp->v_path = buf;
3067         bcopy(str, vp->v_path, len);
3068         vp->v_path[len] = '\0';
3069 
3070         mutex_exit(&vp->v_lock);
3071 }
3072 
3073 /*
3074  * Called from within filesystem's vop_rename() to handle renames once the
3075  * target vnode is available.
3076  */
3077 void
3078 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3079 {
3080         char *tmp;
3081 
3082         mutex_enter(&vp->v_lock);
3083         tmp = vp->v_path;
3084         vp->v_path = NULL;
3085         mutex_exit(&vp->v_lock);
3086         vn_setpath(rootdir, dvp, vp, nm, len);
3087         if (tmp != NULL)
3088                 kmem_free(tmp, strlen(tmp) + 1);
3089 }
3090 
3091 /*
3092  * Similar to vn_setpath_str(), this function sets the path of the destination
3093  * vnode to the be the same as the source vnode.
3094  */
3095 void
3096 vn_copypath(struct vnode *src, struct vnode *dst)
3097 {
3098         char *buf;
3099         int alloc;
3100 
3101         mutex_enter(&src->v_lock);
3102         if (src->v_path == NULL) {
3103                 mutex_exit(&src->v_lock);
3104                 return;
3105         }
3106         alloc = strlen(src->v_path) + 1;
3107 
3108         /* avoid kmem_alloc() with lock held */
3109         mutex_exit(&src->v_lock);
3110         buf = kmem_alloc(alloc, KM_SLEEP);
3111         mutex_enter(&src->v_lock);
3112         if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3113                 mutex_exit(&src->v_lock);
3114                 kmem_free(buf, alloc);
3115                 return;
3116         }
3117         bcopy(src->v_path, buf, alloc);
3118         mutex_exit(&src->v_lock);
3119 
3120         mutex_enter(&dst->v_lock);
3121         if (dst->v_path != NULL) {
3122                 mutex_exit(&dst->v_lock);
3123                 kmem_free(buf, alloc);
3124                 return;
3125         }
3126         dst->v_path = buf;
3127         mutex_exit(&dst->v_lock);
3128 }
3129 
3130 /*
3131  * XXX Private interface for segvn routines that handle vnode
3132  * large page segments.
3133  *
3134  * return 1 if vp's file system VOP_PAGEIO() implementation
3135  * can be safely used instead of VOP_GETPAGE() for handling
3136  * pagefaults against regular non swap files. VOP_PAGEIO()
3137  * interface is considered safe here if its implementation
3138  * is very close to VOP_GETPAGE() implementation.
3139  * e.g. It zero's out the part of the page beyond EOF. Doesn't
3140  * panic if there're file holes but instead returns an error.
3141  * Doesn't assume file won't be changed by user writes, etc.
3142  *
3143  * return 0 otherwise.
3144  *
3145  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3146  */
3147 int
3148 vn_vmpss_usepageio(vnode_t *vp)
3149 {
 
3298         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3299         len = resid_start - uiop->uio_resid;
3300 
3301         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3302 
3303         if (start != 0) {
3304                 mutex_enter(&zonep->zone_vfs_lock);
3305                 zonep->zone_vfs_rwstats.reads++;
3306                 zonep->zone_vfs_rwstats.nread += len;
3307                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3308                 mutex_exit(&zonep->zone_vfs_lock);
3309 
3310                 lat = gethrtime() - start;
3311 
3312                 if (lat >= VOP_LATENCY_10MS) {
3313                         if (lat < VOP_LATENCY_100MS)
3314                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3315                         else if (lat < VOP_LATENCY_1S) {
3316                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3317                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3318                         } else {
3319                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3320                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3321                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3322                         }
3323                 }
3324         }
3325 
3326         return (err);
3327 }
3328 
3329 int
3330 fop_write(
3331         vnode_t *vp,
3332         uio_t *uiop,
3333         int ioflag,
3334         cred_t *cr,
3335         caller_context_t *ct)
3336 {
3337         ssize_t resid_start = uiop->uio_resid;
3338         zone_t  *zonep = curzone;
3339         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3340 
3341         hrtime_t start = 0, lat;
 
3361         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3362         len = resid_start - uiop->uio_resid;
3363 
3364         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3365 
3366         if (start != 0) {
3367                 mutex_enter(&zonep->zone_vfs_lock);
3368                 zonep->zone_vfs_rwstats.writes++;
3369                 zonep->zone_vfs_rwstats.nwritten += len;
3370                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3371                 mutex_exit(&zonep->zone_vfs_lock);
3372 
3373                 lat = gethrtime() - start;
3374 
3375                 if (lat >= VOP_LATENCY_10MS) {
3376                         if (lat < VOP_LATENCY_100MS)
3377                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3378                         else if (lat < VOP_LATENCY_1S) {
3379                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3380                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3381                         } else {
3382                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3383                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3384                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3385                         }
3386                 }
3387         }
3388 
3389         return (err);
3390 }
3391 
3392 int
3393 fop_ioctl(
3394         vnode_t *vp,
3395         int cmd,
3396         intptr_t arg,
3397         int flag,
3398         cred_t *cr,
3399         int *rvalp,
3400         caller_context_t *ct)
3401 {
3402         int     err;
3403 
3404         VOPXID_MAP_CR(vp, cr);
 
3532          * If this file system doesn't support case-insensitive access
3533          * and said access is requested, fail quickly.  It is required
3534          * that if the vfs supports case-insensitive lookup, it also
3535          * supports extended dirent flags.
3536          */
3537         if (flags & FIGNORECASE &&
3538             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3539             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3540                 return (EINVAL);
3541 
3542         VOPXID_MAP_CR(dvp, cr);
3543 
3544         if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3545                 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3546         } else {
3547                 ret = (*(dvp)->v_op->vop_lookup)
3548                     (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3549         }
3550         if (ret == 0 && *vpp) {
3551                 VOPSTATS_UPDATE(*vpp, lookup);
3552                 if ((*vpp)->v_path == NULL) {
3553                         vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3554                 }
3555         }
3556 
3557         return (ret);
3558 }
3559 
3560 int
3561 fop_create(
3562         vnode_t *dvp,
3563         char *name,
3564         vattr_t *vap,
3565         vcexcl_t excl,
3566         int mode,
3567         vnode_t **vpp,
3568         cred_t *cr,
3569         int flags,
3570         caller_context_t *ct,
3571         vsecattr_t *vsecp)      /* ACL to set during create */
3572 {
3573         int ret;
3574 
3575         if (vsecp != NULL &&
3576             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3577                 return (EINVAL);
3578         }
3579         /*
3580          * If this file system doesn't support case-insensitive access
3581          * and said access is requested, fail quickly.
3582          */
3583         if (flags & FIGNORECASE &&
3584             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3585             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3586                 return (EINVAL);
3587 
3588         VOPXID_MAP_CR(dvp, cr);
3589 
3590         ret = (*(dvp)->v_op->vop_create)
3591             (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3592         if (ret == 0 && *vpp) {
3593                 VOPSTATS_UPDATE(*vpp, create);
3594                 if ((*vpp)->v_path == NULL) {
3595                         vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3596                 }
3597         }
3598 
3599         return (ret);
3600 }
3601 
3602 int
3603 fop_remove(
3604         vnode_t *dvp,
3605         char *nm,
3606         cred_t *cr,
3607         caller_context_t *ct,
3608         int flags)
3609 {
3610         int     err;
3611 
3612         /*
3613          * If this file system doesn't support case-insensitive access
3614          * and said access is requested, fail quickly.
3615          */
3616         if (flags & FIGNORECASE &&
3617             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 
3696 
3697         if (vsecp != NULL &&
3698             vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3699                 return (EINVAL);
3700         }
3701         /*
3702          * If this file system doesn't support case-insensitive access
3703          * and said access is requested, fail quickly.
3704          */
3705         if (flags & FIGNORECASE &&
3706             (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3707             vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3708                 return (EINVAL);
3709 
3710         VOPXID_MAP_CR(dvp, cr);
3711 
3712         ret = (*(dvp)->v_op->vop_mkdir)
3713             (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3714         if (ret == 0 && *vpp) {
3715                 VOPSTATS_UPDATE(*vpp, mkdir);
3716                 if ((*vpp)->v_path == NULL) {
3717                         vn_setpath(rootdir, dvp, *vpp, dirname,
3718                             strlen(dirname));
3719                 }
3720         }
3721 
3722         return (ret);
3723 }
3724 
3725 int
3726 fop_rmdir(
3727         vnode_t *dvp,
3728         char *nm,
3729         vnode_t *cdir,
3730         cred_t *cr,
3731         caller_context_t *ct,
3732         int flags)
3733 {
3734         int     err;
3735 
3736         /*
3737          * If this file system doesn't support case-insensitive access
3738          * and said access is requested, fail quickly.
3739          */
3740         if (flags & FIGNORECASE &&
 
 |