4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2016, Joyent, Inc.
25 */
26
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/t_lock.h>
43 #include <sys/errno.h>
44 #include <sys/cred.h>
49 #include <sys/vfs.h>
50 #include <sys/vfs_opreg.h>
51 #include <sys/vnode.h>
52 #include <sys/rwstlock.h>
53 #include <sys/fem.h>
54 #include <sys/stat.h>
55 #include <sys/mode.h>
56 #include <sys/conf.h>
57 #include <sys/sysmacros.h>
58 #include <sys/cmn_err.h>
59 #include <sys/systm.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <c2/audit.h>
63 #include <sys/acl.h>
64 #include <sys/nbmlock.h>
65 #include <sys/fcntl.h>
66 #include <fs/fs_subr.h>
67 #include <sys/taskq.h>
68 #include <fs/fs_reparse.h>
69 #include <sys/time.h>
70 #include <sys/sdt.h>
71
72 /* Determine if this vnode is a file that is read-only */
73 #define ISROFILE(vp) \
74 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
75 (vp)->v_type != VFIFO && vn_is_readonly(vp))
76
77 /* Tunable via /etc/system; used only by admin/install */
78 int nfs_global_client_only;
79
80 /*
81 * Array of vopstats_t for per-FS-type vopstats. This array has the same
82 * number of entries as and parallel to the vfssw table. (Arguably, it could
83 * be part of the vfssw table.) Once it's initialized, it's accessed using
84 * the same fstype index that is used to index into the vfssw table.
85 */
86 vopstats_t **vopstats_fstype;
87
88 /* vopstats initialization template used for fast initialization via bcopy() */
89 static vopstats_t *vs_templatep;
90
91 /* Kmem cache handle for vsk_anchor_t allocations */
92 kmem_cache_t *vsk_anchor_cache;
93
94 /* file events cleanup routine */
95 extern void free_fopdata(vnode_t *);
96
97 /*
98 * Root of AVL tree for the kstats associated with vopstats. Lock protects
99 * updates to vsktat_tree.
100 */
101 avl_tree_t vskstat_tree;
102 kmutex_t vskstat_tree_lock;
103
104 /* Global variable which enables/disables the vopstats collection */
105 int vopstats_enabled = 1;
106
107 /* Global used for empty/invalid v_path */
108 char *vn_vpath_empty = "";
109
110 /*
111 * forward declarations for internal vnode specific data (vsd)
112 */
113 static void *vsd_realloc(void *, size_t, size_t);
114
115 /*
116 * forward declarations for reparse point functions
117 */
118 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
119
120 /*
121 * VSD -- VNODE SPECIFIC DATA
122 * The v_data pointer is typically used by a file system to store a
123 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
124 * However, there are times when additional project private data needs
125 * to be stored separately from the data (node) pointed to by v_data.
126 * This additional data could be stored by the file system itself or
127 * by a completely different kernel entity. VSD provides a way for
128 * callers to obtain a key and store a pointer to private data associated
129 * with a vnode.
191 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
192 vsp->n##counter.value.ui64++; \
193 vsp->bytecounter.value.ui64 += bytesval; \
194 } \
195 } \
196 }
197
198 /*
199 * If the filesystem does not support XIDs map credential
200 * If the vfsp is NULL, perhaps we should also map?
201 */
202 #define VOPXID_MAP_CR(vp, cr) { \
203 vfs_t *vfsp = (vp)->v_vfsp; \
204 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
205 cr = crgetmapped(cr); \
206 }
207
208 #define VOP_LATENCY_10MS 10000000
209 #define VOP_LATENCY_100MS 100000000
210 #define VOP_LATENCY_1S 1000000000
211 #define VOP_LATENCY_10S 10000000000
212
213 /*
214 * Convert stat(2) formats to vnode types and vice versa. (Knows about
215 * numerical order of S_IFMT and vnode types.)
216 */
217 enum vtype iftovt_tab[] = {
218 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
219 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
220 };
221
222 ushort_t vttoif_tab[] = {
223 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
224 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
225 };
226
227 /*
228 * The system vnode cache.
229 */
230
231 kmem_cache_t *vn_cache;
2277 kmem_free(vnops, sizeof (vnodeops_t));
2278 }
2279
2280 /*
2281 * Vnode cache.
2282 */
2283
2284 /* ARGSUSED */
2285 static int
2286 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2287 {
2288 struct vnode *vp;
2289
2290 vp = buf;
2291
2292 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2293 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2294 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2295 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2296 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2297 vp->v_path = vn_vpath_empty;
2298 vp->v_path_stamp = 0;
2299 vp->v_mpssdata = NULL;
2300 vp->v_vsd = NULL;
2301 vp->v_fopdata = NULL;
2302
2303 return (0);
2304 }
2305
2306 /* ARGSUSED */
2307 static void
2308 vn_cache_destructor(void *buf, void *cdrarg)
2309 {
2310 struct vnode *vp;
2311
2312 vp = buf;
2313
2314 rw_destroy(&vp->v_nbllock);
2315 cv_destroy(&vp->v_cv);
2316 mutex_destroy(&vp->v_vsd_lock);
2317 mutex_destroy(&vp->v_lock);
2318 }
2325 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2326 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2327 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2328 NULL, 0);
2329 }
2330
2331 void
2332 vn_destroy_cache(void)
2333 {
2334 kmem_cache_destroy(vn_cache);
2335 }
2336
2337 /*
2338 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2339 * cached by the file system and vnodes remain associated.
2340 */
2341 void
2342 vn_recycle(vnode_t *vp)
2343 {
2344 ASSERT(vp->v_pages == NULL);
2345 VERIFY(vp->v_path != NULL);
2346
2347 /*
2348 * XXX - This really belongs in vn_reinit(), but we have some issues
2349 * with the counts. Best to have it here for clean initialization.
2350 */
2351 vp->v_rdcnt = 0;
2352 vp->v_wrcnt = 0;
2353 vp->v_mmap_read = 0;
2354 vp->v_mmap_write = 0;
2355
2356 /*
2357 * If FEM was in use, make sure everything gets cleaned up
2358 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2359 * constructor.
2360 */
2361 if (vp->v_femhead) {
2362 /* XXX - There should be a free_femhead() that does all this */
2363 ASSERT(vp->v_femhead->femh_list == NULL);
2364 mutex_destroy(&vp->v_femhead->femh_lock);
2365 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2366 vp->v_femhead = NULL;
2367 }
2368 if (vp->v_path != vn_vpath_empty) {
2369 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2370 vp->v_path = vn_vpath_empty;
2371 }
2372 vp->v_path_stamp = 0;
2373
2374 if (vp->v_fopdata != NULL) {
2375 free_fopdata(vp);
2376 }
2377 vp->v_mpssdata = NULL;
2378 vsd_free(vp);
2379 }
2380
2381 /*
2382 * Used to reset the vnode fields including those that are directly accessible
2383 * as well as those which require an accessor function.
2384 *
2385 * Does not initialize:
2386 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2387 * v_data (since FS-nodes and vnodes point to each other and should
2388 * be updated simultaneously)
2389 * v_op (in case someone needs to make a VOP call on this object)
2390 */
2391 void
2392 vn_reinit(vnode_t *vp)
2423 vp->v_fopdata = NULL;
2424 vn_reinit(vp);
2425 }
2426
2427 return (vp);
2428 }
2429
2430 void
2431 vn_free(vnode_t *vp)
2432 {
2433 ASSERT(vp->v_shrlocks == NULL);
2434 ASSERT(vp->v_filocks == NULL);
2435
2436 /*
2437 * Some file systems call vn_free() with v_count of zero,
2438 * some with v_count of 1. In any case, the value should
2439 * never be anything else.
2440 */
2441 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2442 ASSERT(vp->v_count_dnlc == 0);
2443 VERIFY(vp->v_path != NULL);
2444 if (vp->v_path != vn_vpath_empty) {
2445 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2446 vp->v_path = vn_vpath_empty;
2447 }
2448
2449 /* If FEM was in use, make sure everything gets cleaned up */
2450 if (vp->v_femhead) {
2451 /* XXX - There should be a free_femhead() that does all this */
2452 ASSERT(vp->v_femhead->femh_list == NULL);
2453 mutex_destroy(&vp->v_femhead->femh_lock);
2454 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2455 vp->v_femhead = NULL;
2456 }
2457
2458 if (vp->v_fopdata != NULL) {
2459 free_fopdata(vp);
2460 }
2461 vp->v_mpssdata = NULL;
2462 vsd_free(vp);
2463 kmem_cache_free(vn_cache, vp);
2464 }
2465
2466 /*
2960 }
2961
2962 return ((loc != NULL) && (*loc == funcp));
2963 }
2964
2965 /*
2966 * fs_new_caller_id() needs to return a unique ID on a given local system.
2967 * The IDs do not need to survive across reboots. These are primarily
2968 * used so that (FEM) monitors can detect particular callers (such as
2969 * the NFS server) to a given vnode/vfs operation.
2970 */
2971 u_longlong_t
2972 fs_new_caller_id()
2973 {
2974 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2975
2976 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2977 }
2978
2979 /*
2980 * The value stored in v_path is relative to rootdir, located in the global
2981 * zone. Zones or chroot environments which reside deeper inside the VFS
2982 * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
2983 * what lies below their perceived root. In order to keep v_path usable for
2984 * these child environments, its allocations are allowed to exceed MAXPATHLEN.
2985 *
2986 * An upper bound of max_vnode_path is placed upon v_path allocations to
2987 * prevent the system from going too wild at the behest of pathological
2988 * behavior from the operator.
2989 */
2990 size_t max_vnode_path = 4 * MAXPATHLEN;
2991
2992
2993 void
2994 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
2995 {
2996 char *buf;
2997
2998 mutex_enter(&vp->v_lock);
2999 /*
3000 * If the snapshot of v_path_stamp passed in via compare_stamp does not
3001 * match the present value on the vnode, it indicates that subsequent
3002 * changes have occurred. The v_path value is not cleared in this case
3003 * since the new value may be valid.
3004 */
3005 if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
3006 mutex_exit(&vp->v_lock);
3007 return;
3008 }
3009 buf = vp->v_path;
3010 vp->v_path = vn_vpath_empty;
3011 vp->v_path_stamp = 0;
3012 mutex_exit(&vp->v_lock);
3013 if (buf != vn_vpath_empty) {
3014 kmem_free(buf, strlen(buf) + 1);
3015 }
3016 }
3017
3018 static void
3019 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
3020 boolean_t is_rename)
3021 {
3022 char *buf, *oldbuf;
3023 hrtime_t pstamp;
3024 size_t baselen, buflen = 0;
3025
3026 /* Handle the vn_setpath_str case. */
3027 if (pvp == NULL) {
3028 if (len + 1 > max_vnode_path) {
3029 DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
3030 vnode_t *, vp, char *, name, size_t, len + 1);
3031 return;
3032 }
3033 buf = kmem_alloc(len + 1, KM_SLEEP);
3034 bcopy(name, buf, len);
3035 buf[len] = '\0';
3036
3037 mutex_enter(&vp->v_lock);
3038 oldbuf = vp->v_path;
3039 vp->v_path = buf;
3040 vp->v_path_stamp = gethrtime();
3041 mutex_exit(&vp->v_lock);
3042 if (oldbuf != vn_vpath_empty) {
3043 kmem_free(oldbuf, strlen(oldbuf) + 1);
3044 }
3045 return;
3046 }
3047
3048 /* Take snapshot of parent dir */
3049 mutex_enter(&pvp->v_lock);
3050 retrybuf:
3051 if (pvp->v_path == vn_vpath_empty) {
3052 /*
3053 * Without v_path from the parent directory, generating a child
3054 * path from the name is impossible.
3055 */
3056 if (len > 0) {
3057 pstamp = pvp->v_path_stamp;
3058 mutex_exit(&pvp->v_lock);
3059 vn_clearpath(vp, pstamp);
3060 return;
3061 }
3062
3063 /*
3064 * The only feasible case here is where a NUL lookup is being
3065 * performed on rootdir prior to its v_path being populated.
3066 */
3067 ASSERT(pvp->v_path_stamp = 0);
3068 baselen = 0;
3069 pstamp = 0;
3070 } else {
3071 pstamp = pvp->v_path_stamp;
3072 baselen = strlen(pvp->v_path);
3073 /* ignore a trailing slash if present */
3074 if (pvp->v_path[baselen - 1] == '/') {
3075 /* This should only the be case for rootdir */
3076 ASSERT(baselen == 1 && pvp == rootdir);
3077 baselen--;
3078 }
3079 }
3080 mutex_exit(&pvp->v_lock);
3081
3082 if (buflen != 0) {
3083 /* Free the existing (mis-sized) buffer in case of retry */
3084 kmem_free(buf, buflen);
3085 }
3086 /* base, '/', name and trailing NUL */
3087 buflen = baselen + len + 2;
3088 if (buflen > max_vnode_path) {
3089 DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
3090 vnode_t *, vp, char *, name, size_t, buflen);
3091 return;
3092 }
3093 buf = kmem_alloc(buflen, KM_SLEEP);
3094
3095 mutex_enter(&pvp->v_lock);
3096 if (pvp->v_path_stamp != pstamp) {
3097 size_t vlen;
3098
3099 /*
3100 * Since v_path_stamp changed on the parent, it is likely that
3101 * v_path has been altered as well. If the length does not
3102 * exactly match what was previously measured, the buffer
3103 * allocation must be repeated for proper sizing.
3104 */
3105 if (pvp->v_path == vn_vpath_empty) {
3106 /* Give up if parent lack v_path */
3107 mutex_exit(&pvp->v_lock);
3108 kmem_free(buf, buflen);
3109 return;
3110 }
3111 vlen = strlen(pvp->v_path);
3112 if (pvp->v_path[vlen - 1] == '/') {
3113 vlen--;
3114 }
3115 if (vlen != baselen) {
3116 goto retrybuf;
3117 }
3118 }
3119 bcopy(pvp->v_path, buf, baselen);
3120 mutex_exit(&pvp->v_lock);
3121
3122 buf[baselen] = '/';
3123 baselen++;
3124 bcopy(name, &buf[baselen], len + 1);
3125
3126 mutex_enter(&vp->v_lock);
3127 if (vp->v_path_stamp == 0) {
3128 /* never-visited vnode can inherit stamp from parent */
3129 ASSERT(vp->v_path == vn_vpath_empty);
3130 vp->v_path_stamp = pstamp;
3131 vp->v_path = buf;
3132 mutex_exit(&vp->v_lock);
3133 } else if (vp->v_path_stamp < pstamp || is_rename) {
3134 /*
3135 * Install the updated path and stamp, ensuring that the v_path
3136 * pointer is valid at all times for dtrace.
3137 */
3138 oldbuf = vp->v_path;
3139 vp->v_path = buf;
3140 vp->v_path_stamp = gethrtime();
3141 mutex_exit(&vp->v_lock);
3142 kmem_free(oldbuf, strlen(oldbuf) + 1);
3143 } else {
3144 /*
3145 * If the timestamp matches or is greater, it means another
3146 * thread performed the update first while locks were dropped
3147 * here to make the allocation. We defer to the newer value.
3148 */
3149 mutex_exit(&vp->v_lock);
3150 kmem_free(buf, buflen);
3151 }
3152 ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
3153 }
3154
3155 void
3156 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
3157 {
3158 size_t len;
3159
3160 /*
3161 * If the parent is older or empty, there's nothing further to do.
3162 */
3163 if (pvp->v_path == vn_vpath_empty ||
3164 pvp->v_path_stamp <= vp->v_path_stamp) {
3165 return;
3166 }
3167
3168 /*
3169 * Given the lack of appropriate context, meaningful updates to v_path
3170 * cannot be made for during lookups for the '.' or '..' entries.
3171 */
3172 len = strlen(name);
3173 if (len == 0 || (len == 1 && name[0] == '.') ||
3174 (len == 2 && name[0] == '.' && name[1] == '.')) {
3175 return;
3176 }
3177
3178 vn_setpath_common(pvp, vp, name, len, B_FALSE);
3179 }
3180
3181 /*
3182 * Given a starting vnode and a path, updates the path in the target vnode in
3183 * a safe manner. If the vnode already has path information embedded, then the
3184 * cached path is left untouched.
3185 */
3186 /* ARGSUSED */
3187 void
3188 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
3189 size_t len)
3190 {
3191 vn_setpath_common(pvp, vp, name, len, B_FALSE);
3192 }
3193
3194 /*
3195 * Sets the path to the vnode to be the given string, regardless of current
3196 * context. The string must be a complete path from rootdir. This is only used
3197 * by fsop_root() for setting the path based on the mountpoint.
3198 */
3199 void
3200 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
3201 {
3202 vn_setpath_common(NULL, vp, str, len, B_FALSE);
3203 }
3204
3205 /*
3206 * Called from within filesystem's vop_rename() to handle renames once the
3207 * target vnode is available.
3208 */
3209 void
3210 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
3211 {
3212 vn_setpath_common(pvp, vp, name, len, B_TRUE);
3213 }
3214
3215 /*
3216 * Similar to vn_setpath_str(), this function sets the path of the destination
3217 * vnode to the be the same as the source vnode.
3218 */
3219 void
3220 vn_copypath(struct vnode *src, struct vnode *dst)
3221 {
3222 char *buf;
3223 hrtime_t stamp;
3224 size_t buflen;
3225
3226 mutex_enter(&src->v_lock);
3227 if (src->v_path == vn_vpath_empty) {
3228 mutex_exit(&src->v_lock);
3229 return;
3230 }
3231 buflen = strlen(src->v_path) + 1;
3232 mutex_exit(&src->v_lock);
3233
3234 buf = kmem_alloc(buflen, KM_SLEEP);
3235
3236 mutex_enter(&src->v_lock);
3237 if (src->v_path == vn_vpath_empty ||
3238 strlen(src->v_path) + 1 != buflen) {
3239 mutex_exit(&src->v_lock);
3240 kmem_free(buf, buflen);
3241 return;
3242 }
3243 bcopy(src->v_path, buf, buflen);
3244 stamp = src->v_path_stamp;
3245 mutex_exit(&src->v_lock);
3246
3247 mutex_enter(&dst->v_lock);
3248 if (dst->v_path != vn_vpath_empty) {
3249 mutex_exit(&dst->v_lock);
3250 kmem_free(buf, buflen);
3251 return;
3252 }
3253 dst->v_path = buf;
3254 dst->v_path_stamp = stamp;
3255 mutex_exit(&dst->v_lock);
3256 }
3257
3258
3259 /*
3260 * XXX Private interface for segvn routines that handle vnode
3261 * large page segments.
3262 *
3263 * return 1 if vp's file system VOP_PAGEIO() implementation
3264 * can be safely used instead of VOP_GETPAGE() for handling
3265 * pagefaults against regular non swap files. VOP_PAGEIO()
3266 * interface is considered safe here if its implementation
3267 * is very close to VOP_GETPAGE() implementation.
3268 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3269 * panic if there're file holes but instead returns an error.
3270 * Doesn't assume file won't be changed by user writes, etc.
3271 *
3272 * return 0 otherwise.
3273 *
3274 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3275 */
3276 int
3277 vn_vmpss_usepageio(vnode_t *vp)
3278 {
3427 err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3428 len = resid_start - uiop->uio_resid;
3429
3430 VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3431
3432 if (start != 0) {
3433 mutex_enter(&zonep->zone_vfs_lock);
3434 zonep->zone_vfs_rwstats.reads++;
3435 zonep->zone_vfs_rwstats.nread += len;
3436 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3437 mutex_exit(&zonep->zone_vfs_lock);
3438
3439 lat = gethrtime() - start;
3440
3441 if (lat >= VOP_LATENCY_10MS) {
3442 if (lat < VOP_LATENCY_100MS)
3443 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3444 else if (lat < VOP_LATENCY_1S) {
3445 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3446 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3447 } else if (lat < VOP_LATENCY_10S) {
3448 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3449 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3450 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3451 } else {
3452 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3453 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3454 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3455 atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3456 }
3457 }
3458 }
3459
3460 return (err);
3461 }
3462
3463 int
3464 fop_write(
3465 vnode_t *vp,
3466 uio_t *uiop,
3467 int ioflag,
3468 cred_t *cr,
3469 caller_context_t *ct)
3470 {
3471 ssize_t resid_start = uiop->uio_resid;
3472 zone_t *zonep = curzone;
3473 zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3474
3475 hrtime_t start = 0, lat;
3495 err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3496 len = resid_start - uiop->uio_resid;
3497
3498 VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3499
3500 if (start != 0) {
3501 mutex_enter(&zonep->zone_vfs_lock);
3502 zonep->zone_vfs_rwstats.writes++;
3503 zonep->zone_vfs_rwstats.nwritten += len;
3504 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3505 mutex_exit(&zonep->zone_vfs_lock);
3506
3507 lat = gethrtime() - start;
3508
3509 if (lat >= VOP_LATENCY_10MS) {
3510 if (lat < VOP_LATENCY_100MS)
3511 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3512 else if (lat < VOP_LATENCY_1S) {
3513 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3514 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3515 } else if (lat < VOP_LATENCY_10S) {
3516 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3517 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3518 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3519 } else {
3520 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3521 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3522 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3523 atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
3524 }
3525 }
3526 }
3527
3528 return (err);
3529 }
3530
3531 int
3532 fop_ioctl(
3533 vnode_t *vp,
3534 int cmd,
3535 intptr_t arg,
3536 int flag,
3537 cred_t *cr,
3538 int *rvalp,
3539 caller_context_t *ct)
3540 {
3541 int err;
3542
3543 VOPXID_MAP_CR(vp, cr);
3671 * If this file system doesn't support case-insensitive access
3672 * and said access is requested, fail quickly. It is required
3673 * that if the vfs supports case-insensitive lookup, it also
3674 * supports extended dirent flags.
3675 */
3676 if (flags & FIGNORECASE &&
3677 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3678 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3679 return (EINVAL);
3680
3681 VOPXID_MAP_CR(dvp, cr);
3682
3683 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3684 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3685 } else {
3686 ret = (*(dvp)->v_op->vop_lookup)
3687 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3688 }
3689 if (ret == 0 && *vpp) {
3690 VOPSTATS_UPDATE(*vpp, lookup);
3691 vn_updatepath(dvp, *vpp, nm);
3692 }
3693
3694 return (ret);
3695 }
3696
3697 int
3698 fop_create(
3699 vnode_t *dvp,
3700 char *name,
3701 vattr_t *vap,
3702 vcexcl_t excl,
3703 int mode,
3704 vnode_t **vpp,
3705 cred_t *cr,
3706 int flags,
3707 caller_context_t *ct,
3708 vsecattr_t *vsecp) /* ACL to set during create */
3709 {
3710 int ret;
3711
3712 if (vsecp != NULL &&
3713 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3714 return (EINVAL);
3715 }
3716 /*
3717 * If this file system doesn't support case-insensitive access
3718 * and said access is requested, fail quickly.
3719 */
3720 if (flags & FIGNORECASE &&
3721 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3722 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3723 return (EINVAL);
3724
3725 VOPXID_MAP_CR(dvp, cr);
3726
3727 ret = (*(dvp)->v_op->vop_create)
3728 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3729 if (ret == 0 && *vpp) {
3730 VOPSTATS_UPDATE(*vpp, create);
3731 vn_updatepath(dvp, *vpp, name);
3732 }
3733
3734 return (ret);
3735 }
3736
3737 int
3738 fop_remove(
3739 vnode_t *dvp,
3740 char *nm,
3741 cred_t *cr,
3742 caller_context_t *ct,
3743 int flags)
3744 {
3745 int err;
3746
3747 /*
3748 * If this file system doesn't support case-insensitive access
3749 * and said access is requested, fail quickly.
3750 */
3751 if (flags & FIGNORECASE &&
3752 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3831
3832 if (vsecp != NULL &&
3833 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3834 return (EINVAL);
3835 }
3836 /*
3837 * If this file system doesn't support case-insensitive access
3838 * and said access is requested, fail quickly.
3839 */
3840 if (flags & FIGNORECASE &&
3841 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3842 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3843 return (EINVAL);
3844
3845 VOPXID_MAP_CR(dvp, cr);
3846
3847 ret = (*(dvp)->v_op->vop_mkdir)
3848 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3849 if (ret == 0 && *vpp) {
3850 VOPSTATS_UPDATE(*vpp, mkdir);
3851 vn_updatepath(dvp, *vpp, dirname);
3852 }
3853
3854 return (ret);
3855 }
3856
3857 int
3858 fop_rmdir(
3859 vnode_t *dvp,
3860 char *nm,
3861 vnode_t *cdir,
3862 cred_t *cr,
3863 caller_context_t *ct,
3864 int flags)
3865 {
3866 int err;
3867
3868 /*
3869 * If this file system doesn't support case-insensitive access
3870 * and said access is requested, fail quickly.
3871 */
3872 if (flags & FIGNORECASE &&
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
25 */
26
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
34 *
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
38 */
39
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/t_lock.h>
43 #include <sys/errno.h>
44 #include <sys/cred.h>
49 #include <sys/vfs.h>
50 #include <sys/vfs_opreg.h>
51 #include <sys/vnode.h>
52 #include <sys/rwstlock.h>
53 #include <sys/fem.h>
54 #include <sys/stat.h>
55 #include <sys/mode.h>
56 #include <sys/conf.h>
57 #include <sys/sysmacros.h>
58 #include <sys/cmn_err.h>
59 #include <sys/systm.h>
60 #include <sys/kmem.h>
61 #include <sys/debug.h>
62 #include <c2/audit.h>
63 #include <sys/acl.h>
64 #include <sys/nbmlock.h>
65 #include <sys/fcntl.h>
66 #include <fs/fs_subr.h>
67 #include <sys/taskq.h>
68 #include <fs/fs_reparse.h>
69
70 /* Determine if this vnode is a file that is read-only */
71 #define ISROFILE(vp) \
72 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
73 (vp)->v_type != VFIFO && vn_is_readonly(vp))
74
75 /* Tunable via /etc/system; used only by admin/install */
76 int nfs_global_client_only;
77
78 /*
79 * Array of vopstats_t for per-FS-type vopstats. This array has the same
80 * number of entries as and parallel to the vfssw table. (Arguably, it could
81 * be part of the vfssw table.) Once it's initialized, it's accessed using
82 * the same fstype index that is used to index into the vfssw table.
83 */
84 vopstats_t **vopstats_fstype;
85
86 /* vopstats initialization template used for fast initialization via bcopy() */
87 static vopstats_t *vs_templatep;
88
89 /* Kmem cache handle for vsk_anchor_t allocations */
90 kmem_cache_t *vsk_anchor_cache;
91
92 /* file events cleanup routine */
93 extern void free_fopdata(vnode_t *);
94
95 /*
96 * Root of AVL tree for the kstats associated with vopstats. Lock protects
97 * updates to vsktat_tree.
98 */
99 avl_tree_t vskstat_tree;
100 kmutex_t vskstat_tree_lock;
101
102 /* Global variable which enables/disables the vopstats collection */
103 int vopstats_enabled = 1;
104
105 /*
106 * forward declarations for internal vnode specific data (vsd)
107 */
108 static void *vsd_realloc(void *, size_t, size_t);
109
110 /*
111 * forward declarations for reparse point functions
112 */
113 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
114
115 /*
116 * VSD -- VNODE SPECIFIC DATA
117 * The v_data pointer is typically used by a file system to store a
118 * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
119 * However, there are times when additional project private data needs
120 * to be stored separately from the data (node) pointed to by v_data.
121 * This additional data could be stored by the file system itself or
122 * by a completely different kernel entity. VSD provides a way for
123 * callers to obtain a key and store a pointer to private data associated
124 * with a vnode.
186 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \
187 vsp->n##counter.value.ui64++; \
188 vsp->bytecounter.value.ui64 += bytesval; \
189 } \
190 } \
191 }
192
193 /*
194 * If the filesystem does not support XIDs map credential
195 * If the vfsp is NULL, perhaps we should also map?
196 */
197 #define VOPXID_MAP_CR(vp, cr) { \
198 vfs_t *vfsp = (vp)->v_vfsp; \
199 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \
200 cr = crgetmapped(cr); \
201 }
202
203 #define VOP_LATENCY_10MS 10000000
204 #define VOP_LATENCY_100MS 100000000
205 #define VOP_LATENCY_1S 1000000000
206
207 /*
208 * Convert stat(2) formats to vnode types and vice versa. (Knows about
209 * numerical order of S_IFMT and vnode types.)
210 */
211 enum vtype iftovt_tab[] = {
212 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
213 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
214 };
215
216 ushort_t vttoif_tab[] = {
217 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
218 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
219 };
220
221 /*
222 * The system vnode cache.
223 */
224
225 kmem_cache_t *vn_cache;
2271 kmem_free(vnops, sizeof (vnodeops_t));
2272 }
2273
2274 /*
2275 * Vnode cache.
2276 */
2277
2278 /* ARGSUSED */
2279 static int
2280 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
2281 {
2282 struct vnode *vp;
2283
2284 vp = buf;
2285
2286 mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
2287 mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
2288 cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
2289 rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
2290 vp->v_femhead = NULL; /* Must be done before vn_reinit() */
2291 vp->v_path = NULL;
2292 vp->v_mpssdata = NULL;
2293 vp->v_vsd = NULL;
2294 vp->v_fopdata = NULL;
2295
2296 return (0);
2297 }
2298
2299 /* ARGSUSED */
2300 static void
2301 vn_cache_destructor(void *buf, void *cdrarg)
2302 {
2303 struct vnode *vp;
2304
2305 vp = buf;
2306
2307 rw_destroy(&vp->v_nbllock);
2308 cv_destroy(&vp->v_cv);
2309 mutex_destroy(&vp->v_vsd_lock);
2310 mutex_destroy(&vp->v_lock);
2311 }
2318 P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
2319 vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
2320 VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
2321 NULL, 0);
2322 }
2323
2324 void
2325 vn_destroy_cache(void)
2326 {
2327 kmem_cache_destroy(vn_cache);
2328 }
2329
2330 /*
2331 * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
2332 * cached by the file system and vnodes remain associated.
2333 */
2334 void
2335 vn_recycle(vnode_t *vp)
2336 {
2337 ASSERT(vp->v_pages == NULL);
2338
2339 /*
2340 * XXX - This really belongs in vn_reinit(), but we have some issues
2341 * with the counts. Best to have it here for clean initialization.
2342 */
2343 vp->v_rdcnt = 0;
2344 vp->v_wrcnt = 0;
2345 vp->v_mmap_read = 0;
2346 vp->v_mmap_write = 0;
2347
2348 /*
2349 * If FEM was in use, make sure everything gets cleaned up
2350 * NOTE: vp->v_femhead is initialized to NULL in the vnode
2351 * constructor.
2352 */
2353 if (vp->v_femhead) {
2354 /* XXX - There should be a free_femhead() that does all this */
2355 ASSERT(vp->v_femhead->femh_list == NULL);
2356 mutex_destroy(&vp->v_femhead->femh_lock);
2357 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2358 vp->v_femhead = NULL;
2359 }
2360 if (vp->v_path) {
2361 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2362 vp->v_path = NULL;
2363 }
2364
2365 if (vp->v_fopdata != NULL) {
2366 free_fopdata(vp);
2367 }
2368 vp->v_mpssdata = NULL;
2369 vsd_free(vp);
2370 }
2371
2372 /*
2373 * Used to reset the vnode fields including those that are directly accessible
2374 * as well as those which require an accessor function.
2375 *
2376 * Does not initialize:
2377 * synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
2378 * v_data (since FS-nodes and vnodes point to each other and should
2379 * be updated simultaneously)
2380 * v_op (in case someone needs to make a VOP call on this object)
2381 */
2382 void
2383 vn_reinit(vnode_t *vp)
2414 vp->v_fopdata = NULL;
2415 vn_reinit(vp);
2416 }
2417
2418 return (vp);
2419 }
2420
2421 void
2422 vn_free(vnode_t *vp)
2423 {
2424 ASSERT(vp->v_shrlocks == NULL);
2425 ASSERT(vp->v_filocks == NULL);
2426
2427 /*
2428 * Some file systems call vn_free() with v_count of zero,
2429 * some with v_count of 1. In any case, the value should
2430 * never be anything else.
2431 */
2432 ASSERT((vp->v_count == 0) || (vp->v_count == 1));
2433 ASSERT(vp->v_count_dnlc == 0);
2434 if (vp->v_path != NULL) {
2435 kmem_free(vp->v_path, strlen(vp->v_path) + 1);
2436 vp->v_path = NULL;
2437 }
2438
2439 /* If FEM was in use, make sure everything gets cleaned up */
2440 if (vp->v_femhead) {
2441 /* XXX - There should be a free_femhead() that does all this */
2442 ASSERT(vp->v_femhead->femh_list == NULL);
2443 mutex_destroy(&vp->v_femhead->femh_lock);
2444 kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
2445 vp->v_femhead = NULL;
2446 }
2447
2448 if (vp->v_fopdata != NULL) {
2449 free_fopdata(vp);
2450 }
2451 vp->v_mpssdata = NULL;
2452 vsd_free(vp);
2453 kmem_cache_free(vn_cache, vp);
2454 }
2455
2456 /*
2950 }
2951
2952 return ((loc != NULL) && (*loc == funcp));
2953 }
2954
2955 /*
2956 * fs_new_caller_id() needs to return a unique ID on a given local system.
2957 * The IDs do not need to survive across reboots. These are primarily
2958 * used so that (FEM) monitors can detect particular callers (such as
2959 * the NFS server) to a given vnode/vfs operation.
2960 */
2961 u_longlong_t
2962 fs_new_caller_id()
2963 {
2964 static uint64_t next_caller_id = 0LL; /* First call returns 1 */
2965
2966 return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
2967 }
2968
2969 /*
2970 * Given a starting vnode and a path, updates the path in the target vnode in
2971 * a safe manner. If the vnode already has path information embedded, then the
2972 * cached path is left untouched.
2973 */
2974
2975 size_t max_vnode_path = 4 * MAXPATHLEN;
2976
2977 void
2978 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
2979 const char *path, size_t plen)
2980 {
2981 char *rpath;
2982 vnode_t *base;
2983 size_t rpathlen, rpathalloc;
2984 int doslash = 1;
2985
2986 if (*path == '/') {
2987 base = rootvp;
2988 path++;
2989 plen--;
2990 } else {
2991 base = startvp;
2992 }
2993
2994 /*
2995 * We cannot grab base->v_lock while we hold vp->v_lock because of
2996 * the potential for deadlock.
2997 */
2998 mutex_enter(&base->v_lock);
2999 if (base->v_path == NULL) {
3000 mutex_exit(&base->v_lock);
3001 return;
3002 }
3003
3004 rpathlen = strlen(base->v_path);
3005 rpathalloc = rpathlen + plen + 1;
3006 /* Avoid adding a slash if there's already one there */
3007 if (base->v_path[rpathlen-1] == '/')
3008 doslash = 0;
3009 else
3010 rpathalloc++;
3011
3012 /*
3013 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
3014 * so we must do this dance. If, by chance, something changes the path,
3015 * just give up since there is no real harm.
3016 */
3017 mutex_exit(&base->v_lock);
3018
3019 /* Paths should stay within reason */
3020 if (rpathalloc > max_vnode_path)
3021 return;
3022
3023 rpath = kmem_alloc(rpathalloc, KM_SLEEP);
3024
3025 mutex_enter(&base->v_lock);
3026 if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
3027 mutex_exit(&base->v_lock);
3028 kmem_free(rpath, rpathalloc);
3029 return;
3030 }
3031 bcopy(base->v_path, rpath, rpathlen);
3032 mutex_exit(&base->v_lock);
3033
3034 if (doslash)
3035 rpath[rpathlen++] = '/';
3036 bcopy(path, rpath + rpathlen, plen);
3037 rpath[rpathlen + plen] = '\0';
3038
3039 mutex_enter(&vp->v_lock);
3040 if (vp->v_path != NULL) {
3041 mutex_exit(&vp->v_lock);
3042 kmem_free(rpath, rpathalloc);
3043 } else {
3044 vp->v_path = rpath;
3045 mutex_exit(&vp->v_lock);
3046 }
3047 }
3048
3049 /*
3050 * Sets the path to the vnode to be the given string, regardless of current
3051 * context. The string must be a complete path from rootdir. This is only used
3052 * by fsop_root() for setting the path based on the mountpoint.
3053 */
3054 void
3055 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
3056 {
3057 char *buf = kmem_alloc(len + 1, KM_SLEEP);
3058
3059 mutex_enter(&vp->v_lock);
3060 if (vp->v_path != NULL) {
3061 mutex_exit(&vp->v_lock);
3062 kmem_free(buf, len + 1);
3063 return;
3064 }
3065
3066 vp->v_path = buf;
3067 bcopy(str, vp->v_path, len);
3068 vp->v_path[len] = '\0';
3069
3070 mutex_exit(&vp->v_lock);
3071 }
3072
3073 /*
3074 * Called from within filesystem's vop_rename() to handle renames once the
3075 * target vnode is available.
3076 */
3077 void
3078 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
3079 {
3080 char *tmp;
3081
3082 mutex_enter(&vp->v_lock);
3083 tmp = vp->v_path;
3084 vp->v_path = NULL;
3085 mutex_exit(&vp->v_lock);
3086 vn_setpath(rootdir, dvp, vp, nm, len);
3087 if (tmp != NULL)
3088 kmem_free(tmp, strlen(tmp) + 1);
3089 }
3090
3091 /*
3092 * Similar to vn_setpath_str(), this function sets the path of the destination
3093 * vnode to the be the same as the source vnode.
3094 */
3095 void
3096 vn_copypath(struct vnode *src, struct vnode *dst)
3097 {
3098 char *buf;
3099 int alloc;
3100
3101 mutex_enter(&src->v_lock);
3102 if (src->v_path == NULL) {
3103 mutex_exit(&src->v_lock);
3104 return;
3105 }
3106 alloc = strlen(src->v_path) + 1;
3107
3108 /* avoid kmem_alloc() with lock held */
3109 mutex_exit(&src->v_lock);
3110 buf = kmem_alloc(alloc, KM_SLEEP);
3111 mutex_enter(&src->v_lock);
3112 if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
3113 mutex_exit(&src->v_lock);
3114 kmem_free(buf, alloc);
3115 return;
3116 }
3117 bcopy(src->v_path, buf, alloc);
3118 mutex_exit(&src->v_lock);
3119
3120 mutex_enter(&dst->v_lock);
3121 if (dst->v_path != NULL) {
3122 mutex_exit(&dst->v_lock);
3123 kmem_free(buf, alloc);
3124 return;
3125 }
3126 dst->v_path = buf;
3127 mutex_exit(&dst->v_lock);
3128 }
3129
3130 /*
3131 * XXX Private interface for segvn routines that handle vnode
3132 * large page segments.
3133 *
3134 * return 1 if vp's file system VOP_PAGEIO() implementation
3135 * can be safely used instead of VOP_GETPAGE() for handling
3136 * pagefaults against regular non swap files. VOP_PAGEIO()
3137 * interface is considered safe here if its implementation
3138 * is very close to VOP_GETPAGE() implementation.
3139 * e.g. It zero's out the part of the page beyond EOF. Doesn't
3140 * panic if there're file holes but instead returns an error.
3141 * Doesn't assume file won't be changed by user writes, etc.
3142 *
3143 * return 0 otherwise.
3144 *
3145 * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
3146 */
3147 int
3148 vn_vmpss_usepageio(vnode_t *vp)
3149 {
3298 err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3299 len = resid_start - uiop->uio_resid;
3300
3301 VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3302
3303 if (start != 0) {
3304 mutex_enter(&zonep->zone_vfs_lock);
3305 zonep->zone_vfs_rwstats.reads++;
3306 zonep->zone_vfs_rwstats.nread += len;
3307 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3308 mutex_exit(&zonep->zone_vfs_lock);
3309
3310 lat = gethrtime() - start;
3311
3312 if (lat >= VOP_LATENCY_10MS) {
3313 if (lat < VOP_LATENCY_100MS)
3314 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3315 else if (lat < VOP_LATENCY_1S) {
3316 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3317 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3318 } else {
3319 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3320 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3321 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3322 }
3323 }
3324 }
3325
3326 return (err);
3327 }
3328
3329 int
3330 fop_write(
3331 vnode_t *vp,
3332 uio_t *uiop,
3333 int ioflag,
3334 cred_t *cr,
3335 caller_context_t *ct)
3336 {
3337 ssize_t resid_start = uiop->uio_resid;
3338 zone_t *zonep = curzone;
3339 zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3340
3341 hrtime_t start = 0, lat;
3361 err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3362 len = resid_start - uiop->uio_resid;
3363
3364 VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3365
3366 if (start != 0) {
3367 mutex_enter(&zonep->zone_vfs_lock);
3368 zonep->zone_vfs_rwstats.writes++;
3369 zonep->zone_vfs_rwstats.nwritten += len;
3370 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3371 mutex_exit(&zonep->zone_vfs_lock);
3372
3373 lat = gethrtime() - start;
3374
3375 if (lat >= VOP_LATENCY_10MS) {
3376 if (lat < VOP_LATENCY_100MS)
3377 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3378 else if (lat < VOP_LATENCY_1S) {
3379 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3380 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3381 } else {
3382 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3383 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3384 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3385 }
3386 }
3387 }
3388
3389 return (err);
3390 }
3391
3392 int
3393 fop_ioctl(
3394 vnode_t *vp,
3395 int cmd,
3396 intptr_t arg,
3397 int flag,
3398 cred_t *cr,
3399 int *rvalp,
3400 caller_context_t *ct)
3401 {
3402 int err;
3403
3404 VOPXID_MAP_CR(vp, cr);
3532 * If this file system doesn't support case-insensitive access
3533 * and said access is requested, fail quickly. It is required
3534 * that if the vfs supports case-insensitive lookup, it also
3535 * supports extended dirent flags.
3536 */
3537 if (flags & FIGNORECASE &&
3538 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3539 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3540 return (EINVAL);
3541
3542 VOPXID_MAP_CR(dvp, cr);
3543
3544 if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
3545 ret = xattr_dir_lookup(dvp, vpp, flags, cr);
3546 } else {
3547 ret = (*(dvp)->v_op->vop_lookup)
3548 (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
3549 }
3550 if (ret == 0 && *vpp) {
3551 VOPSTATS_UPDATE(*vpp, lookup);
3552 if ((*vpp)->v_path == NULL) {
3553 vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
3554 }
3555 }
3556
3557 return (ret);
3558 }
3559
3560 int
3561 fop_create(
3562 vnode_t *dvp,
3563 char *name,
3564 vattr_t *vap,
3565 vcexcl_t excl,
3566 int mode,
3567 vnode_t **vpp,
3568 cred_t *cr,
3569 int flags,
3570 caller_context_t *ct,
3571 vsecattr_t *vsecp) /* ACL to set during create */
3572 {
3573 int ret;
3574
3575 if (vsecp != NULL &&
3576 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3577 return (EINVAL);
3578 }
3579 /*
3580 * If this file system doesn't support case-insensitive access
3581 * and said access is requested, fail quickly.
3582 */
3583 if (flags & FIGNORECASE &&
3584 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3585 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3586 return (EINVAL);
3587
3588 VOPXID_MAP_CR(dvp, cr);
3589
3590 ret = (*(dvp)->v_op->vop_create)
3591 (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
3592 if (ret == 0 && *vpp) {
3593 VOPSTATS_UPDATE(*vpp, create);
3594 if ((*vpp)->v_path == NULL) {
3595 vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
3596 }
3597 }
3598
3599 return (ret);
3600 }
3601
3602 int
3603 fop_remove(
3604 vnode_t *dvp,
3605 char *nm,
3606 cred_t *cr,
3607 caller_context_t *ct,
3608 int flags)
3609 {
3610 int err;
3611
3612 /*
3613 * If this file system doesn't support case-insensitive access
3614 * and said access is requested, fail quickly.
3615 */
3616 if (flags & FIGNORECASE &&
3617 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3696
3697 if (vsecp != NULL &&
3698 vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
3699 return (EINVAL);
3700 }
3701 /*
3702 * If this file system doesn't support case-insensitive access
3703 * and said access is requested, fail quickly.
3704 */
3705 if (flags & FIGNORECASE &&
3706 (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
3707 vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
3708 return (EINVAL);
3709
3710 VOPXID_MAP_CR(dvp, cr);
3711
3712 ret = (*(dvp)->v_op->vop_mkdir)
3713 (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
3714 if (ret == 0 && *vpp) {
3715 VOPSTATS_UPDATE(*vpp, mkdir);
3716 if ((*vpp)->v_path == NULL) {
3717 vn_setpath(rootdir, dvp, *vpp, dirname,
3718 strlen(dirname));
3719 }
3720 }
3721
3722 return (ret);
3723 }
3724
3725 int
3726 fop_rmdir(
3727 vnode_t *dvp,
3728 char *nm,
3729 vnode_t *cdir,
3730 cred_t *cr,
3731 caller_context_t *ct,
3732 int flags)
3733 {
3734 int err;
3735
3736 /*
3737 * If this file system doesn't support case-insensitive access
3738 * and said access is requested, fail quickly.
3739 */
3740 if (flags & FIGNORECASE &&
|