Print this page
8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages()
Reviewed by: Arne Jansen <arne@die-jansens.de>
Reviewed by: Vitaliy Gusev <gusev.vitaliy@icloud.com>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_vnops.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2016 STRATO AG. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28 28 */
29 29
30 30 /*
31 31 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
32 32 * Use is subject to license terms.
33 33 */
34 34
35 35 /*
36 36 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
37 37 * All Rights Reserved
38 38 */
39 39
40 40 /*
41 41 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
42 42 */
43 43
44 44 #include <sys/param.h>
45 45 #include <sys/types.h>
46 46 #include <sys/systm.h>
47 47 #include <sys/cred.h>
48 48 #include <sys/time.h>
49 49 #include <sys/vnode.h>
50 50 #include <sys/vfs.h>
51 51 #include <sys/vfs_opreg.h>
52 52 #include <sys/file.h>
53 53 #include <sys/filio.h>
54 54 #include <sys/uio.h>
55 55 #include <sys/buf.h>
56 56 #include <sys/mman.h>
57 57 #include <sys/pathname.h>
58 58 #include <sys/dirent.h>
59 59 #include <sys/debug.h>
60 60 #include <sys/vmsystm.h>
61 61 #include <sys/fcntl.h>
62 62 #include <sys/flock.h>
63 63 #include <sys/swap.h>
64 64 #include <sys/errno.h>
65 65 #include <sys/strsubr.h>
66 66 #include <sys/sysmacros.h>
67 67 #include <sys/kmem.h>
68 68 #include <sys/cmn_err.h>
69 69 #include <sys/pathconf.h>
70 70 #include <sys/utsname.h>
71 71 #include <sys/dnlc.h>
72 72 #include <sys/acl.h>
73 73 #include <sys/systeminfo.h>
74 74 #include <sys/policy.h>
75 75 #include <sys/sdt.h>
76 76 #include <sys/list.h>
77 77 #include <sys/stat.h>
78 78 #include <sys/zone.h>
79 79
80 80 #include <rpc/types.h>
81 81 #include <rpc/auth.h>
82 82 #include <rpc/clnt.h>
83 83
84 84 #include <nfs/nfs.h>
85 85 #include <nfs/nfs_clnt.h>
86 86 #include <nfs/nfs_acl.h>
87 87 #include <nfs/lm.h>
88 88 #include <nfs/nfs4.h>
89 89 #include <nfs/nfs4_kprot.h>
90 90 #include <nfs/rnode4.h>
91 91 #include <nfs/nfs4_clnt.h>
92 92
93 93 #include <vm/hat.h>
94 94 #include <vm/as.h>
95 95 #include <vm/page.h>
96 96 #include <vm/pvn.h>
97 97 #include <vm/seg.h>
98 98 #include <vm/seg_map.h>
99 99 #include <vm/seg_kpm.h>
100 100 #include <vm/seg_vn.h>
101 101
102 102 #include <fs/fs_subr.h>
103 103
104 104 #include <sys/ddi.h>
105 105 #include <sys/int_fmtio.h>
106 106 #include <sys/fs/autofs.h>
107 107
108 108 typedef struct {
109 109 nfs4_ga_res_t *di_garp;
110 110 cred_t *di_cred;
111 111 hrtime_t di_time_call;
112 112 } dirattr_info_t;
113 113
114 114 typedef enum nfs4_acl_op {
115 115 NFS4_ACL_GET,
116 116 NFS4_ACL_SET
117 117 } nfs4_acl_op_t;
118 118
119 119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *);
120 120
121 121 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
122 122 char *, dirattr_info_t *);
123 123
124 124 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
125 125 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
126 126 nfs4_error_t *, int *);
127 127 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
128 128 cred_t *);
129 129 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
130 130 stable_how4 *);
131 131 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
132 132 cred_t *, bool_t, struct uio *);
133 133 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
134 134 vsecattr_t *);
135 135 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
136 136 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
137 137 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
138 138 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
139 139 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
140 140 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
141 141 int, vnode_t **, cred_t *);
142 142 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
143 143 cred_t *, int, int, enum createmode4, int);
144 144 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
145 145 caller_context_t *);
146 146 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
147 147 vnode_t *, char *, cred_t *, nfsstat4 *);
148 148 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
149 149 vnode_t *, char *, cred_t *, nfsstat4 *);
150 150 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
151 151 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
152 152 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
153 153 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
154 154 page_t *[], size_t, struct seg *, caddr_t,
155 155 enum seg_rw, cred_t *);
156 156 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
157 157 cred_t *);
158 158 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
159 159 int, cred_t *);
160 160 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
161 161 int, cred_t *);
162 162 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *);
163 163 static void nfs4_set_mod(vnode_t *);
164 164 static void nfs4_get_commit(vnode_t *);
165 165 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
166 166 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
167 167 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
168 168 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
169 169 cred_t *);
170 170 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
171 171 cred_t *);
172 172 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
173 173 hrtime_t, vnode_t *, cred_t *);
174 174 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
175 175 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
176 176 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
177 177 u_offset_t);
178 178 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
179 179 static int nfs4_block_and_wait(clock_t *, rnode4_t *);
180 180 static cred_t *state_to_cred(nfs4_open_stream_t *);
181 181 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
182 182 static pid_t lo_to_pid(lock_owner4 *);
183 183 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
184 184 cred_t *, nfs4_lock_owner_t *);
185 185 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
186 186 nfs4_lock_owner_t *);
187 187 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
188 188 static void nfs4_delmap_callback(struct as *, void *, uint_t);
189 189 static void nfs4_free_delmapcall(nfs4_delmapcall_t *);
190 190 static nfs4_delmapcall_t *nfs4_init_delmapcall();
191 191 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
192 192 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
193 193 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
194 194 uid_t, gid_t, int);
195 195
196 196 /*
197 197 * Routines that implement the setting of v4 args for the misc. ops
198 198 */
199 199 static void nfs4args_lock_free(nfs_argop4 *);
200 200 static void nfs4args_lockt_free(nfs_argop4 *);
201 201 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
202 202 int, rnode4_t *, cred_t *, bitmap4, int *,
203 203 nfs4_stateid_types_t *);
204 204 static void nfs4args_setattr_free(nfs_argop4 *);
205 205 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
206 206 bitmap4);
207 207 static void nfs4args_verify_free(nfs_argop4 *);
208 208 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
209 209 WRITE4args **, nfs4_stateid_types_t *);
210 210
211 211 /*
212 212 * These are the vnode ops functions that implement the vnode interface to
213 213 * the networked file system. See more comments below at nfs4_vnodeops.
214 214 */
215 215 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
216 216 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
217 217 caller_context_t *);
218 218 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *,
219 219 caller_context_t *);
220 220 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *,
221 221 caller_context_t *);
222 222 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
223 223 caller_context_t *);
224 224 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
225 225 caller_context_t *);
226 226 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
227 227 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *,
228 228 caller_context_t *);
229 229 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
230 230 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
231 231 int, vnode_t **, cred_t *, int, caller_context_t *,
232 232 vsecattr_t *);
233 233 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
234 234 int);
235 235 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
236 236 caller_context_t *, int);
237 237 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
238 238 caller_context_t *, int);
239 239 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
240 240 cred_t *, caller_context_t *, int, vsecattr_t *);
241 241 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
242 242 caller_context_t *, int);
243 243 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
244 244 cred_t *, caller_context_t *, int);
245 245 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
246 246 caller_context_t *, int);
247 247 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
248 248 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
249 249 page_t *[], size_t, struct seg *, caddr_t,
250 250 enum seg_rw, cred_t *, caller_context_t *);
251 251 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
252 252 caller_context_t *);
253 253 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
254 254 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
255 255 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
256 256 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
257 257 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
258 258 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
259 259 struct flk_callback *, cred_t *, caller_context_t *);
260 260 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
261 261 cred_t *, caller_context_t *);
262 262 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
263 263 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
264 264 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
265 265 cred_t *, caller_context_t *);
266 266 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
267 267 caller_context_t *);
268 268 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
269 269 caller_context_t *);
270 270 /*
271 271 * These vnode ops are required to be called from outside this source file,
272 272 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
273 273 * as static.
274 274 */
275 275 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
276 276 caller_context_t *);
277 277 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
278 278 int nfs4_lookup(vnode_t *, char *, vnode_t **,
279 279 struct pathname *, int, vnode_t *, cred_t *,
280 280 caller_context_t *, int *, pathname_t *);
281 281 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
282 282 int nfs4_rwlock(vnode_t *, int, caller_context_t *);
283 283 void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
284 284 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
285 285 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
286 286 caller_context_t *);
287 287 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
288 288 caller_context_t *);
289 289 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
290 290 caller_context_t *);
291 291
292 292 /*
293 293 * Used for nfs4_commit_vp() to indicate if we should
294 294 * wait on pending writes.
295 295 */
296 296 #define NFS4_WRITE_NOWAIT 0
297 297 #define NFS4_WRITE_WAIT 1
298 298
299 299 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */
300 300
301 301 /*
302 302 * Error flags used to pass information about certain special errors
303 303 * which need to be handled specially.
304 304 */
305 305 #define NFS_EOF -98
306 306 #define NFS_VERF_MISMATCH -97
307 307
308 308 /*
309 309 * Flags used to differentiate between which operation drove the
310 310 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
311 311 */
312 312 #define NFS4_CLOSE_OP 0x1
313 313 #define NFS4_DELMAP_OP 0x2
314 314 #define NFS4_INACTIVE_OP 0x3
315 315
316 316 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
317 317
318 318 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
319 319 #define ALIGN64(x, ptr, sz) \
320 320 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \
321 321 if (x) { \
322 322 x = sizeof (uint64_t) - (x); \
323 323 sz -= (x); \
324 324 ptr += (x); \
325 325 }
326 326
327 327 #ifdef DEBUG
328 328 int nfs4_client_attr_debug = 0;
329 329 int nfs4_client_state_debug = 0;
330 330 int nfs4_client_shadow_debug = 0;
331 331 int nfs4_client_lock_debug = 0;
332 332 int nfs4_seqid_sync = 0;
333 333 int nfs4_client_map_debug = 0;
334 334 static int nfs4_pageio_debug = 0;
335 335 int nfs4_client_inactive_debug = 0;
336 336 int nfs4_client_recov_debug = 0;
337 337 int nfs4_client_failover_debug = 0;
338 338 int nfs4_client_call_debug = 0;
339 339 int nfs4_client_lookup_debug = 0;
340 340 int nfs4_client_zone_debug = 0;
341 341 int nfs4_lost_rqst_debug = 0;
342 342 int nfs4_rdattrerr_debug = 0;
343 343 int nfs4_open_stream_debug = 0;
344 344
345 345 int nfs4read_error_inject;
346 346
347 347 static int nfs4_create_misses = 0;
348 348
349 349 static int nfs4_readdir_cache_shorts = 0;
350 350 static int nfs4_readdir_readahead = 0;
351 351
352 352 static int nfs4_bio_do_stop = 0;
353 353
354 354 static int nfs4_lostpage = 0; /* number of times we lost original page */
355 355
356 356 int nfs4_mmap_debug = 0;
357 357
358 358 static int nfs4_pathconf_cache_hits = 0;
359 359 static int nfs4_pathconf_cache_misses = 0;
360 360
361 361 int nfs4close_all_cnt;
362 362 int nfs4close_one_debug = 0;
363 363 int nfs4close_notw_debug = 0;
364 364
365 365 int denied_to_flk_debug = 0;
366 366 void *lockt_denied_debug;
367 367
368 368 #endif
369 369
370 370 /*
371 371 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
372 372 * or NFS4ERR_RESOURCE.
373 373 */
374 374 static int confirm_retry_sec = 30;
375 375
376 376 static int nfs4_lookup_neg_cache = 1;
377 377
378 378 /*
379 379 * number of pages to read ahead
380 380 * optimized for 100 base-T.
381 381 */
382 382 static int nfs4_nra = 4;
383 383
384 384 static int nfs4_do_symlink_cache = 1;
385 385
386 386 static int nfs4_pathconf_disable_cache = 0;
387 387
388 388 /*
389 389 * These are the vnode ops routines which implement the vnode interface to
390 390 * the networked file system. These routines just take their parameters,
391 391 * make them look networkish by putting the right info into interface structs,
392 392 * and then calling the appropriate remote routine(s) to do the work.
393 393 *
394 394 * Note on directory name lookup cacheing: If we detect a stale fhandle,
395 395 * we purge the directory cache relative to that vnode. This way, the
396 396 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for
397 397 * more details on rnode locking.
398 398 */
399 399
400 400 struct vnodeops *nfs4_vnodeops;
401 401
402 402 const fs_operation_def_t nfs4_vnodeops_template[] = {
403 403 VOPNAME_OPEN, { .vop_open = nfs4_open },
404 404 VOPNAME_CLOSE, { .vop_close = nfs4_close },
405 405 VOPNAME_READ, { .vop_read = nfs4_read },
406 406 VOPNAME_WRITE, { .vop_write = nfs4_write },
407 407 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl },
408 408 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr },
409 409 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr },
410 410 VOPNAME_ACCESS, { .vop_access = nfs4_access },
411 411 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup },
412 412 VOPNAME_CREATE, { .vop_create = nfs4_create },
413 413 VOPNAME_REMOVE, { .vop_remove = nfs4_remove },
414 414 VOPNAME_LINK, { .vop_link = nfs4_link },
415 415 VOPNAME_RENAME, { .vop_rename = nfs4_rename },
416 416 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir },
417 417 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir },
418 418 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir },
419 419 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink },
420 420 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink },
421 421 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync },
422 422 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
423 423 VOPNAME_FID, { .vop_fid = nfs4_fid },
424 424 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
425 425 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
426 426 VOPNAME_SEEK, { .vop_seek = nfs4_seek },
427 427 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock },
428 428 VOPNAME_SPACE, { .vop_space = nfs4_space },
429 429 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
430 430 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage },
431 431 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage },
432 432 VOPNAME_MAP, { .vop_map = nfs4_map },
433 433 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap },
434 434 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap },
435 435 /* no separate nfs4_dump */
436 436 VOPNAME_DUMP, { .vop_dump = nfs_dump },
437 437 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
438 438 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio },
439 439 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose },
440 440 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr },
441 441 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
442 442 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock },
443 443 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
444 444 NULL, NULL
445 445 };
446 446
447 447 /*
448 448 * The following are subroutines and definitions to set args or get res
449 449 * for the different nfsv4 ops
450 450 */
451 451
452 452 void
453 453 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
454 454 {
455 455 int i;
456 456
457 457 for (i = 0; i < arglen; i++) {
458 458 if (argop[i].argop == OP_LOOKUP) {
459 459 kmem_free(
460 460 argop[i].nfs_argop4_u.oplookup.
461 461 objname.utf8string_val,
462 462 argop[i].nfs_argop4_u.oplookup.
463 463 objname.utf8string_len);
464 464 }
465 465 }
466 466 }
467 467
468 468 static void
469 469 nfs4args_lock_free(nfs_argop4 *argop)
470 470 {
471 471 locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
472 472
473 473 if (locker->new_lock_owner == TRUE) {
474 474 open_to_lock_owner4 *open_owner;
475 475
476 476 open_owner = &locker->locker4_u.open_owner;
477 477 if (open_owner->lock_owner.owner_val != NULL) {
478 478 kmem_free(open_owner->lock_owner.owner_val,
479 479 open_owner->lock_owner.owner_len);
480 480 }
481 481 }
482 482 }
483 483
484 484 static void
485 485 nfs4args_lockt_free(nfs_argop4 *argop)
486 486 {
487 487 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
488 488
489 489 if (lowner->owner_val != NULL) {
490 490 kmem_free(lowner->owner_val, lowner->owner_len);
491 491 }
492 492 }
493 493
494 494 static void
495 495 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
496 496 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
497 497 nfs4_stateid_types_t *sid_types)
498 498 {
499 499 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
500 500 mntinfo4_t *mi;
501 501
502 502 argop->argop = OP_SETATTR;
503 503 /*
504 504 * The stateid is set to 0 if client is not modifying the size
505 505 * and otherwise to whatever nfs4_get_stateid() returns.
506 506 *
507 507 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
508 508 * state struct could be found for the process/file pair. We may
509 509 * want to change this in the future (by OPENing the file). See
510 510 * bug # 4474852.
511 511 */
512 512 if (vap->va_mask & AT_SIZE) {
513 513
514 514 ASSERT(rp != NULL);
515 515 mi = VTOMI4(RTOV4(rp));
516 516
517 517 argop->nfs_argop4_u.opsetattr.stateid =
518 518 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
519 519 OP_SETATTR, sid_types, FALSE);
520 520 } else {
521 521 bzero(&argop->nfs_argop4_u.opsetattr.stateid,
522 522 sizeof (stateid4));
523 523 }
524 524
525 525 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
526 526 if (*error)
527 527 bzero(attr, sizeof (*attr));
528 528 }
529 529
530 530 static void
531 531 nfs4args_setattr_free(nfs_argop4 *argop)
532 532 {
533 533 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
534 534 }
535 535
536 536 static int
537 537 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
538 538 bitmap4 supp)
539 539 {
540 540 fattr4 *attr;
541 541 int error = 0;
542 542
543 543 argop->argop = op;
544 544 switch (op) {
545 545 case OP_VERIFY:
546 546 attr = &argop->nfs_argop4_u.opverify.obj_attributes;
547 547 break;
548 548 case OP_NVERIFY:
549 549 attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
550 550 break;
551 551 default:
552 552 return (EINVAL);
553 553 }
554 554 if (!error)
555 555 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
556 556 if (error)
557 557 bzero(attr, sizeof (*attr));
558 558 return (error);
559 559 }
560 560
561 561 static void
562 562 nfs4args_verify_free(nfs_argop4 *argop)
563 563 {
564 564 switch (argop->argop) {
565 565 case OP_VERIFY:
566 566 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
567 567 break;
568 568 case OP_NVERIFY:
569 569 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
570 570 break;
571 571 default:
572 572 break;
573 573 }
574 574 }
575 575
576 576 static void
577 577 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
578 578 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
579 579 {
580 580 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
581 581 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
582 582
583 583 argop->argop = OP_WRITE;
584 584 wargs->stable = stable;
585 585 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
586 586 mi, OP_WRITE, sid_tp);
587 587 wargs->mblk = NULL;
588 588 *wargs_pp = wargs;
589 589 }
590 590
591 591 void
592 592 nfs4args_copen_free(OPEN4cargs *open_args)
593 593 {
594 594 if (open_args->owner.owner_val) {
595 595 kmem_free(open_args->owner.owner_val,
596 596 open_args->owner.owner_len);
597 597 }
598 598 if ((open_args->opentype == OPEN4_CREATE) &&
599 599 (open_args->mode != EXCLUSIVE4)) {
600 600 nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
601 601 }
602 602 }
603 603
604 604 /*
605 605 * XXX: This is referenced in modstubs.s
606 606 */
607 607 struct vnodeops *
608 608 nfs4_getvnodeops(void)
609 609 {
610 610 return (nfs4_vnodeops);
611 611 }
612 612
613 613 /*
614 614 * The OPEN operation opens a regular file.
615 615 */
616 616 /*ARGSUSED3*/
617 617 static int
618 618 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
619 619 {
620 620 vnode_t *dvp = NULL;
621 621 rnode4_t *rp, *drp;
622 622 int error;
623 623 int just_been_created;
624 624 char fn[MAXNAMELEN];
625 625
626 626 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
627 627 if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
628 628 return (EIO);
629 629 rp = VTOR4(*vpp);
630 630
631 631 /*
632 632 * Check to see if opening something besides a regular file;
633 633 * if so skip the OTW call
634 634 */
635 635 if ((*vpp)->v_type != VREG) {
636 636 error = nfs4_open_non_reg_file(vpp, flag, cr);
637 637 return (error);
638 638 }
639 639
640 640 /*
641 641 * XXX - would like a check right here to know if the file is
642 642 * executable or not, so as to skip OTW
643 643 */
644 644
645 645 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
646 646 return (error);
647 647
648 648 drp = VTOR4(dvp);
649 649 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
650 650 return (EINTR);
651 651
652 652 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
653 653 nfs_rw_exit(&drp->r_rwlock);
654 654 return (error);
655 655 }
656 656
657 657 /*
658 658 * See if this file has just been CREATEd.
659 659 * If so, clear the flag and update the dnlc, which was previously
660 660 * skipped in nfs4_create.
661 661 * XXX need better serilization on this.
662 662 * XXX move this into the nf4open_otw call, after we have
663 663 * XXX acquired the open owner seqid sync.
664 664 */
665 665 mutex_enter(&rp->r_statev4_lock);
666 666 if (rp->created_v4) {
667 667 rp->created_v4 = 0;
668 668 mutex_exit(&rp->r_statev4_lock);
669 669
670 670 dnlc_update(dvp, fn, *vpp);
671 671 /* This is needed so we don't bump the open ref count */
672 672 just_been_created = 1;
673 673 } else {
674 674 mutex_exit(&rp->r_statev4_lock);
675 675 just_been_created = 0;
676 676 }
677 677
678 678 /*
679 679 * If caller specified O_TRUNC/FTRUNC, then be sure to set
680 680 * FWRITE (to drive successful setattr(size=0) after open)
681 681 */
682 682 if (flag & FTRUNC)
683 683 flag |= FWRITE;
684 684
685 685 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
686 686 just_been_created);
687 687
688 688 if (!error && !((*vpp)->v_flag & VROOT))
689 689 dnlc_update(dvp, fn, *vpp);
690 690
691 691 nfs_rw_exit(&drp->r_rwlock);
692 692
693 693 /* release the hold from vtodv */
694 694 VN_RELE(dvp);
695 695
696 696 /* exchange the shadow for the master vnode, if needed */
697 697
698 698 if (error == 0 && IS_SHADOW(*vpp, rp))
699 699 sv_exchange(vpp);
700 700
701 701 return (error);
702 702 }
703 703
704 704 /*
705 705 * See if there's a "lost open" request to be saved and recovered.
706 706 */
707 707 static void
708 708 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
709 709 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
710 710 vnode_t *dvp, OPEN4cargs *open_args)
711 711 {
712 712 vfs_t *vfsp;
713 713 char *srccfp;
714 714
715 715 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
716 716
717 717 if (error != ETIMEDOUT && error != EINTR &&
718 718 !NFS4_FRC_UNMT_ERR(error, vfsp)) {
719 719 lost_rqstp->lr_op = 0;
720 720 return;
721 721 }
722 722
723 723 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
724 724 "nfs4open_save_lost_rqst: error %d", error));
725 725
726 726 lost_rqstp->lr_op = OP_OPEN;
727 727
728 728 /*
729 729 * The vp (if it is not NULL) and dvp are held and rele'd via
730 730 * the recovery code. See nfs4_save_lost_rqst.
731 731 */
732 732 lost_rqstp->lr_vp = vp;
733 733 lost_rqstp->lr_dvp = dvp;
734 734 lost_rqstp->lr_oop = oop;
735 735 lost_rqstp->lr_osp = NULL;
736 736 lost_rqstp->lr_lop = NULL;
737 737 lost_rqstp->lr_cr = cr;
738 738 lost_rqstp->lr_flk = NULL;
739 739 lost_rqstp->lr_oacc = open_args->share_access;
740 740 lost_rqstp->lr_odeny = open_args->share_deny;
741 741 lost_rqstp->lr_oclaim = open_args->claim;
742 742 if (open_args->claim == CLAIM_DELEGATE_CUR) {
743 743 lost_rqstp->lr_ostateid =
744 744 open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
745 745 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
746 746 } else {
747 747 srccfp = open_args->open_claim4_u.cfile;
748 748 }
749 749 lost_rqstp->lr_ofile.utf8string_len = 0;
750 750 lost_rqstp->lr_ofile.utf8string_val = NULL;
751 751 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
752 752 lost_rqstp->lr_putfirst = FALSE;
753 753 }
754 754
755 755 struct nfs4_excl_time {
756 756 uint32 seconds;
757 757 uint32 nseconds;
758 758 };
759 759
760 760 /*
761 761 * The OPEN operation creates and/or opens a regular file
762 762 *
763 763 * ARGSUSED
764 764 */
765 765 static int
766 766 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
767 767 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
768 768 enum createmode4 createmode, int file_just_been_created)
769 769 {
770 770 rnode4_t *rp;
771 771 rnode4_t *drp = VTOR4(dvp);
772 772 vnode_t *vp = NULL;
773 773 vnode_t *vpi = *vpp;
774 774 bool_t needrecov = FALSE;
775 775
776 776 int doqueue = 1;
777 777
778 778 COMPOUND4args_clnt args;
779 779 COMPOUND4res_clnt res;
780 780 nfs_argop4 *argop;
781 781 nfs_resop4 *resop;
782 782 int argoplist_size;
783 783 int idx_open, idx_fattr;
784 784
785 785 GETFH4res *gf_res = NULL;
786 786 OPEN4res *op_res = NULL;
787 787 nfs4_ga_res_t *garp;
788 788 fattr4 *attr = NULL;
789 789 struct nfs4_excl_time verf;
790 790 bool_t did_excl_setup = FALSE;
791 791 int created_osp;
792 792
793 793 OPEN4cargs *open_args;
794 794 nfs4_open_owner_t *oop = NULL;
795 795 nfs4_open_stream_t *osp = NULL;
796 796 seqid4 seqid = 0;
797 797 bool_t retry_open = FALSE;
798 798 nfs4_recov_state_t recov_state;
799 799 nfs4_lost_rqst_t lost_rqst;
800 800 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
801 801 hrtime_t t;
802 802 int acc = 0;
803 803 cred_t *cred_otw = NULL; /* cred used to do the RPC call */
804 804 cred_t *ncr = NULL;
805 805
806 806 nfs4_sharedfh_t *otw_sfh;
807 807 nfs4_sharedfh_t *orig_sfh;
808 808 int fh_differs = 0;
809 809 int numops, setgid_flag;
810 810 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
811 811
812 812 /*
813 813 * Make sure we properly deal with setting the right gid on
814 814 * a newly created file to reflect the parent's setgid bit
815 815 */
816 816 setgid_flag = 0;
817 817 if (create_flag && in_va) {
818 818
819 819 /*
820 820 * If there is grpid mount flag used or
821 821 * the parent's directory has the setgid bit set
822 822 * _and_ the client was able to get a valid mapping
823 823 * for the parent dir's owner_group, we want to
824 824 * append NVERIFY(owner_group == dva.va_gid) and
825 825 * SETATTR to the CREATE compound.
826 826 */
827 827 mutex_enter(&drp->r_statelock);
828 828 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
829 829 drp->r_attr.va_mode & VSGID) &&
830 830 drp->r_attr.va_gid != GID_NOBODY) {
831 831 in_va->va_mask |= AT_GID;
832 832 in_va->va_gid = drp->r_attr.va_gid;
833 833 setgid_flag = 1;
834 834 }
835 835 mutex_exit(&drp->r_statelock);
836 836 }
837 837
838 838 /*
839 839 * Normal/non-create compound:
840 840 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
841 841 *
842 842 * Open(create) compound no setgid:
843 843 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
844 844 * RESTOREFH + GETATTR
845 845 *
846 846 * Open(create) setgid:
847 847 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
848 848 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
849 849 * NVERIFY(grp) + SETATTR
850 850 */
851 851 if (setgid_flag) {
852 852 numops = 10;
853 853 idx_open = 1;
854 854 idx_fattr = 3;
855 855 } else if (create_flag) {
856 856 numops = 7;
857 857 idx_open = 2;
858 858 idx_fattr = 4;
859 859 } else {
860 860 numops = 4;
861 861 idx_open = 1;
862 862 idx_fattr = 3;
863 863 }
864 864
865 865 args.array_len = numops;
866 866 argoplist_size = numops * sizeof (nfs_argop4);
867 867 argop = kmem_alloc(argoplist_size, KM_SLEEP);
868 868
869 869 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
870 870 "open %s open flag 0x%x cred %p", file_name, open_flag,
871 871 (void *)cr));
872 872
873 873 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
874 874 if (create_flag) {
875 875 /*
876 876 * We are to create a file. Initialize the passed in vnode
877 877 * pointer.
878 878 */
879 879 vpi = NULL;
880 880 } else {
881 881 /*
882 882 * Check to see if the client owns a read delegation and is
883 883 * trying to open for write. If so, then return the delegation
884 884 * to avoid the server doing a cb_recall and returning DELAY.
885 885 * NB - we don't use the statev4_lock here because we'd have
886 886 * to drop the lock anyway and the result would be stale.
887 887 */
888 888 if ((open_flag & FWRITE) &&
889 889 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
890 890 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
891 891
892 892 /*
893 893 * If the file has a delegation, then do an access check up
894 894 * front. This avoids having to an access check later after
895 895 * we've already done start_op, which could deadlock.
896 896 */
897 897 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
898 898 if (open_flag & FREAD &&
899 899 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
900 900 acc |= VREAD;
901 901 if (open_flag & FWRITE &&
902 902 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
903 903 acc |= VWRITE;
904 904 }
905 905 }
906 906
907 907 drp = VTOR4(dvp);
908 908
909 909 recov_state.rs_flags = 0;
910 910 recov_state.rs_num_retry_despite_err = 0;
911 911 cred_otw = cr;
912 912
913 913 recov_retry:
914 914 fh_differs = 0;
915 915 nfs4_error_zinit(&e);
916 916
917 917 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
918 918 if (e.error) {
919 919 if (ncr != NULL)
920 920 crfree(ncr);
921 921 kmem_free(argop, argoplist_size);
922 922 return (e.error);
923 923 }
924 924
925 925 args.ctag = TAG_OPEN;
926 926 args.array_len = numops;
927 927 args.array = argop;
928 928
929 929 /* putfh directory fh */
930 930 argop[0].argop = OP_CPUTFH;
931 931 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
932 932
933 933 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
934 934 argop[idx_open].argop = OP_COPEN;
935 935 open_args = &argop[idx_open].nfs_argop4_u.opcopen;
936 936 open_args->claim = CLAIM_NULL;
937 937
938 938 /* name of file */
939 939 open_args->open_claim4_u.cfile = file_name;
940 940 open_args->owner.owner_len = 0;
941 941 open_args->owner.owner_val = NULL;
942 942
943 943 if (create_flag) {
944 944 /* CREATE a file */
945 945 open_args->opentype = OPEN4_CREATE;
946 946 open_args->mode = createmode;
947 947 if (createmode == EXCLUSIVE4) {
948 948 if (did_excl_setup == FALSE) {
949 949 verf.seconds = zone_get_hostid(NULL);
950 950 if (verf.seconds != 0)
951 951 verf.nseconds = newnum();
952 952 else {
953 953 timestruc_t now;
954 954
955 955 gethrestime(&now);
956 956 verf.seconds = now.tv_sec;
957 957 verf.nseconds = now.tv_nsec;
958 958 }
959 959 /*
960 960 * Since the server will use this value for the
961 961 * mtime, make sure that it can't overflow. Zero
962 962 * out the MSB. The actual value does not matter
963 963 * here, only its uniqeness.
964 964 */
965 965 verf.seconds &= INT32_MAX;
966 966 did_excl_setup = TRUE;
967 967 }
968 968
969 969 /* Now copy over verifier to OPEN4args. */
970 970 open_args->createhow4_u.createverf = *(uint64_t *)&verf;
971 971 } else {
972 972 int v_error;
973 973 bitmap4 supp_attrs;
974 974 servinfo4_t *svp;
975 975
976 976 attr = &open_args->createhow4_u.createattrs;
977 977
978 978 svp = drp->r_server;
979 979 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
980 980 supp_attrs = svp->sv_supp_attrs;
981 981 nfs_rw_exit(&svp->sv_lock);
982 982
983 983 /* GUARDED4 or UNCHECKED4 */
984 984 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
985 985 supp_attrs);
986 986 if (v_error) {
987 987 bzero(attr, sizeof (*attr));
988 988 nfs4args_copen_free(open_args);
989 989 nfs4_end_op(VTOMI4(dvp), dvp, vpi,
990 990 &recov_state, FALSE);
991 991 if (ncr != NULL)
992 992 crfree(ncr);
993 993 kmem_free(argop, argoplist_size);
994 994 return (v_error);
995 995 }
996 996 }
997 997 } else {
998 998 /* NO CREATE */
999 999 open_args->opentype = OPEN4_NOCREATE;
1000 1000 }
1001 1001
1002 1002 if (recov_state.rs_sp != NULL) {
1003 1003 mutex_enter(&recov_state.rs_sp->s_lock);
1004 1004 open_args->owner.clientid = recov_state.rs_sp->clientid;
1005 1005 mutex_exit(&recov_state.rs_sp->s_lock);
1006 1006 } else {
1007 1007 /* XXX should we just fail here? */
1008 1008 open_args->owner.clientid = 0;
1009 1009 }
1010 1010
1011 1011 /*
1012 1012 * This increments oop's ref count or creates a temporary 'just_created'
1013 1013 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1014 1014 * completes.
1015 1015 */
1016 1016 mutex_enter(&VTOMI4(dvp)->mi_lock);
1017 1017
1018 1018 /* See if a permanent or just created open owner exists */
1019 1019 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1020 1020 if (!oop) {
1021 1021 /*
1022 1022 * This open owner does not exist so create a temporary
1023 1023 * just created one.
1024 1024 */
1025 1025 oop = create_open_owner(cr, VTOMI4(dvp));
1026 1026 ASSERT(oop != NULL);
1027 1027 }
1028 1028 mutex_exit(&VTOMI4(dvp)->mi_lock);
1029 1029
1030 1030 /* this length never changes, do alloc before seqid sync */
1031 1031 open_args->owner.owner_len = sizeof (oop->oo_name);
1032 1032 open_args->owner.owner_val =
1033 1033 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1034 1034
1035 1035 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1036 1036 if (e.error == EAGAIN) {
1037 1037 open_owner_rele(oop);
1038 1038 nfs4args_copen_free(open_args);
1039 1039 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1040 1040 if (ncr != NULL) {
1041 1041 crfree(ncr);
1042 1042 ncr = NULL;
1043 1043 }
1044 1044 goto recov_retry;
1045 1045 }
1046 1046
1047 1047 /* Check to see if we need to do the OTW call */
1048 1048 if (!create_flag) {
1049 1049 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1050 1050 file_just_been_created, &e.error, acc, &recov_state)) {
1051 1051
1052 1052 /*
1053 1053 * The OTW open is not necessary. Either
1054 1054 * the open can succeed without it (eg.
1055 1055 * delegation, error == 0) or the open
1056 1056 * must fail due to an access failure
1057 1057 * (error != 0). In either case, tidy
1058 1058 * up and return.
1059 1059 */
1060 1060
1061 1061 nfs4_end_open_seqid_sync(oop);
1062 1062 open_owner_rele(oop);
1063 1063 nfs4args_copen_free(open_args);
1064 1064 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1065 1065 if (ncr != NULL)
1066 1066 crfree(ncr);
1067 1067 kmem_free(argop, argoplist_size);
1068 1068 return (e.error);
1069 1069 }
1070 1070 }
1071 1071
1072 1072 bcopy(&oop->oo_name, open_args->owner.owner_val,
1073 1073 open_args->owner.owner_len);
1074 1074
1075 1075 seqid = nfs4_get_open_seqid(oop) + 1;
1076 1076 open_args->seqid = seqid;
1077 1077 open_args->share_access = 0;
1078 1078 if (open_flag & FREAD)
1079 1079 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1080 1080 if (open_flag & FWRITE)
1081 1081 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1082 1082 open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1083 1083
1084 1084
1085 1085
1086 1086 /*
1087 1087 * getfh w/sanity check for idx_open/idx_fattr
1088 1088 */
1089 1089 ASSERT((idx_open + 1) == (idx_fattr - 1));
1090 1090 argop[idx_open + 1].argop = OP_GETFH;
1091 1091
1092 1092 /* getattr */
1093 1093 argop[idx_fattr].argop = OP_GETATTR;
1094 1094 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1095 1095 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1096 1096
1097 1097 if (setgid_flag) {
1098 1098 vattr_t _v;
1099 1099 servinfo4_t *svp;
1100 1100 bitmap4 supp_attrs;
1101 1101
1102 1102 svp = drp->r_server;
1103 1103 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1104 1104 supp_attrs = svp->sv_supp_attrs;
1105 1105 nfs_rw_exit(&svp->sv_lock);
1106 1106
1107 1107 /*
1108 1108 * For setgid case, we need to:
1109 1109 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1110 1110 */
1111 1111 argop[4].argop = OP_SAVEFH;
1112 1112
1113 1113 argop[5].argop = OP_CPUTFH;
1114 1114 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1115 1115
1116 1116 argop[6].argop = OP_GETATTR;
1117 1117 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1118 1118 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1119 1119
1120 1120 argop[7].argop = OP_RESTOREFH;
1121 1121
1122 1122 /*
1123 1123 * nverify
1124 1124 */
1125 1125 _v.va_mask = AT_GID;
1126 1126 _v.va_gid = in_va->va_gid;
1127 1127 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1128 1128 supp_attrs))) {
1129 1129
1130 1130 /*
1131 1131 * setattr
1132 1132 *
1133 1133 * We _know_ we're not messing with AT_SIZE or
1134 1134 * AT_XTIME, so no need for stateid or flags.
1135 1135 * Also we specify NULL rp since we're only
1136 1136 * interested in setting owner_group attributes.
1137 1137 */
1138 1138 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1139 1139 supp_attrs, &e.error, 0);
1140 1140 if (e.error)
1141 1141 nfs4args_verify_free(&argop[8]);
1142 1142 }
1143 1143
1144 1144 if (e.error) {
1145 1145 /*
1146 1146 * XXX - Revisit the last argument to nfs4_end_op()
1147 1147 * once 5020486 is fixed.
1148 1148 */
1149 1149 nfs4_end_open_seqid_sync(oop);
1150 1150 open_owner_rele(oop);
1151 1151 nfs4args_copen_free(open_args);
1152 1152 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1153 1153 if (ncr != NULL)
1154 1154 crfree(ncr);
1155 1155 kmem_free(argop, argoplist_size);
1156 1156 return (e.error);
1157 1157 }
1158 1158 } else if (create_flag) {
1159 1159 argop[1].argop = OP_SAVEFH;
1160 1160
1161 1161 argop[5].argop = OP_RESTOREFH;
1162 1162
1163 1163 argop[6].argop = OP_GETATTR;
1164 1164 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1165 1165 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1166 1166 }
1167 1167
1168 1168 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1169 1169 "nfs4open_otw: %s call, nm %s, rp %s",
1170 1170 needrecov ? "recov" : "first", file_name,
1171 1171 rnode4info(VTOR4(dvp))));
1172 1172
1173 1173 t = gethrtime();
1174 1174
1175 1175 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1176 1176
1177 1177 if (!e.error && nfs4_need_to_bump_seqid(&res))
1178 1178 nfs4_set_open_seqid(seqid, oop, args.ctag);
1179 1179
1180 1180 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1181 1181
1182 1182 if (e.error || needrecov) {
1183 1183 bool_t abort = FALSE;
1184 1184
1185 1185 if (needrecov) {
1186 1186 nfs4_bseqid_entry_t *bsep = NULL;
1187 1187
1188 1188 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1189 1189 cred_otw, vpi, dvp, open_args);
1190 1190
1191 1191 if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1192 1192 bsep = nfs4_create_bseqid_entry(oop, NULL,
1193 1193 vpi, 0, args.ctag, open_args->seqid);
1194 1194 num_bseqid_retry--;
1195 1195 }
1196 1196
1197 1197 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1198 1198 NULL, lost_rqst.lr_op == OP_OPEN ?
1199 1199 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1200 1200
1201 1201 if (bsep)
1202 1202 kmem_free(bsep, sizeof (*bsep));
1203 1203 /* give up if we keep getting BAD_SEQID */
1204 1204 if (num_bseqid_retry == 0)
1205 1205 abort = TRUE;
1206 1206 if (abort == TRUE && e.error == 0)
1207 1207 e.error = geterrno4(res.status);
1208 1208 }
1209 1209 nfs4_end_open_seqid_sync(oop);
1210 1210 open_owner_rele(oop);
1211 1211 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1212 1212 nfs4args_copen_free(open_args);
1213 1213 if (setgid_flag) {
1214 1214 nfs4args_verify_free(&argop[8]);
1215 1215 nfs4args_setattr_free(&argop[9]);
1216 1216 }
1217 1217 if (!e.error)
1218 1218 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1219 1219 if (ncr != NULL) {
1220 1220 crfree(ncr);
1221 1221 ncr = NULL;
1222 1222 }
1223 1223 if (!needrecov || abort == TRUE || e.error == EINTR ||
1224 1224 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1225 1225 kmem_free(argop, argoplist_size);
1226 1226 return (e.error);
1227 1227 }
1228 1228 goto recov_retry;
1229 1229 }
1230 1230
1231 1231 /*
1232 1232 * Will check and update lease after checking the rflag for
1233 1233 * OPEN_CONFIRM in the successful OPEN call.
1234 1234 */
1235 1235 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1236 1236
1237 1237 /*
1238 1238 * XXX what if we're crossing mount points from server1:/drp
1239 1239 * to server2:/drp/rp.
1240 1240 */
1241 1241
1242 1242 /* Signal our end of use of the open seqid */
1243 1243 nfs4_end_open_seqid_sync(oop);
1244 1244
1245 1245 /*
1246 1246 * This will destroy the open owner if it was just created,
1247 1247 * and no one else has put a reference on it.
1248 1248 */
1249 1249 open_owner_rele(oop);
1250 1250 if (create_flag && (createmode != EXCLUSIVE4) &&
1251 1251 res.status == NFS4ERR_BADOWNER)
1252 1252 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1253 1253
1254 1254 e.error = geterrno4(res.status);
1255 1255 nfs4args_copen_free(open_args);
1256 1256 if (setgid_flag) {
1257 1257 nfs4args_verify_free(&argop[8]);
1258 1258 nfs4args_setattr_free(&argop[9]);
1259 1259 }
1260 1260 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1261 1261 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1262 1262 /*
1263 1263 * If the reply is NFS4ERR_ACCESS, it may be because
1264 1264 * we are root (no root net access). If the real uid
1265 1265 * is not root, then retry with the real uid instead.
1266 1266 */
1267 1267 if (ncr != NULL) {
1268 1268 crfree(ncr);
1269 1269 ncr = NULL;
1270 1270 }
1271 1271 if (res.status == NFS4ERR_ACCESS &&
1272 1272 (ncr = crnetadjust(cred_otw)) != NULL) {
1273 1273 cred_otw = ncr;
1274 1274 goto recov_retry;
1275 1275 }
1276 1276 kmem_free(argop, argoplist_size);
1277 1277 return (e.error);
1278 1278 }
1279 1279
1280 1280 resop = &res.array[idx_open]; /* open res */
1281 1281 op_res = &resop->nfs_resop4_u.opopen;
1282 1282
1283 1283 #ifdef DEBUG
1284 1284 /*
1285 1285 * verify attrset bitmap
1286 1286 */
1287 1287 if (create_flag &&
1288 1288 (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1289 1289 /* make sure attrset returned is what we asked for */
1290 1290 /* XXX Ignore this 'error' for now */
1291 1291 if (attr->attrmask != op_res->attrset)
1292 1292 /* EMPTY */;
1293 1293 }
1294 1294 #endif
1295 1295
1296 1296 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1297 1297 mutex_enter(&VTOMI4(dvp)->mi_lock);
1298 1298 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1299 1299 mutex_exit(&VTOMI4(dvp)->mi_lock);
1300 1300 }
1301 1301
1302 1302 resop = &res.array[idx_open + 1]; /* getfh res */
1303 1303 gf_res = &resop->nfs_resop4_u.opgetfh;
1304 1304
1305 1305 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1306 1306
1307 1307 /*
1308 1308 * The open stateid has been updated on the server but not
1309 1309 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache->
1310 1310 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1311 1311 * WRITE call. That, however, will use the old stateid, so go ahead
1312 1312 * and upate the open stateid now, before any call to makenfs4node.
1313 1313 */
1314 1314 if (vpi) {
1315 1315 nfs4_open_stream_t *tmp_osp;
1316 1316 rnode4_t *tmp_rp = VTOR4(vpi);
1317 1317
1318 1318 tmp_osp = find_open_stream(oop, tmp_rp);
1319 1319 if (tmp_osp) {
1320 1320 tmp_osp->open_stateid = op_res->stateid;
1321 1321 mutex_exit(&tmp_osp->os_sync_lock);
1322 1322 open_stream_rele(tmp_osp, tmp_rp);
1323 1323 }
1324 1324
1325 1325 /*
1326 1326 * We must determine if the file handle given by the otw open
1327 1327 * is the same as the file handle which was passed in with
1328 1328 * *vpp. This case can be reached if the file we are trying
1329 1329 * to open has been removed and another file has been created
1330 1330 * having the same file name. The passed in vnode is released
1331 1331 * later.
1332 1332 */
1333 1333 orig_sfh = VTOR4(vpi)->r_fh;
1334 1334 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1335 1335 }
1336 1336
1337 1337 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1338 1338
1339 1339 if (create_flag || fh_differs) {
1340 1340 int rnode_err = 0;
1341 1341
1342 1342 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1343 1343 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1344 1344
1345 1345 if (e.error)
1346 1346 PURGE_ATTRCACHE4(vp);
1347 1347 /*
1348 1348 * For the newly created vp case, make sure the rnode
1349 1349 * isn't bad before using it.
1350 1350 */
1351 1351 mutex_enter(&(VTOR4(vp))->r_statelock);
1352 1352 if (VTOR4(vp)->r_flags & R4RECOVERR)
1353 1353 rnode_err = EIO;
1354 1354 mutex_exit(&(VTOR4(vp))->r_statelock);
1355 1355
1356 1356 if (rnode_err) {
1357 1357 nfs4_end_open_seqid_sync(oop);
1358 1358 nfs4args_copen_free(open_args);
1359 1359 if (setgid_flag) {
1360 1360 nfs4args_verify_free(&argop[8]);
1361 1361 nfs4args_setattr_free(&argop[9]);
1362 1362 }
1363 1363 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1364 1364 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1365 1365 needrecov);
1366 1366 open_owner_rele(oop);
1367 1367 VN_RELE(vp);
1368 1368 if (ncr != NULL)
1369 1369 crfree(ncr);
1370 1370 sfh4_rele(&otw_sfh);
1371 1371 kmem_free(argop, argoplist_size);
1372 1372 return (EIO);
1373 1373 }
1374 1374 } else {
1375 1375 vp = vpi;
1376 1376 }
1377 1377 sfh4_rele(&otw_sfh);
1378 1378
1379 1379 /*
1380 1380 * It seems odd to get a full set of attrs and then not update
1381 1381 * the object's attrcache in the non-create case. Create case uses
1382 1382 * the attrs since makenfs4node checks to see if the attrs need to
1383 1383 * be updated (and then updates them). The non-create case should
1384 1384 * update attrs also.
1385 1385 */
1386 1386 if (! create_flag && ! fh_differs && !e.error) {
1387 1387 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1388 1388 }
1389 1389
1390 1390 nfs4_error_zinit(&e);
1391 1391 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1392 1392 /* This does not do recovery for vp explicitly. */
1393 1393 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1394 1394 &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1395 1395
1396 1396 if (e.error || e.stat) {
1397 1397 nfs4_end_open_seqid_sync(oop);
1398 1398 nfs4args_copen_free(open_args);
1399 1399 if (setgid_flag) {
1400 1400 nfs4args_verify_free(&argop[8]);
1401 1401 nfs4args_setattr_free(&argop[9]);
1402 1402 }
1403 1403 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1404 1404 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1405 1405 needrecov);
1406 1406 open_owner_rele(oop);
1407 1407 if (create_flag || fh_differs) {
1408 1408 /* rele the makenfs4node */
1409 1409 VN_RELE(vp);
1410 1410 }
1411 1411 if (ncr != NULL) {
1412 1412 crfree(ncr);
1413 1413 ncr = NULL;
1414 1414 }
1415 1415 if (retry_open == TRUE) {
1416 1416 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1417 1417 "nfs4open_otw: retry the open since OPEN "
1418 1418 "CONFIRM failed with error %d stat %d",
1419 1419 e.error, e.stat));
1420 1420 if (create_flag && createmode == GUARDED4) {
1421 1421 NFS4_DEBUG(nfs4_client_recov_debug,
1422 1422 (CE_NOTE, "nfs4open_otw: switch "
1423 1423 "createmode from GUARDED4 to "
1424 1424 "UNCHECKED4"));
1425 1425 createmode = UNCHECKED4;
1426 1426 }
1427 1427 goto recov_retry;
1428 1428 }
1429 1429 if (!e.error) {
1430 1430 if (create_flag && (createmode != EXCLUSIVE4) &&
1431 1431 e.stat == NFS4ERR_BADOWNER)
1432 1432 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1433 1433
1434 1434 e.error = geterrno4(e.stat);
1435 1435 }
1436 1436 kmem_free(argop, argoplist_size);
1437 1437 return (e.error);
1438 1438 }
1439 1439 }
1440 1440
1441 1441 rp = VTOR4(vp);
1442 1442
1443 1443 mutex_enter(&rp->r_statev4_lock);
1444 1444 if (create_flag)
1445 1445 rp->created_v4 = 1;
1446 1446 mutex_exit(&rp->r_statev4_lock);
1447 1447
1448 1448 mutex_enter(&oop->oo_lock);
1449 1449 /* Doesn't matter if 'oo_just_created' already was set as this */
1450 1450 oop->oo_just_created = NFS4_PERM_CREATED;
1451 1451 if (oop->oo_cred_otw)
1452 1452 crfree(oop->oo_cred_otw);
1453 1453 oop->oo_cred_otw = cred_otw;
1454 1454 crhold(oop->oo_cred_otw);
1455 1455 mutex_exit(&oop->oo_lock);
1456 1456
1457 1457 /* returns with 'os_sync_lock' held */
1458 1458 osp = find_or_create_open_stream(oop, rp, &created_osp);
1459 1459 if (!osp) {
1460 1460 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1461 1461 "nfs4open_otw: failed to create an open stream"));
1462 1462 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1463 1463 "signal our end of use of the open seqid"));
1464 1464
1465 1465 nfs4_end_open_seqid_sync(oop);
1466 1466 open_owner_rele(oop);
1467 1467 nfs4args_copen_free(open_args);
1468 1468 if (setgid_flag) {
1469 1469 nfs4args_verify_free(&argop[8]);
1470 1470 nfs4args_setattr_free(&argop[9]);
1471 1471 }
1472 1472 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1473 1473 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1474 1474 if (create_flag || fh_differs)
1475 1475 VN_RELE(vp);
1476 1476 if (ncr != NULL)
1477 1477 crfree(ncr);
1478 1478
1479 1479 kmem_free(argop, argoplist_size);
1480 1480 return (EINVAL);
1481 1481
1482 1482 }
1483 1483
1484 1484 osp->open_stateid = op_res->stateid;
1485 1485
1486 1486 if (open_flag & FREAD)
1487 1487 osp->os_share_acc_read++;
1488 1488 if (open_flag & FWRITE)
1489 1489 osp->os_share_acc_write++;
1490 1490 osp->os_share_deny_none++;
1491 1491
1492 1492 /*
1493 1493 * Need to reset this bitfield for the possible case where we were
1494 1494 * going to OTW CLOSE the file, got a non-recoverable error, and before
1495 1495 * we could retry the CLOSE, OPENed the file again.
1496 1496 */
1497 1497 ASSERT(osp->os_open_owner->oo_seqid_inuse);
1498 1498 osp->os_final_close = 0;
1499 1499 osp->os_force_close = 0;
1500 1500 #ifdef DEBUG
1501 1501 if (osp->os_failed_reopen)
1502 1502 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1503 1503 " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1504 1504 (void *)osp, (void *)cr, rnode4info(rp)));
1505 1505 #endif
1506 1506 osp->os_failed_reopen = 0;
1507 1507
1508 1508 mutex_exit(&osp->os_sync_lock);
1509 1509
1510 1510 nfs4_end_open_seqid_sync(oop);
1511 1511
1512 1512 if (created_osp && recov_state.rs_sp != NULL) {
1513 1513 mutex_enter(&recov_state.rs_sp->s_lock);
1514 1514 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1515 1515 mutex_exit(&recov_state.rs_sp->s_lock);
1516 1516 }
1517 1517
1518 1518 /* get rid of our reference to find oop */
1519 1519 open_owner_rele(oop);
1520 1520
1521 1521 open_stream_rele(osp, rp);
1522 1522
1523 1523 /* accept delegation, if any */
1524 1524 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1525 1525
1526 1526 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1527 1527
1528 1528 if (createmode == EXCLUSIVE4 &&
1529 1529 (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1530 1530 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1531 1531 " EXCLUSIVE4: sending a SETATTR"));
1532 1532 /*
1533 1533 * If doing an exclusive create, then generate
1534 1534 * a SETATTR to set the initial attributes.
1535 1535 * Try to set the mtime and the atime to the
1536 1536 * server's current time. It is somewhat
1537 1537 * expected that these fields will be used to
1538 1538 * store the exclusive create cookie. If not,
1539 1539 * server implementors will need to know that
1540 1540 * a SETATTR will follow an exclusive create
1541 1541 * and the cookie should be destroyed if
1542 1542 * appropriate.
1543 1543 *
1544 1544 * The AT_GID and AT_SIZE bits are turned off
1545 1545 * so that the SETATTR request will not attempt
1546 1546 * to process these. The gid will be set
1547 1547 * separately if appropriate. The size is turned
1548 1548 * off because it is assumed that a new file will
1549 1549 * be created empty and if the file wasn't empty,
1550 1550 * then the exclusive create will have failed
1551 1551 * because the file must have existed already.
1552 1552 * Therefore, no truncate operation is needed.
1553 1553 */
1554 1554 in_va->va_mask &= ~(AT_GID | AT_SIZE);
1555 1555 in_va->va_mask |= (AT_MTIME | AT_ATIME);
1556 1556
1557 1557 e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1558 1558 if (e.error) {
1559 1559 nfs4_error_t err;
1560 1560
1561 1561 /*
1562 1562 * Couldn't correct the attributes of
1563 1563 * the newly created file and the
1564 1564 * attributes are wrong. Remove the
1565 1565 * file and return an error to the
1566 1566 * application.
1567 1567 */
1568 1568 /* XXX will this take care of client state ? */
1569 1569 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1570 1570 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1571 1571 " remove file", e.error));
1572 1572
1573 1573 /*
1574 1574 * The file is currently open so try to close it first.
1575 1575 *
1576 1576 * If we do not close the file explicitly here then the
1577 1577 * VN_RELE() would do an (implicit and asynchronous)
1578 1578 * close for us. But such async close could race with
1579 1579 * the nfs4_remove() below. If the async close is
1580 1580 * slower than nfs4_remove() then nfs4_remove()
1581 1581 * wouldn't remove the file but rename it to .nfsXXXX
1582 1582 * instead.
1583 1583 */
1584 1584 nfs4close_one(vp, NULL, cr, open_flag, NULL, &err,
1585 1585 CLOSE_NORM, 0, 0, 0);
1586 1586 VN_RELE(vp);
1587 1587 (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1588 1588
1589 1589 /*
1590 1590 * Since we've reled the vnode and removed
1591 1591 * the file we now need to return the error.
1592 1592 * At this point we don't want to update the
1593 1593 * dircaches, call nfs4_waitfor_purge_complete
1594 1594 * or set vpp to vp so we need to skip these
1595 1595 * as well.
1596 1596 */
1597 1597 goto skip_update_dircaches;
1598 1598 }
1599 1599 }
1600 1600
1601 1601 /*
1602 1602 * If we created or found the correct vnode, due to create_flag or
1603 1603 * fh_differs being set, then update directory cache attribute, readdir
1604 1604 * and dnlc caches.
1605 1605 */
1606 1606 if (create_flag || fh_differs) {
1607 1607 dirattr_info_t dinfo, *dinfop;
1608 1608
1609 1609 /*
1610 1610 * Make sure getattr succeeded before using results.
1611 1611 * note: op 7 is getattr(dir) for both flavors of
1612 1612 * open(create).
1613 1613 */
1614 1614 if (create_flag && res.status == NFS4_OK) {
1615 1615 dinfo.di_time_call = t;
1616 1616 dinfo.di_cred = cr;
1617 1617 dinfo.di_garp =
1618 1618 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1619 1619 dinfop = &dinfo;
1620 1620 } else {
1621 1621 dinfop = NULL;
1622 1622 }
1623 1623
1624 1624 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1625 1625 dinfop);
1626 1626 }
1627 1627
1628 1628 /*
1629 1629 * If the page cache for this file was flushed from actions
1630 1630 * above, it was done asynchronously and if that is true,
1631 1631 * there is a need to wait here for it to complete. This must
1632 1632 * be done outside of start_fop/end_fop.
1633 1633 */
1634 1634 (void) nfs4_waitfor_purge_complete(vp);
1635 1635
1636 1636 /*
1637 1637 * It is implicit that we are in the open case (create_flag == 0) since
1638 1638 * fh_differs can only be set to a non-zero value in the open case.
1639 1639 */
1640 1640 if (fh_differs != 0 && vpi != NULL)
1641 1641 VN_RELE(vpi);
1642 1642
1643 1643 /*
1644 1644 * Be sure to set *vpp to the correct value before returning.
1645 1645 */
1646 1646 *vpp = vp;
1647 1647
1648 1648 skip_update_dircaches:
1649 1649
1650 1650 nfs4args_copen_free(open_args);
1651 1651 if (setgid_flag) {
1652 1652 nfs4args_verify_free(&argop[8]);
1653 1653 nfs4args_setattr_free(&argop[9]);
1654 1654 }
1655 1655 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1656 1656
1657 1657 if (ncr)
1658 1658 crfree(ncr);
1659 1659 kmem_free(argop, argoplist_size);
1660 1660 return (e.error);
1661 1661 }
1662 1662
1663 1663 /*
1664 1664 * Reopen an open instance. cf. nfs4open_otw().
1665 1665 *
1666 1666 * Errors are returned by the nfs4_error_t parameter.
1667 1667 * - ep->error contains an errno value or zero.
1668 1668 * - if it is zero, ep->stat is set to an NFS status code, if any.
1669 1669 * If the file could not be reopened, but the caller should continue, the
1670 1670 * file is marked dead and no error values are returned. If the caller
1671 1671 * should stop recovering open files and start over, either the ep->error
1672 1672 * value or ep->stat will indicate an error (either something that requires
1673 1673 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile
1674 1674 * filehandles) may be handled silently by this routine.
1675 1675 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1676 1676 * will be started, so the caller should not do it.
1677 1677 *
1678 1678 * Gotos:
1679 1679 * - kill_file : reopen failed in such a fashion to constitute marking the
1680 1680 * file dead and setting the open stream's 'os_failed_reopen' as 1. This
1681 1681 * is for cases where recovery is not possible.
1682 1682 * - failed_reopen : same as above, except that the file has already been
1683 1683 * marked dead, so no need to do it again.
1684 1684 * - bailout : reopen failed but we are able to recover and retry the reopen -
1685 1685 * either within this function immediately or via the calling function.
1686 1686 */
1687 1687
1688 1688 void
1689 1689 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1690 1690 open_claim_type4 claim, bool_t frc_use_claim_previous,
1691 1691 bool_t is_recov)
1692 1692 {
1693 1693 COMPOUND4args_clnt args;
1694 1694 COMPOUND4res_clnt res;
1695 1695 nfs_argop4 argop[4];
1696 1696 nfs_resop4 *resop;
1697 1697 OPEN4res *op_res = NULL;
1698 1698 OPEN4cargs *open_args;
1699 1699 GETFH4res *gf_res;
1700 1700 rnode4_t *rp = VTOR4(vp);
1701 1701 int doqueue = 1;
1702 1702 cred_t *cr = NULL, *cred_otw = NULL;
1703 1703 nfs4_open_owner_t *oop = NULL;
1704 1704 seqid4 seqid;
1705 1705 nfs4_ga_res_t *garp;
1706 1706 char fn[MAXNAMELEN];
1707 1707 nfs4_recov_state_t recov = {NULL, 0};
1708 1708 nfs4_lost_rqst_t lost_rqst;
1709 1709 mntinfo4_t *mi = VTOMI4(vp);
1710 1710 bool_t abort;
1711 1711 char *failed_msg = "";
1712 1712 int fh_different;
1713 1713 hrtime_t t;
1714 1714 nfs4_bseqid_entry_t *bsep = NULL;
1715 1715
1716 1716 ASSERT(nfs4_consistent_type(vp));
1717 1717 ASSERT(nfs_zone() == mi->mi_zone);
1718 1718
1719 1719 nfs4_error_zinit(ep);
1720 1720
1721 1721 /* this is the cred used to find the open owner */
1722 1722 cr = state_to_cred(osp);
1723 1723 if (cr == NULL) {
1724 1724 failed_msg = "Couldn't reopen: no cred";
1725 1725 goto kill_file;
1726 1726 }
1727 1727 /* use this cred for OTW operations */
1728 1728 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1729 1729
1730 1730 top:
1731 1731 nfs4_error_zinit(ep);
1732 1732
1733 1733 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1734 1734 /* File system has been unmounted, quit */
1735 1735 ep->error = EIO;
1736 1736 failed_msg = "Couldn't reopen: file system has been unmounted";
1737 1737 goto kill_file;
1738 1738 }
1739 1739
1740 1740 oop = osp->os_open_owner;
1741 1741
1742 1742 ASSERT(oop != NULL);
1743 1743 if (oop == NULL) { /* be defensive in non-DEBUG */
1744 1744 failed_msg = "can't reopen: no open owner";
1745 1745 goto kill_file;
1746 1746 }
1747 1747 open_owner_hold(oop);
1748 1748
1749 1749 ep->error = nfs4_start_open_seqid_sync(oop, mi);
1750 1750 if (ep->error) {
1751 1751 open_owner_rele(oop);
1752 1752 oop = NULL;
1753 1753 goto bailout;
1754 1754 }
1755 1755
1756 1756 /*
1757 1757 * If the rnode has a delegation and the delegation has been
1758 1758 * recovered and the server didn't request a recall and the caller
1759 1759 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1760 1760 * recovery) and the rnode hasn't been marked dead, then install
1761 1761 * the delegation stateid in the open stream. Otherwise, proceed
1762 1762 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1763 1763 */
1764 1764 mutex_enter(&rp->r_statev4_lock);
1765 1765 if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1766 1766 !rp->r_deleg_return_pending &&
1767 1767 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1768 1768 !rp->r_deleg_needs_recall &&
1769 1769 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1770 1770 !(rp->r_flags & R4RECOVERR)) {
1771 1771 mutex_enter(&osp->os_sync_lock);
1772 1772 osp->os_delegation = 1;
1773 1773 osp->open_stateid = rp->r_deleg_stateid;
1774 1774 mutex_exit(&osp->os_sync_lock);
1775 1775 mutex_exit(&rp->r_statev4_lock);
1776 1776 goto bailout;
1777 1777 }
1778 1778 mutex_exit(&rp->r_statev4_lock);
1779 1779
1780 1780 /*
1781 1781 * If the file failed recovery, just quit. This failure need not
1782 1782 * affect other reopens, so don't return an error.
1783 1783 */
1784 1784 mutex_enter(&rp->r_statelock);
1785 1785 if (rp->r_flags & R4RECOVERR) {
1786 1786 mutex_exit(&rp->r_statelock);
1787 1787 ep->error = 0;
1788 1788 goto failed_reopen;
1789 1789 }
1790 1790 mutex_exit(&rp->r_statelock);
1791 1791
1792 1792 /*
1793 1793 * argop is empty here
1794 1794 *
1795 1795 * PUTFH, OPEN, GETATTR
1796 1796 */
1797 1797 args.ctag = TAG_REOPEN;
1798 1798 args.array_len = 4;
1799 1799 args.array = argop;
1800 1800
1801 1801 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1802 1802 "nfs4_reopen: file is type %d, id %s",
1803 1803 vp->v_type, rnode4info(VTOR4(vp))));
1804 1804
1805 1805 argop[0].argop = OP_CPUTFH;
1806 1806
1807 1807 if (claim != CLAIM_PREVIOUS) {
1808 1808 /*
1809 1809 * if this is a file mount then
1810 1810 * use the mntinfo parentfh
1811 1811 */
1812 1812 argop[0].nfs_argop4_u.opcputfh.sfh =
1813 1813 (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1814 1814 VTOSV(vp)->sv_dfh;
1815 1815 } else {
1816 1816 /* putfh fh to reopen */
1817 1817 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1818 1818 }
1819 1819
1820 1820 argop[1].argop = OP_COPEN;
1821 1821 open_args = &argop[1].nfs_argop4_u.opcopen;
1822 1822 open_args->claim = claim;
1823 1823
1824 1824 if (claim == CLAIM_NULL) {
1825 1825
1826 1826 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1827 1827 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1828 1828 "failed for vp 0x%p for CLAIM_NULL with %m",
1829 1829 (void *)vp);
1830 1830 failed_msg = "Couldn't reopen: vtoname failed for "
1831 1831 "CLAIM_NULL";
1832 1832 /* nothing allocated yet */
1833 1833 goto kill_file;
1834 1834 }
1835 1835
1836 1836 open_args->open_claim4_u.cfile = fn;
1837 1837 } else if (claim == CLAIM_PREVIOUS) {
1838 1838
1839 1839 /*
1840 1840 * We have two cases to deal with here:
1841 1841 * 1) We're being called to reopen files in order to satisfy
1842 1842 * a lock operation request which requires us to explicitly
1843 1843 * reopen files which were opened under a delegation. If
1844 1844 * we're in recovery, we *must* use CLAIM_PREVIOUS. In
1845 1845 * that case, frc_use_claim_previous is TRUE and we must
1846 1846 * use the rnode's current delegation type (r_deleg_type).
1847 1847 * 2) We're reopening files during some form of recovery.
1848 1848 * In this case, frc_use_claim_previous is FALSE and we
1849 1849 * use the delegation type appropriate for recovery
1850 1850 * (r_deleg_needs_recovery).
1851 1851 */
1852 1852 mutex_enter(&rp->r_statev4_lock);
1853 1853 open_args->open_claim4_u.delegate_type =
1854 1854 frc_use_claim_previous ?
1855 1855 rp->r_deleg_type :
1856 1856 rp->r_deleg_needs_recovery;
1857 1857 mutex_exit(&rp->r_statev4_lock);
1858 1858
1859 1859 } else if (claim == CLAIM_DELEGATE_CUR) {
1860 1860
1861 1861 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1862 1862 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1863 1863 "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1864 1864 "with %m", (void *)vp);
1865 1865 failed_msg = "Couldn't reopen: vtoname failed for "
1866 1866 "CLAIM_DELEGATE_CUR";
1867 1867 /* nothing allocated yet */
1868 1868 goto kill_file;
1869 1869 }
1870 1870
1871 1871 mutex_enter(&rp->r_statev4_lock);
1872 1872 open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1873 1873 rp->r_deleg_stateid;
1874 1874 mutex_exit(&rp->r_statev4_lock);
1875 1875
1876 1876 open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1877 1877 }
1878 1878 open_args->opentype = OPEN4_NOCREATE;
1879 1879 open_args->owner.clientid = mi2clientid(mi);
1880 1880 open_args->owner.owner_len = sizeof (oop->oo_name);
1881 1881 open_args->owner.owner_val =
1882 1882 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1883 1883 bcopy(&oop->oo_name, open_args->owner.owner_val,
1884 1884 open_args->owner.owner_len);
1885 1885 open_args->share_access = 0;
1886 1886 open_args->share_deny = 0;
1887 1887
1888 1888 mutex_enter(&osp->os_sync_lock);
1889 1889 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1890 1890 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1891 1891 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1892 1892 (void *)osp, (void *)rp, osp->os_share_acc_read,
1893 1893 osp->os_share_acc_write, osp->os_open_ref_count,
1894 1894 osp->os_mmap_read, osp->os_mmap_write, claim));
1895 1895
1896 1896 if (osp->os_share_acc_read || osp->os_mmap_read)
1897 1897 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1898 1898 if (osp->os_share_acc_write || osp->os_mmap_write)
1899 1899 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1900 1900 if (osp->os_share_deny_read)
1901 1901 open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1902 1902 if (osp->os_share_deny_write)
1903 1903 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1904 1904 mutex_exit(&osp->os_sync_lock);
1905 1905
1906 1906 seqid = nfs4_get_open_seqid(oop) + 1;
1907 1907 open_args->seqid = seqid;
1908 1908
1909 1909 /* Construct the getfh part of the compound */
1910 1910 argop[2].argop = OP_GETFH;
1911 1911
1912 1912 /* Construct the getattr part of the compound */
1913 1913 argop[3].argop = OP_GETATTR;
1914 1914 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1915 1915 argop[3].nfs_argop4_u.opgetattr.mi = mi;
1916 1916
1917 1917 t = gethrtime();
1918 1918
1919 1919 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1920 1920
1921 1921 if (ep->error) {
1922 1922 if (!is_recov && !frc_use_claim_previous &&
1923 1923 (ep->error == EINTR || ep->error == ETIMEDOUT ||
1924 1924 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1925 1925 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1926 1926 cred_otw, vp, NULL, open_args);
1927 1927 abort = nfs4_start_recovery(ep,
1928 1928 VTOMI4(vp), vp, NULL, NULL,
1929 1929 lost_rqst.lr_op == OP_OPEN ?
1930 1930 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1931 1931 nfs4args_copen_free(open_args);
1932 1932 goto bailout;
1933 1933 }
1934 1934
1935 1935 nfs4args_copen_free(open_args);
1936 1936
1937 1937 if (ep->error == EACCES && cred_otw != cr) {
1938 1938 crfree(cred_otw);
1939 1939 cred_otw = cr;
1940 1940 crhold(cred_otw);
1941 1941 nfs4_end_open_seqid_sync(oop);
1942 1942 open_owner_rele(oop);
1943 1943 oop = NULL;
1944 1944 goto top;
1945 1945 }
1946 1946 if (ep->error == ETIMEDOUT)
1947 1947 goto bailout;
1948 1948 failed_msg = "Couldn't reopen: rpc error";
1949 1949 goto kill_file;
1950 1950 }
1951 1951
1952 1952 if (nfs4_need_to_bump_seqid(&res))
1953 1953 nfs4_set_open_seqid(seqid, oop, args.ctag);
1954 1954
1955 1955 switch (res.status) {
1956 1956 case NFS4_OK:
1957 1957 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1958 1958 mutex_enter(&rp->r_statelock);
1959 1959 rp->r_delay_interval = 0;
1960 1960 mutex_exit(&rp->r_statelock);
1961 1961 }
1962 1962 break;
1963 1963 case NFS4ERR_BAD_SEQID:
1964 1964 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1965 1965 args.ctag, open_args->seqid);
1966 1966
1967 1967 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1968 1968 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1969 1969 NULL, OP_OPEN, bsep, NULL, NULL);
1970 1970
1971 1971 nfs4args_copen_free(open_args);
1972 1972 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1973 1973 nfs4_end_open_seqid_sync(oop);
1974 1974 open_owner_rele(oop);
1975 1975 oop = NULL;
1976 1976 kmem_free(bsep, sizeof (*bsep));
1977 1977
1978 1978 goto kill_file;
1979 1979 case NFS4ERR_NO_GRACE:
1980 1980 nfs4args_copen_free(open_args);
1981 1981 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1982 1982 nfs4_end_open_seqid_sync(oop);
1983 1983 open_owner_rele(oop);
1984 1984 oop = NULL;
1985 1985 if (claim == CLAIM_PREVIOUS) {
1986 1986 /*
1987 1987 * Retry as a plain open. We don't need to worry about
1988 1988 * checking the changeinfo: it is acceptable for a
1989 1989 * client to re-open a file and continue processing
1990 1990 * (in the absence of locks).
1991 1991 */
1992 1992 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1993 1993 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1994 1994 "will retry as CLAIM_NULL"));
1995 1995 claim = CLAIM_NULL;
1996 1996 nfs4_mi_kstat_inc_no_grace(mi);
1997 1997 goto top;
1998 1998 }
1999 1999 failed_msg =
2000 2000 "Couldn't reopen: tried reclaim outside grace period. ";
2001 2001 goto kill_file;
2002 2002 case NFS4ERR_GRACE:
2003 2003 nfs4_set_grace_wait(mi);
2004 2004 nfs4args_copen_free(open_args);
2005 2005 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 2006 nfs4_end_open_seqid_sync(oop);
2007 2007 open_owner_rele(oop);
2008 2008 oop = NULL;
2009 2009 ep->error = nfs4_wait_for_grace(mi, &recov);
2010 2010 if (ep->error != 0)
2011 2011 goto bailout;
2012 2012 goto top;
2013 2013 case NFS4ERR_DELAY:
2014 2014 nfs4_set_delay_wait(vp);
2015 2015 nfs4args_copen_free(open_args);
2016 2016 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2017 2017 nfs4_end_open_seqid_sync(oop);
2018 2018 open_owner_rele(oop);
2019 2019 oop = NULL;
2020 2020 ep->error = nfs4_wait_for_delay(vp, &recov);
2021 2021 nfs4_mi_kstat_inc_delay(mi);
2022 2022 if (ep->error != 0)
2023 2023 goto bailout;
2024 2024 goto top;
2025 2025 case NFS4ERR_FHEXPIRED:
2026 2026 /* recover filehandle and retry */
2027 2027 abort = nfs4_start_recovery(ep,
2028 2028 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2029 2029 nfs4args_copen_free(open_args);
2030 2030 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2031 2031 nfs4_end_open_seqid_sync(oop);
2032 2032 open_owner_rele(oop);
2033 2033 oop = NULL;
2034 2034 if (abort == FALSE)
2035 2035 goto top;
2036 2036 failed_msg = "Couldn't reopen: recovery aborted";
2037 2037 goto kill_file;
2038 2038 case NFS4ERR_RESOURCE:
2039 2039 case NFS4ERR_STALE_CLIENTID:
2040 2040 case NFS4ERR_WRONGSEC:
2041 2041 case NFS4ERR_EXPIRED:
2042 2042 /*
2043 2043 * Do not mark the file dead and let the calling
2044 2044 * function initiate recovery.
2045 2045 */
2046 2046 nfs4args_copen_free(open_args);
2047 2047 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2048 2048 nfs4_end_open_seqid_sync(oop);
2049 2049 open_owner_rele(oop);
2050 2050 oop = NULL;
2051 2051 goto bailout;
2052 2052 case NFS4ERR_ACCESS:
2053 2053 if (cred_otw != cr) {
2054 2054 crfree(cred_otw);
2055 2055 cred_otw = cr;
2056 2056 crhold(cred_otw);
2057 2057 nfs4args_copen_free(open_args);
2058 2058 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2059 2059 nfs4_end_open_seqid_sync(oop);
2060 2060 open_owner_rele(oop);
2061 2061 oop = NULL;
2062 2062 goto top;
2063 2063 }
2064 2064 /* fall through */
2065 2065 default:
2066 2066 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2067 2067 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2068 2068 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2069 2069 rnode4info(VTOR4(vp))));
2070 2070 failed_msg = "Couldn't reopen: NFSv4 error";
2071 2071 nfs4args_copen_free(open_args);
2072 2072 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2073 2073 goto kill_file;
2074 2074 }
2075 2075
2076 2076 resop = &res.array[1]; /* open res */
2077 2077 op_res = &resop->nfs_resop4_u.opopen;
2078 2078
2079 2079 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2080 2080
2081 2081 /*
2082 2082 * Check if the path we reopened really is the same
2083 2083 * file. We could end up in a situation where the file
2084 2084 * was removed and a new file created with the same name.
2085 2085 */
2086 2086 resop = &res.array[2];
2087 2087 gf_res = &resop->nfs_resop4_u.opgetfh;
2088 2088 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2089 2089 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2090 2090 if (fh_different) {
2091 2091 if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2092 2092 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2093 2093 /* Oops, we don't have the same file */
2094 2094 if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2095 2095 failed_msg = "Couldn't reopen: Persistent "
2096 2096 "file handle changed";
2097 2097 else
2098 2098 failed_msg = "Couldn't reopen: Volatile "
2099 2099 "(no expire on open) file handle changed";
2100 2100
2101 2101 nfs4args_copen_free(open_args);
2102 2102 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2103 2103 nfs_rw_exit(&mi->mi_fh_lock);
2104 2104 goto kill_file;
2105 2105
2106 2106 } else {
2107 2107 /*
2108 2108 * We have volatile file handles that don't compare.
2109 2109 * If the fids are the same then we assume that the
2110 2110 * file handle expired but the rnode still refers to
2111 2111 * the same file object.
2112 2112 *
2113 2113 * First check that we have fids or not.
2114 2114 * If we don't we have a dumb server so we will
2115 2115 * just assume every thing is ok for now.
2116 2116 */
2117 2117 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2118 2118 rp->r_attr.va_mask & AT_NODEID &&
2119 2119 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2120 2120 /*
2121 2121 * We have fids, but they don't
2122 2122 * compare. So kill the file.
2123 2123 */
2124 2124 failed_msg =
2125 2125 "Couldn't reopen: file handle changed"
2126 2126 " due to mismatched fids";
2127 2127 nfs4args_copen_free(open_args);
2128 2128 xdr_free(xdr_COMPOUND4res_clnt,
2129 2129 (caddr_t)&res);
2130 2130 nfs_rw_exit(&mi->mi_fh_lock);
2131 2131 goto kill_file;
2132 2132 } else {
2133 2133 /*
2134 2134 * We have volatile file handles that refers
2135 2135 * to the same file (at least they have the
2136 2136 * same fid) or we don't have fids so we
2137 2137 * can't tell. :(. We'll be a kind and accepting
2138 2138 * client so we'll update the rnode's file
2139 2139 * handle with the otw handle.
2140 2140 *
2141 2141 * We need to drop mi->mi_fh_lock since
2142 2142 * sh4_update acquires it. Since there is
2143 2143 * only one recovery thread there is no
2144 2144 * race.
2145 2145 */
2146 2146 nfs_rw_exit(&mi->mi_fh_lock);
2147 2147 sfh4_update(rp->r_fh, &gf_res->object);
2148 2148 }
2149 2149 }
2150 2150 } else {
2151 2151 nfs_rw_exit(&mi->mi_fh_lock);
2152 2152 }
2153 2153
2154 2154 ASSERT(nfs4_consistent_type(vp));
2155 2155
2156 2156 /*
2157 2157 * If the server wanted an OPEN_CONFIRM but that fails, just start
2158 2158 * over. Presumably if there is a persistent error it will show up
2159 2159 * when we resend the OPEN.
2160 2160 */
2161 2161 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2162 2162 bool_t retry_open = FALSE;
2163 2163
2164 2164 nfs4open_confirm(vp, &seqid, &op_res->stateid,
2165 2165 cred_otw, is_recov, &retry_open,
2166 2166 oop, FALSE, ep, NULL);
2167 2167 if (ep->error || ep->stat) {
2168 2168 nfs4args_copen_free(open_args);
2169 2169 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2170 2170 nfs4_end_open_seqid_sync(oop);
2171 2171 open_owner_rele(oop);
2172 2172 oop = NULL;
2173 2173 goto top;
2174 2174 }
2175 2175 }
2176 2176
2177 2177 mutex_enter(&osp->os_sync_lock);
2178 2178 osp->open_stateid = op_res->stateid;
2179 2179 osp->os_delegation = 0;
2180 2180 /*
2181 2181 * Need to reset this bitfield for the possible case where we were
2182 2182 * going to OTW CLOSE the file, got a non-recoverable error, and before
2183 2183 * we could retry the CLOSE, OPENed the file again.
2184 2184 */
2185 2185 ASSERT(osp->os_open_owner->oo_seqid_inuse);
2186 2186 osp->os_final_close = 0;
2187 2187 osp->os_force_close = 0;
2188 2188 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2189 2189 osp->os_dc_openacc = open_args->share_access;
2190 2190 mutex_exit(&osp->os_sync_lock);
2191 2191
2192 2192 nfs4_end_open_seqid_sync(oop);
2193 2193
2194 2194 /* accept delegation, if any */
2195 2195 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2196 2196
2197 2197 nfs4args_copen_free(open_args);
2198 2198
2199 2199 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2200 2200
2201 2201 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2202 2202
2203 2203 ASSERT(nfs4_consistent_type(vp));
2204 2204
2205 2205 open_owner_rele(oop);
2206 2206 crfree(cr);
2207 2207 crfree(cred_otw);
2208 2208 return;
2209 2209
2210 2210 kill_file:
2211 2211 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2212 2212 failed_reopen:
2213 2213 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2214 2214 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2215 2215 (void *)osp, (void *)cr, rnode4info(rp)));
2216 2216 mutex_enter(&osp->os_sync_lock);
2217 2217 osp->os_failed_reopen = 1;
2218 2218 mutex_exit(&osp->os_sync_lock);
2219 2219 bailout:
2220 2220 if (oop != NULL) {
2221 2221 nfs4_end_open_seqid_sync(oop);
2222 2222 open_owner_rele(oop);
2223 2223 }
2224 2224 if (cr != NULL)
2225 2225 crfree(cr);
2226 2226 if (cred_otw != NULL)
2227 2227 crfree(cred_otw);
2228 2228 }
2229 2229
2230 2230 /* for . and .. OPENs */
2231 2231 /* ARGSUSED */
2232 2232 static int
2233 2233 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2234 2234 {
2235 2235 rnode4_t *rp;
2236 2236 nfs4_ga_res_t gar;
2237 2237
2238 2238 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2239 2239
2240 2240 /*
2241 2241 * If close-to-open consistency checking is turned off or
2242 2242 * if there is no cached data, we can avoid
2243 2243 * the over the wire getattr. Otherwise, force a
2244 2244 * call to the server to get fresh attributes and to
2245 2245 * check caches. This is required for close-to-open
2246 2246 * consistency.
2247 2247 */
2248 2248 rp = VTOR4(*vpp);
2249 2249 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2250 2250 (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2251 2251 return (0);
2252 2252
2253 2253 return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2254 2254 }
2255 2255
2256 2256 /*
2257 2257 * CLOSE a file
2258 2258 */
2259 2259 /* ARGSUSED */
2260 2260 static int
2261 2261 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2262 2262 caller_context_t *ct)
2263 2263 {
2264 2264 rnode4_t *rp;
2265 2265 int error = 0;
2266 2266 int r_error = 0;
2267 2267 int n4error = 0;
2268 2268 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2269 2269
2270 2270 /*
2271 2271 * Remove client state for this (lockowner, file) pair.
2272 2272 * Issue otw v4 call to have the server do the same.
2273 2273 */
2274 2274
2275 2275 rp = VTOR4(vp);
2276 2276
2277 2277 /*
2278 2278 * zone_enter(2) prevents processes from changing zones with NFS files
2279 2279 * open; if we happen to get here from the wrong zone we can't do
2280 2280 * anything over the wire.
2281 2281 */
2282 2282 if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2283 2283 /*
2284 2284 * We could attempt to clean up locks, except we're sure
2285 2285 * that the current process didn't acquire any locks on
2286 2286 * the file: any attempt to lock a file belong to another zone
2287 2287 * will fail, and one can't lock an NFS file and then change
2288 2288 * zones, as that fails too.
2289 2289 *
2290 2290 * Returning an error here is the sane thing to do. A
2291 2291 * subsequent call to VN_RELE() which translates to a
2292 2292 * nfs4_inactive() will clean up state: if the zone of the
2293 2293 * vnode's origin is still alive and kicking, the inactive
2294 2294 * thread will handle the request (from the correct zone), and
2295 2295 * everything (minus the OTW close call) should be OK. If the
2296 2296 * zone is going away nfs4_async_inactive() will throw away
2297 2297 * delegations, open streams and cached pages inline.
2298 2298 */
2299 2299 return (EIO);
2300 2300 }
2301 2301
2302 2302 /*
2303 2303 * If we are using local locking for this filesystem, then
2304 2304 * release all of the SYSV style record locks. Otherwise,
2305 2305 * we are doing network locking and we need to release all
2306 2306 * of the network locks. All of the locks held by this
2307 2307 * process on this file are released no matter what the
2308 2308 * incoming reference count is.
2309 2309 */
2310 2310 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2311 2311 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2312 2312 cleanshares(vp, ttoproc(curthread)->p_pid);
2313 2313 } else
2314 2314 e.error = nfs4_lockrelease(vp, flag, offset, cr);
2315 2315
2316 2316 if (e.error) {
2317 2317 struct lm_sysid *lmsid;
2318 2318 lmsid = nfs4_find_sysid(VTOMI4(vp));
2319 2319 if (lmsid == NULL) {
2320 2320 DTRACE_PROBE2(unknown__sysid, int, e.error,
2321 2321 vnode_t *, vp);
2322 2322 } else {
2323 2323 cleanlocks(vp, ttoproc(curthread)->p_pid,
2324 2324 (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2325 2325
2326 2326 lm_rel_sysid(lmsid);
2327 2327 }
2328 2328 return (e.error);
2329 2329 }
2330 2330
2331 2331 if (count > 1)
2332 2332 return (0);
2333 2333
2334 2334 /*
2335 2335 * If the file has been `unlinked', then purge the
2336 2336 * DNLC so that this vnode will get reycled quicker
2337 2337 * and the .nfs* file on the server will get removed.
2338 2338 */
2339 2339 if (rp->r_unldvp != NULL)
2340 2340 dnlc_purge_vp(vp);
2341 2341
2342 2342 /*
2343 2343 * If the file was open for write and there are pages,
2344 2344 * do a synchronous flush and commit of all of the
2345 2345 * dirty and uncommitted pages.
2346 2346 */
2347 2347 ASSERT(!e.error);
2348 2348 if ((flag & FWRITE) && nfs4_has_pages(vp))
2349 2349 error = nfs4_putpage_commit(vp, 0, 0, cr);
2350 2350
2351 2351 mutex_enter(&rp->r_statelock);
2352 2352 r_error = rp->r_error;
2353 2353 rp->r_error = 0;
2354 2354 mutex_exit(&rp->r_statelock);
2355 2355
2356 2356 /*
2357 2357 * If this file type is one for which no explicit 'open' was
2358 2358 * done, then bail now (ie. no need for protocol 'close'). If
2359 2359 * there was an error w/the vm subsystem, return _that_ error,
2360 2360 * otherwise, return any errors that may've been reported via
2361 2361 * the rnode.
2362 2362 */
2363 2363 if (vp->v_type != VREG)
2364 2364 return (error ? error : r_error);
2365 2365
2366 2366 /*
2367 2367 * The sync putpage commit may have failed above, but since
2368 2368 * we're working w/a regular file, we need to do the protocol
2369 2369 * 'close' (nfs4close_one will figure out if an otw close is
2370 2370 * needed or not). Report any errors _after_ doing the protocol
2371 2371 * 'close'.
2372 2372 */
2373 2373 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2374 2374 n4error = e.error ? e.error : geterrno4(e.stat);
2375 2375
2376 2376 /*
2377 2377 * Error reporting prio (Hi -> Lo)
2378 2378 *
2379 2379 * i) nfs4_putpage_commit (error)
2380 2380 * ii) rnode's (r_error)
2381 2381 * iii) nfs4close_one (n4error)
2382 2382 */
2383 2383 return (error ? error : (r_error ? r_error : n4error));
2384 2384 }
2385 2385
2386 2386 /*
2387 2387 * Initialize *lost_rqstp.
2388 2388 */
2389 2389
2390 2390 static void
2391 2391 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2392 2392 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2393 2393 vnode_t *vp)
2394 2394 {
2395 2395 if (error != ETIMEDOUT && error != EINTR &&
2396 2396 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2397 2397 lost_rqstp->lr_op = 0;
2398 2398 return;
2399 2399 }
2400 2400
2401 2401 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2402 2402 "nfs4close_save_lost_rqst: error %d", error));
2403 2403
2404 2404 lost_rqstp->lr_op = OP_CLOSE;
2405 2405 /*
2406 2406 * The vp is held and rele'd via the recovery code.
2407 2407 * See nfs4_save_lost_rqst.
2408 2408 */
2409 2409 lost_rqstp->lr_vp = vp;
2410 2410 lost_rqstp->lr_dvp = NULL;
2411 2411 lost_rqstp->lr_oop = oop;
2412 2412 lost_rqstp->lr_osp = osp;
2413 2413 ASSERT(osp != NULL);
2414 2414 ASSERT(mutex_owned(&osp->os_sync_lock));
2415 2415 osp->os_pending_close = 1;
2416 2416 lost_rqstp->lr_lop = NULL;
2417 2417 lost_rqstp->lr_cr = cr;
2418 2418 lost_rqstp->lr_flk = NULL;
2419 2419 lost_rqstp->lr_putfirst = FALSE;
2420 2420 }
2421 2421
2422 2422 /*
2423 2423 * Assumes you already have the open seqid sync grabbed as well as the
2424 2424 * 'os_sync_lock'. Note: this will release the open seqid sync and
2425 2425 * 'os_sync_lock' if client recovery starts. Calling functions have to
2426 2426 * be prepared to handle this.
2427 2427 *
2428 2428 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2429 2429 * was needed and was started, and that the calling function should retry
2430 2430 * this function; otherwise it is returned as 0.
2431 2431 *
2432 2432 * Errors are returned via the nfs4_error_t parameter.
2433 2433 */
2434 2434 static void
2435 2435 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2436 2436 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2437 2437 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2438 2438 {
2439 2439 COMPOUND4args_clnt args;
2440 2440 COMPOUND4res_clnt res;
2441 2441 CLOSE4args *close_args;
2442 2442 nfs_resop4 *resop;
2443 2443 nfs_argop4 argop[3];
2444 2444 int doqueue = 1;
2445 2445 mntinfo4_t *mi;
2446 2446 seqid4 seqid;
2447 2447 vnode_t *vp;
2448 2448 bool_t needrecov = FALSE;
2449 2449 nfs4_lost_rqst_t lost_rqst;
2450 2450 hrtime_t t;
2451 2451
2452 2452 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2453 2453
2454 2454 ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2455 2455
2456 2456 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2457 2457
2458 2458 /* Only set this to 1 if recovery is started */
2459 2459 *recov = 0;
2460 2460
2461 2461 /* do the OTW call to close the file */
2462 2462
2463 2463 if (close_type == CLOSE_RESEND)
2464 2464 args.ctag = TAG_CLOSE_LOST;
2465 2465 else if (close_type == CLOSE_AFTER_RESEND)
2466 2466 args.ctag = TAG_CLOSE_UNDO;
2467 2467 else
2468 2468 args.ctag = TAG_CLOSE;
2469 2469
2470 2470 args.array_len = 3;
2471 2471 args.array = argop;
2472 2472
2473 2473 vp = RTOV4(rp);
2474 2474
2475 2475 mi = VTOMI4(vp);
2476 2476
2477 2477 /* putfh target fh */
2478 2478 argop[0].argop = OP_CPUTFH;
2479 2479 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2480 2480
2481 2481 argop[1].argop = OP_GETATTR;
2482 2482 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2483 2483 argop[1].nfs_argop4_u.opgetattr.mi = mi;
2484 2484
2485 2485 argop[2].argop = OP_CLOSE;
2486 2486 close_args = &argop[2].nfs_argop4_u.opclose;
2487 2487
2488 2488 seqid = nfs4_get_open_seqid(oop) + 1;
2489 2489
2490 2490 close_args->seqid = seqid;
2491 2491 close_args->open_stateid = osp->open_stateid;
2492 2492
2493 2493 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2494 2494 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2495 2495 rnode4info(rp)));
2496 2496
2497 2497 t = gethrtime();
2498 2498
2499 2499 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2500 2500
2501 2501 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2502 2502 nfs4_set_open_seqid(seqid, oop, args.ctag);
2503 2503 }
2504 2504
2505 2505 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2506 2506 if (ep->error && !needrecov) {
2507 2507 /*
2508 2508 * if there was an error and no recovery is to be done
2509 2509 * then then set up the file to flush its cache if
2510 2510 * needed for the next caller.
2511 2511 */
2512 2512 mutex_enter(&rp->r_statelock);
2513 2513 PURGE_ATTRCACHE4_LOCKED(rp);
2514 2514 rp->r_flags &= ~R4WRITEMODIFIED;
2515 2515 mutex_exit(&rp->r_statelock);
2516 2516 return;
2517 2517 }
2518 2518
2519 2519 if (needrecov) {
2520 2520 bool_t abort;
2521 2521 nfs4_bseqid_entry_t *bsep = NULL;
2522 2522
2523 2523 if (close_type != CLOSE_RESEND)
2524 2524 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2525 2525 osp, cred_otw, vp);
2526 2526
2527 2527 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2528 2528 bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2529 2529 0, args.ctag, close_args->seqid);
2530 2530
2531 2531 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2532 2532 "nfs4close_otw: initiating recovery. error %d "
2533 2533 "res.status %d", ep->error, res.status));
2534 2534
2535 2535 /*
2536 2536 * Drop the 'os_sync_lock' here so we don't hit
2537 2537 * a potential recursive mutex_enter via an
2538 2538 * 'open_stream_hold()'.
2539 2539 */
2540 2540 mutex_exit(&osp->os_sync_lock);
2541 2541 *have_sync_lockp = 0;
2542 2542 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2543 2543 (close_type != CLOSE_RESEND &&
2544 2544 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2545 2545 OP_CLOSE, bsep, NULL, NULL);
2546 2546
2547 2547 /* drop open seq sync, and let the calling function regrab it */
2548 2548 nfs4_end_open_seqid_sync(oop);
2549 2549 *did_start_seqid_syncp = 0;
2550 2550
2551 2551 if (bsep)
2552 2552 kmem_free(bsep, sizeof (*bsep));
2553 2553 /*
2554 2554 * For signals, the caller wants to quit, so don't say to
2555 2555 * retry. For forced unmount, if it's a user thread, it
2556 2556 * wants to quit. If it's a recovery thread, the retry
2557 2557 * will happen higher-up on the call stack. Either way,
2558 2558 * don't say to retry.
2559 2559 */
2560 2560 if (abort == FALSE && ep->error != EINTR &&
2561 2561 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2562 2562 close_type != CLOSE_RESEND &&
2563 2563 close_type != CLOSE_AFTER_RESEND)
2564 2564 *recov = 1;
2565 2565 else
2566 2566 *recov = 0;
2567 2567
2568 2568 if (!ep->error)
2569 2569 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2570 2570 return;
2571 2571 }
2572 2572
2573 2573 if (res.status) {
2574 2574 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2575 2575 return;
2576 2576 }
2577 2577
2578 2578 mutex_enter(&rp->r_statev4_lock);
2579 2579 rp->created_v4 = 0;
2580 2580 mutex_exit(&rp->r_statev4_lock);
2581 2581
2582 2582 resop = &res.array[2];
2583 2583 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2584 2584 osp->os_valid = 0;
2585 2585
2586 2586 /*
2587 2587 * This removes the reference obtained at OPEN; ie, when the
2588 2588 * open stream structure was created.
|
↓ open down ↓ |
2588 lines elided |
↑ open up ↑ |
2589 2589 *
2590 2590 * We don't have to worry about calling 'open_stream_rele'
2591 2591 * since we our currently holding a reference to the open
2592 2592 * stream which means the count cannot go to 0 with this
2593 2593 * decrement.
2594 2594 */
2595 2595 ASSERT(osp->os_ref_count >= 2);
2596 2596 osp->os_ref_count--;
2597 2597
2598 2598 if (ep->error == 0) {
2599 - /*
2600 - * Avoid a deadlock with the r_serial thread waiting for
2601 - * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be
2602 - * held by us. We will wait in nfs4_attr_cache() for the
2603 - * completion of the r_serial thread.
2604 - */
2605 2599 mutex_exit(&osp->os_sync_lock);
2606 2600 *have_sync_lockp = 0;
2607 2601
2608 2602 nfs4_attr_cache(vp,
2609 2603 &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2610 2604 t, cred_otw, TRUE, NULL);
2611 2605 }
2612 2606
2613 2607 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2614 2608 " returning %d", ep->error));
2615 2609
2616 2610 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2617 2611 }
2618 2612
2619 2613 /* ARGSUSED */
2620 2614 static int
2621 2615 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2622 2616 caller_context_t *ct)
2623 2617 {
2624 2618 rnode4_t *rp;
2625 2619 u_offset_t off;
2626 2620 offset_t diff;
2627 2621 uint_t on;
2628 2622 uint_t n;
2629 2623 caddr_t base;
2630 2624 uint_t flags;
2631 2625 int error;
2632 2626 mntinfo4_t *mi;
2633 2627
2634 2628 rp = VTOR4(vp);
2635 2629
2636 2630 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2637 2631
2638 2632 if (IS_SHADOW(vp, rp))
2639 2633 vp = RTOV4(rp);
2640 2634
2641 2635 if (vp->v_type != VREG)
2642 2636 return (EISDIR);
2643 2637
2644 2638 mi = VTOMI4(vp);
2645 2639
2646 2640 if (nfs_zone() != mi->mi_zone)
2647 2641 return (EIO);
2648 2642
2649 2643 if (uiop->uio_resid == 0)
2650 2644 return (0);
2651 2645
2652 2646 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2653 2647 return (EINVAL);
2654 2648
2655 2649 mutex_enter(&rp->r_statelock);
2656 2650 if (rp->r_flags & R4RECOVERRP)
2657 2651 error = (rp->r_error ? rp->r_error : EIO);
2658 2652 else
2659 2653 error = 0;
2660 2654 mutex_exit(&rp->r_statelock);
2661 2655 if (error)
2662 2656 return (error);
2663 2657
2664 2658 /*
2665 2659 * Bypass VM if caching has been disabled (e.g., locking) or if
2666 2660 * using client-side direct I/O and the file is not mmap'd and
2667 2661 * there are no cached pages.
2668 2662 */
2669 2663 if ((vp->v_flag & VNOCACHE) ||
2670 2664 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2671 2665 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2672 2666 size_t resid = 0;
2673 2667
2674 2668 return (nfs4read(vp, NULL, uiop->uio_loffset,
2675 2669 uiop->uio_resid, &resid, cr, FALSE, uiop));
2676 2670 }
2677 2671
2678 2672 error = 0;
2679 2673
2680 2674 do {
2681 2675 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2682 2676 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2683 2677 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2684 2678
2685 2679 if (error = nfs4_validate_caches(vp, cr))
2686 2680 break;
2687 2681
2688 2682 mutex_enter(&rp->r_statelock);
2689 2683 while (rp->r_flags & R4INCACHEPURGE) {
2690 2684 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2691 2685 mutex_exit(&rp->r_statelock);
2692 2686 return (EINTR);
2693 2687 }
2694 2688 }
2695 2689 diff = rp->r_size - uiop->uio_loffset;
2696 2690 mutex_exit(&rp->r_statelock);
2697 2691 if (diff <= 0)
2698 2692 break;
2699 2693 if (diff < n)
2700 2694 n = (uint_t)diff;
2701 2695
2702 2696 if (vpm_enable) {
2703 2697 /*
2704 2698 * Copy data.
2705 2699 */
2706 2700 error = vpm_data_copy(vp, off + on, n, uiop,
2707 2701 1, NULL, 0, S_READ);
2708 2702 } else {
2709 2703 base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2710 2704 S_READ);
2711 2705
2712 2706 error = uiomove(base + on, n, UIO_READ, uiop);
2713 2707 }
2714 2708
2715 2709 if (!error) {
2716 2710 /*
2717 2711 * If read a whole block or read to eof,
2718 2712 * won't need this buffer again soon.
2719 2713 */
2720 2714 mutex_enter(&rp->r_statelock);
2721 2715 if (n + on == MAXBSIZE ||
2722 2716 uiop->uio_loffset == rp->r_size)
2723 2717 flags = SM_DONTNEED;
2724 2718 else
2725 2719 flags = 0;
2726 2720 mutex_exit(&rp->r_statelock);
2727 2721 if (vpm_enable) {
2728 2722 error = vpm_sync_pages(vp, off, n, flags);
2729 2723 } else {
2730 2724 error = segmap_release(segkmap, base, flags);
2731 2725 }
2732 2726 } else {
2733 2727 if (vpm_enable) {
2734 2728 (void) vpm_sync_pages(vp, off, n, 0);
2735 2729 } else {
2736 2730 (void) segmap_release(segkmap, base, 0);
2737 2731 }
2738 2732 }
2739 2733 } while (!error && uiop->uio_resid > 0);
2740 2734
2741 2735 return (error);
2742 2736 }
2743 2737
2744 2738 /* ARGSUSED */
2745 2739 static int
2746 2740 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2747 2741 caller_context_t *ct)
2748 2742 {
2749 2743 rlim64_t limit = uiop->uio_llimit;
2750 2744 rnode4_t *rp;
2751 2745 u_offset_t off;
2752 2746 caddr_t base;
2753 2747 uint_t flags;
2754 2748 int remainder;
2755 2749 size_t n;
2756 2750 int on;
2757 2751 int error;
2758 2752 int resid;
2759 2753 u_offset_t offset;
2760 2754 mntinfo4_t *mi;
2761 2755 uint_t bsize;
2762 2756
2763 2757 rp = VTOR4(vp);
2764 2758
2765 2759 if (IS_SHADOW(vp, rp))
2766 2760 vp = RTOV4(rp);
2767 2761
2768 2762 if (vp->v_type != VREG)
2769 2763 return (EISDIR);
2770 2764
2771 2765 mi = VTOMI4(vp);
2772 2766
2773 2767 if (nfs_zone() != mi->mi_zone)
2774 2768 return (EIO);
2775 2769
2776 2770 if (uiop->uio_resid == 0)
2777 2771 return (0);
2778 2772
2779 2773 mutex_enter(&rp->r_statelock);
2780 2774 if (rp->r_flags & R4RECOVERRP)
2781 2775 error = (rp->r_error ? rp->r_error : EIO);
2782 2776 else
2783 2777 error = 0;
2784 2778 mutex_exit(&rp->r_statelock);
2785 2779 if (error)
2786 2780 return (error);
2787 2781
2788 2782 if (ioflag & FAPPEND) {
2789 2783 struct vattr va;
2790 2784
2791 2785 /*
2792 2786 * Must serialize if appending.
2793 2787 */
2794 2788 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2795 2789 nfs_rw_exit(&rp->r_rwlock);
2796 2790 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2797 2791 INTR4(vp)))
2798 2792 return (EINTR);
2799 2793 }
2800 2794
2801 2795 va.va_mask = AT_SIZE;
2802 2796 error = nfs4getattr(vp, &va, cr);
2803 2797 if (error)
2804 2798 return (error);
2805 2799 uiop->uio_loffset = va.va_size;
2806 2800 }
2807 2801
2808 2802 offset = uiop->uio_loffset + uiop->uio_resid;
2809 2803
2810 2804 if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2811 2805 return (EINVAL);
2812 2806
2813 2807 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2814 2808 limit = MAXOFFSET_T;
2815 2809
2816 2810 /*
2817 2811 * Check to make sure that the process will not exceed
2818 2812 * its limit on file size. It is okay to write up to
2819 2813 * the limit, but not beyond. Thus, the write which
2820 2814 * reaches the limit will be short and the next write
2821 2815 * will return an error.
2822 2816 */
2823 2817 remainder = 0;
2824 2818 if (offset > uiop->uio_llimit) {
2825 2819 remainder = offset - uiop->uio_llimit;
2826 2820 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2827 2821 if (uiop->uio_resid <= 0) {
2828 2822 proc_t *p = ttoproc(curthread);
2829 2823
2830 2824 uiop->uio_resid += remainder;
2831 2825 mutex_enter(&p->p_lock);
2832 2826 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2833 2827 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2834 2828 mutex_exit(&p->p_lock);
2835 2829 return (EFBIG);
2836 2830 }
2837 2831 }
2838 2832
2839 2833 /* update the change attribute, if we have a write delegation */
2840 2834
2841 2835 mutex_enter(&rp->r_statev4_lock);
2842 2836 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2843 2837 rp->r_deleg_change++;
2844 2838
2845 2839 mutex_exit(&rp->r_statev4_lock);
2846 2840
2847 2841 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2848 2842 return (EINTR);
2849 2843
2850 2844 /*
2851 2845 * Bypass VM if caching has been disabled (e.g., locking) or if
2852 2846 * using client-side direct I/O and the file is not mmap'd and
2853 2847 * there are no cached pages.
2854 2848 */
2855 2849 if ((vp->v_flag & VNOCACHE) ||
2856 2850 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2857 2851 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2858 2852 size_t bufsize;
2859 2853 int count;
2860 2854 u_offset_t org_offset;
2861 2855 stable_how4 stab_comm;
2862 2856 nfs4_fwrite:
2863 2857 if (rp->r_flags & R4STALE) {
2864 2858 resid = uiop->uio_resid;
2865 2859 offset = uiop->uio_loffset;
2866 2860 error = rp->r_error;
2867 2861 /*
2868 2862 * A close may have cleared r_error, if so,
2869 2863 * propagate ESTALE error return properly
2870 2864 */
2871 2865 if (error == 0)
2872 2866 error = ESTALE;
2873 2867 goto bottom;
2874 2868 }
2875 2869
2876 2870 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2877 2871 base = kmem_alloc(bufsize, KM_SLEEP);
2878 2872 do {
2879 2873 if (ioflag & FDSYNC)
2880 2874 stab_comm = DATA_SYNC4;
2881 2875 else
2882 2876 stab_comm = FILE_SYNC4;
2883 2877 resid = uiop->uio_resid;
2884 2878 offset = uiop->uio_loffset;
2885 2879 count = MIN(uiop->uio_resid, bufsize);
2886 2880 org_offset = uiop->uio_loffset;
2887 2881 error = uiomove(base, count, UIO_WRITE, uiop);
2888 2882 if (!error) {
2889 2883 error = nfs4write(vp, base, org_offset,
2890 2884 count, cr, &stab_comm);
2891 2885 if (!error) {
2892 2886 mutex_enter(&rp->r_statelock);
2893 2887 if (rp->r_size < uiop->uio_loffset)
2894 2888 rp->r_size = uiop->uio_loffset;
2895 2889 mutex_exit(&rp->r_statelock);
2896 2890 }
2897 2891 }
2898 2892 } while (!error && uiop->uio_resid > 0);
2899 2893 kmem_free(base, bufsize);
2900 2894 goto bottom;
2901 2895 }
2902 2896
2903 2897 bsize = vp->v_vfsp->vfs_bsize;
2904 2898
2905 2899 do {
2906 2900 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2907 2901 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2908 2902 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2909 2903
2910 2904 resid = uiop->uio_resid;
2911 2905 offset = uiop->uio_loffset;
2912 2906
2913 2907 if (rp->r_flags & R4STALE) {
2914 2908 error = rp->r_error;
2915 2909 /*
2916 2910 * A close may have cleared r_error, if so,
2917 2911 * propagate ESTALE error return properly
2918 2912 */
2919 2913 if (error == 0)
2920 2914 error = ESTALE;
2921 2915 break;
2922 2916 }
2923 2917
2924 2918 /*
2925 2919 * Don't create dirty pages faster than they
2926 2920 * can be cleaned so that the system doesn't
2927 2921 * get imbalanced. If the async queue is
2928 2922 * maxed out, then wait for it to drain before
2929 2923 * creating more dirty pages. Also, wait for
2930 2924 * any threads doing pagewalks in the vop_getattr
2931 2925 * entry points so that they don't block for
2932 2926 * long periods.
2933 2927 */
2934 2928 mutex_enter(&rp->r_statelock);
2935 2929 while ((mi->mi_max_threads != 0 &&
2936 2930 rp->r_awcount > 2 * mi->mi_max_threads) ||
2937 2931 rp->r_gcount > 0) {
2938 2932 if (INTR4(vp)) {
2939 2933 klwp_t *lwp = ttolwp(curthread);
2940 2934
2941 2935 if (lwp != NULL)
2942 2936 lwp->lwp_nostop++;
2943 2937 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2944 2938 mutex_exit(&rp->r_statelock);
2945 2939 if (lwp != NULL)
2946 2940 lwp->lwp_nostop--;
2947 2941 error = EINTR;
2948 2942 goto bottom;
2949 2943 }
2950 2944 if (lwp != NULL)
2951 2945 lwp->lwp_nostop--;
2952 2946 } else
2953 2947 cv_wait(&rp->r_cv, &rp->r_statelock);
2954 2948 }
2955 2949 mutex_exit(&rp->r_statelock);
2956 2950
2957 2951 /*
2958 2952 * Touch the page and fault it in if it is not in core
2959 2953 * before segmap_getmapflt or vpm_data_copy can lock it.
2960 2954 * This is to avoid the deadlock if the buffer is mapped
2961 2955 * to the same file through mmap which we want to write.
2962 2956 */
2963 2957 uio_prefaultpages((long)n, uiop);
2964 2958
2965 2959 if (vpm_enable) {
2966 2960 /*
2967 2961 * It will use kpm mappings, so no need to
2968 2962 * pass an address.
2969 2963 */
2970 2964 error = writerp4(rp, NULL, n, uiop, 0);
2971 2965 } else {
2972 2966 if (segmap_kpm) {
2973 2967 int pon = uiop->uio_loffset & PAGEOFFSET;
2974 2968 size_t pn = MIN(PAGESIZE - pon,
2975 2969 uiop->uio_resid);
2976 2970 int pagecreate;
2977 2971
2978 2972 mutex_enter(&rp->r_statelock);
2979 2973 pagecreate = (pon == 0) && (pn == PAGESIZE ||
2980 2974 uiop->uio_loffset + pn >= rp->r_size);
2981 2975 mutex_exit(&rp->r_statelock);
2982 2976
2983 2977 base = segmap_getmapflt(segkmap, vp, off + on,
2984 2978 pn, !pagecreate, S_WRITE);
2985 2979
2986 2980 error = writerp4(rp, base + pon, n, uiop,
2987 2981 pagecreate);
2988 2982
2989 2983 } else {
2990 2984 base = segmap_getmapflt(segkmap, vp, off + on,
2991 2985 n, 0, S_READ);
2992 2986 error = writerp4(rp, base + on, n, uiop, 0);
2993 2987 }
2994 2988 }
2995 2989
2996 2990 if (!error) {
2997 2991 if (mi->mi_flags & MI4_NOAC)
2998 2992 flags = SM_WRITE;
2999 2993 else if ((uiop->uio_loffset % bsize) == 0 ||
3000 2994 IS_SWAPVP(vp)) {
3001 2995 /*
3002 2996 * Have written a whole block.
3003 2997 * Start an asynchronous write
3004 2998 * and mark the buffer to
3005 2999 * indicate that it won't be
3006 3000 * needed again soon.
3007 3001 */
3008 3002 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
3009 3003 } else
3010 3004 flags = 0;
3011 3005 if ((ioflag & (FSYNC|FDSYNC)) ||
3012 3006 (rp->r_flags & R4OUTOFSPACE)) {
3013 3007 flags &= ~SM_ASYNC;
3014 3008 flags |= SM_WRITE;
3015 3009 }
3016 3010 if (vpm_enable) {
3017 3011 error = vpm_sync_pages(vp, off, n, flags);
3018 3012 } else {
3019 3013 error = segmap_release(segkmap, base, flags);
3020 3014 }
3021 3015 } else {
3022 3016 if (vpm_enable) {
3023 3017 (void) vpm_sync_pages(vp, off, n, 0);
3024 3018 } else {
3025 3019 (void) segmap_release(segkmap, base, 0);
3026 3020 }
3027 3021 /*
3028 3022 * In the event that we got an access error while
3029 3023 * faulting in a page for a write-only file just
3030 3024 * force a write.
3031 3025 */
3032 3026 if (error == EACCES)
3033 3027 goto nfs4_fwrite;
3034 3028 }
3035 3029 } while (!error && uiop->uio_resid > 0);
3036 3030
3037 3031 bottom:
3038 3032 if (error) {
3039 3033 uiop->uio_resid = resid + remainder;
3040 3034 uiop->uio_loffset = offset;
3041 3035 } else {
3042 3036 uiop->uio_resid += remainder;
3043 3037
3044 3038 mutex_enter(&rp->r_statev4_lock);
3045 3039 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3046 3040 gethrestime(&rp->r_attr.va_mtime);
3047 3041 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3048 3042 }
3049 3043 mutex_exit(&rp->r_statev4_lock);
3050 3044 }
3051 3045
3052 3046 nfs_rw_exit(&rp->r_lkserlock);
3053 3047
3054 3048 return (error);
3055 3049 }
3056 3050
3057 3051 /*
3058 3052 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3059 3053 */
3060 3054 static int
3061 3055 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3062 3056 int flags, cred_t *cr)
3063 3057 {
3064 3058 struct buf *bp;
3065 3059 int error;
3066 3060 page_t *savepp;
3067 3061 uchar_t fsdata;
3068 3062 stable_how4 stab_comm;
3069 3063
3070 3064 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3071 3065 bp = pageio_setup(pp, len, vp, flags);
3072 3066 ASSERT(bp != NULL);
3073 3067
3074 3068 /*
3075 3069 * pageio_setup should have set b_addr to 0. This
3076 3070 * is correct since we want to do I/O on a page
3077 3071 * boundary. bp_mapin will use this addr to calculate
3078 3072 * an offset, and then set b_addr to the kernel virtual
3079 3073 * address it allocated for us.
3080 3074 */
3081 3075 ASSERT(bp->b_un.b_addr == 0);
3082 3076
3083 3077 bp->b_edev = 0;
3084 3078 bp->b_dev = 0;
3085 3079 bp->b_lblkno = lbtodb(off);
3086 3080 bp->b_file = vp;
3087 3081 bp->b_offset = (offset_t)off;
3088 3082 bp_mapin(bp);
3089 3083
3090 3084 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3091 3085 freemem > desfree)
3092 3086 stab_comm = UNSTABLE4;
3093 3087 else
3094 3088 stab_comm = FILE_SYNC4;
3095 3089
3096 3090 error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3097 3091
3098 3092 bp_mapout(bp);
3099 3093 pageio_done(bp);
3100 3094
3101 3095 if (stab_comm == UNSTABLE4)
3102 3096 fsdata = C_DELAYCOMMIT;
3103 3097 else
3104 3098 fsdata = C_NOCOMMIT;
3105 3099
3106 3100 savepp = pp;
3107 3101 do {
3108 3102 pp->p_fsdata = fsdata;
3109 3103 } while ((pp = pp->p_next) != savepp);
3110 3104
3111 3105 return (error);
3112 3106 }
3113 3107
3114 3108 /*
3115 3109 */
3116 3110 static int
3117 3111 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3118 3112 {
3119 3113 nfs4_open_owner_t *oop;
3120 3114 nfs4_open_stream_t *osp;
3121 3115 rnode4_t *rp = VTOR4(vp);
3122 3116 mntinfo4_t *mi = VTOMI4(vp);
3123 3117 int reopen_needed;
3124 3118
3125 3119 ASSERT(nfs_zone() == mi->mi_zone);
3126 3120
3127 3121
3128 3122 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3129 3123 if (!oop)
3130 3124 return (EIO);
3131 3125
3132 3126 /* returns with 'os_sync_lock' held */
3133 3127 osp = find_open_stream(oop, rp);
3134 3128 if (!osp) {
3135 3129 open_owner_rele(oop);
3136 3130 return (EIO);
3137 3131 }
3138 3132
3139 3133 if (osp->os_failed_reopen) {
3140 3134 mutex_exit(&osp->os_sync_lock);
3141 3135 open_stream_rele(osp, rp);
3142 3136 open_owner_rele(oop);
3143 3137 return (EIO);
3144 3138 }
3145 3139
3146 3140 /*
3147 3141 * Determine whether a reopen is needed. If this
3148 3142 * is a delegation open stream, then the os_delegation bit
3149 3143 * should be set.
3150 3144 */
3151 3145
3152 3146 reopen_needed = osp->os_delegation;
3153 3147
3154 3148 mutex_exit(&osp->os_sync_lock);
3155 3149 open_owner_rele(oop);
3156 3150
3157 3151 if (reopen_needed) {
3158 3152 nfs4_error_zinit(ep);
3159 3153 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3160 3154 mutex_enter(&osp->os_sync_lock);
3161 3155 if (ep->error || ep->stat || osp->os_failed_reopen) {
3162 3156 mutex_exit(&osp->os_sync_lock);
3163 3157 open_stream_rele(osp, rp);
3164 3158 return (EIO);
3165 3159 }
3166 3160 mutex_exit(&osp->os_sync_lock);
3167 3161 }
3168 3162 open_stream_rele(osp, rp);
3169 3163
3170 3164 return (0);
3171 3165 }
3172 3166
3173 3167 /*
3174 3168 * Write to file. Writes to remote server in largest size
3175 3169 * chunks that the server can handle. Write is synchronous.
3176 3170 */
3177 3171 static int
3178 3172 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3179 3173 stable_how4 *stab_comm)
3180 3174 {
3181 3175 mntinfo4_t *mi;
3182 3176 COMPOUND4args_clnt args;
3183 3177 COMPOUND4res_clnt res;
3184 3178 WRITE4args *wargs;
3185 3179 WRITE4res *wres;
3186 3180 nfs_argop4 argop[2];
3187 3181 nfs_resop4 *resop;
3188 3182 int tsize;
3189 3183 stable_how4 stable;
3190 3184 rnode4_t *rp;
3191 3185 int doqueue = 1;
3192 3186 bool_t needrecov;
3193 3187 nfs4_recov_state_t recov_state;
3194 3188 nfs4_stateid_types_t sid_types;
3195 3189 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3196 3190 int recov;
3197 3191
3198 3192 rp = VTOR4(vp);
3199 3193 mi = VTOMI4(vp);
3200 3194
3201 3195 ASSERT(nfs_zone() == mi->mi_zone);
3202 3196
3203 3197 stable = *stab_comm;
3204 3198 *stab_comm = FILE_SYNC4;
3205 3199
3206 3200 needrecov = FALSE;
3207 3201 recov_state.rs_flags = 0;
3208 3202 recov_state.rs_num_retry_despite_err = 0;
3209 3203 nfs4_init_stateid_types(&sid_types);
3210 3204
3211 3205 /* Is curthread the recovery thread? */
3212 3206 mutex_enter(&mi->mi_lock);
3213 3207 recov = (mi->mi_recovthread == curthread);
3214 3208 mutex_exit(&mi->mi_lock);
3215 3209
3216 3210 recov_retry:
3217 3211 args.ctag = TAG_WRITE;
3218 3212 args.array_len = 2;
3219 3213 args.array = argop;
3220 3214
3221 3215 if (!recov) {
3222 3216 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3223 3217 &recov_state, NULL);
3224 3218 if (e.error)
3225 3219 return (e.error);
3226 3220 }
3227 3221
3228 3222 /* 0. putfh target fh */
3229 3223 argop[0].argop = OP_CPUTFH;
3230 3224 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3231 3225
3232 3226 /* 1. write */
3233 3227 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3234 3228
3235 3229 do {
3236 3230
3237 3231 wargs->offset = (offset4)offset;
3238 3232 wargs->data_val = base;
3239 3233
3240 3234 if (mi->mi_io_kstats) {
3241 3235 mutex_enter(&mi->mi_lock);
3242 3236 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3243 3237 mutex_exit(&mi->mi_lock);
3244 3238 }
3245 3239
3246 3240 if ((vp->v_flag & VNOCACHE) ||
3247 3241 (rp->r_flags & R4DIRECTIO) ||
3248 3242 (mi->mi_flags & MI4_DIRECTIO))
3249 3243 tsize = MIN(mi->mi_stsize, count);
3250 3244 else
3251 3245 tsize = MIN(mi->mi_curwrite, count);
3252 3246 wargs->data_len = (uint_t)tsize;
3253 3247 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3254 3248
3255 3249 if (mi->mi_io_kstats) {
3256 3250 mutex_enter(&mi->mi_lock);
3257 3251 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3258 3252 mutex_exit(&mi->mi_lock);
3259 3253 }
3260 3254
3261 3255 if (!recov) {
3262 3256 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3263 3257 if (e.error && !needrecov) {
3264 3258 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3265 3259 &recov_state, needrecov);
3266 3260 return (e.error);
3267 3261 }
3268 3262 } else {
3269 3263 if (e.error)
3270 3264 return (e.error);
3271 3265 }
3272 3266
3273 3267 /*
3274 3268 * Do handling of OLD_STATEID outside
3275 3269 * of the normal recovery framework.
3276 3270 *
3277 3271 * If write receives a BAD stateid error while using a
3278 3272 * delegation stateid, retry using the open stateid (if it
3279 3273 * exists). If it doesn't have an open stateid, reopen the
3280 3274 * file first, then retry.
3281 3275 */
3282 3276 if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3283 3277 sid_types.cur_sid_type != SPEC_SID) {
3284 3278 nfs4_save_stateid(&wargs->stateid, &sid_types);
3285 3279 if (!recov)
3286 3280 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3287 3281 &recov_state, needrecov);
3288 3282 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3289 3283 goto recov_retry;
3290 3284 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3291 3285 sid_types.cur_sid_type == DEL_SID) {
3292 3286 nfs4_save_stateid(&wargs->stateid, &sid_types);
3293 3287 mutex_enter(&rp->r_statev4_lock);
3294 3288 rp->r_deleg_return_pending = TRUE;
3295 3289 mutex_exit(&rp->r_statev4_lock);
3296 3290 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3297 3291 if (!recov)
3298 3292 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3299 3293 &recov_state, needrecov);
3300 3294 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3301 3295 return (EIO);
3302 3296 }
3303 3297 if (!recov)
3304 3298 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3305 3299 &recov_state, needrecov);
3306 3300 /* hold needed for nfs4delegreturn_thread */
3307 3301 VN_HOLD(vp);
3308 3302 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3309 3303 NFS4_DR_DISCARD), FALSE);
3310 3304 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3311 3305 goto recov_retry;
3312 3306 }
3313 3307
3314 3308 if (needrecov) {
3315 3309 bool_t abort;
3316 3310
3317 3311 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3318 3312 "nfs4write: client got error %d, res.status %d"
3319 3313 ", so start recovery", e.error, res.status));
3320 3314
3321 3315 abort = nfs4_start_recovery(&e,
3322 3316 VTOMI4(vp), vp, NULL, &wargs->stateid,
3323 3317 NULL, OP_WRITE, NULL, NULL, NULL);
3324 3318 if (!e.error) {
3325 3319 e.error = geterrno4(res.status);
3326 3320 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3327 3321 }
3328 3322 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3329 3323 &recov_state, needrecov);
3330 3324 if (abort == FALSE)
3331 3325 goto recov_retry;
3332 3326 return (e.error);
3333 3327 }
3334 3328
3335 3329 if (res.status) {
3336 3330 e.error = geterrno4(res.status);
3337 3331 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3338 3332 if (!recov)
3339 3333 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3340 3334 &recov_state, needrecov);
3341 3335 return (e.error);
3342 3336 }
3343 3337
3344 3338 resop = &res.array[1]; /* write res */
3345 3339 wres = &resop->nfs_resop4_u.opwrite;
3346 3340
3347 3341 if ((int)wres->count > tsize) {
3348 3342 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3349 3343
3350 3344 zcmn_err(getzoneid(), CE_WARN,
3351 3345 "nfs4write: server wrote %u, requested was %u",
3352 3346 (int)wres->count, tsize);
3353 3347 if (!recov)
3354 3348 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3355 3349 &recov_state, needrecov);
3356 3350 return (EIO);
3357 3351 }
3358 3352 if (wres->committed == UNSTABLE4) {
3359 3353 *stab_comm = UNSTABLE4;
3360 3354 if (wargs->stable == DATA_SYNC4 ||
3361 3355 wargs->stable == FILE_SYNC4) {
3362 3356 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3363 3357 zcmn_err(getzoneid(), CE_WARN,
3364 3358 "nfs4write: server %s did not commit "
3365 3359 "to stable storage",
3366 3360 rp->r_server->sv_hostname);
3367 3361 if (!recov)
3368 3362 nfs4_end_fop(VTOMI4(vp), vp, NULL,
3369 3363 OH_WRITE, &recov_state, needrecov);
3370 3364 return (EIO);
3371 3365 }
3372 3366 }
3373 3367
3374 3368 tsize = (int)wres->count;
3375 3369 count -= tsize;
3376 3370 base += tsize;
3377 3371 offset += tsize;
3378 3372 if (mi->mi_io_kstats) {
3379 3373 mutex_enter(&mi->mi_lock);
3380 3374 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3381 3375 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3382 3376 tsize;
3383 3377 mutex_exit(&mi->mi_lock);
3384 3378 }
3385 3379 lwp_stat_update(LWP_STAT_OUBLK, 1);
3386 3380 mutex_enter(&rp->r_statelock);
3387 3381 if (rp->r_flags & R4HAVEVERF) {
3388 3382 if (rp->r_writeverf != wres->writeverf) {
3389 3383 nfs4_set_mod(vp);
3390 3384 rp->r_writeverf = wres->writeverf;
3391 3385 }
3392 3386 } else {
3393 3387 rp->r_writeverf = wres->writeverf;
3394 3388 rp->r_flags |= R4HAVEVERF;
3395 3389 }
3396 3390 PURGE_ATTRCACHE4_LOCKED(rp);
3397 3391 rp->r_flags |= R4WRITEMODIFIED;
3398 3392 gethrestime(&rp->r_attr.va_mtime);
3399 3393 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3400 3394 mutex_exit(&rp->r_statelock);
3401 3395 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3402 3396 } while (count);
3403 3397
3404 3398 if (!recov)
3405 3399 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3406 3400 needrecov);
3407 3401
3408 3402 return (e.error);
3409 3403 }
3410 3404
3411 3405 /*
3412 3406 * Read from a file. Reads data in largest chunks our interface can handle.
3413 3407 */
3414 3408 static int
3415 3409 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3416 3410 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3417 3411 {
3418 3412 mntinfo4_t *mi;
3419 3413 COMPOUND4args_clnt args;
3420 3414 COMPOUND4res_clnt res;
3421 3415 READ4args *rargs;
3422 3416 nfs_argop4 argop[2];
3423 3417 int tsize;
3424 3418 int doqueue;
3425 3419 rnode4_t *rp;
3426 3420 int data_len;
3427 3421 bool_t is_eof;
3428 3422 bool_t needrecov = FALSE;
3429 3423 nfs4_recov_state_t recov_state;
3430 3424 nfs4_stateid_types_t sid_types;
3431 3425 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432 3426
3433 3427 rp = VTOR4(vp);
3434 3428 mi = VTOMI4(vp);
3435 3429 doqueue = 1;
3436 3430
3437 3431 ASSERT(nfs_zone() == mi->mi_zone);
3438 3432
3439 3433 args.ctag = async ? TAG_READAHEAD : TAG_READ;
3440 3434
3441 3435 args.array_len = 2;
3442 3436 args.array = argop;
3443 3437
3444 3438 nfs4_init_stateid_types(&sid_types);
3445 3439
3446 3440 recov_state.rs_flags = 0;
3447 3441 recov_state.rs_num_retry_despite_err = 0;
3448 3442
3449 3443 recov_retry:
3450 3444 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3451 3445 &recov_state, NULL);
3452 3446 if (e.error)
3453 3447 return (e.error);
3454 3448
3455 3449 /* putfh target fh */
3456 3450 argop[0].argop = OP_CPUTFH;
3457 3451 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3458 3452
3459 3453 /* read */
3460 3454 argop[1].argop = OP_READ;
3461 3455 rargs = &argop[1].nfs_argop4_u.opread;
3462 3456 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3463 3457 OP_READ, &sid_types, async);
3464 3458
3465 3459 do {
3466 3460 if (mi->mi_io_kstats) {
3467 3461 mutex_enter(&mi->mi_lock);
3468 3462 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3469 3463 mutex_exit(&mi->mi_lock);
3470 3464 }
3471 3465
3472 3466 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3473 3467 "nfs4read: %s call, rp %s",
3474 3468 needrecov ? "recov" : "first",
3475 3469 rnode4info(rp)));
3476 3470
3477 3471 if ((vp->v_flag & VNOCACHE) ||
3478 3472 (rp->r_flags & R4DIRECTIO) ||
3479 3473 (mi->mi_flags & MI4_DIRECTIO))
3480 3474 tsize = MIN(mi->mi_tsize, count);
3481 3475 else
3482 3476 tsize = MIN(mi->mi_curread, count);
3483 3477
3484 3478 rargs->offset = (offset4)offset;
3485 3479 rargs->count = (count4)tsize;
3486 3480 rargs->res_data_val_alt = NULL;
3487 3481 rargs->res_mblk = NULL;
3488 3482 rargs->res_uiop = NULL;
3489 3483 rargs->res_maxsize = 0;
3490 3484 rargs->wlist = NULL;
3491 3485
3492 3486 if (uiop)
3493 3487 rargs->res_uiop = uiop;
3494 3488 else
3495 3489 rargs->res_data_val_alt = base;
3496 3490 rargs->res_maxsize = tsize;
3497 3491
3498 3492 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3499 3493 #ifdef DEBUG
3500 3494 if (nfs4read_error_inject) {
3501 3495 res.status = nfs4read_error_inject;
3502 3496 nfs4read_error_inject = 0;
3503 3497 }
3504 3498 #endif
3505 3499
3506 3500 if (mi->mi_io_kstats) {
3507 3501 mutex_enter(&mi->mi_lock);
3508 3502 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3509 3503 mutex_exit(&mi->mi_lock);
3510 3504 }
3511 3505
3512 3506 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3513 3507 if (e.error != 0 && !needrecov) {
3514 3508 nfs4_end_fop(mi, vp, NULL, OH_READ,
3515 3509 &recov_state, needrecov);
3516 3510 return (e.error);
3517 3511 }
3518 3512
3519 3513 /*
3520 3514 * Do proper retry for OLD and BAD stateid errors outside
3521 3515 * of the normal recovery framework. There are two differences
3522 3516 * between async and sync reads. The first is that we allow
3523 3517 * retry on BAD_STATEID for async reads, but not sync reads.
3524 3518 * The second is that we mark the file dead for a failed
3525 3519 * attempt with a special stateid for sync reads, but just
3526 3520 * return EIO for async reads.
3527 3521 *
3528 3522 * If a sync read receives a BAD stateid error while using a
3529 3523 * delegation stateid, retry using the open stateid (if it
3530 3524 * exists). If it doesn't have an open stateid, reopen the
3531 3525 * file first, then retry.
3532 3526 */
3533 3527 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3534 3528 res.status == NFS4ERR_BAD_STATEID) && async) {
3535 3529 nfs4_end_fop(mi, vp, NULL, OH_READ,
3536 3530 &recov_state, needrecov);
3537 3531 if (sid_types.cur_sid_type == SPEC_SID) {
3538 3532 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3539 3533 return (EIO);
3540 3534 }
3541 3535 nfs4_save_stateid(&rargs->stateid, &sid_types);
3542 3536 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3543 3537 goto recov_retry;
3544 3538 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3545 3539 !async && sid_types.cur_sid_type != SPEC_SID) {
3546 3540 nfs4_save_stateid(&rargs->stateid, &sid_types);
3547 3541 nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 3542 &recov_state, needrecov);
3549 3543 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3550 3544 goto recov_retry;
3551 3545 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3552 3546 sid_types.cur_sid_type == DEL_SID) {
3553 3547 nfs4_save_stateid(&rargs->stateid, &sid_types);
3554 3548 mutex_enter(&rp->r_statev4_lock);
3555 3549 rp->r_deleg_return_pending = TRUE;
3556 3550 mutex_exit(&rp->r_statev4_lock);
3557 3551 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3558 3552 nfs4_end_fop(mi, vp, NULL, OH_READ,
3559 3553 &recov_state, needrecov);
3560 3554 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3561 3555 return (EIO);
3562 3556 }
3563 3557 nfs4_end_fop(mi, vp, NULL, OH_READ,
3564 3558 &recov_state, needrecov);
3565 3559 /* hold needed for nfs4delegreturn_thread */
3566 3560 VN_HOLD(vp);
3567 3561 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3568 3562 NFS4_DR_DISCARD), FALSE);
3569 3563 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3570 3564 goto recov_retry;
3571 3565 }
3572 3566 if (needrecov) {
3573 3567 bool_t abort;
3574 3568
3575 3569 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3576 3570 "nfs4read: initiating recovery\n"));
3577 3571 abort = nfs4_start_recovery(&e,
3578 3572 mi, vp, NULL, &rargs->stateid,
3579 3573 NULL, OP_READ, NULL, NULL, NULL);
3580 3574 nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 3575 &recov_state, needrecov);
3582 3576 /*
3583 3577 * Do not retry if we got OLD_STATEID using a special
3584 3578 * stateid. This avoids looping with a broken server.
3585 3579 */
3586 3580 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3587 3581 sid_types.cur_sid_type == SPEC_SID)
3588 3582 abort = TRUE;
3589 3583
3590 3584 if (abort == FALSE) {
3591 3585 /*
3592 3586 * Need to retry all possible stateids in
3593 3587 * case the recovery error wasn't stateid
3594 3588 * related or the stateids have become
3595 3589 * stale (server reboot).
3596 3590 */
3597 3591 nfs4_init_stateid_types(&sid_types);
3598 3592 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3599 3593 goto recov_retry;
3600 3594 }
3601 3595
3602 3596 if (!e.error) {
3603 3597 e.error = geterrno4(res.status);
3604 3598 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3605 3599 }
3606 3600 return (e.error);
3607 3601 }
3608 3602
3609 3603 if (res.status) {
3610 3604 e.error = geterrno4(res.status);
3611 3605 nfs4_end_fop(mi, vp, NULL, OH_READ,
3612 3606 &recov_state, needrecov);
3613 3607 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3614 3608 return (e.error);
3615 3609 }
3616 3610
3617 3611 data_len = res.array[1].nfs_resop4_u.opread.data_len;
3618 3612 count -= data_len;
3619 3613 if (base)
3620 3614 base += data_len;
3621 3615 offset += data_len;
3622 3616 if (mi->mi_io_kstats) {
3623 3617 mutex_enter(&mi->mi_lock);
3624 3618 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3625 3619 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3626 3620 mutex_exit(&mi->mi_lock);
3627 3621 }
3628 3622 lwp_stat_update(LWP_STAT_INBLK, 1);
3629 3623 is_eof = res.array[1].nfs_resop4_u.opread.eof;
3630 3624 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3631 3625
3632 3626 } while (count && !is_eof);
3633 3627
3634 3628 *residp = count;
3635 3629
3636 3630 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3637 3631
3638 3632 return (e.error);
3639 3633 }
3640 3634
3641 3635 /* ARGSUSED */
3642 3636 static int
3643 3637 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3644 3638 caller_context_t *ct)
3645 3639 {
3646 3640 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3647 3641 return (EIO);
3648 3642 switch (cmd) {
3649 3643 case _FIODIRECTIO:
3650 3644 return (nfs4_directio(vp, (int)arg, cr));
3651 3645 default:
3652 3646 return (ENOTTY);
3653 3647 }
3654 3648 }
3655 3649
3656 3650 /* ARGSUSED */
3657 3651 int
3658 3652 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3659 3653 caller_context_t *ct)
3660 3654 {
3661 3655 int error;
3662 3656 rnode4_t *rp = VTOR4(vp);
3663 3657
3664 3658 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3665 3659 return (EIO);
3666 3660 /*
3667 3661 * If it has been specified that the return value will
3668 3662 * just be used as a hint, and we are only being asked
3669 3663 * for size, fsid or rdevid, then return the client's
3670 3664 * notion of these values without checking to make sure
3671 3665 * that the attribute cache is up to date.
3672 3666 * The whole point is to avoid an over the wire GETATTR
3673 3667 * call.
3674 3668 */
3675 3669 if (flags & ATTR_HINT) {
3676 3670 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3677 3671 mutex_enter(&rp->r_statelock);
3678 3672 if (vap->va_mask & AT_SIZE)
3679 3673 vap->va_size = rp->r_size;
3680 3674 if (vap->va_mask & AT_FSID)
3681 3675 vap->va_fsid = rp->r_attr.va_fsid;
3682 3676 if (vap->va_mask & AT_RDEV)
3683 3677 vap->va_rdev = rp->r_attr.va_rdev;
3684 3678 mutex_exit(&rp->r_statelock);
3685 3679 return (0);
3686 3680 }
3687 3681 }
3688 3682
3689 3683 /*
3690 3684 * Only need to flush pages if asking for the mtime
3691 3685 * and if there any dirty pages or any outstanding
3692 3686 * asynchronous (write) requests for this file.
3693 3687 */
3694 3688 if (vap->va_mask & AT_MTIME) {
3695 3689 rp = VTOR4(vp);
3696 3690 if (nfs4_has_pages(vp)) {
3697 3691 mutex_enter(&rp->r_statev4_lock);
3698 3692 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3699 3693 mutex_exit(&rp->r_statev4_lock);
3700 3694 if (rp->r_flags & R4DIRTY ||
3701 3695 rp->r_awcount > 0) {
3702 3696 mutex_enter(&rp->r_statelock);
3703 3697 rp->r_gcount++;
3704 3698 mutex_exit(&rp->r_statelock);
3705 3699 error =
3706 3700 nfs4_putpage(vp, (u_offset_t)0,
3707 3701 0, 0, cr, NULL);
3708 3702 mutex_enter(&rp->r_statelock);
3709 3703 if (error && (error == ENOSPC ||
3710 3704 error == EDQUOT)) {
3711 3705 if (!rp->r_error)
3712 3706 rp->r_error = error;
3713 3707 }
3714 3708 if (--rp->r_gcount == 0)
3715 3709 cv_broadcast(&rp->r_cv);
3716 3710 mutex_exit(&rp->r_statelock);
3717 3711 }
3718 3712 } else {
3719 3713 mutex_exit(&rp->r_statev4_lock);
3720 3714 }
3721 3715 }
3722 3716 }
3723 3717 return (nfs4getattr(vp, vap, cr));
3724 3718 }
3725 3719
3726 3720 int
3727 3721 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3728 3722 {
3729 3723 /*
3730 3724 * If these are the only two bits cleared
3731 3725 * on the server then return 0 (OK) else
3732 3726 * return 1 (BAD).
3733 3727 */
3734 3728 on_client &= ~(S_ISUID|S_ISGID);
3735 3729 if (on_client == from_server)
3736 3730 return (0);
3737 3731 else
3738 3732 return (1);
3739 3733 }
3740 3734
3741 3735 /*ARGSUSED4*/
3742 3736 static int
3743 3737 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3744 3738 caller_context_t *ct)
3745 3739 {
3746 3740 int error;
3747 3741
3748 3742 if (vap->va_mask & AT_NOSET)
3749 3743 return (EINVAL);
3750 3744
3751 3745 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3752 3746 return (EIO);
3753 3747
3754 3748 /*
3755 3749 * Don't call secpolicy_vnode_setattr, the client cannot
3756 3750 * use its cached attributes to make security decisions
3757 3751 * as the server may be faking mode bits or mapping uid/gid.
3758 3752 * Always just let the server to the checking.
3759 3753 * If we provide the ability to remove basic priviledges
3760 3754 * to setattr (e.g. basic without chmod) then we will
3761 3755 * need to add a check here before calling the server.
3762 3756 */
3763 3757 error = nfs4setattr(vp, vap, flags, cr, NULL);
3764 3758
3765 3759 if (error == 0 && (vap->va_mask & AT_SIZE)) {
3766 3760 if (vap->va_size == 0) {
3767 3761 vnevent_truncate(vp, ct);
3768 3762 } else {
3769 3763 vnevent_resize(vp, ct);
3770 3764 }
3771 3765 }
3772 3766
3773 3767 return (error);
3774 3768 }
3775 3769
3776 3770 /*
3777 3771 * To replace the "guarded" version 3 setattr, we use two types of compound
3778 3772 * setattr requests:
3779 3773 * 1. The "normal" setattr, used when the size of the file isn't being
3780 3774 * changed - { Putfh <fh>; Setattr; Getattr }/
3781 3775 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3782 3776 * with only ctime as the argument. If the server ctime differs from
3783 3777 * what is cached on the client, the verify will fail, but we would
3784 3778 * already have the ctime from the preceding getattr, so just set it
3785 3779 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3786 3780 * Setattr; Getattr }.
3787 3781 *
3788 3782 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3789 3783 * this setattr and NULL if they are not.
3790 3784 */
3791 3785 static int
3792 3786 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3793 3787 vsecattr_t *vsap)
3794 3788 {
3795 3789 COMPOUND4args_clnt args;
3796 3790 COMPOUND4res_clnt res, *resp = NULL;
3797 3791 nfs4_ga_res_t *garp = NULL;
3798 3792 int numops = 3; /* { Putfh; Setattr; Getattr } */
3799 3793 nfs_argop4 argop[5];
3800 3794 int verify_argop = -1;
3801 3795 int setattr_argop = 1;
3802 3796 nfs_resop4 *resop;
3803 3797 vattr_t va;
3804 3798 rnode4_t *rp;
3805 3799 int doqueue = 1;
3806 3800 uint_t mask = vap->va_mask;
3807 3801 mode_t omode;
3808 3802 vsecattr_t *vsp;
3809 3803 timestruc_t ctime;
3810 3804 bool_t needrecov = FALSE;
3811 3805 nfs4_recov_state_t recov_state;
3812 3806 nfs4_stateid_types_t sid_types;
3813 3807 stateid4 stateid;
3814 3808 hrtime_t t;
3815 3809 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3816 3810 servinfo4_t *svp;
3817 3811 bitmap4 supp_attrs;
3818 3812
3819 3813 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3820 3814 rp = VTOR4(vp);
3821 3815 nfs4_init_stateid_types(&sid_types);
3822 3816
3823 3817 /*
3824 3818 * Only need to flush pages if there are any pages and
3825 3819 * if the file is marked as dirty in some fashion. The
3826 3820 * file must be flushed so that we can accurately
3827 3821 * determine the size of the file and the cached data
3828 3822 * after the SETATTR returns. A file is considered to
3829 3823 * be dirty if it is either marked with R4DIRTY, has
3830 3824 * outstanding i/o's active, or is mmap'd. In this
3831 3825 * last case, we can't tell whether there are dirty
3832 3826 * pages, so we flush just to be sure.
3833 3827 */
3834 3828 if (nfs4_has_pages(vp) &&
3835 3829 ((rp->r_flags & R4DIRTY) ||
3836 3830 rp->r_count > 0 ||
3837 3831 rp->r_mapcnt > 0)) {
3838 3832 ASSERT(vp->v_type != VCHR);
3839 3833 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3840 3834 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3841 3835 mutex_enter(&rp->r_statelock);
3842 3836 if (!rp->r_error)
3843 3837 rp->r_error = e.error;
3844 3838 mutex_exit(&rp->r_statelock);
3845 3839 }
3846 3840 }
3847 3841
3848 3842 if (mask & AT_SIZE) {
3849 3843 /*
3850 3844 * Verification setattr compound for non-deleg AT_SIZE:
3851 3845 * { Putfh; Getattr; Verify; Setattr; Getattr }
3852 3846 * Set ctime local here (outside the do_again label)
3853 3847 * so that subsequent retries (after failed VERIFY)
3854 3848 * will use ctime from GETATTR results (from failed
3855 3849 * verify compound) as VERIFY arg.
3856 3850 * If file has delegation, then VERIFY(time_metadata)
3857 3851 * is of little added value, so don't bother.
3858 3852 */
3859 3853 mutex_enter(&rp->r_statev4_lock);
3860 3854 if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3861 3855 rp->r_deleg_return_pending) {
3862 3856 numops = 5;
3863 3857 ctime = rp->r_attr.va_ctime;
3864 3858 }
3865 3859 mutex_exit(&rp->r_statev4_lock);
3866 3860 }
3867 3861
3868 3862 recov_state.rs_flags = 0;
3869 3863 recov_state.rs_num_retry_despite_err = 0;
3870 3864
3871 3865 args.ctag = TAG_SETATTR;
3872 3866 do_again:
3873 3867 recov_retry:
3874 3868 setattr_argop = numops - 2;
3875 3869
3876 3870 args.array = argop;
3877 3871 args.array_len = numops;
3878 3872
3879 3873 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3880 3874 if (e.error)
3881 3875 return (e.error);
3882 3876
3883 3877
3884 3878 /* putfh target fh */
3885 3879 argop[0].argop = OP_CPUTFH;
3886 3880 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3887 3881
3888 3882 if (numops == 5) {
3889 3883 /*
3890 3884 * We only care about the ctime, but need to get mtime
3891 3885 * and size for proper cache update.
3892 3886 */
3893 3887 /* getattr */
3894 3888 argop[1].argop = OP_GETATTR;
3895 3889 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3896 3890 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3897 3891
3898 3892 /* verify - set later in loop */
3899 3893 verify_argop = 2;
3900 3894 }
3901 3895
3902 3896 /* setattr */
3903 3897 svp = rp->r_server;
3904 3898 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 3899 supp_attrs = svp->sv_supp_attrs;
3906 3900 nfs_rw_exit(&svp->sv_lock);
3907 3901
3908 3902 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3909 3903 supp_attrs, &e.error, &sid_types);
3910 3904 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3911 3905 if (e.error) {
3912 3906 /* req time field(s) overflow - return immediately */
3913 3907 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3914 3908 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3915 3909 opsetattr.obj_attributes);
3916 3910 return (e.error);
3917 3911 }
3918 3912 omode = rp->r_attr.va_mode;
3919 3913
3920 3914 /* getattr */
3921 3915 argop[numops-1].argop = OP_GETATTR;
3922 3916 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3923 3917 /*
3924 3918 * If we are setting the ACL (indicated only by vsap != NULL), request
3925 3919 * the ACL in this getattr. The ACL returned from this getattr will be
3926 3920 * used in updating the ACL cache.
3927 3921 */
3928 3922 if (vsap != NULL)
3929 3923 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3930 3924 FATTR4_ACL_MASK;
3931 3925 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3932 3926
3933 3927 /*
3934 3928 * setattr iterates if the object size is set and the cached ctime
3935 3929 * does not match the file ctime. In that case, verify the ctime first.
3936 3930 */
3937 3931
3938 3932 do {
3939 3933 if (verify_argop != -1) {
3940 3934 /*
3941 3935 * Verify that the ctime match before doing setattr.
3942 3936 */
3943 3937 va.va_mask = AT_CTIME;
3944 3938 va.va_ctime = ctime;
3945 3939 svp = rp->r_server;
3946 3940 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3947 3941 supp_attrs = svp->sv_supp_attrs;
3948 3942 nfs_rw_exit(&svp->sv_lock);
3949 3943 e.error = nfs4args_verify(&argop[verify_argop], &va,
3950 3944 OP_VERIFY, supp_attrs);
3951 3945 if (e.error) {
3952 3946 /* req time field(s) overflow - return */
3953 3947 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3954 3948 needrecov);
3955 3949 break;
3956 3950 }
3957 3951 }
3958 3952
3959 3953 doqueue = 1;
3960 3954
3961 3955 t = gethrtime();
3962 3956
3963 3957 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3964 3958
3965 3959 /*
3966 3960 * Purge the access cache and ACL cache if changing either the
3967 3961 * owner of the file, the group owner, or the mode. These may
3968 3962 * change the access permissions of the file, so purge old
3969 3963 * information and start over again.
3970 3964 */
3971 3965 if (mask & (AT_UID | AT_GID | AT_MODE)) {
3972 3966 (void) nfs4_access_purge_rp(rp);
3973 3967 if (rp->r_secattr != NULL) {
3974 3968 mutex_enter(&rp->r_statelock);
3975 3969 vsp = rp->r_secattr;
3976 3970 rp->r_secattr = NULL;
3977 3971 mutex_exit(&rp->r_statelock);
3978 3972 if (vsp != NULL)
3979 3973 nfs4_acl_free_cache(vsp);
3980 3974 }
3981 3975 }
3982 3976
3983 3977 /*
3984 3978 * If res.array_len == numops, then everything succeeded,
3985 3979 * except for possibly the final getattr. If only the
3986 3980 * last getattr failed, give up, and don't try recovery.
3987 3981 */
3988 3982 if (res.array_len == numops) {
3989 3983 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3990 3984 needrecov);
3991 3985 if (! e.error)
3992 3986 resp = &res;
3993 3987 break;
3994 3988 }
3995 3989
3996 3990 /*
3997 3991 * if either rpc call failed or completely succeeded - done
3998 3992 */
3999 3993 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4000 3994 if (e.error) {
4001 3995 PURGE_ATTRCACHE4(vp);
4002 3996 if (!needrecov) {
4003 3997 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4004 3998 needrecov);
4005 3999 break;
4006 4000 }
4007 4001 }
4008 4002
4009 4003 /*
4010 4004 * Do proper retry for OLD_STATEID outside of the normal
4011 4005 * recovery framework.
4012 4006 */
4013 4007 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4014 4008 sid_types.cur_sid_type != SPEC_SID &&
4015 4009 sid_types.cur_sid_type != NO_SID) {
4016 4010 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4017 4011 needrecov);
4018 4012 nfs4_save_stateid(&stateid, &sid_types);
4019 4013 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4020 4014 opsetattr.obj_attributes);
4021 4015 if (verify_argop != -1) {
4022 4016 nfs4args_verify_free(&argop[verify_argop]);
4023 4017 verify_argop = -1;
4024 4018 }
4025 4019 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4026 4020 goto recov_retry;
4027 4021 }
4028 4022
4029 4023 if (needrecov) {
4030 4024 bool_t abort;
4031 4025
4032 4026 abort = nfs4_start_recovery(&e,
4033 4027 VTOMI4(vp), vp, NULL, NULL, NULL,
4034 4028 OP_SETATTR, NULL, NULL, NULL);
4035 4029 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4036 4030 needrecov);
4037 4031 /*
4038 4032 * Do not retry if we failed with OLD_STATEID using
4039 4033 * a special stateid. This is done to avoid looping
4040 4034 * with a broken server.
4041 4035 */
4042 4036 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4043 4037 (sid_types.cur_sid_type == SPEC_SID ||
4044 4038 sid_types.cur_sid_type == NO_SID))
4045 4039 abort = TRUE;
4046 4040 if (!e.error) {
4047 4041 if (res.status == NFS4ERR_BADOWNER)
4048 4042 nfs4_log_badowner(VTOMI4(vp),
4049 4043 OP_SETATTR);
4050 4044
4051 4045 e.error = geterrno4(res.status);
4052 4046 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4053 4047 }
4054 4048 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4055 4049 opsetattr.obj_attributes);
4056 4050 if (verify_argop != -1) {
4057 4051 nfs4args_verify_free(&argop[verify_argop]);
4058 4052 verify_argop = -1;
4059 4053 }
4060 4054 if (abort == FALSE) {
4061 4055 /*
4062 4056 * Need to retry all possible stateids in
4063 4057 * case the recovery error wasn't stateid
4064 4058 * related or the stateids have become
4065 4059 * stale (server reboot).
4066 4060 */
4067 4061 nfs4_init_stateid_types(&sid_types);
4068 4062 goto recov_retry;
4069 4063 }
4070 4064 return (e.error);
4071 4065 }
4072 4066
4073 4067 /*
4074 4068 * Need to call nfs4_end_op before nfs4getattr to
4075 4069 * avoid potential nfs4_start_op deadlock. See RFE
4076 4070 * 4777612. Calls to nfs4_invalidate_pages() and
4077 4071 * nfs4_purge_stale_fh() might also generate over the
4078 4072 * wire calls which my cause nfs4_start_op() deadlock.
4079 4073 */
4080 4074 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4081 4075
4082 4076 /*
4083 4077 * Check to update lease.
4084 4078 */
4085 4079 resp = &res;
4086 4080 if (res.status == NFS4_OK) {
4087 4081 break;
4088 4082 }
4089 4083
4090 4084 /*
4091 4085 * Check if verify failed to see if try again
4092 4086 */
4093 4087 if ((verify_argop == -1) || (res.array_len != 3)) {
4094 4088 /*
4095 4089 * can't continue...
4096 4090 */
4097 4091 if (res.status == NFS4ERR_BADOWNER)
4098 4092 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4099 4093
4100 4094 e.error = geterrno4(res.status);
4101 4095 } else {
4102 4096 /*
4103 4097 * When the verify request fails, the client ctime is
4104 4098 * not in sync with the server. This is the same as
4105 4099 * the version 3 "not synchronized" error, and we
4106 4100 * handle it in a similar manner (XXX do we need to???).
4107 4101 * Use the ctime returned in the first getattr for
4108 4102 * the input to the next verify.
4109 4103 * If we couldn't get the attributes, then we give up
4110 4104 * because we can't complete the operation as required.
4111 4105 */
4112 4106 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4113 4107 }
4114 4108 if (e.error) {
4115 4109 PURGE_ATTRCACHE4(vp);
4116 4110 nfs4_purge_stale_fh(e.error, vp, cr);
4117 4111 } else {
4118 4112 /*
4119 4113 * retry with a new verify value
4120 4114 */
4121 4115 ctime = garp->n4g_va.va_ctime;
4122 4116 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4123 4117 resp = NULL;
4124 4118 }
4125 4119 if (!e.error) {
4126 4120 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4127 4121 opsetattr.obj_attributes);
4128 4122 if (verify_argop != -1) {
4129 4123 nfs4args_verify_free(&argop[verify_argop]);
4130 4124 verify_argop = -1;
4131 4125 }
4132 4126 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4133 4127 goto do_again;
4134 4128 }
4135 4129 } while (!e.error);
4136 4130
4137 4131 if (e.error) {
4138 4132 /*
4139 4133 * If we are here, rfs4call has an irrecoverable error - return
4140 4134 */
4141 4135 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4142 4136 opsetattr.obj_attributes);
4143 4137 if (verify_argop != -1) {
4144 4138 nfs4args_verify_free(&argop[verify_argop]);
4145 4139 verify_argop = -1;
4146 4140 }
4147 4141 if (resp)
4148 4142 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4149 4143 return (e.error);
4150 4144 }
4151 4145
4152 4146
4153 4147
4154 4148 /*
4155 4149 * If changing the size of the file, invalidate
4156 4150 * any local cached data which is no longer part
4157 4151 * of the file. We also possibly invalidate the
4158 4152 * last page in the file. We could use
4159 4153 * pvn_vpzero(), but this would mark the page as
4160 4154 * modified and require it to be written back to
4161 4155 * the server for no particularly good reason.
4162 4156 * This way, if we access it, then we bring it
4163 4157 * back in. A read should be cheaper than a
4164 4158 * write.
4165 4159 */
4166 4160 if (mask & AT_SIZE) {
4167 4161 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4168 4162 }
4169 4163
4170 4164 /* either no error or one of the postop getattr failed */
4171 4165
4172 4166 /*
4173 4167 * XXX Perform a simplified version of wcc checking. Instead of
4174 4168 * have another getattr to get pre-op, just purge cache if
4175 4169 * any of the ops prior to and including the getattr failed.
4176 4170 * If the getattr succeeded then update the attrcache accordingly.
4177 4171 */
4178 4172
4179 4173 garp = NULL;
4180 4174 if (res.status == NFS4_OK) {
4181 4175 /*
4182 4176 * Last getattr
4183 4177 */
4184 4178 resop = &res.array[numops - 1];
4185 4179 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4186 4180 }
4187 4181 /*
4188 4182 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4189 4183 * rather than filling it. See the function itself for details.
4190 4184 */
4191 4185 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4192 4186 if (garp != NULL) {
4193 4187 if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4194 4188 nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4195 4189 vs_ace4_destroy(&garp->n4g_vsa);
4196 4190 } else {
4197 4191 if (vsap != NULL) {
4198 4192 /*
4199 4193 * The ACL was supposed to be set and to be
4200 4194 * returned in the last getattr of this
4201 4195 * compound, but for some reason the getattr
4202 4196 * result doesn't contain the ACL. In this
4203 4197 * case, purge the ACL cache.
4204 4198 */
4205 4199 if (rp->r_secattr != NULL) {
4206 4200 mutex_enter(&rp->r_statelock);
4207 4201 vsp = rp->r_secattr;
4208 4202 rp->r_secattr = NULL;
4209 4203 mutex_exit(&rp->r_statelock);
4210 4204 if (vsp != NULL)
4211 4205 nfs4_acl_free_cache(vsp);
4212 4206 }
4213 4207 }
4214 4208 }
4215 4209 }
4216 4210
4217 4211 if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4218 4212 /*
4219 4213 * Set the size, rather than relying on getting it updated
4220 4214 * via a GETATTR. With delegations the client tries to
4221 4215 * suppress GETATTR calls.
4222 4216 */
4223 4217 mutex_enter(&rp->r_statelock);
4224 4218 rp->r_size = vap->va_size;
4225 4219 mutex_exit(&rp->r_statelock);
4226 4220 }
4227 4221
4228 4222 /*
4229 4223 * Can free up request args and res
4230 4224 */
4231 4225 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4232 4226 opsetattr.obj_attributes);
4233 4227 if (verify_argop != -1) {
4234 4228 nfs4args_verify_free(&argop[verify_argop]);
4235 4229 verify_argop = -1;
4236 4230 }
4237 4231 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4238 4232
4239 4233 /*
4240 4234 * Some servers will change the mode to clear the setuid
4241 4235 * and setgid bits when changing the uid or gid. The
4242 4236 * client needs to compensate appropriately.
4243 4237 */
4244 4238 if (mask & (AT_UID | AT_GID)) {
4245 4239 int terror, do_setattr;
4246 4240
4247 4241 do_setattr = 0;
4248 4242 va.va_mask = AT_MODE;
4249 4243 terror = nfs4getattr(vp, &va, cr);
4250 4244 if (!terror &&
4251 4245 (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4252 4246 (!(mask & AT_MODE) && va.va_mode != omode))) {
4253 4247 va.va_mask = AT_MODE;
4254 4248 if (mask & AT_MODE) {
4255 4249 /*
4256 4250 * We asked the mode to be changed and what
4257 4251 * we just got from the server in getattr is
4258 4252 * not what we wanted it to be, so set it now.
4259 4253 */
4260 4254 va.va_mode = vap->va_mode;
4261 4255 do_setattr = 1;
4262 4256 } else {
4263 4257 /*
4264 4258 * We did not ask the mode to be changed,
4265 4259 * Check to see that the server just cleared
4266 4260 * I_SUID and I_GUID from it. If not then
4267 4261 * set mode to omode with UID/GID cleared.
4268 4262 */
4269 4263 if (nfs4_compare_modes(va.va_mode, omode)) {
4270 4264 omode &= ~(S_ISUID|S_ISGID);
4271 4265 va.va_mode = omode;
4272 4266 do_setattr = 1;
4273 4267 }
4274 4268 }
4275 4269
4276 4270 if (do_setattr)
4277 4271 (void) nfs4setattr(vp, &va, 0, cr, NULL);
4278 4272 }
4279 4273 }
4280 4274
4281 4275 return (e.error);
4282 4276 }
4283 4277
4284 4278 /* ARGSUSED */
4285 4279 static int
4286 4280 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4287 4281 {
4288 4282 COMPOUND4args_clnt args;
4289 4283 COMPOUND4res_clnt res;
4290 4284 int doqueue;
4291 4285 uint32_t acc, resacc, argacc;
4292 4286 rnode4_t *rp;
4293 4287 cred_t *cred, *ncr, *ncrfree = NULL;
4294 4288 nfs4_access_type_t cacc;
4295 4289 int num_ops;
4296 4290 nfs_argop4 argop[3];
4297 4291 nfs_resop4 *resop;
4298 4292 bool_t needrecov = FALSE, do_getattr;
4299 4293 nfs4_recov_state_t recov_state;
4300 4294 int rpc_error;
4301 4295 hrtime_t t;
4302 4296 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4303 4297 mntinfo4_t *mi = VTOMI4(vp);
4304 4298
4305 4299 if (nfs_zone() != mi->mi_zone)
4306 4300 return (EIO);
4307 4301
4308 4302 acc = 0;
4309 4303 if (mode & VREAD)
4310 4304 acc |= ACCESS4_READ;
4311 4305 if (mode & VWRITE) {
4312 4306 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4313 4307 return (EROFS);
4314 4308 if (vp->v_type == VDIR)
4315 4309 acc |= ACCESS4_DELETE;
4316 4310 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4317 4311 }
4318 4312 if (mode & VEXEC) {
4319 4313 if (vp->v_type == VDIR)
4320 4314 acc |= ACCESS4_LOOKUP;
4321 4315 else
4322 4316 acc |= ACCESS4_EXECUTE;
4323 4317 }
4324 4318
4325 4319 if (VTOR4(vp)->r_acache != NULL) {
4326 4320 e.error = nfs4_validate_caches(vp, cr);
4327 4321 if (e.error)
4328 4322 return (e.error);
4329 4323 }
4330 4324
4331 4325 rp = VTOR4(vp);
4332 4326 if (vp->v_type == VDIR)
4333 4327 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4334 4328 ACCESS4_EXTEND | ACCESS4_LOOKUP;
4335 4329 else
4336 4330 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4337 4331 ACCESS4_EXECUTE;
4338 4332 recov_state.rs_flags = 0;
4339 4333 recov_state.rs_num_retry_despite_err = 0;
4340 4334
4341 4335 cred = cr;
4342 4336 /*
4343 4337 * ncr and ncrfree both initially
4344 4338 * point to the memory area returned
4345 4339 * by crnetadjust();
4346 4340 * ncrfree not NULL when exiting means
4347 4341 * that we need to release it
4348 4342 */
4349 4343 ncr = crnetadjust(cred);
4350 4344 ncrfree = ncr;
4351 4345
4352 4346 tryagain:
4353 4347 cacc = nfs4_access_check(rp, acc, cred);
4354 4348 if (cacc == NFS4_ACCESS_ALLOWED) {
4355 4349 if (ncrfree != NULL)
4356 4350 crfree(ncrfree);
4357 4351 return (0);
4358 4352 }
4359 4353 if (cacc == NFS4_ACCESS_DENIED) {
4360 4354 /*
4361 4355 * If the cred can be adjusted, try again
4362 4356 * with the new cred.
4363 4357 */
4364 4358 if (ncr != NULL) {
4365 4359 cred = ncr;
4366 4360 ncr = NULL;
4367 4361 goto tryagain;
4368 4362 }
4369 4363 if (ncrfree != NULL)
4370 4364 crfree(ncrfree);
4371 4365 return (EACCES);
4372 4366 }
4373 4367
4374 4368 recov_retry:
4375 4369 /*
4376 4370 * Don't take with r_statev4_lock here. r_deleg_type could
4377 4371 * change as soon as lock is released. Since it is an int,
4378 4372 * there is no atomicity issue.
4379 4373 */
4380 4374 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4381 4375 num_ops = do_getattr ? 3 : 2;
4382 4376
4383 4377 args.ctag = TAG_ACCESS;
4384 4378
4385 4379 args.array_len = num_ops;
4386 4380 args.array = argop;
4387 4381
4388 4382 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4389 4383 &recov_state, NULL)) {
4390 4384 if (ncrfree != NULL)
4391 4385 crfree(ncrfree);
4392 4386 return (e.error);
4393 4387 }
4394 4388
4395 4389 /* putfh target fh */
4396 4390 argop[0].argop = OP_CPUTFH;
4397 4391 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4398 4392
4399 4393 /* access */
4400 4394 argop[1].argop = OP_ACCESS;
4401 4395 argop[1].nfs_argop4_u.opaccess.access = argacc;
4402 4396
4403 4397 /* getattr */
4404 4398 if (do_getattr) {
4405 4399 argop[2].argop = OP_GETATTR;
4406 4400 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4407 4401 argop[2].nfs_argop4_u.opgetattr.mi = mi;
4408 4402 }
4409 4403
4410 4404 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4411 4405 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4412 4406 rnode4info(VTOR4(vp))));
4413 4407
4414 4408 doqueue = 1;
4415 4409 t = gethrtime();
4416 4410 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4417 4411 rpc_error = e.error;
4418 4412
4419 4413 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4420 4414 if (needrecov) {
4421 4415 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4422 4416 "nfs4_access: initiating recovery\n"));
4423 4417
4424 4418 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4425 4419 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4426 4420 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4427 4421 &recov_state, needrecov);
4428 4422 if (!e.error)
4429 4423 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4430 4424 goto recov_retry;
4431 4425 }
4432 4426 }
4433 4427 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4434 4428
4435 4429 if (e.error)
4436 4430 goto out;
4437 4431
4438 4432 if (res.status) {
4439 4433 e.error = geterrno4(res.status);
4440 4434 /*
4441 4435 * This might generate over the wire calls throught
4442 4436 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4443 4437 * here to avoid a deadlock.
4444 4438 */
4445 4439 nfs4_purge_stale_fh(e.error, vp, cr);
4446 4440 goto out;
4447 4441 }
4448 4442 resop = &res.array[1]; /* access res */
4449 4443
4450 4444 resacc = resop->nfs_resop4_u.opaccess.access;
4451 4445
4452 4446 if (do_getattr) {
4453 4447 resop++; /* getattr res */
4454 4448 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4455 4449 t, cr, FALSE, NULL);
4456 4450 }
4457 4451
4458 4452 if (!e.error) {
4459 4453 nfs4_access_cache(rp, argacc, resacc, cred);
4460 4454 /*
4461 4455 * we just cached results with cred; if cred is the
4462 4456 * adjusted credentials from crnetadjust, we do not want
4463 4457 * to release them before exiting: hence setting ncrfree
4464 4458 * to NULL
4465 4459 */
4466 4460 if (cred != cr)
4467 4461 ncrfree = NULL;
4468 4462 /* XXX check the supported bits too? */
4469 4463 if ((acc & resacc) != acc) {
4470 4464 /*
4471 4465 * The following code implements the semantic
4472 4466 * that a setuid root program has *at least* the
4473 4467 * permissions of the user that is running the
4474 4468 * program. See rfs3call() for more portions
4475 4469 * of the implementation of this functionality.
4476 4470 */
4477 4471 /* XXX-LP */
4478 4472 if (ncr != NULL) {
4479 4473 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4480 4474 cred = ncr;
4481 4475 ncr = NULL;
4482 4476 goto tryagain;
4483 4477 }
4484 4478 e.error = EACCES;
4485 4479 }
4486 4480 }
4487 4481
4488 4482 out:
4489 4483 if (!rpc_error)
4490 4484 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4491 4485
4492 4486 if (ncrfree != NULL)
4493 4487 crfree(ncrfree);
4494 4488
4495 4489 return (e.error);
4496 4490 }
4497 4491
4498 4492 /* ARGSUSED */
4499 4493 static int
4500 4494 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4501 4495 {
4502 4496 COMPOUND4args_clnt args;
4503 4497 COMPOUND4res_clnt res;
4504 4498 int doqueue;
4505 4499 rnode4_t *rp;
4506 4500 nfs_argop4 argop[3];
4507 4501 nfs_resop4 *resop;
4508 4502 READLINK4res *lr_res;
4509 4503 nfs4_ga_res_t *garp;
4510 4504 uint_t len;
4511 4505 char *linkdata;
4512 4506 bool_t needrecov = FALSE;
4513 4507 nfs4_recov_state_t recov_state;
4514 4508 hrtime_t t;
4515 4509 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4516 4510
4517 4511 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4518 4512 return (EIO);
4519 4513 /*
4520 4514 * Can't readlink anything other than a symbolic link.
4521 4515 */
4522 4516 if (vp->v_type != VLNK)
4523 4517 return (EINVAL);
4524 4518
4525 4519 rp = VTOR4(vp);
4526 4520 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4527 4521 e.error = nfs4_validate_caches(vp, cr);
4528 4522 if (e.error)
4529 4523 return (e.error);
4530 4524 mutex_enter(&rp->r_statelock);
4531 4525 if (rp->r_symlink.contents != NULL) {
4532 4526 e.error = uiomove(rp->r_symlink.contents,
4533 4527 rp->r_symlink.len, UIO_READ, uiop);
4534 4528 mutex_exit(&rp->r_statelock);
4535 4529 return (e.error);
4536 4530 }
4537 4531 mutex_exit(&rp->r_statelock);
4538 4532 }
4539 4533 recov_state.rs_flags = 0;
4540 4534 recov_state.rs_num_retry_despite_err = 0;
4541 4535
4542 4536 recov_retry:
4543 4537 args.array_len = 3;
4544 4538 args.array = argop;
4545 4539 args.ctag = TAG_READLINK;
4546 4540
4547 4541 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4548 4542 if (e.error) {
4549 4543 return (e.error);
4550 4544 }
4551 4545
4552 4546 /* 0. putfh symlink fh */
4553 4547 argop[0].argop = OP_CPUTFH;
4554 4548 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4555 4549
4556 4550 /* 1. readlink */
4557 4551 argop[1].argop = OP_READLINK;
4558 4552
4559 4553 /* 2. getattr */
4560 4554 argop[2].argop = OP_GETATTR;
4561 4555 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4562 4556 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4563 4557
4564 4558 doqueue = 1;
4565 4559
4566 4560 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4567 4561 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4568 4562 rnode4info(VTOR4(vp))));
4569 4563
4570 4564 t = gethrtime();
4571 4565
4572 4566 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4573 4567
4574 4568 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4575 4569 if (needrecov) {
4576 4570 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4577 4571 "nfs4_readlink: initiating recovery\n"));
4578 4572
4579 4573 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4580 4574 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4581 4575 if (!e.error)
4582 4576 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4583 4577
4584 4578 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4585 4579 needrecov);
4586 4580 goto recov_retry;
4587 4581 }
4588 4582 }
4589 4583
4590 4584 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4591 4585
4592 4586 if (e.error)
4593 4587 return (e.error);
4594 4588
4595 4589 /*
4596 4590 * There is an path in the code below which calls
4597 4591 * nfs4_purge_stale_fh(), which may generate otw calls through
4598 4592 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4599 4593 * here to avoid nfs4_start_op() deadlock.
4600 4594 */
4601 4595
4602 4596 if (res.status && (res.array_len < args.array_len)) {
4603 4597 /*
4604 4598 * either Putfh or Link failed
4605 4599 */
4606 4600 e.error = geterrno4(res.status);
4607 4601 nfs4_purge_stale_fh(e.error, vp, cr);
4608 4602 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4609 4603 return (e.error);
4610 4604 }
4611 4605
4612 4606 resop = &res.array[1]; /* readlink res */
4613 4607 lr_res = &resop->nfs_resop4_u.opreadlink;
4614 4608
4615 4609 /*
4616 4610 * treat symlink names as data
4617 4611 */
4618 4612 linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
4619 4613 if (linkdata != NULL) {
4620 4614 int uio_len = len - 1;
4621 4615 /* len includes null byte, which we won't uiomove */
4622 4616 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4623 4617 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4624 4618 mutex_enter(&rp->r_statelock);
4625 4619 if (rp->r_symlink.contents == NULL) {
4626 4620 rp->r_symlink.contents = linkdata;
4627 4621 rp->r_symlink.len = uio_len;
4628 4622 rp->r_symlink.size = len;
4629 4623 mutex_exit(&rp->r_statelock);
4630 4624 } else {
4631 4625 mutex_exit(&rp->r_statelock);
4632 4626 kmem_free(linkdata, len);
4633 4627 }
4634 4628 } else {
4635 4629 kmem_free(linkdata, len);
4636 4630 }
4637 4631 }
4638 4632 if (res.status == NFS4_OK) {
4639 4633 resop++; /* getattr res */
4640 4634 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4641 4635 }
4642 4636 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4643 4637
4644 4638 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4645 4639
4646 4640 /*
4647 4641 * The over the wire error for attempting to readlink something
4648 4642 * other than a symbolic link is ENXIO. However, we need to
4649 4643 * return EINVAL instead of ENXIO, so we map it here.
4650 4644 */
4651 4645 return (e.error == ENXIO ? EINVAL : e.error);
4652 4646 }
4653 4647
4654 4648 /*
4655 4649 * Flush local dirty pages to stable storage on the server.
4656 4650 *
4657 4651 * If FNODSYNC is specified, then there is nothing to do because
4658 4652 * metadata changes are not cached on the client before being
4659 4653 * sent to the server.
4660 4654 */
4661 4655 /* ARGSUSED */
4662 4656 static int
4663 4657 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4664 4658 {
4665 4659 int error;
4666 4660
4667 4661 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4668 4662 return (0);
4669 4663 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4670 4664 return (EIO);
4671 4665 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4672 4666 if (!error)
4673 4667 error = VTOR4(vp)->r_error;
4674 4668 return (error);
4675 4669 }
4676 4670
4677 4671 /*
4678 4672 * Weirdness: if the file was removed or the target of a rename
4679 4673 * operation while it was open, it got renamed instead. Here we
4680 4674 * remove the renamed file.
4681 4675 */
4682 4676 /* ARGSUSED */
4683 4677 void
4684 4678 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4685 4679 {
4686 4680 rnode4_t *rp;
4687 4681
4688 4682 ASSERT(vp != DNLC_NO_VNODE);
4689 4683
4690 4684 rp = VTOR4(vp);
4691 4685
4692 4686 if (IS_SHADOW(vp, rp)) {
4693 4687 sv_inactive(vp);
4694 4688 return;
4695 4689 }
4696 4690
4697 4691 /*
4698 4692 * If this is coming from the wrong zone, we let someone in the right
4699 4693 * zone take care of it asynchronously. We can get here due to
4700 4694 * VN_RELE() being called from pageout() or fsflush(). This call may
4701 4695 * potentially turn into an expensive no-op if, for instance, v_count
4702 4696 * gets incremented in the meantime, but it's still correct.
4703 4697 */
4704 4698 if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4705 4699 nfs4_async_inactive(vp, cr);
4706 4700 return;
4707 4701 }
4708 4702
4709 4703 /*
4710 4704 * Some of the cleanup steps might require over-the-wire
4711 4705 * operations. Since VOP_INACTIVE can get called as a result of
4712 4706 * other over-the-wire operations (e.g., an attribute cache update
4713 4707 * can lead to a DNLC purge), doing those steps now would lead to a
4714 4708 * nested call to the recovery framework, which can deadlock. So
4715 4709 * do any over-the-wire cleanups asynchronously, in a separate
4716 4710 * thread.
4717 4711 */
4718 4712
4719 4713 mutex_enter(&rp->r_os_lock);
4720 4714 mutex_enter(&rp->r_statelock);
4721 4715 mutex_enter(&rp->r_statev4_lock);
4722 4716
4723 4717 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4724 4718 mutex_exit(&rp->r_statev4_lock);
4725 4719 mutex_exit(&rp->r_statelock);
4726 4720 mutex_exit(&rp->r_os_lock);
4727 4721 nfs4_async_inactive(vp, cr);
4728 4722 return;
4729 4723 }
4730 4724
4731 4725 if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4732 4726 rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4733 4727 mutex_exit(&rp->r_statev4_lock);
4734 4728 mutex_exit(&rp->r_statelock);
4735 4729 mutex_exit(&rp->r_os_lock);
4736 4730 nfs4_async_inactive(vp, cr);
4737 4731 return;
4738 4732 }
4739 4733
4740 4734 if (rp->r_unldvp != NULL) {
4741 4735 mutex_exit(&rp->r_statev4_lock);
4742 4736 mutex_exit(&rp->r_statelock);
4743 4737 mutex_exit(&rp->r_os_lock);
4744 4738 nfs4_async_inactive(vp, cr);
4745 4739 return;
4746 4740 }
4747 4741 mutex_exit(&rp->r_statev4_lock);
4748 4742 mutex_exit(&rp->r_statelock);
4749 4743 mutex_exit(&rp->r_os_lock);
4750 4744
4751 4745 rp4_addfree(rp, cr);
4752 4746 }
4753 4747
4754 4748 /*
4755 4749 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4756 4750 * various bits of state. The caller must not refer to vp after this call.
4757 4751 */
4758 4752
4759 4753 void
4760 4754 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4761 4755 {
4762 4756 rnode4_t *rp = VTOR4(vp);
4763 4757 nfs4_recov_state_t recov_state;
4764 4758 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4765 4759 vnode_t *unldvp;
4766 4760 char *unlname;
4767 4761 cred_t *unlcred;
4768 4762 COMPOUND4args_clnt args;
4769 4763 COMPOUND4res_clnt res, *resp;
4770 4764 nfs_argop4 argop[2];
4771 4765 int doqueue;
4772 4766 #ifdef DEBUG
4773 4767 char *name;
4774 4768 #endif
4775 4769
4776 4770 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4777 4771 ASSERT(!IS_SHADOW(vp, rp));
4778 4772
4779 4773 #ifdef DEBUG
4780 4774 name = fn_name(VTOSV(vp)->sv_name);
4781 4775 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4782 4776 "release vnode %s", name));
4783 4777 kmem_free(name, MAXNAMELEN);
4784 4778 #endif
4785 4779
4786 4780 if (vp->v_type == VREG) {
4787 4781 bool_t recov_failed = FALSE;
4788 4782
4789 4783 e.error = nfs4close_all(vp, cr);
4790 4784 if (e.error) {
4791 4785 /* Check to see if recovery failed */
4792 4786 mutex_enter(&(VTOMI4(vp)->mi_lock));
4793 4787 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4794 4788 recov_failed = TRUE;
4795 4789 mutex_exit(&(VTOMI4(vp)->mi_lock));
4796 4790 if (!recov_failed) {
4797 4791 mutex_enter(&rp->r_statelock);
4798 4792 if (rp->r_flags & R4RECOVERR)
4799 4793 recov_failed = TRUE;
4800 4794 mutex_exit(&rp->r_statelock);
4801 4795 }
4802 4796 if (recov_failed) {
4803 4797 NFS4_DEBUG(nfs4_client_recov_debug,
4804 4798 (CE_NOTE, "nfs4_inactive_otw: "
4805 4799 "close failed (recovery failure)"));
4806 4800 }
4807 4801 }
4808 4802 }
4809 4803
4810 4804 redo:
4811 4805 if (rp->r_unldvp == NULL) {
4812 4806 rp4_addfree(rp, cr);
4813 4807 return;
4814 4808 }
4815 4809
4816 4810 /*
4817 4811 * Save the vnode pointer for the directory where the
4818 4812 * unlinked-open file got renamed, then set it to NULL
4819 4813 * to prevent another thread from getting here before
4820 4814 * we're done with the remove. While we have the
4821 4815 * statelock, make local copies of the pertinent rnode
4822 4816 * fields. If we weren't to do this in an atomic way, the
4823 4817 * the unl* fields could become inconsistent with respect
4824 4818 * to each other due to a race condition between this
4825 4819 * code and nfs_remove(). See bug report 1034328.
4826 4820 */
4827 4821 mutex_enter(&rp->r_statelock);
4828 4822 if (rp->r_unldvp == NULL) {
4829 4823 mutex_exit(&rp->r_statelock);
4830 4824 rp4_addfree(rp, cr);
4831 4825 return;
4832 4826 }
4833 4827
4834 4828 unldvp = rp->r_unldvp;
4835 4829 rp->r_unldvp = NULL;
4836 4830 unlname = rp->r_unlname;
4837 4831 rp->r_unlname = NULL;
4838 4832 unlcred = rp->r_unlcred;
4839 4833 rp->r_unlcred = NULL;
4840 4834 mutex_exit(&rp->r_statelock);
4841 4835
4842 4836 /*
4843 4837 * If there are any dirty pages left, then flush
4844 4838 * them. This is unfortunate because they just
4845 4839 * may get thrown away during the remove operation,
4846 4840 * but we have to do this for correctness.
4847 4841 */
4848 4842 if (nfs4_has_pages(vp) &&
4849 4843 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4850 4844 ASSERT(vp->v_type != VCHR);
4851 4845 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4852 4846 if (e.error) {
4853 4847 mutex_enter(&rp->r_statelock);
4854 4848 if (!rp->r_error)
4855 4849 rp->r_error = e.error;
4856 4850 mutex_exit(&rp->r_statelock);
4857 4851 }
4858 4852 }
4859 4853
4860 4854 recov_state.rs_flags = 0;
4861 4855 recov_state.rs_num_retry_despite_err = 0;
4862 4856 recov_retry_remove:
4863 4857 /*
4864 4858 * Do the remove operation on the renamed file
4865 4859 */
4866 4860 args.ctag = TAG_INACTIVE;
4867 4861
4868 4862 /*
4869 4863 * Remove ops: putfh dir; remove
4870 4864 */
4871 4865 args.array_len = 2;
4872 4866 args.array = argop;
4873 4867
4874 4868 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4875 4869 if (e.error) {
4876 4870 kmem_free(unlname, MAXNAMELEN);
4877 4871 crfree(unlcred);
4878 4872 VN_RELE(unldvp);
4879 4873 /*
4880 4874 * Try again; this time around r_unldvp will be NULL, so we'll
4881 4875 * just call rp4_addfree() and return.
4882 4876 */
4883 4877 goto redo;
4884 4878 }
4885 4879
4886 4880 /* putfh directory */
4887 4881 argop[0].argop = OP_CPUTFH;
4888 4882 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4889 4883
4890 4884 /* remove */
4891 4885 argop[1].argop = OP_CREMOVE;
4892 4886 argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4893 4887
4894 4888 doqueue = 1;
4895 4889 resp = &res;
4896 4890
4897 4891 #if 0 /* notyet */
4898 4892 /*
4899 4893 * Can't do this yet. We may be being called from
4900 4894 * dnlc_purge_XXX while that routine is holding a
4901 4895 * mutex lock to the nc_rele list. The calls to
4902 4896 * nfs3_cache_wcc_data may result in calls to
4903 4897 * dnlc_purge_XXX. This will result in a deadlock.
4904 4898 */
4905 4899 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4906 4900 if (e.error) {
4907 4901 PURGE_ATTRCACHE4(unldvp);
4908 4902 resp = NULL;
4909 4903 } else if (res.status) {
4910 4904 e.error = geterrno4(res.status);
4911 4905 PURGE_ATTRCACHE4(unldvp);
4912 4906 /*
4913 4907 * This code is inactive right now
4914 4908 * but if made active there should
4915 4909 * be a nfs4_end_op() call before
4916 4910 * nfs4_purge_stale_fh to avoid start_op()
4917 4911 * deadlock. See BugId: 4948726
4918 4912 */
4919 4913 nfs4_purge_stale_fh(error, unldvp, cr);
4920 4914 } else {
4921 4915 nfs_resop4 *resop;
4922 4916 REMOVE4res *rm_res;
4923 4917
4924 4918 resop = &res.array[1];
4925 4919 rm_res = &resop->nfs_resop4_u.opremove;
4926 4920 /*
4927 4921 * Update directory cache attribute,
4928 4922 * readdir and dnlc caches.
4929 4923 */
4930 4924 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4931 4925 }
4932 4926 #else
4933 4927 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4934 4928
4935 4929 PURGE_ATTRCACHE4(unldvp);
4936 4930 #endif
4937 4931
4938 4932 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4939 4933 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4940 4934 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4941 4935 if (!e.error)
4942 4936 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4943 4937 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4944 4938 &recov_state, TRUE);
4945 4939 goto recov_retry_remove;
4946 4940 }
4947 4941 }
4948 4942 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4949 4943
4950 4944 /*
4951 4945 * Release stuff held for the remove
4952 4946 */
4953 4947 VN_RELE(unldvp);
4954 4948 if (!e.error && resp)
4955 4949 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4956 4950
4957 4951 kmem_free(unlname, MAXNAMELEN);
4958 4952 crfree(unlcred);
4959 4953 goto redo;
4960 4954 }
4961 4955
4962 4956 /*
4963 4957 * Remote file system operations having to do with directory manipulation.
4964 4958 */
4965 4959 /* ARGSUSED3 */
4966 4960 int
4967 4961 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4968 4962 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4969 4963 int *direntflags, pathname_t *realpnp)
4970 4964 {
4971 4965 int error;
4972 4966 vnode_t *vp, *avp = NULL;
4973 4967 rnode4_t *drp;
4974 4968
4975 4969 *vpp = NULL;
4976 4970 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4977 4971 return (EPERM);
4978 4972 /*
4979 4973 * if LOOKUP_XATTR, must replace dvp (object) with
4980 4974 * object's attrdir before continuing with lookup
4981 4975 */
4982 4976 if (flags & LOOKUP_XATTR) {
4983 4977 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4984 4978 if (error)
4985 4979 return (error);
4986 4980
4987 4981 dvp = avp;
4988 4982
4989 4983 /*
4990 4984 * If lookup is for "", just return dvp now. The attrdir
4991 4985 * has already been activated (from nfs4lookup_xattr), and
4992 4986 * the caller will RELE the original dvp -- not
4993 4987 * the attrdir. So, set vpp and return.
4994 4988 * Currently, when the LOOKUP_XATTR flag is
4995 4989 * passed to VOP_LOOKUP, the name is always empty, and
4996 4990 * shortcircuiting here avoids 3 unneeded lock/unlock
4997 4991 * pairs.
4998 4992 *
4999 4993 * If a non-empty name was provided, then it is the
5000 4994 * attribute name, and it will be looked up below.
5001 4995 */
5002 4996 if (*nm == '\0') {
5003 4997 *vpp = dvp;
5004 4998 return (0);
5005 4999 }
5006 5000
5007 5001 /*
5008 5002 * The vfs layer never sends a name when asking for the
5009 5003 * attrdir, so we should never get here (unless of course
5010 5004 * name is passed at some time in future -- at which time
5011 5005 * we'll blow up here).
5012 5006 */
5013 5007 ASSERT(0);
5014 5008 }
5015 5009
5016 5010 drp = VTOR4(dvp);
5017 5011 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5018 5012 return (EINTR);
5019 5013
5020 5014 error = nfs4lookup(dvp, nm, vpp, cr, 0);
5021 5015 nfs_rw_exit(&drp->r_rwlock);
5022 5016
5023 5017 /*
5024 5018 * If vnode is a device, create special vnode.
5025 5019 */
5026 5020 if (!error && ISVDEV((*vpp)->v_type)) {
5027 5021 vp = *vpp;
5028 5022 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
5029 5023 VN_RELE(vp);
5030 5024 }
5031 5025
5032 5026 return (error);
5033 5027 }
5034 5028
5035 5029 /* ARGSUSED */
5036 5030 static int
5037 5031 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5038 5032 {
5039 5033 int error;
5040 5034 rnode4_t *drp;
5041 5035 int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5042 5036 mntinfo4_t *mi;
5043 5037
5044 5038 mi = VTOMI4(dvp);
5045 5039 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5046 5040 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5047 5041 return (EINVAL);
5048 5042
5049 5043 drp = VTOR4(dvp);
5050 5044 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5051 5045 return (EINTR);
5052 5046
5053 5047 mutex_enter(&drp->r_statelock);
5054 5048 /*
5055 5049 * If the server doesn't support xattrs just return EINVAL
5056 5050 */
5057 5051 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5058 5052 mutex_exit(&drp->r_statelock);
5059 5053 nfs_rw_exit(&drp->r_rwlock);
5060 5054 return (EINVAL);
5061 5055 }
5062 5056
5063 5057 /*
5064 5058 * If there is a cached xattr directory entry,
5065 5059 * use it as long as the attributes are valid. If the
5066 5060 * attributes are not valid, take the simple approach and
5067 5061 * free the cached value and re-fetch a new value.
5068 5062 *
5069 5063 * We don't negative entry cache for now, if we did we
5070 5064 * would need to check if the file has changed on every
5071 5065 * lookup. But xattrs don't exist very often and failing
5072 5066 * an openattr is not much more expensive than and NVERIFY or GETATTR
5073 5067 * so do an openattr over the wire for now.
5074 5068 */
5075 5069 if (drp->r_xattr_dir != NULL) {
5076 5070 if (ATTRCACHE4_VALID(dvp)) {
5077 5071 VN_HOLD(drp->r_xattr_dir);
5078 5072 *vpp = drp->r_xattr_dir;
5079 5073 mutex_exit(&drp->r_statelock);
5080 5074 nfs_rw_exit(&drp->r_rwlock);
5081 5075 return (0);
5082 5076 }
5083 5077 VN_RELE(drp->r_xattr_dir);
5084 5078 drp->r_xattr_dir = NULL;
5085 5079 }
5086 5080 mutex_exit(&drp->r_statelock);
5087 5081
5088 5082 error = nfs4openattr(dvp, vpp, cflag, cr);
5089 5083
5090 5084 nfs_rw_exit(&drp->r_rwlock);
5091 5085
5092 5086 return (error);
5093 5087 }
5094 5088
5095 5089 static int
5096 5090 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5097 5091 {
5098 5092 int error;
5099 5093 rnode4_t *drp;
5100 5094
5101 5095 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5102 5096
5103 5097 /*
5104 5098 * If lookup is for "", just return dvp. Don't need
5105 5099 * to send it over the wire, look it up in the dnlc,
5106 5100 * or perform any access checks.
5107 5101 */
5108 5102 if (*nm == '\0') {
5109 5103 VN_HOLD(dvp);
5110 5104 *vpp = dvp;
5111 5105 return (0);
5112 5106 }
5113 5107
5114 5108 /*
5115 5109 * Can't do lookups in non-directories.
5116 5110 */
5117 5111 if (dvp->v_type != VDIR)
5118 5112 return (ENOTDIR);
5119 5113
5120 5114 /*
5121 5115 * If lookup is for ".", just return dvp. Don't need
5122 5116 * to send it over the wire or look it up in the dnlc,
5123 5117 * just need to check access.
5124 5118 */
5125 5119 if (nm[0] == '.' && nm[1] == '\0') {
5126 5120 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5127 5121 if (error)
5128 5122 return (error);
5129 5123 VN_HOLD(dvp);
5130 5124 *vpp = dvp;
5131 5125 return (0);
5132 5126 }
5133 5127
5134 5128 drp = VTOR4(dvp);
5135 5129 if (!(drp->r_flags & R4LOOKUP)) {
5136 5130 mutex_enter(&drp->r_statelock);
5137 5131 drp->r_flags |= R4LOOKUP;
5138 5132 mutex_exit(&drp->r_statelock);
5139 5133 }
5140 5134
5141 5135 *vpp = NULL;
5142 5136 /*
5143 5137 * Lookup this name in the DNLC. If there is no entry
5144 5138 * lookup over the wire.
5145 5139 */
5146 5140 if (!skipdnlc)
5147 5141 *vpp = dnlc_lookup(dvp, nm);
5148 5142 if (*vpp == NULL) {
5149 5143 /*
5150 5144 * We need to go over the wire to lookup the name.
5151 5145 */
5152 5146 return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5153 5147 }
5154 5148
5155 5149 /*
5156 5150 * We hit on the dnlc
5157 5151 */
5158 5152 if (*vpp != DNLC_NO_VNODE ||
5159 5153 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5160 5154 /*
5161 5155 * But our attrs may not be valid.
5162 5156 */
5163 5157 if (ATTRCACHE4_VALID(dvp)) {
5164 5158 error = nfs4_waitfor_purge_complete(dvp);
5165 5159 if (error) {
5166 5160 VN_RELE(*vpp);
5167 5161 *vpp = NULL;
5168 5162 return (error);
5169 5163 }
5170 5164
5171 5165 /*
5172 5166 * If after the purge completes, check to make sure
5173 5167 * our attrs are still valid.
5174 5168 */
5175 5169 if (ATTRCACHE4_VALID(dvp)) {
5176 5170 /*
5177 5171 * If we waited for a purge we may have
5178 5172 * lost our vnode so look it up again.
5179 5173 */
5180 5174 VN_RELE(*vpp);
5181 5175 *vpp = dnlc_lookup(dvp, nm);
5182 5176 if (*vpp == NULL)
5183 5177 return (nfs4lookupnew_otw(dvp,
5184 5178 nm, vpp, cr));
5185 5179
5186 5180 /*
5187 5181 * The access cache should almost always hit
5188 5182 */
5189 5183 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5190 5184
5191 5185 if (error) {
5192 5186 VN_RELE(*vpp);
5193 5187 *vpp = NULL;
5194 5188 return (error);
5195 5189 }
5196 5190 if (*vpp == DNLC_NO_VNODE) {
5197 5191 VN_RELE(*vpp);
5198 5192 *vpp = NULL;
5199 5193 return (ENOENT);
5200 5194 }
5201 5195 return (0);
5202 5196 }
5203 5197 }
5204 5198 }
5205 5199
5206 5200 ASSERT(*vpp != NULL);
5207 5201
5208 5202 /*
5209 5203 * We may have gotten here we have one of the following cases:
5210 5204 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5211 5205 * need to validate them.
5212 5206 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always
5213 5207 * must validate.
5214 5208 *
5215 5209 * Go to the server and check if the directory has changed, if
5216 5210 * it hasn't we are done and can use the dnlc entry.
5217 5211 */
5218 5212 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5219 5213 }
5220 5214
5221 5215 /*
5222 5216 * Go to the server and check if the directory has changed, if
5223 5217 * it hasn't we are done and can use the dnlc entry. If it
5224 5218 * has changed we get a new copy of its attributes and check
5225 5219 * the access for VEXEC, then relookup the filename and
5226 5220 * get its filehandle and attributes.
5227 5221 *
5228 5222 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5229 5223 * if the NVERIFY failed we must
5230 5224 * purge the caches
5231 5225 * cache new attributes (will set r_time_attr_inval)
5232 5226 * cache new access
5233 5227 * recheck VEXEC access
5234 5228 * add name to dnlc, possibly negative
5235 5229 * if LOOKUP succeeded
5236 5230 * cache new attributes
5237 5231 * else
5238 5232 * set a new r_time_attr_inval for dvp
5239 5233 * check to make sure we have access
5240 5234 *
5241 5235 * The vpp returned is the vnode passed in if the directory is valid,
5242 5236 * a new vnode if successful lookup, or NULL on error.
5243 5237 */
5244 5238 static int
5245 5239 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5246 5240 {
5247 5241 COMPOUND4args_clnt args;
5248 5242 COMPOUND4res_clnt res;
5249 5243 fattr4 *ver_fattr;
5250 5244 fattr4_change dchange;
5251 5245 int32_t *ptr;
5252 5246 int argoplist_size = 7 * sizeof (nfs_argop4);
5253 5247 nfs_argop4 *argop;
5254 5248 int doqueue;
5255 5249 mntinfo4_t *mi;
5256 5250 nfs4_recov_state_t recov_state;
5257 5251 hrtime_t t;
5258 5252 int isdotdot;
5259 5253 vnode_t *nvp;
5260 5254 nfs_fh4 *fhp;
5261 5255 nfs4_sharedfh_t *sfhp;
5262 5256 nfs4_access_type_t cacc;
5263 5257 rnode4_t *nrp;
5264 5258 rnode4_t *drp = VTOR4(dvp);
5265 5259 nfs4_ga_res_t *garp = NULL;
5266 5260 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5267 5261
5268 5262 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5269 5263 ASSERT(nm != NULL);
5270 5264 ASSERT(nm[0] != '\0');
5271 5265 ASSERT(dvp->v_type == VDIR);
5272 5266 ASSERT(nm[0] != '.' || nm[1] != '\0');
5273 5267 ASSERT(*vpp != NULL);
5274 5268
5275 5269 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5276 5270 isdotdot = 1;
5277 5271 args.ctag = TAG_LOOKUP_VPARENT;
5278 5272 } else {
5279 5273 /*
5280 5274 * If dvp were a stub, it should have triggered and caused
5281 5275 * a mount for us to get this far.
5282 5276 */
5283 5277 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5284 5278
5285 5279 isdotdot = 0;
5286 5280 args.ctag = TAG_LOOKUP_VALID;
5287 5281 }
5288 5282
5289 5283 mi = VTOMI4(dvp);
5290 5284 recov_state.rs_flags = 0;
5291 5285 recov_state.rs_num_retry_despite_err = 0;
5292 5286
5293 5287 nvp = NULL;
5294 5288
5295 5289 /* Save the original mount point security information */
5296 5290 (void) save_mnt_secinfo(mi->mi_curr_serv);
5297 5291
5298 5292 recov_retry:
5299 5293 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5300 5294 &recov_state, NULL);
5301 5295 if (e.error) {
5302 5296 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5303 5297 VN_RELE(*vpp);
5304 5298 *vpp = NULL;
5305 5299 return (e.error);
5306 5300 }
5307 5301
5308 5302 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5309 5303
5310 5304 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5311 5305 args.array_len = 7;
5312 5306 args.array = argop;
5313 5307
5314 5308 /* 0. putfh file */
5315 5309 argop[0].argop = OP_CPUTFH;
5316 5310 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5317 5311
5318 5312 /* 1. nverify the change info */
5319 5313 argop[1].argop = OP_NVERIFY;
5320 5314 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5321 5315 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5322 5316 ver_fattr->attrlist4 = (char *)&dchange;
5323 5317 ptr = (int32_t *)&dchange;
5324 5318 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5325 5319 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5326 5320
5327 5321 /* 2. getattr directory */
5328 5322 argop[2].argop = OP_GETATTR;
5329 5323 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5330 5324 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5331 5325
5332 5326 /* 3. access directory */
5333 5327 argop[3].argop = OP_ACCESS;
5334 5328 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5335 5329 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5336 5330
5337 5331 /* 4. lookup name */
5338 5332 if (isdotdot) {
5339 5333 argop[4].argop = OP_LOOKUPP;
5340 5334 } else {
5341 5335 argop[4].argop = OP_CLOOKUP;
5342 5336 argop[4].nfs_argop4_u.opclookup.cname = nm;
5343 5337 }
5344 5338
5345 5339 /* 5. resulting file handle */
5346 5340 argop[5].argop = OP_GETFH;
5347 5341
5348 5342 /* 6. resulting file attributes */
5349 5343 argop[6].argop = OP_GETATTR;
5350 5344 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5351 5345 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5352 5346
5353 5347 doqueue = 1;
5354 5348 t = gethrtime();
5355 5349
5356 5350 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5357 5351
5358 5352 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5359 5353 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5360 5354 if (e.error != 0 && *vpp != NULL)
5361 5355 VN_RELE(*vpp);
5362 5356 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5363 5357 &recov_state, FALSE);
5364 5358 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5365 5359 kmem_free(argop, argoplist_size);
5366 5360 return (e.error);
5367 5361 }
5368 5362
5369 5363 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5370 5364 /*
5371 5365 * For WRONGSEC of a non-dotdot case, send secinfo directly
5372 5366 * from this thread, do not go thru the recovery thread since
5373 5367 * we need the nm information.
5374 5368 *
5375 5369 * Not doing dotdot case because there is no specification
5376 5370 * for (PUTFH, SECINFO "..") yet.
5377 5371 */
5378 5372 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5379 5373 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5380 5374 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5381 5375 &recov_state, FALSE);
5382 5376 else
5383 5377 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5384 5378 &recov_state, TRUE);
5385 5379 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5386 5380 kmem_free(argop, argoplist_size);
5387 5381 if (!e.error)
5388 5382 goto recov_retry;
5389 5383 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5390 5384 VN_RELE(*vpp);
5391 5385 *vpp = NULL;
5392 5386 return (e.error);
5393 5387 }
5394 5388
5395 5389 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5396 5390 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5397 5391 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5398 5392 &recov_state, TRUE);
5399 5393
5400 5394 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5401 5395 kmem_free(argop, argoplist_size);
5402 5396 goto recov_retry;
5403 5397 }
5404 5398 }
5405 5399
5406 5400 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5407 5401
5408 5402 if (e.error || res.array_len == 0) {
5409 5403 /*
5410 5404 * If e.error isn't set, then reply has no ops (or we couldn't
5411 5405 * be here). The only legal way to reply without an op array
5412 5406 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5413 5407 * be in the reply for all other status values.
5414 5408 *
5415 5409 * For valid replies without an ops array, return ENOTSUP
5416 5410 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5417 5411 * return EIO -- don't trust status.
5418 5412 */
5419 5413 if (e.error == 0)
5420 5414 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5421 5415 ENOTSUP : EIO;
5422 5416 VN_RELE(*vpp);
5423 5417 *vpp = NULL;
5424 5418 kmem_free(argop, argoplist_size);
5425 5419 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5426 5420 return (e.error);
5427 5421 }
5428 5422
5429 5423 if (res.status != NFS4ERR_SAME) {
5430 5424 e.error = geterrno4(res.status);
5431 5425
5432 5426 /*
5433 5427 * The NVERIFY "failed" so the directory has changed
5434 5428 * First make sure PUTFH succeeded and NVERIFY "failed"
5435 5429 * cleanly.
5436 5430 */
5437 5431 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5438 5432 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5439 5433 nfs4_purge_stale_fh(e.error, dvp, cr);
5440 5434 VN_RELE(*vpp);
5441 5435 *vpp = NULL;
5442 5436 goto exit;
5443 5437 }
5444 5438
5445 5439 /*
5446 5440 * We know the NVERIFY "failed" so we must:
5447 5441 * purge the caches (access and indirectly dnlc if needed)
5448 5442 */
5449 5443 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5450 5444
5451 5445 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5452 5446 nfs4_purge_stale_fh(e.error, dvp, cr);
5453 5447 VN_RELE(*vpp);
5454 5448 *vpp = NULL;
5455 5449 goto exit;
5456 5450 }
5457 5451
5458 5452 /*
5459 5453 * Install new cached attributes for the directory
5460 5454 */
5461 5455 nfs4_attr_cache(dvp,
5462 5456 &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5463 5457 t, cr, FALSE, NULL);
5464 5458
5465 5459 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5466 5460 nfs4_purge_stale_fh(e.error, dvp, cr);
5467 5461 VN_RELE(*vpp);
5468 5462 *vpp = NULL;
5469 5463 e.error = geterrno4(res.status);
5470 5464 goto exit;
5471 5465 }
5472 5466
5473 5467 /*
5474 5468 * Now we know the directory is valid,
5475 5469 * cache new directory access
5476 5470 */
5477 5471 nfs4_access_cache(drp,
5478 5472 args.array[3].nfs_argop4_u.opaccess.access,
5479 5473 res.array[3].nfs_resop4_u.opaccess.access, cr);
5480 5474
5481 5475 /*
5482 5476 * recheck VEXEC access
5483 5477 */
5484 5478 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5485 5479 if (cacc != NFS4_ACCESS_ALLOWED) {
5486 5480 /*
5487 5481 * Directory permissions might have been revoked
5488 5482 */
5489 5483 if (cacc == NFS4_ACCESS_DENIED) {
5490 5484 e.error = EACCES;
5491 5485 VN_RELE(*vpp);
5492 5486 *vpp = NULL;
5493 5487 goto exit;
5494 5488 }
5495 5489
5496 5490 /*
5497 5491 * Somehow we must not have asked for enough
5498 5492 * so try a singleton ACCESS, should never happen.
5499 5493 */
5500 5494 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5501 5495 if (e.error) {
5502 5496 VN_RELE(*vpp);
5503 5497 *vpp = NULL;
5504 5498 goto exit;
5505 5499 }
5506 5500 }
5507 5501
5508 5502 e.error = geterrno4(res.status);
5509 5503 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5510 5504 /*
5511 5505 * The lookup failed, probably no entry
5512 5506 */
5513 5507 if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5514 5508 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5515 5509 } else {
5516 5510 /*
5517 5511 * Might be some other error, so remove
5518 5512 * the dnlc entry to make sure we start all
5519 5513 * over again, next time.
5520 5514 */
5521 5515 dnlc_remove(dvp, nm);
5522 5516 }
5523 5517 VN_RELE(*vpp);
5524 5518 *vpp = NULL;
5525 5519 goto exit;
5526 5520 }
5527 5521
5528 5522 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5529 5523 /*
5530 5524 * The file exists but we can't get its fh for
5531 5525 * some unknown reason. Remove it from the dnlc
5532 5526 * and error out to be safe.
5533 5527 */
5534 5528 dnlc_remove(dvp, nm);
5535 5529 VN_RELE(*vpp);
5536 5530 *vpp = NULL;
5537 5531 goto exit;
5538 5532 }
5539 5533 fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5540 5534 if (fhp->nfs_fh4_len == 0) {
5541 5535 /*
5542 5536 * The file exists but a bogus fh
5543 5537 * some unknown reason. Remove it from the dnlc
5544 5538 * and error out to be safe.
5545 5539 */
5546 5540 e.error = ENOENT;
5547 5541 dnlc_remove(dvp, nm);
5548 5542 VN_RELE(*vpp);
5549 5543 *vpp = NULL;
5550 5544 goto exit;
5551 5545 }
5552 5546 sfhp = sfh4_get(fhp, mi);
5553 5547
5554 5548 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5555 5549 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5556 5550
5557 5551 /*
5558 5552 * Make the new rnode
5559 5553 */
5560 5554 if (isdotdot) {
5561 5555 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5562 5556 if (e.error) {
5563 5557 sfh4_rele(&sfhp);
5564 5558 VN_RELE(*vpp);
5565 5559 *vpp = NULL;
5566 5560 goto exit;
5567 5561 }
5568 5562 /*
5569 5563 * XXX if nfs4_make_dotdot uses an existing rnode
5570 5564 * XXX it doesn't update the attributes.
5571 5565 * XXX for now just save them again to save an OTW
5572 5566 */
5573 5567 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5574 5568 } else {
5575 5569 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5576 5570 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5577 5571 /*
5578 5572 * If v_type == VNON, then garp was NULL because
5579 5573 * the last op in the compound failed and makenfs4node
5580 5574 * could not find the vnode for sfhp. It created
5581 5575 * a new vnode, so we have nothing to purge here.
5582 5576 */
5583 5577 if (nvp->v_type == VNON) {
5584 5578 vattr_t vattr;
5585 5579
5586 5580 vattr.va_mask = AT_TYPE;
5587 5581 /*
5588 5582 * N.B. We've already called nfs4_end_fop above.
5589 5583 */
5590 5584 e.error = nfs4getattr(nvp, &vattr, cr);
5591 5585 if (e.error) {
5592 5586 sfh4_rele(&sfhp);
5593 5587 VN_RELE(*vpp);
5594 5588 *vpp = NULL;
5595 5589 VN_RELE(nvp);
5596 5590 goto exit;
5597 5591 }
5598 5592 nvp->v_type = vattr.va_type;
5599 5593 }
5600 5594 }
5601 5595 sfh4_rele(&sfhp);
5602 5596
5603 5597 nrp = VTOR4(nvp);
5604 5598 mutex_enter(&nrp->r_statev4_lock);
5605 5599 if (!nrp->created_v4) {
5606 5600 mutex_exit(&nrp->r_statev4_lock);
5607 5601 dnlc_update(dvp, nm, nvp);
5608 5602 } else
5609 5603 mutex_exit(&nrp->r_statev4_lock);
5610 5604
5611 5605 VN_RELE(*vpp);
5612 5606 *vpp = nvp;
5613 5607 } else {
5614 5608 hrtime_t now;
5615 5609 hrtime_t delta = 0;
5616 5610
5617 5611 e.error = 0;
5618 5612
5619 5613 /*
5620 5614 * Because the NVERIFY "succeeded" we know that the
5621 5615 * directory attributes are still valid
5622 5616 * so update r_time_attr_inval
5623 5617 */
5624 5618 now = gethrtime();
5625 5619 mutex_enter(&drp->r_statelock);
5626 5620 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5627 5621 delta = now - drp->r_time_attr_saved;
5628 5622 if (delta < mi->mi_acdirmin)
5629 5623 delta = mi->mi_acdirmin;
5630 5624 else if (delta > mi->mi_acdirmax)
5631 5625 delta = mi->mi_acdirmax;
5632 5626 }
5633 5627 drp->r_time_attr_inval = now + delta;
5634 5628 mutex_exit(&drp->r_statelock);
5635 5629 dnlc_update(dvp, nm, *vpp);
5636 5630
5637 5631 /*
5638 5632 * Even though we have a valid directory attr cache
5639 5633 * and dnlc entry, we may not have access.
5640 5634 * This should almost always hit the cache.
5641 5635 */
5642 5636 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5643 5637 if (e.error) {
5644 5638 VN_RELE(*vpp);
5645 5639 *vpp = NULL;
5646 5640 }
5647 5641
5648 5642 if (*vpp == DNLC_NO_VNODE) {
5649 5643 VN_RELE(*vpp);
5650 5644 *vpp = NULL;
5651 5645 e.error = ENOENT;
5652 5646 }
5653 5647 }
5654 5648
5655 5649 exit:
5656 5650 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5657 5651 kmem_free(argop, argoplist_size);
5658 5652 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5659 5653 return (e.error);
5660 5654 }
5661 5655
5662 5656 /*
5663 5657 * We need to go over the wire to lookup the name, but
5664 5658 * while we are there verify the directory has not
5665 5659 * changed but if it has, get new attributes and check access
5666 5660 *
5667 5661 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5668 5662 * NVERIFY GETATTR ACCESS
5669 5663 *
5670 5664 * With the results:
5671 5665 * if the NVERIFY failed we must purge the caches, add new attributes,
5672 5666 * and cache new access.
5673 5667 * set a new r_time_attr_inval
5674 5668 * add name to dnlc, possibly negative
5675 5669 * if LOOKUP succeeded
5676 5670 * cache new attributes
5677 5671 */
5678 5672 static int
5679 5673 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5680 5674 {
5681 5675 COMPOUND4args_clnt args;
5682 5676 COMPOUND4res_clnt res;
5683 5677 fattr4 *ver_fattr;
5684 5678 fattr4_change dchange;
5685 5679 int32_t *ptr;
5686 5680 nfs4_ga_res_t *garp = NULL;
5687 5681 int argoplist_size = 9 * sizeof (nfs_argop4);
5688 5682 nfs_argop4 *argop;
5689 5683 int doqueue;
5690 5684 mntinfo4_t *mi;
5691 5685 nfs4_recov_state_t recov_state;
5692 5686 hrtime_t t;
5693 5687 int isdotdot;
5694 5688 vnode_t *nvp;
5695 5689 nfs_fh4 *fhp;
5696 5690 nfs4_sharedfh_t *sfhp;
5697 5691 nfs4_access_type_t cacc;
5698 5692 rnode4_t *nrp;
5699 5693 rnode4_t *drp = VTOR4(dvp);
5700 5694 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5701 5695
5702 5696 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5703 5697 ASSERT(nm != NULL);
5704 5698 ASSERT(nm[0] != '\0');
5705 5699 ASSERT(dvp->v_type == VDIR);
5706 5700 ASSERT(nm[0] != '.' || nm[1] != '\0');
5707 5701 ASSERT(*vpp == NULL);
5708 5702
5709 5703 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5710 5704 isdotdot = 1;
5711 5705 args.ctag = TAG_LOOKUP_PARENT;
5712 5706 } else {
5713 5707 /*
5714 5708 * If dvp were a stub, it should have triggered and caused
5715 5709 * a mount for us to get this far.
5716 5710 */
5717 5711 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5718 5712
5719 5713 isdotdot = 0;
5720 5714 args.ctag = TAG_LOOKUP;
5721 5715 }
5722 5716
5723 5717 mi = VTOMI4(dvp);
5724 5718 recov_state.rs_flags = 0;
5725 5719 recov_state.rs_num_retry_despite_err = 0;
5726 5720
5727 5721 nvp = NULL;
5728 5722
5729 5723 /* Save the original mount point security information */
5730 5724 (void) save_mnt_secinfo(mi->mi_curr_serv);
5731 5725
5732 5726 recov_retry:
5733 5727 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5734 5728 &recov_state, NULL);
5735 5729 if (e.error) {
5736 5730 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5737 5731 return (e.error);
5738 5732 }
5739 5733
5740 5734 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5741 5735
5742 5736 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5743 5737 args.array_len = 9;
5744 5738 args.array = argop;
5745 5739
5746 5740 /* 0. putfh file */
5747 5741 argop[0].argop = OP_CPUTFH;
5748 5742 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5749 5743
5750 5744 /* 1. savefh for the nverify */
5751 5745 argop[1].argop = OP_SAVEFH;
5752 5746
5753 5747 /* 2. lookup name */
5754 5748 if (isdotdot) {
5755 5749 argop[2].argop = OP_LOOKUPP;
5756 5750 } else {
5757 5751 argop[2].argop = OP_CLOOKUP;
5758 5752 argop[2].nfs_argop4_u.opclookup.cname = nm;
5759 5753 }
5760 5754
5761 5755 /* 3. resulting file handle */
5762 5756 argop[3].argop = OP_GETFH;
5763 5757
5764 5758 /* 4. resulting file attributes */
5765 5759 argop[4].argop = OP_GETATTR;
5766 5760 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5767 5761 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5768 5762
5769 5763 /* 5. restorefh back the directory for the nverify */
5770 5764 argop[5].argop = OP_RESTOREFH;
5771 5765
5772 5766 /* 6. nverify the change info */
5773 5767 argop[6].argop = OP_NVERIFY;
5774 5768 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5775 5769 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5776 5770 ver_fattr->attrlist4 = (char *)&dchange;
5777 5771 ptr = (int32_t *)&dchange;
5778 5772 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5779 5773 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5780 5774
5781 5775 /* 7. getattr directory */
5782 5776 argop[7].argop = OP_GETATTR;
5783 5777 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5784 5778 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5785 5779
5786 5780 /* 8. access directory */
5787 5781 argop[8].argop = OP_ACCESS;
5788 5782 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5789 5783 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5790 5784
5791 5785 doqueue = 1;
5792 5786 t = gethrtime();
5793 5787
5794 5788 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5795 5789
5796 5790 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5797 5791 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5798 5792 if (e.error != 0 && *vpp != NULL)
5799 5793 VN_RELE(*vpp);
5800 5794 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5801 5795 &recov_state, FALSE);
5802 5796 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5803 5797 kmem_free(argop, argoplist_size);
5804 5798 return (e.error);
5805 5799 }
5806 5800
5807 5801 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5808 5802 /*
5809 5803 * For WRONGSEC of a non-dotdot case, send secinfo directly
5810 5804 * from this thread, do not go thru the recovery thread since
5811 5805 * we need the nm information.
5812 5806 *
5813 5807 * Not doing dotdot case because there is no specification
5814 5808 * for (PUTFH, SECINFO "..") yet.
5815 5809 */
5816 5810 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5817 5811 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5818 5812 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5819 5813 &recov_state, FALSE);
5820 5814 else
5821 5815 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5822 5816 &recov_state, TRUE);
5823 5817 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5824 5818 kmem_free(argop, argoplist_size);
5825 5819 if (!e.error)
5826 5820 goto recov_retry;
5827 5821 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5828 5822 return (e.error);
5829 5823 }
5830 5824
5831 5825 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5832 5826 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5833 5827 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5834 5828 &recov_state, TRUE);
5835 5829
5836 5830 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5837 5831 kmem_free(argop, argoplist_size);
5838 5832 goto recov_retry;
5839 5833 }
5840 5834 }
5841 5835
5842 5836 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5843 5837
5844 5838 if (e.error || res.array_len == 0) {
5845 5839 /*
5846 5840 * If e.error isn't set, then reply has no ops (or we couldn't
5847 5841 * be here). The only legal way to reply without an op array
5848 5842 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5849 5843 * be in the reply for all other status values.
5850 5844 *
5851 5845 * For valid replies without an ops array, return ENOTSUP
5852 5846 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5853 5847 * return EIO -- don't trust status.
5854 5848 */
5855 5849 if (e.error == 0)
5856 5850 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5857 5851 ENOTSUP : EIO;
5858 5852
5859 5853 kmem_free(argop, argoplist_size);
5860 5854 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5861 5855 return (e.error);
5862 5856 }
5863 5857
5864 5858 e.error = geterrno4(res.status);
5865 5859
5866 5860 /*
5867 5861 * The PUTFH and SAVEFH may have failed.
5868 5862 */
5869 5863 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5870 5864 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5871 5865 nfs4_purge_stale_fh(e.error, dvp, cr);
5872 5866 goto exit;
5873 5867 }
5874 5868
5875 5869 /*
5876 5870 * Check if the file exists, if it does delay entering
5877 5871 * into the dnlc until after we update the directory
5878 5872 * attributes so we don't cause it to get purged immediately.
5879 5873 */
5880 5874 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5881 5875 /*
5882 5876 * The lookup failed, probably no entry
5883 5877 */
5884 5878 if (e.error == ENOENT && nfs4_lookup_neg_cache)
5885 5879 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5886 5880 goto exit;
5887 5881 }
5888 5882
5889 5883 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5890 5884 /*
5891 5885 * The file exists but we can't get its fh for
5892 5886 * some unknown reason. Error out to be safe.
5893 5887 */
5894 5888 goto exit;
5895 5889 }
5896 5890
5897 5891 fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5898 5892 if (fhp->nfs_fh4_len == 0) {
5899 5893 /*
5900 5894 * The file exists but a bogus fh
5901 5895 * some unknown reason. Error out to be safe.
5902 5896 */
5903 5897 e.error = EIO;
5904 5898 goto exit;
5905 5899 }
5906 5900 sfhp = sfh4_get(fhp, mi);
5907 5901
5908 5902 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5909 5903 sfh4_rele(&sfhp);
5910 5904 goto exit;
5911 5905 }
5912 5906 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5913 5907
5914 5908 /*
5915 5909 * The RESTOREFH may have failed
5916 5910 */
5917 5911 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5918 5912 sfh4_rele(&sfhp);
5919 5913 e.error = EIO;
5920 5914 goto exit;
5921 5915 }
5922 5916
5923 5917 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5924 5918 /*
5925 5919 * First make sure the NVERIFY failed as we expected,
5926 5920 * if it didn't then be conservative and error out
5927 5921 * as we can't trust the directory.
5928 5922 */
5929 5923 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5930 5924 sfh4_rele(&sfhp);
5931 5925 e.error = EIO;
5932 5926 goto exit;
5933 5927 }
5934 5928
5935 5929 /*
5936 5930 * We know the NVERIFY "failed" so the directory has changed,
5937 5931 * so we must:
5938 5932 * purge the caches (access and indirectly dnlc if needed)
5939 5933 */
5940 5934 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5941 5935
5942 5936 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5943 5937 sfh4_rele(&sfhp);
5944 5938 goto exit;
5945 5939 }
5946 5940 nfs4_attr_cache(dvp,
5947 5941 &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5948 5942 t, cr, FALSE, NULL);
5949 5943
5950 5944 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5951 5945 nfs4_purge_stale_fh(e.error, dvp, cr);
5952 5946 sfh4_rele(&sfhp);
5953 5947 e.error = geterrno4(res.status);
5954 5948 goto exit;
5955 5949 }
5956 5950
5957 5951 /*
5958 5952 * Now we know the directory is valid,
5959 5953 * cache new directory access
5960 5954 */
5961 5955 nfs4_access_cache(drp,
5962 5956 args.array[8].nfs_argop4_u.opaccess.access,
5963 5957 res.array[8].nfs_resop4_u.opaccess.access, cr);
5964 5958
5965 5959 /*
5966 5960 * recheck VEXEC access
5967 5961 */
5968 5962 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5969 5963 if (cacc != NFS4_ACCESS_ALLOWED) {
5970 5964 /*
5971 5965 * Directory permissions might have been revoked
5972 5966 */
5973 5967 if (cacc == NFS4_ACCESS_DENIED) {
5974 5968 sfh4_rele(&sfhp);
5975 5969 e.error = EACCES;
5976 5970 goto exit;
5977 5971 }
5978 5972
5979 5973 /*
5980 5974 * Somehow we must not have asked for enough
5981 5975 * so try a singleton ACCESS should never happen
5982 5976 */
5983 5977 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5984 5978 if (e.error) {
5985 5979 sfh4_rele(&sfhp);
5986 5980 goto exit;
5987 5981 }
5988 5982 }
5989 5983
5990 5984 e.error = geterrno4(res.status);
5991 5985 } else {
5992 5986 hrtime_t now;
5993 5987 hrtime_t delta = 0;
5994 5988
5995 5989 e.error = 0;
5996 5990
5997 5991 /*
5998 5992 * Because the NVERIFY "succeeded" we know that the
5999 5993 * directory attributes are still valid
6000 5994 * so update r_time_attr_inval
6001 5995 */
6002 5996 now = gethrtime();
6003 5997 mutex_enter(&drp->r_statelock);
6004 5998 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
6005 5999 delta = now - drp->r_time_attr_saved;
6006 6000 if (delta < mi->mi_acdirmin)
6007 6001 delta = mi->mi_acdirmin;
6008 6002 else if (delta > mi->mi_acdirmax)
6009 6003 delta = mi->mi_acdirmax;
6010 6004 }
6011 6005 drp->r_time_attr_inval = now + delta;
6012 6006 mutex_exit(&drp->r_statelock);
6013 6007
6014 6008 /*
6015 6009 * Even though we have a valid directory attr cache,
6016 6010 * we may not have access.
6017 6011 * This should almost always hit the cache.
6018 6012 */
6019 6013 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
6020 6014 if (e.error) {
6021 6015 sfh4_rele(&sfhp);
6022 6016 goto exit;
6023 6017 }
6024 6018 }
6025 6019
6026 6020 /*
6027 6021 * Now we have successfully completed the lookup, if the
6028 6022 * directory has changed we now have the valid attributes.
6029 6023 * We also know we have directory access.
6030 6024 * Create the new rnode and insert it in the dnlc.
6031 6025 */
6032 6026 if (isdotdot) {
6033 6027 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
6034 6028 if (e.error) {
6035 6029 sfh4_rele(&sfhp);
6036 6030 goto exit;
6037 6031 }
6038 6032 /*
6039 6033 * XXX if nfs4_make_dotdot uses an existing rnode
6040 6034 * XXX it doesn't update the attributes.
6041 6035 * XXX for now just save them again to save an OTW
6042 6036 */
6043 6037 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6044 6038 } else {
6045 6039 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6046 6040 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6047 6041 }
6048 6042 sfh4_rele(&sfhp);
6049 6043
6050 6044 nrp = VTOR4(nvp);
6051 6045 mutex_enter(&nrp->r_statev4_lock);
6052 6046 if (!nrp->created_v4) {
6053 6047 mutex_exit(&nrp->r_statev4_lock);
6054 6048 dnlc_update(dvp, nm, nvp);
6055 6049 } else
6056 6050 mutex_exit(&nrp->r_statev4_lock);
6057 6051
6058 6052 *vpp = nvp;
6059 6053
6060 6054 exit:
6061 6055 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6062 6056 kmem_free(argop, argoplist_size);
6063 6057 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6064 6058 return (e.error);
6065 6059 }
6066 6060
6067 6061 #ifdef DEBUG
6068 6062 void
6069 6063 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6070 6064 {
6071 6065 uint_t i, len;
6072 6066 zoneid_t zoneid = getzoneid();
6073 6067 char *s;
6074 6068
6075 6069 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6076 6070 for (i = 0; i < argcnt; i++) {
6077 6071 nfs_argop4 *op = &argbase[i];
6078 6072 switch (op->argop) {
6079 6073 case OP_CPUTFH:
6080 6074 case OP_PUTFH:
6081 6075 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6082 6076 break;
6083 6077 case OP_PUTROOTFH:
6084 6078 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6085 6079 break;
6086 6080 case OP_CLOOKUP:
6087 6081 s = op->nfs_argop4_u.opclookup.cname;
6088 6082 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6089 6083 break;
6090 6084 case OP_LOOKUP:
6091 6085 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6092 6086 &len, NULL);
6093 6087 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6094 6088 kmem_free(s, len);
6095 6089 break;
6096 6090 case OP_LOOKUPP:
6097 6091 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6098 6092 break;
6099 6093 case OP_GETFH:
6100 6094 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6101 6095 break;
6102 6096 case OP_GETATTR:
6103 6097 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6104 6098 break;
6105 6099 case OP_OPENATTR:
6106 6100 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6107 6101 break;
6108 6102 default:
6109 6103 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6110 6104 op->argop);
6111 6105 break;
6112 6106 }
6113 6107 }
6114 6108 }
6115 6109 #endif
6116 6110
6117 6111 /*
6118 6112 * nfs4lookup_setup - constructs a multi-lookup compound request.
6119 6113 *
6120 6114 * Given the path "nm1/nm2/.../nmn", the following compound requests
6121 6115 * may be created:
6122 6116 *
6123 6117 * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6124 6118 * is faster, for now.
6125 6119 *
6126 6120 * l4_getattrs indicates the type of compound requested.
6127 6121 *
6128 6122 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6129 6123 *
6130 6124 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} }
6131 6125 *
6132 6126 * total number of ops is n + 1.
6133 6127 *
6134 6128 * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6135 6129 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6136 6130 * before the last component, and only get attributes
6137 6131 * for the last component. Note that the second-to-last
6138 6132 * pathname component is XATTR_RPATH, which does NOT go
6139 6133 * over-the-wire as a lookup.
6140 6134 *
6141 6135 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6142 6136 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6143 6137 *
6144 6138 * and total number of ops is n + 5.
6145 6139 *
6146 6140 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6147 6141 * attribute directory: create lookups plus an OPENATTR
6148 6142 * replacing the last lookup. Note that the last pathname
6149 6143 * component is XATTR_RPATH, which does NOT go over-the-wire
6150 6144 * as a lookup.
6151 6145 *
6152 6146 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6153 6147 * Openattr; Getfh; Getattr }
6154 6148 *
6155 6149 * and total number of ops is n + 5.
6156 6150 *
6157 6151 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6158 6152 * nodes too.
6159 6153 *
6160 6154 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6161 6155 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr }
6162 6156 *
6163 6157 * and total number of ops is 3*n + 1.
6164 6158 *
6165 6159 * All cases: returns the index in the arg array of the final LOOKUP op, or
6166 6160 * -1 if no LOOKUPs were used.
6167 6161 */
6168 6162 int
6169 6163 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6170 6164 {
6171 6165 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6172 6166 nfs_argop4 *argbase, *argop;
6173 6167 int arglen, argcnt;
6174 6168 int n = 1; /* number of components */
6175 6169 int nga = 1; /* number of Getattr's in request */
6176 6170 char c = '\0', *s, *p;
6177 6171 int lookup_idx = -1;
6178 6172 int argoplist_size;
6179 6173
6180 6174 /* set lookuparg response result to 0 */
6181 6175 lookupargp->resp->status = NFS4_OK;
6182 6176
6183 6177 /* skip leading "/" or "." e.g. ".//./" if there is */
6184 6178 for (; ; nm++) {
6185 6179 if (*nm != '/' && *nm != '.')
6186 6180 break;
6187 6181
6188 6182 /* ".." is counted as 1 component */
6189 6183 if (*nm == '.' && *(nm + 1) != '/')
6190 6184 break;
6191 6185 }
6192 6186
6193 6187 /*
6194 6188 * Find n = number of components - nm must be null terminated
6195 6189 * Skip "." components.
6196 6190 */
6197 6191 if (*nm != '\0')
6198 6192 for (n = 1, s = nm; *s != '\0'; s++) {
6199 6193 if ((*s == '/') && (*(s + 1) != '/') &&
6200 6194 (*(s + 1) != '\0') &&
6201 6195 !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6202 6196 *(s + 2) == '\0')))
6203 6197 n++;
6204 6198 }
6205 6199 else
6206 6200 n = 0;
6207 6201
6208 6202 /*
6209 6203 * nga is number of components that need Getfh+Getattr
6210 6204 */
6211 6205 switch (l4_getattrs) {
6212 6206 case LKP4_NO_ATTRIBUTES:
6213 6207 nga = 0;
6214 6208 break;
6215 6209 case LKP4_ALL_ATTRIBUTES:
6216 6210 nga = n;
6217 6211 /*
6218 6212 * Always have at least 1 getfh, getattr pair
6219 6213 */
6220 6214 if (nga == 0)
6221 6215 nga++;
6222 6216 break;
6223 6217 case LKP4_LAST_ATTRDIR:
6224 6218 case LKP4_LAST_NAMED_ATTR:
6225 6219 nga = n+1;
6226 6220 break;
6227 6221 }
6228 6222
6229 6223 /*
6230 6224 * If change to use the filehandle attr instead of getfh
6231 6225 * the following line can be deleted.
6232 6226 */
6233 6227 nga *= 2;
6234 6228
6235 6229 /*
6236 6230 * calculate number of ops in request as
6237 6231 * header + trailer + lookups + getattrs
6238 6232 */
6239 6233 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6240 6234
6241 6235 argoplist_size = arglen * sizeof (nfs_argop4);
6242 6236 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6243 6237 lookupargp->argsp->array = argop;
6244 6238
6245 6239 argcnt = lookupargp->header_len;
6246 6240 argop += argcnt;
6247 6241
6248 6242 /*
6249 6243 * loop and create a lookup op and possibly getattr/getfh for
6250 6244 * each component. Skip "." components.
6251 6245 */
6252 6246 for (s = nm; *s != '\0'; s = p) {
6253 6247 /*
6254 6248 * Set up a pathname struct for each component if needed
6255 6249 */
6256 6250 while (*s == '/')
6257 6251 s++;
6258 6252 if (*s == '\0')
6259 6253 break;
6260 6254
6261 6255 for (p = s; (*p != '/') && (*p != '\0'); p++)
6262 6256 ;
6263 6257 c = *p;
6264 6258 *p = '\0';
6265 6259
6266 6260 if (s[0] == '.' && s[1] == '\0') {
6267 6261 *p = c;
6268 6262 continue;
6269 6263 }
6270 6264 if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6271 6265 strcmp(s, XATTR_RPATH) == 0) {
6272 6266 /* getfh XXX may not be needed in future */
6273 6267 argop->argop = OP_GETFH;
6274 6268 argop++;
6275 6269 argcnt++;
6276 6270
6277 6271 /* getattr */
6278 6272 argop->argop = OP_GETATTR;
6279 6273 argop->nfs_argop4_u.opgetattr.attr_request =
6280 6274 lookupargp->ga_bits;
6281 6275 argop->nfs_argop4_u.opgetattr.mi =
6282 6276 lookupargp->mi;
6283 6277 argop++;
6284 6278 argcnt++;
6285 6279
6286 6280 /* openattr */
6287 6281 argop->argop = OP_OPENATTR;
6288 6282 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6289 6283 strcmp(s, XATTR_RPATH) == 0) {
6290 6284 /* openattr */
6291 6285 argop->argop = OP_OPENATTR;
6292 6286 argop++;
6293 6287 argcnt++;
6294 6288
6295 6289 /* getfh XXX may not be needed in future */
6296 6290 argop->argop = OP_GETFH;
6297 6291 argop++;
6298 6292 argcnt++;
6299 6293
6300 6294 /* getattr */
6301 6295 argop->argop = OP_GETATTR;
6302 6296 argop->nfs_argop4_u.opgetattr.attr_request =
6303 6297 lookupargp->ga_bits;
6304 6298 argop->nfs_argop4_u.opgetattr.mi =
6305 6299 lookupargp->mi;
6306 6300 argop++;
6307 6301 argcnt++;
6308 6302 *p = c;
6309 6303 continue;
6310 6304 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6311 6305 /* lookupp */
6312 6306 argop->argop = OP_LOOKUPP;
6313 6307 } else {
6314 6308 /* lookup */
6315 6309 argop->argop = OP_LOOKUP;
6316 6310 (void) str_to_utf8(s,
6317 6311 &argop->nfs_argop4_u.oplookup.objname);
6318 6312 }
6319 6313 lookup_idx = argcnt;
6320 6314 argop++;
6321 6315 argcnt++;
6322 6316
6323 6317 *p = c;
6324 6318
6325 6319 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6326 6320 /* getfh XXX may not be needed in future */
6327 6321 argop->argop = OP_GETFH;
6328 6322 argop++;
6329 6323 argcnt++;
6330 6324
6331 6325 /* getattr */
6332 6326 argop->argop = OP_GETATTR;
6333 6327 argop->nfs_argop4_u.opgetattr.attr_request =
6334 6328 lookupargp->ga_bits;
6335 6329 argop->nfs_argop4_u.opgetattr.mi =
6336 6330 lookupargp->mi;
6337 6331 argop++;
6338 6332 argcnt++;
6339 6333 }
6340 6334 }
6341 6335
6342 6336 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6343 6337 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6344 6338 if (needgetfh) {
6345 6339 /* stick in a post-lookup getfh */
6346 6340 argop->argop = OP_GETFH;
6347 6341 argcnt++;
6348 6342 argop++;
6349 6343 }
6350 6344 /* post-lookup getattr */
6351 6345 argop->argop = OP_GETATTR;
6352 6346 argop->nfs_argop4_u.opgetattr.attr_request =
6353 6347 lookupargp->ga_bits;
6354 6348 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6355 6349 argcnt++;
6356 6350 }
6357 6351 argcnt += lookupargp->trailer_len; /* actual op count */
6358 6352 lookupargp->argsp->array_len = argcnt;
6359 6353 lookupargp->arglen = arglen;
6360 6354
6361 6355 #ifdef DEBUG
6362 6356 if (nfs4_client_lookup_debug)
6363 6357 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6364 6358 #endif
6365 6359
6366 6360 return (lookup_idx);
6367 6361 }
6368 6362
6369 6363 static int
6370 6364 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6371 6365 {
6372 6366 COMPOUND4args_clnt args;
6373 6367 COMPOUND4res_clnt res;
6374 6368 GETFH4res *gf_res = NULL;
6375 6369 nfs_argop4 argop[4];
6376 6370 nfs_resop4 *resop = NULL;
6377 6371 nfs4_sharedfh_t *sfhp;
6378 6372 hrtime_t t;
6379 6373 nfs4_error_t e;
6380 6374
6381 6375 rnode4_t *drp;
6382 6376 int doqueue = 1;
6383 6377 vnode_t *vp;
6384 6378 int needrecov = 0;
6385 6379 nfs4_recov_state_t recov_state;
6386 6380
6387 6381 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6388 6382
6389 6383 *avp = NULL;
6390 6384 recov_state.rs_flags = 0;
6391 6385 recov_state.rs_num_retry_despite_err = 0;
6392 6386
6393 6387 recov_retry:
6394 6388 /* COMPOUND: putfh, openattr, getfh, getattr */
6395 6389 args.array_len = 4;
6396 6390 args.array = argop;
6397 6391 args.ctag = TAG_OPENATTR;
6398 6392
6399 6393 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6400 6394 if (e.error)
6401 6395 return (e.error);
6402 6396
6403 6397 drp = VTOR4(dvp);
6404 6398
6405 6399 /* putfh */
6406 6400 argop[0].argop = OP_CPUTFH;
6407 6401 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6408 6402
6409 6403 /* openattr */
6410 6404 argop[1].argop = OP_OPENATTR;
6411 6405 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6412 6406
6413 6407 /* getfh */
6414 6408 argop[2].argop = OP_GETFH;
6415 6409
6416 6410 /* getattr */
6417 6411 argop[3].argop = OP_GETATTR;
6418 6412 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6419 6413 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6420 6414
6421 6415 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6422 6416 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6423 6417 rnode4info(drp)));
6424 6418
6425 6419 t = gethrtime();
6426 6420
6427 6421 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6428 6422
6429 6423 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6430 6424 if (needrecov) {
6431 6425 bool_t abort;
6432 6426
6433 6427 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6434 6428 "nfs4openattr: initiating recovery\n"));
6435 6429
6436 6430 abort = nfs4_start_recovery(&e,
6437 6431 VTOMI4(dvp), dvp, NULL, NULL, NULL,
6438 6432 OP_OPENATTR, NULL, NULL, NULL);
6439 6433 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6440 6434 if (!e.error) {
6441 6435 e.error = geterrno4(res.status);
6442 6436 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6443 6437 }
6444 6438 if (abort == FALSE)
6445 6439 goto recov_retry;
6446 6440 return (e.error);
6447 6441 }
6448 6442
6449 6443 if (e.error) {
6450 6444 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6451 6445 return (e.error);
6452 6446 }
6453 6447
6454 6448 if (res.status) {
6455 6449 /*
6456 6450 * If OTW errro is NOTSUPP, then it should be
6457 6451 * translated to EINVAL. All Solaris file system
6458 6452 * implementations return EINVAL to the syscall layer
6459 6453 * when the attrdir cannot be created due to an
6460 6454 * implementation restriction or noxattr mount option.
6461 6455 */
6462 6456 if (res.status == NFS4ERR_NOTSUPP) {
6463 6457 mutex_enter(&drp->r_statelock);
6464 6458 if (drp->r_xattr_dir)
6465 6459 VN_RELE(drp->r_xattr_dir);
6466 6460 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6467 6461 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6468 6462 mutex_exit(&drp->r_statelock);
6469 6463
6470 6464 e.error = EINVAL;
6471 6465 } else {
6472 6466 e.error = geterrno4(res.status);
6473 6467 }
6474 6468
6475 6469 if (e.error) {
6476 6470 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6477 6471 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6478 6472 needrecov);
6479 6473 return (e.error);
6480 6474 }
6481 6475 }
6482 6476
6483 6477 resop = &res.array[0]; /* putfh res */
6484 6478 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6485 6479
6486 6480 resop = &res.array[1]; /* openattr res */
6487 6481 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6488 6482
6489 6483 resop = &res.array[2]; /* getfh res */
6490 6484 gf_res = &resop->nfs_resop4_u.opgetfh;
6491 6485 if (gf_res->object.nfs_fh4_len == 0) {
6492 6486 *avp = NULL;
6493 6487 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6494 6488 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6495 6489 return (ENOENT);
6496 6490 }
6497 6491
6498 6492 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6499 6493 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6500 6494 dvp->v_vfsp, t, cr, dvp,
6501 6495 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6502 6496 sfh4_rele(&sfhp);
6503 6497
6504 6498 if (e.error)
6505 6499 PURGE_ATTRCACHE4(vp);
6506 6500
6507 6501 mutex_enter(&vp->v_lock);
6508 6502 vp->v_flag |= V_XATTRDIR;
6509 6503 mutex_exit(&vp->v_lock);
6510 6504
6511 6505 *avp = vp;
6512 6506
6513 6507 mutex_enter(&drp->r_statelock);
6514 6508 if (drp->r_xattr_dir)
6515 6509 VN_RELE(drp->r_xattr_dir);
6516 6510 VN_HOLD(vp);
6517 6511 drp->r_xattr_dir = vp;
6518 6512
6519 6513 /*
6520 6514 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6521 6515 * NULL. xattrs could be created at any time, and we have no
6522 6516 * way to update pc4_xattr_exists in the base object if/when
6523 6517 * it happens.
6524 6518 */
6525 6519 drp->r_pathconf.pc4_xattr_valid = 0;
6526 6520
6527 6521 mutex_exit(&drp->r_statelock);
6528 6522
6529 6523 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6530 6524
6531 6525 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6532 6526
6533 6527 return (0);
6534 6528 }
6535 6529
6536 6530 /* ARGSUSED */
6537 6531 static int
6538 6532 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6539 6533 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6540 6534 vsecattr_t *vsecp)
6541 6535 {
6542 6536 int error;
6543 6537 vnode_t *vp = NULL;
6544 6538 rnode4_t *rp;
6545 6539 struct vattr vattr;
6546 6540 rnode4_t *drp;
6547 6541 vnode_t *tempvp;
6548 6542 enum createmode4 createmode;
6549 6543 bool_t must_trunc = FALSE;
6550 6544 int truncating = 0;
6551 6545
6552 6546 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6553 6547 return (EPERM);
6554 6548 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6555 6549 return (EINVAL);
6556 6550 }
6557 6551
6558 6552 /* . and .. have special meaning in the protocol, reject them. */
6559 6553
6560 6554 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6561 6555 return (EISDIR);
6562 6556
6563 6557 drp = VTOR4(dvp);
6564 6558
6565 6559 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6566 6560 return (EINTR);
6567 6561
6568 6562 top:
6569 6563 /*
6570 6564 * We make a copy of the attributes because the caller does not
6571 6565 * expect us to change what va points to.
6572 6566 */
6573 6567 vattr = *va;
6574 6568
6575 6569 /*
6576 6570 * If the pathname is "", then dvp is the root vnode of
6577 6571 * a remote file mounted over a local directory.
6578 6572 * All that needs to be done is access
6579 6573 * checking and truncation. Note that we avoid doing
6580 6574 * open w/ create because the parent directory might
6581 6575 * be in pseudo-fs and the open would fail.
6582 6576 */
6583 6577 if (*nm == '\0') {
6584 6578 error = 0;
6585 6579 VN_HOLD(dvp);
6586 6580 vp = dvp;
6587 6581 must_trunc = TRUE;
6588 6582 } else {
6589 6583 /*
6590 6584 * We need to go over the wire, just to be sure whether the
6591 6585 * file exists or not. Using the DNLC can be dangerous in
6592 6586 * this case when making a decision regarding existence.
6593 6587 */
6594 6588 error = nfs4lookup(dvp, nm, &vp, cr, 1);
6595 6589 }
6596 6590
6597 6591 if (exclusive)
6598 6592 createmode = EXCLUSIVE4;
6599 6593 else
6600 6594 createmode = GUARDED4;
6601 6595
6602 6596 /*
6603 6597 * error would be set if the file does not exist on the
6604 6598 * server, so lets go create it.
6605 6599 */
6606 6600 if (error) {
6607 6601 goto create_otw;
6608 6602 }
6609 6603
6610 6604 /*
6611 6605 * File does exist on the server
6612 6606 */
6613 6607 if (exclusive == EXCL)
6614 6608 error = EEXIST;
6615 6609 else if (vp->v_type == VDIR && (mode & VWRITE))
6616 6610 error = EISDIR;
6617 6611 else {
6618 6612 /*
6619 6613 * If vnode is a device, create special vnode.
6620 6614 */
6621 6615 if (ISVDEV(vp->v_type)) {
6622 6616 tempvp = vp;
6623 6617 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6624 6618 VN_RELE(tempvp);
6625 6619 }
6626 6620 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6627 6621 if ((vattr.va_mask & AT_SIZE) &&
6628 6622 vp->v_type == VREG) {
6629 6623 rp = VTOR4(vp);
6630 6624 /*
6631 6625 * Check here for large file handled
6632 6626 * by LF-unaware process (as
6633 6627 * ufs_create() does)
6634 6628 */
6635 6629 if (!(flags & FOFFMAX)) {
6636 6630 mutex_enter(&rp->r_statelock);
6637 6631 if (rp->r_size > MAXOFF32_T)
6638 6632 error = EOVERFLOW;
6639 6633 mutex_exit(&rp->r_statelock);
6640 6634 }
6641 6635
6642 6636 /* if error is set then we need to return */
6643 6637 if (error) {
6644 6638 nfs_rw_exit(&drp->r_rwlock);
6645 6639 VN_RELE(vp);
6646 6640 return (error);
6647 6641 }
6648 6642
6649 6643 if (must_trunc) {
6650 6644 vattr.va_mask = AT_SIZE;
6651 6645 error = nfs4setattr(vp, &vattr, 0, cr,
6652 6646 NULL);
6653 6647 } else {
6654 6648 /*
6655 6649 * we know we have a regular file that already
6656 6650 * exists and we may end up truncating the file
6657 6651 * as a result of the open_otw, so flush out
6658 6652 * any dirty pages for this file first.
6659 6653 */
6660 6654 if (nfs4_has_pages(vp) &&
6661 6655 ((rp->r_flags & R4DIRTY) ||
6662 6656 rp->r_count > 0 ||
6663 6657 rp->r_mapcnt > 0)) {
6664 6658 error = nfs4_putpage(vp,
6665 6659 (offset_t)0, 0, 0, cr, ct);
6666 6660 if (error && (error == ENOSPC ||
6667 6661 error == EDQUOT)) {
6668 6662 mutex_enter(
6669 6663 &rp->r_statelock);
6670 6664 if (!rp->r_error)
6671 6665 rp->r_error =
6672 6666 error;
6673 6667 mutex_exit(
6674 6668 &rp->r_statelock);
6675 6669 }
6676 6670 }
6677 6671 vattr.va_mask = (AT_SIZE |
6678 6672 AT_TYPE | AT_MODE);
6679 6673 vattr.va_type = VREG;
6680 6674 createmode = UNCHECKED4;
6681 6675 truncating = 1;
6682 6676 goto create_otw;
6683 6677 }
6684 6678 }
6685 6679 }
6686 6680 }
6687 6681 nfs_rw_exit(&drp->r_rwlock);
6688 6682 if (error) {
6689 6683 VN_RELE(vp);
6690 6684 } else {
6691 6685 vnode_t *tvp;
6692 6686 rnode4_t *trp;
6693 6687 tvp = vp;
6694 6688 if (vp->v_type == VREG) {
6695 6689 trp = VTOR4(vp);
6696 6690 if (IS_SHADOW(vp, trp))
6697 6691 tvp = RTOV4(trp);
6698 6692 }
6699 6693
6700 6694 if (must_trunc) {
6701 6695 /*
6702 6696 * existing file got truncated, notify.
6703 6697 */
6704 6698 vnevent_create(tvp, ct);
6705 6699 }
6706 6700
6707 6701 *vpp = vp;
6708 6702 }
6709 6703 return (error);
6710 6704
6711 6705 create_otw:
6712 6706 dnlc_remove(dvp, nm);
6713 6707
6714 6708 ASSERT(vattr.va_mask & AT_TYPE);
6715 6709
6716 6710 /*
6717 6711 * If not a regular file let nfs4mknod() handle it.
6718 6712 */
6719 6713 if (vattr.va_type != VREG) {
6720 6714 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6721 6715 nfs_rw_exit(&drp->r_rwlock);
6722 6716 return (error);
6723 6717 }
6724 6718
6725 6719 /*
6726 6720 * It _is_ a regular file.
6727 6721 */
6728 6722 ASSERT(vattr.va_mask & AT_MODE);
6729 6723 if (MANDMODE(vattr.va_mode)) {
6730 6724 nfs_rw_exit(&drp->r_rwlock);
6731 6725 return (EACCES);
6732 6726 }
6733 6727
6734 6728 /*
6735 6729 * If this happens to be a mknod of a regular file, then flags will
6736 6730 * have neither FREAD or FWRITE. However, we must set at least one
6737 6731 * for the call to nfs4open_otw. If it's open(O_CREAT) driving
6738 6732 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6739 6733 * set (based on openmode specified by app).
6740 6734 */
6741 6735 if ((flags & (FREAD|FWRITE)) == 0)
6742 6736 flags |= (FREAD|FWRITE);
6743 6737
6744 6738 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6745 6739
6746 6740 if (vp != NULL) {
6747 6741 /* if create was successful, throw away the file's pages */
6748 6742 if (!error && (vattr.va_mask & AT_SIZE))
6749 6743 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6750 6744 cr);
6751 6745 /* release the lookup hold */
6752 6746 VN_RELE(vp);
6753 6747 vp = NULL;
6754 6748 }
6755 6749
6756 6750 /*
6757 6751 * validate that we opened a regular file. This handles a misbehaving
6758 6752 * server that returns an incorrect FH.
6759 6753 */
6760 6754 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6761 6755 error = EISDIR;
6762 6756 VN_RELE(*vpp);
6763 6757 }
6764 6758
6765 6759 /*
6766 6760 * If this is not an exclusive create, then the CREATE
6767 6761 * request will be made with the GUARDED mode set. This
6768 6762 * means that the server will return EEXIST if the file
6769 6763 * exists. The file could exist because of a retransmitted
6770 6764 * request. In this case, we recover by starting over and
6771 6765 * checking to see whether the file exists. This second
6772 6766 * time through it should and a CREATE request will not be
6773 6767 * sent.
6774 6768 *
6775 6769 * This handles the problem of a dangling CREATE request
6776 6770 * which contains attributes which indicate that the file
6777 6771 * should be truncated. This retransmitted request could
6778 6772 * possibly truncate valid data in the file if not caught
6779 6773 * by the duplicate request mechanism on the server or if
6780 6774 * not caught by other means. The scenario is:
6781 6775 *
6782 6776 * Client transmits CREATE request with size = 0
6783 6777 * Client times out, retransmits request.
6784 6778 * Response to the first request arrives from the server
6785 6779 * and the client proceeds on.
6786 6780 * Client writes data to the file.
6787 6781 * The server now processes retransmitted CREATE request
6788 6782 * and truncates file.
6789 6783 *
6790 6784 * The use of the GUARDED CREATE request prevents this from
6791 6785 * happening because the retransmitted CREATE would fail
6792 6786 * with EEXIST and would not truncate the file.
6793 6787 */
6794 6788 if (error == EEXIST && exclusive == NONEXCL) {
6795 6789 #ifdef DEBUG
6796 6790 nfs4_create_misses++;
6797 6791 #endif
6798 6792 goto top;
6799 6793 }
6800 6794 nfs_rw_exit(&drp->r_rwlock);
6801 6795 if (truncating && !error && *vpp) {
6802 6796 vnode_t *tvp;
6803 6797 rnode4_t *trp;
6804 6798 /*
6805 6799 * existing file got truncated, notify.
6806 6800 */
6807 6801 tvp = *vpp;
6808 6802 trp = VTOR4(tvp);
6809 6803 if (IS_SHADOW(tvp, trp))
6810 6804 tvp = RTOV4(trp);
6811 6805 vnevent_create(tvp, ct);
6812 6806 }
6813 6807 return (error);
6814 6808 }
6815 6809
6816 6810 /*
6817 6811 * Create compound (for mkdir, mknod, symlink):
6818 6812 * { Putfh <dfh>; Create; Getfh; Getattr }
6819 6813 * It's okay if setattr failed to set gid - this is not considered
6820 6814 * an error, but purge attrs in that case.
6821 6815 */
6822 6816 static int
6823 6817 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6824 6818 vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6825 6819 {
6826 6820 int need_end_op = FALSE;
6827 6821 COMPOUND4args_clnt args;
6828 6822 COMPOUND4res_clnt res, *resp = NULL;
6829 6823 nfs_argop4 *argop;
6830 6824 nfs_resop4 *resop;
6831 6825 int doqueue;
6832 6826 mntinfo4_t *mi;
6833 6827 rnode4_t *drp = VTOR4(dvp);
6834 6828 change_info4 *cinfo;
6835 6829 GETFH4res *gf_res;
6836 6830 struct vattr vattr;
6837 6831 vnode_t *vp;
6838 6832 fattr4 *crattr;
6839 6833 bool_t needrecov = FALSE;
6840 6834 nfs4_recov_state_t recov_state;
6841 6835 nfs4_sharedfh_t *sfhp = NULL;
6842 6836 hrtime_t t;
6843 6837 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6844 6838 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6845 6839 dirattr_info_t dinfo, *dinfop;
6846 6840 servinfo4_t *svp;
6847 6841 bitmap4 supp_attrs;
6848 6842
6849 6843 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6850 6844 type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6851 6845
6852 6846 mi = VTOMI4(dvp);
6853 6847
6854 6848 /*
6855 6849 * Make sure we properly deal with setting the right gid
6856 6850 * on a new directory to reflect the parent's setgid bit
6857 6851 */
6858 6852 setgid_flag = 0;
6859 6853 if (type == NF4DIR) {
6860 6854 struct vattr dva;
6861 6855
6862 6856 va->va_mode &= ~VSGID;
6863 6857 dva.va_mask = AT_MODE | AT_GID;
6864 6858 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6865 6859
6866 6860 /*
6867 6861 * If the parent's directory has the setgid bit set
6868 6862 * _and_ the client was able to get a valid mapping
6869 6863 * for the parent dir's owner_group, we want to
6870 6864 * append NVERIFY(owner_group == dva.va_gid) and
6871 6865 * SETTATTR to the CREATE compound.
6872 6866 */
6873 6867 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6874 6868 setgid_flag = 1;
6875 6869 va->va_mode |= VSGID;
6876 6870 if (dva.va_gid != GID_NOBODY) {
6877 6871 va->va_mask |= AT_GID;
6878 6872 va->va_gid = dva.va_gid;
6879 6873 }
6880 6874 }
6881 6875 }
6882 6876 }
6883 6877
6884 6878 /*
6885 6879 * Create ops:
6886 6880 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6887 6881 * 5:restorefh(dir) 6:getattr(dir)
6888 6882 *
6889 6883 * if (setgid)
6890 6884 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6891 6885 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6892 6886 * 8:nverify 9:setattr
6893 6887 */
6894 6888 if (setgid_flag) {
6895 6889 numops = 10;
6896 6890 idx_create = 1;
6897 6891 idx_fattr = 3;
6898 6892 } else {
6899 6893 numops = 7;
6900 6894 idx_create = 2;
6901 6895 idx_fattr = 4;
6902 6896 }
6903 6897
6904 6898 ASSERT(nfs_zone() == mi->mi_zone);
6905 6899 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6906 6900 return (EINTR);
6907 6901 }
6908 6902 recov_state.rs_flags = 0;
6909 6903 recov_state.rs_num_retry_despite_err = 0;
6910 6904
6911 6905 argoplist_size = numops * sizeof (nfs_argop4);
6912 6906 argop = kmem_alloc(argoplist_size, KM_SLEEP);
6913 6907
6914 6908 recov_retry:
6915 6909 if (type == NF4LNK)
6916 6910 args.ctag = TAG_SYMLINK;
6917 6911 else if (type == NF4DIR)
6918 6912 args.ctag = TAG_MKDIR;
6919 6913 else
6920 6914 args.ctag = TAG_MKNOD;
6921 6915
6922 6916 args.array_len = numops;
6923 6917 args.array = argop;
6924 6918
6925 6919 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6926 6920 nfs_rw_exit(&drp->r_rwlock);
6927 6921 kmem_free(argop, argoplist_size);
6928 6922 return (e.error);
6929 6923 }
6930 6924 need_end_op = TRUE;
6931 6925
6932 6926
6933 6927 /* 0: putfh directory */
6934 6928 argop[0].argop = OP_CPUTFH;
6935 6929 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6936 6930
6937 6931 /* 1/2: Create object */
6938 6932 argop[idx_create].argop = OP_CCREATE;
6939 6933 argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6940 6934 argop[idx_create].nfs_argop4_u.opccreate.type = type;
6941 6935 if (type == NF4LNK) {
6942 6936 /*
6943 6937 * symlink, treat name as data
6944 6938 */
6945 6939 ASSERT(data != NULL);
6946 6940 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6947 6941 (char *)data;
6948 6942 }
6949 6943 if (type == NF4BLK || type == NF4CHR) {
6950 6944 ASSERT(data != NULL);
6951 6945 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6952 6946 *((specdata4 *)data);
6953 6947 }
6954 6948
6955 6949 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6956 6950
6957 6951 svp = drp->r_server;
6958 6952 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6959 6953 supp_attrs = svp->sv_supp_attrs;
6960 6954 nfs_rw_exit(&svp->sv_lock);
6961 6955
6962 6956 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6963 6957 nfs_rw_exit(&drp->r_rwlock);
6964 6958 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6965 6959 e.error = EINVAL;
6966 6960 kmem_free(argop, argoplist_size);
6967 6961 return (e.error);
6968 6962 }
6969 6963
6970 6964 /* 2/3: getfh fh of created object */
6971 6965 ASSERT(idx_create + 1 == idx_fattr - 1);
6972 6966 argop[idx_create + 1].argop = OP_GETFH;
6973 6967
6974 6968 /* 3/4: getattr of new object */
6975 6969 argop[idx_fattr].argop = OP_GETATTR;
6976 6970 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6977 6971 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6978 6972
6979 6973 if (setgid_flag) {
6980 6974 vattr_t _v;
6981 6975
6982 6976 argop[4].argop = OP_SAVEFH;
6983 6977
6984 6978 argop[5].argop = OP_CPUTFH;
6985 6979 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6986 6980
6987 6981 argop[6].argop = OP_GETATTR;
6988 6982 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6989 6983 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6990 6984
6991 6985 argop[7].argop = OP_RESTOREFH;
6992 6986
6993 6987 /*
6994 6988 * nverify
6995 6989 *
6996 6990 * XXX - Revisit the last argument to nfs4_end_op()
6997 6991 * once 5020486 is fixed.
6998 6992 */
6999 6993 _v.va_mask = AT_GID;
7000 6994 _v.va_gid = va->va_gid;
7001 6995 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
7002 6996 supp_attrs)) {
7003 6997 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7004 6998 nfs_rw_exit(&drp->r_rwlock);
7005 6999 nfs4_fattr4_free(crattr);
7006 7000 kmem_free(argop, argoplist_size);
7007 7001 return (e.error);
7008 7002 }
7009 7003
7010 7004 /*
7011 7005 * setattr
7012 7006 *
7013 7007 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
7014 7008 * so no need for stateid or flags. Also we specify NULL
7015 7009 * rp since we're only interested in setting owner_group
7016 7010 * attributes.
7017 7011 */
7018 7012 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
7019 7013 &e.error, 0);
7020 7014
7021 7015 if (e.error) {
7022 7016 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7023 7017 nfs_rw_exit(&drp->r_rwlock);
7024 7018 nfs4_fattr4_free(crattr);
7025 7019 nfs4args_verify_free(&argop[8]);
7026 7020 kmem_free(argop, argoplist_size);
7027 7021 return (e.error);
7028 7022 }
7029 7023 } else {
7030 7024 argop[1].argop = OP_SAVEFH;
7031 7025
7032 7026 argop[5].argop = OP_RESTOREFH;
7033 7027
7034 7028 argop[6].argop = OP_GETATTR;
7035 7029 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7036 7030 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7037 7031 }
7038 7032
7039 7033 dnlc_remove(dvp, nm);
7040 7034
7041 7035 doqueue = 1;
7042 7036 t = gethrtime();
7043 7037 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7044 7038
7045 7039 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7046 7040 if (e.error) {
7047 7041 PURGE_ATTRCACHE4(dvp);
7048 7042 if (!needrecov)
7049 7043 goto out;
7050 7044 }
7051 7045
7052 7046 if (needrecov) {
7053 7047 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7054 7048 OP_CREATE, NULL, NULL, NULL) == FALSE) {
7055 7049 nfs4_end_op(mi, dvp, NULL, &recov_state,
7056 7050 needrecov);
7057 7051 need_end_op = FALSE;
7058 7052 nfs4_fattr4_free(crattr);
7059 7053 if (setgid_flag) {
7060 7054 nfs4args_verify_free(&argop[8]);
7061 7055 nfs4args_setattr_free(&argop[9]);
7062 7056 }
7063 7057 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7064 7058 goto recov_retry;
7065 7059 }
7066 7060 }
7067 7061
7068 7062 resp = &res;
7069 7063
7070 7064 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7071 7065
7072 7066 if (res.status == NFS4ERR_BADOWNER)
7073 7067 nfs4_log_badowner(mi, OP_CREATE);
7074 7068
7075 7069 e.error = geterrno4(res.status);
7076 7070
7077 7071 /*
7078 7072 * This check is left over from when create was implemented
7079 7073 * using a setattr op (instead of createattrs). If the
7080 7074 * putfh/create/getfh failed, the error was returned. If
7081 7075 * setattr/getattr failed, we keep going.
7082 7076 *
7083 7077 * It might be better to get rid of the GETFH also, and just
7084 7078 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7085 7079 * Then if any of the operations failed, we could return the
7086 7080 * error now, and remove much of the error code below.
7087 7081 */
7088 7082 if (res.array_len <= idx_fattr) {
7089 7083 /*
7090 7084 * Either Putfh, Create or Getfh failed.
7091 7085 */
7092 7086 PURGE_ATTRCACHE4(dvp);
7093 7087 /*
7094 7088 * nfs4_purge_stale_fh() may generate otw calls through
7095 7089 * nfs4_invalidate_pages. Hence the need to call
7096 7090 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7097 7091 */
7098 7092 nfs4_end_op(mi, dvp, NULL, &recov_state,
7099 7093 needrecov);
7100 7094 need_end_op = FALSE;
7101 7095 nfs4_purge_stale_fh(e.error, dvp, cr);
7102 7096 goto out;
7103 7097 }
7104 7098 }
7105 7099
7106 7100 resop = &res.array[idx_create]; /* create res */
7107 7101 cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7108 7102
7109 7103 resop = &res.array[idx_create + 1]; /* getfh res */
7110 7104 gf_res = &resop->nfs_resop4_u.opgetfh;
7111 7105
7112 7106 sfhp = sfh4_get(&gf_res->object, mi);
7113 7107 if (e.error) {
7114 7108 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7115 7109 fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7116 7110 if (vp->v_type == VNON) {
7117 7111 vattr.va_mask = AT_TYPE;
7118 7112 /*
7119 7113 * Need to call nfs4_end_op before nfs4getattr to avoid
7120 7114 * potential nfs4_start_op deadlock. See RFE 4777612.
7121 7115 */
7122 7116 nfs4_end_op(mi, dvp, NULL, &recov_state,
7123 7117 needrecov);
7124 7118 need_end_op = FALSE;
7125 7119 e.error = nfs4getattr(vp, &vattr, cr);
7126 7120 if (e.error) {
7127 7121 VN_RELE(vp);
7128 7122 *vpp = NULL;
7129 7123 goto out;
7130 7124 }
7131 7125 vp->v_type = vattr.va_type;
7132 7126 }
7133 7127 e.error = 0;
7134 7128 } else {
7135 7129 *vpp = vp = makenfs4node(sfhp,
7136 7130 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7137 7131 dvp->v_vfsp, t, cr,
7138 7132 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7139 7133 }
7140 7134
7141 7135 /*
7142 7136 * If compound succeeded, then update dir attrs
7143 7137 */
7144 7138 if (res.status == NFS4_OK) {
7145 7139 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7146 7140 dinfo.di_cred = cr;
7147 7141 dinfo.di_time_call = t;
7148 7142 dinfop = &dinfo;
7149 7143 } else
7150 7144 dinfop = NULL;
7151 7145
7152 7146 /* Update directory cache attribute, readdir and dnlc caches */
7153 7147 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7154 7148
7155 7149 out:
7156 7150 if (sfhp != NULL)
7157 7151 sfh4_rele(&sfhp);
7158 7152 nfs_rw_exit(&drp->r_rwlock);
7159 7153 nfs4_fattr4_free(crattr);
7160 7154 if (setgid_flag) {
7161 7155 nfs4args_verify_free(&argop[8]);
7162 7156 nfs4args_setattr_free(&argop[9]);
7163 7157 }
7164 7158 if (resp)
7165 7159 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7166 7160 if (need_end_op)
7167 7161 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7168 7162
7169 7163 kmem_free(argop, argoplist_size);
7170 7164 return (e.error);
7171 7165 }
7172 7166
7173 7167 /* ARGSUSED */
7174 7168 static int
7175 7169 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7176 7170 int mode, vnode_t **vpp, cred_t *cr)
7177 7171 {
7178 7172 int error;
7179 7173 vnode_t *vp;
7180 7174 nfs_ftype4 type;
7181 7175 specdata4 spec, *specp = NULL;
7182 7176
7183 7177 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7184 7178
7185 7179 switch (va->va_type) {
7186 7180 case VCHR:
7187 7181 case VBLK:
7188 7182 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7189 7183 spec.specdata1 = getmajor(va->va_rdev);
7190 7184 spec.specdata2 = getminor(va->va_rdev);
7191 7185 specp = &spec;
7192 7186 break;
7193 7187
7194 7188 case VFIFO:
7195 7189 type = NF4FIFO;
7196 7190 break;
7197 7191 case VSOCK:
7198 7192 type = NF4SOCK;
7199 7193 break;
7200 7194
7201 7195 default:
7202 7196 return (EINVAL);
7203 7197 }
7204 7198
7205 7199 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7206 7200 if (error) {
7207 7201 return (error);
7208 7202 }
7209 7203
7210 7204 /*
7211 7205 * This might not be needed any more; special case to deal
7212 7206 * with problematic v2/v3 servers. Since create was unable
7213 7207 * to set group correctly, not sure what hope setattr has.
7214 7208 */
7215 7209 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7216 7210 va->va_mask = AT_GID;
7217 7211 (void) nfs4setattr(vp, va, 0, cr, NULL);
7218 7212 }
7219 7213
7220 7214 /*
7221 7215 * If vnode is a device create special vnode
7222 7216 */
7223 7217 if (ISVDEV(vp->v_type)) {
7224 7218 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7225 7219 VN_RELE(vp);
7226 7220 } else {
7227 7221 *vpp = vp;
7228 7222 }
7229 7223 return (error);
7230 7224 }
7231 7225
7232 7226 /*
7233 7227 * Remove requires that the current fh be the target directory.
7234 7228 * After the operation, the current fh is unchanged.
7235 7229 * The compound op structure is:
7236 7230 * PUTFH(targetdir), REMOVE
7237 7231 *
7238 7232 * Weirdness: if the vnode to be removed is open
7239 7233 * we rename it instead of removing it and nfs_inactive
7240 7234 * will remove the new name.
7241 7235 */
7242 7236 /* ARGSUSED */
7243 7237 static int
7244 7238 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7245 7239 {
7246 7240 COMPOUND4args_clnt args;
7247 7241 COMPOUND4res_clnt res, *resp = NULL;
7248 7242 REMOVE4res *rm_res;
7249 7243 nfs_argop4 argop[3];
7250 7244 nfs_resop4 *resop;
7251 7245 vnode_t *vp;
7252 7246 char *tmpname;
7253 7247 int doqueue;
7254 7248 mntinfo4_t *mi;
7255 7249 rnode4_t *rp;
7256 7250 rnode4_t *drp;
7257 7251 int needrecov = 0;
7258 7252 nfs4_recov_state_t recov_state;
7259 7253 int isopen;
7260 7254 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7261 7255 dirattr_info_t dinfo;
7262 7256
7263 7257 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7264 7258 return (EPERM);
7265 7259 drp = VTOR4(dvp);
7266 7260 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7267 7261 return (EINTR);
7268 7262
7269 7263 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7270 7264 if (e.error) {
7271 7265 nfs_rw_exit(&drp->r_rwlock);
7272 7266 return (e.error);
7273 7267 }
7274 7268
7275 7269 if (vp->v_type == VDIR) {
7276 7270 VN_RELE(vp);
7277 7271 nfs_rw_exit(&drp->r_rwlock);
7278 7272 return (EISDIR);
7279 7273 }
7280 7274
7281 7275 /*
7282 7276 * First just remove the entry from the name cache, as it
7283 7277 * is most likely the only entry for this vp.
7284 7278 */
7285 7279 dnlc_remove(dvp, nm);
7286 7280
7287 7281 rp = VTOR4(vp);
7288 7282
7289 7283 /*
7290 7284 * For regular file types, check to see if the file is open by looking
7291 7285 * at the open streams.
7292 7286 * For all other types, check the reference count on the vnode. Since
7293 7287 * they are not opened OTW they never have an open stream.
7294 7288 *
7295 7289 * If the file is open, rename it to .nfsXXXX.
7296 7290 */
7297 7291 if (vp->v_type != VREG) {
7298 7292 /*
7299 7293 * If the file has a v_count > 1 then there may be more than one
7300 7294 * entry in the name cache due multiple links or an open file,
7301 7295 * but we don't have the real reference count so flush all
7302 7296 * possible entries.
7303 7297 */
7304 7298 if (vp->v_count > 1)
7305 7299 dnlc_purge_vp(vp);
7306 7300
7307 7301 /*
7308 7302 * Now we have the real reference count.
7309 7303 */
7310 7304 isopen = vp->v_count > 1;
7311 7305 } else {
7312 7306 mutex_enter(&rp->r_os_lock);
7313 7307 isopen = list_head(&rp->r_open_streams) != NULL;
7314 7308 mutex_exit(&rp->r_os_lock);
7315 7309 }
7316 7310
7317 7311 mutex_enter(&rp->r_statelock);
7318 7312 if (isopen &&
7319 7313 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7320 7314 mutex_exit(&rp->r_statelock);
7321 7315 tmpname = newname();
7322 7316 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7323 7317 if (e.error)
7324 7318 kmem_free(tmpname, MAXNAMELEN);
7325 7319 else {
7326 7320 mutex_enter(&rp->r_statelock);
7327 7321 if (rp->r_unldvp == NULL) {
7328 7322 VN_HOLD(dvp);
7329 7323 rp->r_unldvp = dvp;
7330 7324 if (rp->r_unlcred != NULL)
7331 7325 crfree(rp->r_unlcred);
7332 7326 crhold(cr);
7333 7327 rp->r_unlcred = cr;
7334 7328 rp->r_unlname = tmpname;
7335 7329 } else {
7336 7330 kmem_free(rp->r_unlname, MAXNAMELEN);
7337 7331 rp->r_unlname = tmpname;
7338 7332 }
7339 7333 mutex_exit(&rp->r_statelock);
7340 7334 }
7341 7335 VN_RELE(vp);
7342 7336 nfs_rw_exit(&drp->r_rwlock);
7343 7337 return (e.error);
7344 7338 }
7345 7339 /*
7346 7340 * Actually remove the file/dir
7347 7341 */
7348 7342 mutex_exit(&rp->r_statelock);
7349 7343
7350 7344 /*
7351 7345 * We need to flush any dirty pages which happen to
7352 7346 * be hanging around before removing the file.
7353 7347 * This shouldn't happen very often since in NFSv4
7354 7348 * we should be close to open consistent.
7355 7349 */
7356 7350 if (nfs4_has_pages(vp) &&
7357 7351 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7358 7352 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7359 7353 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7360 7354 mutex_enter(&rp->r_statelock);
7361 7355 if (!rp->r_error)
7362 7356 rp->r_error = e.error;
7363 7357 mutex_exit(&rp->r_statelock);
7364 7358 }
7365 7359 }
7366 7360
7367 7361 mi = VTOMI4(dvp);
7368 7362
7369 7363 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7370 7364 recov_state.rs_flags = 0;
7371 7365 recov_state.rs_num_retry_despite_err = 0;
7372 7366
7373 7367 recov_retry:
7374 7368 /*
7375 7369 * Remove ops: putfh dir; remove
7376 7370 */
7377 7371 args.ctag = TAG_REMOVE;
7378 7372 args.array_len = 3;
7379 7373 args.array = argop;
7380 7374
7381 7375 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7382 7376 if (e.error) {
7383 7377 nfs_rw_exit(&drp->r_rwlock);
7384 7378 VN_RELE(vp);
7385 7379 return (e.error);
7386 7380 }
7387 7381
7388 7382 /* putfh directory */
7389 7383 argop[0].argop = OP_CPUTFH;
7390 7384 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7391 7385
7392 7386 /* remove */
7393 7387 argop[1].argop = OP_CREMOVE;
7394 7388 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7395 7389
7396 7390 /* getattr dir */
7397 7391 argop[2].argop = OP_GETATTR;
7398 7392 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7399 7393 argop[2].nfs_argop4_u.opgetattr.mi = mi;
7400 7394
7401 7395 doqueue = 1;
7402 7396 dinfo.di_time_call = gethrtime();
7403 7397 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7404 7398
7405 7399 PURGE_ATTRCACHE4(vp);
7406 7400
7407 7401 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7408 7402 if (e.error)
7409 7403 PURGE_ATTRCACHE4(dvp);
7410 7404
7411 7405 if (needrecov) {
7412 7406 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7413 7407 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7414 7408 if (!e.error)
7415 7409 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7416 7410 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7417 7411 needrecov);
7418 7412 goto recov_retry;
7419 7413 }
7420 7414 }
7421 7415
7422 7416 /*
7423 7417 * Matching nfs4_end_op() for start_op() above.
7424 7418 * There is a path in the code below which calls
7425 7419 * nfs4_purge_stale_fh(), which may generate otw calls through
7426 7420 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7427 7421 * here to avoid nfs4_start_op() deadlock.
7428 7422 */
7429 7423 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7430 7424
7431 7425 if (!e.error) {
7432 7426 resp = &res;
7433 7427
7434 7428 if (res.status) {
7435 7429 e.error = geterrno4(res.status);
7436 7430 PURGE_ATTRCACHE4(dvp);
7437 7431 nfs4_purge_stale_fh(e.error, dvp, cr);
7438 7432 } else {
7439 7433 resop = &res.array[1]; /* remove res */
7440 7434 rm_res = &resop->nfs_resop4_u.opremove;
7441 7435
7442 7436 dinfo.di_garp =
7443 7437 &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7444 7438 dinfo.di_cred = cr;
7445 7439
7446 7440 /* Update directory attr, readdir and dnlc caches */
7447 7441 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7448 7442 &dinfo);
7449 7443 }
7450 7444 }
7451 7445 nfs_rw_exit(&drp->r_rwlock);
7452 7446 if (resp)
7453 7447 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7454 7448
7455 7449 if (e.error == 0) {
7456 7450 vnode_t *tvp;
7457 7451 rnode4_t *trp;
7458 7452 trp = VTOR4(vp);
7459 7453 tvp = vp;
7460 7454 if (IS_SHADOW(vp, trp))
7461 7455 tvp = RTOV4(trp);
7462 7456 vnevent_remove(tvp, dvp, nm, ct);
7463 7457 }
7464 7458 VN_RELE(vp);
7465 7459 return (e.error);
7466 7460 }
7467 7461
7468 7462 /*
7469 7463 * Link requires that the current fh be the target directory and the
7470 7464 * saved fh be the source fh. After the operation, the current fh is unchanged.
7471 7465 * Thus the compound op structure is:
7472 7466 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7473 7467 * GETATTR(file)
7474 7468 */
7475 7469 /* ARGSUSED */
7476 7470 static int
7477 7471 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7478 7472 caller_context_t *ct, int flags)
7479 7473 {
7480 7474 COMPOUND4args_clnt args;
7481 7475 COMPOUND4res_clnt res, *resp = NULL;
7482 7476 LINK4res *ln_res;
7483 7477 int argoplist_size = 7 * sizeof (nfs_argop4);
7484 7478 nfs_argop4 *argop;
7485 7479 nfs_resop4 *resop;
7486 7480 vnode_t *realvp, *nvp;
7487 7481 int doqueue;
7488 7482 mntinfo4_t *mi;
7489 7483 rnode4_t *tdrp;
7490 7484 bool_t needrecov = FALSE;
7491 7485 nfs4_recov_state_t recov_state;
7492 7486 hrtime_t t;
7493 7487 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7494 7488 dirattr_info_t dinfo;
7495 7489
7496 7490 ASSERT(*tnm != '\0');
7497 7491 ASSERT(tdvp->v_type == VDIR);
7498 7492 ASSERT(nfs4_consistent_type(tdvp));
7499 7493 ASSERT(nfs4_consistent_type(svp));
7500 7494
7501 7495 if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7502 7496 return (EPERM);
7503 7497 if (VOP_REALVP(svp, &realvp, ct) == 0) {
7504 7498 svp = realvp;
7505 7499 ASSERT(nfs4_consistent_type(svp));
7506 7500 }
7507 7501
7508 7502 tdrp = VTOR4(tdvp);
7509 7503 mi = VTOMI4(svp);
7510 7504
7511 7505 if (!(mi->mi_flags & MI4_LINK)) {
7512 7506 return (EOPNOTSUPP);
7513 7507 }
7514 7508 recov_state.rs_flags = 0;
7515 7509 recov_state.rs_num_retry_despite_err = 0;
7516 7510
7517 7511 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7518 7512 return (EINTR);
7519 7513
7520 7514 recov_retry:
7521 7515 argop = kmem_alloc(argoplist_size, KM_SLEEP);
7522 7516
7523 7517 args.ctag = TAG_LINK;
7524 7518
7525 7519 /*
7526 7520 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7527 7521 * restorefh; getattr(fl)
7528 7522 */
7529 7523 args.array_len = 7;
7530 7524 args.array = argop;
7531 7525
7532 7526 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7533 7527 if (e.error) {
7534 7528 kmem_free(argop, argoplist_size);
7535 7529 nfs_rw_exit(&tdrp->r_rwlock);
7536 7530 return (e.error);
7537 7531 }
7538 7532
7539 7533 /* 0. putfh file */
7540 7534 argop[0].argop = OP_CPUTFH;
7541 7535 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7542 7536
7543 7537 /* 1. save current fh to free up the space for the dir */
7544 7538 argop[1].argop = OP_SAVEFH;
7545 7539
7546 7540 /* 2. putfh targetdir */
7547 7541 argop[2].argop = OP_CPUTFH;
7548 7542 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7549 7543
7550 7544 /* 3. link: current_fh is targetdir, saved_fh is source */
7551 7545 argop[3].argop = OP_CLINK;
7552 7546 argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7553 7547
7554 7548 /* 4. Get attributes of dir */
7555 7549 argop[4].argop = OP_GETATTR;
7556 7550 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7557 7551 argop[4].nfs_argop4_u.opgetattr.mi = mi;
7558 7552
7559 7553 /* 5. If link was successful, restore current vp to file */
7560 7554 argop[5].argop = OP_RESTOREFH;
7561 7555
7562 7556 /* 6. Get attributes of linked object */
7563 7557 argop[6].argop = OP_GETATTR;
7564 7558 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7565 7559 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7566 7560
7567 7561 dnlc_remove(tdvp, tnm);
7568 7562
7569 7563 doqueue = 1;
7570 7564 t = gethrtime();
7571 7565
7572 7566 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7573 7567
7574 7568 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7575 7569 if (e.error != 0 && !needrecov) {
7576 7570 PURGE_ATTRCACHE4(tdvp);
7577 7571 PURGE_ATTRCACHE4(svp);
7578 7572 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7579 7573 goto out;
7580 7574 }
7581 7575
7582 7576 if (needrecov) {
7583 7577 bool_t abort;
7584 7578
7585 7579 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7586 7580 NULL, NULL, OP_LINK, NULL, NULL, NULL);
7587 7581 if (abort == FALSE) {
7588 7582 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7589 7583 needrecov);
7590 7584 kmem_free(argop, argoplist_size);
7591 7585 if (!e.error)
7592 7586 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7593 7587 goto recov_retry;
7594 7588 } else {
7595 7589 if (e.error != 0) {
7596 7590 PURGE_ATTRCACHE4(tdvp);
7597 7591 PURGE_ATTRCACHE4(svp);
7598 7592 nfs4_end_op(VTOMI4(svp), svp, tdvp,
7599 7593 &recov_state, needrecov);
7600 7594 goto out;
7601 7595 }
7602 7596 /* fall through for res.status case */
7603 7597 }
7604 7598 }
7605 7599
7606 7600 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7607 7601
7608 7602 resp = &res;
7609 7603 if (res.status) {
7610 7604 /* If link succeeded, then don't return error */
7611 7605 e.error = geterrno4(res.status);
7612 7606 if (res.array_len <= 4) {
7613 7607 /*
7614 7608 * Either Putfh, Savefh, Putfh dir, or Link failed
7615 7609 */
7616 7610 PURGE_ATTRCACHE4(svp);
7617 7611 PURGE_ATTRCACHE4(tdvp);
7618 7612 if (e.error == EOPNOTSUPP) {
7619 7613 mutex_enter(&mi->mi_lock);
7620 7614 mi->mi_flags &= ~MI4_LINK;
7621 7615 mutex_exit(&mi->mi_lock);
7622 7616 }
7623 7617 /* Remap EISDIR to EPERM for non-root user for SVVS */
7624 7618 /* XXX-LP */
7625 7619 if (e.error == EISDIR && crgetuid(cr) != 0)
7626 7620 e.error = EPERM;
7627 7621 goto out;
7628 7622 }
7629 7623 }
7630 7624
7631 7625 /* either no error or one of the postop getattr failed */
7632 7626
7633 7627 /*
7634 7628 * XXX - if LINK succeeded, but no attrs were returned for link
7635 7629 * file, purge its cache.
7636 7630 *
7637 7631 * XXX Perform a simplified version of wcc checking. Instead of
7638 7632 * have another getattr to get pre-op, just purge cache if
7639 7633 * any of the ops prior to and including the getattr failed.
7640 7634 * If the getattr succeeded then update the attrcache accordingly.
7641 7635 */
7642 7636
7643 7637 /*
7644 7638 * update cache with link file postattrs.
7645 7639 * Note: at this point resop points to link res.
7646 7640 */
7647 7641 resop = &res.array[3]; /* link res */
7648 7642 ln_res = &resop->nfs_resop4_u.oplink;
7649 7643 if (res.status == NFS4_OK)
7650 7644 e.error = nfs4_update_attrcache(res.status,
7651 7645 &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7652 7646 t, svp, cr);
7653 7647
7654 7648 /*
7655 7649 * Call makenfs4node to create the new shadow vp for tnm.
7656 7650 * We pass NULL attrs because we just cached attrs for
7657 7651 * the src object. All we're trying to accomplish is to
7658 7652 * to create the new shadow vnode.
7659 7653 */
7660 7654 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7661 7655 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7662 7656
7663 7657 /* Update target cache attribute, readdir and dnlc caches */
7664 7658 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7665 7659 dinfo.di_time_call = t;
7666 7660 dinfo.di_cred = cr;
7667 7661
7668 7662 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7669 7663 ASSERT(nfs4_consistent_type(tdvp));
7670 7664 ASSERT(nfs4_consistent_type(svp));
7671 7665 ASSERT(nfs4_consistent_type(nvp));
7672 7666 VN_RELE(nvp);
7673 7667
7674 7668 if (!e.error) {
7675 7669 vnode_t *tvp;
7676 7670 rnode4_t *trp;
7677 7671 /*
7678 7672 * Notify the source file of this link operation.
7679 7673 */
7680 7674 trp = VTOR4(svp);
7681 7675 tvp = svp;
7682 7676 if (IS_SHADOW(svp, trp))
7683 7677 tvp = RTOV4(trp);
7684 7678 vnevent_link(tvp, ct);
7685 7679 }
7686 7680 out:
7687 7681 kmem_free(argop, argoplist_size);
7688 7682 if (resp)
7689 7683 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7690 7684
7691 7685 nfs_rw_exit(&tdrp->r_rwlock);
7692 7686
7693 7687 return (e.error);
7694 7688 }
7695 7689
7696 7690 /* ARGSUSED */
7697 7691 static int
7698 7692 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7699 7693 caller_context_t *ct, int flags)
7700 7694 {
7701 7695 vnode_t *realvp;
7702 7696
7703 7697 if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7704 7698 return (EPERM);
7705 7699 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7706 7700 ndvp = realvp;
7707 7701
7708 7702 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7709 7703 }
7710 7704
7711 7705 /*
7712 7706 * nfs4rename does the real work of renaming in NFS Version 4.
7713 7707 *
7714 7708 * A file handle is considered volatile for renaming purposes if either
7715 7709 * of the volatile bits are turned on. However, the compound may differ
7716 7710 * based on the likelihood of the filehandle to change during rename.
7717 7711 */
7718 7712 static int
7719 7713 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7720 7714 caller_context_t *ct)
7721 7715 {
7722 7716 int error;
7723 7717 mntinfo4_t *mi;
7724 7718 vnode_t *nvp = NULL;
7725 7719 vnode_t *ovp = NULL;
7726 7720 char *tmpname = NULL;
7727 7721 rnode4_t *rp;
7728 7722 rnode4_t *odrp;
7729 7723 rnode4_t *ndrp;
7730 7724 int did_link = 0;
7731 7725 int do_link = 1;
7732 7726 nfsstat4 stat = NFS4_OK;
7733 7727
7734 7728 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7735 7729 ASSERT(nfs4_consistent_type(odvp));
7736 7730 ASSERT(nfs4_consistent_type(ndvp));
7737 7731
7738 7732 if (onm[0] == '.' && (onm[1] == '\0' ||
7739 7733 (onm[1] == '.' && onm[2] == '\0')))
7740 7734 return (EINVAL);
7741 7735
7742 7736 if (nnm[0] == '.' && (nnm[1] == '\0' ||
7743 7737 (nnm[1] == '.' && nnm[2] == '\0')))
7744 7738 return (EINVAL);
7745 7739
7746 7740 odrp = VTOR4(odvp);
7747 7741 ndrp = VTOR4(ndvp);
7748 7742 if ((intptr_t)odrp < (intptr_t)ndrp) {
7749 7743 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7750 7744 return (EINTR);
7751 7745 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7752 7746 nfs_rw_exit(&odrp->r_rwlock);
7753 7747 return (EINTR);
7754 7748 }
7755 7749 } else {
7756 7750 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7757 7751 return (EINTR);
7758 7752 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7759 7753 nfs_rw_exit(&ndrp->r_rwlock);
7760 7754 return (EINTR);
7761 7755 }
7762 7756 }
7763 7757
7764 7758 /*
7765 7759 * Lookup the target file. If it exists, it needs to be
7766 7760 * checked to see whether it is a mount point and whether
7767 7761 * it is active (open).
7768 7762 */
7769 7763 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7770 7764 if (!error) {
7771 7765 int isactive;
7772 7766
7773 7767 ASSERT(nfs4_consistent_type(nvp));
7774 7768 /*
7775 7769 * If this file has been mounted on, then just
7776 7770 * return busy because renaming to it would remove
7777 7771 * the mounted file system from the name space.
7778 7772 */
7779 7773 if (vn_ismntpt(nvp)) {
7780 7774 VN_RELE(nvp);
7781 7775 nfs_rw_exit(&odrp->r_rwlock);
7782 7776 nfs_rw_exit(&ndrp->r_rwlock);
7783 7777 return (EBUSY);
7784 7778 }
7785 7779
7786 7780 /*
7787 7781 * First just remove the entry from the name cache, as it
7788 7782 * is most likely the only entry for this vp.
7789 7783 */
7790 7784 dnlc_remove(ndvp, nnm);
7791 7785
7792 7786 rp = VTOR4(nvp);
7793 7787
7794 7788 if (nvp->v_type != VREG) {
7795 7789 /*
7796 7790 * Purge the name cache of all references to this vnode
7797 7791 * so that we can check the reference count to infer
7798 7792 * whether it is active or not.
7799 7793 */
7800 7794 if (nvp->v_count > 1)
7801 7795 dnlc_purge_vp(nvp);
7802 7796
7803 7797 isactive = nvp->v_count > 1;
7804 7798 } else {
7805 7799 mutex_enter(&rp->r_os_lock);
7806 7800 isactive = list_head(&rp->r_open_streams) != NULL;
7807 7801 mutex_exit(&rp->r_os_lock);
7808 7802 }
7809 7803
7810 7804 /*
7811 7805 * If the vnode is active and is not a directory,
7812 7806 * arrange to rename it to a
7813 7807 * temporary file so that it will continue to be
7814 7808 * accessible. This implements the "unlink-open-file"
7815 7809 * semantics for the target of a rename operation.
7816 7810 * Before doing this though, make sure that the
7817 7811 * source and target files are not already the same.
7818 7812 */
7819 7813 if (isactive && nvp->v_type != VDIR) {
7820 7814 /*
7821 7815 * Lookup the source name.
7822 7816 */
7823 7817 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7824 7818
7825 7819 /*
7826 7820 * The source name *should* already exist.
7827 7821 */
7828 7822 if (error) {
7829 7823 VN_RELE(nvp);
7830 7824 nfs_rw_exit(&odrp->r_rwlock);
7831 7825 nfs_rw_exit(&ndrp->r_rwlock);
7832 7826 return (error);
7833 7827 }
7834 7828
7835 7829 ASSERT(nfs4_consistent_type(ovp));
7836 7830
7837 7831 /*
7838 7832 * Compare the two vnodes. If they are the same,
7839 7833 * just release all held vnodes and return success.
7840 7834 */
7841 7835 if (VN_CMP(ovp, nvp)) {
7842 7836 VN_RELE(ovp);
7843 7837 VN_RELE(nvp);
7844 7838 nfs_rw_exit(&odrp->r_rwlock);
7845 7839 nfs_rw_exit(&ndrp->r_rwlock);
7846 7840 return (0);
7847 7841 }
7848 7842
7849 7843 /*
7850 7844 * Can't mix and match directories and non-
7851 7845 * directories in rename operations. We already
7852 7846 * know that the target is not a directory. If
7853 7847 * the source is a directory, return an error.
7854 7848 */
7855 7849 if (ovp->v_type == VDIR) {
7856 7850 VN_RELE(ovp);
7857 7851 VN_RELE(nvp);
7858 7852 nfs_rw_exit(&odrp->r_rwlock);
7859 7853 nfs_rw_exit(&ndrp->r_rwlock);
7860 7854 return (ENOTDIR);
7861 7855 }
7862 7856 link_call:
7863 7857 /*
7864 7858 * The target file exists, is not the same as
7865 7859 * the source file, and is active. We first
7866 7860 * try to Link it to a temporary filename to
7867 7861 * avoid having the server removing the file
7868 7862 * completely (which could cause data loss to
7869 7863 * the user's POV in the event the Rename fails
7870 7864 * -- see bug 1165874).
7871 7865 */
7872 7866 /*
7873 7867 * The do_link and did_link booleans are
7874 7868 * introduced in the event we get NFS4ERR_FILE_OPEN
7875 7869 * returned for the Rename. Some servers can
7876 7870 * not Rename over an Open file, so they return
7877 7871 * this error. The client needs to Remove the
7878 7872 * newly created Link and do two Renames, just
7879 7873 * as if the server didn't support LINK.
7880 7874 */
7881 7875 tmpname = newname();
7882 7876 error = 0;
7883 7877
7884 7878 if (do_link) {
7885 7879 error = nfs4_link(ndvp, nvp, tmpname, cr,
7886 7880 NULL, 0);
7887 7881 }
7888 7882 if (error == EOPNOTSUPP || !do_link) {
7889 7883 error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7890 7884 cr, NULL, 0);
7891 7885 did_link = 0;
7892 7886 } else {
7893 7887 did_link = 1;
7894 7888 }
7895 7889 if (error) {
7896 7890 kmem_free(tmpname, MAXNAMELEN);
7897 7891 VN_RELE(ovp);
7898 7892 VN_RELE(nvp);
7899 7893 nfs_rw_exit(&odrp->r_rwlock);
7900 7894 nfs_rw_exit(&ndrp->r_rwlock);
7901 7895 return (error);
7902 7896 }
7903 7897
7904 7898 mutex_enter(&rp->r_statelock);
7905 7899 if (rp->r_unldvp == NULL) {
7906 7900 VN_HOLD(ndvp);
7907 7901 rp->r_unldvp = ndvp;
7908 7902 if (rp->r_unlcred != NULL)
7909 7903 crfree(rp->r_unlcred);
7910 7904 crhold(cr);
7911 7905 rp->r_unlcred = cr;
7912 7906 rp->r_unlname = tmpname;
7913 7907 } else {
7914 7908 if (rp->r_unlname)
7915 7909 kmem_free(rp->r_unlname, MAXNAMELEN);
7916 7910 rp->r_unlname = tmpname;
7917 7911 }
7918 7912 mutex_exit(&rp->r_statelock);
7919 7913 }
7920 7914
7921 7915 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7922 7916
7923 7917 ASSERT(nfs4_consistent_type(nvp));
7924 7918 }
7925 7919
7926 7920 if (ovp == NULL) {
7927 7921 /*
7928 7922 * When renaming directories to be a subdirectory of a
7929 7923 * different parent, the dnlc entry for ".." will no
7930 7924 * longer be valid, so it must be removed.
7931 7925 *
7932 7926 * We do a lookup here to determine whether we are renaming
7933 7927 * a directory and we need to check if we are renaming
7934 7928 * an unlinked file. This might have already been done
7935 7929 * in previous code, so we check ovp == NULL to avoid
7936 7930 * doing it twice.
7937 7931 */
7938 7932 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7939 7933 /*
7940 7934 * The source name *should* already exist.
7941 7935 */
7942 7936 if (error) {
7943 7937 nfs_rw_exit(&odrp->r_rwlock);
7944 7938 nfs_rw_exit(&ndrp->r_rwlock);
7945 7939 if (nvp) {
7946 7940 VN_RELE(nvp);
7947 7941 }
7948 7942 return (error);
7949 7943 }
7950 7944 ASSERT(ovp != NULL);
7951 7945 ASSERT(nfs4_consistent_type(ovp));
7952 7946 }
7953 7947
7954 7948 /*
7955 7949 * Is the object being renamed a dir, and if so, is
7956 7950 * it being renamed to a child of itself? The underlying
7957 7951 * fs should ultimately return EINVAL for this case;
7958 7952 * however, buggy beta non-Solaris NFSv4 servers at
7959 7953 * interop testing events have allowed this behavior,
7960 7954 * and it caused our client to panic due to a recursive
7961 7955 * mutex_enter in fn_move.
7962 7956 *
7963 7957 * The tedious locking in fn_move could be changed to
7964 7958 * deal with this case, and the client could avoid the
7965 7959 * panic; however, the client would just confuse itself
7966 7960 * later and misbehave. A better way to handle the broken
7967 7961 * server is to detect this condition and return EINVAL
7968 7962 * without ever sending the the bogus rename to the server.
7969 7963 * We know the rename is invalid -- just fail it now.
7970 7964 */
7971 7965 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7972 7966 VN_RELE(ovp);
7973 7967 nfs_rw_exit(&odrp->r_rwlock);
7974 7968 nfs_rw_exit(&ndrp->r_rwlock);
7975 7969 if (nvp) {
7976 7970 VN_RELE(nvp);
7977 7971 }
7978 7972 return (EINVAL);
7979 7973 }
7980 7974
7981 7975 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7982 7976
7983 7977 /*
7984 7978 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7985 7979 * possible for the filehandle to change due to the rename.
7986 7980 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7987 7981 * the fh will not change because of the rename, but we still need
7988 7982 * to update its rnode entry with the new name for
7989 7983 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7990 7984 * has no effect on these for now, but for future improvements,
7991 7985 * we might want to use it too to simplify handling of files
7992 7986 * that are open with that flag on. (XXX)
7993 7987 */
7994 7988 mi = VTOMI4(odvp);
7995 7989 if (NFS4_VOLATILE_FH(mi))
7996 7990 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7997 7991 &stat);
7998 7992 else
7999 7993 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
8000 7994 &stat);
8001 7995
8002 7996 ASSERT(nfs4_consistent_type(odvp));
8003 7997 ASSERT(nfs4_consistent_type(ndvp));
8004 7998 ASSERT(nfs4_consistent_type(ovp));
8005 7999
8006 8000 if (stat == NFS4ERR_FILE_OPEN && did_link) {
8007 8001 do_link = 0;
8008 8002 /*
8009 8003 * Before the 'link_call' code, we did a nfs4_lookup
8010 8004 * that puts a VN_HOLD on nvp. After the nfs4_link
8011 8005 * call we call VN_RELE to match that hold. We need
8012 8006 * to place an additional VN_HOLD here since we will
8013 8007 * be hitting that VN_RELE again.
8014 8008 */
8015 8009 VN_HOLD(nvp);
8016 8010
8017 8011 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
8018 8012
8019 8013 /* Undo the unlinked file naming stuff we just did */
8020 8014 mutex_enter(&rp->r_statelock);
8021 8015 if (rp->r_unldvp) {
8022 8016 VN_RELE(ndvp);
8023 8017 rp->r_unldvp = NULL;
8024 8018 if (rp->r_unlcred != NULL)
8025 8019 crfree(rp->r_unlcred);
8026 8020 rp->r_unlcred = NULL;
8027 8021 /* rp->r_unlanme points to tmpname */
8028 8022 if (rp->r_unlname)
8029 8023 kmem_free(rp->r_unlname, MAXNAMELEN);
8030 8024 rp->r_unlname = NULL;
8031 8025 }
8032 8026 mutex_exit(&rp->r_statelock);
8033 8027
8034 8028 if (nvp) {
8035 8029 VN_RELE(nvp);
8036 8030 }
8037 8031 goto link_call;
8038 8032 }
8039 8033
8040 8034 if (error) {
8041 8035 VN_RELE(ovp);
8042 8036 nfs_rw_exit(&odrp->r_rwlock);
8043 8037 nfs_rw_exit(&ndrp->r_rwlock);
8044 8038 if (nvp) {
8045 8039 VN_RELE(nvp);
8046 8040 }
8047 8041 return (error);
8048 8042 }
8049 8043
8050 8044 /*
8051 8045 * when renaming directories to be a subdirectory of a
8052 8046 * different parent, the dnlc entry for ".." will no
8053 8047 * longer be valid, so it must be removed
8054 8048 */
8055 8049 rp = VTOR4(ovp);
8056 8050 if (ndvp != odvp) {
8057 8051 if (ovp->v_type == VDIR) {
8058 8052 dnlc_remove(ovp, "..");
8059 8053 if (rp->r_dir != NULL)
8060 8054 nfs4_purge_rddir_cache(ovp);
8061 8055 }
8062 8056 }
8063 8057
8064 8058 /*
8065 8059 * If we are renaming the unlinked file, update the
8066 8060 * r_unldvp and r_unlname as needed.
8067 8061 */
8068 8062 mutex_enter(&rp->r_statelock);
8069 8063 if (rp->r_unldvp != NULL) {
8070 8064 if (strcmp(rp->r_unlname, onm) == 0) {
8071 8065 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8072 8066 rp->r_unlname[MAXNAMELEN - 1] = '\0';
8073 8067 if (ndvp != rp->r_unldvp) {
8074 8068 VN_RELE(rp->r_unldvp);
8075 8069 rp->r_unldvp = ndvp;
8076 8070 VN_HOLD(ndvp);
8077 8071 }
8078 8072 }
8079 8073 }
8080 8074 mutex_exit(&rp->r_statelock);
8081 8075
8082 8076 /*
8083 8077 * Notify the rename vnevents to source vnode, and to the target
8084 8078 * vnode if it already existed.
8085 8079 */
8086 8080 if (error == 0) {
8087 8081 vnode_t *tvp, *tovp;
8088 8082 rnode4_t *trp;
8089 8083
8090 8084 /*
8091 8085 * Notify the vnode. Each links is represented by
8092 8086 * a different vnode, in nfsv4.
8093 8087 */
8094 8088 if (nvp) {
8095 8089 trp = VTOR4(nvp);
8096 8090 tvp = nvp;
8097 8091 if (IS_SHADOW(nvp, trp))
8098 8092 tvp = RTOV4(trp);
8099 8093 vnevent_rename_dest(tvp, ndvp, nnm, ct);
8100 8094 }
8101 8095
8102 8096 trp = VTOR4(ovp);
8103 8097 tovp = ovp;
8104 8098 if (IS_SHADOW(ovp, trp))
8105 8099 tovp = RTOV4(trp);
8106 8100
8107 8101 vnevent_rename_src(tovp, odvp, onm, ct);
8108 8102
8109 8103 trp = VTOR4(ndvp);
8110 8104 tvp = ndvp;
8111 8105
8112 8106 if (IS_SHADOW(ndvp, trp))
8113 8107 tvp = RTOV4(trp);
8114 8108
8115 8109 vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
8116 8110 }
8117 8111
8118 8112 if (nvp) {
8119 8113 VN_RELE(nvp);
8120 8114 }
8121 8115 VN_RELE(ovp);
8122 8116
8123 8117 nfs_rw_exit(&odrp->r_rwlock);
8124 8118 nfs_rw_exit(&ndrp->r_rwlock);
8125 8119
8126 8120 return (error);
8127 8121 }
8128 8122
8129 8123 /*
8130 8124 * When the parent directory has changed, sv_dfh must be updated
8131 8125 */
8132 8126 static void
8133 8127 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8134 8128 {
8135 8129 svnode_t *sv = VTOSV(vp);
8136 8130 nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8137 8131 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8138 8132
8139 8133 sfh4_hold(new_dfh);
8140 8134 sv->sv_dfh = new_dfh;
8141 8135 sfh4_rele(&old_dfh);
8142 8136 }
8143 8137
8144 8138 /*
8145 8139 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8146 8140 * when it is known that the filehandle is persistent through rename.
8147 8141 *
8148 8142 * Rename requires that the current fh be the target directory and the
8149 8143 * saved fh be the source directory. After the operation, the current fh
8150 8144 * is unchanged.
8151 8145 * The compound op structure for persistent fh rename is:
8152 8146 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8153 8147 * Rather than bother with the directory postop args, we'll simply
8154 8148 * update that a change occurred in the cache, so no post-op getattrs.
8155 8149 */
8156 8150 static int
8157 8151 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8158 8152 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8159 8153 {
8160 8154 COMPOUND4args_clnt args;
8161 8155 COMPOUND4res_clnt res, *resp = NULL;
8162 8156 nfs_argop4 *argop;
8163 8157 nfs_resop4 *resop;
8164 8158 int doqueue, argoplist_size;
8165 8159 mntinfo4_t *mi;
8166 8160 rnode4_t *odrp = VTOR4(odvp);
8167 8161 rnode4_t *ndrp = VTOR4(ndvp);
8168 8162 RENAME4res *rn_res;
8169 8163 bool_t needrecov;
8170 8164 nfs4_recov_state_t recov_state;
8171 8165 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8172 8166 dirattr_info_t dinfo, *dinfop;
8173 8167
8174 8168 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8175 8169
8176 8170 recov_state.rs_flags = 0;
8177 8171 recov_state.rs_num_retry_despite_err = 0;
8178 8172
8179 8173 /*
8180 8174 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8181 8175 *
8182 8176 * If source/target are different dirs, then append putfh(src); getattr
8183 8177 */
8184 8178 args.array_len = (odvp == ndvp) ? 5 : 7;
8185 8179 argoplist_size = args.array_len * sizeof (nfs_argop4);
8186 8180 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8187 8181
8188 8182 recov_retry:
8189 8183 *statp = NFS4_OK;
8190 8184
8191 8185 /* No need to Lookup the file, persistent fh */
8192 8186 args.ctag = TAG_RENAME;
8193 8187
8194 8188 mi = VTOMI4(odvp);
8195 8189 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8196 8190 if (e.error) {
8197 8191 kmem_free(argop, argoplist_size);
8198 8192 return (e.error);
8199 8193 }
8200 8194
8201 8195 /* 0: putfh source directory */
8202 8196 argop[0].argop = OP_CPUTFH;
8203 8197 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8204 8198
8205 8199 /* 1: Save source fh to free up current for target */
8206 8200 argop[1].argop = OP_SAVEFH;
8207 8201
8208 8202 /* 2: putfh targetdir */
8209 8203 argop[2].argop = OP_CPUTFH;
8210 8204 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8211 8205
8212 8206 /* 3: current_fh is targetdir, saved_fh is sourcedir */
8213 8207 argop[3].argop = OP_CRENAME;
8214 8208 argop[3].nfs_argop4_u.opcrename.coldname = onm;
8215 8209 argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8216 8210
8217 8211 /* 4: getattr (targetdir) */
8218 8212 argop[4].argop = OP_GETATTR;
8219 8213 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8220 8214 argop[4].nfs_argop4_u.opgetattr.mi = mi;
8221 8215
8222 8216 if (ndvp != odvp) {
8223 8217
8224 8218 /* 5: putfh (sourcedir) */
8225 8219 argop[5].argop = OP_CPUTFH;
8226 8220 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8227 8221
8228 8222 /* 6: getattr (sourcedir) */
8229 8223 argop[6].argop = OP_GETATTR;
8230 8224 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8231 8225 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8232 8226 }
8233 8227
8234 8228 dnlc_remove(odvp, onm);
8235 8229 dnlc_remove(ndvp, nnm);
8236 8230
8237 8231 doqueue = 1;
8238 8232 dinfo.di_time_call = gethrtime();
8239 8233 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8240 8234
8241 8235 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8242 8236 if (e.error) {
8243 8237 PURGE_ATTRCACHE4(odvp);
8244 8238 PURGE_ATTRCACHE4(ndvp);
8245 8239 } else {
8246 8240 *statp = res.status;
8247 8241 }
8248 8242
8249 8243 if (needrecov) {
8250 8244 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8251 8245 OP_RENAME, NULL, NULL, NULL) == FALSE) {
8252 8246 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8253 8247 if (!e.error)
8254 8248 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8255 8249 goto recov_retry;
8256 8250 }
8257 8251 }
8258 8252
8259 8253 if (!e.error) {
8260 8254 resp = &res;
8261 8255 /*
8262 8256 * as long as OP_RENAME
8263 8257 */
8264 8258 if (res.status != NFS4_OK && res.array_len <= 4) {
8265 8259 e.error = geterrno4(res.status);
8266 8260 PURGE_ATTRCACHE4(odvp);
8267 8261 PURGE_ATTRCACHE4(ndvp);
8268 8262 /*
8269 8263 * System V defines rename to return EEXIST, not
8270 8264 * ENOTEMPTY if the target directory is not empty.
8271 8265 * Over the wire, the error is NFSERR_ENOTEMPTY
8272 8266 * which geterrno4 maps to ENOTEMPTY.
8273 8267 */
8274 8268 if (e.error == ENOTEMPTY)
8275 8269 e.error = EEXIST;
8276 8270 } else {
8277 8271
8278 8272 resop = &res.array[3]; /* rename res */
8279 8273 rn_res = &resop->nfs_resop4_u.oprename;
8280 8274
8281 8275 if (res.status == NFS4_OK) {
8282 8276 /*
8283 8277 * Update target attribute, readdir and dnlc
8284 8278 * caches.
8285 8279 */
8286 8280 dinfo.di_garp =
8287 8281 &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8288 8282 dinfo.di_cred = cr;
8289 8283 dinfop = &dinfo;
8290 8284 } else
8291 8285 dinfop = NULL;
8292 8286
8293 8287 nfs4_update_dircaches(&rn_res->target_cinfo,
8294 8288 ndvp, NULL, NULL, dinfop);
8295 8289
8296 8290 /*
8297 8291 * Update source attribute, readdir and dnlc caches
8298 8292 *
8299 8293 */
8300 8294 if (ndvp != odvp) {
8301 8295 update_parentdir_sfh(renvp, ndvp);
8302 8296
8303 8297 if (dinfop)
8304 8298 dinfo.di_garp =
8305 8299 &(res.array[6].nfs_resop4_u.
8306 8300 opgetattr.ga_res);
8307 8301
8308 8302 nfs4_update_dircaches(&rn_res->source_cinfo,
8309 8303 odvp, NULL, NULL, dinfop);
8310 8304 }
8311 8305
8312 8306 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8313 8307 nnm);
8314 8308 }
8315 8309 }
8316 8310
8317 8311 if (resp)
8318 8312 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8319 8313 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8320 8314 kmem_free(argop, argoplist_size);
8321 8315
8322 8316 return (e.error);
8323 8317 }
8324 8318
8325 8319 /*
8326 8320 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8327 8321 * it is possible for the filehandle to change due to the rename.
8328 8322 *
8329 8323 * The compound req in this case includes a post-rename lookup and getattr
8330 8324 * to ensure that we have the correct fh and attributes for the object.
8331 8325 *
8332 8326 * Rename requires that the current fh be the target directory and the
8333 8327 * saved fh be the source directory. After the operation, the current fh
8334 8328 * is unchanged.
8335 8329 *
8336 8330 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8337 8331 * update the filehandle for the renamed object. We also get the old
8338 8332 * filehandle for historical reasons; this should be taken out sometime.
8339 8333 * This results in a rather cumbersome compound...
8340 8334 *
8341 8335 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8342 8336 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8343 8337 *
8344 8338 */
8345 8339 static int
8346 8340 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8347 8341 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8348 8342 {
8349 8343 COMPOUND4args_clnt args;
8350 8344 COMPOUND4res_clnt res, *resp = NULL;
8351 8345 int argoplist_size;
8352 8346 nfs_argop4 *argop;
8353 8347 nfs_resop4 *resop;
8354 8348 int doqueue;
8355 8349 mntinfo4_t *mi;
8356 8350 rnode4_t *odrp = VTOR4(odvp); /* old directory */
8357 8351 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */
8358 8352 rnode4_t *orp = VTOR4(ovp); /* object being renamed */
8359 8353 RENAME4res *rn_res;
8360 8354 GETFH4res *ngf_res;
8361 8355 bool_t needrecov;
8362 8356 nfs4_recov_state_t recov_state;
8363 8357 hrtime_t t;
8364 8358 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8365 8359 dirattr_info_t dinfo, *dinfop = &dinfo;
8366 8360
8367 8361 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8368 8362
8369 8363 recov_state.rs_flags = 0;
8370 8364 recov_state.rs_num_retry_despite_err = 0;
8371 8365
8372 8366 recov_retry:
8373 8367 *statp = NFS4_OK;
8374 8368
8375 8369 /*
8376 8370 * There is a window between the RPC and updating the path and
8377 8371 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery
8378 8372 * code, so that it doesn't try to use the old path during that
8379 8373 * window.
8380 8374 */
8381 8375 mutex_enter(&orp->r_statelock);
8382 8376 while (orp->r_flags & R4RECEXPFH) {
8383 8377 klwp_t *lwp = ttolwp(curthread);
8384 8378
8385 8379 if (lwp != NULL)
8386 8380 lwp->lwp_nostop++;
8387 8381 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8388 8382 mutex_exit(&orp->r_statelock);
8389 8383 if (lwp != NULL)
8390 8384 lwp->lwp_nostop--;
8391 8385 return (EINTR);
8392 8386 }
8393 8387 if (lwp != NULL)
8394 8388 lwp->lwp_nostop--;
8395 8389 }
8396 8390 orp->r_flags |= R4RECEXPFH;
8397 8391 mutex_exit(&orp->r_statelock);
8398 8392
8399 8393 mi = VTOMI4(odvp);
8400 8394
8401 8395 args.ctag = TAG_RENAME_VFH;
8402 8396 args.array_len = (odvp == ndvp) ? 10 : 12;
8403 8397 argoplist_size = args.array_len * sizeof (nfs_argop4);
8404 8398 argop = kmem_alloc(argoplist_size, KM_SLEEP);
8405 8399
8406 8400 /*
8407 8401 * Rename ops:
8408 8402 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8409 8403 * PUTFH(targetdir), RENAME, GETATTR(targetdir)
8410 8404 * LOOKUP(trgt), GETFH(new), GETATTR,
8411 8405 *
8412 8406 * if (odvp != ndvp)
8413 8407 * add putfh(sourcedir), getattr(sourcedir) }
8414 8408 */
8415 8409 args.array = argop;
8416 8410
8417 8411 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8418 8412 &recov_state, NULL);
8419 8413 if (e.error) {
8420 8414 kmem_free(argop, argoplist_size);
8421 8415 mutex_enter(&orp->r_statelock);
8422 8416 orp->r_flags &= ~R4RECEXPFH;
8423 8417 cv_broadcast(&orp->r_cv);
8424 8418 mutex_exit(&orp->r_statelock);
8425 8419 return (e.error);
8426 8420 }
8427 8421
8428 8422 /* 0: putfh source directory */
8429 8423 argop[0].argop = OP_CPUTFH;
8430 8424 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8431 8425
8432 8426 /* 1: Save source fh to free up current for target */
8433 8427 argop[1].argop = OP_SAVEFH;
8434 8428
8435 8429 /* 2: Lookup pre-rename fh of renamed object */
8436 8430 argop[2].argop = OP_CLOOKUP;
8437 8431 argop[2].nfs_argop4_u.opclookup.cname = onm;
8438 8432
8439 8433 /* 3: getfh fh of renamed object (before rename) */
8440 8434 argop[3].argop = OP_GETFH;
8441 8435
8442 8436 /* 4: putfh targetdir */
8443 8437 argop[4].argop = OP_CPUTFH;
8444 8438 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8445 8439
8446 8440 /* 5: current_fh is targetdir, saved_fh is sourcedir */
8447 8441 argop[5].argop = OP_CRENAME;
8448 8442 argop[5].nfs_argop4_u.opcrename.coldname = onm;
8449 8443 argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8450 8444
8451 8445 /* 6: getattr of target dir (post op attrs) */
8452 8446 argop[6].argop = OP_GETATTR;
8453 8447 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8454 8448 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8455 8449
8456 8450 /* 7: Lookup post-rename fh of renamed object */
8457 8451 argop[7].argop = OP_CLOOKUP;
8458 8452 argop[7].nfs_argop4_u.opclookup.cname = nnm;
8459 8453
8460 8454 /* 8: getfh fh of renamed object (after rename) */
8461 8455 argop[8].argop = OP_GETFH;
8462 8456
8463 8457 /* 9: getattr of renamed object */
8464 8458 argop[9].argop = OP_GETATTR;
8465 8459 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8466 8460 argop[9].nfs_argop4_u.opgetattr.mi = mi;
8467 8461
8468 8462 /*
8469 8463 * If source/target dirs are different, then get new post-op
8470 8464 * attrs for source dir also.
8471 8465 */
8472 8466 if (ndvp != odvp) {
8473 8467 /* 10: putfh (sourcedir) */
8474 8468 argop[10].argop = OP_CPUTFH;
8475 8469 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8476 8470
8477 8471 /* 11: getattr (sourcedir) */
8478 8472 argop[11].argop = OP_GETATTR;
8479 8473 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8480 8474 argop[11].nfs_argop4_u.opgetattr.mi = mi;
8481 8475 }
8482 8476
8483 8477 dnlc_remove(odvp, onm);
8484 8478 dnlc_remove(ndvp, nnm);
8485 8479
8486 8480 doqueue = 1;
8487 8481 t = gethrtime();
8488 8482 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8489 8483
8490 8484 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8491 8485 if (e.error) {
8492 8486 PURGE_ATTRCACHE4(odvp);
8493 8487 PURGE_ATTRCACHE4(ndvp);
8494 8488 if (!needrecov) {
8495 8489 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8496 8490 &recov_state, needrecov);
8497 8491 goto out;
8498 8492 }
8499 8493 } else {
8500 8494 *statp = res.status;
8501 8495 }
8502 8496
8503 8497 if (needrecov) {
8504 8498 bool_t abort;
8505 8499
8506 8500 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8507 8501 OP_RENAME, NULL, NULL, NULL);
8508 8502 if (abort == FALSE) {
8509 8503 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8510 8504 &recov_state, needrecov);
8511 8505 kmem_free(argop, argoplist_size);
8512 8506 if (!e.error)
8513 8507 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8514 8508 mutex_enter(&orp->r_statelock);
8515 8509 orp->r_flags &= ~R4RECEXPFH;
8516 8510 cv_broadcast(&orp->r_cv);
8517 8511 mutex_exit(&orp->r_statelock);
8518 8512 goto recov_retry;
8519 8513 } else {
8520 8514 if (e.error != 0) {
8521 8515 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8522 8516 &recov_state, needrecov);
8523 8517 goto out;
8524 8518 }
8525 8519 /* fall through for res.status case */
8526 8520 }
8527 8521 }
8528 8522
8529 8523 resp = &res;
8530 8524 /*
8531 8525 * If OP_RENAME (or any prev op) failed, then return an error.
8532 8526 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8533 8527 */
8534 8528 if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8535 8529 /*
8536 8530 * Error in an op other than last Getattr
8537 8531 */
8538 8532 e.error = geterrno4(res.status);
8539 8533 PURGE_ATTRCACHE4(odvp);
8540 8534 PURGE_ATTRCACHE4(ndvp);
8541 8535 /*
8542 8536 * System V defines rename to return EEXIST, not
8543 8537 * ENOTEMPTY if the target directory is not empty.
8544 8538 * Over the wire, the error is NFSERR_ENOTEMPTY
8545 8539 * which geterrno4 maps to ENOTEMPTY.
8546 8540 */
8547 8541 if (e.error == ENOTEMPTY)
8548 8542 e.error = EEXIST;
8549 8543 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8550 8544 needrecov);
8551 8545 goto out;
8552 8546 }
8553 8547
8554 8548 /* rename results */
8555 8549 rn_res = &res.array[5].nfs_resop4_u.oprename;
8556 8550
8557 8551 if (res.status == NFS4_OK) {
8558 8552 /* Update target attribute, readdir and dnlc caches */
8559 8553 dinfo.di_garp =
8560 8554 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8561 8555 dinfo.di_cred = cr;
8562 8556 dinfo.di_time_call = t;
8563 8557 } else
8564 8558 dinfop = NULL;
8565 8559
8566 8560 /* Update source cache attribute, readdir and dnlc caches */
8567 8561 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8568 8562
8569 8563 /* Update source cache attribute, readdir and dnlc caches */
8570 8564 if (ndvp != odvp) {
8571 8565 update_parentdir_sfh(ovp, ndvp);
8572 8566
8573 8567 /*
8574 8568 * If dinfop is non-NULL, then compound succeded, so
8575 8569 * set di_garp to attrs for source dir. dinfop is only
8576 8570 * set to NULL when compound fails.
8577 8571 */
8578 8572 if (dinfop)
8579 8573 dinfo.di_garp =
8580 8574 &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8581 8575 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8582 8576 dinfop);
8583 8577 }
8584 8578
8585 8579 /*
8586 8580 * Update the rnode with the new component name and args,
8587 8581 * and if the file handle changed, also update it with the new fh.
8588 8582 * This is only necessary if the target object has an rnode
8589 8583 * entry and there is no need to create one for it.
8590 8584 */
8591 8585 resop = &res.array[8]; /* getfh new res */
8592 8586 ngf_res = &resop->nfs_resop4_u.opgetfh;
8593 8587
8594 8588 /*
8595 8589 * Update the path and filehandle for the renamed object.
8596 8590 */
8597 8591 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8598 8592
8599 8593 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8600 8594
8601 8595 if (res.status == NFS4_OK) {
8602 8596 resop++; /* getattr res */
8603 8597 e.error = nfs4_update_attrcache(res.status,
8604 8598 &resop->nfs_resop4_u.opgetattr.ga_res,
8605 8599 t, ovp, cr);
8606 8600 }
8607 8601
8608 8602 out:
8609 8603 kmem_free(argop, argoplist_size);
8610 8604 if (resp)
8611 8605 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8612 8606 mutex_enter(&orp->r_statelock);
8613 8607 orp->r_flags &= ~R4RECEXPFH;
8614 8608 cv_broadcast(&orp->r_cv);
8615 8609 mutex_exit(&orp->r_statelock);
8616 8610
8617 8611 return (e.error);
8618 8612 }
8619 8613
8620 8614 /* ARGSUSED */
8621 8615 static int
8622 8616 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8623 8617 caller_context_t *ct, int flags, vsecattr_t *vsecp)
8624 8618 {
8625 8619 int error;
8626 8620 vnode_t *vp;
8627 8621
8628 8622 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8629 8623 return (EPERM);
8630 8624 /*
8631 8625 * As ".." has special meaning and rather than send a mkdir
8632 8626 * over the wire to just let the server freak out, we just
8633 8627 * short circuit it here and return EEXIST
8634 8628 */
8635 8629 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8636 8630 return (EEXIST);
8637 8631
8638 8632 /*
8639 8633 * Decision to get the right gid and setgid bit of the
8640 8634 * new directory is now made in call_nfs4_create_req.
8641 8635 */
8642 8636 va->va_mask |= AT_MODE;
8643 8637 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8644 8638 if (error)
8645 8639 return (error);
8646 8640
8647 8641 *vpp = vp;
8648 8642 return (0);
8649 8643 }
8650 8644
8651 8645
8652 8646 /*
8653 8647 * rmdir is using the same remove v4 op as does remove.
8654 8648 * Remove requires that the current fh be the target directory.
8655 8649 * After the operation, the current fh is unchanged.
8656 8650 * The compound op structure is:
8657 8651 * PUTFH(targetdir), REMOVE
8658 8652 */
8659 8653 /*ARGSUSED4*/
8660 8654 static int
8661 8655 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8662 8656 caller_context_t *ct, int flags)
8663 8657 {
8664 8658 int need_end_op = FALSE;
8665 8659 COMPOUND4args_clnt args;
8666 8660 COMPOUND4res_clnt res, *resp = NULL;
8667 8661 REMOVE4res *rm_res;
8668 8662 nfs_argop4 argop[3];
8669 8663 nfs_resop4 *resop;
8670 8664 vnode_t *vp;
8671 8665 int doqueue;
8672 8666 mntinfo4_t *mi;
8673 8667 rnode4_t *drp;
8674 8668 bool_t needrecov = FALSE;
8675 8669 nfs4_recov_state_t recov_state;
8676 8670 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8677 8671 dirattr_info_t dinfo, *dinfop;
8678 8672
8679 8673 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8680 8674 return (EPERM);
8681 8675 /*
8682 8676 * As ".." has special meaning and rather than send a rmdir
8683 8677 * over the wire to just let the server freak out, we just
8684 8678 * short circuit it here and return EEXIST
8685 8679 */
8686 8680 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8687 8681 return (EEXIST);
8688 8682
8689 8683 drp = VTOR4(dvp);
8690 8684 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8691 8685 return (EINTR);
8692 8686
8693 8687 /*
8694 8688 * Attempt to prevent a rmdir(".") from succeeding.
8695 8689 */
8696 8690 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8697 8691 if (e.error) {
8698 8692 nfs_rw_exit(&drp->r_rwlock);
8699 8693 return (e.error);
8700 8694 }
8701 8695 if (vp == cdir) {
8702 8696 VN_RELE(vp);
8703 8697 nfs_rw_exit(&drp->r_rwlock);
8704 8698 return (EINVAL);
8705 8699 }
8706 8700
8707 8701 /*
8708 8702 * Since nfsv4 remove op works on both files and directories,
8709 8703 * check that the removed object is indeed a directory.
8710 8704 */
8711 8705 if (vp->v_type != VDIR) {
8712 8706 VN_RELE(vp);
8713 8707 nfs_rw_exit(&drp->r_rwlock);
8714 8708 return (ENOTDIR);
8715 8709 }
8716 8710
8717 8711 /*
8718 8712 * First just remove the entry from the name cache, as it
8719 8713 * is most likely an entry for this vp.
8720 8714 */
8721 8715 dnlc_remove(dvp, nm);
8722 8716
8723 8717 /*
8724 8718 * If there vnode reference count is greater than one, then
8725 8719 * there may be additional references in the DNLC which will
8726 8720 * need to be purged. First, trying removing the entry for
8727 8721 * the parent directory and see if that removes the additional
8728 8722 * reference(s). If that doesn't do it, then use dnlc_purge_vp
8729 8723 * to completely remove any references to the directory which
8730 8724 * might still exist in the DNLC.
8731 8725 */
8732 8726 if (vp->v_count > 1) {
8733 8727 dnlc_remove(vp, "..");
8734 8728 if (vp->v_count > 1)
8735 8729 dnlc_purge_vp(vp);
8736 8730 }
8737 8731
8738 8732 mi = VTOMI4(dvp);
8739 8733 recov_state.rs_flags = 0;
8740 8734 recov_state.rs_num_retry_despite_err = 0;
8741 8735
8742 8736 recov_retry:
8743 8737 args.ctag = TAG_RMDIR;
8744 8738
8745 8739 /*
8746 8740 * Rmdir ops: putfh dir; remove
8747 8741 */
8748 8742 args.array_len = 3;
8749 8743 args.array = argop;
8750 8744
8751 8745 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8752 8746 if (e.error) {
8753 8747 nfs_rw_exit(&drp->r_rwlock);
8754 8748 return (e.error);
8755 8749 }
8756 8750 need_end_op = TRUE;
8757 8751
8758 8752 /* putfh directory */
8759 8753 argop[0].argop = OP_CPUTFH;
8760 8754 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8761 8755
8762 8756 /* remove */
8763 8757 argop[1].argop = OP_CREMOVE;
8764 8758 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8765 8759
8766 8760 /* getattr (postop attrs for dir that contained removed dir) */
8767 8761 argop[2].argop = OP_GETATTR;
8768 8762 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8769 8763 argop[2].nfs_argop4_u.opgetattr.mi = mi;
8770 8764
8771 8765 dinfo.di_time_call = gethrtime();
8772 8766 doqueue = 1;
8773 8767 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8774 8768
8775 8769 PURGE_ATTRCACHE4(vp);
8776 8770
8777 8771 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8778 8772 if (e.error) {
8779 8773 PURGE_ATTRCACHE4(dvp);
8780 8774 }
8781 8775
8782 8776 if (needrecov) {
8783 8777 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8784 8778 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8785 8779 if (!e.error)
8786 8780 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8787 8781
8788 8782 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8789 8783 needrecov);
8790 8784 need_end_op = FALSE;
8791 8785 goto recov_retry;
8792 8786 }
8793 8787 }
8794 8788
8795 8789 if (!e.error) {
8796 8790 resp = &res;
8797 8791
8798 8792 /*
8799 8793 * Only return error if first 2 ops (OP_REMOVE or earlier)
8800 8794 * failed.
8801 8795 */
8802 8796 if (res.status != NFS4_OK && res.array_len <= 2) {
8803 8797 e.error = geterrno4(res.status);
8804 8798 PURGE_ATTRCACHE4(dvp);
8805 8799 nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8806 8800 &recov_state, needrecov);
8807 8801 need_end_op = FALSE;
8808 8802 nfs4_purge_stale_fh(e.error, dvp, cr);
8809 8803 /*
8810 8804 * System V defines rmdir to return EEXIST, not
8811 8805 * ENOTEMPTY if the directory is not empty. Over
8812 8806 * the wire, the error is NFSERR_ENOTEMPTY which
8813 8807 * geterrno4 maps to ENOTEMPTY.
8814 8808 */
8815 8809 if (e.error == ENOTEMPTY)
8816 8810 e.error = EEXIST;
8817 8811 } else {
8818 8812 resop = &res.array[1]; /* remove res */
8819 8813 rm_res = &resop->nfs_resop4_u.opremove;
8820 8814
8821 8815 if (res.status == NFS4_OK) {
8822 8816 resop = &res.array[2]; /* dir attrs */
8823 8817 dinfo.di_garp =
8824 8818 &resop->nfs_resop4_u.opgetattr.ga_res;
8825 8819 dinfo.di_cred = cr;
8826 8820 dinfop = &dinfo;
8827 8821 } else
8828 8822 dinfop = NULL;
8829 8823
8830 8824 /* Update dir attribute, readdir and dnlc caches */
8831 8825 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8832 8826 dinfop);
8833 8827
8834 8828 /* destroy rddir cache for dir that was removed */
8835 8829 if (VTOR4(vp)->r_dir != NULL)
8836 8830 nfs4_purge_rddir_cache(vp);
8837 8831 }
8838 8832 }
8839 8833
8840 8834 if (need_end_op)
8841 8835 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8842 8836
8843 8837 nfs_rw_exit(&drp->r_rwlock);
8844 8838
8845 8839 if (resp)
8846 8840 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8847 8841
8848 8842 if (e.error == 0) {
8849 8843 vnode_t *tvp;
8850 8844 rnode4_t *trp;
8851 8845 trp = VTOR4(vp);
8852 8846 tvp = vp;
8853 8847 if (IS_SHADOW(vp, trp))
8854 8848 tvp = RTOV4(trp);
8855 8849 vnevent_rmdir(tvp, dvp, nm, ct);
8856 8850 }
8857 8851
8858 8852 VN_RELE(vp);
8859 8853
8860 8854 return (e.error);
8861 8855 }
8862 8856
8863 8857 /* ARGSUSED */
8864 8858 static int
8865 8859 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8866 8860 caller_context_t *ct, int flags)
8867 8861 {
8868 8862 int error;
8869 8863 vnode_t *vp;
8870 8864 rnode4_t *rp;
8871 8865 char *contents;
8872 8866 mntinfo4_t *mi = VTOMI4(dvp);
8873 8867
8874 8868 if (nfs_zone() != mi->mi_zone)
8875 8869 return (EPERM);
8876 8870 if (!(mi->mi_flags & MI4_SYMLINK))
8877 8871 return (EOPNOTSUPP);
8878 8872
8879 8873 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8880 8874 if (error)
8881 8875 return (error);
8882 8876
8883 8877 ASSERT(nfs4_consistent_type(vp));
8884 8878 rp = VTOR4(vp);
8885 8879 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8886 8880
8887 8881 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8888 8882
8889 8883 if (contents != NULL) {
8890 8884 mutex_enter(&rp->r_statelock);
8891 8885 if (rp->r_symlink.contents == NULL) {
8892 8886 rp->r_symlink.len = strlen(tnm);
8893 8887 bcopy(tnm, contents, rp->r_symlink.len);
8894 8888 rp->r_symlink.contents = contents;
8895 8889 rp->r_symlink.size = MAXPATHLEN;
8896 8890 mutex_exit(&rp->r_statelock);
8897 8891 } else {
8898 8892 mutex_exit(&rp->r_statelock);
8899 8893 kmem_free((void *)contents, MAXPATHLEN);
8900 8894 }
8901 8895 }
8902 8896 }
8903 8897 VN_RELE(vp);
8904 8898
8905 8899 return (error);
8906 8900 }
8907 8901
8908 8902
8909 8903 /*
8910 8904 * Read directory entries.
8911 8905 * There are some weird things to look out for here. The uio_loffset
8912 8906 * field is either 0 or it is the offset returned from a previous
8913 8907 * readdir. It is an opaque value used by the server to find the
8914 8908 * correct directory block to read. The count field is the number
8915 8909 * of blocks to read on the server. This is advisory only, the server
8916 8910 * may return only one block's worth of entries. Entries may be compressed
8917 8911 * on the server.
8918 8912 */
8919 8913 /* ARGSUSED */
8920 8914 static int
8921 8915 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8922 8916 caller_context_t *ct, int flags)
8923 8917 {
8924 8918 int error;
8925 8919 uint_t count;
8926 8920 rnode4_t *rp;
8927 8921 rddir4_cache *rdc;
8928 8922 rddir4_cache *rrdc;
8929 8923
8930 8924 if (nfs_zone() != VTOMI4(vp)->mi_zone)
8931 8925 return (EIO);
8932 8926 rp = VTOR4(vp);
8933 8927
8934 8928 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8935 8929
8936 8930 /*
8937 8931 * Make sure that the directory cache is valid.
8938 8932 */
8939 8933 if (rp->r_dir != NULL) {
8940 8934 if (nfs_disable_rddir_cache != 0) {
8941 8935 /*
8942 8936 * Setting nfs_disable_rddir_cache in /etc/system
8943 8937 * allows interoperability with servers that do not
8944 8938 * properly update the attributes of directories.
8945 8939 * Any cached information gets purged before an
8946 8940 * access is made to it.
8947 8941 */
8948 8942 nfs4_purge_rddir_cache(vp);
8949 8943 }
8950 8944
8951 8945 error = nfs4_validate_caches(vp, cr);
8952 8946 if (error)
8953 8947 return (error);
8954 8948 }
8955 8949
8956 8950 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8957 8951
8958 8952 /*
8959 8953 * Short circuit last readdir which always returns 0 bytes.
8960 8954 * This can be done after the directory has been read through
8961 8955 * completely at least once. This will set r_direof which
8962 8956 * can be used to find the value of the last cookie.
8963 8957 */
8964 8958 mutex_enter(&rp->r_statelock);
8965 8959 if (rp->r_direof != NULL &&
8966 8960 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8967 8961 mutex_exit(&rp->r_statelock);
8968 8962 #ifdef DEBUG
8969 8963 nfs4_readdir_cache_shorts++;
8970 8964 #endif
8971 8965 if (eofp)
8972 8966 *eofp = 1;
8973 8967 return (0);
8974 8968 }
8975 8969
8976 8970 /*
8977 8971 * Look for a cache entry. Cache entries are identified
8978 8972 * by the NFS cookie value and the byte count requested.
8979 8973 */
8980 8974 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8981 8975
8982 8976 /*
8983 8977 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8984 8978 */
8985 8979 if (rdc == NULL) {
8986 8980 mutex_exit(&rp->r_statelock);
8987 8981 return (EINTR);
8988 8982 }
8989 8983
8990 8984 /*
8991 8985 * Check to see if we need to fill this entry in.
8992 8986 */
8993 8987 if (rdc->flags & RDDIRREQ) {
8994 8988 rdc->flags &= ~RDDIRREQ;
8995 8989 rdc->flags |= RDDIR;
8996 8990 mutex_exit(&rp->r_statelock);
8997 8991
8998 8992 /*
8999 8993 * Do the readdir.
9000 8994 */
9001 8995 nfs4readdir(vp, rdc, cr);
9002 8996
9003 8997 /*
9004 8998 * Reacquire the lock, so that we can continue
9005 8999 */
9006 9000 mutex_enter(&rp->r_statelock);
9007 9001 /*
9008 9002 * The entry is now complete
9009 9003 */
9010 9004 rdc->flags &= ~RDDIR;
9011 9005 }
9012 9006
9013 9007 ASSERT(!(rdc->flags & RDDIR));
9014 9008
9015 9009 /*
9016 9010 * If an error occurred while attempting
9017 9011 * to fill the cache entry, mark the entry invalid and
9018 9012 * just return the error.
9019 9013 */
9020 9014 if (rdc->error) {
9021 9015 error = rdc->error;
9022 9016 rdc->flags |= RDDIRREQ;
9023 9017 rddir4_cache_rele(rp, rdc);
9024 9018 mutex_exit(&rp->r_statelock);
9025 9019 return (error);
9026 9020 }
9027 9021
9028 9022 /*
9029 9023 * The cache entry is complete and good,
9030 9024 * copyout the dirent structs to the calling
9031 9025 * thread.
9032 9026 */
9033 9027 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9034 9028
9035 9029 /*
9036 9030 * If no error occurred during the copyout,
9037 9031 * update the offset in the uio struct to
9038 9032 * contain the value of the next NFS 4 cookie
9039 9033 * and set the eof value appropriately.
9040 9034 */
9041 9035 if (!error) {
9042 9036 uiop->uio_loffset = rdc->nfs4_ncookie;
9043 9037 if (eofp)
9044 9038 *eofp = rdc->eof;
9045 9039 }
9046 9040
9047 9041 /*
9048 9042 * Decide whether to do readahead. Don't if we
9049 9043 * have already read to the end of directory.
9050 9044 */
9051 9045 if (rdc->eof) {
9052 9046 /*
9053 9047 * Make the entry the direof only if it is cached
9054 9048 */
9055 9049 if (rdc->flags & RDDIRCACHED)
9056 9050 rp->r_direof = rdc;
9057 9051 rddir4_cache_rele(rp, rdc);
9058 9052 mutex_exit(&rp->r_statelock);
9059 9053 return (error);
9060 9054 }
9061 9055
9062 9056 /* Determine if a readdir readahead should be done */
9063 9057 if (!(rp->r_flags & R4LOOKUP)) {
9064 9058 rddir4_cache_rele(rp, rdc);
9065 9059 mutex_exit(&rp->r_statelock);
9066 9060 return (error);
9067 9061 }
9068 9062
9069 9063 /*
9070 9064 * Now look for a readahead entry.
9071 9065 *
9072 9066 * Check to see whether we found an entry for the readahead.
9073 9067 * If so, we don't need to do anything further, so free the new
9074 9068 * entry if one was allocated. Otherwise, allocate a new entry, add
9075 9069 * it to the cache, and then initiate an asynchronous readdir
9076 9070 * operation to fill it.
9077 9071 */
9078 9072 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9079 9073
9080 9074 /*
9081 9075 * A readdir cache entry could not be obtained for the readahead. In
9082 9076 * this case we skip the readahead and return.
9083 9077 */
9084 9078 if (rrdc == NULL) {
9085 9079 rddir4_cache_rele(rp, rdc);
9086 9080 mutex_exit(&rp->r_statelock);
9087 9081 return (error);
9088 9082 }
9089 9083
9090 9084 /*
9091 9085 * Check to see if we need to fill this entry in.
9092 9086 */
9093 9087 if (rrdc->flags & RDDIRREQ) {
9094 9088 rrdc->flags &= ~RDDIRREQ;
9095 9089 rrdc->flags |= RDDIR;
9096 9090 rddir4_cache_rele(rp, rdc);
9097 9091 mutex_exit(&rp->r_statelock);
9098 9092 #ifdef DEBUG
9099 9093 nfs4_readdir_readahead++;
9100 9094 #endif
9101 9095 /*
9102 9096 * Do the readdir.
9103 9097 */
9104 9098 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9105 9099 return (error);
9106 9100 }
9107 9101
9108 9102 rddir4_cache_rele(rp, rrdc);
9109 9103 rddir4_cache_rele(rp, rdc);
9110 9104 mutex_exit(&rp->r_statelock);
9111 9105 return (error);
9112 9106 }
9113 9107
9114 9108 static int
9115 9109 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9116 9110 {
9117 9111 int error;
9118 9112 rnode4_t *rp;
9119 9113
9120 9114 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9121 9115
9122 9116 rp = VTOR4(vp);
9123 9117
9124 9118 /*
9125 9119 * Obtain the readdir results for the caller.
9126 9120 */
9127 9121 nfs4readdir(vp, rdc, cr);
9128 9122
9129 9123 mutex_enter(&rp->r_statelock);
9130 9124 /*
9131 9125 * The entry is now complete
9132 9126 */
9133 9127 rdc->flags &= ~RDDIR;
9134 9128
9135 9129 error = rdc->error;
9136 9130 if (error)
9137 9131 rdc->flags |= RDDIRREQ;
9138 9132 rddir4_cache_rele(rp, rdc);
9139 9133 mutex_exit(&rp->r_statelock);
9140 9134
9141 9135 return (error);
9142 9136 }
9143 9137
9144 9138 /*
9145 9139 * Read directory entries.
9146 9140 * There are some weird things to look out for here. The uio_loffset
9147 9141 * field is either 0 or it is the offset returned from a previous
9148 9142 * readdir. It is an opaque value used by the server to find the
9149 9143 * correct directory block to read. The count field is the number
9150 9144 * of blocks to read on the server. This is advisory only, the server
9151 9145 * may return only one block's worth of entries. Entries may be compressed
9152 9146 * on the server.
9153 9147 *
9154 9148 * Generates the following compound request:
9155 9149 * 1. If readdir offset is zero and no dnlc entry for parent exists,
9156 9150 * must include a Lookupp as well. In this case, send:
9157 9151 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9158 9152 * 2. Otherwise just do: { Putfh <fh>; Readdir }
9159 9153 *
9160 9154 * Get complete attributes and filehandles for entries if this is the
9161 9155 * first read of the directory. Otherwise, just get fileid's.
9162 9156 */
9163 9157 static void
9164 9158 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9165 9159 {
9166 9160 COMPOUND4args_clnt args;
9167 9161 COMPOUND4res_clnt res;
9168 9162 READDIR4args *rargs;
9169 9163 READDIR4res_clnt *rd_res;
9170 9164 bitmap4 rd_bitsval;
9171 9165 nfs_argop4 argop[5];
9172 9166 nfs_resop4 *resop;
9173 9167 rnode4_t *rp = VTOR4(vp);
9174 9168 mntinfo4_t *mi = VTOMI4(vp);
9175 9169 int doqueue;
9176 9170 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */
9177 9171 vnode_t *dvp;
9178 9172 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9179 9173 int num_ops, res_opcnt;
9180 9174 bool_t needrecov = FALSE;
9181 9175 nfs4_recov_state_t recov_state;
9182 9176 hrtime_t t;
9183 9177 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9184 9178
9185 9179 ASSERT(nfs_zone() == mi->mi_zone);
9186 9180 ASSERT(rdc->flags & RDDIR);
9187 9181 ASSERT(rdc->entries == NULL);
9188 9182
9189 9183 /*
9190 9184 * If rp were a stub, it should have triggered and caused
9191 9185 * a mount for us to get this far.
9192 9186 */
9193 9187 ASSERT(!RP_ISSTUB(rp));
9194 9188
9195 9189 num_ops = 2;
9196 9190 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9197 9191 /*
9198 9192 * Since nfsv4 readdir may not return entries for "." and "..",
9199 9193 * the client must recreate them:
9200 9194 * To find the correct nodeid, do the following:
9201 9195 * For current node, get nodeid from dnlc.
9202 9196 * - if current node is rootvp, set pnodeid to nodeid.
9203 9197 * - else if parent is in the dnlc, get its nodeid from there.
9204 9198 * - else add LOOKUPP+GETATTR to compound.
9205 9199 */
9206 9200 nodeid = rp->r_attr.va_nodeid;
9207 9201 if (vp->v_flag & VROOT) {
9208 9202 pnodeid = nodeid; /* root of mount point */
9209 9203 } else {
9210 9204 dvp = dnlc_lookup(vp, "..");
9211 9205 if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9212 9206 /* parent in dnlc cache - no need for otw */
9213 9207 pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9214 9208 } else {
9215 9209 /*
9216 9210 * parent not in dnlc cache,
9217 9211 * do lookupp to get its id
9218 9212 */
9219 9213 num_ops = 5;
9220 9214 pnodeid = 0; /* set later by getattr parent */
9221 9215 }
9222 9216 if (dvp)
9223 9217 VN_RELE(dvp);
9224 9218 }
9225 9219 }
9226 9220 recov_state.rs_flags = 0;
9227 9221 recov_state.rs_num_retry_despite_err = 0;
9228 9222
9229 9223 /* Save the original mount point security flavor */
9230 9224 (void) save_mnt_secinfo(mi->mi_curr_serv);
9231 9225
9232 9226 recov_retry:
9233 9227 args.ctag = TAG_READDIR;
9234 9228
9235 9229 args.array = argop;
9236 9230 args.array_len = num_ops;
9237 9231
9238 9232 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9239 9233 &recov_state, NULL)) {
9240 9234 /*
9241 9235 * If readdir a node that is a stub for a crossed mount point,
9242 9236 * keep the original secinfo flavor for the current file
9243 9237 * system, not the crossed one.
9244 9238 */
9245 9239 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9246 9240 rdc->error = e.error;
9247 9241 return;
9248 9242 }
9249 9243
9250 9244 /*
9251 9245 * Determine which attrs to request for dirents. This code
9252 9246 * must be protected by nfs4_start/end_fop because of r_server
9253 9247 * (which will change during failover recovery).
9254 9248 *
9255 9249 */
9256 9250 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9257 9251 /*
9258 9252 * Get all vattr attrs plus filehandle and rdattr_error
9259 9253 */
9260 9254 rd_bitsval = NFS4_VATTR_MASK |
9261 9255 FATTR4_RDATTR_ERROR_MASK |
9262 9256 FATTR4_FILEHANDLE_MASK;
9263 9257
9264 9258 if (rp->r_flags & R4READDIRWATTR) {
9265 9259 mutex_enter(&rp->r_statelock);
9266 9260 rp->r_flags &= ~R4READDIRWATTR;
9267 9261 mutex_exit(&rp->r_statelock);
9268 9262 }
9269 9263 } else {
9270 9264 servinfo4_t *svp = rp->r_server;
9271 9265
9272 9266 /*
9273 9267 * Already read directory. Use readdir with
9274 9268 * no attrs (except for mounted_on_fileid) for updates.
9275 9269 */
9276 9270 rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9277 9271
9278 9272 /*
9279 9273 * request mounted on fileid if supported, else request
9280 9274 * fileid. maybe we should verify that fileid is supported
9281 9275 * and request something else if not.
9282 9276 */
9283 9277 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9284 9278 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9285 9279 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9286 9280 nfs_rw_exit(&svp->sv_lock);
9287 9281 }
9288 9282
9289 9283 /* putfh directory fh */
9290 9284 argop[0].argop = OP_CPUTFH;
9291 9285 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9292 9286
9293 9287 argop[1].argop = OP_READDIR;
9294 9288 rargs = &argop[1].nfs_argop4_u.opreaddir;
9295 9289 /*
9296 9290 * 1 and 2 are reserved for client "." and ".." entry offset.
9297 9291 * cookie 0 should be used over-the-wire to start reading at
9298 9292 * the beginning of the directory excluding "." and "..".
9299 9293 */
9300 9294 if (rdc->nfs4_cookie == 0 ||
9301 9295 rdc->nfs4_cookie == 1 ||
9302 9296 rdc->nfs4_cookie == 2) {
9303 9297 rargs->cookie = (nfs_cookie4)0;
9304 9298 rargs->cookieverf = 0;
9305 9299 } else {
9306 9300 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9307 9301 mutex_enter(&rp->r_statelock);
9308 9302 rargs->cookieverf = rp->r_cookieverf4;
9309 9303 mutex_exit(&rp->r_statelock);
9310 9304 }
9311 9305 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9312 9306 rargs->maxcount = mi->mi_tsize;
9313 9307 rargs->attr_request = rd_bitsval;
9314 9308 rargs->rdc = rdc;
9315 9309 rargs->dvp = vp;
9316 9310 rargs->mi = mi;
9317 9311 rargs->cr = cr;
9318 9312
9319 9313
9320 9314 /*
9321 9315 * If count < than the minimum required, we return no entries
9322 9316 * and fail with EINVAL
9323 9317 */
9324 9318 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9325 9319 rdc->error = EINVAL;
9326 9320 goto out;
9327 9321 }
9328 9322
9329 9323 if (args.array_len == 5) {
9330 9324 /*
9331 9325 * Add lookupp and getattr for parent nodeid.
9332 9326 */
9333 9327 argop[2].argop = OP_LOOKUPP;
9334 9328
9335 9329 argop[3].argop = OP_GETFH;
9336 9330
9337 9331 /* getattr parent */
9338 9332 argop[4].argop = OP_GETATTR;
9339 9333 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9340 9334 argop[4].nfs_argop4_u.opgetattr.mi = mi;
9341 9335 }
9342 9336
9343 9337 doqueue = 1;
9344 9338
9345 9339 if (mi->mi_io_kstats) {
9346 9340 mutex_enter(&mi->mi_lock);
9347 9341 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9348 9342 mutex_exit(&mi->mi_lock);
9349 9343 }
9350 9344
9351 9345 /* capture the time of this call */
9352 9346 rargs->t = t = gethrtime();
9353 9347
9354 9348 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9355 9349
9356 9350 if (mi->mi_io_kstats) {
9357 9351 mutex_enter(&mi->mi_lock);
9358 9352 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9359 9353 mutex_exit(&mi->mi_lock);
9360 9354 }
9361 9355
9362 9356 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9363 9357
9364 9358 /*
9365 9359 * If RPC error occurred and it isn't an error that
9366 9360 * triggers recovery, then go ahead and fail now.
9367 9361 */
9368 9362 if (e.error != 0 && !needrecov) {
9369 9363 rdc->error = e.error;
9370 9364 goto out;
9371 9365 }
9372 9366
9373 9367 if (needrecov) {
9374 9368 bool_t abort;
9375 9369
9376 9370 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9377 9371 "nfs4readdir: initiating recovery.\n"));
9378 9372
9379 9373 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9380 9374 NULL, OP_READDIR, NULL, NULL, NULL);
9381 9375 if (abort == FALSE) {
9382 9376 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9383 9377 &recov_state, needrecov);
9384 9378 if (!e.error)
9385 9379 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9386 9380 if (rdc->entries != NULL) {
9387 9381 kmem_free(rdc->entries, rdc->entlen);
9388 9382 rdc->entries = NULL;
9389 9383 }
9390 9384 goto recov_retry;
9391 9385 }
9392 9386
9393 9387 if (e.error != 0) {
9394 9388 rdc->error = e.error;
9395 9389 goto out;
9396 9390 }
9397 9391
9398 9392 /* fall through for res.status case */
9399 9393 }
9400 9394
9401 9395 res_opcnt = res.array_len;
9402 9396
9403 9397 /*
9404 9398 * If compound failed first 2 ops (PUTFH+READDIR), then return
9405 9399 * failure here. Subsequent ops are for filling out dot-dot
9406 9400 * dirent, and if they fail, we still want to give the caller
9407 9401 * the dirents returned by (the successful) READDIR op, so we need
9408 9402 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9409 9403 *
9410 9404 * One example where PUTFH+READDIR ops would succeed but
9411 9405 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9412 9406 * but lacks x. In this case, a POSIX server's VOP_READDIR
9413 9407 * would succeed; however, VOP_LOOKUP(..) would fail since no
9414 9408 * x perm. We need to come up with a non-vendor-specific way
9415 9409 * for a POSIX server to return d_ino from dotdot's dirent if
9416 9410 * client only requests mounted_on_fileid, and just say the
9417 9411 * LOOKUPP succeeded and fill out the GETATTR. However, if
9418 9412 * client requested any mandatory attrs, server would be required
9419 9413 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9420 9414 * for dotdot.
9421 9415 */
9422 9416
9423 9417 if (res.status) {
9424 9418 if (res_opcnt <= 2) {
9425 9419 e.error = geterrno4(res.status);
9426 9420 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9427 9421 &recov_state, needrecov);
9428 9422 nfs4_purge_stale_fh(e.error, vp, cr);
9429 9423 rdc->error = e.error;
9430 9424 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9431 9425 if (rdc->entries != NULL) {
9432 9426 kmem_free(rdc->entries, rdc->entlen);
9433 9427 rdc->entries = NULL;
9434 9428 }
9435 9429 /*
9436 9430 * If readdir a node that is a stub for a
9437 9431 * crossed mount point, keep the original
9438 9432 * secinfo flavor for the current file system,
9439 9433 * not the crossed one.
9440 9434 */
9441 9435 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9442 9436 return;
9443 9437 }
9444 9438 }
9445 9439
9446 9440 resop = &res.array[1]; /* readdir res */
9447 9441 rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9448 9442
9449 9443 mutex_enter(&rp->r_statelock);
9450 9444 rp->r_cookieverf4 = rd_res->cookieverf;
9451 9445 mutex_exit(&rp->r_statelock);
9452 9446
9453 9447 /*
9454 9448 * For "." and ".." entries
9455 9449 * e.g.
9456 9450 * seek(cookie=0) -> "." entry with d_off = 1
9457 9451 * seek(cookie=1) -> ".." entry with d_off = 2
9458 9452 */
9459 9453 if (cookie == (nfs_cookie4) 0) {
9460 9454 if (rd_res->dotp)
9461 9455 rd_res->dotp->d_ino = nodeid;
9462 9456 if (rd_res->dotdotp)
9463 9457 rd_res->dotdotp->d_ino = pnodeid;
9464 9458 }
9465 9459 if (cookie == (nfs_cookie4) 1) {
9466 9460 if (rd_res->dotdotp)
9467 9461 rd_res->dotdotp->d_ino = pnodeid;
9468 9462 }
9469 9463
9470 9464
9471 9465 /* LOOKUPP+GETATTR attemped */
9472 9466 if (args.array_len == 5 && rd_res->dotdotp) {
9473 9467 if (res.status == NFS4_OK && res_opcnt == 5) {
9474 9468 nfs_fh4 *fhp;
9475 9469 nfs4_sharedfh_t *sfhp;
9476 9470 vnode_t *pvp;
9477 9471 nfs4_ga_res_t *garp;
9478 9472
9479 9473 resop++; /* lookupp */
9480 9474 resop++; /* getfh */
9481 9475 fhp = &resop->nfs_resop4_u.opgetfh.object;
9482 9476
9483 9477 resop++; /* getattr of parent */
9484 9478
9485 9479 /*
9486 9480 * First, take care of finishing the
9487 9481 * readdir results.
9488 9482 */
9489 9483 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9490 9484 /*
9491 9485 * The d_ino of .. must be the inode number
9492 9486 * of the mounted filesystem.
9493 9487 */
9494 9488 if (garp->n4g_va.va_mask & AT_NODEID)
9495 9489 rd_res->dotdotp->d_ino =
9496 9490 garp->n4g_va.va_nodeid;
9497 9491
9498 9492
9499 9493 /*
9500 9494 * Next, create the ".." dnlc entry
9501 9495 */
9502 9496 sfhp = sfh4_get(fhp, mi);
9503 9497 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9504 9498 dnlc_update(vp, "..", pvp);
9505 9499 VN_RELE(pvp);
9506 9500 }
9507 9501 sfh4_rele(&sfhp);
9508 9502 }
9509 9503 }
9510 9504
9511 9505 if (mi->mi_io_kstats) {
9512 9506 mutex_enter(&mi->mi_lock);
9513 9507 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9514 9508 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9515 9509 mutex_exit(&mi->mi_lock);
9516 9510 }
9517 9511
9518 9512 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9519 9513
9520 9514 out:
9521 9515 /*
9522 9516 * If readdir a node that is a stub for a crossed mount point,
9523 9517 * keep the original secinfo flavor for the current file system,
9524 9518 * not the crossed one.
9525 9519 */
9526 9520 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9527 9521
9528 9522 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9529 9523 }
9530 9524
9531 9525
9532 9526 static int
9533 9527 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9534 9528 {
9535 9529 rnode4_t *rp = VTOR4(bp->b_vp);
9536 9530 int count;
9537 9531 int error;
9538 9532 cred_t *cred_otw = NULL;
9539 9533 offset_t offset;
9540 9534 nfs4_open_stream_t *osp = NULL;
9541 9535 bool_t first_time = TRUE; /* first time getting otw cred */
9542 9536 bool_t last_time = FALSE; /* last time getting otw cred */
9543 9537
9544 9538 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9545 9539
9546 9540 DTRACE_IO1(start, struct buf *, bp);
9547 9541 offset = ldbtob(bp->b_lblkno);
9548 9542
9549 9543 if (bp->b_flags & B_READ) {
9550 9544 read_again:
9551 9545 /*
9552 9546 * Releases the osp, if it is provided.
9553 9547 * Puts a hold on the cred_otw and the new osp (if found).
9554 9548 */
9555 9549 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9556 9550 &first_time, &last_time);
9557 9551 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9558 9552 offset, bp->b_bcount, &bp->b_resid, cred_otw,
9559 9553 readahead, NULL);
9560 9554 crfree(cred_otw);
9561 9555 if (!error) {
9562 9556 if (bp->b_resid) {
9563 9557 /*
9564 9558 * Didn't get it all because we hit EOF,
9565 9559 * zero all the memory beyond the EOF.
9566 9560 */
9567 9561 /* bzero(rdaddr + */
9568 9562 bzero(bp->b_un.b_addr +
9569 9563 bp->b_bcount - bp->b_resid, bp->b_resid);
9570 9564 }
9571 9565 mutex_enter(&rp->r_statelock);
9572 9566 if (bp->b_resid == bp->b_bcount &&
9573 9567 offset >= rp->r_size) {
9574 9568 /*
9575 9569 * We didn't read anything at all as we are
9576 9570 * past EOF. Return an error indicator back
9577 9571 * but don't destroy the pages (yet).
9578 9572 */
9579 9573 error = NFS_EOF;
9580 9574 }
9581 9575 mutex_exit(&rp->r_statelock);
9582 9576 } else if (error == EACCES && last_time == FALSE) {
9583 9577 goto read_again;
9584 9578 }
9585 9579 } else {
9586 9580 if (!(rp->r_flags & R4STALE)) {
9587 9581 write_again:
9588 9582 /*
9589 9583 * Releases the osp, if it is provided.
9590 9584 * Puts a hold on the cred_otw and the new
9591 9585 * osp (if found).
9592 9586 */
9593 9587 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9594 9588 &first_time, &last_time);
9595 9589 mutex_enter(&rp->r_statelock);
9596 9590 count = MIN(bp->b_bcount, rp->r_size - offset);
9597 9591 mutex_exit(&rp->r_statelock);
9598 9592 if (count < 0)
9599 9593 cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9600 9594 #ifdef DEBUG
9601 9595 if (count == 0) {
9602 9596 zoneid_t zoneid = getzoneid();
9603 9597
9604 9598 zcmn_err(zoneid, CE_WARN,
9605 9599 "nfs4_bio: zero length write at %lld",
9606 9600 offset);
9607 9601 zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9608 9602 "b_bcount=%ld, file size=%lld",
9609 9603 rp->r_flags, (long)bp->b_bcount,
9610 9604 rp->r_size);
9611 9605 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9612 9606 if (nfs4_bio_do_stop)
9613 9607 debug_enter("nfs4_bio");
9614 9608 }
9615 9609 #endif
9616 9610 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9617 9611 count, cred_otw, stab_comm);
9618 9612 if (error == EACCES && last_time == FALSE) {
9619 9613 crfree(cred_otw);
9620 9614 goto write_again;
9621 9615 }
9622 9616 bp->b_error = error;
9623 9617 if (error && error != EINTR &&
9624 9618 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9625 9619 /*
9626 9620 * Don't print EDQUOT errors on the console.
9627 9621 * Don't print asynchronous EACCES errors.
9628 9622 * Don't print EFBIG errors.
9629 9623 * Print all other write errors.
9630 9624 */
9631 9625 if (error != EDQUOT && error != EFBIG &&
9632 9626 (error != EACCES ||
9633 9627 !(bp->b_flags & B_ASYNC)))
9634 9628 nfs4_write_error(bp->b_vp,
9635 9629 error, cred_otw);
9636 9630 /*
9637 9631 * Update r_error and r_flags as appropriate.
9638 9632 * If the error was ESTALE, then mark the
9639 9633 * rnode as not being writeable and save
9640 9634 * the error status. Otherwise, save any
9641 9635 * errors which occur from asynchronous
9642 9636 * page invalidations. Any errors occurring
9643 9637 * from other operations should be saved
9644 9638 * by the caller.
9645 9639 */
9646 9640 mutex_enter(&rp->r_statelock);
9647 9641 if (error == ESTALE) {
9648 9642 rp->r_flags |= R4STALE;
9649 9643 if (!rp->r_error)
9650 9644 rp->r_error = error;
9651 9645 } else if (!rp->r_error &&
9652 9646 (bp->b_flags &
9653 9647 (B_INVAL|B_FORCE|B_ASYNC)) ==
9654 9648 (B_INVAL|B_FORCE|B_ASYNC)) {
9655 9649 rp->r_error = error;
9656 9650 }
9657 9651 mutex_exit(&rp->r_statelock);
9658 9652 }
9659 9653 crfree(cred_otw);
9660 9654 } else {
9661 9655 error = rp->r_error;
9662 9656 /*
9663 9657 * A close may have cleared r_error, if so,
9664 9658 * propagate ESTALE error return properly
9665 9659 */
9666 9660 if (error == 0)
9667 9661 error = ESTALE;
9668 9662 }
9669 9663 }
9670 9664
9671 9665 if (error != 0 && error != NFS_EOF)
9672 9666 bp->b_flags |= B_ERROR;
9673 9667
9674 9668 if (osp)
9675 9669 open_stream_rele(osp, rp);
9676 9670
9677 9671 DTRACE_IO1(done, struct buf *, bp);
9678 9672
9679 9673 return (error);
9680 9674 }
9681 9675
9682 9676 /* ARGSUSED */
9683 9677 int
9684 9678 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9685 9679 {
9686 9680 return (EREMOTE);
9687 9681 }
9688 9682
9689 9683 /* ARGSUSED2 */
9690 9684 int
9691 9685 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9692 9686 {
9693 9687 rnode4_t *rp = VTOR4(vp);
9694 9688
9695 9689 if (!write_lock) {
9696 9690 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9697 9691 return (V_WRITELOCK_FALSE);
9698 9692 }
9699 9693
9700 9694 if ((rp->r_flags & R4DIRECTIO) ||
9701 9695 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9702 9696 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9703 9697 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9704 9698 return (V_WRITELOCK_FALSE);
9705 9699 nfs_rw_exit(&rp->r_rwlock);
9706 9700 }
9707 9701
9708 9702 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9709 9703 return (V_WRITELOCK_TRUE);
9710 9704 }
9711 9705
9712 9706 /* ARGSUSED */
9713 9707 void
9714 9708 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9715 9709 {
9716 9710 rnode4_t *rp = VTOR4(vp);
9717 9711
9718 9712 nfs_rw_exit(&rp->r_rwlock);
9719 9713 }
9720 9714
9721 9715 /* ARGSUSED */
9722 9716 static int
9723 9717 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9724 9718 {
9725 9719 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9726 9720 return (EIO);
9727 9721
9728 9722 /*
9729 9723 * Because we stuff the readdir cookie into the offset field
9730 9724 * someone may attempt to do an lseek with the cookie which
9731 9725 * we want to succeed.
9732 9726 */
9733 9727 if (vp->v_type == VDIR)
9734 9728 return (0);
9735 9729 if (*noffp < 0)
9736 9730 return (EINVAL);
9737 9731 return (0);
9738 9732 }
9739 9733
9740 9734
9741 9735 /*
9742 9736 * Return all the pages from [off..off+len) in file
9743 9737 */
9744 9738 /* ARGSUSED */
9745 9739 static int
9746 9740 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9747 9741 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9748 9742 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9749 9743 {
9750 9744 rnode4_t *rp;
9751 9745 int error;
9752 9746 mntinfo4_t *mi;
9753 9747
9754 9748 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9755 9749 return (EIO);
9756 9750 rp = VTOR4(vp);
9757 9751 if (IS_SHADOW(vp, rp))
9758 9752 vp = RTOV4(rp);
9759 9753
9760 9754 if (vp->v_flag & VNOMAP)
9761 9755 return (ENOSYS);
9762 9756
9763 9757 if (protp != NULL)
9764 9758 *protp = PROT_ALL;
9765 9759
9766 9760 /*
9767 9761 * Now validate that the caches are up to date.
9768 9762 */
9769 9763 if (error = nfs4_validate_caches(vp, cr))
9770 9764 return (error);
9771 9765
9772 9766 mi = VTOMI4(vp);
9773 9767 retry:
9774 9768 mutex_enter(&rp->r_statelock);
9775 9769
9776 9770 /*
9777 9771 * Don't create dirty pages faster than they
9778 9772 * can be cleaned so that the system doesn't
9779 9773 * get imbalanced. If the async queue is
9780 9774 * maxed out, then wait for it to drain before
9781 9775 * creating more dirty pages. Also, wait for
9782 9776 * any threads doing pagewalks in the vop_getattr
9783 9777 * entry points so that they don't block for
9784 9778 * long periods.
9785 9779 */
9786 9780 if (rw == S_CREATE) {
9787 9781 while ((mi->mi_max_threads != 0 &&
9788 9782 rp->r_awcount > 2 * mi->mi_max_threads) ||
9789 9783 rp->r_gcount > 0)
9790 9784 cv_wait(&rp->r_cv, &rp->r_statelock);
9791 9785 }
9792 9786
9793 9787 /*
9794 9788 * If we are getting called as a side effect of an nfs_write()
9795 9789 * operation the local file size might not be extended yet.
9796 9790 * In this case we want to be able to return pages of zeroes.
9797 9791 */
9798 9792 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9799 9793 NFS4_DEBUG(nfs4_pageio_debug,
9800 9794 (CE_NOTE, "getpage beyond EOF: off=%lld, "
9801 9795 "len=%llu, size=%llu, attrsize =%llu", off,
9802 9796 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9803 9797 mutex_exit(&rp->r_statelock);
9804 9798 return (EFAULT); /* beyond EOF */
9805 9799 }
9806 9800
9807 9801 mutex_exit(&rp->r_statelock);
9808 9802
9809 9803 error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9810 9804 pl, plsz, seg, addr, rw, cr);
9811 9805 NFS4_DEBUG(nfs4_pageio_debug && error,
9812 9806 (CE_NOTE, "getpages error %d; off=%lld, len=%lld",
9813 9807 error, off, (u_longlong_t)len));
9814 9808
9815 9809 switch (error) {
9816 9810 case NFS_EOF:
9817 9811 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9818 9812 goto retry;
9819 9813 case ESTALE:
9820 9814 nfs4_purge_stale_fh(error, vp, cr);
9821 9815 }
9822 9816
9823 9817 return (error);
9824 9818 }
9825 9819
9826 9820 /*
9827 9821 * Called from pvn_getpages to get a particular page.
9828 9822 */
9829 9823 /* ARGSUSED */
9830 9824 static int
9831 9825 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9832 9826 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9833 9827 enum seg_rw rw, cred_t *cr)
9834 9828 {
9835 9829 rnode4_t *rp;
9836 9830 uint_t bsize;
9837 9831 struct buf *bp;
9838 9832 page_t *pp;
9839 9833 u_offset_t lbn;
9840 9834 u_offset_t io_off;
9841 9835 u_offset_t blkoff;
9842 9836 u_offset_t rablkoff;
9843 9837 size_t io_len;
9844 9838 uint_t blksize;
9845 9839 int error;
9846 9840 int readahead;
9847 9841 int readahead_issued = 0;
9848 9842 int ra_window; /* readahead window */
9849 9843 page_t *pagefound;
9850 9844 page_t *savepp;
9851 9845
9852 9846 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9853 9847 return (EIO);
9854 9848
9855 9849 rp = VTOR4(vp);
9856 9850 ASSERT(!IS_SHADOW(vp, rp));
9857 9851 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9858 9852
9859 9853 reread:
9860 9854 bp = NULL;
9861 9855 pp = NULL;
9862 9856 pagefound = NULL;
9863 9857
9864 9858 if (pl != NULL)
9865 9859 pl[0] = NULL;
9866 9860
9867 9861 error = 0;
9868 9862 lbn = off / bsize;
9869 9863 blkoff = lbn * bsize;
9870 9864
9871 9865 /*
9872 9866 * Queueing up the readahead before doing the synchronous read
9873 9867 * results in a significant increase in read throughput because
9874 9868 * of the increased parallelism between the async threads and
9875 9869 * the process context.
9876 9870 */
9877 9871 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9878 9872 rw != S_CREATE &&
9879 9873 !(vp->v_flag & VNOCACHE)) {
9880 9874 mutex_enter(&rp->r_statelock);
9881 9875
9882 9876 /*
9883 9877 * Calculate the number of readaheads to do.
9884 9878 * a) No readaheads at offset = 0.
9885 9879 * b) Do maximum(nfs4_nra) readaheads when the readahead
9886 9880 * window is closed.
9887 9881 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9888 9882 * upon how far the readahead window is open or close.
9889 9883 * d) No readaheads if rp->r_nextr is not within the scope
9890 9884 * of the readahead window (random i/o).
9891 9885 */
9892 9886
9893 9887 if (off == 0)
9894 9888 readahead = 0;
9895 9889 else if (blkoff == rp->r_nextr)
9896 9890 readahead = nfs4_nra;
9897 9891 else if (rp->r_nextr > blkoff &&
9898 9892 ((ra_window = (rp->r_nextr - blkoff) / bsize)
9899 9893 <= (nfs4_nra - 1)))
9900 9894 readahead = nfs4_nra - ra_window;
9901 9895 else
9902 9896 readahead = 0;
9903 9897
9904 9898 rablkoff = rp->r_nextr;
9905 9899 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9906 9900 mutex_exit(&rp->r_statelock);
9907 9901 if (nfs4_async_readahead(vp, rablkoff + bsize,
9908 9902 addr + (rablkoff + bsize - off),
9909 9903 seg, cr, nfs4_readahead) < 0) {
9910 9904 mutex_enter(&rp->r_statelock);
9911 9905 break;
9912 9906 }
9913 9907 readahead--;
9914 9908 rablkoff += bsize;
9915 9909 /*
9916 9910 * Indicate that we did a readahead so
9917 9911 * readahead offset is not updated
9918 9912 * by the synchronous read below.
9919 9913 */
9920 9914 readahead_issued = 1;
9921 9915 mutex_enter(&rp->r_statelock);
9922 9916 /*
9923 9917 * set readahead offset to
9924 9918 * offset of last async readahead
9925 9919 * request.
9926 9920 */
9927 9921 rp->r_nextr = rablkoff;
9928 9922 }
9929 9923 mutex_exit(&rp->r_statelock);
9930 9924 }
9931 9925
9932 9926 again:
9933 9927 if ((pagefound = page_exists(vp, off)) == NULL) {
9934 9928 if (pl == NULL) {
9935 9929 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9936 9930 nfs4_readahead);
9937 9931 } else if (rw == S_CREATE) {
9938 9932 /*
9939 9933 * Block for this page is not allocated, or the offset
9940 9934 * is beyond the current allocation size, or we're
9941 9935 * allocating a swap slot and the page was not found,
9942 9936 * so allocate it and return a zero page.
9943 9937 */
9944 9938 if ((pp = page_create_va(vp, off,
9945 9939 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9946 9940 cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9947 9941 io_len = PAGESIZE;
9948 9942 mutex_enter(&rp->r_statelock);
9949 9943 rp->r_nextr = off + PAGESIZE;
9950 9944 mutex_exit(&rp->r_statelock);
9951 9945 } else {
9952 9946 /*
9953 9947 * Need to go to server to get a block
9954 9948 */
9955 9949 mutex_enter(&rp->r_statelock);
9956 9950 if (blkoff < rp->r_size &&
9957 9951 blkoff + bsize > rp->r_size) {
9958 9952 /*
9959 9953 * If less than a block left in
9960 9954 * file read less than a block.
9961 9955 */
9962 9956 if (rp->r_size <= off) {
9963 9957 /*
9964 9958 * Trying to access beyond EOF,
9965 9959 * set up to get at least one page.
9966 9960 */
9967 9961 blksize = off + PAGESIZE - blkoff;
9968 9962 } else
9969 9963 blksize = rp->r_size - blkoff;
9970 9964 } else if ((off == 0) ||
9971 9965 (off != rp->r_nextr && !readahead_issued)) {
9972 9966 blksize = PAGESIZE;
9973 9967 blkoff = off; /* block = page here */
9974 9968 } else
9975 9969 blksize = bsize;
9976 9970 mutex_exit(&rp->r_statelock);
9977 9971
9978 9972 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9979 9973 &io_len, blkoff, blksize, 0);
9980 9974
9981 9975 /*
9982 9976 * Some other thread has entered the page,
9983 9977 * so just use it.
9984 9978 */
9985 9979 if (pp == NULL)
9986 9980 goto again;
9987 9981
9988 9982 /*
9989 9983 * Now round the request size up to page boundaries.
9990 9984 * This ensures that the entire page will be
9991 9985 * initialized to zeroes if EOF is encountered.
9992 9986 */
9993 9987 io_len = ptob(btopr(io_len));
9994 9988
9995 9989 bp = pageio_setup(pp, io_len, vp, B_READ);
9996 9990 ASSERT(bp != NULL);
9997 9991
9998 9992 /*
9999 9993 * pageio_setup should have set b_addr to 0. This
10000 9994 * is correct since we want to do I/O on a page
10001 9995 * boundary. bp_mapin will use this addr to calculate
10002 9996 * an offset, and then set b_addr to the kernel virtual
10003 9997 * address it allocated for us.
10004 9998 */
10005 9999 ASSERT(bp->b_un.b_addr == 0);
10006 10000
10007 10001 bp->b_edev = 0;
10008 10002 bp->b_dev = 0;
10009 10003 bp->b_lblkno = lbtodb(io_off);
10010 10004 bp->b_file = vp;
10011 10005 bp->b_offset = (offset_t)off;
10012 10006 bp_mapin(bp);
10013 10007
10014 10008 /*
10015 10009 * If doing a write beyond what we believe is EOF,
10016 10010 * don't bother trying to read the pages from the
10017 10011 * server, we'll just zero the pages here. We
10018 10012 * don't check that the rw flag is S_WRITE here
10019 10013 * because some implementations may attempt a
10020 10014 * read access to the buffer before copying data.
10021 10015 */
10022 10016 mutex_enter(&rp->r_statelock);
10023 10017 if (io_off >= rp->r_size && seg == segkmap) {
10024 10018 mutex_exit(&rp->r_statelock);
10025 10019 bzero(bp->b_un.b_addr, io_len);
10026 10020 } else {
10027 10021 mutex_exit(&rp->r_statelock);
10028 10022 error = nfs4_bio(bp, NULL, cr, FALSE);
10029 10023 }
10030 10024
10031 10025 /*
10032 10026 * Unmap the buffer before freeing it.
10033 10027 */
10034 10028 bp_mapout(bp);
10035 10029 pageio_done(bp);
10036 10030
10037 10031 savepp = pp;
10038 10032 do {
10039 10033 pp->p_fsdata = C_NOCOMMIT;
10040 10034 } while ((pp = pp->p_next) != savepp);
10041 10035
10042 10036 if (error == NFS_EOF) {
10043 10037 /*
10044 10038 * If doing a write system call just return
10045 10039 * zeroed pages, else user tried to get pages
10046 10040 * beyond EOF, return error. We don't check
10047 10041 * that the rw flag is S_WRITE here because
10048 10042 * some implementations may attempt a read
10049 10043 * access to the buffer before copying data.
10050 10044 */
10051 10045 if (seg == segkmap)
10052 10046 error = 0;
10053 10047 else
10054 10048 error = EFAULT;
10055 10049 }
10056 10050
10057 10051 if (!readahead_issued && !error) {
10058 10052 mutex_enter(&rp->r_statelock);
10059 10053 rp->r_nextr = io_off + io_len;
10060 10054 mutex_exit(&rp->r_statelock);
10061 10055 }
10062 10056 }
10063 10057 }
10064 10058
10065 10059 out:
10066 10060 if (pl == NULL)
10067 10061 return (error);
10068 10062
10069 10063 if (error) {
10070 10064 if (pp != NULL)
10071 10065 pvn_read_done(pp, B_ERROR);
10072 10066 return (error);
10073 10067 }
10074 10068
10075 10069 if (pagefound) {
10076 10070 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10077 10071
10078 10072 /*
10079 10073 * Page exists in the cache, acquire the appropriate lock.
10080 10074 * If this fails, start all over again.
10081 10075 */
10082 10076 if ((pp = page_lookup(vp, off, se)) == NULL) {
10083 10077 #ifdef DEBUG
10084 10078 nfs4_lostpage++;
10085 10079 #endif
10086 10080 goto reread;
10087 10081 }
10088 10082 pl[0] = pp;
10089 10083 pl[1] = NULL;
10090 10084 return (0);
10091 10085 }
10092 10086
10093 10087 if (pp != NULL)
10094 10088 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10095 10089
10096 10090 return (error);
10097 10091 }
10098 10092
10099 10093 static void
10100 10094 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10101 10095 cred_t *cr)
10102 10096 {
10103 10097 int error;
10104 10098 page_t *pp;
10105 10099 u_offset_t io_off;
10106 10100 size_t io_len;
10107 10101 struct buf *bp;
10108 10102 uint_t bsize, blksize;
10109 10103 rnode4_t *rp = VTOR4(vp);
10110 10104 page_t *savepp;
10111 10105
10112 10106 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10113 10107
10114 10108 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10115 10109
10116 10110 mutex_enter(&rp->r_statelock);
10117 10111 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10118 10112 /*
10119 10113 * If less than a block left in file read less
10120 10114 * than a block.
10121 10115 */
10122 10116 blksize = rp->r_size - blkoff;
10123 10117 } else
10124 10118 blksize = bsize;
10125 10119 mutex_exit(&rp->r_statelock);
10126 10120
10127 10121 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10128 10122 &io_off, &io_len, blkoff, blksize, 1);
10129 10123 /*
10130 10124 * The isra flag passed to the kluster function is 1, we may have
10131 10125 * gotten a return value of NULL for a variety of reasons (# of free
10132 10126 * pages < minfree, someone entered the page on the vnode etc). In all
10133 10127 * cases, we want to punt on the readahead.
10134 10128 */
10135 10129 if (pp == NULL)
10136 10130 return;
10137 10131
10138 10132 /*
10139 10133 * Now round the request size up to page boundaries.
10140 10134 * This ensures that the entire page will be
10141 10135 * initialized to zeroes if EOF is encountered.
10142 10136 */
10143 10137 io_len = ptob(btopr(io_len));
10144 10138
10145 10139 bp = pageio_setup(pp, io_len, vp, B_READ);
10146 10140 ASSERT(bp != NULL);
10147 10141
10148 10142 /*
10149 10143 * pageio_setup should have set b_addr to 0. This is correct since
10150 10144 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10151 10145 * to calculate an offset, and then set b_addr to the kernel virtual
10152 10146 * address it allocated for us.
10153 10147 */
10154 10148 ASSERT(bp->b_un.b_addr == 0);
10155 10149
10156 10150 bp->b_edev = 0;
10157 10151 bp->b_dev = 0;
10158 10152 bp->b_lblkno = lbtodb(io_off);
10159 10153 bp->b_file = vp;
10160 10154 bp->b_offset = (offset_t)blkoff;
10161 10155 bp_mapin(bp);
10162 10156
10163 10157 /*
10164 10158 * If doing a write beyond what we believe is EOF, don't bother trying
10165 10159 * to read the pages from the server, we'll just zero the pages here.
10166 10160 * We don't check that the rw flag is S_WRITE here because some
10167 10161 * implementations may attempt a read access to the buffer before
10168 10162 * copying data.
10169 10163 */
10170 10164 mutex_enter(&rp->r_statelock);
10171 10165 if (io_off >= rp->r_size && seg == segkmap) {
10172 10166 mutex_exit(&rp->r_statelock);
10173 10167 bzero(bp->b_un.b_addr, io_len);
10174 10168 error = 0;
10175 10169 } else {
10176 10170 mutex_exit(&rp->r_statelock);
10177 10171 error = nfs4_bio(bp, NULL, cr, TRUE);
10178 10172 if (error == NFS_EOF)
10179 10173 error = 0;
10180 10174 }
10181 10175
10182 10176 /*
10183 10177 * Unmap the buffer before freeing it.
10184 10178 */
10185 10179 bp_mapout(bp);
10186 10180 pageio_done(bp);
10187 10181
10188 10182 savepp = pp;
10189 10183 do {
10190 10184 pp->p_fsdata = C_NOCOMMIT;
10191 10185 } while ((pp = pp->p_next) != savepp);
10192 10186
10193 10187 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10194 10188
10195 10189 /*
10196 10190 * In case of error set readahead offset
10197 10191 * to the lowest offset.
10198 10192 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10199 10193 */
10200 10194 if (error && rp->r_nextr > io_off) {
10201 10195 mutex_enter(&rp->r_statelock);
10202 10196 if (rp->r_nextr > io_off)
10203 10197 rp->r_nextr = io_off;
10204 10198 mutex_exit(&rp->r_statelock);
10205 10199 }
10206 10200 }
10207 10201
10208 10202 /*
10209 10203 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10210 10204 * If len == 0, do from off to EOF.
10211 10205 *
10212 10206 * The normal cases should be len == 0 && off == 0 (entire vp list) or
10213 10207 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10214 10208 * (from pageout).
10215 10209 */
10216 10210 /* ARGSUSED */
10217 10211 static int
10218 10212 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10219 10213 caller_context_t *ct)
10220 10214 {
10221 10215 int error;
10222 10216 rnode4_t *rp;
10223 10217
10224 10218 ASSERT(cr != NULL);
10225 10219
10226 10220 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10227 10221 return (EIO);
10228 10222
10229 10223 rp = VTOR4(vp);
10230 10224 if (IS_SHADOW(vp, rp))
10231 10225 vp = RTOV4(rp);
10232 10226
10233 10227 /*
10234 10228 * XXX - Why should this check be made here?
10235 10229 */
10236 10230 if (vp->v_flag & VNOMAP)
10237 10231 return (ENOSYS);
10238 10232
10239 10233 if (len == 0 && !(flags & B_INVAL) &&
10240 10234 (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10241 10235 return (0);
10242 10236
10243 10237 mutex_enter(&rp->r_statelock);
10244 10238 rp->r_count++;
10245 10239 mutex_exit(&rp->r_statelock);
10246 10240 error = nfs4_putpages(vp, off, len, flags, cr);
10247 10241 mutex_enter(&rp->r_statelock);
10248 10242 rp->r_count--;
10249 10243 cv_broadcast(&rp->r_cv);
10250 10244 mutex_exit(&rp->r_statelock);
10251 10245
10252 10246 return (error);
10253 10247 }
10254 10248
10255 10249 /*
10256 10250 * Write out a single page, possibly klustering adjacent dirty pages.
10257 10251 */
10258 10252 int
10259 10253 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10260 10254 int flags, cred_t *cr)
10261 10255 {
10262 10256 u_offset_t io_off;
10263 10257 u_offset_t lbn_off;
10264 10258 u_offset_t lbn;
10265 10259 size_t io_len;
10266 10260 uint_t bsize;
10267 10261 int error;
10268 10262 rnode4_t *rp;
10269 10263
10270 10264 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10271 10265 ASSERT(pp != NULL);
10272 10266 ASSERT(cr != NULL);
10273 10267 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10274 10268
10275 10269 rp = VTOR4(vp);
10276 10270 ASSERT(rp->r_count > 0);
10277 10271 ASSERT(!IS_SHADOW(vp, rp));
10278 10272
10279 10273 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10280 10274 lbn = pp->p_offset / bsize;
10281 10275 lbn_off = lbn * bsize;
10282 10276
10283 10277 /*
10284 10278 * Find a kluster that fits in one block, or in
10285 10279 * one page if pages are bigger than blocks. If
10286 10280 * there is less file space allocated than a whole
10287 10281 * page, we'll shorten the i/o request below.
10288 10282 */
10289 10283 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10290 10284 roundup(bsize, PAGESIZE), flags);
10291 10285
10292 10286 /*
10293 10287 * pvn_write_kluster shouldn't have returned a page with offset
10294 10288 * behind the original page we were given. Verify that.
10295 10289 */
10296 10290 ASSERT((pp->p_offset / bsize) >= lbn);
10297 10291
10298 10292 /*
10299 10293 * Now pp will have the list of kept dirty pages marked for
10300 10294 * write back. It will also handle invalidation and freeing
10301 10295 * of pages that are not dirty. Check for page length rounding
10302 10296 * problems.
10303 10297 */
10304 10298 if (io_off + io_len > lbn_off + bsize) {
10305 10299 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10306 10300 io_len = lbn_off + bsize - io_off;
10307 10301 }
10308 10302 /*
10309 10303 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10310 10304 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10311 10305 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10312 10306 * progress and the r_size has not been made consistent with the
10313 10307 * new size of the file. When the uiomove() completes the r_size is
10314 10308 * updated and the R4MODINPROGRESS flag is cleared.
10315 10309 *
10316 10310 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10317 10311 * consistent value of r_size. Without this handshaking, it is
10318 10312 * possible that nfs4_bio() picks up the old value of r_size
10319 10313 * before the uiomove() in writerp4() completes. This will result
10320 10314 * in the write through nfs4_bio() being dropped.
10321 10315 *
10322 10316 * More precisely, there is a window between the time the uiomove()
10323 10317 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10324 10318 * operation intervenes in this window, the page will be picked up,
10325 10319 * because it is dirty (it will be unlocked, unless it was
10326 10320 * pagecreate'd). When the page is picked up as dirty, the dirty
10327 10321 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10328 10322 * checked. This will still be the old size. Therefore the page will
10329 10323 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10330 10324 * the page will be found to be clean and the write will be dropped.
10331 10325 */
10332 10326 if (rp->r_flags & R4MODINPROGRESS) {
10333 10327 mutex_enter(&rp->r_statelock);
10334 10328 if ((rp->r_flags & R4MODINPROGRESS) &&
10335 10329 rp->r_modaddr + MAXBSIZE > io_off &&
10336 10330 rp->r_modaddr < io_off + io_len) {
10337 10331 page_t *plist;
10338 10332 /*
10339 10333 * A write is in progress for this region of the file.
10340 10334 * If we did not detect R4MODINPROGRESS here then this
10341 10335 * path through nfs_putapage() would eventually go to
10342 10336 * nfs4_bio() and may not write out all of the data
10343 10337 * in the pages. We end up losing data. So we decide
10344 10338 * to set the modified bit on each page in the page
10345 10339 * list and mark the rnode with R4DIRTY. This write
10346 10340 * will be restarted at some later time.
10347 10341 */
10348 10342 plist = pp;
10349 10343 while (plist != NULL) {
10350 10344 pp = plist;
10351 10345 page_sub(&plist, pp);
10352 10346 hat_setmod(pp);
10353 10347 page_io_unlock(pp);
10354 10348 page_unlock(pp);
10355 10349 }
10356 10350 rp->r_flags |= R4DIRTY;
10357 10351 mutex_exit(&rp->r_statelock);
10358 10352 if (offp)
10359 10353 *offp = io_off;
10360 10354 if (lenp)
10361 10355 *lenp = io_len;
10362 10356 return (0);
10363 10357 }
10364 10358 mutex_exit(&rp->r_statelock);
10365 10359 }
10366 10360
10367 10361 if (flags & B_ASYNC) {
10368 10362 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10369 10363 nfs4_sync_putapage);
10370 10364 } else
10371 10365 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10372 10366
10373 10367 if (offp)
10374 10368 *offp = io_off;
10375 10369 if (lenp)
10376 10370 *lenp = io_len;
10377 10371 return (error);
10378 10372 }
10379 10373
10380 10374 static int
10381 10375 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10382 10376 int flags, cred_t *cr)
10383 10377 {
10384 10378 int error;
10385 10379 rnode4_t *rp;
10386 10380
10387 10381 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10388 10382
10389 10383 flags |= B_WRITE;
10390 10384
10391 10385 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10392 10386
10393 10387 rp = VTOR4(vp);
10394 10388
10395 10389 if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10396 10390 error == EACCES) &&
10397 10391 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10398 10392 if (!(rp->r_flags & R4OUTOFSPACE)) {
10399 10393 mutex_enter(&rp->r_statelock);
10400 10394 rp->r_flags |= R4OUTOFSPACE;
10401 10395 mutex_exit(&rp->r_statelock);
10402 10396 }
10403 10397 flags |= B_ERROR;
10404 10398 pvn_write_done(pp, flags);
10405 10399 /*
10406 10400 * If this was not an async thread, then try again to
10407 10401 * write out the pages, but this time, also destroy
10408 10402 * them whether or not the write is successful. This
10409 10403 * will prevent memory from filling up with these
10410 10404 * pages and destroying them is the only alternative
10411 10405 * if they can't be written out.
10412 10406 *
10413 10407 * Don't do this if this is an async thread because
10414 10408 * when the pages are unlocked in pvn_write_done,
10415 10409 * some other thread could have come along, locked
10416 10410 * them, and queued for an async thread. It would be
10417 10411 * possible for all of the async threads to be tied
10418 10412 * up waiting to lock the pages again and they would
10419 10413 * all already be locked and waiting for an async
10420 10414 * thread to handle them. Deadlock.
10421 10415 */
10422 10416 if (!(flags & B_ASYNC)) {
10423 10417 error = nfs4_putpage(vp, io_off, io_len,
10424 10418 B_INVAL | B_FORCE, cr, NULL);
10425 10419 }
10426 10420 } else {
10427 10421 if (error)
10428 10422 flags |= B_ERROR;
10429 10423 else if (rp->r_flags & R4OUTOFSPACE) {
10430 10424 mutex_enter(&rp->r_statelock);
10431 10425 rp->r_flags &= ~R4OUTOFSPACE;
10432 10426 mutex_exit(&rp->r_statelock);
10433 10427 }
10434 10428 pvn_write_done(pp, flags);
10435 10429 if (freemem < desfree)
10436 10430 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10437 10431 NFS4_WRITE_NOWAIT);
10438 10432 }
10439 10433
10440 10434 return (error);
10441 10435 }
10442 10436
10443 10437 #ifdef DEBUG
10444 10438 int nfs4_force_open_before_mmap = 0;
10445 10439 #endif
10446 10440
10447 10441 /* ARGSUSED */
10448 10442 static int
10449 10443 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10450 10444 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10451 10445 caller_context_t *ct)
10452 10446 {
10453 10447 struct segvn_crargs vn_a;
10454 10448 int error = 0;
10455 10449 rnode4_t *rp = VTOR4(vp);
10456 10450 mntinfo4_t *mi = VTOMI4(vp);
10457 10451
10458 10452 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10459 10453 return (EIO);
10460 10454
10461 10455 if (vp->v_flag & VNOMAP)
10462 10456 return (ENOSYS);
10463 10457
10464 10458 if (off < 0 || (off + len) < 0)
10465 10459 return (ENXIO);
10466 10460
10467 10461 if (vp->v_type != VREG)
10468 10462 return (ENODEV);
10469 10463
10470 10464 /*
10471 10465 * If the file is delegated to the client don't do anything.
10472 10466 * If the file is not delegated, then validate the data cache.
10473 10467 */
10474 10468 mutex_enter(&rp->r_statev4_lock);
10475 10469 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10476 10470 mutex_exit(&rp->r_statev4_lock);
10477 10471 error = nfs4_validate_caches(vp, cr);
10478 10472 if (error)
10479 10473 return (error);
10480 10474 } else {
10481 10475 mutex_exit(&rp->r_statev4_lock);
10482 10476 }
10483 10477
10484 10478 /*
10485 10479 * Check to see if the vnode is currently marked as not cachable.
10486 10480 * This means portions of the file are locked (through VOP_FRLOCK).
10487 10481 * In this case the map request must be refused. We use
10488 10482 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10489 10483 *
10490 10484 * Atomically increment r_inmap after acquiring r_rwlock. The
10491 10485 * idea here is to acquire r_rwlock to block read/write and
10492 10486 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10493 10487 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10494 10488 * and we can prevent the deadlock that would have occurred
10495 10489 * when nfs4_addmap() would have acquired it out of order.
10496 10490 *
10497 10491 * Since we are not protecting r_inmap by any lock, we do not
10498 10492 * hold any lock when we decrement it. We atomically decrement
10499 10493 * r_inmap after we release r_lkserlock.
10500 10494 */
10501 10495
10502 10496 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10503 10497 return (EINTR);
10504 10498 atomic_inc_uint(&rp->r_inmap);
10505 10499 nfs_rw_exit(&rp->r_rwlock);
10506 10500
10507 10501 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10508 10502 atomic_dec_uint(&rp->r_inmap);
10509 10503 return (EINTR);
10510 10504 }
10511 10505
10512 10506 if (vp->v_flag & VNOCACHE) {
10513 10507 error = EAGAIN;
10514 10508 goto done;
10515 10509 }
10516 10510
10517 10511 /*
10518 10512 * Don't allow concurrent locks and mapping if mandatory locking is
10519 10513 * enabled.
10520 10514 */
10521 10515 if (flk_has_remote_locks(vp)) {
10522 10516 struct vattr va;
10523 10517 va.va_mask = AT_MODE;
10524 10518 error = nfs4getattr(vp, &va, cr);
10525 10519 if (error != 0)
10526 10520 goto done;
10527 10521 if (MANDLOCK(vp, va.va_mode)) {
10528 10522 error = EAGAIN;
10529 10523 goto done;
10530 10524 }
10531 10525 }
10532 10526
10533 10527 /*
10534 10528 * It is possible that the rnode has a lost lock request that we
10535 10529 * are still trying to recover, and that the request conflicts with
10536 10530 * this map request.
10537 10531 *
10538 10532 * An alternative approach would be for nfs4_safemap() to consider
10539 10533 * queued lock requests when deciding whether to set or clear
10540 10534 * VNOCACHE. This would require the frlock code path to call
10541 10535 * nfs4_safemap() after enqueing a lost request.
10542 10536 */
10543 10537 if (nfs4_map_lost_lock_conflict(vp)) {
10544 10538 error = EAGAIN;
10545 10539 goto done;
10546 10540 }
10547 10541
10548 10542 as_rangelock(as);
10549 10543 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10550 10544 if (error != 0) {
10551 10545 as_rangeunlock(as);
10552 10546 goto done;
10553 10547 }
10554 10548
10555 10549 if (vp->v_type == VREG) {
10556 10550 /*
10557 10551 * We need to retrieve the open stream
10558 10552 */
10559 10553 nfs4_open_stream_t *osp = NULL;
10560 10554 nfs4_open_owner_t *oop = NULL;
10561 10555
10562 10556 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10563 10557 if (oop != NULL) {
10564 10558 /* returns with 'os_sync_lock' held */
10565 10559 osp = find_open_stream(oop, rp);
10566 10560 open_owner_rele(oop);
10567 10561 }
10568 10562 if (osp == NULL) {
10569 10563 #ifdef DEBUG
10570 10564 if (nfs4_force_open_before_mmap) {
10571 10565 error = EIO;
10572 10566 goto done;
10573 10567 }
10574 10568 #endif
10575 10569 /* returns with 'os_sync_lock' held */
10576 10570 error = open_and_get_osp(vp, cr, &osp);
10577 10571 if (osp == NULL) {
10578 10572 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10579 10573 "nfs4_map: we tried to OPEN the file "
10580 10574 "but again no osp, so fail with EIO"));
10581 10575 goto done;
10582 10576 }
10583 10577 }
10584 10578
10585 10579 if (osp->os_failed_reopen) {
10586 10580 mutex_exit(&osp->os_sync_lock);
10587 10581 open_stream_rele(osp, rp);
10588 10582 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10589 10583 "nfs4_map: os_failed_reopen set on "
10590 10584 "osp %p, cr %p, rp %s", (void *)osp,
10591 10585 (void *)cr, rnode4info(rp)));
10592 10586 error = EIO;
10593 10587 goto done;
10594 10588 }
10595 10589 mutex_exit(&osp->os_sync_lock);
10596 10590 open_stream_rele(osp, rp);
10597 10591 }
10598 10592
10599 10593 vn_a.vp = vp;
10600 10594 vn_a.offset = off;
10601 10595 vn_a.type = (flags & MAP_TYPE);
10602 10596 vn_a.prot = (uchar_t)prot;
10603 10597 vn_a.maxprot = (uchar_t)maxprot;
10604 10598 vn_a.flags = (flags & ~MAP_TYPE);
10605 10599 vn_a.cred = cr;
10606 10600 vn_a.amp = NULL;
10607 10601 vn_a.szc = 0;
10608 10602 vn_a.lgrp_mem_policy_flags = 0;
10609 10603
10610 10604 error = as_map(as, *addrp, len, segvn_create, &vn_a);
10611 10605 as_rangeunlock(as);
10612 10606
10613 10607 done:
10614 10608 nfs_rw_exit(&rp->r_lkserlock);
10615 10609 atomic_dec_uint(&rp->r_inmap);
10616 10610 return (error);
10617 10611 }
10618 10612
10619 10613 /*
10620 10614 * We're most likely dealing with a kernel module that likes to READ
10621 10615 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10622 10616 * officially OPEN the file to create the necessary client state
10623 10617 * for bookkeeping of os_mmap_read/write counts.
10624 10618 *
10625 10619 * Since VOP_MAP only passes in a pointer to the vnode rather than
10626 10620 * a double pointer, we can't handle the case where nfs4open_otw()
10627 10621 * returns a different vnode than the one passed into VOP_MAP (since
10628 10622 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case,
10629 10623 * we return NULL and let nfs4_map() fail. Note: the only case where
10630 10624 * this should happen is if the file got removed and replaced with the
10631 10625 * same name on the server (in addition to the fact that we're trying
10632 10626 * to VOP_MAP withouth VOP_OPENing the file in the first place).
10633 10627 */
10634 10628 static int
10635 10629 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10636 10630 {
10637 10631 rnode4_t *rp, *drp;
10638 10632 vnode_t *dvp, *open_vp;
10639 10633 char file_name[MAXNAMELEN];
10640 10634 int just_created;
10641 10635 nfs4_open_stream_t *osp;
10642 10636 nfs4_open_owner_t *oop;
10643 10637 int error;
10644 10638
10645 10639 *ospp = NULL;
10646 10640 open_vp = map_vp;
10647 10641
10648 10642 rp = VTOR4(open_vp);
10649 10643 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10650 10644 return (error);
10651 10645 drp = VTOR4(dvp);
10652 10646
10653 10647 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10654 10648 VN_RELE(dvp);
10655 10649 return (EINTR);
10656 10650 }
10657 10651
10658 10652 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10659 10653 nfs_rw_exit(&drp->r_rwlock);
10660 10654 VN_RELE(dvp);
10661 10655 return (error);
10662 10656 }
10663 10657
10664 10658 mutex_enter(&rp->r_statev4_lock);
10665 10659 if (rp->created_v4) {
10666 10660 rp->created_v4 = 0;
10667 10661 mutex_exit(&rp->r_statev4_lock);
10668 10662
10669 10663 dnlc_update(dvp, file_name, open_vp);
10670 10664 /* This is needed so we don't bump the open ref count */
10671 10665 just_created = 1;
10672 10666 } else {
10673 10667 mutex_exit(&rp->r_statev4_lock);
10674 10668 just_created = 0;
10675 10669 }
10676 10670
10677 10671 VN_HOLD(map_vp);
10678 10672
10679 10673 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10680 10674 just_created);
10681 10675 if (error) {
10682 10676 nfs_rw_exit(&drp->r_rwlock);
10683 10677 VN_RELE(dvp);
10684 10678 VN_RELE(map_vp);
10685 10679 return (error);
10686 10680 }
10687 10681
10688 10682 nfs_rw_exit(&drp->r_rwlock);
10689 10683 VN_RELE(dvp);
10690 10684
10691 10685 /*
10692 10686 * If nfs4open_otw() returned a different vnode then "undo"
10693 10687 * the open and return failure to the caller.
10694 10688 */
10695 10689 if (!VN_CMP(open_vp, map_vp)) {
10696 10690 nfs4_error_t e;
10697 10691
10698 10692 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10699 10693 "open returned a different vnode"));
10700 10694 /*
10701 10695 * If there's an error, ignore it,
10702 10696 * and let VOP_INACTIVE handle it.
10703 10697 */
10704 10698 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10705 10699 CLOSE_NORM, 0, 0, 0);
10706 10700 VN_RELE(map_vp);
10707 10701 return (EIO);
10708 10702 }
10709 10703
10710 10704 VN_RELE(map_vp);
10711 10705
10712 10706 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10713 10707 if (!oop) {
10714 10708 nfs4_error_t e;
10715 10709
10716 10710 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10717 10711 "no open owner"));
10718 10712 /*
10719 10713 * If there's an error, ignore it,
10720 10714 * and let VOP_INACTIVE handle it.
10721 10715 */
10722 10716 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10723 10717 CLOSE_NORM, 0, 0, 0);
10724 10718 return (EIO);
10725 10719 }
10726 10720 osp = find_open_stream(oop, rp);
10727 10721 open_owner_rele(oop);
10728 10722 *ospp = osp;
10729 10723 return (0);
10730 10724 }
10731 10725
10732 10726 /*
10733 10727 * Please be aware that when this function is called, the address space write
10734 10728 * a_lock is held. Do not put over the wire calls in this function.
10735 10729 */
10736 10730 /* ARGSUSED */
10737 10731 static int
10738 10732 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10739 10733 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10740 10734 caller_context_t *ct)
10741 10735 {
10742 10736 rnode4_t *rp;
10743 10737 int error = 0;
10744 10738 mntinfo4_t *mi;
10745 10739
10746 10740 mi = VTOMI4(vp);
10747 10741 rp = VTOR4(vp);
10748 10742
10749 10743 if (nfs_zone() != mi->mi_zone)
10750 10744 return (EIO);
10751 10745 if (vp->v_flag & VNOMAP)
10752 10746 return (ENOSYS);
10753 10747
10754 10748 /*
10755 10749 * Don't need to update the open stream first, since this
10756 10750 * mmap can't add any additional share access that isn't
10757 10751 * already contained in the open stream (for the case where we
10758 10752 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10759 10753 * take into account os_mmap_read[write] counts).
10760 10754 */
10761 10755 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10762 10756
10763 10757 if (vp->v_type == VREG) {
10764 10758 /*
10765 10759 * We need to retrieve the open stream and update the counts.
10766 10760 * If there is no open stream here, something is wrong.
10767 10761 */
10768 10762 nfs4_open_stream_t *osp = NULL;
10769 10763 nfs4_open_owner_t *oop = NULL;
10770 10764
10771 10765 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10772 10766 if (oop != NULL) {
10773 10767 /* returns with 'os_sync_lock' held */
10774 10768 osp = find_open_stream(oop, rp);
10775 10769 open_owner_rele(oop);
10776 10770 }
10777 10771 if (osp == NULL) {
10778 10772 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10779 10773 "nfs4_addmap: we should have an osp"
10780 10774 "but we don't, so fail with EIO"));
10781 10775 error = EIO;
10782 10776 goto out;
10783 10777 }
10784 10778
10785 10779 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10786 10780 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10787 10781
10788 10782 /*
10789 10783 * Update the map count in the open stream.
10790 10784 * This is necessary in the case where we
10791 10785 * open/mmap/close/, then the server reboots, and we
10792 10786 * attempt to reopen. If the mmap doesn't add share
10793 10787 * access then we send an invalid reopen with
10794 10788 * access = NONE.
10795 10789 *
10796 10790 * We need to specifically check each PROT_* so a mmap
10797 10791 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10798 10792 * read and write access. A simple comparison of prot
10799 10793 * to ~PROT_WRITE to determine read access is insufficient
10800 10794 * since prot can be |= with PROT_USER, etc.
10801 10795 */
10802 10796
10803 10797 /*
10804 10798 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10805 10799 */
10806 10800 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10807 10801 osp->os_mmap_write += btopr(len);
10808 10802 if (maxprot & PROT_READ)
10809 10803 osp->os_mmap_read += btopr(len);
10810 10804 if (maxprot & PROT_EXEC)
10811 10805 osp->os_mmap_read += btopr(len);
10812 10806 /*
10813 10807 * Ensure that os_mmap_read gets incremented, even if
10814 10808 * maxprot were to look like PROT_NONE.
10815 10809 */
10816 10810 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10817 10811 !(maxprot & PROT_EXEC))
10818 10812 osp->os_mmap_read += btopr(len);
10819 10813 osp->os_mapcnt += btopr(len);
10820 10814 mutex_exit(&osp->os_sync_lock);
10821 10815 open_stream_rele(osp, rp);
10822 10816 }
10823 10817
10824 10818 out:
10825 10819 /*
10826 10820 * If we got an error, then undo our
10827 10821 * incrementing of 'r_mapcnt'.
10828 10822 */
10829 10823
10830 10824 if (error) {
10831 10825 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10832 10826 ASSERT(rp->r_mapcnt >= 0);
10833 10827 }
10834 10828 return (error);
10835 10829 }
10836 10830
10837 10831 /* ARGSUSED */
10838 10832 static int
10839 10833 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10840 10834 {
10841 10835
10842 10836 return (VTOR4(vp1) == VTOR4(vp2));
10843 10837 }
10844 10838
10845 10839 /* ARGSUSED */
10846 10840 static int
10847 10841 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10848 10842 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10849 10843 caller_context_t *ct)
10850 10844 {
10851 10845 int rc;
10852 10846 u_offset_t start, end;
10853 10847 rnode4_t *rp;
10854 10848 int error = 0, intr = INTR4(vp);
10855 10849 nfs4_error_t e;
10856 10850
10857 10851 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10858 10852 return (EIO);
10859 10853
10860 10854 /* check for valid cmd parameter */
10861 10855 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10862 10856 return (EINVAL);
10863 10857
10864 10858 /* Verify l_type. */
10865 10859 switch (bfp->l_type) {
10866 10860 case F_RDLCK:
10867 10861 if (cmd != F_GETLK && !(flag & FREAD))
10868 10862 return (EBADF);
10869 10863 break;
10870 10864 case F_WRLCK:
10871 10865 if (cmd != F_GETLK && !(flag & FWRITE))
10872 10866 return (EBADF);
10873 10867 break;
10874 10868 case F_UNLCK:
10875 10869 intr = 0;
10876 10870 break;
10877 10871
10878 10872 default:
10879 10873 return (EINVAL);
10880 10874 }
10881 10875
10882 10876 /* check the validity of the lock range */
10883 10877 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10884 10878 return (rc);
10885 10879 if (rc = flk_check_lock_data(start, end, MAXEND))
10886 10880 return (rc);
10887 10881
10888 10882 /*
10889 10883 * If the filesystem is mounted using local locking, pass the
10890 10884 * request off to the local locking code.
10891 10885 */
10892 10886 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10893 10887 if (cmd == F_SETLK || cmd == F_SETLKW) {
10894 10888 /*
10895 10889 * For complete safety, we should be holding
10896 10890 * r_lkserlock. However, we can't call
10897 10891 * nfs4_safelock and then fs_frlock while
10898 10892 * holding r_lkserlock, so just invoke
10899 10893 * nfs4_safelock and expect that this will
10900 10894 * catch enough of the cases.
10901 10895 */
10902 10896 if (!nfs4_safelock(vp, bfp, cr))
10903 10897 return (EAGAIN);
10904 10898 }
10905 10899 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10906 10900 }
10907 10901
10908 10902 rp = VTOR4(vp);
10909 10903
10910 10904 /*
10911 10905 * Check whether the given lock request can proceed, given the
10912 10906 * current file mappings.
10913 10907 */
10914 10908 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10915 10909 return (EINTR);
10916 10910 if (cmd == F_SETLK || cmd == F_SETLKW) {
10917 10911 if (!nfs4_safelock(vp, bfp, cr)) {
10918 10912 rc = EAGAIN;
10919 10913 goto done;
10920 10914 }
10921 10915 }
10922 10916
10923 10917 /*
10924 10918 * Flush the cache after waiting for async I/O to finish. For new
10925 10919 * locks, this is so that the process gets the latest bits from the
10926 10920 * server. For unlocks, this is so that other clients see the
10927 10921 * latest bits once the file has been unlocked. If currently dirty
10928 10922 * pages can't be flushed, then don't allow a lock to be set. But
10929 10923 * allow unlocks to succeed, to avoid having orphan locks on the
10930 10924 * server.
10931 10925 */
10932 10926 if (cmd != F_GETLK) {
10933 10927 mutex_enter(&rp->r_statelock);
10934 10928 while (rp->r_count > 0) {
10935 10929 if (intr) {
10936 10930 klwp_t *lwp = ttolwp(curthread);
10937 10931
10938 10932 if (lwp != NULL)
10939 10933 lwp->lwp_nostop++;
10940 10934 if (cv_wait_sig(&rp->r_cv,
10941 10935 &rp->r_statelock) == 0) {
10942 10936 if (lwp != NULL)
10943 10937 lwp->lwp_nostop--;
10944 10938 rc = EINTR;
10945 10939 break;
10946 10940 }
10947 10941 if (lwp != NULL)
10948 10942 lwp->lwp_nostop--;
10949 10943 } else {
10950 10944 cv_wait(&rp->r_cv, &rp->r_statelock);
10951 10945 }
10952 10946 }
10953 10947 mutex_exit(&rp->r_statelock);
10954 10948 if (rc != 0)
10955 10949 goto done;
10956 10950 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10957 10951 if (error) {
10958 10952 if (error == ENOSPC || error == EDQUOT) {
10959 10953 mutex_enter(&rp->r_statelock);
10960 10954 if (!rp->r_error)
10961 10955 rp->r_error = error;
10962 10956 mutex_exit(&rp->r_statelock);
10963 10957 }
10964 10958 if (bfp->l_type != F_UNLCK) {
10965 10959 rc = ENOLCK;
10966 10960 goto done;
10967 10961 }
10968 10962 }
10969 10963 }
10970 10964
10971 10965 /*
10972 10966 * Call the lock manager to do the real work of contacting
10973 10967 * the server and obtaining the lock.
10974 10968 */
10975 10969 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10976 10970 cr, &e, NULL, NULL);
10977 10971 rc = e.error;
10978 10972
10979 10973 if (rc == 0)
10980 10974 nfs4_lockcompletion(vp, cmd);
10981 10975
10982 10976 done:
10983 10977 nfs_rw_exit(&rp->r_lkserlock);
10984 10978
10985 10979 return (rc);
10986 10980 }
10987 10981
10988 10982 /*
10989 10983 * Free storage space associated with the specified vnode. The portion
10990 10984 * to be freed is specified by bfp->l_start and bfp->l_len (already
10991 10985 * normalized to a "whence" of 0).
10992 10986 *
10993 10987 * This is an experimental facility whose continued existence is not
10994 10988 * guaranteed. Currently, we only support the special case
10995 10989 * of l_len == 0, meaning free to end of file.
10996 10990 */
10997 10991 /* ARGSUSED */
10998 10992 static int
10999 10993 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
11000 10994 offset_t offset, cred_t *cr, caller_context_t *ct)
11001 10995 {
11002 10996 int error;
11003 10997
11004 10998 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11005 10999 return (EIO);
11006 11000 ASSERT(vp->v_type == VREG);
11007 11001 if (cmd != F_FREESP)
11008 11002 return (EINVAL);
11009 11003
11010 11004 error = convoff(vp, bfp, 0, offset);
11011 11005 if (!error) {
11012 11006 ASSERT(bfp->l_start >= 0);
11013 11007 if (bfp->l_len == 0) {
11014 11008 struct vattr va;
11015 11009
11016 11010 va.va_mask = AT_SIZE;
11017 11011 va.va_size = bfp->l_start;
11018 11012 error = nfs4setattr(vp, &va, 0, cr, NULL);
11019 11013
11020 11014 if (error == 0) {
11021 11015 if (bfp->l_start == 0) {
11022 11016 vnevent_truncate(vp, ct);
11023 11017 } else {
11024 11018 vnevent_resize(vp, ct);
11025 11019 }
11026 11020 }
11027 11021 } else
11028 11022 error = EINVAL;
11029 11023 }
11030 11024
11031 11025 return (error);
11032 11026 }
11033 11027
11034 11028 /* ARGSUSED */
11035 11029 int
11036 11030 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11037 11031 {
11038 11032 rnode4_t *rp;
11039 11033 rp = VTOR4(vp);
11040 11034
11041 11035 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11042 11036 vp = RTOV4(rp);
11043 11037 }
11044 11038 *vpp = vp;
11045 11039 return (0);
11046 11040 }
11047 11041
11048 11042 /*
11049 11043 * Setup and add an address space callback to do the work of the delmap call.
11050 11044 * The callback will (and must be) deleted in the actual callback function.
11051 11045 *
11052 11046 * This is done in order to take care of the problem that we have with holding
11053 11047 * the address space's a_lock for a long period of time (e.g. if the NFS server
11054 11048 * is down). Callbacks will be executed in the address space code while the
11055 11049 * a_lock is not held. Holding the address space's a_lock causes things such
11056 11050 * as ps and fork to hang because they are trying to acquire this lock as well.
11057 11051 */
11058 11052 /* ARGSUSED */
11059 11053 static int
11060 11054 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11061 11055 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11062 11056 caller_context_t *ct)
11063 11057 {
11064 11058 int caller_found;
11065 11059 int error;
11066 11060 rnode4_t *rp;
11067 11061 nfs4_delmap_args_t *dmapp;
11068 11062 nfs4_delmapcall_t *delmap_call;
11069 11063
11070 11064 if (vp->v_flag & VNOMAP)
11071 11065 return (ENOSYS);
11072 11066
11073 11067 /*
11074 11068 * A process may not change zones if it has NFS pages mmap'ed
11075 11069 * in, so we can't legitimately get here from the wrong zone.
11076 11070 */
11077 11071 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11078 11072
11079 11073 rp = VTOR4(vp);
11080 11074
11081 11075 /*
11082 11076 * The way that the address space of this process deletes its mapping
11083 11077 * of this file is via the following call chains:
11084 11078 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11085 11079 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11086 11080 *
11087 11081 * With the use of address space callbacks we are allowed to drop the
11088 11082 * address space lock, a_lock, while executing the NFS operations that
11089 11083 * need to go over the wire. Returning EAGAIN to the caller of this
11090 11084 * function is what drives the execution of the callback that we add
11091 11085 * below. The callback will be executed by the address space code
11092 11086 * after dropping the a_lock. When the callback is finished, since
11093 11087 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11094 11088 * is called again on the same segment to finish the rest of the work
11095 11089 * that needs to happen during unmapping.
11096 11090 *
11097 11091 * This action of calling back into the segment driver causes
11098 11092 * nfs4_delmap() to get called again, but since the callback was
11099 11093 * already executed at this point, it already did the work and there
11100 11094 * is nothing left for us to do.
11101 11095 *
11102 11096 * To Summarize:
11103 11097 * - The first time nfs4_delmap is called by the current thread is when
11104 11098 * we add the caller associated with this delmap to the delmap caller
11105 11099 * list, add the callback, and return EAGAIN.
11106 11100 * - The second time in this call chain when nfs4_delmap is called we
11107 11101 * will find this caller in the delmap caller list and realize there
11108 11102 * is no more work to do thus removing this caller from the list and
11109 11103 * returning the error that was set in the callback execution.
11110 11104 */
11111 11105 caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11112 11106 if (caller_found) {
11113 11107 /*
11114 11108 * 'error' is from the actual delmap operations. To avoid
11115 11109 * hangs, we need to handle the return of EAGAIN differently
11116 11110 * since this is what drives the callback execution.
11117 11111 * In this case, we don't want to return EAGAIN and do the
11118 11112 * callback execution because there are none to execute.
11119 11113 */
11120 11114 if (error == EAGAIN)
11121 11115 return (0);
11122 11116 else
11123 11117 return (error);
11124 11118 }
11125 11119
11126 11120 /* current caller was not in the list */
11127 11121 delmap_call = nfs4_init_delmapcall();
11128 11122
11129 11123 mutex_enter(&rp->r_statelock);
11130 11124 list_insert_tail(&rp->r_indelmap, delmap_call);
11131 11125 mutex_exit(&rp->r_statelock);
11132 11126
11133 11127 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11134 11128
11135 11129 dmapp->vp = vp;
11136 11130 dmapp->off = off;
11137 11131 dmapp->addr = addr;
11138 11132 dmapp->len = len;
11139 11133 dmapp->prot = prot;
11140 11134 dmapp->maxprot = maxprot;
11141 11135 dmapp->flags = flags;
11142 11136 dmapp->cr = cr;
11143 11137 dmapp->caller = delmap_call;
11144 11138
11145 11139 error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11146 11140 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11147 11141
11148 11142 return (error ? error : EAGAIN);
11149 11143 }
11150 11144
11151 11145 static nfs4_delmapcall_t *
11152 11146 nfs4_init_delmapcall()
11153 11147 {
11154 11148 nfs4_delmapcall_t *delmap_call;
11155 11149
11156 11150 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11157 11151 delmap_call->call_id = curthread;
11158 11152 delmap_call->error = 0;
11159 11153
11160 11154 return (delmap_call);
11161 11155 }
11162 11156
11163 11157 static void
11164 11158 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11165 11159 {
11166 11160 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11167 11161 }
11168 11162
11169 11163 /*
11170 11164 * Searches for the current delmap caller (based on curthread) in the list of
11171 11165 * callers. If it is found, we remove it and free the delmap caller.
11172 11166 * Returns:
11173 11167 * 0 if the caller wasn't found
11174 11168 * 1 if the caller was found, removed and freed. *errp will be set
11175 11169 * to what the result of the delmap was.
11176 11170 */
11177 11171 static int
11178 11172 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11179 11173 {
11180 11174 nfs4_delmapcall_t *delmap_call;
11181 11175
11182 11176 /*
11183 11177 * If the list doesn't exist yet, we create it and return
11184 11178 * that the caller wasn't found. No list = no callers.
11185 11179 */
11186 11180 mutex_enter(&rp->r_statelock);
11187 11181 if (!(rp->r_flags & R4DELMAPLIST)) {
11188 11182 /* The list does not exist */
11189 11183 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11190 11184 offsetof(nfs4_delmapcall_t, call_node));
11191 11185 rp->r_flags |= R4DELMAPLIST;
11192 11186 mutex_exit(&rp->r_statelock);
11193 11187 return (0);
11194 11188 } else {
11195 11189 /* The list exists so search it */
11196 11190 for (delmap_call = list_head(&rp->r_indelmap);
11197 11191 delmap_call != NULL;
11198 11192 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11199 11193 if (delmap_call->call_id == curthread) {
11200 11194 /* current caller is in the list */
11201 11195 *errp = delmap_call->error;
11202 11196 list_remove(&rp->r_indelmap, delmap_call);
11203 11197 mutex_exit(&rp->r_statelock);
11204 11198 nfs4_free_delmapcall(delmap_call);
11205 11199 return (1);
11206 11200 }
11207 11201 }
11208 11202 }
11209 11203 mutex_exit(&rp->r_statelock);
11210 11204 return (0);
11211 11205 }
11212 11206
11213 11207 /*
11214 11208 * Remove some pages from an mmap'd vnode. Just update the
11215 11209 * count of pages. If doing close-to-open, then flush and
11216 11210 * commit all of the pages associated with this file.
11217 11211 * Otherwise, start an asynchronous page flush to write out
11218 11212 * any dirty pages. This will also associate a credential
11219 11213 * with the rnode which can be used to write the pages.
11220 11214 */
11221 11215 /* ARGSUSED */
11222 11216 static void
11223 11217 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11224 11218 {
11225 11219 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11226 11220 rnode4_t *rp;
11227 11221 mntinfo4_t *mi;
11228 11222 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg;
11229 11223
11230 11224 rp = VTOR4(dmapp->vp);
11231 11225 mi = VTOMI4(dmapp->vp);
11232 11226
11233 11227 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11234 11228 ASSERT(rp->r_mapcnt >= 0);
11235 11229
11236 11230 /*
11237 11231 * Initiate a page flush and potential commit if there are
11238 11232 * pages, the file system was not mounted readonly, the segment
11239 11233 * was mapped shared, and the pages themselves were writeable.
11240 11234 */
11241 11235 if (nfs4_has_pages(dmapp->vp) &&
11242 11236 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11243 11237 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11244 11238 mutex_enter(&rp->r_statelock);
11245 11239 rp->r_flags |= R4DIRTY;
11246 11240 mutex_exit(&rp->r_statelock);
11247 11241 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11248 11242 dmapp->len, dmapp->cr);
11249 11243 if (!e.error) {
11250 11244 mutex_enter(&rp->r_statelock);
11251 11245 e.error = rp->r_error;
11252 11246 rp->r_error = 0;
11253 11247 mutex_exit(&rp->r_statelock);
11254 11248 }
11255 11249 } else
11256 11250 e.error = 0;
11257 11251
11258 11252 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11259 11253 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11260 11254 B_INVAL, dmapp->cr, NULL);
11261 11255
11262 11256 if (e.error) {
11263 11257 e.stat = puterrno4(e.error);
11264 11258 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11265 11259 OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11266 11260 dmapp->caller->error = e.error;
11267 11261 }
11268 11262
11269 11263 /* Check to see if we need to close the file */
11270 11264
11271 11265 if (dmapp->vp->v_type == VREG) {
11272 11266 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11273 11267 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11274 11268
11275 11269 if (e.error != 0 || e.stat != NFS4_OK) {
11276 11270 /*
11277 11271 * Since it is possible that e.error == 0 and
11278 11272 * e.stat != NFS4_OK (and vice versa),
11279 11273 * we do the proper checking in order to get both
11280 11274 * e.error and e.stat reporting the correct info.
11281 11275 */
11282 11276 if (e.stat == NFS4_OK)
11283 11277 e.stat = puterrno4(e.error);
11284 11278 if (e.error == 0)
11285 11279 e.error = geterrno4(e.stat);
11286 11280
11287 11281 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11288 11282 OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11289 11283 dmapp->caller->error = e.error;
11290 11284 }
11291 11285 }
11292 11286
11293 11287 (void) as_delete_callback(as, arg);
11294 11288 kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11295 11289 }
11296 11290
11297 11291
11298 11292 static uint_t
11299 11293 fattr4_maxfilesize_to_bits(uint64_t ll)
11300 11294 {
11301 11295 uint_t l = 1;
11302 11296
11303 11297 if (ll == 0) {
11304 11298 return (0);
11305 11299 }
11306 11300
11307 11301 if (ll & 0xffffffff00000000) {
11308 11302 l += 32; ll >>= 32;
11309 11303 }
11310 11304 if (ll & 0xffff0000) {
11311 11305 l += 16; ll >>= 16;
11312 11306 }
11313 11307 if (ll & 0xff00) {
11314 11308 l += 8; ll >>= 8;
11315 11309 }
11316 11310 if (ll & 0xf0) {
11317 11311 l += 4; ll >>= 4;
11318 11312 }
11319 11313 if (ll & 0xc) {
11320 11314 l += 2; ll >>= 2;
11321 11315 }
11322 11316 if (ll & 0x2) {
11323 11317 l += 1;
11324 11318 }
11325 11319 return (l);
11326 11320 }
11327 11321
11328 11322 static int
11329 11323 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11330 11324 {
11331 11325 vnode_t *avp = NULL;
11332 11326 int error;
11333 11327
11334 11328 if ((error = nfs4lookup_xattr(vp, "", &avp,
11335 11329 LOOKUP_XATTR, cr)) == 0)
11336 11330 error = do_xattr_exists_check(avp, valp, cr);
11337 11331 if (avp)
11338 11332 VN_RELE(avp);
11339 11333
11340 11334 return (error);
11341 11335 }
11342 11336
11343 11337 /* ARGSUSED */
11344 11338 int
11345 11339 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11346 11340 caller_context_t *ct)
11347 11341 {
11348 11342 int error;
11349 11343 hrtime_t t;
11350 11344 rnode4_t *rp;
11351 11345 nfs4_ga_res_t gar;
11352 11346 nfs4_ga_ext_res_t ger;
11353 11347
11354 11348 gar.n4g_ext_res = &ger;
11355 11349
11356 11350 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11357 11351 return (EIO);
11358 11352 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11359 11353 *valp = MAXPATHLEN;
11360 11354 return (0);
11361 11355 }
11362 11356 if (cmd == _PC_ACL_ENABLED) {
11363 11357 *valp = _ACL_ACE_ENABLED;
11364 11358 return (0);
11365 11359 }
11366 11360
11367 11361 rp = VTOR4(vp);
11368 11362 if (cmd == _PC_XATTR_EXISTS) {
11369 11363 /*
11370 11364 * The existence of the xattr directory is not sufficient
11371 11365 * for determining whether generic user attributes exists.
11372 11366 * The attribute directory could only be a transient directory
11373 11367 * used for Solaris sysattr support. Do a small readdir
11374 11368 * to verify if the only entries are sysattrs or not.
11375 11369 *
11376 11370 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11377 11371 * is NULL. Once the xadir vp exists, we can create xattrs,
11378 11372 * and we don't have any way to update the "base" object's
11379 11373 * pc4_xattr_exists from the xattr or xadir. Maybe FEM
11380 11374 * could help out.
11381 11375 */
11382 11376 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11383 11377 rp->r_xattr_dir == NULL) {
11384 11378 return (nfs4_have_xattrs(vp, valp, cr));
11385 11379 }
11386 11380 } else { /* OLD CODE */
11387 11381 if (ATTRCACHE4_VALID(vp)) {
11388 11382 mutex_enter(&rp->r_statelock);
11389 11383 if (rp->r_pathconf.pc4_cache_valid) {
11390 11384 error = 0;
11391 11385 switch (cmd) {
11392 11386 case _PC_FILESIZEBITS:
11393 11387 *valp =
11394 11388 rp->r_pathconf.pc4_filesizebits;
11395 11389 break;
11396 11390 case _PC_LINK_MAX:
11397 11391 *valp =
11398 11392 rp->r_pathconf.pc4_link_max;
11399 11393 break;
11400 11394 case _PC_NAME_MAX:
11401 11395 *valp =
11402 11396 rp->r_pathconf.pc4_name_max;
11403 11397 break;
11404 11398 case _PC_CHOWN_RESTRICTED:
11405 11399 *valp =
11406 11400 rp->r_pathconf.pc4_chown_restricted;
11407 11401 break;
11408 11402 case _PC_NO_TRUNC:
11409 11403 *valp =
11410 11404 rp->r_pathconf.pc4_no_trunc;
11411 11405 break;
11412 11406 default:
11413 11407 error = EINVAL;
11414 11408 break;
11415 11409 }
11416 11410 mutex_exit(&rp->r_statelock);
11417 11411 #ifdef DEBUG
11418 11412 nfs4_pathconf_cache_hits++;
11419 11413 #endif
11420 11414 return (error);
11421 11415 }
11422 11416 mutex_exit(&rp->r_statelock);
11423 11417 }
11424 11418 }
11425 11419 #ifdef DEBUG
11426 11420 nfs4_pathconf_cache_misses++;
11427 11421 #endif
11428 11422
11429 11423 t = gethrtime();
11430 11424
11431 11425 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11432 11426
11433 11427 if (error) {
11434 11428 mutex_enter(&rp->r_statelock);
11435 11429 rp->r_pathconf.pc4_cache_valid = FALSE;
11436 11430 rp->r_pathconf.pc4_xattr_valid = FALSE;
11437 11431 mutex_exit(&rp->r_statelock);
11438 11432 return (error);
11439 11433 }
11440 11434
11441 11435 /* interpret the max filesize */
11442 11436 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11443 11437 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11444 11438
11445 11439 /* Store the attributes we just received */
11446 11440 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11447 11441
11448 11442 switch (cmd) {
11449 11443 case _PC_FILESIZEBITS:
11450 11444 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11451 11445 break;
11452 11446 case _PC_LINK_MAX:
11453 11447 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11454 11448 break;
11455 11449 case _PC_NAME_MAX:
11456 11450 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11457 11451 break;
11458 11452 case _PC_CHOWN_RESTRICTED:
11459 11453 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11460 11454 break;
11461 11455 case _PC_NO_TRUNC:
11462 11456 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11463 11457 break;
11464 11458 case _PC_XATTR_EXISTS:
11465 11459 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11466 11460 if (error = nfs4_have_xattrs(vp, valp, cr))
11467 11461 return (error);
11468 11462 }
11469 11463 break;
11470 11464 default:
11471 11465 return (EINVAL);
11472 11466 }
11473 11467
11474 11468 return (0);
11475 11469 }
11476 11470
11477 11471 /*
11478 11472 * Called by async thread to do synchronous pageio. Do the i/o, wait
11479 11473 * for it to complete, and cleanup the page list when done.
11480 11474 */
11481 11475 static int
11482 11476 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11483 11477 int flags, cred_t *cr)
11484 11478 {
11485 11479 int error;
11486 11480
11487 11481 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11488 11482
11489 11483 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11490 11484 if (flags & B_READ)
11491 11485 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11492 11486 else
11493 11487 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11494 11488 return (error);
11495 11489 }
11496 11490
11497 11491 /* ARGSUSED */
11498 11492 static int
11499 11493 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11500 11494 int flags, cred_t *cr, caller_context_t *ct)
11501 11495 {
11502 11496 int error;
11503 11497 rnode4_t *rp;
11504 11498
11505 11499 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11506 11500 return (EIO);
11507 11501
11508 11502 if (pp == NULL)
11509 11503 return (EINVAL);
11510 11504
11511 11505 rp = VTOR4(vp);
11512 11506 mutex_enter(&rp->r_statelock);
11513 11507 rp->r_count++;
11514 11508 mutex_exit(&rp->r_statelock);
11515 11509
11516 11510 if (flags & B_ASYNC) {
11517 11511 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11518 11512 nfs4_sync_pageio);
11519 11513 } else
11520 11514 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11521 11515 mutex_enter(&rp->r_statelock);
11522 11516 rp->r_count--;
11523 11517 cv_broadcast(&rp->r_cv);
11524 11518 mutex_exit(&rp->r_statelock);
11525 11519 return (error);
11526 11520 }
11527 11521
11528 11522 /* ARGSUSED */
11529 11523 static void
11530 11524 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11531 11525 caller_context_t *ct)
11532 11526 {
11533 11527 int error;
11534 11528 rnode4_t *rp;
11535 11529 page_t *plist;
11536 11530 page_t *pptr;
11537 11531 offset3 offset;
11538 11532 count3 len;
11539 11533 k_sigset_t smask;
11540 11534
11541 11535 /*
11542 11536 * We should get called with fl equal to either B_FREE or
11543 11537 * B_INVAL. Any other value is illegal.
11544 11538 *
11545 11539 * The page that we are either supposed to free or destroy
11546 11540 * should be exclusive locked and its io lock should not
11547 11541 * be held.
11548 11542 */
11549 11543 ASSERT(fl == B_FREE || fl == B_INVAL);
11550 11544 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11551 11545
11552 11546 rp = VTOR4(vp);
11553 11547
11554 11548 /*
11555 11549 * If the page doesn't need to be committed or we shouldn't
11556 11550 * even bother attempting to commit it, then just make sure
11557 11551 * that the p_fsdata byte is clear and then either free or
11558 11552 * destroy the page as appropriate.
11559 11553 */
11560 11554 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11561 11555 pp->p_fsdata = C_NOCOMMIT;
11562 11556 if (fl == B_FREE)
11563 11557 page_free(pp, dn);
11564 11558 else
11565 11559 page_destroy(pp, dn);
11566 11560 return;
11567 11561 }
11568 11562
11569 11563 /*
11570 11564 * If there is a page invalidation operation going on, then
11571 11565 * if this is one of the pages being destroyed, then just
11572 11566 * clear the p_fsdata byte and then either free or destroy
11573 11567 * the page as appropriate.
11574 11568 */
11575 11569 mutex_enter(&rp->r_statelock);
11576 11570 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11577 11571 mutex_exit(&rp->r_statelock);
11578 11572 pp->p_fsdata = C_NOCOMMIT;
11579 11573 if (fl == B_FREE)
11580 11574 page_free(pp, dn);
11581 11575 else
11582 11576 page_destroy(pp, dn);
11583 11577 return;
11584 11578 }
11585 11579
11586 11580 /*
11587 11581 * If we are freeing this page and someone else is already
11588 11582 * waiting to do a commit, then just unlock the page and
11589 11583 * return. That other thread will take care of commiting
11590 11584 * this page. The page can be freed sometime after the
11591 11585 * commit has finished. Otherwise, if the page is marked
11592 11586 * as delay commit, then we may be getting called from
11593 11587 * pvn_write_done, one page at a time. This could result
11594 11588 * in one commit per page, so we end up doing lots of small
11595 11589 * commits instead of fewer larger commits. This is bad,
11596 11590 * we want do as few commits as possible.
11597 11591 */
11598 11592 if (fl == B_FREE) {
11599 11593 if (rp->r_flags & R4COMMITWAIT) {
11600 11594 page_unlock(pp);
11601 11595 mutex_exit(&rp->r_statelock);
11602 11596 return;
11603 11597 }
11604 11598 if (pp->p_fsdata == C_DELAYCOMMIT) {
11605 11599 pp->p_fsdata = C_COMMIT;
11606 11600 page_unlock(pp);
11607 11601 mutex_exit(&rp->r_statelock);
11608 11602 return;
11609 11603 }
11610 11604 }
11611 11605
11612 11606 /*
11613 11607 * Check to see if there is a signal which would prevent an
11614 11608 * attempt to commit the pages from being successful. If so,
11615 11609 * then don't bother with all of the work to gather pages and
11616 11610 * generate the unsuccessful RPC. Just return from here and
11617 11611 * let the page be committed at some later time.
11618 11612 */
11619 11613 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11620 11614 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11621 11615 sigunintr(&smask);
11622 11616 page_unlock(pp);
11623 11617 mutex_exit(&rp->r_statelock);
11624 11618 return;
11625 11619 }
11626 11620 sigunintr(&smask);
11627 11621
11628 11622 /*
11629 11623 * We are starting to need to commit pages, so let's try
11630 11624 * to commit as many as possible at once to reduce the
11631 11625 * overhead.
11632 11626 *
11633 11627 * Set the `commit inprogress' state bit. We must
11634 11628 * first wait until any current one finishes. Then
11635 11629 * we initialize the c_pages list with this page.
11636 11630 */
11637 11631 while (rp->r_flags & R4COMMIT) {
11638 11632 rp->r_flags |= R4COMMITWAIT;
11639 11633 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11640 11634 rp->r_flags &= ~R4COMMITWAIT;
11641 11635 }
11642 11636 rp->r_flags |= R4COMMIT;
11643 11637 mutex_exit(&rp->r_statelock);
11644 11638 ASSERT(rp->r_commit.c_pages == NULL);
11645 11639 rp->r_commit.c_pages = pp;
11646 11640 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11647 11641 rp->r_commit.c_commlen = PAGESIZE;
11648 11642
11649 11643 /*
11650 11644 * Gather together all other pages which can be committed.
11651 11645 * They will all be chained off r_commit.c_pages.
11652 11646 */
11653 11647 nfs4_get_commit(vp);
11654 11648
11655 11649 /*
11656 11650 * Clear the `commit inprogress' status and disconnect
11657 11651 * the list of pages to be committed from the rnode.
11658 11652 * At this same time, we also save the starting offset
11659 11653 * and length of data to be committed on the server.
11660 11654 */
11661 11655 plist = rp->r_commit.c_pages;
11662 11656 rp->r_commit.c_pages = NULL;
11663 11657 offset = rp->r_commit.c_commbase;
11664 11658 len = rp->r_commit.c_commlen;
11665 11659 mutex_enter(&rp->r_statelock);
11666 11660 rp->r_flags &= ~R4COMMIT;
11667 11661 cv_broadcast(&rp->r_commit.c_cv);
11668 11662 mutex_exit(&rp->r_statelock);
11669 11663
11670 11664 if (curproc == proc_pageout || curproc == proc_fsflush ||
11671 11665 nfs_zone() != VTOMI4(vp)->mi_zone) {
11672 11666 nfs4_async_commit(vp, plist, offset, len,
11673 11667 cr, do_nfs4_async_commit);
11674 11668 return;
11675 11669 }
11676 11670
11677 11671 /*
11678 11672 * Actually generate the COMMIT op over the wire operation.
11679 11673 */
11680 11674 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11681 11675
11682 11676 /*
11683 11677 * If we got an error during the commit, just unlock all
11684 11678 * of the pages. The pages will get retransmitted to the
11685 11679 * server during a putpage operation.
11686 11680 */
11687 11681 if (error) {
11688 11682 while (plist != NULL) {
11689 11683 pptr = plist;
11690 11684 page_sub(&plist, pptr);
11691 11685 page_unlock(pptr);
11692 11686 }
11693 11687 return;
11694 11688 }
11695 11689
11696 11690 /*
11697 11691 * We've tried as hard as we can to commit the data to stable
11698 11692 * storage on the server. We just unlock the rest of the pages
11699 11693 * and clear the commit required state. They will be put
11700 11694 * onto the tail of the cachelist if they are nolonger
11701 11695 * mapped.
11702 11696 */
11703 11697 while (plist != pp) {
11704 11698 pptr = plist;
11705 11699 page_sub(&plist, pptr);
11706 11700 pptr->p_fsdata = C_NOCOMMIT;
11707 11701 page_unlock(pptr);
11708 11702 }
11709 11703
11710 11704 /*
11711 11705 * It is possible that nfs4_commit didn't return error but
11712 11706 * some other thread has modified the page we are going
11713 11707 * to free/destroy.
11714 11708 * In this case we need to rewrite the page. Do an explicit check
11715 11709 * before attempting to free/destroy the page. If modified, needs to
11716 11710 * be rewritten so unlock the page and return.
11717 11711 */
11718 11712 if (hat_ismod(pp)) {
11719 11713 pp->p_fsdata = C_NOCOMMIT;
11720 11714 page_unlock(pp);
11721 11715 return;
11722 11716 }
11723 11717
11724 11718 /*
11725 11719 * Now, as appropriate, either free or destroy the page
11726 11720 * that we were called with.
11727 11721 */
11728 11722 pp->p_fsdata = C_NOCOMMIT;
11729 11723 if (fl == B_FREE)
11730 11724 page_free(pp, dn);
11731 11725 else
11732 11726 page_destroy(pp, dn);
11733 11727 }
11734 11728
11735 11729 /*
11736 11730 * Commit requires that the current fh be the file written to.
11737 11731 * The compound op structure is:
11738 11732 * PUTFH(file), COMMIT
11739 11733 */
11740 11734 static int
11741 11735 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11742 11736 {
11743 11737 COMPOUND4args_clnt args;
11744 11738 COMPOUND4res_clnt res;
11745 11739 COMMIT4res *cm_res;
11746 11740 nfs_argop4 argop[2];
11747 11741 nfs_resop4 *resop;
11748 11742 int doqueue;
11749 11743 mntinfo4_t *mi;
11750 11744 rnode4_t *rp;
11751 11745 cred_t *cred_otw = NULL;
11752 11746 bool_t needrecov = FALSE;
11753 11747 nfs4_recov_state_t recov_state;
11754 11748 nfs4_open_stream_t *osp = NULL;
11755 11749 bool_t first_time = TRUE; /* first time getting OTW cred */
11756 11750 bool_t last_time = FALSE; /* last time getting OTW cred */
11757 11751 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11758 11752
11759 11753 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11760 11754
11761 11755 rp = VTOR4(vp);
11762 11756
11763 11757 mi = VTOMI4(vp);
11764 11758 recov_state.rs_flags = 0;
11765 11759 recov_state.rs_num_retry_despite_err = 0;
11766 11760 get_commit_cred:
11767 11761 /*
11768 11762 * Releases the osp, if a valid open stream is provided.
11769 11763 * Puts a hold on the cred_otw and the new osp (if found).
11770 11764 */
11771 11765 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11772 11766 &first_time, &last_time);
11773 11767 args.ctag = TAG_COMMIT;
11774 11768 recov_retry:
11775 11769 /*
11776 11770 * Commit ops: putfh file; commit
11777 11771 */
11778 11772 args.array_len = 2;
11779 11773 args.array = argop;
11780 11774
11781 11775 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11782 11776 &recov_state, NULL);
11783 11777 if (e.error) {
11784 11778 crfree(cred_otw);
11785 11779 if (osp != NULL)
11786 11780 open_stream_rele(osp, rp);
11787 11781 return (e.error);
11788 11782 }
11789 11783
11790 11784 /* putfh directory */
11791 11785 argop[0].argop = OP_CPUTFH;
11792 11786 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11793 11787
11794 11788 /* commit */
11795 11789 argop[1].argop = OP_COMMIT;
11796 11790 argop[1].nfs_argop4_u.opcommit.offset = offset;
11797 11791 argop[1].nfs_argop4_u.opcommit.count = count;
11798 11792
11799 11793 doqueue = 1;
11800 11794 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11801 11795
11802 11796 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11803 11797 if (!needrecov && e.error) {
11804 11798 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11805 11799 needrecov);
11806 11800 crfree(cred_otw);
11807 11801 if (e.error == EACCES && last_time == FALSE)
11808 11802 goto get_commit_cred;
11809 11803 if (osp != NULL)
11810 11804 open_stream_rele(osp, rp);
11811 11805 return (e.error);
11812 11806 }
11813 11807
11814 11808 if (needrecov) {
11815 11809 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11816 11810 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11817 11811 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11818 11812 &recov_state, needrecov);
11819 11813 if (!e.error)
11820 11814 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11821 11815 goto recov_retry;
11822 11816 }
11823 11817 if (e.error) {
11824 11818 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11825 11819 &recov_state, needrecov);
11826 11820 crfree(cred_otw);
11827 11821 if (osp != NULL)
11828 11822 open_stream_rele(osp, rp);
11829 11823 return (e.error);
11830 11824 }
11831 11825 /* fall through for res.status case */
11832 11826 }
11833 11827
11834 11828 if (res.status) {
11835 11829 e.error = geterrno4(res.status);
11836 11830 if (e.error == EACCES && last_time == FALSE) {
11837 11831 crfree(cred_otw);
11838 11832 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11839 11833 &recov_state, needrecov);
11840 11834 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11841 11835 goto get_commit_cred;
11842 11836 }
11843 11837 /*
11844 11838 * Can't do a nfs4_purge_stale_fh here because this
11845 11839 * can cause a deadlock. nfs4_commit can
11846 11840 * be called from nfs4_dispose which can be called
11847 11841 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh
11848 11842 * can call back to pvn_vplist_dirty.
11849 11843 */
11850 11844 if (e.error == ESTALE) {
11851 11845 mutex_enter(&rp->r_statelock);
11852 11846 rp->r_flags |= R4STALE;
11853 11847 if (!rp->r_error)
11854 11848 rp->r_error = e.error;
11855 11849 mutex_exit(&rp->r_statelock);
11856 11850 PURGE_ATTRCACHE4(vp);
11857 11851 } else {
11858 11852 mutex_enter(&rp->r_statelock);
11859 11853 if (!rp->r_error)
11860 11854 rp->r_error = e.error;
11861 11855 mutex_exit(&rp->r_statelock);
11862 11856 }
11863 11857 } else {
11864 11858 ASSERT(rp->r_flags & R4HAVEVERF);
11865 11859 resop = &res.array[1]; /* commit res */
11866 11860 cm_res = &resop->nfs_resop4_u.opcommit;
11867 11861 mutex_enter(&rp->r_statelock);
11868 11862 if (cm_res->writeverf == rp->r_writeverf) {
11869 11863 mutex_exit(&rp->r_statelock);
11870 11864 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11871 11865 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11872 11866 &recov_state, needrecov);
11873 11867 crfree(cred_otw);
11874 11868 if (osp != NULL)
11875 11869 open_stream_rele(osp, rp);
11876 11870 return (0);
11877 11871 }
11878 11872 nfs4_set_mod(vp);
11879 11873 rp->r_writeverf = cm_res->writeverf;
11880 11874 mutex_exit(&rp->r_statelock);
11881 11875 e.error = NFS_VERF_MISMATCH;
11882 11876 }
11883 11877
11884 11878 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11885 11879 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11886 11880 crfree(cred_otw);
11887 11881 if (osp != NULL)
11888 11882 open_stream_rele(osp, rp);
11889 11883
11890 11884 return (e.error);
11891 11885 }
11892 11886
11893 11887 static void
11894 11888 nfs4_set_mod(vnode_t *vp)
11895 11889 {
11896 11890 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11897 11891
11898 11892 /* make sure we're looking at the master vnode, not a shadow */
11899 11893 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11900 11894 }
11901 11895
11902 11896 /*
11903 11897 * This function is used to gather a page list of the pages which
11904 11898 * can be committed on the server.
11905 11899 *
11906 11900 * The calling thread must have set R4COMMIT. This bit is used to
11907 11901 * serialize access to the commit structure in the rnode. As long
11908 11902 * as the thread has set R4COMMIT, then it can manipulate the commit
11909 11903 * structure without requiring any other locks.
11910 11904 *
11911 11905 * When this function is called from nfs4_dispose() the page passed
11912 11906 * into nfs4_dispose() will be SE_EXCL locked, and so this function
11913 11907 * will skip it. This is not a problem since we initially add the
11914 11908 * page to the r_commit page list.
11915 11909 *
11916 11910 */
11917 11911 static void
11918 11912 nfs4_get_commit(vnode_t *vp)
11919 11913 {
11920 11914 rnode4_t *rp;
11921 11915 page_t *pp;
11922 11916 kmutex_t *vphm;
11923 11917
11924 11918 rp = VTOR4(vp);
11925 11919
11926 11920 ASSERT(rp->r_flags & R4COMMIT);
11927 11921
11928 11922 /* make sure we're looking at the master vnode, not a shadow */
11929 11923
11930 11924 if (IS_SHADOW(vp, rp))
11931 11925 vp = RTOV4(rp);
11932 11926
11933 11927 vphm = page_vnode_mutex(vp);
11934 11928 mutex_enter(vphm);
11935 11929
11936 11930 /*
11937 11931 * If there are no pages associated with this vnode, then
11938 11932 * just return.
11939 11933 */
11940 11934 if ((pp = vp->v_pages) == NULL) {
11941 11935 mutex_exit(vphm);
11942 11936 return;
11943 11937 }
11944 11938
11945 11939 /*
11946 11940 * Step through all of the pages associated with this vnode
11947 11941 * looking for pages which need to be committed.
11948 11942 */
11949 11943 do {
11950 11944 /* Skip marker pages. */
11951 11945 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11952 11946 continue;
11953 11947
11954 11948 /*
11955 11949 * First short-cut everything (without the page_lock)
11956 11950 * and see if this page does not need to be committed
11957 11951 * or is modified if so then we'll just skip it.
11958 11952 */
11959 11953 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11960 11954 continue;
11961 11955
11962 11956 /*
11963 11957 * Attempt to lock the page. If we can't, then
11964 11958 * someone else is messing with it or we have been
11965 11959 * called from nfs4_dispose and this is the page that
11966 11960 * nfs4_dispose was called with.. anyway just skip it.
11967 11961 */
11968 11962 if (!page_trylock(pp, SE_EXCL))
11969 11963 continue;
11970 11964
11971 11965 /*
11972 11966 * Lets check again now that we have the page lock.
11973 11967 */
11974 11968 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11975 11969 page_unlock(pp);
11976 11970 continue;
11977 11971 }
11978 11972
11979 11973 /* this had better not be a free page */
11980 11974 ASSERT(PP_ISFREE(pp) == 0);
11981 11975
11982 11976 /*
11983 11977 * The page needs to be committed and we locked it.
11984 11978 * Update the base and length parameters and add it
11985 11979 * to r_pages.
11986 11980 */
11987 11981 if (rp->r_commit.c_pages == NULL) {
11988 11982 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11989 11983 rp->r_commit.c_commlen = PAGESIZE;
11990 11984 } else if (pp->p_offset < rp->r_commit.c_commbase) {
11991 11985 rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11992 11986 (offset3)pp->p_offset + rp->r_commit.c_commlen;
11993 11987 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11994 11988 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11995 11989 <= pp->p_offset) {
11996 11990 rp->r_commit.c_commlen = (offset3)pp->p_offset -
11997 11991 rp->r_commit.c_commbase + PAGESIZE;
11998 11992 }
11999 11993 page_add(&rp->r_commit.c_pages, pp);
12000 11994 } while ((pp = pp->p_vpnext) != vp->v_pages);
12001 11995
12002 11996 mutex_exit(vphm);
12003 11997 }
12004 11998
12005 11999 /*
12006 12000 * This routine is used to gather together a page list of the pages
12007 12001 * which are to be committed on the server. This routine must not
12008 12002 * be called if the calling thread holds any locked pages.
12009 12003 *
12010 12004 * The calling thread must have set R4COMMIT. This bit is used to
12011 12005 * serialize access to the commit structure in the rnode. As long
12012 12006 * as the thread has set R4COMMIT, then it can manipulate the commit
12013 12007 * structure without requiring any other locks.
12014 12008 */
12015 12009 static void
12016 12010 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
12017 12011 {
12018 12012
12019 12013 rnode4_t *rp;
12020 12014 page_t *pp;
12021 12015 u_offset_t end;
12022 12016 u_offset_t off;
12023 12017 ASSERT(len != 0);
12024 12018 rp = VTOR4(vp);
12025 12019 ASSERT(rp->r_flags & R4COMMIT);
12026 12020
12027 12021 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12028 12022
12029 12023 /* make sure we're looking at the master vnode, not a shadow */
12030 12024
12031 12025 if (IS_SHADOW(vp, rp))
12032 12026 vp = RTOV4(rp);
12033 12027
12034 12028 /*
12035 12029 * If there are no pages associated with this vnode, then
12036 12030 * just return.
12037 12031 */
12038 12032 if ((pp = vp->v_pages) == NULL)
12039 12033 return;
12040 12034 /*
12041 12035 * Calculate the ending offset.
12042 12036 */
12043 12037 end = soff + len;
12044 12038 for (off = soff; off < end; off += PAGESIZE) {
12045 12039 /*
12046 12040 * Lookup each page by vp, offset.
12047 12041 */
12048 12042 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12049 12043 continue;
12050 12044 /*
12051 12045 * If this page does not need to be committed or is
12052 12046 * modified, then just skip it.
12053 12047 */
12054 12048 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12055 12049 page_unlock(pp);
12056 12050 continue;
12057 12051 }
12058 12052
12059 12053 ASSERT(PP_ISFREE(pp) == 0);
12060 12054 /*
12061 12055 * The page needs to be committed and we locked it.
12062 12056 * Update the base and length parameters and add it
12063 12057 * to r_pages.
12064 12058 */
12065 12059 if (rp->r_commit.c_pages == NULL) {
12066 12060 rp->r_commit.c_commbase = (offset3)pp->p_offset;
12067 12061 rp->r_commit.c_commlen = PAGESIZE;
12068 12062 } else {
12069 12063 rp->r_commit.c_commlen = (offset3)pp->p_offset -
12070 12064 rp->r_commit.c_commbase + PAGESIZE;
12071 12065 }
12072 12066 page_add(&rp->r_commit.c_pages, pp);
12073 12067 }
12074 12068 }
12075 12069
12076 12070 /*
12077 12071 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12078 12072 * Flushes and commits data to the server.
12079 12073 */
12080 12074 static int
12081 12075 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12082 12076 {
12083 12077 int error;
12084 12078 verifier4 write_verf;
12085 12079 rnode4_t *rp = VTOR4(vp);
12086 12080
12087 12081 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12088 12082
12089 12083 /*
12090 12084 * Flush the data portion of the file and then commit any
12091 12085 * portions which need to be committed. This may need to
12092 12086 * be done twice if the server has changed state since
12093 12087 * data was last written. The data will need to be
12094 12088 * rewritten to the server and then a new commit done.
12095 12089 *
12096 12090 * In fact, this may need to be done several times if the
12097 12091 * server is having problems and crashing while we are
12098 12092 * attempting to do this.
12099 12093 */
12100 12094
12101 12095 top:
12102 12096 /*
12103 12097 * Do a flush based on the poff and plen arguments. This
12104 12098 * will synchronously write out any modified pages in the
12105 12099 * range specified by (poff, plen). This starts all of the
12106 12100 * i/o operations which will be waited for in the next
12107 12101 * call to nfs4_putpage
12108 12102 */
12109 12103
12110 12104 mutex_enter(&rp->r_statelock);
12111 12105 write_verf = rp->r_writeverf;
12112 12106 mutex_exit(&rp->r_statelock);
12113 12107
12114 12108 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12115 12109 if (error == EAGAIN)
12116 12110 error = 0;
12117 12111
12118 12112 /*
12119 12113 * Do a flush based on the poff and plen arguments. This
12120 12114 * will synchronously write out any modified pages in the
12121 12115 * range specified by (poff, plen) and wait until all of
12122 12116 * the asynchronous i/o's in that range are done as well.
12123 12117 */
12124 12118 if (!error)
12125 12119 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12126 12120
12127 12121 if (error)
12128 12122 return (error);
12129 12123
12130 12124 mutex_enter(&rp->r_statelock);
12131 12125 if (rp->r_writeverf != write_verf) {
12132 12126 mutex_exit(&rp->r_statelock);
12133 12127 goto top;
12134 12128 }
12135 12129 mutex_exit(&rp->r_statelock);
12136 12130
12137 12131 /*
12138 12132 * Now commit any pages which might need to be committed.
12139 12133 * If the error, NFS_VERF_MISMATCH, is returned, then
12140 12134 * start over with the flush operation.
12141 12135 */
12142 12136 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12143 12137
12144 12138 if (error == NFS_VERF_MISMATCH)
12145 12139 goto top;
12146 12140
12147 12141 return (error);
12148 12142 }
12149 12143
12150 12144 /*
12151 12145 * nfs4_commit_vp() will wait for other pending commits and
12152 12146 * will either commit the whole file or a range, plen dictates
12153 12147 * if we commit whole file. a value of zero indicates the whole
12154 12148 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12155 12149 */
12156 12150 static int
12157 12151 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12158 12152 cred_t *cr, int wait_on_writes)
12159 12153 {
12160 12154 rnode4_t *rp;
12161 12155 page_t *plist;
12162 12156 offset3 offset;
12163 12157 count3 len;
12164 12158
12165 12159 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12166 12160
12167 12161 rp = VTOR4(vp);
12168 12162
12169 12163 /*
12170 12164 * before we gather commitable pages make
12171 12165 * sure there are no outstanding async writes
12172 12166 */
12173 12167 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12174 12168 mutex_enter(&rp->r_statelock);
12175 12169 while (rp->r_count > 0) {
12176 12170 cv_wait(&rp->r_cv, &rp->r_statelock);
12177 12171 }
12178 12172 mutex_exit(&rp->r_statelock);
12179 12173 }
12180 12174
12181 12175 /*
12182 12176 * Set the `commit inprogress' state bit. We must
12183 12177 * first wait until any current one finishes.
12184 12178 */
12185 12179 mutex_enter(&rp->r_statelock);
12186 12180 while (rp->r_flags & R4COMMIT) {
12187 12181 rp->r_flags |= R4COMMITWAIT;
12188 12182 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12189 12183 rp->r_flags &= ~R4COMMITWAIT;
12190 12184 }
12191 12185 rp->r_flags |= R4COMMIT;
12192 12186 mutex_exit(&rp->r_statelock);
12193 12187
12194 12188 /*
12195 12189 * Gather all of the pages which need to be
12196 12190 * committed.
12197 12191 */
12198 12192 if (plen == 0)
12199 12193 nfs4_get_commit(vp);
12200 12194 else
12201 12195 nfs4_get_commit_range(vp, poff, plen);
12202 12196
12203 12197 /*
12204 12198 * Clear the `commit inprogress' bit and disconnect the
12205 12199 * page list which was gathered by nfs4_get_commit.
12206 12200 */
12207 12201 plist = rp->r_commit.c_pages;
12208 12202 rp->r_commit.c_pages = NULL;
12209 12203 offset = rp->r_commit.c_commbase;
12210 12204 len = rp->r_commit.c_commlen;
12211 12205 mutex_enter(&rp->r_statelock);
12212 12206 rp->r_flags &= ~R4COMMIT;
12213 12207 cv_broadcast(&rp->r_commit.c_cv);
12214 12208 mutex_exit(&rp->r_statelock);
12215 12209
12216 12210 /*
12217 12211 * If any pages need to be committed, commit them and
12218 12212 * then unlock them so that they can be freed some
12219 12213 * time later.
12220 12214 */
12221 12215 if (plist == NULL)
12222 12216 return (0);
12223 12217
12224 12218 /*
12225 12219 * No error occurred during the flush portion
12226 12220 * of this operation, so now attempt to commit
12227 12221 * the data to stable storage on the server.
12228 12222 *
12229 12223 * This will unlock all of the pages on the list.
12230 12224 */
12231 12225 return (nfs4_sync_commit(vp, plist, offset, len, cr));
12232 12226 }
12233 12227
12234 12228 static int
12235 12229 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12236 12230 cred_t *cr)
12237 12231 {
12238 12232 int error;
12239 12233 page_t *pp;
12240 12234
12241 12235 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12242 12236
12243 12237 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12244 12238
12245 12239 /*
12246 12240 * If we got an error, then just unlock all of the pages
12247 12241 * on the list.
12248 12242 */
12249 12243 if (error) {
12250 12244 while (plist != NULL) {
12251 12245 pp = plist;
12252 12246 page_sub(&plist, pp);
12253 12247 page_unlock(pp);
12254 12248 }
12255 12249 return (error);
12256 12250 }
12257 12251 /*
12258 12252 * We've tried as hard as we can to commit the data to stable
12259 12253 * storage on the server. We just unlock the pages and clear
12260 12254 * the commit required state. They will get freed later.
12261 12255 */
12262 12256 while (plist != NULL) {
12263 12257 pp = plist;
12264 12258 page_sub(&plist, pp);
12265 12259 pp->p_fsdata = C_NOCOMMIT;
12266 12260 page_unlock(pp);
12267 12261 }
12268 12262
12269 12263 return (error);
12270 12264 }
12271 12265
12272 12266 static void
12273 12267 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12274 12268 cred_t *cr)
12275 12269 {
12276 12270
12277 12271 (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12278 12272 }
12279 12273
12280 12274 /*ARGSUSED*/
12281 12275 static int
12282 12276 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12283 12277 caller_context_t *ct)
12284 12278 {
12285 12279 int error = 0;
12286 12280 mntinfo4_t *mi;
12287 12281 vattr_t va;
12288 12282 vsecattr_t nfsace4_vsap;
12289 12283
12290 12284 mi = VTOMI4(vp);
12291 12285 if (nfs_zone() != mi->mi_zone)
12292 12286 return (EIO);
12293 12287 if (mi->mi_flags & MI4_ACL) {
12294 12288 /* if we have a delegation, return it */
12295 12289 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12296 12290 (void) nfs4delegreturn(VTOR4(vp),
12297 12291 NFS4_DR_REOPEN|NFS4_DR_PUSH);
12298 12292
12299 12293 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12300 12294 NFS4_ACL_SET);
12301 12295 if (error) /* EINVAL */
12302 12296 return (error);
12303 12297
12304 12298 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12305 12299 /*
12306 12300 * These are aclent_t type entries.
12307 12301 */
12308 12302 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12309 12303 vp->v_type == VDIR, FALSE);
12310 12304 if (error)
12311 12305 return (error);
12312 12306 } else {
12313 12307 /*
12314 12308 * These are ace_t type entries.
12315 12309 */
12316 12310 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12317 12311 FALSE);
12318 12312 if (error)
12319 12313 return (error);
12320 12314 }
12321 12315 bzero(&va, sizeof (va));
12322 12316 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12323 12317 vs_ace4_destroy(&nfsace4_vsap);
12324 12318 return (error);
12325 12319 }
12326 12320 return (ENOSYS);
12327 12321 }
12328 12322
12329 12323 /* ARGSUSED */
12330 12324 int
12331 12325 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12332 12326 caller_context_t *ct)
12333 12327 {
12334 12328 int error;
12335 12329 mntinfo4_t *mi;
12336 12330 nfs4_ga_res_t gar;
12337 12331 rnode4_t *rp = VTOR4(vp);
12338 12332
12339 12333 mi = VTOMI4(vp);
12340 12334 if (nfs_zone() != mi->mi_zone)
12341 12335 return (EIO);
12342 12336
12343 12337 bzero(&gar, sizeof (gar));
12344 12338 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12345 12339
12346 12340 /*
12347 12341 * vsecattr->vsa_mask holds the original acl request mask.
12348 12342 * This is needed when determining what to return.
12349 12343 * (See: nfs4_create_getsecattr_return())
12350 12344 */
12351 12345 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12352 12346 if (error) /* EINVAL */
12353 12347 return (error);
12354 12348
12355 12349 /*
12356 12350 * If this is a referral stub, don't try to go OTW for an ACL
12357 12351 */
12358 12352 if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12359 12353 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12360 12354
12361 12355 if (mi->mi_flags & MI4_ACL) {
12362 12356 /*
12363 12357 * Check if the data is cached and the cache is valid. If it
12364 12358 * is we don't go over the wire.
12365 12359 */
12366 12360 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12367 12361 mutex_enter(&rp->r_statelock);
12368 12362 if (rp->r_secattr != NULL) {
12369 12363 error = nfs4_create_getsecattr_return(
12370 12364 rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12371 12365 rp->r_attr.va_gid,
12372 12366 vp->v_type == VDIR);
12373 12367 if (!error) { /* error == 0 - Success! */
12374 12368 mutex_exit(&rp->r_statelock);
12375 12369 return (error);
12376 12370 }
12377 12371 }
12378 12372 mutex_exit(&rp->r_statelock);
12379 12373 }
12380 12374
12381 12375 /*
12382 12376 * The getattr otw call will always get both the acl, in
12383 12377 * the form of a list of nfsace4's, and the number of acl
12384 12378 * entries; independent of the value of gar.n4g_va.va_mask.
12385 12379 */
12386 12380 error = nfs4_getattr_otw(vp, &gar, cr, 1);
12387 12381 if (error) {
12388 12382 vs_ace4_destroy(&gar.n4g_vsa);
12389 12383 if (error == ENOTSUP || error == EOPNOTSUPP)
12390 12384 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12391 12385 return (error);
12392 12386 }
12393 12387
12394 12388 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12395 12389 /*
12396 12390 * No error was returned, but according to the response
12397 12391 * bitmap, neither was an acl.
12398 12392 */
12399 12393 vs_ace4_destroy(&gar.n4g_vsa);
12400 12394 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12401 12395 return (error);
12402 12396 }
12403 12397
12404 12398 /*
12405 12399 * Update the cache with the ACL.
12406 12400 */
12407 12401 nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12408 12402
12409 12403 error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12410 12404 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12411 12405 vp->v_type == VDIR);
12412 12406 vs_ace4_destroy(&gar.n4g_vsa);
12413 12407 if ((error) && (vsecattr->vsa_mask &
12414 12408 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12415 12409 (error != EACCES)) {
12416 12410 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12417 12411 }
12418 12412 return (error);
12419 12413 }
12420 12414 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12421 12415 return (error);
12422 12416 }
12423 12417
12424 12418 /*
12425 12419 * The function returns:
12426 12420 * - 0 (zero) if the passed in "acl_mask" is a valid request.
12427 12421 * - EINVAL if the passed in "acl_mask" is an invalid request.
12428 12422 *
12429 12423 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12430 12424 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12431 12425 *
12432 12426 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12433 12427 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12434 12428 * - We have a count field set without the corresponding acl field set. (e.g. -
12435 12429 * VSA_ACECNT is set, but VSA_ACE is not)
12436 12430 */
12437 12431 static int
12438 12432 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12439 12433 {
12440 12434 /* Shortcut the masks that are always valid. */
12441 12435 if (acl_mask == (VSA_ACE | VSA_ACECNT))
12442 12436 return (0);
12443 12437 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12444 12438 return (0);
12445 12439
12446 12440 if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12447 12441 /*
12448 12442 * We can't have any VSA_ACL type stuff in the mask now.
12449 12443 */
12450 12444 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12451 12445 VSA_DFACLCNT))
12452 12446 return (EINVAL);
12453 12447
12454 12448 if (op == NFS4_ACL_SET) {
12455 12449 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12456 12450 return (EINVAL);
12457 12451 }
12458 12452 }
12459 12453
12460 12454 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12461 12455 /*
12462 12456 * We can't have any VSA_ACE type stuff in the mask now.
12463 12457 */
12464 12458 if (acl_mask & (VSA_ACE | VSA_ACECNT))
12465 12459 return (EINVAL);
12466 12460
12467 12461 if (op == NFS4_ACL_SET) {
12468 12462 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12469 12463 return (EINVAL);
12470 12464
12471 12465 if ((acl_mask & VSA_DFACLCNT) &&
12472 12466 !(acl_mask & VSA_DFACL))
12473 12467 return (EINVAL);
12474 12468 }
12475 12469 }
12476 12470 return (0);
12477 12471 }
12478 12472
12479 12473 /*
12480 12474 * The theory behind creating the correct getsecattr return is simply this:
12481 12475 * "Don't return anything that the caller is not expecting to have to free."
12482 12476 */
12483 12477 static int
12484 12478 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12485 12479 uid_t uid, gid_t gid, int isdir)
12486 12480 {
12487 12481 int error = 0;
12488 12482 /* Save the mask since the translators modify it. */
12489 12483 uint_t orig_mask = vsap->vsa_mask;
12490 12484
12491 12485 if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12492 12486 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12493 12487
12494 12488 if (error)
12495 12489 return (error);
12496 12490
12497 12491 /*
12498 12492 * If the caller only asked for the ace count (VSA_ACECNT)
12499 12493 * don't give them the full acl (VSA_ACE), free it.
12500 12494 */
12501 12495 if (!orig_mask & VSA_ACE) {
12502 12496 if (vsap->vsa_aclentp != NULL) {
12503 12497 kmem_free(vsap->vsa_aclentp,
12504 12498 vsap->vsa_aclcnt * sizeof (ace_t));
12505 12499 vsap->vsa_aclentp = NULL;
12506 12500 }
12507 12501 }
12508 12502 vsap->vsa_mask = orig_mask;
12509 12503
12510 12504 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12511 12505 VSA_DFACLCNT)) {
12512 12506 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12513 12507 isdir, FALSE);
12514 12508
12515 12509 if (error)
12516 12510 return (error);
12517 12511
12518 12512 /*
12519 12513 * If the caller only asked for the acl count (VSA_ACLCNT)
12520 12514 * and/or the default acl count (VSA_DFACLCNT) don't give them
12521 12515 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12522 12516 */
12523 12517 if (!orig_mask & VSA_ACL) {
12524 12518 if (vsap->vsa_aclentp != NULL) {
12525 12519 kmem_free(vsap->vsa_aclentp,
12526 12520 vsap->vsa_aclcnt * sizeof (aclent_t));
12527 12521 vsap->vsa_aclentp = NULL;
12528 12522 }
12529 12523 }
12530 12524
12531 12525 if (!orig_mask & VSA_DFACL) {
12532 12526 if (vsap->vsa_dfaclentp != NULL) {
12533 12527 kmem_free(vsap->vsa_dfaclentp,
12534 12528 vsap->vsa_dfaclcnt * sizeof (aclent_t));
12535 12529 vsap->vsa_dfaclentp = NULL;
12536 12530 }
12537 12531 }
12538 12532 vsap->vsa_mask = orig_mask;
12539 12533 }
12540 12534 return (0);
12541 12535 }
12542 12536
12543 12537 /* ARGSUSED */
12544 12538 int
12545 12539 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12546 12540 caller_context_t *ct)
12547 12541 {
12548 12542 int error;
12549 12543
12550 12544 if (nfs_zone() != VTOMI4(vp)->mi_zone)
12551 12545 return (EIO);
12552 12546 /*
12553 12547 * check for valid cmd parameter
12554 12548 */
12555 12549 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12556 12550 return (EINVAL);
12557 12551
12558 12552 /*
12559 12553 * Check access permissions
12560 12554 */
12561 12555 if ((cmd & F_SHARE) &&
12562 12556 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12563 12557 (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12564 12558 return (EBADF);
12565 12559
12566 12560 /*
12567 12561 * If the filesystem is mounted using local locking, pass the
12568 12562 * request off to the local share code.
12569 12563 */
12570 12564 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12571 12565 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12572 12566
12573 12567 switch (cmd) {
12574 12568 case F_SHARE:
12575 12569 case F_UNSHARE:
12576 12570 /*
12577 12571 * This will be properly implemented later,
12578 12572 * see RFE: 4823948 .
12579 12573 */
12580 12574 error = EAGAIN;
12581 12575 break;
12582 12576
12583 12577 case F_HASREMOTELOCKS:
12584 12578 /*
12585 12579 * NFS client can't store remote locks itself
12586 12580 */
12587 12581 shr->s_access = 0;
12588 12582 error = 0;
12589 12583 break;
12590 12584
12591 12585 default:
12592 12586 error = EINVAL;
12593 12587 break;
12594 12588 }
12595 12589
12596 12590 return (error);
12597 12591 }
12598 12592
12599 12593 /*
12600 12594 * Common code called by directory ops to update the attrcache
12601 12595 */
12602 12596 static int
12603 12597 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12604 12598 hrtime_t t, vnode_t *vp, cred_t *cr)
12605 12599 {
12606 12600 int error = 0;
12607 12601
12608 12602 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12609 12603
12610 12604 if (status != NFS4_OK) {
12611 12605 /* getattr not done or failed */
12612 12606 PURGE_ATTRCACHE4(vp);
12613 12607 return (error);
12614 12608 }
12615 12609
12616 12610 if (garp) {
12617 12611 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12618 12612 } else {
12619 12613 PURGE_ATTRCACHE4(vp);
12620 12614 }
12621 12615 return (error);
12622 12616 }
12623 12617
12624 12618 /*
12625 12619 * Update directory caches for directory modification ops (link, rename, etc.)
12626 12620 * When dinfo is NULL, manage dircaches in the old way.
12627 12621 */
12628 12622 static void
12629 12623 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12630 12624 dirattr_info_t *dinfo)
12631 12625 {
12632 12626 rnode4_t *drp = VTOR4(dvp);
12633 12627
12634 12628 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12635 12629
12636 12630 /* Purge rddir cache for dir since it changed */
12637 12631 if (drp->r_dir != NULL)
12638 12632 nfs4_purge_rddir_cache(dvp);
12639 12633
12640 12634 /*
12641 12635 * If caller provided dinfo, then use it to manage dir caches.
12642 12636 */
12643 12637 if (dinfo != NULL) {
12644 12638 if (vp != NULL) {
12645 12639 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12646 12640 if (!VTOR4(vp)->created_v4) {
12647 12641 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12648 12642 dnlc_update(dvp, nm, vp);
12649 12643 } else {
12650 12644 /*
12651 12645 * XXX don't update if the created_v4 flag is
12652 12646 * set
12653 12647 */
12654 12648 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12655 12649 NFS4_DEBUG(nfs4_client_state_debug,
12656 12650 (CE_NOTE, "nfs4_update_dircaches: "
12657 12651 "don't update dnlc: created_v4 flag"));
12658 12652 }
12659 12653 }
12660 12654
12661 12655 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12662 12656 dinfo->di_cred, FALSE, cinfo);
12663 12657
12664 12658 return;
12665 12659 }
12666 12660
12667 12661 /*
12668 12662 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12669 12663 * Since caller modified dir but didn't receive post-dirmod-op dir
12670 12664 * attrs, the dir's attrs must be purged.
12671 12665 *
12672 12666 * XXX this check and dnlc update/purge should really be atomic,
12673 12667 * XXX but can't use rnode statelock because it'll deadlock in
12674 12668 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12675 12669 * XXX does occur.
12676 12670 *
12677 12671 * XXX We also may want to check that atomic is true in the
12678 12672 * XXX change_info struct. If it is not, the change_info may
12679 12673 * XXX reflect changes by more than one clients which means that
12680 12674 * XXX our cache may not be valid.
12681 12675 */
12682 12676 PURGE_ATTRCACHE4(dvp);
12683 12677 if (drp->r_change == cinfo->before) {
12684 12678 /* no changes took place in the directory prior to our link */
12685 12679 if (vp != NULL) {
12686 12680 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12687 12681 if (!VTOR4(vp)->created_v4) {
12688 12682 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12689 12683 dnlc_update(dvp, nm, vp);
12690 12684 } else {
12691 12685 /*
12692 12686 * XXX dont' update if the created_v4 flag
12693 12687 * is set
12694 12688 */
12695 12689 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12696 12690 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12697 12691 "nfs4_update_dircaches: don't"
12698 12692 " update dnlc: created_v4 flag"));
12699 12693 }
12700 12694 }
12701 12695 } else {
12702 12696 /* Another client modified directory - purge its dnlc cache */
12703 12697 dnlc_purge_vp(dvp);
12704 12698 }
12705 12699 }
12706 12700
12707 12701 /*
12708 12702 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12709 12703 * file.
12710 12704 *
12711 12705 * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12712 12706 * file (ie: client recovery) and otherwise set to FALSE.
12713 12707 *
12714 12708 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12715 12709 * initiated) calling functions.
12716 12710 *
12717 12711 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12718 12712 * of resending a 'lost' open request.
12719 12713 *
12720 12714 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12721 12715 * server that hands out BAD_SEQID on open confirm.
12722 12716 *
12723 12717 * Errors are returned via the nfs4_error_t parameter.
12724 12718 */
12725 12719 void
12726 12720 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12727 12721 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12728 12722 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12729 12723 {
12730 12724 COMPOUND4args_clnt args;
12731 12725 COMPOUND4res_clnt res;
12732 12726 nfs_argop4 argop[2];
12733 12727 nfs_resop4 *resop;
12734 12728 int doqueue = 1;
12735 12729 mntinfo4_t *mi;
12736 12730 OPEN_CONFIRM4args *open_confirm_args;
12737 12731 int needrecov;
12738 12732
12739 12733 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12740 12734 #if DEBUG
12741 12735 mutex_enter(&oop->oo_lock);
12742 12736 ASSERT(oop->oo_seqid_inuse);
12743 12737 mutex_exit(&oop->oo_lock);
12744 12738 #endif
12745 12739
12746 12740 recov_retry_confirm:
12747 12741 nfs4_error_zinit(ep);
12748 12742 *retry_open = FALSE;
12749 12743
12750 12744 if (resend)
12751 12745 args.ctag = TAG_OPEN_CONFIRM_LOST;
12752 12746 else
12753 12747 args.ctag = TAG_OPEN_CONFIRM;
12754 12748
12755 12749 args.array_len = 2;
12756 12750 args.array = argop;
12757 12751
12758 12752 /* putfh target fh */
12759 12753 argop[0].argop = OP_CPUTFH;
12760 12754 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12761 12755
12762 12756 argop[1].argop = OP_OPEN_CONFIRM;
12763 12757 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12764 12758
12765 12759 (*seqid) += 1;
12766 12760 open_confirm_args->seqid = *seqid;
12767 12761 open_confirm_args->open_stateid = *stateid;
12768 12762
12769 12763 mi = VTOMI4(vp);
12770 12764
12771 12765 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12772 12766
12773 12767 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12774 12768 nfs4_set_open_seqid((*seqid), oop, args.ctag);
12775 12769 }
12776 12770
12777 12771 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12778 12772 if (!needrecov && ep->error)
12779 12773 return;
12780 12774
12781 12775 if (needrecov) {
12782 12776 bool_t abort = FALSE;
12783 12777
12784 12778 if (reopening_file == FALSE) {
12785 12779 nfs4_bseqid_entry_t *bsep = NULL;
12786 12780
12787 12781 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12788 12782 bsep = nfs4_create_bseqid_entry(oop, NULL,
12789 12783 vp, 0, args.ctag,
12790 12784 open_confirm_args->seqid);
12791 12785
12792 12786 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12793 12787 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12794 12788 if (bsep) {
12795 12789 kmem_free(bsep, sizeof (*bsep));
12796 12790 if (num_bseqid_retryp &&
12797 12791 --(*num_bseqid_retryp) == 0)
12798 12792 abort = TRUE;
12799 12793 }
12800 12794 }
12801 12795 if ((ep->error == ETIMEDOUT ||
12802 12796 res.status == NFS4ERR_RESOURCE) &&
12803 12797 abort == FALSE && resend == FALSE) {
12804 12798 if (!ep->error)
12805 12799 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12806 12800
12807 12801 delay(SEC_TO_TICK(confirm_retry_sec));
12808 12802 goto recov_retry_confirm;
12809 12803 }
12810 12804 /* State may have changed so retry the entire OPEN op */
12811 12805 if (abort == FALSE)
12812 12806 *retry_open = TRUE;
12813 12807 else
12814 12808 *retry_open = FALSE;
12815 12809 if (!ep->error)
12816 12810 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12817 12811 return;
12818 12812 }
12819 12813
12820 12814 if (res.status) {
12821 12815 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12822 12816 return;
12823 12817 }
12824 12818
12825 12819 resop = &res.array[1]; /* open confirm res */
12826 12820 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12827 12821 stateid, sizeof (*stateid));
12828 12822
12829 12823 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12830 12824 }
12831 12825
12832 12826 /*
12833 12827 * Return the credentials associated with a client state object. The
12834 12828 * caller is responsible for freeing the credentials.
12835 12829 */
12836 12830
12837 12831 static cred_t *
12838 12832 state_to_cred(nfs4_open_stream_t *osp)
12839 12833 {
12840 12834 cred_t *cr;
12841 12835
12842 12836 /*
12843 12837 * It's ok to not lock the open stream and open owner to get
12844 12838 * the oo_cred since this is only written once (upon creation)
12845 12839 * and will not change.
12846 12840 */
12847 12841 cr = osp->os_open_owner->oo_cred;
12848 12842 crhold(cr);
12849 12843
12850 12844 return (cr);
12851 12845 }
12852 12846
12853 12847 /*
12854 12848 * nfs4_find_sysid
12855 12849 *
12856 12850 * Find the sysid for the knetconfig associated with the given mi.
12857 12851 */
12858 12852 static struct lm_sysid *
12859 12853 nfs4_find_sysid(mntinfo4_t *mi)
12860 12854 {
12861 12855 ASSERT(nfs_zone() == mi->mi_zone);
12862 12856
12863 12857 /*
12864 12858 * Switch from RDMA knconf to original mount knconf
12865 12859 */
12866 12860 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12867 12861 mi->mi_curr_serv->sv_hostname, NULL));
12868 12862 }
12869 12863
12870 12864 #ifdef DEBUG
12871 12865 /*
12872 12866 * Return a string version of the call type for easy reading.
12873 12867 */
12874 12868 static char *
12875 12869 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12876 12870 {
12877 12871 switch (ctype) {
12878 12872 case NFS4_LCK_CTYPE_NORM:
12879 12873 return ("NORMAL");
12880 12874 case NFS4_LCK_CTYPE_RECLAIM:
12881 12875 return ("RECLAIM");
12882 12876 case NFS4_LCK_CTYPE_RESEND:
12883 12877 return ("RESEND");
12884 12878 case NFS4_LCK_CTYPE_REINSTATE:
12885 12879 return ("REINSTATE");
12886 12880 default:
12887 12881 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12888 12882 "type %d", ctype);
12889 12883 return ("");
12890 12884 }
12891 12885 }
12892 12886 #endif
12893 12887
12894 12888 /*
12895 12889 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12896 12890 * Unlock requests don't have an over-the-wire locktype, so we just return
12897 12891 * something non-threatening.
12898 12892 */
12899 12893
12900 12894 static nfs_lock_type4
12901 12895 flk_to_locktype(int cmd, int l_type)
12902 12896 {
12903 12897 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12904 12898
12905 12899 switch (l_type) {
12906 12900 case F_UNLCK:
12907 12901 return (READ_LT);
12908 12902 case F_RDLCK:
12909 12903 if (cmd == F_SETLK)
12910 12904 return (READ_LT);
12911 12905 else
12912 12906 return (READW_LT);
12913 12907 case F_WRLCK:
12914 12908 if (cmd == F_SETLK)
12915 12909 return (WRITE_LT);
12916 12910 else
12917 12911 return (WRITEW_LT);
12918 12912 }
12919 12913 panic("flk_to_locktype");
12920 12914 /*NOTREACHED*/
12921 12915 }
12922 12916
12923 12917 /*
12924 12918 * Do some preliminary checks for nfs4frlock.
12925 12919 */
12926 12920 static int
12927 12921 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12928 12922 u_offset_t offset)
12929 12923 {
12930 12924 int error = 0;
12931 12925
12932 12926 /*
12933 12927 * If we are setting a lock, check that the file is opened
12934 12928 * with the correct mode.
12935 12929 */
12936 12930 if (cmd == F_SETLK || cmd == F_SETLKW) {
12937 12931 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12938 12932 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12939 12933 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 12934 "nfs4frlock_validate_args: file was opened with "
12941 12935 "incorrect mode"));
12942 12936 return (EBADF);
12943 12937 }
12944 12938 }
12945 12939
12946 12940 /* Convert the offset. It may need to be restored before returning. */
12947 12941 if (error = convoff(vp, flk, 0, offset)) {
12948 12942 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12949 12943 "nfs4frlock_validate_args: convoff => error= %d\n",
12950 12944 error));
12951 12945 return (error);
12952 12946 }
12953 12947
12954 12948 return (error);
12955 12949 }
12956 12950
12957 12951 /*
12958 12952 * Set the flock64's lm_sysid for nfs4frlock.
12959 12953 */
12960 12954 static int
12961 12955 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12962 12956 {
12963 12957 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12964 12958
12965 12959 /* Find the lm_sysid */
12966 12960 *lspp = nfs4_find_sysid(VTOMI4(vp));
12967 12961
12968 12962 if (*lspp == NULL) {
12969 12963 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12970 12964 "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12971 12965 return (ENOLCK);
12972 12966 }
12973 12967
12974 12968 flk->l_sysid = lm_sysidt(*lspp);
12975 12969
12976 12970 return (0);
12977 12971 }
12978 12972
12979 12973 /*
12980 12974 * Do the remaining preliminary setup for nfs4frlock.
12981 12975 */
12982 12976 static void
12983 12977 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12984 12978 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12985 12979 cred_t **cred_otw)
12986 12980 {
12987 12981 /*
12988 12982 * set tick_delay to the base delay time.
12989 12983 * (NFS4_BASE_WAIT_TIME is in secs)
12990 12984 */
12991 12985
12992 12986 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12993 12987
12994 12988 /*
12995 12989 * If lock is relative to EOF, we need the newest length of the
12996 12990 * file. Therefore invalidate the ATTR_CACHE.
12997 12991 */
12998 12992
12999 12993 *whencep = flk->l_whence;
13000 12994
13001 12995 if (*whencep == 2) /* SEEK_END */
13002 12996 PURGE_ATTRCACHE4(vp);
13003 12997
13004 12998 recov_statep->rs_flags = 0;
13005 12999 recov_statep->rs_num_retry_despite_err = 0;
13006 13000 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
13007 13001 }
13008 13002
13009 13003 /*
13010 13004 * Initialize and allocate the data structures necessary for
13011 13005 * the nfs4frlock call.
13012 13006 * Allocates argsp's op array.
13013 13007 */
13014 13008 static void
13015 13009 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
13016 13010 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
13017 13011 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
13018 13012 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
13019 13013 {
13020 13014 int argoplist_size;
13021 13015 int num_ops = 2;
13022 13016
13023 13017 *retry = FALSE;
13024 13018 *did_start_fop = FALSE;
13025 13019 *skip_get_err = FALSE;
13026 13020 lost_rqstp->lr_op = 0;
13027 13021 argoplist_size = num_ops * sizeof (nfs_argop4);
13028 13022 /* fill array with zero */
13029 13023 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13030 13024
13031 13025 *argspp = argsp;
13032 13026 *respp = NULL;
13033 13027
13034 13028 argsp->array_len = num_ops;
13035 13029 argsp->array = *argopp;
13036 13030
13037 13031 /* initialize in case of error; will get real value down below */
13038 13032 argsp->ctag = TAG_NONE;
13039 13033
13040 13034 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13041 13035 *op_hintp = OH_LOCKU;
13042 13036 else
13043 13037 *op_hintp = OH_OTHER;
13044 13038 }
13045 13039
13046 13040 /*
13047 13041 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign
13048 13042 * the proper nfs4_server_t for this instance of nfs4frlock.
13049 13043 * Returns 0 (success) or an errno value.
13050 13044 */
13051 13045 static int
13052 13046 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13053 13047 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13054 13048 bool_t *did_start_fop, bool_t *startrecovp)
13055 13049 {
13056 13050 int error = 0;
13057 13051 rnode4_t *rp;
13058 13052
13059 13053 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13060 13054
13061 13055 if (ctype == NFS4_LCK_CTYPE_NORM) {
13062 13056 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13063 13057 recov_statep, startrecovp);
13064 13058 if (error)
13065 13059 return (error);
13066 13060 *did_start_fop = TRUE;
13067 13061 } else {
13068 13062 *did_start_fop = FALSE;
13069 13063 *startrecovp = FALSE;
13070 13064 }
13071 13065
13072 13066 if (!error) {
13073 13067 rp = VTOR4(vp);
13074 13068
13075 13069 /* If the file failed recovery, just quit. */
13076 13070 mutex_enter(&rp->r_statelock);
13077 13071 if (rp->r_flags & R4RECOVERR) {
13078 13072 error = EIO;
13079 13073 }
13080 13074 mutex_exit(&rp->r_statelock);
13081 13075 }
13082 13076
13083 13077 return (error);
13084 13078 }
13085 13079
13086 13080 /*
13087 13081 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A
13088 13082 * resend nfs4frlock call is initiated by the recovery framework.
13089 13083 * Acquires the lop and oop seqid synchronization.
13090 13084 */
13091 13085 static void
13092 13086 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13093 13087 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13094 13088 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13095 13089 LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13096 13090 {
13097 13091 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13098 13092 int error;
13099 13093
13100 13094 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13101 13095 (CE_NOTE,
13102 13096 "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13103 13097 ASSERT(resend_rqstp != NULL);
13104 13098 ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13105 13099 resend_rqstp->lr_op == OP_LOCKU);
13106 13100
13107 13101 *oopp = resend_rqstp->lr_oop;
13108 13102 if (resend_rqstp->lr_oop) {
13109 13103 open_owner_hold(resend_rqstp->lr_oop);
13110 13104 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13111 13105 ASSERT(error == 0); /* recov thread always succeeds */
13112 13106 }
13113 13107
13114 13108 /* Must resend this lost lock/locku request. */
13115 13109 ASSERT(resend_rqstp->lr_lop != NULL);
13116 13110 *lopp = resend_rqstp->lr_lop;
13117 13111 lock_owner_hold(resend_rqstp->lr_lop);
13118 13112 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13119 13113 ASSERT(error == 0); /* recov thread always succeeds */
13120 13114
13121 13115 *ospp = resend_rqstp->lr_osp;
13122 13116 if (*ospp)
13123 13117 open_stream_hold(resend_rqstp->lr_osp);
13124 13118
13125 13119 if (resend_rqstp->lr_op == OP_LOCK) {
13126 13120 LOCK4args *lock_args;
13127 13121
13128 13122 argop->argop = OP_LOCK;
13129 13123 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13130 13124 lock_args->locktype = resend_rqstp->lr_locktype;
13131 13125 lock_args->reclaim =
13132 13126 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13133 13127 lock_args->offset = resend_rqstp->lr_flk->l_start;
13134 13128 lock_args->length = resend_rqstp->lr_flk->l_len;
13135 13129 if (lock_args->length == 0)
13136 13130 lock_args->length = ~lock_args->length;
13137 13131 nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13138 13132 mi2clientid(mi), &lock_args->locker);
13139 13133
13140 13134 switch (resend_rqstp->lr_ctype) {
13141 13135 case NFS4_LCK_CTYPE_RESEND:
13142 13136 argsp->ctag = TAG_LOCK_RESEND;
13143 13137 break;
13144 13138 case NFS4_LCK_CTYPE_REINSTATE:
13145 13139 argsp->ctag = TAG_LOCK_REINSTATE;
13146 13140 break;
13147 13141 case NFS4_LCK_CTYPE_RECLAIM:
13148 13142 argsp->ctag = TAG_LOCK_RECLAIM;
13149 13143 break;
13150 13144 default:
13151 13145 argsp->ctag = TAG_LOCK_UNKNOWN;
13152 13146 break;
13153 13147 }
13154 13148 } else {
13155 13149 LOCKU4args *locku_args;
13156 13150 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13157 13151
13158 13152 argop->argop = OP_LOCKU;
13159 13153 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13160 13154 locku_args->locktype = READ_LT;
13161 13155 locku_args->seqid = lop->lock_seqid + 1;
13162 13156 mutex_enter(&lop->lo_lock);
13163 13157 locku_args->lock_stateid = lop->lock_stateid;
13164 13158 mutex_exit(&lop->lo_lock);
13165 13159 locku_args->offset = resend_rqstp->lr_flk->l_start;
13166 13160 locku_args->length = resend_rqstp->lr_flk->l_len;
13167 13161 if (locku_args->length == 0)
13168 13162 locku_args->length = ~locku_args->length;
13169 13163
13170 13164 switch (resend_rqstp->lr_ctype) {
13171 13165 case NFS4_LCK_CTYPE_RESEND:
13172 13166 argsp->ctag = TAG_LOCKU_RESEND;
13173 13167 break;
13174 13168 case NFS4_LCK_CTYPE_REINSTATE:
13175 13169 argsp->ctag = TAG_LOCKU_REINSTATE;
13176 13170 break;
13177 13171 default:
13178 13172 argsp->ctag = TAG_LOCK_UNKNOWN;
13179 13173 break;
13180 13174 }
13181 13175 }
13182 13176 }
13183 13177
13184 13178 /*
13185 13179 * Setup the LOCKT4 arguments.
13186 13180 */
13187 13181 static void
13188 13182 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13189 13183 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13190 13184 rnode4_t *rp)
13191 13185 {
13192 13186 LOCKT4args *lockt_args;
13193 13187
13194 13188 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13195 13189 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13196 13190 argop->argop = OP_LOCKT;
13197 13191 argsp->ctag = TAG_LOCKT;
13198 13192 lockt_args = &argop->nfs_argop4_u.oplockt;
13199 13193
13200 13194 /*
13201 13195 * The locktype will be READ_LT unless it's
13202 13196 * a write lock. We do this because the Solaris
13203 13197 * system call allows the combination of
13204 13198 * F_UNLCK and F_GETLK* and so in that case the
13205 13199 * unlock is mapped to a read.
13206 13200 */
13207 13201 if (flk->l_type == F_WRLCK)
13208 13202 lockt_args->locktype = WRITE_LT;
13209 13203 else
13210 13204 lockt_args->locktype = READ_LT;
13211 13205
13212 13206 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13213 13207 /* set the lock owner4 args */
13214 13208 nfs4_setlockowner_args(&lockt_args->owner, rp,
13215 13209 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13216 13210 flk->l_pid);
13217 13211 lockt_args->offset = flk->l_start;
13218 13212 lockt_args->length = flk->l_len;
13219 13213 if (flk->l_len == 0)
13220 13214 lockt_args->length = ~lockt_args->length;
13221 13215
13222 13216 *lockt_argsp = lockt_args;
13223 13217 }
13224 13218
13225 13219 /*
13226 13220 * If the client is holding a delegation, and the open stream to be used
13227 13221 * with this lock request is a delegation open stream, then re-open the stream.
13228 13222 * Sets the nfs4_error_t to all zeros unless the open stream has already
13229 13223 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY
13230 13224 * means the caller should retry (like a recovery retry).
13231 13225 */
13232 13226 static void
13233 13227 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13234 13228 {
13235 13229 open_delegation_type4 dt;
13236 13230 bool_t reopen_needed, force;
13237 13231 nfs4_open_stream_t *osp;
13238 13232 open_claim_type4 oclaim;
13239 13233 rnode4_t *rp = VTOR4(vp);
13240 13234 mntinfo4_t *mi = VTOMI4(vp);
13241 13235
13242 13236 ASSERT(nfs_zone() == mi->mi_zone);
13243 13237
13244 13238 nfs4_error_zinit(ep);
13245 13239
13246 13240 mutex_enter(&rp->r_statev4_lock);
13247 13241 dt = rp->r_deleg_type;
13248 13242 mutex_exit(&rp->r_statev4_lock);
13249 13243
13250 13244 if (dt != OPEN_DELEGATE_NONE) {
13251 13245 nfs4_open_owner_t *oop;
13252 13246
13253 13247 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13254 13248 if (!oop) {
13255 13249 ep->stat = NFS4ERR_IO;
13256 13250 return;
13257 13251 }
13258 13252 /* returns with 'os_sync_lock' held */
13259 13253 osp = find_open_stream(oop, rp);
13260 13254 if (!osp) {
13261 13255 open_owner_rele(oop);
13262 13256 ep->stat = NFS4ERR_IO;
13263 13257 return;
13264 13258 }
13265 13259
13266 13260 if (osp->os_failed_reopen) {
13267 13261 NFS4_DEBUG((nfs4_open_stream_debug ||
13268 13262 nfs4_client_lock_debug), (CE_NOTE,
13269 13263 "nfs4frlock_check_deleg: os_failed_reopen set "
13270 13264 "for osp %p, cr %p, rp %s", (void *)osp,
13271 13265 (void *)cr, rnode4info(rp)));
13272 13266 mutex_exit(&osp->os_sync_lock);
13273 13267 open_stream_rele(osp, rp);
13274 13268 open_owner_rele(oop);
13275 13269 ep->stat = NFS4ERR_IO;
13276 13270 return;
13277 13271 }
13278 13272
13279 13273 /*
13280 13274 * Determine whether a reopen is needed. If this
13281 13275 * is a delegation open stream, then send the open
13282 13276 * to the server to give visibility to the open owner.
13283 13277 * Even if it isn't a delegation open stream, we need
13284 13278 * to check if the previous open CLAIM_DELEGATE_CUR
13285 13279 * was sufficient.
13286 13280 */
13287 13281
13288 13282 reopen_needed = osp->os_delegation ||
13289 13283 ((lt == F_RDLCK &&
13290 13284 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13291 13285 (lt == F_WRLCK &&
13292 13286 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13293 13287
13294 13288 mutex_exit(&osp->os_sync_lock);
13295 13289 open_owner_rele(oop);
13296 13290
13297 13291 if (reopen_needed) {
13298 13292 /*
13299 13293 * Always use CLAIM_PREVIOUS after server reboot.
13300 13294 * The server will reject CLAIM_DELEGATE_CUR if
13301 13295 * it is used during the grace period.
13302 13296 */
13303 13297 mutex_enter(&mi->mi_lock);
13304 13298 if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13305 13299 oclaim = CLAIM_PREVIOUS;
13306 13300 force = TRUE;
13307 13301 } else {
13308 13302 oclaim = CLAIM_DELEGATE_CUR;
13309 13303 force = FALSE;
13310 13304 }
13311 13305 mutex_exit(&mi->mi_lock);
13312 13306
13313 13307 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13314 13308 if (ep->error == EAGAIN) {
13315 13309 nfs4_error_zinit(ep);
13316 13310 ep->stat = NFS4ERR_DELAY;
13317 13311 }
13318 13312 }
13319 13313 open_stream_rele(osp, rp);
13320 13314 osp = NULL;
13321 13315 }
13322 13316 }
13323 13317
13324 13318 /*
13325 13319 * Setup the LOCKU4 arguments.
13326 13320 * Returns errors via the nfs4_error_t.
13327 13321 * NFS4_OK no problems. *go_otwp is TRUE if call should go
13328 13322 * over-the-wire. The caller must release the
13329 13323 * reference on *lopp.
13330 13324 * NFS4ERR_DELAY caller should retry (like recovery retry)
13331 13325 * (other) unrecoverable error.
13332 13326 */
13333 13327 static void
13334 13328 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13335 13329 LOCKU4args **locku_argsp, flock64_t *flk,
13336 13330 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13337 13331 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13338 13332 bool_t *skip_get_err, bool_t *go_otwp)
13339 13333 {
13340 13334 nfs4_lock_owner_t *lop = NULL;
13341 13335 LOCKU4args *locku_args;
13342 13336 pid_t pid;
13343 13337 bool_t is_spec = FALSE;
13344 13338 rnode4_t *rp = VTOR4(vp);
13345 13339
13346 13340 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13347 13341 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13348 13342
13349 13343 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13350 13344 if (ep->error || ep->stat)
13351 13345 return;
13352 13346
13353 13347 argop->argop = OP_LOCKU;
13354 13348 if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13355 13349 argsp->ctag = TAG_LOCKU_REINSTATE;
13356 13350 else
13357 13351 argsp->ctag = TAG_LOCKU;
13358 13352 locku_args = &argop->nfs_argop4_u.oplocku;
13359 13353 *locku_argsp = locku_args;
13360 13354
13361 13355 /* locktype should be set to any legal value */
13362 13356 locku_args->locktype = READ_LT;
13363 13357
13364 13358 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13365 13359 flk->l_pid;
13366 13360
13367 13361 /*
13368 13362 * Get the lock owner stateid. If no lock owner
13369 13363 * exists, return success.
13370 13364 */
13371 13365 lop = find_lock_owner(rp, pid, LOWN_ANY);
13372 13366 *lopp = lop;
13373 13367 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13374 13368 is_spec = TRUE;
13375 13369 if (!lop || is_spec) {
13376 13370 /*
13377 13371 * No lock owner so no locks to unlock.
13378 13372 * Return success. If there was a failed
13379 13373 * reclaim earlier, the lock might still be
13380 13374 * registered with the local locking code,
13381 13375 * so notify it of the unlock.
13382 13376 *
13383 13377 * If the lockowner is using a special stateid,
13384 13378 * then the original lock request (that created
13385 13379 * this lockowner) was never successful, so we
13386 13380 * have no lock to undo OTW.
13387 13381 */
13388 13382 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13389 13383 "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13390 13384 "(%ld) so return success", (long)pid));
13391 13385
13392 13386 if (ctype == NFS4_LCK_CTYPE_NORM)
13393 13387 flk->l_pid = curproc->p_pid;
13394 13388 nfs4_register_lock_locally(vp, flk, flag, offset);
13395 13389 /*
13396 13390 * Release our hold and NULL out so final_cleanup
13397 13391 * doesn't try to end a lock seqid sync we
13398 13392 * never started.
13399 13393 */
13400 13394 if (is_spec) {
13401 13395 lock_owner_rele(lop);
13402 13396 *lopp = NULL;
13403 13397 }
13404 13398 *skip_get_err = TRUE;
13405 13399 *go_otwp = FALSE;
13406 13400 return;
13407 13401 }
13408 13402
13409 13403 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13410 13404 if (ep->error == EAGAIN) {
13411 13405 lock_owner_rele(lop);
13412 13406 *lopp = NULL;
13413 13407 return;
13414 13408 }
13415 13409
13416 13410 mutex_enter(&lop->lo_lock);
13417 13411 locku_args->lock_stateid = lop->lock_stateid;
13418 13412 mutex_exit(&lop->lo_lock);
13419 13413 locku_args->seqid = lop->lock_seqid + 1;
13420 13414
13421 13415 /* leave the ref count on lop, rele after RPC call */
13422 13416
13423 13417 locku_args->offset = flk->l_start;
13424 13418 locku_args->length = flk->l_len;
13425 13419 if (flk->l_len == 0)
13426 13420 locku_args->length = ~locku_args->length;
13427 13421
13428 13422 *go_otwp = TRUE;
13429 13423 }
13430 13424
13431 13425 /*
13432 13426 * Setup the LOCK4 arguments.
13433 13427 *
13434 13428 * Returns errors via the nfs4_error_t.
13435 13429 * NFS4_OK no problems
13436 13430 * NFS4ERR_DELAY caller should retry (like recovery retry)
13437 13431 * (other) unrecoverable error
13438 13432 */
13439 13433 static void
13440 13434 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13441 13435 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13442 13436 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13443 13437 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13444 13438 {
13445 13439 LOCK4args *lock_args;
13446 13440 nfs4_open_owner_t *oop = NULL;
13447 13441 nfs4_open_stream_t *osp = NULL;
13448 13442 nfs4_lock_owner_t *lop = NULL;
13449 13443 pid_t pid;
13450 13444 rnode4_t *rp = VTOR4(vp);
13451 13445
13452 13446 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13453 13447
13454 13448 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13455 13449 if (ep->error || ep->stat != NFS4_OK)
13456 13450 return;
13457 13451
13458 13452 argop->argop = OP_LOCK;
13459 13453 if (ctype == NFS4_LCK_CTYPE_NORM)
13460 13454 argsp->ctag = TAG_LOCK;
13461 13455 else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13462 13456 argsp->ctag = TAG_RELOCK;
13463 13457 else
13464 13458 argsp->ctag = TAG_LOCK_REINSTATE;
13465 13459 lock_args = &argop->nfs_argop4_u.oplock;
13466 13460 lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13467 13461 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13468 13462 /*
13469 13463 * Get the lock owner. If no lock owner exists,
13470 13464 * create a 'temporary' one and grab the open seqid
13471 13465 * synchronization (which puts a hold on the open
13472 13466 * owner and open stream).
13473 13467 * This also grabs the lock seqid synchronization.
13474 13468 */
13475 13469 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13476 13470 ep->stat =
13477 13471 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13478 13472
13479 13473 if (ep->stat != NFS4_OK)
13480 13474 goto out;
13481 13475
13482 13476 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13483 13477 &lock_args->locker);
13484 13478
13485 13479 lock_args->offset = flk->l_start;
13486 13480 lock_args->length = flk->l_len;
13487 13481 if (flk->l_len == 0)
13488 13482 lock_args->length = ~lock_args->length;
13489 13483 *lock_argsp = lock_args;
13490 13484 out:
13491 13485 *oopp = oop;
13492 13486 *ospp = osp;
13493 13487 *lopp = lop;
13494 13488 }
13495 13489
13496 13490 /*
13497 13491 * After we get the reply from the server, record the proper information
13498 13492 * for possible resend lock requests.
13499 13493 */
13500 13494 static void
13501 13495 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13502 13496 nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13503 13497 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13504 13498 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13505 13499 {
13506 13500 bool_t unlock = (flk->l_type == F_UNLCK);
13507 13501
13508 13502 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13509 13503 ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13510 13504 ctype == NFS4_LCK_CTYPE_REINSTATE);
13511 13505
13512 13506 if (error != 0 && !unlock) {
13513 13507 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13514 13508 nfs4_client_lock_debug), (CE_NOTE,
13515 13509 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13516 13510 " for lop %p", (void *)lop));
13517 13511 ASSERT(lop != NULL);
13518 13512 mutex_enter(&lop->lo_lock);
13519 13513 lop->lo_pending_rqsts = 1;
13520 13514 mutex_exit(&lop->lo_lock);
13521 13515 }
13522 13516
13523 13517 lost_rqstp->lr_putfirst = FALSE;
13524 13518 lost_rqstp->lr_op = 0;
13525 13519
13526 13520 /*
13527 13521 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13528 13522 * recovery purposes so that the lock request that was sent
13529 13523 * can be saved and re-issued later. Ditto for EIO from a forced
13530 13524 * unmount. This is done to have the client's local locking state
13531 13525 * match the v4 server's state; that is, the request was
13532 13526 * potentially received and accepted by the server but the client
13533 13527 * thinks it was not.
13534 13528 */
13535 13529 if (error == ETIMEDOUT || error == EINTR ||
13536 13530 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13537 13531 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13538 13532 nfs4_client_lock_debug), (CE_NOTE,
13539 13533 "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13540 13534 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13541 13535 (void *)lop, (void *)oop, (void *)osp));
13542 13536 if (unlock)
13543 13537 lost_rqstp->lr_op = OP_LOCKU;
13544 13538 else {
13545 13539 lost_rqstp->lr_op = OP_LOCK;
13546 13540 lost_rqstp->lr_locktype = locktype;
13547 13541 }
13548 13542 /*
13549 13543 * Objects are held and rele'd via the recovery code.
13550 13544 * See nfs4_save_lost_rqst.
13551 13545 */
13552 13546 lost_rqstp->lr_vp = vp;
13553 13547 lost_rqstp->lr_dvp = NULL;
13554 13548 lost_rqstp->lr_oop = oop;
13555 13549 lost_rqstp->lr_osp = osp;
13556 13550 lost_rqstp->lr_lop = lop;
13557 13551 lost_rqstp->lr_cr = cr;
13558 13552 switch (ctype) {
13559 13553 case NFS4_LCK_CTYPE_NORM:
13560 13554 flk->l_pid = ttoproc(curthread)->p_pid;
13561 13555 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13562 13556 break;
13563 13557 case NFS4_LCK_CTYPE_REINSTATE:
13564 13558 lost_rqstp->lr_putfirst = TRUE;
13565 13559 lost_rqstp->lr_ctype = ctype;
13566 13560 break;
13567 13561 default:
13568 13562 break;
13569 13563 }
13570 13564 lost_rqstp->lr_flk = flk;
13571 13565 }
13572 13566 }
13573 13567
13574 13568 /*
13575 13569 * Update lop's seqid. Also update the seqid stored in a resend request,
13576 13570 * if any. (Some recovery errors increment the seqid, and we may have to
13577 13571 * send the resend request again.)
13578 13572 */
13579 13573
13580 13574 static void
13581 13575 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13582 13576 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13583 13577 {
13584 13578 if (lock_args) {
13585 13579 if (lock_args->locker.new_lock_owner == TRUE)
13586 13580 nfs4_get_and_set_next_open_seqid(oop, tag_type);
13587 13581 else {
13588 13582 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13589 13583 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13590 13584 }
13591 13585 } else if (locku_args) {
13592 13586 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13593 13587 nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13594 13588 }
13595 13589 }
13596 13590
13597 13591 /*
13598 13592 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13599 13593 * COMPOUND4 args/res for calls that need to retry.
13600 13594 * Switches the *cred_otwp to base_cr.
13601 13595 */
13602 13596 static void
13603 13597 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13604 13598 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13605 13599 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13606 13600 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13607 13601 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13608 13602 {
13609 13603 nfs4_open_owner_t *oop = *oopp;
13610 13604 nfs4_open_stream_t *osp = *ospp;
13611 13605 nfs4_lock_owner_t *lop = *lopp;
13612 13606 nfs_argop4 *argop = (*argspp)->array;
13613 13607
13614 13608 if (*did_start_fop) {
13615 13609 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13616 13610 needrecov);
13617 13611 *did_start_fop = FALSE;
13618 13612 }
13619 13613 ASSERT((*argspp)->array_len == 2);
13620 13614 if (argop[1].argop == OP_LOCK)
13621 13615 nfs4args_lock_free(&argop[1]);
13622 13616 else if (argop[1].argop == OP_LOCKT)
13623 13617 nfs4args_lockt_free(&argop[1]);
13624 13618 kmem_free(argop, 2 * sizeof (nfs_argop4));
13625 13619 if (!error)
13626 13620 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13627 13621 *argspp = NULL;
13628 13622 *respp = NULL;
13629 13623
13630 13624 if (lop) {
13631 13625 nfs4_end_lock_seqid_sync(lop);
13632 13626 lock_owner_rele(lop);
13633 13627 *lopp = NULL;
13634 13628 }
13635 13629
13636 13630 /* need to free up the reference on osp for lock args */
13637 13631 if (osp != NULL) {
13638 13632 open_stream_rele(osp, VTOR4(vp));
13639 13633 *ospp = NULL;
13640 13634 }
13641 13635
13642 13636 /* need to free up the reference on oop for lock args */
13643 13637 if (oop != NULL) {
13644 13638 nfs4_end_open_seqid_sync(oop);
13645 13639 open_owner_rele(oop);
13646 13640 *oopp = NULL;
13647 13641 }
13648 13642
13649 13643 crfree(*cred_otwp);
13650 13644 *cred_otwp = base_cr;
13651 13645 crhold(*cred_otwp);
13652 13646 }
13653 13647
13654 13648 /*
13655 13649 * Function to process the client's recovery for nfs4frlock.
13656 13650 * Returns TRUE if we should retry the lock request; FALSE otherwise.
13657 13651 *
13658 13652 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13659 13653 * COMPOUND4 args/res for calls that need to retry.
13660 13654 *
13661 13655 * Note: the rp's r_lkserlock is *not* dropped during this path.
13662 13656 */
13663 13657 static bool_t
13664 13658 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13665 13659 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13666 13660 LOCK4args *lock_args, LOCKU4args *locku_args,
13667 13661 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13668 13662 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13669 13663 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13670 13664 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13671 13665 {
13672 13666 nfs4_open_owner_t *oop = *oopp;
13673 13667 nfs4_open_stream_t *osp = *ospp;
13674 13668 nfs4_lock_owner_t *lop = *lopp;
13675 13669
13676 13670 bool_t abort, retry;
13677 13671
13678 13672 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13679 13673 ASSERT((*argspp) != NULL);
13680 13674 ASSERT((*respp) != NULL);
13681 13675 if (lock_args || locku_args)
13682 13676 ASSERT(lop != NULL);
13683 13677
13684 13678 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13685 13679 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13686 13680
13687 13681 retry = TRUE;
13688 13682 abort = FALSE;
13689 13683 if (needrecov) {
13690 13684 nfs4_bseqid_entry_t *bsep = NULL;
13691 13685 nfs_opnum4 op;
13692 13686
13693 13687 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13694 13688
13695 13689 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13696 13690 seqid4 seqid;
13697 13691
13698 13692 if (lock_args) {
13699 13693 if (lock_args->locker.new_lock_owner == TRUE)
13700 13694 seqid = lock_args->locker.locker4_u.
13701 13695 open_owner.open_seqid;
13702 13696 else
13703 13697 seqid = lock_args->locker.locker4_u.
13704 13698 lock_owner.lock_seqid;
13705 13699 } else if (locku_args) {
13706 13700 seqid = locku_args->seqid;
13707 13701 } else {
13708 13702 seqid = 0;
13709 13703 }
13710 13704
13711 13705 bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13712 13706 flk->l_pid, (*argspp)->ctag, seqid);
13713 13707 }
13714 13708
13715 13709 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13716 13710 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13717 13711 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13718 13712 NULL, op, bsep, NULL, NULL);
13719 13713
13720 13714 if (bsep)
13721 13715 kmem_free(bsep, sizeof (*bsep));
13722 13716 }
13723 13717
13724 13718 /*
13725 13719 * Return that we do not want to retry the request for 3 cases:
13726 13720 * 1. If we received EINTR or are bailing out because of a forced
13727 13721 * unmount, we came into this code path just for the sake of
13728 13722 * initiating recovery, we now need to return the error.
13729 13723 * 2. If we have aborted recovery.
13730 13724 * 3. We received NFS4ERR_BAD_SEQID.
13731 13725 */
13732 13726 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13733 13727 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13734 13728 retry = FALSE;
13735 13729
13736 13730 if (*did_start_fop == TRUE) {
13737 13731 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13738 13732 needrecov);
13739 13733 *did_start_fop = FALSE;
13740 13734 }
13741 13735
13742 13736 if (retry == TRUE) {
13743 13737 nfs_argop4 *argop;
13744 13738
13745 13739 argop = (*argspp)->array;
13746 13740 ASSERT((*argspp)->array_len == 2);
13747 13741
13748 13742 if (argop[1].argop == OP_LOCK)
13749 13743 nfs4args_lock_free(&argop[1]);
13750 13744 else if (argop[1].argop == OP_LOCKT)
13751 13745 nfs4args_lockt_free(&argop[1]);
13752 13746 kmem_free(argop, 2 * sizeof (nfs_argop4));
13753 13747 if (!ep->error)
13754 13748 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13755 13749 *respp = NULL;
13756 13750 *argspp = NULL;
13757 13751 }
13758 13752
13759 13753 if (lop != NULL) {
13760 13754 nfs4_end_lock_seqid_sync(lop);
13761 13755 lock_owner_rele(lop);
13762 13756 }
13763 13757
13764 13758 *lopp = NULL;
13765 13759
13766 13760 /* need to free up the reference on osp for lock args */
13767 13761 if (osp != NULL) {
13768 13762 open_stream_rele(osp, rp);
13769 13763 *ospp = NULL;
13770 13764 }
13771 13765
13772 13766 /* need to free up the reference on oop for lock args */
13773 13767 if (oop != NULL) {
13774 13768 nfs4_end_open_seqid_sync(oop);
13775 13769 open_owner_rele(oop);
13776 13770 *oopp = NULL;
13777 13771 }
13778 13772
13779 13773 return (retry);
13780 13774 }
13781 13775
13782 13776 /*
13783 13777 * Handles the successful reply from the server for nfs4frlock.
13784 13778 */
13785 13779 static void
13786 13780 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13787 13781 vnode_t *vp, int flag, u_offset_t offset,
13788 13782 nfs4_lost_rqst_t *resend_rqstp)
13789 13783 {
13790 13784 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13791 13785 if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13792 13786 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13793 13787 if (ctype == NFS4_LCK_CTYPE_NORM) {
13794 13788 flk->l_pid = ttoproc(curthread)->p_pid;
13795 13789 /*
13796 13790 * We do not register lost locks locally in
13797 13791 * the 'resend' case since the user/application
13798 13792 * doesn't think we have the lock.
13799 13793 */
13800 13794 ASSERT(!resend_rqstp);
13801 13795 nfs4_register_lock_locally(vp, flk, flag, offset);
13802 13796 }
13803 13797 }
13804 13798 }
13805 13799
13806 13800 /*
13807 13801 * Handle the DENIED reply from the server for nfs4frlock.
13808 13802 * Returns TRUE if we should retry the request; FALSE otherwise.
13809 13803 *
13810 13804 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13811 13805 * COMPOUND4 args/res for calls that need to retry. Can also
13812 13806 * drop and regrab the r_lkserlock.
13813 13807 */
13814 13808 static bool_t
13815 13809 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13816 13810 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13817 13811 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13818 13812 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13819 13813 nfs4_recov_state_t *recov_statep, int needrecov,
13820 13814 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13821 13815 clock_t *tick_delayp, short *whencep, int *errorp,
13822 13816 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13823 13817 bool_t *skip_get_err)
13824 13818 {
13825 13819 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13826 13820
13827 13821 if (lock_args) {
13828 13822 nfs4_open_owner_t *oop = *oopp;
13829 13823 nfs4_open_stream_t *osp = *ospp;
13830 13824 nfs4_lock_owner_t *lop = *lopp;
13831 13825 int intr;
13832 13826
13833 13827 /*
13834 13828 * Blocking lock needs to sleep and retry from the request.
13835 13829 *
13836 13830 * Do not block and wait for 'resend' or 'reinstate'
13837 13831 * lock requests, just return the error.
13838 13832 *
13839 13833 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13840 13834 */
13841 13835 if (cmd == F_SETLKW) {
13842 13836 rnode4_t *rp = VTOR4(vp);
13843 13837 nfs_argop4 *argop = (*argspp)->array;
13844 13838
13845 13839 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13846 13840
13847 13841 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13848 13842 recov_statep, needrecov);
13849 13843 *did_start_fop = FALSE;
13850 13844 ASSERT((*argspp)->array_len == 2);
13851 13845 if (argop[1].argop == OP_LOCK)
13852 13846 nfs4args_lock_free(&argop[1]);
13853 13847 else if (argop[1].argop == OP_LOCKT)
13854 13848 nfs4args_lockt_free(&argop[1]);
13855 13849 kmem_free(argop, 2 * sizeof (nfs_argop4));
13856 13850 if (*respp)
13857 13851 xdr_free(xdr_COMPOUND4res_clnt,
13858 13852 (caddr_t)*respp);
13859 13853 *argspp = NULL;
13860 13854 *respp = NULL;
13861 13855 nfs4_end_lock_seqid_sync(lop);
13862 13856 lock_owner_rele(lop);
13863 13857 *lopp = NULL;
13864 13858 if (osp != NULL) {
13865 13859 open_stream_rele(osp, rp);
13866 13860 *ospp = NULL;
13867 13861 }
13868 13862 if (oop != NULL) {
13869 13863 nfs4_end_open_seqid_sync(oop);
13870 13864 open_owner_rele(oop);
13871 13865 *oopp = NULL;
13872 13866 }
13873 13867
13874 13868 nfs_rw_exit(&rp->r_lkserlock);
13875 13869
13876 13870 intr = nfs4_block_and_wait(tick_delayp, rp);
13877 13871
13878 13872 if (intr) {
13879 13873 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13880 13874 RW_WRITER, FALSE);
13881 13875 *errorp = EINTR;
13882 13876 return (FALSE);
13883 13877 }
13884 13878
13885 13879 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13886 13880 RW_WRITER, FALSE);
13887 13881
13888 13882 /*
13889 13883 * Make sure we are still safe to lock with
13890 13884 * regards to mmapping.
13891 13885 */
13892 13886 if (!nfs4_safelock(vp, flk, cr)) {
13893 13887 *errorp = EAGAIN;
13894 13888 return (FALSE);
13895 13889 }
13896 13890
13897 13891 return (TRUE);
13898 13892 }
13899 13893 if (ctype == NFS4_LCK_CTYPE_NORM)
13900 13894 *errorp = EAGAIN;
13901 13895 *skip_get_err = TRUE;
13902 13896 flk->l_whence = 0;
13903 13897 *whencep = 0;
13904 13898 return (FALSE);
13905 13899 } else if (lockt_args) {
13906 13900 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13907 13901 "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13908 13902
13909 13903 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13910 13904 flk, lockt_args);
13911 13905
13912 13906 /* according to NLM code */
13913 13907 *errorp = 0;
13914 13908 *whencep = 0;
13915 13909 *skip_get_err = TRUE;
13916 13910 return (FALSE);
13917 13911 }
13918 13912 return (FALSE);
13919 13913 }
13920 13914
13921 13915 /*
13922 13916 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13923 13917 */
13924 13918 static void
13925 13919 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13926 13920 {
13927 13921 switch (resp->status) {
13928 13922 case NFS4ERR_ACCESS:
13929 13923 case NFS4ERR_ADMIN_REVOKED:
13930 13924 case NFS4ERR_BADHANDLE:
13931 13925 case NFS4ERR_BAD_RANGE:
13932 13926 case NFS4ERR_BAD_SEQID:
13933 13927 case NFS4ERR_BAD_STATEID:
13934 13928 case NFS4ERR_BADXDR:
13935 13929 case NFS4ERR_DEADLOCK:
13936 13930 case NFS4ERR_DELAY:
13937 13931 case NFS4ERR_EXPIRED:
13938 13932 case NFS4ERR_FHEXPIRED:
13939 13933 case NFS4ERR_GRACE:
13940 13934 case NFS4ERR_INVAL:
13941 13935 case NFS4ERR_ISDIR:
13942 13936 case NFS4ERR_LEASE_MOVED:
13943 13937 case NFS4ERR_LOCK_NOTSUPP:
13944 13938 case NFS4ERR_LOCK_RANGE:
13945 13939 case NFS4ERR_MOVED:
13946 13940 case NFS4ERR_NOFILEHANDLE:
13947 13941 case NFS4ERR_NO_GRACE:
13948 13942 case NFS4ERR_OLD_STATEID:
13949 13943 case NFS4ERR_OPENMODE:
13950 13944 case NFS4ERR_RECLAIM_BAD:
13951 13945 case NFS4ERR_RECLAIM_CONFLICT:
13952 13946 case NFS4ERR_RESOURCE:
13953 13947 case NFS4ERR_SERVERFAULT:
13954 13948 case NFS4ERR_STALE:
13955 13949 case NFS4ERR_STALE_CLIENTID:
13956 13950 case NFS4ERR_STALE_STATEID:
13957 13951 return;
13958 13952 default:
13959 13953 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13960 13954 "nfs4frlock_results_default: got unrecognizable "
13961 13955 "res.status %d", resp->status));
13962 13956 *errorp = NFS4ERR_INVAL;
13963 13957 }
13964 13958 }
13965 13959
13966 13960 /*
13967 13961 * The lock request was successful, so update the client's state.
13968 13962 */
13969 13963 static void
13970 13964 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13971 13965 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13972 13966 vnode_t *vp, flock64_t *flk, cred_t *cr,
13973 13967 nfs4_lost_rqst_t *resend_rqstp)
13974 13968 {
13975 13969 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13976 13970
13977 13971 if (lock_args) {
13978 13972 LOCK4res *lock_res;
13979 13973
13980 13974 lock_res = &resop->nfs_resop4_u.oplock;
13981 13975 /* update the stateid with server's response */
13982 13976
13983 13977 if (lock_args->locker.new_lock_owner == TRUE) {
13984 13978 mutex_enter(&lop->lo_lock);
13985 13979 lop->lo_just_created = NFS4_PERM_CREATED;
13986 13980 mutex_exit(&lop->lo_lock);
13987 13981 }
13988 13982
13989 13983 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13990 13984
13991 13985 /*
13992 13986 * If the lock was the result of a resending a lost
13993 13987 * request, we've synched up the stateid and seqid
13994 13988 * with the server, but now the server might be out of sync
13995 13989 * with what the application thinks it has for locks.
13996 13990 * Clean that up here. It's unclear whether we should do
13997 13991 * this even if the filesystem has been forcibly unmounted.
13998 13992 * For most servers, it's probably wasted effort, but
13999 13993 * RFC 7530 lets servers require that unlocks exactly match
14000 13994 * the locks that are held.
14001 13995 */
14002 13996 if (resend_rqstp != NULL &&
14003 13997 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
14004 13998 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
14005 13999 } else {
14006 14000 flk->l_whence = 0;
14007 14001 }
14008 14002 } else if (locku_args) {
14009 14003 LOCKU4res *locku_res;
14010 14004
14011 14005 locku_res = &resop->nfs_resop4_u.oplocku;
14012 14006
14013 14007 /* Update the stateid with the server's response */
14014 14008 nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
14015 14009 } else if (lockt_args) {
14016 14010 /* Switch the lock type to express success, see fcntl */
14017 14011 flk->l_type = F_UNLCK;
14018 14012 flk->l_whence = 0;
14019 14013 }
14020 14014 }
14021 14015
14022 14016 /*
14023 14017 * Do final cleanup before exiting nfs4frlock.
14024 14018 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14025 14019 * COMPOUND4 args/res for calls that haven't already.
14026 14020 */
14027 14021 static void
14028 14022 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14029 14023 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14030 14024 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14031 14025 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14032 14026 short whence, u_offset_t offset, struct lm_sysid *ls,
14033 14027 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14034 14028 bool_t did_start_fop, bool_t skip_get_err,
14035 14029 cred_t *cred_otw, cred_t *cred)
14036 14030 {
14037 14031 mntinfo4_t *mi = VTOMI4(vp);
14038 14032 rnode4_t *rp = VTOR4(vp);
14039 14033 int error = *errorp;
14040 14034 nfs_argop4 *argop;
14041 14035 int do_flush_pages = 0;
14042 14036
14043 14037 ASSERT(nfs_zone() == mi->mi_zone);
14044 14038 /*
14045 14039 * The client recovery code wants the raw status information,
14046 14040 * so don't map the NFS status code to an errno value for
14047 14041 * non-normal call types.
14048 14042 */
14049 14043 if (ctype == NFS4_LCK_CTYPE_NORM) {
14050 14044 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14051 14045 *errorp = geterrno4(resp->status);
14052 14046 if (did_start_fop == TRUE)
14053 14047 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14054 14048 needrecov);
14055 14049
14056 14050 /*
14057 14051 * We've established a new lock on the server, so invalidate
14058 14052 * the pages associated with the vnode to get the most up to
14059 14053 * date pages from the server after acquiring the lock. We
14060 14054 * want to be sure that the read operation gets the newest data.
14061 14055 * N.B.
14062 14056 * We used to do this in nfs4frlock_results_ok but that doesn't
14063 14057 * work since VOP_PUTPAGE can call nfs4_commit which calls
14064 14058 * nfs4_start_fop. We flush the pages below after calling
14065 14059 * nfs4_end_fop above
14066 14060 * The flush of the page cache must be done after
14067 14061 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14068 14062 */
14069 14063 if (!error && resp && resp->status == NFS4_OK)
14070 14064 do_flush_pages = 1;
14071 14065 }
14072 14066 if (argsp) {
14073 14067 ASSERT(argsp->array_len == 2);
14074 14068 argop = argsp->array;
14075 14069 if (argop[1].argop == OP_LOCK)
14076 14070 nfs4args_lock_free(&argop[1]);
14077 14071 else if (argop[1].argop == OP_LOCKT)
14078 14072 nfs4args_lockt_free(&argop[1]);
14079 14073 kmem_free(argop, 2 * sizeof (nfs_argop4));
14080 14074 if (resp)
14081 14075 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14082 14076 }
14083 14077
14084 14078 /* free the reference on the lock owner */
14085 14079 if (lop != NULL) {
14086 14080 nfs4_end_lock_seqid_sync(lop);
14087 14081 lock_owner_rele(lop);
14088 14082 }
14089 14083
14090 14084 /* need to free up the reference on osp for lock args */
14091 14085 if (osp != NULL)
14092 14086 open_stream_rele(osp, rp);
14093 14087
14094 14088 /* need to free up the reference on oop for lock args */
14095 14089 if (oop != NULL) {
14096 14090 nfs4_end_open_seqid_sync(oop);
14097 14091 open_owner_rele(oop);
14098 14092 }
14099 14093
14100 14094 if (do_flush_pages)
14101 14095 nfs4_flush_pages(vp, cred);
14102 14096
14103 14097 (void) convoff(vp, flk, whence, offset);
14104 14098
14105 14099 lm_rel_sysid(ls);
14106 14100
14107 14101 /*
14108 14102 * Record debug information in the event we get EINVAL.
14109 14103 */
14110 14104 mutex_enter(&mi->mi_lock);
14111 14105 if (*errorp == EINVAL && (lock_args || locku_args) &&
14112 14106 (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14113 14107 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14114 14108 zcmn_err(getzoneid(), CE_NOTE,
14115 14109 "%s operation failed with "
14116 14110 "EINVAL probably since the server, %s,"
14117 14111 " doesn't support POSIX style locking",
14118 14112 lock_args ? "LOCK" : "LOCKU",
14119 14113 mi->mi_curr_serv->sv_hostname);
14120 14114 mi->mi_flags |= MI4_LOCK_DEBUG;
14121 14115 }
14122 14116 }
14123 14117 mutex_exit(&mi->mi_lock);
14124 14118
14125 14119 if (cred_otw)
14126 14120 crfree(cred_otw);
14127 14121 }
14128 14122
14129 14123 /*
14130 14124 * This calls the server and the local locking code.
14131 14125 *
14132 14126 * Client locks are registerred locally by oring the sysid with
14133 14127 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14134 14128 * We need to distinguish between the two to avoid collision in case one
14135 14129 * machine is used as both client and server.
14136 14130 *
14137 14131 * Blocking lock requests will continually retry to acquire the lock
14138 14132 * forever.
14139 14133 *
14140 14134 * The ctype is defined as follows:
14141 14135 * NFS4_LCK_CTYPE_NORM: normal lock request.
14142 14136 *
14143 14137 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client
14144 14138 * recovery, get the pid from flk instead of curproc, and don't reregister
14145 14139 * the lock locally.
14146 14140 *
14147 14141 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14148 14142 * that we will use the information passed in via resend_rqstp to setup the
14149 14143 * lock/locku request. This resend is the exact same request as the 'lost
14150 14144 * lock', and is initiated by the recovery framework. A successful resend
14151 14145 * request can initiate one or more reinstate requests.
14152 14146 *
14153 14147 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14154 14148 * does not trigger additional reinstate requests. This lock call type is
14155 14149 * set for setting the v4 server's locking state back to match what the
14156 14150 * client's local locking state is in the event of a received 'lost lock'.
14157 14151 *
14158 14152 * Errors are returned via the nfs4_error_t parameter.
14159 14153 */
14160 14154 void
14161 14155 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14162 14156 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14163 14157 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14164 14158 {
14165 14159 COMPOUND4args_clnt args, *argsp = NULL;
14166 14160 COMPOUND4res_clnt res, *resp = NULL;
14167 14161 nfs_argop4 *argop;
14168 14162 nfs_resop4 *resop;
14169 14163 rnode4_t *rp;
14170 14164 int doqueue = 1;
14171 14165 clock_t tick_delay; /* delay in clock ticks */
14172 14166 struct lm_sysid *ls;
14173 14167 LOCK4args *lock_args = NULL;
14174 14168 LOCKU4args *locku_args = NULL;
14175 14169 LOCKT4args *lockt_args = NULL;
14176 14170 nfs4_open_owner_t *oop = NULL;
14177 14171 nfs4_open_stream_t *osp = NULL;
14178 14172 nfs4_lock_owner_t *lop = NULL;
14179 14173 bool_t needrecov = FALSE;
14180 14174 nfs4_recov_state_t recov_state;
14181 14175 short whence;
14182 14176 nfs4_op_hint_t op_hint;
14183 14177 nfs4_lost_rqst_t lost_rqst;
14184 14178 bool_t retry = FALSE;
14185 14179 bool_t did_start_fop = FALSE;
14186 14180 bool_t skip_get_err = FALSE;
14187 14181 cred_t *cred_otw = NULL;
14188 14182 bool_t recovonly; /* just queue request */
14189 14183 int frc_no_reclaim = 0;
14190 14184 #ifdef DEBUG
14191 14185 char *name;
14192 14186 #endif
14193 14187
14194 14188 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14195 14189
14196 14190 #ifdef DEBUG
14197 14191 name = fn_name(VTOSV(vp)->sv_name);
14198 14192 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14199 14193 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14200 14194 "length %"PRIu64", pid %d, sysid %d, call type %s, "
14201 14195 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14202 14196 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14203 14197 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14204 14198 resend_rqstp ? "TRUE" : "FALSE"));
14205 14199 kmem_free(name, MAXNAMELEN);
14206 14200 #endif
14207 14201
14208 14202 nfs4_error_zinit(ep);
14209 14203 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14210 14204 if (ep->error)
14211 14205 return;
14212 14206 ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14213 14207 if (ep->error)
14214 14208 return;
14215 14209 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14216 14210 vp, cr, &cred_otw);
14217 14211
14218 14212 recov_retry:
14219 14213 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14220 14214 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14221 14215 rp = VTOR4(vp);
14222 14216
14223 14217 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14224 14218 &did_start_fop, &recovonly);
14225 14219
14226 14220 if (ep->error)
14227 14221 goto out;
14228 14222
14229 14223 if (recovonly) {
14230 14224 /*
14231 14225 * Leave the request for the recovery system to deal with.
14232 14226 */
14233 14227 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14234 14228 ASSERT(cmd != F_GETLK);
14235 14229 ASSERT(flk->l_type == F_UNLCK);
14236 14230
14237 14231 nfs4_error_init(ep, EINTR);
14238 14232 needrecov = TRUE;
14239 14233 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14240 14234 if (lop != NULL) {
14241 14235 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14242 14236 NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14243 14237 (void) nfs4_start_recovery(ep,
14244 14238 VTOMI4(vp), vp, NULL, NULL,
14245 14239 (lost_rqst.lr_op == OP_LOCK ||
14246 14240 lost_rqst.lr_op == OP_LOCKU) ?
14247 14241 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14248 14242 lock_owner_rele(lop);
14249 14243 lop = NULL;
14250 14244 }
14251 14245 flk->l_pid = curproc->p_pid;
14252 14246 nfs4_register_lock_locally(vp, flk, flag, offset);
14253 14247 goto out;
14254 14248 }
14255 14249
14256 14250 /* putfh directory fh */
14257 14251 argop[0].argop = OP_CPUTFH;
14258 14252 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14259 14253
14260 14254 /*
14261 14255 * Set up the over-the-wire arguments and get references to the
14262 14256 * open owner, etc.
14263 14257 */
14264 14258
14265 14259 if (ctype == NFS4_LCK_CTYPE_RESEND ||
14266 14260 ctype == NFS4_LCK_CTYPE_REINSTATE) {
14267 14261 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14268 14262 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14269 14263 } else {
14270 14264 bool_t go_otw = TRUE;
14271 14265
14272 14266 ASSERT(resend_rqstp == NULL);
14273 14267
14274 14268 switch (cmd) {
14275 14269 case F_GETLK:
14276 14270 nfs4frlock_setup_lockt_args(ctype, &argop[1],
14277 14271 &lockt_args, argsp, flk, rp);
14278 14272 break;
14279 14273 case F_SETLKW:
14280 14274 case F_SETLK:
14281 14275 if (flk->l_type == F_UNLCK)
14282 14276 nfs4frlock_setup_locku_args(ctype,
14283 14277 &argop[1], &locku_args, flk,
14284 14278 &lop, ep, argsp,
14285 14279 vp, flag, offset, cr,
14286 14280 &skip_get_err, &go_otw);
14287 14281 else
14288 14282 nfs4frlock_setup_lock_args(ctype,
14289 14283 &lock_args, &oop, &osp, &lop, &argop[1],
14290 14284 argsp, flk, cmd, vp, cr, ep);
14291 14285
14292 14286 if (ep->error)
14293 14287 goto out;
14294 14288
14295 14289 switch (ep->stat) {
14296 14290 case NFS4_OK:
14297 14291 break;
14298 14292 case NFS4ERR_DELAY:
14299 14293 /* recov thread never gets this error */
14300 14294 ASSERT(resend_rqstp == NULL);
14301 14295 ASSERT(did_start_fop);
14302 14296
14303 14297 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14304 14298 &recov_state, TRUE);
14305 14299 did_start_fop = FALSE;
14306 14300 if (argop[1].argop == OP_LOCK)
14307 14301 nfs4args_lock_free(&argop[1]);
14308 14302 else if (argop[1].argop == OP_LOCKT)
14309 14303 nfs4args_lockt_free(&argop[1]);
14310 14304 kmem_free(argop, 2 * sizeof (nfs_argop4));
14311 14305 argsp = NULL;
14312 14306 goto recov_retry;
14313 14307 default:
14314 14308 ep->error = EIO;
14315 14309 goto out;
14316 14310 }
14317 14311 break;
14318 14312 default:
14319 14313 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14320 14314 "nfs4_frlock: invalid cmd %d", cmd));
14321 14315 ep->error = EINVAL;
14322 14316 goto out;
14323 14317 }
14324 14318
14325 14319 if (!go_otw)
14326 14320 goto out;
14327 14321 }
14328 14322
14329 14323 /* XXX should we use the local reclock as a cache ? */
14330 14324 /*
14331 14325 * Unregister the lock with the local locking code before
14332 14326 * contacting the server. This avoids a potential race where
14333 14327 * another process gets notified that it has been granted a lock
14334 14328 * before we can unregister ourselves locally.
14335 14329 */
14336 14330 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14337 14331 if (ctype == NFS4_LCK_CTYPE_NORM)
14338 14332 flk->l_pid = ttoproc(curthread)->p_pid;
14339 14333 nfs4_register_lock_locally(vp, flk, flag, offset);
14340 14334 }
14341 14335
14342 14336 /*
14343 14337 * Send the server the lock request. Continually loop with a delay
14344 14338 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14345 14339 */
14346 14340 resp = &res;
14347 14341
14348 14342 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14349 14343 (CE_NOTE,
14350 14344 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14351 14345 rnode4info(rp)));
14352 14346
14353 14347 if (lock_args && frc_no_reclaim) {
14354 14348 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14355 14349 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14356 14350 "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14357 14351 lock_args->reclaim = FALSE;
14358 14352 if (did_reclaimp)
14359 14353 *did_reclaimp = 0;
14360 14354 }
14361 14355
14362 14356 /*
14363 14357 * Do the OTW call.
14364 14358 */
14365 14359 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14366 14360
14367 14361 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14368 14362 "nfs4frlock: error %d, status %d", ep->error, resp->status));
14369 14363
14370 14364 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14371 14365 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14372 14366 "nfs4frlock: needrecov %d", needrecov));
14373 14367
14374 14368 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14375 14369 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14376 14370 args.ctag);
14377 14371
14378 14372 /*
14379 14373 * Check if one of these mutually exclusive error cases has
14380 14374 * happened:
14381 14375 * need to swap credentials due to access error
14382 14376 * recovery is needed
14383 14377 * different error (only known case is missing Kerberos ticket)
14384 14378 */
14385 14379
14386 14380 if ((ep->error == EACCES ||
14387 14381 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14388 14382 cred_otw != cr) {
14389 14383 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14390 14384 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14391 14385 cr, &cred_otw);
14392 14386 goto recov_retry;
14393 14387 }
14394 14388
14395 14389 if (needrecov) {
14396 14390 /*
14397 14391 * LOCKT requests don't need to recover from lost
14398 14392 * requests since they don't create/modify state.
14399 14393 */
14400 14394 if ((ep->error == EINTR ||
14401 14395 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14402 14396 lockt_args)
14403 14397 goto out;
14404 14398 /*
14405 14399 * Do not attempt recovery for requests initiated by
14406 14400 * the recovery framework. Let the framework redrive them.
14407 14401 */
14408 14402 if (ctype != NFS4_LCK_CTYPE_NORM)
14409 14403 goto out;
14410 14404 else {
14411 14405 ASSERT(resend_rqstp == NULL);
14412 14406 }
14413 14407
14414 14408 nfs4frlock_save_lost_rqst(ctype, ep->error,
14415 14409 flk_to_locktype(cmd, flk->l_type),
14416 14410 oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14417 14411
14418 14412 retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14419 14413 &resp, lock_args, locku_args, &oop, &osp, &lop,
14420 14414 rp, vp, &recov_state, op_hint, &did_start_fop,
14421 14415 cmd != F_GETLK ? &lost_rqst : NULL, flk);
14422 14416
14423 14417 if (retry) {
14424 14418 ASSERT(oop == NULL);
14425 14419 ASSERT(osp == NULL);
14426 14420 ASSERT(lop == NULL);
14427 14421 goto recov_retry;
14428 14422 }
14429 14423 goto out;
14430 14424 }
14431 14425
14432 14426 /*
14433 14427 * Bail out if have reached this point with ep->error set. Can
14434 14428 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14435 14429 * This happens if Kerberos ticket has expired or has been
14436 14430 * destroyed.
14437 14431 */
14438 14432 if (ep->error != 0)
14439 14433 goto out;
14440 14434
14441 14435 /*
14442 14436 * Process the reply.
14443 14437 */
14444 14438 switch (resp->status) {
14445 14439 case NFS4_OK:
14446 14440 resop = &resp->array[1];
14447 14441 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14448 14442 resend_rqstp);
14449 14443 /*
14450 14444 * Have a successful lock operation, now update state.
14451 14445 */
14452 14446 nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14453 14447 resop, lop, vp, flk, cr, resend_rqstp);
14454 14448 break;
14455 14449
14456 14450 case NFS4ERR_DENIED:
14457 14451 resop = &resp->array[1];
14458 14452 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14459 14453 &oop, &osp, &lop, cmd, vp, flk, op_hint,
14460 14454 &recov_state, needrecov, &argsp, &resp,
14461 14455 &tick_delay, &whence, &ep->error, resop, cr,
14462 14456 &did_start_fop, &skip_get_err);
14463 14457
14464 14458 if (retry) {
14465 14459 ASSERT(oop == NULL);
14466 14460 ASSERT(osp == NULL);
14467 14461 ASSERT(lop == NULL);
14468 14462 goto recov_retry;
14469 14463 }
14470 14464 break;
14471 14465 /*
14472 14466 * If the server won't let us reclaim, fall-back to trying to lock
14473 14467 * the file from scratch. Code elsewhere will check the changeinfo
14474 14468 * to ensure the file hasn't been changed.
14475 14469 */
14476 14470 case NFS4ERR_NO_GRACE:
14477 14471 if (lock_args && lock_args->reclaim == TRUE) {
14478 14472 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14479 14473 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14480 14474 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14481 14475 frc_no_reclaim = 1;
14482 14476 /* clean up before retrying */
14483 14477 needrecov = 0;
14484 14478 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14485 14479 lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14486 14480 &recov_state, op_hint, &did_start_fop, NULL, flk);
14487 14481 goto recov_retry;
14488 14482 }
14489 14483 /* FALLTHROUGH */
14490 14484
14491 14485 default:
14492 14486 nfs4frlock_results_default(resp, &ep->error);
14493 14487 break;
14494 14488 }
14495 14489 out:
14496 14490 /*
14497 14491 * Process and cleanup from error. Make interrupted unlock
14498 14492 * requests look successful, since they will be handled by the
14499 14493 * client recovery code.
14500 14494 */
14501 14495 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14502 14496 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14503 14497 lock_args, locku_args, did_start_fop,
14504 14498 skip_get_err, cred_otw, cr);
14505 14499
14506 14500 if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14507 14501 (cmd == F_SETLK || cmd == F_SETLKW))
14508 14502 ep->error = 0;
14509 14503 }
14510 14504
14511 14505 /*
14512 14506 * nfs4_safelock:
14513 14507 *
14514 14508 * Return non-zero if the given lock request can be handled without
14515 14509 * violating the constraints on concurrent mapping and locking.
14516 14510 */
14517 14511
14518 14512 static int
14519 14513 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14520 14514 {
14521 14515 rnode4_t *rp = VTOR4(vp);
14522 14516 struct vattr va;
14523 14517 int error;
14524 14518
14525 14519 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14526 14520 ASSERT(rp->r_mapcnt >= 0);
14527 14521 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14528 14522 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14529 14523 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14530 14524 bfp->l_start, bfp->l_len, rp->r_mapcnt));
14531 14525
14532 14526 if (rp->r_mapcnt == 0)
14533 14527 return (1); /* always safe if not mapped */
14534 14528
14535 14529 /*
14536 14530 * If the file is already mapped and there are locks, then they
14537 14531 * should be all safe locks. So adding or removing a lock is safe
14538 14532 * as long as the new request is safe (i.e., whole-file, meaning
14539 14533 * length and starting offset are both zero).
14540 14534 */
14541 14535
14542 14536 if (bfp->l_start != 0 || bfp->l_len != 0) {
14543 14537 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14544 14538 "cannot lock a memory mapped file unless locking the "
14545 14539 "entire file: start %"PRIx64", len %"PRIx64,
14546 14540 bfp->l_start, bfp->l_len));
14547 14541 return (0);
14548 14542 }
14549 14543
14550 14544 /* mandatory locking and mapping don't mix */
14551 14545 va.va_mask = AT_MODE;
14552 14546 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14553 14547 if (error != 0) {
14554 14548 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14555 14549 "getattr error %d", error));
14556 14550 return (0); /* treat errors conservatively */
14557 14551 }
14558 14552 if (MANDLOCK(vp, va.va_mode)) {
14559 14553 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14560 14554 "cannot mandatory lock and mmap a file"));
14561 14555 return (0);
14562 14556 }
14563 14557
14564 14558 return (1);
14565 14559 }
14566 14560
14567 14561
14568 14562 /*
14569 14563 * Register the lock locally within Solaris.
14570 14564 * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14571 14565 * recording locks locally.
14572 14566 *
14573 14567 * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14574 14568 * are registered locally.
14575 14569 */
14576 14570 void
14577 14571 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14578 14572 u_offset_t offset)
14579 14573 {
14580 14574 int oldsysid;
14581 14575 int error;
14582 14576 #ifdef DEBUG
14583 14577 char *name;
14584 14578 #endif
14585 14579
14586 14580 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14587 14581
14588 14582 #ifdef DEBUG
14589 14583 name = fn_name(VTOSV(vp)->sv_name);
14590 14584 NFS4_DEBUG(nfs4_client_lock_debug,
14591 14585 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14592 14586 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14593 14587 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14594 14588 flk->l_sysid));
14595 14589 kmem_free(name, MAXNAMELEN);
14596 14590 #endif
14597 14591
14598 14592 /* register the lock with local locking */
14599 14593 oldsysid = flk->l_sysid;
14600 14594 flk->l_sysid |= LM_SYSID_CLIENT;
14601 14595 error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14602 14596 #ifdef DEBUG
14603 14597 if (error != 0) {
14604 14598 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14605 14599 "nfs4_register_lock_locally: could not register with"
14606 14600 " local locking"));
14607 14601 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14608 14602 "error %d, vp 0x%p, pid %d, sysid 0x%x",
14609 14603 error, (void *)vp, flk->l_pid, flk->l_sysid));
14610 14604 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14611 14605 "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14612 14606 flk->l_type, flk->l_start, flk->l_len));
14613 14607 (void) reclock(vp, flk, 0, flag, offset, NULL);
14614 14608 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14615 14609 "blocked by pid %d sysid 0x%x type %d "
14616 14610 "off 0x%" PRIx64 " len 0x%" PRIx64,
14617 14611 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14618 14612 flk->l_len));
14619 14613 }
14620 14614 #endif
14621 14615 flk->l_sysid = oldsysid;
14622 14616 }
14623 14617
14624 14618 /*
14625 14619 * nfs4_lockrelease:
14626 14620 *
14627 14621 * Release any locks on the given vnode that are held by the current
14628 14622 * process. Also removes the lock owner (if one exists) from the rnode's
14629 14623 * list.
14630 14624 */
14631 14625 static int
14632 14626 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14633 14627 {
14634 14628 flock64_t ld;
14635 14629 int ret, error;
14636 14630 rnode4_t *rp;
14637 14631 nfs4_lock_owner_t *lop;
14638 14632 nfs4_recov_state_t recov_state;
14639 14633 mntinfo4_t *mi;
14640 14634 bool_t possible_orphan = FALSE;
14641 14635 bool_t recovonly;
14642 14636
14643 14637 ASSERT((uintptr_t)vp > KERNELBASE);
14644 14638 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14645 14639
14646 14640 rp = VTOR4(vp);
14647 14641 mi = VTOMI4(vp);
14648 14642
14649 14643 /*
14650 14644 * If we have not locked anything then we can
14651 14645 * just return since we have no work to do.
14652 14646 */
14653 14647 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14654 14648 return (0);
14655 14649 }
14656 14650
14657 14651 /*
14658 14652 * We need to comprehend that another thread may
14659 14653 * kick off recovery and the lock_owner we have stashed
14660 14654 * in lop might be invalid so we should NOT cache it
14661 14655 * locally!
14662 14656 */
14663 14657 recov_state.rs_flags = 0;
14664 14658 recov_state.rs_num_retry_despite_err = 0;
14665 14659 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14666 14660 &recovonly);
14667 14661 if (error) {
14668 14662 mutex_enter(&rp->r_statelock);
14669 14663 rp->r_flags |= R4LODANGLERS;
14670 14664 mutex_exit(&rp->r_statelock);
14671 14665 return (error);
14672 14666 }
14673 14667
14674 14668 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14675 14669
14676 14670 /*
14677 14671 * Check if the lock owner might have a lock (request was sent but
14678 14672 * no response was received). Also check if there are any remote
14679 14673 * locks on the file. (In theory we shouldn't have to make this
14680 14674 * second check if there's no lock owner, but for now we'll be
14681 14675 * conservative and do it anyway.) If either condition is true,
14682 14676 * send an unlock for the entire file to the server.
14683 14677 *
14684 14678 * Note that no explicit synchronization is needed here. At worst,
14685 14679 * flk_has_remote_locks() will return a false positive, in which case
14686 14680 * the unlock call wastes time but doesn't harm correctness.
14687 14681 */
14688 14682
14689 14683 if (lop) {
14690 14684 mutex_enter(&lop->lo_lock);
14691 14685 possible_orphan = lop->lo_pending_rqsts;
14692 14686 mutex_exit(&lop->lo_lock);
14693 14687 lock_owner_rele(lop);
14694 14688 }
14695 14689
14696 14690 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14697 14691
14698 14692 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14699 14693 "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14700 14694 "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14701 14695 (void *)lop));
14702 14696
14703 14697 if (possible_orphan || flk_has_remote_locks(vp)) {
14704 14698 ld.l_type = F_UNLCK; /* set to unlock entire file */
14705 14699 ld.l_whence = 0; /* unlock from start of file */
14706 14700 ld.l_start = 0;
14707 14701 ld.l_len = 0; /* do entire file */
14708 14702
14709 14703 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14710 14704 cr, NULL);
14711 14705
14712 14706 if (ret != 0) {
14713 14707 /*
14714 14708 * If VOP_FRLOCK fails, make sure we unregister
14715 14709 * local locks before we continue.
14716 14710 */
14717 14711 ld.l_pid = ttoproc(curthread)->p_pid;
14718 14712 nfs4_register_lock_locally(vp, &ld, flag, offset);
14719 14713 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14720 14714 "nfs4_lockrelease: lock release error on vp"
14721 14715 " %p: error %d.\n", (void *)vp, ret));
14722 14716 }
14723 14717 }
14724 14718
14725 14719 recov_state.rs_flags = 0;
14726 14720 recov_state.rs_num_retry_despite_err = 0;
14727 14721 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14728 14722 &recovonly);
14729 14723 if (error) {
14730 14724 mutex_enter(&rp->r_statelock);
14731 14725 rp->r_flags |= R4LODANGLERS;
14732 14726 mutex_exit(&rp->r_statelock);
14733 14727 return (error);
14734 14728 }
14735 14729
14736 14730 /*
14737 14731 * So, here we're going to need to retrieve the lock-owner
14738 14732 * again (in case recovery has done a switch-a-roo) and
14739 14733 * remove it because we can.
14740 14734 */
14741 14735 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14742 14736
14743 14737 if (lop) {
14744 14738 nfs4_rnode_remove_lock_owner(rp, lop);
14745 14739 lock_owner_rele(lop);
14746 14740 }
14747 14741
14748 14742 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14749 14743 return (0);
14750 14744 }
14751 14745
14752 14746 /*
14753 14747 * Wait for 'tick_delay' clock ticks.
14754 14748 * Implement exponential backoff until hit the lease_time of this nfs4_server.
14755 14749 * NOTE: lock_lease_time is in seconds.
14756 14750 *
14757 14751 * XXX For future improvements, should implement a waiting queue scheme.
14758 14752 */
14759 14753 static int
14760 14754 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14761 14755 {
14762 14756 long milliseconds_delay;
14763 14757 time_t lock_lease_time;
14764 14758
14765 14759 /* wait tick_delay clock ticks or siginteruptus */
14766 14760 if (delay_sig(*tick_delay)) {
14767 14761 return (EINTR);
14768 14762 }
14769 14763 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14770 14764 "reissue the lock request: blocked for %ld clock ticks: %ld "
14771 14765 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14772 14766
14773 14767 /* get the lease time */
14774 14768 lock_lease_time = r2lease_time(rp);
14775 14769
14776 14770 /* drv_hztousec converts ticks to microseconds */
14777 14771 milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14778 14772 if (milliseconds_delay < lock_lease_time * 1000) {
14779 14773 *tick_delay = 2 * *tick_delay;
14780 14774 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14781 14775 *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14782 14776 }
14783 14777 return (0);
14784 14778 }
14785 14779
14786 14780
14787 14781 void
14788 14782 nfs4_vnops_init(void)
14789 14783 {
14790 14784 }
14791 14785
14792 14786 void
14793 14787 nfs4_vnops_fini(void)
14794 14788 {
14795 14789 }
14796 14790
14797 14791 /*
14798 14792 * Return a reference to the directory (parent) vnode for a given vnode,
14799 14793 * using the saved pathname information and the directory file handle. The
14800 14794 * caller is responsible for disposing of the reference.
14801 14795 * Returns zero or an errno value.
14802 14796 *
14803 14797 * Caller should set need_start_op to FALSE if it is the recovery
14804 14798 * thread, or if a start_fop has already been done. Otherwise, TRUE.
14805 14799 */
14806 14800 int
14807 14801 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14808 14802 {
14809 14803 svnode_t *svnp;
14810 14804 vnode_t *dvp = NULL;
14811 14805 servinfo4_t *svp;
14812 14806 nfs4_fname_t *mfname;
14813 14807 int error;
14814 14808
14815 14809 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14816 14810
14817 14811 if (vp->v_flag & VROOT) {
14818 14812 nfs4_sharedfh_t *sfh;
14819 14813 nfs_fh4 fh;
14820 14814 mntinfo4_t *mi;
14821 14815
14822 14816 ASSERT(vp->v_type == VREG);
14823 14817
14824 14818 mi = VTOMI4(vp);
14825 14819 svp = mi->mi_curr_serv;
14826 14820 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14827 14821 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14828 14822 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14829 14823 sfh = sfh4_get(&fh, VTOMI4(vp));
14830 14824 nfs_rw_exit(&svp->sv_lock);
14831 14825 mfname = mi->mi_fname;
14832 14826 fn_hold(mfname);
14833 14827 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14834 14828 sfh4_rele(&sfh);
14835 14829
14836 14830 if (dvp->v_type == VNON)
14837 14831 dvp->v_type = VDIR;
14838 14832 *dvpp = dvp;
14839 14833 return (0);
14840 14834 }
14841 14835
14842 14836 svnp = VTOSV(vp);
14843 14837
14844 14838 if (svnp == NULL) {
14845 14839 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14846 14840 "shadow node is NULL"));
14847 14841 return (EINVAL);
14848 14842 }
14849 14843
14850 14844 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14851 14845 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14852 14846 "shadow node name or dfh val == NULL"));
14853 14847 return (EINVAL);
14854 14848 }
14855 14849
14856 14850 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14857 14851 (int)need_start_op);
14858 14852 if (error != 0) {
14859 14853 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14860 14854 "nfs4_make_dotdot returned %d", error));
14861 14855 return (error);
14862 14856 }
14863 14857 if (!dvp) {
14864 14858 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14865 14859 "nfs4_make_dotdot returned a NULL dvp"));
14866 14860 return (EIO);
14867 14861 }
14868 14862 if (dvp->v_type == VNON)
14869 14863 dvp->v_type = VDIR;
14870 14864 ASSERT(dvp->v_type == VDIR);
14871 14865 if (VTOR4(vp)->r_flags & R4ISXATTR) {
14872 14866 mutex_enter(&dvp->v_lock);
14873 14867 dvp->v_flag |= V_XATTRDIR;
14874 14868 mutex_exit(&dvp->v_lock);
14875 14869 }
14876 14870 *dvpp = dvp;
14877 14871 return (0);
14878 14872 }
14879 14873
14880 14874 /*
14881 14875 * Copy the (final) component name of vp to fnamep. maxlen is the maximum
14882 14876 * length that fnamep can accept, including the trailing null.
14883 14877 * Returns 0 if okay, returns an errno value if there was a problem.
14884 14878 */
14885 14879
14886 14880 int
14887 14881 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14888 14882 {
14889 14883 char *fn;
14890 14884 int err = 0;
14891 14885 servinfo4_t *svp;
14892 14886 svnode_t *shvp;
14893 14887
14894 14888 /*
14895 14889 * If the file being opened has VROOT set, then this is
14896 14890 * a "file" mount. sv_name will not be interesting, so
14897 14891 * go back to the servinfo4 to get the original mount
14898 14892 * path and strip off all but the final edge. Otherwise
14899 14893 * just return the name from the shadow vnode.
14900 14894 */
14901 14895
14902 14896 if (vp->v_flag & VROOT) {
14903 14897
14904 14898 svp = VTOMI4(vp)->mi_curr_serv;
14905 14899 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14906 14900
14907 14901 fn = strrchr(svp->sv_path, '/');
14908 14902 if (fn == NULL)
14909 14903 err = EINVAL;
14910 14904 else
14911 14905 fn++;
14912 14906 } else {
14913 14907 shvp = VTOSV(vp);
14914 14908 fn = fn_name(shvp->sv_name);
14915 14909 }
14916 14910
14917 14911 if (err == 0)
14918 14912 if (strlen(fn) < maxlen)
14919 14913 (void) strcpy(fnamep, fn);
14920 14914 else
14921 14915 err = ENAMETOOLONG;
14922 14916
14923 14917 if (vp->v_flag & VROOT)
14924 14918 nfs_rw_exit(&svp->sv_lock);
14925 14919 else
14926 14920 kmem_free(fn, MAXNAMELEN);
14927 14921
14928 14922 return (err);
14929 14923 }
14930 14924
14931 14925 /*
14932 14926 * Bookkeeping for a close that doesn't need to go over the wire.
14933 14927 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14934 14928 * it is left at 1.
14935 14929 */
14936 14930 void
14937 14931 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14938 14932 {
14939 14933 rnode4_t *rp;
14940 14934 mntinfo4_t *mi;
14941 14935
14942 14936 mi = VTOMI4(vp);
14943 14937 rp = VTOR4(vp);
14944 14938
14945 14939 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14946 14940 "rp=%p osp=%p", (void *)rp, (void *)osp));
14947 14941 ASSERT(nfs_zone() == mi->mi_zone);
14948 14942 ASSERT(mutex_owned(&osp->os_sync_lock));
14949 14943 ASSERT(*have_lockp);
14950 14944
14951 14945 if (!osp->os_valid ||
14952 14946 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14953 14947 return;
14954 14948 }
14955 14949
14956 14950 /*
14957 14951 * This removes the reference obtained at OPEN; ie,
14958 14952 * when the open stream structure was created.
14959 14953 *
14960 14954 * We don't have to worry about calling 'open_stream_rele'
14961 14955 * since we our currently holding a reference to this
14962 14956 * open stream which means the count can not go to 0 with
14963 14957 * this decrement.
14964 14958 */
14965 14959 ASSERT(osp->os_ref_count >= 2);
14966 14960 osp->os_ref_count--;
14967 14961 osp->os_valid = 0;
14968 14962 mutex_exit(&osp->os_sync_lock);
14969 14963 *have_lockp = 0;
14970 14964
14971 14965 nfs4_dec_state_ref_count(mi);
14972 14966 }
14973 14967
14974 14968 /*
14975 14969 * Close all remaining open streams on the rnode. These open streams
14976 14970 * could be here because:
14977 14971 * - The close attempted at either close or delmap failed
14978 14972 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14979 14973 * - Someone did mknod on a regular file but never opened it
14980 14974 */
14981 14975 int
14982 14976 nfs4close_all(vnode_t *vp, cred_t *cr)
14983 14977 {
14984 14978 nfs4_open_stream_t *osp;
14985 14979 int error;
14986 14980 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14987 14981 rnode4_t *rp;
14988 14982
14989 14983 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14990 14984
14991 14985 error = 0;
14992 14986 rp = VTOR4(vp);
14993 14987
14994 14988 /*
14995 14989 * At this point, all we know is that the last time
14996 14990 * someone called vn_rele, the count was 1. Since then,
14997 14991 * the vnode could have been re-activated. We want to
14998 14992 * loop through the open streams and close each one, but
14999 14993 * we have to be careful since once we release the rnode
15000 14994 * hash bucket lock, someone else is free to come in and
15001 14995 * re-activate the rnode and add new open streams. The
15002 14996 * strategy is take the rnode hash bucket lock, verify that
15003 14997 * the count is still 1, grab the open stream off the
15004 14998 * head of the list and mark it invalid, then release the
15005 14999 * rnode hash bucket lock and proceed with that open stream.
15006 15000 * This is ok because nfs4close_one() will acquire the proper
15007 15001 * open/create to close/destroy synchronization for open
15008 15002 * streams, and will ensure that if someone has reopened
15009 15003 * the open stream after we've dropped the hash bucket lock
15010 15004 * then we'll just simply return without destroying the
15011 15005 * open stream.
15012 15006 * Repeat until the list is empty.
15013 15007 */
15014 15008
15015 15009 for (;;) {
15016 15010
15017 15011 /* make sure vnode hasn't been reactivated */
15018 15012 rw_enter(&rp->r_hashq->r_lock, RW_READER);
15019 15013 mutex_enter(&vp->v_lock);
15020 15014 if (vp->v_count > 1) {
15021 15015 mutex_exit(&vp->v_lock);
15022 15016 rw_exit(&rp->r_hashq->r_lock);
15023 15017 break;
15024 15018 }
15025 15019 /*
15026 15020 * Grabbing r_os_lock before releasing v_lock prevents
15027 15021 * a window where the rnode/open stream could get
15028 15022 * reactivated (and os_force_close set to 0) before we
15029 15023 * had a chance to set os_force_close to 1.
15030 15024 */
15031 15025 mutex_enter(&rp->r_os_lock);
15032 15026 mutex_exit(&vp->v_lock);
15033 15027
15034 15028 osp = list_head(&rp->r_open_streams);
15035 15029 if (!osp) {
15036 15030 /* nothing left to CLOSE OTW, so return */
15037 15031 mutex_exit(&rp->r_os_lock);
15038 15032 rw_exit(&rp->r_hashq->r_lock);
15039 15033 break;
15040 15034 }
15041 15035
15042 15036 mutex_enter(&rp->r_statev4_lock);
15043 15037 /* the file can't still be mem mapped */
15044 15038 ASSERT(rp->r_mapcnt == 0);
15045 15039 if (rp->created_v4)
15046 15040 rp->created_v4 = 0;
15047 15041 mutex_exit(&rp->r_statev4_lock);
15048 15042
15049 15043 /*
15050 15044 * Grab a ref on this open stream; nfs4close_one
15051 15045 * will mark it as invalid
15052 15046 */
15053 15047 mutex_enter(&osp->os_sync_lock);
15054 15048 osp->os_ref_count++;
15055 15049 osp->os_force_close = 1;
15056 15050 mutex_exit(&osp->os_sync_lock);
15057 15051 mutex_exit(&rp->r_os_lock);
15058 15052 rw_exit(&rp->r_hashq->r_lock);
15059 15053
15060 15054 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15061 15055
15062 15056 /* Update error if it isn't already non-zero */
15063 15057 if (error == 0) {
15064 15058 if (e.error)
15065 15059 error = e.error;
15066 15060 else if (e.stat)
15067 15061 error = geterrno4(e.stat);
15068 15062 }
15069 15063
15070 15064 #ifdef DEBUG
15071 15065 nfs4close_all_cnt++;
15072 15066 #endif
15073 15067 /* Release the ref on osp acquired above. */
15074 15068 open_stream_rele(osp, rp);
15075 15069
15076 15070 /* Proceed to the next open stream, if any */
15077 15071 }
15078 15072 return (error);
15079 15073 }
15080 15074
15081 15075 /*
15082 15076 * nfs4close_one - close one open stream for a file if needed.
15083 15077 *
15084 15078 * "close_type" indicates which close path this is:
15085 15079 * CLOSE_NORM: close initiated via VOP_CLOSE.
15086 15080 * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15087 15081 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces
15088 15082 * the close and release of client state for this open stream
15089 15083 * (unless someone else has the open stream open).
15090 15084 * CLOSE_RESEND: indicates the request is a replay of an earlier request
15091 15085 * (e.g., due to abort because of a signal).
15092 15086 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15093 15087 *
15094 15088 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15095 15089 * recovery. Instead, the caller is expected to deal with retries.
15096 15090 *
15097 15091 * The caller can either pass in the osp ('provided_osp') or not.
15098 15092 *
15099 15093 * 'access_bits' represents the access we are closing/downgrading.
15100 15094 *
15101 15095 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the
15102 15096 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15103 15097 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15104 15098 *
15105 15099 * Errors are returned via the nfs4_error_t.
15106 15100 */
15107 15101 void
15108 15102 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15109 15103 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15110 15104 nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15111 15105 uint_t mmap_flags)
15112 15106 {
15113 15107 nfs4_open_owner_t *oop;
15114 15108 nfs4_open_stream_t *osp = NULL;
15115 15109 int retry = 0;
15116 15110 int num_retries = NFS4_NUM_RECOV_RETRIES;
15117 15111 rnode4_t *rp;
15118 15112 mntinfo4_t *mi;
15119 15113 nfs4_recov_state_t recov_state;
15120 15114 cred_t *cred_otw = NULL;
15121 15115 bool_t recovonly = FALSE;
15122 15116 int isrecov;
15123 15117 int force_close;
15124 15118 int close_failed = 0;
15125 15119 int did_dec_count = 0;
15126 15120 int did_start_op = 0;
15127 15121 int did_force_recovlock = 0;
15128 15122 int did_start_seqid_sync = 0;
15129 15123 int have_sync_lock = 0;
15130 15124
15131 15125 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15132 15126
15133 15127 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15134 15128 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15135 15129 (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15136 15130 len, maxprot, mmap_flags, access_bits));
15137 15131
15138 15132 nfs4_error_zinit(ep);
15139 15133 rp = VTOR4(vp);
15140 15134 mi = VTOMI4(vp);
15141 15135 isrecov = (close_type == CLOSE_RESEND ||
15142 15136 close_type == CLOSE_AFTER_RESEND);
15143 15137
15144 15138 /*
15145 15139 * First get the open owner.
15146 15140 */
15147 15141 if (!provided_osp) {
15148 15142 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15149 15143 } else {
15150 15144 oop = provided_osp->os_open_owner;
15151 15145 ASSERT(oop != NULL);
15152 15146 open_owner_hold(oop);
15153 15147 }
15154 15148
15155 15149 if (!oop) {
15156 15150 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15157 15151 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15158 15152 "close type %d", (void *)rp, (void *)mi, (void *)cr,
15159 15153 (void *)provided_osp, close_type));
15160 15154 ep->error = EIO;
15161 15155 goto out;
15162 15156 }
15163 15157
15164 15158 cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15165 15159 recov_retry:
15166 15160 osp = NULL;
15167 15161 close_failed = 0;
15168 15162 force_close = (close_type == CLOSE_FORCE);
15169 15163 retry = 0;
15170 15164 did_start_op = 0;
15171 15165 did_force_recovlock = 0;
15172 15166 did_start_seqid_sync = 0;
15173 15167 have_sync_lock = 0;
15174 15168 recovonly = FALSE;
15175 15169 recov_state.rs_flags = 0;
15176 15170 recov_state.rs_num_retry_despite_err = 0;
15177 15171
15178 15172 /*
15179 15173 * Second synchronize with recovery.
15180 15174 */
15181 15175 if (!isrecov) {
15182 15176 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15183 15177 &recov_state, &recovonly);
15184 15178 if (!ep->error) {
15185 15179 did_start_op = 1;
15186 15180 } else {
15187 15181 close_failed = 1;
15188 15182 /*
15189 15183 * If we couldn't get start_fop, but have to
15190 15184 * cleanup state, then at least acquire the
15191 15185 * mi_recovlock so we can synchronize with
15192 15186 * recovery.
15193 15187 */
15194 15188 if (close_type == CLOSE_FORCE) {
15195 15189 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15196 15190 RW_READER, FALSE);
15197 15191 did_force_recovlock = 1;
15198 15192 } else
15199 15193 goto out;
15200 15194 }
15201 15195 }
15202 15196
15203 15197 /*
15204 15198 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15205 15199 * set 'recovonly' to TRUE since most likely this is due to
15206 15200 * reovery being active (MI4_RECOV_ACTIV). If recovery is active,
15207 15201 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15208 15202 * to retry, causing us to loop until recovery finishes. Plus we
15209 15203 * don't need protection over the open seqid since we're not going
15210 15204 * OTW, hence don't need to use the seqid.
15211 15205 */
15212 15206 if (recovonly == FALSE) {
15213 15207 /* need to grab the open owner sync before 'os_sync_lock' */
15214 15208 ep->error = nfs4_start_open_seqid_sync(oop, mi);
15215 15209 if (ep->error == EAGAIN) {
15216 15210 ASSERT(!isrecov);
15217 15211 if (did_start_op)
15218 15212 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15219 15213 &recov_state, TRUE);
15220 15214 if (did_force_recovlock)
15221 15215 nfs_rw_exit(&mi->mi_recovlock);
15222 15216 goto recov_retry;
15223 15217 }
15224 15218 did_start_seqid_sync = 1;
15225 15219 }
15226 15220
15227 15221 /*
15228 15222 * Third get an open stream and acquire 'os_sync_lock' to
15229 15223 * sychronize the opening/creating of an open stream with the
15230 15224 * closing/destroying of an open stream.
15231 15225 */
15232 15226 if (!provided_osp) {
15233 15227 /* returns with 'os_sync_lock' held */
15234 15228 osp = find_open_stream(oop, rp);
15235 15229 if (!osp) {
15236 15230 ep->error = EIO;
15237 15231 goto out;
15238 15232 }
15239 15233 } else {
15240 15234 osp = provided_osp;
15241 15235 open_stream_hold(osp);
15242 15236 mutex_enter(&osp->os_sync_lock);
15243 15237 }
15244 15238 have_sync_lock = 1;
15245 15239
15246 15240 ASSERT(oop == osp->os_open_owner);
15247 15241
15248 15242 /*
15249 15243 * Fourth, do any special pre-OTW CLOSE processing
15250 15244 * based on the specific close type.
15251 15245 */
15252 15246 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15253 15247 !did_dec_count) {
15254 15248 ASSERT(osp->os_open_ref_count > 0);
15255 15249 osp->os_open_ref_count--;
15256 15250 did_dec_count = 1;
15257 15251 if (osp->os_open_ref_count == 0)
15258 15252 osp->os_final_close = 1;
15259 15253 }
15260 15254
15261 15255 if (close_type == CLOSE_FORCE) {
15262 15256 /* see if somebody reopened the open stream. */
15263 15257 if (!osp->os_force_close) {
15264 15258 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15265 15259 "nfs4close_one: skip CLOSE_FORCE as osp %p "
15266 15260 "was reopened, vp %p", (void *)osp, (void *)vp));
15267 15261 ep->error = 0;
15268 15262 ep->stat = NFS4_OK;
15269 15263 goto out;
15270 15264 }
15271 15265
15272 15266 if (!osp->os_final_close && !did_dec_count) {
15273 15267 osp->os_open_ref_count--;
15274 15268 did_dec_count = 1;
15275 15269 }
15276 15270
15277 15271 /*
15278 15272 * We can't depend on os_open_ref_count being 0 due to the
15279 15273 * way executables are opened (VN_RELE to match a VOP_OPEN).
15280 15274 */
15281 15275 #ifdef NOTYET
15282 15276 ASSERT(osp->os_open_ref_count == 0);
15283 15277 #endif
15284 15278 if (osp->os_open_ref_count != 0) {
15285 15279 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15286 15280 "nfs4close_one: should panic here on an "
15287 15281 "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15288 15282 "since this is probably the exec problem."));
15289 15283
15290 15284 osp->os_open_ref_count = 0;
15291 15285 }
15292 15286
15293 15287 /*
15294 15288 * There is the possibility that nfs4close_one()
15295 15289 * for close_type == CLOSE_DELMAP couldn't find the
15296 15290 * open stream, thus couldn't decrement its os_mapcnt;
15297 15291 * therefore we can't use this ASSERT yet.
15298 15292 */
15299 15293 #ifdef NOTYET
15300 15294 ASSERT(osp->os_mapcnt == 0);
15301 15295 #endif
15302 15296 osp->os_mapcnt = 0;
15303 15297 }
15304 15298
15305 15299 if (close_type == CLOSE_DELMAP && !did_dec_count) {
15306 15300 ASSERT(osp->os_mapcnt >= btopr(len));
15307 15301
15308 15302 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15309 15303 osp->os_mmap_write -= btopr(len);
15310 15304 if (maxprot & PROT_READ)
15311 15305 osp->os_mmap_read -= btopr(len);
15312 15306 if (maxprot & PROT_EXEC)
15313 15307 osp->os_mmap_read -= btopr(len);
15314 15308 /* mirror the PROT_NONE check in nfs4_addmap() */
15315 15309 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15316 15310 !(maxprot & PROT_EXEC))
15317 15311 osp->os_mmap_read -= btopr(len);
15318 15312 osp->os_mapcnt -= btopr(len);
15319 15313 did_dec_count = 1;
15320 15314 }
15321 15315
15322 15316 if (recovonly) {
15323 15317 nfs4_lost_rqst_t lost_rqst;
15324 15318
15325 15319 /* request should not already be in recovery queue */
15326 15320 ASSERT(lrp == NULL);
15327 15321 nfs4_error_init(ep, EINTR);
15328 15322 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15329 15323 osp, cred_otw, vp);
15330 15324 mutex_exit(&osp->os_sync_lock);
15331 15325 have_sync_lock = 0;
15332 15326 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15333 15327 lost_rqst.lr_op == OP_CLOSE ?
15334 15328 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15335 15329 close_failed = 1;
15336 15330 force_close = 0;
15337 15331 goto close_cleanup;
15338 15332 }
15339 15333
15340 15334 /*
15341 15335 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15342 15336 * we stopped operating on the open owner's <old oo_name, old seqid>
15343 15337 * space, which means we stopped operating on the open stream
15344 15338 * too. So don't go OTW (as the seqid is likely bad, and the
15345 15339 * stateid could be stale, potentially triggering a false
15346 15340 * setclientid), and just clean up the client's internal state.
15347 15341 */
15348 15342 if (osp->os_orig_oo_name != oop->oo_name) {
15349 15343 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15350 15344 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15351 15345 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15352 15346 "oo_name %" PRIx64")",
15353 15347 (void *)osp, (void *)oop, osp->os_orig_oo_name,
15354 15348 oop->oo_name));
15355 15349 close_failed = 1;
15356 15350 }
15357 15351
15358 15352 /* If the file failed recovery, just quit. */
15359 15353 mutex_enter(&rp->r_statelock);
15360 15354 if (rp->r_flags & R4RECOVERR) {
15361 15355 close_failed = 1;
15362 15356 }
15363 15357 mutex_exit(&rp->r_statelock);
15364 15358
15365 15359 /*
15366 15360 * If the force close path failed to obtain start_fop
15367 15361 * then skip the OTW close and just remove the state.
15368 15362 */
15369 15363 if (close_failed)
15370 15364 goto close_cleanup;
15371 15365
15372 15366 /*
15373 15367 * Fifth, check to see if there are still mapped pages or other
15374 15368 * opens using this open stream. If there are then we can't
15375 15369 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15376 15370 */
15377 15371 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15378 15372 nfs4_lost_rqst_t new_lost_rqst;
15379 15373 bool_t needrecov = FALSE;
15380 15374 cred_t *odg_cred_otw = NULL;
15381 15375 seqid4 open_dg_seqid = 0;
15382 15376
15383 15377 if (osp->os_delegation) {
15384 15378 /*
15385 15379 * If this open stream was never OPENed OTW then we
15386 15380 * surely can't DOWNGRADE it (especially since the
15387 15381 * osp->open_stateid is really a delegation stateid
15388 15382 * when os_delegation is 1).
15389 15383 */
15390 15384 if (access_bits & FREAD)
15391 15385 osp->os_share_acc_read--;
15392 15386 if (access_bits & FWRITE)
15393 15387 osp->os_share_acc_write--;
15394 15388 osp->os_share_deny_none--;
15395 15389 nfs4_error_zinit(ep);
15396 15390 goto out;
15397 15391 }
15398 15392 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15399 15393 lrp, ep, &odg_cred_otw, &open_dg_seqid);
15400 15394 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15401 15395 if (needrecov && !isrecov) {
15402 15396 bool_t abort;
15403 15397 nfs4_bseqid_entry_t *bsep = NULL;
15404 15398
15405 15399 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15406 15400 bsep = nfs4_create_bseqid_entry(oop, NULL,
15407 15401 vp, 0,
15408 15402 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15409 15403 open_dg_seqid);
15410 15404
15411 15405 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15412 15406 oop, osp, odg_cred_otw, vp, access_bits, 0);
15413 15407 mutex_exit(&osp->os_sync_lock);
15414 15408 have_sync_lock = 0;
15415 15409 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15416 15410 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15417 15411 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15418 15412 bsep, NULL, NULL);
15419 15413 if (odg_cred_otw)
15420 15414 crfree(odg_cred_otw);
15421 15415 if (bsep)
15422 15416 kmem_free(bsep, sizeof (*bsep));
15423 15417
15424 15418 if (abort == TRUE)
15425 15419 goto out;
15426 15420
15427 15421 if (did_start_seqid_sync) {
15428 15422 nfs4_end_open_seqid_sync(oop);
15429 15423 did_start_seqid_sync = 0;
15430 15424 }
15431 15425 open_stream_rele(osp, rp);
15432 15426
15433 15427 if (did_start_op)
15434 15428 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15435 15429 &recov_state, FALSE);
15436 15430 if (did_force_recovlock)
15437 15431 nfs_rw_exit(&mi->mi_recovlock);
15438 15432
15439 15433 goto recov_retry;
15440 15434 } else {
15441 15435 if (odg_cred_otw)
15442 15436 crfree(odg_cred_otw);
15443 15437 }
15444 15438 goto out;
15445 15439 }
15446 15440
15447 15441 /*
15448 15442 * If this open stream was created as the results of an open
15449 15443 * while holding a delegation, then just release it; no need
15450 15444 * to do an OTW close. Otherwise do a "normal" OTW close.
15451 15445 */
15452 15446 if (osp->os_delegation) {
15453 15447 nfs4close_notw(vp, osp, &have_sync_lock);
15454 15448 nfs4_error_zinit(ep);
15455 15449 goto out;
15456 15450 }
15457 15451
15458 15452 /*
15459 15453 * If this stream is not valid, we're done.
15460 15454 */
15461 15455 if (!osp->os_valid) {
15462 15456 nfs4_error_zinit(ep);
15463 15457 goto out;
15464 15458 }
15465 15459
15466 15460 /*
15467 15461 * Last open or mmap ref has vanished, need to do an OTW close.
15468 15462 * First check to see if a close is still necessary.
15469 15463 */
15470 15464 if (osp->os_failed_reopen) {
15471 15465 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15472 15466 "don't close OTW osp %p since reopen failed.",
15473 15467 (void *)osp));
15474 15468 /*
15475 15469 * Reopen of the open stream failed, hence the
15476 15470 * stateid of the open stream is invalid/stale, and
15477 15471 * sending this OTW would incorrectly cause another
15478 15472 * round of recovery. In this case, we need to set
15479 15473 * the 'os_valid' bit to 0 so another thread doesn't
15480 15474 * come in and re-open this open stream before
15481 15475 * this "closing" thread cleans up state (decrementing
15482 15476 * the nfs4_server_t's state_ref_count and decrementing
15483 15477 * the os_ref_count).
15484 15478 */
15485 15479 osp->os_valid = 0;
15486 15480 /*
15487 15481 * This removes the reference obtained at OPEN; ie,
15488 15482 * when the open stream structure was created.
15489 15483 *
15490 15484 * We don't have to worry about calling 'open_stream_rele'
15491 15485 * since we our currently holding a reference to this
15492 15486 * open stream which means the count can not go to 0 with
15493 15487 * this decrement.
15494 15488 */
15495 15489 ASSERT(osp->os_ref_count >= 2);
15496 15490 osp->os_ref_count--;
15497 15491 nfs4_error_zinit(ep);
15498 15492 close_failed = 0;
15499 15493 goto close_cleanup;
15500 15494 }
15501 15495
15502 15496 ASSERT(osp->os_ref_count > 1);
15503 15497
15504 15498 /*
15505 15499 * Sixth, try the CLOSE OTW.
15506 15500 */
15507 15501 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15508 15502 close_type, ep, &have_sync_lock);
15509 15503
15510 15504 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15511 15505 /*
15512 15506 * Let the recovery thread be responsible for
15513 15507 * removing the state for CLOSE.
15514 15508 */
15515 15509 close_failed = 1;
15516 15510 force_close = 0;
15517 15511 retry = 0;
15518 15512 }
15519 15513
15520 15514 /* See if we need to retry with a different cred */
15521 15515 if ((ep->error == EACCES ||
15522 15516 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15523 15517 cred_otw != cr) {
15524 15518 crfree(cred_otw);
15525 15519 cred_otw = cr;
15526 15520 crhold(cred_otw);
15527 15521 retry = 1;
15528 15522 }
15529 15523
15530 15524 if (ep->error || ep->stat)
15531 15525 close_failed = 1;
15532 15526
15533 15527 if (retry && !isrecov && num_retries-- > 0) {
15534 15528 if (have_sync_lock) {
15535 15529 mutex_exit(&osp->os_sync_lock);
15536 15530 have_sync_lock = 0;
15537 15531 }
15538 15532 if (did_start_seqid_sync) {
15539 15533 nfs4_end_open_seqid_sync(oop);
15540 15534 did_start_seqid_sync = 0;
15541 15535 }
15542 15536 open_stream_rele(osp, rp);
15543 15537
15544 15538 if (did_start_op)
15545 15539 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15546 15540 &recov_state, FALSE);
15547 15541 if (did_force_recovlock)
15548 15542 nfs_rw_exit(&mi->mi_recovlock);
15549 15543 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15550 15544 "nfs4close_one: need to retry the close "
15551 15545 "operation"));
15552 15546 goto recov_retry;
15553 15547 }
15554 15548 close_cleanup:
15555 15549 /*
15556 15550 * Seventh and lastly, process our results.
15557 15551 */
15558 15552 if (close_failed && force_close) {
15559 15553 /*
15560 15554 * It's ok to drop and regrab the 'os_sync_lock' since
15561 15555 * nfs4close_notw() will recheck to make sure the
15562 15556 * "close"/removal of state should happen.
15563 15557 */
15564 15558 if (!have_sync_lock) {
15565 15559 mutex_enter(&osp->os_sync_lock);
15566 15560 have_sync_lock = 1;
15567 15561 }
15568 15562 /*
15569 15563 * This is last call, remove the ref on the open
15570 15564 * stream created by open and clean everything up.
15571 15565 */
15572 15566 osp->os_pending_close = 0;
15573 15567 nfs4close_notw(vp, osp, &have_sync_lock);
15574 15568 nfs4_error_zinit(ep);
15575 15569 }
15576 15570
15577 15571 if (!close_failed) {
15578 15572 if (have_sync_lock) {
15579 15573 osp->os_pending_close = 0;
15580 15574 mutex_exit(&osp->os_sync_lock);
15581 15575 have_sync_lock = 0;
15582 15576 } else {
15583 15577 mutex_enter(&osp->os_sync_lock);
15584 15578 osp->os_pending_close = 0;
15585 15579 mutex_exit(&osp->os_sync_lock);
15586 15580 }
15587 15581 if (did_start_op && recov_state.rs_sp != NULL) {
15588 15582 mutex_enter(&recov_state.rs_sp->s_lock);
15589 15583 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15590 15584 mutex_exit(&recov_state.rs_sp->s_lock);
15591 15585 } else {
15592 15586 nfs4_dec_state_ref_count(mi);
15593 15587 }
15594 15588 nfs4_error_zinit(ep);
15595 15589 }
15596 15590
15597 15591 out:
15598 15592 if (have_sync_lock)
15599 15593 mutex_exit(&osp->os_sync_lock);
15600 15594 if (did_start_op)
15601 15595 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15602 15596 recovonly ? TRUE : FALSE);
15603 15597 if (did_force_recovlock)
15604 15598 nfs_rw_exit(&mi->mi_recovlock);
15605 15599 if (cred_otw)
15606 15600 crfree(cred_otw);
15607 15601 if (osp)
15608 15602 open_stream_rele(osp, rp);
15609 15603 if (oop) {
15610 15604 if (did_start_seqid_sync)
15611 15605 nfs4_end_open_seqid_sync(oop);
15612 15606 open_owner_rele(oop);
15613 15607 }
15614 15608 }
15615 15609
15616 15610 /*
15617 15611 * Convert information returned by the server in the LOCK4denied
15618 15612 * structure to the form required by fcntl.
15619 15613 */
15620 15614 static void
15621 15615 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15622 15616 {
15623 15617 nfs4_lo_name_t *lo;
15624 15618
15625 15619 #ifdef DEBUG
15626 15620 if (denied_to_flk_debug) {
15627 15621 lockt_denied_debug = lockt_denied;
15628 15622 debug_enter("lockt_denied");
15629 15623 }
15630 15624 #endif
15631 15625
15632 15626 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15633 15627 flk->l_whence = 0; /* aka SEEK_SET */
15634 15628 flk->l_start = lockt_denied->offset;
15635 15629 flk->l_len = lockt_denied->length;
15636 15630
15637 15631 /*
15638 15632 * If the blocking clientid matches our client id, then we can
15639 15633 * interpret the lockowner (since we built it). If not, then
15640 15634 * fabricate a sysid and pid. Note that the l_sysid field
15641 15635 * in *flk already has the local sysid.
15642 15636 */
15643 15637
15644 15638 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15645 15639
15646 15640 if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15647 15641 lo = (nfs4_lo_name_t *)
15648 15642 lockt_denied->owner.owner_val;
15649 15643
15650 15644 flk->l_pid = lo->ln_pid;
15651 15645 } else {
15652 15646 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15653 15647 "denied_to_flk: bad lock owner length\n"));
15654 15648
15655 15649 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15656 15650 }
15657 15651 } else {
15658 15652 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15659 15653 "denied_to_flk: foreign clientid\n"));
15660 15654
15661 15655 /*
15662 15656 * Construct a new sysid which should be different from
15663 15657 * sysids of other systems.
15664 15658 */
15665 15659
15666 15660 flk->l_sysid++;
15667 15661 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15668 15662 }
15669 15663 }
15670 15664
15671 15665 static pid_t
15672 15666 lo_to_pid(lock_owner4 *lop)
15673 15667 {
15674 15668 pid_t pid = 0;
15675 15669 uchar_t *cp;
15676 15670 int i;
15677 15671
15678 15672 cp = (uchar_t *)&lop->clientid;
15679 15673
15680 15674 for (i = 0; i < sizeof (lop->clientid); i++)
15681 15675 pid += (pid_t)*cp++;
15682 15676
15683 15677 cp = (uchar_t *)lop->owner_val;
15684 15678
15685 15679 for (i = 0; i < lop->owner_len; i++)
15686 15680 pid += (pid_t)*cp++;
15687 15681
15688 15682 return (pid);
15689 15683 }
15690 15684
15691 15685 /*
15692 15686 * Given a lock pointer, returns the length of that lock.
15693 15687 * "end" is the last locked offset the "l_len" covers from
15694 15688 * the start of the lock.
15695 15689 */
15696 15690 static off64_t
15697 15691 lock_to_end(flock64_t *lock)
15698 15692 {
15699 15693 off64_t lock_end;
15700 15694
15701 15695 if (lock->l_len == 0)
15702 15696 lock_end = (off64_t)MAXEND;
15703 15697 else
15704 15698 lock_end = lock->l_start + lock->l_len - 1;
15705 15699
15706 15700 return (lock_end);
15707 15701 }
15708 15702
15709 15703 /*
15710 15704 * Given the end of a lock, it will return you the length "l_len" for that lock.
15711 15705 */
15712 15706 static off64_t
15713 15707 end_to_len(off64_t start, off64_t end)
15714 15708 {
15715 15709 off64_t lock_len;
15716 15710
15717 15711 ASSERT(end >= start);
15718 15712 if (end == MAXEND)
15719 15713 lock_len = 0;
15720 15714 else
15721 15715 lock_len = end - start + 1;
15722 15716
15723 15717 return (lock_len);
15724 15718 }
15725 15719
15726 15720 /*
15727 15721 * On given end for a lock it determines if it is the last locked offset
15728 15722 * or not, if so keeps it as is, else adds one to return the length for
15729 15723 * valid start.
15730 15724 */
15731 15725 static off64_t
15732 15726 start_check(off64_t x)
15733 15727 {
15734 15728 if (x == MAXEND)
15735 15729 return (x);
15736 15730 else
15737 15731 return (x + 1);
15738 15732 }
15739 15733
15740 15734 /*
15741 15735 * See if these two locks overlap, and if so return 1;
15742 15736 * otherwise, return 0.
15743 15737 */
15744 15738 static int
15745 15739 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15746 15740 {
15747 15741 off64_t llfp_end, curfp_end;
15748 15742
15749 15743 llfp_end = lock_to_end(llfp);
15750 15744 curfp_end = lock_to_end(curfp);
15751 15745
15752 15746 if (((llfp_end >= curfp->l_start) &&
15753 15747 (llfp->l_start <= curfp->l_start)) ||
15754 15748 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15755 15749 return (1);
15756 15750 return (0);
15757 15751 }
15758 15752
15759 15753 /*
15760 15754 * Determine what the intersecting lock region is, and add that to the
15761 15755 * 'nl_llpp' locklist in increasing order (by l_start).
15762 15756 */
15763 15757 static void
15764 15758 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15765 15759 locklist_t **nl_llpp, vnode_t *vp)
15766 15760 {
15767 15761 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15768 15762 off64_t lost_flp_end, local_flp_end, len, start;
15769 15763
15770 15764 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15771 15765
15772 15766 if (!locks_intersect(lost_flp, local_flp))
15773 15767 return;
15774 15768
15775 15769 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15776 15770 "locks intersect"));
15777 15771
15778 15772 lost_flp_end = lock_to_end(lost_flp);
15779 15773 local_flp_end = lock_to_end(local_flp);
15780 15774
15781 15775 /* Find the starting point of the intersecting region */
15782 15776 if (local_flp->l_start > lost_flp->l_start)
15783 15777 start = local_flp->l_start;
15784 15778 else
15785 15779 start = lost_flp->l_start;
15786 15780
15787 15781 /* Find the lenght of the intersecting region */
15788 15782 if (lost_flp_end < local_flp_end)
15789 15783 len = end_to_len(start, lost_flp_end);
15790 15784 else
15791 15785 len = end_to_len(start, local_flp_end);
15792 15786
15793 15787 /*
15794 15788 * Prepare the flock structure for the intersection found and insert
15795 15789 * it into the new list in increasing l_start order. This list contains
15796 15790 * intersections of locks registered by the client with the local host
15797 15791 * and the lost lock.
15798 15792 * The lock type of this lock is the same as that of the local_flp.
15799 15793 */
15800 15794 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15801 15795 intersect_llp->ll_flock.l_start = start;
15802 15796 intersect_llp->ll_flock.l_len = len;
15803 15797 intersect_llp->ll_flock.l_type = local_flp->l_type;
15804 15798 intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15805 15799 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15806 15800 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */
15807 15801 intersect_llp->ll_vp = vp;
15808 15802
15809 15803 tmp_fllp = *nl_llpp;
15810 15804 cur_fllp = NULL;
15811 15805 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15812 15806 intersect_llp->ll_flock.l_start) {
15813 15807 cur_fllp = tmp_fllp;
15814 15808 tmp_fllp = tmp_fllp->ll_next;
15815 15809 }
15816 15810 if (cur_fllp == NULL) {
15817 15811 /* first on the list */
15818 15812 intersect_llp->ll_next = *nl_llpp;
15819 15813 *nl_llpp = intersect_llp;
15820 15814 } else {
15821 15815 intersect_llp->ll_next = cur_fllp->ll_next;
15822 15816 cur_fllp->ll_next = intersect_llp;
15823 15817 }
15824 15818
15825 15819 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15826 15820 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15827 15821 intersect_llp->ll_flock.l_start,
15828 15822 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15829 15823 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15830 15824 }
15831 15825
15832 15826 /*
15833 15827 * Our local locking current state is potentially different than
15834 15828 * what the NFSv4 server thinks we have due to a lost lock that was
15835 15829 * resent and then received. We need to reset our "NFSv4" locking
15836 15830 * state to match the current local locking state for this pid since
15837 15831 * that is what the user/application sees as what the world is.
15838 15832 *
15839 15833 * We cannot afford to drop the open/lock seqid sync since then we can
15840 15834 * get confused about what the current local locking state "is" versus
15841 15835 * "was".
15842 15836 *
15843 15837 * If we are unable to fix up the locks, we send SIGLOST to the affected
15844 15838 * process. This is not done if the filesystem has been forcibly
15845 15839 * unmounted, in case the process has already exited and a new process
15846 15840 * exists with the same pid.
15847 15841 */
15848 15842 static void
15849 15843 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15850 15844 nfs4_lock_owner_t *lop)
15851 15845 {
15852 15846 locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15853 15847 mntinfo4_t *mi = VTOMI4(vp);
15854 15848 const int cmd = F_SETLK;
15855 15849 off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15856 15850 flock64_t ul_fl;
15857 15851
15858 15852 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15859 15853 "nfs4_reinstitute_local_lock_state"));
15860 15854
15861 15855 /*
15862 15856 * Find active locks for this vp from the local locking code.
15863 15857 * Scan through this list and find out the locks that intersect with
15864 15858 * the lost lock. Once we find the lock that intersects, add the
15865 15859 * intersection area as a new lock to a new list "ri_llp". The lock
15866 15860 * type of the intersection region lock added to ri_llp is the same
15867 15861 * as that found in the active lock list, "list". The intersecting
15868 15862 * region locks are added to ri_llp in increasing l_start order.
15869 15863 */
15870 15864 ASSERT(nfs_zone() == mi->mi_zone);
15871 15865
15872 15866 locks = flk_active_locks_for_vp(vp);
15873 15867 ri_llp = NULL;
15874 15868
15875 15869 for (llp = locks; llp != NULL; llp = llp->ll_next) {
15876 15870 ASSERT(llp->ll_vp == vp);
15877 15871 /*
15878 15872 * Pick locks that belong to this pid/lockowner
15879 15873 */
15880 15874 if (llp->ll_flock.l_pid != lost_flp->l_pid)
15881 15875 continue;
15882 15876
15883 15877 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15884 15878 }
15885 15879
15886 15880 /*
15887 15881 * Now we have the list of intersections with the lost lock. These are
15888 15882 * the locks that were/are active before the server replied to the
15889 15883 * last/lost lock. Issue these locks to the server here. Playing these
15890 15884 * locks to the server will re-establish our current local locking state
15891 15885 * with the v4 server.
15892 15886 * If we get an error, send SIGLOST to the application for that lock.
15893 15887 */
15894 15888
15895 15889 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15896 15890 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15897 15891 "nfs4_reinstitute_local_lock_state: need to issue "
15898 15892 "flock: [%"PRIx64" - %"PRIx64"] : %s",
15899 15893 llp->ll_flock.l_start,
15900 15894 llp->ll_flock.l_start + llp->ll_flock.l_len,
15901 15895 llp->ll_flock.l_type == F_RDLCK ? "READ" :
15902 15896 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15903 15897 /*
15904 15898 * No need to relock what we already have
15905 15899 */
15906 15900 if (llp->ll_flock.l_type == lost_flp->l_type)
15907 15901 continue;
15908 15902
15909 15903 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15910 15904 }
15911 15905
15912 15906 /*
15913 15907 * Now keeping the start of the lost lock as our reference parse the
15914 15908 * newly created ri_llp locklist to find the ranges that we have locked
15915 15909 * with the v4 server but not in the current local locking. We need
15916 15910 * to unlock these ranges.
15917 15911 * These ranges can also be reffered to as those ranges, where the lost
15918 15912 * lock does not overlap with the locks in the ri_llp but are locked
15919 15913 * since the server replied to the lost lock.
15920 15914 */
15921 15915 cur_start = lost_flp->l_start;
15922 15916 lost_flp_end = lock_to_end(lost_flp);
15923 15917
15924 15918 ul_fl.l_type = F_UNLCK;
15925 15919 ul_fl.l_whence = 0; /* aka SEEK_SET */
15926 15920 ul_fl.l_sysid = lost_flp->l_sysid;
15927 15921 ul_fl.l_pid = lost_flp->l_pid;
15928 15922
15929 15923 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15930 15924 llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15931 15925
15932 15926 if (llp->ll_flock.l_start <= cur_start) {
15933 15927 cur_start = start_check(llp_ll_flock_end);
15934 15928 continue;
15935 15929 }
15936 15930 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15937 15931 "nfs4_reinstitute_local_lock_state: "
15938 15932 "UNLOCK [%"PRIx64" - %"PRIx64"]",
15939 15933 cur_start, llp->ll_flock.l_start));
15940 15934
15941 15935 ul_fl.l_start = cur_start;
15942 15936 ul_fl.l_len = end_to_len(cur_start,
15943 15937 (llp->ll_flock.l_start - 1));
15944 15938
15945 15939 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15946 15940 cur_start = start_check(llp_ll_flock_end);
15947 15941 }
15948 15942
15949 15943 /*
15950 15944 * In the case where the lost lock ends after all intersecting locks,
15951 15945 * unlock the last part of the lost lock range.
15952 15946 */
15953 15947 if (cur_start != start_check(lost_flp_end)) {
15954 15948 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15955 15949 "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15956 15950 "lost lock region [%"PRIx64" - %"PRIx64"]",
15957 15951 cur_start, lost_flp->l_start + lost_flp->l_len));
15958 15952
15959 15953 ul_fl.l_start = cur_start;
15960 15954 /*
15961 15955 * Is it an to-EOF lock? if so unlock till the end
15962 15956 */
15963 15957 if (lost_flp->l_len == 0)
15964 15958 ul_fl.l_len = 0;
15965 15959 else
15966 15960 ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15967 15961
15968 15962 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15969 15963 }
15970 15964
15971 15965 if (locks != NULL)
15972 15966 flk_free_locklist(locks);
15973 15967
15974 15968 /* Free up our newly created locklist */
15975 15969 for (llp = ri_llp; llp != NULL; ) {
15976 15970 tmp_llp = llp->ll_next;
15977 15971 kmem_free(llp, sizeof (locklist_t));
15978 15972 llp = tmp_llp;
15979 15973 }
15980 15974
15981 15975 /*
15982 15976 * Now return back to the original calling nfs4frlock()
15983 15977 * and let us naturally drop our seqid syncs.
15984 15978 */
15985 15979 }
15986 15980
15987 15981 /*
15988 15982 * Create a lost state record for the given lock reinstantiation request
15989 15983 * and push it onto the lost state queue.
15990 15984 */
15991 15985 static void
15992 15986 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15993 15987 nfs4_lock_owner_t *lop)
15994 15988 {
15995 15989 nfs4_lost_rqst_t req;
15996 15990 nfs_lock_type4 locktype;
15997 15991 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15998 15992
15999 15993 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
16000 15994
16001 15995 locktype = flk_to_locktype(cmd, flk->l_type);
16002 15996 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
16003 15997 NULL, NULL, lop, flk, &req, cr, vp);
16004 15998 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
16005 15999 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
16006 16000 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
16007 16001 NULL, NULL, NULL);
16008 16002 }
|
↓ open down ↓ |
13394 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX