1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2016 STRATO AG. All rights reserved.
24 */
25
26 /*
27 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 /*
31 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
32 * Use is subject to license terms.
33 */
34
35 /*
36 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
37 * All Rights Reserved
38 */
39
40 /*
41 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
42 */
43
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/systm.h>
47 #include <sys/cred.h>
48 #include <sys/time.h>
49 #include <sys/vnode.h>
50 #include <sys/vfs.h>
51 #include <sys/vfs_opreg.h>
52 #include <sys/file.h>
53 #include <sys/filio.h>
54 #include <sys/uio.h>
55 #include <sys/buf.h>
56 #include <sys/mman.h>
57 #include <sys/pathname.h>
58 #include <sys/dirent.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/fcntl.h>
62 #include <sys/flock.h>
63 #include <sys/swap.h>
64 #include <sys/errno.h>
65 #include <sys/strsubr.h>
66 #include <sys/sysmacros.h>
67 #include <sys/kmem.h>
68 #include <sys/cmn_err.h>
69 #include <sys/pathconf.h>
70 #include <sys/utsname.h>
71 #include <sys/dnlc.h>
72 #include <sys/acl.h>
73 #include <sys/systeminfo.h>
74 #include <sys/policy.h>
75 #include <sys/sdt.h>
76 #include <sys/list.h>
77 #include <sys/stat.h>
78 #include <sys/zone.h>
79
80 #include <rpc/types.h>
81 #include <rpc/auth.h>
82 #include <rpc/clnt.h>
83
84 #include <nfs/nfs.h>
85 #include <nfs/nfs_clnt.h>
86 #include <nfs/nfs_acl.h>
87 #include <nfs/lm.h>
88 #include <nfs/nfs4.h>
89 #include <nfs/nfs4_kprot.h>
90 #include <nfs/rnode4.h>
91 #include <nfs/nfs4_clnt.h>
92
93 #include <vm/hat.h>
94 #include <vm/as.h>
95 #include <vm/page.h>
96 #include <vm/pvn.h>
97 #include <vm/seg.h>
98 #include <vm/seg_map.h>
99 #include <vm/seg_kpm.h>
100 #include <vm/seg_vn.h>
101
102 #include <fs/fs_subr.h>
103
104 #include <sys/ddi.h>
105 #include <sys/int_fmtio.h>
106 #include <sys/fs/autofs.h>
107
108 typedef struct {
109 nfs4_ga_res_t *di_garp;
110 cred_t *di_cred;
111 hrtime_t di_time_call;
112 } dirattr_info_t;
113
114 typedef enum nfs4_acl_op {
115 NFS4_ACL_GET,
116 NFS4_ACL_SET
117 } nfs4_acl_op_t;
118
119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *);
120
121 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
122 char *, dirattr_info_t *);
123
124 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
125 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
126 nfs4_error_t *, int *);
127 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
128 cred_t *);
129 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
130 stable_how4 *);
131 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
132 cred_t *, bool_t, struct uio *);
133 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
134 vsecattr_t *);
135 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
136 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
137 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
138 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
139 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
140 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
141 int, vnode_t **, cred_t *);
142 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
143 cred_t *, int, int, enum createmode4, int);
144 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
145 caller_context_t *);
146 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
147 vnode_t *, char *, cred_t *, nfsstat4 *);
148 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
149 vnode_t *, char *, cred_t *, nfsstat4 *);
150 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
151 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
152 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
153 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
154 page_t *[], size_t, struct seg *, caddr_t,
155 enum seg_rw, cred_t *);
156 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
157 cred_t *);
158 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
159 int, cred_t *);
160 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
161 int, cred_t *);
162 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *);
163 static void nfs4_set_mod(vnode_t *);
164 static void nfs4_get_commit(vnode_t *);
165 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
166 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
167 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
168 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
169 cred_t *);
170 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
171 cred_t *);
172 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
173 hrtime_t, vnode_t *, cred_t *);
174 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
175 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
176 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
177 u_offset_t);
178 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
179 static int nfs4_block_and_wait(clock_t *, rnode4_t *);
180 static cred_t *state_to_cred(nfs4_open_stream_t *);
181 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
182 static pid_t lo_to_pid(lock_owner4 *);
183 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
184 cred_t *, nfs4_lock_owner_t *);
185 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
186 nfs4_lock_owner_t *);
187 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
188 static void nfs4_delmap_callback(struct as *, void *, uint_t);
189 static void nfs4_free_delmapcall(nfs4_delmapcall_t *);
190 static nfs4_delmapcall_t *nfs4_init_delmapcall();
191 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
192 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
193 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
194 uid_t, gid_t, int);
195
196 /*
197 * Routines that implement the setting of v4 args for the misc. ops
198 */
199 static void nfs4args_lock_free(nfs_argop4 *);
200 static void nfs4args_lockt_free(nfs_argop4 *);
201 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
202 int, rnode4_t *, cred_t *, bitmap4, int *,
203 nfs4_stateid_types_t *);
204 static void nfs4args_setattr_free(nfs_argop4 *);
205 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
206 bitmap4);
207 static void nfs4args_verify_free(nfs_argop4 *);
208 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
209 WRITE4args **, nfs4_stateid_types_t *);
210
211 /*
212 * These are the vnode ops functions that implement the vnode interface to
213 * the networked file system. See more comments below at nfs4_vnodeops.
214 */
215 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
216 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
217 caller_context_t *);
218 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *,
219 caller_context_t *);
220 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *,
221 caller_context_t *);
222 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
223 caller_context_t *);
224 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
225 caller_context_t *);
226 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
227 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *,
228 caller_context_t *);
229 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
230 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
231 int, vnode_t **, cred_t *, int, caller_context_t *,
232 vsecattr_t *);
233 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
234 int);
235 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
236 caller_context_t *, int);
237 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
238 caller_context_t *, int);
239 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
240 cred_t *, caller_context_t *, int, vsecattr_t *);
241 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
242 caller_context_t *, int);
243 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
244 cred_t *, caller_context_t *, int);
245 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
246 caller_context_t *, int);
247 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
248 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
249 page_t *[], size_t, struct seg *, caddr_t,
250 enum seg_rw, cred_t *, caller_context_t *);
251 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
252 caller_context_t *);
253 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
254 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
255 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
256 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
257 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
258 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
259 struct flk_callback *, cred_t *, caller_context_t *);
260 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
261 cred_t *, caller_context_t *);
262 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
263 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
264 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
265 cred_t *, caller_context_t *);
266 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
267 caller_context_t *);
268 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
269 caller_context_t *);
270 /*
271 * These vnode ops are required to be called from outside this source file,
272 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
273 * as static.
274 */
275 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
276 caller_context_t *);
277 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
278 int nfs4_lookup(vnode_t *, char *, vnode_t **,
279 struct pathname *, int, vnode_t *, cred_t *,
280 caller_context_t *, int *, pathname_t *);
281 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
282 int nfs4_rwlock(vnode_t *, int, caller_context_t *);
283 void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
284 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
285 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
286 caller_context_t *);
287 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
288 caller_context_t *);
289 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
290 caller_context_t *);
291
292 /*
293 * Used for nfs4_commit_vp() to indicate if we should
294 * wait on pending writes.
295 */
296 #define NFS4_WRITE_NOWAIT 0
297 #define NFS4_WRITE_WAIT 1
298
299 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */
300
301 /*
302 * Error flags used to pass information about certain special errors
303 * which need to be handled specially.
304 */
305 #define NFS_EOF -98
306 #define NFS_VERF_MISMATCH -97
307
308 /*
309 * Flags used to differentiate between which operation drove the
310 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
311 */
312 #define NFS4_CLOSE_OP 0x1
313 #define NFS4_DELMAP_OP 0x2
314 #define NFS4_INACTIVE_OP 0x3
315
316 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
317
318 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
319 #define ALIGN64(x, ptr, sz) \
320 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \
321 if (x) { \
322 x = sizeof (uint64_t) - (x); \
323 sz -= (x); \
324 ptr += (x); \
325 }
326
327 #ifdef DEBUG
328 int nfs4_client_attr_debug = 0;
329 int nfs4_client_state_debug = 0;
330 int nfs4_client_shadow_debug = 0;
331 int nfs4_client_lock_debug = 0;
332 int nfs4_seqid_sync = 0;
333 int nfs4_client_map_debug = 0;
334 static int nfs4_pageio_debug = 0;
335 int nfs4_client_inactive_debug = 0;
336 int nfs4_client_recov_debug = 0;
337 int nfs4_client_failover_debug = 0;
338 int nfs4_client_call_debug = 0;
339 int nfs4_client_lookup_debug = 0;
340 int nfs4_client_zone_debug = 0;
341 int nfs4_lost_rqst_debug = 0;
342 int nfs4_rdattrerr_debug = 0;
343 int nfs4_open_stream_debug = 0;
344
345 int nfs4read_error_inject;
346
347 static int nfs4_create_misses = 0;
348
349 static int nfs4_readdir_cache_shorts = 0;
350 static int nfs4_readdir_readahead = 0;
351
352 static int nfs4_bio_do_stop = 0;
353
354 static int nfs4_lostpage = 0; /* number of times we lost original page */
355
356 int nfs4_mmap_debug = 0;
357
358 static int nfs4_pathconf_cache_hits = 0;
359 static int nfs4_pathconf_cache_misses = 0;
360
361 int nfs4close_all_cnt;
362 int nfs4close_one_debug = 0;
363 int nfs4close_notw_debug = 0;
364
365 int denied_to_flk_debug = 0;
366 void *lockt_denied_debug;
367
368 #endif
369
370 /*
371 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
372 * or NFS4ERR_RESOURCE.
373 */
374 static int confirm_retry_sec = 30;
375
376 static int nfs4_lookup_neg_cache = 1;
377
378 /*
379 * number of pages to read ahead
380 * optimized for 100 base-T.
381 */
382 static int nfs4_nra = 4;
383
384 static int nfs4_do_symlink_cache = 1;
385
386 static int nfs4_pathconf_disable_cache = 0;
387
388 /*
389 * These are the vnode ops routines which implement the vnode interface to
390 * the networked file system. These routines just take their parameters,
391 * make them look networkish by putting the right info into interface structs,
392 * and then calling the appropriate remote routine(s) to do the work.
393 *
394 * Note on directory name lookup cacheing: If we detect a stale fhandle,
395 * we purge the directory cache relative to that vnode. This way, the
396 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for
397 * more details on rnode locking.
398 */
399
400 struct vnodeops *nfs4_vnodeops;
401
402 const fs_operation_def_t nfs4_vnodeops_template[] = {
403 VOPNAME_OPEN, { .vop_open = nfs4_open },
404 VOPNAME_CLOSE, { .vop_close = nfs4_close },
405 VOPNAME_READ, { .vop_read = nfs4_read },
406 VOPNAME_WRITE, { .vop_write = nfs4_write },
407 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl },
408 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr },
409 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr },
410 VOPNAME_ACCESS, { .vop_access = nfs4_access },
411 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup },
412 VOPNAME_CREATE, { .vop_create = nfs4_create },
413 VOPNAME_REMOVE, { .vop_remove = nfs4_remove },
414 VOPNAME_LINK, { .vop_link = nfs4_link },
415 VOPNAME_RENAME, { .vop_rename = nfs4_rename },
416 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir },
417 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir },
418 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir },
419 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink },
420 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink },
421 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync },
422 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
423 VOPNAME_FID, { .vop_fid = nfs4_fid },
424 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
425 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
426 VOPNAME_SEEK, { .vop_seek = nfs4_seek },
427 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock },
428 VOPNAME_SPACE, { .vop_space = nfs4_space },
429 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
430 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage },
431 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage },
432 VOPNAME_MAP, { .vop_map = nfs4_map },
433 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap },
434 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap },
435 /* no separate nfs4_dump */
436 VOPNAME_DUMP, { .vop_dump = nfs_dump },
437 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
438 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio },
439 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose },
440 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr },
441 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
442 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock },
443 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
444 NULL, NULL
445 };
446
447 /*
448 * The following are subroutines and definitions to set args or get res
449 * for the different nfsv4 ops
450 */
451
452 void
453 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
454 {
455 int i;
456
457 for (i = 0; i < arglen; i++) {
458 if (argop[i].argop == OP_LOOKUP) {
459 kmem_free(
460 argop[i].nfs_argop4_u.oplookup.
461 objname.utf8string_val,
462 argop[i].nfs_argop4_u.oplookup.
463 objname.utf8string_len);
464 }
465 }
466 }
467
468 static void
469 nfs4args_lock_free(nfs_argop4 *argop)
470 {
471 locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
472
473 if (locker->new_lock_owner == TRUE) {
474 open_to_lock_owner4 *open_owner;
475
476 open_owner = &locker->locker4_u.open_owner;
477 if (open_owner->lock_owner.owner_val != NULL) {
478 kmem_free(open_owner->lock_owner.owner_val,
479 open_owner->lock_owner.owner_len);
480 }
481 }
482 }
483
484 static void
485 nfs4args_lockt_free(nfs_argop4 *argop)
486 {
487 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
488
489 if (lowner->owner_val != NULL) {
490 kmem_free(lowner->owner_val, lowner->owner_len);
491 }
492 }
493
494 static void
495 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
496 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
497 nfs4_stateid_types_t *sid_types)
498 {
499 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
500 mntinfo4_t *mi;
501
502 argop->argop = OP_SETATTR;
503 /*
504 * The stateid is set to 0 if client is not modifying the size
505 * and otherwise to whatever nfs4_get_stateid() returns.
506 *
507 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
508 * state struct could be found for the process/file pair. We may
509 * want to change this in the future (by OPENing the file). See
510 * bug # 4474852.
511 */
512 if (vap->va_mask & AT_SIZE) {
513
514 ASSERT(rp != NULL);
515 mi = VTOMI4(RTOV4(rp));
516
517 argop->nfs_argop4_u.opsetattr.stateid =
518 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
519 OP_SETATTR, sid_types, FALSE);
520 } else {
521 bzero(&argop->nfs_argop4_u.opsetattr.stateid,
522 sizeof (stateid4));
523 }
524
525 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
526 if (*error)
527 bzero(attr, sizeof (*attr));
528 }
529
530 static void
531 nfs4args_setattr_free(nfs_argop4 *argop)
532 {
533 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
534 }
535
536 static int
537 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
538 bitmap4 supp)
539 {
540 fattr4 *attr;
541 int error = 0;
542
543 argop->argop = op;
544 switch (op) {
545 case OP_VERIFY:
546 attr = &argop->nfs_argop4_u.opverify.obj_attributes;
547 break;
548 case OP_NVERIFY:
549 attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
550 break;
551 default:
552 return (EINVAL);
553 }
554 if (!error)
555 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
556 if (error)
557 bzero(attr, sizeof (*attr));
558 return (error);
559 }
560
561 static void
562 nfs4args_verify_free(nfs_argop4 *argop)
563 {
564 switch (argop->argop) {
565 case OP_VERIFY:
566 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
567 break;
568 case OP_NVERIFY:
569 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
570 break;
571 default:
572 break;
573 }
574 }
575
576 static void
577 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
578 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
579 {
580 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
581 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
582
583 argop->argop = OP_WRITE;
584 wargs->stable = stable;
585 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
586 mi, OP_WRITE, sid_tp);
587 wargs->mblk = NULL;
588 *wargs_pp = wargs;
589 }
590
591 void
592 nfs4args_copen_free(OPEN4cargs *open_args)
593 {
594 if (open_args->owner.owner_val) {
595 kmem_free(open_args->owner.owner_val,
596 open_args->owner.owner_len);
597 }
598 if ((open_args->opentype == OPEN4_CREATE) &&
599 (open_args->mode != EXCLUSIVE4)) {
600 nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
601 }
602 }
603
604 /*
605 * XXX: This is referenced in modstubs.s
606 */
607 struct vnodeops *
608 nfs4_getvnodeops(void)
609 {
610 return (nfs4_vnodeops);
611 }
612
613 /*
614 * The OPEN operation opens a regular file.
615 */
616 /*ARGSUSED3*/
617 static int
618 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
619 {
620 vnode_t *dvp = NULL;
621 rnode4_t *rp, *drp;
622 int error;
623 int just_been_created;
624 char fn[MAXNAMELEN];
625
626 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
627 if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
628 return (EIO);
629 rp = VTOR4(*vpp);
630
631 /*
632 * Check to see if opening something besides a regular file;
633 * if so skip the OTW call
634 */
635 if ((*vpp)->v_type != VREG) {
636 error = nfs4_open_non_reg_file(vpp, flag, cr);
637 return (error);
638 }
639
640 /*
641 * XXX - would like a check right here to know if the file is
642 * executable or not, so as to skip OTW
643 */
644
645 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
646 return (error);
647
648 drp = VTOR4(dvp);
649 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
650 return (EINTR);
651
652 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
653 nfs_rw_exit(&drp->r_rwlock);
654 return (error);
655 }
656
657 /*
658 * See if this file has just been CREATEd.
659 * If so, clear the flag and update the dnlc, which was previously
660 * skipped in nfs4_create.
661 * XXX need better serilization on this.
662 * XXX move this into the nf4open_otw call, after we have
663 * XXX acquired the open owner seqid sync.
664 */
665 mutex_enter(&rp->r_statev4_lock);
666 if (rp->created_v4) {
667 rp->created_v4 = 0;
668 mutex_exit(&rp->r_statev4_lock);
669
670 dnlc_update(dvp, fn, *vpp);
671 /* This is needed so we don't bump the open ref count */
672 just_been_created = 1;
673 } else {
674 mutex_exit(&rp->r_statev4_lock);
675 just_been_created = 0;
676 }
677
678 /*
679 * If caller specified O_TRUNC/FTRUNC, then be sure to set
680 * FWRITE (to drive successful setattr(size=0) after open)
681 */
682 if (flag & FTRUNC)
683 flag |= FWRITE;
684
685 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
686 just_been_created);
687
688 if (!error && !((*vpp)->v_flag & VROOT))
689 dnlc_update(dvp, fn, *vpp);
690
691 nfs_rw_exit(&drp->r_rwlock);
692
693 /* release the hold from vtodv */
694 VN_RELE(dvp);
695
696 /* exchange the shadow for the master vnode, if needed */
697
698 if (error == 0 && IS_SHADOW(*vpp, rp))
699 sv_exchange(vpp);
700
701 return (error);
702 }
703
704 /*
705 * See if there's a "lost open" request to be saved and recovered.
706 */
707 static void
708 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
709 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
710 vnode_t *dvp, OPEN4cargs *open_args)
711 {
712 vfs_t *vfsp;
713 char *srccfp;
714
715 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
716
717 if (error != ETIMEDOUT && error != EINTR &&
718 !NFS4_FRC_UNMT_ERR(error, vfsp)) {
719 lost_rqstp->lr_op = 0;
720 return;
721 }
722
723 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
724 "nfs4open_save_lost_rqst: error %d", error));
725
726 lost_rqstp->lr_op = OP_OPEN;
727
728 /*
729 * The vp (if it is not NULL) and dvp are held and rele'd via
730 * the recovery code. See nfs4_save_lost_rqst.
731 */
732 lost_rqstp->lr_vp = vp;
733 lost_rqstp->lr_dvp = dvp;
734 lost_rqstp->lr_oop = oop;
735 lost_rqstp->lr_osp = NULL;
736 lost_rqstp->lr_lop = NULL;
737 lost_rqstp->lr_cr = cr;
738 lost_rqstp->lr_flk = NULL;
739 lost_rqstp->lr_oacc = open_args->share_access;
740 lost_rqstp->lr_odeny = open_args->share_deny;
741 lost_rqstp->lr_oclaim = open_args->claim;
742 if (open_args->claim == CLAIM_DELEGATE_CUR) {
743 lost_rqstp->lr_ostateid =
744 open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
745 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
746 } else {
747 srccfp = open_args->open_claim4_u.cfile;
748 }
749 lost_rqstp->lr_ofile.utf8string_len = 0;
750 lost_rqstp->lr_ofile.utf8string_val = NULL;
751 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
752 lost_rqstp->lr_putfirst = FALSE;
753 }
754
755 struct nfs4_excl_time {
756 uint32 seconds;
757 uint32 nseconds;
758 };
759
760 /*
761 * The OPEN operation creates and/or opens a regular file
762 *
763 * ARGSUSED
764 */
765 static int
766 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
767 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
768 enum createmode4 createmode, int file_just_been_created)
769 {
770 rnode4_t *rp;
771 rnode4_t *drp = VTOR4(dvp);
772 vnode_t *vp = NULL;
773 vnode_t *vpi = *vpp;
774 bool_t needrecov = FALSE;
775
776 int doqueue = 1;
777
778 COMPOUND4args_clnt args;
779 COMPOUND4res_clnt res;
780 nfs_argop4 *argop;
781 nfs_resop4 *resop;
782 int argoplist_size;
783 int idx_open, idx_fattr;
784
785 GETFH4res *gf_res = NULL;
786 OPEN4res *op_res = NULL;
787 nfs4_ga_res_t *garp;
788 fattr4 *attr = NULL;
789 struct nfs4_excl_time verf;
790 bool_t did_excl_setup = FALSE;
791 int created_osp;
792
793 OPEN4cargs *open_args;
794 nfs4_open_owner_t *oop = NULL;
795 nfs4_open_stream_t *osp = NULL;
796 seqid4 seqid = 0;
797 bool_t retry_open = FALSE;
798 nfs4_recov_state_t recov_state;
799 nfs4_lost_rqst_t lost_rqst;
800 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
801 hrtime_t t;
802 int acc = 0;
803 cred_t *cred_otw = NULL; /* cred used to do the RPC call */
804 cred_t *ncr = NULL;
805
806 nfs4_sharedfh_t *otw_sfh;
807 nfs4_sharedfh_t *orig_sfh;
808 int fh_differs = 0;
809 int numops, setgid_flag;
810 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
811
812 /*
813 * Make sure we properly deal with setting the right gid on
814 * a newly created file to reflect the parent's setgid bit
815 */
816 setgid_flag = 0;
817 if (create_flag && in_va) {
818
819 /*
820 * If there is grpid mount flag used or
821 * the parent's directory has the setgid bit set
822 * _and_ the client was able to get a valid mapping
823 * for the parent dir's owner_group, we want to
824 * append NVERIFY(owner_group == dva.va_gid) and
825 * SETATTR to the CREATE compound.
826 */
827 mutex_enter(&drp->r_statelock);
828 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
829 drp->r_attr.va_mode & VSGID) &&
830 drp->r_attr.va_gid != GID_NOBODY) {
831 in_va->va_mask |= AT_GID;
832 in_va->va_gid = drp->r_attr.va_gid;
833 setgid_flag = 1;
834 }
835 mutex_exit(&drp->r_statelock);
836 }
837
838 /*
839 * Normal/non-create compound:
840 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
841 *
842 * Open(create) compound no setgid:
843 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
844 * RESTOREFH + GETATTR
845 *
846 * Open(create) setgid:
847 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
848 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
849 * NVERIFY(grp) + SETATTR
850 */
851 if (setgid_flag) {
852 numops = 10;
853 idx_open = 1;
854 idx_fattr = 3;
855 } else if (create_flag) {
856 numops = 7;
857 idx_open = 2;
858 idx_fattr = 4;
859 } else {
860 numops = 4;
861 idx_open = 1;
862 idx_fattr = 3;
863 }
864
865 args.array_len = numops;
866 argoplist_size = numops * sizeof (nfs_argop4);
867 argop = kmem_alloc(argoplist_size, KM_SLEEP);
868
869 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
870 "open %s open flag 0x%x cred %p", file_name, open_flag,
871 (void *)cr));
872
873 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
874 if (create_flag) {
875 /*
876 * We are to create a file. Initialize the passed in vnode
877 * pointer.
878 */
879 vpi = NULL;
880 } else {
881 /*
882 * Check to see if the client owns a read delegation and is
883 * trying to open for write. If so, then return the delegation
884 * to avoid the server doing a cb_recall and returning DELAY.
885 * NB - we don't use the statev4_lock here because we'd have
886 * to drop the lock anyway and the result would be stale.
887 */
888 if ((open_flag & FWRITE) &&
889 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
890 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
891
892 /*
893 * If the file has a delegation, then do an access check up
894 * front. This avoids having to an access check later after
895 * we've already done start_op, which could deadlock.
896 */
897 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
898 if (open_flag & FREAD &&
899 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
900 acc |= VREAD;
901 if (open_flag & FWRITE &&
902 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
903 acc |= VWRITE;
904 }
905 }
906
907 drp = VTOR4(dvp);
908
909 recov_state.rs_flags = 0;
910 recov_state.rs_num_retry_despite_err = 0;
911 cred_otw = cr;
912
913 recov_retry:
914 fh_differs = 0;
915 nfs4_error_zinit(&e);
916
917 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
918 if (e.error) {
919 if (ncr != NULL)
920 crfree(ncr);
921 kmem_free(argop, argoplist_size);
922 return (e.error);
923 }
924
925 args.ctag = TAG_OPEN;
926 args.array_len = numops;
927 args.array = argop;
928
929 /* putfh directory fh */
930 argop[0].argop = OP_CPUTFH;
931 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
932
933 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
934 argop[idx_open].argop = OP_COPEN;
935 open_args = &argop[idx_open].nfs_argop4_u.opcopen;
936 open_args->claim = CLAIM_NULL;
937
938 /* name of file */
939 open_args->open_claim4_u.cfile = file_name;
940 open_args->owner.owner_len = 0;
941 open_args->owner.owner_val = NULL;
942
943 if (create_flag) {
944 /* CREATE a file */
945 open_args->opentype = OPEN4_CREATE;
946 open_args->mode = createmode;
947 if (createmode == EXCLUSIVE4) {
948 if (did_excl_setup == FALSE) {
949 verf.seconds = zone_get_hostid(NULL);
950 if (verf.seconds != 0)
951 verf.nseconds = newnum();
952 else {
953 timestruc_t now;
954
955 gethrestime(&now);
956 verf.seconds = now.tv_sec;
957 verf.nseconds = now.tv_nsec;
958 }
959 /*
960 * Since the server will use this value for the
961 * mtime, make sure that it can't overflow. Zero
962 * out the MSB. The actual value does not matter
963 * here, only its uniqeness.
964 */
965 verf.seconds &= INT32_MAX;
966 did_excl_setup = TRUE;
967 }
968
969 /* Now copy over verifier to OPEN4args. */
970 open_args->createhow4_u.createverf = *(uint64_t *)&verf;
971 } else {
972 int v_error;
973 bitmap4 supp_attrs;
974 servinfo4_t *svp;
975
976 attr = &open_args->createhow4_u.createattrs;
977
978 svp = drp->r_server;
979 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
980 supp_attrs = svp->sv_supp_attrs;
981 nfs_rw_exit(&svp->sv_lock);
982
983 /* GUARDED4 or UNCHECKED4 */
984 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
985 supp_attrs);
986 if (v_error) {
987 bzero(attr, sizeof (*attr));
988 nfs4args_copen_free(open_args);
989 nfs4_end_op(VTOMI4(dvp), dvp, vpi,
990 &recov_state, FALSE);
991 if (ncr != NULL)
992 crfree(ncr);
993 kmem_free(argop, argoplist_size);
994 return (v_error);
995 }
996 }
997 } else {
998 /* NO CREATE */
999 open_args->opentype = OPEN4_NOCREATE;
1000 }
1001
1002 if (recov_state.rs_sp != NULL) {
1003 mutex_enter(&recov_state.rs_sp->s_lock);
1004 open_args->owner.clientid = recov_state.rs_sp->clientid;
1005 mutex_exit(&recov_state.rs_sp->s_lock);
1006 } else {
1007 /* XXX should we just fail here? */
1008 open_args->owner.clientid = 0;
1009 }
1010
1011 /*
1012 * This increments oop's ref count or creates a temporary 'just_created'
1013 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1014 * completes.
1015 */
1016 mutex_enter(&VTOMI4(dvp)->mi_lock);
1017
1018 /* See if a permanent or just created open owner exists */
1019 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1020 if (!oop) {
1021 /*
1022 * This open owner does not exist so create a temporary
1023 * just created one.
1024 */
1025 oop = create_open_owner(cr, VTOMI4(dvp));
1026 ASSERT(oop != NULL);
1027 }
1028 mutex_exit(&VTOMI4(dvp)->mi_lock);
1029
1030 /* this length never changes, do alloc before seqid sync */
1031 open_args->owner.owner_len = sizeof (oop->oo_name);
1032 open_args->owner.owner_val =
1033 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1034
1035 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1036 if (e.error == EAGAIN) {
1037 open_owner_rele(oop);
1038 nfs4args_copen_free(open_args);
1039 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1040 if (ncr != NULL) {
1041 crfree(ncr);
1042 ncr = NULL;
1043 }
1044 goto recov_retry;
1045 }
1046
1047 /* Check to see if we need to do the OTW call */
1048 if (!create_flag) {
1049 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1050 file_just_been_created, &e.error, acc, &recov_state)) {
1051
1052 /*
1053 * The OTW open is not necessary. Either
1054 * the open can succeed without it (eg.
1055 * delegation, error == 0) or the open
1056 * must fail due to an access failure
1057 * (error != 0). In either case, tidy
1058 * up and return.
1059 */
1060
1061 nfs4_end_open_seqid_sync(oop);
1062 open_owner_rele(oop);
1063 nfs4args_copen_free(open_args);
1064 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1065 if (ncr != NULL)
1066 crfree(ncr);
1067 kmem_free(argop, argoplist_size);
1068 return (e.error);
1069 }
1070 }
1071
1072 bcopy(&oop->oo_name, open_args->owner.owner_val,
1073 open_args->owner.owner_len);
1074
1075 seqid = nfs4_get_open_seqid(oop) + 1;
1076 open_args->seqid = seqid;
1077 open_args->share_access = 0;
1078 if (open_flag & FREAD)
1079 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1080 if (open_flag & FWRITE)
1081 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1082 open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1083
1084
1085
1086 /*
1087 * getfh w/sanity check for idx_open/idx_fattr
1088 */
1089 ASSERT((idx_open + 1) == (idx_fattr - 1));
1090 argop[idx_open + 1].argop = OP_GETFH;
1091
1092 /* getattr */
1093 argop[idx_fattr].argop = OP_GETATTR;
1094 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1095 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1096
1097 if (setgid_flag) {
1098 vattr_t _v;
1099 servinfo4_t *svp;
1100 bitmap4 supp_attrs;
1101
1102 svp = drp->r_server;
1103 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1104 supp_attrs = svp->sv_supp_attrs;
1105 nfs_rw_exit(&svp->sv_lock);
1106
1107 /*
1108 * For setgid case, we need to:
1109 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1110 */
1111 argop[4].argop = OP_SAVEFH;
1112
1113 argop[5].argop = OP_CPUTFH;
1114 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1115
1116 argop[6].argop = OP_GETATTR;
1117 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1118 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1119
1120 argop[7].argop = OP_RESTOREFH;
1121
1122 /*
1123 * nverify
1124 */
1125 _v.va_mask = AT_GID;
1126 _v.va_gid = in_va->va_gid;
1127 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1128 supp_attrs))) {
1129
1130 /*
1131 * setattr
1132 *
1133 * We _know_ we're not messing with AT_SIZE or
1134 * AT_XTIME, so no need for stateid or flags.
1135 * Also we specify NULL rp since we're only
1136 * interested in setting owner_group attributes.
1137 */
1138 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1139 supp_attrs, &e.error, 0);
1140 if (e.error)
1141 nfs4args_verify_free(&argop[8]);
1142 }
1143
1144 if (e.error) {
1145 /*
1146 * XXX - Revisit the last argument to nfs4_end_op()
1147 * once 5020486 is fixed.
1148 */
1149 nfs4_end_open_seqid_sync(oop);
1150 open_owner_rele(oop);
1151 nfs4args_copen_free(open_args);
1152 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1153 if (ncr != NULL)
1154 crfree(ncr);
1155 kmem_free(argop, argoplist_size);
1156 return (e.error);
1157 }
1158 } else if (create_flag) {
1159 argop[1].argop = OP_SAVEFH;
1160
1161 argop[5].argop = OP_RESTOREFH;
1162
1163 argop[6].argop = OP_GETATTR;
1164 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1165 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1166 }
1167
1168 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1169 "nfs4open_otw: %s call, nm %s, rp %s",
1170 needrecov ? "recov" : "first", file_name,
1171 rnode4info(VTOR4(dvp))));
1172
1173 t = gethrtime();
1174
1175 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1176
1177 if (!e.error && nfs4_need_to_bump_seqid(&res))
1178 nfs4_set_open_seqid(seqid, oop, args.ctag);
1179
1180 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1181
1182 if (e.error || needrecov) {
1183 bool_t abort = FALSE;
1184
1185 if (needrecov) {
1186 nfs4_bseqid_entry_t *bsep = NULL;
1187
1188 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1189 cred_otw, vpi, dvp, open_args);
1190
1191 if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1192 bsep = nfs4_create_bseqid_entry(oop, NULL,
1193 vpi, 0, args.ctag, open_args->seqid);
1194 num_bseqid_retry--;
1195 }
1196
1197 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1198 NULL, lost_rqst.lr_op == OP_OPEN ?
1199 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1200
1201 if (bsep)
1202 kmem_free(bsep, sizeof (*bsep));
1203 /* give up if we keep getting BAD_SEQID */
1204 if (num_bseqid_retry == 0)
1205 abort = TRUE;
1206 if (abort == TRUE && e.error == 0)
1207 e.error = geterrno4(res.status);
1208 }
1209 nfs4_end_open_seqid_sync(oop);
1210 open_owner_rele(oop);
1211 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1212 nfs4args_copen_free(open_args);
1213 if (setgid_flag) {
1214 nfs4args_verify_free(&argop[8]);
1215 nfs4args_setattr_free(&argop[9]);
1216 }
1217 if (!e.error)
1218 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1219 if (ncr != NULL) {
1220 crfree(ncr);
1221 ncr = NULL;
1222 }
1223 if (!needrecov || abort == TRUE || e.error == EINTR ||
1224 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1225 kmem_free(argop, argoplist_size);
1226 return (e.error);
1227 }
1228 goto recov_retry;
1229 }
1230
1231 /*
1232 * Will check and update lease after checking the rflag for
1233 * OPEN_CONFIRM in the successful OPEN call.
1234 */
1235 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1236
1237 /*
1238 * XXX what if we're crossing mount points from server1:/drp
1239 * to server2:/drp/rp.
1240 */
1241
1242 /* Signal our end of use of the open seqid */
1243 nfs4_end_open_seqid_sync(oop);
1244
1245 /*
1246 * This will destroy the open owner if it was just created,
1247 * and no one else has put a reference on it.
1248 */
1249 open_owner_rele(oop);
1250 if (create_flag && (createmode != EXCLUSIVE4) &&
1251 res.status == NFS4ERR_BADOWNER)
1252 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1253
1254 e.error = geterrno4(res.status);
1255 nfs4args_copen_free(open_args);
1256 if (setgid_flag) {
1257 nfs4args_verify_free(&argop[8]);
1258 nfs4args_setattr_free(&argop[9]);
1259 }
1260 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1261 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1262 /*
1263 * If the reply is NFS4ERR_ACCESS, it may be because
1264 * we are root (no root net access). If the real uid
1265 * is not root, then retry with the real uid instead.
1266 */
1267 if (ncr != NULL) {
1268 crfree(ncr);
1269 ncr = NULL;
1270 }
1271 if (res.status == NFS4ERR_ACCESS &&
1272 (ncr = crnetadjust(cred_otw)) != NULL) {
1273 cred_otw = ncr;
1274 goto recov_retry;
1275 }
1276 kmem_free(argop, argoplist_size);
1277 return (e.error);
1278 }
1279
1280 resop = &res.array[idx_open]; /* open res */
1281 op_res = &resop->nfs_resop4_u.opopen;
1282
1283 #ifdef DEBUG
1284 /*
1285 * verify attrset bitmap
1286 */
1287 if (create_flag &&
1288 (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1289 /* make sure attrset returned is what we asked for */
1290 /* XXX Ignore this 'error' for now */
1291 if (attr->attrmask != op_res->attrset)
1292 /* EMPTY */;
1293 }
1294 #endif
1295
1296 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1297 mutex_enter(&VTOMI4(dvp)->mi_lock);
1298 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1299 mutex_exit(&VTOMI4(dvp)->mi_lock);
1300 }
1301
1302 resop = &res.array[idx_open + 1]; /* getfh res */
1303 gf_res = &resop->nfs_resop4_u.opgetfh;
1304
1305 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1306
1307 /*
1308 * The open stateid has been updated on the server but not
1309 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache->
1310 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1311 * WRITE call. That, however, will use the old stateid, so go ahead
1312 * and upate the open stateid now, before any call to makenfs4node.
1313 */
1314 if (vpi) {
1315 nfs4_open_stream_t *tmp_osp;
1316 rnode4_t *tmp_rp = VTOR4(vpi);
1317
1318 tmp_osp = find_open_stream(oop, tmp_rp);
1319 if (tmp_osp) {
1320 tmp_osp->open_stateid = op_res->stateid;
1321 mutex_exit(&tmp_osp->os_sync_lock);
1322 open_stream_rele(tmp_osp, tmp_rp);
1323 }
1324
1325 /*
1326 * We must determine if the file handle given by the otw open
1327 * is the same as the file handle which was passed in with
1328 * *vpp. This case can be reached if the file we are trying
1329 * to open has been removed and another file has been created
1330 * having the same file name. The passed in vnode is released
1331 * later.
1332 */
1333 orig_sfh = VTOR4(vpi)->r_fh;
1334 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1335 }
1336
1337 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1338
1339 if (create_flag || fh_differs) {
1340 int rnode_err = 0;
1341
1342 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1343 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1344
1345 if (e.error)
1346 PURGE_ATTRCACHE4(vp);
1347 /*
1348 * For the newly created vp case, make sure the rnode
1349 * isn't bad before using it.
1350 */
1351 mutex_enter(&(VTOR4(vp))->r_statelock);
1352 if (VTOR4(vp)->r_flags & R4RECOVERR)
1353 rnode_err = EIO;
1354 mutex_exit(&(VTOR4(vp))->r_statelock);
1355
1356 if (rnode_err) {
1357 nfs4_end_open_seqid_sync(oop);
1358 nfs4args_copen_free(open_args);
1359 if (setgid_flag) {
1360 nfs4args_verify_free(&argop[8]);
1361 nfs4args_setattr_free(&argop[9]);
1362 }
1363 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1364 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1365 needrecov);
1366 open_owner_rele(oop);
1367 VN_RELE(vp);
1368 if (ncr != NULL)
1369 crfree(ncr);
1370 sfh4_rele(&otw_sfh);
1371 kmem_free(argop, argoplist_size);
1372 return (EIO);
1373 }
1374 } else {
1375 vp = vpi;
1376 }
1377 sfh4_rele(&otw_sfh);
1378
1379 /*
1380 * It seems odd to get a full set of attrs and then not update
1381 * the object's attrcache in the non-create case. Create case uses
1382 * the attrs since makenfs4node checks to see if the attrs need to
1383 * be updated (and then updates them). The non-create case should
1384 * update attrs also.
1385 */
1386 if (! create_flag && ! fh_differs && !e.error) {
1387 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1388 }
1389
1390 nfs4_error_zinit(&e);
1391 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1392 /* This does not do recovery for vp explicitly. */
1393 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1394 &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1395
1396 if (e.error || e.stat) {
1397 nfs4_end_open_seqid_sync(oop);
1398 nfs4args_copen_free(open_args);
1399 if (setgid_flag) {
1400 nfs4args_verify_free(&argop[8]);
1401 nfs4args_setattr_free(&argop[9]);
1402 }
1403 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1404 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1405 needrecov);
1406 open_owner_rele(oop);
1407 if (create_flag || fh_differs) {
1408 /* rele the makenfs4node */
1409 VN_RELE(vp);
1410 }
1411 if (ncr != NULL) {
1412 crfree(ncr);
1413 ncr = NULL;
1414 }
1415 if (retry_open == TRUE) {
1416 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1417 "nfs4open_otw: retry the open since OPEN "
1418 "CONFIRM failed with error %d stat %d",
1419 e.error, e.stat));
1420 if (create_flag && createmode == GUARDED4) {
1421 NFS4_DEBUG(nfs4_client_recov_debug,
1422 (CE_NOTE, "nfs4open_otw: switch "
1423 "createmode from GUARDED4 to "
1424 "UNCHECKED4"));
1425 createmode = UNCHECKED4;
1426 }
1427 goto recov_retry;
1428 }
1429 if (!e.error) {
1430 if (create_flag && (createmode != EXCLUSIVE4) &&
1431 e.stat == NFS4ERR_BADOWNER)
1432 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1433
1434 e.error = geterrno4(e.stat);
1435 }
1436 kmem_free(argop, argoplist_size);
1437 return (e.error);
1438 }
1439 }
1440
1441 rp = VTOR4(vp);
1442
1443 mutex_enter(&rp->r_statev4_lock);
1444 if (create_flag)
1445 rp->created_v4 = 1;
1446 mutex_exit(&rp->r_statev4_lock);
1447
1448 mutex_enter(&oop->oo_lock);
1449 /* Doesn't matter if 'oo_just_created' already was set as this */
1450 oop->oo_just_created = NFS4_PERM_CREATED;
1451 if (oop->oo_cred_otw)
1452 crfree(oop->oo_cred_otw);
1453 oop->oo_cred_otw = cred_otw;
1454 crhold(oop->oo_cred_otw);
1455 mutex_exit(&oop->oo_lock);
1456
1457 /* returns with 'os_sync_lock' held */
1458 osp = find_or_create_open_stream(oop, rp, &created_osp);
1459 if (!osp) {
1460 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1461 "nfs4open_otw: failed to create an open stream"));
1462 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1463 "signal our end of use of the open seqid"));
1464
1465 nfs4_end_open_seqid_sync(oop);
1466 open_owner_rele(oop);
1467 nfs4args_copen_free(open_args);
1468 if (setgid_flag) {
1469 nfs4args_verify_free(&argop[8]);
1470 nfs4args_setattr_free(&argop[9]);
1471 }
1472 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1473 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1474 if (create_flag || fh_differs)
1475 VN_RELE(vp);
1476 if (ncr != NULL)
1477 crfree(ncr);
1478
1479 kmem_free(argop, argoplist_size);
1480 return (EINVAL);
1481
1482 }
1483
1484 osp->open_stateid = op_res->stateid;
1485
1486 if (open_flag & FREAD)
1487 osp->os_share_acc_read++;
1488 if (open_flag & FWRITE)
1489 osp->os_share_acc_write++;
1490 osp->os_share_deny_none++;
1491
1492 /*
1493 * Need to reset this bitfield for the possible case where we were
1494 * going to OTW CLOSE the file, got a non-recoverable error, and before
1495 * we could retry the CLOSE, OPENed the file again.
1496 */
1497 ASSERT(osp->os_open_owner->oo_seqid_inuse);
1498 osp->os_final_close = 0;
1499 osp->os_force_close = 0;
1500 #ifdef DEBUG
1501 if (osp->os_failed_reopen)
1502 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1503 " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1504 (void *)osp, (void *)cr, rnode4info(rp)));
1505 #endif
1506 osp->os_failed_reopen = 0;
1507
1508 mutex_exit(&osp->os_sync_lock);
1509
1510 nfs4_end_open_seqid_sync(oop);
1511
1512 if (created_osp && recov_state.rs_sp != NULL) {
1513 mutex_enter(&recov_state.rs_sp->s_lock);
1514 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1515 mutex_exit(&recov_state.rs_sp->s_lock);
1516 }
1517
1518 /* get rid of our reference to find oop */
1519 open_owner_rele(oop);
1520
1521 open_stream_rele(osp, rp);
1522
1523 /* accept delegation, if any */
1524 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1525
1526 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1527
1528 if (createmode == EXCLUSIVE4 &&
1529 (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1530 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1531 " EXCLUSIVE4: sending a SETATTR"));
1532 /*
1533 * If doing an exclusive create, then generate
1534 * a SETATTR to set the initial attributes.
1535 * Try to set the mtime and the atime to the
1536 * server's current time. It is somewhat
1537 * expected that these fields will be used to
1538 * store the exclusive create cookie. If not,
1539 * server implementors will need to know that
1540 * a SETATTR will follow an exclusive create
1541 * and the cookie should be destroyed if
1542 * appropriate.
1543 *
1544 * The AT_GID and AT_SIZE bits are turned off
1545 * so that the SETATTR request will not attempt
1546 * to process these. The gid will be set
1547 * separately if appropriate. The size is turned
1548 * off because it is assumed that a new file will
1549 * be created empty and if the file wasn't empty,
1550 * then the exclusive create will have failed
1551 * because the file must have existed already.
1552 * Therefore, no truncate operation is needed.
1553 */
1554 in_va->va_mask &= ~(AT_GID | AT_SIZE);
1555 in_va->va_mask |= (AT_MTIME | AT_ATIME);
1556
1557 e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1558 if (e.error) {
1559 nfs4_error_t err;
1560
1561 /*
1562 * Couldn't correct the attributes of
1563 * the newly created file and the
1564 * attributes are wrong. Remove the
1565 * file and return an error to the
1566 * application.
1567 */
1568 /* XXX will this take care of client state ? */
1569 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1570 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1571 " remove file", e.error));
1572
1573 /*
1574 * The file is currently open so try to close it first.
1575 *
1576 * If we do not close the file explicitly here then the
1577 * VN_RELE() would do an (implicit and asynchronous)
1578 * close for us. But such async close could race with
1579 * the nfs4_remove() below. If the async close is
1580 * slower than nfs4_remove() then nfs4_remove()
1581 * wouldn't remove the file but rename it to .nfsXXXX
1582 * instead.
1583 */
1584 nfs4close_one(vp, NULL, cr, open_flag, NULL, &err,
1585 CLOSE_NORM, 0, 0, 0);
1586 VN_RELE(vp);
1587 (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1588
1589 /*
1590 * Since we've reled the vnode and removed
1591 * the file we now need to return the error.
1592 * At this point we don't want to update the
1593 * dircaches, call nfs4_waitfor_purge_complete
1594 * or set vpp to vp so we need to skip these
1595 * as well.
1596 */
1597 goto skip_update_dircaches;
1598 }
1599 }
1600
1601 /*
1602 * If we created or found the correct vnode, due to create_flag or
1603 * fh_differs being set, then update directory cache attribute, readdir
1604 * and dnlc caches.
1605 */
1606 if (create_flag || fh_differs) {
1607 dirattr_info_t dinfo, *dinfop;
1608
1609 /*
1610 * Make sure getattr succeeded before using results.
1611 * note: op 7 is getattr(dir) for both flavors of
1612 * open(create).
1613 */
1614 if (create_flag && res.status == NFS4_OK) {
1615 dinfo.di_time_call = t;
1616 dinfo.di_cred = cr;
1617 dinfo.di_garp =
1618 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1619 dinfop = &dinfo;
1620 } else {
1621 dinfop = NULL;
1622 }
1623
1624 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1625 dinfop);
1626 }
1627
1628 /*
1629 * If the page cache for this file was flushed from actions
1630 * above, it was done asynchronously and if that is true,
1631 * there is a need to wait here for it to complete. This must
1632 * be done outside of start_fop/end_fop.
1633 */
1634 (void) nfs4_waitfor_purge_complete(vp);
1635
1636 /*
1637 * It is implicit that we are in the open case (create_flag == 0) since
1638 * fh_differs can only be set to a non-zero value in the open case.
1639 */
1640 if (fh_differs != 0 && vpi != NULL)
1641 VN_RELE(vpi);
1642
1643 /*
1644 * Be sure to set *vpp to the correct value before returning.
1645 */
1646 *vpp = vp;
1647
1648 skip_update_dircaches:
1649
1650 nfs4args_copen_free(open_args);
1651 if (setgid_flag) {
1652 nfs4args_verify_free(&argop[8]);
1653 nfs4args_setattr_free(&argop[9]);
1654 }
1655 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1656
1657 if (ncr)
1658 crfree(ncr);
1659 kmem_free(argop, argoplist_size);
1660 return (e.error);
1661 }
1662
1663 /*
1664 * Reopen an open instance. cf. nfs4open_otw().
1665 *
1666 * Errors are returned by the nfs4_error_t parameter.
1667 * - ep->error contains an errno value or zero.
1668 * - if it is zero, ep->stat is set to an NFS status code, if any.
1669 * If the file could not be reopened, but the caller should continue, the
1670 * file is marked dead and no error values are returned. If the caller
1671 * should stop recovering open files and start over, either the ep->error
1672 * value or ep->stat will indicate an error (either something that requires
1673 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile
1674 * filehandles) may be handled silently by this routine.
1675 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1676 * will be started, so the caller should not do it.
1677 *
1678 * Gotos:
1679 * - kill_file : reopen failed in such a fashion to constitute marking the
1680 * file dead and setting the open stream's 'os_failed_reopen' as 1. This
1681 * is for cases where recovery is not possible.
1682 * - failed_reopen : same as above, except that the file has already been
1683 * marked dead, so no need to do it again.
1684 * - bailout : reopen failed but we are able to recover and retry the reopen -
1685 * either within this function immediately or via the calling function.
1686 */
1687
1688 void
1689 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1690 open_claim_type4 claim, bool_t frc_use_claim_previous,
1691 bool_t is_recov)
1692 {
1693 COMPOUND4args_clnt args;
1694 COMPOUND4res_clnt res;
1695 nfs_argop4 argop[4];
1696 nfs_resop4 *resop;
1697 OPEN4res *op_res = NULL;
1698 OPEN4cargs *open_args;
1699 GETFH4res *gf_res;
1700 rnode4_t *rp = VTOR4(vp);
1701 int doqueue = 1;
1702 cred_t *cr = NULL, *cred_otw = NULL;
1703 nfs4_open_owner_t *oop = NULL;
1704 seqid4 seqid;
1705 nfs4_ga_res_t *garp;
1706 char fn[MAXNAMELEN];
1707 nfs4_recov_state_t recov = {NULL, 0};
1708 nfs4_lost_rqst_t lost_rqst;
1709 mntinfo4_t *mi = VTOMI4(vp);
1710 bool_t abort;
1711 char *failed_msg = "";
1712 int fh_different;
1713 hrtime_t t;
1714 nfs4_bseqid_entry_t *bsep = NULL;
1715
1716 ASSERT(nfs4_consistent_type(vp));
1717 ASSERT(nfs_zone() == mi->mi_zone);
1718
1719 nfs4_error_zinit(ep);
1720
1721 /* this is the cred used to find the open owner */
1722 cr = state_to_cred(osp);
1723 if (cr == NULL) {
1724 failed_msg = "Couldn't reopen: no cred";
1725 goto kill_file;
1726 }
1727 /* use this cred for OTW operations */
1728 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1729
1730 top:
1731 nfs4_error_zinit(ep);
1732
1733 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1734 /* File system has been unmounted, quit */
1735 ep->error = EIO;
1736 failed_msg = "Couldn't reopen: file system has been unmounted";
1737 goto kill_file;
1738 }
1739
1740 oop = osp->os_open_owner;
1741
1742 ASSERT(oop != NULL);
1743 if (oop == NULL) { /* be defensive in non-DEBUG */
1744 failed_msg = "can't reopen: no open owner";
1745 goto kill_file;
1746 }
1747 open_owner_hold(oop);
1748
1749 ep->error = nfs4_start_open_seqid_sync(oop, mi);
1750 if (ep->error) {
1751 open_owner_rele(oop);
1752 oop = NULL;
1753 goto bailout;
1754 }
1755
1756 /*
1757 * If the rnode has a delegation and the delegation has been
1758 * recovered and the server didn't request a recall and the caller
1759 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1760 * recovery) and the rnode hasn't been marked dead, then install
1761 * the delegation stateid in the open stream. Otherwise, proceed
1762 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1763 */
1764 mutex_enter(&rp->r_statev4_lock);
1765 if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1766 !rp->r_deleg_return_pending &&
1767 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1768 !rp->r_deleg_needs_recall &&
1769 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1770 !(rp->r_flags & R4RECOVERR)) {
1771 mutex_enter(&osp->os_sync_lock);
1772 osp->os_delegation = 1;
1773 osp->open_stateid = rp->r_deleg_stateid;
1774 mutex_exit(&osp->os_sync_lock);
1775 mutex_exit(&rp->r_statev4_lock);
1776 goto bailout;
1777 }
1778 mutex_exit(&rp->r_statev4_lock);
1779
1780 /*
1781 * If the file failed recovery, just quit. This failure need not
1782 * affect other reopens, so don't return an error.
1783 */
1784 mutex_enter(&rp->r_statelock);
1785 if (rp->r_flags & R4RECOVERR) {
1786 mutex_exit(&rp->r_statelock);
1787 ep->error = 0;
1788 goto failed_reopen;
1789 }
1790 mutex_exit(&rp->r_statelock);
1791
1792 /*
1793 * argop is empty here
1794 *
1795 * PUTFH, OPEN, GETATTR
1796 */
1797 args.ctag = TAG_REOPEN;
1798 args.array_len = 4;
1799 args.array = argop;
1800
1801 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1802 "nfs4_reopen: file is type %d, id %s",
1803 vp->v_type, rnode4info(VTOR4(vp))));
1804
1805 argop[0].argop = OP_CPUTFH;
1806
1807 if (claim != CLAIM_PREVIOUS) {
1808 /*
1809 * if this is a file mount then
1810 * use the mntinfo parentfh
1811 */
1812 argop[0].nfs_argop4_u.opcputfh.sfh =
1813 (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1814 VTOSV(vp)->sv_dfh;
1815 } else {
1816 /* putfh fh to reopen */
1817 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1818 }
1819
1820 argop[1].argop = OP_COPEN;
1821 open_args = &argop[1].nfs_argop4_u.opcopen;
1822 open_args->claim = claim;
1823
1824 if (claim == CLAIM_NULL) {
1825
1826 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1827 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1828 "failed for vp 0x%p for CLAIM_NULL with %m",
1829 (void *)vp);
1830 failed_msg = "Couldn't reopen: vtoname failed for "
1831 "CLAIM_NULL";
1832 /* nothing allocated yet */
1833 goto kill_file;
1834 }
1835
1836 open_args->open_claim4_u.cfile = fn;
1837 } else if (claim == CLAIM_PREVIOUS) {
1838
1839 /*
1840 * We have two cases to deal with here:
1841 * 1) We're being called to reopen files in order to satisfy
1842 * a lock operation request which requires us to explicitly
1843 * reopen files which were opened under a delegation. If
1844 * we're in recovery, we *must* use CLAIM_PREVIOUS. In
1845 * that case, frc_use_claim_previous is TRUE and we must
1846 * use the rnode's current delegation type (r_deleg_type).
1847 * 2) We're reopening files during some form of recovery.
1848 * In this case, frc_use_claim_previous is FALSE and we
1849 * use the delegation type appropriate for recovery
1850 * (r_deleg_needs_recovery).
1851 */
1852 mutex_enter(&rp->r_statev4_lock);
1853 open_args->open_claim4_u.delegate_type =
1854 frc_use_claim_previous ?
1855 rp->r_deleg_type :
1856 rp->r_deleg_needs_recovery;
1857 mutex_exit(&rp->r_statev4_lock);
1858
1859 } else if (claim == CLAIM_DELEGATE_CUR) {
1860
1861 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1862 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1863 "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1864 "with %m", (void *)vp);
1865 failed_msg = "Couldn't reopen: vtoname failed for "
1866 "CLAIM_DELEGATE_CUR";
1867 /* nothing allocated yet */
1868 goto kill_file;
1869 }
1870
1871 mutex_enter(&rp->r_statev4_lock);
1872 open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1873 rp->r_deleg_stateid;
1874 mutex_exit(&rp->r_statev4_lock);
1875
1876 open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1877 }
1878 open_args->opentype = OPEN4_NOCREATE;
1879 open_args->owner.clientid = mi2clientid(mi);
1880 open_args->owner.owner_len = sizeof (oop->oo_name);
1881 open_args->owner.owner_val =
1882 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1883 bcopy(&oop->oo_name, open_args->owner.owner_val,
1884 open_args->owner.owner_len);
1885 open_args->share_access = 0;
1886 open_args->share_deny = 0;
1887
1888 mutex_enter(&osp->os_sync_lock);
1889 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1890 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1891 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1892 (void *)osp, (void *)rp, osp->os_share_acc_read,
1893 osp->os_share_acc_write, osp->os_open_ref_count,
1894 osp->os_mmap_read, osp->os_mmap_write, claim));
1895
1896 if (osp->os_share_acc_read || osp->os_mmap_read)
1897 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1898 if (osp->os_share_acc_write || osp->os_mmap_write)
1899 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1900 if (osp->os_share_deny_read)
1901 open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1902 if (osp->os_share_deny_write)
1903 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1904 mutex_exit(&osp->os_sync_lock);
1905
1906 seqid = nfs4_get_open_seqid(oop) + 1;
1907 open_args->seqid = seqid;
1908
1909 /* Construct the getfh part of the compound */
1910 argop[2].argop = OP_GETFH;
1911
1912 /* Construct the getattr part of the compound */
1913 argop[3].argop = OP_GETATTR;
1914 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1915 argop[3].nfs_argop4_u.opgetattr.mi = mi;
1916
1917 t = gethrtime();
1918
1919 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1920
1921 if (ep->error) {
1922 if (!is_recov && !frc_use_claim_previous &&
1923 (ep->error == EINTR || ep->error == ETIMEDOUT ||
1924 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1925 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1926 cred_otw, vp, NULL, open_args);
1927 abort = nfs4_start_recovery(ep,
1928 VTOMI4(vp), vp, NULL, NULL,
1929 lost_rqst.lr_op == OP_OPEN ?
1930 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1931 nfs4args_copen_free(open_args);
1932 goto bailout;
1933 }
1934
1935 nfs4args_copen_free(open_args);
1936
1937 if (ep->error == EACCES && cred_otw != cr) {
1938 crfree(cred_otw);
1939 cred_otw = cr;
1940 crhold(cred_otw);
1941 nfs4_end_open_seqid_sync(oop);
1942 open_owner_rele(oop);
1943 oop = NULL;
1944 goto top;
1945 }
1946 if (ep->error == ETIMEDOUT)
1947 goto bailout;
1948 failed_msg = "Couldn't reopen: rpc error";
1949 goto kill_file;
1950 }
1951
1952 if (nfs4_need_to_bump_seqid(&res))
1953 nfs4_set_open_seqid(seqid, oop, args.ctag);
1954
1955 switch (res.status) {
1956 case NFS4_OK:
1957 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1958 mutex_enter(&rp->r_statelock);
1959 rp->r_delay_interval = 0;
1960 mutex_exit(&rp->r_statelock);
1961 }
1962 break;
1963 case NFS4ERR_BAD_SEQID:
1964 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1965 args.ctag, open_args->seqid);
1966
1967 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1968 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1969 NULL, OP_OPEN, bsep, NULL, NULL);
1970
1971 nfs4args_copen_free(open_args);
1972 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1973 nfs4_end_open_seqid_sync(oop);
1974 open_owner_rele(oop);
1975 oop = NULL;
1976 kmem_free(bsep, sizeof (*bsep));
1977
1978 goto kill_file;
1979 case NFS4ERR_NO_GRACE:
1980 nfs4args_copen_free(open_args);
1981 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1982 nfs4_end_open_seqid_sync(oop);
1983 open_owner_rele(oop);
1984 oop = NULL;
1985 if (claim == CLAIM_PREVIOUS) {
1986 /*
1987 * Retry as a plain open. We don't need to worry about
1988 * checking the changeinfo: it is acceptable for a
1989 * client to re-open a file and continue processing
1990 * (in the absence of locks).
1991 */
1992 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1993 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1994 "will retry as CLAIM_NULL"));
1995 claim = CLAIM_NULL;
1996 nfs4_mi_kstat_inc_no_grace(mi);
1997 goto top;
1998 }
1999 failed_msg =
2000 "Couldn't reopen: tried reclaim outside grace period. ";
2001 goto kill_file;
2002 case NFS4ERR_GRACE:
2003 nfs4_set_grace_wait(mi);
2004 nfs4args_copen_free(open_args);
2005 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 nfs4_end_open_seqid_sync(oop);
2007 open_owner_rele(oop);
2008 oop = NULL;
2009 ep->error = nfs4_wait_for_grace(mi, &recov);
2010 if (ep->error != 0)
2011 goto bailout;
2012 goto top;
2013 case NFS4ERR_DELAY:
2014 nfs4_set_delay_wait(vp);
2015 nfs4args_copen_free(open_args);
2016 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2017 nfs4_end_open_seqid_sync(oop);
2018 open_owner_rele(oop);
2019 oop = NULL;
2020 ep->error = nfs4_wait_for_delay(vp, &recov);
2021 nfs4_mi_kstat_inc_delay(mi);
2022 if (ep->error != 0)
2023 goto bailout;
2024 goto top;
2025 case NFS4ERR_FHEXPIRED:
2026 /* recover filehandle and retry */
2027 abort = nfs4_start_recovery(ep,
2028 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2029 nfs4args_copen_free(open_args);
2030 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2031 nfs4_end_open_seqid_sync(oop);
2032 open_owner_rele(oop);
2033 oop = NULL;
2034 if (abort == FALSE)
2035 goto top;
2036 failed_msg = "Couldn't reopen: recovery aborted";
2037 goto kill_file;
2038 case NFS4ERR_RESOURCE:
2039 case NFS4ERR_STALE_CLIENTID:
2040 case NFS4ERR_WRONGSEC:
2041 case NFS4ERR_EXPIRED:
2042 /*
2043 * Do not mark the file dead and let the calling
2044 * function initiate recovery.
2045 */
2046 nfs4args_copen_free(open_args);
2047 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2048 nfs4_end_open_seqid_sync(oop);
2049 open_owner_rele(oop);
2050 oop = NULL;
2051 goto bailout;
2052 case NFS4ERR_ACCESS:
2053 if (cred_otw != cr) {
2054 crfree(cred_otw);
2055 cred_otw = cr;
2056 crhold(cred_otw);
2057 nfs4args_copen_free(open_args);
2058 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2059 nfs4_end_open_seqid_sync(oop);
2060 open_owner_rele(oop);
2061 oop = NULL;
2062 goto top;
2063 }
2064 /* fall through */
2065 default:
2066 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2067 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2068 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2069 rnode4info(VTOR4(vp))));
2070 failed_msg = "Couldn't reopen: NFSv4 error";
2071 nfs4args_copen_free(open_args);
2072 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2073 goto kill_file;
2074 }
2075
2076 resop = &res.array[1]; /* open res */
2077 op_res = &resop->nfs_resop4_u.opopen;
2078
2079 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2080
2081 /*
2082 * Check if the path we reopened really is the same
2083 * file. We could end up in a situation where the file
2084 * was removed and a new file created with the same name.
2085 */
2086 resop = &res.array[2];
2087 gf_res = &resop->nfs_resop4_u.opgetfh;
2088 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2089 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2090 if (fh_different) {
2091 if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2092 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2093 /* Oops, we don't have the same file */
2094 if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2095 failed_msg = "Couldn't reopen: Persistent "
2096 "file handle changed";
2097 else
2098 failed_msg = "Couldn't reopen: Volatile "
2099 "(no expire on open) file handle changed";
2100
2101 nfs4args_copen_free(open_args);
2102 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2103 nfs_rw_exit(&mi->mi_fh_lock);
2104 goto kill_file;
2105
2106 } else {
2107 /*
2108 * We have volatile file handles that don't compare.
2109 * If the fids are the same then we assume that the
2110 * file handle expired but the rnode still refers to
2111 * the same file object.
2112 *
2113 * First check that we have fids or not.
2114 * If we don't we have a dumb server so we will
2115 * just assume every thing is ok for now.
2116 */
2117 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2118 rp->r_attr.va_mask & AT_NODEID &&
2119 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2120 /*
2121 * We have fids, but they don't
2122 * compare. So kill the file.
2123 */
2124 failed_msg =
2125 "Couldn't reopen: file handle changed"
2126 " due to mismatched fids";
2127 nfs4args_copen_free(open_args);
2128 xdr_free(xdr_COMPOUND4res_clnt,
2129 (caddr_t)&res);
2130 nfs_rw_exit(&mi->mi_fh_lock);
2131 goto kill_file;
2132 } else {
2133 /*
2134 * We have volatile file handles that refers
2135 * to the same file (at least they have the
2136 * same fid) or we don't have fids so we
2137 * can't tell. :(. We'll be a kind and accepting
2138 * client so we'll update the rnode's file
2139 * handle with the otw handle.
2140 *
2141 * We need to drop mi->mi_fh_lock since
2142 * sh4_update acquires it. Since there is
2143 * only one recovery thread there is no
2144 * race.
2145 */
2146 nfs_rw_exit(&mi->mi_fh_lock);
2147 sfh4_update(rp->r_fh, &gf_res->object);
2148 }
2149 }
2150 } else {
2151 nfs_rw_exit(&mi->mi_fh_lock);
2152 }
2153
2154 ASSERT(nfs4_consistent_type(vp));
2155
2156 /*
2157 * If the server wanted an OPEN_CONFIRM but that fails, just start
2158 * over. Presumably if there is a persistent error it will show up
2159 * when we resend the OPEN.
2160 */
2161 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2162 bool_t retry_open = FALSE;
2163
2164 nfs4open_confirm(vp, &seqid, &op_res->stateid,
2165 cred_otw, is_recov, &retry_open,
2166 oop, FALSE, ep, NULL);
2167 if (ep->error || ep->stat) {
2168 nfs4args_copen_free(open_args);
2169 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2170 nfs4_end_open_seqid_sync(oop);
2171 open_owner_rele(oop);
2172 oop = NULL;
2173 goto top;
2174 }
2175 }
2176
2177 mutex_enter(&osp->os_sync_lock);
2178 osp->open_stateid = op_res->stateid;
2179 osp->os_delegation = 0;
2180 /*
2181 * Need to reset this bitfield for the possible case where we were
2182 * going to OTW CLOSE the file, got a non-recoverable error, and before
2183 * we could retry the CLOSE, OPENed the file again.
2184 */
2185 ASSERT(osp->os_open_owner->oo_seqid_inuse);
2186 osp->os_final_close = 0;
2187 osp->os_force_close = 0;
2188 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2189 osp->os_dc_openacc = open_args->share_access;
2190 mutex_exit(&osp->os_sync_lock);
2191
2192 nfs4_end_open_seqid_sync(oop);
2193
2194 /* accept delegation, if any */
2195 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2196
2197 nfs4args_copen_free(open_args);
2198
2199 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2200
2201 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2202
2203 ASSERT(nfs4_consistent_type(vp));
2204
2205 open_owner_rele(oop);
2206 crfree(cr);
2207 crfree(cred_otw);
2208 return;
2209
2210 kill_file:
2211 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2212 failed_reopen:
2213 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2214 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2215 (void *)osp, (void *)cr, rnode4info(rp)));
2216 mutex_enter(&osp->os_sync_lock);
2217 osp->os_failed_reopen = 1;
2218 mutex_exit(&osp->os_sync_lock);
2219 bailout:
2220 if (oop != NULL) {
2221 nfs4_end_open_seqid_sync(oop);
2222 open_owner_rele(oop);
2223 }
2224 if (cr != NULL)
2225 crfree(cr);
2226 if (cred_otw != NULL)
2227 crfree(cred_otw);
2228 }
2229
2230 /* for . and .. OPENs */
2231 /* ARGSUSED */
2232 static int
2233 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2234 {
2235 rnode4_t *rp;
2236 nfs4_ga_res_t gar;
2237
2238 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2239
2240 /*
2241 * If close-to-open consistency checking is turned off or
2242 * if there is no cached data, we can avoid
2243 * the over the wire getattr. Otherwise, force a
2244 * call to the server to get fresh attributes and to
2245 * check caches. This is required for close-to-open
2246 * consistency.
2247 */
2248 rp = VTOR4(*vpp);
2249 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2250 (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2251 return (0);
2252
2253 return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2254 }
2255
2256 /*
2257 * CLOSE a file
2258 */
2259 /* ARGSUSED */
2260 static int
2261 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2262 caller_context_t *ct)
2263 {
2264 rnode4_t *rp;
2265 int error = 0;
2266 int r_error = 0;
2267 int n4error = 0;
2268 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2269
2270 /*
2271 * Remove client state for this (lockowner, file) pair.
2272 * Issue otw v4 call to have the server do the same.
2273 */
2274
2275 rp = VTOR4(vp);
2276
2277 /*
2278 * zone_enter(2) prevents processes from changing zones with NFS files
2279 * open; if we happen to get here from the wrong zone we can't do
2280 * anything over the wire.
2281 */
2282 if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2283 /*
2284 * We could attempt to clean up locks, except we're sure
2285 * that the current process didn't acquire any locks on
2286 * the file: any attempt to lock a file belong to another zone
2287 * will fail, and one can't lock an NFS file and then change
2288 * zones, as that fails too.
2289 *
2290 * Returning an error here is the sane thing to do. A
2291 * subsequent call to VN_RELE() which translates to a
2292 * nfs4_inactive() will clean up state: if the zone of the
2293 * vnode's origin is still alive and kicking, the inactive
2294 * thread will handle the request (from the correct zone), and
2295 * everything (minus the OTW close call) should be OK. If the
2296 * zone is going away nfs4_async_inactive() will throw away
2297 * delegations, open streams and cached pages inline.
2298 */
2299 return (EIO);
2300 }
2301
2302 /*
2303 * If we are using local locking for this filesystem, then
2304 * release all of the SYSV style record locks. Otherwise,
2305 * we are doing network locking and we need to release all
2306 * of the network locks. All of the locks held by this
2307 * process on this file are released no matter what the
2308 * incoming reference count is.
2309 */
2310 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2311 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2312 cleanshares(vp, ttoproc(curthread)->p_pid);
2313 } else
2314 e.error = nfs4_lockrelease(vp, flag, offset, cr);
2315
2316 if (e.error) {
2317 struct lm_sysid *lmsid;
2318 lmsid = nfs4_find_sysid(VTOMI4(vp));
2319 if (lmsid == NULL) {
2320 DTRACE_PROBE2(unknown__sysid, int, e.error,
2321 vnode_t *, vp);
2322 } else {
2323 cleanlocks(vp, ttoproc(curthread)->p_pid,
2324 (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2325
2326 lm_rel_sysid(lmsid);
2327 }
2328 return (e.error);
2329 }
2330
2331 if (count > 1)
2332 return (0);
2333
2334 /*
2335 * If the file has been `unlinked', then purge the
2336 * DNLC so that this vnode will get reycled quicker
2337 * and the .nfs* file on the server will get removed.
2338 */
2339 if (rp->r_unldvp != NULL)
2340 dnlc_purge_vp(vp);
2341
2342 /*
2343 * If the file was open for write and there are pages,
2344 * do a synchronous flush and commit of all of the
2345 * dirty and uncommitted pages.
2346 */
2347 ASSERT(!e.error);
2348 if ((flag & FWRITE) && nfs4_has_pages(vp))
2349 error = nfs4_putpage_commit(vp, 0, 0, cr);
2350
2351 mutex_enter(&rp->r_statelock);
2352 r_error = rp->r_error;
2353 rp->r_error = 0;
2354 mutex_exit(&rp->r_statelock);
2355
2356 /*
2357 * If this file type is one for which no explicit 'open' was
2358 * done, then bail now (ie. no need for protocol 'close'). If
2359 * there was an error w/the vm subsystem, return _that_ error,
2360 * otherwise, return any errors that may've been reported via
2361 * the rnode.
2362 */
2363 if (vp->v_type != VREG)
2364 return (error ? error : r_error);
2365
2366 /*
2367 * The sync putpage commit may have failed above, but since
2368 * we're working w/a regular file, we need to do the protocol
2369 * 'close' (nfs4close_one will figure out if an otw close is
2370 * needed or not). Report any errors _after_ doing the protocol
2371 * 'close'.
2372 */
2373 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2374 n4error = e.error ? e.error : geterrno4(e.stat);
2375
2376 /*
2377 * Error reporting prio (Hi -> Lo)
2378 *
2379 * i) nfs4_putpage_commit (error)
2380 * ii) rnode's (r_error)
2381 * iii) nfs4close_one (n4error)
2382 */
2383 return (error ? error : (r_error ? r_error : n4error));
2384 }
2385
2386 /*
2387 * Initialize *lost_rqstp.
2388 */
2389
2390 static void
2391 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2392 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2393 vnode_t *vp)
2394 {
2395 if (error != ETIMEDOUT && error != EINTR &&
2396 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2397 lost_rqstp->lr_op = 0;
2398 return;
2399 }
2400
2401 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2402 "nfs4close_save_lost_rqst: error %d", error));
2403
2404 lost_rqstp->lr_op = OP_CLOSE;
2405 /*
2406 * The vp is held and rele'd via the recovery code.
2407 * See nfs4_save_lost_rqst.
2408 */
2409 lost_rqstp->lr_vp = vp;
2410 lost_rqstp->lr_dvp = NULL;
2411 lost_rqstp->lr_oop = oop;
2412 lost_rqstp->lr_osp = osp;
2413 ASSERT(osp != NULL);
2414 ASSERT(mutex_owned(&osp->os_sync_lock));
2415 osp->os_pending_close = 1;
2416 lost_rqstp->lr_lop = NULL;
2417 lost_rqstp->lr_cr = cr;
2418 lost_rqstp->lr_flk = NULL;
2419 lost_rqstp->lr_putfirst = FALSE;
2420 }
2421
2422 /*
2423 * Assumes you already have the open seqid sync grabbed as well as the
2424 * 'os_sync_lock'. Note: this will release the open seqid sync and
2425 * 'os_sync_lock' if client recovery starts. Calling functions have to
2426 * be prepared to handle this.
2427 *
2428 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2429 * was needed and was started, and that the calling function should retry
2430 * this function; otherwise it is returned as 0.
2431 *
2432 * Errors are returned via the nfs4_error_t parameter.
2433 */
2434 static void
2435 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2436 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2437 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2438 {
2439 COMPOUND4args_clnt args;
2440 COMPOUND4res_clnt res;
2441 CLOSE4args *close_args;
2442 nfs_resop4 *resop;
2443 nfs_argop4 argop[3];
2444 int doqueue = 1;
2445 mntinfo4_t *mi;
2446 seqid4 seqid;
2447 vnode_t *vp;
2448 bool_t needrecov = FALSE;
2449 nfs4_lost_rqst_t lost_rqst;
2450 hrtime_t t;
2451
2452 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2453
2454 ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2455
2456 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2457
2458 /* Only set this to 1 if recovery is started */
2459 *recov = 0;
2460
2461 /* do the OTW call to close the file */
2462
2463 if (close_type == CLOSE_RESEND)
2464 args.ctag = TAG_CLOSE_LOST;
2465 else if (close_type == CLOSE_AFTER_RESEND)
2466 args.ctag = TAG_CLOSE_UNDO;
2467 else
2468 args.ctag = TAG_CLOSE;
2469
2470 args.array_len = 3;
2471 args.array = argop;
2472
2473 vp = RTOV4(rp);
2474
2475 mi = VTOMI4(vp);
2476
2477 /* putfh target fh */
2478 argop[0].argop = OP_CPUTFH;
2479 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2480
2481 argop[1].argop = OP_GETATTR;
2482 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2483 argop[1].nfs_argop4_u.opgetattr.mi = mi;
2484
2485 argop[2].argop = OP_CLOSE;
2486 close_args = &argop[2].nfs_argop4_u.opclose;
2487
2488 seqid = nfs4_get_open_seqid(oop) + 1;
2489
2490 close_args->seqid = seqid;
2491 close_args->open_stateid = osp->open_stateid;
2492
2493 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2494 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2495 rnode4info(rp)));
2496
2497 t = gethrtime();
2498
2499 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2500
2501 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2502 nfs4_set_open_seqid(seqid, oop, args.ctag);
2503 }
2504
2505 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2506 if (ep->error && !needrecov) {
2507 /*
2508 * if there was an error and no recovery is to be done
2509 * then then set up the file to flush its cache if
2510 * needed for the next caller.
2511 */
2512 mutex_enter(&rp->r_statelock);
2513 PURGE_ATTRCACHE4_LOCKED(rp);
2514 rp->r_flags &= ~R4WRITEMODIFIED;
2515 mutex_exit(&rp->r_statelock);
2516 return;
2517 }
2518
2519 if (needrecov) {
2520 bool_t abort;
2521 nfs4_bseqid_entry_t *bsep = NULL;
2522
2523 if (close_type != CLOSE_RESEND)
2524 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2525 osp, cred_otw, vp);
2526
2527 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2528 bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2529 0, args.ctag, close_args->seqid);
2530
2531 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2532 "nfs4close_otw: initiating recovery. error %d "
2533 "res.status %d", ep->error, res.status));
2534
2535 /*
2536 * Drop the 'os_sync_lock' here so we don't hit
2537 * a potential recursive mutex_enter via an
2538 * 'open_stream_hold()'.
2539 */
2540 mutex_exit(&osp->os_sync_lock);
2541 *have_sync_lockp = 0;
2542 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2543 (close_type != CLOSE_RESEND &&
2544 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2545 OP_CLOSE, bsep, NULL, NULL);
2546
2547 /* drop open seq sync, and let the calling function regrab it */
2548 nfs4_end_open_seqid_sync(oop);
2549 *did_start_seqid_syncp = 0;
2550
2551 if (bsep)
2552 kmem_free(bsep, sizeof (*bsep));
2553 /*
2554 * For signals, the caller wants to quit, so don't say to
2555 * retry. For forced unmount, if it's a user thread, it
2556 * wants to quit. If it's a recovery thread, the retry
2557 * will happen higher-up on the call stack. Either way,
2558 * don't say to retry.
2559 */
2560 if (abort == FALSE && ep->error != EINTR &&
2561 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2562 close_type != CLOSE_RESEND &&
2563 close_type != CLOSE_AFTER_RESEND)
2564 *recov = 1;
2565 else
2566 *recov = 0;
2567
2568 if (!ep->error)
2569 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2570 return;
2571 }
2572
2573 if (res.status) {
2574 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2575 return;
2576 }
2577
2578 mutex_enter(&rp->r_statev4_lock);
2579 rp->created_v4 = 0;
2580 mutex_exit(&rp->r_statev4_lock);
2581
2582 resop = &res.array[2];
2583 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2584 osp->os_valid = 0;
2585
2586 /*
2587 * This removes the reference obtained at OPEN; ie, when the
2588 * open stream structure was created.
2589 *
2590 * We don't have to worry about calling 'open_stream_rele'
2591 * since we our currently holding a reference to the open
2592 * stream which means the count cannot go to 0 with this
2593 * decrement.
2594 */
2595 ASSERT(osp->os_ref_count >= 2);
2596 osp->os_ref_count--;
2597
2598 if (ep->error == 0) {
2599 mutex_exit(&osp->os_sync_lock);
2600 *have_sync_lockp = 0;
2601
2602 nfs4_attr_cache(vp,
2603 &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2604 t, cred_otw, TRUE, NULL);
2605 }
2606
2607 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2608 " returning %d", ep->error));
2609
2610 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2611 }
2612
2613 /* ARGSUSED */
2614 static int
2615 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2616 caller_context_t *ct)
2617 {
2618 rnode4_t *rp;
2619 u_offset_t off;
2620 offset_t diff;
2621 uint_t on;
2622 uint_t n;
2623 caddr_t base;
2624 uint_t flags;
2625 int error;
2626 mntinfo4_t *mi;
2627
2628 rp = VTOR4(vp);
2629
2630 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2631
2632 if (IS_SHADOW(vp, rp))
2633 vp = RTOV4(rp);
2634
2635 if (vp->v_type != VREG)
2636 return (EISDIR);
2637
2638 mi = VTOMI4(vp);
2639
2640 if (nfs_zone() != mi->mi_zone)
2641 return (EIO);
2642
2643 if (uiop->uio_resid == 0)
2644 return (0);
2645
2646 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2647 return (EINVAL);
2648
2649 mutex_enter(&rp->r_statelock);
2650 if (rp->r_flags & R4RECOVERRP)
2651 error = (rp->r_error ? rp->r_error : EIO);
2652 else
2653 error = 0;
2654 mutex_exit(&rp->r_statelock);
2655 if (error)
2656 return (error);
2657
2658 /*
2659 * Bypass VM if caching has been disabled (e.g., locking) or if
2660 * using client-side direct I/O and the file is not mmap'd and
2661 * there are no cached pages.
2662 */
2663 if ((vp->v_flag & VNOCACHE) ||
2664 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2665 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2666 size_t resid = 0;
2667
2668 return (nfs4read(vp, NULL, uiop->uio_loffset,
2669 uiop->uio_resid, &resid, cr, FALSE, uiop));
2670 }
2671
2672 error = 0;
2673
2674 do {
2675 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2676 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2677 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2678
2679 if (error = nfs4_validate_caches(vp, cr))
2680 break;
2681
2682 mutex_enter(&rp->r_statelock);
2683 while (rp->r_flags & R4INCACHEPURGE) {
2684 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2685 mutex_exit(&rp->r_statelock);
2686 return (EINTR);
2687 }
2688 }
2689 diff = rp->r_size - uiop->uio_loffset;
2690 mutex_exit(&rp->r_statelock);
2691 if (diff <= 0)
2692 break;
2693 if (diff < n)
2694 n = (uint_t)diff;
2695
2696 if (vpm_enable) {
2697 /*
2698 * Copy data.
2699 */
2700 error = vpm_data_copy(vp, off + on, n, uiop,
2701 1, NULL, 0, S_READ);
2702 } else {
2703 base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2704 S_READ);
2705
2706 error = uiomove(base + on, n, UIO_READ, uiop);
2707 }
2708
2709 if (!error) {
2710 /*
2711 * If read a whole block or read to eof,
2712 * won't need this buffer again soon.
2713 */
2714 mutex_enter(&rp->r_statelock);
2715 if (n + on == MAXBSIZE ||
2716 uiop->uio_loffset == rp->r_size)
2717 flags = SM_DONTNEED;
2718 else
2719 flags = 0;
2720 mutex_exit(&rp->r_statelock);
2721 if (vpm_enable) {
2722 error = vpm_sync_pages(vp, off, n, flags);
2723 } else {
2724 error = segmap_release(segkmap, base, flags);
2725 }
2726 } else {
2727 if (vpm_enable) {
2728 (void) vpm_sync_pages(vp, off, n, 0);
2729 } else {
2730 (void) segmap_release(segkmap, base, 0);
2731 }
2732 }
2733 } while (!error && uiop->uio_resid > 0);
2734
2735 return (error);
2736 }
2737
2738 /* ARGSUSED */
2739 static int
2740 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2741 caller_context_t *ct)
2742 {
2743 rlim64_t limit = uiop->uio_llimit;
2744 rnode4_t *rp;
2745 u_offset_t off;
2746 caddr_t base;
2747 uint_t flags;
2748 int remainder;
2749 size_t n;
2750 int on;
2751 int error;
2752 int resid;
2753 u_offset_t offset;
2754 mntinfo4_t *mi;
2755 uint_t bsize;
2756
2757 rp = VTOR4(vp);
2758
2759 if (IS_SHADOW(vp, rp))
2760 vp = RTOV4(rp);
2761
2762 if (vp->v_type != VREG)
2763 return (EISDIR);
2764
2765 mi = VTOMI4(vp);
2766
2767 if (nfs_zone() != mi->mi_zone)
2768 return (EIO);
2769
2770 if (uiop->uio_resid == 0)
2771 return (0);
2772
2773 mutex_enter(&rp->r_statelock);
2774 if (rp->r_flags & R4RECOVERRP)
2775 error = (rp->r_error ? rp->r_error : EIO);
2776 else
2777 error = 0;
2778 mutex_exit(&rp->r_statelock);
2779 if (error)
2780 return (error);
2781
2782 if (ioflag & FAPPEND) {
2783 struct vattr va;
2784
2785 /*
2786 * Must serialize if appending.
2787 */
2788 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2789 nfs_rw_exit(&rp->r_rwlock);
2790 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2791 INTR4(vp)))
2792 return (EINTR);
2793 }
2794
2795 va.va_mask = AT_SIZE;
2796 error = nfs4getattr(vp, &va, cr);
2797 if (error)
2798 return (error);
2799 uiop->uio_loffset = va.va_size;
2800 }
2801
2802 offset = uiop->uio_loffset + uiop->uio_resid;
2803
2804 if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2805 return (EINVAL);
2806
2807 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2808 limit = MAXOFFSET_T;
2809
2810 /*
2811 * Check to make sure that the process will not exceed
2812 * its limit on file size. It is okay to write up to
2813 * the limit, but not beyond. Thus, the write which
2814 * reaches the limit will be short and the next write
2815 * will return an error.
2816 */
2817 remainder = 0;
2818 if (offset > uiop->uio_llimit) {
2819 remainder = offset - uiop->uio_llimit;
2820 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2821 if (uiop->uio_resid <= 0) {
2822 proc_t *p = ttoproc(curthread);
2823
2824 uiop->uio_resid += remainder;
2825 mutex_enter(&p->p_lock);
2826 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2827 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2828 mutex_exit(&p->p_lock);
2829 return (EFBIG);
2830 }
2831 }
2832
2833 /* update the change attribute, if we have a write delegation */
2834
2835 mutex_enter(&rp->r_statev4_lock);
2836 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2837 rp->r_deleg_change++;
2838
2839 mutex_exit(&rp->r_statev4_lock);
2840
2841 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2842 return (EINTR);
2843
2844 /*
2845 * Bypass VM if caching has been disabled (e.g., locking) or if
2846 * using client-side direct I/O and the file is not mmap'd and
2847 * there are no cached pages.
2848 */
2849 if ((vp->v_flag & VNOCACHE) ||
2850 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2851 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2852 size_t bufsize;
2853 int count;
2854 u_offset_t org_offset;
2855 stable_how4 stab_comm;
2856 nfs4_fwrite:
2857 if (rp->r_flags & R4STALE) {
2858 resid = uiop->uio_resid;
2859 offset = uiop->uio_loffset;
2860 error = rp->r_error;
2861 /*
2862 * A close may have cleared r_error, if so,
2863 * propagate ESTALE error return properly
2864 */
2865 if (error == 0)
2866 error = ESTALE;
2867 goto bottom;
2868 }
2869
2870 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2871 base = kmem_alloc(bufsize, KM_SLEEP);
2872 do {
2873 if (ioflag & FDSYNC)
2874 stab_comm = DATA_SYNC4;
2875 else
2876 stab_comm = FILE_SYNC4;
2877 resid = uiop->uio_resid;
2878 offset = uiop->uio_loffset;
2879 count = MIN(uiop->uio_resid, bufsize);
2880 org_offset = uiop->uio_loffset;
2881 error = uiomove(base, count, UIO_WRITE, uiop);
2882 if (!error) {
2883 error = nfs4write(vp, base, org_offset,
2884 count, cr, &stab_comm);
2885 if (!error) {
2886 mutex_enter(&rp->r_statelock);
2887 if (rp->r_size < uiop->uio_loffset)
2888 rp->r_size = uiop->uio_loffset;
2889 mutex_exit(&rp->r_statelock);
2890 }
2891 }
2892 } while (!error && uiop->uio_resid > 0);
2893 kmem_free(base, bufsize);
2894 goto bottom;
2895 }
2896
2897 bsize = vp->v_vfsp->vfs_bsize;
2898
2899 do {
2900 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2901 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2902 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2903
2904 resid = uiop->uio_resid;
2905 offset = uiop->uio_loffset;
2906
2907 if (rp->r_flags & R4STALE) {
2908 error = rp->r_error;
2909 /*
2910 * A close may have cleared r_error, if so,
2911 * propagate ESTALE error return properly
2912 */
2913 if (error == 0)
2914 error = ESTALE;
2915 break;
2916 }
2917
2918 /*
2919 * Don't create dirty pages faster than they
2920 * can be cleaned so that the system doesn't
2921 * get imbalanced. If the async queue is
2922 * maxed out, then wait for it to drain before
2923 * creating more dirty pages. Also, wait for
2924 * any threads doing pagewalks in the vop_getattr
2925 * entry points so that they don't block for
2926 * long periods.
2927 */
2928 mutex_enter(&rp->r_statelock);
2929 while ((mi->mi_max_threads != 0 &&
2930 rp->r_awcount > 2 * mi->mi_max_threads) ||
2931 rp->r_gcount > 0) {
2932 if (INTR4(vp)) {
2933 klwp_t *lwp = ttolwp(curthread);
2934
2935 if (lwp != NULL)
2936 lwp->lwp_nostop++;
2937 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2938 mutex_exit(&rp->r_statelock);
2939 if (lwp != NULL)
2940 lwp->lwp_nostop--;
2941 error = EINTR;
2942 goto bottom;
2943 }
2944 if (lwp != NULL)
2945 lwp->lwp_nostop--;
2946 } else
2947 cv_wait(&rp->r_cv, &rp->r_statelock);
2948 }
2949 mutex_exit(&rp->r_statelock);
2950
2951 /*
2952 * Touch the page and fault it in if it is not in core
2953 * before segmap_getmapflt or vpm_data_copy can lock it.
2954 * This is to avoid the deadlock if the buffer is mapped
2955 * to the same file through mmap which we want to write.
2956 */
2957 uio_prefaultpages((long)n, uiop);
2958
2959 if (vpm_enable) {
2960 /*
2961 * It will use kpm mappings, so no need to
2962 * pass an address.
2963 */
2964 error = writerp4(rp, NULL, n, uiop, 0);
2965 } else {
2966 if (segmap_kpm) {
2967 int pon = uiop->uio_loffset & PAGEOFFSET;
2968 size_t pn = MIN(PAGESIZE - pon,
2969 uiop->uio_resid);
2970 int pagecreate;
2971
2972 mutex_enter(&rp->r_statelock);
2973 pagecreate = (pon == 0) && (pn == PAGESIZE ||
2974 uiop->uio_loffset + pn >= rp->r_size);
2975 mutex_exit(&rp->r_statelock);
2976
2977 base = segmap_getmapflt(segkmap, vp, off + on,
2978 pn, !pagecreate, S_WRITE);
2979
2980 error = writerp4(rp, base + pon, n, uiop,
2981 pagecreate);
2982
2983 } else {
2984 base = segmap_getmapflt(segkmap, vp, off + on,
2985 n, 0, S_READ);
2986 error = writerp4(rp, base + on, n, uiop, 0);
2987 }
2988 }
2989
2990 if (!error) {
2991 if (mi->mi_flags & MI4_NOAC)
2992 flags = SM_WRITE;
2993 else if ((uiop->uio_loffset % bsize) == 0 ||
2994 IS_SWAPVP(vp)) {
2995 /*
2996 * Have written a whole block.
2997 * Start an asynchronous write
2998 * and mark the buffer to
2999 * indicate that it won't be
3000 * needed again soon.
3001 */
3002 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
3003 } else
3004 flags = 0;
3005 if ((ioflag & (FSYNC|FDSYNC)) ||
3006 (rp->r_flags & R4OUTOFSPACE)) {
3007 flags &= ~SM_ASYNC;
3008 flags |= SM_WRITE;
3009 }
3010 if (vpm_enable) {
3011 error = vpm_sync_pages(vp, off, n, flags);
3012 } else {
3013 error = segmap_release(segkmap, base, flags);
3014 }
3015 } else {
3016 if (vpm_enable) {
3017 (void) vpm_sync_pages(vp, off, n, 0);
3018 } else {
3019 (void) segmap_release(segkmap, base, 0);
3020 }
3021 /*
3022 * In the event that we got an access error while
3023 * faulting in a page for a write-only file just
3024 * force a write.
3025 */
3026 if (error == EACCES)
3027 goto nfs4_fwrite;
3028 }
3029 } while (!error && uiop->uio_resid > 0);
3030
3031 bottom:
3032 if (error) {
3033 uiop->uio_resid = resid + remainder;
3034 uiop->uio_loffset = offset;
3035 } else {
3036 uiop->uio_resid += remainder;
3037
3038 mutex_enter(&rp->r_statev4_lock);
3039 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3040 gethrestime(&rp->r_attr.va_mtime);
3041 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3042 }
3043 mutex_exit(&rp->r_statev4_lock);
3044 }
3045
3046 nfs_rw_exit(&rp->r_lkserlock);
3047
3048 return (error);
3049 }
3050
3051 /*
3052 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3053 */
3054 static int
3055 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3056 int flags, cred_t *cr)
3057 {
3058 struct buf *bp;
3059 int error;
3060 page_t *savepp;
3061 uchar_t fsdata;
3062 stable_how4 stab_comm;
3063
3064 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3065 bp = pageio_setup(pp, len, vp, flags);
3066 ASSERT(bp != NULL);
3067
3068 /*
3069 * pageio_setup should have set b_addr to 0. This
3070 * is correct since we want to do I/O on a page
3071 * boundary. bp_mapin will use this addr to calculate
3072 * an offset, and then set b_addr to the kernel virtual
3073 * address it allocated for us.
3074 */
3075 ASSERT(bp->b_un.b_addr == 0);
3076
3077 bp->b_edev = 0;
3078 bp->b_dev = 0;
3079 bp->b_lblkno = lbtodb(off);
3080 bp->b_file = vp;
3081 bp->b_offset = (offset_t)off;
3082 bp_mapin(bp);
3083
3084 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3085 freemem > desfree)
3086 stab_comm = UNSTABLE4;
3087 else
3088 stab_comm = FILE_SYNC4;
3089
3090 error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3091
3092 bp_mapout(bp);
3093 pageio_done(bp);
3094
3095 if (stab_comm == UNSTABLE4)
3096 fsdata = C_DELAYCOMMIT;
3097 else
3098 fsdata = C_NOCOMMIT;
3099
3100 savepp = pp;
3101 do {
3102 pp->p_fsdata = fsdata;
3103 } while ((pp = pp->p_next) != savepp);
3104
3105 return (error);
3106 }
3107
3108 /*
3109 */
3110 static int
3111 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3112 {
3113 nfs4_open_owner_t *oop;
3114 nfs4_open_stream_t *osp;
3115 rnode4_t *rp = VTOR4(vp);
3116 mntinfo4_t *mi = VTOMI4(vp);
3117 int reopen_needed;
3118
3119 ASSERT(nfs_zone() == mi->mi_zone);
3120
3121
3122 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3123 if (!oop)
3124 return (EIO);
3125
3126 /* returns with 'os_sync_lock' held */
3127 osp = find_open_stream(oop, rp);
3128 if (!osp) {
3129 open_owner_rele(oop);
3130 return (EIO);
3131 }
3132
3133 if (osp->os_failed_reopen) {
3134 mutex_exit(&osp->os_sync_lock);
3135 open_stream_rele(osp, rp);
3136 open_owner_rele(oop);
3137 return (EIO);
3138 }
3139
3140 /*
3141 * Determine whether a reopen is needed. If this
3142 * is a delegation open stream, then the os_delegation bit
3143 * should be set.
3144 */
3145
3146 reopen_needed = osp->os_delegation;
3147
3148 mutex_exit(&osp->os_sync_lock);
3149 open_owner_rele(oop);
3150
3151 if (reopen_needed) {
3152 nfs4_error_zinit(ep);
3153 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3154 mutex_enter(&osp->os_sync_lock);
3155 if (ep->error || ep->stat || osp->os_failed_reopen) {
3156 mutex_exit(&osp->os_sync_lock);
3157 open_stream_rele(osp, rp);
3158 return (EIO);
3159 }
3160 mutex_exit(&osp->os_sync_lock);
3161 }
3162 open_stream_rele(osp, rp);
3163
3164 return (0);
3165 }
3166
3167 /*
3168 * Write to file. Writes to remote server in largest size
3169 * chunks that the server can handle. Write is synchronous.
3170 */
3171 static int
3172 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3173 stable_how4 *stab_comm)
3174 {
3175 mntinfo4_t *mi;
3176 COMPOUND4args_clnt args;
3177 COMPOUND4res_clnt res;
3178 WRITE4args *wargs;
3179 WRITE4res *wres;
3180 nfs_argop4 argop[2];
3181 nfs_resop4 *resop;
3182 int tsize;
3183 stable_how4 stable;
3184 rnode4_t *rp;
3185 int doqueue = 1;
3186 bool_t needrecov;
3187 nfs4_recov_state_t recov_state;
3188 nfs4_stateid_types_t sid_types;
3189 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3190 int recov;
3191
3192 rp = VTOR4(vp);
3193 mi = VTOMI4(vp);
3194
3195 ASSERT(nfs_zone() == mi->mi_zone);
3196
3197 stable = *stab_comm;
3198 *stab_comm = FILE_SYNC4;
3199
3200 needrecov = FALSE;
3201 recov_state.rs_flags = 0;
3202 recov_state.rs_num_retry_despite_err = 0;
3203 nfs4_init_stateid_types(&sid_types);
3204
3205 /* Is curthread the recovery thread? */
3206 mutex_enter(&mi->mi_lock);
3207 recov = (mi->mi_recovthread == curthread);
3208 mutex_exit(&mi->mi_lock);
3209
3210 recov_retry:
3211 args.ctag = TAG_WRITE;
3212 args.array_len = 2;
3213 args.array = argop;
3214
3215 if (!recov) {
3216 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3217 &recov_state, NULL);
3218 if (e.error)
3219 return (e.error);
3220 }
3221
3222 /* 0. putfh target fh */
3223 argop[0].argop = OP_CPUTFH;
3224 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3225
3226 /* 1. write */
3227 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3228
3229 do {
3230
3231 wargs->offset = (offset4)offset;
3232 wargs->data_val = base;
3233
3234 if (mi->mi_io_kstats) {
3235 mutex_enter(&mi->mi_lock);
3236 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3237 mutex_exit(&mi->mi_lock);
3238 }
3239
3240 if ((vp->v_flag & VNOCACHE) ||
3241 (rp->r_flags & R4DIRECTIO) ||
3242 (mi->mi_flags & MI4_DIRECTIO))
3243 tsize = MIN(mi->mi_stsize, count);
3244 else
3245 tsize = MIN(mi->mi_curwrite, count);
3246 wargs->data_len = (uint_t)tsize;
3247 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3248
3249 if (mi->mi_io_kstats) {
3250 mutex_enter(&mi->mi_lock);
3251 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3252 mutex_exit(&mi->mi_lock);
3253 }
3254
3255 if (!recov) {
3256 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3257 if (e.error && !needrecov) {
3258 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3259 &recov_state, needrecov);
3260 return (e.error);
3261 }
3262 } else {
3263 if (e.error)
3264 return (e.error);
3265 }
3266
3267 /*
3268 * Do handling of OLD_STATEID outside
3269 * of the normal recovery framework.
3270 *
3271 * If write receives a BAD stateid error while using a
3272 * delegation stateid, retry using the open stateid (if it
3273 * exists). If it doesn't have an open stateid, reopen the
3274 * file first, then retry.
3275 */
3276 if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3277 sid_types.cur_sid_type != SPEC_SID) {
3278 nfs4_save_stateid(&wargs->stateid, &sid_types);
3279 if (!recov)
3280 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3281 &recov_state, needrecov);
3282 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3283 goto recov_retry;
3284 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3285 sid_types.cur_sid_type == DEL_SID) {
3286 nfs4_save_stateid(&wargs->stateid, &sid_types);
3287 mutex_enter(&rp->r_statev4_lock);
3288 rp->r_deleg_return_pending = TRUE;
3289 mutex_exit(&rp->r_statev4_lock);
3290 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3291 if (!recov)
3292 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3293 &recov_state, needrecov);
3294 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3295 return (EIO);
3296 }
3297 if (!recov)
3298 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3299 &recov_state, needrecov);
3300 /* hold needed for nfs4delegreturn_thread */
3301 VN_HOLD(vp);
3302 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3303 NFS4_DR_DISCARD), FALSE);
3304 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3305 goto recov_retry;
3306 }
3307
3308 if (needrecov) {
3309 bool_t abort;
3310
3311 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3312 "nfs4write: client got error %d, res.status %d"
3313 ", so start recovery", e.error, res.status));
3314
3315 abort = nfs4_start_recovery(&e,
3316 VTOMI4(vp), vp, NULL, &wargs->stateid,
3317 NULL, OP_WRITE, NULL, NULL, NULL);
3318 if (!e.error) {
3319 e.error = geterrno4(res.status);
3320 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3321 }
3322 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3323 &recov_state, needrecov);
3324 if (abort == FALSE)
3325 goto recov_retry;
3326 return (e.error);
3327 }
3328
3329 if (res.status) {
3330 e.error = geterrno4(res.status);
3331 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3332 if (!recov)
3333 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3334 &recov_state, needrecov);
3335 return (e.error);
3336 }
3337
3338 resop = &res.array[1]; /* write res */
3339 wres = &resop->nfs_resop4_u.opwrite;
3340
3341 if ((int)wres->count > tsize) {
3342 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3343
3344 zcmn_err(getzoneid(), CE_WARN,
3345 "nfs4write: server wrote %u, requested was %u",
3346 (int)wres->count, tsize);
3347 if (!recov)
3348 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3349 &recov_state, needrecov);
3350 return (EIO);
3351 }
3352 if (wres->committed == UNSTABLE4) {
3353 *stab_comm = UNSTABLE4;
3354 if (wargs->stable == DATA_SYNC4 ||
3355 wargs->stable == FILE_SYNC4) {
3356 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3357 zcmn_err(getzoneid(), CE_WARN,
3358 "nfs4write: server %s did not commit "
3359 "to stable storage",
3360 rp->r_server->sv_hostname);
3361 if (!recov)
3362 nfs4_end_fop(VTOMI4(vp), vp, NULL,
3363 OH_WRITE, &recov_state, needrecov);
3364 return (EIO);
3365 }
3366 }
3367
3368 tsize = (int)wres->count;
3369 count -= tsize;
3370 base += tsize;
3371 offset += tsize;
3372 if (mi->mi_io_kstats) {
3373 mutex_enter(&mi->mi_lock);
3374 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3375 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3376 tsize;
3377 mutex_exit(&mi->mi_lock);
3378 }
3379 lwp_stat_update(LWP_STAT_OUBLK, 1);
3380 mutex_enter(&rp->r_statelock);
3381 if (rp->r_flags & R4HAVEVERF) {
3382 if (rp->r_writeverf != wres->writeverf) {
3383 nfs4_set_mod(vp);
3384 rp->r_writeverf = wres->writeverf;
3385 }
3386 } else {
3387 rp->r_writeverf = wres->writeverf;
3388 rp->r_flags |= R4HAVEVERF;
3389 }
3390 PURGE_ATTRCACHE4_LOCKED(rp);
3391 rp->r_flags |= R4WRITEMODIFIED;
3392 gethrestime(&rp->r_attr.va_mtime);
3393 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3394 mutex_exit(&rp->r_statelock);
3395 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3396 } while (count);
3397
3398 if (!recov)
3399 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3400 needrecov);
3401
3402 return (e.error);
3403 }
3404
3405 /*
3406 * Read from a file. Reads data in largest chunks our interface can handle.
3407 */
3408 static int
3409 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3410 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3411 {
3412 mntinfo4_t *mi;
3413 COMPOUND4args_clnt args;
3414 COMPOUND4res_clnt res;
3415 READ4args *rargs;
3416 nfs_argop4 argop[2];
3417 int tsize;
3418 int doqueue;
3419 rnode4_t *rp;
3420 int data_len;
3421 bool_t is_eof;
3422 bool_t needrecov = FALSE;
3423 nfs4_recov_state_t recov_state;
3424 nfs4_stateid_types_t sid_types;
3425 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3426
3427 rp = VTOR4(vp);
3428 mi = VTOMI4(vp);
3429 doqueue = 1;
3430
3431 ASSERT(nfs_zone() == mi->mi_zone);
3432
3433 args.ctag = async ? TAG_READAHEAD : TAG_READ;
3434
3435 args.array_len = 2;
3436 args.array = argop;
3437
3438 nfs4_init_stateid_types(&sid_types);
3439
3440 recov_state.rs_flags = 0;
3441 recov_state.rs_num_retry_despite_err = 0;
3442
3443 recov_retry:
3444 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3445 &recov_state, NULL);
3446 if (e.error)
3447 return (e.error);
3448
3449 /* putfh target fh */
3450 argop[0].argop = OP_CPUTFH;
3451 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3452
3453 /* read */
3454 argop[1].argop = OP_READ;
3455 rargs = &argop[1].nfs_argop4_u.opread;
3456 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3457 OP_READ, &sid_types, async);
3458
3459 do {
3460 if (mi->mi_io_kstats) {
3461 mutex_enter(&mi->mi_lock);
3462 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3463 mutex_exit(&mi->mi_lock);
3464 }
3465
3466 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3467 "nfs4read: %s call, rp %s",
3468 needrecov ? "recov" : "first",
3469 rnode4info(rp)));
3470
3471 if ((vp->v_flag & VNOCACHE) ||
3472 (rp->r_flags & R4DIRECTIO) ||
3473 (mi->mi_flags & MI4_DIRECTIO))
3474 tsize = MIN(mi->mi_tsize, count);
3475 else
3476 tsize = MIN(mi->mi_curread, count);
3477
3478 rargs->offset = (offset4)offset;
3479 rargs->count = (count4)tsize;
3480 rargs->res_data_val_alt = NULL;
3481 rargs->res_mblk = NULL;
3482 rargs->res_uiop = NULL;
3483 rargs->res_maxsize = 0;
3484 rargs->wlist = NULL;
3485
3486 if (uiop)
3487 rargs->res_uiop = uiop;
3488 else
3489 rargs->res_data_val_alt = base;
3490 rargs->res_maxsize = tsize;
3491
3492 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3493 #ifdef DEBUG
3494 if (nfs4read_error_inject) {
3495 res.status = nfs4read_error_inject;
3496 nfs4read_error_inject = 0;
3497 }
3498 #endif
3499
3500 if (mi->mi_io_kstats) {
3501 mutex_enter(&mi->mi_lock);
3502 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3503 mutex_exit(&mi->mi_lock);
3504 }
3505
3506 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3507 if (e.error != 0 && !needrecov) {
3508 nfs4_end_fop(mi, vp, NULL, OH_READ,
3509 &recov_state, needrecov);
3510 return (e.error);
3511 }
3512
3513 /*
3514 * Do proper retry for OLD and BAD stateid errors outside
3515 * of the normal recovery framework. There are two differences
3516 * between async and sync reads. The first is that we allow
3517 * retry on BAD_STATEID for async reads, but not sync reads.
3518 * The second is that we mark the file dead for a failed
3519 * attempt with a special stateid for sync reads, but just
3520 * return EIO for async reads.
3521 *
3522 * If a sync read receives a BAD stateid error while using a
3523 * delegation stateid, retry using the open stateid (if it
3524 * exists). If it doesn't have an open stateid, reopen the
3525 * file first, then retry.
3526 */
3527 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3528 res.status == NFS4ERR_BAD_STATEID) && async) {
3529 nfs4_end_fop(mi, vp, NULL, OH_READ,
3530 &recov_state, needrecov);
3531 if (sid_types.cur_sid_type == SPEC_SID) {
3532 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3533 return (EIO);
3534 }
3535 nfs4_save_stateid(&rargs->stateid, &sid_types);
3536 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3537 goto recov_retry;
3538 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3539 !async && sid_types.cur_sid_type != SPEC_SID) {
3540 nfs4_save_stateid(&rargs->stateid, &sid_types);
3541 nfs4_end_fop(mi, vp, NULL, OH_READ,
3542 &recov_state, needrecov);
3543 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3544 goto recov_retry;
3545 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3546 sid_types.cur_sid_type == DEL_SID) {
3547 nfs4_save_stateid(&rargs->stateid, &sid_types);
3548 mutex_enter(&rp->r_statev4_lock);
3549 rp->r_deleg_return_pending = TRUE;
3550 mutex_exit(&rp->r_statev4_lock);
3551 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3552 nfs4_end_fop(mi, vp, NULL, OH_READ,
3553 &recov_state, needrecov);
3554 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3555 return (EIO);
3556 }
3557 nfs4_end_fop(mi, vp, NULL, OH_READ,
3558 &recov_state, needrecov);
3559 /* hold needed for nfs4delegreturn_thread */
3560 VN_HOLD(vp);
3561 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3562 NFS4_DR_DISCARD), FALSE);
3563 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3564 goto recov_retry;
3565 }
3566 if (needrecov) {
3567 bool_t abort;
3568
3569 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3570 "nfs4read: initiating recovery\n"));
3571 abort = nfs4_start_recovery(&e,
3572 mi, vp, NULL, &rargs->stateid,
3573 NULL, OP_READ, NULL, NULL, NULL);
3574 nfs4_end_fop(mi, vp, NULL, OH_READ,
3575 &recov_state, needrecov);
3576 /*
3577 * Do not retry if we got OLD_STATEID using a special
3578 * stateid. This avoids looping with a broken server.
3579 */
3580 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3581 sid_types.cur_sid_type == SPEC_SID)
3582 abort = TRUE;
3583
3584 if (abort == FALSE) {
3585 /*
3586 * Need to retry all possible stateids in
3587 * case the recovery error wasn't stateid
3588 * related or the stateids have become
3589 * stale (server reboot).
3590 */
3591 nfs4_init_stateid_types(&sid_types);
3592 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3593 goto recov_retry;
3594 }
3595
3596 if (!e.error) {
3597 e.error = geterrno4(res.status);
3598 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3599 }
3600 return (e.error);
3601 }
3602
3603 if (res.status) {
3604 e.error = geterrno4(res.status);
3605 nfs4_end_fop(mi, vp, NULL, OH_READ,
3606 &recov_state, needrecov);
3607 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3608 return (e.error);
3609 }
3610
3611 data_len = res.array[1].nfs_resop4_u.opread.data_len;
3612 count -= data_len;
3613 if (base)
3614 base += data_len;
3615 offset += data_len;
3616 if (mi->mi_io_kstats) {
3617 mutex_enter(&mi->mi_lock);
3618 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3619 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3620 mutex_exit(&mi->mi_lock);
3621 }
3622 lwp_stat_update(LWP_STAT_INBLK, 1);
3623 is_eof = res.array[1].nfs_resop4_u.opread.eof;
3624 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3625
3626 } while (count && !is_eof);
3627
3628 *residp = count;
3629
3630 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3631
3632 return (e.error);
3633 }
3634
3635 /* ARGSUSED */
3636 static int
3637 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3638 caller_context_t *ct)
3639 {
3640 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3641 return (EIO);
3642 switch (cmd) {
3643 case _FIODIRECTIO:
3644 return (nfs4_directio(vp, (int)arg, cr));
3645 default:
3646 return (ENOTTY);
3647 }
3648 }
3649
3650 /* ARGSUSED */
3651 int
3652 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3653 caller_context_t *ct)
3654 {
3655 int error;
3656 rnode4_t *rp = VTOR4(vp);
3657
3658 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3659 return (EIO);
3660 /*
3661 * If it has been specified that the return value will
3662 * just be used as a hint, and we are only being asked
3663 * for size, fsid or rdevid, then return the client's
3664 * notion of these values without checking to make sure
3665 * that the attribute cache is up to date.
3666 * The whole point is to avoid an over the wire GETATTR
3667 * call.
3668 */
3669 if (flags & ATTR_HINT) {
3670 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3671 mutex_enter(&rp->r_statelock);
3672 if (vap->va_mask & AT_SIZE)
3673 vap->va_size = rp->r_size;
3674 if (vap->va_mask & AT_FSID)
3675 vap->va_fsid = rp->r_attr.va_fsid;
3676 if (vap->va_mask & AT_RDEV)
3677 vap->va_rdev = rp->r_attr.va_rdev;
3678 mutex_exit(&rp->r_statelock);
3679 return (0);
3680 }
3681 }
3682
3683 /*
3684 * Only need to flush pages if asking for the mtime
3685 * and if there any dirty pages or any outstanding
3686 * asynchronous (write) requests for this file.
3687 */
3688 if (vap->va_mask & AT_MTIME) {
3689 rp = VTOR4(vp);
3690 if (nfs4_has_pages(vp)) {
3691 mutex_enter(&rp->r_statev4_lock);
3692 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3693 mutex_exit(&rp->r_statev4_lock);
3694 if (rp->r_flags & R4DIRTY ||
3695 rp->r_awcount > 0) {
3696 mutex_enter(&rp->r_statelock);
3697 rp->r_gcount++;
3698 mutex_exit(&rp->r_statelock);
3699 error =
3700 nfs4_putpage(vp, (u_offset_t)0,
3701 0, 0, cr, NULL);
3702 mutex_enter(&rp->r_statelock);
3703 if (error && (error == ENOSPC ||
3704 error == EDQUOT)) {
3705 if (!rp->r_error)
3706 rp->r_error = error;
3707 }
3708 if (--rp->r_gcount == 0)
3709 cv_broadcast(&rp->r_cv);
3710 mutex_exit(&rp->r_statelock);
3711 }
3712 } else {
3713 mutex_exit(&rp->r_statev4_lock);
3714 }
3715 }
3716 }
3717 return (nfs4getattr(vp, vap, cr));
3718 }
3719
3720 int
3721 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3722 {
3723 /*
3724 * If these are the only two bits cleared
3725 * on the server then return 0 (OK) else
3726 * return 1 (BAD).
3727 */
3728 on_client &= ~(S_ISUID|S_ISGID);
3729 if (on_client == from_server)
3730 return (0);
3731 else
3732 return (1);
3733 }
3734
3735 /*ARGSUSED4*/
3736 static int
3737 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3738 caller_context_t *ct)
3739 {
3740 int error;
3741
3742 if (vap->va_mask & AT_NOSET)
3743 return (EINVAL);
3744
3745 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3746 return (EIO);
3747
3748 /*
3749 * Don't call secpolicy_vnode_setattr, the client cannot
3750 * use its cached attributes to make security decisions
3751 * as the server may be faking mode bits or mapping uid/gid.
3752 * Always just let the server to the checking.
3753 * If we provide the ability to remove basic priviledges
3754 * to setattr (e.g. basic without chmod) then we will
3755 * need to add a check here before calling the server.
3756 */
3757 error = nfs4setattr(vp, vap, flags, cr, NULL);
3758
3759 if (error == 0 && (vap->va_mask & AT_SIZE)) {
3760 if (vap->va_size == 0) {
3761 vnevent_truncate(vp, ct);
3762 } else {
3763 vnevent_resize(vp, ct);
3764 }
3765 }
3766
3767 return (error);
3768 }
3769
3770 /*
3771 * To replace the "guarded" version 3 setattr, we use two types of compound
3772 * setattr requests:
3773 * 1. The "normal" setattr, used when the size of the file isn't being
3774 * changed - { Putfh <fh>; Setattr; Getattr }/
3775 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3776 * with only ctime as the argument. If the server ctime differs from
3777 * what is cached on the client, the verify will fail, but we would
3778 * already have the ctime from the preceding getattr, so just set it
3779 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3780 * Setattr; Getattr }.
3781 *
3782 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3783 * this setattr and NULL if they are not.
3784 */
3785 static int
3786 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3787 vsecattr_t *vsap)
3788 {
3789 COMPOUND4args_clnt args;
3790 COMPOUND4res_clnt res, *resp = NULL;
3791 nfs4_ga_res_t *garp = NULL;
3792 int numops = 3; /* { Putfh; Setattr; Getattr } */
3793 nfs_argop4 argop[5];
3794 int verify_argop = -1;
3795 int setattr_argop = 1;
3796 nfs_resop4 *resop;
3797 vattr_t va;
3798 rnode4_t *rp;
3799 int doqueue = 1;
3800 uint_t mask = vap->va_mask;
3801 mode_t omode;
3802 vsecattr_t *vsp;
3803 timestruc_t ctime;
3804 bool_t needrecov = FALSE;
3805 nfs4_recov_state_t recov_state;
3806 nfs4_stateid_types_t sid_types;
3807 stateid4 stateid;
3808 hrtime_t t;
3809 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3810 servinfo4_t *svp;
3811 bitmap4 supp_attrs;
3812
3813 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3814 rp = VTOR4(vp);
3815 nfs4_init_stateid_types(&sid_types);
3816
3817 /*
3818 * Only need to flush pages if there are any pages and
3819 * if the file is marked as dirty in some fashion. The
3820 * file must be flushed so that we can accurately
3821 * determine the size of the file and the cached data
3822 * after the SETATTR returns. A file is considered to
3823 * be dirty if it is either marked with R4DIRTY, has
3824 * outstanding i/o's active, or is mmap'd. In this
3825 * last case, we can't tell whether there are dirty
3826 * pages, so we flush just to be sure.
3827 */
3828 if (nfs4_has_pages(vp) &&
3829 ((rp->r_flags & R4DIRTY) ||
3830 rp->r_count > 0 ||
3831 rp->r_mapcnt > 0)) {
3832 ASSERT(vp->v_type != VCHR);
3833 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3834 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3835 mutex_enter(&rp->r_statelock);
3836 if (!rp->r_error)
3837 rp->r_error = e.error;
3838 mutex_exit(&rp->r_statelock);
3839 }
3840 }
3841
3842 if (mask & AT_SIZE) {
3843 /*
3844 * Verification setattr compound for non-deleg AT_SIZE:
3845 * { Putfh; Getattr; Verify; Setattr; Getattr }
3846 * Set ctime local here (outside the do_again label)
3847 * so that subsequent retries (after failed VERIFY)
3848 * will use ctime from GETATTR results (from failed
3849 * verify compound) as VERIFY arg.
3850 * If file has delegation, then VERIFY(time_metadata)
3851 * is of little added value, so don't bother.
3852 */
3853 mutex_enter(&rp->r_statev4_lock);
3854 if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3855 rp->r_deleg_return_pending) {
3856 numops = 5;
3857 ctime = rp->r_attr.va_ctime;
3858 }
3859 mutex_exit(&rp->r_statev4_lock);
3860 }
3861
3862 recov_state.rs_flags = 0;
3863 recov_state.rs_num_retry_despite_err = 0;
3864
3865 args.ctag = TAG_SETATTR;
3866 do_again:
3867 recov_retry:
3868 setattr_argop = numops - 2;
3869
3870 args.array = argop;
3871 args.array_len = numops;
3872
3873 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3874 if (e.error)
3875 return (e.error);
3876
3877
3878 /* putfh target fh */
3879 argop[0].argop = OP_CPUTFH;
3880 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3881
3882 if (numops == 5) {
3883 /*
3884 * We only care about the ctime, but need to get mtime
3885 * and size for proper cache update.
3886 */
3887 /* getattr */
3888 argop[1].argop = OP_GETATTR;
3889 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3890 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3891
3892 /* verify - set later in loop */
3893 verify_argop = 2;
3894 }
3895
3896 /* setattr */
3897 svp = rp->r_server;
3898 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3899 supp_attrs = svp->sv_supp_attrs;
3900 nfs_rw_exit(&svp->sv_lock);
3901
3902 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3903 supp_attrs, &e.error, &sid_types);
3904 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3905 if (e.error) {
3906 /* req time field(s) overflow - return immediately */
3907 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3908 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3909 opsetattr.obj_attributes);
3910 return (e.error);
3911 }
3912 omode = rp->r_attr.va_mode;
3913
3914 /* getattr */
3915 argop[numops-1].argop = OP_GETATTR;
3916 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3917 /*
3918 * If we are setting the ACL (indicated only by vsap != NULL), request
3919 * the ACL in this getattr. The ACL returned from this getattr will be
3920 * used in updating the ACL cache.
3921 */
3922 if (vsap != NULL)
3923 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3924 FATTR4_ACL_MASK;
3925 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3926
3927 /*
3928 * setattr iterates if the object size is set and the cached ctime
3929 * does not match the file ctime. In that case, verify the ctime first.
3930 */
3931
3932 do {
3933 if (verify_argop != -1) {
3934 /*
3935 * Verify that the ctime match before doing setattr.
3936 */
3937 va.va_mask = AT_CTIME;
3938 va.va_ctime = ctime;
3939 svp = rp->r_server;
3940 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3941 supp_attrs = svp->sv_supp_attrs;
3942 nfs_rw_exit(&svp->sv_lock);
3943 e.error = nfs4args_verify(&argop[verify_argop], &va,
3944 OP_VERIFY, supp_attrs);
3945 if (e.error) {
3946 /* req time field(s) overflow - return */
3947 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3948 needrecov);
3949 break;
3950 }
3951 }
3952
3953 doqueue = 1;
3954
3955 t = gethrtime();
3956
3957 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3958
3959 /*
3960 * Purge the access cache and ACL cache if changing either the
3961 * owner of the file, the group owner, or the mode. These may
3962 * change the access permissions of the file, so purge old
3963 * information and start over again.
3964 */
3965 if (mask & (AT_UID | AT_GID | AT_MODE)) {
3966 (void) nfs4_access_purge_rp(rp);
3967 if (rp->r_secattr != NULL) {
3968 mutex_enter(&rp->r_statelock);
3969 vsp = rp->r_secattr;
3970 rp->r_secattr = NULL;
3971 mutex_exit(&rp->r_statelock);
3972 if (vsp != NULL)
3973 nfs4_acl_free_cache(vsp);
3974 }
3975 }
3976
3977 /*
3978 * If res.array_len == numops, then everything succeeded,
3979 * except for possibly the final getattr. If only the
3980 * last getattr failed, give up, and don't try recovery.
3981 */
3982 if (res.array_len == numops) {
3983 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3984 needrecov);
3985 if (! e.error)
3986 resp = &res;
3987 break;
3988 }
3989
3990 /*
3991 * if either rpc call failed or completely succeeded - done
3992 */
3993 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3994 if (e.error) {
3995 PURGE_ATTRCACHE4(vp);
3996 if (!needrecov) {
3997 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3998 needrecov);
3999 break;
4000 }
4001 }
4002
4003 /*
4004 * Do proper retry for OLD_STATEID outside of the normal
4005 * recovery framework.
4006 */
4007 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4008 sid_types.cur_sid_type != SPEC_SID &&
4009 sid_types.cur_sid_type != NO_SID) {
4010 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4011 needrecov);
4012 nfs4_save_stateid(&stateid, &sid_types);
4013 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4014 opsetattr.obj_attributes);
4015 if (verify_argop != -1) {
4016 nfs4args_verify_free(&argop[verify_argop]);
4017 verify_argop = -1;
4018 }
4019 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4020 goto recov_retry;
4021 }
4022
4023 if (needrecov) {
4024 bool_t abort;
4025
4026 abort = nfs4_start_recovery(&e,
4027 VTOMI4(vp), vp, NULL, NULL, NULL,
4028 OP_SETATTR, NULL, NULL, NULL);
4029 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4030 needrecov);
4031 /*
4032 * Do not retry if we failed with OLD_STATEID using
4033 * a special stateid. This is done to avoid looping
4034 * with a broken server.
4035 */
4036 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4037 (sid_types.cur_sid_type == SPEC_SID ||
4038 sid_types.cur_sid_type == NO_SID))
4039 abort = TRUE;
4040 if (!e.error) {
4041 if (res.status == NFS4ERR_BADOWNER)
4042 nfs4_log_badowner(VTOMI4(vp),
4043 OP_SETATTR);
4044
4045 e.error = geterrno4(res.status);
4046 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4047 }
4048 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4049 opsetattr.obj_attributes);
4050 if (verify_argop != -1) {
4051 nfs4args_verify_free(&argop[verify_argop]);
4052 verify_argop = -1;
4053 }
4054 if (abort == FALSE) {
4055 /*
4056 * Need to retry all possible stateids in
4057 * case the recovery error wasn't stateid
4058 * related or the stateids have become
4059 * stale (server reboot).
4060 */
4061 nfs4_init_stateid_types(&sid_types);
4062 goto recov_retry;
4063 }
4064 return (e.error);
4065 }
4066
4067 /*
4068 * Need to call nfs4_end_op before nfs4getattr to
4069 * avoid potential nfs4_start_op deadlock. See RFE
4070 * 4777612. Calls to nfs4_invalidate_pages() and
4071 * nfs4_purge_stale_fh() might also generate over the
4072 * wire calls which my cause nfs4_start_op() deadlock.
4073 */
4074 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4075
4076 /*
4077 * Check to update lease.
4078 */
4079 resp = &res;
4080 if (res.status == NFS4_OK) {
4081 break;
4082 }
4083
4084 /*
4085 * Check if verify failed to see if try again
4086 */
4087 if ((verify_argop == -1) || (res.array_len != 3)) {
4088 /*
4089 * can't continue...
4090 */
4091 if (res.status == NFS4ERR_BADOWNER)
4092 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4093
4094 e.error = geterrno4(res.status);
4095 } else {
4096 /*
4097 * When the verify request fails, the client ctime is
4098 * not in sync with the server. This is the same as
4099 * the version 3 "not synchronized" error, and we
4100 * handle it in a similar manner (XXX do we need to???).
4101 * Use the ctime returned in the first getattr for
4102 * the input to the next verify.
4103 * If we couldn't get the attributes, then we give up
4104 * because we can't complete the operation as required.
4105 */
4106 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4107 }
4108 if (e.error) {
4109 PURGE_ATTRCACHE4(vp);
4110 nfs4_purge_stale_fh(e.error, vp, cr);
4111 } else {
4112 /*
4113 * retry with a new verify value
4114 */
4115 ctime = garp->n4g_va.va_ctime;
4116 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4117 resp = NULL;
4118 }
4119 if (!e.error) {
4120 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4121 opsetattr.obj_attributes);
4122 if (verify_argop != -1) {
4123 nfs4args_verify_free(&argop[verify_argop]);
4124 verify_argop = -1;
4125 }
4126 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4127 goto do_again;
4128 }
4129 } while (!e.error);
4130
4131 if (e.error) {
4132 /*
4133 * If we are here, rfs4call has an irrecoverable error - return
4134 */
4135 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4136 opsetattr.obj_attributes);
4137 if (verify_argop != -1) {
4138 nfs4args_verify_free(&argop[verify_argop]);
4139 verify_argop = -1;
4140 }
4141 if (resp)
4142 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4143 return (e.error);
4144 }
4145
4146
4147
4148 /*
4149 * If changing the size of the file, invalidate
4150 * any local cached data which is no longer part
4151 * of the file. We also possibly invalidate the
4152 * last page in the file. We could use
4153 * pvn_vpzero(), but this would mark the page as
4154 * modified and require it to be written back to
4155 * the server for no particularly good reason.
4156 * This way, if we access it, then we bring it
4157 * back in. A read should be cheaper than a
4158 * write.
4159 */
4160 if (mask & AT_SIZE) {
4161 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4162 }
4163
4164 /* either no error or one of the postop getattr failed */
4165
4166 /*
4167 * XXX Perform a simplified version of wcc checking. Instead of
4168 * have another getattr to get pre-op, just purge cache if
4169 * any of the ops prior to and including the getattr failed.
4170 * If the getattr succeeded then update the attrcache accordingly.
4171 */
4172
4173 garp = NULL;
4174 if (res.status == NFS4_OK) {
4175 /*
4176 * Last getattr
4177 */
4178 resop = &res.array[numops - 1];
4179 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4180 }
4181 /*
4182 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4183 * rather than filling it. See the function itself for details.
4184 */
4185 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4186 if (garp != NULL) {
4187 if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4188 nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4189 vs_ace4_destroy(&garp->n4g_vsa);
4190 } else {
4191 if (vsap != NULL) {
4192 /*
4193 * The ACL was supposed to be set and to be
4194 * returned in the last getattr of this
4195 * compound, but for some reason the getattr
4196 * result doesn't contain the ACL. In this
4197 * case, purge the ACL cache.
4198 */
4199 if (rp->r_secattr != NULL) {
4200 mutex_enter(&rp->r_statelock);
4201 vsp = rp->r_secattr;
4202 rp->r_secattr = NULL;
4203 mutex_exit(&rp->r_statelock);
4204 if (vsp != NULL)
4205 nfs4_acl_free_cache(vsp);
4206 }
4207 }
4208 }
4209 }
4210
4211 if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4212 /*
4213 * Set the size, rather than relying on getting it updated
4214 * via a GETATTR. With delegations the client tries to
4215 * suppress GETATTR calls.
4216 */
4217 mutex_enter(&rp->r_statelock);
4218 rp->r_size = vap->va_size;
4219 mutex_exit(&rp->r_statelock);
4220 }
4221
4222 /*
4223 * Can free up request args and res
4224 */
4225 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4226 opsetattr.obj_attributes);
4227 if (verify_argop != -1) {
4228 nfs4args_verify_free(&argop[verify_argop]);
4229 verify_argop = -1;
4230 }
4231 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4232
4233 /*
4234 * Some servers will change the mode to clear the setuid
4235 * and setgid bits when changing the uid or gid. The
4236 * client needs to compensate appropriately.
4237 */
4238 if (mask & (AT_UID | AT_GID)) {
4239 int terror, do_setattr;
4240
4241 do_setattr = 0;
4242 va.va_mask = AT_MODE;
4243 terror = nfs4getattr(vp, &va, cr);
4244 if (!terror &&
4245 (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4246 (!(mask & AT_MODE) && va.va_mode != omode))) {
4247 va.va_mask = AT_MODE;
4248 if (mask & AT_MODE) {
4249 /*
4250 * We asked the mode to be changed and what
4251 * we just got from the server in getattr is
4252 * not what we wanted it to be, so set it now.
4253 */
4254 va.va_mode = vap->va_mode;
4255 do_setattr = 1;
4256 } else {
4257 /*
4258 * We did not ask the mode to be changed,
4259 * Check to see that the server just cleared
4260 * I_SUID and I_GUID from it. If not then
4261 * set mode to omode with UID/GID cleared.
4262 */
4263 if (nfs4_compare_modes(va.va_mode, omode)) {
4264 omode &= ~(S_ISUID|S_ISGID);
4265 va.va_mode = omode;
4266 do_setattr = 1;
4267 }
4268 }
4269
4270 if (do_setattr)
4271 (void) nfs4setattr(vp, &va, 0, cr, NULL);
4272 }
4273 }
4274
4275 return (e.error);
4276 }
4277
4278 /* ARGSUSED */
4279 static int
4280 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4281 {
4282 COMPOUND4args_clnt args;
4283 COMPOUND4res_clnt res;
4284 int doqueue;
4285 uint32_t acc, resacc, argacc;
4286 rnode4_t *rp;
4287 cred_t *cred, *ncr, *ncrfree = NULL;
4288 nfs4_access_type_t cacc;
4289 int num_ops;
4290 nfs_argop4 argop[3];
4291 nfs_resop4 *resop;
4292 bool_t needrecov = FALSE, do_getattr;
4293 nfs4_recov_state_t recov_state;
4294 int rpc_error;
4295 hrtime_t t;
4296 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4297 mntinfo4_t *mi = VTOMI4(vp);
4298
4299 if (nfs_zone() != mi->mi_zone)
4300 return (EIO);
4301
4302 acc = 0;
4303 if (mode & VREAD)
4304 acc |= ACCESS4_READ;
4305 if (mode & VWRITE) {
4306 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4307 return (EROFS);
4308 if (vp->v_type == VDIR)
4309 acc |= ACCESS4_DELETE;
4310 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4311 }
4312 if (mode & VEXEC) {
4313 if (vp->v_type == VDIR)
4314 acc |= ACCESS4_LOOKUP;
4315 else
4316 acc |= ACCESS4_EXECUTE;
4317 }
4318
4319 if (VTOR4(vp)->r_acache != NULL) {
4320 e.error = nfs4_validate_caches(vp, cr);
4321 if (e.error)
4322 return (e.error);
4323 }
4324
4325 rp = VTOR4(vp);
4326 if (vp->v_type == VDIR)
4327 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4328 ACCESS4_EXTEND | ACCESS4_LOOKUP;
4329 else
4330 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4331 ACCESS4_EXECUTE;
4332 recov_state.rs_flags = 0;
4333 recov_state.rs_num_retry_despite_err = 0;
4334
4335 cred = cr;
4336 /*
4337 * ncr and ncrfree both initially
4338 * point to the memory area returned
4339 * by crnetadjust();
4340 * ncrfree not NULL when exiting means
4341 * that we need to release it
4342 */
4343 ncr = crnetadjust(cred);
4344 ncrfree = ncr;
4345
4346 tryagain:
4347 cacc = nfs4_access_check(rp, acc, cred);
4348 if (cacc == NFS4_ACCESS_ALLOWED) {
4349 if (ncrfree != NULL)
4350 crfree(ncrfree);
4351 return (0);
4352 }
4353 if (cacc == NFS4_ACCESS_DENIED) {
4354 /*
4355 * If the cred can be adjusted, try again
4356 * with the new cred.
4357 */
4358 if (ncr != NULL) {
4359 cred = ncr;
4360 ncr = NULL;
4361 goto tryagain;
4362 }
4363 if (ncrfree != NULL)
4364 crfree(ncrfree);
4365 return (EACCES);
4366 }
4367
4368 recov_retry:
4369 /*
4370 * Don't take with r_statev4_lock here. r_deleg_type could
4371 * change as soon as lock is released. Since it is an int,
4372 * there is no atomicity issue.
4373 */
4374 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4375 num_ops = do_getattr ? 3 : 2;
4376
4377 args.ctag = TAG_ACCESS;
4378
4379 args.array_len = num_ops;
4380 args.array = argop;
4381
4382 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4383 &recov_state, NULL)) {
4384 if (ncrfree != NULL)
4385 crfree(ncrfree);
4386 return (e.error);
4387 }
4388
4389 /* putfh target fh */
4390 argop[0].argop = OP_CPUTFH;
4391 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4392
4393 /* access */
4394 argop[1].argop = OP_ACCESS;
4395 argop[1].nfs_argop4_u.opaccess.access = argacc;
4396
4397 /* getattr */
4398 if (do_getattr) {
4399 argop[2].argop = OP_GETATTR;
4400 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4401 argop[2].nfs_argop4_u.opgetattr.mi = mi;
4402 }
4403
4404 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4405 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4406 rnode4info(VTOR4(vp))));
4407
4408 doqueue = 1;
4409 t = gethrtime();
4410 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4411 rpc_error = e.error;
4412
4413 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4414 if (needrecov) {
4415 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4416 "nfs4_access: initiating recovery\n"));
4417
4418 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4419 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4420 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4421 &recov_state, needrecov);
4422 if (!e.error)
4423 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4424 goto recov_retry;
4425 }
4426 }
4427 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4428
4429 if (e.error)
4430 goto out;
4431
4432 if (res.status) {
4433 e.error = geterrno4(res.status);
4434 /*
4435 * This might generate over the wire calls throught
4436 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4437 * here to avoid a deadlock.
4438 */
4439 nfs4_purge_stale_fh(e.error, vp, cr);
4440 goto out;
4441 }
4442 resop = &res.array[1]; /* access res */
4443
4444 resacc = resop->nfs_resop4_u.opaccess.access;
4445
4446 if (do_getattr) {
4447 resop++; /* getattr res */
4448 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4449 t, cr, FALSE, NULL);
4450 }
4451
4452 if (!e.error) {
4453 nfs4_access_cache(rp, argacc, resacc, cred);
4454 /*
4455 * we just cached results with cred; if cred is the
4456 * adjusted credentials from crnetadjust, we do not want
4457 * to release them before exiting: hence setting ncrfree
4458 * to NULL
4459 */
4460 if (cred != cr)
4461 ncrfree = NULL;
4462 /* XXX check the supported bits too? */
4463 if ((acc & resacc) != acc) {
4464 /*
4465 * The following code implements the semantic
4466 * that a setuid root program has *at least* the
4467 * permissions of the user that is running the
4468 * program. See rfs3call() for more portions
4469 * of the implementation of this functionality.
4470 */
4471 /* XXX-LP */
4472 if (ncr != NULL) {
4473 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4474 cred = ncr;
4475 ncr = NULL;
4476 goto tryagain;
4477 }
4478 e.error = EACCES;
4479 }
4480 }
4481
4482 out:
4483 if (!rpc_error)
4484 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4485
4486 if (ncrfree != NULL)
4487 crfree(ncrfree);
4488
4489 return (e.error);
4490 }
4491
4492 /* ARGSUSED */
4493 static int
4494 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4495 {
4496 COMPOUND4args_clnt args;
4497 COMPOUND4res_clnt res;
4498 int doqueue;
4499 rnode4_t *rp;
4500 nfs_argop4 argop[3];
4501 nfs_resop4 *resop;
4502 READLINK4res *lr_res;
4503 nfs4_ga_res_t *garp;
4504 uint_t len;
4505 char *linkdata;
4506 bool_t needrecov = FALSE;
4507 nfs4_recov_state_t recov_state;
4508 hrtime_t t;
4509 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4510
4511 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4512 return (EIO);
4513 /*
4514 * Can't readlink anything other than a symbolic link.
4515 */
4516 if (vp->v_type != VLNK)
4517 return (EINVAL);
4518
4519 rp = VTOR4(vp);
4520 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4521 e.error = nfs4_validate_caches(vp, cr);
4522 if (e.error)
4523 return (e.error);
4524 mutex_enter(&rp->r_statelock);
4525 if (rp->r_symlink.contents != NULL) {
4526 e.error = uiomove(rp->r_symlink.contents,
4527 rp->r_symlink.len, UIO_READ, uiop);
4528 mutex_exit(&rp->r_statelock);
4529 return (e.error);
4530 }
4531 mutex_exit(&rp->r_statelock);
4532 }
4533 recov_state.rs_flags = 0;
4534 recov_state.rs_num_retry_despite_err = 0;
4535
4536 recov_retry:
4537 args.array_len = 3;
4538 args.array = argop;
4539 args.ctag = TAG_READLINK;
4540
4541 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4542 if (e.error) {
4543 return (e.error);
4544 }
4545
4546 /* 0. putfh symlink fh */
4547 argop[0].argop = OP_CPUTFH;
4548 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4549
4550 /* 1. readlink */
4551 argop[1].argop = OP_READLINK;
4552
4553 /* 2. getattr */
4554 argop[2].argop = OP_GETATTR;
4555 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4556 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4557
4558 doqueue = 1;
4559
4560 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4561 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4562 rnode4info(VTOR4(vp))));
4563
4564 t = gethrtime();
4565
4566 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4567
4568 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4569 if (needrecov) {
4570 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4571 "nfs4_readlink: initiating recovery\n"));
4572
4573 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4574 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4575 if (!e.error)
4576 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4577
4578 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4579 needrecov);
4580 goto recov_retry;
4581 }
4582 }
4583
4584 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4585
4586 if (e.error)
4587 return (e.error);
4588
4589 /*
4590 * There is an path in the code below which calls
4591 * nfs4_purge_stale_fh(), which may generate otw calls through
4592 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4593 * here to avoid nfs4_start_op() deadlock.
4594 */
4595
4596 if (res.status && (res.array_len < args.array_len)) {
4597 /*
4598 * either Putfh or Link failed
4599 */
4600 e.error = geterrno4(res.status);
4601 nfs4_purge_stale_fh(e.error, vp, cr);
4602 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4603 return (e.error);
4604 }
4605
4606 resop = &res.array[1]; /* readlink res */
4607 lr_res = &resop->nfs_resop4_u.opreadlink;
4608
4609 /*
4610 * treat symlink names as data
4611 */
4612 linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
4613 if (linkdata != NULL) {
4614 int uio_len = len - 1;
4615 /* len includes null byte, which we won't uiomove */
4616 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4617 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4618 mutex_enter(&rp->r_statelock);
4619 if (rp->r_symlink.contents == NULL) {
4620 rp->r_symlink.contents = linkdata;
4621 rp->r_symlink.len = uio_len;
4622 rp->r_symlink.size = len;
4623 mutex_exit(&rp->r_statelock);
4624 } else {
4625 mutex_exit(&rp->r_statelock);
4626 kmem_free(linkdata, len);
4627 }
4628 } else {
4629 kmem_free(linkdata, len);
4630 }
4631 }
4632 if (res.status == NFS4_OK) {
4633 resop++; /* getattr res */
4634 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4635 }
4636 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4637
4638 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4639
4640 /*
4641 * The over the wire error for attempting to readlink something
4642 * other than a symbolic link is ENXIO. However, we need to
4643 * return EINVAL instead of ENXIO, so we map it here.
4644 */
4645 return (e.error == ENXIO ? EINVAL : e.error);
4646 }
4647
4648 /*
4649 * Flush local dirty pages to stable storage on the server.
4650 *
4651 * If FNODSYNC is specified, then there is nothing to do because
4652 * metadata changes are not cached on the client before being
4653 * sent to the server.
4654 */
4655 /* ARGSUSED */
4656 static int
4657 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4658 {
4659 int error;
4660
4661 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4662 return (0);
4663 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4664 return (EIO);
4665 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4666 if (!error)
4667 error = VTOR4(vp)->r_error;
4668 return (error);
4669 }
4670
4671 /*
4672 * Weirdness: if the file was removed or the target of a rename
4673 * operation while it was open, it got renamed instead. Here we
4674 * remove the renamed file.
4675 */
4676 /* ARGSUSED */
4677 void
4678 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4679 {
4680 rnode4_t *rp;
4681
4682 ASSERT(vp != DNLC_NO_VNODE);
4683
4684 rp = VTOR4(vp);
4685
4686 if (IS_SHADOW(vp, rp)) {
4687 sv_inactive(vp);
4688 return;
4689 }
4690
4691 /*
4692 * If this is coming from the wrong zone, we let someone in the right
4693 * zone take care of it asynchronously. We can get here due to
4694 * VN_RELE() being called from pageout() or fsflush(). This call may
4695 * potentially turn into an expensive no-op if, for instance, v_count
4696 * gets incremented in the meantime, but it's still correct.
4697 */
4698 if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4699 nfs4_async_inactive(vp, cr);
4700 return;
4701 }
4702
4703 /*
4704 * Some of the cleanup steps might require over-the-wire
4705 * operations. Since VOP_INACTIVE can get called as a result of
4706 * other over-the-wire operations (e.g., an attribute cache update
4707 * can lead to a DNLC purge), doing those steps now would lead to a
4708 * nested call to the recovery framework, which can deadlock. So
4709 * do any over-the-wire cleanups asynchronously, in a separate
4710 * thread.
4711 */
4712
4713 mutex_enter(&rp->r_os_lock);
4714 mutex_enter(&rp->r_statelock);
4715 mutex_enter(&rp->r_statev4_lock);
4716
4717 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4718 mutex_exit(&rp->r_statev4_lock);
4719 mutex_exit(&rp->r_statelock);
4720 mutex_exit(&rp->r_os_lock);
4721 nfs4_async_inactive(vp, cr);
4722 return;
4723 }
4724
4725 if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4726 rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4727 mutex_exit(&rp->r_statev4_lock);
4728 mutex_exit(&rp->r_statelock);
4729 mutex_exit(&rp->r_os_lock);
4730 nfs4_async_inactive(vp, cr);
4731 return;
4732 }
4733
4734 if (rp->r_unldvp != NULL) {
4735 mutex_exit(&rp->r_statev4_lock);
4736 mutex_exit(&rp->r_statelock);
4737 mutex_exit(&rp->r_os_lock);
4738 nfs4_async_inactive(vp, cr);
4739 return;
4740 }
4741 mutex_exit(&rp->r_statev4_lock);
4742 mutex_exit(&rp->r_statelock);
4743 mutex_exit(&rp->r_os_lock);
4744
4745 rp4_addfree(rp, cr);
4746 }
4747
4748 /*
4749 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4750 * various bits of state. The caller must not refer to vp after this call.
4751 */
4752
4753 void
4754 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4755 {
4756 rnode4_t *rp = VTOR4(vp);
4757 nfs4_recov_state_t recov_state;
4758 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4759 vnode_t *unldvp;
4760 char *unlname;
4761 cred_t *unlcred;
4762 COMPOUND4args_clnt args;
4763 COMPOUND4res_clnt res, *resp;
4764 nfs_argop4 argop[2];
4765 int doqueue;
4766 #ifdef DEBUG
4767 char *name;
4768 #endif
4769
4770 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4771 ASSERT(!IS_SHADOW(vp, rp));
4772
4773 #ifdef DEBUG
4774 name = fn_name(VTOSV(vp)->sv_name);
4775 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4776 "release vnode %s", name));
4777 kmem_free(name, MAXNAMELEN);
4778 #endif
4779
4780 if (vp->v_type == VREG) {
4781 bool_t recov_failed = FALSE;
4782
4783 e.error = nfs4close_all(vp, cr);
4784 if (e.error) {
4785 /* Check to see if recovery failed */
4786 mutex_enter(&(VTOMI4(vp)->mi_lock));
4787 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4788 recov_failed = TRUE;
4789 mutex_exit(&(VTOMI4(vp)->mi_lock));
4790 if (!recov_failed) {
4791 mutex_enter(&rp->r_statelock);
4792 if (rp->r_flags & R4RECOVERR)
4793 recov_failed = TRUE;
4794 mutex_exit(&rp->r_statelock);
4795 }
4796 if (recov_failed) {
4797 NFS4_DEBUG(nfs4_client_recov_debug,
4798 (CE_NOTE, "nfs4_inactive_otw: "
4799 "close failed (recovery failure)"));
4800 }
4801 }
4802 }
4803
4804 redo:
4805 if (rp->r_unldvp == NULL) {
4806 rp4_addfree(rp, cr);
4807 return;
4808 }
4809
4810 /*
4811 * Save the vnode pointer for the directory where the
4812 * unlinked-open file got renamed, then set it to NULL
4813 * to prevent another thread from getting here before
4814 * we're done with the remove. While we have the
4815 * statelock, make local copies of the pertinent rnode
4816 * fields. If we weren't to do this in an atomic way, the
4817 * the unl* fields could become inconsistent with respect
4818 * to each other due to a race condition between this
4819 * code and nfs_remove(). See bug report 1034328.
4820 */
4821 mutex_enter(&rp->r_statelock);
4822 if (rp->r_unldvp == NULL) {
4823 mutex_exit(&rp->r_statelock);
4824 rp4_addfree(rp, cr);
4825 return;
4826 }
4827
4828 unldvp = rp->r_unldvp;
4829 rp->r_unldvp = NULL;
4830 unlname = rp->r_unlname;
4831 rp->r_unlname = NULL;
4832 unlcred = rp->r_unlcred;
4833 rp->r_unlcred = NULL;
4834 mutex_exit(&rp->r_statelock);
4835
4836 /*
4837 * If there are any dirty pages left, then flush
4838 * them. This is unfortunate because they just
4839 * may get thrown away during the remove operation,
4840 * but we have to do this for correctness.
4841 */
4842 if (nfs4_has_pages(vp) &&
4843 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4844 ASSERT(vp->v_type != VCHR);
4845 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4846 if (e.error) {
4847 mutex_enter(&rp->r_statelock);
4848 if (!rp->r_error)
4849 rp->r_error = e.error;
4850 mutex_exit(&rp->r_statelock);
4851 }
4852 }
4853
4854 recov_state.rs_flags = 0;
4855 recov_state.rs_num_retry_despite_err = 0;
4856 recov_retry_remove:
4857 /*
4858 * Do the remove operation on the renamed file
4859 */
4860 args.ctag = TAG_INACTIVE;
4861
4862 /*
4863 * Remove ops: putfh dir; remove
4864 */
4865 args.array_len = 2;
4866 args.array = argop;
4867
4868 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4869 if (e.error) {
4870 kmem_free(unlname, MAXNAMELEN);
4871 crfree(unlcred);
4872 VN_RELE(unldvp);
4873 /*
4874 * Try again; this time around r_unldvp will be NULL, so we'll
4875 * just call rp4_addfree() and return.
4876 */
4877 goto redo;
4878 }
4879
4880 /* putfh directory */
4881 argop[0].argop = OP_CPUTFH;
4882 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4883
4884 /* remove */
4885 argop[1].argop = OP_CREMOVE;
4886 argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4887
4888 doqueue = 1;
4889 resp = &res;
4890
4891 #if 0 /* notyet */
4892 /*
4893 * Can't do this yet. We may be being called from
4894 * dnlc_purge_XXX while that routine is holding a
4895 * mutex lock to the nc_rele list. The calls to
4896 * nfs3_cache_wcc_data may result in calls to
4897 * dnlc_purge_XXX. This will result in a deadlock.
4898 */
4899 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4900 if (e.error) {
4901 PURGE_ATTRCACHE4(unldvp);
4902 resp = NULL;
4903 } else if (res.status) {
4904 e.error = geterrno4(res.status);
4905 PURGE_ATTRCACHE4(unldvp);
4906 /*
4907 * This code is inactive right now
4908 * but if made active there should
4909 * be a nfs4_end_op() call before
4910 * nfs4_purge_stale_fh to avoid start_op()
4911 * deadlock. See BugId: 4948726
4912 */
4913 nfs4_purge_stale_fh(error, unldvp, cr);
4914 } else {
4915 nfs_resop4 *resop;
4916 REMOVE4res *rm_res;
4917
4918 resop = &res.array[1];
4919 rm_res = &resop->nfs_resop4_u.opremove;
4920 /*
4921 * Update directory cache attribute,
4922 * readdir and dnlc caches.
4923 */
4924 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4925 }
4926 #else
4927 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4928
4929 PURGE_ATTRCACHE4(unldvp);
4930 #endif
4931
4932 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4933 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4934 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4935 if (!e.error)
4936 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4937 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4938 &recov_state, TRUE);
4939 goto recov_retry_remove;
4940 }
4941 }
4942 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4943
4944 /*
4945 * Release stuff held for the remove
4946 */
4947 VN_RELE(unldvp);
4948 if (!e.error && resp)
4949 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4950
4951 kmem_free(unlname, MAXNAMELEN);
4952 crfree(unlcred);
4953 goto redo;
4954 }
4955
4956 /*
4957 * Remote file system operations having to do with directory manipulation.
4958 */
4959 /* ARGSUSED3 */
4960 int
4961 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4962 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4963 int *direntflags, pathname_t *realpnp)
4964 {
4965 int error;
4966 vnode_t *vp, *avp = NULL;
4967 rnode4_t *drp;
4968
4969 *vpp = NULL;
4970 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4971 return (EPERM);
4972 /*
4973 * if LOOKUP_XATTR, must replace dvp (object) with
4974 * object's attrdir before continuing with lookup
4975 */
4976 if (flags & LOOKUP_XATTR) {
4977 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4978 if (error)
4979 return (error);
4980
4981 dvp = avp;
4982
4983 /*
4984 * If lookup is for "", just return dvp now. The attrdir
4985 * has already been activated (from nfs4lookup_xattr), and
4986 * the caller will RELE the original dvp -- not
4987 * the attrdir. So, set vpp and return.
4988 * Currently, when the LOOKUP_XATTR flag is
4989 * passed to VOP_LOOKUP, the name is always empty, and
4990 * shortcircuiting here avoids 3 unneeded lock/unlock
4991 * pairs.
4992 *
4993 * If a non-empty name was provided, then it is the
4994 * attribute name, and it will be looked up below.
4995 */
4996 if (*nm == '\0') {
4997 *vpp = dvp;
4998 return (0);
4999 }
5000
5001 /*
5002 * The vfs layer never sends a name when asking for the
5003 * attrdir, so we should never get here (unless of course
5004 * name is passed at some time in future -- at which time
5005 * we'll blow up here).
5006 */
5007 ASSERT(0);
5008 }
5009
5010 drp = VTOR4(dvp);
5011 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5012 return (EINTR);
5013
5014 error = nfs4lookup(dvp, nm, vpp, cr, 0);
5015 nfs_rw_exit(&drp->r_rwlock);
5016
5017 /*
5018 * If vnode is a device, create special vnode.
5019 */
5020 if (!error && ISVDEV((*vpp)->v_type)) {
5021 vp = *vpp;
5022 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
5023 VN_RELE(vp);
5024 }
5025
5026 return (error);
5027 }
5028
5029 /* ARGSUSED */
5030 static int
5031 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5032 {
5033 int error;
5034 rnode4_t *drp;
5035 int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5036 mntinfo4_t *mi;
5037
5038 mi = VTOMI4(dvp);
5039 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5040 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5041 return (EINVAL);
5042
5043 drp = VTOR4(dvp);
5044 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5045 return (EINTR);
5046
5047 mutex_enter(&drp->r_statelock);
5048 /*
5049 * If the server doesn't support xattrs just return EINVAL
5050 */
5051 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5052 mutex_exit(&drp->r_statelock);
5053 nfs_rw_exit(&drp->r_rwlock);
5054 return (EINVAL);
5055 }
5056
5057 /*
5058 * If there is a cached xattr directory entry,
5059 * use it as long as the attributes are valid. If the
5060 * attributes are not valid, take the simple approach and
5061 * free the cached value and re-fetch a new value.
5062 *
5063 * We don't negative entry cache for now, if we did we
5064 * would need to check if the file has changed on every
5065 * lookup. But xattrs don't exist very often and failing
5066 * an openattr is not much more expensive than and NVERIFY or GETATTR
5067 * so do an openattr over the wire for now.
5068 */
5069 if (drp->r_xattr_dir != NULL) {
5070 if (ATTRCACHE4_VALID(dvp)) {
5071 VN_HOLD(drp->r_xattr_dir);
5072 *vpp = drp->r_xattr_dir;
5073 mutex_exit(&drp->r_statelock);
5074 nfs_rw_exit(&drp->r_rwlock);
5075 return (0);
5076 }
5077 VN_RELE(drp->r_xattr_dir);
5078 drp->r_xattr_dir = NULL;
5079 }
5080 mutex_exit(&drp->r_statelock);
5081
5082 error = nfs4openattr(dvp, vpp, cflag, cr);
5083
5084 nfs_rw_exit(&drp->r_rwlock);
5085
5086 return (error);
5087 }
5088
5089 static int
5090 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5091 {
5092 int error;
5093 rnode4_t *drp;
5094
5095 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5096
5097 /*
5098 * If lookup is for "", just return dvp. Don't need
5099 * to send it over the wire, look it up in the dnlc,
5100 * or perform any access checks.
5101 */
5102 if (*nm == '\0') {
5103 VN_HOLD(dvp);
5104 *vpp = dvp;
5105 return (0);
5106 }
5107
5108 /*
5109 * Can't do lookups in non-directories.
5110 */
5111 if (dvp->v_type != VDIR)
5112 return (ENOTDIR);
5113
5114 /*
5115 * If lookup is for ".", just return dvp. Don't need
5116 * to send it over the wire or look it up in the dnlc,
5117 * just need to check access.
5118 */
5119 if (nm[0] == '.' && nm[1] == '\0') {
5120 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5121 if (error)
5122 return (error);
5123 VN_HOLD(dvp);
5124 *vpp = dvp;
5125 return (0);
5126 }
5127
5128 drp = VTOR4(dvp);
5129 if (!(drp->r_flags & R4LOOKUP)) {
5130 mutex_enter(&drp->r_statelock);
5131 drp->r_flags |= R4LOOKUP;
5132 mutex_exit(&drp->r_statelock);
5133 }
5134
5135 *vpp = NULL;
5136 /*
5137 * Lookup this name in the DNLC. If there is no entry
5138 * lookup over the wire.
5139 */
5140 if (!skipdnlc)
5141 *vpp = dnlc_lookup(dvp, nm);
5142 if (*vpp == NULL) {
5143 /*
5144 * We need to go over the wire to lookup the name.
5145 */
5146 return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5147 }
5148
5149 /*
5150 * We hit on the dnlc
5151 */
5152 if (*vpp != DNLC_NO_VNODE ||
5153 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5154 /*
5155 * But our attrs may not be valid.
5156 */
5157 if (ATTRCACHE4_VALID(dvp)) {
5158 error = nfs4_waitfor_purge_complete(dvp);
5159 if (error) {
5160 VN_RELE(*vpp);
5161 *vpp = NULL;
5162 return (error);
5163 }
5164
5165 /*
5166 * If after the purge completes, check to make sure
5167 * our attrs are still valid.
5168 */
5169 if (ATTRCACHE4_VALID(dvp)) {
5170 /*
5171 * If we waited for a purge we may have
5172 * lost our vnode so look it up again.
5173 */
5174 VN_RELE(*vpp);
5175 *vpp = dnlc_lookup(dvp, nm);
5176 if (*vpp == NULL)
5177 return (nfs4lookupnew_otw(dvp,
5178 nm, vpp, cr));
5179
5180 /*
5181 * The access cache should almost always hit
5182 */
5183 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5184
5185 if (error) {
5186 VN_RELE(*vpp);
5187 *vpp = NULL;
5188 return (error);
5189 }
5190 if (*vpp == DNLC_NO_VNODE) {
5191 VN_RELE(*vpp);
5192 *vpp = NULL;
5193 return (ENOENT);
5194 }
5195 return (0);
5196 }
5197 }
5198 }
5199
5200 ASSERT(*vpp != NULL);
5201
5202 /*
5203 * We may have gotten here we have one of the following cases:
5204 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5205 * need to validate them.
5206 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always
5207 * must validate.
5208 *
5209 * Go to the server and check if the directory has changed, if
5210 * it hasn't we are done and can use the dnlc entry.
5211 */
5212 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5213 }
5214
5215 /*
5216 * Go to the server and check if the directory has changed, if
5217 * it hasn't we are done and can use the dnlc entry. If it
5218 * has changed we get a new copy of its attributes and check
5219 * the access for VEXEC, then relookup the filename and
5220 * get its filehandle and attributes.
5221 *
5222 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5223 * if the NVERIFY failed we must
5224 * purge the caches
5225 * cache new attributes (will set r_time_attr_inval)
5226 * cache new access
5227 * recheck VEXEC access
5228 * add name to dnlc, possibly negative
5229 * if LOOKUP succeeded
5230 * cache new attributes
5231 * else
5232 * set a new r_time_attr_inval for dvp
5233 * check to make sure we have access
5234 *
5235 * The vpp returned is the vnode passed in if the directory is valid,
5236 * a new vnode if successful lookup, or NULL on error.
5237 */
5238 static int
5239 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5240 {
5241 COMPOUND4args_clnt args;
5242 COMPOUND4res_clnt res;
5243 fattr4 *ver_fattr;
5244 fattr4_change dchange;
5245 int32_t *ptr;
5246 int argoplist_size = 7 * sizeof (nfs_argop4);
5247 nfs_argop4 *argop;
5248 int doqueue;
5249 mntinfo4_t *mi;
5250 nfs4_recov_state_t recov_state;
5251 hrtime_t t;
5252 int isdotdot;
5253 vnode_t *nvp;
5254 nfs_fh4 *fhp;
5255 nfs4_sharedfh_t *sfhp;
5256 nfs4_access_type_t cacc;
5257 rnode4_t *nrp;
5258 rnode4_t *drp = VTOR4(dvp);
5259 nfs4_ga_res_t *garp = NULL;
5260 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5261
5262 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5263 ASSERT(nm != NULL);
5264 ASSERT(nm[0] != '\0');
5265 ASSERT(dvp->v_type == VDIR);
5266 ASSERT(nm[0] != '.' || nm[1] != '\0');
5267 ASSERT(*vpp != NULL);
5268
5269 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5270 isdotdot = 1;
5271 args.ctag = TAG_LOOKUP_VPARENT;
5272 } else {
5273 /*
5274 * If dvp were a stub, it should have triggered and caused
5275 * a mount for us to get this far.
5276 */
5277 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5278
5279 isdotdot = 0;
5280 args.ctag = TAG_LOOKUP_VALID;
5281 }
5282
5283 mi = VTOMI4(dvp);
5284 recov_state.rs_flags = 0;
5285 recov_state.rs_num_retry_despite_err = 0;
5286
5287 nvp = NULL;
5288
5289 /* Save the original mount point security information */
5290 (void) save_mnt_secinfo(mi->mi_curr_serv);
5291
5292 recov_retry:
5293 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5294 &recov_state, NULL);
5295 if (e.error) {
5296 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5297 VN_RELE(*vpp);
5298 *vpp = NULL;
5299 return (e.error);
5300 }
5301
5302 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5303
5304 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5305 args.array_len = 7;
5306 args.array = argop;
5307
5308 /* 0. putfh file */
5309 argop[0].argop = OP_CPUTFH;
5310 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5311
5312 /* 1. nverify the change info */
5313 argop[1].argop = OP_NVERIFY;
5314 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5315 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5316 ver_fattr->attrlist4 = (char *)&dchange;
5317 ptr = (int32_t *)&dchange;
5318 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5319 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5320
5321 /* 2. getattr directory */
5322 argop[2].argop = OP_GETATTR;
5323 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5324 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5325
5326 /* 3. access directory */
5327 argop[3].argop = OP_ACCESS;
5328 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5329 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5330
5331 /* 4. lookup name */
5332 if (isdotdot) {
5333 argop[4].argop = OP_LOOKUPP;
5334 } else {
5335 argop[4].argop = OP_CLOOKUP;
5336 argop[4].nfs_argop4_u.opclookup.cname = nm;
5337 }
5338
5339 /* 5. resulting file handle */
5340 argop[5].argop = OP_GETFH;
5341
5342 /* 6. resulting file attributes */
5343 argop[6].argop = OP_GETATTR;
5344 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5345 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5346
5347 doqueue = 1;
5348 t = gethrtime();
5349
5350 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5351
5352 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5353 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5354 if (e.error != 0 && *vpp != NULL)
5355 VN_RELE(*vpp);
5356 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5357 &recov_state, FALSE);
5358 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5359 kmem_free(argop, argoplist_size);
5360 return (e.error);
5361 }
5362
5363 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5364 /*
5365 * For WRONGSEC of a non-dotdot case, send secinfo directly
5366 * from this thread, do not go thru the recovery thread since
5367 * we need the nm information.
5368 *
5369 * Not doing dotdot case because there is no specification
5370 * for (PUTFH, SECINFO "..") yet.
5371 */
5372 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5373 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5374 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5375 &recov_state, FALSE);
5376 else
5377 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5378 &recov_state, TRUE);
5379 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5380 kmem_free(argop, argoplist_size);
5381 if (!e.error)
5382 goto recov_retry;
5383 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5384 VN_RELE(*vpp);
5385 *vpp = NULL;
5386 return (e.error);
5387 }
5388
5389 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5390 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5391 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5392 &recov_state, TRUE);
5393
5394 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5395 kmem_free(argop, argoplist_size);
5396 goto recov_retry;
5397 }
5398 }
5399
5400 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5401
5402 if (e.error || res.array_len == 0) {
5403 /*
5404 * If e.error isn't set, then reply has no ops (or we couldn't
5405 * be here). The only legal way to reply without an op array
5406 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5407 * be in the reply for all other status values.
5408 *
5409 * For valid replies without an ops array, return ENOTSUP
5410 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5411 * return EIO -- don't trust status.
5412 */
5413 if (e.error == 0)
5414 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5415 ENOTSUP : EIO;
5416 VN_RELE(*vpp);
5417 *vpp = NULL;
5418 kmem_free(argop, argoplist_size);
5419 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5420 return (e.error);
5421 }
5422
5423 if (res.status != NFS4ERR_SAME) {
5424 e.error = geterrno4(res.status);
5425
5426 /*
5427 * The NVERIFY "failed" so the directory has changed
5428 * First make sure PUTFH succeeded and NVERIFY "failed"
5429 * cleanly.
5430 */
5431 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5432 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5433 nfs4_purge_stale_fh(e.error, dvp, cr);
5434 VN_RELE(*vpp);
5435 *vpp = NULL;
5436 goto exit;
5437 }
5438
5439 /*
5440 * We know the NVERIFY "failed" so we must:
5441 * purge the caches (access and indirectly dnlc if needed)
5442 */
5443 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5444
5445 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5446 nfs4_purge_stale_fh(e.error, dvp, cr);
5447 VN_RELE(*vpp);
5448 *vpp = NULL;
5449 goto exit;
5450 }
5451
5452 /*
5453 * Install new cached attributes for the directory
5454 */
5455 nfs4_attr_cache(dvp,
5456 &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5457 t, cr, FALSE, NULL);
5458
5459 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5460 nfs4_purge_stale_fh(e.error, dvp, cr);
5461 VN_RELE(*vpp);
5462 *vpp = NULL;
5463 e.error = geterrno4(res.status);
5464 goto exit;
5465 }
5466
5467 /*
5468 * Now we know the directory is valid,
5469 * cache new directory access
5470 */
5471 nfs4_access_cache(drp,
5472 args.array[3].nfs_argop4_u.opaccess.access,
5473 res.array[3].nfs_resop4_u.opaccess.access, cr);
5474
5475 /*
5476 * recheck VEXEC access
5477 */
5478 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5479 if (cacc != NFS4_ACCESS_ALLOWED) {
5480 /*
5481 * Directory permissions might have been revoked
5482 */
5483 if (cacc == NFS4_ACCESS_DENIED) {
5484 e.error = EACCES;
5485 VN_RELE(*vpp);
5486 *vpp = NULL;
5487 goto exit;
5488 }
5489
5490 /*
5491 * Somehow we must not have asked for enough
5492 * so try a singleton ACCESS, should never happen.
5493 */
5494 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5495 if (e.error) {
5496 VN_RELE(*vpp);
5497 *vpp = NULL;
5498 goto exit;
5499 }
5500 }
5501
5502 e.error = geterrno4(res.status);
5503 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5504 /*
5505 * The lookup failed, probably no entry
5506 */
5507 if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5508 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5509 } else {
5510 /*
5511 * Might be some other error, so remove
5512 * the dnlc entry to make sure we start all
5513 * over again, next time.
5514 */
5515 dnlc_remove(dvp, nm);
5516 }
5517 VN_RELE(*vpp);
5518 *vpp = NULL;
5519 goto exit;
5520 }
5521
5522 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5523 /*
5524 * The file exists but we can't get its fh for
5525 * some unknown reason. Remove it from the dnlc
5526 * and error out to be safe.
5527 */
5528 dnlc_remove(dvp, nm);
5529 VN_RELE(*vpp);
5530 *vpp = NULL;
5531 goto exit;
5532 }
5533 fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5534 if (fhp->nfs_fh4_len == 0) {
5535 /*
5536 * The file exists but a bogus fh
5537 * some unknown reason. Remove it from the dnlc
5538 * and error out to be safe.
5539 */
5540 e.error = ENOENT;
5541 dnlc_remove(dvp, nm);
5542 VN_RELE(*vpp);
5543 *vpp = NULL;
5544 goto exit;
5545 }
5546 sfhp = sfh4_get(fhp, mi);
5547
5548 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5549 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5550
5551 /*
5552 * Make the new rnode
5553 */
5554 if (isdotdot) {
5555 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5556 if (e.error) {
5557 sfh4_rele(&sfhp);
5558 VN_RELE(*vpp);
5559 *vpp = NULL;
5560 goto exit;
5561 }
5562 /*
5563 * XXX if nfs4_make_dotdot uses an existing rnode
5564 * XXX it doesn't update the attributes.
5565 * XXX for now just save them again to save an OTW
5566 */
5567 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5568 } else {
5569 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5570 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5571 /*
5572 * If v_type == VNON, then garp was NULL because
5573 * the last op in the compound failed and makenfs4node
5574 * could not find the vnode for sfhp. It created
5575 * a new vnode, so we have nothing to purge here.
5576 */
5577 if (nvp->v_type == VNON) {
5578 vattr_t vattr;
5579
5580 vattr.va_mask = AT_TYPE;
5581 /*
5582 * N.B. We've already called nfs4_end_fop above.
5583 */
5584 e.error = nfs4getattr(nvp, &vattr, cr);
5585 if (e.error) {
5586 sfh4_rele(&sfhp);
5587 VN_RELE(*vpp);
5588 *vpp = NULL;
5589 VN_RELE(nvp);
5590 goto exit;
5591 }
5592 nvp->v_type = vattr.va_type;
5593 }
5594 }
5595 sfh4_rele(&sfhp);
5596
5597 nrp = VTOR4(nvp);
5598 mutex_enter(&nrp->r_statev4_lock);
5599 if (!nrp->created_v4) {
5600 mutex_exit(&nrp->r_statev4_lock);
5601 dnlc_update(dvp, nm, nvp);
5602 } else
5603 mutex_exit(&nrp->r_statev4_lock);
5604
5605 VN_RELE(*vpp);
5606 *vpp = nvp;
5607 } else {
5608 hrtime_t now;
5609 hrtime_t delta = 0;
5610
5611 e.error = 0;
5612
5613 /*
5614 * Because the NVERIFY "succeeded" we know that the
5615 * directory attributes are still valid
5616 * so update r_time_attr_inval
5617 */
5618 now = gethrtime();
5619 mutex_enter(&drp->r_statelock);
5620 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5621 delta = now - drp->r_time_attr_saved;
5622 if (delta < mi->mi_acdirmin)
5623 delta = mi->mi_acdirmin;
5624 else if (delta > mi->mi_acdirmax)
5625 delta = mi->mi_acdirmax;
5626 }
5627 drp->r_time_attr_inval = now + delta;
5628 mutex_exit(&drp->r_statelock);
5629 dnlc_update(dvp, nm, *vpp);
5630
5631 /*
5632 * Even though we have a valid directory attr cache
5633 * and dnlc entry, we may not have access.
5634 * This should almost always hit the cache.
5635 */
5636 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5637 if (e.error) {
5638 VN_RELE(*vpp);
5639 *vpp = NULL;
5640 }
5641
5642 if (*vpp == DNLC_NO_VNODE) {
5643 VN_RELE(*vpp);
5644 *vpp = NULL;
5645 e.error = ENOENT;
5646 }
5647 }
5648
5649 exit:
5650 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5651 kmem_free(argop, argoplist_size);
5652 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5653 return (e.error);
5654 }
5655
5656 /*
5657 * We need to go over the wire to lookup the name, but
5658 * while we are there verify the directory has not
5659 * changed but if it has, get new attributes and check access
5660 *
5661 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5662 * NVERIFY GETATTR ACCESS
5663 *
5664 * With the results:
5665 * if the NVERIFY failed we must purge the caches, add new attributes,
5666 * and cache new access.
5667 * set a new r_time_attr_inval
5668 * add name to dnlc, possibly negative
5669 * if LOOKUP succeeded
5670 * cache new attributes
5671 */
5672 static int
5673 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5674 {
5675 COMPOUND4args_clnt args;
5676 COMPOUND4res_clnt res;
5677 fattr4 *ver_fattr;
5678 fattr4_change dchange;
5679 int32_t *ptr;
5680 nfs4_ga_res_t *garp = NULL;
5681 int argoplist_size = 9 * sizeof (nfs_argop4);
5682 nfs_argop4 *argop;
5683 int doqueue;
5684 mntinfo4_t *mi;
5685 nfs4_recov_state_t recov_state;
5686 hrtime_t t;
5687 int isdotdot;
5688 vnode_t *nvp;
5689 nfs_fh4 *fhp;
5690 nfs4_sharedfh_t *sfhp;
5691 nfs4_access_type_t cacc;
5692 rnode4_t *nrp;
5693 rnode4_t *drp = VTOR4(dvp);
5694 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5695
5696 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5697 ASSERT(nm != NULL);
5698 ASSERT(nm[0] != '\0');
5699 ASSERT(dvp->v_type == VDIR);
5700 ASSERT(nm[0] != '.' || nm[1] != '\0');
5701 ASSERT(*vpp == NULL);
5702
5703 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5704 isdotdot = 1;
5705 args.ctag = TAG_LOOKUP_PARENT;
5706 } else {
5707 /*
5708 * If dvp were a stub, it should have triggered and caused
5709 * a mount for us to get this far.
5710 */
5711 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5712
5713 isdotdot = 0;
5714 args.ctag = TAG_LOOKUP;
5715 }
5716
5717 mi = VTOMI4(dvp);
5718 recov_state.rs_flags = 0;
5719 recov_state.rs_num_retry_despite_err = 0;
5720
5721 nvp = NULL;
5722
5723 /* Save the original mount point security information */
5724 (void) save_mnt_secinfo(mi->mi_curr_serv);
5725
5726 recov_retry:
5727 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5728 &recov_state, NULL);
5729 if (e.error) {
5730 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5731 return (e.error);
5732 }
5733
5734 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5735
5736 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5737 args.array_len = 9;
5738 args.array = argop;
5739
5740 /* 0. putfh file */
5741 argop[0].argop = OP_CPUTFH;
5742 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5743
5744 /* 1. savefh for the nverify */
5745 argop[1].argop = OP_SAVEFH;
5746
5747 /* 2. lookup name */
5748 if (isdotdot) {
5749 argop[2].argop = OP_LOOKUPP;
5750 } else {
5751 argop[2].argop = OP_CLOOKUP;
5752 argop[2].nfs_argop4_u.opclookup.cname = nm;
5753 }
5754
5755 /* 3. resulting file handle */
5756 argop[3].argop = OP_GETFH;
5757
5758 /* 4. resulting file attributes */
5759 argop[4].argop = OP_GETATTR;
5760 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5761 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5762
5763 /* 5. restorefh back the directory for the nverify */
5764 argop[5].argop = OP_RESTOREFH;
5765
5766 /* 6. nverify the change info */
5767 argop[6].argop = OP_NVERIFY;
5768 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5769 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5770 ver_fattr->attrlist4 = (char *)&dchange;
5771 ptr = (int32_t *)&dchange;
5772 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5773 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5774
5775 /* 7. getattr directory */
5776 argop[7].argop = OP_GETATTR;
5777 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5778 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5779
5780 /* 8. access directory */
5781 argop[8].argop = OP_ACCESS;
5782 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5783 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5784
5785 doqueue = 1;
5786 t = gethrtime();
5787
5788 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5789
5790 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5791 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5792 if (e.error != 0 && *vpp != NULL)
5793 VN_RELE(*vpp);
5794 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5795 &recov_state, FALSE);
5796 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5797 kmem_free(argop, argoplist_size);
5798 return (e.error);
5799 }
5800
5801 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5802 /*
5803 * For WRONGSEC of a non-dotdot case, send secinfo directly
5804 * from this thread, do not go thru the recovery thread since
5805 * we need the nm information.
5806 *
5807 * Not doing dotdot case because there is no specification
5808 * for (PUTFH, SECINFO "..") yet.
5809 */
5810 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5811 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5812 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5813 &recov_state, FALSE);
5814 else
5815 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5816 &recov_state, TRUE);
5817 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5818 kmem_free(argop, argoplist_size);
5819 if (!e.error)
5820 goto recov_retry;
5821 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5822 return (e.error);
5823 }
5824
5825 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5826 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5827 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5828 &recov_state, TRUE);
5829
5830 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5831 kmem_free(argop, argoplist_size);
5832 goto recov_retry;
5833 }
5834 }
5835
5836 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5837
5838 if (e.error || res.array_len == 0) {
5839 /*
5840 * If e.error isn't set, then reply has no ops (or we couldn't
5841 * be here). The only legal way to reply without an op array
5842 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5843 * be in the reply for all other status values.
5844 *
5845 * For valid replies without an ops array, return ENOTSUP
5846 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5847 * return EIO -- don't trust status.
5848 */
5849 if (e.error == 0)
5850 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5851 ENOTSUP : EIO;
5852
5853 kmem_free(argop, argoplist_size);
5854 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5855 return (e.error);
5856 }
5857
5858 e.error = geterrno4(res.status);
5859
5860 /*
5861 * The PUTFH and SAVEFH may have failed.
5862 */
5863 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5864 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5865 nfs4_purge_stale_fh(e.error, dvp, cr);
5866 goto exit;
5867 }
5868
5869 /*
5870 * Check if the file exists, if it does delay entering
5871 * into the dnlc until after we update the directory
5872 * attributes so we don't cause it to get purged immediately.
5873 */
5874 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5875 /*
5876 * The lookup failed, probably no entry
5877 */
5878 if (e.error == ENOENT && nfs4_lookup_neg_cache)
5879 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5880 goto exit;
5881 }
5882
5883 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5884 /*
5885 * The file exists but we can't get its fh for
5886 * some unknown reason. Error out to be safe.
5887 */
5888 goto exit;
5889 }
5890
5891 fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5892 if (fhp->nfs_fh4_len == 0) {
5893 /*
5894 * The file exists but a bogus fh
5895 * some unknown reason. Error out to be safe.
5896 */
5897 e.error = EIO;
5898 goto exit;
5899 }
5900 sfhp = sfh4_get(fhp, mi);
5901
5902 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5903 sfh4_rele(&sfhp);
5904 goto exit;
5905 }
5906 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5907
5908 /*
5909 * The RESTOREFH may have failed
5910 */
5911 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5912 sfh4_rele(&sfhp);
5913 e.error = EIO;
5914 goto exit;
5915 }
5916
5917 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5918 /*
5919 * First make sure the NVERIFY failed as we expected,
5920 * if it didn't then be conservative and error out
5921 * as we can't trust the directory.
5922 */
5923 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5924 sfh4_rele(&sfhp);
5925 e.error = EIO;
5926 goto exit;
5927 }
5928
5929 /*
5930 * We know the NVERIFY "failed" so the directory has changed,
5931 * so we must:
5932 * purge the caches (access and indirectly dnlc if needed)
5933 */
5934 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5935
5936 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5937 sfh4_rele(&sfhp);
5938 goto exit;
5939 }
5940 nfs4_attr_cache(dvp,
5941 &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5942 t, cr, FALSE, NULL);
5943
5944 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5945 nfs4_purge_stale_fh(e.error, dvp, cr);
5946 sfh4_rele(&sfhp);
5947 e.error = geterrno4(res.status);
5948 goto exit;
5949 }
5950
5951 /*
5952 * Now we know the directory is valid,
5953 * cache new directory access
5954 */
5955 nfs4_access_cache(drp,
5956 args.array[8].nfs_argop4_u.opaccess.access,
5957 res.array[8].nfs_resop4_u.opaccess.access, cr);
5958
5959 /*
5960 * recheck VEXEC access
5961 */
5962 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5963 if (cacc != NFS4_ACCESS_ALLOWED) {
5964 /*
5965 * Directory permissions might have been revoked
5966 */
5967 if (cacc == NFS4_ACCESS_DENIED) {
5968 sfh4_rele(&sfhp);
5969 e.error = EACCES;
5970 goto exit;
5971 }
5972
5973 /*
5974 * Somehow we must not have asked for enough
5975 * so try a singleton ACCESS should never happen
5976 */
5977 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5978 if (e.error) {
5979 sfh4_rele(&sfhp);
5980 goto exit;
5981 }
5982 }
5983
5984 e.error = geterrno4(res.status);
5985 } else {
5986 hrtime_t now;
5987 hrtime_t delta = 0;
5988
5989 e.error = 0;
5990
5991 /*
5992 * Because the NVERIFY "succeeded" we know that the
5993 * directory attributes are still valid
5994 * so update r_time_attr_inval
5995 */
5996 now = gethrtime();
5997 mutex_enter(&drp->r_statelock);
5998 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5999 delta = now - drp->r_time_attr_saved;
6000 if (delta < mi->mi_acdirmin)
6001 delta = mi->mi_acdirmin;
6002 else if (delta > mi->mi_acdirmax)
6003 delta = mi->mi_acdirmax;
6004 }
6005 drp->r_time_attr_inval = now + delta;
6006 mutex_exit(&drp->r_statelock);
6007
6008 /*
6009 * Even though we have a valid directory attr cache,
6010 * we may not have access.
6011 * This should almost always hit the cache.
6012 */
6013 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
6014 if (e.error) {
6015 sfh4_rele(&sfhp);
6016 goto exit;
6017 }
6018 }
6019
6020 /*
6021 * Now we have successfully completed the lookup, if the
6022 * directory has changed we now have the valid attributes.
6023 * We also know we have directory access.
6024 * Create the new rnode and insert it in the dnlc.
6025 */
6026 if (isdotdot) {
6027 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
6028 if (e.error) {
6029 sfh4_rele(&sfhp);
6030 goto exit;
6031 }
6032 /*
6033 * XXX if nfs4_make_dotdot uses an existing rnode
6034 * XXX it doesn't update the attributes.
6035 * XXX for now just save them again to save an OTW
6036 */
6037 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6038 } else {
6039 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6040 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6041 }
6042 sfh4_rele(&sfhp);
6043
6044 nrp = VTOR4(nvp);
6045 mutex_enter(&nrp->r_statev4_lock);
6046 if (!nrp->created_v4) {
6047 mutex_exit(&nrp->r_statev4_lock);
6048 dnlc_update(dvp, nm, nvp);
6049 } else
6050 mutex_exit(&nrp->r_statev4_lock);
6051
6052 *vpp = nvp;
6053
6054 exit:
6055 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6056 kmem_free(argop, argoplist_size);
6057 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6058 return (e.error);
6059 }
6060
6061 #ifdef DEBUG
6062 void
6063 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6064 {
6065 uint_t i, len;
6066 zoneid_t zoneid = getzoneid();
6067 char *s;
6068
6069 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6070 for (i = 0; i < argcnt; i++) {
6071 nfs_argop4 *op = &argbase[i];
6072 switch (op->argop) {
6073 case OP_CPUTFH:
6074 case OP_PUTFH:
6075 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6076 break;
6077 case OP_PUTROOTFH:
6078 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6079 break;
6080 case OP_CLOOKUP:
6081 s = op->nfs_argop4_u.opclookup.cname;
6082 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6083 break;
6084 case OP_LOOKUP:
6085 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6086 &len, NULL);
6087 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6088 kmem_free(s, len);
6089 break;
6090 case OP_LOOKUPP:
6091 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6092 break;
6093 case OP_GETFH:
6094 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6095 break;
6096 case OP_GETATTR:
6097 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6098 break;
6099 case OP_OPENATTR:
6100 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6101 break;
6102 default:
6103 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6104 op->argop);
6105 break;
6106 }
6107 }
6108 }
6109 #endif
6110
6111 /*
6112 * nfs4lookup_setup - constructs a multi-lookup compound request.
6113 *
6114 * Given the path "nm1/nm2/.../nmn", the following compound requests
6115 * may be created:
6116 *
6117 * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6118 * is faster, for now.
6119 *
6120 * l4_getattrs indicates the type of compound requested.
6121 *
6122 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6123 *
6124 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} }
6125 *
6126 * total number of ops is n + 1.
6127 *
6128 * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6129 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6130 * before the last component, and only get attributes
6131 * for the last component. Note that the second-to-last
6132 * pathname component is XATTR_RPATH, which does NOT go
6133 * over-the-wire as a lookup.
6134 *
6135 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6136 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6137 *
6138 * and total number of ops is n + 5.
6139 *
6140 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6141 * attribute directory: create lookups plus an OPENATTR
6142 * replacing the last lookup. Note that the last pathname
6143 * component is XATTR_RPATH, which does NOT go over-the-wire
6144 * as a lookup.
6145 *
6146 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6147 * Openattr; Getfh; Getattr }
6148 *
6149 * and total number of ops is n + 5.
6150 *
6151 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6152 * nodes too.
6153 *
6154 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6155 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr }
6156 *
6157 * and total number of ops is 3*n + 1.
6158 *
6159 * All cases: returns the index in the arg array of the final LOOKUP op, or
6160 * -1 if no LOOKUPs were used.
6161 */
6162 int
6163 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6164 {
6165 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6166 nfs_argop4 *argbase, *argop;
6167 int arglen, argcnt;
6168 int n = 1; /* number of components */
6169 int nga = 1; /* number of Getattr's in request */
6170 char c = '\0', *s, *p;
6171 int lookup_idx = -1;
6172 int argoplist_size;
6173
6174 /* set lookuparg response result to 0 */
6175 lookupargp->resp->status = NFS4_OK;
6176
6177 /* skip leading "/" or "." e.g. ".//./" if there is */
6178 for (; ; nm++) {
6179 if (*nm != '/' && *nm != '.')
6180 break;
6181
6182 /* ".." is counted as 1 component */
6183 if (*nm == '.' && *(nm + 1) != '/')
6184 break;
6185 }
6186
6187 /*
6188 * Find n = number of components - nm must be null terminated
6189 * Skip "." components.
6190 */
6191 if (*nm != '\0')
6192 for (n = 1, s = nm; *s != '\0'; s++) {
6193 if ((*s == '/') && (*(s + 1) != '/') &&
6194 (*(s + 1) != '\0') &&
6195 !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6196 *(s + 2) == '\0')))
6197 n++;
6198 }
6199 else
6200 n = 0;
6201
6202 /*
6203 * nga is number of components that need Getfh+Getattr
6204 */
6205 switch (l4_getattrs) {
6206 case LKP4_NO_ATTRIBUTES:
6207 nga = 0;
6208 break;
6209 case LKP4_ALL_ATTRIBUTES:
6210 nga = n;
6211 /*
6212 * Always have at least 1 getfh, getattr pair
6213 */
6214 if (nga == 0)
6215 nga++;
6216 break;
6217 case LKP4_LAST_ATTRDIR:
6218 case LKP4_LAST_NAMED_ATTR:
6219 nga = n+1;
6220 break;
6221 }
6222
6223 /*
6224 * If change to use the filehandle attr instead of getfh
6225 * the following line can be deleted.
6226 */
6227 nga *= 2;
6228
6229 /*
6230 * calculate number of ops in request as
6231 * header + trailer + lookups + getattrs
6232 */
6233 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6234
6235 argoplist_size = arglen * sizeof (nfs_argop4);
6236 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6237 lookupargp->argsp->array = argop;
6238
6239 argcnt = lookupargp->header_len;
6240 argop += argcnt;
6241
6242 /*
6243 * loop and create a lookup op and possibly getattr/getfh for
6244 * each component. Skip "." components.
6245 */
6246 for (s = nm; *s != '\0'; s = p) {
6247 /*
6248 * Set up a pathname struct for each component if needed
6249 */
6250 while (*s == '/')
6251 s++;
6252 if (*s == '\0')
6253 break;
6254
6255 for (p = s; (*p != '/') && (*p != '\0'); p++)
6256 ;
6257 c = *p;
6258 *p = '\0';
6259
6260 if (s[0] == '.' && s[1] == '\0') {
6261 *p = c;
6262 continue;
6263 }
6264 if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6265 strcmp(s, XATTR_RPATH) == 0) {
6266 /* getfh XXX may not be needed in future */
6267 argop->argop = OP_GETFH;
6268 argop++;
6269 argcnt++;
6270
6271 /* getattr */
6272 argop->argop = OP_GETATTR;
6273 argop->nfs_argop4_u.opgetattr.attr_request =
6274 lookupargp->ga_bits;
6275 argop->nfs_argop4_u.opgetattr.mi =
6276 lookupargp->mi;
6277 argop++;
6278 argcnt++;
6279
6280 /* openattr */
6281 argop->argop = OP_OPENATTR;
6282 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6283 strcmp(s, XATTR_RPATH) == 0) {
6284 /* openattr */
6285 argop->argop = OP_OPENATTR;
6286 argop++;
6287 argcnt++;
6288
6289 /* getfh XXX may not be needed in future */
6290 argop->argop = OP_GETFH;
6291 argop++;
6292 argcnt++;
6293
6294 /* getattr */
6295 argop->argop = OP_GETATTR;
6296 argop->nfs_argop4_u.opgetattr.attr_request =
6297 lookupargp->ga_bits;
6298 argop->nfs_argop4_u.opgetattr.mi =
6299 lookupargp->mi;
6300 argop++;
6301 argcnt++;
6302 *p = c;
6303 continue;
6304 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6305 /* lookupp */
6306 argop->argop = OP_LOOKUPP;
6307 } else {
6308 /* lookup */
6309 argop->argop = OP_LOOKUP;
6310 (void) str_to_utf8(s,
6311 &argop->nfs_argop4_u.oplookup.objname);
6312 }
6313 lookup_idx = argcnt;
6314 argop++;
6315 argcnt++;
6316
6317 *p = c;
6318
6319 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6320 /* getfh XXX may not be needed in future */
6321 argop->argop = OP_GETFH;
6322 argop++;
6323 argcnt++;
6324
6325 /* getattr */
6326 argop->argop = OP_GETATTR;
6327 argop->nfs_argop4_u.opgetattr.attr_request =
6328 lookupargp->ga_bits;
6329 argop->nfs_argop4_u.opgetattr.mi =
6330 lookupargp->mi;
6331 argop++;
6332 argcnt++;
6333 }
6334 }
6335
6336 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6337 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6338 if (needgetfh) {
6339 /* stick in a post-lookup getfh */
6340 argop->argop = OP_GETFH;
6341 argcnt++;
6342 argop++;
6343 }
6344 /* post-lookup getattr */
6345 argop->argop = OP_GETATTR;
6346 argop->nfs_argop4_u.opgetattr.attr_request =
6347 lookupargp->ga_bits;
6348 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6349 argcnt++;
6350 }
6351 argcnt += lookupargp->trailer_len; /* actual op count */
6352 lookupargp->argsp->array_len = argcnt;
6353 lookupargp->arglen = arglen;
6354
6355 #ifdef DEBUG
6356 if (nfs4_client_lookup_debug)
6357 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6358 #endif
6359
6360 return (lookup_idx);
6361 }
6362
6363 static int
6364 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6365 {
6366 COMPOUND4args_clnt args;
6367 COMPOUND4res_clnt res;
6368 GETFH4res *gf_res = NULL;
6369 nfs_argop4 argop[4];
6370 nfs_resop4 *resop = NULL;
6371 nfs4_sharedfh_t *sfhp;
6372 hrtime_t t;
6373 nfs4_error_t e;
6374
6375 rnode4_t *drp;
6376 int doqueue = 1;
6377 vnode_t *vp;
6378 int needrecov = 0;
6379 nfs4_recov_state_t recov_state;
6380
6381 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6382
6383 *avp = NULL;
6384 recov_state.rs_flags = 0;
6385 recov_state.rs_num_retry_despite_err = 0;
6386
6387 recov_retry:
6388 /* COMPOUND: putfh, openattr, getfh, getattr */
6389 args.array_len = 4;
6390 args.array = argop;
6391 args.ctag = TAG_OPENATTR;
6392
6393 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6394 if (e.error)
6395 return (e.error);
6396
6397 drp = VTOR4(dvp);
6398
6399 /* putfh */
6400 argop[0].argop = OP_CPUTFH;
6401 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6402
6403 /* openattr */
6404 argop[1].argop = OP_OPENATTR;
6405 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6406
6407 /* getfh */
6408 argop[2].argop = OP_GETFH;
6409
6410 /* getattr */
6411 argop[3].argop = OP_GETATTR;
6412 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6413 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6414
6415 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6416 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6417 rnode4info(drp)));
6418
6419 t = gethrtime();
6420
6421 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6422
6423 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6424 if (needrecov) {
6425 bool_t abort;
6426
6427 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6428 "nfs4openattr: initiating recovery\n"));
6429
6430 abort = nfs4_start_recovery(&e,
6431 VTOMI4(dvp), dvp, NULL, NULL, NULL,
6432 OP_OPENATTR, NULL, NULL, NULL);
6433 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6434 if (!e.error) {
6435 e.error = geterrno4(res.status);
6436 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6437 }
6438 if (abort == FALSE)
6439 goto recov_retry;
6440 return (e.error);
6441 }
6442
6443 if (e.error) {
6444 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6445 return (e.error);
6446 }
6447
6448 if (res.status) {
6449 /*
6450 * If OTW errro is NOTSUPP, then it should be
6451 * translated to EINVAL. All Solaris file system
6452 * implementations return EINVAL to the syscall layer
6453 * when the attrdir cannot be created due to an
6454 * implementation restriction or noxattr mount option.
6455 */
6456 if (res.status == NFS4ERR_NOTSUPP) {
6457 mutex_enter(&drp->r_statelock);
6458 if (drp->r_xattr_dir)
6459 VN_RELE(drp->r_xattr_dir);
6460 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6461 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6462 mutex_exit(&drp->r_statelock);
6463
6464 e.error = EINVAL;
6465 } else {
6466 e.error = geterrno4(res.status);
6467 }
6468
6469 if (e.error) {
6470 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6471 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6472 needrecov);
6473 return (e.error);
6474 }
6475 }
6476
6477 resop = &res.array[0]; /* putfh res */
6478 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6479
6480 resop = &res.array[1]; /* openattr res */
6481 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6482
6483 resop = &res.array[2]; /* getfh res */
6484 gf_res = &resop->nfs_resop4_u.opgetfh;
6485 if (gf_res->object.nfs_fh4_len == 0) {
6486 *avp = NULL;
6487 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6488 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6489 return (ENOENT);
6490 }
6491
6492 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6493 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6494 dvp->v_vfsp, t, cr, dvp,
6495 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6496 sfh4_rele(&sfhp);
6497
6498 if (e.error)
6499 PURGE_ATTRCACHE4(vp);
6500
6501 mutex_enter(&vp->v_lock);
6502 vp->v_flag |= V_XATTRDIR;
6503 mutex_exit(&vp->v_lock);
6504
6505 *avp = vp;
6506
6507 mutex_enter(&drp->r_statelock);
6508 if (drp->r_xattr_dir)
6509 VN_RELE(drp->r_xattr_dir);
6510 VN_HOLD(vp);
6511 drp->r_xattr_dir = vp;
6512
6513 /*
6514 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6515 * NULL. xattrs could be created at any time, and we have no
6516 * way to update pc4_xattr_exists in the base object if/when
6517 * it happens.
6518 */
6519 drp->r_pathconf.pc4_xattr_valid = 0;
6520
6521 mutex_exit(&drp->r_statelock);
6522
6523 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6524
6525 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6526
6527 return (0);
6528 }
6529
6530 /* ARGSUSED */
6531 static int
6532 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6533 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6534 vsecattr_t *vsecp)
6535 {
6536 int error;
6537 vnode_t *vp = NULL;
6538 rnode4_t *rp;
6539 struct vattr vattr;
6540 rnode4_t *drp;
6541 vnode_t *tempvp;
6542 enum createmode4 createmode;
6543 bool_t must_trunc = FALSE;
6544 int truncating = 0;
6545
6546 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6547 return (EPERM);
6548 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6549 return (EINVAL);
6550 }
6551
6552 /* . and .. have special meaning in the protocol, reject them. */
6553
6554 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6555 return (EISDIR);
6556
6557 drp = VTOR4(dvp);
6558
6559 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6560 return (EINTR);
6561
6562 top:
6563 /*
6564 * We make a copy of the attributes because the caller does not
6565 * expect us to change what va points to.
6566 */
6567 vattr = *va;
6568
6569 /*
6570 * If the pathname is "", then dvp is the root vnode of
6571 * a remote file mounted over a local directory.
6572 * All that needs to be done is access
6573 * checking and truncation. Note that we avoid doing
6574 * open w/ create because the parent directory might
6575 * be in pseudo-fs and the open would fail.
6576 */
6577 if (*nm == '\0') {
6578 error = 0;
6579 VN_HOLD(dvp);
6580 vp = dvp;
6581 must_trunc = TRUE;
6582 } else {
6583 /*
6584 * We need to go over the wire, just to be sure whether the
6585 * file exists or not. Using the DNLC can be dangerous in
6586 * this case when making a decision regarding existence.
6587 */
6588 error = nfs4lookup(dvp, nm, &vp, cr, 1);
6589 }
6590
6591 if (exclusive)
6592 createmode = EXCLUSIVE4;
6593 else
6594 createmode = GUARDED4;
6595
6596 /*
6597 * error would be set if the file does not exist on the
6598 * server, so lets go create it.
6599 */
6600 if (error) {
6601 goto create_otw;
6602 }
6603
6604 /*
6605 * File does exist on the server
6606 */
6607 if (exclusive == EXCL)
6608 error = EEXIST;
6609 else if (vp->v_type == VDIR && (mode & VWRITE))
6610 error = EISDIR;
6611 else {
6612 /*
6613 * If vnode is a device, create special vnode.
6614 */
6615 if (ISVDEV(vp->v_type)) {
6616 tempvp = vp;
6617 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6618 VN_RELE(tempvp);
6619 }
6620 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6621 if ((vattr.va_mask & AT_SIZE) &&
6622 vp->v_type == VREG) {
6623 rp = VTOR4(vp);
6624 /*
6625 * Check here for large file handled
6626 * by LF-unaware process (as
6627 * ufs_create() does)
6628 */
6629 if (!(flags & FOFFMAX)) {
6630 mutex_enter(&rp->r_statelock);
6631 if (rp->r_size > MAXOFF32_T)
6632 error = EOVERFLOW;
6633 mutex_exit(&rp->r_statelock);
6634 }
6635
6636 /* if error is set then we need to return */
6637 if (error) {
6638 nfs_rw_exit(&drp->r_rwlock);
6639 VN_RELE(vp);
6640 return (error);
6641 }
6642
6643 if (must_trunc) {
6644 vattr.va_mask = AT_SIZE;
6645 error = nfs4setattr(vp, &vattr, 0, cr,
6646 NULL);
6647 } else {
6648 /*
6649 * we know we have a regular file that already
6650 * exists and we may end up truncating the file
6651 * as a result of the open_otw, so flush out
6652 * any dirty pages for this file first.
6653 */
6654 if (nfs4_has_pages(vp) &&
6655 ((rp->r_flags & R4DIRTY) ||
6656 rp->r_count > 0 ||
6657 rp->r_mapcnt > 0)) {
6658 error = nfs4_putpage(vp,
6659 (offset_t)0, 0, 0, cr, ct);
6660 if (error && (error == ENOSPC ||
6661 error == EDQUOT)) {
6662 mutex_enter(
6663 &rp->r_statelock);
6664 if (!rp->r_error)
6665 rp->r_error =
6666 error;
6667 mutex_exit(
6668 &rp->r_statelock);
6669 }
6670 }
6671 vattr.va_mask = (AT_SIZE |
6672 AT_TYPE | AT_MODE);
6673 vattr.va_type = VREG;
6674 createmode = UNCHECKED4;
6675 truncating = 1;
6676 goto create_otw;
6677 }
6678 }
6679 }
6680 }
6681 nfs_rw_exit(&drp->r_rwlock);
6682 if (error) {
6683 VN_RELE(vp);
6684 } else {
6685 vnode_t *tvp;
6686 rnode4_t *trp;
6687 tvp = vp;
6688 if (vp->v_type == VREG) {
6689 trp = VTOR4(vp);
6690 if (IS_SHADOW(vp, trp))
6691 tvp = RTOV4(trp);
6692 }
6693
6694 if (must_trunc) {
6695 /*
6696 * existing file got truncated, notify.
6697 */
6698 vnevent_create(tvp, ct);
6699 }
6700
6701 *vpp = vp;
6702 }
6703 return (error);
6704
6705 create_otw:
6706 dnlc_remove(dvp, nm);
6707
6708 ASSERT(vattr.va_mask & AT_TYPE);
6709
6710 /*
6711 * If not a regular file let nfs4mknod() handle it.
6712 */
6713 if (vattr.va_type != VREG) {
6714 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6715 nfs_rw_exit(&drp->r_rwlock);
6716 return (error);
6717 }
6718
6719 /*
6720 * It _is_ a regular file.
6721 */
6722 ASSERT(vattr.va_mask & AT_MODE);
6723 if (MANDMODE(vattr.va_mode)) {
6724 nfs_rw_exit(&drp->r_rwlock);
6725 return (EACCES);
6726 }
6727
6728 /*
6729 * If this happens to be a mknod of a regular file, then flags will
6730 * have neither FREAD or FWRITE. However, we must set at least one
6731 * for the call to nfs4open_otw. If it's open(O_CREAT) driving
6732 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6733 * set (based on openmode specified by app).
6734 */
6735 if ((flags & (FREAD|FWRITE)) == 0)
6736 flags |= (FREAD|FWRITE);
6737
6738 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6739
6740 if (vp != NULL) {
6741 /* if create was successful, throw away the file's pages */
6742 if (!error && (vattr.va_mask & AT_SIZE))
6743 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6744 cr);
6745 /* release the lookup hold */
6746 VN_RELE(vp);
6747 vp = NULL;
6748 }
6749
6750 /*
6751 * validate that we opened a regular file. This handles a misbehaving
6752 * server that returns an incorrect FH.
6753 */
6754 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6755 error = EISDIR;
6756 VN_RELE(*vpp);
6757 }
6758
6759 /*
6760 * If this is not an exclusive create, then the CREATE
6761 * request will be made with the GUARDED mode set. This
6762 * means that the server will return EEXIST if the file
6763 * exists. The file could exist because of a retransmitted
6764 * request. In this case, we recover by starting over and
6765 * checking to see whether the file exists. This second
6766 * time through it should and a CREATE request will not be
6767 * sent.
6768 *
6769 * This handles the problem of a dangling CREATE request
6770 * which contains attributes which indicate that the file
6771 * should be truncated. This retransmitted request could
6772 * possibly truncate valid data in the file if not caught
6773 * by the duplicate request mechanism on the server or if
6774 * not caught by other means. The scenario is:
6775 *
6776 * Client transmits CREATE request with size = 0
6777 * Client times out, retransmits request.
6778 * Response to the first request arrives from the server
6779 * and the client proceeds on.
6780 * Client writes data to the file.
6781 * The server now processes retransmitted CREATE request
6782 * and truncates file.
6783 *
6784 * The use of the GUARDED CREATE request prevents this from
6785 * happening because the retransmitted CREATE would fail
6786 * with EEXIST and would not truncate the file.
6787 */
6788 if (error == EEXIST && exclusive == NONEXCL) {
6789 #ifdef DEBUG
6790 nfs4_create_misses++;
6791 #endif
6792 goto top;
6793 }
6794 nfs_rw_exit(&drp->r_rwlock);
6795 if (truncating && !error && *vpp) {
6796 vnode_t *tvp;
6797 rnode4_t *trp;
6798 /*
6799 * existing file got truncated, notify.
6800 */
6801 tvp = *vpp;
6802 trp = VTOR4(tvp);
6803 if (IS_SHADOW(tvp, trp))
6804 tvp = RTOV4(trp);
6805 vnevent_create(tvp, ct);
6806 }
6807 return (error);
6808 }
6809
6810 /*
6811 * Create compound (for mkdir, mknod, symlink):
6812 * { Putfh <dfh>; Create; Getfh; Getattr }
6813 * It's okay if setattr failed to set gid - this is not considered
6814 * an error, but purge attrs in that case.
6815 */
6816 static int
6817 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6818 vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6819 {
6820 int need_end_op = FALSE;
6821 COMPOUND4args_clnt args;
6822 COMPOUND4res_clnt res, *resp = NULL;
6823 nfs_argop4 *argop;
6824 nfs_resop4 *resop;
6825 int doqueue;
6826 mntinfo4_t *mi;
6827 rnode4_t *drp = VTOR4(dvp);
6828 change_info4 *cinfo;
6829 GETFH4res *gf_res;
6830 struct vattr vattr;
6831 vnode_t *vp;
6832 fattr4 *crattr;
6833 bool_t needrecov = FALSE;
6834 nfs4_recov_state_t recov_state;
6835 nfs4_sharedfh_t *sfhp = NULL;
6836 hrtime_t t;
6837 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6838 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6839 dirattr_info_t dinfo, *dinfop;
6840 servinfo4_t *svp;
6841 bitmap4 supp_attrs;
6842
6843 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6844 type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6845
6846 mi = VTOMI4(dvp);
6847
6848 /*
6849 * Make sure we properly deal with setting the right gid
6850 * on a new directory to reflect the parent's setgid bit
6851 */
6852 setgid_flag = 0;
6853 if (type == NF4DIR) {
6854 struct vattr dva;
6855
6856 va->va_mode &= ~VSGID;
6857 dva.va_mask = AT_MODE | AT_GID;
6858 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6859
6860 /*
6861 * If the parent's directory has the setgid bit set
6862 * _and_ the client was able to get a valid mapping
6863 * for the parent dir's owner_group, we want to
6864 * append NVERIFY(owner_group == dva.va_gid) and
6865 * SETTATTR to the CREATE compound.
6866 */
6867 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6868 setgid_flag = 1;
6869 va->va_mode |= VSGID;
6870 if (dva.va_gid != GID_NOBODY) {
6871 va->va_mask |= AT_GID;
6872 va->va_gid = dva.va_gid;
6873 }
6874 }
6875 }
6876 }
6877
6878 /*
6879 * Create ops:
6880 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6881 * 5:restorefh(dir) 6:getattr(dir)
6882 *
6883 * if (setgid)
6884 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6885 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6886 * 8:nverify 9:setattr
6887 */
6888 if (setgid_flag) {
6889 numops = 10;
6890 idx_create = 1;
6891 idx_fattr = 3;
6892 } else {
6893 numops = 7;
6894 idx_create = 2;
6895 idx_fattr = 4;
6896 }
6897
6898 ASSERT(nfs_zone() == mi->mi_zone);
6899 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6900 return (EINTR);
6901 }
6902 recov_state.rs_flags = 0;
6903 recov_state.rs_num_retry_despite_err = 0;
6904
6905 argoplist_size = numops * sizeof (nfs_argop4);
6906 argop = kmem_alloc(argoplist_size, KM_SLEEP);
6907
6908 recov_retry:
6909 if (type == NF4LNK)
6910 args.ctag = TAG_SYMLINK;
6911 else if (type == NF4DIR)
6912 args.ctag = TAG_MKDIR;
6913 else
6914 args.ctag = TAG_MKNOD;
6915
6916 args.array_len = numops;
6917 args.array = argop;
6918
6919 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6920 nfs_rw_exit(&drp->r_rwlock);
6921 kmem_free(argop, argoplist_size);
6922 return (e.error);
6923 }
6924 need_end_op = TRUE;
6925
6926
6927 /* 0: putfh directory */
6928 argop[0].argop = OP_CPUTFH;
6929 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6930
6931 /* 1/2: Create object */
6932 argop[idx_create].argop = OP_CCREATE;
6933 argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6934 argop[idx_create].nfs_argop4_u.opccreate.type = type;
6935 if (type == NF4LNK) {
6936 /*
6937 * symlink, treat name as data
6938 */
6939 ASSERT(data != NULL);
6940 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6941 (char *)data;
6942 }
6943 if (type == NF4BLK || type == NF4CHR) {
6944 ASSERT(data != NULL);
6945 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6946 *((specdata4 *)data);
6947 }
6948
6949 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6950
6951 svp = drp->r_server;
6952 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6953 supp_attrs = svp->sv_supp_attrs;
6954 nfs_rw_exit(&svp->sv_lock);
6955
6956 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6957 nfs_rw_exit(&drp->r_rwlock);
6958 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6959 e.error = EINVAL;
6960 kmem_free(argop, argoplist_size);
6961 return (e.error);
6962 }
6963
6964 /* 2/3: getfh fh of created object */
6965 ASSERT(idx_create + 1 == idx_fattr - 1);
6966 argop[idx_create + 1].argop = OP_GETFH;
6967
6968 /* 3/4: getattr of new object */
6969 argop[idx_fattr].argop = OP_GETATTR;
6970 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6971 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6972
6973 if (setgid_flag) {
6974 vattr_t _v;
6975
6976 argop[4].argop = OP_SAVEFH;
6977
6978 argop[5].argop = OP_CPUTFH;
6979 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6980
6981 argop[6].argop = OP_GETATTR;
6982 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6983 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6984
6985 argop[7].argop = OP_RESTOREFH;
6986
6987 /*
6988 * nverify
6989 *
6990 * XXX - Revisit the last argument to nfs4_end_op()
6991 * once 5020486 is fixed.
6992 */
6993 _v.va_mask = AT_GID;
6994 _v.va_gid = va->va_gid;
6995 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6996 supp_attrs)) {
6997 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6998 nfs_rw_exit(&drp->r_rwlock);
6999 nfs4_fattr4_free(crattr);
7000 kmem_free(argop, argoplist_size);
7001 return (e.error);
7002 }
7003
7004 /*
7005 * setattr
7006 *
7007 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
7008 * so no need for stateid or flags. Also we specify NULL
7009 * rp since we're only interested in setting owner_group
7010 * attributes.
7011 */
7012 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
7013 &e.error, 0);
7014
7015 if (e.error) {
7016 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7017 nfs_rw_exit(&drp->r_rwlock);
7018 nfs4_fattr4_free(crattr);
7019 nfs4args_verify_free(&argop[8]);
7020 kmem_free(argop, argoplist_size);
7021 return (e.error);
7022 }
7023 } else {
7024 argop[1].argop = OP_SAVEFH;
7025
7026 argop[5].argop = OP_RESTOREFH;
7027
7028 argop[6].argop = OP_GETATTR;
7029 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7030 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7031 }
7032
7033 dnlc_remove(dvp, nm);
7034
7035 doqueue = 1;
7036 t = gethrtime();
7037 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7038
7039 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7040 if (e.error) {
7041 PURGE_ATTRCACHE4(dvp);
7042 if (!needrecov)
7043 goto out;
7044 }
7045
7046 if (needrecov) {
7047 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7048 OP_CREATE, NULL, NULL, NULL) == FALSE) {
7049 nfs4_end_op(mi, dvp, NULL, &recov_state,
7050 needrecov);
7051 need_end_op = FALSE;
7052 nfs4_fattr4_free(crattr);
7053 if (setgid_flag) {
7054 nfs4args_verify_free(&argop[8]);
7055 nfs4args_setattr_free(&argop[9]);
7056 }
7057 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7058 goto recov_retry;
7059 }
7060 }
7061
7062 resp = &res;
7063
7064 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7065
7066 if (res.status == NFS4ERR_BADOWNER)
7067 nfs4_log_badowner(mi, OP_CREATE);
7068
7069 e.error = geterrno4(res.status);
7070
7071 /*
7072 * This check is left over from when create was implemented
7073 * using a setattr op (instead of createattrs). If the
7074 * putfh/create/getfh failed, the error was returned. If
7075 * setattr/getattr failed, we keep going.
7076 *
7077 * It might be better to get rid of the GETFH also, and just
7078 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7079 * Then if any of the operations failed, we could return the
7080 * error now, and remove much of the error code below.
7081 */
7082 if (res.array_len <= idx_fattr) {
7083 /*
7084 * Either Putfh, Create or Getfh failed.
7085 */
7086 PURGE_ATTRCACHE4(dvp);
7087 /*
7088 * nfs4_purge_stale_fh() may generate otw calls through
7089 * nfs4_invalidate_pages. Hence the need to call
7090 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7091 */
7092 nfs4_end_op(mi, dvp, NULL, &recov_state,
7093 needrecov);
7094 need_end_op = FALSE;
7095 nfs4_purge_stale_fh(e.error, dvp, cr);
7096 goto out;
7097 }
7098 }
7099
7100 resop = &res.array[idx_create]; /* create res */
7101 cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7102
7103 resop = &res.array[idx_create + 1]; /* getfh res */
7104 gf_res = &resop->nfs_resop4_u.opgetfh;
7105
7106 sfhp = sfh4_get(&gf_res->object, mi);
7107 if (e.error) {
7108 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7109 fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7110 if (vp->v_type == VNON) {
7111 vattr.va_mask = AT_TYPE;
7112 /*
7113 * Need to call nfs4_end_op before nfs4getattr to avoid
7114 * potential nfs4_start_op deadlock. See RFE 4777612.
7115 */
7116 nfs4_end_op(mi, dvp, NULL, &recov_state,
7117 needrecov);
7118 need_end_op = FALSE;
7119 e.error = nfs4getattr(vp, &vattr, cr);
7120 if (e.error) {
7121 VN_RELE(vp);
7122 *vpp = NULL;
7123 goto out;
7124 }
7125 vp->v_type = vattr.va_type;
7126 }
7127 e.error = 0;
7128 } else {
7129 *vpp = vp = makenfs4node(sfhp,
7130 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7131 dvp->v_vfsp, t, cr,
7132 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7133 }
7134
7135 /*
7136 * If compound succeeded, then update dir attrs
7137 */
7138 if (res.status == NFS4_OK) {
7139 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7140 dinfo.di_cred = cr;
7141 dinfo.di_time_call = t;
7142 dinfop = &dinfo;
7143 } else
7144 dinfop = NULL;
7145
7146 /* Update directory cache attribute, readdir and dnlc caches */
7147 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7148
7149 out:
7150 if (sfhp != NULL)
7151 sfh4_rele(&sfhp);
7152 nfs_rw_exit(&drp->r_rwlock);
7153 nfs4_fattr4_free(crattr);
7154 if (setgid_flag) {
7155 nfs4args_verify_free(&argop[8]);
7156 nfs4args_setattr_free(&argop[9]);
7157 }
7158 if (resp)
7159 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7160 if (need_end_op)
7161 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7162
7163 kmem_free(argop, argoplist_size);
7164 return (e.error);
7165 }
7166
7167 /* ARGSUSED */
7168 static int
7169 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7170 int mode, vnode_t **vpp, cred_t *cr)
7171 {
7172 int error;
7173 vnode_t *vp;
7174 nfs_ftype4 type;
7175 specdata4 spec, *specp = NULL;
7176
7177 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7178
7179 switch (va->va_type) {
7180 case VCHR:
7181 case VBLK:
7182 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7183 spec.specdata1 = getmajor(va->va_rdev);
7184 spec.specdata2 = getminor(va->va_rdev);
7185 specp = &spec;
7186 break;
7187
7188 case VFIFO:
7189 type = NF4FIFO;
7190 break;
7191 case VSOCK:
7192 type = NF4SOCK;
7193 break;
7194
7195 default:
7196 return (EINVAL);
7197 }
7198
7199 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7200 if (error) {
7201 return (error);
7202 }
7203
7204 /*
7205 * This might not be needed any more; special case to deal
7206 * with problematic v2/v3 servers. Since create was unable
7207 * to set group correctly, not sure what hope setattr has.
7208 */
7209 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7210 va->va_mask = AT_GID;
7211 (void) nfs4setattr(vp, va, 0, cr, NULL);
7212 }
7213
7214 /*
7215 * If vnode is a device create special vnode
7216 */
7217 if (ISVDEV(vp->v_type)) {
7218 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7219 VN_RELE(vp);
7220 } else {
7221 *vpp = vp;
7222 }
7223 return (error);
7224 }
7225
7226 /*
7227 * Remove requires that the current fh be the target directory.
7228 * After the operation, the current fh is unchanged.
7229 * The compound op structure is:
7230 * PUTFH(targetdir), REMOVE
7231 *
7232 * Weirdness: if the vnode to be removed is open
7233 * we rename it instead of removing it and nfs_inactive
7234 * will remove the new name.
7235 */
7236 /* ARGSUSED */
7237 static int
7238 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7239 {
7240 COMPOUND4args_clnt args;
7241 COMPOUND4res_clnt res, *resp = NULL;
7242 REMOVE4res *rm_res;
7243 nfs_argop4 argop[3];
7244 nfs_resop4 *resop;
7245 vnode_t *vp;
7246 char *tmpname;
7247 int doqueue;
7248 mntinfo4_t *mi;
7249 rnode4_t *rp;
7250 rnode4_t *drp;
7251 int needrecov = 0;
7252 nfs4_recov_state_t recov_state;
7253 int isopen;
7254 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7255 dirattr_info_t dinfo;
7256
7257 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7258 return (EPERM);
7259 drp = VTOR4(dvp);
7260 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7261 return (EINTR);
7262
7263 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7264 if (e.error) {
7265 nfs_rw_exit(&drp->r_rwlock);
7266 return (e.error);
7267 }
7268
7269 if (vp->v_type == VDIR) {
7270 VN_RELE(vp);
7271 nfs_rw_exit(&drp->r_rwlock);
7272 return (EISDIR);
7273 }
7274
7275 /*
7276 * First just remove the entry from the name cache, as it
7277 * is most likely the only entry for this vp.
7278 */
7279 dnlc_remove(dvp, nm);
7280
7281 rp = VTOR4(vp);
7282
7283 /*
7284 * For regular file types, check to see if the file is open by looking
7285 * at the open streams.
7286 * For all other types, check the reference count on the vnode. Since
7287 * they are not opened OTW they never have an open stream.
7288 *
7289 * If the file is open, rename it to .nfsXXXX.
7290 */
7291 if (vp->v_type != VREG) {
7292 /*
7293 * If the file has a v_count > 1 then there may be more than one
7294 * entry in the name cache due multiple links or an open file,
7295 * but we don't have the real reference count so flush all
7296 * possible entries.
7297 */
7298 if (vp->v_count > 1)
7299 dnlc_purge_vp(vp);
7300
7301 /*
7302 * Now we have the real reference count.
7303 */
7304 isopen = vp->v_count > 1;
7305 } else {
7306 mutex_enter(&rp->r_os_lock);
7307 isopen = list_head(&rp->r_open_streams) != NULL;
7308 mutex_exit(&rp->r_os_lock);
7309 }
7310
7311 mutex_enter(&rp->r_statelock);
7312 if (isopen &&
7313 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7314 mutex_exit(&rp->r_statelock);
7315 tmpname = newname();
7316 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7317 if (e.error)
7318 kmem_free(tmpname, MAXNAMELEN);
7319 else {
7320 mutex_enter(&rp->r_statelock);
7321 if (rp->r_unldvp == NULL) {
7322 VN_HOLD(dvp);
7323 rp->r_unldvp = dvp;
7324 if (rp->r_unlcred != NULL)
7325 crfree(rp->r_unlcred);
7326 crhold(cr);
7327 rp->r_unlcred = cr;
7328 rp->r_unlname = tmpname;
7329 } else {
7330 kmem_free(rp->r_unlname, MAXNAMELEN);
7331 rp->r_unlname = tmpname;
7332 }
7333 mutex_exit(&rp->r_statelock);
7334 }
7335 VN_RELE(vp);
7336 nfs_rw_exit(&drp->r_rwlock);
7337 return (e.error);
7338 }
7339 /*
7340 * Actually remove the file/dir
7341 */
7342 mutex_exit(&rp->r_statelock);
7343
7344 /*
7345 * We need to flush any dirty pages which happen to
7346 * be hanging around before removing the file.
7347 * This shouldn't happen very often since in NFSv4
7348 * we should be close to open consistent.
7349 */
7350 if (nfs4_has_pages(vp) &&
7351 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7352 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7353 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7354 mutex_enter(&rp->r_statelock);
7355 if (!rp->r_error)
7356 rp->r_error = e.error;
7357 mutex_exit(&rp->r_statelock);
7358 }
7359 }
7360
7361 mi = VTOMI4(dvp);
7362
7363 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7364 recov_state.rs_flags = 0;
7365 recov_state.rs_num_retry_despite_err = 0;
7366
7367 recov_retry:
7368 /*
7369 * Remove ops: putfh dir; remove
7370 */
7371 args.ctag = TAG_REMOVE;
7372 args.array_len = 3;
7373 args.array = argop;
7374
7375 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7376 if (e.error) {
7377 nfs_rw_exit(&drp->r_rwlock);
7378 VN_RELE(vp);
7379 return (e.error);
7380 }
7381
7382 /* putfh directory */
7383 argop[0].argop = OP_CPUTFH;
7384 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7385
7386 /* remove */
7387 argop[1].argop = OP_CREMOVE;
7388 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7389
7390 /* getattr dir */
7391 argop[2].argop = OP_GETATTR;
7392 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7393 argop[2].nfs_argop4_u.opgetattr.mi = mi;
7394
7395 doqueue = 1;
7396 dinfo.di_time_call = gethrtime();
7397 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7398
7399 PURGE_ATTRCACHE4(vp);
7400
7401 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7402 if (e.error)
7403 PURGE_ATTRCACHE4(dvp);
7404
7405 if (needrecov) {
7406 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7407 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7408 if (!e.error)
7409 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7410 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7411 needrecov);
7412 goto recov_retry;
7413 }
7414 }
7415
7416 /*
7417 * Matching nfs4_end_op() for start_op() above.
7418 * There is a path in the code below which calls
7419 * nfs4_purge_stale_fh(), which may generate otw calls through
7420 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7421 * here to avoid nfs4_start_op() deadlock.
7422 */
7423 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7424
7425 if (!e.error) {
7426 resp = &res;
7427
7428 if (res.status) {
7429 e.error = geterrno4(res.status);
7430 PURGE_ATTRCACHE4(dvp);
7431 nfs4_purge_stale_fh(e.error, dvp, cr);
7432 } else {
7433 resop = &res.array[1]; /* remove res */
7434 rm_res = &resop->nfs_resop4_u.opremove;
7435
7436 dinfo.di_garp =
7437 &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7438 dinfo.di_cred = cr;
7439
7440 /* Update directory attr, readdir and dnlc caches */
7441 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7442 &dinfo);
7443 }
7444 }
7445 nfs_rw_exit(&drp->r_rwlock);
7446 if (resp)
7447 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7448
7449 if (e.error == 0) {
7450 vnode_t *tvp;
7451 rnode4_t *trp;
7452 trp = VTOR4(vp);
7453 tvp = vp;
7454 if (IS_SHADOW(vp, trp))
7455 tvp = RTOV4(trp);
7456 vnevent_remove(tvp, dvp, nm, ct);
7457 }
7458 VN_RELE(vp);
7459 return (e.error);
7460 }
7461
7462 /*
7463 * Link requires that the current fh be the target directory and the
7464 * saved fh be the source fh. After the operation, the current fh is unchanged.
7465 * Thus the compound op structure is:
7466 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7467 * GETATTR(file)
7468 */
7469 /* ARGSUSED */
7470 static int
7471 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7472 caller_context_t *ct, int flags)
7473 {
7474 COMPOUND4args_clnt args;
7475 COMPOUND4res_clnt res, *resp = NULL;
7476 LINK4res *ln_res;
7477 int argoplist_size = 7 * sizeof (nfs_argop4);
7478 nfs_argop4 *argop;
7479 nfs_resop4 *resop;
7480 vnode_t *realvp, *nvp;
7481 int doqueue;
7482 mntinfo4_t *mi;
7483 rnode4_t *tdrp;
7484 bool_t needrecov = FALSE;
7485 nfs4_recov_state_t recov_state;
7486 hrtime_t t;
7487 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7488 dirattr_info_t dinfo;
7489
7490 ASSERT(*tnm != '\0');
7491 ASSERT(tdvp->v_type == VDIR);
7492 ASSERT(nfs4_consistent_type(tdvp));
7493 ASSERT(nfs4_consistent_type(svp));
7494
7495 if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7496 return (EPERM);
7497 if (VOP_REALVP(svp, &realvp, ct) == 0) {
7498 svp = realvp;
7499 ASSERT(nfs4_consistent_type(svp));
7500 }
7501
7502 tdrp = VTOR4(tdvp);
7503 mi = VTOMI4(svp);
7504
7505 if (!(mi->mi_flags & MI4_LINK)) {
7506 return (EOPNOTSUPP);
7507 }
7508 recov_state.rs_flags = 0;
7509 recov_state.rs_num_retry_despite_err = 0;
7510
7511 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7512 return (EINTR);
7513
7514 recov_retry:
7515 argop = kmem_alloc(argoplist_size, KM_SLEEP);
7516
7517 args.ctag = TAG_LINK;
7518
7519 /*
7520 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7521 * restorefh; getattr(fl)
7522 */
7523 args.array_len = 7;
7524 args.array = argop;
7525
7526 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7527 if (e.error) {
7528 kmem_free(argop, argoplist_size);
7529 nfs_rw_exit(&tdrp->r_rwlock);
7530 return (e.error);
7531 }
7532
7533 /* 0. putfh file */
7534 argop[0].argop = OP_CPUTFH;
7535 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7536
7537 /* 1. save current fh to free up the space for the dir */
7538 argop[1].argop = OP_SAVEFH;
7539
7540 /* 2. putfh targetdir */
7541 argop[2].argop = OP_CPUTFH;
7542 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7543
7544 /* 3. link: current_fh is targetdir, saved_fh is source */
7545 argop[3].argop = OP_CLINK;
7546 argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7547
7548 /* 4. Get attributes of dir */
7549 argop[4].argop = OP_GETATTR;
7550 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7551 argop[4].nfs_argop4_u.opgetattr.mi = mi;
7552
7553 /* 5. If link was successful, restore current vp to file */
7554 argop[5].argop = OP_RESTOREFH;
7555
7556 /* 6. Get attributes of linked object */
7557 argop[6].argop = OP_GETATTR;
7558 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7559 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7560
7561 dnlc_remove(tdvp, tnm);
7562
7563 doqueue = 1;
7564 t = gethrtime();
7565
7566 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7567
7568 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7569 if (e.error != 0 && !needrecov) {
7570 PURGE_ATTRCACHE4(tdvp);
7571 PURGE_ATTRCACHE4(svp);
7572 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7573 goto out;
7574 }
7575
7576 if (needrecov) {
7577 bool_t abort;
7578
7579 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7580 NULL, NULL, OP_LINK, NULL, NULL, NULL);
7581 if (abort == FALSE) {
7582 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7583 needrecov);
7584 kmem_free(argop, argoplist_size);
7585 if (!e.error)
7586 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7587 goto recov_retry;
7588 } else {
7589 if (e.error != 0) {
7590 PURGE_ATTRCACHE4(tdvp);
7591 PURGE_ATTRCACHE4(svp);
7592 nfs4_end_op(VTOMI4(svp), svp, tdvp,
7593 &recov_state, needrecov);
7594 goto out;
7595 }
7596 /* fall through for res.status case */
7597 }
7598 }
7599
7600 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7601
7602 resp = &res;
7603 if (res.status) {
7604 /* If link succeeded, then don't return error */
7605 e.error = geterrno4(res.status);
7606 if (res.array_len <= 4) {
7607 /*
7608 * Either Putfh, Savefh, Putfh dir, or Link failed
7609 */
7610 PURGE_ATTRCACHE4(svp);
7611 PURGE_ATTRCACHE4(tdvp);
7612 if (e.error == EOPNOTSUPP) {
7613 mutex_enter(&mi->mi_lock);
7614 mi->mi_flags &= ~MI4_LINK;
7615 mutex_exit(&mi->mi_lock);
7616 }
7617 /* Remap EISDIR to EPERM for non-root user for SVVS */
7618 /* XXX-LP */
7619 if (e.error == EISDIR && crgetuid(cr) != 0)
7620 e.error = EPERM;
7621 goto out;
7622 }
7623 }
7624
7625 /* either no error or one of the postop getattr failed */
7626
7627 /*
7628 * XXX - if LINK succeeded, but no attrs were returned for link
7629 * file, purge its cache.
7630 *
7631 * XXX Perform a simplified version of wcc checking. Instead of
7632 * have another getattr to get pre-op, just purge cache if
7633 * any of the ops prior to and including the getattr failed.
7634 * If the getattr succeeded then update the attrcache accordingly.
7635 */
7636
7637 /*
7638 * update cache with link file postattrs.
7639 * Note: at this point resop points to link res.
7640 */
7641 resop = &res.array[3]; /* link res */
7642 ln_res = &resop->nfs_resop4_u.oplink;
7643 if (res.status == NFS4_OK)
7644 e.error = nfs4_update_attrcache(res.status,
7645 &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7646 t, svp, cr);
7647
7648 /*
7649 * Call makenfs4node to create the new shadow vp for tnm.
7650 * We pass NULL attrs because we just cached attrs for
7651 * the src object. All we're trying to accomplish is to
7652 * to create the new shadow vnode.
7653 */
7654 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7655 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7656
7657 /* Update target cache attribute, readdir and dnlc caches */
7658 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7659 dinfo.di_time_call = t;
7660 dinfo.di_cred = cr;
7661
7662 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7663 ASSERT(nfs4_consistent_type(tdvp));
7664 ASSERT(nfs4_consistent_type(svp));
7665 ASSERT(nfs4_consistent_type(nvp));
7666 VN_RELE(nvp);
7667
7668 if (!e.error) {
7669 vnode_t *tvp;
7670 rnode4_t *trp;
7671 /*
7672 * Notify the source file of this link operation.
7673 */
7674 trp = VTOR4(svp);
7675 tvp = svp;
7676 if (IS_SHADOW(svp, trp))
7677 tvp = RTOV4(trp);
7678 vnevent_link(tvp, ct);
7679 }
7680 out:
7681 kmem_free(argop, argoplist_size);
7682 if (resp)
7683 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7684
7685 nfs_rw_exit(&tdrp->r_rwlock);
7686
7687 return (e.error);
7688 }
7689
7690 /* ARGSUSED */
7691 static int
7692 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7693 caller_context_t *ct, int flags)
7694 {
7695 vnode_t *realvp;
7696
7697 if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7698 return (EPERM);
7699 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7700 ndvp = realvp;
7701
7702 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7703 }
7704
7705 /*
7706 * nfs4rename does the real work of renaming in NFS Version 4.
7707 *
7708 * A file handle is considered volatile for renaming purposes if either
7709 * of the volatile bits are turned on. However, the compound may differ
7710 * based on the likelihood of the filehandle to change during rename.
7711 */
7712 static int
7713 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7714 caller_context_t *ct)
7715 {
7716 int error;
7717 mntinfo4_t *mi;
7718 vnode_t *nvp = NULL;
7719 vnode_t *ovp = NULL;
7720 char *tmpname = NULL;
7721 rnode4_t *rp;
7722 rnode4_t *odrp;
7723 rnode4_t *ndrp;
7724 int did_link = 0;
7725 int do_link = 1;
7726 nfsstat4 stat = NFS4_OK;
7727
7728 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7729 ASSERT(nfs4_consistent_type(odvp));
7730 ASSERT(nfs4_consistent_type(ndvp));
7731
7732 if (onm[0] == '.' && (onm[1] == '\0' ||
7733 (onm[1] == '.' && onm[2] == '\0')))
7734 return (EINVAL);
7735
7736 if (nnm[0] == '.' && (nnm[1] == '\0' ||
7737 (nnm[1] == '.' && nnm[2] == '\0')))
7738 return (EINVAL);
7739
7740 odrp = VTOR4(odvp);
7741 ndrp = VTOR4(ndvp);
7742 if ((intptr_t)odrp < (intptr_t)ndrp) {
7743 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7744 return (EINTR);
7745 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7746 nfs_rw_exit(&odrp->r_rwlock);
7747 return (EINTR);
7748 }
7749 } else {
7750 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7751 return (EINTR);
7752 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7753 nfs_rw_exit(&ndrp->r_rwlock);
7754 return (EINTR);
7755 }
7756 }
7757
7758 /*
7759 * Lookup the target file. If it exists, it needs to be
7760 * checked to see whether it is a mount point and whether
7761 * it is active (open).
7762 */
7763 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7764 if (!error) {
7765 int isactive;
7766
7767 ASSERT(nfs4_consistent_type(nvp));
7768 /*
7769 * If this file has been mounted on, then just
7770 * return busy because renaming to it would remove
7771 * the mounted file system from the name space.
7772 */
7773 if (vn_ismntpt(nvp)) {
7774 VN_RELE(nvp);
7775 nfs_rw_exit(&odrp->r_rwlock);
7776 nfs_rw_exit(&ndrp->r_rwlock);
7777 return (EBUSY);
7778 }
7779
7780 /*
7781 * First just remove the entry from the name cache, as it
7782 * is most likely the only entry for this vp.
7783 */
7784 dnlc_remove(ndvp, nnm);
7785
7786 rp = VTOR4(nvp);
7787
7788 if (nvp->v_type != VREG) {
7789 /*
7790 * Purge the name cache of all references to this vnode
7791 * so that we can check the reference count to infer
7792 * whether it is active or not.
7793 */
7794 if (nvp->v_count > 1)
7795 dnlc_purge_vp(nvp);
7796
7797 isactive = nvp->v_count > 1;
7798 } else {
7799 mutex_enter(&rp->r_os_lock);
7800 isactive = list_head(&rp->r_open_streams) != NULL;
7801 mutex_exit(&rp->r_os_lock);
7802 }
7803
7804 /*
7805 * If the vnode is active and is not a directory,
7806 * arrange to rename it to a
7807 * temporary file so that it will continue to be
7808 * accessible. This implements the "unlink-open-file"
7809 * semantics for the target of a rename operation.
7810 * Before doing this though, make sure that the
7811 * source and target files are not already the same.
7812 */
7813 if (isactive && nvp->v_type != VDIR) {
7814 /*
7815 * Lookup the source name.
7816 */
7817 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7818
7819 /*
7820 * The source name *should* already exist.
7821 */
7822 if (error) {
7823 VN_RELE(nvp);
7824 nfs_rw_exit(&odrp->r_rwlock);
7825 nfs_rw_exit(&ndrp->r_rwlock);
7826 return (error);
7827 }
7828
7829 ASSERT(nfs4_consistent_type(ovp));
7830
7831 /*
7832 * Compare the two vnodes. If they are the same,
7833 * just release all held vnodes and return success.
7834 */
7835 if (VN_CMP(ovp, nvp)) {
7836 VN_RELE(ovp);
7837 VN_RELE(nvp);
7838 nfs_rw_exit(&odrp->r_rwlock);
7839 nfs_rw_exit(&ndrp->r_rwlock);
7840 return (0);
7841 }
7842
7843 /*
7844 * Can't mix and match directories and non-
7845 * directories in rename operations. We already
7846 * know that the target is not a directory. If
7847 * the source is a directory, return an error.
7848 */
7849 if (ovp->v_type == VDIR) {
7850 VN_RELE(ovp);
7851 VN_RELE(nvp);
7852 nfs_rw_exit(&odrp->r_rwlock);
7853 nfs_rw_exit(&ndrp->r_rwlock);
7854 return (ENOTDIR);
7855 }
7856 link_call:
7857 /*
7858 * The target file exists, is not the same as
7859 * the source file, and is active. We first
7860 * try to Link it to a temporary filename to
7861 * avoid having the server removing the file
7862 * completely (which could cause data loss to
7863 * the user's POV in the event the Rename fails
7864 * -- see bug 1165874).
7865 */
7866 /*
7867 * The do_link and did_link booleans are
7868 * introduced in the event we get NFS4ERR_FILE_OPEN
7869 * returned for the Rename. Some servers can
7870 * not Rename over an Open file, so they return
7871 * this error. The client needs to Remove the
7872 * newly created Link and do two Renames, just
7873 * as if the server didn't support LINK.
7874 */
7875 tmpname = newname();
7876 error = 0;
7877
7878 if (do_link) {
7879 error = nfs4_link(ndvp, nvp, tmpname, cr,
7880 NULL, 0);
7881 }
7882 if (error == EOPNOTSUPP || !do_link) {
7883 error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7884 cr, NULL, 0);
7885 did_link = 0;
7886 } else {
7887 did_link = 1;
7888 }
7889 if (error) {
7890 kmem_free(tmpname, MAXNAMELEN);
7891 VN_RELE(ovp);
7892 VN_RELE(nvp);
7893 nfs_rw_exit(&odrp->r_rwlock);
7894 nfs_rw_exit(&ndrp->r_rwlock);
7895 return (error);
7896 }
7897
7898 mutex_enter(&rp->r_statelock);
7899 if (rp->r_unldvp == NULL) {
7900 VN_HOLD(ndvp);
7901 rp->r_unldvp = ndvp;
7902 if (rp->r_unlcred != NULL)
7903 crfree(rp->r_unlcred);
7904 crhold(cr);
7905 rp->r_unlcred = cr;
7906 rp->r_unlname = tmpname;
7907 } else {
7908 if (rp->r_unlname)
7909 kmem_free(rp->r_unlname, MAXNAMELEN);
7910 rp->r_unlname = tmpname;
7911 }
7912 mutex_exit(&rp->r_statelock);
7913 }
7914
7915 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7916
7917 ASSERT(nfs4_consistent_type(nvp));
7918 }
7919
7920 if (ovp == NULL) {
7921 /*
7922 * When renaming directories to be a subdirectory of a
7923 * different parent, the dnlc entry for ".." will no
7924 * longer be valid, so it must be removed.
7925 *
7926 * We do a lookup here to determine whether we are renaming
7927 * a directory and we need to check if we are renaming
7928 * an unlinked file. This might have already been done
7929 * in previous code, so we check ovp == NULL to avoid
7930 * doing it twice.
7931 */
7932 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7933 /*
7934 * The source name *should* already exist.
7935 */
7936 if (error) {
7937 nfs_rw_exit(&odrp->r_rwlock);
7938 nfs_rw_exit(&ndrp->r_rwlock);
7939 if (nvp) {
7940 VN_RELE(nvp);
7941 }
7942 return (error);
7943 }
7944 ASSERT(ovp != NULL);
7945 ASSERT(nfs4_consistent_type(ovp));
7946 }
7947
7948 /*
7949 * Is the object being renamed a dir, and if so, is
7950 * it being renamed to a child of itself? The underlying
7951 * fs should ultimately return EINVAL for this case;
7952 * however, buggy beta non-Solaris NFSv4 servers at
7953 * interop testing events have allowed this behavior,
7954 * and it caused our client to panic due to a recursive
7955 * mutex_enter in fn_move.
7956 *
7957 * The tedious locking in fn_move could be changed to
7958 * deal with this case, and the client could avoid the
7959 * panic; however, the client would just confuse itself
7960 * later and misbehave. A better way to handle the broken
7961 * server is to detect this condition and return EINVAL
7962 * without ever sending the the bogus rename to the server.
7963 * We know the rename is invalid -- just fail it now.
7964 */
7965 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7966 VN_RELE(ovp);
7967 nfs_rw_exit(&odrp->r_rwlock);
7968 nfs_rw_exit(&ndrp->r_rwlock);
7969 if (nvp) {
7970 VN_RELE(nvp);
7971 }
7972 return (EINVAL);
7973 }
7974
7975 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7976
7977 /*
7978 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7979 * possible for the filehandle to change due to the rename.
7980 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7981 * the fh will not change because of the rename, but we still need
7982 * to update its rnode entry with the new name for
7983 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7984 * has no effect on these for now, but for future improvements,
7985 * we might want to use it too to simplify handling of files
7986 * that are open with that flag on. (XXX)
7987 */
7988 mi = VTOMI4(odvp);
7989 if (NFS4_VOLATILE_FH(mi))
7990 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7991 &stat);
7992 else
7993 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7994 &stat);
7995
7996 ASSERT(nfs4_consistent_type(odvp));
7997 ASSERT(nfs4_consistent_type(ndvp));
7998 ASSERT(nfs4_consistent_type(ovp));
7999
8000 if (stat == NFS4ERR_FILE_OPEN && did_link) {
8001 do_link = 0;
8002 /*
8003 * Before the 'link_call' code, we did a nfs4_lookup
8004 * that puts a VN_HOLD on nvp. After the nfs4_link
8005 * call we call VN_RELE to match that hold. We need
8006 * to place an additional VN_HOLD here since we will
8007 * be hitting that VN_RELE again.
8008 */
8009 VN_HOLD(nvp);
8010
8011 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
8012
8013 /* Undo the unlinked file naming stuff we just did */
8014 mutex_enter(&rp->r_statelock);
8015 if (rp->r_unldvp) {
8016 VN_RELE(ndvp);
8017 rp->r_unldvp = NULL;
8018 if (rp->r_unlcred != NULL)
8019 crfree(rp->r_unlcred);
8020 rp->r_unlcred = NULL;
8021 /* rp->r_unlanme points to tmpname */
8022 if (rp->r_unlname)
8023 kmem_free(rp->r_unlname, MAXNAMELEN);
8024 rp->r_unlname = NULL;
8025 }
8026 mutex_exit(&rp->r_statelock);
8027
8028 if (nvp) {
8029 VN_RELE(nvp);
8030 }
8031 goto link_call;
8032 }
8033
8034 if (error) {
8035 VN_RELE(ovp);
8036 nfs_rw_exit(&odrp->r_rwlock);
8037 nfs_rw_exit(&ndrp->r_rwlock);
8038 if (nvp) {
8039 VN_RELE(nvp);
8040 }
8041 return (error);
8042 }
8043
8044 /*
8045 * when renaming directories to be a subdirectory of a
8046 * different parent, the dnlc entry for ".." will no
8047 * longer be valid, so it must be removed
8048 */
8049 rp = VTOR4(ovp);
8050 if (ndvp != odvp) {
8051 if (ovp->v_type == VDIR) {
8052 dnlc_remove(ovp, "..");
8053 if (rp->r_dir != NULL)
8054 nfs4_purge_rddir_cache(ovp);
8055 }
8056 }
8057
8058 /*
8059 * If we are renaming the unlinked file, update the
8060 * r_unldvp and r_unlname as needed.
8061 */
8062 mutex_enter(&rp->r_statelock);
8063 if (rp->r_unldvp != NULL) {
8064 if (strcmp(rp->r_unlname, onm) == 0) {
8065 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8066 rp->r_unlname[MAXNAMELEN - 1] = '\0';
8067 if (ndvp != rp->r_unldvp) {
8068 VN_RELE(rp->r_unldvp);
8069 rp->r_unldvp = ndvp;
8070 VN_HOLD(ndvp);
8071 }
8072 }
8073 }
8074 mutex_exit(&rp->r_statelock);
8075
8076 /*
8077 * Notify the rename vnevents to source vnode, and to the target
8078 * vnode if it already existed.
8079 */
8080 if (error == 0) {
8081 vnode_t *tvp, *tovp;
8082 rnode4_t *trp;
8083
8084 /*
8085 * Notify the vnode. Each links is represented by
8086 * a different vnode, in nfsv4.
8087 */
8088 if (nvp) {
8089 trp = VTOR4(nvp);
8090 tvp = nvp;
8091 if (IS_SHADOW(nvp, trp))
8092 tvp = RTOV4(trp);
8093 vnevent_rename_dest(tvp, ndvp, nnm, ct);
8094 }
8095
8096 trp = VTOR4(ovp);
8097 tovp = ovp;
8098 if (IS_SHADOW(ovp, trp))
8099 tovp = RTOV4(trp);
8100
8101 vnevent_rename_src(tovp, odvp, onm, ct);
8102
8103 trp = VTOR4(ndvp);
8104 tvp = ndvp;
8105
8106 if (IS_SHADOW(ndvp, trp))
8107 tvp = RTOV4(trp);
8108
8109 vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
8110 }
8111
8112 if (nvp) {
8113 VN_RELE(nvp);
8114 }
8115 VN_RELE(ovp);
8116
8117 nfs_rw_exit(&odrp->r_rwlock);
8118 nfs_rw_exit(&ndrp->r_rwlock);
8119
8120 return (error);
8121 }
8122
8123 /*
8124 * When the parent directory has changed, sv_dfh must be updated
8125 */
8126 static void
8127 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8128 {
8129 svnode_t *sv = VTOSV(vp);
8130 nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8131 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8132
8133 sfh4_hold(new_dfh);
8134 sv->sv_dfh = new_dfh;
8135 sfh4_rele(&old_dfh);
8136 }
8137
8138 /*
8139 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8140 * when it is known that the filehandle is persistent through rename.
8141 *
8142 * Rename requires that the current fh be the target directory and the
8143 * saved fh be the source directory. After the operation, the current fh
8144 * is unchanged.
8145 * The compound op structure for persistent fh rename is:
8146 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8147 * Rather than bother with the directory postop args, we'll simply
8148 * update that a change occurred in the cache, so no post-op getattrs.
8149 */
8150 static int
8151 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8152 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8153 {
8154 COMPOUND4args_clnt args;
8155 COMPOUND4res_clnt res, *resp = NULL;
8156 nfs_argop4 *argop;
8157 nfs_resop4 *resop;
8158 int doqueue, argoplist_size;
8159 mntinfo4_t *mi;
8160 rnode4_t *odrp = VTOR4(odvp);
8161 rnode4_t *ndrp = VTOR4(ndvp);
8162 RENAME4res *rn_res;
8163 bool_t needrecov;
8164 nfs4_recov_state_t recov_state;
8165 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8166 dirattr_info_t dinfo, *dinfop;
8167
8168 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8169
8170 recov_state.rs_flags = 0;
8171 recov_state.rs_num_retry_despite_err = 0;
8172
8173 /*
8174 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8175 *
8176 * If source/target are different dirs, then append putfh(src); getattr
8177 */
8178 args.array_len = (odvp == ndvp) ? 5 : 7;
8179 argoplist_size = args.array_len * sizeof (nfs_argop4);
8180 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8181
8182 recov_retry:
8183 *statp = NFS4_OK;
8184
8185 /* No need to Lookup the file, persistent fh */
8186 args.ctag = TAG_RENAME;
8187
8188 mi = VTOMI4(odvp);
8189 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8190 if (e.error) {
8191 kmem_free(argop, argoplist_size);
8192 return (e.error);
8193 }
8194
8195 /* 0: putfh source directory */
8196 argop[0].argop = OP_CPUTFH;
8197 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8198
8199 /* 1: Save source fh to free up current for target */
8200 argop[1].argop = OP_SAVEFH;
8201
8202 /* 2: putfh targetdir */
8203 argop[2].argop = OP_CPUTFH;
8204 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8205
8206 /* 3: current_fh is targetdir, saved_fh is sourcedir */
8207 argop[3].argop = OP_CRENAME;
8208 argop[3].nfs_argop4_u.opcrename.coldname = onm;
8209 argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8210
8211 /* 4: getattr (targetdir) */
8212 argop[4].argop = OP_GETATTR;
8213 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8214 argop[4].nfs_argop4_u.opgetattr.mi = mi;
8215
8216 if (ndvp != odvp) {
8217
8218 /* 5: putfh (sourcedir) */
8219 argop[5].argop = OP_CPUTFH;
8220 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8221
8222 /* 6: getattr (sourcedir) */
8223 argop[6].argop = OP_GETATTR;
8224 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8225 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8226 }
8227
8228 dnlc_remove(odvp, onm);
8229 dnlc_remove(ndvp, nnm);
8230
8231 doqueue = 1;
8232 dinfo.di_time_call = gethrtime();
8233 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8234
8235 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8236 if (e.error) {
8237 PURGE_ATTRCACHE4(odvp);
8238 PURGE_ATTRCACHE4(ndvp);
8239 } else {
8240 *statp = res.status;
8241 }
8242
8243 if (needrecov) {
8244 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8245 OP_RENAME, NULL, NULL, NULL) == FALSE) {
8246 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8247 if (!e.error)
8248 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8249 goto recov_retry;
8250 }
8251 }
8252
8253 if (!e.error) {
8254 resp = &res;
8255 /*
8256 * as long as OP_RENAME
8257 */
8258 if (res.status != NFS4_OK && res.array_len <= 4) {
8259 e.error = geterrno4(res.status);
8260 PURGE_ATTRCACHE4(odvp);
8261 PURGE_ATTRCACHE4(ndvp);
8262 /*
8263 * System V defines rename to return EEXIST, not
8264 * ENOTEMPTY if the target directory is not empty.
8265 * Over the wire, the error is NFSERR_ENOTEMPTY
8266 * which geterrno4 maps to ENOTEMPTY.
8267 */
8268 if (e.error == ENOTEMPTY)
8269 e.error = EEXIST;
8270 } else {
8271
8272 resop = &res.array[3]; /* rename res */
8273 rn_res = &resop->nfs_resop4_u.oprename;
8274
8275 if (res.status == NFS4_OK) {
8276 /*
8277 * Update target attribute, readdir and dnlc
8278 * caches.
8279 */
8280 dinfo.di_garp =
8281 &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8282 dinfo.di_cred = cr;
8283 dinfop = &dinfo;
8284 } else
8285 dinfop = NULL;
8286
8287 nfs4_update_dircaches(&rn_res->target_cinfo,
8288 ndvp, NULL, NULL, dinfop);
8289
8290 /*
8291 * Update source attribute, readdir and dnlc caches
8292 *
8293 */
8294 if (ndvp != odvp) {
8295 update_parentdir_sfh(renvp, ndvp);
8296
8297 if (dinfop)
8298 dinfo.di_garp =
8299 &(res.array[6].nfs_resop4_u.
8300 opgetattr.ga_res);
8301
8302 nfs4_update_dircaches(&rn_res->source_cinfo,
8303 odvp, NULL, NULL, dinfop);
8304 }
8305
8306 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8307 nnm);
8308 }
8309 }
8310
8311 if (resp)
8312 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8313 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8314 kmem_free(argop, argoplist_size);
8315
8316 return (e.error);
8317 }
8318
8319 /*
8320 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8321 * it is possible for the filehandle to change due to the rename.
8322 *
8323 * The compound req in this case includes a post-rename lookup and getattr
8324 * to ensure that we have the correct fh and attributes for the object.
8325 *
8326 * Rename requires that the current fh be the target directory and the
8327 * saved fh be the source directory. After the operation, the current fh
8328 * is unchanged.
8329 *
8330 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8331 * update the filehandle for the renamed object. We also get the old
8332 * filehandle for historical reasons; this should be taken out sometime.
8333 * This results in a rather cumbersome compound...
8334 *
8335 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8336 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8337 *
8338 */
8339 static int
8340 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8341 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8342 {
8343 COMPOUND4args_clnt args;
8344 COMPOUND4res_clnt res, *resp = NULL;
8345 int argoplist_size;
8346 nfs_argop4 *argop;
8347 nfs_resop4 *resop;
8348 int doqueue;
8349 mntinfo4_t *mi;
8350 rnode4_t *odrp = VTOR4(odvp); /* old directory */
8351 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */
8352 rnode4_t *orp = VTOR4(ovp); /* object being renamed */
8353 RENAME4res *rn_res;
8354 GETFH4res *ngf_res;
8355 bool_t needrecov;
8356 nfs4_recov_state_t recov_state;
8357 hrtime_t t;
8358 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8359 dirattr_info_t dinfo, *dinfop = &dinfo;
8360
8361 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8362
8363 recov_state.rs_flags = 0;
8364 recov_state.rs_num_retry_despite_err = 0;
8365
8366 recov_retry:
8367 *statp = NFS4_OK;
8368
8369 /*
8370 * There is a window between the RPC and updating the path and
8371 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery
8372 * code, so that it doesn't try to use the old path during that
8373 * window.
8374 */
8375 mutex_enter(&orp->r_statelock);
8376 while (orp->r_flags & R4RECEXPFH) {
8377 klwp_t *lwp = ttolwp(curthread);
8378
8379 if (lwp != NULL)
8380 lwp->lwp_nostop++;
8381 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8382 mutex_exit(&orp->r_statelock);
8383 if (lwp != NULL)
8384 lwp->lwp_nostop--;
8385 return (EINTR);
8386 }
8387 if (lwp != NULL)
8388 lwp->lwp_nostop--;
8389 }
8390 orp->r_flags |= R4RECEXPFH;
8391 mutex_exit(&orp->r_statelock);
8392
8393 mi = VTOMI4(odvp);
8394
8395 args.ctag = TAG_RENAME_VFH;
8396 args.array_len = (odvp == ndvp) ? 10 : 12;
8397 argoplist_size = args.array_len * sizeof (nfs_argop4);
8398 argop = kmem_alloc(argoplist_size, KM_SLEEP);
8399
8400 /*
8401 * Rename ops:
8402 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8403 * PUTFH(targetdir), RENAME, GETATTR(targetdir)
8404 * LOOKUP(trgt), GETFH(new), GETATTR,
8405 *
8406 * if (odvp != ndvp)
8407 * add putfh(sourcedir), getattr(sourcedir) }
8408 */
8409 args.array = argop;
8410
8411 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8412 &recov_state, NULL);
8413 if (e.error) {
8414 kmem_free(argop, argoplist_size);
8415 mutex_enter(&orp->r_statelock);
8416 orp->r_flags &= ~R4RECEXPFH;
8417 cv_broadcast(&orp->r_cv);
8418 mutex_exit(&orp->r_statelock);
8419 return (e.error);
8420 }
8421
8422 /* 0: putfh source directory */
8423 argop[0].argop = OP_CPUTFH;
8424 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8425
8426 /* 1: Save source fh to free up current for target */
8427 argop[1].argop = OP_SAVEFH;
8428
8429 /* 2: Lookup pre-rename fh of renamed object */
8430 argop[2].argop = OP_CLOOKUP;
8431 argop[2].nfs_argop4_u.opclookup.cname = onm;
8432
8433 /* 3: getfh fh of renamed object (before rename) */
8434 argop[3].argop = OP_GETFH;
8435
8436 /* 4: putfh targetdir */
8437 argop[4].argop = OP_CPUTFH;
8438 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8439
8440 /* 5: current_fh is targetdir, saved_fh is sourcedir */
8441 argop[5].argop = OP_CRENAME;
8442 argop[5].nfs_argop4_u.opcrename.coldname = onm;
8443 argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8444
8445 /* 6: getattr of target dir (post op attrs) */
8446 argop[6].argop = OP_GETATTR;
8447 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8448 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8449
8450 /* 7: Lookup post-rename fh of renamed object */
8451 argop[7].argop = OP_CLOOKUP;
8452 argop[7].nfs_argop4_u.opclookup.cname = nnm;
8453
8454 /* 8: getfh fh of renamed object (after rename) */
8455 argop[8].argop = OP_GETFH;
8456
8457 /* 9: getattr of renamed object */
8458 argop[9].argop = OP_GETATTR;
8459 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8460 argop[9].nfs_argop4_u.opgetattr.mi = mi;
8461
8462 /*
8463 * If source/target dirs are different, then get new post-op
8464 * attrs for source dir also.
8465 */
8466 if (ndvp != odvp) {
8467 /* 10: putfh (sourcedir) */
8468 argop[10].argop = OP_CPUTFH;
8469 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8470
8471 /* 11: getattr (sourcedir) */
8472 argop[11].argop = OP_GETATTR;
8473 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8474 argop[11].nfs_argop4_u.opgetattr.mi = mi;
8475 }
8476
8477 dnlc_remove(odvp, onm);
8478 dnlc_remove(ndvp, nnm);
8479
8480 doqueue = 1;
8481 t = gethrtime();
8482 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8483
8484 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8485 if (e.error) {
8486 PURGE_ATTRCACHE4(odvp);
8487 PURGE_ATTRCACHE4(ndvp);
8488 if (!needrecov) {
8489 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8490 &recov_state, needrecov);
8491 goto out;
8492 }
8493 } else {
8494 *statp = res.status;
8495 }
8496
8497 if (needrecov) {
8498 bool_t abort;
8499
8500 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8501 OP_RENAME, NULL, NULL, NULL);
8502 if (abort == FALSE) {
8503 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8504 &recov_state, needrecov);
8505 kmem_free(argop, argoplist_size);
8506 if (!e.error)
8507 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8508 mutex_enter(&orp->r_statelock);
8509 orp->r_flags &= ~R4RECEXPFH;
8510 cv_broadcast(&orp->r_cv);
8511 mutex_exit(&orp->r_statelock);
8512 goto recov_retry;
8513 } else {
8514 if (e.error != 0) {
8515 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8516 &recov_state, needrecov);
8517 goto out;
8518 }
8519 /* fall through for res.status case */
8520 }
8521 }
8522
8523 resp = &res;
8524 /*
8525 * If OP_RENAME (or any prev op) failed, then return an error.
8526 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8527 */
8528 if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8529 /*
8530 * Error in an op other than last Getattr
8531 */
8532 e.error = geterrno4(res.status);
8533 PURGE_ATTRCACHE4(odvp);
8534 PURGE_ATTRCACHE4(ndvp);
8535 /*
8536 * System V defines rename to return EEXIST, not
8537 * ENOTEMPTY if the target directory is not empty.
8538 * Over the wire, the error is NFSERR_ENOTEMPTY
8539 * which geterrno4 maps to ENOTEMPTY.
8540 */
8541 if (e.error == ENOTEMPTY)
8542 e.error = EEXIST;
8543 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8544 needrecov);
8545 goto out;
8546 }
8547
8548 /* rename results */
8549 rn_res = &res.array[5].nfs_resop4_u.oprename;
8550
8551 if (res.status == NFS4_OK) {
8552 /* Update target attribute, readdir and dnlc caches */
8553 dinfo.di_garp =
8554 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8555 dinfo.di_cred = cr;
8556 dinfo.di_time_call = t;
8557 } else
8558 dinfop = NULL;
8559
8560 /* Update source cache attribute, readdir and dnlc caches */
8561 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8562
8563 /* Update source cache attribute, readdir and dnlc caches */
8564 if (ndvp != odvp) {
8565 update_parentdir_sfh(ovp, ndvp);
8566
8567 /*
8568 * If dinfop is non-NULL, then compound succeded, so
8569 * set di_garp to attrs for source dir. dinfop is only
8570 * set to NULL when compound fails.
8571 */
8572 if (dinfop)
8573 dinfo.di_garp =
8574 &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8575 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8576 dinfop);
8577 }
8578
8579 /*
8580 * Update the rnode with the new component name and args,
8581 * and if the file handle changed, also update it with the new fh.
8582 * This is only necessary if the target object has an rnode
8583 * entry and there is no need to create one for it.
8584 */
8585 resop = &res.array[8]; /* getfh new res */
8586 ngf_res = &resop->nfs_resop4_u.opgetfh;
8587
8588 /*
8589 * Update the path and filehandle for the renamed object.
8590 */
8591 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8592
8593 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8594
8595 if (res.status == NFS4_OK) {
8596 resop++; /* getattr res */
8597 e.error = nfs4_update_attrcache(res.status,
8598 &resop->nfs_resop4_u.opgetattr.ga_res,
8599 t, ovp, cr);
8600 }
8601
8602 out:
8603 kmem_free(argop, argoplist_size);
8604 if (resp)
8605 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8606 mutex_enter(&orp->r_statelock);
8607 orp->r_flags &= ~R4RECEXPFH;
8608 cv_broadcast(&orp->r_cv);
8609 mutex_exit(&orp->r_statelock);
8610
8611 return (e.error);
8612 }
8613
8614 /* ARGSUSED */
8615 static int
8616 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8617 caller_context_t *ct, int flags, vsecattr_t *vsecp)
8618 {
8619 int error;
8620 vnode_t *vp;
8621
8622 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8623 return (EPERM);
8624 /*
8625 * As ".." has special meaning and rather than send a mkdir
8626 * over the wire to just let the server freak out, we just
8627 * short circuit it here and return EEXIST
8628 */
8629 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8630 return (EEXIST);
8631
8632 /*
8633 * Decision to get the right gid and setgid bit of the
8634 * new directory is now made in call_nfs4_create_req.
8635 */
8636 va->va_mask |= AT_MODE;
8637 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8638 if (error)
8639 return (error);
8640
8641 *vpp = vp;
8642 return (0);
8643 }
8644
8645
8646 /*
8647 * rmdir is using the same remove v4 op as does remove.
8648 * Remove requires that the current fh be the target directory.
8649 * After the operation, the current fh is unchanged.
8650 * The compound op structure is:
8651 * PUTFH(targetdir), REMOVE
8652 */
8653 /*ARGSUSED4*/
8654 static int
8655 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8656 caller_context_t *ct, int flags)
8657 {
8658 int need_end_op = FALSE;
8659 COMPOUND4args_clnt args;
8660 COMPOUND4res_clnt res, *resp = NULL;
8661 REMOVE4res *rm_res;
8662 nfs_argop4 argop[3];
8663 nfs_resop4 *resop;
8664 vnode_t *vp;
8665 int doqueue;
8666 mntinfo4_t *mi;
8667 rnode4_t *drp;
8668 bool_t needrecov = FALSE;
8669 nfs4_recov_state_t recov_state;
8670 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8671 dirattr_info_t dinfo, *dinfop;
8672
8673 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8674 return (EPERM);
8675 /*
8676 * As ".." has special meaning and rather than send a rmdir
8677 * over the wire to just let the server freak out, we just
8678 * short circuit it here and return EEXIST
8679 */
8680 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8681 return (EEXIST);
8682
8683 drp = VTOR4(dvp);
8684 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8685 return (EINTR);
8686
8687 /*
8688 * Attempt to prevent a rmdir(".") from succeeding.
8689 */
8690 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8691 if (e.error) {
8692 nfs_rw_exit(&drp->r_rwlock);
8693 return (e.error);
8694 }
8695 if (vp == cdir) {
8696 VN_RELE(vp);
8697 nfs_rw_exit(&drp->r_rwlock);
8698 return (EINVAL);
8699 }
8700
8701 /*
8702 * Since nfsv4 remove op works on both files and directories,
8703 * check that the removed object is indeed a directory.
8704 */
8705 if (vp->v_type != VDIR) {
8706 VN_RELE(vp);
8707 nfs_rw_exit(&drp->r_rwlock);
8708 return (ENOTDIR);
8709 }
8710
8711 /*
8712 * First just remove the entry from the name cache, as it
8713 * is most likely an entry for this vp.
8714 */
8715 dnlc_remove(dvp, nm);
8716
8717 /*
8718 * If there vnode reference count is greater than one, then
8719 * there may be additional references in the DNLC which will
8720 * need to be purged. First, trying removing the entry for
8721 * the parent directory and see if that removes the additional
8722 * reference(s). If that doesn't do it, then use dnlc_purge_vp
8723 * to completely remove any references to the directory which
8724 * might still exist in the DNLC.
8725 */
8726 if (vp->v_count > 1) {
8727 dnlc_remove(vp, "..");
8728 if (vp->v_count > 1)
8729 dnlc_purge_vp(vp);
8730 }
8731
8732 mi = VTOMI4(dvp);
8733 recov_state.rs_flags = 0;
8734 recov_state.rs_num_retry_despite_err = 0;
8735
8736 recov_retry:
8737 args.ctag = TAG_RMDIR;
8738
8739 /*
8740 * Rmdir ops: putfh dir; remove
8741 */
8742 args.array_len = 3;
8743 args.array = argop;
8744
8745 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8746 if (e.error) {
8747 nfs_rw_exit(&drp->r_rwlock);
8748 return (e.error);
8749 }
8750 need_end_op = TRUE;
8751
8752 /* putfh directory */
8753 argop[0].argop = OP_CPUTFH;
8754 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8755
8756 /* remove */
8757 argop[1].argop = OP_CREMOVE;
8758 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8759
8760 /* getattr (postop attrs for dir that contained removed dir) */
8761 argop[2].argop = OP_GETATTR;
8762 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8763 argop[2].nfs_argop4_u.opgetattr.mi = mi;
8764
8765 dinfo.di_time_call = gethrtime();
8766 doqueue = 1;
8767 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8768
8769 PURGE_ATTRCACHE4(vp);
8770
8771 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8772 if (e.error) {
8773 PURGE_ATTRCACHE4(dvp);
8774 }
8775
8776 if (needrecov) {
8777 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8778 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8779 if (!e.error)
8780 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8781
8782 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8783 needrecov);
8784 need_end_op = FALSE;
8785 goto recov_retry;
8786 }
8787 }
8788
8789 if (!e.error) {
8790 resp = &res;
8791
8792 /*
8793 * Only return error if first 2 ops (OP_REMOVE or earlier)
8794 * failed.
8795 */
8796 if (res.status != NFS4_OK && res.array_len <= 2) {
8797 e.error = geterrno4(res.status);
8798 PURGE_ATTRCACHE4(dvp);
8799 nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8800 &recov_state, needrecov);
8801 need_end_op = FALSE;
8802 nfs4_purge_stale_fh(e.error, dvp, cr);
8803 /*
8804 * System V defines rmdir to return EEXIST, not
8805 * ENOTEMPTY if the directory is not empty. Over
8806 * the wire, the error is NFSERR_ENOTEMPTY which
8807 * geterrno4 maps to ENOTEMPTY.
8808 */
8809 if (e.error == ENOTEMPTY)
8810 e.error = EEXIST;
8811 } else {
8812 resop = &res.array[1]; /* remove res */
8813 rm_res = &resop->nfs_resop4_u.opremove;
8814
8815 if (res.status == NFS4_OK) {
8816 resop = &res.array[2]; /* dir attrs */
8817 dinfo.di_garp =
8818 &resop->nfs_resop4_u.opgetattr.ga_res;
8819 dinfo.di_cred = cr;
8820 dinfop = &dinfo;
8821 } else
8822 dinfop = NULL;
8823
8824 /* Update dir attribute, readdir and dnlc caches */
8825 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8826 dinfop);
8827
8828 /* destroy rddir cache for dir that was removed */
8829 if (VTOR4(vp)->r_dir != NULL)
8830 nfs4_purge_rddir_cache(vp);
8831 }
8832 }
8833
8834 if (need_end_op)
8835 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8836
8837 nfs_rw_exit(&drp->r_rwlock);
8838
8839 if (resp)
8840 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8841
8842 if (e.error == 0) {
8843 vnode_t *tvp;
8844 rnode4_t *trp;
8845 trp = VTOR4(vp);
8846 tvp = vp;
8847 if (IS_SHADOW(vp, trp))
8848 tvp = RTOV4(trp);
8849 vnevent_rmdir(tvp, dvp, nm, ct);
8850 }
8851
8852 VN_RELE(vp);
8853
8854 return (e.error);
8855 }
8856
8857 /* ARGSUSED */
8858 static int
8859 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8860 caller_context_t *ct, int flags)
8861 {
8862 int error;
8863 vnode_t *vp;
8864 rnode4_t *rp;
8865 char *contents;
8866 mntinfo4_t *mi = VTOMI4(dvp);
8867
8868 if (nfs_zone() != mi->mi_zone)
8869 return (EPERM);
8870 if (!(mi->mi_flags & MI4_SYMLINK))
8871 return (EOPNOTSUPP);
8872
8873 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8874 if (error)
8875 return (error);
8876
8877 ASSERT(nfs4_consistent_type(vp));
8878 rp = VTOR4(vp);
8879 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8880
8881 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8882
8883 if (contents != NULL) {
8884 mutex_enter(&rp->r_statelock);
8885 if (rp->r_symlink.contents == NULL) {
8886 rp->r_symlink.len = strlen(tnm);
8887 bcopy(tnm, contents, rp->r_symlink.len);
8888 rp->r_symlink.contents = contents;
8889 rp->r_symlink.size = MAXPATHLEN;
8890 mutex_exit(&rp->r_statelock);
8891 } else {
8892 mutex_exit(&rp->r_statelock);
8893 kmem_free((void *)contents, MAXPATHLEN);
8894 }
8895 }
8896 }
8897 VN_RELE(vp);
8898
8899 return (error);
8900 }
8901
8902
8903 /*
8904 * Read directory entries.
8905 * There are some weird things to look out for here. The uio_loffset
8906 * field is either 0 or it is the offset returned from a previous
8907 * readdir. It is an opaque value used by the server to find the
8908 * correct directory block to read. The count field is the number
8909 * of blocks to read on the server. This is advisory only, the server
8910 * may return only one block's worth of entries. Entries may be compressed
8911 * on the server.
8912 */
8913 /* ARGSUSED */
8914 static int
8915 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8916 caller_context_t *ct, int flags)
8917 {
8918 int error;
8919 uint_t count;
8920 rnode4_t *rp;
8921 rddir4_cache *rdc;
8922 rddir4_cache *rrdc;
8923
8924 if (nfs_zone() != VTOMI4(vp)->mi_zone)
8925 return (EIO);
8926 rp = VTOR4(vp);
8927
8928 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8929
8930 /*
8931 * Make sure that the directory cache is valid.
8932 */
8933 if (rp->r_dir != NULL) {
8934 if (nfs_disable_rddir_cache != 0) {
8935 /*
8936 * Setting nfs_disable_rddir_cache in /etc/system
8937 * allows interoperability with servers that do not
8938 * properly update the attributes of directories.
8939 * Any cached information gets purged before an
8940 * access is made to it.
8941 */
8942 nfs4_purge_rddir_cache(vp);
8943 }
8944
8945 error = nfs4_validate_caches(vp, cr);
8946 if (error)
8947 return (error);
8948 }
8949
8950 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8951
8952 /*
8953 * Short circuit last readdir which always returns 0 bytes.
8954 * This can be done after the directory has been read through
8955 * completely at least once. This will set r_direof which
8956 * can be used to find the value of the last cookie.
8957 */
8958 mutex_enter(&rp->r_statelock);
8959 if (rp->r_direof != NULL &&
8960 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8961 mutex_exit(&rp->r_statelock);
8962 #ifdef DEBUG
8963 nfs4_readdir_cache_shorts++;
8964 #endif
8965 if (eofp)
8966 *eofp = 1;
8967 return (0);
8968 }
8969
8970 /*
8971 * Look for a cache entry. Cache entries are identified
8972 * by the NFS cookie value and the byte count requested.
8973 */
8974 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8975
8976 /*
8977 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8978 */
8979 if (rdc == NULL) {
8980 mutex_exit(&rp->r_statelock);
8981 return (EINTR);
8982 }
8983
8984 /*
8985 * Check to see if we need to fill this entry in.
8986 */
8987 if (rdc->flags & RDDIRREQ) {
8988 rdc->flags &= ~RDDIRREQ;
8989 rdc->flags |= RDDIR;
8990 mutex_exit(&rp->r_statelock);
8991
8992 /*
8993 * Do the readdir.
8994 */
8995 nfs4readdir(vp, rdc, cr);
8996
8997 /*
8998 * Reacquire the lock, so that we can continue
8999 */
9000 mutex_enter(&rp->r_statelock);
9001 /*
9002 * The entry is now complete
9003 */
9004 rdc->flags &= ~RDDIR;
9005 }
9006
9007 ASSERT(!(rdc->flags & RDDIR));
9008
9009 /*
9010 * If an error occurred while attempting
9011 * to fill the cache entry, mark the entry invalid and
9012 * just return the error.
9013 */
9014 if (rdc->error) {
9015 error = rdc->error;
9016 rdc->flags |= RDDIRREQ;
9017 rddir4_cache_rele(rp, rdc);
9018 mutex_exit(&rp->r_statelock);
9019 return (error);
9020 }
9021
9022 /*
9023 * The cache entry is complete and good,
9024 * copyout the dirent structs to the calling
9025 * thread.
9026 */
9027 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9028
9029 /*
9030 * If no error occurred during the copyout,
9031 * update the offset in the uio struct to
9032 * contain the value of the next NFS 4 cookie
9033 * and set the eof value appropriately.
9034 */
9035 if (!error) {
9036 uiop->uio_loffset = rdc->nfs4_ncookie;
9037 if (eofp)
9038 *eofp = rdc->eof;
9039 }
9040
9041 /*
9042 * Decide whether to do readahead. Don't if we
9043 * have already read to the end of directory.
9044 */
9045 if (rdc->eof) {
9046 /*
9047 * Make the entry the direof only if it is cached
9048 */
9049 if (rdc->flags & RDDIRCACHED)
9050 rp->r_direof = rdc;
9051 rddir4_cache_rele(rp, rdc);
9052 mutex_exit(&rp->r_statelock);
9053 return (error);
9054 }
9055
9056 /* Determine if a readdir readahead should be done */
9057 if (!(rp->r_flags & R4LOOKUP)) {
9058 rddir4_cache_rele(rp, rdc);
9059 mutex_exit(&rp->r_statelock);
9060 return (error);
9061 }
9062
9063 /*
9064 * Now look for a readahead entry.
9065 *
9066 * Check to see whether we found an entry for the readahead.
9067 * If so, we don't need to do anything further, so free the new
9068 * entry if one was allocated. Otherwise, allocate a new entry, add
9069 * it to the cache, and then initiate an asynchronous readdir
9070 * operation to fill it.
9071 */
9072 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9073
9074 /*
9075 * A readdir cache entry could not be obtained for the readahead. In
9076 * this case we skip the readahead and return.
9077 */
9078 if (rrdc == NULL) {
9079 rddir4_cache_rele(rp, rdc);
9080 mutex_exit(&rp->r_statelock);
9081 return (error);
9082 }
9083
9084 /*
9085 * Check to see if we need to fill this entry in.
9086 */
9087 if (rrdc->flags & RDDIRREQ) {
9088 rrdc->flags &= ~RDDIRREQ;
9089 rrdc->flags |= RDDIR;
9090 rddir4_cache_rele(rp, rdc);
9091 mutex_exit(&rp->r_statelock);
9092 #ifdef DEBUG
9093 nfs4_readdir_readahead++;
9094 #endif
9095 /*
9096 * Do the readdir.
9097 */
9098 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9099 return (error);
9100 }
9101
9102 rddir4_cache_rele(rp, rrdc);
9103 rddir4_cache_rele(rp, rdc);
9104 mutex_exit(&rp->r_statelock);
9105 return (error);
9106 }
9107
9108 static int
9109 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9110 {
9111 int error;
9112 rnode4_t *rp;
9113
9114 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9115
9116 rp = VTOR4(vp);
9117
9118 /*
9119 * Obtain the readdir results for the caller.
9120 */
9121 nfs4readdir(vp, rdc, cr);
9122
9123 mutex_enter(&rp->r_statelock);
9124 /*
9125 * The entry is now complete
9126 */
9127 rdc->flags &= ~RDDIR;
9128
9129 error = rdc->error;
9130 if (error)
9131 rdc->flags |= RDDIRREQ;
9132 rddir4_cache_rele(rp, rdc);
9133 mutex_exit(&rp->r_statelock);
9134
9135 return (error);
9136 }
9137
9138 /*
9139 * Read directory entries.
9140 * There are some weird things to look out for here. The uio_loffset
9141 * field is either 0 or it is the offset returned from a previous
9142 * readdir. It is an opaque value used by the server to find the
9143 * correct directory block to read. The count field is the number
9144 * of blocks to read on the server. This is advisory only, the server
9145 * may return only one block's worth of entries. Entries may be compressed
9146 * on the server.
9147 *
9148 * Generates the following compound request:
9149 * 1. If readdir offset is zero and no dnlc entry for parent exists,
9150 * must include a Lookupp as well. In this case, send:
9151 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9152 * 2. Otherwise just do: { Putfh <fh>; Readdir }
9153 *
9154 * Get complete attributes and filehandles for entries if this is the
9155 * first read of the directory. Otherwise, just get fileid's.
9156 */
9157 static void
9158 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9159 {
9160 COMPOUND4args_clnt args;
9161 COMPOUND4res_clnt res;
9162 READDIR4args *rargs;
9163 READDIR4res_clnt *rd_res;
9164 bitmap4 rd_bitsval;
9165 nfs_argop4 argop[5];
9166 nfs_resop4 *resop;
9167 rnode4_t *rp = VTOR4(vp);
9168 mntinfo4_t *mi = VTOMI4(vp);
9169 int doqueue;
9170 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */
9171 vnode_t *dvp;
9172 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9173 int num_ops, res_opcnt;
9174 bool_t needrecov = FALSE;
9175 nfs4_recov_state_t recov_state;
9176 hrtime_t t;
9177 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9178
9179 ASSERT(nfs_zone() == mi->mi_zone);
9180 ASSERT(rdc->flags & RDDIR);
9181 ASSERT(rdc->entries == NULL);
9182
9183 /*
9184 * If rp were a stub, it should have triggered and caused
9185 * a mount for us to get this far.
9186 */
9187 ASSERT(!RP_ISSTUB(rp));
9188
9189 num_ops = 2;
9190 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9191 /*
9192 * Since nfsv4 readdir may not return entries for "." and "..",
9193 * the client must recreate them:
9194 * To find the correct nodeid, do the following:
9195 * For current node, get nodeid from dnlc.
9196 * - if current node is rootvp, set pnodeid to nodeid.
9197 * - else if parent is in the dnlc, get its nodeid from there.
9198 * - else add LOOKUPP+GETATTR to compound.
9199 */
9200 nodeid = rp->r_attr.va_nodeid;
9201 if (vp->v_flag & VROOT) {
9202 pnodeid = nodeid; /* root of mount point */
9203 } else {
9204 dvp = dnlc_lookup(vp, "..");
9205 if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9206 /* parent in dnlc cache - no need for otw */
9207 pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9208 } else {
9209 /*
9210 * parent not in dnlc cache,
9211 * do lookupp to get its id
9212 */
9213 num_ops = 5;
9214 pnodeid = 0; /* set later by getattr parent */
9215 }
9216 if (dvp)
9217 VN_RELE(dvp);
9218 }
9219 }
9220 recov_state.rs_flags = 0;
9221 recov_state.rs_num_retry_despite_err = 0;
9222
9223 /* Save the original mount point security flavor */
9224 (void) save_mnt_secinfo(mi->mi_curr_serv);
9225
9226 recov_retry:
9227 args.ctag = TAG_READDIR;
9228
9229 args.array = argop;
9230 args.array_len = num_ops;
9231
9232 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9233 &recov_state, NULL)) {
9234 /*
9235 * If readdir a node that is a stub for a crossed mount point,
9236 * keep the original secinfo flavor for the current file
9237 * system, not the crossed one.
9238 */
9239 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9240 rdc->error = e.error;
9241 return;
9242 }
9243
9244 /*
9245 * Determine which attrs to request for dirents. This code
9246 * must be protected by nfs4_start/end_fop because of r_server
9247 * (which will change during failover recovery).
9248 *
9249 */
9250 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9251 /*
9252 * Get all vattr attrs plus filehandle and rdattr_error
9253 */
9254 rd_bitsval = NFS4_VATTR_MASK |
9255 FATTR4_RDATTR_ERROR_MASK |
9256 FATTR4_FILEHANDLE_MASK;
9257
9258 if (rp->r_flags & R4READDIRWATTR) {
9259 mutex_enter(&rp->r_statelock);
9260 rp->r_flags &= ~R4READDIRWATTR;
9261 mutex_exit(&rp->r_statelock);
9262 }
9263 } else {
9264 servinfo4_t *svp = rp->r_server;
9265
9266 /*
9267 * Already read directory. Use readdir with
9268 * no attrs (except for mounted_on_fileid) for updates.
9269 */
9270 rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9271
9272 /*
9273 * request mounted on fileid if supported, else request
9274 * fileid. maybe we should verify that fileid is supported
9275 * and request something else if not.
9276 */
9277 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9278 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9279 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9280 nfs_rw_exit(&svp->sv_lock);
9281 }
9282
9283 /* putfh directory fh */
9284 argop[0].argop = OP_CPUTFH;
9285 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9286
9287 argop[1].argop = OP_READDIR;
9288 rargs = &argop[1].nfs_argop4_u.opreaddir;
9289 /*
9290 * 1 and 2 are reserved for client "." and ".." entry offset.
9291 * cookie 0 should be used over-the-wire to start reading at
9292 * the beginning of the directory excluding "." and "..".
9293 */
9294 if (rdc->nfs4_cookie == 0 ||
9295 rdc->nfs4_cookie == 1 ||
9296 rdc->nfs4_cookie == 2) {
9297 rargs->cookie = (nfs_cookie4)0;
9298 rargs->cookieverf = 0;
9299 } else {
9300 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9301 mutex_enter(&rp->r_statelock);
9302 rargs->cookieverf = rp->r_cookieverf4;
9303 mutex_exit(&rp->r_statelock);
9304 }
9305 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9306 rargs->maxcount = mi->mi_tsize;
9307 rargs->attr_request = rd_bitsval;
9308 rargs->rdc = rdc;
9309 rargs->dvp = vp;
9310 rargs->mi = mi;
9311 rargs->cr = cr;
9312
9313
9314 /*
9315 * If count < than the minimum required, we return no entries
9316 * and fail with EINVAL
9317 */
9318 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9319 rdc->error = EINVAL;
9320 goto out;
9321 }
9322
9323 if (args.array_len == 5) {
9324 /*
9325 * Add lookupp and getattr for parent nodeid.
9326 */
9327 argop[2].argop = OP_LOOKUPP;
9328
9329 argop[3].argop = OP_GETFH;
9330
9331 /* getattr parent */
9332 argop[4].argop = OP_GETATTR;
9333 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9334 argop[4].nfs_argop4_u.opgetattr.mi = mi;
9335 }
9336
9337 doqueue = 1;
9338
9339 if (mi->mi_io_kstats) {
9340 mutex_enter(&mi->mi_lock);
9341 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9342 mutex_exit(&mi->mi_lock);
9343 }
9344
9345 /* capture the time of this call */
9346 rargs->t = t = gethrtime();
9347
9348 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9349
9350 if (mi->mi_io_kstats) {
9351 mutex_enter(&mi->mi_lock);
9352 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9353 mutex_exit(&mi->mi_lock);
9354 }
9355
9356 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9357
9358 /*
9359 * If RPC error occurred and it isn't an error that
9360 * triggers recovery, then go ahead and fail now.
9361 */
9362 if (e.error != 0 && !needrecov) {
9363 rdc->error = e.error;
9364 goto out;
9365 }
9366
9367 if (needrecov) {
9368 bool_t abort;
9369
9370 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9371 "nfs4readdir: initiating recovery.\n"));
9372
9373 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9374 NULL, OP_READDIR, NULL, NULL, NULL);
9375 if (abort == FALSE) {
9376 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9377 &recov_state, needrecov);
9378 if (!e.error)
9379 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9380 if (rdc->entries != NULL) {
9381 kmem_free(rdc->entries, rdc->entlen);
9382 rdc->entries = NULL;
9383 }
9384 goto recov_retry;
9385 }
9386
9387 if (e.error != 0) {
9388 rdc->error = e.error;
9389 goto out;
9390 }
9391
9392 /* fall through for res.status case */
9393 }
9394
9395 res_opcnt = res.array_len;
9396
9397 /*
9398 * If compound failed first 2 ops (PUTFH+READDIR), then return
9399 * failure here. Subsequent ops are for filling out dot-dot
9400 * dirent, and if they fail, we still want to give the caller
9401 * the dirents returned by (the successful) READDIR op, so we need
9402 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9403 *
9404 * One example where PUTFH+READDIR ops would succeed but
9405 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9406 * but lacks x. In this case, a POSIX server's VOP_READDIR
9407 * would succeed; however, VOP_LOOKUP(..) would fail since no
9408 * x perm. We need to come up with a non-vendor-specific way
9409 * for a POSIX server to return d_ino from dotdot's dirent if
9410 * client only requests mounted_on_fileid, and just say the
9411 * LOOKUPP succeeded and fill out the GETATTR. However, if
9412 * client requested any mandatory attrs, server would be required
9413 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9414 * for dotdot.
9415 */
9416
9417 if (res.status) {
9418 if (res_opcnt <= 2) {
9419 e.error = geterrno4(res.status);
9420 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9421 &recov_state, needrecov);
9422 nfs4_purge_stale_fh(e.error, vp, cr);
9423 rdc->error = e.error;
9424 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9425 if (rdc->entries != NULL) {
9426 kmem_free(rdc->entries, rdc->entlen);
9427 rdc->entries = NULL;
9428 }
9429 /*
9430 * If readdir a node that is a stub for a
9431 * crossed mount point, keep the original
9432 * secinfo flavor for the current file system,
9433 * not the crossed one.
9434 */
9435 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9436 return;
9437 }
9438 }
9439
9440 resop = &res.array[1]; /* readdir res */
9441 rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9442
9443 mutex_enter(&rp->r_statelock);
9444 rp->r_cookieverf4 = rd_res->cookieverf;
9445 mutex_exit(&rp->r_statelock);
9446
9447 /*
9448 * For "." and ".." entries
9449 * e.g.
9450 * seek(cookie=0) -> "." entry with d_off = 1
9451 * seek(cookie=1) -> ".." entry with d_off = 2
9452 */
9453 if (cookie == (nfs_cookie4) 0) {
9454 if (rd_res->dotp)
9455 rd_res->dotp->d_ino = nodeid;
9456 if (rd_res->dotdotp)
9457 rd_res->dotdotp->d_ino = pnodeid;
9458 }
9459 if (cookie == (nfs_cookie4) 1) {
9460 if (rd_res->dotdotp)
9461 rd_res->dotdotp->d_ino = pnodeid;
9462 }
9463
9464
9465 /* LOOKUPP+GETATTR attemped */
9466 if (args.array_len == 5 && rd_res->dotdotp) {
9467 if (res.status == NFS4_OK && res_opcnt == 5) {
9468 nfs_fh4 *fhp;
9469 nfs4_sharedfh_t *sfhp;
9470 vnode_t *pvp;
9471 nfs4_ga_res_t *garp;
9472
9473 resop++; /* lookupp */
9474 resop++; /* getfh */
9475 fhp = &resop->nfs_resop4_u.opgetfh.object;
9476
9477 resop++; /* getattr of parent */
9478
9479 /*
9480 * First, take care of finishing the
9481 * readdir results.
9482 */
9483 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9484 /*
9485 * The d_ino of .. must be the inode number
9486 * of the mounted filesystem.
9487 */
9488 if (garp->n4g_va.va_mask & AT_NODEID)
9489 rd_res->dotdotp->d_ino =
9490 garp->n4g_va.va_nodeid;
9491
9492
9493 /*
9494 * Next, create the ".." dnlc entry
9495 */
9496 sfhp = sfh4_get(fhp, mi);
9497 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9498 dnlc_update(vp, "..", pvp);
9499 VN_RELE(pvp);
9500 }
9501 sfh4_rele(&sfhp);
9502 }
9503 }
9504
9505 if (mi->mi_io_kstats) {
9506 mutex_enter(&mi->mi_lock);
9507 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9508 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9509 mutex_exit(&mi->mi_lock);
9510 }
9511
9512 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9513
9514 out:
9515 /*
9516 * If readdir a node that is a stub for a crossed mount point,
9517 * keep the original secinfo flavor for the current file system,
9518 * not the crossed one.
9519 */
9520 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9521
9522 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9523 }
9524
9525
9526 static int
9527 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9528 {
9529 rnode4_t *rp = VTOR4(bp->b_vp);
9530 int count;
9531 int error;
9532 cred_t *cred_otw = NULL;
9533 offset_t offset;
9534 nfs4_open_stream_t *osp = NULL;
9535 bool_t first_time = TRUE; /* first time getting otw cred */
9536 bool_t last_time = FALSE; /* last time getting otw cred */
9537
9538 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9539
9540 DTRACE_IO1(start, struct buf *, bp);
9541 offset = ldbtob(bp->b_lblkno);
9542
9543 if (bp->b_flags & B_READ) {
9544 read_again:
9545 /*
9546 * Releases the osp, if it is provided.
9547 * Puts a hold on the cred_otw and the new osp (if found).
9548 */
9549 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9550 &first_time, &last_time);
9551 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9552 offset, bp->b_bcount, &bp->b_resid, cred_otw,
9553 readahead, NULL);
9554 crfree(cred_otw);
9555 if (!error) {
9556 if (bp->b_resid) {
9557 /*
9558 * Didn't get it all because we hit EOF,
9559 * zero all the memory beyond the EOF.
9560 */
9561 /* bzero(rdaddr + */
9562 bzero(bp->b_un.b_addr +
9563 bp->b_bcount - bp->b_resid, bp->b_resid);
9564 }
9565 mutex_enter(&rp->r_statelock);
9566 if (bp->b_resid == bp->b_bcount &&
9567 offset >= rp->r_size) {
9568 /*
9569 * We didn't read anything at all as we are
9570 * past EOF. Return an error indicator back
9571 * but don't destroy the pages (yet).
9572 */
9573 error = NFS_EOF;
9574 }
9575 mutex_exit(&rp->r_statelock);
9576 } else if (error == EACCES && last_time == FALSE) {
9577 goto read_again;
9578 }
9579 } else {
9580 if (!(rp->r_flags & R4STALE)) {
9581 write_again:
9582 /*
9583 * Releases the osp, if it is provided.
9584 * Puts a hold on the cred_otw and the new
9585 * osp (if found).
9586 */
9587 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9588 &first_time, &last_time);
9589 mutex_enter(&rp->r_statelock);
9590 count = MIN(bp->b_bcount, rp->r_size - offset);
9591 mutex_exit(&rp->r_statelock);
9592 if (count < 0)
9593 cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9594 #ifdef DEBUG
9595 if (count == 0) {
9596 zoneid_t zoneid = getzoneid();
9597
9598 zcmn_err(zoneid, CE_WARN,
9599 "nfs4_bio: zero length write at %lld",
9600 offset);
9601 zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9602 "b_bcount=%ld, file size=%lld",
9603 rp->r_flags, (long)bp->b_bcount,
9604 rp->r_size);
9605 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9606 if (nfs4_bio_do_stop)
9607 debug_enter("nfs4_bio");
9608 }
9609 #endif
9610 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9611 count, cred_otw, stab_comm);
9612 if (error == EACCES && last_time == FALSE) {
9613 crfree(cred_otw);
9614 goto write_again;
9615 }
9616 bp->b_error = error;
9617 if (error && error != EINTR &&
9618 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9619 /*
9620 * Don't print EDQUOT errors on the console.
9621 * Don't print asynchronous EACCES errors.
9622 * Don't print EFBIG errors.
9623 * Print all other write errors.
9624 */
9625 if (error != EDQUOT && error != EFBIG &&
9626 (error != EACCES ||
9627 !(bp->b_flags & B_ASYNC)))
9628 nfs4_write_error(bp->b_vp,
9629 error, cred_otw);
9630 /*
9631 * Update r_error and r_flags as appropriate.
9632 * If the error was ESTALE, then mark the
9633 * rnode as not being writeable and save
9634 * the error status. Otherwise, save any
9635 * errors which occur from asynchronous
9636 * page invalidations. Any errors occurring
9637 * from other operations should be saved
9638 * by the caller.
9639 */
9640 mutex_enter(&rp->r_statelock);
9641 if (error == ESTALE) {
9642 rp->r_flags |= R4STALE;
9643 if (!rp->r_error)
9644 rp->r_error = error;
9645 } else if (!rp->r_error &&
9646 (bp->b_flags &
9647 (B_INVAL|B_FORCE|B_ASYNC)) ==
9648 (B_INVAL|B_FORCE|B_ASYNC)) {
9649 rp->r_error = error;
9650 }
9651 mutex_exit(&rp->r_statelock);
9652 }
9653 crfree(cred_otw);
9654 } else {
9655 error = rp->r_error;
9656 /*
9657 * A close may have cleared r_error, if so,
9658 * propagate ESTALE error return properly
9659 */
9660 if (error == 0)
9661 error = ESTALE;
9662 }
9663 }
9664
9665 if (error != 0 && error != NFS_EOF)
9666 bp->b_flags |= B_ERROR;
9667
9668 if (osp)
9669 open_stream_rele(osp, rp);
9670
9671 DTRACE_IO1(done, struct buf *, bp);
9672
9673 return (error);
9674 }
9675
9676 /* ARGSUSED */
9677 int
9678 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9679 {
9680 return (EREMOTE);
9681 }
9682
9683 /* ARGSUSED2 */
9684 int
9685 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9686 {
9687 rnode4_t *rp = VTOR4(vp);
9688
9689 if (!write_lock) {
9690 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9691 return (V_WRITELOCK_FALSE);
9692 }
9693
9694 if ((rp->r_flags & R4DIRECTIO) ||
9695 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9696 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9697 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9698 return (V_WRITELOCK_FALSE);
9699 nfs_rw_exit(&rp->r_rwlock);
9700 }
9701
9702 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9703 return (V_WRITELOCK_TRUE);
9704 }
9705
9706 /* ARGSUSED */
9707 void
9708 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9709 {
9710 rnode4_t *rp = VTOR4(vp);
9711
9712 nfs_rw_exit(&rp->r_rwlock);
9713 }
9714
9715 /* ARGSUSED */
9716 static int
9717 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9718 {
9719 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9720 return (EIO);
9721
9722 /*
9723 * Because we stuff the readdir cookie into the offset field
9724 * someone may attempt to do an lseek with the cookie which
9725 * we want to succeed.
9726 */
9727 if (vp->v_type == VDIR)
9728 return (0);
9729 if (*noffp < 0)
9730 return (EINVAL);
9731 return (0);
9732 }
9733
9734
9735 /*
9736 * Return all the pages from [off..off+len) in file
9737 */
9738 /* ARGSUSED */
9739 static int
9740 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9741 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9742 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9743 {
9744 rnode4_t *rp;
9745 int error;
9746 mntinfo4_t *mi;
9747
9748 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9749 return (EIO);
9750 rp = VTOR4(vp);
9751 if (IS_SHADOW(vp, rp))
9752 vp = RTOV4(rp);
9753
9754 if (vp->v_flag & VNOMAP)
9755 return (ENOSYS);
9756
9757 if (protp != NULL)
9758 *protp = PROT_ALL;
9759
9760 /*
9761 * Now validate that the caches are up to date.
9762 */
9763 if (error = nfs4_validate_caches(vp, cr))
9764 return (error);
9765
9766 mi = VTOMI4(vp);
9767 retry:
9768 mutex_enter(&rp->r_statelock);
9769
9770 /*
9771 * Don't create dirty pages faster than they
9772 * can be cleaned so that the system doesn't
9773 * get imbalanced. If the async queue is
9774 * maxed out, then wait for it to drain before
9775 * creating more dirty pages. Also, wait for
9776 * any threads doing pagewalks in the vop_getattr
9777 * entry points so that they don't block for
9778 * long periods.
9779 */
9780 if (rw == S_CREATE) {
9781 while ((mi->mi_max_threads != 0 &&
9782 rp->r_awcount > 2 * mi->mi_max_threads) ||
9783 rp->r_gcount > 0)
9784 cv_wait(&rp->r_cv, &rp->r_statelock);
9785 }
9786
9787 /*
9788 * If we are getting called as a side effect of an nfs_write()
9789 * operation the local file size might not be extended yet.
9790 * In this case we want to be able to return pages of zeroes.
9791 */
9792 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9793 NFS4_DEBUG(nfs4_pageio_debug,
9794 (CE_NOTE, "getpage beyond EOF: off=%lld, "
9795 "len=%llu, size=%llu, attrsize =%llu", off,
9796 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9797 mutex_exit(&rp->r_statelock);
9798 return (EFAULT); /* beyond EOF */
9799 }
9800
9801 mutex_exit(&rp->r_statelock);
9802
9803 error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9804 pl, plsz, seg, addr, rw, cr);
9805 NFS4_DEBUG(nfs4_pageio_debug && error,
9806 (CE_NOTE, "getpages error %d; off=%lld, len=%lld",
9807 error, off, (u_longlong_t)len));
9808
9809 switch (error) {
9810 case NFS_EOF:
9811 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9812 goto retry;
9813 case ESTALE:
9814 nfs4_purge_stale_fh(error, vp, cr);
9815 }
9816
9817 return (error);
9818 }
9819
9820 /*
9821 * Called from pvn_getpages to get a particular page.
9822 */
9823 /* ARGSUSED */
9824 static int
9825 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9826 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9827 enum seg_rw rw, cred_t *cr)
9828 {
9829 rnode4_t *rp;
9830 uint_t bsize;
9831 struct buf *bp;
9832 page_t *pp;
9833 u_offset_t lbn;
9834 u_offset_t io_off;
9835 u_offset_t blkoff;
9836 u_offset_t rablkoff;
9837 size_t io_len;
9838 uint_t blksize;
9839 int error;
9840 int readahead;
9841 int readahead_issued = 0;
9842 int ra_window; /* readahead window */
9843 page_t *pagefound;
9844 page_t *savepp;
9845
9846 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9847 return (EIO);
9848
9849 rp = VTOR4(vp);
9850 ASSERT(!IS_SHADOW(vp, rp));
9851 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9852
9853 reread:
9854 bp = NULL;
9855 pp = NULL;
9856 pagefound = NULL;
9857
9858 if (pl != NULL)
9859 pl[0] = NULL;
9860
9861 error = 0;
9862 lbn = off / bsize;
9863 blkoff = lbn * bsize;
9864
9865 /*
9866 * Queueing up the readahead before doing the synchronous read
9867 * results in a significant increase in read throughput because
9868 * of the increased parallelism between the async threads and
9869 * the process context.
9870 */
9871 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9872 rw != S_CREATE &&
9873 !(vp->v_flag & VNOCACHE)) {
9874 mutex_enter(&rp->r_statelock);
9875
9876 /*
9877 * Calculate the number of readaheads to do.
9878 * a) No readaheads at offset = 0.
9879 * b) Do maximum(nfs4_nra) readaheads when the readahead
9880 * window is closed.
9881 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9882 * upon how far the readahead window is open or close.
9883 * d) No readaheads if rp->r_nextr is not within the scope
9884 * of the readahead window (random i/o).
9885 */
9886
9887 if (off == 0)
9888 readahead = 0;
9889 else if (blkoff == rp->r_nextr)
9890 readahead = nfs4_nra;
9891 else if (rp->r_nextr > blkoff &&
9892 ((ra_window = (rp->r_nextr - blkoff) / bsize)
9893 <= (nfs4_nra - 1)))
9894 readahead = nfs4_nra - ra_window;
9895 else
9896 readahead = 0;
9897
9898 rablkoff = rp->r_nextr;
9899 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9900 mutex_exit(&rp->r_statelock);
9901 if (nfs4_async_readahead(vp, rablkoff + bsize,
9902 addr + (rablkoff + bsize - off),
9903 seg, cr, nfs4_readahead) < 0) {
9904 mutex_enter(&rp->r_statelock);
9905 break;
9906 }
9907 readahead--;
9908 rablkoff += bsize;
9909 /*
9910 * Indicate that we did a readahead so
9911 * readahead offset is not updated
9912 * by the synchronous read below.
9913 */
9914 readahead_issued = 1;
9915 mutex_enter(&rp->r_statelock);
9916 /*
9917 * set readahead offset to
9918 * offset of last async readahead
9919 * request.
9920 */
9921 rp->r_nextr = rablkoff;
9922 }
9923 mutex_exit(&rp->r_statelock);
9924 }
9925
9926 again:
9927 if ((pagefound = page_exists(vp, off)) == NULL) {
9928 if (pl == NULL) {
9929 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9930 nfs4_readahead);
9931 } else if (rw == S_CREATE) {
9932 /*
9933 * Block for this page is not allocated, or the offset
9934 * is beyond the current allocation size, or we're
9935 * allocating a swap slot and the page was not found,
9936 * so allocate it and return a zero page.
9937 */
9938 if ((pp = page_create_va(vp, off,
9939 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9940 cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9941 io_len = PAGESIZE;
9942 mutex_enter(&rp->r_statelock);
9943 rp->r_nextr = off + PAGESIZE;
9944 mutex_exit(&rp->r_statelock);
9945 } else {
9946 /*
9947 * Need to go to server to get a block
9948 */
9949 mutex_enter(&rp->r_statelock);
9950 if (blkoff < rp->r_size &&
9951 blkoff + bsize > rp->r_size) {
9952 /*
9953 * If less than a block left in
9954 * file read less than a block.
9955 */
9956 if (rp->r_size <= off) {
9957 /*
9958 * Trying to access beyond EOF,
9959 * set up to get at least one page.
9960 */
9961 blksize = off + PAGESIZE - blkoff;
9962 } else
9963 blksize = rp->r_size - blkoff;
9964 } else if ((off == 0) ||
9965 (off != rp->r_nextr && !readahead_issued)) {
9966 blksize = PAGESIZE;
9967 blkoff = off; /* block = page here */
9968 } else
9969 blksize = bsize;
9970 mutex_exit(&rp->r_statelock);
9971
9972 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9973 &io_len, blkoff, blksize, 0);
9974
9975 /*
9976 * Some other thread has entered the page,
9977 * so just use it.
9978 */
9979 if (pp == NULL)
9980 goto again;
9981
9982 /*
9983 * Now round the request size up to page boundaries.
9984 * This ensures that the entire page will be
9985 * initialized to zeroes if EOF is encountered.
9986 */
9987 io_len = ptob(btopr(io_len));
9988
9989 bp = pageio_setup(pp, io_len, vp, B_READ);
9990 ASSERT(bp != NULL);
9991
9992 /*
9993 * pageio_setup should have set b_addr to 0. This
9994 * is correct since we want to do I/O on a page
9995 * boundary. bp_mapin will use this addr to calculate
9996 * an offset, and then set b_addr to the kernel virtual
9997 * address it allocated for us.
9998 */
9999 ASSERT(bp->b_un.b_addr == 0);
10000
10001 bp->b_edev = 0;
10002 bp->b_dev = 0;
10003 bp->b_lblkno = lbtodb(io_off);
10004 bp->b_file = vp;
10005 bp->b_offset = (offset_t)off;
10006 bp_mapin(bp);
10007
10008 /*
10009 * If doing a write beyond what we believe is EOF,
10010 * don't bother trying to read the pages from the
10011 * server, we'll just zero the pages here. We
10012 * don't check that the rw flag is S_WRITE here
10013 * because some implementations may attempt a
10014 * read access to the buffer before copying data.
10015 */
10016 mutex_enter(&rp->r_statelock);
10017 if (io_off >= rp->r_size && seg == segkmap) {
10018 mutex_exit(&rp->r_statelock);
10019 bzero(bp->b_un.b_addr, io_len);
10020 } else {
10021 mutex_exit(&rp->r_statelock);
10022 error = nfs4_bio(bp, NULL, cr, FALSE);
10023 }
10024
10025 /*
10026 * Unmap the buffer before freeing it.
10027 */
10028 bp_mapout(bp);
10029 pageio_done(bp);
10030
10031 savepp = pp;
10032 do {
10033 pp->p_fsdata = C_NOCOMMIT;
10034 } while ((pp = pp->p_next) != savepp);
10035
10036 if (error == NFS_EOF) {
10037 /*
10038 * If doing a write system call just return
10039 * zeroed pages, else user tried to get pages
10040 * beyond EOF, return error. We don't check
10041 * that the rw flag is S_WRITE here because
10042 * some implementations may attempt a read
10043 * access to the buffer before copying data.
10044 */
10045 if (seg == segkmap)
10046 error = 0;
10047 else
10048 error = EFAULT;
10049 }
10050
10051 if (!readahead_issued && !error) {
10052 mutex_enter(&rp->r_statelock);
10053 rp->r_nextr = io_off + io_len;
10054 mutex_exit(&rp->r_statelock);
10055 }
10056 }
10057 }
10058
10059 out:
10060 if (pl == NULL)
10061 return (error);
10062
10063 if (error) {
10064 if (pp != NULL)
10065 pvn_read_done(pp, B_ERROR);
10066 return (error);
10067 }
10068
10069 if (pagefound) {
10070 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10071
10072 /*
10073 * Page exists in the cache, acquire the appropriate lock.
10074 * If this fails, start all over again.
10075 */
10076 if ((pp = page_lookup(vp, off, se)) == NULL) {
10077 #ifdef DEBUG
10078 nfs4_lostpage++;
10079 #endif
10080 goto reread;
10081 }
10082 pl[0] = pp;
10083 pl[1] = NULL;
10084 return (0);
10085 }
10086
10087 if (pp != NULL)
10088 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10089
10090 return (error);
10091 }
10092
10093 static void
10094 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10095 cred_t *cr)
10096 {
10097 int error;
10098 page_t *pp;
10099 u_offset_t io_off;
10100 size_t io_len;
10101 struct buf *bp;
10102 uint_t bsize, blksize;
10103 rnode4_t *rp = VTOR4(vp);
10104 page_t *savepp;
10105
10106 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10107
10108 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10109
10110 mutex_enter(&rp->r_statelock);
10111 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10112 /*
10113 * If less than a block left in file read less
10114 * than a block.
10115 */
10116 blksize = rp->r_size - blkoff;
10117 } else
10118 blksize = bsize;
10119 mutex_exit(&rp->r_statelock);
10120
10121 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10122 &io_off, &io_len, blkoff, blksize, 1);
10123 /*
10124 * The isra flag passed to the kluster function is 1, we may have
10125 * gotten a return value of NULL for a variety of reasons (# of free
10126 * pages < minfree, someone entered the page on the vnode etc). In all
10127 * cases, we want to punt on the readahead.
10128 */
10129 if (pp == NULL)
10130 return;
10131
10132 /*
10133 * Now round the request size up to page boundaries.
10134 * This ensures that the entire page will be
10135 * initialized to zeroes if EOF is encountered.
10136 */
10137 io_len = ptob(btopr(io_len));
10138
10139 bp = pageio_setup(pp, io_len, vp, B_READ);
10140 ASSERT(bp != NULL);
10141
10142 /*
10143 * pageio_setup should have set b_addr to 0. This is correct since
10144 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10145 * to calculate an offset, and then set b_addr to the kernel virtual
10146 * address it allocated for us.
10147 */
10148 ASSERT(bp->b_un.b_addr == 0);
10149
10150 bp->b_edev = 0;
10151 bp->b_dev = 0;
10152 bp->b_lblkno = lbtodb(io_off);
10153 bp->b_file = vp;
10154 bp->b_offset = (offset_t)blkoff;
10155 bp_mapin(bp);
10156
10157 /*
10158 * If doing a write beyond what we believe is EOF, don't bother trying
10159 * to read the pages from the server, we'll just zero the pages here.
10160 * We don't check that the rw flag is S_WRITE here because some
10161 * implementations may attempt a read access to the buffer before
10162 * copying data.
10163 */
10164 mutex_enter(&rp->r_statelock);
10165 if (io_off >= rp->r_size && seg == segkmap) {
10166 mutex_exit(&rp->r_statelock);
10167 bzero(bp->b_un.b_addr, io_len);
10168 error = 0;
10169 } else {
10170 mutex_exit(&rp->r_statelock);
10171 error = nfs4_bio(bp, NULL, cr, TRUE);
10172 if (error == NFS_EOF)
10173 error = 0;
10174 }
10175
10176 /*
10177 * Unmap the buffer before freeing it.
10178 */
10179 bp_mapout(bp);
10180 pageio_done(bp);
10181
10182 savepp = pp;
10183 do {
10184 pp->p_fsdata = C_NOCOMMIT;
10185 } while ((pp = pp->p_next) != savepp);
10186
10187 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10188
10189 /*
10190 * In case of error set readahead offset
10191 * to the lowest offset.
10192 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10193 */
10194 if (error && rp->r_nextr > io_off) {
10195 mutex_enter(&rp->r_statelock);
10196 if (rp->r_nextr > io_off)
10197 rp->r_nextr = io_off;
10198 mutex_exit(&rp->r_statelock);
10199 }
10200 }
10201
10202 /*
10203 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10204 * If len == 0, do from off to EOF.
10205 *
10206 * The normal cases should be len == 0 && off == 0 (entire vp list) or
10207 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10208 * (from pageout).
10209 */
10210 /* ARGSUSED */
10211 static int
10212 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10213 caller_context_t *ct)
10214 {
10215 int error;
10216 rnode4_t *rp;
10217
10218 ASSERT(cr != NULL);
10219
10220 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10221 return (EIO);
10222
10223 rp = VTOR4(vp);
10224 if (IS_SHADOW(vp, rp))
10225 vp = RTOV4(rp);
10226
10227 /*
10228 * XXX - Why should this check be made here?
10229 */
10230 if (vp->v_flag & VNOMAP)
10231 return (ENOSYS);
10232
10233 if (len == 0 && !(flags & B_INVAL) &&
10234 (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10235 return (0);
10236
10237 mutex_enter(&rp->r_statelock);
10238 rp->r_count++;
10239 mutex_exit(&rp->r_statelock);
10240 error = nfs4_putpages(vp, off, len, flags, cr);
10241 mutex_enter(&rp->r_statelock);
10242 rp->r_count--;
10243 cv_broadcast(&rp->r_cv);
10244 mutex_exit(&rp->r_statelock);
10245
10246 return (error);
10247 }
10248
10249 /*
10250 * Write out a single page, possibly klustering adjacent dirty pages.
10251 */
10252 int
10253 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10254 int flags, cred_t *cr)
10255 {
10256 u_offset_t io_off;
10257 u_offset_t lbn_off;
10258 u_offset_t lbn;
10259 size_t io_len;
10260 uint_t bsize;
10261 int error;
10262 rnode4_t *rp;
10263
10264 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10265 ASSERT(pp != NULL);
10266 ASSERT(cr != NULL);
10267 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10268
10269 rp = VTOR4(vp);
10270 ASSERT(rp->r_count > 0);
10271 ASSERT(!IS_SHADOW(vp, rp));
10272
10273 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10274 lbn = pp->p_offset / bsize;
10275 lbn_off = lbn * bsize;
10276
10277 /*
10278 * Find a kluster that fits in one block, or in
10279 * one page if pages are bigger than blocks. If
10280 * there is less file space allocated than a whole
10281 * page, we'll shorten the i/o request below.
10282 */
10283 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10284 roundup(bsize, PAGESIZE), flags);
10285
10286 /*
10287 * pvn_write_kluster shouldn't have returned a page with offset
10288 * behind the original page we were given. Verify that.
10289 */
10290 ASSERT((pp->p_offset / bsize) >= lbn);
10291
10292 /*
10293 * Now pp will have the list of kept dirty pages marked for
10294 * write back. It will also handle invalidation and freeing
10295 * of pages that are not dirty. Check for page length rounding
10296 * problems.
10297 */
10298 if (io_off + io_len > lbn_off + bsize) {
10299 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10300 io_len = lbn_off + bsize - io_off;
10301 }
10302 /*
10303 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10304 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10305 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10306 * progress and the r_size has not been made consistent with the
10307 * new size of the file. When the uiomove() completes the r_size is
10308 * updated and the R4MODINPROGRESS flag is cleared.
10309 *
10310 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10311 * consistent value of r_size. Without this handshaking, it is
10312 * possible that nfs4_bio() picks up the old value of r_size
10313 * before the uiomove() in writerp4() completes. This will result
10314 * in the write through nfs4_bio() being dropped.
10315 *
10316 * More precisely, there is a window between the time the uiomove()
10317 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10318 * operation intervenes in this window, the page will be picked up,
10319 * because it is dirty (it will be unlocked, unless it was
10320 * pagecreate'd). When the page is picked up as dirty, the dirty
10321 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10322 * checked. This will still be the old size. Therefore the page will
10323 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10324 * the page will be found to be clean and the write will be dropped.
10325 */
10326 if (rp->r_flags & R4MODINPROGRESS) {
10327 mutex_enter(&rp->r_statelock);
10328 if ((rp->r_flags & R4MODINPROGRESS) &&
10329 rp->r_modaddr + MAXBSIZE > io_off &&
10330 rp->r_modaddr < io_off + io_len) {
10331 page_t *plist;
10332 /*
10333 * A write is in progress for this region of the file.
10334 * If we did not detect R4MODINPROGRESS here then this
10335 * path through nfs_putapage() would eventually go to
10336 * nfs4_bio() and may not write out all of the data
10337 * in the pages. We end up losing data. So we decide
10338 * to set the modified bit on each page in the page
10339 * list and mark the rnode with R4DIRTY. This write
10340 * will be restarted at some later time.
10341 */
10342 plist = pp;
10343 while (plist != NULL) {
10344 pp = plist;
10345 page_sub(&plist, pp);
10346 hat_setmod(pp);
10347 page_io_unlock(pp);
10348 page_unlock(pp);
10349 }
10350 rp->r_flags |= R4DIRTY;
10351 mutex_exit(&rp->r_statelock);
10352 if (offp)
10353 *offp = io_off;
10354 if (lenp)
10355 *lenp = io_len;
10356 return (0);
10357 }
10358 mutex_exit(&rp->r_statelock);
10359 }
10360
10361 if (flags & B_ASYNC) {
10362 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10363 nfs4_sync_putapage);
10364 } else
10365 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10366
10367 if (offp)
10368 *offp = io_off;
10369 if (lenp)
10370 *lenp = io_len;
10371 return (error);
10372 }
10373
10374 static int
10375 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10376 int flags, cred_t *cr)
10377 {
10378 int error;
10379 rnode4_t *rp;
10380
10381 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10382
10383 flags |= B_WRITE;
10384
10385 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10386
10387 rp = VTOR4(vp);
10388
10389 if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10390 error == EACCES) &&
10391 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10392 if (!(rp->r_flags & R4OUTOFSPACE)) {
10393 mutex_enter(&rp->r_statelock);
10394 rp->r_flags |= R4OUTOFSPACE;
10395 mutex_exit(&rp->r_statelock);
10396 }
10397 flags |= B_ERROR;
10398 pvn_write_done(pp, flags);
10399 /*
10400 * If this was not an async thread, then try again to
10401 * write out the pages, but this time, also destroy
10402 * them whether or not the write is successful. This
10403 * will prevent memory from filling up with these
10404 * pages and destroying them is the only alternative
10405 * if they can't be written out.
10406 *
10407 * Don't do this if this is an async thread because
10408 * when the pages are unlocked in pvn_write_done,
10409 * some other thread could have come along, locked
10410 * them, and queued for an async thread. It would be
10411 * possible for all of the async threads to be tied
10412 * up waiting to lock the pages again and they would
10413 * all already be locked and waiting for an async
10414 * thread to handle them. Deadlock.
10415 */
10416 if (!(flags & B_ASYNC)) {
10417 error = nfs4_putpage(vp, io_off, io_len,
10418 B_INVAL | B_FORCE, cr, NULL);
10419 }
10420 } else {
10421 if (error)
10422 flags |= B_ERROR;
10423 else if (rp->r_flags & R4OUTOFSPACE) {
10424 mutex_enter(&rp->r_statelock);
10425 rp->r_flags &= ~R4OUTOFSPACE;
10426 mutex_exit(&rp->r_statelock);
10427 }
10428 pvn_write_done(pp, flags);
10429 if (freemem < desfree)
10430 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10431 NFS4_WRITE_NOWAIT);
10432 }
10433
10434 return (error);
10435 }
10436
10437 #ifdef DEBUG
10438 int nfs4_force_open_before_mmap = 0;
10439 #endif
10440
10441 /* ARGSUSED */
10442 static int
10443 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10444 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10445 caller_context_t *ct)
10446 {
10447 struct segvn_crargs vn_a;
10448 int error = 0;
10449 rnode4_t *rp = VTOR4(vp);
10450 mntinfo4_t *mi = VTOMI4(vp);
10451
10452 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10453 return (EIO);
10454
10455 if (vp->v_flag & VNOMAP)
10456 return (ENOSYS);
10457
10458 if (off < 0 || (off + len) < 0)
10459 return (ENXIO);
10460
10461 if (vp->v_type != VREG)
10462 return (ENODEV);
10463
10464 /*
10465 * If the file is delegated to the client don't do anything.
10466 * If the file is not delegated, then validate the data cache.
10467 */
10468 mutex_enter(&rp->r_statev4_lock);
10469 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10470 mutex_exit(&rp->r_statev4_lock);
10471 error = nfs4_validate_caches(vp, cr);
10472 if (error)
10473 return (error);
10474 } else {
10475 mutex_exit(&rp->r_statev4_lock);
10476 }
10477
10478 /*
10479 * Check to see if the vnode is currently marked as not cachable.
10480 * This means portions of the file are locked (through VOP_FRLOCK).
10481 * In this case the map request must be refused. We use
10482 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10483 *
10484 * Atomically increment r_inmap after acquiring r_rwlock. The
10485 * idea here is to acquire r_rwlock to block read/write and
10486 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10487 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10488 * and we can prevent the deadlock that would have occurred
10489 * when nfs4_addmap() would have acquired it out of order.
10490 *
10491 * Since we are not protecting r_inmap by any lock, we do not
10492 * hold any lock when we decrement it. We atomically decrement
10493 * r_inmap after we release r_lkserlock.
10494 */
10495
10496 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10497 return (EINTR);
10498 atomic_inc_uint(&rp->r_inmap);
10499 nfs_rw_exit(&rp->r_rwlock);
10500
10501 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10502 atomic_dec_uint(&rp->r_inmap);
10503 return (EINTR);
10504 }
10505
10506 if (vp->v_flag & VNOCACHE) {
10507 error = EAGAIN;
10508 goto done;
10509 }
10510
10511 /*
10512 * Don't allow concurrent locks and mapping if mandatory locking is
10513 * enabled.
10514 */
10515 if (flk_has_remote_locks(vp)) {
10516 struct vattr va;
10517 va.va_mask = AT_MODE;
10518 error = nfs4getattr(vp, &va, cr);
10519 if (error != 0)
10520 goto done;
10521 if (MANDLOCK(vp, va.va_mode)) {
10522 error = EAGAIN;
10523 goto done;
10524 }
10525 }
10526
10527 /*
10528 * It is possible that the rnode has a lost lock request that we
10529 * are still trying to recover, and that the request conflicts with
10530 * this map request.
10531 *
10532 * An alternative approach would be for nfs4_safemap() to consider
10533 * queued lock requests when deciding whether to set or clear
10534 * VNOCACHE. This would require the frlock code path to call
10535 * nfs4_safemap() after enqueing a lost request.
10536 */
10537 if (nfs4_map_lost_lock_conflict(vp)) {
10538 error = EAGAIN;
10539 goto done;
10540 }
10541
10542 as_rangelock(as);
10543 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10544 if (error != 0) {
10545 as_rangeunlock(as);
10546 goto done;
10547 }
10548
10549 if (vp->v_type == VREG) {
10550 /*
10551 * We need to retrieve the open stream
10552 */
10553 nfs4_open_stream_t *osp = NULL;
10554 nfs4_open_owner_t *oop = NULL;
10555
10556 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10557 if (oop != NULL) {
10558 /* returns with 'os_sync_lock' held */
10559 osp = find_open_stream(oop, rp);
10560 open_owner_rele(oop);
10561 }
10562 if (osp == NULL) {
10563 #ifdef DEBUG
10564 if (nfs4_force_open_before_mmap) {
10565 error = EIO;
10566 goto done;
10567 }
10568 #endif
10569 /* returns with 'os_sync_lock' held */
10570 error = open_and_get_osp(vp, cr, &osp);
10571 if (osp == NULL) {
10572 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10573 "nfs4_map: we tried to OPEN the file "
10574 "but again no osp, so fail with EIO"));
10575 goto done;
10576 }
10577 }
10578
10579 if (osp->os_failed_reopen) {
10580 mutex_exit(&osp->os_sync_lock);
10581 open_stream_rele(osp, rp);
10582 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10583 "nfs4_map: os_failed_reopen set on "
10584 "osp %p, cr %p, rp %s", (void *)osp,
10585 (void *)cr, rnode4info(rp)));
10586 error = EIO;
10587 goto done;
10588 }
10589 mutex_exit(&osp->os_sync_lock);
10590 open_stream_rele(osp, rp);
10591 }
10592
10593 vn_a.vp = vp;
10594 vn_a.offset = off;
10595 vn_a.type = (flags & MAP_TYPE);
10596 vn_a.prot = (uchar_t)prot;
10597 vn_a.maxprot = (uchar_t)maxprot;
10598 vn_a.flags = (flags & ~MAP_TYPE);
10599 vn_a.cred = cr;
10600 vn_a.amp = NULL;
10601 vn_a.szc = 0;
10602 vn_a.lgrp_mem_policy_flags = 0;
10603
10604 error = as_map(as, *addrp, len, segvn_create, &vn_a);
10605 as_rangeunlock(as);
10606
10607 done:
10608 nfs_rw_exit(&rp->r_lkserlock);
10609 atomic_dec_uint(&rp->r_inmap);
10610 return (error);
10611 }
10612
10613 /*
10614 * We're most likely dealing with a kernel module that likes to READ
10615 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10616 * officially OPEN the file to create the necessary client state
10617 * for bookkeeping of os_mmap_read/write counts.
10618 *
10619 * Since VOP_MAP only passes in a pointer to the vnode rather than
10620 * a double pointer, we can't handle the case where nfs4open_otw()
10621 * returns a different vnode than the one passed into VOP_MAP (since
10622 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case,
10623 * we return NULL and let nfs4_map() fail. Note: the only case where
10624 * this should happen is if the file got removed and replaced with the
10625 * same name on the server (in addition to the fact that we're trying
10626 * to VOP_MAP withouth VOP_OPENing the file in the first place).
10627 */
10628 static int
10629 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10630 {
10631 rnode4_t *rp, *drp;
10632 vnode_t *dvp, *open_vp;
10633 char file_name[MAXNAMELEN];
10634 int just_created;
10635 nfs4_open_stream_t *osp;
10636 nfs4_open_owner_t *oop;
10637 int error;
10638
10639 *ospp = NULL;
10640 open_vp = map_vp;
10641
10642 rp = VTOR4(open_vp);
10643 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10644 return (error);
10645 drp = VTOR4(dvp);
10646
10647 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10648 VN_RELE(dvp);
10649 return (EINTR);
10650 }
10651
10652 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10653 nfs_rw_exit(&drp->r_rwlock);
10654 VN_RELE(dvp);
10655 return (error);
10656 }
10657
10658 mutex_enter(&rp->r_statev4_lock);
10659 if (rp->created_v4) {
10660 rp->created_v4 = 0;
10661 mutex_exit(&rp->r_statev4_lock);
10662
10663 dnlc_update(dvp, file_name, open_vp);
10664 /* This is needed so we don't bump the open ref count */
10665 just_created = 1;
10666 } else {
10667 mutex_exit(&rp->r_statev4_lock);
10668 just_created = 0;
10669 }
10670
10671 VN_HOLD(map_vp);
10672
10673 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10674 just_created);
10675 if (error) {
10676 nfs_rw_exit(&drp->r_rwlock);
10677 VN_RELE(dvp);
10678 VN_RELE(map_vp);
10679 return (error);
10680 }
10681
10682 nfs_rw_exit(&drp->r_rwlock);
10683 VN_RELE(dvp);
10684
10685 /*
10686 * If nfs4open_otw() returned a different vnode then "undo"
10687 * the open and return failure to the caller.
10688 */
10689 if (!VN_CMP(open_vp, map_vp)) {
10690 nfs4_error_t e;
10691
10692 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10693 "open returned a different vnode"));
10694 /*
10695 * If there's an error, ignore it,
10696 * and let VOP_INACTIVE handle it.
10697 */
10698 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10699 CLOSE_NORM, 0, 0, 0);
10700 VN_RELE(map_vp);
10701 return (EIO);
10702 }
10703
10704 VN_RELE(map_vp);
10705
10706 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10707 if (!oop) {
10708 nfs4_error_t e;
10709
10710 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10711 "no open owner"));
10712 /*
10713 * If there's an error, ignore it,
10714 * and let VOP_INACTIVE handle it.
10715 */
10716 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10717 CLOSE_NORM, 0, 0, 0);
10718 return (EIO);
10719 }
10720 osp = find_open_stream(oop, rp);
10721 open_owner_rele(oop);
10722 *ospp = osp;
10723 return (0);
10724 }
10725
10726 /*
10727 * Please be aware that when this function is called, the address space write
10728 * a_lock is held. Do not put over the wire calls in this function.
10729 */
10730 /* ARGSUSED */
10731 static int
10732 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10733 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10734 caller_context_t *ct)
10735 {
10736 rnode4_t *rp;
10737 int error = 0;
10738 mntinfo4_t *mi;
10739
10740 mi = VTOMI4(vp);
10741 rp = VTOR4(vp);
10742
10743 if (nfs_zone() != mi->mi_zone)
10744 return (EIO);
10745 if (vp->v_flag & VNOMAP)
10746 return (ENOSYS);
10747
10748 /*
10749 * Don't need to update the open stream first, since this
10750 * mmap can't add any additional share access that isn't
10751 * already contained in the open stream (for the case where we
10752 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10753 * take into account os_mmap_read[write] counts).
10754 */
10755 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10756
10757 if (vp->v_type == VREG) {
10758 /*
10759 * We need to retrieve the open stream and update the counts.
10760 * If there is no open stream here, something is wrong.
10761 */
10762 nfs4_open_stream_t *osp = NULL;
10763 nfs4_open_owner_t *oop = NULL;
10764
10765 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10766 if (oop != NULL) {
10767 /* returns with 'os_sync_lock' held */
10768 osp = find_open_stream(oop, rp);
10769 open_owner_rele(oop);
10770 }
10771 if (osp == NULL) {
10772 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10773 "nfs4_addmap: we should have an osp"
10774 "but we don't, so fail with EIO"));
10775 error = EIO;
10776 goto out;
10777 }
10778
10779 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10780 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10781
10782 /*
10783 * Update the map count in the open stream.
10784 * This is necessary in the case where we
10785 * open/mmap/close/, then the server reboots, and we
10786 * attempt to reopen. If the mmap doesn't add share
10787 * access then we send an invalid reopen with
10788 * access = NONE.
10789 *
10790 * We need to specifically check each PROT_* so a mmap
10791 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10792 * read and write access. A simple comparison of prot
10793 * to ~PROT_WRITE to determine read access is insufficient
10794 * since prot can be |= with PROT_USER, etc.
10795 */
10796
10797 /*
10798 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10799 */
10800 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10801 osp->os_mmap_write += btopr(len);
10802 if (maxprot & PROT_READ)
10803 osp->os_mmap_read += btopr(len);
10804 if (maxprot & PROT_EXEC)
10805 osp->os_mmap_read += btopr(len);
10806 /*
10807 * Ensure that os_mmap_read gets incremented, even if
10808 * maxprot were to look like PROT_NONE.
10809 */
10810 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10811 !(maxprot & PROT_EXEC))
10812 osp->os_mmap_read += btopr(len);
10813 osp->os_mapcnt += btopr(len);
10814 mutex_exit(&osp->os_sync_lock);
10815 open_stream_rele(osp, rp);
10816 }
10817
10818 out:
10819 /*
10820 * If we got an error, then undo our
10821 * incrementing of 'r_mapcnt'.
10822 */
10823
10824 if (error) {
10825 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10826 ASSERT(rp->r_mapcnt >= 0);
10827 }
10828 return (error);
10829 }
10830
10831 /* ARGSUSED */
10832 static int
10833 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10834 {
10835
10836 return (VTOR4(vp1) == VTOR4(vp2));
10837 }
10838
10839 /* ARGSUSED */
10840 static int
10841 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10842 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10843 caller_context_t *ct)
10844 {
10845 int rc;
10846 u_offset_t start, end;
10847 rnode4_t *rp;
10848 int error = 0, intr = INTR4(vp);
10849 nfs4_error_t e;
10850
10851 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10852 return (EIO);
10853
10854 /* check for valid cmd parameter */
10855 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10856 return (EINVAL);
10857
10858 /* Verify l_type. */
10859 switch (bfp->l_type) {
10860 case F_RDLCK:
10861 if (cmd != F_GETLK && !(flag & FREAD))
10862 return (EBADF);
10863 break;
10864 case F_WRLCK:
10865 if (cmd != F_GETLK && !(flag & FWRITE))
10866 return (EBADF);
10867 break;
10868 case F_UNLCK:
10869 intr = 0;
10870 break;
10871
10872 default:
10873 return (EINVAL);
10874 }
10875
10876 /* check the validity of the lock range */
10877 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10878 return (rc);
10879 if (rc = flk_check_lock_data(start, end, MAXEND))
10880 return (rc);
10881
10882 /*
10883 * If the filesystem is mounted using local locking, pass the
10884 * request off to the local locking code.
10885 */
10886 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10887 if (cmd == F_SETLK || cmd == F_SETLKW) {
10888 /*
10889 * For complete safety, we should be holding
10890 * r_lkserlock. However, we can't call
10891 * nfs4_safelock and then fs_frlock while
10892 * holding r_lkserlock, so just invoke
10893 * nfs4_safelock and expect that this will
10894 * catch enough of the cases.
10895 */
10896 if (!nfs4_safelock(vp, bfp, cr))
10897 return (EAGAIN);
10898 }
10899 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10900 }
10901
10902 rp = VTOR4(vp);
10903
10904 /*
10905 * Check whether the given lock request can proceed, given the
10906 * current file mappings.
10907 */
10908 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10909 return (EINTR);
10910 if (cmd == F_SETLK || cmd == F_SETLKW) {
10911 if (!nfs4_safelock(vp, bfp, cr)) {
10912 rc = EAGAIN;
10913 goto done;
10914 }
10915 }
10916
10917 /*
10918 * Flush the cache after waiting for async I/O to finish. For new
10919 * locks, this is so that the process gets the latest bits from the
10920 * server. For unlocks, this is so that other clients see the
10921 * latest bits once the file has been unlocked. If currently dirty
10922 * pages can't be flushed, then don't allow a lock to be set. But
10923 * allow unlocks to succeed, to avoid having orphan locks on the
10924 * server.
10925 */
10926 if (cmd != F_GETLK) {
10927 mutex_enter(&rp->r_statelock);
10928 while (rp->r_count > 0) {
10929 if (intr) {
10930 klwp_t *lwp = ttolwp(curthread);
10931
10932 if (lwp != NULL)
10933 lwp->lwp_nostop++;
10934 if (cv_wait_sig(&rp->r_cv,
10935 &rp->r_statelock) == 0) {
10936 if (lwp != NULL)
10937 lwp->lwp_nostop--;
10938 rc = EINTR;
10939 break;
10940 }
10941 if (lwp != NULL)
10942 lwp->lwp_nostop--;
10943 } else {
10944 cv_wait(&rp->r_cv, &rp->r_statelock);
10945 }
10946 }
10947 mutex_exit(&rp->r_statelock);
10948 if (rc != 0)
10949 goto done;
10950 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10951 if (error) {
10952 if (error == ENOSPC || error == EDQUOT) {
10953 mutex_enter(&rp->r_statelock);
10954 if (!rp->r_error)
10955 rp->r_error = error;
10956 mutex_exit(&rp->r_statelock);
10957 }
10958 if (bfp->l_type != F_UNLCK) {
10959 rc = ENOLCK;
10960 goto done;
10961 }
10962 }
10963 }
10964
10965 /*
10966 * Call the lock manager to do the real work of contacting
10967 * the server and obtaining the lock.
10968 */
10969 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10970 cr, &e, NULL, NULL);
10971 rc = e.error;
10972
10973 if (rc == 0)
10974 nfs4_lockcompletion(vp, cmd);
10975
10976 done:
10977 nfs_rw_exit(&rp->r_lkserlock);
10978
10979 return (rc);
10980 }
10981
10982 /*
10983 * Free storage space associated with the specified vnode. The portion
10984 * to be freed is specified by bfp->l_start and bfp->l_len (already
10985 * normalized to a "whence" of 0).
10986 *
10987 * This is an experimental facility whose continued existence is not
10988 * guaranteed. Currently, we only support the special case
10989 * of l_len == 0, meaning free to end of file.
10990 */
10991 /* ARGSUSED */
10992 static int
10993 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10994 offset_t offset, cred_t *cr, caller_context_t *ct)
10995 {
10996 int error;
10997
10998 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10999 return (EIO);
11000 ASSERT(vp->v_type == VREG);
11001 if (cmd != F_FREESP)
11002 return (EINVAL);
11003
11004 error = convoff(vp, bfp, 0, offset);
11005 if (!error) {
11006 ASSERT(bfp->l_start >= 0);
11007 if (bfp->l_len == 0) {
11008 struct vattr va;
11009
11010 va.va_mask = AT_SIZE;
11011 va.va_size = bfp->l_start;
11012 error = nfs4setattr(vp, &va, 0, cr, NULL);
11013
11014 if (error == 0) {
11015 if (bfp->l_start == 0) {
11016 vnevent_truncate(vp, ct);
11017 } else {
11018 vnevent_resize(vp, ct);
11019 }
11020 }
11021 } else
11022 error = EINVAL;
11023 }
11024
11025 return (error);
11026 }
11027
11028 /* ARGSUSED */
11029 int
11030 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11031 {
11032 rnode4_t *rp;
11033 rp = VTOR4(vp);
11034
11035 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11036 vp = RTOV4(rp);
11037 }
11038 *vpp = vp;
11039 return (0);
11040 }
11041
11042 /*
11043 * Setup and add an address space callback to do the work of the delmap call.
11044 * The callback will (and must be) deleted in the actual callback function.
11045 *
11046 * This is done in order to take care of the problem that we have with holding
11047 * the address space's a_lock for a long period of time (e.g. if the NFS server
11048 * is down). Callbacks will be executed in the address space code while the
11049 * a_lock is not held. Holding the address space's a_lock causes things such
11050 * as ps and fork to hang because they are trying to acquire this lock as well.
11051 */
11052 /* ARGSUSED */
11053 static int
11054 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11055 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11056 caller_context_t *ct)
11057 {
11058 int caller_found;
11059 int error;
11060 rnode4_t *rp;
11061 nfs4_delmap_args_t *dmapp;
11062 nfs4_delmapcall_t *delmap_call;
11063
11064 if (vp->v_flag & VNOMAP)
11065 return (ENOSYS);
11066
11067 /*
11068 * A process may not change zones if it has NFS pages mmap'ed
11069 * in, so we can't legitimately get here from the wrong zone.
11070 */
11071 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11072
11073 rp = VTOR4(vp);
11074
11075 /*
11076 * The way that the address space of this process deletes its mapping
11077 * of this file is via the following call chains:
11078 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11079 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11080 *
11081 * With the use of address space callbacks we are allowed to drop the
11082 * address space lock, a_lock, while executing the NFS operations that
11083 * need to go over the wire. Returning EAGAIN to the caller of this
11084 * function is what drives the execution of the callback that we add
11085 * below. The callback will be executed by the address space code
11086 * after dropping the a_lock. When the callback is finished, since
11087 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11088 * is called again on the same segment to finish the rest of the work
11089 * that needs to happen during unmapping.
11090 *
11091 * This action of calling back into the segment driver causes
11092 * nfs4_delmap() to get called again, but since the callback was
11093 * already executed at this point, it already did the work and there
11094 * is nothing left for us to do.
11095 *
11096 * To Summarize:
11097 * - The first time nfs4_delmap is called by the current thread is when
11098 * we add the caller associated with this delmap to the delmap caller
11099 * list, add the callback, and return EAGAIN.
11100 * - The second time in this call chain when nfs4_delmap is called we
11101 * will find this caller in the delmap caller list and realize there
11102 * is no more work to do thus removing this caller from the list and
11103 * returning the error that was set in the callback execution.
11104 */
11105 caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11106 if (caller_found) {
11107 /*
11108 * 'error' is from the actual delmap operations. To avoid
11109 * hangs, we need to handle the return of EAGAIN differently
11110 * since this is what drives the callback execution.
11111 * In this case, we don't want to return EAGAIN and do the
11112 * callback execution because there are none to execute.
11113 */
11114 if (error == EAGAIN)
11115 return (0);
11116 else
11117 return (error);
11118 }
11119
11120 /* current caller was not in the list */
11121 delmap_call = nfs4_init_delmapcall();
11122
11123 mutex_enter(&rp->r_statelock);
11124 list_insert_tail(&rp->r_indelmap, delmap_call);
11125 mutex_exit(&rp->r_statelock);
11126
11127 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11128
11129 dmapp->vp = vp;
11130 dmapp->off = off;
11131 dmapp->addr = addr;
11132 dmapp->len = len;
11133 dmapp->prot = prot;
11134 dmapp->maxprot = maxprot;
11135 dmapp->flags = flags;
11136 dmapp->cr = cr;
11137 dmapp->caller = delmap_call;
11138
11139 error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11140 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11141
11142 return (error ? error : EAGAIN);
11143 }
11144
11145 static nfs4_delmapcall_t *
11146 nfs4_init_delmapcall()
11147 {
11148 nfs4_delmapcall_t *delmap_call;
11149
11150 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11151 delmap_call->call_id = curthread;
11152 delmap_call->error = 0;
11153
11154 return (delmap_call);
11155 }
11156
11157 static void
11158 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11159 {
11160 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11161 }
11162
11163 /*
11164 * Searches for the current delmap caller (based on curthread) in the list of
11165 * callers. If it is found, we remove it and free the delmap caller.
11166 * Returns:
11167 * 0 if the caller wasn't found
11168 * 1 if the caller was found, removed and freed. *errp will be set
11169 * to what the result of the delmap was.
11170 */
11171 static int
11172 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11173 {
11174 nfs4_delmapcall_t *delmap_call;
11175
11176 /*
11177 * If the list doesn't exist yet, we create it and return
11178 * that the caller wasn't found. No list = no callers.
11179 */
11180 mutex_enter(&rp->r_statelock);
11181 if (!(rp->r_flags & R4DELMAPLIST)) {
11182 /* The list does not exist */
11183 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11184 offsetof(nfs4_delmapcall_t, call_node));
11185 rp->r_flags |= R4DELMAPLIST;
11186 mutex_exit(&rp->r_statelock);
11187 return (0);
11188 } else {
11189 /* The list exists so search it */
11190 for (delmap_call = list_head(&rp->r_indelmap);
11191 delmap_call != NULL;
11192 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11193 if (delmap_call->call_id == curthread) {
11194 /* current caller is in the list */
11195 *errp = delmap_call->error;
11196 list_remove(&rp->r_indelmap, delmap_call);
11197 mutex_exit(&rp->r_statelock);
11198 nfs4_free_delmapcall(delmap_call);
11199 return (1);
11200 }
11201 }
11202 }
11203 mutex_exit(&rp->r_statelock);
11204 return (0);
11205 }
11206
11207 /*
11208 * Remove some pages from an mmap'd vnode. Just update the
11209 * count of pages. If doing close-to-open, then flush and
11210 * commit all of the pages associated with this file.
11211 * Otherwise, start an asynchronous page flush to write out
11212 * any dirty pages. This will also associate a credential
11213 * with the rnode which can be used to write the pages.
11214 */
11215 /* ARGSUSED */
11216 static void
11217 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11218 {
11219 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11220 rnode4_t *rp;
11221 mntinfo4_t *mi;
11222 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg;
11223
11224 rp = VTOR4(dmapp->vp);
11225 mi = VTOMI4(dmapp->vp);
11226
11227 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11228 ASSERT(rp->r_mapcnt >= 0);
11229
11230 /*
11231 * Initiate a page flush and potential commit if there are
11232 * pages, the file system was not mounted readonly, the segment
11233 * was mapped shared, and the pages themselves were writeable.
11234 */
11235 if (nfs4_has_pages(dmapp->vp) &&
11236 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11237 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11238 mutex_enter(&rp->r_statelock);
11239 rp->r_flags |= R4DIRTY;
11240 mutex_exit(&rp->r_statelock);
11241 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11242 dmapp->len, dmapp->cr);
11243 if (!e.error) {
11244 mutex_enter(&rp->r_statelock);
11245 e.error = rp->r_error;
11246 rp->r_error = 0;
11247 mutex_exit(&rp->r_statelock);
11248 }
11249 } else
11250 e.error = 0;
11251
11252 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11253 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11254 B_INVAL, dmapp->cr, NULL);
11255
11256 if (e.error) {
11257 e.stat = puterrno4(e.error);
11258 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11259 OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11260 dmapp->caller->error = e.error;
11261 }
11262
11263 /* Check to see if we need to close the file */
11264
11265 if (dmapp->vp->v_type == VREG) {
11266 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11267 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11268
11269 if (e.error != 0 || e.stat != NFS4_OK) {
11270 /*
11271 * Since it is possible that e.error == 0 and
11272 * e.stat != NFS4_OK (and vice versa),
11273 * we do the proper checking in order to get both
11274 * e.error and e.stat reporting the correct info.
11275 */
11276 if (e.stat == NFS4_OK)
11277 e.stat = puterrno4(e.error);
11278 if (e.error == 0)
11279 e.error = geterrno4(e.stat);
11280
11281 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11282 OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11283 dmapp->caller->error = e.error;
11284 }
11285 }
11286
11287 (void) as_delete_callback(as, arg);
11288 kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11289 }
11290
11291
11292 static uint_t
11293 fattr4_maxfilesize_to_bits(uint64_t ll)
11294 {
11295 uint_t l = 1;
11296
11297 if (ll == 0) {
11298 return (0);
11299 }
11300
11301 if (ll & 0xffffffff00000000) {
11302 l += 32; ll >>= 32;
11303 }
11304 if (ll & 0xffff0000) {
11305 l += 16; ll >>= 16;
11306 }
11307 if (ll & 0xff00) {
11308 l += 8; ll >>= 8;
11309 }
11310 if (ll & 0xf0) {
11311 l += 4; ll >>= 4;
11312 }
11313 if (ll & 0xc) {
11314 l += 2; ll >>= 2;
11315 }
11316 if (ll & 0x2) {
11317 l += 1;
11318 }
11319 return (l);
11320 }
11321
11322 static int
11323 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11324 {
11325 vnode_t *avp = NULL;
11326 int error;
11327
11328 if ((error = nfs4lookup_xattr(vp, "", &avp,
11329 LOOKUP_XATTR, cr)) == 0)
11330 error = do_xattr_exists_check(avp, valp, cr);
11331 if (avp)
11332 VN_RELE(avp);
11333
11334 return (error);
11335 }
11336
11337 /* ARGSUSED */
11338 int
11339 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11340 caller_context_t *ct)
11341 {
11342 int error;
11343 hrtime_t t;
11344 rnode4_t *rp;
11345 nfs4_ga_res_t gar;
11346 nfs4_ga_ext_res_t ger;
11347
11348 gar.n4g_ext_res = &ger;
11349
11350 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11351 return (EIO);
11352 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11353 *valp = MAXPATHLEN;
11354 return (0);
11355 }
11356 if (cmd == _PC_ACL_ENABLED) {
11357 *valp = _ACL_ACE_ENABLED;
11358 return (0);
11359 }
11360
11361 rp = VTOR4(vp);
11362 if (cmd == _PC_XATTR_EXISTS) {
11363 /*
11364 * The existence of the xattr directory is not sufficient
11365 * for determining whether generic user attributes exists.
11366 * The attribute directory could only be a transient directory
11367 * used for Solaris sysattr support. Do a small readdir
11368 * to verify if the only entries are sysattrs or not.
11369 *
11370 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11371 * is NULL. Once the xadir vp exists, we can create xattrs,
11372 * and we don't have any way to update the "base" object's
11373 * pc4_xattr_exists from the xattr or xadir. Maybe FEM
11374 * could help out.
11375 */
11376 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11377 rp->r_xattr_dir == NULL) {
11378 return (nfs4_have_xattrs(vp, valp, cr));
11379 }
11380 } else { /* OLD CODE */
11381 if (ATTRCACHE4_VALID(vp)) {
11382 mutex_enter(&rp->r_statelock);
11383 if (rp->r_pathconf.pc4_cache_valid) {
11384 error = 0;
11385 switch (cmd) {
11386 case _PC_FILESIZEBITS:
11387 *valp =
11388 rp->r_pathconf.pc4_filesizebits;
11389 break;
11390 case _PC_LINK_MAX:
11391 *valp =
11392 rp->r_pathconf.pc4_link_max;
11393 break;
11394 case _PC_NAME_MAX:
11395 *valp =
11396 rp->r_pathconf.pc4_name_max;
11397 break;
11398 case _PC_CHOWN_RESTRICTED:
11399 *valp =
11400 rp->r_pathconf.pc4_chown_restricted;
11401 break;
11402 case _PC_NO_TRUNC:
11403 *valp =
11404 rp->r_pathconf.pc4_no_trunc;
11405 break;
11406 default:
11407 error = EINVAL;
11408 break;
11409 }
11410 mutex_exit(&rp->r_statelock);
11411 #ifdef DEBUG
11412 nfs4_pathconf_cache_hits++;
11413 #endif
11414 return (error);
11415 }
11416 mutex_exit(&rp->r_statelock);
11417 }
11418 }
11419 #ifdef DEBUG
11420 nfs4_pathconf_cache_misses++;
11421 #endif
11422
11423 t = gethrtime();
11424
11425 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11426
11427 if (error) {
11428 mutex_enter(&rp->r_statelock);
11429 rp->r_pathconf.pc4_cache_valid = FALSE;
11430 rp->r_pathconf.pc4_xattr_valid = FALSE;
11431 mutex_exit(&rp->r_statelock);
11432 return (error);
11433 }
11434
11435 /* interpret the max filesize */
11436 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11437 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11438
11439 /* Store the attributes we just received */
11440 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11441
11442 switch (cmd) {
11443 case _PC_FILESIZEBITS:
11444 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11445 break;
11446 case _PC_LINK_MAX:
11447 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11448 break;
11449 case _PC_NAME_MAX:
11450 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11451 break;
11452 case _PC_CHOWN_RESTRICTED:
11453 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11454 break;
11455 case _PC_NO_TRUNC:
11456 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11457 break;
11458 case _PC_XATTR_EXISTS:
11459 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11460 if (error = nfs4_have_xattrs(vp, valp, cr))
11461 return (error);
11462 }
11463 break;
11464 default:
11465 return (EINVAL);
11466 }
11467
11468 return (0);
11469 }
11470
11471 /*
11472 * Called by async thread to do synchronous pageio. Do the i/o, wait
11473 * for it to complete, and cleanup the page list when done.
11474 */
11475 static int
11476 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11477 int flags, cred_t *cr)
11478 {
11479 int error;
11480
11481 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11482
11483 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11484 if (flags & B_READ)
11485 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11486 else
11487 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11488 return (error);
11489 }
11490
11491 /* ARGSUSED */
11492 static int
11493 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11494 int flags, cred_t *cr, caller_context_t *ct)
11495 {
11496 int error;
11497 rnode4_t *rp;
11498
11499 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11500 return (EIO);
11501
11502 if (pp == NULL)
11503 return (EINVAL);
11504
11505 rp = VTOR4(vp);
11506 mutex_enter(&rp->r_statelock);
11507 rp->r_count++;
11508 mutex_exit(&rp->r_statelock);
11509
11510 if (flags & B_ASYNC) {
11511 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11512 nfs4_sync_pageio);
11513 } else
11514 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11515 mutex_enter(&rp->r_statelock);
11516 rp->r_count--;
11517 cv_broadcast(&rp->r_cv);
11518 mutex_exit(&rp->r_statelock);
11519 return (error);
11520 }
11521
11522 /* ARGSUSED */
11523 static void
11524 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11525 caller_context_t *ct)
11526 {
11527 int error;
11528 rnode4_t *rp;
11529 page_t *plist;
11530 page_t *pptr;
11531 offset3 offset;
11532 count3 len;
11533 k_sigset_t smask;
11534
11535 /*
11536 * We should get called with fl equal to either B_FREE or
11537 * B_INVAL. Any other value is illegal.
11538 *
11539 * The page that we are either supposed to free or destroy
11540 * should be exclusive locked and its io lock should not
11541 * be held.
11542 */
11543 ASSERT(fl == B_FREE || fl == B_INVAL);
11544 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11545
11546 rp = VTOR4(vp);
11547
11548 /*
11549 * If the page doesn't need to be committed or we shouldn't
11550 * even bother attempting to commit it, then just make sure
11551 * that the p_fsdata byte is clear and then either free or
11552 * destroy the page as appropriate.
11553 */
11554 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11555 pp->p_fsdata = C_NOCOMMIT;
11556 if (fl == B_FREE)
11557 page_free(pp, dn);
11558 else
11559 page_destroy(pp, dn);
11560 return;
11561 }
11562
11563 /*
11564 * If there is a page invalidation operation going on, then
11565 * if this is one of the pages being destroyed, then just
11566 * clear the p_fsdata byte and then either free or destroy
11567 * the page as appropriate.
11568 */
11569 mutex_enter(&rp->r_statelock);
11570 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11571 mutex_exit(&rp->r_statelock);
11572 pp->p_fsdata = C_NOCOMMIT;
11573 if (fl == B_FREE)
11574 page_free(pp, dn);
11575 else
11576 page_destroy(pp, dn);
11577 return;
11578 }
11579
11580 /*
11581 * If we are freeing this page and someone else is already
11582 * waiting to do a commit, then just unlock the page and
11583 * return. That other thread will take care of commiting
11584 * this page. The page can be freed sometime after the
11585 * commit has finished. Otherwise, if the page is marked
11586 * as delay commit, then we may be getting called from
11587 * pvn_write_done, one page at a time. This could result
11588 * in one commit per page, so we end up doing lots of small
11589 * commits instead of fewer larger commits. This is bad,
11590 * we want do as few commits as possible.
11591 */
11592 if (fl == B_FREE) {
11593 if (rp->r_flags & R4COMMITWAIT) {
11594 page_unlock(pp);
11595 mutex_exit(&rp->r_statelock);
11596 return;
11597 }
11598 if (pp->p_fsdata == C_DELAYCOMMIT) {
11599 pp->p_fsdata = C_COMMIT;
11600 page_unlock(pp);
11601 mutex_exit(&rp->r_statelock);
11602 return;
11603 }
11604 }
11605
11606 /*
11607 * Check to see if there is a signal which would prevent an
11608 * attempt to commit the pages from being successful. If so,
11609 * then don't bother with all of the work to gather pages and
11610 * generate the unsuccessful RPC. Just return from here and
11611 * let the page be committed at some later time.
11612 */
11613 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11614 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11615 sigunintr(&smask);
11616 page_unlock(pp);
11617 mutex_exit(&rp->r_statelock);
11618 return;
11619 }
11620 sigunintr(&smask);
11621
11622 /*
11623 * We are starting to need to commit pages, so let's try
11624 * to commit as many as possible at once to reduce the
11625 * overhead.
11626 *
11627 * Set the `commit inprogress' state bit. We must
11628 * first wait until any current one finishes. Then
11629 * we initialize the c_pages list with this page.
11630 */
11631 while (rp->r_flags & R4COMMIT) {
11632 rp->r_flags |= R4COMMITWAIT;
11633 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11634 rp->r_flags &= ~R4COMMITWAIT;
11635 }
11636 rp->r_flags |= R4COMMIT;
11637 mutex_exit(&rp->r_statelock);
11638 ASSERT(rp->r_commit.c_pages == NULL);
11639 rp->r_commit.c_pages = pp;
11640 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11641 rp->r_commit.c_commlen = PAGESIZE;
11642
11643 /*
11644 * Gather together all other pages which can be committed.
11645 * They will all be chained off r_commit.c_pages.
11646 */
11647 nfs4_get_commit(vp);
11648
11649 /*
11650 * Clear the `commit inprogress' status and disconnect
11651 * the list of pages to be committed from the rnode.
11652 * At this same time, we also save the starting offset
11653 * and length of data to be committed on the server.
11654 */
11655 plist = rp->r_commit.c_pages;
11656 rp->r_commit.c_pages = NULL;
11657 offset = rp->r_commit.c_commbase;
11658 len = rp->r_commit.c_commlen;
11659 mutex_enter(&rp->r_statelock);
11660 rp->r_flags &= ~R4COMMIT;
11661 cv_broadcast(&rp->r_commit.c_cv);
11662 mutex_exit(&rp->r_statelock);
11663
11664 if (curproc == proc_pageout || curproc == proc_fsflush ||
11665 nfs_zone() != VTOMI4(vp)->mi_zone) {
11666 nfs4_async_commit(vp, plist, offset, len,
11667 cr, do_nfs4_async_commit);
11668 return;
11669 }
11670
11671 /*
11672 * Actually generate the COMMIT op over the wire operation.
11673 */
11674 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11675
11676 /*
11677 * If we got an error during the commit, just unlock all
11678 * of the pages. The pages will get retransmitted to the
11679 * server during a putpage operation.
11680 */
11681 if (error) {
11682 while (plist != NULL) {
11683 pptr = plist;
11684 page_sub(&plist, pptr);
11685 page_unlock(pptr);
11686 }
11687 return;
11688 }
11689
11690 /*
11691 * We've tried as hard as we can to commit the data to stable
11692 * storage on the server. We just unlock the rest of the pages
11693 * and clear the commit required state. They will be put
11694 * onto the tail of the cachelist if they are nolonger
11695 * mapped.
11696 */
11697 while (plist != pp) {
11698 pptr = plist;
11699 page_sub(&plist, pptr);
11700 pptr->p_fsdata = C_NOCOMMIT;
11701 page_unlock(pptr);
11702 }
11703
11704 /*
11705 * It is possible that nfs4_commit didn't return error but
11706 * some other thread has modified the page we are going
11707 * to free/destroy.
11708 * In this case we need to rewrite the page. Do an explicit check
11709 * before attempting to free/destroy the page. If modified, needs to
11710 * be rewritten so unlock the page and return.
11711 */
11712 if (hat_ismod(pp)) {
11713 pp->p_fsdata = C_NOCOMMIT;
11714 page_unlock(pp);
11715 return;
11716 }
11717
11718 /*
11719 * Now, as appropriate, either free or destroy the page
11720 * that we were called with.
11721 */
11722 pp->p_fsdata = C_NOCOMMIT;
11723 if (fl == B_FREE)
11724 page_free(pp, dn);
11725 else
11726 page_destroy(pp, dn);
11727 }
11728
11729 /*
11730 * Commit requires that the current fh be the file written to.
11731 * The compound op structure is:
11732 * PUTFH(file), COMMIT
11733 */
11734 static int
11735 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11736 {
11737 COMPOUND4args_clnt args;
11738 COMPOUND4res_clnt res;
11739 COMMIT4res *cm_res;
11740 nfs_argop4 argop[2];
11741 nfs_resop4 *resop;
11742 int doqueue;
11743 mntinfo4_t *mi;
11744 rnode4_t *rp;
11745 cred_t *cred_otw = NULL;
11746 bool_t needrecov = FALSE;
11747 nfs4_recov_state_t recov_state;
11748 nfs4_open_stream_t *osp = NULL;
11749 bool_t first_time = TRUE; /* first time getting OTW cred */
11750 bool_t last_time = FALSE; /* last time getting OTW cred */
11751 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11752
11753 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11754
11755 rp = VTOR4(vp);
11756
11757 mi = VTOMI4(vp);
11758 recov_state.rs_flags = 0;
11759 recov_state.rs_num_retry_despite_err = 0;
11760 get_commit_cred:
11761 /*
11762 * Releases the osp, if a valid open stream is provided.
11763 * Puts a hold on the cred_otw and the new osp (if found).
11764 */
11765 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11766 &first_time, &last_time);
11767 args.ctag = TAG_COMMIT;
11768 recov_retry:
11769 /*
11770 * Commit ops: putfh file; commit
11771 */
11772 args.array_len = 2;
11773 args.array = argop;
11774
11775 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11776 &recov_state, NULL);
11777 if (e.error) {
11778 crfree(cred_otw);
11779 if (osp != NULL)
11780 open_stream_rele(osp, rp);
11781 return (e.error);
11782 }
11783
11784 /* putfh directory */
11785 argop[0].argop = OP_CPUTFH;
11786 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11787
11788 /* commit */
11789 argop[1].argop = OP_COMMIT;
11790 argop[1].nfs_argop4_u.opcommit.offset = offset;
11791 argop[1].nfs_argop4_u.opcommit.count = count;
11792
11793 doqueue = 1;
11794 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11795
11796 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11797 if (!needrecov && e.error) {
11798 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11799 needrecov);
11800 crfree(cred_otw);
11801 if (e.error == EACCES && last_time == FALSE)
11802 goto get_commit_cred;
11803 if (osp != NULL)
11804 open_stream_rele(osp, rp);
11805 return (e.error);
11806 }
11807
11808 if (needrecov) {
11809 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11810 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11811 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11812 &recov_state, needrecov);
11813 if (!e.error)
11814 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11815 goto recov_retry;
11816 }
11817 if (e.error) {
11818 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11819 &recov_state, needrecov);
11820 crfree(cred_otw);
11821 if (osp != NULL)
11822 open_stream_rele(osp, rp);
11823 return (e.error);
11824 }
11825 /* fall through for res.status case */
11826 }
11827
11828 if (res.status) {
11829 e.error = geterrno4(res.status);
11830 if (e.error == EACCES && last_time == FALSE) {
11831 crfree(cred_otw);
11832 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11833 &recov_state, needrecov);
11834 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11835 goto get_commit_cred;
11836 }
11837 /*
11838 * Can't do a nfs4_purge_stale_fh here because this
11839 * can cause a deadlock. nfs4_commit can
11840 * be called from nfs4_dispose which can be called
11841 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh
11842 * can call back to pvn_vplist_dirty.
11843 */
11844 if (e.error == ESTALE) {
11845 mutex_enter(&rp->r_statelock);
11846 rp->r_flags |= R4STALE;
11847 if (!rp->r_error)
11848 rp->r_error = e.error;
11849 mutex_exit(&rp->r_statelock);
11850 PURGE_ATTRCACHE4(vp);
11851 } else {
11852 mutex_enter(&rp->r_statelock);
11853 if (!rp->r_error)
11854 rp->r_error = e.error;
11855 mutex_exit(&rp->r_statelock);
11856 }
11857 } else {
11858 ASSERT(rp->r_flags & R4HAVEVERF);
11859 resop = &res.array[1]; /* commit res */
11860 cm_res = &resop->nfs_resop4_u.opcommit;
11861 mutex_enter(&rp->r_statelock);
11862 if (cm_res->writeverf == rp->r_writeverf) {
11863 mutex_exit(&rp->r_statelock);
11864 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11865 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11866 &recov_state, needrecov);
11867 crfree(cred_otw);
11868 if (osp != NULL)
11869 open_stream_rele(osp, rp);
11870 return (0);
11871 }
11872 nfs4_set_mod(vp);
11873 rp->r_writeverf = cm_res->writeverf;
11874 mutex_exit(&rp->r_statelock);
11875 e.error = NFS_VERF_MISMATCH;
11876 }
11877
11878 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11879 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11880 crfree(cred_otw);
11881 if (osp != NULL)
11882 open_stream_rele(osp, rp);
11883
11884 return (e.error);
11885 }
11886
11887 static void
11888 nfs4_set_mod(vnode_t *vp)
11889 {
11890 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11891
11892 /* make sure we're looking at the master vnode, not a shadow */
11893 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11894 }
11895
11896 /*
11897 * This function is used to gather a page list of the pages which
11898 * can be committed on the server.
11899 *
11900 * The calling thread must have set R4COMMIT. This bit is used to
11901 * serialize access to the commit structure in the rnode. As long
11902 * as the thread has set R4COMMIT, then it can manipulate the commit
11903 * structure without requiring any other locks.
11904 *
11905 * When this function is called from nfs4_dispose() the page passed
11906 * into nfs4_dispose() will be SE_EXCL locked, and so this function
11907 * will skip it. This is not a problem since we initially add the
11908 * page to the r_commit page list.
11909 *
11910 */
11911 static void
11912 nfs4_get_commit(vnode_t *vp)
11913 {
11914 rnode4_t *rp;
11915 page_t *pp;
11916 kmutex_t *vphm;
11917
11918 rp = VTOR4(vp);
11919
11920 ASSERT(rp->r_flags & R4COMMIT);
11921
11922 /* make sure we're looking at the master vnode, not a shadow */
11923
11924 if (IS_SHADOW(vp, rp))
11925 vp = RTOV4(rp);
11926
11927 vphm = page_vnode_mutex(vp);
11928 mutex_enter(vphm);
11929
11930 /*
11931 * If there are no pages associated with this vnode, then
11932 * just return.
11933 */
11934 if ((pp = vp->v_pages) == NULL) {
11935 mutex_exit(vphm);
11936 return;
11937 }
11938
11939 /*
11940 * Step through all of the pages associated with this vnode
11941 * looking for pages which need to be committed.
11942 */
11943 do {
11944 /* Skip marker pages. */
11945 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11946 continue;
11947
11948 /*
11949 * First short-cut everything (without the page_lock)
11950 * and see if this page does not need to be committed
11951 * or is modified if so then we'll just skip it.
11952 */
11953 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11954 continue;
11955
11956 /*
11957 * Attempt to lock the page. If we can't, then
11958 * someone else is messing with it or we have been
11959 * called from nfs4_dispose and this is the page that
11960 * nfs4_dispose was called with.. anyway just skip it.
11961 */
11962 if (!page_trylock(pp, SE_EXCL))
11963 continue;
11964
11965 /*
11966 * Lets check again now that we have the page lock.
11967 */
11968 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11969 page_unlock(pp);
11970 continue;
11971 }
11972
11973 /* this had better not be a free page */
11974 ASSERT(PP_ISFREE(pp) == 0);
11975
11976 /*
11977 * The page needs to be committed and we locked it.
11978 * Update the base and length parameters and add it
11979 * to r_pages.
11980 */
11981 if (rp->r_commit.c_pages == NULL) {
11982 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11983 rp->r_commit.c_commlen = PAGESIZE;
11984 } else if (pp->p_offset < rp->r_commit.c_commbase) {
11985 rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11986 (offset3)pp->p_offset + rp->r_commit.c_commlen;
11987 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11988 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11989 <= pp->p_offset) {
11990 rp->r_commit.c_commlen = (offset3)pp->p_offset -
11991 rp->r_commit.c_commbase + PAGESIZE;
11992 }
11993 page_add(&rp->r_commit.c_pages, pp);
11994 } while ((pp = pp->p_vpnext) != vp->v_pages);
11995
11996 mutex_exit(vphm);
11997 }
11998
11999 /*
12000 * This routine is used to gather together a page list of the pages
12001 * which are to be committed on the server. This routine must not
12002 * be called if the calling thread holds any locked pages.
12003 *
12004 * The calling thread must have set R4COMMIT. This bit is used to
12005 * serialize access to the commit structure in the rnode. As long
12006 * as the thread has set R4COMMIT, then it can manipulate the commit
12007 * structure without requiring any other locks.
12008 */
12009 static void
12010 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
12011 {
12012
12013 rnode4_t *rp;
12014 page_t *pp;
12015 u_offset_t end;
12016 u_offset_t off;
12017 ASSERT(len != 0);
12018 rp = VTOR4(vp);
12019 ASSERT(rp->r_flags & R4COMMIT);
12020
12021 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12022
12023 /* make sure we're looking at the master vnode, not a shadow */
12024
12025 if (IS_SHADOW(vp, rp))
12026 vp = RTOV4(rp);
12027
12028 /*
12029 * If there are no pages associated with this vnode, then
12030 * just return.
12031 */
12032 if ((pp = vp->v_pages) == NULL)
12033 return;
12034 /*
12035 * Calculate the ending offset.
12036 */
12037 end = soff + len;
12038 for (off = soff; off < end; off += PAGESIZE) {
12039 /*
12040 * Lookup each page by vp, offset.
12041 */
12042 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12043 continue;
12044 /*
12045 * If this page does not need to be committed or is
12046 * modified, then just skip it.
12047 */
12048 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12049 page_unlock(pp);
12050 continue;
12051 }
12052
12053 ASSERT(PP_ISFREE(pp) == 0);
12054 /*
12055 * The page needs to be committed and we locked it.
12056 * Update the base and length parameters and add it
12057 * to r_pages.
12058 */
12059 if (rp->r_commit.c_pages == NULL) {
12060 rp->r_commit.c_commbase = (offset3)pp->p_offset;
12061 rp->r_commit.c_commlen = PAGESIZE;
12062 } else {
12063 rp->r_commit.c_commlen = (offset3)pp->p_offset -
12064 rp->r_commit.c_commbase + PAGESIZE;
12065 }
12066 page_add(&rp->r_commit.c_pages, pp);
12067 }
12068 }
12069
12070 /*
12071 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12072 * Flushes and commits data to the server.
12073 */
12074 static int
12075 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12076 {
12077 int error;
12078 verifier4 write_verf;
12079 rnode4_t *rp = VTOR4(vp);
12080
12081 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12082
12083 /*
12084 * Flush the data portion of the file and then commit any
12085 * portions which need to be committed. This may need to
12086 * be done twice if the server has changed state since
12087 * data was last written. The data will need to be
12088 * rewritten to the server and then a new commit done.
12089 *
12090 * In fact, this may need to be done several times if the
12091 * server is having problems and crashing while we are
12092 * attempting to do this.
12093 */
12094
12095 top:
12096 /*
12097 * Do a flush based on the poff and plen arguments. This
12098 * will synchronously write out any modified pages in the
12099 * range specified by (poff, plen). This starts all of the
12100 * i/o operations which will be waited for in the next
12101 * call to nfs4_putpage
12102 */
12103
12104 mutex_enter(&rp->r_statelock);
12105 write_verf = rp->r_writeverf;
12106 mutex_exit(&rp->r_statelock);
12107
12108 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12109 if (error == EAGAIN)
12110 error = 0;
12111
12112 /*
12113 * Do a flush based on the poff and plen arguments. This
12114 * will synchronously write out any modified pages in the
12115 * range specified by (poff, plen) and wait until all of
12116 * the asynchronous i/o's in that range are done as well.
12117 */
12118 if (!error)
12119 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12120
12121 if (error)
12122 return (error);
12123
12124 mutex_enter(&rp->r_statelock);
12125 if (rp->r_writeverf != write_verf) {
12126 mutex_exit(&rp->r_statelock);
12127 goto top;
12128 }
12129 mutex_exit(&rp->r_statelock);
12130
12131 /*
12132 * Now commit any pages which might need to be committed.
12133 * If the error, NFS_VERF_MISMATCH, is returned, then
12134 * start over with the flush operation.
12135 */
12136 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12137
12138 if (error == NFS_VERF_MISMATCH)
12139 goto top;
12140
12141 return (error);
12142 }
12143
12144 /*
12145 * nfs4_commit_vp() will wait for other pending commits and
12146 * will either commit the whole file or a range, plen dictates
12147 * if we commit whole file. a value of zero indicates the whole
12148 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12149 */
12150 static int
12151 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12152 cred_t *cr, int wait_on_writes)
12153 {
12154 rnode4_t *rp;
12155 page_t *plist;
12156 offset3 offset;
12157 count3 len;
12158
12159 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12160
12161 rp = VTOR4(vp);
12162
12163 /*
12164 * before we gather commitable pages make
12165 * sure there are no outstanding async writes
12166 */
12167 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12168 mutex_enter(&rp->r_statelock);
12169 while (rp->r_count > 0) {
12170 cv_wait(&rp->r_cv, &rp->r_statelock);
12171 }
12172 mutex_exit(&rp->r_statelock);
12173 }
12174
12175 /*
12176 * Set the `commit inprogress' state bit. We must
12177 * first wait until any current one finishes.
12178 */
12179 mutex_enter(&rp->r_statelock);
12180 while (rp->r_flags & R4COMMIT) {
12181 rp->r_flags |= R4COMMITWAIT;
12182 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12183 rp->r_flags &= ~R4COMMITWAIT;
12184 }
12185 rp->r_flags |= R4COMMIT;
12186 mutex_exit(&rp->r_statelock);
12187
12188 /*
12189 * Gather all of the pages which need to be
12190 * committed.
12191 */
12192 if (plen == 0)
12193 nfs4_get_commit(vp);
12194 else
12195 nfs4_get_commit_range(vp, poff, plen);
12196
12197 /*
12198 * Clear the `commit inprogress' bit and disconnect the
12199 * page list which was gathered by nfs4_get_commit.
12200 */
12201 plist = rp->r_commit.c_pages;
12202 rp->r_commit.c_pages = NULL;
12203 offset = rp->r_commit.c_commbase;
12204 len = rp->r_commit.c_commlen;
12205 mutex_enter(&rp->r_statelock);
12206 rp->r_flags &= ~R4COMMIT;
12207 cv_broadcast(&rp->r_commit.c_cv);
12208 mutex_exit(&rp->r_statelock);
12209
12210 /*
12211 * If any pages need to be committed, commit them and
12212 * then unlock them so that they can be freed some
12213 * time later.
12214 */
12215 if (plist == NULL)
12216 return (0);
12217
12218 /*
12219 * No error occurred during the flush portion
12220 * of this operation, so now attempt to commit
12221 * the data to stable storage on the server.
12222 *
12223 * This will unlock all of the pages on the list.
12224 */
12225 return (nfs4_sync_commit(vp, plist, offset, len, cr));
12226 }
12227
12228 static int
12229 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12230 cred_t *cr)
12231 {
12232 int error;
12233 page_t *pp;
12234
12235 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12236
12237 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12238
12239 /*
12240 * If we got an error, then just unlock all of the pages
12241 * on the list.
12242 */
12243 if (error) {
12244 while (plist != NULL) {
12245 pp = plist;
12246 page_sub(&plist, pp);
12247 page_unlock(pp);
12248 }
12249 return (error);
12250 }
12251 /*
12252 * We've tried as hard as we can to commit the data to stable
12253 * storage on the server. We just unlock the pages and clear
12254 * the commit required state. They will get freed later.
12255 */
12256 while (plist != NULL) {
12257 pp = plist;
12258 page_sub(&plist, pp);
12259 pp->p_fsdata = C_NOCOMMIT;
12260 page_unlock(pp);
12261 }
12262
12263 return (error);
12264 }
12265
12266 static void
12267 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12268 cred_t *cr)
12269 {
12270
12271 (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12272 }
12273
12274 /*ARGSUSED*/
12275 static int
12276 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12277 caller_context_t *ct)
12278 {
12279 int error = 0;
12280 mntinfo4_t *mi;
12281 vattr_t va;
12282 vsecattr_t nfsace4_vsap;
12283
12284 mi = VTOMI4(vp);
12285 if (nfs_zone() != mi->mi_zone)
12286 return (EIO);
12287 if (mi->mi_flags & MI4_ACL) {
12288 /* if we have a delegation, return it */
12289 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12290 (void) nfs4delegreturn(VTOR4(vp),
12291 NFS4_DR_REOPEN|NFS4_DR_PUSH);
12292
12293 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12294 NFS4_ACL_SET);
12295 if (error) /* EINVAL */
12296 return (error);
12297
12298 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12299 /*
12300 * These are aclent_t type entries.
12301 */
12302 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12303 vp->v_type == VDIR, FALSE);
12304 if (error)
12305 return (error);
12306 } else {
12307 /*
12308 * These are ace_t type entries.
12309 */
12310 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12311 FALSE);
12312 if (error)
12313 return (error);
12314 }
12315 bzero(&va, sizeof (va));
12316 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12317 vs_ace4_destroy(&nfsace4_vsap);
12318 return (error);
12319 }
12320 return (ENOSYS);
12321 }
12322
12323 /* ARGSUSED */
12324 int
12325 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12326 caller_context_t *ct)
12327 {
12328 int error;
12329 mntinfo4_t *mi;
12330 nfs4_ga_res_t gar;
12331 rnode4_t *rp = VTOR4(vp);
12332
12333 mi = VTOMI4(vp);
12334 if (nfs_zone() != mi->mi_zone)
12335 return (EIO);
12336
12337 bzero(&gar, sizeof (gar));
12338 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12339
12340 /*
12341 * vsecattr->vsa_mask holds the original acl request mask.
12342 * This is needed when determining what to return.
12343 * (See: nfs4_create_getsecattr_return())
12344 */
12345 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12346 if (error) /* EINVAL */
12347 return (error);
12348
12349 /*
12350 * If this is a referral stub, don't try to go OTW for an ACL
12351 */
12352 if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12353 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12354
12355 if (mi->mi_flags & MI4_ACL) {
12356 /*
12357 * Check if the data is cached and the cache is valid. If it
12358 * is we don't go over the wire.
12359 */
12360 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12361 mutex_enter(&rp->r_statelock);
12362 if (rp->r_secattr != NULL) {
12363 error = nfs4_create_getsecattr_return(
12364 rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12365 rp->r_attr.va_gid,
12366 vp->v_type == VDIR);
12367 if (!error) { /* error == 0 - Success! */
12368 mutex_exit(&rp->r_statelock);
12369 return (error);
12370 }
12371 }
12372 mutex_exit(&rp->r_statelock);
12373 }
12374
12375 /*
12376 * The getattr otw call will always get both the acl, in
12377 * the form of a list of nfsace4's, and the number of acl
12378 * entries; independent of the value of gar.n4g_va.va_mask.
12379 */
12380 error = nfs4_getattr_otw(vp, &gar, cr, 1);
12381 if (error) {
12382 vs_ace4_destroy(&gar.n4g_vsa);
12383 if (error == ENOTSUP || error == EOPNOTSUPP)
12384 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12385 return (error);
12386 }
12387
12388 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12389 /*
12390 * No error was returned, but according to the response
12391 * bitmap, neither was an acl.
12392 */
12393 vs_ace4_destroy(&gar.n4g_vsa);
12394 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12395 return (error);
12396 }
12397
12398 /*
12399 * Update the cache with the ACL.
12400 */
12401 nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12402
12403 error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12404 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12405 vp->v_type == VDIR);
12406 vs_ace4_destroy(&gar.n4g_vsa);
12407 if ((error) && (vsecattr->vsa_mask &
12408 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12409 (error != EACCES)) {
12410 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12411 }
12412 return (error);
12413 }
12414 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12415 return (error);
12416 }
12417
12418 /*
12419 * The function returns:
12420 * - 0 (zero) if the passed in "acl_mask" is a valid request.
12421 * - EINVAL if the passed in "acl_mask" is an invalid request.
12422 *
12423 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12424 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12425 *
12426 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12427 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12428 * - We have a count field set without the corresponding acl field set. (e.g. -
12429 * VSA_ACECNT is set, but VSA_ACE is not)
12430 */
12431 static int
12432 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12433 {
12434 /* Shortcut the masks that are always valid. */
12435 if (acl_mask == (VSA_ACE | VSA_ACECNT))
12436 return (0);
12437 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12438 return (0);
12439
12440 if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12441 /*
12442 * We can't have any VSA_ACL type stuff in the mask now.
12443 */
12444 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12445 VSA_DFACLCNT))
12446 return (EINVAL);
12447
12448 if (op == NFS4_ACL_SET) {
12449 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12450 return (EINVAL);
12451 }
12452 }
12453
12454 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12455 /*
12456 * We can't have any VSA_ACE type stuff in the mask now.
12457 */
12458 if (acl_mask & (VSA_ACE | VSA_ACECNT))
12459 return (EINVAL);
12460
12461 if (op == NFS4_ACL_SET) {
12462 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12463 return (EINVAL);
12464
12465 if ((acl_mask & VSA_DFACLCNT) &&
12466 !(acl_mask & VSA_DFACL))
12467 return (EINVAL);
12468 }
12469 }
12470 return (0);
12471 }
12472
12473 /*
12474 * The theory behind creating the correct getsecattr return is simply this:
12475 * "Don't return anything that the caller is not expecting to have to free."
12476 */
12477 static int
12478 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12479 uid_t uid, gid_t gid, int isdir)
12480 {
12481 int error = 0;
12482 /* Save the mask since the translators modify it. */
12483 uint_t orig_mask = vsap->vsa_mask;
12484
12485 if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12486 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12487
12488 if (error)
12489 return (error);
12490
12491 /*
12492 * If the caller only asked for the ace count (VSA_ACECNT)
12493 * don't give them the full acl (VSA_ACE), free it.
12494 */
12495 if (!orig_mask & VSA_ACE) {
12496 if (vsap->vsa_aclentp != NULL) {
12497 kmem_free(vsap->vsa_aclentp,
12498 vsap->vsa_aclcnt * sizeof (ace_t));
12499 vsap->vsa_aclentp = NULL;
12500 }
12501 }
12502 vsap->vsa_mask = orig_mask;
12503
12504 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12505 VSA_DFACLCNT)) {
12506 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12507 isdir, FALSE);
12508
12509 if (error)
12510 return (error);
12511
12512 /*
12513 * If the caller only asked for the acl count (VSA_ACLCNT)
12514 * and/or the default acl count (VSA_DFACLCNT) don't give them
12515 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12516 */
12517 if (!orig_mask & VSA_ACL) {
12518 if (vsap->vsa_aclentp != NULL) {
12519 kmem_free(vsap->vsa_aclentp,
12520 vsap->vsa_aclcnt * sizeof (aclent_t));
12521 vsap->vsa_aclentp = NULL;
12522 }
12523 }
12524
12525 if (!orig_mask & VSA_DFACL) {
12526 if (vsap->vsa_dfaclentp != NULL) {
12527 kmem_free(vsap->vsa_dfaclentp,
12528 vsap->vsa_dfaclcnt * sizeof (aclent_t));
12529 vsap->vsa_dfaclentp = NULL;
12530 }
12531 }
12532 vsap->vsa_mask = orig_mask;
12533 }
12534 return (0);
12535 }
12536
12537 /* ARGSUSED */
12538 int
12539 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12540 caller_context_t *ct)
12541 {
12542 int error;
12543
12544 if (nfs_zone() != VTOMI4(vp)->mi_zone)
12545 return (EIO);
12546 /*
12547 * check for valid cmd parameter
12548 */
12549 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12550 return (EINVAL);
12551
12552 /*
12553 * Check access permissions
12554 */
12555 if ((cmd & F_SHARE) &&
12556 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12557 (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12558 return (EBADF);
12559
12560 /*
12561 * If the filesystem is mounted using local locking, pass the
12562 * request off to the local share code.
12563 */
12564 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12565 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12566
12567 switch (cmd) {
12568 case F_SHARE:
12569 case F_UNSHARE:
12570 /*
12571 * This will be properly implemented later,
12572 * see RFE: 4823948 .
12573 */
12574 error = EAGAIN;
12575 break;
12576
12577 case F_HASREMOTELOCKS:
12578 /*
12579 * NFS client can't store remote locks itself
12580 */
12581 shr->s_access = 0;
12582 error = 0;
12583 break;
12584
12585 default:
12586 error = EINVAL;
12587 break;
12588 }
12589
12590 return (error);
12591 }
12592
12593 /*
12594 * Common code called by directory ops to update the attrcache
12595 */
12596 static int
12597 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12598 hrtime_t t, vnode_t *vp, cred_t *cr)
12599 {
12600 int error = 0;
12601
12602 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12603
12604 if (status != NFS4_OK) {
12605 /* getattr not done or failed */
12606 PURGE_ATTRCACHE4(vp);
12607 return (error);
12608 }
12609
12610 if (garp) {
12611 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12612 } else {
12613 PURGE_ATTRCACHE4(vp);
12614 }
12615 return (error);
12616 }
12617
12618 /*
12619 * Update directory caches for directory modification ops (link, rename, etc.)
12620 * When dinfo is NULL, manage dircaches in the old way.
12621 */
12622 static void
12623 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12624 dirattr_info_t *dinfo)
12625 {
12626 rnode4_t *drp = VTOR4(dvp);
12627
12628 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12629
12630 /* Purge rddir cache for dir since it changed */
12631 if (drp->r_dir != NULL)
12632 nfs4_purge_rddir_cache(dvp);
12633
12634 /*
12635 * If caller provided dinfo, then use it to manage dir caches.
12636 */
12637 if (dinfo != NULL) {
12638 if (vp != NULL) {
12639 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12640 if (!VTOR4(vp)->created_v4) {
12641 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12642 dnlc_update(dvp, nm, vp);
12643 } else {
12644 /*
12645 * XXX don't update if the created_v4 flag is
12646 * set
12647 */
12648 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12649 NFS4_DEBUG(nfs4_client_state_debug,
12650 (CE_NOTE, "nfs4_update_dircaches: "
12651 "don't update dnlc: created_v4 flag"));
12652 }
12653 }
12654
12655 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12656 dinfo->di_cred, FALSE, cinfo);
12657
12658 return;
12659 }
12660
12661 /*
12662 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12663 * Since caller modified dir but didn't receive post-dirmod-op dir
12664 * attrs, the dir's attrs must be purged.
12665 *
12666 * XXX this check and dnlc update/purge should really be atomic,
12667 * XXX but can't use rnode statelock because it'll deadlock in
12668 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12669 * XXX does occur.
12670 *
12671 * XXX We also may want to check that atomic is true in the
12672 * XXX change_info struct. If it is not, the change_info may
12673 * XXX reflect changes by more than one clients which means that
12674 * XXX our cache may not be valid.
12675 */
12676 PURGE_ATTRCACHE4(dvp);
12677 if (drp->r_change == cinfo->before) {
12678 /* no changes took place in the directory prior to our link */
12679 if (vp != NULL) {
12680 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12681 if (!VTOR4(vp)->created_v4) {
12682 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12683 dnlc_update(dvp, nm, vp);
12684 } else {
12685 /*
12686 * XXX dont' update if the created_v4 flag
12687 * is set
12688 */
12689 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12690 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12691 "nfs4_update_dircaches: don't"
12692 " update dnlc: created_v4 flag"));
12693 }
12694 }
12695 } else {
12696 /* Another client modified directory - purge its dnlc cache */
12697 dnlc_purge_vp(dvp);
12698 }
12699 }
12700
12701 /*
12702 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12703 * file.
12704 *
12705 * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12706 * file (ie: client recovery) and otherwise set to FALSE.
12707 *
12708 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12709 * initiated) calling functions.
12710 *
12711 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12712 * of resending a 'lost' open request.
12713 *
12714 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12715 * server that hands out BAD_SEQID on open confirm.
12716 *
12717 * Errors are returned via the nfs4_error_t parameter.
12718 */
12719 void
12720 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12721 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12722 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12723 {
12724 COMPOUND4args_clnt args;
12725 COMPOUND4res_clnt res;
12726 nfs_argop4 argop[2];
12727 nfs_resop4 *resop;
12728 int doqueue = 1;
12729 mntinfo4_t *mi;
12730 OPEN_CONFIRM4args *open_confirm_args;
12731 int needrecov;
12732
12733 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12734 #if DEBUG
12735 mutex_enter(&oop->oo_lock);
12736 ASSERT(oop->oo_seqid_inuse);
12737 mutex_exit(&oop->oo_lock);
12738 #endif
12739
12740 recov_retry_confirm:
12741 nfs4_error_zinit(ep);
12742 *retry_open = FALSE;
12743
12744 if (resend)
12745 args.ctag = TAG_OPEN_CONFIRM_LOST;
12746 else
12747 args.ctag = TAG_OPEN_CONFIRM;
12748
12749 args.array_len = 2;
12750 args.array = argop;
12751
12752 /* putfh target fh */
12753 argop[0].argop = OP_CPUTFH;
12754 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12755
12756 argop[1].argop = OP_OPEN_CONFIRM;
12757 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12758
12759 (*seqid) += 1;
12760 open_confirm_args->seqid = *seqid;
12761 open_confirm_args->open_stateid = *stateid;
12762
12763 mi = VTOMI4(vp);
12764
12765 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12766
12767 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12768 nfs4_set_open_seqid((*seqid), oop, args.ctag);
12769 }
12770
12771 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12772 if (!needrecov && ep->error)
12773 return;
12774
12775 if (needrecov) {
12776 bool_t abort = FALSE;
12777
12778 if (reopening_file == FALSE) {
12779 nfs4_bseqid_entry_t *bsep = NULL;
12780
12781 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12782 bsep = nfs4_create_bseqid_entry(oop, NULL,
12783 vp, 0, args.ctag,
12784 open_confirm_args->seqid);
12785
12786 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12787 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12788 if (bsep) {
12789 kmem_free(bsep, sizeof (*bsep));
12790 if (num_bseqid_retryp &&
12791 --(*num_bseqid_retryp) == 0)
12792 abort = TRUE;
12793 }
12794 }
12795 if ((ep->error == ETIMEDOUT ||
12796 res.status == NFS4ERR_RESOURCE) &&
12797 abort == FALSE && resend == FALSE) {
12798 if (!ep->error)
12799 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12800
12801 delay(SEC_TO_TICK(confirm_retry_sec));
12802 goto recov_retry_confirm;
12803 }
12804 /* State may have changed so retry the entire OPEN op */
12805 if (abort == FALSE)
12806 *retry_open = TRUE;
12807 else
12808 *retry_open = FALSE;
12809 if (!ep->error)
12810 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12811 return;
12812 }
12813
12814 if (res.status) {
12815 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12816 return;
12817 }
12818
12819 resop = &res.array[1]; /* open confirm res */
12820 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12821 stateid, sizeof (*stateid));
12822
12823 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12824 }
12825
12826 /*
12827 * Return the credentials associated with a client state object. The
12828 * caller is responsible for freeing the credentials.
12829 */
12830
12831 static cred_t *
12832 state_to_cred(nfs4_open_stream_t *osp)
12833 {
12834 cred_t *cr;
12835
12836 /*
12837 * It's ok to not lock the open stream and open owner to get
12838 * the oo_cred since this is only written once (upon creation)
12839 * and will not change.
12840 */
12841 cr = osp->os_open_owner->oo_cred;
12842 crhold(cr);
12843
12844 return (cr);
12845 }
12846
12847 /*
12848 * nfs4_find_sysid
12849 *
12850 * Find the sysid for the knetconfig associated with the given mi.
12851 */
12852 static struct lm_sysid *
12853 nfs4_find_sysid(mntinfo4_t *mi)
12854 {
12855 ASSERT(nfs_zone() == mi->mi_zone);
12856
12857 /*
12858 * Switch from RDMA knconf to original mount knconf
12859 */
12860 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12861 mi->mi_curr_serv->sv_hostname, NULL));
12862 }
12863
12864 #ifdef DEBUG
12865 /*
12866 * Return a string version of the call type for easy reading.
12867 */
12868 static char *
12869 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12870 {
12871 switch (ctype) {
12872 case NFS4_LCK_CTYPE_NORM:
12873 return ("NORMAL");
12874 case NFS4_LCK_CTYPE_RECLAIM:
12875 return ("RECLAIM");
12876 case NFS4_LCK_CTYPE_RESEND:
12877 return ("RESEND");
12878 case NFS4_LCK_CTYPE_REINSTATE:
12879 return ("REINSTATE");
12880 default:
12881 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12882 "type %d", ctype);
12883 return ("");
12884 }
12885 }
12886 #endif
12887
12888 /*
12889 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12890 * Unlock requests don't have an over-the-wire locktype, so we just return
12891 * something non-threatening.
12892 */
12893
12894 static nfs_lock_type4
12895 flk_to_locktype(int cmd, int l_type)
12896 {
12897 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12898
12899 switch (l_type) {
12900 case F_UNLCK:
12901 return (READ_LT);
12902 case F_RDLCK:
12903 if (cmd == F_SETLK)
12904 return (READ_LT);
12905 else
12906 return (READW_LT);
12907 case F_WRLCK:
12908 if (cmd == F_SETLK)
12909 return (WRITE_LT);
12910 else
12911 return (WRITEW_LT);
12912 }
12913 panic("flk_to_locktype");
12914 /*NOTREACHED*/
12915 }
12916
12917 /*
12918 * Do some preliminary checks for nfs4frlock.
12919 */
12920 static int
12921 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12922 u_offset_t offset)
12923 {
12924 int error = 0;
12925
12926 /*
12927 * If we are setting a lock, check that the file is opened
12928 * with the correct mode.
12929 */
12930 if (cmd == F_SETLK || cmd == F_SETLKW) {
12931 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12932 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12933 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12934 "nfs4frlock_validate_args: file was opened with "
12935 "incorrect mode"));
12936 return (EBADF);
12937 }
12938 }
12939
12940 /* Convert the offset. It may need to be restored before returning. */
12941 if (error = convoff(vp, flk, 0, offset)) {
12942 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12943 "nfs4frlock_validate_args: convoff => error= %d\n",
12944 error));
12945 return (error);
12946 }
12947
12948 return (error);
12949 }
12950
12951 /*
12952 * Set the flock64's lm_sysid for nfs4frlock.
12953 */
12954 static int
12955 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12956 {
12957 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12958
12959 /* Find the lm_sysid */
12960 *lspp = nfs4_find_sysid(VTOMI4(vp));
12961
12962 if (*lspp == NULL) {
12963 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12964 "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12965 return (ENOLCK);
12966 }
12967
12968 flk->l_sysid = lm_sysidt(*lspp);
12969
12970 return (0);
12971 }
12972
12973 /*
12974 * Do the remaining preliminary setup for nfs4frlock.
12975 */
12976 static void
12977 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12978 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12979 cred_t **cred_otw)
12980 {
12981 /*
12982 * set tick_delay to the base delay time.
12983 * (NFS4_BASE_WAIT_TIME is in secs)
12984 */
12985
12986 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12987
12988 /*
12989 * If lock is relative to EOF, we need the newest length of the
12990 * file. Therefore invalidate the ATTR_CACHE.
12991 */
12992
12993 *whencep = flk->l_whence;
12994
12995 if (*whencep == 2) /* SEEK_END */
12996 PURGE_ATTRCACHE4(vp);
12997
12998 recov_statep->rs_flags = 0;
12999 recov_statep->rs_num_retry_despite_err = 0;
13000 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
13001 }
13002
13003 /*
13004 * Initialize and allocate the data structures necessary for
13005 * the nfs4frlock call.
13006 * Allocates argsp's op array.
13007 */
13008 static void
13009 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
13010 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
13011 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
13012 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
13013 {
13014 int argoplist_size;
13015 int num_ops = 2;
13016
13017 *retry = FALSE;
13018 *did_start_fop = FALSE;
13019 *skip_get_err = FALSE;
13020 lost_rqstp->lr_op = 0;
13021 argoplist_size = num_ops * sizeof (nfs_argop4);
13022 /* fill array with zero */
13023 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13024
13025 *argspp = argsp;
13026 *respp = NULL;
13027
13028 argsp->array_len = num_ops;
13029 argsp->array = *argopp;
13030
13031 /* initialize in case of error; will get real value down below */
13032 argsp->ctag = TAG_NONE;
13033
13034 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13035 *op_hintp = OH_LOCKU;
13036 else
13037 *op_hintp = OH_OTHER;
13038 }
13039
13040 /*
13041 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign
13042 * the proper nfs4_server_t for this instance of nfs4frlock.
13043 * Returns 0 (success) or an errno value.
13044 */
13045 static int
13046 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13047 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13048 bool_t *did_start_fop, bool_t *startrecovp)
13049 {
13050 int error = 0;
13051 rnode4_t *rp;
13052
13053 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13054
13055 if (ctype == NFS4_LCK_CTYPE_NORM) {
13056 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13057 recov_statep, startrecovp);
13058 if (error)
13059 return (error);
13060 *did_start_fop = TRUE;
13061 } else {
13062 *did_start_fop = FALSE;
13063 *startrecovp = FALSE;
13064 }
13065
13066 if (!error) {
13067 rp = VTOR4(vp);
13068
13069 /* If the file failed recovery, just quit. */
13070 mutex_enter(&rp->r_statelock);
13071 if (rp->r_flags & R4RECOVERR) {
13072 error = EIO;
13073 }
13074 mutex_exit(&rp->r_statelock);
13075 }
13076
13077 return (error);
13078 }
13079
13080 /*
13081 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A
13082 * resend nfs4frlock call is initiated by the recovery framework.
13083 * Acquires the lop and oop seqid synchronization.
13084 */
13085 static void
13086 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13087 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13088 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13089 LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13090 {
13091 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13092 int error;
13093
13094 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13095 (CE_NOTE,
13096 "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13097 ASSERT(resend_rqstp != NULL);
13098 ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13099 resend_rqstp->lr_op == OP_LOCKU);
13100
13101 *oopp = resend_rqstp->lr_oop;
13102 if (resend_rqstp->lr_oop) {
13103 open_owner_hold(resend_rqstp->lr_oop);
13104 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13105 ASSERT(error == 0); /* recov thread always succeeds */
13106 }
13107
13108 /* Must resend this lost lock/locku request. */
13109 ASSERT(resend_rqstp->lr_lop != NULL);
13110 *lopp = resend_rqstp->lr_lop;
13111 lock_owner_hold(resend_rqstp->lr_lop);
13112 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13113 ASSERT(error == 0); /* recov thread always succeeds */
13114
13115 *ospp = resend_rqstp->lr_osp;
13116 if (*ospp)
13117 open_stream_hold(resend_rqstp->lr_osp);
13118
13119 if (resend_rqstp->lr_op == OP_LOCK) {
13120 LOCK4args *lock_args;
13121
13122 argop->argop = OP_LOCK;
13123 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13124 lock_args->locktype = resend_rqstp->lr_locktype;
13125 lock_args->reclaim =
13126 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13127 lock_args->offset = resend_rqstp->lr_flk->l_start;
13128 lock_args->length = resend_rqstp->lr_flk->l_len;
13129 if (lock_args->length == 0)
13130 lock_args->length = ~lock_args->length;
13131 nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13132 mi2clientid(mi), &lock_args->locker);
13133
13134 switch (resend_rqstp->lr_ctype) {
13135 case NFS4_LCK_CTYPE_RESEND:
13136 argsp->ctag = TAG_LOCK_RESEND;
13137 break;
13138 case NFS4_LCK_CTYPE_REINSTATE:
13139 argsp->ctag = TAG_LOCK_REINSTATE;
13140 break;
13141 case NFS4_LCK_CTYPE_RECLAIM:
13142 argsp->ctag = TAG_LOCK_RECLAIM;
13143 break;
13144 default:
13145 argsp->ctag = TAG_LOCK_UNKNOWN;
13146 break;
13147 }
13148 } else {
13149 LOCKU4args *locku_args;
13150 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13151
13152 argop->argop = OP_LOCKU;
13153 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13154 locku_args->locktype = READ_LT;
13155 locku_args->seqid = lop->lock_seqid + 1;
13156 mutex_enter(&lop->lo_lock);
13157 locku_args->lock_stateid = lop->lock_stateid;
13158 mutex_exit(&lop->lo_lock);
13159 locku_args->offset = resend_rqstp->lr_flk->l_start;
13160 locku_args->length = resend_rqstp->lr_flk->l_len;
13161 if (locku_args->length == 0)
13162 locku_args->length = ~locku_args->length;
13163
13164 switch (resend_rqstp->lr_ctype) {
13165 case NFS4_LCK_CTYPE_RESEND:
13166 argsp->ctag = TAG_LOCKU_RESEND;
13167 break;
13168 case NFS4_LCK_CTYPE_REINSTATE:
13169 argsp->ctag = TAG_LOCKU_REINSTATE;
13170 break;
13171 default:
13172 argsp->ctag = TAG_LOCK_UNKNOWN;
13173 break;
13174 }
13175 }
13176 }
13177
13178 /*
13179 * Setup the LOCKT4 arguments.
13180 */
13181 static void
13182 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13183 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13184 rnode4_t *rp)
13185 {
13186 LOCKT4args *lockt_args;
13187
13188 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13189 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13190 argop->argop = OP_LOCKT;
13191 argsp->ctag = TAG_LOCKT;
13192 lockt_args = &argop->nfs_argop4_u.oplockt;
13193
13194 /*
13195 * The locktype will be READ_LT unless it's
13196 * a write lock. We do this because the Solaris
13197 * system call allows the combination of
13198 * F_UNLCK and F_GETLK* and so in that case the
13199 * unlock is mapped to a read.
13200 */
13201 if (flk->l_type == F_WRLCK)
13202 lockt_args->locktype = WRITE_LT;
13203 else
13204 lockt_args->locktype = READ_LT;
13205
13206 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13207 /* set the lock owner4 args */
13208 nfs4_setlockowner_args(&lockt_args->owner, rp,
13209 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13210 flk->l_pid);
13211 lockt_args->offset = flk->l_start;
13212 lockt_args->length = flk->l_len;
13213 if (flk->l_len == 0)
13214 lockt_args->length = ~lockt_args->length;
13215
13216 *lockt_argsp = lockt_args;
13217 }
13218
13219 /*
13220 * If the client is holding a delegation, and the open stream to be used
13221 * with this lock request is a delegation open stream, then re-open the stream.
13222 * Sets the nfs4_error_t to all zeros unless the open stream has already
13223 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY
13224 * means the caller should retry (like a recovery retry).
13225 */
13226 static void
13227 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13228 {
13229 open_delegation_type4 dt;
13230 bool_t reopen_needed, force;
13231 nfs4_open_stream_t *osp;
13232 open_claim_type4 oclaim;
13233 rnode4_t *rp = VTOR4(vp);
13234 mntinfo4_t *mi = VTOMI4(vp);
13235
13236 ASSERT(nfs_zone() == mi->mi_zone);
13237
13238 nfs4_error_zinit(ep);
13239
13240 mutex_enter(&rp->r_statev4_lock);
13241 dt = rp->r_deleg_type;
13242 mutex_exit(&rp->r_statev4_lock);
13243
13244 if (dt != OPEN_DELEGATE_NONE) {
13245 nfs4_open_owner_t *oop;
13246
13247 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13248 if (!oop) {
13249 ep->stat = NFS4ERR_IO;
13250 return;
13251 }
13252 /* returns with 'os_sync_lock' held */
13253 osp = find_open_stream(oop, rp);
13254 if (!osp) {
13255 open_owner_rele(oop);
13256 ep->stat = NFS4ERR_IO;
13257 return;
13258 }
13259
13260 if (osp->os_failed_reopen) {
13261 NFS4_DEBUG((nfs4_open_stream_debug ||
13262 nfs4_client_lock_debug), (CE_NOTE,
13263 "nfs4frlock_check_deleg: os_failed_reopen set "
13264 "for osp %p, cr %p, rp %s", (void *)osp,
13265 (void *)cr, rnode4info(rp)));
13266 mutex_exit(&osp->os_sync_lock);
13267 open_stream_rele(osp, rp);
13268 open_owner_rele(oop);
13269 ep->stat = NFS4ERR_IO;
13270 return;
13271 }
13272
13273 /*
13274 * Determine whether a reopen is needed. If this
13275 * is a delegation open stream, then send the open
13276 * to the server to give visibility to the open owner.
13277 * Even if it isn't a delegation open stream, we need
13278 * to check if the previous open CLAIM_DELEGATE_CUR
13279 * was sufficient.
13280 */
13281
13282 reopen_needed = osp->os_delegation ||
13283 ((lt == F_RDLCK &&
13284 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13285 (lt == F_WRLCK &&
13286 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13287
13288 mutex_exit(&osp->os_sync_lock);
13289 open_owner_rele(oop);
13290
13291 if (reopen_needed) {
13292 /*
13293 * Always use CLAIM_PREVIOUS after server reboot.
13294 * The server will reject CLAIM_DELEGATE_CUR if
13295 * it is used during the grace period.
13296 */
13297 mutex_enter(&mi->mi_lock);
13298 if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13299 oclaim = CLAIM_PREVIOUS;
13300 force = TRUE;
13301 } else {
13302 oclaim = CLAIM_DELEGATE_CUR;
13303 force = FALSE;
13304 }
13305 mutex_exit(&mi->mi_lock);
13306
13307 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13308 if (ep->error == EAGAIN) {
13309 nfs4_error_zinit(ep);
13310 ep->stat = NFS4ERR_DELAY;
13311 }
13312 }
13313 open_stream_rele(osp, rp);
13314 osp = NULL;
13315 }
13316 }
13317
13318 /*
13319 * Setup the LOCKU4 arguments.
13320 * Returns errors via the nfs4_error_t.
13321 * NFS4_OK no problems. *go_otwp is TRUE if call should go
13322 * over-the-wire. The caller must release the
13323 * reference on *lopp.
13324 * NFS4ERR_DELAY caller should retry (like recovery retry)
13325 * (other) unrecoverable error.
13326 */
13327 static void
13328 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13329 LOCKU4args **locku_argsp, flock64_t *flk,
13330 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13331 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13332 bool_t *skip_get_err, bool_t *go_otwp)
13333 {
13334 nfs4_lock_owner_t *lop = NULL;
13335 LOCKU4args *locku_args;
13336 pid_t pid;
13337 bool_t is_spec = FALSE;
13338 rnode4_t *rp = VTOR4(vp);
13339
13340 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13341 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13342
13343 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13344 if (ep->error || ep->stat)
13345 return;
13346
13347 argop->argop = OP_LOCKU;
13348 if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13349 argsp->ctag = TAG_LOCKU_REINSTATE;
13350 else
13351 argsp->ctag = TAG_LOCKU;
13352 locku_args = &argop->nfs_argop4_u.oplocku;
13353 *locku_argsp = locku_args;
13354
13355 /* locktype should be set to any legal value */
13356 locku_args->locktype = READ_LT;
13357
13358 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13359 flk->l_pid;
13360
13361 /*
13362 * Get the lock owner stateid. If no lock owner
13363 * exists, return success.
13364 */
13365 lop = find_lock_owner(rp, pid, LOWN_ANY);
13366 *lopp = lop;
13367 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13368 is_spec = TRUE;
13369 if (!lop || is_spec) {
13370 /*
13371 * No lock owner so no locks to unlock.
13372 * Return success. If there was a failed
13373 * reclaim earlier, the lock might still be
13374 * registered with the local locking code,
13375 * so notify it of the unlock.
13376 *
13377 * If the lockowner is using a special stateid,
13378 * then the original lock request (that created
13379 * this lockowner) was never successful, so we
13380 * have no lock to undo OTW.
13381 */
13382 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13383 "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13384 "(%ld) so return success", (long)pid));
13385
13386 if (ctype == NFS4_LCK_CTYPE_NORM)
13387 flk->l_pid = curproc->p_pid;
13388 nfs4_register_lock_locally(vp, flk, flag, offset);
13389 /*
13390 * Release our hold and NULL out so final_cleanup
13391 * doesn't try to end a lock seqid sync we
13392 * never started.
13393 */
13394 if (is_spec) {
13395 lock_owner_rele(lop);
13396 *lopp = NULL;
13397 }
13398 *skip_get_err = TRUE;
13399 *go_otwp = FALSE;
13400 return;
13401 }
13402
13403 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13404 if (ep->error == EAGAIN) {
13405 lock_owner_rele(lop);
13406 *lopp = NULL;
13407 return;
13408 }
13409
13410 mutex_enter(&lop->lo_lock);
13411 locku_args->lock_stateid = lop->lock_stateid;
13412 mutex_exit(&lop->lo_lock);
13413 locku_args->seqid = lop->lock_seqid + 1;
13414
13415 /* leave the ref count on lop, rele after RPC call */
13416
13417 locku_args->offset = flk->l_start;
13418 locku_args->length = flk->l_len;
13419 if (flk->l_len == 0)
13420 locku_args->length = ~locku_args->length;
13421
13422 *go_otwp = TRUE;
13423 }
13424
13425 /*
13426 * Setup the LOCK4 arguments.
13427 *
13428 * Returns errors via the nfs4_error_t.
13429 * NFS4_OK no problems
13430 * NFS4ERR_DELAY caller should retry (like recovery retry)
13431 * (other) unrecoverable error
13432 */
13433 static void
13434 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13435 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13436 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13437 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13438 {
13439 LOCK4args *lock_args;
13440 nfs4_open_owner_t *oop = NULL;
13441 nfs4_open_stream_t *osp = NULL;
13442 nfs4_lock_owner_t *lop = NULL;
13443 pid_t pid;
13444 rnode4_t *rp = VTOR4(vp);
13445
13446 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13447
13448 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13449 if (ep->error || ep->stat != NFS4_OK)
13450 return;
13451
13452 argop->argop = OP_LOCK;
13453 if (ctype == NFS4_LCK_CTYPE_NORM)
13454 argsp->ctag = TAG_LOCK;
13455 else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13456 argsp->ctag = TAG_RELOCK;
13457 else
13458 argsp->ctag = TAG_LOCK_REINSTATE;
13459 lock_args = &argop->nfs_argop4_u.oplock;
13460 lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13461 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13462 /*
13463 * Get the lock owner. If no lock owner exists,
13464 * create a 'temporary' one and grab the open seqid
13465 * synchronization (which puts a hold on the open
13466 * owner and open stream).
13467 * This also grabs the lock seqid synchronization.
13468 */
13469 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13470 ep->stat =
13471 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13472
13473 if (ep->stat != NFS4_OK)
13474 goto out;
13475
13476 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13477 &lock_args->locker);
13478
13479 lock_args->offset = flk->l_start;
13480 lock_args->length = flk->l_len;
13481 if (flk->l_len == 0)
13482 lock_args->length = ~lock_args->length;
13483 *lock_argsp = lock_args;
13484 out:
13485 *oopp = oop;
13486 *ospp = osp;
13487 *lopp = lop;
13488 }
13489
13490 /*
13491 * After we get the reply from the server, record the proper information
13492 * for possible resend lock requests.
13493 */
13494 static void
13495 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13496 nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13497 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13498 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13499 {
13500 bool_t unlock = (flk->l_type == F_UNLCK);
13501
13502 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13503 ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13504 ctype == NFS4_LCK_CTYPE_REINSTATE);
13505
13506 if (error != 0 && !unlock) {
13507 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13508 nfs4_client_lock_debug), (CE_NOTE,
13509 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13510 " for lop %p", (void *)lop));
13511 ASSERT(lop != NULL);
13512 mutex_enter(&lop->lo_lock);
13513 lop->lo_pending_rqsts = 1;
13514 mutex_exit(&lop->lo_lock);
13515 }
13516
13517 lost_rqstp->lr_putfirst = FALSE;
13518 lost_rqstp->lr_op = 0;
13519
13520 /*
13521 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13522 * recovery purposes so that the lock request that was sent
13523 * can be saved and re-issued later. Ditto for EIO from a forced
13524 * unmount. This is done to have the client's local locking state
13525 * match the v4 server's state; that is, the request was
13526 * potentially received and accepted by the server but the client
13527 * thinks it was not.
13528 */
13529 if (error == ETIMEDOUT || error == EINTR ||
13530 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13531 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13532 nfs4_client_lock_debug), (CE_NOTE,
13533 "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13534 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13535 (void *)lop, (void *)oop, (void *)osp));
13536 if (unlock)
13537 lost_rqstp->lr_op = OP_LOCKU;
13538 else {
13539 lost_rqstp->lr_op = OP_LOCK;
13540 lost_rqstp->lr_locktype = locktype;
13541 }
13542 /*
13543 * Objects are held and rele'd via the recovery code.
13544 * See nfs4_save_lost_rqst.
13545 */
13546 lost_rqstp->lr_vp = vp;
13547 lost_rqstp->lr_dvp = NULL;
13548 lost_rqstp->lr_oop = oop;
13549 lost_rqstp->lr_osp = osp;
13550 lost_rqstp->lr_lop = lop;
13551 lost_rqstp->lr_cr = cr;
13552 switch (ctype) {
13553 case NFS4_LCK_CTYPE_NORM:
13554 flk->l_pid = ttoproc(curthread)->p_pid;
13555 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13556 break;
13557 case NFS4_LCK_CTYPE_REINSTATE:
13558 lost_rqstp->lr_putfirst = TRUE;
13559 lost_rqstp->lr_ctype = ctype;
13560 break;
13561 default:
13562 break;
13563 }
13564 lost_rqstp->lr_flk = flk;
13565 }
13566 }
13567
13568 /*
13569 * Update lop's seqid. Also update the seqid stored in a resend request,
13570 * if any. (Some recovery errors increment the seqid, and we may have to
13571 * send the resend request again.)
13572 */
13573
13574 static void
13575 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13576 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13577 {
13578 if (lock_args) {
13579 if (lock_args->locker.new_lock_owner == TRUE)
13580 nfs4_get_and_set_next_open_seqid(oop, tag_type);
13581 else {
13582 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13583 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13584 }
13585 } else if (locku_args) {
13586 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13587 nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13588 }
13589 }
13590
13591 /*
13592 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13593 * COMPOUND4 args/res for calls that need to retry.
13594 * Switches the *cred_otwp to base_cr.
13595 */
13596 static void
13597 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13598 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13599 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13600 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13601 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13602 {
13603 nfs4_open_owner_t *oop = *oopp;
13604 nfs4_open_stream_t *osp = *ospp;
13605 nfs4_lock_owner_t *lop = *lopp;
13606 nfs_argop4 *argop = (*argspp)->array;
13607
13608 if (*did_start_fop) {
13609 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13610 needrecov);
13611 *did_start_fop = FALSE;
13612 }
13613 ASSERT((*argspp)->array_len == 2);
13614 if (argop[1].argop == OP_LOCK)
13615 nfs4args_lock_free(&argop[1]);
13616 else if (argop[1].argop == OP_LOCKT)
13617 nfs4args_lockt_free(&argop[1]);
13618 kmem_free(argop, 2 * sizeof (nfs_argop4));
13619 if (!error)
13620 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13621 *argspp = NULL;
13622 *respp = NULL;
13623
13624 if (lop) {
13625 nfs4_end_lock_seqid_sync(lop);
13626 lock_owner_rele(lop);
13627 *lopp = NULL;
13628 }
13629
13630 /* need to free up the reference on osp for lock args */
13631 if (osp != NULL) {
13632 open_stream_rele(osp, VTOR4(vp));
13633 *ospp = NULL;
13634 }
13635
13636 /* need to free up the reference on oop for lock args */
13637 if (oop != NULL) {
13638 nfs4_end_open_seqid_sync(oop);
13639 open_owner_rele(oop);
13640 *oopp = NULL;
13641 }
13642
13643 crfree(*cred_otwp);
13644 *cred_otwp = base_cr;
13645 crhold(*cred_otwp);
13646 }
13647
13648 /*
13649 * Function to process the client's recovery for nfs4frlock.
13650 * Returns TRUE if we should retry the lock request; FALSE otherwise.
13651 *
13652 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13653 * COMPOUND4 args/res for calls that need to retry.
13654 *
13655 * Note: the rp's r_lkserlock is *not* dropped during this path.
13656 */
13657 static bool_t
13658 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13659 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13660 LOCK4args *lock_args, LOCKU4args *locku_args,
13661 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13662 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13663 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13664 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13665 {
13666 nfs4_open_owner_t *oop = *oopp;
13667 nfs4_open_stream_t *osp = *ospp;
13668 nfs4_lock_owner_t *lop = *lopp;
13669
13670 bool_t abort, retry;
13671
13672 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13673 ASSERT((*argspp) != NULL);
13674 ASSERT((*respp) != NULL);
13675 if (lock_args || locku_args)
13676 ASSERT(lop != NULL);
13677
13678 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13679 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13680
13681 retry = TRUE;
13682 abort = FALSE;
13683 if (needrecov) {
13684 nfs4_bseqid_entry_t *bsep = NULL;
13685 nfs_opnum4 op;
13686
13687 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13688
13689 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13690 seqid4 seqid;
13691
13692 if (lock_args) {
13693 if (lock_args->locker.new_lock_owner == TRUE)
13694 seqid = lock_args->locker.locker4_u.
13695 open_owner.open_seqid;
13696 else
13697 seqid = lock_args->locker.locker4_u.
13698 lock_owner.lock_seqid;
13699 } else if (locku_args) {
13700 seqid = locku_args->seqid;
13701 } else {
13702 seqid = 0;
13703 }
13704
13705 bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13706 flk->l_pid, (*argspp)->ctag, seqid);
13707 }
13708
13709 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13710 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13711 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13712 NULL, op, bsep, NULL, NULL);
13713
13714 if (bsep)
13715 kmem_free(bsep, sizeof (*bsep));
13716 }
13717
13718 /*
13719 * Return that we do not want to retry the request for 3 cases:
13720 * 1. If we received EINTR or are bailing out because of a forced
13721 * unmount, we came into this code path just for the sake of
13722 * initiating recovery, we now need to return the error.
13723 * 2. If we have aborted recovery.
13724 * 3. We received NFS4ERR_BAD_SEQID.
13725 */
13726 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13727 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13728 retry = FALSE;
13729
13730 if (*did_start_fop == TRUE) {
13731 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13732 needrecov);
13733 *did_start_fop = FALSE;
13734 }
13735
13736 if (retry == TRUE) {
13737 nfs_argop4 *argop;
13738
13739 argop = (*argspp)->array;
13740 ASSERT((*argspp)->array_len == 2);
13741
13742 if (argop[1].argop == OP_LOCK)
13743 nfs4args_lock_free(&argop[1]);
13744 else if (argop[1].argop == OP_LOCKT)
13745 nfs4args_lockt_free(&argop[1]);
13746 kmem_free(argop, 2 * sizeof (nfs_argop4));
13747 if (!ep->error)
13748 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13749 *respp = NULL;
13750 *argspp = NULL;
13751 }
13752
13753 if (lop != NULL) {
13754 nfs4_end_lock_seqid_sync(lop);
13755 lock_owner_rele(lop);
13756 }
13757
13758 *lopp = NULL;
13759
13760 /* need to free up the reference on osp for lock args */
13761 if (osp != NULL) {
13762 open_stream_rele(osp, rp);
13763 *ospp = NULL;
13764 }
13765
13766 /* need to free up the reference on oop for lock args */
13767 if (oop != NULL) {
13768 nfs4_end_open_seqid_sync(oop);
13769 open_owner_rele(oop);
13770 *oopp = NULL;
13771 }
13772
13773 return (retry);
13774 }
13775
13776 /*
13777 * Handles the successful reply from the server for nfs4frlock.
13778 */
13779 static void
13780 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13781 vnode_t *vp, int flag, u_offset_t offset,
13782 nfs4_lost_rqst_t *resend_rqstp)
13783 {
13784 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13785 if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13786 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13787 if (ctype == NFS4_LCK_CTYPE_NORM) {
13788 flk->l_pid = ttoproc(curthread)->p_pid;
13789 /*
13790 * We do not register lost locks locally in
13791 * the 'resend' case since the user/application
13792 * doesn't think we have the lock.
13793 */
13794 ASSERT(!resend_rqstp);
13795 nfs4_register_lock_locally(vp, flk, flag, offset);
13796 }
13797 }
13798 }
13799
13800 /*
13801 * Handle the DENIED reply from the server for nfs4frlock.
13802 * Returns TRUE if we should retry the request; FALSE otherwise.
13803 *
13804 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13805 * COMPOUND4 args/res for calls that need to retry. Can also
13806 * drop and regrab the r_lkserlock.
13807 */
13808 static bool_t
13809 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13810 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13811 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13812 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13813 nfs4_recov_state_t *recov_statep, int needrecov,
13814 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13815 clock_t *tick_delayp, short *whencep, int *errorp,
13816 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13817 bool_t *skip_get_err)
13818 {
13819 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13820
13821 if (lock_args) {
13822 nfs4_open_owner_t *oop = *oopp;
13823 nfs4_open_stream_t *osp = *ospp;
13824 nfs4_lock_owner_t *lop = *lopp;
13825 int intr;
13826
13827 /*
13828 * Blocking lock needs to sleep and retry from the request.
13829 *
13830 * Do not block and wait for 'resend' or 'reinstate'
13831 * lock requests, just return the error.
13832 *
13833 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13834 */
13835 if (cmd == F_SETLKW) {
13836 rnode4_t *rp = VTOR4(vp);
13837 nfs_argop4 *argop = (*argspp)->array;
13838
13839 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13840
13841 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13842 recov_statep, needrecov);
13843 *did_start_fop = FALSE;
13844 ASSERT((*argspp)->array_len == 2);
13845 if (argop[1].argop == OP_LOCK)
13846 nfs4args_lock_free(&argop[1]);
13847 else if (argop[1].argop == OP_LOCKT)
13848 nfs4args_lockt_free(&argop[1]);
13849 kmem_free(argop, 2 * sizeof (nfs_argop4));
13850 if (*respp)
13851 xdr_free(xdr_COMPOUND4res_clnt,
13852 (caddr_t)*respp);
13853 *argspp = NULL;
13854 *respp = NULL;
13855 nfs4_end_lock_seqid_sync(lop);
13856 lock_owner_rele(lop);
13857 *lopp = NULL;
13858 if (osp != NULL) {
13859 open_stream_rele(osp, rp);
13860 *ospp = NULL;
13861 }
13862 if (oop != NULL) {
13863 nfs4_end_open_seqid_sync(oop);
13864 open_owner_rele(oop);
13865 *oopp = NULL;
13866 }
13867
13868 nfs_rw_exit(&rp->r_lkserlock);
13869
13870 intr = nfs4_block_and_wait(tick_delayp, rp);
13871
13872 if (intr) {
13873 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13874 RW_WRITER, FALSE);
13875 *errorp = EINTR;
13876 return (FALSE);
13877 }
13878
13879 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13880 RW_WRITER, FALSE);
13881
13882 /*
13883 * Make sure we are still safe to lock with
13884 * regards to mmapping.
13885 */
13886 if (!nfs4_safelock(vp, flk, cr)) {
13887 *errorp = EAGAIN;
13888 return (FALSE);
13889 }
13890
13891 return (TRUE);
13892 }
13893 if (ctype == NFS4_LCK_CTYPE_NORM)
13894 *errorp = EAGAIN;
13895 *skip_get_err = TRUE;
13896 flk->l_whence = 0;
13897 *whencep = 0;
13898 return (FALSE);
13899 } else if (lockt_args) {
13900 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13901 "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13902
13903 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13904 flk, lockt_args);
13905
13906 /* according to NLM code */
13907 *errorp = 0;
13908 *whencep = 0;
13909 *skip_get_err = TRUE;
13910 return (FALSE);
13911 }
13912 return (FALSE);
13913 }
13914
13915 /*
13916 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13917 */
13918 static void
13919 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13920 {
13921 switch (resp->status) {
13922 case NFS4ERR_ACCESS:
13923 case NFS4ERR_ADMIN_REVOKED:
13924 case NFS4ERR_BADHANDLE:
13925 case NFS4ERR_BAD_RANGE:
13926 case NFS4ERR_BAD_SEQID:
13927 case NFS4ERR_BAD_STATEID:
13928 case NFS4ERR_BADXDR:
13929 case NFS4ERR_DEADLOCK:
13930 case NFS4ERR_DELAY:
13931 case NFS4ERR_EXPIRED:
13932 case NFS4ERR_FHEXPIRED:
13933 case NFS4ERR_GRACE:
13934 case NFS4ERR_INVAL:
13935 case NFS4ERR_ISDIR:
13936 case NFS4ERR_LEASE_MOVED:
13937 case NFS4ERR_LOCK_NOTSUPP:
13938 case NFS4ERR_LOCK_RANGE:
13939 case NFS4ERR_MOVED:
13940 case NFS4ERR_NOFILEHANDLE:
13941 case NFS4ERR_NO_GRACE:
13942 case NFS4ERR_OLD_STATEID:
13943 case NFS4ERR_OPENMODE:
13944 case NFS4ERR_RECLAIM_BAD:
13945 case NFS4ERR_RECLAIM_CONFLICT:
13946 case NFS4ERR_RESOURCE:
13947 case NFS4ERR_SERVERFAULT:
13948 case NFS4ERR_STALE:
13949 case NFS4ERR_STALE_CLIENTID:
13950 case NFS4ERR_STALE_STATEID:
13951 return;
13952 default:
13953 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13954 "nfs4frlock_results_default: got unrecognizable "
13955 "res.status %d", resp->status));
13956 *errorp = NFS4ERR_INVAL;
13957 }
13958 }
13959
13960 /*
13961 * The lock request was successful, so update the client's state.
13962 */
13963 static void
13964 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13965 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13966 vnode_t *vp, flock64_t *flk, cred_t *cr,
13967 nfs4_lost_rqst_t *resend_rqstp)
13968 {
13969 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13970
13971 if (lock_args) {
13972 LOCK4res *lock_res;
13973
13974 lock_res = &resop->nfs_resop4_u.oplock;
13975 /* update the stateid with server's response */
13976
13977 if (lock_args->locker.new_lock_owner == TRUE) {
13978 mutex_enter(&lop->lo_lock);
13979 lop->lo_just_created = NFS4_PERM_CREATED;
13980 mutex_exit(&lop->lo_lock);
13981 }
13982
13983 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13984
13985 /*
13986 * If the lock was the result of a resending a lost
13987 * request, we've synched up the stateid and seqid
13988 * with the server, but now the server might be out of sync
13989 * with what the application thinks it has for locks.
13990 * Clean that up here. It's unclear whether we should do
13991 * this even if the filesystem has been forcibly unmounted.
13992 * For most servers, it's probably wasted effort, but
13993 * RFC 7530 lets servers require that unlocks exactly match
13994 * the locks that are held.
13995 */
13996 if (resend_rqstp != NULL &&
13997 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13998 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13999 } else {
14000 flk->l_whence = 0;
14001 }
14002 } else if (locku_args) {
14003 LOCKU4res *locku_res;
14004
14005 locku_res = &resop->nfs_resop4_u.oplocku;
14006
14007 /* Update the stateid with the server's response */
14008 nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
14009 } else if (lockt_args) {
14010 /* Switch the lock type to express success, see fcntl */
14011 flk->l_type = F_UNLCK;
14012 flk->l_whence = 0;
14013 }
14014 }
14015
14016 /*
14017 * Do final cleanup before exiting nfs4frlock.
14018 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14019 * COMPOUND4 args/res for calls that haven't already.
14020 */
14021 static void
14022 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14023 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14024 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14025 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14026 short whence, u_offset_t offset, struct lm_sysid *ls,
14027 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14028 bool_t did_start_fop, bool_t skip_get_err,
14029 cred_t *cred_otw, cred_t *cred)
14030 {
14031 mntinfo4_t *mi = VTOMI4(vp);
14032 rnode4_t *rp = VTOR4(vp);
14033 int error = *errorp;
14034 nfs_argop4 *argop;
14035 int do_flush_pages = 0;
14036
14037 ASSERT(nfs_zone() == mi->mi_zone);
14038 /*
14039 * The client recovery code wants the raw status information,
14040 * so don't map the NFS status code to an errno value for
14041 * non-normal call types.
14042 */
14043 if (ctype == NFS4_LCK_CTYPE_NORM) {
14044 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14045 *errorp = geterrno4(resp->status);
14046 if (did_start_fop == TRUE)
14047 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14048 needrecov);
14049
14050 /*
14051 * We've established a new lock on the server, so invalidate
14052 * the pages associated with the vnode to get the most up to
14053 * date pages from the server after acquiring the lock. We
14054 * want to be sure that the read operation gets the newest data.
14055 * N.B.
14056 * We used to do this in nfs4frlock_results_ok but that doesn't
14057 * work since VOP_PUTPAGE can call nfs4_commit which calls
14058 * nfs4_start_fop. We flush the pages below after calling
14059 * nfs4_end_fop above
14060 * The flush of the page cache must be done after
14061 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14062 */
14063 if (!error && resp && resp->status == NFS4_OK)
14064 do_flush_pages = 1;
14065 }
14066 if (argsp) {
14067 ASSERT(argsp->array_len == 2);
14068 argop = argsp->array;
14069 if (argop[1].argop == OP_LOCK)
14070 nfs4args_lock_free(&argop[1]);
14071 else if (argop[1].argop == OP_LOCKT)
14072 nfs4args_lockt_free(&argop[1]);
14073 kmem_free(argop, 2 * sizeof (nfs_argop4));
14074 if (resp)
14075 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14076 }
14077
14078 /* free the reference on the lock owner */
14079 if (lop != NULL) {
14080 nfs4_end_lock_seqid_sync(lop);
14081 lock_owner_rele(lop);
14082 }
14083
14084 /* need to free up the reference on osp for lock args */
14085 if (osp != NULL)
14086 open_stream_rele(osp, rp);
14087
14088 /* need to free up the reference on oop for lock args */
14089 if (oop != NULL) {
14090 nfs4_end_open_seqid_sync(oop);
14091 open_owner_rele(oop);
14092 }
14093
14094 if (do_flush_pages)
14095 nfs4_flush_pages(vp, cred);
14096
14097 (void) convoff(vp, flk, whence, offset);
14098
14099 lm_rel_sysid(ls);
14100
14101 /*
14102 * Record debug information in the event we get EINVAL.
14103 */
14104 mutex_enter(&mi->mi_lock);
14105 if (*errorp == EINVAL && (lock_args || locku_args) &&
14106 (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14107 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14108 zcmn_err(getzoneid(), CE_NOTE,
14109 "%s operation failed with "
14110 "EINVAL probably since the server, %s,"
14111 " doesn't support POSIX style locking",
14112 lock_args ? "LOCK" : "LOCKU",
14113 mi->mi_curr_serv->sv_hostname);
14114 mi->mi_flags |= MI4_LOCK_DEBUG;
14115 }
14116 }
14117 mutex_exit(&mi->mi_lock);
14118
14119 if (cred_otw)
14120 crfree(cred_otw);
14121 }
14122
14123 /*
14124 * This calls the server and the local locking code.
14125 *
14126 * Client locks are registerred locally by oring the sysid with
14127 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14128 * We need to distinguish between the two to avoid collision in case one
14129 * machine is used as both client and server.
14130 *
14131 * Blocking lock requests will continually retry to acquire the lock
14132 * forever.
14133 *
14134 * The ctype is defined as follows:
14135 * NFS4_LCK_CTYPE_NORM: normal lock request.
14136 *
14137 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client
14138 * recovery, get the pid from flk instead of curproc, and don't reregister
14139 * the lock locally.
14140 *
14141 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14142 * that we will use the information passed in via resend_rqstp to setup the
14143 * lock/locku request. This resend is the exact same request as the 'lost
14144 * lock', and is initiated by the recovery framework. A successful resend
14145 * request can initiate one or more reinstate requests.
14146 *
14147 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14148 * does not trigger additional reinstate requests. This lock call type is
14149 * set for setting the v4 server's locking state back to match what the
14150 * client's local locking state is in the event of a received 'lost lock'.
14151 *
14152 * Errors are returned via the nfs4_error_t parameter.
14153 */
14154 void
14155 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14156 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14157 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14158 {
14159 COMPOUND4args_clnt args, *argsp = NULL;
14160 COMPOUND4res_clnt res, *resp = NULL;
14161 nfs_argop4 *argop;
14162 nfs_resop4 *resop;
14163 rnode4_t *rp;
14164 int doqueue = 1;
14165 clock_t tick_delay; /* delay in clock ticks */
14166 struct lm_sysid *ls;
14167 LOCK4args *lock_args = NULL;
14168 LOCKU4args *locku_args = NULL;
14169 LOCKT4args *lockt_args = NULL;
14170 nfs4_open_owner_t *oop = NULL;
14171 nfs4_open_stream_t *osp = NULL;
14172 nfs4_lock_owner_t *lop = NULL;
14173 bool_t needrecov = FALSE;
14174 nfs4_recov_state_t recov_state;
14175 short whence;
14176 nfs4_op_hint_t op_hint;
14177 nfs4_lost_rqst_t lost_rqst;
14178 bool_t retry = FALSE;
14179 bool_t did_start_fop = FALSE;
14180 bool_t skip_get_err = FALSE;
14181 cred_t *cred_otw = NULL;
14182 bool_t recovonly; /* just queue request */
14183 int frc_no_reclaim = 0;
14184 #ifdef DEBUG
14185 char *name;
14186 #endif
14187
14188 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14189
14190 #ifdef DEBUG
14191 name = fn_name(VTOSV(vp)->sv_name);
14192 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14193 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14194 "length %"PRIu64", pid %d, sysid %d, call type %s, "
14195 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14196 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14197 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14198 resend_rqstp ? "TRUE" : "FALSE"));
14199 kmem_free(name, MAXNAMELEN);
14200 #endif
14201
14202 nfs4_error_zinit(ep);
14203 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14204 if (ep->error)
14205 return;
14206 ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14207 if (ep->error)
14208 return;
14209 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14210 vp, cr, &cred_otw);
14211
14212 recov_retry:
14213 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14214 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14215 rp = VTOR4(vp);
14216
14217 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14218 &did_start_fop, &recovonly);
14219
14220 if (ep->error)
14221 goto out;
14222
14223 if (recovonly) {
14224 /*
14225 * Leave the request for the recovery system to deal with.
14226 */
14227 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14228 ASSERT(cmd != F_GETLK);
14229 ASSERT(flk->l_type == F_UNLCK);
14230
14231 nfs4_error_init(ep, EINTR);
14232 needrecov = TRUE;
14233 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14234 if (lop != NULL) {
14235 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14236 NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14237 (void) nfs4_start_recovery(ep,
14238 VTOMI4(vp), vp, NULL, NULL,
14239 (lost_rqst.lr_op == OP_LOCK ||
14240 lost_rqst.lr_op == OP_LOCKU) ?
14241 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14242 lock_owner_rele(lop);
14243 lop = NULL;
14244 }
14245 flk->l_pid = curproc->p_pid;
14246 nfs4_register_lock_locally(vp, flk, flag, offset);
14247 goto out;
14248 }
14249
14250 /* putfh directory fh */
14251 argop[0].argop = OP_CPUTFH;
14252 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14253
14254 /*
14255 * Set up the over-the-wire arguments and get references to the
14256 * open owner, etc.
14257 */
14258
14259 if (ctype == NFS4_LCK_CTYPE_RESEND ||
14260 ctype == NFS4_LCK_CTYPE_REINSTATE) {
14261 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14262 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14263 } else {
14264 bool_t go_otw = TRUE;
14265
14266 ASSERT(resend_rqstp == NULL);
14267
14268 switch (cmd) {
14269 case F_GETLK:
14270 nfs4frlock_setup_lockt_args(ctype, &argop[1],
14271 &lockt_args, argsp, flk, rp);
14272 break;
14273 case F_SETLKW:
14274 case F_SETLK:
14275 if (flk->l_type == F_UNLCK)
14276 nfs4frlock_setup_locku_args(ctype,
14277 &argop[1], &locku_args, flk,
14278 &lop, ep, argsp,
14279 vp, flag, offset, cr,
14280 &skip_get_err, &go_otw);
14281 else
14282 nfs4frlock_setup_lock_args(ctype,
14283 &lock_args, &oop, &osp, &lop, &argop[1],
14284 argsp, flk, cmd, vp, cr, ep);
14285
14286 if (ep->error)
14287 goto out;
14288
14289 switch (ep->stat) {
14290 case NFS4_OK:
14291 break;
14292 case NFS4ERR_DELAY:
14293 /* recov thread never gets this error */
14294 ASSERT(resend_rqstp == NULL);
14295 ASSERT(did_start_fop);
14296
14297 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14298 &recov_state, TRUE);
14299 did_start_fop = FALSE;
14300 if (argop[1].argop == OP_LOCK)
14301 nfs4args_lock_free(&argop[1]);
14302 else if (argop[1].argop == OP_LOCKT)
14303 nfs4args_lockt_free(&argop[1]);
14304 kmem_free(argop, 2 * sizeof (nfs_argop4));
14305 argsp = NULL;
14306 goto recov_retry;
14307 default:
14308 ep->error = EIO;
14309 goto out;
14310 }
14311 break;
14312 default:
14313 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14314 "nfs4_frlock: invalid cmd %d", cmd));
14315 ep->error = EINVAL;
14316 goto out;
14317 }
14318
14319 if (!go_otw)
14320 goto out;
14321 }
14322
14323 /* XXX should we use the local reclock as a cache ? */
14324 /*
14325 * Unregister the lock with the local locking code before
14326 * contacting the server. This avoids a potential race where
14327 * another process gets notified that it has been granted a lock
14328 * before we can unregister ourselves locally.
14329 */
14330 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14331 if (ctype == NFS4_LCK_CTYPE_NORM)
14332 flk->l_pid = ttoproc(curthread)->p_pid;
14333 nfs4_register_lock_locally(vp, flk, flag, offset);
14334 }
14335
14336 /*
14337 * Send the server the lock request. Continually loop with a delay
14338 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14339 */
14340 resp = &res;
14341
14342 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14343 (CE_NOTE,
14344 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14345 rnode4info(rp)));
14346
14347 if (lock_args && frc_no_reclaim) {
14348 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14349 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14350 "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14351 lock_args->reclaim = FALSE;
14352 if (did_reclaimp)
14353 *did_reclaimp = 0;
14354 }
14355
14356 /*
14357 * Do the OTW call.
14358 */
14359 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14360
14361 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14362 "nfs4frlock: error %d, status %d", ep->error, resp->status));
14363
14364 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14365 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14366 "nfs4frlock: needrecov %d", needrecov));
14367
14368 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14369 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14370 args.ctag);
14371
14372 /*
14373 * Check if one of these mutually exclusive error cases has
14374 * happened:
14375 * need to swap credentials due to access error
14376 * recovery is needed
14377 * different error (only known case is missing Kerberos ticket)
14378 */
14379
14380 if ((ep->error == EACCES ||
14381 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14382 cred_otw != cr) {
14383 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14384 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14385 cr, &cred_otw);
14386 goto recov_retry;
14387 }
14388
14389 if (needrecov) {
14390 /*
14391 * LOCKT requests don't need to recover from lost
14392 * requests since they don't create/modify state.
14393 */
14394 if ((ep->error == EINTR ||
14395 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14396 lockt_args)
14397 goto out;
14398 /*
14399 * Do not attempt recovery for requests initiated by
14400 * the recovery framework. Let the framework redrive them.
14401 */
14402 if (ctype != NFS4_LCK_CTYPE_NORM)
14403 goto out;
14404 else {
14405 ASSERT(resend_rqstp == NULL);
14406 }
14407
14408 nfs4frlock_save_lost_rqst(ctype, ep->error,
14409 flk_to_locktype(cmd, flk->l_type),
14410 oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14411
14412 retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14413 &resp, lock_args, locku_args, &oop, &osp, &lop,
14414 rp, vp, &recov_state, op_hint, &did_start_fop,
14415 cmd != F_GETLK ? &lost_rqst : NULL, flk);
14416
14417 if (retry) {
14418 ASSERT(oop == NULL);
14419 ASSERT(osp == NULL);
14420 ASSERT(lop == NULL);
14421 goto recov_retry;
14422 }
14423 goto out;
14424 }
14425
14426 /*
14427 * Bail out if have reached this point with ep->error set. Can
14428 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14429 * This happens if Kerberos ticket has expired or has been
14430 * destroyed.
14431 */
14432 if (ep->error != 0)
14433 goto out;
14434
14435 /*
14436 * Process the reply.
14437 */
14438 switch (resp->status) {
14439 case NFS4_OK:
14440 resop = &resp->array[1];
14441 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14442 resend_rqstp);
14443 /*
14444 * Have a successful lock operation, now update state.
14445 */
14446 nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14447 resop, lop, vp, flk, cr, resend_rqstp);
14448 break;
14449
14450 case NFS4ERR_DENIED:
14451 resop = &resp->array[1];
14452 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14453 &oop, &osp, &lop, cmd, vp, flk, op_hint,
14454 &recov_state, needrecov, &argsp, &resp,
14455 &tick_delay, &whence, &ep->error, resop, cr,
14456 &did_start_fop, &skip_get_err);
14457
14458 if (retry) {
14459 ASSERT(oop == NULL);
14460 ASSERT(osp == NULL);
14461 ASSERT(lop == NULL);
14462 goto recov_retry;
14463 }
14464 break;
14465 /*
14466 * If the server won't let us reclaim, fall-back to trying to lock
14467 * the file from scratch. Code elsewhere will check the changeinfo
14468 * to ensure the file hasn't been changed.
14469 */
14470 case NFS4ERR_NO_GRACE:
14471 if (lock_args && lock_args->reclaim == TRUE) {
14472 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14473 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14474 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14475 frc_no_reclaim = 1;
14476 /* clean up before retrying */
14477 needrecov = 0;
14478 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14479 lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14480 &recov_state, op_hint, &did_start_fop, NULL, flk);
14481 goto recov_retry;
14482 }
14483 /* FALLTHROUGH */
14484
14485 default:
14486 nfs4frlock_results_default(resp, &ep->error);
14487 break;
14488 }
14489 out:
14490 /*
14491 * Process and cleanup from error. Make interrupted unlock
14492 * requests look successful, since they will be handled by the
14493 * client recovery code.
14494 */
14495 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14496 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14497 lock_args, locku_args, did_start_fop,
14498 skip_get_err, cred_otw, cr);
14499
14500 if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14501 (cmd == F_SETLK || cmd == F_SETLKW))
14502 ep->error = 0;
14503 }
14504
14505 /*
14506 * nfs4_safelock:
14507 *
14508 * Return non-zero if the given lock request can be handled without
14509 * violating the constraints on concurrent mapping and locking.
14510 */
14511
14512 static int
14513 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14514 {
14515 rnode4_t *rp = VTOR4(vp);
14516 struct vattr va;
14517 int error;
14518
14519 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14520 ASSERT(rp->r_mapcnt >= 0);
14521 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14522 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14523 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14524 bfp->l_start, bfp->l_len, rp->r_mapcnt));
14525
14526 if (rp->r_mapcnt == 0)
14527 return (1); /* always safe if not mapped */
14528
14529 /*
14530 * If the file is already mapped and there are locks, then they
14531 * should be all safe locks. So adding or removing a lock is safe
14532 * as long as the new request is safe (i.e., whole-file, meaning
14533 * length and starting offset are both zero).
14534 */
14535
14536 if (bfp->l_start != 0 || bfp->l_len != 0) {
14537 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14538 "cannot lock a memory mapped file unless locking the "
14539 "entire file: start %"PRIx64", len %"PRIx64,
14540 bfp->l_start, bfp->l_len));
14541 return (0);
14542 }
14543
14544 /* mandatory locking and mapping don't mix */
14545 va.va_mask = AT_MODE;
14546 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14547 if (error != 0) {
14548 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14549 "getattr error %d", error));
14550 return (0); /* treat errors conservatively */
14551 }
14552 if (MANDLOCK(vp, va.va_mode)) {
14553 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14554 "cannot mandatory lock and mmap a file"));
14555 return (0);
14556 }
14557
14558 return (1);
14559 }
14560
14561
14562 /*
14563 * Register the lock locally within Solaris.
14564 * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14565 * recording locks locally.
14566 *
14567 * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14568 * are registered locally.
14569 */
14570 void
14571 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14572 u_offset_t offset)
14573 {
14574 int oldsysid;
14575 int error;
14576 #ifdef DEBUG
14577 char *name;
14578 #endif
14579
14580 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14581
14582 #ifdef DEBUG
14583 name = fn_name(VTOSV(vp)->sv_name);
14584 NFS4_DEBUG(nfs4_client_lock_debug,
14585 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14586 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14587 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14588 flk->l_sysid));
14589 kmem_free(name, MAXNAMELEN);
14590 #endif
14591
14592 /* register the lock with local locking */
14593 oldsysid = flk->l_sysid;
14594 flk->l_sysid |= LM_SYSID_CLIENT;
14595 error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14596 #ifdef DEBUG
14597 if (error != 0) {
14598 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14599 "nfs4_register_lock_locally: could not register with"
14600 " local locking"));
14601 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14602 "error %d, vp 0x%p, pid %d, sysid 0x%x",
14603 error, (void *)vp, flk->l_pid, flk->l_sysid));
14604 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14605 "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14606 flk->l_type, flk->l_start, flk->l_len));
14607 (void) reclock(vp, flk, 0, flag, offset, NULL);
14608 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14609 "blocked by pid %d sysid 0x%x type %d "
14610 "off 0x%" PRIx64 " len 0x%" PRIx64,
14611 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14612 flk->l_len));
14613 }
14614 #endif
14615 flk->l_sysid = oldsysid;
14616 }
14617
14618 /*
14619 * nfs4_lockrelease:
14620 *
14621 * Release any locks on the given vnode that are held by the current
14622 * process. Also removes the lock owner (if one exists) from the rnode's
14623 * list.
14624 */
14625 static int
14626 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14627 {
14628 flock64_t ld;
14629 int ret, error;
14630 rnode4_t *rp;
14631 nfs4_lock_owner_t *lop;
14632 nfs4_recov_state_t recov_state;
14633 mntinfo4_t *mi;
14634 bool_t possible_orphan = FALSE;
14635 bool_t recovonly;
14636
14637 ASSERT((uintptr_t)vp > KERNELBASE);
14638 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14639
14640 rp = VTOR4(vp);
14641 mi = VTOMI4(vp);
14642
14643 /*
14644 * If we have not locked anything then we can
14645 * just return since we have no work to do.
14646 */
14647 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14648 return (0);
14649 }
14650
14651 /*
14652 * We need to comprehend that another thread may
14653 * kick off recovery and the lock_owner we have stashed
14654 * in lop might be invalid so we should NOT cache it
14655 * locally!
14656 */
14657 recov_state.rs_flags = 0;
14658 recov_state.rs_num_retry_despite_err = 0;
14659 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14660 &recovonly);
14661 if (error) {
14662 mutex_enter(&rp->r_statelock);
14663 rp->r_flags |= R4LODANGLERS;
14664 mutex_exit(&rp->r_statelock);
14665 return (error);
14666 }
14667
14668 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14669
14670 /*
14671 * Check if the lock owner might have a lock (request was sent but
14672 * no response was received). Also check if there are any remote
14673 * locks on the file. (In theory we shouldn't have to make this
14674 * second check if there's no lock owner, but for now we'll be
14675 * conservative and do it anyway.) If either condition is true,
14676 * send an unlock for the entire file to the server.
14677 *
14678 * Note that no explicit synchronization is needed here. At worst,
14679 * flk_has_remote_locks() will return a false positive, in which case
14680 * the unlock call wastes time but doesn't harm correctness.
14681 */
14682
14683 if (lop) {
14684 mutex_enter(&lop->lo_lock);
14685 possible_orphan = lop->lo_pending_rqsts;
14686 mutex_exit(&lop->lo_lock);
14687 lock_owner_rele(lop);
14688 }
14689
14690 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14691
14692 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14693 "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14694 "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14695 (void *)lop));
14696
14697 if (possible_orphan || flk_has_remote_locks(vp)) {
14698 ld.l_type = F_UNLCK; /* set to unlock entire file */
14699 ld.l_whence = 0; /* unlock from start of file */
14700 ld.l_start = 0;
14701 ld.l_len = 0; /* do entire file */
14702
14703 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14704 cr, NULL);
14705
14706 if (ret != 0) {
14707 /*
14708 * If VOP_FRLOCK fails, make sure we unregister
14709 * local locks before we continue.
14710 */
14711 ld.l_pid = ttoproc(curthread)->p_pid;
14712 nfs4_register_lock_locally(vp, &ld, flag, offset);
14713 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14714 "nfs4_lockrelease: lock release error on vp"
14715 " %p: error %d.\n", (void *)vp, ret));
14716 }
14717 }
14718
14719 recov_state.rs_flags = 0;
14720 recov_state.rs_num_retry_despite_err = 0;
14721 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14722 &recovonly);
14723 if (error) {
14724 mutex_enter(&rp->r_statelock);
14725 rp->r_flags |= R4LODANGLERS;
14726 mutex_exit(&rp->r_statelock);
14727 return (error);
14728 }
14729
14730 /*
14731 * So, here we're going to need to retrieve the lock-owner
14732 * again (in case recovery has done a switch-a-roo) and
14733 * remove it because we can.
14734 */
14735 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14736
14737 if (lop) {
14738 nfs4_rnode_remove_lock_owner(rp, lop);
14739 lock_owner_rele(lop);
14740 }
14741
14742 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14743 return (0);
14744 }
14745
14746 /*
14747 * Wait for 'tick_delay' clock ticks.
14748 * Implement exponential backoff until hit the lease_time of this nfs4_server.
14749 * NOTE: lock_lease_time is in seconds.
14750 *
14751 * XXX For future improvements, should implement a waiting queue scheme.
14752 */
14753 static int
14754 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14755 {
14756 long milliseconds_delay;
14757 time_t lock_lease_time;
14758
14759 /* wait tick_delay clock ticks or siginteruptus */
14760 if (delay_sig(*tick_delay)) {
14761 return (EINTR);
14762 }
14763 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14764 "reissue the lock request: blocked for %ld clock ticks: %ld "
14765 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14766
14767 /* get the lease time */
14768 lock_lease_time = r2lease_time(rp);
14769
14770 /* drv_hztousec converts ticks to microseconds */
14771 milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14772 if (milliseconds_delay < lock_lease_time * 1000) {
14773 *tick_delay = 2 * *tick_delay;
14774 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14775 *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14776 }
14777 return (0);
14778 }
14779
14780
14781 void
14782 nfs4_vnops_init(void)
14783 {
14784 }
14785
14786 void
14787 nfs4_vnops_fini(void)
14788 {
14789 }
14790
14791 /*
14792 * Return a reference to the directory (parent) vnode for a given vnode,
14793 * using the saved pathname information and the directory file handle. The
14794 * caller is responsible for disposing of the reference.
14795 * Returns zero or an errno value.
14796 *
14797 * Caller should set need_start_op to FALSE if it is the recovery
14798 * thread, or if a start_fop has already been done. Otherwise, TRUE.
14799 */
14800 int
14801 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14802 {
14803 svnode_t *svnp;
14804 vnode_t *dvp = NULL;
14805 servinfo4_t *svp;
14806 nfs4_fname_t *mfname;
14807 int error;
14808
14809 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14810
14811 if (vp->v_flag & VROOT) {
14812 nfs4_sharedfh_t *sfh;
14813 nfs_fh4 fh;
14814 mntinfo4_t *mi;
14815
14816 ASSERT(vp->v_type == VREG);
14817
14818 mi = VTOMI4(vp);
14819 svp = mi->mi_curr_serv;
14820 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14821 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14822 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14823 sfh = sfh4_get(&fh, VTOMI4(vp));
14824 nfs_rw_exit(&svp->sv_lock);
14825 mfname = mi->mi_fname;
14826 fn_hold(mfname);
14827 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14828 sfh4_rele(&sfh);
14829
14830 if (dvp->v_type == VNON)
14831 dvp->v_type = VDIR;
14832 *dvpp = dvp;
14833 return (0);
14834 }
14835
14836 svnp = VTOSV(vp);
14837
14838 if (svnp == NULL) {
14839 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14840 "shadow node is NULL"));
14841 return (EINVAL);
14842 }
14843
14844 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14845 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14846 "shadow node name or dfh val == NULL"));
14847 return (EINVAL);
14848 }
14849
14850 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14851 (int)need_start_op);
14852 if (error != 0) {
14853 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14854 "nfs4_make_dotdot returned %d", error));
14855 return (error);
14856 }
14857 if (!dvp) {
14858 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14859 "nfs4_make_dotdot returned a NULL dvp"));
14860 return (EIO);
14861 }
14862 if (dvp->v_type == VNON)
14863 dvp->v_type = VDIR;
14864 ASSERT(dvp->v_type == VDIR);
14865 if (VTOR4(vp)->r_flags & R4ISXATTR) {
14866 mutex_enter(&dvp->v_lock);
14867 dvp->v_flag |= V_XATTRDIR;
14868 mutex_exit(&dvp->v_lock);
14869 }
14870 *dvpp = dvp;
14871 return (0);
14872 }
14873
14874 /*
14875 * Copy the (final) component name of vp to fnamep. maxlen is the maximum
14876 * length that fnamep can accept, including the trailing null.
14877 * Returns 0 if okay, returns an errno value if there was a problem.
14878 */
14879
14880 int
14881 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14882 {
14883 char *fn;
14884 int err = 0;
14885 servinfo4_t *svp;
14886 svnode_t *shvp;
14887
14888 /*
14889 * If the file being opened has VROOT set, then this is
14890 * a "file" mount. sv_name will not be interesting, so
14891 * go back to the servinfo4 to get the original mount
14892 * path and strip off all but the final edge. Otherwise
14893 * just return the name from the shadow vnode.
14894 */
14895
14896 if (vp->v_flag & VROOT) {
14897
14898 svp = VTOMI4(vp)->mi_curr_serv;
14899 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14900
14901 fn = strrchr(svp->sv_path, '/');
14902 if (fn == NULL)
14903 err = EINVAL;
14904 else
14905 fn++;
14906 } else {
14907 shvp = VTOSV(vp);
14908 fn = fn_name(shvp->sv_name);
14909 }
14910
14911 if (err == 0)
14912 if (strlen(fn) < maxlen)
14913 (void) strcpy(fnamep, fn);
14914 else
14915 err = ENAMETOOLONG;
14916
14917 if (vp->v_flag & VROOT)
14918 nfs_rw_exit(&svp->sv_lock);
14919 else
14920 kmem_free(fn, MAXNAMELEN);
14921
14922 return (err);
14923 }
14924
14925 /*
14926 * Bookkeeping for a close that doesn't need to go over the wire.
14927 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14928 * it is left at 1.
14929 */
14930 void
14931 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14932 {
14933 rnode4_t *rp;
14934 mntinfo4_t *mi;
14935
14936 mi = VTOMI4(vp);
14937 rp = VTOR4(vp);
14938
14939 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14940 "rp=%p osp=%p", (void *)rp, (void *)osp));
14941 ASSERT(nfs_zone() == mi->mi_zone);
14942 ASSERT(mutex_owned(&osp->os_sync_lock));
14943 ASSERT(*have_lockp);
14944
14945 if (!osp->os_valid ||
14946 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14947 return;
14948 }
14949
14950 /*
14951 * This removes the reference obtained at OPEN; ie,
14952 * when the open stream structure was created.
14953 *
14954 * We don't have to worry about calling 'open_stream_rele'
14955 * since we our currently holding a reference to this
14956 * open stream which means the count can not go to 0 with
14957 * this decrement.
14958 */
14959 ASSERT(osp->os_ref_count >= 2);
14960 osp->os_ref_count--;
14961 osp->os_valid = 0;
14962 mutex_exit(&osp->os_sync_lock);
14963 *have_lockp = 0;
14964
14965 nfs4_dec_state_ref_count(mi);
14966 }
14967
14968 /*
14969 * Close all remaining open streams on the rnode. These open streams
14970 * could be here because:
14971 * - The close attempted at either close or delmap failed
14972 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14973 * - Someone did mknod on a regular file but never opened it
14974 */
14975 int
14976 nfs4close_all(vnode_t *vp, cred_t *cr)
14977 {
14978 nfs4_open_stream_t *osp;
14979 int error;
14980 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14981 rnode4_t *rp;
14982
14983 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14984
14985 error = 0;
14986 rp = VTOR4(vp);
14987
14988 /*
14989 * At this point, all we know is that the last time
14990 * someone called vn_rele, the count was 1. Since then,
14991 * the vnode could have been re-activated. We want to
14992 * loop through the open streams and close each one, but
14993 * we have to be careful since once we release the rnode
14994 * hash bucket lock, someone else is free to come in and
14995 * re-activate the rnode and add new open streams. The
14996 * strategy is take the rnode hash bucket lock, verify that
14997 * the count is still 1, grab the open stream off the
14998 * head of the list and mark it invalid, then release the
14999 * rnode hash bucket lock and proceed with that open stream.
15000 * This is ok because nfs4close_one() will acquire the proper
15001 * open/create to close/destroy synchronization for open
15002 * streams, and will ensure that if someone has reopened
15003 * the open stream after we've dropped the hash bucket lock
15004 * then we'll just simply return without destroying the
15005 * open stream.
15006 * Repeat until the list is empty.
15007 */
15008
15009 for (;;) {
15010
15011 /* make sure vnode hasn't been reactivated */
15012 rw_enter(&rp->r_hashq->r_lock, RW_READER);
15013 mutex_enter(&vp->v_lock);
15014 if (vp->v_count > 1) {
15015 mutex_exit(&vp->v_lock);
15016 rw_exit(&rp->r_hashq->r_lock);
15017 break;
15018 }
15019 /*
15020 * Grabbing r_os_lock before releasing v_lock prevents
15021 * a window where the rnode/open stream could get
15022 * reactivated (and os_force_close set to 0) before we
15023 * had a chance to set os_force_close to 1.
15024 */
15025 mutex_enter(&rp->r_os_lock);
15026 mutex_exit(&vp->v_lock);
15027
15028 osp = list_head(&rp->r_open_streams);
15029 if (!osp) {
15030 /* nothing left to CLOSE OTW, so return */
15031 mutex_exit(&rp->r_os_lock);
15032 rw_exit(&rp->r_hashq->r_lock);
15033 break;
15034 }
15035
15036 mutex_enter(&rp->r_statev4_lock);
15037 /* the file can't still be mem mapped */
15038 ASSERT(rp->r_mapcnt == 0);
15039 if (rp->created_v4)
15040 rp->created_v4 = 0;
15041 mutex_exit(&rp->r_statev4_lock);
15042
15043 /*
15044 * Grab a ref on this open stream; nfs4close_one
15045 * will mark it as invalid
15046 */
15047 mutex_enter(&osp->os_sync_lock);
15048 osp->os_ref_count++;
15049 osp->os_force_close = 1;
15050 mutex_exit(&osp->os_sync_lock);
15051 mutex_exit(&rp->r_os_lock);
15052 rw_exit(&rp->r_hashq->r_lock);
15053
15054 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15055
15056 /* Update error if it isn't already non-zero */
15057 if (error == 0) {
15058 if (e.error)
15059 error = e.error;
15060 else if (e.stat)
15061 error = geterrno4(e.stat);
15062 }
15063
15064 #ifdef DEBUG
15065 nfs4close_all_cnt++;
15066 #endif
15067 /* Release the ref on osp acquired above. */
15068 open_stream_rele(osp, rp);
15069
15070 /* Proceed to the next open stream, if any */
15071 }
15072 return (error);
15073 }
15074
15075 /*
15076 * nfs4close_one - close one open stream for a file if needed.
15077 *
15078 * "close_type" indicates which close path this is:
15079 * CLOSE_NORM: close initiated via VOP_CLOSE.
15080 * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15081 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces
15082 * the close and release of client state for this open stream
15083 * (unless someone else has the open stream open).
15084 * CLOSE_RESEND: indicates the request is a replay of an earlier request
15085 * (e.g., due to abort because of a signal).
15086 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15087 *
15088 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15089 * recovery. Instead, the caller is expected to deal with retries.
15090 *
15091 * The caller can either pass in the osp ('provided_osp') or not.
15092 *
15093 * 'access_bits' represents the access we are closing/downgrading.
15094 *
15095 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the
15096 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15097 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15098 *
15099 * Errors are returned via the nfs4_error_t.
15100 */
15101 void
15102 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15103 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15104 nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15105 uint_t mmap_flags)
15106 {
15107 nfs4_open_owner_t *oop;
15108 nfs4_open_stream_t *osp = NULL;
15109 int retry = 0;
15110 int num_retries = NFS4_NUM_RECOV_RETRIES;
15111 rnode4_t *rp;
15112 mntinfo4_t *mi;
15113 nfs4_recov_state_t recov_state;
15114 cred_t *cred_otw = NULL;
15115 bool_t recovonly = FALSE;
15116 int isrecov;
15117 int force_close;
15118 int close_failed = 0;
15119 int did_dec_count = 0;
15120 int did_start_op = 0;
15121 int did_force_recovlock = 0;
15122 int did_start_seqid_sync = 0;
15123 int have_sync_lock = 0;
15124
15125 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15126
15127 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15128 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15129 (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15130 len, maxprot, mmap_flags, access_bits));
15131
15132 nfs4_error_zinit(ep);
15133 rp = VTOR4(vp);
15134 mi = VTOMI4(vp);
15135 isrecov = (close_type == CLOSE_RESEND ||
15136 close_type == CLOSE_AFTER_RESEND);
15137
15138 /*
15139 * First get the open owner.
15140 */
15141 if (!provided_osp) {
15142 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15143 } else {
15144 oop = provided_osp->os_open_owner;
15145 ASSERT(oop != NULL);
15146 open_owner_hold(oop);
15147 }
15148
15149 if (!oop) {
15150 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15151 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15152 "close type %d", (void *)rp, (void *)mi, (void *)cr,
15153 (void *)provided_osp, close_type));
15154 ep->error = EIO;
15155 goto out;
15156 }
15157
15158 cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15159 recov_retry:
15160 osp = NULL;
15161 close_failed = 0;
15162 force_close = (close_type == CLOSE_FORCE);
15163 retry = 0;
15164 did_start_op = 0;
15165 did_force_recovlock = 0;
15166 did_start_seqid_sync = 0;
15167 have_sync_lock = 0;
15168 recovonly = FALSE;
15169 recov_state.rs_flags = 0;
15170 recov_state.rs_num_retry_despite_err = 0;
15171
15172 /*
15173 * Second synchronize with recovery.
15174 */
15175 if (!isrecov) {
15176 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15177 &recov_state, &recovonly);
15178 if (!ep->error) {
15179 did_start_op = 1;
15180 } else {
15181 close_failed = 1;
15182 /*
15183 * If we couldn't get start_fop, but have to
15184 * cleanup state, then at least acquire the
15185 * mi_recovlock so we can synchronize with
15186 * recovery.
15187 */
15188 if (close_type == CLOSE_FORCE) {
15189 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15190 RW_READER, FALSE);
15191 did_force_recovlock = 1;
15192 } else
15193 goto out;
15194 }
15195 }
15196
15197 /*
15198 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15199 * set 'recovonly' to TRUE since most likely this is due to
15200 * reovery being active (MI4_RECOV_ACTIV). If recovery is active,
15201 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15202 * to retry, causing us to loop until recovery finishes. Plus we
15203 * don't need protection over the open seqid since we're not going
15204 * OTW, hence don't need to use the seqid.
15205 */
15206 if (recovonly == FALSE) {
15207 /* need to grab the open owner sync before 'os_sync_lock' */
15208 ep->error = nfs4_start_open_seqid_sync(oop, mi);
15209 if (ep->error == EAGAIN) {
15210 ASSERT(!isrecov);
15211 if (did_start_op)
15212 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15213 &recov_state, TRUE);
15214 if (did_force_recovlock)
15215 nfs_rw_exit(&mi->mi_recovlock);
15216 goto recov_retry;
15217 }
15218 did_start_seqid_sync = 1;
15219 }
15220
15221 /*
15222 * Third get an open stream and acquire 'os_sync_lock' to
15223 * sychronize the opening/creating of an open stream with the
15224 * closing/destroying of an open stream.
15225 */
15226 if (!provided_osp) {
15227 /* returns with 'os_sync_lock' held */
15228 osp = find_open_stream(oop, rp);
15229 if (!osp) {
15230 ep->error = EIO;
15231 goto out;
15232 }
15233 } else {
15234 osp = provided_osp;
15235 open_stream_hold(osp);
15236 mutex_enter(&osp->os_sync_lock);
15237 }
15238 have_sync_lock = 1;
15239
15240 ASSERT(oop == osp->os_open_owner);
15241
15242 /*
15243 * Fourth, do any special pre-OTW CLOSE processing
15244 * based on the specific close type.
15245 */
15246 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15247 !did_dec_count) {
15248 ASSERT(osp->os_open_ref_count > 0);
15249 osp->os_open_ref_count--;
15250 did_dec_count = 1;
15251 if (osp->os_open_ref_count == 0)
15252 osp->os_final_close = 1;
15253 }
15254
15255 if (close_type == CLOSE_FORCE) {
15256 /* see if somebody reopened the open stream. */
15257 if (!osp->os_force_close) {
15258 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15259 "nfs4close_one: skip CLOSE_FORCE as osp %p "
15260 "was reopened, vp %p", (void *)osp, (void *)vp));
15261 ep->error = 0;
15262 ep->stat = NFS4_OK;
15263 goto out;
15264 }
15265
15266 if (!osp->os_final_close && !did_dec_count) {
15267 osp->os_open_ref_count--;
15268 did_dec_count = 1;
15269 }
15270
15271 /*
15272 * We can't depend on os_open_ref_count being 0 due to the
15273 * way executables are opened (VN_RELE to match a VOP_OPEN).
15274 */
15275 #ifdef NOTYET
15276 ASSERT(osp->os_open_ref_count == 0);
15277 #endif
15278 if (osp->os_open_ref_count != 0) {
15279 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15280 "nfs4close_one: should panic here on an "
15281 "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15282 "since this is probably the exec problem."));
15283
15284 osp->os_open_ref_count = 0;
15285 }
15286
15287 /*
15288 * There is the possibility that nfs4close_one()
15289 * for close_type == CLOSE_DELMAP couldn't find the
15290 * open stream, thus couldn't decrement its os_mapcnt;
15291 * therefore we can't use this ASSERT yet.
15292 */
15293 #ifdef NOTYET
15294 ASSERT(osp->os_mapcnt == 0);
15295 #endif
15296 osp->os_mapcnt = 0;
15297 }
15298
15299 if (close_type == CLOSE_DELMAP && !did_dec_count) {
15300 ASSERT(osp->os_mapcnt >= btopr(len));
15301
15302 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15303 osp->os_mmap_write -= btopr(len);
15304 if (maxprot & PROT_READ)
15305 osp->os_mmap_read -= btopr(len);
15306 if (maxprot & PROT_EXEC)
15307 osp->os_mmap_read -= btopr(len);
15308 /* mirror the PROT_NONE check in nfs4_addmap() */
15309 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15310 !(maxprot & PROT_EXEC))
15311 osp->os_mmap_read -= btopr(len);
15312 osp->os_mapcnt -= btopr(len);
15313 did_dec_count = 1;
15314 }
15315
15316 if (recovonly) {
15317 nfs4_lost_rqst_t lost_rqst;
15318
15319 /* request should not already be in recovery queue */
15320 ASSERT(lrp == NULL);
15321 nfs4_error_init(ep, EINTR);
15322 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15323 osp, cred_otw, vp);
15324 mutex_exit(&osp->os_sync_lock);
15325 have_sync_lock = 0;
15326 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15327 lost_rqst.lr_op == OP_CLOSE ?
15328 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15329 close_failed = 1;
15330 force_close = 0;
15331 goto close_cleanup;
15332 }
15333
15334 /*
15335 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15336 * we stopped operating on the open owner's <old oo_name, old seqid>
15337 * space, which means we stopped operating on the open stream
15338 * too. So don't go OTW (as the seqid is likely bad, and the
15339 * stateid could be stale, potentially triggering a false
15340 * setclientid), and just clean up the client's internal state.
15341 */
15342 if (osp->os_orig_oo_name != oop->oo_name) {
15343 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15344 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15345 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15346 "oo_name %" PRIx64")",
15347 (void *)osp, (void *)oop, osp->os_orig_oo_name,
15348 oop->oo_name));
15349 close_failed = 1;
15350 }
15351
15352 /* If the file failed recovery, just quit. */
15353 mutex_enter(&rp->r_statelock);
15354 if (rp->r_flags & R4RECOVERR) {
15355 close_failed = 1;
15356 }
15357 mutex_exit(&rp->r_statelock);
15358
15359 /*
15360 * If the force close path failed to obtain start_fop
15361 * then skip the OTW close and just remove the state.
15362 */
15363 if (close_failed)
15364 goto close_cleanup;
15365
15366 /*
15367 * Fifth, check to see if there are still mapped pages or other
15368 * opens using this open stream. If there are then we can't
15369 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15370 */
15371 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15372 nfs4_lost_rqst_t new_lost_rqst;
15373 bool_t needrecov = FALSE;
15374 cred_t *odg_cred_otw = NULL;
15375 seqid4 open_dg_seqid = 0;
15376
15377 if (osp->os_delegation) {
15378 /*
15379 * If this open stream was never OPENed OTW then we
15380 * surely can't DOWNGRADE it (especially since the
15381 * osp->open_stateid is really a delegation stateid
15382 * when os_delegation is 1).
15383 */
15384 if (access_bits & FREAD)
15385 osp->os_share_acc_read--;
15386 if (access_bits & FWRITE)
15387 osp->os_share_acc_write--;
15388 osp->os_share_deny_none--;
15389 nfs4_error_zinit(ep);
15390 goto out;
15391 }
15392 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15393 lrp, ep, &odg_cred_otw, &open_dg_seqid);
15394 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15395 if (needrecov && !isrecov) {
15396 bool_t abort;
15397 nfs4_bseqid_entry_t *bsep = NULL;
15398
15399 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15400 bsep = nfs4_create_bseqid_entry(oop, NULL,
15401 vp, 0,
15402 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15403 open_dg_seqid);
15404
15405 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15406 oop, osp, odg_cred_otw, vp, access_bits, 0);
15407 mutex_exit(&osp->os_sync_lock);
15408 have_sync_lock = 0;
15409 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15410 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15411 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15412 bsep, NULL, NULL);
15413 if (odg_cred_otw)
15414 crfree(odg_cred_otw);
15415 if (bsep)
15416 kmem_free(bsep, sizeof (*bsep));
15417
15418 if (abort == TRUE)
15419 goto out;
15420
15421 if (did_start_seqid_sync) {
15422 nfs4_end_open_seqid_sync(oop);
15423 did_start_seqid_sync = 0;
15424 }
15425 open_stream_rele(osp, rp);
15426
15427 if (did_start_op)
15428 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15429 &recov_state, FALSE);
15430 if (did_force_recovlock)
15431 nfs_rw_exit(&mi->mi_recovlock);
15432
15433 goto recov_retry;
15434 } else {
15435 if (odg_cred_otw)
15436 crfree(odg_cred_otw);
15437 }
15438 goto out;
15439 }
15440
15441 /*
15442 * If this open stream was created as the results of an open
15443 * while holding a delegation, then just release it; no need
15444 * to do an OTW close. Otherwise do a "normal" OTW close.
15445 */
15446 if (osp->os_delegation) {
15447 nfs4close_notw(vp, osp, &have_sync_lock);
15448 nfs4_error_zinit(ep);
15449 goto out;
15450 }
15451
15452 /*
15453 * If this stream is not valid, we're done.
15454 */
15455 if (!osp->os_valid) {
15456 nfs4_error_zinit(ep);
15457 goto out;
15458 }
15459
15460 /*
15461 * Last open or mmap ref has vanished, need to do an OTW close.
15462 * First check to see if a close is still necessary.
15463 */
15464 if (osp->os_failed_reopen) {
15465 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15466 "don't close OTW osp %p since reopen failed.",
15467 (void *)osp));
15468 /*
15469 * Reopen of the open stream failed, hence the
15470 * stateid of the open stream is invalid/stale, and
15471 * sending this OTW would incorrectly cause another
15472 * round of recovery. In this case, we need to set
15473 * the 'os_valid' bit to 0 so another thread doesn't
15474 * come in and re-open this open stream before
15475 * this "closing" thread cleans up state (decrementing
15476 * the nfs4_server_t's state_ref_count and decrementing
15477 * the os_ref_count).
15478 */
15479 osp->os_valid = 0;
15480 /*
15481 * This removes the reference obtained at OPEN; ie,
15482 * when the open stream structure was created.
15483 *
15484 * We don't have to worry about calling 'open_stream_rele'
15485 * since we our currently holding a reference to this
15486 * open stream which means the count can not go to 0 with
15487 * this decrement.
15488 */
15489 ASSERT(osp->os_ref_count >= 2);
15490 osp->os_ref_count--;
15491 nfs4_error_zinit(ep);
15492 close_failed = 0;
15493 goto close_cleanup;
15494 }
15495
15496 ASSERT(osp->os_ref_count > 1);
15497
15498 /*
15499 * Sixth, try the CLOSE OTW.
15500 */
15501 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15502 close_type, ep, &have_sync_lock);
15503
15504 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15505 /*
15506 * Let the recovery thread be responsible for
15507 * removing the state for CLOSE.
15508 */
15509 close_failed = 1;
15510 force_close = 0;
15511 retry = 0;
15512 }
15513
15514 /* See if we need to retry with a different cred */
15515 if ((ep->error == EACCES ||
15516 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15517 cred_otw != cr) {
15518 crfree(cred_otw);
15519 cred_otw = cr;
15520 crhold(cred_otw);
15521 retry = 1;
15522 }
15523
15524 if (ep->error || ep->stat)
15525 close_failed = 1;
15526
15527 if (retry && !isrecov && num_retries-- > 0) {
15528 if (have_sync_lock) {
15529 mutex_exit(&osp->os_sync_lock);
15530 have_sync_lock = 0;
15531 }
15532 if (did_start_seqid_sync) {
15533 nfs4_end_open_seqid_sync(oop);
15534 did_start_seqid_sync = 0;
15535 }
15536 open_stream_rele(osp, rp);
15537
15538 if (did_start_op)
15539 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15540 &recov_state, FALSE);
15541 if (did_force_recovlock)
15542 nfs_rw_exit(&mi->mi_recovlock);
15543 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15544 "nfs4close_one: need to retry the close "
15545 "operation"));
15546 goto recov_retry;
15547 }
15548 close_cleanup:
15549 /*
15550 * Seventh and lastly, process our results.
15551 */
15552 if (close_failed && force_close) {
15553 /*
15554 * It's ok to drop and regrab the 'os_sync_lock' since
15555 * nfs4close_notw() will recheck to make sure the
15556 * "close"/removal of state should happen.
15557 */
15558 if (!have_sync_lock) {
15559 mutex_enter(&osp->os_sync_lock);
15560 have_sync_lock = 1;
15561 }
15562 /*
15563 * This is last call, remove the ref on the open
15564 * stream created by open and clean everything up.
15565 */
15566 osp->os_pending_close = 0;
15567 nfs4close_notw(vp, osp, &have_sync_lock);
15568 nfs4_error_zinit(ep);
15569 }
15570
15571 if (!close_failed) {
15572 if (have_sync_lock) {
15573 osp->os_pending_close = 0;
15574 mutex_exit(&osp->os_sync_lock);
15575 have_sync_lock = 0;
15576 } else {
15577 mutex_enter(&osp->os_sync_lock);
15578 osp->os_pending_close = 0;
15579 mutex_exit(&osp->os_sync_lock);
15580 }
15581 if (did_start_op && recov_state.rs_sp != NULL) {
15582 mutex_enter(&recov_state.rs_sp->s_lock);
15583 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15584 mutex_exit(&recov_state.rs_sp->s_lock);
15585 } else {
15586 nfs4_dec_state_ref_count(mi);
15587 }
15588 nfs4_error_zinit(ep);
15589 }
15590
15591 out:
15592 if (have_sync_lock)
15593 mutex_exit(&osp->os_sync_lock);
15594 if (did_start_op)
15595 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15596 recovonly ? TRUE : FALSE);
15597 if (did_force_recovlock)
15598 nfs_rw_exit(&mi->mi_recovlock);
15599 if (cred_otw)
15600 crfree(cred_otw);
15601 if (osp)
15602 open_stream_rele(osp, rp);
15603 if (oop) {
15604 if (did_start_seqid_sync)
15605 nfs4_end_open_seqid_sync(oop);
15606 open_owner_rele(oop);
15607 }
15608 }
15609
15610 /*
15611 * Convert information returned by the server in the LOCK4denied
15612 * structure to the form required by fcntl.
15613 */
15614 static void
15615 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15616 {
15617 nfs4_lo_name_t *lo;
15618
15619 #ifdef DEBUG
15620 if (denied_to_flk_debug) {
15621 lockt_denied_debug = lockt_denied;
15622 debug_enter("lockt_denied");
15623 }
15624 #endif
15625
15626 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15627 flk->l_whence = 0; /* aka SEEK_SET */
15628 flk->l_start = lockt_denied->offset;
15629 flk->l_len = lockt_denied->length;
15630
15631 /*
15632 * If the blocking clientid matches our client id, then we can
15633 * interpret the lockowner (since we built it). If not, then
15634 * fabricate a sysid and pid. Note that the l_sysid field
15635 * in *flk already has the local sysid.
15636 */
15637
15638 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15639
15640 if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15641 lo = (nfs4_lo_name_t *)
15642 lockt_denied->owner.owner_val;
15643
15644 flk->l_pid = lo->ln_pid;
15645 } else {
15646 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15647 "denied_to_flk: bad lock owner length\n"));
15648
15649 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15650 }
15651 } else {
15652 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15653 "denied_to_flk: foreign clientid\n"));
15654
15655 /*
15656 * Construct a new sysid which should be different from
15657 * sysids of other systems.
15658 */
15659
15660 flk->l_sysid++;
15661 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15662 }
15663 }
15664
15665 static pid_t
15666 lo_to_pid(lock_owner4 *lop)
15667 {
15668 pid_t pid = 0;
15669 uchar_t *cp;
15670 int i;
15671
15672 cp = (uchar_t *)&lop->clientid;
15673
15674 for (i = 0; i < sizeof (lop->clientid); i++)
15675 pid += (pid_t)*cp++;
15676
15677 cp = (uchar_t *)lop->owner_val;
15678
15679 for (i = 0; i < lop->owner_len; i++)
15680 pid += (pid_t)*cp++;
15681
15682 return (pid);
15683 }
15684
15685 /*
15686 * Given a lock pointer, returns the length of that lock.
15687 * "end" is the last locked offset the "l_len" covers from
15688 * the start of the lock.
15689 */
15690 static off64_t
15691 lock_to_end(flock64_t *lock)
15692 {
15693 off64_t lock_end;
15694
15695 if (lock->l_len == 0)
15696 lock_end = (off64_t)MAXEND;
15697 else
15698 lock_end = lock->l_start + lock->l_len - 1;
15699
15700 return (lock_end);
15701 }
15702
15703 /*
15704 * Given the end of a lock, it will return you the length "l_len" for that lock.
15705 */
15706 static off64_t
15707 end_to_len(off64_t start, off64_t end)
15708 {
15709 off64_t lock_len;
15710
15711 ASSERT(end >= start);
15712 if (end == MAXEND)
15713 lock_len = 0;
15714 else
15715 lock_len = end - start + 1;
15716
15717 return (lock_len);
15718 }
15719
15720 /*
15721 * On given end for a lock it determines if it is the last locked offset
15722 * or not, if so keeps it as is, else adds one to return the length for
15723 * valid start.
15724 */
15725 static off64_t
15726 start_check(off64_t x)
15727 {
15728 if (x == MAXEND)
15729 return (x);
15730 else
15731 return (x + 1);
15732 }
15733
15734 /*
15735 * See if these two locks overlap, and if so return 1;
15736 * otherwise, return 0.
15737 */
15738 static int
15739 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15740 {
15741 off64_t llfp_end, curfp_end;
15742
15743 llfp_end = lock_to_end(llfp);
15744 curfp_end = lock_to_end(curfp);
15745
15746 if (((llfp_end >= curfp->l_start) &&
15747 (llfp->l_start <= curfp->l_start)) ||
15748 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15749 return (1);
15750 return (0);
15751 }
15752
15753 /*
15754 * Determine what the intersecting lock region is, and add that to the
15755 * 'nl_llpp' locklist in increasing order (by l_start).
15756 */
15757 static void
15758 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15759 locklist_t **nl_llpp, vnode_t *vp)
15760 {
15761 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15762 off64_t lost_flp_end, local_flp_end, len, start;
15763
15764 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15765
15766 if (!locks_intersect(lost_flp, local_flp))
15767 return;
15768
15769 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15770 "locks intersect"));
15771
15772 lost_flp_end = lock_to_end(lost_flp);
15773 local_flp_end = lock_to_end(local_flp);
15774
15775 /* Find the starting point of the intersecting region */
15776 if (local_flp->l_start > lost_flp->l_start)
15777 start = local_flp->l_start;
15778 else
15779 start = lost_flp->l_start;
15780
15781 /* Find the lenght of the intersecting region */
15782 if (lost_flp_end < local_flp_end)
15783 len = end_to_len(start, lost_flp_end);
15784 else
15785 len = end_to_len(start, local_flp_end);
15786
15787 /*
15788 * Prepare the flock structure for the intersection found and insert
15789 * it into the new list in increasing l_start order. This list contains
15790 * intersections of locks registered by the client with the local host
15791 * and the lost lock.
15792 * The lock type of this lock is the same as that of the local_flp.
15793 */
15794 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15795 intersect_llp->ll_flock.l_start = start;
15796 intersect_llp->ll_flock.l_len = len;
15797 intersect_llp->ll_flock.l_type = local_flp->l_type;
15798 intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15799 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15800 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */
15801 intersect_llp->ll_vp = vp;
15802
15803 tmp_fllp = *nl_llpp;
15804 cur_fllp = NULL;
15805 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15806 intersect_llp->ll_flock.l_start) {
15807 cur_fllp = tmp_fllp;
15808 tmp_fllp = tmp_fllp->ll_next;
15809 }
15810 if (cur_fllp == NULL) {
15811 /* first on the list */
15812 intersect_llp->ll_next = *nl_llpp;
15813 *nl_llpp = intersect_llp;
15814 } else {
15815 intersect_llp->ll_next = cur_fllp->ll_next;
15816 cur_fllp->ll_next = intersect_llp;
15817 }
15818
15819 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15820 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15821 intersect_llp->ll_flock.l_start,
15822 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15823 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15824 }
15825
15826 /*
15827 * Our local locking current state is potentially different than
15828 * what the NFSv4 server thinks we have due to a lost lock that was
15829 * resent and then received. We need to reset our "NFSv4" locking
15830 * state to match the current local locking state for this pid since
15831 * that is what the user/application sees as what the world is.
15832 *
15833 * We cannot afford to drop the open/lock seqid sync since then we can
15834 * get confused about what the current local locking state "is" versus
15835 * "was".
15836 *
15837 * If we are unable to fix up the locks, we send SIGLOST to the affected
15838 * process. This is not done if the filesystem has been forcibly
15839 * unmounted, in case the process has already exited and a new process
15840 * exists with the same pid.
15841 */
15842 static void
15843 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15844 nfs4_lock_owner_t *lop)
15845 {
15846 locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15847 mntinfo4_t *mi = VTOMI4(vp);
15848 const int cmd = F_SETLK;
15849 off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15850 flock64_t ul_fl;
15851
15852 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15853 "nfs4_reinstitute_local_lock_state"));
15854
15855 /*
15856 * Find active locks for this vp from the local locking code.
15857 * Scan through this list and find out the locks that intersect with
15858 * the lost lock. Once we find the lock that intersects, add the
15859 * intersection area as a new lock to a new list "ri_llp". The lock
15860 * type of the intersection region lock added to ri_llp is the same
15861 * as that found in the active lock list, "list". The intersecting
15862 * region locks are added to ri_llp in increasing l_start order.
15863 */
15864 ASSERT(nfs_zone() == mi->mi_zone);
15865
15866 locks = flk_active_locks_for_vp(vp);
15867 ri_llp = NULL;
15868
15869 for (llp = locks; llp != NULL; llp = llp->ll_next) {
15870 ASSERT(llp->ll_vp == vp);
15871 /*
15872 * Pick locks that belong to this pid/lockowner
15873 */
15874 if (llp->ll_flock.l_pid != lost_flp->l_pid)
15875 continue;
15876
15877 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15878 }
15879
15880 /*
15881 * Now we have the list of intersections with the lost lock. These are
15882 * the locks that were/are active before the server replied to the
15883 * last/lost lock. Issue these locks to the server here. Playing these
15884 * locks to the server will re-establish our current local locking state
15885 * with the v4 server.
15886 * If we get an error, send SIGLOST to the application for that lock.
15887 */
15888
15889 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15890 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15891 "nfs4_reinstitute_local_lock_state: need to issue "
15892 "flock: [%"PRIx64" - %"PRIx64"] : %s",
15893 llp->ll_flock.l_start,
15894 llp->ll_flock.l_start + llp->ll_flock.l_len,
15895 llp->ll_flock.l_type == F_RDLCK ? "READ" :
15896 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15897 /*
15898 * No need to relock what we already have
15899 */
15900 if (llp->ll_flock.l_type == lost_flp->l_type)
15901 continue;
15902
15903 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15904 }
15905
15906 /*
15907 * Now keeping the start of the lost lock as our reference parse the
15908 * newly created ri_llp locklist to find the ranges that we have locked
15909 * with the v4 server but not in the current local locking. We need
15910 * to unlock these ranges.
15911 * These ranges can also be reffered to as those ranges, where the lost
15912 * lock does not overlap with the locks in the ri_llp but are locked
15913 * since the server replied to the lost lock.
15914 */
15915 cur_start = lost_flp->l_start;
15916 lost_flp_end = lock_to_end(lost_flp);
15917
15918 ul_fl.l_type = F_UNLCK;
15919 ul_fl.l_whence = 0; /* aka SEEK_SET */
15920 ul_fl.l_sysid = lost_flp->l_sysid;
15921 ul_fl.l_pid = lost_flp->l_pid;
15922
15923 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15924 llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15925
15926 if (llp->ll_flock.l_start <= cur_start) {
15927 cur_start = start_check(llp_ll_flock_end);
15928 continue;
15929 }
15930 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15931 "nfs4_reinstitute_local_lock_state: "
15932 "UNLOCK [%"PRIx64" - %"PRIx64"]",
15933 cur_start, llp->ll_flock.l_start));
15934
15935 ul_fl.l_start = cur_start;
15936 ul_fl.l_len = end_to_len(cur_start,
15937 (llp->ll_flock.l_start - 1));
15938
15939 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15940 cur_start = start_check(llp_ll_flock_end);
15941 }
15942
15943 /*
15944 * In the case where the lost lock ends after all intersecting locks,
15945 * unlock the last part of the lost lock range.
15946 */
15947 if (cur_start != start_check(lost_flp_end)) {
15948 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15949 "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15950 "lost lock region [%"PRIx64" - %"PRIx64"]",
15951 cur_start, lost_flp->l_start + lost_flp->l_len));
15952
15953 ul_fl.l_start = cur_start;
15954 /*
15955 * Is it an to-EOF lock? if so unlock till the end
15956 */
15957 if (lost_flp->l_len == 0)
15958 ul_fl.l_len = 0;
15959 else
15960 ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15961
15962 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15963 }
15964
15965 if (locks != NULL)
15966 flk_free_locklist(locks);
15967
15968 /* Free up our newly created locklist */
15969 for (llp = ri_llp; llp != NULL; ) {
15970 tmp_llp = llp->ll_next;
15971 kmem_free(llp, sizeof (locklist_t));
15972 llp = tmp_llp;
15973 }
15974
15975 /*
15976 * Now return back to the original calling nfs4frlock()
15977 * and let us naturally drop our seqid syncs.
15978 */
15979 }
15980
15981 /*
15982 * Create a lost state record for the given lock reinstantiation request
15983 * and push it onto the lost state queue.
15984 */
15985 static void
15986 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15987 nfs4_lock_owner_t *lop)
15988 {
15989 nfs4_lost_rqst_t req;
15990 nfs_lock_type4 locktype;
15991 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15992
15993 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15994
15995 locktype = flk_to_locktype(cmd, flk->l_type);
15996 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15997 NULL, NULL, lop, flk, &req, cr, vp);
15998 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
15999 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
16000 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
16001 NULL, NULL, NULL);
16002 }