1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2016 STRATO AG. All rights reserved.
24 */
25
26 /*
27 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 /*
31 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
32 * Use is subject to license terms.
33 */
34
35 /*
36 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
37 * All Rights Reserved
38 */
39
40 /*
41 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
42 */
43
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/systm.h>
47 #include <sys/cred.h>
48 #include <sys/time.h>
49 #include <sys/vnode.h>
50 #include <sys/vfs.h>
51 #include <sys/vfs_opreg.h>
52 #include <sys/file.h>
53 #include <sys/filio.h>
54 #include <sys/uio.h>
55 #include <sys/buf.h>
56 #include <sys/mman.h>
57 #include <sys/pathname.h>
58 #include <sys/dirent.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/fcntl.h>
62 #include <sys/flock.h>
63 #include <sys/swap.h>
64 #include <sys/errno.h>
65 #include <sys/strsubr.h>
66 #include <sys/sysmacros.h>
67 #include <sys/kmem.h>
68 #include <sys/cmn_err.h>
69 #include <sys/pathconf.h>
70 #include <sys/utsname.h>
71 #include <sys/dnlc.h>
72 #include <sys/acl.h>
73 #include <sys/systeminfo.h>
74 #include <sys/policy.h>
75 #include <sys/sdt.h>
76 #include <sys/list.h>
77 #include <sys/stat.h>
78 #include <sys/zone.h>
79
80 #include <rpc/types.h>
81 #include <rpc/auth.h>
82 #include <rpc/clnt.h>
83
84 #include <nfs/nfs.h>
85 #include <nfs/nfs_clnt.h>
86 #include <nfs/nfs_acl.h>
87 #include <nfs/lm.h>
88 #include <nfs/nfs4.h>
89 #include <nfs/nfs4_kprot.h>
90 #include <nfs/rnode4.h>
91 #include <nfs/nfs4_clnt.h>
92
93 #include <vm/hat.h>
94 #include <vm/as.h>
95 #include <vm/page.h>
96 #include <vm/pvn.h>
97 #include <vm/seg.h>
98 #include <vm/seg_map.h>
99 #include <vm/seg_kpm.h>
100 #include <vm/seg_vn.h>
101
102 #include <fs/fs_subr.h>
103
104 #include <sys/ddi.h>
105 #include <sys/int_fmtio.h>
106 #include <sys/fs/autofs.h>
107
108 typedef struct {
109 nfs4_ga_res_t *di_garp;
110 cred_t *di_cred;
111 hrtime_t di_time_call;
112 } dirattr_info_t;
113
114 typedef enum nfs4_acl_op {
115 NFS4_ACL_GET,
116 NFS4_ACL_SET
117 } nfs4_acl_op_t;
118
119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *);
120
121 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
122 char *, dirattr_info_t *);
123
124 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
125 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
126 nfs4_error_t *, int *);
127 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
128 cred_t *);
129 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
130 stable_how4 *);
131 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
132 cred_t *, bool_t, struct uio *);
133 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
134 vsecattr_t *);
135 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
136 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
137 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
138 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
139 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
140 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
141 int, vnode_t **, cred_t *);
142 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
143 cred_t *, int, int, enum createmode4, int);
144 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
145 caller_context_t *);
146 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
147 vnode_t *, char *, cred_t *, nfsstat4 *);
148 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
149 vnode_t *, char *, cred_t *, nfsstat4 *);
150 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
151 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
152 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
153 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
154 page_t *[], size_t, struct seg *, caddr_t,
155 enum seg_rw, cred_t *);
156 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
157 cred_t *);
158 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
159 int, cred_t *);
160 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
161 int, cred_t *);
162 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *);
163 static void nfs4_set_mod(vnode_t *);
164 static void nfs4_get_commit(vnode_t *);
165 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
166 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
167 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
168 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
169 cred_t *);
170 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
171 cred_t *);
172 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
173 hrtime_t, vnode_t *, cred_t *);
174 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
175 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
176 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
177 u_offset_t);
178 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
179 static int nfs4_block_and_wait(clock_t *, rnode4_t *);
180 static cred_t *state_to_cred(nfs4_open_stream_t *);
181 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
182 static pid_t lo_to_pid(lock_owner4 *);
183 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
184 cred_t *, nfs4_lock_owner_t *);
185 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
186 nfs4_lock_owner_t *);
187 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
188 static void nfs4_delmap_callback(struct as *, void *, uint_t);
189 static void nfs4_free_delmapcall(nfs4_delmapcall_t *);
190 static nfs4_delmapcall_t *nfs4_init_delmapcall();
191 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
192 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
193 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
194 uid_t, gid_t, int);
195
196 /*
197 * Routines that implement the setting of v4 args for the misc. ops
198 */
199 static void nfs4args_lock_free(nfs_argop4 *);
200 static void nfs4args_lockt_free(nfs_argop4 *);
201 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
202 int, rnode4_t *, cred_t *, bitmap4, int *,
203 nfs4_stateid_types_t *);
204 static void nfs4args_setattr_free(nfs_argop4 *);
205 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
206 bitmap4);
207 static void nfs4args_verify_free(nfs_argop4 *);
208 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
209 WRITE4args **, nfs4_stateid_types_t *);
210
211 /*
212 * These are the vnode ops functions that implement the vnode interface to
213 * the networked file system. See more comments below at nfs4_vnodeops.
214 */
215 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
216 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
217 caller_context_t *);
218 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *,
219 caller_context_t *);
220 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *,
221 caller_context_t *);
222 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
223 caller_context_t *);
224 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
225 caller_context_t *);
226 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
227 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *,
228 caller_context_t *);
229 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
230 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
231 int, vnode_t **, cred_t *, int, caller_context_t *,
232 vsecattr_t *);
233 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
234 int);
235 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
236 caller_context_t *, int);
237 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
238 caller_context_t *, int);
239 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
240 cred_t *, caller_context_t *, int, vsecattr_t *);
241 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
242 caller_context_t *, int);
243 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
244 cred_t *, caller_context_t *, int);
245 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
246 caller_context_t *, int);
247 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
248 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
249 page_t *[], size_t, struct seg *, caddr_t,
250 enum seg_rw, cred_t *, caller_context_t *);
251 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
252 caller_context_t *);
253 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
254 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
255 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
256 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
257 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
258 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
259 struct flk_callback *, cred_t *, caller_context_t *);
260 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
261 cred_t *, caller_context_t *);
262 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
263 uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
264 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
265 cred_t *, caller_context_t *);
266 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
267 caller_context_t *);
268 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
269 caller_context_t *);
270 /*
271 * These vnode ops are required to be called from outside this source file,
272 * e.g. by ephemeral mount stub vnode ops, and so may not be declared
273 * as static.
274 */
275 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
276 caller_context_t *);
277 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
278 int nfs4_lookup(vnode_t *, char *, vnode_t **,
279 struct pathname *, int, vnode_t *, cred_t *,
280 caller_context_t *, int *, pathname_t *);
281 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
282 int nfs4_rwlock(vnode_t *, int, caller_context_t *);
283 void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
284 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
285 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
286 caller_context_t *);
287 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
288 caller_context_t *);
289 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
290 caller_context_t *);
291
292 /*
293 * Used for nfs4_commit_vp() to indicate if we should
294 * wait on pending writes.
295 */
296 #define NFS4_WRITE_NOWAIT 0
297 #define NFS4_WRITE_WAIT 1
298
299 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */
300
301 /*
302 * Error flags used to pass information about certain special errors
303 * which need to be handled specially.
304 */
305 #define NFS_EOF -98
306 #define NFS_VERF_MISMATCH -97
307
308 /*
309 * Flags used to differentiate between which operation drove the
310 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
311 */
312 #define NFS4_CLOSE_OP 0x1
313 #define NFS4_DELMAP_OP 0x2
314 #define NFS4_INACTIVE_OP 0x3
315
316 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
317
318 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
319 #define ALIGN64(x, ptr, sz) \
320 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \
321 if (x) { \
322 x = sizeof (uint64_t) - (x); \
323 sz -= (x); \
324 ptr += (x); \
325 }
326
327 #ifdef DEBUG
328 int nfs4_client_attr_debug = 0;
329 int nfs4_client_state_debug = 0;
330 int nfs4_client_shadow_debug = 0;
331 int nfs4_client_lock_debug = 0;
332 int nfs4_seqid_sync = 0;
333 int nfs4_client_map_debug = 0;
334 static int nfs4_pageio_debug = 0;
335 int nfs4_client_inactive_debug = 0;
336 int nfs4_client_recov_debug = 0;
337 int nfs4_client_failover_debug = 0;
338 int nfs4_client_call_debug = 0;
339 int nfs4_client_lookup_debug = 0;
340 int nfs4_client_zone_debug = 0;
341 int nfs4_lost_rqst_debug = 0;
342 int nfs4_rdattrerr_debug = 0;
343 int nfs4_open_stream_debug = 0;
344
345 int nfs4read_error_inject;
346
347 static int nfs4_create_misses = 0;
348
349 static int nfs4_readdir_cache_shorts = 0;
350 static int nfs4_readdir_readahead = 0;
351
352 static int nfs4_bio_do_stop = 0;
353
354 static int nfs4_lostpage = 0; /* number of times we lost original page */
355
356 int nfs4_mmap_debug = 0;
357
358 static int nfs4_pathconf_cache_hits = 0;
359 static int nfs4_pathconf_cache_misses = 0;
360
361 int nfs4close_all_cnt;
362 int nfs4close_one_debug = 0;
363 int nfs4close_notw_debug = 0;
364
365 int denied_to_flk_debug = 0;
366 void *lockt_denied_debug;
367
368 #endif
369
370 /*
371 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
372 * or NFS4ERR_RESOURCE.
373 */
374 static int confirm_retry_sec = 30;
375
376 static int nfs4_lookup_neg_cache = 1;
377
378 /*
379 * number of pages to read ahead
380 * optimized for 100 base-T.
381 */
382 static int nfs4_nra = 4;
383
384 static int nfs4_do_symlink_cache = 1;
385
386 static int nfs4_pathconf_disable_cache = 0;
387
388 /*
389 * These are the vnode ops routines which implement the vnode interface to
390 * the networked file system. These routines just take their parameters,
391 * make them look networkish by putting the right info into interface structs,
392 * and then calling the appropriate remote routine(s) to do the work.
393 *
394 * Note on directory name lookup cacheing: If we detect a stale fhandle,
395 * we purge the directory cache relative to that vnode. This way, the
396 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for
397 * more details on rnode locking.
398 */
399
400 struct vnodeops *nfs4_vnodeops;
401
402 const fs_operation_def_t nfs4_vnodeops_template[] = {
403 VOPNAME_OPEN, { .vop_open = nfs4_open },
404 VOPNAME_CLOSE, { .vop_close = nfs4_close },
405 VOPNAME_READ, { .vop_read = nfs4_read },
406 VOPNAME_WRITE, { .vop_write = nfs4_write },
407 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl },
408 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr },
409 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr },
410 VOPNAME_ACCESS, { .vop_access = nfs4_access },
411 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup },
412 VOPNAME_CREATE, { .vop_create = nfs4_create },
413 VOPNAME_REMOVE, { .vop_remove = nfs4_remove },
414 VOPNAME_LINK, { .vop_link = nfs4_link },
415 VOPNAME_RENAME, { .vop_rename = nfs4_rename },
416 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir },
417 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir },
418 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir },
419 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink },
420 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink },
421 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync },
422 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
423 VOPNAME_FID, { .vop_fid = nfs4_fid },
424 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
425 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
426 VOPNAME_SEEK, { .vop_seek = nfs4_seek },
427 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock },
428 VOPNAME_SPACE, { .vop_space = nfs4_space },
429 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
430 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage },
431 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage },
432 VOPNAME_MAP, { .vop_map = nfs4_map },
433 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap },
434 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap },
435 /* no separate nfs4_dump */
436 VOPNAME_DUMP, { .vop_dump = nfs_dump },
437 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
438 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio },
439 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose },
440 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr },
441 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
442 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock },
443 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
444 NULL, NULL
445 };
446
447 /*
448 * The following are subroutines and definitions to set args or get res
449 * for the different nfsv4 ops
450 */
451
452 void
453 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
454 {
455 int i;
456
457 for (i = 0; i < arglen; i++) {
458 if (argop[i].argop == OP_LOOKUP) {
459 kmem_free(
460 argop[i].nfs_argop4_u.oplookup.
461 objname.utf8string_val,
462 argop[i].nfs_argop4_u.oplookup.
463 objname.utf8string_len);
464 }
465 }
466 }
467
468 static void
469 nfs4args_lock_free(nfs_argop4 *argop)
470 {
471 locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
472
473 if (locker->new_lock_owner == TRUE) {
474 open_to_lock_owner4 *open_owner;
475
476 open_owner = &locker->locker4_u.open_owner;
477 if (open_owner->lock_owner.owner_val != NULL) {
478 kmem_free(open_owner->lock_owner.owner_val,
479 open_owner->lock_owner.owner_len);
480 }
481 }
482 }
483
484 static void
485 nfs4args_lockt_free(nfs_argop4 *argop)
486 {
487 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
488
489 if (lowner->owner_val != NULL) {
490 kmem_free(lowner->owner_val, lowner->owner_len);
491 }
492 }
493
494 static void
495 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
496 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
497 nfs4_stateid_types_t *sid_types)
498 {
499 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
500 mntinfo4_t *mi;
501
502 argop->argop = OP_SETATTR;
503 /*
504 * The stateid is set to 0 if client is not modifying the size
505 * and otherwise to whatever nfs4_get_stateid() returns.
506 *
507 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
508 * state struct could be found for the process/file pair. We may
509 * want to change this in the future (by OPENing the file). See
510 * bug # 4474852.
511 */
512 if (vap->va_mask & AT_SIZE) {
513
514 ASSERT(rp != NULL);
515 mi = VTOMI4(RTOV4(rp));
516
517 argop->nfs_argop4_u.opsetattr.stateid =
518 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
519 OP_SETATTR, sid_types, FALSE);
520 } else {
521 bzero(&argop->nfs_argop4_u.opsetattr.stateid,
522 sizeof (stateid4));
523 }
524
525 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
526 if (*error)
527 bzero(attr, sizeof (*attr));
528 }
529
530 static void
531 nfs4args_setattr_free(nfs_argop4 *argop)
532 {
533 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
534 }
535
536 static int
537 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
538 bitmap4 supp)
539 {
540 fattr4 *attr;
541 int error = 0;
542
543 argop->argop = op;
544 switch (op) {
545 case OP_VERIFY:
546 attr = &argop->nfs_argop4_u.opverify.obj_attributes;
547 break;
548 case OP_NVERIFY:
549 attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
550 break;
551 default:
552 return (EINVAL);
553 }
554 if (!error)
555 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
556 if (error)
557 bzero(attr, sizeof (*attr));
558 return (error);
559 }
560
561 static void
562 nfs4args_verify_free(nfs_argop4 *argop)
563 {
564 switch (argop->argop) {
565 case OP_VERIFY:
566 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
567 break;
568 case OP_NVERIFY:
569 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
570 break;
571 default:
572 break;
573 }
574 }
575
576 static void
577 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
578 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
579 {
580 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
581 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
582
583 argop->argop = OP_WRITE;
584 wargs->stable = stable;
585 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
586 mi, OP_WRITE, sid_tp);
587 wargs->mblk = NULL;
588 *wargs_pp = wargs;
589 }
590
591 void
592 nfs4args_copen_free(OPEN4cargs *open_args)
593 {
594 if (open_args->owner.owner_val) {
595 kmem_free(open_args->owner.owner_val,
596 open_args->owner.owner_len);
597 }
598 if ((open_args->opentype == OPEN4_CREATE) &&
599 (open_args->mode != EXCLUSIVE4)) {
600 nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
601 }
602 }
603
604 /*
605 * XXX: This is referenced in modstubs.s
606 */
607 struct vnodeops *
608 nfs4_getvnodeops(void)
609 {
610 return (nfs4_vnodeops);
611 }
612
613 /*
614 * The OPEN operation opens a regular file.
615 */
616 /*ARGSUSED3*/
617 static int
618 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
619 {
620 vnode_t *dvp = NULL;
621 rnode4_t *rp, *drp;
622 int error;
623 int just_been_created;
624 char fn[MAXNAMELEN];
625
626 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
627 if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
628 return (EIO);
629 rp = VTOR4(*vpp);
630
631 /*
632 * Check to see if opening something besides a regular file;
633 * if so skip the OTW call
634 */
635 if ((*vpp)->v_type != VREG) {
636 error = nfs4_open_non_reg_file(vpp, flag, cr);
637 return (error);
638 }
639
640 /*
641 * XXX - would like a check right here to know if the file is
642 * executable or not, so as to skip OTW
643 */
644
645 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
646 return (error);
647
648 drp = VTOR4(dvp);
649 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
650 return (EINTR);
651
652 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
653 nfs_rw_exit(&drp->r_rwlock);
654 return (error);
655 }
656
657 /*
658 * See if this file has just been CREATEd.
659 * If so, clear the flag and update the dnlc, which was previously
660 * skipped in nfs4_create.
661 * XXX need better serilization on this.
662 * XXX move this into the nf4open_otw call, after we have
663 * XXX acquired the open owner seqid sync.
664 */
665 mutex_enter(&rp->r_statev4_lock);
666 if (rp->created_v4) {
667 rp->created_v4 = 0;
668 mutex_exit(&rp->r_statev4_lock);
669
670 dnlc_update(dvp, fn, *vpp);
671 /* This is needed so we don't bump the open ref count */
672 just_been_created = 1;
673 } else {
674 mutex_exit(&rp->r_statev4_lock);
675 just_been_created = 0;
676 }
677
678 /*
679 * If caller specified O_TRUNC/FTRUNC, then be sure to set
680 * FWRITE (to drive successful setattr(size=0) after open)
681 */
682 if (flag & FTRUNC)
683 flag |= FWRITE;
684
685 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
686 just_been_created);
687
688 if (!error && !((*vpp)->v_flag & VROOT))
689 dnlc_update(dvp, fn, *vpp);
690
691 nfs_rw_exit(&drp->r_rwlock);
692
693 /* release the hold from vtodv */
694 VN_RELE(dvp);
695
696 /* exchange the shadow for the master vnode, if needed */
697
698 if (error == 0 && IS_SHADOW(*vpp, rp))
699 sv_exchange(vpp);
700
701 return (error);
702 }
703
704 /*
705 * See if there's a "lost open" request to be saved and recovered.
706 */
707 static void
708 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
709 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
710 vnode_t *dvp, OPEN4cargs *open_args)
711 {
712 vfs_t *vfsp;
713 char *srccfp;
714
715 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
716
717 if (error != ETIMEDOUT && error != EINTR &&
718 !NFS4_FRC_UNMT_ERR(error, vfsp)) {
719 lost_rqstp->lr_op = 0;
720 return;
721 }
722
723 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
724 "nfs4open_save_lost_rqst: error %d", error));
725
726 lost_rqstp->lr_op = OP_OPEN;
727
728 /*
729 * The vp (if it is not NULL) and dvp are held and rele'd via
730 * the recovery code. See nfs4_save_lost_rqst.
731 */
732 lost_rqstp->lr_vp = vp;
733 lost_rqstp->lr_dvp = dvp;
734 lost_rqstp->lr_oop = oop;
735 lost_rqstp->lr_osp = NULL;
736 lost_rqstp->lr_lop = NULL;
737 lost_rqstp->lr_cr = cr;
738 lost_rqstp->lr_flk = NULL;
739 lost_rqstp->lr_oacc = open_args->share_access;
740 lost_rqstp->lr_odeny = open_args->share_deny;
741 lost_rqstp->lr_oclaim = open_args->claim;
742 if (open_args->claim == CLAIM_DELEGATE_CUR) {
743 lost_rqstp->lr_ostateid =
744 open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
745 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
746 } else {
747 srccfp = open_args->open_claim4_u.cfile;
748 }
749 lost_rqstp->lr_ofile.utf8string_len = 0;
750 lost_rqstp->lr_ofile.utf8string_val = NULL;
751 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
752 lost_rqstp->lr_putfirst = FALSE;
753 }
754
755 struct nfs4_excl_time {
756 uint32 seconds;
757 uint32 nseconds;
758 };
759
760 /*
761 * The OPEN operation creates and/or opens a regular file
762 *
763 * ARGSUSED
764 */
765 static int
766 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
767 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
768 enum createmode4 createmode, int file_just_been_created)
769 {
770 rnode4_t *rp;
771 rnode4_t *drp = VTOR4(dvp);
772 vnode_t *vp = NULL;
773 vnode_t *vpi = *vpp;
774 bool_t needrecov = FALSE;
775
776 int doqueue = 1;
777
778 COMPOUND4args_clnt args;
779 COMPOUND4res_clnt res;
780 nfs_argop4 *argop;
781 nfs_resop4 *resop;
782 int argoplist_size;
783 int idx_open, idx_fattr;
784
785 GETFH4res *gf_res = NULL;
786 OPEN4res *op_res = NULL;
787 nfs4_ga_res_t *garp;
788 fattr4 *attr = NULL;
789 struct nfs4_excl_time verf;
790 bool_t did_excl_setup = FALSE;
791 int created_osp;
792
793 OPEN4cargs *open_args;
794 nfs4_open_owner_t *oop = NULL;
795 nfs4_open_stream_t *osp = NULL;
796 seqid4 seqid = 0;
797 bool_t retry_open = FALSE;
798 nfs4_recov_state_t recov_state;
799 nfs4_lost_rqst_t lost_rqst;
800 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
801 hrtime_t t;
802 int acc = 0;
803 cred_t *cred_otw = NULL; /* cred used to do the RPC call */
804 cred_t *ncr = NULL;
805
806 nfs4_sharedfh_t *otw_sfh;
807 nfs4_sharedfh_t *orig_sfh;
808 int fh_differs = 0;
809 int numops, setgid_flag;
810 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
811
812 /*
813 * Make sure we properly deal with setting the right gid on
814 * a newly created file to reflect the parent's setgid bit
815 */
816 setgid_flag = 0;
817 if (create_flag && in_va) {
818
819 /*
820 * If there is grpid mount flag used or
821 * the parent's directory has the setgid bit set
822 * _and_ the client was able to get a valid mapping
823 * for the parent dir's owner_group, we want to
824 * append NVERIFY(owner_group == dva.va_gid) and
825 * SETATTR to the CREATE compound.
826 */
827 mutex_enter(&drp->r_statelock);
828 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
829 drp->r_attr.va_mode & VSGID) &&
830 drp->r_attr.va_gid != GID_NOBODY) {
831 in_va->va_mask |= AT_GID;
832 in_va->va_gid = drp->r_attr.va_gid;
833 setgid_flag = 1;
834 }
835 mutex_exit(&drp->r_statelock);
836 }
837
838 /*
839 * Normal/non-create compound:
840 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
841 *
842 * Open(create) compound no setgid:
843 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
844 * RESTOREFH + GETATTR
845 *
846 * Open(create) setgid:
847 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
848 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
849 * NVERIFY(grp) + SETATTR
850 */
851 if (setgid_flag) {
852 numops = 10;
853 idx_open = 1;
854 idx_fattr = 3;
855 } else if (create_flag) {
856 numops = 7;
857 idx_open = 2;
858 idx_fattr = 4;
859 } else {
860 numops = 4;
861 idx_open = 1;
862 idx_fattr = 3;
863 }
864
865 args.array_len = numops;
866 argoplist_size = numops * sizeof (nfs_argop4);
867 argop = kmem_alloc(argoplist_size, KM_SLEEP);
868
869 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
870 "open %s open flag 0x%x cred %p", file_name, open_flag,
871 (void *)cr));
872
873 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
874 if (create_flag) {
875 /*
876 * We are to create a file. Initialize the passed in vnode
877 * pointer.
878 */
879 vpi = NULL;
880 } else {
881 /*
882 * Check to see if the client owns a read delegation and is
883 * trying to open for write. If so, then return the delegation
884 * to avoid the server doing a cb_recall and returning DELAY.
885 * NB - we don't use the statev4_lock here because we'd have
886 * to drop the lock anyway and the result would be stale.
887 */
888 if ((open_flag & FWRITE) &&
889 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
890 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
891
892 /*
893 * If the file has a delegation, then do an access check up
894 * front. This avoids having to an access check later after
895 * we've already done start_op, which could deadlock.
896 */
897 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
898 if (open_flag & FREAD &&
899 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
900 acc |= VREAD;
901 if (open_flag & FWRITE &&
902 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
903 acc |= VWRITE;
904 }
905 }
906
907 drp = VTOR4(dvp);
908
909 recov_state.rs_flags = 0;
910 recov_state.rs_num_retry_despite_err = 0;
911 cred_otw = cr;
912
913 recov_retry:
914 fh_differs = 0;
915 nfs4_error_zinit(&e);
916
917 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
918 if (e.error) {
919 if (ncr != NULL)
920 crfree(ncr);
921 kmem_free(argop, argoplist_size);
922 return (e.error);
923 }
924
925 args.ctag = TAG_OPEN;
926 args.array_len = numops;
927 args.array = argop;
928
929 /* putfh directory fh */
930 argop[0].argop = OP_CPUTFH;
931 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
932
933 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
934 argop[idx_open].argop = OP_COPEN;
935 open_args = &argop[idx_open].nfs_argop4_u.opcopen;
936 open_args->claim = CLAIM_NULL;
937
938 /* name of file */
939 open_args->open_claim4_u.cfile = file_name;
940 open_args->owner.owner_len = 0;
941 open_args->owner.owner_val = NULL;
942
943 if (create_flag) {
944 /* CREATE a file */
945 open_args->opentype = OPEN4_CREATE;
946 open_args->mode = createmode;
947 if (createmode == EXCLUSIVE4) {
948 if (did_excl_setup == FALSE) {
949 verf.seconds = zone_get_hostid(NULL);
950 if (verf.seconds != 0)
951 verf.nseconds = newnum();
952 else {
953 timestruc_t now;
954
955 gethrestime(&now);
956 verf.seconds = now.tv_sec;
957 verf.nseconds = now.tv_nsec;
958 }
959 /*
960 * Since the server will use this value for the
961 * mtime, make sure that it can't overflow. Zero
962 * out the MSB. The actual value does not matter
963 * here, only its uniqeness.
964 */
965 verf.seconds &= INT32_MAX;
966 did_excl_setup = TRUE;
967 }
968
969 /* Now copy over verifier to OPEN4args. */
970 open_args->createhow4_u.createverf = *(uint64_t *)&verf;
971 } else {
972 int v_error;
973 bitmap4 supp_attrs;
974 servinfo4_t *svp;
975
976 attr = &open_args->createhow4_u.createattrs;
977
978 svp = drp->r_server;
979 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
980 supp_attrs = svp->sv_supp_attrs;
981 nfs_rw_exit(&svp->sv_lock);
982
983 /* GUARDED4 or UNCHECKED4 */
984 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
985 supp_attrs);
986 if (v_error) {
987 bzero(attr, sizeof (*attr));
988 nfs4args_copen_free(open_args);
989 nfs4_end_op(VTOMI4(dvp), dvp, vpi,
990 &recov_state, FALSE);
991 if (ncr != NULL)
992 crfree(ncr);
993 kmem_free(argop, argoplist_size);
994 return (v_error);
995 }
996 }
997 } else {
998 /* NO CREATE */
999 open_args->opentype = OPEN4_NOCREATE;
1000 }
1001
1002 if (recov_state.rs_sp != NULL) {
1003 mutex_enter(&recov_state.rs_sp->s_lock);
1004 open_args->owner.clientid = recov_state.rs_sp->clientid;
1005 mutex_exit(&recov_state.rs_sp->s_lock);
1006 } else {
1007 /* XXX should we just fail here? */
1008 open_args->owner.clientid = 0;
1009 }
1010
1011 /*
1012 * This increments oop's ref count or creates a temporary 'just_created'
1013 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1014 * completes.
1015 */
1016 mutex_enter(&VTOMI4(dvp)->mi_lock);
1017
1018 /* See if a permanent or just created open owner exists */
1019 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1020 if (!oop) {
1021 /*
1022 * This open owner does not exist so create a temporary
1023 * just created one.
1024 */
1025 oop = create_open_owner(cr, VTOMI4(dvp));
1026 ASSERT(oop != NULL);
1027 }
1028 mutex_exit(&VTOMI4(dvp)->mi_lock);
1029
1030 /* this length never changes, do alloc before seqid sync */
1031 open_args->owner.owner_len = sizeof (oop->oo_name);
1032 open_args->owner.owner_val =
1033 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1034
1035 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1036 if (e.error == EAGAIN) {
1037 open_owner_rele(oop);
1038 nfs4args_copen_free(open_args);
1039 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1040 if (ncr != NULL) {
1041 crfree(ncr);
1042 ncr = NULL;
1043 }
1044 goto recov_retry;
1045 }
1046
1047 /* Check to see if we need to do the OTW call */
1048 if (!create_flag) {
1049 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1050 file_just_been_created, &e.error, acc, &recov_state)) {
1051
1052 /*
1053 * The OTW open is not necessary. Either
1054 * the open can succeed without it (eg.
1055 * delegation, error == 0) or the open
1056 * must fail due to an access failure
1057 * (error != 0). In either case, tidy
1058 * up and return.
1059 */
1060
1061 nfs4_end_open_seqid_sync(oop);
1062 open_owner_rele(oop);
1063 nfs4args_copen_free(open_args);
1064 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1065 if (ncr != NULL)
1066 crfree(ncr);
1067 kmem_free(argop, argoplist_size);
1068 return (e.error);
1069 }
1070 }
1071
1072 bcopy(&oop->oo_name, open_args->owner.owner_val,
1073 open_args->owner.owner_len);
1074
1075 seqid = nfs4_get_open_seqid(oop) + 1;
1076 open_args->seqid = seqid;
1077 open_args->share_access = 0;
1078 if (open_flag & FREAD)
1079 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1080 if (open_flag & FWRITE)
1081 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1082 open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1083
1084
1085
1086 /*
1087 * getfh w/sanity check for idx_open/idx_fattr
1088 */
1089 ASSERT((idx_open + 1) == (idx_fattr - 1));
1090 argop[idx_open + 1].argop = OP_GETFH;
1091
1092 /* getattr */
1093 argop[idx_fattr].argop = OP_GETATTR;
1094 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1095 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1096
1097 if (setgid_flag) {
1098 vattr_t _v;
1099 servinfo4_t *svp;
1100 bitmap4 supp_attrs;
1101
1102 svp = drp->r_server;
1103 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1104 supp_attrs = svp->sv_supp_attrs;
1105 nfs_rw_exit(&svp->sv_lock);
1106
1107 /*
1108 * For setgid case, we need to:
1109 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1110 */
1111 argop[4].argop = OP_SAVEFH;
1112
1113 argop[5].argop = OP_CPUTFH;
1114 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1115
1116 argop[6].argop = OP_GETATTR;
1117 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1118 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1119
1120 argop[7].argop = OP_RESTOREFH;
1121
1122 /*
1123 * nverify
1124 */
1125 _v.va_mask = AT_GID;
1126 _v.va_gid = in_va->va_gid;
1127 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1128 supp_attrs))) {
1129
1130 /*
1131 * setattr
1132 *
1133 * We _know_ we're not messing with AT_SIZE or
1134 * AT_XTIME, so no need for stateid or flags.
1135 * Also we specify NULL rp since we're only
1136 * interested in setting owner_group attributes.
1137 */
1138 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1139 supp_attrs, &e.error, 0);
1140 if (e.error)
1141 nfs4args_verify_free(&argop[8]);
1142 }
1143
1144 if (e.error) {
1145 /*
1146 * XXX - Revisit the last argument to nfs4_end_op()
1147 * once 5020486 is fixed.
1148 */
1149 nfs4_end_open_seqid_sync(oop);
1150 open_owner_rele(oop);
1151 nfs4args_copen_free(open_args);
1152 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1153 if (ncr != NULL)
1154 crfree(ncr);
1155 kmem_free(argop, argoplist_size);
1156 return (e.error);
1157 }
1158 } else if (create_flag) {
1159 argop[1].argop = OP_SAVEFH;
1160
1161 argop[5].argop = OP_RESTOREFH;
1162
1163 argop[6].argop = OP_GETATTR;
1164 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1165 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1166 }
1167
1168 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1169 "nfs4open_otw: %s call, nm %s, rp %s",
1170 needrecov ? "recov" : "first", file_name,
1171 rnode4info(VTOR4(dvp))));
1172
1173 t = gethrtime();
1174
1175 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1176
1177 if (!e.error && nfs4_need_to_bump_seqid(&res))
1178 nfs4_set_open_seqid(seqid, oop, args.ctag);
1179
1180 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1181
1182 if (e.error || needrecov) {
1183 bool_t abort = FALSE;
1184
1185 if (needrecov) {
1186 nfs4_bseqid_entry_t *bsep = NULL;
1187
1188 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1189 cred_otw, vpi, dvp, open_args);
1190
1191 if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1192 bsep = nfs4_create_bseqid_entry(oop, NULL,
1193 vpi, 0, args.ctag, open_args->seqid);
1194 num_bseqid_retry--;
1195 }
1196
1197 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1198 NULL, lost_rqst.lr_op == OP_OPEN ?
1199 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1200
1201 if (bsep)
1202 kmem_free(bsep, sizeof (*bsep));
1203 /* give up if we keep getting BAD_SEQID */
1204 if (num_bseqid_retry == 0)
1205 abort = TRUE;
1206 if (abort == TRUE && e.error == 0)
1207 e.error = geterrno4(res.status);
1208 }
1209 nfs4_end_open_seqid_sync(oop);
1210 open_owner_rele(oop);
1211 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1212 nfs4args_copen_free(open_args);
1213 if (setgid_flag) {
1214 nfs4args_verify_free(&argop[8]);
1215 nfs4args_setattr_free(&argop[9]);
1216 }
1217 if (!e.error)
1218 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1219 if (ncr != NULL) {
1220 crfree(ncr);
1221 ncr = NULL;
1222 }
1223 if (!needrecov || abort == TRUE || e.error == EINTR ||
1224 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1225 kmem_free(argop, argoplist_size);
1226 return (e.error);
1227 }
1228 goto recov_retry;
1229 }
1230
1231 /*
1232 * Will check and update lease after checking the rflag for
1233 * OPEN_CONFIRM in the successful OPEN call.
1234 */
1235 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1236
1237 /*
1238 * XXX what if we're crossing mount points from server1:/drp
1239 * to server2:/drp/rp.
1240 */
1241
1242 /* Signal our end of use of the open seqid */
1243 nfs4_end_open_seqid_sync(oop);
1244
1245 /*
1246 * This will destroy the open owner if it was just created,
1247 * and no one else has put a reference on it.
1248 */
1249 open_owner_rele(oop);
1250 if (create_flag && (createmode != EXCLUSIVE4) &&
1251 res.status == NFS4ERR_BADOWNER)
1252 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1253
1254 e.error = geterrno4(res.status);
1255 nfs4args_copen_free(open_args);
1256 if (setgid_flag) {
1257 nfs4args_verify_free(&argop[8]);
1258 nfs4args_setattr_free(&argop[9]);
1259 }
1260 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1261 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1262 /*
1263 * If the reply is NFS4ERR_ACCESS, it may be because
1264 * we are root (no root net access). If the real uid
1265 * is not root, then retry with the real uid instead.
1266 */
1267 if (ncr != NULL) {
1268 crfree(ncr);
1269 ncr = NULL;
1270 }
1271 if (res.status == NFS4ERR_ACCESS &&
1272 (ncr = crnetadjust(cred_otw)) != NULL) {
1273 cred_otw = ncr;
1274 goto recov_retry;
1275 }
1276 kmem_free(argop, argoplist_size);
1277 return (e.error);
1278 }
1279
1280 resop = &res.array[idx_open]; /* open res */
1281 op_res = &resop->nfs_resop4_u.opopen;
1282
1283 #ifdef DEBUG
1284 /*
1285 * verify attrset bitmap
1286 */
1287 if (create_flag &&
1288 (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1289 /* make sure attrset returned is what we asked for */
1290 /* XXX Ignore this 'error' for now */
1291 if (attr->attrmask != op_res->attrset)
1292 /* EMPTY */;
1293 }
1294 #endif
1295
1296 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1297 mutex_enter(&VTOMI4(dvp)->mi_lock);
1298 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1299 mutex_exit(&VTOMI4(dvp)->mi_lock);
1300 }
1301
1302 resop = &res.array[idx_open + 1]; /* getfh res */
1303 gf_res = &resop->nfs_resop4_u.opgetfh;
1304
1305 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1306
1307 /*
1308 * The open stateid has been updated on the server but not
1309 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache->
1310 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1311 * WRITE call. That, however, will use the old stateid, so go ahead
1312 * and upate the open stateid now, before any call to makenfs4node.
1313 */
1314 if (vpi) {
1315 nfs4_open_stream_t *tmp_osp;
1316 rnode4_t *tmp_rp = VTOR4(vpi);
1317
1318 tmp_osp = find_open_stream(oop, tmp_rp);
1319 if (tmp_osp) {
1320 tmp_osp->open_stateid = op_res->stateid;
1321 mutex_exit(&tmp_osp->os_sync_lock);
1322 open_stream_rele(tmp_osp, tmp_rp);
1323 }
1324
1325 /*
1326 * We must determine if the file handle given by the otw open
1327 * is the same as the file handle which was passed in with
1328 * *vpp. This case can be reached if the file we are trying
1329 * to open has been removed and another file has been created
1330 * having the same file name. The passed in vnode is released
1331 * later.
1332 */
1333 orig_sfh = VTOR4(vpi)->r_fh;
1334 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1335 }
1336
1337 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1338
1339 if (create_flag || fh_differs) {
1340 int rnode_err = 0;
1341
1342 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1343 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1344
1345 if (e.error)
1346 PURGE_ATTRCACHE4(vp);
1347 /*
1348 * For the newly created vp case, make sure the rnode
1349 * isn't bad before using it.
1350 */
1351 mutex_enter(&(VTOR4(vp))->r_statelock);
1352 if (VTOR4(vp)->r_flags & R4RECOVERR)
1353 rnode_err = EIO;
1354 mutex_exit(&(VTOR4(vp))->r_statelock);
1355
1356 if (rnode_err) {
1357 nfs4_end_open_seqid_sync(oop);
1358 nfs4args_copen_free(open_args);
1359 if (setgid_flag) {
1360 nfs4args_verify_free(&argop[8]);
1361 nfs4args_setattr_free(&argop[9]);
1362 }
1363 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1364 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1365 needrecov);
1366 open_owner_rele(oop);
1367 VN_RELE(vp);
1368 if (ncr != NULL)
1369 crfree(ncr);
1370 sfh4_rele(&otw_sfh);
1371 kmem_free(argop, argoplist_size);
1372 return (EIO);
1373 }
1374 } else {
1375 vp = vpi;
1376 }
1377 sfh4_rele(&otw_sfh);
1378
1379 /*
1380 * It seems odd to get a full set of attrs and then not update
1381 * the object's attrcache in the non-create case. Create case uses
1382 * the attrs since makenfs4node checks to see if the attrs need to
1383 * be updated (and then updates them). The non-create case should
1384 * update attrs also.
1385 */
1386 if (! create_flag && ! fh_differs && !e.error) {
1387 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1388 }
1389
1390 nfs4_error_zinit(&e);
1391 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1392 /* This does not do recovery for vp explicitly. */
1393 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1394 &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1395
1396 if (e.error || e.stat) {
1397 nfs4_end_open_seqid_sync(oop);
1398 nfs4args_copen_free(open_args);
1399 if (setgid_flag) {
1400 nfs4args_verify_free(&argop[8]);
1401 nfs4args_setattr_free(&argop[9]);
1402 }
1403 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1404 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1405 needrecov);
1406 open_owner_rele(oop);
1407 if (create_flag || fh_differs) {
1408 /* rele the makenfs4node */
1409 VN_RELE(vp);
1410 }
1411 if (ncr != NULL) {
1412 crfree(ncr);
1413 ncr = NULL;
1414 }
1415 if (retry_open == TRUE) {
1416 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1417 "nfs4open_otw: retry the open since OPEN "
1418 "CONFIRM failed with error %d stat %d",
1419 e.error, e.stat));
1420 if (create_flag && createmode == GUARDED4) {
1421 NFS4_DEBUG(nfs4_client_recov_debug,
1422 (CE_NOTE, "nfs4open_otw: switch "
1423 "createmode from GUARDED4 to "
1424 "UNCHECKED4"));
1425 createmode = UNCHECKED4;
1426 }
1427 goto recov_retry;
1428 }
1429 if (!e.error) {
1430 if (create_flag && (createmode != EXCLUSIVE4) &&
1431 e.stat == NFS4ERR_BADOWNER)
1432 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1433
1434 e.error = geterrno4(e.stat);
1435 }
1436 kmem_free(argop, argoplist_size);
1437 return (e.error);
1438 }
1439 }
1440
1441 rp = VTOR4(vp);
1442
1443 mutex_enter(&rp->r_statev4_lock);
1444 if (create_flag)
1445 rp->created_v4 = 1;
1446 mutex_exit(&rp->r_statev4_lock);
1447
1448 mutex_enter(&oop->oo_lock);
1449 /* Doesn't matter if 'oo_just_created' already was set as this */
1450 oop->oo_just_created = NFS4_PERM_CREATED;
1451 if (oop->oo_cred_otw)
1452 crfree(oop->oo_cred_otw);
1453 oop->oo_cred_otw = cred_otw;
1454 crhold(oop->oo_cred_otw);
1455 mutex_exit(&oop->oo_lock);
1456
1457 /* returns with 'os_sync_lock' held */
1458 osp = find_or_create_open_stream(oop, rp, &created_osp);
1459 if (!osp) {
1460 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1461 "nfs4open_otw: failed to create an open stream"));
1462 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1463 "signal our end of use of the open seqid"));
1464
1465 nfs4_end_open_seqid_sync(oop);
1466 open_owner_rele(oop);
1467 nfs4args_copen_free(open_args);
1468 if (setgid_flag) {
1469 nfs4args_verify_free(&argop[8]);
1470 nfs4args_setattr_free(&argop[9]);
1471 }
1472 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1473 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1474 if (create_flag || fh_differs)
1475 VN_RELE(vp);
1476 if (ncr != NULL)
1477 crfree(ncr);
1478
1479 kmem_free(argop, argoplist_size);
1480 return (EINVAL);
1481
1482 }
1483
1484 osp->open_stateid = op_res->stateid;
1485
1486 if (open_flag & FREAD)
1487 osp->os_share_acc_read++;
1488 if (open_flag & FWRITE)
1489 osp->os_share_acc_write++;
1490 osp->os_share_deny_none++;
1491
1492 /*
1493 * Need to reset this bitfield for the possible case where we were
1494 * going to OTW CLOSE the file, got a non-recoverable error, and before
1495 * we could retry the CLOSE, OPENed the file again.
1496 */
1497 ASSERT(osp->os_open_owner->oo_seqid_inuse);
1498 osp->os_final_close = 0;
1499 osp->os_force_close = 0;
1500 #ifdef DEBUG
1501 if (osp->os_failed_reopen)
1502 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1503 " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1504 (void *)osp, (void *)cr, rnode4info(rp)));
1505 #endif
1506 osp->os_failed_reopen = 0;
1507
1508 mutex_exit(&osp->os_sync_lock);
1509
1510 nfs4_end_open_seqid_sync(oop);
1511
1512 if (created_osp && recov_state.rs_sp != NULL) {
1513 mutex_enter(&recov_state.rs_sp->s_lock);
1514 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1515 mutex_exit(&recov_state.rs_sp->s_lock);
1516 }
1517
1518 /* get rid of our reference to find oop */
1519 open_owner_rele(oop);
1520
1521 open_stream_rele(osp, rp);
1522
1523 /* accept delegation, if any */
1524 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1525
1526 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1527
1528 if (createmode == EXCLUSIVE4 &&
1529 (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1530 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1531 " EXCLUSIVE4: sending a SETATTR"));
1532 /*
1533 * If doing an exclusive create, then generate
1534 * a SETATTR to set the initial attributes.
1535 * Try to set the mtime and the atime to the
1536 * server's current time. It is somewhat
1537 * expected that these fields will be used to
1538 * store the exclusive create cookie. If not,
1539 * server implementors will need to know that
1540 * a SETATTR will follow an exclusive create
1541 * and the cookie should be destroyed if
1542 * appropriate.
1543 *
1544 * The AT_GID and AT_SIZE bits are turned off
1545 * so that the SETATTR request will not attempt
1546 * to process these. The gid will be set
1547 * separately if appropriate. The size is turned
1548 * off because it is assumed that a new file will
1549 * be created empty and if the file wasn't empty,
1550 * then the exclusive create will have failed
1551 * because the file must have existed already.
1552 * Therefore, no truncate operation is needed.
1553 */
1554 in_va->va_mask &= ~(AT_GID | AT_SIZE);
1555 in_va->va_mask |= (AT_MTIME | AT_ATIME);
1556
1557 e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1558 if (e.error) {
1559 nfs4_error_t err;
1560
1561 /*
1562 * Couldn't correct the attributes of
1563 * the newly created file and the
1564 * attributes are wrong. Remove the
1565 * file and return an error to the
1566 * application.
1567 */
1568 /* XXX will this take care of client state ? */
1569 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1570 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1571 " remove file", e.error));
1572
1573 /*
1574 * The file is currently open so try to close it first.
1575 *
1576 * If we do not close the file explicitly here then the
1577 * VN_RELE() would do an (implicit and asynchronous)
1578 * close for us. But such async close could race with
1579 * the nfs4_remove() below. If the async close is
1580 * slower than nfs4_remove() then nfs4_remove()
1581 * wouldn't remove the file but rename it to .nfsXXXX
1582 * instead.
1583 */
1584 nfs4close_one(vp, NULL, cr, open_flag, NULL, &err,
1585 CLOSE_NORM, 0, 0, 0);
1586 VN_RELE(vp);
1587 (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1588
1589 /*
1590 * Since we've reled the vnode and removed
1591 * the file we now need to return the error.
1592 * At this point we don't want to update the
1593 * dircaches, call nfs4_waitfor_purge_complete
1594 * or set vpp to vp so we need to skip these
1595 * as well.
1596 */
1597 goto skip_update_dircaches;
1598 }
1599 }
1600
1601 /*
1602 * If we created or found the correct vnode, due to create_flag or
1603 * fh_differs being set, then update directory cache attribute, readdir
1604 * and dnlc caches.
1605 */
1606 if (create_flag || fh_differs) {
1607 dirattr_info_t dinfo, *dinfop;
1608
1609 /*
1610 * Make sure getattr succeeded before using results.
1611 * note: op 7 is getattr(dir) for both flavors of
1612 * open(create).
1613 */
1614 if (create_flag && res.status == NFS4_OK) {
1615 dinfo.di_time_call = t;
1616 dinfo.di_cred = cr;
1617 dinfo.di_garp =
1618 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1619 dinfop = &dinfo;
1620 } else {
1621 dinfop = NULL;
1622 }
1623
1624 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1625 dinfop);
1626 }
1627
1628 /*
1629 * If the page cache for this file was flushed from actions
1630 * above, it was done asynchronously and if that is true,
1631 * there is a need to wait here for it to complete. This must
1632 * be done outside of start_fop/end_fop.
1633 */
1634 (void) nfs4_waitfor_purge_complete(vp);
1635
1636 /*
1637 * It is implicit that we are in the open case (create_flag == 0) since
1638 * fh_differs can only be set to a non-zero value in the open case.
1639 */
1640 if (fh_differs != 0 && vpi != NULL)
1641 VN_RELE(vpi);
1642
1643 /*
1644 * Be sure to set *vpp to the correct value before returning.
1645 */
1646 *vpp = vp;
1647
1648 skip_update_dircaches:
1649
1650 nfs4args_copen_free(open_args);
1651 if (setgid_flag) {
1652 nfs4args_verify_free(&argop[8]);
1653 nfs4args_setattr_free(&argop[9]);
1654 }
1655 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1656
1657 if (ncr)
1658 crfree(ncr);
1659 kmem_free(argop, argoplist_size);
1660 return (e.error);
1661 }
1662
1663 /*
1664 * Reopen an open instance. cf. nfs4open_otw().
1665 *
1666 * Errors are returned by the nfs4_error_t parameter.
1667 * - ep->error contains an errno value or zero.
1668 * - if it is zero, ep->stat is set to an NFS status code, if any.
1669 * If the file could not be reopened, but the caller should continue, the
1670 * file is marked dead and no error values are returned. If the caller
1671 * should stop recovering open files and start over, either the ep->error
1672 * value or ep->stat will indicate an error (either something that requires
1673 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile
1674 * filehandles) may be handled silently by this routine.
1675 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1676 * will be started, so the caller should not do it.
1677 *
1678 * Gotos:
1679 * - kill_file : reopen failed in such a fashion to constitute marking the
1680 * file dead and setting the open stream's 'os_failed_reopen' as 1. This
1681 * is for cases where recovery is not possible.
1682 * - failed_reopen : same as above, except that the file has already been
1683 * marked dead, so no need to do it again.
1684 * - bailout : reopen failed but we are able to recover and retry the reopen -
1685 * either within this function immediately or via the calling function.
1686 */
1687
1688 void
1689 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1690 open_claim_type4 claim, bool_t frc_use_claim_previous,
1691 bool_t is_recov)
1692 {
1693 COMPOUND4args_clnt args;
1694 COMPOUND4res_clnt res;
1695 nfs_argop4 argop[4];
1696 nfs_resop4 *resop;
1697 OPEN4res *op_res = NULL;
1698 OPEN4cargs *open_args;
1699 GETFH4res *gf_res;
1700 rnode4_t *rp = VTOR4(vp);
1701 int doqueue = 1;
1702 cred_t *cr = NULL, *cred_otw = NULL;
1703 nfs4_open_owner_t *oop = NULL;
1704 seqid4 seqid;
1705 nfs4_ga_res_t *garp;
1706 char fn[MAXNAMELEN];
1707 nfs4_recov_state_t recov = {NULL, 0};
1708 nfs4_lost_rqst_t lost_rqst;
1709 mntinfo4_t *mi = VTOMI4(vp);
1710 bool_t abort;
1711 char *failed_msg = "";
1712 int fh_different;
1713 hrtime_t t;
1714 nfs4_bseqid_entry_t *bsep = NULL;
1715
1716 ASSERT(nfs4_consistent_type(vp));
1717 ASSERT(nfs_zone() == mi->mi_zone);
1718
1719 nfs4_error_zinit(ep);
1720
1721 /* this is the cred used to find the open owner */
1722 cr = state_to_cred(osp);
1723 if (cr == NULL) {
1724 failed_msg = "Couldn't reopen: no cred";
1725 goto kill_file;
1726 }
1727 /* use this cred for OTW operations */
1728 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1729
1730 top:
1731 nfs4_error_zinit(ep);
1732
1733 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1734 /* File system has been unmounted, quit */
1735 ep->error = EIO;
1736 failed_msg = "Couldn't reopen: file system has been unmounted";
1737 goto kill_file;
1738 }
1739
1740 oop = osp->os_open_owner;
1741
1742 ASSERT(oop != NULL);
1743 if (oop == NULL) { /* be defensive in non-DEBUG */
1744 failed_msg = "can't reopen: no open owner";
1745 goto kill_file;
1746 }
1747 open_owner_hold(oop);
1748
1749 ep->error = nfs4_start_open_seqid_sync(oop, mi);
1750 if (ep->error) {
1751 open_owner_rele(oop);
1752 oop = NULL;
1753 goto bailout;
1754 }
1755
1756 /*
1757 * If the rnode has a delegation and the delegation has been
1758 * recovered and the server didn't request a recall and the caller
1759 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1760 * recovery) and the rnode hasn't been marked dead, then install
1761 * the delegation stateid in the open stream. Otherwise, proceed
1762 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1763 */
1764 mutex_enter(&rp->r_statev4_lock);
1765 if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1766 !rp->r_deleg_return_pending &&
1767 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1768 !rp->r_deleg_needs_recall &&
1769 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1770 !(rp->r_flags & R4RECOVERR)) {
1771 mutex_enter(&osp->os_sync_lock);
1772 osp->os_delegation = 1;
1773 osp->open_stateid = rp->r_deleg_stateid;
1774 mutex_exit(&osp->os_sync_lock);
1775 mutex_exit(&rp->r_statev4_lock);
1776 goto bailout;
1777 }
1778 mutex_exit(&rp->r_statev4_lock);
1779
1780 /*
1781 * If the file failed recovery, just quit. This failure need not
1782 * affect other reopens, so don't return an error.
1783 */
1784 mutex_enter(&rp->r_statelock);
1785 if (rp->r_flags & R4RECOVERR) {
1786 mutex_exit(&rp->r_statelock);
1787 ep->error = 0;
1788 goto failed_reopen;
1789 }
1790 mutex_exit(&rp->r_statelock);
1791
1792 /*
1793 * argop is empty here
1794 *
1795 * PUTFH, OPEN, GETATTR
1796 */
1797 args.ctag = TAG_REOPEN;
1798 args.array_len = 4;
1799 args.array = argop;
1800
1801 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1802 "nfs4_reopen: file is type %d, id %s",
1803 vp->v_type, rnode4info(VTOR4(vp))));
1804
1805 argop[0].argop = OP_CPUTFH;
1806
1807 if (claim != CLAIM_PREVIOUS) {
1808 /*
1809 * if this is a file mount then
1810 * use the mntinfo parentfh
1811 */
1812 argop[0].nfs_argop4_u.opcputfh.sfh =
1813 (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1814 VTOSV(vp)->sv_dfh;
1815 } else {
1816 /* putfh fh to reopen */
1817 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1818 }
1819
1820 argop[1].argop = OP_COPEN;
1821 open_args = &argop[1].nfs_argop4_u.opcopen;
1822 open_args->claim = claim;
1823
1824 if (claim == CLAIM_NULL) {
1825
1826 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1827 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1828 "failed for vp 0x%p for CLAIM_NULL with %m",
1829 (void *)vp);
1830 failed_msg = "Couldn't reopen: vtoname failed for "
1831 "CLAIM_NULL";
1832 /* nothing allocated yet */
1833 goto kill_file;
1834 }
1835
1836 open_args->open_claim4_u.cfile = fn;
1837 } else if (claim == CLAIM_PREVIOUS) {
1838
1839 /*
1840 * We have two cases to deal with here:
1841 * 1) We're being called to reopen files in order to satisfy
1842 * a lock operation request which requires us to explicitly
1843 * reopen files which were opened under a delegation. If
1844 * we're in recovery, we *must* use CLAIM_PREVIOUS. In
1845 * that case, frc_use_claim_previous is TRUE and we must
1846 * use the rnode's current delegation type (r_deleg_type).
1847 * 2) We're reopening files during some form of recovery.
1848 * In this case, frc_use_claim_previous is FALSE and we
1849 * use the delegation type appropriate for recovery
1850 * (r_deleg_needs_recovery).
1851 */
1852 mutex_enter(&rp->r_statev4_lock);
1853 open_args->open_claim4_u.delegate_type =
1854 frc_use_claim_previous ?
1855 rp->r_deleg_type :
1856 rp->r_deleg_needs_recovery;
1857 mutex_exit(&rp->r_statev4_lock);
1858
1859 } else if (claim == CLAIM_DELEGATE_CUR) {
1860
1861 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1862 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1863 "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1864 "with %m", (void *)vp);
1865 failed_msg = "Couldn't reopen: vtoname failed for "
1866 "CLAIM_DELEGATE_CUR";
1867 /* nothing allocated yet */
1868 goto kill_file;
1869 }
1870
1871 mutex_enter(&rp->r_statev4_lock);
1872 open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1873 rp->r_deleg_stateid;
1874 mutex_exit(&rp->r_statev4_lock);
1875
1876 open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1877 }
1878 open_args->opentype = OPEN4_NOCREATE;
1879 open_args->owner.clientid = mi2clientid(mi);
1880 open_args->owner.owner_len = sizeof (oop->oo_name);
1881 open_args->owner.owner_val =
1882 kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1883 bcopy(&oop->oo_name, open_args->owner.owner_val,
1884 open_args->owner.owner_len);
1885 open_args->share_access = 0;
1886 open_args->share_deny = 0;
1887
1888 mutex_enter(&osp->os_sync_lock);
1889 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1890 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1891 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1892 (void *)osp, (void *)rp, osp->os_share_acc_read,
1893 osp->os_share_acc_write, osp->os_open_ref_count,
1894 osp->os_mmap_read, osp->os_mmap_write, claim));
1895
1896 if (osp->os_share_acc_read || osp->os_mmap_read)
1897 open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1898 if (osp->os_share_acc_write || osp->os_mmap_write)
1899 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1900 if (osp->os_share_deny_read)
1901 open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1902 if (osp->os_share_deny_write)
1903 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1904 mutex_exit(&osp->os_sync_lock);
1905
1906 seqid = nfs4_get_open_seqid(oop) + 1;
1907 open_args->seqid = seqid;
1908
1909 /* Construct the getfh part of the compound */
1910 argop[2].argop = OP_GETFH;
1911
1912 /* Construct the getattr part of the compound */
1913 argop[3].argop = OP_GETATTR;
1914 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1915 argop[3].nfs_argop4_u.opgetattr.mi = mi;
1916
1917 t = gethrtime();
1918
1919 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1920
1921 if (ep->error) {
1922 if (!is_recov && !frc_use_claim_previous &&
1923 (ep->error == EINTR || ep->error == ETIMEDOUT ||
1924 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1925 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1926 cred_otw, vp, NULL, open_args);
1927 abort = nfs4_start_recovery(ep,
1928 VTOMI4(vp), vp, NULL, NULL,
1929 lost_rqst.lr_op == OP_OPEN ?
1930 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1931 nfs4args_copen_free(open_args);
1932 goto bailout;
1933 }
1934
1935 nfs4args_copen_free(open_args);
1936
1937 if (ep->error == EACCES && cred_otw != cr) {
1938 crfree(cred_otw);
1939 cred_otw = cr;
1940 crhold(cred_otw);
1941 nfs4_end_open_seqid_sync(oop);
1942 open_owner_rele(oop);
1943 oop = NULL;
1944 goto top;
1945 }
1946 if (ep->error == ETIMEDOUT)
1947 goto bailout;
1948 failed_msg = "Couldn't reopen: rpc error";
1949 goto kill_file;
1950 }
1951
1952 if (nfs4_need_to_bump_seqid(&res))
1953 nfs4_set_open_seqid(seqid, oop, args.ctag);
1954
1955 switch (res.status) {
1956 case NFS4_OK:
1957 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1958 mutex_enter(&rp->r_statelock);
1959 rp->r_delay_interval = 0;
1960 mutex_exit(&rp->r_statelock);
1961 }
1962 break;
1963 case NFS4ERR_BAD_SEQID:
1964 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1965 args.ctag, open_args->seqid);
1966
1967 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1968 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1969 NULL, OP_OPEN, bsep, NULL, NULL);
1970
1971 nfs4args_copen_free(open_args);
1972 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1973 nfs4_end_open_seqid_sync(oop);
1974 open_owner_rele(oop);
1975 oop = NULL;
1976 kmem_free(bsep, sizeof (*bsep));
1977
1978 goto kill_file;
1979 case NFS4ERR_NO_GRACE:
1980 nfs4args_copen_free(open_args);
1981 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1982 nfs4_end_open_seqid_sync(oop);
1983 open_owner_rele(oop);
1984 oop = NULL;
1985 if (claim == CLAIM_PREVIOUS) {
1986 /*
1987 * Retry as a plain open. We don't need to worry about
1988 * checking the changeinfo: it is acceptable for a
1989 * client to re-open a file and continue processing
1990 * (in the absence of locks).
1991 */
1992 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1993 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1994 "will retry as CLAIM_NULL"));
1995 claim = CLAIM_NULL;
1996 nfs4_mi_kstat_inc_no_grace(mi);
1997 goto top;
1998 }
1999 failed_msg =
2000 "Couldn't reopen: tried reclaim outside grace period. ";
2001 goto kill_file;
2002 case NFS4ERR_GRACE:
2003 nfs4_set_grace_wait(mi);
2004 nfs4args_copen_free(open_args);
2005 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 nfs4_end_open_seqid_sync(oop);
2007 open_owner_rele(oop);
2008 oop = NULL;
2009 ep->error = nfs4_wait_for_grace(mi, &recov);
2010 if (ep->error != 0)
2011 goto bailout;
2012 goto top;
2013 case NFS4ERR_DELAY:
2014 nfs4_set_delay_wait(vp);
2015 nfs4args_copen_free(open_args);
2016 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2017 nfs4_end_open_seqid_sync(oop);
2018 open_owner_rele(oop);
2019 oop = NULL;
2020 ep->error = nfs4_wait_for_delay(vp, &recov);
2021 nfs4_mi_kstat_inc_delay(mi);
2022 if (ep->error != 0)
2023 goto bailout;
2024 goto top;
2025 case NFS4ERR_FHEXPIRED:
2026 /* recover filehandle and retry */
2027 abort = nfs4_start_recovery(ep,
2028 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2029 nfs4args_copen_free(open_args);
2030 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2031 nfs4_end_open_seqid_sync(oop);
2032 open_owner_rele(oop);
2033 oop = NULL;
2034 if (abort == FALSE)
2035 goto top;
2036 failed_msg = "Couldn't reopen: recovery aborted";
2037 goto kill_file;
2038 case NFS4ERR_RESOURCE:
2039 case NFS4ERR_STALE_CLIENTID:
2040 case NFS4ERR_WRONGSEC:
2041 case NFS4ERR_EXPIRED:
2042 /*
2043 * Do not mark the file dead and let the calling
2044 * function initiate recovery.
2045 */
2046 nfs4args_copen_free(open_args);
2047 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2048 nfs4_end_open_seqid_sync(oop);
2049 open_owner_rele(oop);
2050 oop = NULL;
2051 goto bailout;
2052 case NFS4ERR_ACCESS:
2053 if (cred_otw != cr) {
2054 crfree(cred_otw);
2055 cred_otw = cr;
2056 crhold(cred_otw);
2057 nfs4args_copen_free(open_args);
2058 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2059 nfs4_end_open_seqid_sync(oop);
2060 open_owner_rele(oop);
2061 oop = NULL;
2062 goto top;
2063 }
2064 /* fall through */
2065 default:
2066 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2067 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2068 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2069 rnode4info(VTOR4(vp))));
2070 failed_msg = "Couldn't reopen: NFSv4 error";
2071 nfs4args_copen_free(open_args);
2072 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2073 goto kill_file;
2074 }
2075
2076 resop = &res.array[1]; /* open res */
2077 op_res = &resop->nfs_resop4_u.opopen;
2078
2079 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2080
2081 /*
2082 * Check if the path we reopened really is the same
2083 * file. We could end up in a situation where the file
2084 * was removed and a new file created with the same name.
2085 */
2086 resop = &res.array[2];
2087 gf_res = &resop->nfs_resop4_u.opgetfh;
2088 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2089 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2090 if (fh_different) {
2091 if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2092 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2093 /* Oops, we don't have the same file */
2094 if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2095 failed_msg = "Couldn't reopen: Persistent "
2096 "file handle changed";
2097 else
2098 failed_msg = "Couldn't reopen: Volatile "
2099 "(no expire on open) file handle changed";
2100
2101 nfs4args_copen_free(open_args);
2102 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2103 nfs_rw_exit(&mi->mi_fh_lock);
2104 goto kill_file;
2105
2106 } else {
2107 /*
2108 * We have volatile file handles that don't compare.
2109 * If the fids are the same then we assume that the
2110 * file handle expired but the rnode still refers to
2111 * the same file object.
2112 *
2113 * First check that we have fids or not.
2114 * If we don't we have a dumb server so we will
2115 * just assume every thing is ok for now.
2116 */
2117 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2118 rp->r_attr.va_mask & AT_NODEID &&
2119 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2120 /*
2121 * We have fids, but they don't
2122 * compare. So kill the file.
2123 */
2124 failed_msg =
2125 "Couldn't reopen: file handle changed"
2126 " due to mismatched fids";
2127 nfs4args_copen_free(open_args);
2128 xdr_free(xdr_COMPOUND4res_clnt,
2129 (caddr_t)&res);
2130 nfs_rw_exit(&mi->mi_fh_lock);
2131 goto kill_file;
2132 } else {
2133 /*
2134 * We have volatile file handles that refers
2135 * to the same file (at least they have the
2136 * same fid) or we don't have fids so we
2137 * can't tell. :(. We'll be a kind and accepting
2138 * client so we'll update the rnode's file
2139 * handle with the otw handle.
2140 *
2141 * We need to drop mi->mi_fh_lock since
2142 * sh4_update acquires it. Since there is
2143 * only one recovery thread there is no
2144 * race.
2145 */
2146 nfs_rw_exit(&mi->mi_fh_lock);
2147 sfh4_update(rp->r_fh, &gf_res->object);
2148 }
2149 }
2150 } else {
2151 nfs_rw_exit(&mi->mi_fh_lock);
2152 }
2153
2154 ASSERT(nfs4_consistent_type(vp));
2155
2156 /*
2157 * If the server wanted an OPEN_CONFIRM but that fails, just start
2158 * over. Presumably if there is a persistent error it will show up
2159 * when we resend the OPEN.
2160 */
2161 if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2162 bool_t retry_open = FALSE;
2163
2164 nfs4open_confirm(vp, &seqid, &op_res->stateid,
2165 cred_otw, is_recov, &retry_open,
2166 oop, FALSE, ep, NULL);
2167 if (ep->error || ep->stat) {
2168 nfs4args_copen_free(open_args);
2169 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2170 nfs4_end_open_seqid_sync(oop);
2171 open_owner_rele(oop);
2172 oop = NULL;
2173 goto top;
2174 }
2175 }
2176
2177 mutex_enter(&osp->os_sync_lock);
2178 osp->open_stateid = op_res->stateid;
2179 osp->os_delegation = 0;
2180 /*
2181 * Need to reset this bitfield for the possible case where we were
2182 * going to OTW CLOSE the file, got a non-recoverable error, and before
2183 * we could retry the CLOSE, OPENed the file again.
2184 */
2185 ASSERT(osp->os_open_owner->oo_seqid_inuse);
2186 osp->os_final_close = 0;
2187 osp->os_force_close = 0;
2188 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2189 osp->os_dc_openacc = open_args->share_access;
2190 mutex_exit(&osp->os_sync_lock);
2191
2192 nfs4_end_open_seqid_sync(oop);
2193
2194 /* accept delegation, if any */
2195 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2196
2197 nfs4args_copen_free(open_args);
2198
2199 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2200
2201 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2202
2203 ASSERT(nfs4_consistent_type(vp));
2204
2205 open_owner_rele(oop);
2206 crfree(cr);
2207 crfree(cred_otw);
2208 return;
2209
2210 kill_file:
2211 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2212 failed_reopen:
2213 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2214 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2215 (void *)osp, (void *)cr, rnode4info(rp)));
2216 mutex_enter(&osp->os_sync_lock);
2217 osp->os_failed_reopen = 1;
2218 mutex_exit(&osp->os_sync_lock);
2219 bailout:
2220 if (oop != NULL) {
2221 nfs4_end_open_seqid_sync(oop);
2222 open_owner_rele(oop);
2223 }
2224 if (cr != NULL)
2225 crfree(cr);
2226 if (cred_otw != NULL)
2227 crfree(cred_otw);
2228 }
2229
2230 /* for . and .. OPENs */
2231 /* ARGSUSED */
2232 static int
2233 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2234 {
2235 rnode4_t *rp;
2236 nfs4_ga_res_t gar;
2237
2238 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2239
2240 /*
2241 * If close-to-open consistency checking is turned off or
2242 * if there is no cached data, we can avoid
2243 * the over the wire getattr. Otherwise, force a
2244 * call to the server to get fresh attributes and to
2245 * check caches. This is required for close-to-open
2246 * consistency.
2247 */
2248 rp = VTOR4(*vpp);
2249 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2250 (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2251 return (0);
2252
2253 return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2254 }
2255
2256 /*
2257 * CLOSE a file
2258 */
2259 /* ARGSUSED */
2260 static int
2261 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2262 caller_context_t *ct)
2263 {
2264 rnode4_t *rp;
2265 int error = 0;
2266 int r_error = 0;
2267 int n4error = 0;
2268 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2269
2270 /*
2271 * Remove client state for this (lockowner, file) pair.
2272 * Issue otw v4 call to have the server do the same.
2273 */
2274
2275 rp = VTOR4(vp);
2276
2277 /*
2278 * zone_enter(2) prevents processes from changing zones with NFS files
2279 * open; if we happen to get here from the wrong zone we can't do
2280 * anything over the wire.
2281 */
2282 if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2283 /*
2284 * We could attempt to clean up locks, except we're sure
2285 * that the current process didn't acquire any locks on
2286 * the file: any attempt to lock a file belong to another zone
2287 * will fail, and one can't lock an NFS file and then change
2288 * zones, as that fails too.
2289 *
2290 * Returning an error here is the sane thing to do. A
2291 * subsequent call to VN_RELE() which translates to a
2292 * nfs4_inactive() will clean up state: if the zone of the
2293 * vnode's origin is still alive and kicking, the inactive
2294 * thread will handle the request (from the correct zone), and
2295 * everything (minus the OTW close call) should be OK. If the
2296 * zone is going away nfs4_async_inactive() will throw away
2297 * delegations, open streams and cached pages inline.
2298 */
2299 return (EIO);
2300 }
2301
2302 /*
2303 * If we are using local locking for this filesystem, then
2304 * release all of the SYSV style record locks. Otherwise,
2305 * we are doing network locking and we need to release all
2306 * of the network locks. All of the locks held by this
2307 * process on this file are released no matter what the
2308 * incoming reference count is.
2309 */
2310 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2311 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2312 cleanshares(vp, ttoproc(curthread)->p_pid);
2313 } else
2314 e.error = nfs4_lockrelease(vp, flag, offset, cr);
2315
2316 if (e.error) {
2317 struct lm_sysid *lmsid;
2318 lmsid = nfs4_find_sysid(VTOMI4(vp));
2319 if (lmsid == NULL) {
2320 DTRACE_PROBE2(unknown__sysid, int, e.error,
2321 vnode_t *, vp);
2322 } else {
2323 cleanlocks(vp, ttoproc(curthread)->p_pid,
2324 (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2325
2326 lm_rel_sysid(lmsid);
2327 }
2328 return (e.error);
2329 }
2330
2331 if (count > 1)
2332 return (0);
2333
2334 /*
2335 * If the file has been `unlinked', then purge the
2336 * DNLC so that this vnode will get reycled quicker
2337 * and the .nfs* file on the server will get removed.
2338 */
2339 if (rp->r_unldvp != NULL)
2340 dnlc_purge_vp(vp);
2341
2342 /*
2343 * If the file was open for write and there are pages,
2344 * do a synchronous flush and commit of all of the
2345 * dirty and uncommitted pages.
2346 */
2347 ASSERT(!e.error);
2348 if ((flag & FWRITE) && nfs4_has_pages(vp))
2349 error = nfs4_putpage_commit(vp, 0, 0, cr);
2350
2351 mutex_enter(&rp->r_statelock);
2352 r_error = rp->r_error;
2353 rp->r_error = 0;
2354 mutex_exit(&rp->r_statelock);
2355
2356 /*
2357 * If this file type is one for which no explicit 'open' was
2358 * done, then bail now (ie. no need for protocol 'close'). If
2359 * there was an error w/the vm subsystem, return _that_ error,
2360 * otherwise, return any errors that may've been reported via
2361 * the rnode.
2362 */
2363 if (vp->v_type != VREG)
2364 return (error ? error : r_error);
2365
2366 /*
2367 * The sync putpage commit may have failed above, but since
2368 * we're working w/a regular file, we need to do the protocol
2369 * 'close' (nfs4close_one will figure out if an otw close is
2370 * needed or not). Report any errors _after_ doing the protocol
2371 * 'close'.
2372 */
2373 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2374 n4error = e.error ? e.error : geterrno4(e.stat);
2375
2376 /*
2377 * Error reporting prio (Hi -> Lo)
2378 *
2379 * i) nfs4_putpage_commit (error)
2380 * ii) rnode's (r_error)
2381 * iii) nfs4close_one (n4error)
2382 */
2383 return (error ? error : (r_error ? r_error : n4error));
2384 }
2385
2386 /*
2387 * Initialize *lost_rqstp.
2388 */
2389
2390 static void
2391 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2392 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2393 vnode_t *vp)
2394 {
2395 if (error != ETIMEDOUT && error != EINTR &&
2396 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2397 lost_rqstp->lr_op = 0;
2398 return;
2399 }
2400
2401 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2402 "nfs4close_save_lost_rqst: error %d", error));
2403
2404 lost_rqstp->lr_op = OP_CLOSE;
2405 /*
2406 * The vp is held and rele'd via the recovery code.
2407 * See nfs4_save_lost_rqst.
2408 */
2409 lost_rqstp->lr_vp = vp;
2410 lost_rqstp->lr_dvp = NULL;
2411 lost_rqstp->lr_oop = oop;
2412 lost_rqstp->lr_osp = osp;
2413 ASSERT(osp != NULL);
2414 ASSERT(mutex_owned(&osp->os_sync_lock));
2415 osp->os_pending_close = 1;
2416 lost_rqstp->lr_lop = NULL;
2417 lost_rqstp->lr_cr = cr;
2418 lost_rqstp->lr_flk = NULL;
2419 lost_rqstp->lr_putfirst = FALSE;
2420 }
2421
2422 /*
2423 * Assumes you already have the open seqid sync grabbed as well as the
2424 * 'os_sync_lock'. Note: this will release the open seqid sync and
2425 * 'os_sync_lock' if client recovery starts. Calling functions have to
2426 * be prepared to handle this.
2427 *
2428 * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2429 * was needed and was started, and that the calling function should retry
2430 * this function; otherwise it is returned as 0.
2431 *
2432 * Errors are returned via the nfs4_error_t parameter.
2433 */
2434 static void
2435 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2436 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2437 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2438 {
2439 COMPOUND4args_clnt args;
2440 COMPOUND4res_clnt res;
2441 CLOSE4args *close_args;
2442 nfs_resop4 *resop;
2443 nfs_argop4 argop[3];
2444 int doqueue = 1;
2445 mntinfo4_t *mi;
2446 seqid4 seqid;
2447 vnode_t *vp;
2448 bool_t needrecov = FALSE;
2449 nfs4_lost_rqst_t lost_rqst;
2450 hrtime_t t;
2451
2452 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2453
2454 ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2455
2456 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2457
2458 /* Only set this to 1 if recovery is started */
2459 *recov = 0;
2460
2461 /* do the OTW call to close the file */
2462
2463 if (close_type == CLOSE_RESEND)
2464 args.ctag = TAG_CLOSE_LOST;
2465 else if (close_type == CLOSE_AFTER_RESEND)
2466 args.ctag = TAG_CLOSE_UNDO;
2467 else
2468 args.ctag = TAG_CLOSE;
2469
2470 args.array_len = 3;
2471 args.array = argop;
2472
2473 vp = RTOV4(rp);
2474
2475 mi = VTOMI4(vp);
2476
2477 /* putfh target fh */
2478 argop[0].argop = OP_CPUTFH;
2479 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2480
2481 argop[1].argop = OP_GETATTR;
2482 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2483 argop[1].nfs_argop4_u.opgetattr.mi = mi;
2484
2485 argop[2].argop = OP_CLOSE;
2486 close_args = &argop[2].nfs_argop4_u.opclose;
2487
2488 seqid = nfs4_get_open_seqid(oop) + 1;
2489
2490 close_args->seqid = seqid;
2491 close_args->open_stateid = osp->open_stateid;
2492
2493 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2494 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2495 rnode4info(rp)));
2496
2497 t = gethrtime();
2498
2499 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2500
2501 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2502 nfs4_set_open_seqid(seqid, oop, args.ctag);
2503 }
2504
2505 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2506 if (ep->error && !needrecov) {
2507 /*
2508 * if there was an error and no recovery is to be done
2509 * then then set up the file to flush its cache if
2510 * needed for the next caller.
2511 */
2512 mutex_enter(&rp->r_statelock);
2513 PURGE_ATTRCACHE4_LOCKED(rp);
2514 rp->r_flags &= ~R4WRITEMODIFIED;
2515 mutex_exit(&rp->r_statelock);
2516 return;
2517 }
2518
2519 if (needrecov) {
2520 bool_t abort;
2521 nfs4_bseqid_entry_t *bsep = NULL;
2522
2523 if (close_type != CLOSE_RESEND)
2524 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2525 osp, cred_otw, vp);
2526
2527 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2528 bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2529 0, args.ctag, close_args->seqid);
2530
2531 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2532 "nfs4close_otw: initiating recovery. error %d "
2533 "res.status %d", ep->error, res.status));
2534
2535 /*
2536 * Drop the 'os_sync_lock' here so we don't hit
2537 * a potential recursive mutex_enter via an
2538 * 'open_stream_hold()'.
2539 */
2540 mutex_exit(&osp->os_sync_lock);
2541 *have_sync_lockp = 0;
2542 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2543 (close_type != CLOSE_RESEND &&
2544 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2545 OP_CLOSE, bsep, NULL, NULL);
2546
2547 /* drop open seq sync, and let the calling function regrab it */
2548 nfs4_end_open_seqid_sync(oop);
2549 *did_start_seqid_syncp = 0;
2550
2551 if (bsep)
2552 kmem_free(bsep, sizeof (*bsep));
2553 /*
2554 * For signals, the caller wants to quit, so don't say to
2555 * retry. For forced unmount, if it's a user thread, it
2556 * wants to quit. If it's a recovery thread, the retry
2557 * will happen higher-up on the call stack. Either way,
2558 * don't say to retry.
2559 */
2560 if (abort == FALSE && ep->error != EINTR &&
2561 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2562 close_type != CLOSE_RESEND &&
2563 close_type != CLOSE_AFTER_RESEND)
2564 *recov = 1;
2565 else
2566 *recov = 0;
2567
2568 if (!ep->error)
2569 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2570 return;
2571 }
2572
2573 if (res.status) {
2574 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2575 return;
2576 }
2577
2578 mutex_enter(&rp->r_statev4_lock);
2579 rp->created_v4 = 0;
2580 mutex_exit(&rp->r_statev4_lock);
2581
2582 resop = &res.array[2];
2583 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2584 osp->os_valid = 0;
2585
2586 /*
2587 * This removes the reference obtained at OPEN; ie, when the
2588 * open stream structure was created.
2589 *
2590 * We don't have to worry about calling 'open_stream_rele'
2591 * since we our currently holding a reference to the open
2592 * stream which means the count cannot go to 0 with this
2593 * decrement.
2594 */
2595 ASSERT(osp->os_ref_count >= 2);
2596 osp->os_ref_count--;
2597
2598 if (ep->error == 0) {
2599 /*
2600 * Avoid a deadlock with the r_serial thread waiting for
2601 * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be
2602 * held by us. We will wait in nfs4_attr_cache() for the
2603 * completion of the r_serial thread.
2604 */
2605 mutex_exit(&osp->os_sync_lock);
2606 *have_sync_lockp = 0;
2607
2608 nfs4_attr_cache(vp,
2609 &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2610 t, cred_otw, TRUE, NULL);
2611 }
2612
2613 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2614 " returning %d", ep->error));
2615
2616 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2617 }
2618
2619 /* ARGSUSED */
2620 static int
2621 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2622 caller_context_t *ct)
2623 {
2624 rnode4_t *rp;
2625 u_offset_t off;
2626 offset_t diff;
2627 uint_t on;
2628 uint_t n;
2629 caddr_t base;
2630 uint_t flags;
2631 int error;
2632 mntinfo4_t *mi;
2633
2634 rp = VTOR4(vp);
2635
2636 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2637
2638 if (IS_SHADOW(vp, rp))
2639 vp = RTOV4(rp);
2640
2641 if (vp->v_type != VREG)
2642 return (EISDIR);
2643
2644 mi = VTOMI4(vp);
2645
2646 if (nfs_zone() != mi->mi_zone)
2647 return (EIO);
2648
2649 if (uiop->uio_resid == 0)
2650 return (0);
2651
2652 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2653 return (EINVAL);
2654
2655 mutex_enter(&rp->r_statelock);
2656 if (rp->r_flags & R4RECOVERRP)
2657 error = (rp->r_error ? rp->r_error : EIO);
2658 else
2659 error = 0;
2660 mutex_exit(&rp->r_statelock);
2661 if (error)
2662 return (error);
2663
2664 /*
2665 * Bypass VM if caching has been disabled (e.g., locking) or if
2666 * using client-side direct I/O and the file is not mmap'd and
2667 * there are no cached pages.
2668 */
2669 if ((vp->v_flag & VNOCACHE) ||
2670 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2671 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2672 size_t resid = 0;
2673
2674 return (nfs4read(vp, NULL, uiop->uio_loffset,
2675 uiop->uio_resid, &resid, cr, FALSE, uiop));
2676 }
2677
2678 error = 0;
2679
2680 do {
2681 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2682 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2683 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2684
2685 if (error = nfs4_validate_caches(vp, cr))
2686 break;
2687
2688 mutex_enter(&rp->r_statelock);
2689 while (rp->r_flags & R4INCACHEPURGE) {
2690 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2691 mutex_exit(&rp->r_statelock);
2692 return (EINTR);
2693 }
2694 }
2695 diff = rp->r_size - uiop->uio_loffset;
2696 mutex_exit(&rp->r_statelock);
2697 if (diff <= 0)
2698 break;
2699 if (diff < n)
2700 n = (uint_t)diff;
2701
2702 if (vpm_enable) {
2703 /*
2704 * Copy data.
2705 */
2706 error = vpm_data_copy(vp, off + on, n, uiop,
2707 1, NULL, 0, S_READ);
2708 } else {
2709 base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2710 S_READ);
2711
2712 error = uiomove(base + on, n, UIO_READ, uiop);
2713 }
2714
2715 if (!error) {
2716 /*
2717 * If read a whole block or read to eof,
2718 * won't need this buffer again soon.
2719 */
2720 mutex_enter(&rp->r_statelock);
2721 if (n + on == MAXBSIZE ||
2722 uiop->uio_loffset == rp->r_size)
2723 flags = SM_DONTNEED;
2724 else
2725 flags = 0;
2726 mutex_exit(&rp->r_statelock);
2727 if (vpm_enable) {
2728 error = vpm_sync_pages(vp, off, n, flags);
2729 } else {
2730 error = segmap_release(segkmap, base, flags);
2731 }
2732 } else {
2733 if (vpm_enable) {
2734 (void) vpm_sync_pages(vp, off, n, 0);
2735 } else {
2736 (void) segmap_release(segkmap, base, 0);
2737 }
2738 }
2739 } while (!error && uiop->uio_resid > 0);
2740
2741 return (error);
2742 }
2743
2744 /* ARGSUSED */
2745 static int
2746 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2747 caller_context_t *ct)
2748 {
2749 rlim64_t limit = uiop->uio_llimit;
2750 rnode4_t *rp;
2751 u_offset_t off;
2752 caddr_t base;
2753 uint_t flags;
2754 int remainder;
2755 size_t n;
2756 int on;
2757 int error;
2758 int resid;
2759 u_offset_t offset;
2760 mntinfo4_t *mi;
2761 uint_t bsize;
2762
2763 rp = VTOR4(vp);
2764
2765 if (IS_SHADOW(vp, rp))
2766 vp = RTOV4(rp);
2767
2768 if (vp->v_type != VREG)
2769 return (EISDIR);
2770
2771 mi = VTOMI4(vp);
2772
2773 if (nfs_zone() != mi->mi_zone)
2774 return (EIO);
2775
2776 if (uiop->uio_resid == 0)
2777 return (0);
2778
2779 mutex_enter(&rp->r_statelock);
2780 if (rp->r_flags & R4RECOVERRP)
2781 error = (rp->r_error ? rp->r_error : EIO);
2782 else
2783 error = 0;
2784 mutex_exit(&rp->r_statelock);
2785 if (error)
2786 return (error);
2787
2788 if (ioflag & FAPPEND) {
2789 struct vattr va;
2790
2791 /*
2792 * Must serialize if appending.
2793 */
2794 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2795 nfs_rw_exit(&rp->r_rwlock);
2796 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2797 INTR4(vp)))
2798 return (EINTR);
2799 }
2800
2801 va.va_mask = AT_SIZE;
2802 error = nfs4getattr(vp, &va, cr);
2803 if (error)
2804 return (error);
2805 uiop->uio_loffset = va.va_size;
2806 }
2807
2808 offset = uiop->uio_loffset + uiop->uio_resid;
2809
2810 if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2811 return (EINVAL);
2812
2813 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2814 limit = MAXOFFSET_T;
2815
2816 /*
2817 * Check to make sure that the process will not exceed
2818 * its limit on file size. It is okay to write up to
2819 * the limit, but not beyond. Thus, the write which
2820 * reaches the limit will be short and the next write
2821 * will return an error.
2822 */
2823 remainder = 0;
2824 if (offset > uiop->uio_llimit) {
2825 remainder = offset - uiop->uio_llimit;
2826 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2827 if (uiop->uio_resid <= 0) {
2828 proc_t *p = ttoproc(curthread);
2829
2830 uiop->uio_resid += remainder;
2831 mutex_enter(&p->p_lock);
2832 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2833 p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2834 mutex_exit(&p->p_lock);
2835 return (EFBIG);
2836 }
2837 }
2838
2839 /* update the change attribute, if we have a write delegation */
2840
2841 mutex_enter(&rp->r_statev4_lock);
2842 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2843 rp->r_deleg_change++;
2844
2845 mutex_exit(&rp->r_statev4_lock);
2846
2847 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2848 return (EINTR);
2849
2850 /*
2851 * Bypass VM if caching has been disabled (e.g., locking) or if
2852 * using client-side direct I/O and the file is not mmap'd and
2853 * there are no cached pages.
2854 */
2855 if ((vp->v_flag & VNOCACHE) ||
2856 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2857 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2858 size_t bufsize;
2859 int count;
2860 u_offset_t org_offset;
2861 stable_how4 stab_comm;
2862 nfs4_fwrite:
2863 if (rp->r_flags & R4STALE) {
2864 resid = uiop->uio_resid;
2865 offset = uiop->uio_loffset;
2866 error = rp->r_error;
2867 /*
2868 * A close may have cleared r_error, if so,
2869 * propagate ESTALE error return properly
2870 */
2871 if (error == 0)
2872 error = ESTALE;
2873 goto bottom;
2874 }
2875
2876 bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2877 base = kmem_alloc(bufsize, KM_SLEEP);
2878 do {
2879 if (ioflag & FDSYNC)
2880 stab_comm = DATA_SYNC4;
2881 else
2882 stab_comm = FILE_SYNC4;
2883 resid = uiop->uio_resid;
2884 offset = uiop->uio_loffset;
2885 count = MIN(uiop->uio_resid, bufsize);
2886 org_offset = uiop->uio_loffset;
2887 error = uiomove(base, count, UIO_WRITE, uiop);
2888 if (!error) {
2889 error = nfs4write(vp, base, org_offset,
2890 count, cr, &stab_comm);
2891 if (!error) {
2892 mutex_enter(&rp->r_statelock);
2893 if (rp->r_size < uiop->uio_loffset)
2894 rp->r_size = uiop->uio_loffset;
2895 mutex_exit(&rp->r_statelock);
2896 }
2897 }
2898 } while (!error && uiop->uio_resid > 0);
2899 kmem_free(base, bufsize);
2900 goto bottom;
2901 }
2902
2903 bsize = vp->v_vfsp->vfs_bsize;
2904
2905 do {
2906 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2907 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2908 n = MIN(MAXBSIZE - on, uiop->uio_resid);
2909
2910 resid = uiop->uio_resid;
2911 offset = uiop->uio_loffset;
2912
2913 if (rp->r_flags & R4STALE) {
2914 error = rp->r_error;
2915 /*
2916 * A close may have cleared r_error, if so,
2917 * propagate ESTALE error return properly
2918 */
2919 if (error == 0)
2920 error = ESTALE;
2921 break;
2922 }
2923
2924 /*
2925 * Don't create dirty pages faster than they
2926 * can be cleaned so that the system doesn't
2927 * get imbalanced. If the async queue is
2928 * maxed out, then wait for it to drain before
2929 * creating more dirty pages. Also, wait for
2930 * any threads doing pagewalks in the vop_getattr
2931 * entry points so that they don't block for
2932 * long periods.
2933 */
2934 mutex_enter(&rp->r_statelock);
2935 while ((mi->mi_max_threads != 0 &&
2936 rp->r_awcount > 2 * mi->mi_max_threads) ||
2937 rp->r_gcount > 0) {
2938 if (INTR4(vp)) {
2939 klwp_t *lwp = ttolwp(curthread);
2940
2941 if (lwp != NULL)
2942 lwp->lwp_nostop++;
2943 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2944 mutex_exit(&rp->r_statelock);
2945 if (lwp != NULL)
2946 lwp->lwp_nostop--;
2947 error = EINTR;
2948 goto bottom;
2949 }
2950 if (lwp != NULL)
2951 lwp->lwp_nostop--;
2952 } else
2953 cv_wait(&rp->r_cv, &rp->r_statelock);
2954 }
2955 mutex_exit(&rp->r_statelock);
2956
2957 /*
2958 * Touch the page and fault it in if it is not in core
2959 * before segmap_getmapflt or vpm_data_copy can lock it.
2960 * This is to avoid the deadlock if the buffer is mapped
2961 * to the same file through mmap which we want to write.
2962 */
2963 uio_prefaultpages((long)n, uiop);
2964
2965 if (vpm_enable) {
2966 /*
2967 * It will use kpm mappings, so no need to
2968 * pass an address.
2969 */
2970 error = writerp4(rp, NULL, n, uiop, 0);
2971 } else {
2972 if (segmap_kpm) {
2973 int pon = uiop->uio_loffset & PAGEOFFSET;
2974 size_t pn = MIN(PAGESIZE - pon,
2975 uiop->uio_resid);
2976 int pagecreate;
2977
2978 mutex_enter(&rp->r_statelock);
2979 pagecreate = (pon == 0) && (pn == PAGESIZE ||
2980 uiop->uio_loffset + pn >= rp->r_size);
2981 mutex_exit(&rp->r_statelock);
2982
2983 base = segmap_getmapflt(segkmap, vp, off + on,
2984 pn, !pagecreate, S_WRITE);
2985
2986 error = writerp4(rp, base + pon, n, uiop,
2987 pagecreate);
2988
2989 } else {
2990 base = segmap_getmapflt(segkmap, vp, off + on,
2991 n, 0, S_READ);
2992 error = writerp4(rp, base + on, n, uiop, 0);
2993 }
2994 }
2995
2996 if (!error) {
2997 if (mi->mi_flags & MI4_NOAC)
2998 flags = SM_WRITE;
2999 else if ((uiop->uio_loffset % bsize) == 0 ||
3000 IS_SWAPVP(vp)) {
3001 /*
3002 * Have written a whole block.
3003 * Start an asynchronous write
3004 * and mark the buffer to
3005 * indicate that it won't be
3006 * needed again soon.
3007 */
3008 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
3009 } else
3010 flags = 0;
3011 if ((ioflag & (FSYNC|FDSYNC)) ||
3012 (rp->r_flags & R4OUTOFSPACE)) {
3013 flags &= ~SM_ASYNC;
3014 flags |= SM_WRITE;
3015 }
3016 if (vpm_enable) {
3017 error = vpm_sync_pages(vp, off, n, flags);
3018 } else {
3019 error = segmap_release(segkmap, base, flags);
3020 }
3021 } else {
3022 if (vpm_enable) {
3023 (void) vpm_sync_pages(vp, off, n, 0);
3024 } else {
3025 (void) segmap_release(segkmap, base, 0);
3026 }
3027 /*
3028 * In the event that we got an access error while
3029 * faulting in a page for a write-only file just
3030 * force a write.
3031 */
3032 if (error == EACCES)
3033 goto nfs4_fwrite;
3034 }
3035 } while (!error && uiop->uio_resid > 0);
3036
3037 bottom:
3038 if (error) {
3039 uiop->uio_resid = resid + remainder;
3040 uiop->uio_loffset = offset;
3041 } else {
3042 uiop->uio_resid += remainder;
3043
3044 mutex_enter(&rp->r_statev4_lock);
3045 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3046 gethrestime(&rp->r_attr.va_mtime);
3047 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3048 }
3049 mutex_exit(&rp->r_statev4_lock);
3050 }
3051
3052 nfs_rw_exit(&rp->r_lkserlock);
3053
3054 return (error);
3055 }
3056
3057 /*
3058 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3059 */
3060 static int
3061 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3062 int flags, cred_t *cr)
3063 {
3064 struct buf *bp;
3065 int error;
3066 page_t *savepp;
3067 uchar_t fsdata;
3068 stable_how4 stab_comm;
3069
3070 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3071 bp = pageio_setup(pp, len, vp, flags);
3072 ASSERT(bp != NULL);
3073
3074 /*
3075 * pageio_setup should have set b_addr to 0. This
3076 * is correct since we want to do I/O on a page
3077 * boundary. bp_mapin will use this addr to calculate
3078 * an offset, and then set b_addr to the kernel virtual
3079 * address it allocated for us.
3080 */
3081 ASSERT(bp->b_un.b_addr == 0);
3082
3083 bp->b_edev = 0;
3084 bp->b_dev = 0;
3085 bp->b_lblkno = lbtodb(off);
3086 bp->b_file = vp;
3087 bp->b_offset = (offset_t)off;
3088 bp_mapin(bp);
3089
3090 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3091 freemem > desfree)
3092 stab_comm = UNSTABLE4;
3093 else
3094 stab_comm = FILE_SYNC4;
3095
3096 error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3097
3098 bp_mapout(bp);
3099 pageio_done(bp);
3100
3101 if (stab_comm == UNSTABLE4)
3102 fsdata = C_DELAYCOMMIT;
3103 else
3104 fsdata = C_NOCOMMIT;
3105
3106 savepp = pp;
3107 do {
3108 pp->p_fsdata = fsdata;
3109 } while ((pp = pp->p_next) != savepp);
3110
3111 return (error);
3112 }
3113
3114 /*
3115 */
3116 static int
3117 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3118 {
3119 nfs4_open_owner_t *oop;
3120 nfs4_open_stream_t *osp;
3121 rnode4_t *rp = VTOR4(vp);
3122 mntinfo4_t *mi = VTOMI4(vp);
3123 int reopen_needed;
3124
3125 ASSERT(nfs_zone() == mi->mi_zone);
3126
3127
3128 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3129 if (!oop)
3130 return (EIO);
3131
3132 /* returns with 'os_sync_lock' held */
3133 osp = find_open_stream(oop, rp);
3134 if (!osp) {
3135 open_owner_rele(oop);
3136 return (EIO);
3137 }
3138
3139 if (osp->os_failed_reopen) {
3140 mutex_exit(&osp->os_sync_lock);
3141 open_stream_rele(osp, rp);
3142 open_owner_rele(oop);
3143 return (EIO);
3144 }
3145
3146 /*
3147 * Determine whether a reopen is needed. If this
3148 * is a delegation open stream, then the os_delegation bit
3149 * should be set.
3150 */
3151
3152 reopen_needed = osp->os_delegation;
3153
3154 mutex_exit(&osp->os_sync_lock);
3155 open_owner_rele(oop);
3156
3157 if (reopen_needed) {
3158 nfs4_error_zinit(ep);
3159 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3160 mutex_enter(&osp->os_sync_lock);
3161 if (ep->error || ep->stat || osp->os_failed_reopen) {
3162 mutex_exit(&osp->os_sync_lock);
3163 open_stream_rele(osp, rp);
3164 return (EIO);
3165 }
3166 mutex_exit(&osp->os_sync_lock);
3167 }
3168 open_stream_rele(osp, rp);
3169
3170 return (0);
3171 }
3172
3173 /*
3174 * Write to file. Writes to remote server in largest size
3175 * chunks that the server can handle. Write is synchronous.
3176 */
3177 static int
3178 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3179 stable_how4 *stab_comm)
3180 {
3181 mntinfo4_t *mi;
3182 COMPOUND4args_clnt args;
3183 COMPOUND4res_clnt res;
3184 WRITE4args *wargs;
3185 WRITE4res *wres;
3186 nfs_argop4 argop[2];
3187 nfs_resop4 *resop;
3188 int tsize;
3189 stable_how4 stable;
3190 rnode4_t *rp;
3191 int doqueue = 1;
3192 bool_t needrecov;
3193 nfs4_recov_state_t recov_state;
3194 nfs4_stateid_types_t sid_types;
3195 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3196 int recov;
3197
3198 rp = VTOR4(vp);
3199 mi = VTOMI4(vp);
3200
3201 ASSERT(nfs_zone() == mi->mi_zone);
3202
3203 stable = *stab_comm;
3204 *stab_comm = FILE_SYNC4;
3205
3206 needrecov = FALSE;
3207 recov_state.rs_flags = 0;
3208 recov_state.rs_num_retry_despite_err = 0;
3209 nfs4_init_stateid_types(&sid_types);
3210
3211 /* Is curthread the recovery thread? */
3212 mutex_enter(&mi->mi_lock);
3213 recov = (mi->mi_recovthread == curthread);
3214 mutex_exit(&mi->mi_lock);
3215
3216 recov_retry:
3217 args.ctag = TAG_WRITE;
3218 args.array_len = 2;
3219 args.array = argop;
3220
3221 if (!recov) {
3222 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3223 &recov_state, NULL);
3224 if (e.error)
3225 return (e.error);
3226 }
3227
3228 /* 0. putfh target fh */
3229 argop[0].argop = OP_CPUTFH;
3230 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3231
3232 /* 1. write */
3233 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3234
3235 do {
3236
3237 wargs->offset = (offset4)offset;
3238 wargs->data_val = base;
3239
3240 if (mi->mi_io_kstats) {
3241 mutex_enter(&mi->mi_lock);
3242 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3243 mutex_exit(&mi->mi_lock);
3244 }
3245
3246 if ((vp->v_flag & VNOCACHE) ||
3247 (rp->r_flags & R4DIRECTIO) ||
3248 (mi->mi_flags & MI4_DIRECTIO))
3249 tsize = MIN(mi->mi_stsize, count);
3250 else
3251 tsize = MIN(mi->mi_curwrite, count);
3252 wargs->data_len = (uint_t)tsize;
3253 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3254
3255 if (mi->mi_io_kstats) {
3256 mutex_enter(&mi->mi_lock);
3257 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3258 mutex_exit(&mi->mi_lock);
3259 }
3260
3261 if (!recov) {
3262 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3263 if (e.error && !needrecov) {
3264 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3265 &recov_state, needrecov);
3266 return (e.error);
3267 }
3268 } else {
3269 if (e.error)
3270 return (e.error);
3271 }
3272
3273 /*
3274 * Do handling of OLD_STATEID outside
3275 * of the normal recovery framework.
3276 *
3277 * If write receives a BAD stateid error while using a
3278 * delegation stateid, retry using the open stateid (if it
3279 * exists). If it doesn't have an open stateid, reopen the
3280 * file first, then retry.
3281 */
3282 if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3283 sid_types.cur_sid_type != SPEC_SID) {
3284 nfs4_save_stateid(&wargs->stateid, &sid_types);
3285 if (!recov)
3286 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3287 &recov_state, needrecov);
3288 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3289 goto recov_retry;
3290 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3291 sid_types.cur_sid_type == DEL_SID) {
3292 nfs4_save_stateid(&wargs->stateid, &sid_types);
3293 mutex_enter(&rp->r_statev4_lock);
3294 rp->r_deleg_return_pending = TRUE;
3295 mutex_exit(&rp->r_statev4_lock);
3296 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3297 if (!recov)
3298 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3299 &recov_state, needrecov);
3300 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3301 return (EIO);
3302 }
3303 if (!recov)
3304 nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3305 &recov_state, needrecov);
3306 /* hold needed for nfs4delegreturn_thread */
3307 VN_HOLD(vp);
3308 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3309 NFS4_DR_DISCARD), FALSE);
3310 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3311 goto recov_retry;
3312 }
3313
3314 if (needrecov) {
3315 bool_t abort;
3316
3317 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3318 "nfs4write: client got error %d, res.status %d"
3319 ", so start recovery", e.error, res.status));
3320
3321 abort = nfs4_start_recovery(&e,
3322 VTOMI4(vp), vp, NULL, &wargs->stateid,
3323 NULL, OP_WRITE, NULL, NULL, NULL);
3324 if (!e.error) {
3325 e.error = geterrno4(res.status);
3326 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3327 }
3328 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3329 &recov_state, needrecov);
3330 if (abort == FALSE)
3331 goto recov_retry;
3332 return (e.error);
3333 }
3334
3335 if (res.status) {
3336 e.error = geterrno4(res.status);
3337 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3338 if (!recov)
3339 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3340 &recov_state, needrecov);
3341 return (e.error);
3342 }
3343
3344 resop = &res.array[1]; /* write res */
3345 wres = &resop->nfs_resop4_u.opwrite;
3346
3347 if ((int)wres->count > tsize) {
3348 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3349
3350 zcmn_err(getzoneid(), CE_WARN,
3351 "nfs4write: server wrote %u, requested was %u",
3352 (int)wres->count, tsize);
3353 if (!recov)
3354 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3355 &recov_state, needrecov);
3356 return (EIO);
3357 }
3358 if (wres->committed == UNSTABLE4) {
3359 *stab_comm = UNSTABLE4;
3360 if (wargs->stable == DATA_SYNC4 ||
3361 wargs->stable == FILE_SYNC4) {
3362 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3363 zcmn_err(getzoneid(), CE_WARN,
3364 "nfs4write: server %s did not commit "
3365 "to stable storage",
3366 rp->r_server->sv_hostname);
3367 if (!recov)
3368 nfs4_end_fop(VTOMI4(vp), vp, NULL,
3369 OH_WRITE, &recov_state, needrecov);
3370 return (EIO);
3371 }
3372 }
3373
3374 tsize = (int)wres->count;
3375 count -= tsize;
3376 base += tsize;
3377 offset += tsize;
3378 if (mi->mi_io_kstats) {
3379 mutex_enter(&mi->mi_lock);
3380 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3381 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3382 tsize;
3383 mutex_exit(&mi->mi_lock);
3384 }
3385 lwp_stat_update(LWP_STAT_OUBLK, 1);
3386 mutex_enter(&rp->r_statelock);
3387 if (rp->r_flags & R4HAVEVERF) {
3388 if (rp->r_writeverf != wres->writeverf) {
3389 nfs4_set_mod(vp);
3390 rp->r_writeverf = wres->writeverf;
3391 }
3392 } else {
3393 rp->r_writeverf = wres->writeverf;
3394 rp->r_flags |= R4HAVEVERF;
3395 }
3396 PURGE_ATTRCACHE4_LOCKED(rp);
3397 rp->r_flags |= R4WRITEMODIFIED;
3398 gethrestime(&rp->r_attr.va_mtime);
3399 rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3400 mutex_exit(&rp->r_statelock);
3401 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3402 } while (count);
3403
3404 if (!recov)
3405 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3406 needrecov);
3407
3408 return (e.error);
3409 }
3410
3411 /*
3412 * Read from a file. Reads data in largest chunks our interface can handle.
3413 */
3414 static int
3415 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3416 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3417 {
3418 mntinfo4_t *mi;
3419 COMPOUND4args_clnt args;
3420 COMPOUND4res_clnt res;
3421 READ4args *rargs;
3422 nfs_argop4 argop[2];
3423 int tsize;
3424 int doqueue;
3425 rnode4_t *rp;
3426 int data_len;
3427 bool_t is_eof;
3428 bool_t needrecov = FALSE;
3429 nfs4_recov_state_t recov_state;
3430 nfs4_stateid_types_t sid_types;
3431 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432
3433 rp = VTOR4(vp);
3434 mi = VTOMI4(vp);
3435 doqueue = 1;
3436
3437 ASSERT(nfs_zone() == mi->mi_zone);
3438
3439 args.ctag = async ? TAG_READAHEAD : TAG_READ;
3440
3441 args.array_len = 2;
3442 args.array = argop;
3443
3444 nfs4_init_stateid_types(&sid_types);
3445
3446 recov_state.rs_flags = 0;
3447 recov_state.rs_num_retry_despite_err = 0;
3448
3449 recov_retry:
3450 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3451 &recov_state, NULL);
3452 if (e.error)
3453 return (e.error);
3454
3455 /* putfh target fh */
3456 argop[0].argop = OP_CPUTFH;
3457 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3458
3459 /* read */
3460 argop[1].argop = OP_READ;
3461 rargs = &argop[1].nfs_argop4_u.opread;
3462 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3463 OP_READ, &sid_types, async);
3464
3465 do {
3466 if (mi->mi_io_kstats) {
3467 mutex_enter(&mi->mi_lock);
3468 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3469 mutex_exit(&mi->mi_lock);
3470 }
3471
3472 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3473 "nfs4read: %s call, rp %s",
3474 needrecov ? "recov" : "first",
3475 rnode4info(rp)));
3476
3477 if ((vp->v_flag & VNOCACHE) ||
3478 (rp->r_flags & R4DIRECTIO) ||
3479 (mi->mi_flags & MI4_DIRECTIO))
3480 tsize = MIN(mi->mi_tsize, count);
3481 else
3482 tsize = MIN(mi->mi_curread, count);
3483
3484 rargs->offset = (offset4)offset;
3485 rargs->count = (count4)tsize;
3486 rargs->res_data_val_alt = NULL;
3487 rargs->res_mblk = NULL;
3488 rargs->res_uiop = NULL;
3489 rargs->res_maxsize = 0;
3490 rargs->wlist = NULL;
3491
3492 if (uiop)
3493 rargs->res_uiop = uiop;
3494 else
3495 rargs->res_data_val_alt = base;
3496 rargs->res_maxsize = tsize;
3497
3498 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3499 #ifdef DEBUG
3500 if (nfs4read_error_inject) {
3501 res.status = nfs4read_error_inject;
3502 nfs4read_error_inject = 0;
3503 }
3504 #endif
3505
3506 if (mi->mi_io_kstats) {
3507 mutex_enter(&mi->mi_lock);
3508 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3509 mutex_exit(&mi->mi_lock);
3510 }
3511
3512 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3513 if (e.error != 0 && !needrecov) {
3514 nfs4_end_fop(mi, vp, NULL, OH_READ,
3515 &recov_state, needrecov);
3516 return (e.error);
3517 }
3518
3519 /*
3520 * Do proper retry for OLD and BAD stateid errors outside
3521 * of the normal recovery framework. There are two differences
3522 * between async and sync reads. The first is that we allow
3523 * retry on BAD_STATEID for async reads, but not sync reads.
3524 * The second is that we mark the file dead for a failed
3525 * attempt with a special stateid for sync reads, but just
3526 * return EIO for async reads.
3527 *
3528 * If a sync read receives a BAD stateid error while using a
3529 * delegation stateid, retry using the open stateid (if it
3530 * exists). If it doesn't have an open stateid, reopen the
3531 * file first, then retry.
3532 */
3533 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3534 res.status == NFS4ERR_BAD_STATEID) && async) {
3535 nfs4_end_fop(mi, vp, NULL, OH_READ,
3536 &recov_state, needrecov);
3537 if (sid_types.cur_sid_type == SPEC_SID) {
3538 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3539 return (EIO);
3540 }
3541 nfs4_save_stateid(&rargs->stateid, &sid_types);
3542 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3543 goto recov_retry;
3544 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3545 !async && sid_types.cur_sid_type != SPEC_SID) {
3546 nfs4_save_stateid(&rargs->stateid, &sid_types);
3547 nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 &recov_state, needrecov);
3549 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3550 goto recov_retry;
3551 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3552 sid_types.cur_sid_type == DEL_SID) {
3553 nfs4_save_stateid(&rargs->stateid, &sid_types);
3554 mutex_enter(&rp->r_statev4_lock);
3555 rp->r_deleg_return_pending = TRUE;
3556 mutex_exit(&rp->r_statev4_lock);
3557 if (nfs4rdwr_check_osid(vp, &e, cr)) {
3558 nfs4_end_fop(mi, vp, NULL, OH_READ,
3559 &recov_state, needrecov);
3560 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3561 return (EIO);
3562 }
3563 nfs4_end_fop(mi, vp, NULL, OH_READ,
3564 &recov_state, needrecov);
3565 /* hold needed for nfs4delegreturn_thread */
3566 VN_HOLD(vp);
3567 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3568 NFS4_DR_DISCARD), FALSE);
3569 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3570 goto recov_retry;
3571 }
3572 if (needrecov) {
3573 bool_t abort;
3574
3575 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3576 "nfs4read: initiating recovery\n"));
3577 abort = nfs4_start_recovery(&e,
3578 mi, vp, NULL, &rargs->stateid,
3579 NULL, OP_READ, NULL, NULL, NULL);
3580 nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 &recov_state, needrecov);
3582 /*
3583 * Do not retry if we got OLD_STATEID using a special
3584 * stateid. This avoids looping with a broken server.
3585 */
3586 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3587 sid_types.cur_sid_type == SPEC_SID)
3588 abort = TRUE;
3589
3590 if (abort == FALSE) {
3591 /*
3592 * Need to retry all possible stateids in
3593 * case the recovery error wasn't stateid
3594 * related or the stateids have become
3595 * stale (server reboot).
3596 */
3597 nfs4_init_stateid_types(&sid_types);
3598 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3599 goto recov_retry;
3600 }
3601
3602 if (!e.error) {
3603 e.error = geterrno4(res.status);
3604 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3605 }
3606 return (e.error);
3607 }
3608
3609 if (res.status) {
3610 e.error = geterrno4(res.status);
3611 nfs4_end_fop(mi, vp, NULL, OH_READ,
3612 &recov_state, needrecov);
3613 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3614 return (e.error);
3615 }
3616
3617 data_len = res.array[1].nfs_resop4_u.opread.data_len;
3618 count -= data_len;
3619 if (base)
3620 base += data_len;
3621 offset += data_len;
3622 if (mi->mi_io_kstats) {
3623 mutex_enter(&mi->mi_lock);
3624 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3625 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3626 mutex_exit(&mi->mi_lock);
3627 }
3628 lwp_stat_update(LWP_STAT_INBLK, 1);
3629 is_eof = res.array[1].nfs_resop4_u.opread.eof;
3630 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3631
3632 } while (count && !is_eof);
3633
3634 *residp = count;
3635
3636 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3637
3638 return (e.error);
3639 }
3640
3641 /* ARGSUSED */
3642 static int
3643 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3644 caller_context_t *ct)
3645 {
3646 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3647 return (EIO);
3648 switch (cmd) {
3649 case _FIODIRECTIO:
3650 return (nfs4_directio(vp, (int)arg, cr));
3651 default:
3652 return (ENOTTY);
3653 }
3654 }
3655
3656 /* ARGSUSED */
3657 int
3658 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3659 caller_context_t *ct)
3660 {
3661 int error;
3662 rnode4_t *rp = VTOR4(vp);
3663
3664 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3665 return (EIO);
3666 /*
3667 * If it has been specified that the return value will
3668 * just be used as a hint, and we are only being asked
3669 * for size, fsid or rdevid, then return the client's
3670 * notion of these values without checking to make sure
3671 * that the attribute cache is up to date.
3672 * The whole point is to avoid an over the wire GETATTR
3673 * call.
3674 */
3675 if (flags & ATTR_HINT) {
3676 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3677 mutex_enter(&rp->r_statelock);
3678 if (vap->va_mask & AT_SIZE)
3679 vap->va_size = rp->r_size;
3680 if (vap->va_mask & AT_FSID)
3681 vap->va_fsid = rp->r_attr.va_fsid;
3682 if (vap->va_mask & AT_RDEV)
3683 vap->va_rdev = rp->r_attr.va_rdev;
3684 mutex_exit(&rp->r_statelock);
3685 return (0);
3686 }
3687 }
3688
3689 /*
3690 * Only need to flush pages if asking for the mtime
3691 * and if there any dirty pages or any outstanding
3692 * asynchronous (write) requests for this file.
3693 */
3694 if (vap->va_mask & AT_MTIME) {
3695 rp = VTOR4(vp);
3696 if (nfs4_has_pages(vp)) {
3697 mutex_enter(&rp->r_statev4_lock);
3698 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3699 mutex_exit(&rp->r_statev4_lock);
3700 if (rp->r_flags & R4DIRTY ||
3701 rp->r_awcount > 0) {
3702 mutex_enter(&rp->r_statelock);
3703 rp->r_gcount++;
3704 mutex_exit(&rp->r_statelock);
3705 error =
3706 nfs4_putpage(vp, (u_offset_t)0,
3707 0, 0, cr, NULL);
3708 mutex_enter(&rp->r_statelock);
3709 if (error && (error == ENOSPC ||
3710 error == EDQUOT)) {
3711 if (!rp->r_error)
3712 rp->r_error = error;
3713 }
3714 if (--rp->r_gcount == 0)
3715 cv_broadcast(&rp->r_cv);
3716 mutex_exit(&rp->r_statelock);
3717 }
3718 } else {
3719 mutex_exit(&rp->r_statev4_lock);
3720 }
3721 }
3722 }
3723 return (nfs4getattr(vp, vap, cr));
3724 }
3725
3726 int
3727 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3728 {
3729 /*
3730 * If these are the only two bits cleared
3731 * on the server then return 0 (OK) else
3732 * return 1 (BAD).
3733 */
3734 on_client &= ~(S_ISUID|S_ISGID);
3735 if (on_client == from_server)
3736 return (0);
3737 else
3738 return (1);
3739 }
3740
3741 /*ARGSUSED4*/
3742 static int
3743 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3744 caller_context_t *ct)
3745 {
3746 int error;
3747
3748 if (vap->va_mask & AT_NOSET)
3749 return (EINVAL);
3750
3751 if (nfs_zone() != VTOMI4(vp)->mi_zone)
3752 return (EIO);
3753
3754 /*
3755 * Don't call secpolicy_vnode_setattr, the client cannot
3756 * use its cached attributes to make security decisions
3757 * as the server may be faking mode bits or mapping uid/gid.
3758 * Always just let the server to the checking.
3759 * If we provide the ability to remove basic priviledges
3760 * to setattr (e.g. basic without chmod) then we will
3761 * need to add a check here before calling the server.
3762 */
3763 error = nfs4setattr(vp, vap, flags, cr, NULL);
3764
3765 if (error == 0 && (vap->va_mask & AT_SIZE)) {
3766 if (vap->va_size == 0) {
3767 vnevent_truncate(vp, ct);
3768 } else {
3769 vnevent_resize(vp, ct);
3770 }
3771 }
3772
3773 return (error);
3774 }
3775
3776 /*
3777 * To replace the "guarded" version 3 setattr, we use two types of compound
3778 * setattr requests:
3779 * 1. The "normal" setattr, used when the size of the file isn't being
3780 * changed - { Putfh <fh>; Setattr; Getattr }/
3781 * 2. If the size is changed, precede Setattr with: Getattr; Verify
3782 * with only ctime as the argument. If the server ctime differs from
3783 * what is cached on the client, the verify will fail, but we would
3784 * already have the ctime from the preceding getattr, so just set it
3785 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3786 * Setattr; Getattr }.
3787 *
3788 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3789 * this setattr and NULL if they are not.
3790 */
3791 static int
3792 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3793 vsecattr_t *vsap)
3794 {
3795 COMPOUND4args_clnt args;
3796 COMPOUND4res_clnt res, *resp = NULL;
3797 nfs4_ga_res_t *garp = NULL;
3798 int numops = 3; /* { Putfh; Setattr; Getattr } */
3799 nfs_argop4 argop[5];
3800 int verify_argop = -1;
3801 int setattr_argop = 1;
3802 nfs_resop4 *resop;
3803 vattr_t va;
3804 rnode4_t *rp;
3805 int doqueue = 1;
3806 uint_t mask = vap->va_mask;
3807 mode_t omode;
3808 vsecattr_t *vsp;
3809 timestruc_t ctime;
3810 bool_t needrecov = FALSE;
3811 nfs4_recov_state_t recov_state;
3812 nfs4_stateid_types_t sid_types;
3813 stateid4 stateid;
3814 hrtime_t t;
3815 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3816 servinfo4_t *svp;
3817 bitmap4 supp_attrs;
3818
3819 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3820 rp = VTOR4(vp);
3821 nfs4_init_stateid_types(&sid_types);
3822
3823 /*
3824 * Only need to flush pages if there are any pages and
3825 * if the file is marked as dirty in some fashion. The
3826 * file must be flushed so that we can accurately
3827 * determine the size of the file and the cached data
3828 * after the SETATTR returns. A file is considered to
3829 * be dirty if it is either marked with R4DIRTY, has
3830 * outstanding i/o's active, or is mmap'd. In this
3831 * last case, we can't tell whether there are dirty
3832 * pages, so we flush just to be sure.
3833 */
3834 if (nfs4_has_pages(vp) &&
3835 ((rp->r_flags & R4DIRTY) ||
3836 rp->r_count > 0 ||
3837 rp->r_mapcnt > 0)) {
3838 ASSERT(vp->v_type != VCHR);
3839 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3840 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3841 mutex_enter(&rp->r_statelock);
3842 if (!rp->r_error)
3843 rp->r_error = e.error;
3844 mutex_exit(&rp->r_statelock);
3845 }
3846 }
3847
3848 if (mask & AT_SIZE) {
3849 /*
3850 * Verification setattr compound for non-deleg AT_SIZE:
3851 * { Putfh; Getattr; Verify; Setattr; Getattr }
3852 * Set ctime local here (outside the do_again label)
3853 * so that subsequent retries (after failed VERIFY)
3854 * will use ctime from GETATTR results (from failed
3855 * verify compound) as VERIFY arg.
3856 * If file has delegation, then VERIFY(time_metadata)
3857 * is of little added value, so don't bother.
3858 */
3859 mutex_enter(&rp->r_statev4_lock);
3860 if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3861 rp->r_deleg_return_pending) {
3862 numops = 5;
3863 ctime = rp->r_attr.va_ctime;
3864 }
3865 mutex_exit(&rp->r_statev4_lock);
3866 }
3867
3868 recov_state.rs_flags = 0;
3869 recov_state.rs_num_retry_despite_err = 0;
3870
3871 args.ctag = TAG_SETATTR;
3872 do_again:
3873 recov_retry:
3874 setattr_argop = numops - 2;
3875
3876 args.array = argop;
3877 args.array_len = numops;
3878
3879 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3880 if (e.error)
3881 return (e.error);
3882
3883
3884 /* putfh target fh */
3885 argop[0].argop = OP_CPUTFH;
3886 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3887
3888 if (numops == 5) {
3889 /*
3890 * We only care about the ctime, but need to get mtime
3891 * and size for proper cache update.
3892 */
3893 /* getattr */
3894 argop[1].argop = OP_GETATTR;
3895 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3896 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3897
3898 /* verify - set later in loop */
3899 verify_argop = 2;
3900 }
3901
3902 /* setattr */
3903 svp = rp->r_server;
3904 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 supp_attrs = svp->sv_supp_attrs;
3906 nfs_rw_exit(&svp->sv_lock);
3907
3908 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3909 supp_attrs, &e.error, &sid_types);
3910 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3911 if (e.error) {
3912 /* req time field(s) overflow - return immediately */
3913 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3914 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3915 opsetattr.obj_attributes);
3916 return (e.error);
3917 }
3918 omode = rp->r_attr.va_mode;
3919
3920 /* getattr */
3921 argop[numops-1].argop = OP_GETATTR;
3922 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3923 /*
3924 * If we are setting the ACL (indicated only by vsap != NULL), request
3925 * the ACL in this getattr. The ACL returned from this getattr will be
3926 * used in updating the ACL cache.
3927 */
3928 if (vsap != NULL)
3929 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3930 FATTR4_ACL_MASK;
3931 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3932
3933 /*
3934 * setattr iterates if the object size is set and the cached ctime
3935 * does not match the file ctime. In that case, verify the ctime first.
3936 */
3937
3938 do {
3939 if (verify_argop != -1) {
3940 /*
3941 * Verify that the ctime match before doing setattr.
3942 */
3943 va.va_mask = AT_CTIME;
3944 va.va_ctime = ctime;
3945 svp = rp->r_server;
3946 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3947 supp_attrs = svp->sv_supp_attrs;
3948 nfs_rw_exit(&svp->sv_lock);
3949 e.error = nfs4args_verify(&argop[verify_argop], &va,
3950 OP_VERIFY, supp_attrs);
3951 if (e.error) {
3952 /* req time field(s) overflow - return */
3953 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3954 needrecov);
3955 break;
3956 }
3957 }
3958
3959 doqueue = 1;
3960
3961 t = gethrtime();
3962
3963 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3964
3965 /*
3966 * Purge the access cache and ACL cache if changing either the
3967 * owner of the file, the group owner, or the mode. These may
3968 * change the access permissions of the file, so purge old
3969 * information and start over again.
3970 */
3971 if (mask & (AT_UID | AT_GID | AT_MODE)) {
3972 (void) nfs4_access_purge_rp(rp);
3973 if (rp->r_secattr != NULL) {
3974 mutex_enter(&rp->r_statelock);
3975 vsp = rp->r_secattr;
3976 rp->r_secattr = NULL;
3977 mutex_exit(&rp->r_statelock);
3978 if (vsp != NULL)
3979 nfs4_acl_free_cache(vsp);
3980 }
3981 }
3982
3983 /*
3984 * If res.array_len == numops, then everything succeeded,
3985 * except for possibly the final getattr. If only the
3986 * last getattr failed, give up, and don't try recovery.
3987 */
3988 if (res.array_len == numops) {
3989 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3990 needrecov);
3991 if (! e.error)
3992 resp = &res;
3993 break;
3994 }
3995
3996 /*
3997 * if either rpc call failed or completely succeeded - done
3998 */
3999 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4000 if (e.error) {
4001 PURGE_ATTRCACHE4(vp);
4002 if (!needrecov) {
4003 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4004 needrecov);
4005 break;
4006 }
4007 }
4008
4009 /*
4010 * Do proper retry for OLD_STATEID outside of the normal
4011 * recovery framework.
4012 */
4013 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4014 sid_types.cur_sid_type != SPEC_SID &&
4015 sid_types.cur_sid_type != NO_SID) {
4016 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4017 needrecov);
4018 nfs4_save_stateid(&stateid, &sid_types);
4019 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4020 opsetattr.obj_attributes);
4021 if (verify_argop != -1) {
4022 nfs4args_verify_free(&argop[verify_argop]);
4023 verify_argop = -1;
4024 }
4025 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4026 goto recov_retry;
4027 }
4028
4029 if (needrecov) {
4030 bool_t abort;
4031
4032 abort = nfs4_start_recovery(&e,
4033 VTOMI4(vp), vp, NULL, NULL, NULL,
4034 OP_SETATTR, NULL, NULL, NULL);
4035 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4036 needrecov);
4037 /*
4038 * Do not retry if we failed with OLD_STATEID using
4039 * a special stateid. This is done to avoid looping
4040 * with a broken server.
4041 */
4042 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4043 (sid_types.cur_sid_type == SPEC_SID ||
4044 sid_types.cur_sid_type == NO_SID))
4045 abort = TRUE;
4046 if (!e.error) {
4047 if (res.status == NFS4ERR_BADOWNER)
4048 nfs4_log_badowner(VTOMI4(vp),
4049 OP_SETATTR);
4050
4051 e.error = geterrno4(res.status);
4052 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4053 }
4054 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4055 opsetattr.obj_attributes);
4056 if (verify_argop != -1) {
4057 nfs4args_verify_free(&argop[verify_argop]);
4058 verify_argop = -1;
4059 }
4060 if (abort == FALSE) {
4061 /*
4062 * Need to retry all possible stateids in
4063 * case the recovery error wasn't stateid
4064 * related or the stateids have become
4065 * stale (server reboot).
4066 */
4067 nfs4_init_stateid_types(&sid_types);
4068 goto recov_retry;
4069 }
4070 return (e.error);
4071 }
4072
4073 /*
4074 * Need to call nfs4_end_op before nfs4getattr to
4075 * avoid potential nfs4_start_op deadlock. See RFE
4076 * 4777612. Calls to nfs4_invalidate_pages() and
4077 * nfs4_purge_stale_fh() might also generate over the
4078 * wire calls which my cause nfs4_start_op() deadlock.
4079 */
4080 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4081
4082 /*
4083 * Check to update lease.
4084 */
4085 resp = &res;
4086 if (res.status == NFS4_OK) {
4087 break;
4088 }
4089
4090 /*
4091 * Check if verify failed to see if try again
4092 */
4093 if ((verify_argop == -1) || (res.array_len != 3)) {
4094 /*
4095 * can't continue...
4096 */
4097 if (res.status == NFS4ERR_BADOWNER)
4098 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4099
4100 e.error = geterrno4(res.status);
4101 } else {
4102 /*
4103 * When the verify request fails, the client ctime is
4104 * not in sync with the server. This is the same as
4105 * the version 3 "not synchronized" error, and we
4106 * handle it in a similar manner (XXX do we need to???).
4107 * Use the ctime returned in the first getattr for
4108 * the input to the next verify.
4109 * If we couldn't get the attributes, then we give up
4110 * because we can't complete the operation as required.
4111 */
4112 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4113 }
4114 if (e.error) {
4115 PURGE_ATTRCACHE4(vp);
4116 nfs4_purge_stale_fh(e.error, vp, cr);
4117 } else {
4118 /*
4119 * retry with a new verify value
4120 */
4121 ctime = garp->n4g_va.va_ctime;
4122 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4123 resp = NULL;
4124 }
4125 if (!e.error) {
4126 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4127 opsetattr.obj_attributes);
4128 if (verify_argop != -1) {
4129 nfs4args_verify_free(&argop[verify_argop]);
4130 verify_argop = -1;
4131 }
4132 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4133 goto do_again;
4134 }
4135 } while (!e.error);
4136
4137 if (e.error) {
4138 /*
4139 * If we are here, rfs4call has an irrecoverable error - return
4140 */
4141 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4142 opsetattr.obj_attributes);
4143 if (verify_argop != -1) {
4144 nfs4args_verify_free(&argop[verify_argop]);
4145 verify_argop = -1;
4146 }
4147 if (resp)
4148 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4149 return (e.error);
4150 }
4151
4152
4153
4154 /*
4155 * If changing the size of the file, invalidate
4156 * any local cached data which is no longer part
4157 * of the file. We also possibly invalidate the
4158 * last page in the file. We could use
4159 * pvn_vpzero(), but this would mark the page as
4160 * modified and require it to be written back to
4161 * the server for no particularly good reason.
4162 * This way, if we access it, then we bring it
4163 * back in. A read should be cheaper than a
4164 * write.
4165 */
4166 if (mask & AT_SIZE) {
4167 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4168 }
4169
4170 /* either no error or one of the postop getattr failed */
4171
4172 /*
4173 * XXX Perform a simplified version of wcc checking. Instead of
4174 * have another getattr to get pre-op, just purge cache if
4175 * any of the ops prior to and including the getattr failed.
4176 * If the getattr succeeded then update the attrcache accordingly.
4177 */
4178
4179 garp = NULL;
4180 if (res.status == NFS4_OK) {
4181 /*
4182 * Last getattr
4183 */
4184 resop = &res.array[numops - 1];
4185 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4186 }
4187 /*
4188 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4189 * rather than filling it. See the function itself for details.
4190 */
4191 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4192 if (garp != NULL) {
4193 if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4194 nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4195 vs_ace4_destroy(&garp->n4g_vsa);
4196 } else {
4197 if (vsap != NULL) {
4198 /*
4199 * The ACL was supposed to be set and to be
4200 * returned in the last getattr of this
4201 * compound, but for some reason the getattr
4202 * result doesn't contain the ACL. In this
4203 * case, purge the ACL cache.
4204 */
4205 if (rp->r_secattr != NULL) {
4206 mutex_enter(&rp->r_statelock);
4207 vsp = rp->r_secattr;
4208 rp->r_secattr = NULL;
4209 mutex_exit(&rp->r_statelock);
4210 if (vsp != NULL)
4211 nfs4_acl_free_cache(vsp);
4212 }
4213 }
4214 }
4215 }
4216
4217 if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4218 /*
4219 * Set the size, rather than relying on getting it updated
4220 * via a GETATTR. With delegations the client tries to
4221 * suppress GETATTR calls.
4222 */
4223 mutex_enter(&rp->r_statelock);
4224 rp->r_size = vap->va_size;
4225 mutex_exit(&rp->r_statelock);
4226 }
4227
4228 /*
4229 * Can free up request args and res
4230 */
4231 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4232 opsetattr.obj_attributes);
4233 if (verify_argop != -1) {
4234 nfs4args_verify_free(&argop[verify_argop]);
4235 verify_argop = -1;
4236 }
4237 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4238
4239 /*
4240 * Some servers will change the mode to clear the setuid
4241 * and setgid bits when changing the uid or gid. The
4242 * client needs to compensate appropriately.
4243 */
4244 if (mask & (AT_UID | AT_GID)) {
4245 int terror, do_setattr;
4246
4247 do_setattr = 0;
4248 va.va_mask = AT_MODE;
4249 terror = nfs4getattr(vp, &va, cr);
4250 if (!terror &&
4251 (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4252 (!(mask & AT_MODE) && va.va_mode != omode))) {
4253 va.va_mask = AT_MODE;
4254 if (mask & AT_MODE) {
4255 /*
4256 * We asked the mode to be changed and what
4257 * we just got from the server in getattr is
4258 * not what we wanted it to be, so set it now.
4259 */
4260 va.va_mode = vap->va_mode;
4261 do_setattr = 1;
4262 } else {
4263 /*
4264 * We did not ask the mode to be changed,
4265 * Check to see that the server just cleared
4266 * I_SUID and I_GUID from it. If not then
4267 * set mode to omode with UID/GID cleared.
4268 */
4269 if (nfs4_compare_modes(va.va_mode, omode)) {
4270 omode &= ~(S_ISUID|S_ISGID);
4271 va.va_mode = omode;
4272 do_setattr = 1;
4273 }
4274 }
4275
4276 if (do_setattr)
4277 (void) nfs4setattr(vp, &va, 0, cr, NULL);
4278 }
4279 }
4280
4281 return (e.error);
4282 }
4283
4284 /* ARGSUSED */
4285 static int
4286 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4287 {
4288 COMPOUND4args_clnt args;
4289 COMPOUND4res_clnt res;
4290 int doqueue;
4291 uint32_t acc, resacc, argacc;
4292 rnode4_t *rp;
4293 cred_t *cred, *ncr, *ncrfree = NULL;
4294 nfs4_access_type_t cacc;
4295 int num_ops;
4296 nfs_argop4 argop[3];
4297 nfs_resop4 *resop;
4298 bool_t needrecov = FALSE, do_getattr;
4299 nfs4_recov_state_t recov_state;
4300 int rpc_error;
4301 hrtime_t t;
4302 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4303 mntinfo4_t *mi = VTOMI4(vp);
4304
4305 if (nfs_zone() != mi->mi_zone)
4306 return (EIO);
4307
4308 acc = 0;
4309 if (mode & VREAD)
4310 acc |= ACCESS4_READ;
4311 if (mode & VWRITE) {
4312 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4313 return (EROFS);
4314 if (vp->v_type == VDIR)
4315 acc |= ACCESS4_DELETE;
4316 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4317 }
4318 if (mode & VEXEC) {
4319 if (vp->v_type == VDIR)
4320 acc |= ACCESS4_LOOKUP;
4321 else
4322 acc |= ACCESS4_EXECUTE;
4323 }
4324
4325 if (VTOR4(vp)->r_acache != NULL) {
4326 e.error = nfs4_validate_caches(vp, cr);
4327 if (e.error)
4328 return (e.error);
4329 }
4330
4331 rp = VTOR4(vp);
4332 if (vp->v_type == VDIR)
4333 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4334 ACCESS4_EXTEND | ACCESS4_LOOKUP;
4335 else
4336 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4337 ACCESS4_EXECUTE;
4338 recov_state.rs_flags = 0;
4339 recov_state.rs_num_retry_despite_err = 0;
4340
4341 cred = cr;
4342 /*
4343 * ncr and ncrfree both initially
4344 * point to the memory area returned
4345 * by crnetadjust();
4346 * ncrfree not NULL when exiting means
4347 * that we need to release it
4348 */
4349 ncr = crnetadjust(cred);
4350 ncrfree = ncr;
4351
4352 tryagain:
4353 cacc = nfs4_access_check(rp, acc, cred);
4354 if (cacc == NFS4_ACCESS_ALLOWED) {
4355 if (ncrfree != NULL)
4356 crfree(ncrfree);
4357 return (0);
4358 }
4359 if (cacc == NFS4_ACCESS_DENIED) {
4360 /*
4361 * If the cred can be adjusted, try again
4362 * with the new cred.
4363 */
4364 if (ncr != NULL) {
4365 cred = ncr;
4366 ncr = NULL;
4367 goto tryagain;
4368 }
4369 if (ncrfree != NULL)
4370 crfree(ncrfree);
4371 return (EACCES);
4372 }
4373
4374 recov_retry:
4375 /*
4376 * Don't take with r_statev4_lock here. r_deleg_type could
4377 * change as soon as lock is released. Since it is an int,
4378 * there is no atomicity issue.
4379 */
4380 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4381 num_ops = do_getattr ? 3 : 2;
4382
4383 args.ctag = TAG_ACCESS;
4384
4385 args.array_len = num_ops;
4386 args.array = argop;
4387
4388 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4389 &recov_state, NULL)) {
4390 if (ncrfree != NULL)
4391 crfree(ncrfree);
4392 return (e.error);
4393 }
4394
4395 /* putfh target fh */
4396 argop[0].argop = OP_CPUTFH;
4397 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4398
4399 /* access */
4400 argop[1].argop = OP_ACCESS;
4401 argop[1].nfs_argop4_u.opaccess.access = argacc;
4402
4403 /* getattr */
4404 if (do_getattr) {
4405 argop[2].argop = OP_GETATTR;
4406 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4407 argop[2].nfs_argop4_u.opgetattr.mi = mi;
4408 }
4409
4410 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4411 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4412 rnode4info(VTOR4(vp))));
4413
4414 doqueue = 1;
4415 t = gethrtime();
4416 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4417 rpc_error = e.error;
4418
4419 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4420 if (needrecov) {
4421 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4422 "nfs4_access: initiating recovery\n"));
4423
4424 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4425 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4426 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4427 &recov_state, needrecov);
4428 if (!e.error)
4429 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4430 goto recov_retry;
4431 }
4432 }
4433 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4434
4435 if (e.error)
4436 goto out;
4437
4438 if (res.status) {
4439 e.error = geterrno4(res.status);
4440 /*
4441 * This might generate over the wire calls throught
4442 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4443 * here to avoid a deadlock.
4444 */
4445 nfs4_purge_stale_fh(e.error, vp, cr);
4446 goto out;
4447 }
4448 resop = &res.array[1]; /* access res */
4449
4450 resacc = resop->nfs_resop4_u.opaccess.access;
4451
4452 if (do_getattr) {
4453 resop++; /* getattr res */
4454 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4455 t, cr, FALSE, NULL);
4456 }
4457
4458 if (!e.error) {
4459 nfs4_access_cache(rp, argacc, resacc, cred);
4460 /*
4461 * we just cached results with cred; if cred is the
4462 * adjusted credentials from crnetadjust, we do not want
4463 * to release them before exiting: hence setting ncrfree
4464 * to NULL
4465 */
4466 if (cred != cr)
4467 ncrfree = NULL;
4468 /* XXX check the supported bits too? */
4469 if ((acc & resacc) != acc) {
4470 /*
4471 * The following code implements the semantic
4472 * that a setuid root program has *at least* the
4473 * permissions of the user that is running the
4474 * program. See rfs3call() for more portions
4475 * of the implementation of this functionality.
4476 */
4477 /* XXX-LP */
4478 if (ncr != NULL) {
4479 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4480 cred = ncr;
4481 ncr = NULL;
4482 goto tryagain;
4483 }
4484 e.error = EACCES;
4485 }
4486 }
4487
4488 out:
4489 if (!rpc_error)
4490 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4491
4492 if (ncrfree != NULL)
4493 crfree(ncrfree);
4494
4495 return (e.error);
4496 }
4497
4498 /* ARGSUSED */
4499 static int
4500 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4501 {
4502 COMPOUND4args_clnt args;
4503 COMPOUND4res_clnt res;
4504 int doqueue;
4505 rnode4_t *rp;
4506 nfs_argop4 argop[3];
4507 nfs_resop4 *resop;
4508 READLINK4res *lr_res;
4509 nfs4_ga_res_t *garp;
4510 uint_t len;
4511 char *linkdata;
4512 bool_t needrecov = FALSE;
4513 nfs4_recov_state_t recov_state;
4514 hrtime_t t;
4515 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4516
4517 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4518 return (EIO);
4519 /*
4520 * Can't readlink anything other than a symbolic link.
4521 */
4522 if (vp->v_type != VLNK)
4523 return (EINVAL);
4524
4525 rp = VTOR4(vp);
4526 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4527 e.error = nfs4_validate_caches(vp, cr);
4528 if (e.error)
4529 return (e.error);
4530 mutex_enter(&rp->r_statelock);
4531 if (rp->r_symlink.contents != NULL) {
4532 e.error = uiomove(rp->r_symlink.contents,
4533 rp->r_symlink.len, UIO_READ, uiop);
4534 mutex_exit(&rp->r_statelock);
4535 return (e.error);
4536 }
4537 mutex_exit(&rp->r_statelock);
4538 }
4539 recov_state.rs_flags = 0;
4540 recov_state.rs_num_retry_despite_err = 0;
4541
4542 recov_retry:
4543 args.array_len = 3;
4544 args.array = argop;
4545 args.ctag = TAG_READLINK;
4546
4547 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4548 if (e.error) {
4549 return (e.error);
4550 }
4551
4552 /* 0. putfh symlink fh */
4553 argop[0].argop = OP_CPUTFH;
4554 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4555
4556 /* 1. readlink */
4557 argop[1].argop = OP_READLINK;
4558
4559 /* 2. getattr */
4560 argop[2].argop = OP_GETATTR;
4561 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4562 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4563
4564 doqueue = 1;
4565
4566 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4567 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4568 rnode4info(VTOR4(vp))));
4569
4570 t = gethrtime();
4571
4572 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4573
4574 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4575 if (needrecov) {
4576 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4577 "nfs4_readlink: initiating recovery\n"));
4578
4579 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4580 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4581 if (!e.error)
4582 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4583
4584 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4585 needrecov);
4586 goto recov_retry;
4587 }
4588 }
4589
4590 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4591
4592 if (e.error)
4593 return (e.error);
4594
4595 /*
4596 * There is an path in the code below which calls
4597 * nfs4_purge_stale_fh(), which may generate otw calls through
4598 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4599 * here to avoid nfs4_start_op() deadlock.
4600 */
4601
4602 if (res.status && (res.array_len < args.array_len)) {
4603 /*
4604 * either Putfh or Link failed
4605 */
4606 e.error = geterrno4(res.status);
4607 nfs4_purge_stale_fh(e.error, vp, cr);
4608 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4609 return (e.error);
4610 }
4611
4612 resop = &res.array[1]; /* readlink res */
4613 lr_res = &resop->nfs_resop4_u.opreadlink;
4614
4615 /*
4616 * treat symlink names as data
4617 */
4618 linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
4619 if (linkdata != NULL) {
4620 int uio_len = len - 1;
4621 /* len includes null byte, which we won't uiomove */
4622 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4623 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4624 mutex_enter(&rp->r_statelock);
4625 if (rp->r_symlink.contents == NULL) {
4626 rp->r_symlink.contents = linkdata;
4627 rp->r_symlink.len = uio_len;
4628 rp->r_symlink.size = len;
4629 mutex_exit(&rp->r_statelock);
4630 } else {
4631 mutex_exit(&rp->r_statelock);
4632 kmem_free(linkdata, len);
4633 }
4634 } else {
4635 kmem_free(linkdata, len);
4636 }
4637 }
4638 if (res.status == NFS4_OK) {
4639 resop++; /* getattr res */
4640 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4641 }
4642 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4643
4644 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4645
4646 /*
4647 * The over the wire error for attempting to readlink something
4648 * other than a symbolic link is ENXIO. However, we need to
4649 * return EINVAL instead of ENXIO, so we map it here.
4650 */
4651 return (e.error == ENXIO ? EINVAL : e.error);
4652 }
4653
4654 /*
4655 * Flush local dirty pages to stable storage on the server.
4656 *
4657 * If FNODSYNC is specified, then there is nothing to do because
4658 * metadata changes are not cached on the client before being
4659 * sent to the server.
4660 */
4661 /* ARGSUSED */
4662 static int
4663 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4664 {
4665 int error;
4666
4667 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4668 return (0);
4669 if (nfs_zone() != VTOMI4(vp)->mi_zone)
4670 return (EIO);
4671 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4672 if (!error)
4673 error = VTOR4(vp)->r_error;
4674 return (error);
4675 }
4676
4677 /*
4678 * Weirdness: if the file was removed or the target of a rename
4679 * operation while it was open, it got renamed instead. Here we
4680 * remove the renamed file.
4681 */
4682 /* ARGSUSED */
4683 void
4684 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4685 {
4686 rnode4_t *rp;
4687
4688 ASSERT(vp != DNLC_NO_VNODE);
4689
4690 rp = VTOR4(vp);
4691
4692 if (IS_SHADOW(vp, rp)) {
4693 sv_inactive(vp);
4694 return;
4695 }
4696
4697 /*
4698 * If this is coming from the wrong zone, we let someone in the right
4699 * zone take care of it asynchronously. We can get here due to
4700 * VN_RELE() being called from pageout() or fsflush(). This call may
4701 * potentially turn into an expensive no-op if, for instance, v_count
4702 * gets incremented in the meantime, but it's still correct.
4703 */
4704 if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4705 nfs4_async_inactive(vp, cr);
4706 return;
4707 }
4708
4709 /*
4710 * Some of the cleanup steps might require over-the-wire
4711 * operations. Since VOP_INACTIVE can get called as a result of
4712 * other over-the-wire operations (e.g., an attribute cache update
4713 * can lead to a DNLC purge), doing those steps now would lead to a
4714 * nested call to the recovery framework, which can deadlock. So
4715 * do any over-the-wire cleanups asynchronously, in a separate
4716 * thread.
4717 */
4718
4719 mutex_enter(&rp->r_os_lock);
4720 mutex_enter(&rp->r_statelock);
4721 mutex_enter(&rp->r_statev4_lock);
4722
4723 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4724 mutex_exit(&rp->r_statev4_lock);
4725 mutex_exit(&rp->r_statelock);
4726 mutex_exit(&rp->r_os_lock);
4727 nfs4_async_inactive(vp, cr);
4728 return;
4729 }
4730
4731 if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4732 rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4733 mutex_exit(&rp->r_statev4_lock);
4734 mutex_exit(&rp->r_statelock);
4735 mutex_exit(&rp->r_os_lock);
4736 nfs4_async_inactive(vp, cr);
4737 return;
4738 }
4739
4740 if (rp->r_unldvp != NULL) {
4741 mutex_exit(&rp->r_statev4_lock);
4742 mutex_exit(&rp->r_statelock);
4743 mutex_exit(&rp->r_os_lock);
4744 nfs4_async_inactive(vp, cr);
4745 return;
4746 }
4747 mutex_exit(&rp->r_statev4_lock);
4748 mutex_exit(&rp->r_statelock);
4749 mutex_exit(&rp->r_os_lock);
4750
4751 rp4_addfree(rp, cr);
4752 }
4753
4754 /*
4755 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4756 * various bits of state. The caller must not refer to vp after this call.
4757 */
4758
4759 void
4760 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4761 {
4762 rnode4_t *rp = VTOR4(vp);
4763 nfs4_recov_state_t recov_state;
4764 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4765 vnode_t *unldvp;
4766 char *unlname;
4767 cred_t *unlcred;
4768 COMPOUND4args_clnt args;
4769 COMPOUND4res_clnt res, *resp;
4770 nfs_argop4 argop[2];
4771 int doqueue;
4772 #ifdef DEBUG
4773 char *name;
4774 #endif
4775
4776 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4777 ASSERT(!IS_SHADOW(vp, rp));
4778
4779 #ifdef DEBUG
4780 name = fn_name(VTOSV(vp)->sv_name);
4781 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4782 "release vnode %s", name));
4783 kmem_free(name, MAXNAMELEN);
4784 #endif
4785
4786 if (vp->v_type == VREG) {
4787 bool_t recov_failed = FALSE;
4788
4789 e.error = nfs4close_all(vp, cr);
4790 if (e.error) {
4791 /* Check to see if recovery failed */
4792 mutex_enter(&(VTOMI4(vp)->mi_lock));
4793 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4794 recov_failed = TRUE;
4795 mutex_exit(&(VTOMI4(vp)->mi_lock));
4796 if (!recov_failed) {
4797 mutex_enter(&rp->r_statelock);
4798 if (rp->r_flags & R4RECOVERR)
4799 recov_failed = TRUE;
4800 mutex_exit(&rp->r_statelock);
4801 }
4802 if (recov_failed) {
4803 NFS4_DEBUG(nfs4_client_recov_debug,
4804 (CE_NOTE, "nfs4_inactive_otw: "
4805 "close failed (recovery failure)"));
4806 }
4807 }
4808 }
4809
4810 redo:
4811 if (rp->r_unldvp == NULL) {
4812 rp4_addfree(rp, cr);
4813 return;
4814 }
4815
4816 /*
4817 * Save the vnode pointer for the directory where the
4818 * unlinked-open file got renamed, then set it to NULL
4819 * to prevent another thread from getting here before
4820 * we're done with the remove. While we have the
4821 * statelock, make local copies of the pertinent rnode
4822 * fields. If we weren't to do this in an atomic way, the
4823 * the unl* fields could become inconsistent with respect
4824 * to each other due to a race condition between this
4825 * code and nfs_remove(). See bug report 1034328.
4826 */
4827 mutex_enter(&rp->r_statelock);
4828 if (rp->r_unldvp == NULL) {
4829 mutex_exit(&rp->r_statelock);
4830 rp4_addfree(rp, cr);
4831 return;
4832 }
4833
4834 unldvp = rp->r_unldvp;
4835 rp->r_unldvp = NULL;
4836 unlname = rp->r_unlname;
4837 rp->r_unlname = NULL;
4838 unlcred = rp->r_unlcred;
4839 rp->r_unlcred = NULL;
4840 mutex_exit(&rp->r_statelock);
4841
4842 /*
4843 * If there are any dirty pages left, then flush
4844 * them. This is unfortunate because they just
4845 * may get thrown away during the remove operation,
4846 * but we have to do this for correctness.
4847 */
4848 if (nfs4_has_pages(vp) &&
4849 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4850 ASSERT(vp->v_type != VCHR);
4851 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4852 if (e.error) {
4853 mutex_enter(&rp->r_statelock);
4854 if (!rp->r_error)
4855 rp->r_error = e.error;
4856 mutex_exit(&rp->r_statelock);
4857 }
4858 }
4859
4860 recov_state.rs_flags = 0;
4861 recov_state.rs_num_retry_despite_err = 0;
4862 recov_retry_remove:
4863 /*
4864 * Do the remove operation on the renamed file
4865 */
4866 args.ctag = TAG_INACTIVE;
4867
4868 /*
4869 * Remove ops: putfh dir; remove
4870 */
4871 args.array_len = 2;
4872 args.array = argop;
4873
4874 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4875 if (e.error) {
4876 kmem_free(unlname, MAXNAMELEN);
4877 crfree(unlcred);
4878 VN_RELE(unldvp);
4879 /*
4880 * Try again; this time around r_unldvp will be NULL, so we'll
4881 * just call rp4_addfree() and return.
4882 */
4883 goto redo;
4884 }
4885
4886 /* putfh directory */
4887 argop[0].argop = OP_CPUTFH;
4888 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4889
4890 /* remove */
4891 argop[1].argop = OP_CREMOVE;
4892 argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4893
4894 doqueue = 1;
4895 resp = &res;
4896
4897 #if 0 /* notyet */
4898 /*
4899 * Can't do this yet. We may be being called from
4900 * dnlc_purge_XXX while that routine is holding a
4901 * mutex lock to the nc_rele list. The calls to
4902 * nfs3_cache_wcc_data may result in calls to
4903 * dnlc_purge_XXX. This will result in a deadlock.
4904 */
4905 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4906 if (e.error) {
4907 PURGE_ATTRCACHE4(unldvp);
4908 resp = NULL;
4909 } else if (res.status) {
4910 e.error = geterrno4(res.status);
4911 PURGE_ATTRCACHE4(unldvp);
4912 /*
4913 * This code is inactive right now
4914 * but if made active there should
4915 * be a nfs4_end_op() call before
4916 * nfs4_purge_stale_fh to avoid start_op()
4917 * deadlock. See BugId: 4948726
4918 */
4919 nfs4_purge_stale_fh(error, unldvp, cr);
4920 } else {
4921 nfs_resop4 *resop;
4922 REMOVE4res *rm_res;
4923
4924 resop = &res.array[1];
4925 rm_res = &resop->nfs_resop4_u.opremove;
4926 /*
4927 * Update directory cache attribute,
4928 * readdir and dnlc caches.
4929 */
4930 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4931 }
4932 #else
4933 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4934
4935 PURGE_ATTRCACHE4(unldvp);
4936 #endif
4937
4938 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4939 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4940 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4941 if (!e.error)
4942 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4943 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4944 &recov_state, TRUE);
4945 goto recov_retry_remove;
4946 }
4947 }
4948 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4949
4950 /*
4951 * Release stuff held for the remove
4952 */
4953 VN_RELE(unldvp);
4954 if (!e.error && resp)
4955 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4956
4957 kmem_free(unlname, MAXNAMELEN);
4958 crfree(unlcred);
4959 goto redo;
4960 }
4961
4962 /*
4963 * Remote file system operations having to do with directory manipulation.
4964 */
4965 /* ARGSUSED3 */
4966 int
4967 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4968 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4969 int *direntflags, pathname_t *realpnp)
4970 {
4971 int error;
4972 vnode_t *vp, *avp = NULL;
4973 rnode4_t *drp;
4974
4975 *vpp = NULL;
4976 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4977 return (EPERM);
4978 /*
4979 * if LOOKUP_XATTR, must replace dvp (object) with
4980 * object's attrdir before continuing with lookup
4981 */
4982 if (flags & LOOKUP_XATTR) {
4983 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4984 if (error)
4985 return (error);
4986
4987 dvp = avp;
4988
4989 /*
4990 * If lookup is for "", just return dvp now. The attrdir
4991 * has already been activated (from nfs4lookup_xattr), and
4992 * the caller will RELE the original dvp -- not
4993 * the attrdir. So, set vpp and return.
4994 * Currently, when the LOOKUP_XATTR flag is
4995 * passed to VOP_LOOKUP, the name is always empty, and
4996 * shortcircuiting here avoids 3 unneeded lock/unlock
4997 * pairs.
4998 *
4999 * If a non-empty name was provided, then it is the
5000 * attribute name, and it will be looked up below.
5001 */
5002 if (*nm == '\0') {
5003 *vpp = dvp;
5004 return (0);
5005 }
5006
5007 /*
5008 * The vfs layer never sends a name when asking for the
5009 * attrdir, so we should never get here (unless of course
5010 * name is passed at some time in future -- at which time
5011 * we'll blow up here).
5012 */
5013 ASSERT(0);
5014 }
5015
5016 drp = VTOR4(dvp);
5017 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5018 return (EINTR);
5019
5020 error = nfs4lookup(dvp, nm, vpp, cr, 0);
5021 nfs_rw_exit(&drp->r_rwlock);
5022
5023 /*
5024 * If vnode is a device, create special vnode.
5025 */
5026 if (!error && ISVDEV((*vpp)->v_type)) {
5027 vp = *vpp;
5028 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
5029 VN_RELE(vp);
5030 }
5031
5032 return (error);
5033 }
5034
5035 /* ARGSUSED */
5036 static int
5037 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5038 {
5039 int error;
5040 rnode4_t *drp;
5041 int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5042 mntinfo4_t *mi;
5043
5044 mi = VTOMI4(dvp);
5045 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5046 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5047 return (EINVAL);
5048
5049 drp = VTOR4(dvp);
5050 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5051 return (EINTR);
5052
5053 mutex_enter(&drp->r_statelock);
5054 /*
5055 * If the server doesn't support xattrs just return EINVAL
5056 */
5057 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5058 mutex_exit(&drp->r_statelock);
5059 nfs_rw_exit(&drp->r_rwlock);
5060 return (EINVAL);
5061 }
5062
5063 /*
5064 * If there is a cached xattr directory entry,
5065 * use it as long as the attributes are valid. If the
5066 * attributes are not valid, take the simple approach and
5067 * free the cached value and re-fetch a new value.
5068 *
5069 * We don't negative entry cache for now, if we did we
5070 * would need to check if the file has changed on every
5071 * lookup. But xattrs don't exist very often and failing
5072 * an openattr is not much more expensive than and NVERIFY or GETATTR
5073 * so do an openattr over the wire for now.
5074 */
5075 if (drp->r_xattr_dir != NULL) {
5076 if (ATTRCACHE4_VALID(dvp)) {
5077 VN_HOLD(drp->r_xattr_dir);
5078 *vpp = drp->r_xattr_dir;
5079 mutex_exit(&drp->r_statelock);
5080 nfs_rw_exit(&drp->r_rwlock);
5081 return (0);
5082 }
5083 VN_RELE(drp->r_xattr_dir);
5084 drp->r_xattr_dir = NULL;
5085 }
5086 mutex_exit(&drp->r_statelock);
5087
5088 error = nfs4openattr(dvp, vpp, cflag, cr);
5089
5090 nfs_rw_exit(&drp->r_rwlock);
5091
5092 return (error);
5093 }
5094
5095 static int
5096 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5097 {
5098 int error;
5099 rnode4_t *drp;
5100
5101 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5102
5103 /*
5104 * If lookup is for "", just return dvp. Don't need
5105 * to send it over the wire, look it up in the dnlc,
5106 * or perform any access checks.
5107 */
5108 if (*nm == '\0') {
5109 VN_HOLD(dvp);
5110 *vpp = dvp;
5111 return (0);
5112 }
5113
5114 /*
5115 * Can't do lookups in non-directories.
5116 */
5117 if (dvp->v_type != VDIR)
5118 return (ENOTDIR);
5119
5120 /*
5121 * If lookup is for ".", just return dvp. Don't need
5122 * to send it over the wire or look it up in the dnlc,
5123 * just need to check access.
5124 */
5125 if (nm[0] == '.' && nm[1] == '\0') {
5126 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5127 if (error)
5128 return (error);
5129 VN_HOLD(dvp);
5130 *vpp = dvp;
5131 return (0);
5132 }
5133
5134 drp = VTOR4(dvp);
5135 if (!(drp->r_flags & R4LOOKUP)) {
5136 mutex_enter(&drp->r_statelock);
5137 drp->r_flags |= R4LOOKUP;
5138 mutex_exit(&drp->r_statelock);
5139 }
5140
5141 *vpp = NULL;
5142 /*
5143 * Lookup this name in the DNLC. If there is no entry
5144 * lookup over the wire.
5145 */
5146 if (!skipdnlc)
5147 *vpp = dnlc_lookup(dvp, nm);
5148 if (*vpp == NULL) {
5149 /*
5150 * We need to go over the wire to lookup the name.
5151 */
5152 return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5153 }
5154
5155 /*
5156 * We hit on the dnlc
5157 */
5158 if (*vpp != DNLC_NO_VNODE ||
5159 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5160 /*
5161 * But our attrs may not be valid.
5162 */
5163 if (ATTRCACHE4_VALID(dvp)) {
5164 error = nfs4_waitfor_purge_complete(dvp);
5165 if (error) {
5166 VN_RELE(*vpp);
5167 *vpp = NULL;
5168 return (error);
5169 }
5170
5171 /*
5172 * If after the purge completes, check to make sure
5173 * our attrs are still valid.
5174 */
5175 if (ATTRCACHE4_VALID(dvp)) {
5176 /*
5177 * If we waited for a purge we may have
5178 * lost our vnode so look it up again.
5179 */
5180 VN_RELE(*vpp);
5181 *vpp = dnlc_lookup(dvp, nm);
5182 if (*vpp == NULL)
5183 return (nfs4lookupnew_otw(dvp,
5184 nm, vpp, cr));
5185
5186 /*
5187 * The access cache should almost always hit
5188 */
5189 error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5190
5191 if (error) {
5192 VN_RELE(*vpp);
5193 *vpp = NULL;
5194 return (error);
5195 }
5196 if (*vpp == DNLC_NO_VNODE) {
5197 VN_RELE(*vpp);
5198 *vpp = NULL;
5199 return (ENOENT);
5200 }
5201 return (0);
5202 }
5203 }
5204 }
5205
5206 ASSERT(*vpp != NULL);
5207
5208 /*
5209 * We may have gotten here we have one of the following cases:
5210 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5211 * need to validate them.
5212 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always
5213 * must validate.
5214 *
5215 * Go to the server and check if the directory has changed, if
5216 * it hasn't we are done and can use the dnlc entry.
5217 */
5218 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5219 }
5220
5221 /*
5222 * Go to the server and check if the directory has changed, if
5223 * it hasn't we are done and can use the dnlc entry. If it
5224 * has changed we get a new copy of its attributes and check
5225 * the access for VEXEC, then relookup the filename and
5226 * get its filehandle and attributes.
5227 *
5228 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5229 * if the NVERIFY failed we must
5230 * purge the caches
5231 * cache new attributes (will set r_time_attr_inval)
5232 * cache new access
5233 * recheck VEXEC access
5234 * add name to dnlc, possibly negative
5235 * if LOOKUP succeeded
5236 * cache new attributes
5237 * else
5238 * set a new r_time_attr_inval for dvp
5239 * check to make sure we have access
5240 *
5241 * The vpp returned is the vnode passed in if the directory is valid,
5242 * a new vnode if successful lookup, or NULL on error.
5243 */
5244 static int
5245 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5246 {
5247 COMPOUND4args_clnt args;
5248 COMPOUND4res_clnt res;
5249 fattr4 *ver_fattr;
5250 fattr4_change dchange;
5251 int32_t *ptr;
5252 int argoplist_size = 7 * sizeof (nfs_argop4);
5253 nfs_argop4 *argop;
5254 int doqueue;
5255 mntinfo4_t *mi;
5256 nfs4_recov_state_t recov_state;
5257 hrtime_t t;
5258 int isdotdot;
5259 vnode_t *nvp;
5260 nfs_fh4 *fhp;
5261 nfs4_sharedfh_t *sfhp;
5262 nfs4_access_type_t cacc;
5263 rnode4_t *nrp;
5264 rnode4_t *drp = VTOR4(dvp);
5265 nfs4_ga_res_t *garp = NULL;
5266 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5267
5268 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5269 ASSERT(nm != NULL);
5270 ASSERT(nm[0] != '\0');
5271 ASSERT(dvp->v_type == VDIR);
5272 ASSERT(nm[0] != '.' || nm[1] != '\0');
5273 ASSERT(*vpp != NULL);
5274
5275 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5276 isdotdot = 1;
5277 args.ctag = TAG_LOOKUP_VPARENT;
5278 } else {
5279 /*
5280 * If dvp were a stub, it should have triggered and caused
5281 * a mount for us to get this far.
5282 */
5283 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5284
5285 isdotdot = 0;
5286 args.ctag = TAG_LOOKUP_VALID;
5287 }
5288
5289 mi = VTOMI4(dvp);
5290 recov_state.rs_flags = 0;
5291 recov_state.rs_num_retry_despite_err = 0;
5292
5293 nvp = NULL;
5294
5295 /* Save the original mount point security information */
5296 (void) save_mnt_secinfo(mi->mi_curr_serv);
5297
5298 recov_retry:
5299 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5300 &recov_state, NULL);
5301 if (e.error) {
5302 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5303 VN_RELE(*vpp);
5304 *vpp = NULL;
5305 return (e.error);
5306 }
5307
5308 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5309
5310 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5311 args.array_len = 7;
5312 args.array = argop;
5313
5314 /* 0. putfh file */
5315 argop[0].argop = OP_CPUTFH;
5316 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5317
5318 /* 1. nverify the change info */
5319 argop[1].argop = OP_NVERIFY;
5320 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5321 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5322 ver_fattr->attrlist4 = (char *)&dchange;
5323 ptr = (int32_t *)&dchange;
5324 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5325 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5326
5327 /* 2. getattr directory */
5328 argop[2].argop = OP_GETATTR;
5329 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5330 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5331
5332 /* 3. access directory */
5333 argop[3].argop = OP_ACCESS;
5334 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5335 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5336
5337 /* 4. lookup name */
5338 if (isdotdot) {
5339 argop[4].argop = OP_LOOKUPP;
5340 } else {
5341 argop[4].argop = OP_CLOOKUP;
5342 argop[4].nfs_argop4_u.opclookup.cname = nm;
5343 }
5344
5345 /* 5. resulting file handle */
5346 argop[5].argop = OP_GETFH;
5347
5348 /* 6. resulting file attributes */
5349 argop[6].argop = OP_GETATTR;
5350 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5351 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5352
5353 doqueue = 1;
5354 t = gethrtime();
5355
5356 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5357
5358 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5359 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5360 if (e.error != 0 && *vpp != NULL)
5361 VN_RELE(*vpp);
5362 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5363 &recov_state, FALSE);
5364 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5365 kmem_free(argop, argoplist_size);
5366 return (e.error);
5367 }
5368
5369 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5370 /*
5371 * For WRONGSEC of a non-dotdot case, send secinfo directly
5372 * from this thread, do not go thru the recovery thread since
5373 * we need the nm information.
5374 *
5375 * Not doing dotdot case because there is no specification
5376 * for (PUTFH, SECINFO "..") yet.
5377 */
5378 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5379 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5380 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5381 &recov_state, FALSE);
5382 else
5383 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5384 &recov_state, TRUE);
5385 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5386 kmem_free(argop, argoplist_size);
5387 if (!e.error)
5388 goto recov_retry;
5389 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5390 VN_RELE(*vpp);
5391 *vpp = NULL;
5392 return (e.error);
5393 }
5394
5395 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5396 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5397 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5398 &recov_state, TRUE);
5399
5400 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5401 kmem_free(argop, argoplist_size);
5402 goto recov_retry;
5403 }
5404 }
5405
5406 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5407
5408 if (e.error || res.array_len == 0) {
5409 /*
5410 * If e.error isn't set, then reply has no ops (or we couldn't
5411 * be here). The only legal way to reply without an op array
5412 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5413 * be in the reply for all other status values.
5414 *
5415 * For valid replies without an ops array, return ENOTSUP
5416 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5417 * return EIO -- don't trust status.
5418 */
5419 if (e.error == 0)
5420 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5421 ENOTSUP : EIO;
5422 VN_RELE(*vpp);
5423 *vpp = NULL;
5424 kmem_free(argop, argoplist_size);
5425 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5426 return (e.error);
5427 }
5428
5429 if (res.status != NFS4ERR_SAME) {
5430 e.error = geterrno4(res.status);
5431
5432 /*
5433 * The NVERIFY "failed" so the directory has changed
5434 * First make sure PUTFH succeeded and NVERIFY "failed"
5435 * cleanly.
5436 */
5437 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5438 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5439 nfs4_purge_stale_fh(e.error, dvp, cr);
5440 VN_RELE(*vpp);
5441 *vpp = NULL;
5442 goto exit;
5443 }
5444
5445 /*
5446 * We know the NVERIFY "failed" so we must:
5447 * purge the caches (access and indirectly dnlc if needed)
5448 */
5449 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5450
5451 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5452 nfs4_purge_stale_fh(e.error, dvp, cr);
5453 VN_RELE(*vpp);
5454 *vpp = NULL;
5455 goto exit;
5456 }
5457
5458 /*
5459 * Install new cached attributes for the directory
5460 */
5461 nfs4_attr_cache(dvp,
5462 &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5463 t, cr, FALSE, NULL);
5464
5465 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5466 nfs4_purge_stale_fh(e.error, dvp, cr);
5467 VN_RELE(*vpp);
5468 *vpp = NULL;
5469 e.error = geterrno4(res.status);
5470 goto exit;
5471 }
5472
5473 /*
5474 * Now we know the directory is valid,
5475 * cache new directory access
5476 */
5477 nfs4_access_cache(drp,
5478 args.array[3].nfs_argop4_u.opaccess.access,
5479 res.array[3].nfs_resop4_u.opaccess.access, cr);
5480
5481 /*
5482 * recheck VEXEC access
5483 */
5484 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5485 if (cacc != NFS4_ACCESS_ALLOWED) {
5486 /*
5487 * Directory permissions might have been revoked
5488 */
5489 if (cacc == NFS4_ACCESS_DENIED) {
5490 e.error = EACCES;
5491 VN_RELE(*vpp);
5492 *vpp = NULL;
5493 goto exit;
5494 }
5495
5496 /*
5497 * Somehow we must not have asked for enough
5498 * so try a singleton ACCESS, should never happen.
5499 */
5500 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5501 if (e.error) {
5502 VN_RELE(*vpp);
5503 *vpp = NULL;
5504 goto exit;
5505 }
5506 }
5507
5508 e.error = geterrno4(res.status);
5509 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5510 /*
5511 * The lookup failed, probably no entry
5512 */
5513 if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5514 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5515 } else {
5516 /*
5517 * Might be some other error, so remove
5518 * the dnlc entry to make sure we start all
5519 * over again, next time.
5520 */
5521 dnlc_remove(dvp, nm);
5522 }
5523 VN_RELE(*vpp);
5524 *vpp = NULL;
5525 goto exit;
5526 }
5527
5528 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5529 /*
5530 * The file exists but we can't get its fh for
5531 * some unknown reason. Remove it from the dnlc
5532 * and error out to be safe.
5533 */
5534 dnlc_remove(dvp, nm);
5535 VN_RELE(*vpp);
5536 *vpp = NULL;
5537 goto exit;
5538 }
5539 fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5540 if (fhp->nfs_fh4_len == 0) {
5541 /*
5542 * The file exists but a bogus fh
5543 * some unknown reason. Remove it from the dnlc
5544 * and error out to be safe.
5545 */
5546 e.error = ENOENT;
5547 dnlc_remove(dvp, nm);
5548 VN_RELE(*vpp);
5549 *vpp = NULL;
5550 goto exit;
5551 }
5552 sfhp = sfh4_get(fhp, mi);
5553
5554 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5555 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5556
5557 /*
5558 * Make the new rnode
5559 */
5560 if (isdotdot) {
5561 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5562 if (e.error) {
5563 sfh4_rele(&sfhp);
5564 VN_RELE(*vpp);
5565 *vpp = NULL;
5566 goto exit;
5567 }
5568 /*
5569 * XXX if nfs4_make_dotdot uses an existing rnode
5570 * XXX it doesn't update the attributes.
5571 * XXX for now just save them again to save an OTW
5572 */
5573 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5574 } else {
5575 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5576 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5577 /*
5578 * If v_type == VNON, then garp was NULL because
5579 * the last op in the compound failed and makenfs4node
5580 * could not find the vnode for sfhp. It created
5581 * a new vnode, so we have nothing to purge here.
5582 */
5583 if (nvp->v_type == VNON) {
5584 vattr_t vattr;
5585
5586 vattr.va_mask = AT_TYPE;
5587 /*
5588 * N.B. We've already called nfs4_end_fop above.
5589 */
5590 e.error = nfs4getattr(nvp, &vattr, cr);
5591 if (e.error) {
5592 sfh4_rele(&sfhp);
5593 VN_RELE(*vpp);
5594 *vpp = NULL;
5595 VN_RELE(nvp);
5596 goto exit;
5597 }
5598 nvp->v_type = vattr.va_type;
5599 }
5600 }
5601 sfh4_rele(&sfhp);
5602
5603 nrp = VTOR4(nvp);
5604 mutex_enter(&nrp->r_statev4_lock);
5605 if (!nrp->created_v4) {
5606 mutex_exit(&nrp->r_statev4_lock);
5607 dnlc_update(dvp, nm, nvp);
5608 } else
5609 mutex_exit(&nrp->r_statev4_lock);
5610
5611 VN_RELE(*vpp);
5612 *vpp = nvp;
5613 } else {
5614 hrtime_t now;
5615 hrtime_t delta = 0;
5616
5617 e.error = 0;
5618
5619 /*
5620 * Because the NVERIFY "succeeded" we know that the
5621 * directory attributes are still valid
5622 * so update r_time_attr_inval
5623 */
5624 now = gethrtime();
5625 mutex_enter(&drp->r_statelock);
5626 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5627 delta = now - drp->r_time_attr_saved;
5628 if (delta < mi->mi_acdirmin)
5629 delta = mi->mi_acdirmin;
5630 else if (delta > mi->mi_acdirmax)
5631 delta = mi->mi_acdirmax;
5632 }
5633 drp->r_time_attr_inval = now + delta;
5634 mutex_exit(&drp->r_statelock);
5635 dnlc_update(dvp, nm, *vpp);
5636
5637 /*
5638 * Even though we have a valid directory attr cache
5639 * and dnlc entry, we may not have access.
5640 * This should almost always hit the cache.
5641 */
5642 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5643 if (e.error) {
5644 VN_RELE(*vpp);
5645 *vpp = NULL;
5646 }
5647
5648 if (*vpp == DNLC_NO_VNODE) {
5649 VN_RELE(*vpp);
5650 *vpp = NULL;
5651 e.error = ENOENT;
5652 }
5653 }
5654
5655 exit:
5656 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5657 kmem_free(argop, argoplist_size);
5658 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5659 return (e.error);
5660 }
5661
5662 /*
5663 * We need to go over the wire to lookup the name, but
5664 * while we are there verify the directory has not
5665 * changed but if it has, get new attributes and check access
5666 *
5667 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5668 * NVERIFY GETATTR ACCESS
5669 *
5670 * With the results:
5671 * if the NVERIFY failed we must purge the caches, add new attributes,
5672 * and cache new access.
5673 * set a new r_time_attr_inval
5674 * add name to dnlc, possibly negative
5675 * if LOOKUP succeeded
5676 * cache new attributes
5677 */
5678 static int
5679 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5680 {
5681 COMPOUND4args_clnt args;
5682 COMPOUND4res_clnt res;
5683 fattr4 *ver_fattr;
5684 fattr4_change dchange;
5685 int32_t *ptr;
5686 nfs4_ga_res_t *garp = NULL;
5687 int argoplist_size = 9 * sizeof (nfs_argop4);
5688 nfs_argop4 *argop;
5689 int doqueue;
5690 mntinfo4_t *mi;
5691 nfs4_recov_state_t recov_state;
5692 hrtime_t t;
5693 int isdotdot;
5694 vnode_t *nvp;
5695 nfs_fh4 *fhp;
5696 nfs4_sharedfh_t *sfhp;
5697 nfs4_access_type_t cacc;
5698 rnode4_t *nrp;
5699 rnode4_t *drp = VTOR4(dvp);
5700 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5701
5702 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5703 ASSERT(nm != NULL);
5704 ASSERT(nm[0] != '\0');
5705 ASSERT(dvp->v_type == VDIR);
5706 ASSERT(nm[0] != '.' || nm[1] != '\0');
5707 ASSERT(*vpp == NULL);
5708
5709 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5710 isdotdot = 1;
5711 args.ctag = TAG_LOOKUP_PARENT;
5712 } else {
5713 /*
5714 * If dvp were a stub, it should have triggered and caused
5715 * a mount for us to get this far.
5716 */
5717 ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5718
5719 isdotdot = 0;
5720 args.ctag = TAG_LOOKUP;
5721 }
5722
5723 mi = VTOMI4(dvp);
5724 recov_state.rs_flags = 0;
5725 recov_state.rs_num_retry_despite_err = 0;
5726
5727 nvp = NULL;
5728
5729 /* Save the original mount point security information */
5730 (void) save_mnt_secinfo(mi->mi_curr_serv);
5731
5732 recov_retry:
5733 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5734 &recov_state, NULL);
5735 if (e.error) {
5736 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5737 return (e.error);
5738 }
5739
5740 argop = kmem_alloc(argoplist_size, KM_SLEEP);
5741
5742 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5743 args.array_len = 9;
5744 args.array = argop;
5745
5746 /* 0. putfh file */
5747 argop[0].argop = OP_CPUTFH;
5748 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5749
5750 /* 1. savefh for the nverify */
5751 argop[1].argop = OP_SAVEFH;
5752
5753 /* 2. lookup name */
5754 if (isdotdot) {
5755 argop[2].argop = OP_LOOKUPP;
5756 } else {
5757 argop[2].argop = OP_CLOOKUP;
5758 argop[2].nfs_argop4_u.opclookup.cname = nm;
5759 }
5760
5761 /* 3. resulting file handle */
5762 argop[3].argop = OP_GETFH;
5763
5764 /* 4. resulting file attributes */
5765 argop[4].argop = OP_GETATTR;
5766 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5767 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5768
5769 /* 5. restorefh back the directory for the nverify */
5770 argop[5].argop = OP_RESTOREFH;
5771
5772 /* 6. nverify the change info */
5773 argop[6].argop = OP_NVERIFY;
5774 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5775 ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5776 ver_fattr->attrlist4 = (char *)&dchange;
5777 ptr = (int32_t *)&dchange;
5778 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5779 ver_fattr->attrlist4_len = sizeof (fattr4_change);
5780
5781 /* 7. getattr directory */
5782 argop[7].argop = OP_GETATTR;
5783 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5784 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5785
5786 /* 8. access directory */
5787 argop[8].argop = OP_ACCESS;
5788 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5789 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5790
5791 doqueue = 1;
5792 t = gethrtime();
5793
5794 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5795
5796 if (!isdotdot && res.status == NFS4ERR_MOVED) {
5797 e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5798 if (e.error != 0 && *vpp != NULL)
5799 VN_RELE(*vpp);
5800 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5801 &recov_state, FALSE);
5802 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5803 kmem_free(argop, argoplist_size);
5804 return (e.error);
5805 }
5806
5807 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5808 /*
5809 * For WRONGSEC of a non-dotdot case, send secinfo directly
5810 * from this thread, do not go thru the recovery thread since
5811 * we need the nm information.
5812 *
5813 * Not doing dotdot case because there is no specification
5814 * for (PUTFH, SECINFO "..") yet.
5815 */
5816 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5817 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5818 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5819 &recov_state, FALSE);
5820 else
5821 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5822 &recov_state, TRUE);
5823 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5824 kmem_free(argop, argoplist_size);
5825 if (!e.error)
5826 goto recov_retry;
5827 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5828 return (e.error);
5829 }
5830
5831 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5832 OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5833 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5834 &recov_state, TRUE);
5835
5836 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5837 kmem_free(argop, argoplist_size);
5838 goto recov_retry;
5839 }
5840 }
5841
5842 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5843
5844 if (e.error || res.array_len == 0) {
5845 /*
5846 * If e.error isn't set, then reply has no ops (or we couldn't
5847 * be here). The only legal way to reply without an op array
5848 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should
5849 * be in the reply for all other status values.
5850 *
5851 * For valid replies without an ops array, return ENOTSUP
5852 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies,
5853 * return EIO -- don't trust status.
5854 */
5855 if (e.error == 0)
5856 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5857 ENOTSUP : EIO;
5858
5859 kmem_free(argop, argoplist_size);
5860 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5861 return (e.error);
5862 }
5863
5864 e.error = geterrno4(res.status);
5865
5866 /*
5867 * The PUTFH and SAVEFH may have failed.
5868 */
5869 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5870 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5871 nfs4_purge_stale_fh(e.error, dvp, cr);
5872 goto exit;
5873 }
5874
5875 /*
5876 * Check if the file exists, if it does delay entering
5877 * into the dnlc until after we update the directory
5878 * attributes so we don't cause it to get purged immediately.
5879 */
5880 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5881 /*
5882 * The lookup failed, probably no entry
5883 */
5884 if (e.error == ENOENT && nfs4_lookup_neg_cache)
5885 dnlc_update(dvp, nm, DNLC_NO_VNODE);
5886 goto exit;
5887 }
5888
5889 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5890 /*
5891 * The file exists but we can't get its fh for
5892 * some unknown reason. Error out to be safe.
5893 */
5894 goto exit;
5895 }
5896
5897 fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5898 if (fhp->nfs_fh4_len == 0) {
5899 /*
5900 * The file exists but a bogus fh
5901 * some unknown reason. Error out to be safe.
5902 */
5903 e.error = EIO;
5904 goto exit;
5905 }
5906 sfhp = sfh4_get(fhp, mi);
5907
5908 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5909 sfh4_rele(&sfhp);
5910 goto exit;
5911 }
5912 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5913
5914 /*
5915 * The RESTOREFH may have failed
5916 */
5917 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5918 sfh4_rele(&sfhp);
5919 e.error = EIO;
5920 goto exit;
5921 }
5922
5923 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5924 /*
5925 * First make sure the NVERIFY failed as we expected,
5926 * if it didn't then be conservative and error out
5927 * as we can't trust the directory.
5928 */
5929 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5930 sfh4_rele(&sfhp);
5931 e.error = EIO;
5932 goto exit;
5933 }
5934
5935 /*
5936 * We know the NVERIFY "failed" so the directory has changed,
5937 * so we must:
5938 * purge the caches (access and indirectly dnlc if needed)
5939 */
5940 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5941
5942 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5943 sfh4_rele(&sfhp);
5944 goto exit;
5945 }
5946 nfs4_attr_cache(dvp,
5947 &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5948 t, cr, FALSE, NULL);
5949
5950 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5951 nfs4_purge_stale_fh(e.error, dvp, cr);
5952 sfh4_rele(&sfhp);
5953 e.error = geterrno4(res.status);
5954 goto exit;
5955 }
5956
5957 /*
5958 * Now we know the directory is valid,
5959 * cache new directory access
5960 */
5961 nfs4_access_cache(drp,
5962 args.array[8].nfs_argop4_u.opaccess.access,
5963 res.array[8].nfs_resop4_u.opaccess.access, cr);
5964
5965 /*
5966 * recheck VEXEC access
5967 */
5968 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5969 if (cacc != NFS4_ACCESS_ALLOWED) {
5970 /*
5971 * Directory permissions might have been revoked
5972 */
5973 if (cacc == NFS4_ACCESS_DENIED) {
5974 sfh4_rele(&sfhp);
5975 e.error = EACCES;
5976 goto exit;
5977 }
5978
5979 /*
5980 * Somehow we must not have asked for enough
5981 * so try a singleton ACCESS should never happen
5982 */
5983 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5984 if (e.error) {
5985 sfh4_rele(&sfhp);
5986 goto exit;
5987 }
5988 }
5989
5990 e.error = geterrno4(res.status);
5991 } else {
5992 hrtime_t now;
5993 hrtime_t delta = 0;
5994
5995 e.error = 0;
5996
5997 /*
5998 * Because the NVERIFY "succeeded" we know that the
5999 * directory attributes are still valid
6000 * so update r_time_attr_inval
6001 */
6002 now = gethrtime();
6003 mutex_enter(&drp->r_statelock);
6004 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
6005 delta = now - drp->r_time_attr_saved;
6006 if (delta < mi->mi_acdirmin)
6007 delta = mi->mi_acdirmin;
6008 else if (delta > mi->mi_acdirmax)
6009 delta = mi->mi_acdirmax;
6010 }
6011 drp->r_time_attr_inval = now + delta;
6012 mutex_exit(&drp->r_statelock);
6013
6014 /*
6015 * Even though we have a valid directory attr cache,
6016 * we may not have access.
6017 * This should almost always hit the cache.
6018 */
6019 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
6020 if (e.error) {
6021 sfh4_rele(&sfhp);
6022 goto exit;
6023 }
6024 }
6025
6026 /*
6027 * Now we have successfully completed the lookup, if the
6028 * directory has changed we now have the valid attributes.
6029 * We also know we have directory access.
6030 * Create the new rnode and insert it in the dnlc.
6031 */
6032 if (isdotdot) {
6033 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
6034 if (e.error) {
6035 sfh4_rele(&sfhp);
6036 goto exit;
6037 }
6038 /*
6039 * XXX if nfs4_make_dotdot uses an existing rnode
6040 * XXX it doesn't update the attributes.
6041 * XXX for now just save them again to save an OTW
6042 */
6043 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6044 } else {
6045 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6046 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6047 }
6048 sfh4_rele(&sfhp);
6049
6050 nrp = VTOR4(nvp);
6051 mutex_enter(&nrp->r_statev4_lock);
6052 if (!nrp->created_v4) {
6053 mutex_exit(&nrp->r_statev4_lock);
6054 dnlc_update(dvp, nm, nvp);
6055 } else
6056 mutex_exit(&nrp->r_statev4_lock);
6057
6058 *vpp = nvp;
6059
6060 exit:
6061 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6062 kmem_free(argop, argoplist_size);
6063 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6064 return (e.error);
6065 }
6066
6067 #ifdef DEBUG
6068 void
6069 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6070 {
6071 uint_t i, len;
6072 zoneid_t zoneid = getzoneid();
6073 char *s;
6074
6075 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6076 for (i = 0; i < argcnt; i++) {
6077 nfs_argop4 *op = &argbase[i];
6078 switch (op->argop) {
6079 case OP_CPUTFH:
6080 case OP_PUTFH:
6081 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6082 break;
6083 case OP_PUTROOTFH:
6084 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6085 break;
6086 case OP_CLOOKUP:
6087 s = op->nfs_argop4_u.opclookup.cname;
6088 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6089 break;
6090 case OP_LOOKUP:
6091 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6092 &len, NULL);
6093 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6094 kmem_free(s, len);
6095 break;
6096 case OP_LOOKUPP:
6097 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6098 break;
6099 case OP_GETFH:
6100 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6101 break;
6102 case OP_GETATTR:
6103 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6104 break;
6105 case OP_OPENATTR:
6106 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6107 break;
6108 default:
6109 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6110 op->argop);
6111 break;
6112 }
6113 }
6114 }
6115 #endif
6116
6117 /*
6118 * nfs4lookup_setup - constructs a multi-lookup compound request.
6119 *
6120 * Given the path "nm1/nm2/.../nmn", the following compound requests
6121 * may be created:
6122 *
6123 * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6124 * is faster, for now.
6125 *
6126 * l4_getattrs indicates the type of compound requested.
6127 *
6128 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6129 *
6130 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} }
6131 *
6132 * total number of ops is n + 1.
6133 *
6134 * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6135 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6136 * before the last component, and only get attributes
6137 * for the last component. Note that the second-to-last
6138 * pathname component is XATTR_RPATH, which does NOT go
6139 * over-the-wire as a lookup.
6140 *
6141 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6142 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6143 *
6144 * and total number of ops is n + 5.
6145 *
6146 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6147 * attribute directory: create lookups plus an OPENATTR
6148 * replacing the last lookup. Note that the last pathname
6149 * component is XATTR_RPATH, which does NOT go over-the-wire
6150 * as a lookup.
6151 *
6152 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6153 * Openattr; Getfh; Getattr }
6154 *
6155 * and total number of ops is n + 5.
6156 *
6157 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6158 * nodes too.
6159 *
6160 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6161 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr }
6162 *
6163 * and total number of ops is 3*n + 1.
6164 *
6165 * All cases: returns the index in the arg array of the final LOOKUP op, or
6166 * -1 if no LOOKUPs were used.
6167 */
6168 int
6169 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6170 {
6171 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6172 nfs_argop4 *argbase, *argop;
6173 int arglen, argcnt;
6174 int n = 1; /* number of components */
6175 int nga = 1; /* number of Getattr's in request */
6176 char c = '\0', *s, *p;
6177 int lookup_idx = -1;
6178 int argoplist_size;
6179
6180 /* set lookuparg response result to 0 */
6181 lookupargp->resp->status = NFS4_OK;
6182
6183 /* skip leading "/" or "." e.g. ".//./" if there is */
6184 for (; ; nm++) {
6185 if (*nm != '/' && *nm != '.')
6186 break;
6187
6188 /* ".." is counted as 1 component */
6189 if (*nm == '.' && *(nm + 1) != '/')
6190 break;
6191 }
6192
6193 /*
6194 * Find n = number of components - nm must be null terminated
6195 * Skip "." components.
6196 */
6197 if (*nm != '\0')
6198 for (n = 1, s = nm; *s != '\0'; s++) {
6199 if ((*s == '/') && (*(s + 1) != '/') &&
6200 (*(s + 1) != '\0') &&
6201 !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6202 *(s + 2) == '\0')))
6203 n++;
6204 }
6205 else
6206 n = 0;
6207
6208 /*
6209 * nga is number of components that need Getfh+Getattr
6210 */
6211 switch (l4_getattrs) {
6212 case LKP4_NO_ATTRIBUTES:
6213 nga = 0;
6214 break;
6215 case LKP4_ALL_ATTRIBUTES:
6216 nga = n;
6217 /*
6218 * Always have at least 1 getfh, getattr pair
6219 */
6220 if (nga == 0)
6221 nga++;
6222 break;
6223 case LKP4_LAST_ATTRDIR:
6224 case LKP4_LAST_NAMED_ATTR:
6225 nga = n+1;
6226 break;
6227 }
6228
6229 /*
6230 * If change to use the filehandle attr instead of getfh
6231 * the following line can be deleted.
6232 */
6233 nga *= 2;
6234
6235 /*
6236 * calculate number of ops in request as
6237 * header + trailer + lookups + getattrs
6238 */
6239 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6240
6241 argoplist_size = arglen * sizeof (nfs_argop4);
6242 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6243 lookupargp->argsp->array = argop;
6244
6245 argcnt = lookupargp->header_len;
6246 argop += argcnt;
6247
6248 /*
6249 * loop and create a lookup op and possibly getattr/getfh for
6250 * each component. Skip "." components.
6251 */
6252 for (s = nm; *s != '\0'; s = p) {
6253 /*
6254 * Set up a pathname struct for each component if needed
6255 */
6256 while (*s == '/')
6257 s++;
6258 if (*s == '\0')
6259 break;
6260
6261 for (p = s; (*p != '/') && (*p != '\0'); p++)
6262 ;
6263 c = *p;
6264 *p = '\0';
6265
6266 if (s[0] == '.' && s[1] == '\0') {
6267 *p = c;
6268 continue;
6269 }
6270 if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6271 strcmp(s, XATTR_RPATH) == 0) {
6272 /* getfh XXX may not be needed in future */
6273 argop->argop = OP_GETFH;
6274 argop++;
6275 argcnt++;
6276
6277 /* getattr */
6278 argop->argop = OP_GETATTR;
6279 argop->nfs_argop4_u.opgetattr.attr_request =
6280 lookupargp->ga_bits;
6281 argop->nfs_argop4_u.opgetattr.mi =
6282 lookupargp->mi;
6283 argop++;
6284 argcnt++;
6285
6286 /* openattr */
6287 argop->argop = OP_OPENATTR;
6288 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6289 strcmp(s, XATTR_RPATH) == 0) {
6290 /* openattr */
6291 argop->argop = OP_OPENATTR;
6292 argop++;
6293 argcnt++;
6294
6295 /* getfh XXX may not be needed in future */
6296 argop->argop = OP_GETFH;
6297 argop++;
6298 argcnt++;
6299
6300 /* getattr */
6301 argop->argop = OP_GETATTR;
6302 argop->nfs_argop4_u.opgetattr.attr_request =
6303 lookupargp->ga_bits;
6304 argop->nfs_argop4_u.opgetattr.mi =
6305 lookupargp->mi;
6306 argop++;
6307 argcnt++;
6308 *p = c;
6309 continue;
6310 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6311 /* lookupp */
6312 argop->argop = OP_LOOKUPP;
6313 } else {
6314 /* lookup */
6315 argop->argop = OP_LOOKUP;
6316 (void) str_to_utf8(s,
6317 &argop->nfs_argop4_u.oplookup.objname);
6318 }
6319 lookup_idx = argcnt;
6320 argop++;
6321 argcnt++;
6322
6323 *p = c;
6324
6325 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6326 /* getfh XXX may not be needed in future */
6327 argop->argop = OP_GETFH;
6328 argop++;
6329 argcnt++;
6330
6331 /* getattr */
6332 argop->argop = OP_GETATTR;
6333 argop->nfs_argop4_u.opgetattr.attr_request =
6334 lookupargp->ga_bits;
6335 argop->nfs_argop4_u.opgetattr.mi =
6336 lookupargp->mi;
6337 argop++;
6338 argcnt++;
6339 }
6340 }
6341
6342 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6343 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6344 if (needgetfh) {
6345 /* stick in a post-lookup getfh */
6346 argop->argop = OP_GETFH;
6347 argcnt++;
6348 argop++;
6349 }
6350 /* post-lookup getattr */
6351 argop->argop = OP_GETATTR;
6352 argop->nfs_argop4_u.opgetattr.attr_request =
6353 lookupargp->ga_bits;
6354 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6355 argcnt++;
6356 }
6357 argcnt += lookupargp->trailer_len; /* actual op count */
6358 lookupargp->argsp->array_len = argcnt;
6359 lookupargp->arglen = arglen;
6360
6361 #ifdef DEBUG
6362 if (nfs4_client_lookup_debug)
6363 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6364 #endif
6365
6366 return (lookup_idx);
6367 }
6368
6369 static int
6370 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6371 {
6372 COMPOUND4args_clnt args;
6373 COMPOUND4res_clnt res;
6374 GETFH4res *gf_res = NULL;
6375 nfs_argop4 argop[4];
6376 nfs_resop4 *resop = NULL;
6377 nfs4_sharedfh_t *sfhp;
6378 hrtime_t t;
6379 nfs4_error_t e;
6380
6381 rnode4_t *drp;
6382 int doqueue = 1;
6383 vnode_t *vp;
6384 int needrecov = 0;
6385 nfs4_recov_state_t recov_state;
6386
6387 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6388
6389 *avp = NULL;
6390 recov_state.rs_flags = 0;
6391 recov_state.rs_num_retry_despite_err = 0;
6392
6393 recov_retry:
6394 /* COMPOUND: putfh, openattr, getfh, getattr */
6395 args.array_len = 4;
6396 args.array = argop;
6397 args.ctag = TAG_OPENATTR;
6398
6399 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6400 if (e.error)
6401 return (e.error);
6402
6403 drp = VTOR4(dvp);
6404
6405 /* putfh */
6406 argop[0].argop = OP_CPUTFH;
6407 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6408
6409 /* openattr */
6410 argop[1].argop = OP_OPENATTR;
6411 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6412
6413 /* getfh */
6414 argop[2].argop = OP_GETFH;
6415
6416 /* getattr */
6417 argop[3].argop = OP_GETATTR;
6418 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6419 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6420
6421 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6422 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6423 rnode4info(drp)));
6424
6425 t = gethrtime();
6426
6427 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6428
6429 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6430 if (needrecov) {
6431 bool_t abort;
6432
6433 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6434 "nfs4openattr: initiating recovery\n"));
6435
6436 abort = nfs4_start_recovery(&e,
6437 VTOMI4(dvp), dvp, NULL, NULL, NULL,
6438 OP_OPENATTR, NULL, NULL, NULL);
6439 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6440 if (!e.error) {
6441 e.error = geterrno4(res.status);
6442 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6443 }
6444 if (abort == FALSE)
6445 goto recov_retry;
6446 return (e.error);
6447 }
6448
6449 if (e.error) {
6450 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6451 return (e.error);
6452 }
6453
6454 if (res.status) {
6455 /*
6456 * If OTW errro is NOTSUPP, then it should be
6457 * translated to EINVAL. All Solaris file system
6458 * implementations return EINVAL to the syscall layer
6459 * when the attrdir cannot be created due to an
6460 * implementation restriction or noxattr mount option.
6461 */
6462 if (res.status == NFS4ERR_NOTSUPP) {
6463 mutex_enter(&drp->r_statelock);
6464 if (drp->r_xattr_dir)
6465 VN_RELE(drp->r_xattr_dir);
6466 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6467 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6468 mutex_exit(&drp->r_statelock);
6469
6470 e.error = EINVAL;
6471 } else {
6472 e.error = geterrno4(res.status);
6473 }
6474
6475 if (e.error) {
6476 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6477 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6478 needrecov);
6479 return (e.error);
6480 }
6481 }
6482
6483 resop = &res.array[0]; /* putfh res */
6484 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6485
6486 resop = &res.array[1]; /* openattr res */
6487 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6488
6489 resop = &res.array[2]; /* getfh res */
6490 gf_res = &resop->nfs_resop4_u.opgetfh;
6491 if (gf_res->object.nfs_fh4_len == 0) {
6492 *avp = NULL;
6493 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6494 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6495 return (ENOENT);
6496 }
6497
6498 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6499 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6500 dvp->v_vfsp, t, cr, dvp,
6501 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6502 sfh4_rele(&sfhp);
6503
6504 if (e.error)
6505 PURGE_ATTRCACHE4(vp);
6506
6507 mutex_enter(&vp->v_lock);
6508 vp->v_flag |= V_XATTRDIR;
6509 mutex_exit(&vp->v_lock);
6510
6511 *avp = vp;
6512
6513 mutex_enter(&drp->r_statelock);
6514 if (drp->r_xattr_dir)
6515 VN_RELE(drp->r_xattr_dir);
6516 VN_HOLD(vp);
6517 drp->r_xattr_dir = vp;
6518
6519 /*
6520 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6521 * NULL. xattrs could be created at any time, and we have no
6522 * way to update pc4_xattr_exists in the base object if/when
6523 * it happens.
6524 */
6525 drp->r_pathconf.pc4_xattr_valid = 0;
6526
6527 mutex_exit(&drp->r_statelock);
6528
6529 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6530
6531 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6532
6533 return (0);
6534 }
6535
6536 /* ARGSUSED */
6537 static int
6538 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6539 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6540 vsecattr_t *vsecp)
6541 {
6542 int error;
6543 vnode_t *vp = NULL;
6544 rnode4_t *rp;
6545 struct vattr vattr;
6546 rnode4_t *drp;
6547 vnode_t *tempvp;
6548 enum createmode4 createmode;
6549 bool_t must_trunc = FALSE;
6550 int truncating = 0;
6551
6552 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6553 return (EPERM);
6554 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6555 return (EINVAL);
6556 }
6557
6558 /* . and .. have special meaning in the protocol, reject them. */
6559
6560 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6561 return (EISDIR);
6562
6563 drp = VTOR4(dvp);
6564
6565 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6566 return (EINTR);
6567
6568 top:
6569 /*
6570 * We make a copy of the attributes because the caller does not
6571 * expect us to change what va points to.
6572 */
6573 vattr = *va;
6574
6575 /*
6576 * If the pathname is "", then dvp is the root vnode of
6577 * a remote file mounted over a local directory.
6578 * All that needs to be done is access
6579 * checking and truncation. Note that we avoid doing
6580 * open w/ create because the parent directory might
6581 * be in pseudo-fs and the open would fail.
6582 */
6583 if (*nm == '\0') {
6584 error = 0;
6585 VN_HOLD(dvp);
6586 vp = dvp;
6587 must_trunc = TRUE;
6588 } else {
6589 /*
6590 * We need to go over the wire, just to be sure whether the
6591 * file exists or not. Using the DNLC can be dangerous in
6592 * this case when making a decision regarding existence.
6593 */
6594 error = nfs4lookup(dvp, nm, &vp, cr, 1);
6595 }
6596
6597 if (exclusive)
6598 createmode = EXCLUSIVE4;
6599 else
6600 createmode = GUARDED4;
6601
6602 /*
6603 * error would be set if the file does not exist on the
6604 * server, so lets go create it.
6605 */
6606 if (error) {
6607 goto create_otw;
6608 }
6609
6610 /*
6611 * File does exist on the server
6612 */
6613 if (exclusive == EXCL)
6614 error = EEXIST;
6615 else if (vp->v_type == VDIR && (mode & VWRITE))
6616 error = EISDIR;
6617 else {
6618 /*
6619 * If vnode is a device, create special vnode.
6620 */
6621 if (ISVDEV(vp->v_type)) {
6622 tempvp = vp;
6623 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6624 VN_RELE(tempvp);
6625 }
6626 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6627 if ((vattr.va_mask & AT_SIZE) &&
6628 vp->v_type == VREG) {
6629 rp = VTOR4(vp);
6630 /*
6631 * Check here for large file handled
6632 * by LF-unaware process (as
6633 * ufs_create() does)
6634 */
6635 if (!(flags & FOFFMAX)) {
6636 mutex_enter(&rp->r_statelock);
6637 if (rp->r_size > MAXOFF32_T)
6638 error = EOVERFLOW;
6639 mutex_exit(&rp->r_statelock);
6640 }
6641
6642 /* if error is set then we need to return */
6643 if (error) {
6644 nfs_rw_exit(&drp->r_rwlock);
6645 VN_RELE(vp);
6646 return (error);
6647 }
6648
6649 if (must_trunc) {
6650 vattr.va_mask = AT_SIZE;
6651 error = nfs4setattr(vp, &vattr, 0, cr,
6652 NULL);
6653 } else {
6654 /*
6655 * we know we have a regular file that already
6656 * exists and we may end up truncating the file
6657 * as a result of the open_otw, so flush out
6658 * any dirty pages for this file first.
6659 */
6660 if (nfs4_has_pages(vp) &&
6661 ((rp->r_flags & R4DIRTY) ||
6662 rp->r_count > 0 ||
6663 rp->r_mapcnt > 0)) {
6664 error = nfs4_putpage(vp,
6665 (offset_t)0, 0, 0, cr, ct);
6666 if (error && (error == ENOSPC ||
6667 error == EDQUOT)) {
6668 mutex_enter(
6669 &rp->r_statelock);
6670 if (!rp->r_error)
6671 rp->r_error =
6672 error;
6673 mutex_exit(
6674 &rp->r_statelock);
6675 }
6676 }
6677 vattr.va_mask = (AT_SIZE |
6678 AT_TYPE | AT_MODE);
6679 vattr.va_type = VREG;
6680 createmode = UNCHECKED4;
6681 truncating = 1;
6682 goto create_otw;
6683 }
6684 }
6685 }
6686 }
6687 nfs_rw_exit(&drp->r_rwlock);
6688 if (error) {
6689 VN_RELE(vp);
6690 } else {
6691 vnode_t *tvp;
6692 rnode4_t *trp;
6693 tvp = vp;
6694 if (vp->v_type == VREG) {
6695 trp = VTOR4(vp);
6696 if (IS_SHADOW(vp, trp))
6697 tvp = RTOV4(trp);
6698 }
6699
6700 if (must_trunc) {
6701 /*
6702 * existing file got truncated, notify.
6703 */
6704 vnevent_create(tvp, ct);
6705 }
6706
6707 *vpp = vp;
6708 }
6709 return (error);
6710
6711 create_otw:
6712 dnlc_remove(dvp, nm);
6713
6714 ASSERT(vattr.va_mask & AT_TYPE);
6715
6716 /*
6717 * If not a regular file let nfs4mknod() handle it.
6718 */
6719 if (vattr.va_type != VREG) {
6720 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6721 nfs_rw_exit(&drp->r_rwlock);
6722 return (error);
6723 }
6724
6725 /*
6726 * It _is_ a regular file.
6727 */
6728 ASSERT(vattr.va_mask & AT_MODE);
6729 if (MANDMODE(vattr.va_mode)) {
6730 nfs_rw_exit(&drp->r_rwlock);
6731 return (EACCES);
6732 }
6733
6734 /*
6735 * If this happens to be a mknod of a regular file, then flags will
6736 * have neither FREAD or FWRITE. However, we must set at least one
6737 * for the call to nfs4open_otw. If it's open(O_CREAT) driving
6738 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6739 * set (based on openmode specified by app).
6740 */
6741 if ((flags & (FREAD|FWRITE)) == 0)
6742 flags |= (FREAD|FWRITE);
6743
6744 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6745
6746 if (vp != NULL) {
6747 /* if create was successful, throw away the file's pages */
6748 if (!error && (vattr.va_mask & AT_SIZE))
6749 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6750 cr);
6751 /* release the lookup hold */
6752 VN_RELE(vp);
6753 vp = NULL;
6754 }
6755
6756 /*
6757 * validate that we opened a regular file. This handles a misbehaving
6758 * server that returns an incorrect FH.
6759 */
6760 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6761 error = EISDIR;
6762 VN_RELE(*vpp);
6763 }
6764
6765 /*
6766 * If this is not an exclusive create, then the CREATE
6767 * request will be made with the GUARDED mode set. This
6768 * means that the server will return EEXIST if the file
6769 * exists. The file could exist because of a retransmitted
6770 * request. In this case, we recover by starting over and
6771 * checking to see whether the file exists. This second
6772 * time through it should and a CREATE request will not be
6773 * sent.
6774 *
6775 * This handles the problem of a dangling CREATE request
6776 * which contains attributes which indicate that the file
6777 * should be truncated. This retransmitted request could
6778 * possibly truncate valid data in the file if not caught
6779 * by the duplicate request mechanism on the server or if
6780 * not caught by other means. The scenario is:
6781 *
6782 * Client transmits CREATE request with size = 0
6783 * Client times out, retransmits request.
6784 * Response to the first request arrives from the server
6785 * and the client proceeds on.
6786 * Client writes data to the file.
6787 * The server now processes retransmitted CREATE request
6788 * and truncates file.
6789 *
6790 * The use of the GUARDED CREATE request prevents this from
6791 * happening because the retransmitted CREATE would fail
6792 * with EEXIST and would not truncate the file.
6793 */
6794 if (error == EEXIST && exclusive == NONEXCL) {
6795 #ifdef DEBUG
6796 nfs4_create_misses++;
6797 #endif
6798 goto top;
6799 }
6800 nfs_rw_exit(&drp->r_rwlock);
6801 if (truncating && !error && *vpp) {
6802 vnode_t *tvp;
6803 rnode4_t *trp;
6804 /*
6805 * existing file got truncated, notify.
6806 */
6807 tvp = *vpp;
6808 trp = VTOR4(tvp);
6809 if (IS_SHADOW(tvp, trp))
6810 tvp = RTOV4(trp);
6811 vnevent_create(tvp, ct);
6812 }
6813 return (error);
6814 }
6815
6816 /*
6817 * Create compound (for mkdir, mknod, symlink):
6818 * { Putfh <dfh>; Create; Getfh; Getattr }
6819 * It's okay if setattr failed to set gid - this is not considered
6820 * an error, but purge attrs in that case.
6821 */
6822 static int
6823 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6824 vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6825 {
6826 int need_end_op = FALSE;
6827 COMPOUND4args_clnt args;
6828 COMPOUND4res_clnt res, *resp = NULL;
6829 nfs_argop4 *argop;
6830 nfs_resop4 *resop;
6831 int doqueue;
6832 mntinfo4_t *mi;
6833 rnode4_t *drp = VTOR4(dvp);
6834 change_info4 *cinfo;
6835 GETFH4res *gf_res;
6836 struct vattr vattr;
6837 vnode_t *vp;
6838 fattr4 *crattr;
6839 bool_t needrecov = FALSE;
6840 nfs4_recov_state_t recov_state;
6841 nfs4_sharedfh_t *sfhp = NULL;
6842 hrtime_t t;
6843 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6844 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6845 dirattr_info_t dinfo, *dinfop;
6846 servinfo4_t *svp;
6847 bitmap4 supp_attrs;
6848
6849 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6850 type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6851
6852 mi = VTOMI4(dvp);
6853
6854 /*
6855 * Make sure we properly deal with setting the right gid
6856 * on a new directory to reflect the parent's setgid bit
6857 */
6858 setgid_flag = 0;
6859 if (type == NF4DIR) {
6860 struct vattr dva;
6861
6862 va->va_mode &= ~VSGID;
6863 dva.va_mask = AT_MODE | AT_GID;
6864 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6865
6866 /*
6867 * If the parent's directory has the setgid bit set
6868 * _and_ the client was able to get a valid mapping
6869 * for the parent dir's owner_group, we want to
6870 * append NVERIFY(owner_group == dva.va_gid) and
6871 * SETTATTR to the CREATE compound.
6872 */
6873 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6874 setgid_flag = 1;
6875 va->va_mode |= VSGID;
6876 if (dva.va_gid != GID_NOBODY) {
6877 va->va_mask |= AT_GID;
6878 va->va_gid = dva.va_gid;
6879 }
6880 }
6881 }
6882 }
6883
6884 /*
6885 * Create ops:
6886 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6887 * 5:restorefh(dir) 6:getattr(dir)
6888 *
6889 * if (setgid)
6890 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6891 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6892 * 8:nverify 9:setattr
6893 */
6894 if (setgid_flag) {
6895 numops = 10;
6896 idx_create = 1;
6897 idx_fattr = 3;
6898 } else {
6899 numops = 7;
6900 idx_create = 2;
6901 idx_fattr = 4;
6902 }
6903
6904 ASSERT(nfs_zone() == mi->mi_zone);
6905 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6906 return (EINTR);
6907 }
6908 recov_state.rs_flags = 0;
6909 recov_state.rs_num_retry_despite_err = 0;
6910
6911 argoplist_size = numops * sizeof (nfs_argop4);
6912 argop = kmem_alloc(argoplist_size, KM_SLEEP);
6913
6914 recov_retry:
6915 if (type == NF4LNK)
6916 args.ctag = TAG_SYMLINK;
6917 else if (type == NF4DIR)
6918 args.ctag = TAG_MKDIR;
6919 else
6920 args.ctag = TAG_MKNOD;
6921
6922 args.array_len = numops;
6923 args.array = argop;
6924
6925 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6926 nfs_rw_exit(&drp->r_rwlock);
6927 kmem_free(argop, argoplist_size);
6928 return (e.error);
6929 }
6930 need_end_op = TRUE;
6931
6932
6933 /* 0: putfh directory */
6934 argop[0].argop = OP_CPUTFH;
6935 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6936
6937 /* 1/2: Create object */
6938 argop[idx_create].argop = OP_CCREATE;
6939 argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6940 argop[idx_create].nfs_argop4_u.opccreate.type = type;
6941 if (type == NF4LNK) {
6942 /*
6943 * symlink, treat name as data
6944 */
6945 ASSERT(data != NULL);
6946 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6947 (char *)data;
6948 }
6949 if (type == NF4BLK || type == NF4CHR) {
6950 ASSERT(data != NULL);
6951 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6952 *((specdata4 *)data);
6953 }
6954
6955 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6956
6957 svp = drp->r_server;
6958 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6959 supp_attrs = svp->sv_supp_attrs;
6960 nfs_rw_exit(&svp->sv_lock);
6961
6962 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6963 nfs_rw_exit(&drp->r_rwlock);
6964 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6965 e.error = EINVAL;
6966 kmem_free(argop, argoplist_size);
6967 return (e.error);
6968 }
6969
6970 /* 2/3: getfh fh of created object */
6971 ASSERT(idx_create + 1 == idx_fattr - 1);
6972 argop[idx_create + 1].argop = OP_GETFH;
6973
6974 /* 3/4: getattr of new object */
6975 argop[idx_fattr].argop = OP_GETATTR;
6976 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6977 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6978
6979 if (setgid_flag) {
6980 vattr_t _v;
6981
6982 argop[4].argop = OP_SAVEFH;
6983
6984 argop[5].argop = OP_CPUTFH;
6985 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6986
6987 argop[6].argop = OP_GETATTR;
6988 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6989 argop[6].nfs_argop4_u.opgetattr.mi = mi;
6990
6991 argop[7].argop = OP_RESTOREFH;
6992
6993 /*
6994 * nverify
6995 *
6996 * XXX - Revisit the last argument to nfs4_end_op()
6997 * once 5020486 is fixed.
6998 */
6999 _v.va_mask = AT_GID;
7000 _v.va_gid = va->va_gid;
7001 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
7002 supp_attrs)) {
7003 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7004 nfs_rw_exit(&drp->r_rwlock);
7005 nfs4_fattr4_free(crattr);
7006 kmem_free(argop, argoplist_size);
7007 return (e.error);
7008 }
7009
7010 /*
7011 * setattr
7012 *
7013 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
7014 * so no need for stateid or flags. Also we specify NULL
7015 * rp since we're only interested in setting owner_group
7016 * attributes.
7017 */
7018 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
7019 &e.error, 0);
7020
7021 if (e.error) {
7022 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7023 nfs_rw_exit(&drp->r_rwlock);
7024 nfs4_fattr4_free(crattr);
7025 nfs4args_verify_free(&argop[8]);
7026 kmem_free(argop, argoplist_size);
7027 return (e.error);
7028 }
7029 } else {
7030 argop[1].argop = OP_SAVEFH;
7031
7032 argop[5].argop = OP_RESTOREFH;
7033
7034 argop[6].argop = OP_GETATTR;
7035 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7036 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7037 }
7038
7039 dnlc_remove(dvp, nm);
7040
7041 doqueue = 1;
7042 t = gethrtime();
7043 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7044
7045 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7046 if (e.error) {
7047 PURGE_ATTRCACHE4(dvp);
7048 if (!needrecov)
7049 goto out;
7050 }
7051
7052 if (needrecov) {
7053 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7054 OP_CREATE, NULL, NULL, NULL) == FALSE) {
7055 nfs4_end_op(mi, dvp, NULL, &recov_state,
7056 needrecov);
7057 need_end_op = FALSE;
7058 nfs4_fattr4_free(crattr);
7059 if (setgid_flag) {
7060 nfs4args_verify_free(&argop[8]);
7061 nfs4args_setattr_free(&argop[9]);
7062 }
7063 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7064 goto recov_retry;
7065 }
7066 }
7067
7068 resp = &res;
7069
7070 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7071
7072 if (res.status == NFS4ERR_BADOWNER)
7073 nfs4_log_badowner(mi, OP_CREATE);
7074
7075 e.error = geterrno4(res.status);
7076
7077 /*
7078 * This check is left over from when create was implemented
7079 * using a setattr op (instead of createattrs). If the
7080 * putfh/create/getfh failed, the error was returned. If
7081 * setattr/getattr failed, we keep going.
7082 *
7083 * It might be better to get rid of the GETFH also, and just
7084 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7085 * Then if any of the operations failed, we could return the
7086 * error now, and remove much of the error code below.
7087 */
7088 if (res.array_len <= idx_fattr) {
7089 /*
7090 * Either Putfh, Create or Getfh failed.
7091 */
7092 PURGE_ATTRCACHE4(dvp);
7093 /*
7094 * nfs4_purge_stale_fh() may generate otw calls through
7095 * nfs4_invalidate_pages. Hence the need to call
7096 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7097 */
7098 nfs4_end_op(mi, dvp, NULL, &recov_state,
7099 needrecov);
7100 need_end_op = FALSE;
7101 nfs4_purge_stale_fh(e.error, dvp, cr);
7102 goto out;
7103 }
7104 }
7105
7106 resop = &res.array[idx_create]; /* create res */
7107 cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7108
7109 resop = &res.array[idx_create + 1]; /* getfh res */
7110 gf_res = &resop->nfs_resop4_u.opgetfh;
7111
7112 sfhp = sfh4_get(&gf_res->object, mi);
7113 if (e.error) {
7114 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7115 fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7116 if (vp->v_type == VNON) {
7117 vattr.va_mask = AT_TYPE;
7118 /*
7119 * Need to call nfs4_end_op before nfs4getattr to avoid
7120 * potential nfs4_start_op deadlock. See RFE 4777612.
7121 */
7122 nfs4_end_op(mi, dvp, NULL, &recov_state,
7123 needrecov);
7124 need_end_op = FALSE;
7125 e.error = nfs4getattr(vp, &vattr, cr);
7126 if (e.error) {
7127 VN_RELE(vp);
7128 *vpp = NULL;
7129 goto out;
7130 }
7131 vp->v_type = vattr.va_type;
7132 }
7133 e.error = 0;
7134 } else {
7135 *vpp = vp = makenfs4node(sfhp,
7136 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7137 dvp->v_vfsp, t, cr,
7138 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7139 }
7140
7141 /*
7142 * If compound succeeded, then update dir attrs
7143 */
7144 if (res.status == NFS4_OK) {
7145 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7146 dinfo.di_cred = cr;
7147 dinfo.di_time_call = t;
7148 dinfop = &dinfo;
7149 } else
7150 dinfop = NULL;
7151
7152 /* Update directory cache attribute, readdir and dnlc caches */
7153 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7154
7155 out:
7156 if (sfhp != NULL)
7157 sfh4_rele(&sfhp);
7158 nfs_rw_exit(&drp->r_rwlock);
7159 nfs4_fattr4_free(crattr);
7160 if (setgid_flag) {
7161 nfs4args_verify_free(&argop[8]);
7162 nfs4args_setattr_free(&argop[9]);
7163 }
7164 if (resp)
7165 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7166 if (need_end_op)
7167 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7168
7169 kmem_free(argop, argoplist_size);
7170 return (e.error);
7171 }
7172
7173 /* ARGSUSED */
7174 static int
7175 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7176 int mode, vnode_t **vpp, cred_t *cr)
7177 {
7178 int error;
7179 vnode_t *vp;
7180 nfs_ftype4 type;
7181 specdata4 spec, *specp = NULL;
7182
7183 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7184
7185 switch (va->va_type) {
7186 case VCHR:
7187 case VBLK:
7188 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7189 spec.specdata1 = getmajor(va->va_rdev);
7190 spec.specdata2 = getminor(va->va_rdev);
7191 specp = &spec;
7192 break;
7193
7194 case VFIFO:
7195 type = NF4FIFO;
7196 break;
7197 case VSOCK:
7198 type = NF4SOCK;
7199 break;
7200
7201 default:
7202 return (EINVAL);
7203 }
7204
7205 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7206 if (error) {
7207 return (error);
7208 }
7209
7210 /*
7211 * This might not be needed any more; special case to deal
7212 * with problematic v2/v3 servers. Since create was unable
7213 * to set group correctly, not sure what hope setattr has.
7214 */
7215 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7216 va->va_mask = AT_GID;
7217 (void) nfs4setattr(vp, va, 0, cr, NULL);
7218 }
7219
7220 /*
7221 * If vnode is a device create special vnode
7222 */
7223 if (ISVDEV(vp->v_type)) {
7224 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7225 VN_RELE(vp);
7226 } else {
7227 *vpp = vp;
7228 }
7229 return (error);
7230 }
7231
7232 /*
7233 * Remove requires that the current fh be the target directory.
7234 * After the operation, the current fh is unchanged.
7235 * The compound op structure is:
7236 * PUTFH(targetdir), REMOVE
7237 *
7238 * Weirdness: if the vnode to be removed is open
7239 * we rename it instead of removing it and nfs_inactive
7240 * will remove the new name.
7241 */
7242 /* ARGSUSED */
7243 static int
7244 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7245 {
7246 COMPOUND4args_clnt args;
7247 COMPOUND4res_clnt res, *resp = NULL;
7248 REMOVE4res *rm_res;
7249 nfs_argop4 argop[3];
7250 nfs_resop4 *resop;
7251 vnode_t *vp;
7252 char *tmpname;
7253 int doqueue;
7254 mntinfo4_t *mi;
7255 rnode4_t *rp;
7256 rnode4_t *drp;
7257 int needrecov = 0;
7258 nfs4_recov_state_t recov_state;
7259 int isopen;
7260 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7261 dirattr_info_t dinfo;
7262
7263 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7264 return (EPERM);
7265 drp = VTOR4(dvp);
7266 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7267 return (EINTR);
7268
7269 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7270 if (e.error) {
7271 nfs_rw_exit(&drp->r_rwlock);
7272 return (e.error);
7273 }
7274
7275 if (vp->v_type == VDIR) {
7276 VN_RELE(vp);
7277 nfs_rw_exit(&drp->r_rwlock);
7278 return (EISDIR);
7279 }
7280
7281 /*
7282 * First just remove the entry from the name cache, as it
7283 * is most likely the only entry for this vp.
7284 */
7285 dnlc_remove(dvp, nm);
7286
7287 rp = VTOR4(vp);
7288
7289 /*
7290 * For regular file types, check to see if the file is open by looking
7291 * at the open streams.
7292 * For all other types, check the reference count on the vnode. Since
7293 * they are not opened OTW they never have an open stream.
7294 *
7295 * If the file is open, rename it to .nfsXXXX.
7296 */
7297 if (vp->v_type != VREG) {
7298 /*
7299 * If the file has a v_count > 1 then there may be more than one
7300 * entry in the name cache due multiple links or an open file,
7301 * but we don't have the real reference count so flush all
7302 * possible entries.
7303 */
7304 if (vp->v_count > 1)
7305 dnlc_purge_vp(vp);
7306
7307 /*
7308 * Now we have the real reference count.
7309 */
7310 isopen = vp->v_count > 1;
7311 } else {
7312 mutex_enter(&rp->r_os_lock);
7313 isopen = list_head(&rp->r_open_streams) != NULL;
7314 mutex_exit(&rp->r_os_lock);
7315 }
7316
7317 mutex_enter(&rp->r_statelock);
7318 if (isopen &&
7319 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7320 mutex_exit(&rp->r_statelock);
7321 tmpname = newname();
7322 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7323 if (e.error)
7324 kmem_free(tmpname, MAXNAMELEN);
7325 else {
7326 mutex_enter(&rp->r_statelock);
7327 if (rp->r_unldvp == NULL) {
7328 VN_HOLD(dvp);
7329 rp->r_unldvp = dvp;
7330 if (rp->r_unlcred != NULL)
7331 crfree(rp->r_unlcred);
7332 crhold(cr);
7333 rp->r_unlcred = cr;
7334 rp->r_unlname = tmpname;
7335 } else {
7336 kmem_free(rp->r_unlname, MAXNAMELEN);
7337 rp->r_unlname = tmpname;
7338 }
7339 mutex_exit(&rp->r_statelock);
7340 }
7341 VN_RELE(vp);
7342 nfs_rw_exit(&drp->r_rwlock);
7343 return (e.error);
7344 }
7345 /*
7346 * Actually remove the file/dir
7347 */
7348 mutex_exit(&rp->r_statelock);
7349
7350 /*
7351 * We need to flush any dirty pages which happen to
7352 * be hanging around before removing the file.
7353 * This shouldn't happen very often since in NFSv4
7354 * we should be close to open consistent.
7355 */
7356 if (nfs4_has_pages(vp) &&
7357 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7358 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7359 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7360 mutex_enter(&rp->r_statelock);
7361 if (!rp->r_error)
7362 rp->r_error = e.error;
7363 mutex_exit(&rp->r_statelock);
7364 }
7365 }
7366
7367 mi = VTOMI4(dvp);
7368
7369 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7370 recov_state.rs_flags = 0;
7371 recov_state.rs_num_retry_despite_err = 0;
7372
7373 recov_retry:
7374 /*
7375 * Remove ops: putfh dir; remove
7376 */
7377 args.ctag = TAG_REMOVE;
7378 args.array_len = 3;
7379 args.array = argop;
7380
7381 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7382 if (e.error) {
7383 nfs_rw_exit(&drp->r_rwlock);
7384 VN_RELE(vp);
7385 return (e.error);
7386 }
7387
7388 /* putfh directory */
7389 argop[0].argop = OP_CPUTFH;
7390 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7391
7392 /* remove */
7393 argop[1].argop = OP_CREMOVE;
7394 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7395
7396 /* getattr dir */
7397 argop[2].argop = OP_GETATTR;
7398 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7399 argop[2].nfs_argop4_u.opgetattr.mi = mi;
7400
7401 doqueue = 1;
7402 dinfo.di_time_call = gethrtime();
7403 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7404
7405 PURGE_ATTRCACHE4(vp);
7406
7407 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7408 if (e.error)
7409 PURGE_ATTRCACHE4(dvp);
7410
7411 if (needrecov) {
7412 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7413 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7414 if (!e.error)
7415 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7416 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7417 needrecov);
7418 goto recov_retry;
7419 }
7420 }
7421
7422 /*
7423 * Matching nfs4_end_op() for start_op() above.
7424 * There is a path in the code below which calls
7425 * nfs4_purge_stale_fh(), which may generate otw calls through
7426 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7427 * here to avoid nfs4_start_op() deadlock.
7428 */
7429 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7430
7431 if (!e.error) {
7432 resp = &res;
7433
7434 if (res.status) {
7435 e.error = geterrno4(res.status);
7436 PURGE_ATTRCACHE4(dvp);
7437 nfs4_purge_stale_fh(e.error, dvp, cr);
7438 } else {
7439 resop = &res.array[1]; /* remove res */
7440 rm_res = &resop->nfs_resop4_u.opremove;
7441
7442 dinfo.di_garp =
7443 &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7444 dinfo.di_cred = cr;
7445
7446 /* Update directory attr, readdir and dnlc caches */
7447 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7448 &dinfo);
7449 }
7450 }
7451 nfs_rw_exit(&drp->r_rwlock);
7452 if (resp)
7453 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7454
7455 if (e.error == 0) {
7456 vnode_t *tvp;
7457 rnode4_t *trp;
7458 trp = VTOR4(vp);
7459 tvp = vp;
7460 if (IS_SHADOW(vp, trp))
7461 tvp = RTOV4(trp);
7462 vnevent_remove(tvp, dvp, nm, ct);
7463 }
7464 VN_RELE(vp);
7465 return (e.error);
7466 }
7467
7468 /*
7469 * Link requires that the current fh be the target directory and the
7470 * saved fh be the source fh. After the operation, the current fh is unchanged.
7471 * Thus the compound op structure is:
7472 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7473 * GETATTR(file)
7474 */
7475 /* ARGSUSED */
7476 static int
7477 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7478 caller_context_t *ct, int flags)
7479 {
7480 COMPOUND4args_clnt args;
7481 COMPOUND4res_clnt res, *resp = NULL;
7482 LINK4res *ln_res;
7483 int argoplist_size = 7 * sizeof (nfs_argop4);
7484 nfs_argop4 *argop;
7485 nfs_resop4 *resop;
7486 vnode_t *realvp, *nvp;
7487 int doqueue;
7488 mntinfo4_t *mi;
7489 rnode4_t *tdrp;
7490 bool_t needrecov = FALSE;
7491 nfs4_recov_state_t recov_state;
7492 hrtime_t t;
7493 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7494 dirattr_info_t dinfo;
7495
7496 ASSERT(*tnm != '\0');
7497 ASSERT(tdvp->v_type == VDIR);
7498 ASSERT(nfs4_consistent_type(tdvp));
7499 ASSERT(nfs4_consistent_type(svp));
7500
7501 if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7502 return (EPERM);
7503 if (VOP_REALVP(svp, &realvp, ct) == 0) {
7504 svp = realvp;
7505 ASSERT(nfs4_consistent_type(svp));
7506 }
7507
7508 tdrp = VTOR4(tdvp);
7509 mi = VTOMI4(svp);
7510
7511 if (!(mi->mi_flags & MI4_LINK)) {
7512 return (EOPNOTSUPP);
7513 }
7514 recov_state.rs_flags = 0;
7515 recov_state.rs_num_retry_despite_err = 0;
7516
7517 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7518 return (EINTR);
7519
7520 recov_retry:
7521 argop = kmem_alloc(argoplist_size, KM_SLEEP);
7522
7523 args.ctag = TAG_LINK;
7524
7525 /*
7526 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7527 * restorefh; getattr(fl)
7528 */
7529 args.array_len = 7;
7530 args.array = argop;
7531
7532 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7533 if (e.error) {
7534 kmem_free(argop, argoplist_size);
7535 nfs_rw_exit(&tdrp->r_rwlock);
7536 return (e.error);
7537 }
7538
7539 /* 0. putfh file */
7540 argop[0].argop = OP_CPUTFH;
7541 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7542
7543 /* 1. save current fh to free up the space for the dir */
7544 argop[1].argop = OP_SAVEFH;
7545
7546 /* 2. putfh targetdir */
7547 argop[2].argop = OP_CPUTFH;
7548 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7549
7550 /* 3. link: current_fh is targetdir, saved_fh is source */
7551 argop[3].argop = OP_CLINK;
7552 argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7553
7554 /* 4. Get attributes of dir */
7555 argop[4].argop = OP_GETATTR;
7556 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7557 argop[4].nfs_argop4_u.opgetattr.mi = mi;
7558
7559 /* 5. If link was successful, restore current vp to file */
7560 argop[5].argop = OP_RESTOREFH;
7561
7562 /* 6. Get attributes of linked object */
7563 argop[6].argop = OP_GETATTR;
7564 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7565 argop[6].nfs_argop4_u.opgetattr.mi = mi;
7566
7567 dnlc_remove(tdvp, tnm);
7568
7569 doqueue = 1;
7570 t = gethrtime();
7571
7572 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7573
7574 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7575 if (e.error != 0 && !needrecov) {
7576 PURGE_ATTRCACHE4(tdvp);
7577 PURGE_ATTRCACHE4(svp);
7578 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7579 goto out;
7580 }
7581
7582 if (needrecov) {
7583 bool_t abort;
7584
7585 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7586 NULL, NULL, OP_LINK, NULL, NULL, NULL);
7587 if (abort == FALSE) {
7588 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7589 needrecov);
7590 kmem_free(argop, argoplist_size);
7591 if (!e.error)
7592 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7593 goto recov_retry;
7594 } else {
7595 if (e.error != 0) {
7596 PURGE_ATTRCACHE4(tdvp);
7597 PURGE_ATTRCACHE4(svp);
7598 nfs4_end_op(VTOMI4(svp), svp, tdvp,
7599 &recov_state, needrecov);
7600 goto out;
7601 }
7602 /* fall through for res.status case */
7603 }
7604 }
7605
7606 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7607
7608 resp = &res;
7609 if (res.status) {
7610 /* If link succeeded, then don't return error */
7611 e.error = geterrno4(res.status);
7612 if (res.array_len <= 4) {
7613 /*
7614 * Either Putfh, Savefh, Putfh dir, or Link failed
7615 */
7616 PURGE_ATTRCACHE4(svp);
7617 PURGE_ATTRCACHE4(tdvp);
7618 if (e.error == EOPNOTSUPP) {
7619 mutex_enter(&mi->mi_lock);
7620 mi->mi_flags &= ~MI4_LINK;
7621 mutex_exit(&mi->mi_lock);
7622 }
7623 /* Remap EISDIR to EPERM for non-root user for SVVS */
7624 /* XXX-LP */
7625 if (e.error == EISDIR && crgetuid(cr) != 0)
7626 e.error = EPERM;
7627 goto out;
7628 }
7629 }
7630
7631 /* either no error or one of the postop getattr failed */
7632
7633 /*
7634 * XXX - if LINK succeeded, but no attrs were returned for link
7635 * file, purge its cache.
7636 *
7637 * XXX Perform a simplified version of wcc checking. Instead of
7638 * have another getattr to get pre-op, just purge cache if
7639 * any of the ops prior to and including the getattr failed.
7640 * If the getattr succeeded then update the attrcache accordingly.
7641 */
7642
7643 /*
7644 * update cache with link file postattrs.
7645 * Note: at this point resop points to link res.
7646 */
7647 resop = &res.array[3]; /* link res */
7648 ln_res = &resop->nfs_resop4_u.oplink;
7649 if (res.status == NFS4_OK)
7650 e.error = nfs4_update_attrcache(res.status,
7651 &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7652 t, svp, cr);
7653
7654 /*
7655 * Call makenfs4node to create the new shadow vp for tnm.
7656 * We pass NULL attrs because we just cached attrs for
7657 * the src object. All we're trying to accomplish is to
7658 * to create the new shadow vnode.
7659 */
7660 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7661 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7662
7663 /* Update target cache attribute, readdir and dnlc caches */
7664 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7665 dinfo.di_time_call = t;
7666 dinfo.di_cred = cr;
7667
7668 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7669 ASSERT(nfs4_consistent_type(tdvp));
7670 ASSERT(nfs4_consistent_type(svp));
7671 ASSERT(nfs4_consistent_type(nvp));
7672 VN_RELE(nvp);
7673
7674 if (!e.error) {
7675 vnode_t *tvp;
7676 rnode4_t *trp;
7677 /*
7678 * Notify the source file of this link operation.
7679 */
7680 trp = VTOR4(svp);
7681 tvp = svp;
7682 if (IS_SHADOW(svp, trp))
7683 tvp = RTOV4(trp);
7684 vnevent_link(tvp, ct);
7685 }
7686 out:
7687 kmem_free(argop, argoplist_size);
7688 if (resp)
7689 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7690
7691 nfs_rw_exit(&tdrp->r_rwlock);
7692
7693 return (e.error);
7694 }
7695
7696 /* ARGSUSED */
7697 static int
7698 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7699 caller_context_t *ct, int flags)
7700 {
7701 vnode_t *realvp;
7702
7703 if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7704 return (EPERM);
7705 if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7706 ndvp = realvp;
7707
7708 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7709 }
7710
7711 /*
7712 * nfs4rename does the real work of renaming in NFS Version 4.
7713 *
7714 * A file handle is considered volatile for renaming purposes if either
7715 * of the volatile bits are turned on. However, the compound may differ
7716 * based on the likelihood of the filehandle to change during rename.
7717 */
7718 static int
7719 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7720 caller_context_t *ct)
7721 {
7722 int error;
7723 mntinfo4_t *mi;
7724 vnode_t *nvp = NULL;
7725 vnode_t *ovp = NULL;
7726 char *tmpname = NULL;
7727 rnode4_t *rp;
7728 rnode4_t *odrp;
7729 rnode4_t *ndrp;
7730 int did_link = 0;
7731 int do_link = 1;
7732 nfsstat4 stat = NFS4_OK;
7733
7734 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7735 ASSERT(nfs4_consistent_type(odvp));
7736 ASSERT(nfs4_consistent_type(ndvp));
7737
7738 if (onm[0] == '.' && (onm[1] == '\0' ||
7739 (onm[1] == '.' && onm[2] == '\0')))
7740 return (EINVAL);
7741
7742 if (nnm[0] == '.' && (nnm[1] == '\0' ||
7743 (nnm[1] == '.' && nnm[2] == '\0')))
7744 return (EINVAL);
7745
7746 odrp = VTOR4(odvp);
7747 ndrp = VTOR4(ndvp);
7748 if ((intptr_t)odrp < (intptr_t)ndrp) {
7749 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7750 return (EINTR);
7751 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7752 nfs_rw_exit(&odrp->r_rwlock);
7753 return (EINTR);
7754 }
7755 } else {
7756 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7757 return (EINTR);
7758 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7759 nfs_rw_exit(&ndrp->r_rwlock);
7760 return (EINTR);
7761 }
7762 }
7763
7764 /*
7765 * Lookup the target file. If it exists, it needs to be
7766 * checked to see whether it is a mount point and whether
7767 * it is active (open).
7768 */
7769 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7770 if (!error) {
7771 int isactive;
7772
7773 ASSERT(nfs4_consistent_type(nvp));
7774 /*
7775 * If this file has been mounted on, then just
7776 * return busy because renaming to it would remove
7777 * the mounted file system from the name space.
7778 */
7779 if (vn_ismntpt(nvp)) {
7780 VN_RELE(nvp);
7781 nfs_rw_exit(&odrp->r_rwlock);
7782 nfs_rw_exit(&ndrp->r_rwlock);
7783 return (EBUSY);
7784 }
7785
7786 /*
7787 * First just remove the entry from the name cache, as it
7788 * is most likely the only entry for this vp.
7789 */
7790 dnlc_remove(ndvp, nnm);
7791
7792 rp = VTOR4(nvp);
7793
7794 if (nvp->v_type != VREG) {
7795 /*
7796 * Purge the name cache of all references to this vnode
7797 * so that we can check the reference count to infer
7798 * whether it is active or not.
7799 */
7800 if (nvp->v_count > 1)
7801 dnlc_purge_vp(nvp);
7802
7803 isactive = nvp->v_count > 1;
7804 } else {
7805 mutex_enter(&rp->r_os_lock);
7806 isactive = list_head(&rp->r_open_streams) != NULL;
7807 mutex_exit(&rp->r_os_lock);
7808 }
7809
7810 /*
7811 * If the vnode is active and is not a directory,
7812 * arrange to rename it to a
7813 * temporary file so that it will continue to be
7814 * accessible. This implements the "unlink-open-file"
7815 * semantics for the target of a rename operation.
7816 * Before doing this though, make sure that the
7817 * source and target files are not already the same.
7818 */
7819 if (isactive && nvp->v_type != VDIR) {
7820 /*
7821 * Lookup the source name.
7822 */
7823 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7824
7825 /*
7826 * The source name *should* already exist.
7827 */
7828 if (error) {
7829 VN_RELE(nvp);
7830 nfs_rw_exit(&odrp->r_rwlock);
7831 nfs_rw_exit(&ndrp->r_rwlock);
7832 return (error);
7833 }
7834
7835 ASSERT(nfs4_consistent_type(ovp));
7836
7837 /*
7838 * Compare the two vnodes. If they are the same,
7839 * just release all held vnodes and return success.
7840 */
7841 if (VN_CMP(ovp, nvp)) {
7842 VN_RELE(ovp);
7843 VN_RELE(nvp);
7844 nfs_rw_exit(&odrp->r_rwlock);
7845 nfs_rw_exit(&ndrp->r_rwlock);
7846 return (0);
7847 }
7848
7849 /*
7850 * Can't mix and match directories and non-
7851 * directories in rename operations. We already
7852 * know that the target is not a directory. If
7853 * the source is a directory, return an error.
7854 */
7855 if (ovp->v_type == VDIR) {
7856 VN_RELE(ovp);
7857 VN_RELE(nvp);
7858 nfs_rw_exit(&odrp->r_rwlock);
7859 nfs_rw_exit(&ndrp->r_rwlock);
7860 return (ENOTDIR);
7861 }
7862 link_call:
7863 /*
7864 * The target file exists, is not the same as
7865 * the source file, and is active. We first
7866 * try to Link it to a temporary filename to
7867 * avoid having the server removing the file
7868 * completely (which could cause data loss to
7869 * the user's POV in the event the Rename fails
7870 * -- see bug 1165874).
7871 */
7872 /*
7873 * The do_link and did_link booleans are
7874 * introduced in the event we get NFS4ERR_FILE_OPEN
7875 * returned for the Rename. Some servers can
7876 * not Rename over an Open file, so they return
7877 * this error. The client needs to Remove the
7878 * newly created Link and do two Renames, just
7879 * as if the server didn't support LINK.
7880 */
7881 tmpname = newname();
7882 error = 0;
7883
7884 if (do_link) {
7885 error = nfs4_link(ndvp, nvp, tmpname, cr,
7886 NULL, 0);
7887 }
7888 if (error == EOPNOTSUPP || !do_link) {
7889 error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7890 cr, NULL, 0);
7891 did_link = 0;
7892 } else {
7893 did_link = 1;
7894 }
7895 if (error) {
7896 kmem_free(tmpname, MAXNAMELEN);
7897 VN_RELE(ovp);
7898 VN_RELE(nvp);
7899 nfs_rw_exit(&odrp->r_rwlock);
7900 nfs_rw_exit(&ndrp->r_rwlock);
7901 return (error);
7902 }
7903
7904 mutex_enter(&rp->r_statelock);
7905 if (rp->r_unldvp == NULL) {
7906 VN_HOLD(ndvp);
7907 rp->r_unldvp = ndvp;
7908 if (rp->r_unlcred != NULL)
7909 crfree(rp->r_unlcred);
7910 crhold(cr);
7911 rp->r_unlcred = cr;
7912 rp->r_unlname = tmpname;
7913 } else {
7914 if (rp->r_unlname)
7915 kmem_free(rp->r_unlname, MAXNAMELEN);
7916 rp->r_unlname = tmpname;
7917 }
7918 mutex_exit(&rp->r_statelock);
7919 }
7920
7921 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7922
7923 ASSERT(nfs4_consistent_type(nvp));
7924 }
7925
7926 if (ovp == NULL) {
7927 /*
7928 * When renaming directories to be a subdirectory of a
7929 * different parent, the dnlc entry for ".." will no
7930 * longer be valid, so it must be removed.
7931 *
7932 * We do a lookup here to determine whether we are renaming
7933 * a directory and we need to check if we are renaming
7934 * an unlinked file. This might have already been done
7935 * in previous code, so we check ovp == NULL to avoid
7936 * doing it twice.
7937 */
7938 error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7939 /*
7940 * The source name *should* already exist.
7941 */
7942 if (error) {
7943 nfs_rw_exit(&odrp->r_rwlock);
7944 nfs_rw_exit(&ndrp->r_rwlock);
7945 if (nvp) {
7946 VN_RELE(nvp);
7947 }
7948 return (error);
7949 }
7950 ASSERT(ovp != NULL);
7951 ASSERT(nfs4_consistent_type(ovp));
7952 }
7953
7954 /*
7955 * Is the object being renamed a dir, and if so, is
7956 * it being renamed to a child of itself? The underlying
7957 * fs should ultimately return EINVAL for this case;
7958 * however, buggy beta non-Solaris NFSv4 servers at
7959 * interop testing events have allowed this behavior,
7960 * and it caused our client to panic due to a recursive
7961 * mutex_enter in fn_move.
7962 *
7963 * The tedious locking in fn_move could be changed to
7964 * deal with this case, and the client could avoid the
7965 * panic; however, the client would just confuse itself
7966 * later and misbehave. A better way to handle the broken
7967 * server is to detect this condition and return EINVAL
7968 * without ever sending the the bogus rename to the server.
7969 * We know the rename is invalid -- just fail it now.
7970 */
7971 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7972 VN_RELE(ovp);
7973 nfs_rw_exit(&odrp->r_rwlock);
7974 nfs_rw_exit(&ndrp->r_rwlock);
7975 if (nvp) {
7976 VN_RELE(nvp);
7977 }
7978 return (EINVAL);
7979 }
7980
7981 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7982
7983 /*
7984 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7985 * possible for the filehandle to change due to the rename.
7986 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7987 * the fh will not change because of the rename, but we still need
7988 * to update its rnode entry with the new name for
7989 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7990 * has no effect on these for now, but for future improvements,
7991 * we might want to use it too to simplify handling of files
7992 * that are open with that flag on. (XXX)
7993 */
7994 mi = VTOMI4(odvp);
7995 if (NFS4_VOLATILE_FH(mi))
7996 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7997 &stat);
7998 else
7999 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
8000 &stat);
8001
8002 ASSERT(nfs4_consistent_type(odvp));
8003 ASSERT(nfs4_consistent_type(ndvp));
8004 ASSERT(nfs4_consistent_type(ovp));
8005
8006 if (stat == NFS4ERR_FILE_OPEN && did_link) {
8007 do_link = 0;
8008 /*
8009 * Before the 'link_call' code, we did a nfs4_lookup
8010 * that puts a VN_HOLD on nvp. After the nfs4_link
8011 * call we call VN_RELE to match that hold. We need
8012 * to place an additional VN_HOLD here since we will
8013 * be hitting that VN_RELE again.
8014 */
8015 VN_HOLD(nvp);
8016
8017 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
8018
8019 /* Undo the unlinked file naming stuff we just did */
8020 mutex_enter(&rp->r_statelock);
8021 if (rp->r_unldvp) {
8022 VN_RELE(ndvp);
8023 rp->r_unldvp = NULL;
8024 if (rp->r_unlcred != NULL)
8025 crfree(rp->r_unlcred);
8026 rp->r_unlcred = NULL;
8027 /* rp->r_unlanme points to tmpname */
8028 if (rp->r_unlname)
8029 kmem_free(rp->r_unlname, MAXNAMELEN);
8030 rp->r_unlname = NULL;
8031 }
8032 mutex_exit(&rp->r_statelock);
8033
8034 if (nvp) {
8035 VN_RELE(nvp);
8036 }
8037 goto link_call;
8038 }
8039
8040 if (error) {
8041 VN_RELE(ovp);
8042 nfs_rw_exit(&odrp->r_rwlock);
8043 nfs_rw_exit(&ndrp->r_rwlock);
8044 if (nvp) {
8045 VN_RELE(nvp);
8046 }
8047 return (error);
8048 }
8049
8050 /*
8051 * when renaming directories to be a subdirectory of a
8052 * different parent, the dnlc entry for ".." will no
8053 * longer be valid, so it must be removed
8054 */
8055 rp = VTOR4(ovp);
8056 if (ndvp != odvp) {
8057 if (ovp->v_type == VDIR) {
8058 dnlc_remove(ovp, "..");
8059 if (rp->r_dir != NULL)
8060 nfs4_purge_rddir_cache(ovp);
8061 }
8062 }
8063
8064 /*
8065 * If we are renaming the unlinked file, update the
8066 * r_unldvp and r_unlname as needed.
8067 */
8068 mutex_enter(&rp->r_statelock);
8069 if (rp->r_unldvp != NULL) {
8070 if (strcmp(rp->r_unlname, onm) == 0) {
8071 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8072 rp->r_unlname[MAXNAMELEN - 1] = '\0';
8073 if (ndvp != rp->r_unldvp) {
8074 VN_RELE(rp->r_unldvp);
8075 rp->r_unldvp = ndvp;
8076 VN_HOLD(ndvp);
8077 }
8078 }
8079 }
8080 mutex_exit(&rp->r_statelock);
8081
8082 /*
8083 * Notify the rename vnevents to source vnode, and to the target
8084 * vnode if it already existed.
8085 */
8086 if (error == 0) {
8087 vnode_t *tvp, *tovp;
8088 rnode4_t *trp;
8089
8090 /*
8091 * Notify the vnode. Each links is represented by
8092 * a different vnode, in nfsv4.
8093 */
8094 if (nvp) {
8095 trp = VTOR4(nvp);
8096 tvp = nvp;
8097 if (IS_SHADOW(nvp, trp))
8098 tvp = RTOV4(trp);
8099 vnevent_rename_dest(tvp, ndvp, nnm, ct);
8100 }
8101
8102 trp = VTOR4(ovp);
8103 tovp = ovp;
8104 if (IS_SHADOW(ovp, trp))
8105 tovp = RTOV4(trp);
8106
8107 vnevent_rename_src(tovp, odvp, onm, ct);
8108
8109 trp = VTOR4(ndvp);
8110 tvp = ndvp;
8111
8112 if (IS_SHADOW(ndvp, trp))
8113 tvp = RTOV4(trp);
8114
8115 vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
8116 }
8117
8118 if (nvp) {
8119 VN_RELE(nvp);
8120 }
8121 VN_RELE(ovp);
8122
8123 nfs_rw_exit(&odrp->r_rwlock);
8124 nfs_rw_exit(&ndrp->r_rwlock);
8125
8126 return (error);
8127 }
8128
8129 /*
8130 * When the parent directory has changed, sv_dfh must be updated
8131 */
8132 static void
8133 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8134 {
8135 svnode_t *sv = VTOSV(vp);
8136 nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8137 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8138
8139 sfh4_hold(new_dfh);
8140 sv->sv_dfh = new_dfh;
8141 sfh4_rele(&old_dfh);
8142 }
8143
8144 /*
8145 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8146 * when it is known that the filehandle is persistent through rename.
8147 *
8148 * Rename requires that the current fh be the target directory and the
8149 * saved fh be the source directory. After the operation, the current fh
8150 * is unchanged.
8151 * The compound op structure for persistent fh rename is:
8152 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8153 * Rather than bother with the directory postop args, we'll simply
8154 * update that a change occurred in the cache, so no post-op getattrs.
8155 */
8156 static int
8157 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8158 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8159 {
8160 COMPOUND4args_clnt args;
8161 COMPOUND4res_clnt res, *resp = NULL;
8162 nfs_argop4 *argop;
8163 nfs_resop4 *resop;
8164 int doqueue, argoplist_size;
8165 mntinfo4_t *mi;
8166 rnode4_t *odrp = VTOR4(odvp);
8167 rnode4_t *ndrp = VTOR4(ndvp);
8168 RENAME4res *rn_res;
8169 bool_t needrecov;
8170 nfs4_recov_state_t recov_state;
8171 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8172 dirattr_info_t dinfo, *dinfop;
8173
8174 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8175
8176 recov_state.rs_flags = 0;
8177 recov_state.rs_num_retry_despite_err = 0;
8178
8179 /*
8180 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8181 *
8182 * If source/target are different dirs, then append putfh(src); getattr
8183 */
8184 args.array_len = (odvp == ndvp) ? 5 : 7;
8185 argoplist_size = args.array_len * sizeof (nfs_argop4);
8186 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8187
8188 recov_retry:
8189 *statp = NFS4_OK;
8190
8191 /* No need to Lookup the file, persistent fh */
8192 args.ctag = TAG_RENAME;
8193
8194 mi = VTOMI4(odvp);
8195 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8196 if (e.error) {
8197 kmem_free(argop, argoplist_size);
8198 return (e.error);
8199 }
8200
8201 /* 0: putfh source directory */
8202 argop[0].argop = OP_CPUTFH;
8203 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8204
8205 /* 1: Save source fh to free up current for target */
8206 argop[1].argop = OP_SAVEFH;
8207
8208 /* 2: putfh targetdir */
8209 argop[2].argop = OP_CPUTFH;
8210 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8211
8212 /* 3: current_fh is targetdir, saved_fh is sourcedir */
8213 argop[3].argop = OP_CRENAME;
8214 argop[3].nfs_argop4_u.opcrename.coldname = onm;
8215 argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8216
8217 /* 4: getattr (targetdir) */
8218 argop[4].argop = OP_GETATTR;
8219 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8220 argop[4].nfs_argop4_u.opgetattr.mi = mi;
8221
8222 if (ndvp != odvp) {
8223
8224 /* 5: putfh (sourcedir) */
8225 argop[5].argop = OP_CPUTFH;
8226 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8227
8228 /* 6: getattr (sourcedir) */
8229 argop[6].argop = OP_GETATTR;
8230 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8231 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8232 }
8233
8234 dnlc_remove(odvp, onm);
8235 dnlc_remove(ndvp, nnm);
8236
8237 doqueue = 1;
8238 dinfo.di_time_call = gethrtime();
8239 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8240
8241 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8242 if (e.error) {
8243 PURGE_ATTRCACHE4(odvp);
8244 PURGE_ATTRCACHE4(ndvp);
8245 } else {
8246 *statp = res.status;
8247 }
8248
8249 if (needrecov) {
8250 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8251 OP_RENAME, NULL, NULL, NULL) == FALSE) {
8252 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8253 if (!e.error)
8254 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8255 goto recov_retry;
8256 }
8257 }
8258
8259 if (!e.error) {
8260 resp = &res;
8261 /*
8262 * as long as OP_RENAME
8263 */
8264 if (res.status != NFS4_OK && res.array_len <= 4) {
8265 e.error = geterrno4(res.status);
8266 PURGE_ATTRCACHE4(odvp);
8267 PURGE_ATTRCACHE4(ndvp);
8268 /*
8269 * System V defines rename to return EEXIST, not
8270 * ENOTEMPTY if the target directory is not empty.
8271 * Over the wire, the error is NFSERR_ENOTEMPTY
8272 * which geterrno4 maps to ENOTEMPTY.
8273 */
8274 if (e.error == ENOTEMPTY)
8275 e.error = EEXIST;
8276 } else {
8277
8278 resop = &res.array[3]; /* rename res */
8279 rn_res = &resop->nfs_resop4_u.oprename;
8280
8281 if (res.status == NFS4_OK) {
8282 /*
8283 * Update target attribute, readdir and dnlc
8284 * caches.
8285 */
8286 dinfo.di_garp =
8287 &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8288 dinfo.di_cred = cr;
8289 dinfop = &dinfo;
8290 } else
8291 dinfop = NULL;
8292
8293 nfs4_update_dircaches(&rn_res->target_cinfo,
8294 ndvp, NULL, NULL, dinfop);
8295
8296 /*
8297 * Update source attribute, readdir and dnlc caches
8298 *
8299 */
8300 if (ndvp != odvp) {
8301 update_parentdir_sfh(renvp, ndvp);
8302
8303 if (dinfop)
8304 dinfo.di_garp =
8305 &(res.array[6].nfs_resop4_u.
8306 opgetattr.ga_res);
8307
8308 nfs4_update_dircaches(&rn_res->source_cinfo,
8309 odvp, NULL, NULL, dinfop);
8310 }
8311
8312 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8313 nnm);
8314 }
8315 }
8316
8317 if (resp)
8318 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8319 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8320 kmem_free(argop, argoplist_size);
8321
8322 return (e.error);
8323 }
8324
8325 /*
8326 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8327 * it is possible for the filehandle to change due to the rename.
8328 *
8329 * The compound req in this case includes a post-rename lookup and getattr
8330 * to ensure that we have the correct fh and attributes for the object.
8331 *
8332 * Rename requires that the current fh be the target directory and the
8333 * saved fh be the source directory. After the operation, the current fh
8334 * is unchanged.
8335 *
8336 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8337 * update the filehandle for the renamed object. We also get the old
8338 * filehandle for historical reasons; this should be taken out sometime.
8339 * This results in a rather cumbersome compound...
8340 *
8341 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8342 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8343 *
8344 */
8345 static int
8346 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8347 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8348 {
8349 COMPOUND4args_clnt args;
8350 COMPOUND4res_clnt res, *resp = NULL;
8351 int argoplist_size;
8352 nfs_argop4 *argop;
8353 nfs_resop4 *resop;
8354 int doqueue;
8355 mntinfo4_t *mi;
8356 rnode4_t *odrp = VTOR4(odvp); /* old directory */
8357 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */
8358 rnode4_t *orp = VTOR4(ovp); /* object being renamed */
8359 RENAME4res *rn_res;
8360 GETFH4res *ngf_res;
8361 bool_t needrecov;
8362 nfs4_recov_state_t recov_state;
8363 hrtime_t t;
8364 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8365 dirattr_info_t dinfo, *dinfop = &dinfo;
8366
8367 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8368
8369 recov_state.rs_flags = 0;
8370 recov_state.rs_num_retry_despite_err = 0;
8371
8372 recov_retry:
8373 *statp = NFS4_OK;
8374
8375 /*
8376 * There is a window between the RPC and updating the path and
8377 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery
8378 * code, so that it doesn't try to use the old path during that
8379 * window.
8380 */
8381 mutex_enter(&orp->r_statelock);
8382 while (orp->r_flags & R4RECEXPFH) {
8383 klwp_t *lwp = ttolwp(curthread);
8384
8385 if (lwp != NULL)
8386 lwp->lwp_nostop++;
8387 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8388 mutex_exit(&orp->r_statelock);
8389 if (lwp != NULL)
8390 lwp->lwp_nostop--;
8391 return (EINTR);
8392 }
8393 if (lwp != NULL)
8394 lwp->lwp_nostop--;
8395 }
8396 orp->r_flags |= R4RECEXPFH;
8397 mutex_exit(&orp->r_statelock);
8398
8399 mi = VTOMI4(odvp);
8400
8401 args.ctag = TAG_RENAME_VFH;
8402 args.array_len = (odvp == ndvp) ? 10 : 12;
8403 argoplist_size = args.array_len * sizeof (nfs_argop4);
8404 argop = kmem_alloc(argoplist_size, KM_SLEEP);
8405
8406 /*
8407 * Rename ops:
8408 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8409 * PUTFH(targetdir), RENAME, GETATTR(targetdir)
8410 * LOOKUP(trgt), GETFH(new), GETATTR,
8411 *
8412 * if (odvp != ndvp)
8413 * add putfh(sourcedir), getattr(sourcedir) }
8414 */
8415 args.array = argop;
8416
8417 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8418 &recov_state, NULL);
8419 if (e.error) {
8420 kmem_free(argop, argoplist_size);
8421 mutex_enter(&orp->r_statelock);
8422 orp->r_flags &= ~R4RECEXPFH;
8423 cv_broadcast(&orp->r_cv);
8424 mutex_exit(&orp->r_statelock);
8425 return (e.error);
8426 }
8427
8428 /* 0: putfh source directory */
8429 argop[0].argop = OP_CPUTFH;
8430 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8431
8432 /* 1: Save source fh to free up current for target */
8433 argop[1].argop = OP_SAVEFH;
8434
8435 /* 2: Lookup pre-rename fh of renamed object */
8436 argop[2].argop = OP_CLOOKUP;
8437 argop[2].nfs_argop4_u.opclookup.cname = onm;
8438
8439 /* 3: getfh fh of renamed object (before rename) */
8440 argop[3].argop = OP_GETFH;
8441
8442 /* 4: putfh targetdir */
8443 argop[4].argop = OP_CPUTFH;
8444 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8445
8446 /* 5: current_fh is targetdir, saved_fh is sourcedir */
8447 argop[5].argop = OP_CRENAME;
8448 argop[5].nfs_argop4_u.opcrename.coldname = onm;
8449 argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8450
8451 /* 6: getattr of target dir (post op attrs) */
8452 argop[6].argop = OP_GETATTR;
8453 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8454 argop[6].nfs_argop4_u.opgetattr.mi = mi;
8455
8456 /* 7: Lookup post-rename fh of renamed object */
8457 argop[7].argop = OP_CLOOKUP;
8458 argop[7].nfs_argop4_u.opclookup.cname = nnm;
8459
8460 /* 8: getfh fh of renamed object (after rename) */
8461 argop[8].argop = OP_GETFH;
8462
8463 /* 9: getattr of renamed object */
8464 argop[9].argop = OP_GETATTR;
8465 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8466 argop[9].nfs_argop4_u.opgetattr.mi = mi;
8467
8468 /*
8469 * If source/target dirs are different, then get new post-op
8470 * attrs for source dir also.
8471 */
8472 if (ndvp != odvp) {
8473 /* 10: putfh (sourcedir) */
8474 argop[10].argop = OP_CPUTFH;
8475 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8476
8477 /* 11: getattr (sourcedir) */
8478 argop[11].argop = OP_GETATTR;
8479 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8480 argop[11].nfs_argop4_u.opgetattr.mi = mi;
8481 }
8482
8483 dnlc_remove(odvp, onm);
8484 dnlc_remove(ndvp, nnm);
8485
8486 doqueue = 1;
8487 t = gethrtime();
8488 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8489
8490 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8491 if (e.error) {
8492 PURGE_ATTRCACHE4(odvp);
8493 PURGE_ATTRCACHE4(ndvp);
8494 if (!needrecov) {
8495 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8496 &recov_state, needrecov);
8497 goto out;
8498 }
8499 } else {
8500 *statp = res.status;
8501 }
8502
8503 if (needrecov) {
8504 bool_t abort;
8505
8506 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8507 OP_RENAME, NULL, NULL, NULL);
8508 if (abort == FALSE) {
8509 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8510 &recov_state, needrecov);
8511 kmem_free(argop, argoplist_size);
8512 if (!e.error)
8513 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8514 mutex_enter(&orp->r_statelock);
8515 orp->r_flags &= ~R4RECEXPFH;
8516 cv_broadcast(&orp->r_cv);
8517 mutex_exit(&orp->r_statelock);
8518 goto recov_retry;
8519 } else {
8520 if (e.error != 0) {
8521 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8522 &recov_state, needrecov);
8523 goto out;
8524 }
8525 /* fall through for res.status case */
8526 }
8527 }
8528
8529 resp = &res;
8530 /*
8531 * If OP_RENAME (or any prev op) failed, then return an error.
8532 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8533 */
8534 if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8535 /*
8536 * Error in an op other than last Getattr
8537 */
8538 e.error = geterrno4(res.status);
8539 PURGE_ATTRCACHE4(odvp);
8540 PURGE_ATTRCACHE4(ndvp);
8541 /*
8542 * System V defines rename to return EEXIST, not
8543 * ENOTEMPTY if the target directory is not empty.
8544 * Over the wire, the error is NFSERR_ENOTEMPTY
8545 * which geterrno4 maps to ENOTEMPTY.
8546 */
8547 if (e.error == ENOTEMPTY)
8548 e.error = EEXIST;
8549 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8550 needrecov);
8551 goto out;
8552 }
8553
8554 /* rename results */
8555 rn_res = &res.array[5].nfs_resop4_u.oprename;
8556
8557 if (res.status == NFS4_OK) {
8558 /* Update target attribute, readdir and dnlc caches */
8559 dinfo.di_garp =
8560 &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8561 dinfo.di_cred = cr;
8562 dinfo.di_time_call = t;
8563 } else
8564 dinfop = NULL;
8565
8566 /* Update source cache attribute, readdir and dnlc caches */
8567 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8568
8569 /* Update source cache attribute, readdir and dnlc caches */
8570 if (ndvp != odvp) {
8571 update_parentdir_sfh(ovp, ndvp);
8572
8573 /*
8574 * If dinfop is non-NULL, then compound succeded, so
8575 * set di_garp to attrs for source dir. dinfop is only
8576 * set to NULL when compound fails.
8577 */
8578 if (dinfop)
8579 dinfo.di_garp =
8580 &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8581 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8582 dinfop);
8583 }
8584
8585 /*
8586 * Update the rnode with the new component name and args,
8587 * and if the file handle changed, also update it with the new fh.
8588 * This is only necessary if the target object has an rnode
8589 * entry and there is no need to create one for it.
8590 */
8591 resop = &res.array[8]; /* getfh new res */
8592 ngf_res = &resop->nfs_resop4_u.opgetfh;
8593
8594 /*
8595 * Update the path and filehandle for the renamed object.
8596 */
8597 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8598
8599 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8600
8601 if (res.status == NFS4_OK) {
8602 resop++; /* getattr res */
8603 e.error = nfs4_update_attrcache(res.status,
8604 &resop->nfs_resop4_u.opgetattr.ga_res,
8605 t, ovp, cr);
8606 }
8607
8608 out:
8609 kmem_free(argop, argoplist_size);
8610 if (resp)
8611 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8612 mutex_enter(&orp->r_statelock);
8613 orp->r_flags &= ~R4RECEXPFH;
8614 cv_broadcast(&orp->r_cv);
8615 mutex_exit(&orp->r_statelock);
8616
8617 return (e.error);
8618 }
8619
8620 /* ARGSUSED */
8621 static int
8622 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8623 caller_context_t *ct, int flags, vsecattr_t *vsecp)
8624 {
8625 int error;
8626 vnode_t *vp;
8627
8628 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8629 return (EPERM);
8630 /*
8631 * As ".." has special meaning and rather than send a mkdir
8632 * over the wire to just let the server freak out, we just
8633 * short circuit it here and return EEXIST
8634 */
8635 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8636 return (EEXIST);
8637
8638 /*
8639 * Decision to get the right gid and setgid bit of the
8640 * new directory is now made in call_nfs4_create_req.
8641 */
8642 va->va_mask |= AT_MODE;
8643 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8644 if (error)
8645 return (error);
8646
8647 *vpp = vp;
8648 return (0);
8649 }
8650
8651
8652 /*
8653 * rmdir is using the same remove v4 op as does remove.
8654 * Remove requires that the current fh be the target directory.
8655 * After the operation, the current fh is unchanged.
8656 * The compound op structure is:
8657 * PUTFH(targetdir), REMOVE
8658 */
8659 /*ARGSUSED4*/
8660 static int
8661 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8662 caller_context_t *ct, int flags)
8663 {
8664 int need_end_op = FALSE;
8665 COMPOUND4args_clnt args;
8666 COMPOUND4res_clnt res, *resp = NULL;
8667 REMOVE4res *rm_res;
8668 nfs_argop4 argop[3];
8669 nfs_resop4 *resop;
8670 vnode_t *vp;
8671 int doqueue;
8672 mntinfo4_t *mi;
8673 rnode4_t *drp;
8674 bool_t needrecov = FALSE;
8675 nfs4_recov_state_t recov_state;
8676 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8677 dirattr_info_t dinfo, *dinfop;
8678
8679 if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8680 return (EPERM);
8681 /*
8682 * As ".." has special meaning and rather than send a rmdir
8683 * over the wire to just let the server freak out, we just
8684 * short circuit it here and return EEXIST
8685 */
8686 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8687 return (EEXIST);
8688
8689 drp = VTOR4(dvp);
8690 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8691 return (EINTR);
8692
8693 /*
8694 * Attempt to prevent a rmdir(".") from succeeding.
8695 */
8696 e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8697 if (e.error) {
8698 nfs_rw_exit(&drp->r_rwlock);
8699 return (e.error);
8700 }
8701 if (vp == cdir) {
8702 VN_RELE(vp);
8703 nfs_rw_exit(&drp->r_rwlock);
8704 return (EINVAL);
8705 }
8706
8707 /*
8708 * Since nfsv4 remove op works on both files and directories,
8709 * check that the removed object is indeed a directory.
8710 */
8711 if (vp->v_type != VDIR) {
8712 VN_RELE(vp);
8713 nfs_rw_exit(&drp->r_rwlock);
8714 return (ENOTDIR);
8715 }
8716
8717 /*
8718 * First just remove the entry from the name cache, as it
8719 * is most likely an entry for this vp.
8720 */
8721 dnlc_remove(dvp, nm);
8722
8723 /*
8724 * If there vnode reference count is greater than one, then
8725 * there may be additional references in the DNLC which will
8726 * need to be purged. First, trying removing the entry for
8727 * the parent directory and see if that removes the additional
8728 * reference(s). If that doesn't do it, then use dnlc_purge_vp
8729 * to completely remove any references to the directory which
8730 * might still exist in the DNLC.
8731 */
8732 if (vp->v_count > 1) {
8733 dnlc_remove(vp, "..");
8734 if (vp->v_count > 1)
8735 dnlc_purge_vp(vp);
8736 }
8737
8738 mi = VTOMI4(dvp);
8739 recov_state.rs_flags = 0;
8740 recov_state.rs_num_retry_despite_err = 0;
8741
8742 recov_retry:
8743 args.ctag = TAG_RMDIR;
8744
8745 /*
8746 * Rmdir ops: putfh dir; remove
8747 */
8748 args.array_len = 3;
8749 args.array = argop;
8750
8751 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8752 if (e.error) {
8753 nfs_rw_exit(&drp->r_rwlock);
8754 return (e.error);
8755 }
8756 need_end_op = TRUE;
8757
8758 /* putfh directory */
8759 argop[0].argop = OP_CPUTFH;
8760 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8761
8762 /* remove */
8763 argop[1].argop = OP_CREMOVE;
8764 argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8765
8766 /* getattr (postop attrs for dir that contained removed dir) */
8767 argop[2].argop = OP_GETATTR;
8768 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8769 argop[2].nfs_argop4_u.opgetattr.mi = mi;
8770
8771 dinfo.di_time_call = gethrtime();
8772 doqueue = 1;
8773 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8774
8775 PURGE_ATTRCACHE4(vp);
8776
8777 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8778 if (e.error) {
8779 PURGE_ATTRCACHE4(dvp);
8780 }
8781
8782 if (needrecov) {
8783 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8784 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8785 if (!e.error)
8786 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8787
8788 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8789 needrecov);
8790 need_end_op = FALSE;
8791 goto recov_retry;
8792 }
8793 }
8794
8795 if (!e.error) {
8796 resp = &res;
8797
8798 /*
8799 * Only return error if first 2 ops (OP_REMOVE or earlier)
8800 * failed.
8801 */
8802 if (res.status != NFS4_OK && res.array_len <= 2) {
8803 e.error = geterrno4(res.status);
8804 PURGE_ATTRCACHE4(dvp);
8805 nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8806 &recov_state, needrecov);
8807 need_end_op = FALSE;
8808 nfs4_purge_stale_fh(e.error, dvp, cr);
8809 /*
8810 * System V defines rmdir to return EEXIST, not
8811 * ENOTEMPTY if the directory is not empty. Over
8812 * the wire, the error is NFSERR_ENOTEMPTY which
8813 * geterrno4 maps to ENOTEMPTY.
8814 */
8815 if (e.error == ENOTEMPTY)
8816 e.error = EEXIST;
8817 } else {
8818 resop = &res.array[1]; /* remove res */
8819 rm_res = &resop->nfs_resop4_u.opremove;
8820
8821 if (res.status == NFS4_OK) {
8822 resop = &res.array[2]; /* dir attrs */
8823 dinfo.di_garp =
8824 &resop->nfs_resop4_u.opgetattr.ga_res;
8825 dinfo.di_cred = cr;
8826 dinfop = &dinfo;
8827 } else
8828 dinfop = NULL;
8829
8830 /* Update dir attribute, readdir and dnlc caches */
8831 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8832 dinfop);
8833
8834 /* destroy rddir cache for dir that was removed */
8835 if (VTOR4(vp)->r_dir != NULL)
8836 nfs4_purge_rddir_cache(vp);
8837 }
8838 }
8839
8840 if (need_end_op)
8841 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8842
8843 nfs_rw_exit(&drp->r_rwlock);
8844
8845 if (resp)
8846 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8847
8848 if (e.error == 0) {
8849 vnode_t *tvp;
8850 rnode4_t *trp;
8851 trp = VTOR4(vp);
8852 tvp = vp;
8853 if (IS_SHADOW(vp, trp))
8854 tvp = RTOV4(trp);
8855 vnevent_rmdir(tvp, dvp, nm, ct);
8856 }
8857
8858 VN_RELE(vp);
8859
8860 return (e.error);
8861 }
8862
8863 /* ARGSUSED */
8864 static int
8865 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8866 caller_context_t *ct, int flags)
8867 {
8868 int error;
8869 vnode_t *vp;
8870 rnode4_t *rp;
8871 char *contents;
8872 mntinfo4_t *mi = VTOMI4(dvp);
8873
8874 if (nfs_zone() != mi->mi_zone)
8875 return (EPERM);
8876 if (!(mi->mi_flags & MI4_SYMLINK))
8877 return (EOPNOTSUPP);
8878
8879 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8880 if (error)
8881 return (error);
8882
8883 ASSERT(nfs4_consistent_type(vp));
8884 rp = VTOR4(vp);
8885 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8886
8887 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8888
8889 if (contents != NULL) {
8890 mutex_enter(&rp->r_statelock);
8891 if (rp->r_symlink.contents == NULL) {
8892 rp->r_symlink.len = strlen(tnm);
8893 bcopy(tnm, contents, rp->r_symlink.len);
8894 rp->r_symlink.contents = contents;
8895 rp->r_symlink.size = MAXPATHLEN;
8896 mutex_exit(&rp->r_statelock);
8897 } else {
8898 mutex_exit(&rp->r_statelock);
8899 kmem_free((void *)contents, MAXPATHLEN);
8900 }
8901 }
8902 }
8903 VN_RELE(vp);
8904
8905 return (error);
8906 }
8907
8908
8909 /*
8910 * Read directory entries.
8911 * There are some weird things to look out for here. The uio_loffset
8912 * field is either 0 or it is the offset returned from a previous
8913 * readdir. It is an opaque value used by the server to find the
8914 * correct directory block to read. The count field is the number
8915 * of blocks to read on the server. This is advisory only, the server
8916 * may return only one block's worth of entries. Entries may be compressed
8917 * on the server.
8918 */
8919 /* ARGSUSED */
8920 static int
8921 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8922 caller_context_t *ct, int flags)
8923 {
8924 int error;
8925 uint_t count;
8926 rnode4_t *rp;
8927 rddir4_cache *rdc;
8928 rddir4_cache *rrdc;
8929
8930 if (nfs_zone() != VTOMI4(vp)->mi_zone)
8931 return (EIO);
8932 rp = VTOR4(vp);
8933
8934 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8935
8936 /*
8937 * Make sure that the directory cache is valid.
8938 */
8939 if (rp->r_dir != NULL) {
8940 if (nfs_disable_rddir_cache != 0) {
8941 /*
8942 * Setting nfs_disable_rddir_cache in /etc/system
8943 * allows interoperability with servers that do not
8944 * properly update the attributes of directories.
8945 * Any cached information gets purged before an
8946 * access is made to it.
8947 */
8948 nfs4_purge_rddir_cache(vp);
8949 }
8950
8951 error = nfs4_validate_caches(vp, cr);
8952 if (error)
8953 return (error);
8954 }
8955
8956 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8957
8958 /*
8959 * Short circuit last readdir which always returns 0 bytes.
8960 * This can be done after the directory has been read through
8961 * completely at least once. This will set r_direof which
8962 * can be used to find the value of the last cookie.
8963 */
8964 mutex_enter(&rp->r_statelock);
8965 if (rp->r_direof != NULL &&
8966 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8967 mutex_exit(&rp->r_statelock);
8968 #ifdef DEBUG
8969 nfs4_readdir_cache_shorts++;
8970 #endif
8971 if (eofp)
8972 *eofp = 1;
8973 return (0);
8974 }
8975
8976 /*
8977 * Look for a cache entry. Cache entries are identified
8978 * by the NFS cookie value and the byte count requested.
8979 */
8980 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8981
8982 /*
8983 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8984 */
8985 if (rdc == NULL) {
8986 mutex_exit(&rp->r_statelock);
8987 return (EINTR);
8988 }
8989
8990 /*
8991 * Check to see if we need to fill this entry in.
8992 */
8993 if (rdc->flags & RDDIRREQ) {
8994 rdc->flags &= ~RDDIRREQ;
8995 rdc->flags |= RDDIR;
8996 mutex_exit(&rp->r_statelock);
8997
8998 /*
8999 * Do the readdir.
9000 */
9001 nfs4readdir(vp, rdc, cr);
9002
9003 /*
9004 * Reacquire the lock, so that we can continue
9005 */
9006 mutex_enter(&rp->r_statelock);
9007 /*
9008 * The entry is now complete
9009 */
9010 rdc->flags &= ~RDDIR;
9011 }
9012
9013 ASSERT(!(rdc->flags & RDDIR));
9014
9015 /*
9016 * If an error occurred while attempting
9017 * to fill the cache entry, mark the entry invalid and
9018 * just return the error.
9019 */
9020 if (rdc->error) {
9021 error = rdc->error;
9022 rdc->flags |= RDDIRREQ;
9023 rddir4_cache_rele(rp, rdc);
9024 mutex_exit(&rp->r_statelock);
9025 return (error);
9026 }
9027
9028 /*
9029 * The cache entry is complete and good,
9030 * copyout the dirent structs to the calling
9031 * thread.
9032 */
9033 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9034
9035 /*
9036 * If no error occurred during the copyout,
9037 * update the offset in the uio struct to
9038 * contain the value of the next NFS 4 cookie
9039 * and set the eof value appropriately.
9040 */
9041 if (!error) {
9042 uiop->uio_loffset = rdc->nfs4_ncookie;
9043 if (eofp)
9044 *eofp = rdc->eof;
9045 }
9046
9047 /*
9048 * Decide whether to do readahead. Don't if we
9049 * have already read to the end of directory.
9050 */
9051 if (rdc->eof) {
9052 /*
9053 * Make the entry the direof only if it is cached
9054 */
9055 if (rdc->flags & RDDIRCACHED)
9056 rp->r_direof = rdc;
9057 rddir4_cache_rele(rp, rdc);
9058 mutex_exit(&rp->r_statelock);
9059 return (error);
9060 }
9061
9062 /* Determine if a readdir readahead should be done */
9063 if (!(rp->r_flags & R4LOOKUP)) {
9064 rddir4_cache_rele(rp, rdc);
9065 mutex_exit(&rp->r_statelock);
9066 return (error);
9067 }
9068
9069 /*
9070 * Now look for a readahead entry.
9071 *
9072 * Check to see whether we found an entry for the readahead.
9073 * If so, we don't need to do anything further, so free the new
9074 * entry if one was allocated. Otherwise, allocate a new entry, add
9075 * it to the cache, and then initiate an asynchronous readdir
9076 * operation to fill it.
9077 */
9078 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9079
9080 /*
9081 * A readdir cache entry could not be obtained for the readahead. In
9082 * this case we skip the readahead and return.
9083 */
9084 if (rrdc == NULL) {
9085 rddir4_cache_rele(rp, rdc);
9086 mutex_exit(&rp->r_statelock);
9087 return (error);
9088 }
9089
9090 /*
9091 * Check to see if we need to fill this entry in.
9092 */
9093 if (rrdc->flags & RDDIRREQ) {
9094 rrdc->flags &= ~RDDIRREQ;
9095 rrdc->flags |= RDDIR;
9096 rddir4_cache_rele(rp, rdc);
9097 mutex_exit(&rp->r_statelock);
9098 #ifdef DEBUG
9099 nfs4_readdir_readahead++;
9100 #endif
9101 /*
9102 * Do the readdir.
9103 */
9104 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9105 return (error);
9106 }
9107
9108 rddir4_cache_rele(rp, rrdc);
9109 rddir4_cache_rele(rp, rdc);
9110 mutex_exit(&rp->r_statelock);
9111 return (error);
9112 }
9113
9114 static int
9115 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9116 {
9117 int error;
9118 rnode4_t *rp;
9119
9120 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9121
9122 rp = VTOR4(vp);
9123
9124 /*
9125 * Obtain the readdir results for the caller.
9126 */
9127 nfs4readdir(vp, rdc, cr);
9128
9129 mutex_enter(&rp->r_statelock);
9130 /*
9131 * The entry is now complete
9132 */
9133 rdc->flags &= ~RDDIR;
9134
9135 error = rdc->error;
9136 if (error)
9137 rdc->flags |= RDDIRREQ;
9138 rddir4_cache_rele(rp, rdc);
9139 mutex_exit(&rp->r_statelock);
9140
9141 return (error);
9142 }
9143
9144 /*
9145 * Read directory entries.
9146 * There are some weird things to look out for here. The uio_loffset
9147 * field is either 0 or it is the offset returned from a previous
9148 * readdir. It is an opaque value used by the server to find the
9149 * correct directory block to read. The count field is the number
9150 * of blocks to read on the server. This is advisory only, the server
9151 * may return only one block's worth of entries. Entries may be compressed
9152 * on the server.
9153 *
9154 * Generates the following compound request:
9155 * 1. If readdir offset is zero and no dnlc entry for parent exists,
9156 * must include a Lookupp as well. In this case, send:
9157 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9158 * 2. Otherwise just do: { Putfh <fh>; Readdir }
9159 *
9160 * Get complete attributes and filehandles for entries if this is the
9161 * first read of the directory. Otherwise, just get fileid's.
9162 */
9163 static void
9164 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9165 {
9166 COMPOUND4args_clnt args;
9167 COMPOUND4res_clnt res;
9168 READDIR4args *rargs;
9169 READDIR4res_clnt *rd_res;
9170 bitmap4 rd_bitsval;
9171 nfs_argop4 argop[5];
9172 nfs_resop4 *resop;
9173 rnode4_t *rp = VTOR4(vp);
9174 mntinfo4_t *mi = VTOMI4(vp);
9175 int doqueue;
9176 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */
9177 vnode_t *dvp;
9178 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9179 int num_ops, res_opcnt;
9180 bool_t needrecov = FALSE;
9181 nfs4_recov_state_t recov_state;
9182 hrtime_t t;
9183 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9184
9185 ASSERT(nfs_zone() == mi->mi_zone);
9186 ASSERT(rdc->flags & RDDIR);
9187 ASSERT(rdc->entries == NULL);
9188
9189 /*
9190 * If rp were a stub, it should have triggered and caused
9191 * a mount for us to get this far.
9192 */
9193 ASSERT(!RP_ISSTUB(rp));
9194
9195 num_ops = 2;
9196 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9197 /*
9198 * Since nfsv4 readdir may not return entries for "." and "..",
9199 * the client must recreate them:
9200 * To find the correct nodeid, do the following:
9201 * For current node, get nodeid from dnlc.
9202 * - if current node is rootvp, set pnodeid to nodeid.
9203 * - else if parent is in the dnlc, get its nodeid from there.
9204 * - else add LOOKUPP+GETATTR to compound.
9205 */
9206 nodeid = rp->r_attr.va_nodeid;
9207 if (vp->v_flag & VROOT) {
9208 pnodeid = nodeid; /* root of mount point */
9209 } else {
9210 dvp = dnlc_lookup(vp, "..");
9211 if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9212 /* parent in dnlc cache - no need for otw */
9213 pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9214 } else {
9215 /*
9216 * parent not in dnlc cache,
9217 * do lookupp to get its id
9218 */
9219 num_ops = 5;
9220 pnodeid = 0; /* set later by getattr parent */
9221 }
9222 if (dvp)
9223 VN_RELE(dvp);
9224 }
9225 }
9226 recov_state.rs_flags = 0;
9227 recov_state.rs_num_retry_despite_err = 0;
9228
9229 /* Save the original mount point security flavor */
9230 (void) save_mnt_secinfo(mi->mi_curr_serv);
9231
9232 recov_retry:
9233 args.ctag = TAG_READDIR;
9234
9235 args.array = argop;
9236 args.array_len = num_ops;
9237
9238 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9239 &recov_state, NULL)) {
9240 /*
9241 * If readdir a node that is a stub for a crossed mount point,
9242 * keep the original secinfo flavor for the current file
9243 * system, not the crossed one.
9244 */
9245 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9246 rdc->error = e.error;
9247 return;
9248 }
9249
9250 /*
9251 * Determine which attrs to request for dirents. This code
9252 * must be protected by nfs4_start/end_fop because of r_server
9253 * (which will change during failover recovery).
9254 *
9255 */
9256 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9257 /*
9258 * Get all vattr attrs plus filehandle and rdattr_error
9259 */
9260 rd_bitsval = NFS4_VATTR_MASK |
9261 FATTR4_RDATTR_ERROR_MASK |
9262 FATTR4_FILEHANDLE_MASK;
9263
9264 if (rp->r_flags & R4READDIRWATTR) {
9265 mutex_enter(&rp->r_statelock);
9266 rp->r_flags &= ~R4READDIRWATTR;
9267 mutex_exit(&rp->r_statelock);
9268 }
9269 } else {
9270 servinfo4_t *svp = rp->r_server;
9271
9272 /*
9273 * Already read directory. Use readdir with
9274 * no attrs (except for mounted_on_fileid) for updates.
9275 */
9276 rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9277
9278 /*
9279 * request mounted on fileid if supported, else request
9280 * fileid. maybe we should verify that fileid is supported
9281 * and request something else if not.
9282 */
9283 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9284 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9285 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9286 nfs_rw_exit(&svp->sv_lock);
9287 }
9288
9289 /* putfh directory fh */
9290 argop[0].argop = OP_CPUTFH;
9291 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9292
9293 argop[1].argop = OP_READDIR;
9294 rargs = &argop[1].nfs_argop4_u.opreaddir;
9295 /*
9296 * 1 and 2 are reserved for client "." and ".." entry offset.
9297 * cookie 0 should be used over-the-wire to start reading at
9298 * the beginning of the directory excluding "." and "..".
9299 */
9300 if (rdc->nfs4_cookie == 0 ||
9301 rdc->nfs4_cookie == 1 ||
9302 rdc->nfs4_cookie == 2) {
9303 rargs->cookie = (nfs_cookie4)0;
9304 rargs->cookieverf = 0;
9305 } else {
9306 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9307 mutex_enter(&rp->r_statelock);
9308 rargs->cookieverf = rp->r_cookieverf4;
9309 mutex_exit(&rp->r_statelock);
9310 }
9311 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9312 rargs->maxcount = mi->mi_tsize;
9313 rargs->attr_request = rd_bitsval;
9314 rargs->rdc = rdc;
9315 rargs->dvp = vp;
9316 rargs->mi = mi;
9317 rargs->cr = cr;
9318
9319
9320 /*
9321 * If count < than the minimum required, we return no entries
9322 * and fail with EINVAL
9323 */
9324 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9325 rdc->error = EINVAL;
9326 goto out;
9327 }
9328
9329 if (args.array_len == 5) {
9330 /*
9331 * Add lookupp and getattr for parent nodeid.
9332 */
9333 argop[2].argop = OP_LOOKUPP;
9334
9335 argop[3].argop = OP_GETFH;
9336
9337 /* getattr parent */
9338 argop[4].argop = OP_GETATTR;
9339 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9340 argop[4].nfs_argop4_u.opgetattr.mi = mi;
9341 }
9342
9343 doqueue = 1;
9344
9345 if (mi->mi_io_kstats) {
9346 mutex_enter(&mi->mi_lock);
9347 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9348 mutex_exit(&mi->mi_lock);
9349 }
9350
9351 /* capture the time of this call */
9352 rargs->t = t = gethrtime();
9353
9354 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9355
9356 if (mi->mi_io_kstats) {
9357 mutex_enter(&mi->mi_lock);
9358 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9359 mutex_exit(&mi->mi_lock);
9360 }
9361
9362 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9363
9364 /*
9365 * If RPC error occurred and it isn't an error that
9366 * triggers recovery, then go ahead and fail now.
9367 */
9368 if (e.error != 0 && !needrecov) {
9369 rdc->error = e.error;
9370 goto out;
9371 }
9372
9373 if (needrecov) {
9374 bool_t abort;
9375
9376 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9377 "nfs4readdir: initiating recovery.\n"));
9378
9379 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9380 NULL, OP_READDIR, NULL, NULL, NULL);
9381 if (abort == FALSE) {
9382 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9383 &recov_state, needrecov);
9384 if (!e.error)
9385 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9386 if (rdc->entries != NULL) {
9387 kmem_free(rdc->entries, rdc->entlen);
9388 rdc->entries = NULL;
9389 }
9390 goto recov_retry;
9391 }
9392
9393 if (e.error != 0) {
9394 rdc->error = e.error;
9395 goto out;
9396 }
9397
9398 /* fall through for res.status case */
9399 }
9400
9401 res_opcnt = res.array_len;
9402
9403 /*
9404 * If compound failed first 2 ops (PUTFH+READDIR), then return
9405 * failure here. Subsequent ops are for filling out dot-dot
9406 * dirent, and if they fail, we still want to give the caller
9407 * the dirents returned by (the successful) READDIR op, so we need
9408 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9409 *
9410 * One example where PUTFH+READDIR ops would succeed but
9411 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9412 * but lacks x. In this case, a POSIX server's VOP_READDIR
9413 * would succeed; however, VOP_LOOKUP(..) would fail since no
9414 * x perm. We need to come up with a non-vendor-specific way
9415 * for a POSIX server to return d_ino from dotdot's dirent if
9416 * client only requests mounted_on_fileid, and just say the
9417 * LOOKUPP succeeded and fill out the GETATTR. However, if
9418 * client requested any mandatory attrs, server would be required
9419 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9420 * for dotdot.
9421 */
9422
9423 if (res.status) {
9424 if (res_opcnt <= 2) {
9425 e.error = geterrno4(res.status);
9426 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9427 &recov_state, needrecov);
9428 nfs4_purge_stale_fh(e.error, vp, cr);
9429 rdc->error = e.error;
9430 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9431 if (rdc->entries != NULL) {
9432 kmem_free(rdc->entries, rdc->entlen);
9433 rdc->entries = NULL;
9434 }
9435 /*
9436 * If readdir a node that is a stub for a
9437 * crossed mount point, keep the original
9438 * secinfo flavor for the current file system,
9439 * not the crossed one.
9440 */
9441 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9442 return;
9443 }
9444 }
9445
9446 resop = &res.array[1]; /* readdir res */
9447 rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9448
9449 mutex_enter(&rp->r_statelock);
9450 rp->r_cookieverf4 = rd_res->cookieverf;
9451 mutex_exit(&rp->r_statelock);
9452
9453 /*
9454 * For "." and ".." entries
9455 * e.g.
9456 * seek(cookie=0) -> "." entry with d_off = 1
9457 * seek(cookie=1) -> ".." entry with d_off = 2
9458 */
9459 if (cookie == (nfs_cookie4) 0) {
9460 if (rd_res->dotp)
9461 rd_res->dotp->d_ino = nodeid;
9462 if (rd_res->dotdotp)
9463 rd_res->dotdotp->d_ino = pnodeid;
9464 }
9465 if (cookie == (nfs_cookie4) 1) {
9466 if (rd_res->dotdotp)
9467 rd_res->dotdotp->d_ino = pnodeid;
9468 }
9469
9470
9471 /* LOOKUPP+GETATTR attemped */
9472 if (args.array_len == 5 && rd_res->dotdotp) {
9473 if (res.status == NFS4_OK && res_opcnt == 5) {
9474 nfs_fh4 *fhp;
9475 nfs4_sharedfh_t *sfhp;
9476 vnode_t *pvp;
9477 nfs4_ga_res_t *garp;
9478
9479 resop++; /* lookupp */
9480 resop++; /* getfh */
9481 fhp = &resop->nfs_resop4_u.opgetfh.object;
9482
9483 resop++; /* getattr of parent */
9484
9485 /*
9486 * First, take care of finishing the
9487 * readdir results.
9488 */
9489 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9490 /*
9491 * The d_ino of .. must be the inode number
9492 * of the mounted filesystem.
9493 */
9494 if (garp->n4g_va.va_mask & AT_NODEID)
9495 rd_res->dotdotp->d_ino =
9496 garp->n4g_va.va_nodeid;
9497
9498
9499 /*
9500 * Next, create the ".." dnlc entry
9501 */
9502 sfhp = sfh4_get(fhp, mi);
9503 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9504 dnlc_update(vp, "..", pvp);
9505 VN_RELE(pvp);
9506 }
9507 sfh4_rele(&sfhp);
9508 }
9509 }
9510
9511 if (mi->mi_io_kstats) {
9512 mutex_enter(&mi->mi_lock);
9513 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9514 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9515 mutex_exit(&mi->mi_lock);
9516 }
9517
9518 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9519
9520 out:
9521 /*
9522 * If readdir a node that is a stub for a crossed mount point,
9523 * keep the original secinfo flavor for the current file system,
9524 * not the crossed one.
9525 */
9526 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9527
9528 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9529 }
9530
9531
9532 static int
9533 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9534 {
9535 rnode4_t *rp = VTOR4(bp->b_vp);
9536 int count;
9537 int error;
9538 cred_t *cred_otw = NULL;
9539 offset_t offset;
9540 nfs4_open_stream_t *osp = NULL;
9541 bool_t first_time = TRUE; /* first time getting otw cred */
9542 bool_t last_time = FALSE; /* last time getting otw cred */
9543
9544 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9545
9546 DTRACE_IO1(start, struct buf *, bp);
9547 offset = ldbtob(bp->b_lblkno);
9548
9549 if (bp->b_flags & B_READ) {
9550 read_again:
9551 /*
9552 * Releases the osp, if it is provided.
9553 * Puts a hold on the cred_otw and the new osp (if found).
9554 */
9555 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9556 &first_time, &last_time);
9557 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9558 offset, bp->b_bcount, &bp->b_resid, cred_otw,
9559 readahead, NULL);
9560 crfree(cred_otw);
9561 if (!error) {
9562 if (bp->b_resid) {
9563 /*
9564 * Didn't get it all because we hit EOF,
9565 * zero all the memory beyond the EOF.
9566 */
9567 /* bzero(rdaddr + */
9568 bzero(bp->b_un.b_addr +
9569 bp->b_bcount - bp->b_resid, bp->b_resid);
9570 }
9571 mutex_enter(&rp->r_statelock);
9572 if (bp->b_resid == bp->b_bcount &&
9573 offset >= rp->r_size) {
9574 /*
9575 * We didn't read anything at all as we are
9576 * past EOF. Return an error indicator back
9577 * but don't destroy the pages (yet).
9578 */
9579 error = NFS_EOF;
9580 }
9581 mutex_exit(&rp->r_statelock);
9582 } else if (error == EACCES && last_time == FALSE) {
9583 goto read_again;
9584 }
9585 } else {
9586 if (!(rp->r_flags & R4STALE)) {
9587 write_again:
9588 /*
9589 * Releases the osp, if it is provided.
9590 * Puts a hold on the cred_otw and the new
9591 * osp (if found).
9592 */
9593 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9594 &first_time, &last_time);
9595 mutex_enter(&rp->r_statelock);
9596 count = MIN(bp->b_bcount, rp->r_size - offset);
9597 mutex_exit(&rp->r_statelock);
9598 if (count < 0)
9599 cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9600 #ifdef DEBUG
9601 if (count == 0) {
9602 zoneid_t zoneid = getzoneid();
9603
9604 zcmn_err(zoneid, CE_WARN,
9605 "nfs4_bio: zero length write at %lld",
9606 offset);
9607 zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9608 "b_bcount=%ld, file size=%lld",
9609 rp->r_flags, (long)bp->b_bcount,
9610 rp->r_size);
9611 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9612 if (nfs4_bio_do_stop)
9613 debug_enter("nfs4_bio");
9614 }
9615 #endif
9616 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9617 count, cred_otw, stab_comm);
9618 if (error == EACCES && last_time == FALSE) {
9619 crfree(cred_otw);
9620 goto write_again;
9621 }
9622 bp->b_error = error;
9623 if (error && error != EINTR &&
9624 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9625 /*
9626 * Don't print EDQUOT errors on the console.
9627 * Don't print asynchronous EACCES errors.
9628 * Don't print EFBIG errors.
9629 * Print all other write errors.
9630 */
9631 if (error != EDQUOT && error != EFBIG &&
9632 (error != EACCES ||
9633 !(bp->b_flags & B_ASYNC)))
9634 nfs4_write_error(bp->b_vp,
9635 error, cred_otw);
9636 /*
9637 * Update r_error and r_flags as appropriate.
9638 * If the error was ESTALE, then mark the
9639 * rnode as not being writeable and save
9640 * the error status. Otherwise, save any
9641 * errors which occur from asynchronous
9642 * page invalidations. Any errors occurring
9643 * from other operations should be saved
9644 * by the caller.
9645 */
9646 mutex_enter(&rp->r_statelock);
9647 if (error == ESTALE) {
9648 rp->r_flags |= R4STALE;
9649 if (!rp->r_error)
9650 rp->r_error = error;
9651 } else if (!rp->r_error &&
9652 (bp->b_flags &
9653 (B_INVAL|B_FORCE|B_ASYNC)) ==
9654 (B_INVAL|B_FORCE|B_ASYNC)) {
9655 rp->r_error = error;
9656 }
9657 mutex_exit(&rp->r_statelock);
9658 }
9659 crfree(cred_otw);
9660 } else {
9661 error = rp->r_error;
9662 /*
9663 * A close may have cleared r_error, if so,
9664 * propagate ESTALE error return properly
9665 */
9666 if (error == 0)
9667 error = ESTALE;
9668 }
9669 }
9670
9671 if (error != 0 && error != NFS_EOF)
9672 bp->b_flags |= B_ERROR;
9673
9674 if (osp)
9675 open_stream_rele(osp, rp);
9676
9677 DTRACE_IO1(done, struct buf *, bp);
9678
9679 return (error);
9680 }
9681
9682 /* ARGSUSED */
9683 int
9684 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9685 {
9686 return (EREMOTE);
9687 }
9688
9689 /* ARGSUSED2 */
9690 int
9691 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9692 {
9693 rnode4_t *rp = VTOR4(vp);
9694
9695 if (!write_lock) {
9696 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9697 return (V_WRITELOCK_FALSE);
9698 }
9699
9700 if ((rp->r_flags & R4DIRECTIO) ||
9701 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9702 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9703 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9704 return (V_WRITELOCK_FALSE);
9705 nfs_rw_exit(&rp->r_rwlock);
9706 }
9707
9708 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9709 return (V_WRITELOCK_TRUE);
9710 }
9711
9712 /* ARGSUSED */
9713 void
9714 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9715 {
9716 rnode4_t *rp = VTOR4(vp);
9717
9718 nfs_rw_exit(&rp->r_rwlock);
9719 }
9720
9721 /* ARGSUSED */
9722 static int
9723 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9724 {
9725 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9726 return (EIO);
9727
9728 /*
9729 * Because we stuff the readdir cookie into the offset field
9730 * someone may attempt to do an lseek with the cookie which
9731 * we want to succeed.
9732 */
9733 if (vp->v_type == VDIR)
9734 return (0);
9735 if (*noffp < 0)
9736 return (EINVAL);
9737 return (0);
9738 }
9739
9740
9741 /*
9742 * Return all the pages from [off..off+len) in file
9743 */
9744 /* ARGSUSED */
9745 static int
9746 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9747 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9748 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9749 {
9750 rnode4_t *rp;
9751 int error;
9752 mntinfo4_t *mi;
9753
9754 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9755 return (EIO);
9756 rp = VTOR4(vp);
9757 if (IS_SHADOW(vp, rp))
9758 vp = RTOV4(rp);
9759
9760 if (vp->v_flag & VNOMAP)
9761 return (ENOSYS);
9762
9763 if (protp != NULL)
9764 *protp = PROT_ALL;
9765
9766 /*
9767 * Now validate that the caches are up to date.
9768 */
9769 if (error = nfs4_validate_caches(vp, cr))
9770 return (error);
9771
9772 mi = VTOMI4(vp);
9773 retry:
9774 mutex_enter(&rp->r_statelock);
9775
9776 /*
9777 * Don't create dirty pages faster than they
9778 * can be cleaned so that the system doesn't
9779 * get imbalanced. If the async queue is
9780 * maxed out, then wait for it to drain before
9781 * creating more dirty pages. Also, wait for
9782 * any threads doing pagewalks in the vop_getattr
9783 * entry points so that they don't block for
9784 * long periods.
9785 */
9786 if (rw == S_CREATE) {
9787 while ((mi->mi_max_threads != 0 &&
9788 rp->r_awcount > 2 * mi->mi_max_threads) ||
9789 rp->r_gcount > 0)
9790 cv_wait(&rp->r_cv, &rp->r_statelock);
9791 }
9792
9793 /*
9794 * If we are getting called as a side effect of an nfs_write()
9795 * operation the local file size might not be extended yet.
9796 * In this case we want to be able to return pages of zeroes.
9797 */
9798 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9799 NFS4_DEBUG(nfs4_pageio_debug,
9800 (CE_NOTE, "getpage beyond EOF: off=%lld, "
9801 "len=%llu, size=%llu, attrsize =%llu", off,
9802 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9803 mutex_exit(&rp->r_statelock);
9804 return (EFAULT); /* beyond EOF */
9805 }
9806
9807 mutex_exit(&rp->r_statelock);
9808
9809 error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9810 pl, plsz, seg, addr, rw, cr);
9811 NFS4_DEBUG(nfs4_pageio_debug && error,
9812 (CE_NOTE, "getpages error %d; off=%lld, len=%lld",
9813 error, off, (u_longlong_t)len));
9814
9815 switch (error) {
9816 case NFS_EOF:
9817 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9818 goto retry;
9819 case ESTALE:
9820 nfs4_purge_stale_fh(error, vp, cr);
9821 }
9822
9823 return (error);
9824 }
9825
9826 /*
9827 * Called from pvn_getpages to get a particular page.
9828 */
9829 /* ARGSUSED */
9830 static int
9831 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9832 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9833 enum seg_rw rw, cred_t *cr)
9834 {
9835 rnode4_t *rp;
9836 uint_t bsize;
9837 struct buf *bp;
9838 page_t *pp;
9839 u_offset_t lbn;
9840 u_offset_t io_off;
9841 u_offset_t blkoff;
9842 u_offset_t rablkoff;
9843 size_t io_len;
9844 uint_t blksize;
9845 int error;
9846 int readahead;
9847 int readahead_issued = 0;
9848 int ra_window; /* readahead window */
9849 page_t *pagefound;
9850 page_t *savepp;
9851
9852 if (nfs_zone() != VTOMI4(vp)->mi_zone)
9853 return (EIO);
9854
9855 rp = VTOR4(vp);
9856 ASSERT(!IS_SHADOW(vp, rp));
9857 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9858
9859 reread:
9860 bp = NULL;
9861 pp = NULL;
9862 pagefound = NULL;
9863
9864 if (pl != NULL)
9865 pl[0] = NULL;
9866
9867 error = 0;
9868 lbn = off / bsize;
9869 blkoff = lbn * bsize;
9870
9871 /*
9872 * Queueing up the readahead before doing the synchronous read
9873 * results in a significant increase in read throughput because
9874 * of the increased parallelism between the async threads and
9875 * the process context.
9876 */
9877 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9878 rw != S_CREATE &&
9879 !(vp->v_flag & VNOCACHE)) {
9880 mutex_enter(&rp->r_statelock);
9881
9882 /*
9883 * Calculate the number of readaheads to do.
9884 * a) No readaheads at offset = 0.
9885 * b) Do maximum(nfs4_nra) readaheads when the readahead
9886 * window is closed.
9887 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9888 * upon how far the readahead window is open or close.
9889 * d) No readaheads if rp->r_nextr is not within the scope
9890 * of the readahead window (random i/o).
9891 */
9892
9893 if (off == 0)
9894 readahead = 0;
9895 else if (blkoff == rp->r_nextr)
9896 readahead = nfs4_nra;
9897 else if (rp->r_nextr > blkoff &&
9898 ((ra_window = (rp->r_nextr - blkoff) / bsize)
9899 <= (nfs4_nra - 1)))
9900 readahead = nfs4_nra - ra_window;
9901 else
9902 readahead = 0;
9903
9904 rablkoff = rp->r_nextr;
9905 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9906 mutex_exit(&rp->r_statelock);
9907 if (nfs4_async_readahead(vp, rablkoff + bsize,
9908 addr + (rablkoff + bsize - off),
9909 seg, cr, nfs4_readahead) < 0) {
9910 mutex_enter(&rp->r_statelock);
9911 break;
9912 }
9913 readahead--;
9914 rablkoff += bsize;
9915 /*
9916 * Indicate that we did a readahead so
9917 * readahead offset is not updated
9918 * by the synchronous read below.
9919 */
9920 readahead_issued = 1;
9921 mutex_enter(&rp->r_statelock);
9922 /*
9923 * set readahead offset to
9924 * offset of last async readahead
9925 * request.
9926 */
9927 rp->r_nextr = rablkoff;
9928 }
9929 mutex_exit(&rp->r_statelock);
9930 }
9931
9932 again:
9933 if ((pagefound = page_exists(vp, off)) == NULL) {
9934 if (pl == NULL) {
9935 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9936 nfs4_readahead);
9937 } else if (rw == S_CREATE) {
9938 /*
9939 * Block for this page is not allocated, or the offset
9940 * is beyond the current allocation size, or we're
9941 * allocating a swap slot and the page was not found,
9942 * so allocate it and return a zero page.
9943 */
9944 if ((pp = page_create_va(vp, off,
9945 PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9946 cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9947 io_len = PAGESIZE;
9948 mutex_enter(&rp->r_statelock);
9949 rp->r_nextr = off + PAGESIZE;
9950 mutex_exit(&rp->r_statelock);
9951 } else {
9952 /*
9953 * Need to go to server to get a block
9954 */
9955 mutex_enter(&rp->r_statelock);
9956 if (blkoff < rp->r_size &&
9957 blkoff + bsize > rp->r_size) {
9958 /*
9959 * If less than a block left in
9960 * file read less than a block.
9961 */
9962 if (rp->r_size <= off) {
9963 /*
9964 * Trying to access beyond EOF,
9965 * set up to get at least one page.
9966 */
9967 blksize = off + PAGESIZE - blkoff;
9968 } else
9969 blksize = rp->r_size - blkoff;
9970 } else if ((off == 0) ||
9971 (off != rp->r_nextr && !readahead_issued)) {
9972 blksize = PAGESIZE;
9973 blkoff = off; /* block = page here */
9974 } else
9975 blksize = bsize;
9976 mutex_exit(&rp->r_statelock);
9977
9978 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9979 &io_len, blkoff, blksize, 0);
9980
9981 /*
9982 * Some other thread has entered the page,
9983 * so just use it.
9984 */
9985 if (pp == NULL)
9986 goto again;
9987
9988 /*
9989 * Now round the request size up to page boundaries.
9990 * This ensures that the entire page will be
9991 * initialized to zeroes if EOF is encountered.
9992 */
9993 io_len = ptob(btopr(io_len));
9994
9995 bp = pageio_setup(pp, io_len, vp, B_READ);
9996 ASSERT(bp != NULL);
9997
9998 /*
9999 * pageio_setup should have set b_addr to 0. This
10000 * is correct since we want to do I/O on a page
10001 * boundary. bp_mapin will use this addr to calculate
10002 * an offset, and then set b_addr to the kernel virtual
10003 * address it allocated for us.
10004 */
10005 ASSERT(bp->b_un.b_addr == 0);
10006
10007 bp->b_edev = 0;
10008 bp->b_dev = 0;
10009 bp->b_lblkno = lbtodb(io_off);
10010 bp->b_file = vp;
10011 bp->b_offset = (offset_t)off;
10012 bp_mapin(bp);
10013
10014 /*
10015 * If doing a write beyond what we believe is EOF,
10016 * don't bother trying to read the pages from the
10017 * server, we'll just zero the pages here. We
10018 * don't check that the rw flag is S_WRITE here
10019 * because some implementations may attempt a
10020 * read access to the buffer before copying data.
10021 */
10022 mutex_enter(&rp->r_statelock);
10023 if (io_off >= rp->r_size && seg == segkmap) {
10024 mutex_exit(&rp->r_statelock);
10025 bzero(bp->b_un.b_addr, io_len);
10026 } else {
10027 mutex_exit(&rp->r_statelock);
10028 error = nfs4_bio(bp, NULL, cr, FALSE);
10029 }
10030
10031 /*
10032 * Unmap the buffer before freeing it.
10033 */
10034 bp_mapout(bp);
10035 pageio_done(bp);
10036
10037 savepp = pp;
10038 do {
10039 pp->p_fsdata = C_NOCOMMIT;
10040 } while ((pp = pp->p_next) != savepp);
10041
10042 if (error == NFS_EOF) {
10043 /*
10044 * If doing a write system call just return
10045 * zeroed pages, else user tried to get pages
10046 * beyond EOF, return error. We don't check
10047 * that the rw flag is S_WRITE here because
10048 * some implementations may attempt a read
10049 * access to the buffer before copying data.
10050 */
10051 if (seg == segkmap)
10052 error = 0;
10053 else
10054 error = EFAULT;
10055 }
10056
10057 if (!readahead_issued && !error) {
10058 mutex_enter(&rp->r_statelock);
10059 rp->r_nextr = io_off + io_len;
10060 mutex_exit(&rp->r_statelock);
10061 }
10062 }
10063 }
10064
10065 out:
10066 if (pl == NULL)
10067 return (error);
10068
10069 if (error) {
10070 if (pp != NULL)
10071 pvn_read_done(pp, B_ERROR);
10072 return (error);
10073 }
10074
10075 if (pagefound) {
10076 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10077
10078 /*
10079 * Page exists in the cache, acquire the appropriate lock.
10080 * If this fails, start all over again.
10081 */
10082 if ((pp = page_lookup(vp, off, se)) == NULL) {
10083 #ifdef DEBUG
10084 nfs4_lostpage++;
10085 #endif
10086 goto reread;
10087 }
10088 pl[0] = pp;
10089 pl[1] = NULL;
10090 return (0);
10091 }
10092
10093 if (pp != NULL)
10094 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10095
10096 return (error);
10097 }
10098
10099 static void
10100 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10101 cred_t *cr)
10102 {
10103 int error;
10104 page_t *pp;
10105 u_offset_t io_off;
10106 size_t io_len;
10107 struct buf *bp;
10108 uint_t bsize, blksize;
10109 rnode4_t *rp = VTOR4(vp);
10110 page_t *savepp;
10111
10112 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10113
10114 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10115
10116 mutex_enter(&rp->r_statelock);
10117 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10118 /*
10119 * If less than a block left in file read less
10120 * than a block.
10121 */
10122 blksize = rp->r_size - blkoff;
10123 } else
10124 blksize = bsize;
10125 mutex_exit(&rp->r_statelock);
10126
10127 pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10128 &io_off, &io_len, blkoff, blksize, 1);
10129 /*
10130 * The isra flag passed to the kluster function is 1, we may have
10131 * gotten a return value of NULL for a variety of reasons (# of free
10132 * pages < minfree, someone entered the page on the vnode etc). In all
10133 * cases, we want to punt on the readahead.
10134 */
10135 if (pp == NULL)
10136 return;
10137
10138 /*
10139 * Now round the request size up to page boundaries.
10140 * This ensures that the entire page will be
10141 * initialized to zeroes if EOF is encountered.
10142 */
10143 io_len = ptob(btopr(io_len));
10144
10145 bp = pageio_setup(pp, io_len, vp, B_READ);
10146 ASSERT(bp != NULL);
10147
10148 /*
10149 * pageio_setup should have set b_addr to 0. This is correct since
10150 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10151 * to calculate an offset, and then set b_addr to the kernel virtual
10152 * address it allocated for us.
10153 */
10154 ASSERT(bp->b_un.b_addr == 0);
10155
10156 bp->b_edev = 0;
10157 bp->b_dev = 0;
10158 bp->b_lblkno = lbtodb(io_off);
10159 bp->b_file = vp;
10160 bp->b_offset = (offset_t)blkoff;
10161 bp_mapin(bp);
10162
10163 /*
10164 * If doing a write beyond what we believe is EOF, don't bother trying
10165 * to read the pages from the server, we'll just zero the pages here.
10166 * We don't check that the rw flag is S_WRITE here because some
10167 * implementations may attempt a read access to the buffer before
10168 * copying data.
10169 */
10170 mutex_enter(&rp->r_statelock);
10171 if (io_off >= rp->r_size && seg == segkmap) {
10172 mutex_exit(&rp->r_statelock);
10173 bzero(bp->b_un.b_addr, io_len);
10174 error = 0;
10175 } else {
10176 mutex_exit(&rp->r_statelock);
10177 error = nfs4_bio(bp, NULL, cr, TRUE);
10178 if (error == NFS_EOF)
10179 error = 0;
10180 }
10181
10182 /*
10183 * Unmap the buffer before freeing it.
10184 */
10185 bp_mapout(bp);
10186 pageio_done(bp);
10187
10188 savepp = pp;
10189 do {
10190 pp->p_fsdata = C_NOCOMMIT;
10191 } while ((pp = pp->p_next) != savepp);
10192
10193 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10194
10195 /*
10196 * In case of error set readahead offset
10197 * to the lowest offset.
10198 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10199 */
10200 if (error && rp->r_nextr > io_off) {
10201 mutex_enter(&rp->r_statelock);
10202 if (rp->r_nextr > io_off)
10203 rp->r_nextr = io_off;
10204 mutex_exit(&rp->r_statelock);
10205 }
10206 }
10207
10208 /*
10209 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10210 * If len == 0, do from off to EOF.
10211 *
10212 * The normal cases should be len == 0 && off == 0 (entire vp list) or
10213 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10214 * (from pageout).
10215 */
10216 /* ARGSUSED */
10217 static int
10218 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10219 caller_context_t *ct)
10220 {
10221 int error;
10222 rnode4_t *rp;
10223
10224 ASSERT(cr != NULL);
10225
10226 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10227 return (EIO);
10228
10229 rp = VTOR4(vp);
10230 if (IS_SHADOW(vp, rp))
10231 vp = RTOV4(rp);
10232
10233 /*
10234 * XXX - Why should this check be made here?
10235 */
10236 if (vp->v_flag & VNOMAP)
10237 return (ENOSYS);
10238
10239 if (len == 0 && !(flags & B_INVAL) &&
10240 (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10241 return (0);
10242
10243 mutex_enter(&rp->r_statelock);
10244 rp->r_count++;
10245 mutex_exit(&rp->r_statelock);
10246 error = nfs4_putpages(vp, off, len, flags, cr);
10247 mutex_enter(&rp->r_statelock);
10248 rp->r_count--;
10249 cv_broadcast(&rp->r_cv);
10250 mutex_exit(&rp->r_statelock);
10251
10252 return (error);
10253 }
10254
10255 /*
10256 * Write out a single page, possibly klustering adjacent dirty pages.
10257 */
10258 int
10259 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10260 int flags, cred_t *cr)
10261 {
10262 u_offset_t io_off;
10263 u_offset_t lbn_off;
10264 u_offset_t lbn;
10265 size_t io_len;
10266 uint_t bsize;
10267 int error;
10268 rnode4_t *rp;
10269
10270 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10271 ASSERT(pp != NULL);
10272 ASSERT(cr != NULL);
10273 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10274
10275 rp = VTOR4(vp);
10276 ASSERT(rp->r_count > 0);
10277 ASSERT(!IS_SHADOW(vp, rp));
10278
10279 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10280 lbn = pp->p_offset / bsize;
10281 lbn_off = lbn * bsize;
10282
10283 /*
10284 * Find a kluster that fits in one block, or in
10285 * one page if pages are bigger than blocks. If
10286 * there is less file space allocated than a whole
10287 * page, we'll shorten the i/o request below.
10288 */
10289 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10290 roundup(bsize, PAGESIZE), flags);
10291
10292 /*
10293 * pvn_write_kluster shouldn't have returned a page with offset
10294 * behind the original page we were given. Verify that.
10295 */
10296 ASSERT((pp->p_offset / bsize) >= lbn);
10297
10298 /*
10299 * Now pp will have the list of kept dirty pages marked for
10300 * write back. It will also handle invalidation and freeing
10301 * of pages that are not dirty. Check for page length rounding
10302 * problems.
10303 */
10304 if (io_off + io_len > lbn_off + bsize) {
10305 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10306 io_len = lbn_off + bsize - io_off;
10307 }
10308 /*
10309 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10310 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10311 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10312 * progress and the r_size has not been made consistent with the
10313 * new size of the file. When the uiomove() completes the r_size is
10314 * updated and the R4MODINPROGRESS flag is cleared.
10315 *
10316 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10317 * consistent value of r_size. Without this handshaking, it is
10318 * possible that nfs4_bio() picks up the old value of r_size
10319 * before the uiomove() in writerp4() completes. This will result
10320 * in the write through nfs4_bio() being dropped.
10321 *
10322 * More precisely, there is a window between the time the uiomove()
10323 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10324 * operation intervenes in this window, the page will be picked up,
10325 * because it is dirty (it will be unlocked, unless it was
10326 * pagecreate'd). When the page is picked up as dirty, the dirty
10327 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10328 * checked. This will still be the old size. Therefore the page will
10329 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10330 * the page will be found to be clean and the write will be dropped.
10331 */
10332 if (rp->r_flags & R4MODINPROGRESS) {
10333 mutex_enter(&rp->r_statelock);
10334 if ((rp->r_flags & R4MODINPROGRESS) &&
10335 rp->r_modaddr + MAXBSIZE > io_off &&
10336 rp->r_modaddr < io_off + io_len) {
10337 page_t *plist;
10338 /*
10339 * A write is in progress for this region of the file.
10340 * If we did not detect R4MODINPROGRESS here then this
10341 * path through nfs_putapage() would eventually go to
10342 * nfs4_bio() and may not write out all of the data
10343 * in the pages. We end up losing data. So we decide
10344 * to set the modified bit on each page in the page
10345 * list and mark the rnode with R4DIRTY. This write
10346 * will be restarted at some later time.
10347 */
10348 plist = pp;
10349 while (plist != NULL) {
10350 pp = plist;
10351 page_sub(&plist, pp);
10352 hat_setmod(pp);
10353 page_io_unlock(pp);
10354 page_unlock(pp);
10355 }
10356 rp->r_flags |= R4DIRTY;
10357 mutex_exit(&rp->r_statelock);
10358 if (offp)
10359 *offp = io_off;
10360 if (lenp)
10361 *lenp = io_len;
10362 return (0);
10363 }
10364 mutex_exit(&rp->r_statelock);
10365 }
10366
10367 if (flags & B_ASYNC) {
10368 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10369 nfs4_sync_putapage);
10370 } else
10371 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10372
10373 if (offp)
10374 *offp = io_off;
10375 if (lenp)
10376 *lenp = io_len;
10377 return (error);
10378 }
10379
10380 static int
10381 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10382 int flags, cred_t *cr)
10383 {
10384 int error;
10385 rnode4_t *rp;
10386
10387 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10388
10389 flags |= B_WRITE;
10390
10391 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10392
10393 rp = VTOR4(vp);
10394
10395 if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10396 error == EACCES) &&
10397 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10398 if (!(rp->r_flags & R4OUTOFSPACE)) {
10399 mutex_enter(&rp->r_statelock);
10400 rp->r_flags |= R4OUTOFSPACE;
10401 mutex_exit(&rp->r_statelock);
10402 }
10403 flags |= B_ERROR;
10404 pvn_write_done(pp, flags);
10405 /*
10406 * If this was not an async thread, then try again to
10407 * write out the pages, but this time, also destroy
10408 * them whether or not the write is successful. This
10409 * will prevent memory from filling up with these
10410 * pages and destroying them is the only alternative
10411 * if they can't be written out.
10412 *
10413 * Don't do this if this is an async thread because
10414 * when the pages are unlocked in pvn_write_done,
10415 * some other thread could have come along, locked
10416 * them, and queued for an async thread. It would be
10417 * possible for all of the async threads to be tied
10418 * up waiting to lock the pages again and they would
10419 * all already be locked and waiting for an async
10420 * thread to handle them. Deadlock.
10421 */
10422 if (!(flags & B_ASYNC)) {
10423 error = nfs4_putpage(vp, io_off, io_len,
10424 B_INVAL | B_FORCE, cr, NULL);
10425 }
10426 } else {
10427 if (error)
10428 flags |= B_ERROR;
10429 else if (rp->r_flags & R4OUTOFSPACE) {
10430 mutex_enter(&rp->r_statelock);
10431 rp->r_flags &= ~R4OUTOFSPACE;
10432 mutex_exit(&rp->r_statelock);
10433 }
10434 pvn_write_done(pp, flags);
10435 if (freemem < desfree)
10436 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10437 NFS4_WRITE_NOWAIT);
10438 }
10439
10440 return (error);
10441 }
10442
10443 #ifdef DEBUG
10444 int nfs4_force_open_before_mmap = 0;
10445 #endif
10446
10447 /* ARGSUSED */
10448 static int
10449 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10450 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10451 caller_context_t *ct)
10452 {
10453 struct segvn_crargs vn_a;
10454 int error = 0;
10455 rnode4_t *rp = VTOR4(vp);
10456 mntinfo4_t *mi = VTOMI4(vp);
10457
10458 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10459 return (EIO);
10460
10461 if (vp->v_flag & VNOMAP)
10462 return (ENOSYS);
10463
10464 if (off < 0 || (off + len) < 0)
10465 return (ENXIO);
10466
10467 if (vp->v_type != VREG)
10468 return (ENODEV);
10469
10470 /*
10471 * If the file is delegated to the client don't do anything.
10472 * If the file is not delegated, then validate the data cache.
10473 */
10474 mutex_enter(&rp->r_statev4_lock);
10475 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10476 mutex_exit(&rp->r_statev4_lock);
10477 error = nfs4_validate_caches(vp, cr);
10478 if (error)
10479 return (error);
10480 } else {
10481 mutex_exit(&rp->r_statev4_lock);
10482 }
10483
10484 /*
10485 * Check to see if the vnode is currently marked as not cachable.
10486 * This means portions of the file are locked (through VOP_FRLOCK).
10487 * In this case the map request must be refused. We use
10488 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10489 *
10490 * Atomically increment r_inmap after acquiring r_rwlock. The
10491 * idea here is to acquire r_rwlock to block read/write and
10492 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10493 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10494 * and we can prevent the deadlock that would have occurred
10495 * when nfs4_addmap() would have acquired it out of order.
10496 *
10497 * Since we are not protecting r_inmap by any lock, we do not
10498 * hold any lock when we decrement it. We atomically decrement
10499 * r_inmap after we release r_lkserlock.
10500 */
10501
10502 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10503 return (EINTR);
10504 atomic_inc_uint(&rp->r_inmap);
10505 nfs_rw_exit(&rp->r_rwlock);
10506
10507 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10508 atomic_dec_uint(&rp->r_inmap);
10509 return (EINTR);
10510 }
10511
10512 if (vp->v_flag & VNOCACHE) {
10513 error = EAGAIN;
10514 goto done;
10515 }
10516
10517 /*
10518 * Don't allow concurrent locks and mapping if mandatory locking is
10519 * enabled.
10520 */
10521 if (flk_has_remote_locks(vp)) {
10522 struct vattr va;
10523 va.va_mask = AT_MODE;
10524 error = nfs4getattr(vp, &va, cr);
10525 if (error != 0)
10526 goto done;
10527 if (MANDLOCK(vp, va.va_mode)) {
10528 error = EAGAIN;
10529 goto done;
10530 }
10531 }
10532
10533 /*
10534 * It is possible that the rnode has a lost lock request that we
10535 * are still trying to recover, and that the request conflicts with
10536 * this map request.
10537 *
10538 * An alternative approach would be for nfs4_safemap() to consider
10539 * queued lock requests when deciding whether to set or clear
10540 * VNOCACHE. This would require the frlock code path to call
10541 * nfs4_safemap() after enqueing a lost request.
10542 */
10543 if (nfs4_map_lost_lock_conflict(vp)) {
10544 error = EAGAIN;
10545 goto done;
10546 }
10547
10548 as_rangelock(as);
10549 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10550 if (error != 0) {
10551 as_rangeunlock(as);
10552 goto done;
10553 }
10554
10555 if (vp->v_type == VREG) {
10556 /*
10557 * We need to retrieve the open stream
10558 */
10559 nfs4_open_stream_t *osp = NULL;
10560 nfs4_open_owner_t *oop = NULL;
10561
10562 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10563 if (oop != NULL) {
10564 /* returns with 'os_sync_lock' held */
10565 osp = find_open_stream(oop, rp);
10566 open_owner_rele(oop);
10567 }
10568 if (osp == NULL) {
10569 #ifdef DEBUG
10570 if (nfs4_force_open_before_mmap) {
10571 error = EIO;
10572 goto done;
10573 }
10574 #endif
10575 /* returns with 'os_sync_lock' held */
10576 error = open_and_get_osp(vp, cr, &osp);
10577 if (osp == NULL) {
10578 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10579 "nfs4_map: we tried to OPEN the file "
10580 "but again no osp, so fail with EIO"));
10581 goto done;
10582 }
10583 }
10584
10585 if (osp->os_failed_reopen) {
10586 mutex_exit(&osp->os_sync_lock);
10587 open_stream_rele(osp, rp);
10588 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10589 "nfs4_map: os_failed_reopen set on "
10590 "osp %p, cr %p, rp %s", (void *)osp,
10591 (void *)cr, rnode4info(rp)));
10592 error = EIO;
10593 goto done;
10594 }
10595 mutex_exit(&osp->os_sync_lock);
10596 open_stream_rele(osp, rp);
10597 }
10598
10599 vn_a.vp = vp;
10600 vn_a.offset = off;
10601 vn_a.type = (flags & MAP_TYPE);
10602 vn_a.prot = (uchar_t)prot;
10603 vn_a.maxprot = (uchar_t)maxprot;
10604 vn_a.flags = (flags & ~MAP_TYPE);
10605 vn_a.cred = cr;
10606 vn_a.amp = NULL;
10607 vn_a.szc = 0;
10608 vn_a.lgrp_mem_policy_flags = 0;
10609
10610 error = as_map(as, *addrp, len, segvn_create, &vn_a);
10611 as_rangeunlock(as);
10612
10613 done:
10614 nfs_rw_exit(&rp->r_lkserlock);
10615 atomic_dec_uint(&rp->r_inmap);
10616 return (error);
10617 }
10618
10619 /*
10620 * We're most likely dealing with a kernel module that likes to READ
10621 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10622 * officially OPEN the file to create the necessary client state
10623 * for bookkeeping of os_mmap_read/write counts.
10624 *
10625 * Since VOP_MAP only passes in a pointer to the vnode rather than
10626 * a double pointer, we can't handle the case where nfs4open_otw()
10627 * returns a different vnode than the one passed into VOP_MAP (since
10628 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case,
10629 * we return NULL and let nfs4_map() fail. Note: the only case where
10630 * this should happen is if the file got removed and replaced with the
10631 * same name on the server (in addition to the fact that we're trying
10632 * to VOP_MAP withouth VOP_OPENing the file in the first place).
10633 */
10634 static int
10635 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10636 {
10637 rnode4_t *rp, *drp;
10638 vnode_t *dvp, *open_vp;
10639 char file_name[MAXNAMELEN];
10640 int just_created;
10641 nfs4_open_stream_t *osp;
10642 nfs4_open_owner_t *oop;
10643 int error;
10644
10645 *ospp = NULL;
10646 open_vp = map_vp;
10647
10648 rp = VTOR4(open_vp);
10649 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10650 return (error);
10651 drp = VTOR4(dvp);
10652
10653 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10654 VN_RELE(dvp);
10655 return (EINTR);
10656 }
10657
10658 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10659 nfs_rw_exit(&drp->r_rwlock);
10660 VN_RELE(dvp);
10661 return (error);
10662 }
10663
10664 mutex_enter(&rp->r_statev4_lock);
10665 if (rp->created_v4) {
10666 rp->created_v4 = 0;
10667 mutex_exit(&rp->r_statev4_lock);
10668
10669 dnlc_update(dvp, file_name, open_vp);
10670 /* This is needed so we don't bump the open ref count */
10671 just_created = 1;
10672 } else {
10673 mutex_exit(&rp->r_statev4_lock);
10674 just_created = 0;
10675 }
10676
10677 VN_HOLD(map_vp);
10678
10679 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10680 just_created);
10681 if (error) {
10682 nfs_rw_exit(&drp->r_rwlock);
10683 VN_RELE(dvp);
10684 VN_RELE(map_vp);
10685 return (error);
10686 }
10687
10688 nfs_rw_exit(&drp->r_rwlock);
10689 VN_RELE(dvp);
10690
10691 /*
10692 * If nfs4open_otw() returned a different vnode then "undo"
10693 * the open and return failure to the caller.
10694 */
10695 if (!VN_CMP(open_vp, map_vp)) {
10696 nfs4_error_t e;
10697
10698 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10699 "open returned a different vnode"));
10700 /*
10701 * If there's an error, ignore it,
10702 * and let VOP_INACTIVE handle it.
10703 */
10704 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10705 CLOSE_NORM, 0, 0, 0);
10706 VN_RELE(map_vp);
10707 return (EIO);
10708 }
10709
10710 VN_RELE(map_vp);
10711
10712 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10713 if (!oop) {
10714 nfs4_error_t e;
10715
10716 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10717 "no open owner"));
10718 /*
10719 * If there's an error, ignore it,
10720 * and let VOP_INACTIVE handle it.
10721 */
10722 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10723 CLOSE_NORM, 0, 0, 0);
10724 return (EIO);
10725 }
10726 osp = find_open_stream(oop, rp);
10727 open_owner_rele(oop);
10728 *ospp = osp;
10729 return (0);
10730 }
10731
10732 /*
10733 * Please be aware that when this function is called, the address space write
10734 * a_lock is held. Do not put over the wire calls in this function.
10735 */
10736 /* ARGSUSED */
10737 static int
10738 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10739 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10740 caller_context_t *ct)
10741 {
10742 rnode4_t *rp;
10743 int error = 0;
10744 mntinfo4_t *mi;
10745
10746 mi = VTOMI4(vp);
10747 rp = VTOR4(vp);
10748
10749 if (nfs_zone() != mi->mi_zone)
10750 return (EIO);
10751 if (vp->v_flag & VNOMAP)
10752 return (ENOSYS);
10753
10754 /*
10755 * Don't need to update the open stream first, since this
10756 * mmap can't add any additional share access that isn't
10757 * already contained in the open stream (for the case where we
10758 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10759 * take into account os_mmap_read[write] counts).
10760 */
10761 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10762
10763 if (vp->v_type == VREG) {
10764 /*
10765 * We need to retrieve the open stream and update the counts.
10766 * If there is no open stream here, something is wrong.
10767 */
10768 nfs4_open_stream_t *osp = NULL;
10769 nfs4_open_owner_t *oop = NULL;
10770
10771 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10772 if (oop != NULL) {
10773 /* returns with 'os_sync_lock' held */
10774 osp = find_open_stream(oop, rp);
10775 open_owner_rele(oop);
10776 }
10777 if (osp == NULL) {
10778 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10779 "nfs4_addmap: we should have an osp"
10780 "but we don't, so fail with EIO"));
10781 error = EIO;
10782 goto out;
10783 }
10784
10785 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10786 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10787
10788 /*
10789 * Update the map count in the open stream.
10790 * This is necessary in the case where we
10791 * open/mmap/close/, then the server reboots, and we
10792 * attempt to reopen. If the mmap doesn't add share
10793 * access then we send an invalid reopen with
10794 * access = NONE.
10795 *
10796 * We need to specifically check each PROT_* so a mmap
10797 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10798 * read and write access. A simple comparison of prot
10799 * to ~PROT_WRITE to determine read access is insufficient
10800 * since prot can be |= with PROT_USER, etc.
10801 */
10802
10803 /*
10804 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10805 */
10806 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10807 osp->os_mmap_write += btopr(len);
10808 if (maxprot & PROT_READ)
10809 osp->os_mmap_read += btopr(len);
10810 if (maxprot & PROT_EXEC)
10811 osp->os_mmap_read += btopr(len);
10812 /*
10813 * Ensure that os_mmap_read gets incremented, even if
10814 * maxprot were to look like PROT_NONE.
10815 */
10816 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10817 !(maxprot & PROT_EXEC))
10818 osp->os_mmap_read += btopr(len);
10819 osp->os_mapcnt += btopr(len);
10820 mutex_exit(&osp->os_sync_lock);
10821 open_stream_rele(osp, rp);
10822 }
10823
10824 out:
10825 /*
10826 * If we got an error, then undo our
10827 * incrementing of 'r_mapcnt'.
10828 */
10829
10830 if (error) {
10831 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10832 ASSERT(rp->r_mapcnt >= 0);
10833 }
10834 return (error);
10835 }
10836
10837 /* ARGSUSED */
10838 static int
10839 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10840 {
10841
10842 return (VTOR4(vp1) == VTOR4(vp2));
10843 }
10844
10845 /* ARGSUSED */
10846 static int
10847 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10848 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10849 caller_context_t *ct)
10850 {
10851 int rc;
10852 u_offset_t start, end;
10853 rnode4_t *rp;
10854 int error = 0, intr = INTR4(vp);
10855 nfs4_error_t e;
10856
10857 if (nfs_zone() != VTOMI4(vp)->mi_zone)
10858 return (EIO);
10859
10860 /* check for valid cmd parameter */
10861 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10862 return (EINVAL);
10863
10864 /* Verify l_type. */
10865 switch (bfp->l_type) {
10866 case F_RDLCK:
10867 if (cmd != F_GETLK && !(flag & FREAD))
10868 return (EBADF);
10869 break;
10870 case F_WRLCK:
10871 if (cmd != F_GETLK && !(flag & FWRITE))
10872 return (EBADF);
10873 break;
10874 case F_UNLCK:
10875 intr = 0;
10876 break;
10877
10878 default:
10879 return (EINVAL);
10880 }
10881
10882 /* check the validity of the lock range */
10883 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10884 return (rc);
10885 if (rc = flk_check_lock_data(start, end, MAXEND))
10886 return (rc);
10887
10888 /*
10889 * If the filesystem is mounted using local locking, pass the
10890 * request off to the local locking code.
10891 */
10892 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10893 if (cmd == F_SETLK || cmd == F_SETLKW) {
10894 /*
10895 * For complete safety, we should be holding
10896 * r_lkserlock. However, we can't call
10897 * nfs4_safelock and then fs_frlock while
10898 * holding r_lkserlock, so just invoke
10899 * nfs4_safelock and expect that this will
10900 * catch enough of the cases.
10901 */
10902 if (!nfs4_safelock(vp, bfp, cr))
10903 return (EAGAIN);
10904 }
10905 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10906 }
10907
10908 rp = VTOR4(vp);
10909
10910 /*
10911 * Check whether the given lock request can proceed, given the
10912 * current file mappings.
10913 */
10914 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10915 return (EINTR);
10916 if (cmd == F_SETLK || cmd == F_SETLKW) {
10917 if (!nfs4_safelock(vp, bfp, cr)) {
10918 rc = EAGAIN;
10919 goto done;
10920 }
10921 }
10922
10923 /*
10924 * Flush the cache after waiting for async I/O to finish. For new
10925 * locks, this is so that the process gets the latest bits from the
10926 * server. For unlocks, this is so that other clients see the
10927 * latest bits once the file has been unlocked. If currently dirty
10928 * pages can't be flushed, then don't allow a lock to be set. But
10929 * allow unlocks to succeed, to avoid having orphan locks on the
10930 * server.
10931 */
10932 if (cmd != F_GETLK) {
10933 mutex_enter(&rp->r_statelock);
10934 while (rp->r_count > 0) {
10935 if (intr) {
10936 klwp_t *lwp = ttolwp(curthread);
10937
10938 if (lwp != NULL)
10939 lwp->lwp_nostop++;
10940 if (cv_wait_sig(&rp->r_cv,
10941 &rp->r_statelock) == 0) {
10942 if (lwp != NULL)
10943 lwp->lwp_nostop--;
10944 rc = EINTR;
10945 break;
10946 }
10947 if (lwp != NULL)
10948 lwp->lwp_nostop--;
10949 } else {
10950 cv_wait(&rp->r_cv, &rp->r_statelock);
10951 }
10952 }
10953 mutex_exit(&rp->r_statelock);
10954 if (rc != 0)
10955 goto done;
10956 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10957 if (error) {
10958 if (error == ENOSPC || error == EDQUOT) {
10959 mutex_enter(&rp->r_statelock);
10960 if (!rp->r_error)
10961 rp->r_error = error;
10962 mutex_exit(&rp->r_statelock);
10963 }
10964 if (bfp->l_type != F_UNLCK) {
10965 rc = ENOLCK;
10966 goto done;
10967 }
10968 }
10969 }
10970
10971 /*
10972 * Call the lock manager to do the real work of contacting
10973 * the server and obtaining the lock.
10974 */
10975 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10976 cr, &e, NULL, NULL);
10977 rc = e.error;
10978
10979 if (rc == 0)
10980 nfs4_lockcompletion(vp, cmd);
10981
10982 done:
10983 nfs_rw_exit(&rp->r_lkserlock);
10984
10985 return (rc);
10986 }
10987
10988 /*
10989 * Free storage space associated with the specified vnode. The portion
10990 * to be freed is specified by bfp->l_start and bfp->l_len (already
10991 * normalized to a "whence" of 0).
10992 *
10993 * This is an experimental facility whose continued existence is not
10994 * guaranteed. Currently, we only support the special case
10995 * of l_len == 0, meaning free to end of file.
10996 */
10997 /* ARGSUSED */
10998 static int
10999 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
11000 offset_t offset, cred_t *cr, caller_context_t *ct)
11001 {
11002 int error;
11003
11004 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11005 return (EIO);
11006 ASSERT(vp->v_type == VREG);
11007 if (cmd != F_FREESP)
11008 return (EINVAL);
11009
11010 error = convoff(vp, bfp, 0, offset);
11011 if (!error) {
11012 ASSERT(bfp->l_start >= 0);
11013 if (bfp->l_len == 0) {
11014 struct vattr va;
11015
11016 va.va_mask = AT_SIZE;
11017 va.va_size = bfp->l_start;
11018 error = nfs4setattr(vp, &va, 0, cr, NULL);
11019
11020 if (error == 0) {
11021 if (bfp->l_start == 0) {
11022 vnevent_truncate(vp, ct);
11023 } else {
11024 vnevent_resize(vp, ct);
11025 }
11026 }
11027 } else
11028 error = EINVAL;
11029 }
11030
11031 return (error);
11032 }
11033
11034 /* ARGSUSED */
11035 int
11036 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11037 {
11038 rnode4_t *rp;
11039 rp = VTOR4(vp);
11040
11041 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11042 vp = RTOV4(rp);
11043 }
11044 *vpp = vp;
11045 return (0);
11046 }
11047
11048 /*
11049 * Setup and add an address space callback to do the work of the delmap call.
11050 * The callback will (and must be) deleted in the actual callback function.
11051 *
11052 * This is done in order to take care of the problem that we have with holding
11053 * the address space's a_lock for a long period of time (e.g. if the NFS server
11054 * is down). Callbacks will be executed in the address space code while the
11055 * a_lock is not held. Holding the address space's a_lock causes things such
11056 * as ps and fork to hang because they are trying to acquire this lock as well.
11057 */
11058 /* ARGSUSED */
11059 static int
11060 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11061 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11062 caller_context_t *ct)
11063 {
11064 int caller_found;
11065 int error;
11066 rnode4_t *rp;
11067 nfs4_delmap_args_t *dmapp;
11068 nfs4_delmapcall_t *delmap_call;
11069
11070 if (vp->v_flag & VNOMAP)
11071 return (ENOSYS);
11072
11073 /*
11074 * A process may not change zones if it has NFS pages mmap'ed
11075 * in, so we can't legitimately get here from the wrong zone.
11076 */
11077 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11078
11079 rp = VTOR4(vp);
11080
11081 /*
11082 * The way that the address space of this process deletes its mapping
11083 * of this file is via the following call chains:
11084 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11085 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11086 *
11087 * With the use of address space callbacks we are allowed to drop the
11088 * address space lock, a_lock, while executing the NFS operations that
11089 * need to go over the wire. Returning EAGAIN to the caller of this
11090 * function is what drives the execution of the callback that we add
11091 * below. The callback will be executed by the address space code
11092 * after dropping the a_lock. When the callback is finished, since
11093 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11094 * is called again on the same segment to finish the rest of the work
11095 * that needs to happen during unmapping.
11096 *
11097 * This action of calling back into the segment driver causes
11098 * nfs4_delmap() to get called again, but since the callback was
11099 * already executed at this point, it already did the work and there
11100 * is nothing left for us to do.
11101 *
11102 * To Summarize:
11103 * - The first time nfs4_delmap is called by the current thread is when
11104 * we add the caller associated with this delmap to the delmap caller
11105 * list, add the callback, and return EAGAIN.
11106 * - The second time in this call chain when nfs4_delmap is called we
11107 * will find this caller in the delmap caller list and realize there
11108 * is no more work to do thus removing this caller from the list and
11109 * returning the error that was set in the callback execution.
11110 */
11111 caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11112 if (caller_found) {
11113 /*
11114 * 'error' is from the actual delmap operations. To avoid
11115 * hangs, we need to handle the return of EAGAIN differently
11116 * since this is what drives the callback execution.
11117 * In this case, we don't want to return EAGAIN and do the
11118 * callback execution because there are none to execute.
11119 */
11120 if (error == EAGAIN)
11121 return (0);
11122 else
11123 return (error);
11124 }
11125
11126 /* current caller was not in the list */
11127 delmap_call = nfs4_init_delmapcall();
11128
11129 mutex_enter(&rp->r_statelock);
11130 list_insert_tail(&rp->r_indelmap, delmap_call);
11131 mutex_exit(&rp->r_statelock);
11132
11133 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11134
11135 dmapp->vp = vp;
11136 dmapp->off = off;
11137 dmapp->addr = addr;
11138 dmapp->len = len;
11139 dmapp->prot = prot;
11140 dmapp->maxprot = maxprot;
11141 dmapp->flags = flags;
11142 dmapp->cr = cr;
11143 dmapp->caller = delmap_call;
11144
11145 error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11146 AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11147
11148 return (error ? error : EAGAIN);
11149 }
11150
11151 static nfs4_delmapcall_t *
11152 nfs4_init_delmapcall()
11153 {
11154 nfs4_delmapcall_t *delmap_call;
11155
11156 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11157 delmap_call->call_id = curthread;
11158 delmap_call->error = 0;
11159
11160 return (delmap_call);
11161 }
11162
11163 static void
11164 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11165 {
11166 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11167 }
11168
11169 /*
11170 * Searches for the current delmap caller (based on curthread) in the list of
11171 * callers. If it is found, we remove it and free the delmap caller.
11172 * Returns:
11173 * 0 if the caller wasn't found
11174 * 1 if the caller was found, removed and freed. *errp will be set
11175 * to what the result of the delmap was.
11176 */
11177 static int
11178 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11179 {
11180 nfs4_delmapcall_t *delmap_call;
11181
11182 /*
11183 * If the list doesn't exist yet, we create it and return
11184 * that the caller wasn't found. No list = no callers.
11185 */
11186 mutex_enter(&rp->r_statelock);
11187 if (!(rp->r_flags & R4DELMAPLIST)) {
11188 /* The list does not exist */
11189 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11190 offsetof(nfs4_delmapcall_t, call_node));
11191 rp->r_flags |= R4DELMAPLIST;
11192 mutex_exit(&rp->r_statelock);
11193 return (0);
11194 } else {
11195 /* The list exists so search it */
11196 for (delmap_call = list_head(&rp->r_indelmap);
11197 delmap_call != NULL;
11198 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11199 if (delmap_call->call_id == curthread) {
11200 /* current caller is in the list */
11201 *errp = delmap_call->error;
11202 list_remove(&rp->r_indelmap, delmap_call);
11203 mutex_exit(&rp->r_statelock);
11204 nfs4_free_delmapcall(delmap_call);
11205 return (1);
11206 }
11207 }
11208 }
11209 mutex_exit(&rp->r_statelock);
11210 return (0);
11211 }
11212
11213 /*
11214 * Remove some pages from an mmap'd vnode. Just update the
11215 * count of pages. If doing close-to-open, then flush and
11216 * commit all of the pages associated with this file.
11217 * Otherwise, start an asynchronous page flush to write out
11218 * any dirty pages. This will also associate a credential
11219 * with the rnode which can be used to write the pages.
11220 */
11221 /* ARGSUSED */
11222 static void
11223 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11224 {
11225 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11226 rnode4_t *rp;
11227 mntinfo4_t *mi;
11228 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg;
11229
11230 rp = VTOR4(dmapp->vp);
11231 mi = VTOMI4(dmapp->vp);
11232
11233 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11234 ASSERT(rp->r_mapcnt >= 0);
11235
11236 /*
11237 * Initiate a page flush and potential commit if there are
11238 * pages, the file system was not mounted readonly, the segment
11239 * was mapped shared, and the pages themselves were writeable.
11240 */
11241 if (nfs4_has_pages(dmapp->vp) &&
11242 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11243 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11244 mutex_enter(&rp->r_statelock);
11245 rp->r_flags |= R4DIRTY;
11246 mutex_exit(&rp->r_statelock);
11247 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11248 dmapp->len, dmapp->cr);
11249 if (!e.error) {
11250 mutex_enter(&rp->r_statelock);
11251 e.error = rp->r_error;
11252 rp->r_error = 0;
11253 mutex_exit(&rp->r_statelock);
11254 }
11255 } else
11256 e.error = 0;
11257
11258 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11259 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11260 B_INVAL, dmapp->cr, NULL);
11261
11262 if (e.error) {
11263 e.stat = puterrno4(e.error);
11264 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11265 OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11266 dmapp->caller->error = e.error;
11267 }
11268
11269 /* Check to see if we need to close the file */
11270
11271 if (dmapp->vp->v_type == VREG) {
11272 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11273 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11274
11275 if (e.error != 0 || e.stat != NFS4_OK) {
11276 /*
11277 * Since it is possible that e.error == 0 and
11278 * e.stat != NFS4_OK (and vice versa),
11279 * we do the proper checking in order to get both
11280 * e.error and e.stat reporting the correct info.
11281 */
11282 if (e.stat == NFS4_OK)
11283 e.stat = puterrno4(e.error);
11284 if (e.error == 0)
11285 e.error = geterrno4(e.stat);
11286
11287 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11288 OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11289 dmapp->caller->error = e.error;
11290 }
11291 }
11292
11293 (void) as_delete_callback(as, arg);
11294 kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11295 }
11296
11297
11298 static uint_t
11299 fattr4_maxfilesize_to_bits(uint64_t ll)
11300 {
11301 uint_t l = 1;
11302
11303 if (ll == 0) {
11304 return (0);
11305 }
11306
11307 if (ll & 0xffffffff00000000) {
11308 l += 32; ll >>= 32;
11309 }
11310 if (ll & 0xffff0000) {
11311 l += 16; ll >>= 16;
11312 }
11313 if (ll & 0xff00) {
11314 l += 8; ll >>= 8;
11315 }
11316 if (ll & 0xf0) {
11317 l += 4; ll >>= 4;
11318 }
11319 if (ll & 0xc) {
11320 l += 2; ll >>= 2;
11321 }
11322 if (ll & 0x2) {
11323 l += 1;
11324 }
11325 return (l);
11326 }
11327
11328 static int
11329 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11330 {
11331 vnode_t *avp = NULL;
11332 int error;
11333
11334 if ((error = nfs4lookup_xattr(vp, "", &avp,
11335 LOOKUP_XATTR, cr)) == 0)
11336 error = do_xattr_exists_check(avp, valp, cr);
11337 if (avp)
11338 VN_RELE(avp);
11339
11340 return (error);
11341 }
11342
11343 /* ARGSUSED */
11344 int
11345 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11346 caller_context_t *ct)
11347 {
11348 int error;
11349 hrtime_t t;
11350 rnode4_t *rp;
11351 nfs4_ga_res_t gar;
11352 nfs4_ga_ext_res_t ger;
11353
11354 gar.n4g_ext_res = &ger;
11355
11356 if (nfs_zone() != VTOMI4(vp)->mi_zone)
11357 return (EIO);
11358 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11359 *valp = MAXPATHLEN;
11360 return (0);
11361 }
11362 if (cmd == _PC_ACL_ENABLED) {
11363 *valp = _ACL_ACE_ENABLED;
11364 return (0);
11365 }
11366
11367 rp = VTOR4(vp);
11368 if (cmd == _PC_XATTR_EXISTS) {
11369 /*
11370 * The existence of the xattr directory is not sufficient
11371 * for determining whether generic user attributes exists.
11372 * The attribute directory could only be a transient directory
11373 * used for Solaris sysattr support. Do a small readdir
11374 * to verify if the only entries are sysattrs or not.
11375 *
11376 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11377 * is NULL. Once the xadir vp exists, we can create xattrs,
11378 * and we don't have any way to update the "base" object's
11379 * pc4_xattr_exists from the xattr or xadir. Maybe FEM
11380 * could help out.
11381 */
11382 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11383 rp->r_xattr_dir == NULL) {
11384 return (nfs4_have_xattrs(vp, valp, cr));
11385 }
11386 } else { /* OLD CODE */
11387 if (ATTRCACHE4_VALID(vp)) {
11388 mutex_enter(&rp->r_statelock);
11389 if (rp->r_pathconf.pc4_cache_valid) {
11390 error = 0;
11391 switch (cmd) {
11392 case _PC_FILESIZEBITS:
11393 *valp =
11394 rp->r_pathconf.pc4_filesizebits;
11395 break;
11396 case _PC_LINK_MAX:
11397 *valp =
11398 rp->r_pathconf.pc4_link_max;
11399 break;
11400 case _PC_NAME_MAX:
11401 *valp =
11402 rp->r_pathconf.pc4_name_max;
11403 break;
11404 case _PC_CHOWN_RESTRICTED:
11405 *valp =
11406 rp->r_pathconf.pc4_chown_restricted;
11407 break;
11408 case _PC_NO_TRUNC:
11409 *valp =
11410 rp->r_pathconf.pc4_no_trunc;
11411 break;
11412 default:
11413 error = EINVAL;
11414 break;
11415 }
11416 mutex_exit(&rp->r_statelock);
11417 #ifdef DEBUG
11418 nfs4_pathconf_cache_hits++;
11419 #endif
11420 return (error);
11421 }
11422 mutex_exit(&rp->r_statelock);
11423 }
11424 }
11425 #ifdef DEBUG
11426 nfs4_pathconf_cache_misses++;
11427 #endif
11428
11429 t = gethrtime();
11430
11431 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11432
11433 if (error) {
11434 mutex_enter(&rp->r_statelock);
11435 rp->r_pathconf.pc4_cache_valid = FALSE;
11436 rp->r_pathconf.pc4_xattr_valid = FALSE;
11437 mutex_exit(&rp->r_statelock);
11438 return (error);
11439 }
11440
11441 /* interpret the max filesize */
11442 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11443 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11444
11445 /* Store the attributes we just received */
11446 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11447
11448 switch (cmd) {
11449 case _PC_FILESIZEBITS:
11450 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11451 break;
11452 case _PC_LINK_MAX:
11453 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11454 break;
11455 case _PC_NAME_MAX:
11456 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11457 break;
11458 case _PC_CHOWN_RESTRICTED:
11459 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11460 break;
11461 case _PC_NO_TRUNC:
11462 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11463 break;
11464 case _PC_XATTR_EXISTS:
11465 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11466 if (error = nfs4_have_xattrs(vp, valp, cr))
11467 return (error);
11468 }
11469 break;
11470 default:
11471 return (EINVAL);
11472 }
11473
11474 return (0);
11475 }
11476
11477 /*
11478 * Called by async thread to do synchronous pageio. Do the i/o, wait
11479 * for it to complete, and cleanup the page list when done.
11480 */
11481 static int
11482 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11483 int flags, cred_t *cr)
11484 {
11485 int error;
11486
11487 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11488
11489 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11490 if (flags & B_READ)
11491 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11492 else
11493 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11494 return (error);
11495 }
11496
11497 /* ARGSUSED */
11498 static int
11499 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11500 int flags, cred_t *cr, caller_context_t *ct)
11501 {
11502 int error;
11503 rnode4_t *rp;
11504
11505 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11506 return (EIO);
11507
11508 if (pp == NULL)
11509 return (EINVAL);
11510
11511 rp = VTOR4(vp);
11512 mutex_enter(&rp->r_statelock);
11513 rp->r_count++;
11514 mutex_exit(&rp->r_statelock);
11515
11516 if (flags & B_ASYNC) {
11517 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11518 nfs4_sync_pageio);
11519 } else
11520 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11521 mutex_enter(&rp->r_statelock);
11522 rp->r_count--;
11523 cv_broadcast(&rp->r_cv);
11524 mutex_exit(&rp->r_statelock);
11525 return (error);
11526 }
11527
11528 /* ARGSUSED */
11529 static void
11530 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11531 caller_context_t *ct)
11532 {
11533 int error;
11534 rnode4_t *rp;
11535 page_t *plist;
11536 page_t *pptr;
11537 offset3 offset;
11538 count3 len;
11539 k_sigset_t smask;
11540
11541 /*
11542 * We should get called with fl equal to either B_FREE or
11543 * B_INVAL. Any other value is illegal.
11544 *
11545 * The page that we are either supposed to free or destroy
11546 * should be exclusive locked and its io lock should not
11547 * be held.
11548 */
11549 ASSERT(fl == B_FREE || fl == B_INVAL);
11550 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11551
11552 rp = VTOR4(vp);
11553
11554 /*
11555 * If the page doesn't need to be committed or we shouldn't
11556 * even bother attempting to commit it, then just make sure
11557 * that the p_fsdata byte is clear and then either free or
11558 * destroy the page as appropriate.
11559 */
11560 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11561 pp->p_fsdata = C_NOCOMMIT;
11562 if (fl == B_FREE)
11563 page_free(pp, dn);
11564 else
11565 page_destroy(pp, dn);
11566 return;
11567 }
11568
11569 /*
11570 * If there is a page invalidation operation going on, then
11571 * if this is one of the pages being destroyed, then just
11572 * clear the p_fsdata byte and then either free or destroy
11573 * the page as appropriate.
11574 */
11575 mutex_enter(&rp->r_statelock);
11576 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11577 mutex_exit(&rp->r_statelock);
11578 pp->p_fsdata = C_NOCOMMIT;
11579 if (fl == B_FREE)
11580 page_free(pp, dn);
11581 else
11582 page_destroy(pp, dn);
11583 return;
11584 }
11585
11586 /*
11587 * If we are freeing this page and someone else is already
11588 * waiting to do a commit, then just unlock the page and
11589 * return. That other thread will take care of commiting
11590 * this page. The page can be freed sometime after the
11591 * commit has finished. Otherwise, if the page is marked
11592 * as delay commit, then we may be getting called from
11593 * pvn_write_done, one page at a time. This could result
11594 * in one commit per page, so we end up doing lots of small
11595 * commits instead of fewer larger commits. This is bad,
11596 * we want do as few commits as possible.
11597 */
11598 if (fl == B_FREE) {
11599 if (rp->r_flags & R4COMMITWAIT) {
11600 page_unlock(pp);
11601 mutex_exit(&rp->r_statelock);
11602 return;
11603 }
11604 if (pp->p_fsdata == C_DELAYCOMMIT) {
11605 pp->p_fsdata = C_COMMIT;
11606 page_unlock(pp);
11607 mutex_exit(&rp->r_statelock);
11608 return;
11609 }
11610 }
11611
11612 /*
11613 * Check to see if there is a signal which would prevent an
11614 * attempt to commit the pages from being successful. If so,
11615 * then don't bother with all of the work to gather pages and
11616 * generate the unsuccessful RPC. Just return from here and
11617 * let the page be committed at some later time.
11618 */
11619 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11620 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11621 sigunintr(&smask);
11622 page_unlock(pp);
11623 mutex_exit(&rp->r_statelock);
11624 return;
11625 }
11626 sigunintr(&smask);
11627
11628 /*
11629 * We are starting to need to commit pages, so let's try
11630 * to commit as many as possible at once to reduce the
11631 * overhead.
11632 *
11633 * Set the `commit inprogress' state bit. We must
11634 * first wait until any current one finishes. Then
11635 * we initialize the c_pages list with this page.
11636 */
11637 while (rp->r_flags & R4COMMIT) {
11638 rp->r_flags |= R4COMMITWAIT;
11639 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11640 rp->r_flags &= ~R4COMMITWAIT;
11641 }
11642 rp->r_flags |= R4COMMIT;
11643 mutex_exit(&rp->r_statelock);
11644 ASSERT(rp->r_commit.c_pages == NULL);
11645 rp->r_commit.c_pages = pp;
11646 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11647 rp->r_commit.c_commlen = PAGESIZE;
11648
11649 /*
11650 * Gather together all other pages which can be committed.
11651 * They will all be chained off r_commit.c_pages.
11652 */
11653 nfs4_get_commit(vp);
11654
11655 /*
11656 * Clear the `commit inprogress' status and disconnect
11657 * the list of pages to be committed from the rnode.
11658 * At this same time, we also save the starting offset
11659 * and length of data to be committed on the server.
11660 */
11661 plist = rp->r_commit.c_pages;
11662 rp->r_commit.c_pages = NULL;
11663 offset = rp->r_commit.c_commbase;
11664 len = rp->r_commit.c_commlen;
11665 mutex_enter(&rp->r_statelock);
11666 rp->r_flags &= ~R4COMMIT;
11667 cv_broadcast(&rp->r_commit.c_cv);
11668 mutex_exit(&rp->r_statelock);
11669
11670 if (curproc == proc_pageout || curproc == proc_fsflush ||
11671 nfs_zone() != VTOMI4(vp)->mi_zone) {
11672 nfs4_async_commit(vp, plist, offset, len,
11673 cr, do_nfs4_async_commit);
11674 return;
11675 }
11676
11677 /*
11678 * Actually generate the COMMIT op over the wire operation.
11679 */
11680 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11681
11682 /*
11683 * If we got an error during the commit, just unlock all
11684 * of the pages. The pages will get retransmitted to the
11685 * server during a putpage operation.
11686 */
11687 if (error) {
11688 while (plist != NULL) {
11689 pptr = plist;
11690 page_sub(&plist, pptr);
11691 page_unlock(pptr);
11692 }
11693 return;
11694 }
11695
11696 /*
11697 * We've tried as hard as we can to commit the data to stable
11698 * storage on the server. We just unlock the rest of the pages
11699 * and clear the commit required state. They will be put
11700 * onto the tail of the cachelist if they are nolonger
11701 * mapped.
11702 */
11703 while (plist != pp) {
11704 pptr = plist;
11705 page_sub(&plist, pptr);
11706 pptr->p_fsdata = C_NOCOMMIT;
11707 page_unlock(pptr);
11708 }
11709
11710 /*
11711 * It is possible that nfs4_commit didn't return error but
11712 * some other thread has modified the page we are going
11713 * to free/destroy.
11714 * In this case we need to rewrite the page. Do an explicit check
11715 * before attempting to free/destroy the page. If modified, needs to
11716 * be rewritten so unlock the page and return.
11717 */
11718 if (hat_ismod(pp)) {
11719 pp->p_fsdata = C_NOCOMMIT;
11720 page_unlock(pp);
11721 return;
11722 }
11723
11724 /*
11725 * Now, as appropriate, either free or destroy the page
11726 * that we were called with.
11727 */
11728 pp->p_fsdata = C_NOCOMMIT;
11729 if (fl == B_FREE)
11730 page_free(pp, dn);
11731 else
11732 page_destroy(pp, dn);
11733 }
11734
11735 /*
11736 * Commit requires that the current fh be the file written to.
11737 * The compound op structure is:
11738 * PUTFH(file), COMMIT
11739 */
11740 static int
11741 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11742 {
11743 COMPOUND4args_clnt args;
11744 COMPOUND4res_clnt res;
11745 COMMIT4res *cm_res;
11746 nfs_argop4 argop[2];
11747 nfs_resop4 *resop;
11748 int doqueue;
11749 mntinfo4_t *mi;
11750 rnode4_t *rp;
11751 cred_t *cred_otw = NULL;
11752 bool_t needrecov = FALSE;
11753 nfs4_recov_state_t recov_state;
11754 nfs4_open_stream_t *osp = NULL;
11755 bool_t first_time = TRUE; /* first time getting OTW cred */
11756 bool_t last_time = FALSE; /* last time getting OTW cred */
11757 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11758
11759 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11760
11761 rp = VTOR4(vp);
11762
11763 mi = VTOMI4(vp);
11764 recov_state.rs_flags = 0;
11765 recov_state.rs_num_retry_despite_err = 0;
11766 get_commit_cred:
11767 /*
11768 * Releases the osp, if a valid open stream is provided.
11769 * Puts a hold on the cred_otw and the new osp (if found).
11770 */
11771 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11772 &first_time, &last_time);
11773 args.ctag = TAG_COMMIT;
11774 recov_retry:
11775 /*
11776 * Commit ops: putfh file; commit
11777 */
11778 args.array_len = 2;
11779 args.array = argop;
11780
11781 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11782 &recov_state, NULL);
11783 if (e.error) {
11784 crfree(cred_otw);
11785 if (osp != NULL)
11786 open_stream_rele(osp, rp);
11787 return (e.error);
11788 }
11789
11790 /* putfh directory */
11791 argop[0].argop = OP_CPUTFH;
11792 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11793
11794 /* commit */
11795 argop[1].argop = OP_COMMIT;
11796 argop[1].nfs_argop4_u.opcommit.offset = offset;
11797 argop[1].nfs_argop4_u.opcommit.count = count;
11798
11799 doqueue = 1;
11800 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11801
11802 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11803 if (!needrecov && e.error) {
11804 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11805 needrecov);
11806 crfree(cred_otw);
11807 if (e.error == EACCES && last_time == FALSE)
11808 goto get_commit_cred;
11809 if (osp != NULL)
11810 open_stream_rele(osp, rp);
11811 return (e.error);
11812 }
11813
11814 if (needrecov) {
11815 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11816 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11817 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11818 &recov_state, needrecov);
11819 if (!e.error)
11820 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11821 goto recov_retry;
11822 }
11823 if (e.error) {
11824 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11825 &recov_state, needrecov);
11826 crfree(cred_otw);
11827 if (osp != NULL)
11828 open_stream_rele(osp, rp);
11829 return (e.error);
11830 }
11831 /* fall through for res.status case */
11832 }
11833
11834 if (res.status) {
11835 e.error = geterrno4(res.status);
11836 if (e.error == EACCES && last_time == FALSE) {
11837 crfree(cred_otw);
11838 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11839 &recov_state, needrecov);
11840 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11841 goto get_commit_cred;
11842 }
11843 /*
11844 * Can't do a nfs4_purge_stale_fh here because this
11845 * can cause a deadlock. nfs4_commit can
11846 * be called from nfs4_dispose which can be called
11847 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh
11848 * can call back to pvn_vplist_dirty.
11849 */
11850 if (e.error == ESTALE) {
11851 mutex_enter(&rp->r_statelock);
11852 rp->r_flags |= R4STALE;
11853 if (!rp->r_error)
11854 rp->r_error = e.error;
11855 mutex_exit(&rp->r_statelock);
11856 PURGE_ATTRCACHE4(vp);
11857 } else {
11858 mutex_enter(&rp->r_statelock);
11859 if (!rp->r_error)
11860 rp->r_error = e.error;
11861 mutex_exit(&rp->r_statelock);
11862 }
11863 } else {
11864 ASSERT(rp->r_flags & R4HAVEVERF);
11865 resop = &res.array[1]; /* commit res */
11866 cm_res = &resop->nfs_resop4_u.opcommit;
11867 mutex_enter(&rp->r_statelock);
11868 if (cm_res->writeverf == rp->r_writeverf) {
11869 mutex_exit(&rp->r_statelock);
11870 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11871 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11872 &recov_state, needrecov);
11873 crfree(cred_otw);
11874 if (osp != NULL)
11875 open_stream_rele(osp, rp);
11876 return (0);
11877 }
11878 nfs4_set_mod(vp);
11879 rp->r_writeverf = cm_res->writeverf;
11880 mutex_exit(&rp->r_statelock);
11881 e.error = NFS_VERF_MISMATCH;
11882 }
11883
11884 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11885 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11886 crfree(cred_otw);
11887 if (osp != NULL)
11888 open_stream_rele(osp, rp);
11889
11890 return (e.error);
11891 }
11892
11893 static void
11894 nfs4_set_mod(vnode_t *vp)
11895 {
11896 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11897
11898 /* make sure we're looking at the master vnode, not a shadow */
11899 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11900 }
11901
11902 /*
11903 * This function is used to gather a page list of the pages which
11904 * can be committed on the server.
11905 *
11906 * The calling thread must have set R4COMMIT. This bit is used to
11907 * serialize access to the commit structure in the rnode. As long
11908 * as the thread has set R4COMMIT, then it can manipulate the commit
11909 * structure without requiring any other locks.
11910 *
11911 * When this function is called from nfs4_dispose() the page passed
11912 * into nfs4_dispose() will be SE_EXCL locked, and so this function
11913 * will skip it. This is not a problem since we initially add the
11914 * page to the r_commit page list.
11915 *
11916 */
11917 static void
11918 nfs4_get_commit(vnode_t *vp)
11919 {
11920 rnode4_t *rp;
11921 page_t *pp;
11922 kmutex_t *vphm;
11923
11924 rp = VTOR4(vp);
11925
11926 ASSERT(rp->r_flags & R4COMMIT);
11927
11928 /* make sure we're looking at the master vnode, not a shadow */
11929
11930 if (IS_SHADOW(vp, rp))
11931 vp = RTOV4(rp);
11932
11933 vphm = page_vnode_mutex(vp);
11934 mutex_enter(vphm);
11935
11936 /*
11937 * If there are no pages associated with this vnode, then
11938 * just return.
11939 */
11940 if ((pp = vp->v_pages) == NULL) {
11941 mutex_exit(vphm);
11942 return;
11943 }
11944
11945 /*
11946 * Step through all of the pages associated with this vnode
11947 * looking for pages which need to be committed.
11948 */
11949 do {
11950 /* Skip marker pages. */
11951 if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11952 continue;
11953
11954 /*
11955 * First short-cut everything (without the page_lock)
11956 * and see if this page does not need to be committed
11957 * or is modified if so then we'll just skip it.
11958 */
11959 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11960 continue;
11961
11962 /*
11963 * Attempt to lock the page. If we can't, then
11964 * someone else is messing with it or we have been
11965 * called from nfs4_dispose and this is the page that
11966 * nfs4_dispose was called with.. anyway just skip it.
11967 */
11968 if (!page_trylock(pp, SE_EXCL))
11969 continue;
11970
11971 /*
11972 * Lets check again now that we have the page lock.
11973 */
11974 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11975 page_unlock(pp);
11976 continue;
11977 }
11978
11979 /* this had better not be a free page */
11980 ASSERT(PP_ISFREE(pp) == 0);
11981
11982 /*
11983 * The page needs to be committed and we locked it.
11984 * Update the base and length parameters and add it
11985 * to r_pages.
11986 */
11987 if (rp->r_commit.c_pages == NULL) {
11988 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11989 rp->r_commit.c_commlen = PAGESIZE;
11990 } else if (pp->p_offset < rp->r_commit.c_commbase) {
11991 rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11992 (offset3)pp->p_offset + rp->r_commit.c_commlen;
11993 rp->r_commit.c_commbase = (offset3)pp->p_offset;
11994 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11995 <= pp->p_offset) {
11996 rp->r_commit.c_commlen = (offset3)pp->p_offset -
11997 rp->r_commit.c_commbase + PAGESIZE;
11998 }
11999 page_add(&rp->r_commit.c_pages, pp);
12000 } while ((pp = pp->p_vpnext) != vp->v_pages);
12001
12002 mutex_exit(vphm);
12003 }
12004
12005 /*
12006 * This routine is used to gather together a page list of the pages
12007 * which are to be committed on the server. This routine must not
12008 * be called if the calling thread holds any locked pages.
12009 *
12010 * The calling thread must have set R4COMMIT. This bit is used to
12011 * serialize access to the commit structure in the rnode. As long
12012 * as the thread has set R4COMMIT, then it can manipulate the commit
12013 * structure without requiring any other locks.
12014 */
12015 static void
12016 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
12017 {
12018
12019 rnode4_t *rp;
12020 page_t *pp;
12021 u_offset_t end;
12022 u_offset_t off;
12023 ASSERT(len != 0);
12024 rp = VTOR4(vp);
12025 ASSERT(rp->r_flags & R4COMMIT);
12026
12027 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12028
12029 /* make sure we're looking at the master vnode, not a shadow */
12030
12031 if (IS_SHADOW(vp, rp))
12032 vp = RTOV4(rp);
12033
12034 /*
12035 * If there are no pages associated with this vnode, then
12036 * just return.
12037 */
12038 if ((pp = vp->v_pages) == NULL)
12039 return;
12040 /*
12041 * Calculate the ending offset.
12042 */
12043 end = soff + len;
12044 for (off = soff; off < end; off += PAGESIZE) {
12045 /*
12046 * Lookup each page by vp, offset.
12047 */
12048 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12049 continue;
12050 /*
12051 * If this page does not need to be committed or is
12052 * modified, then just skip it.
12053 */
12054 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12055 page_unlock(pp);
12056 continue;
12057 }
12058
12059 ASSERT(PP_ISFREE(pp) == 0);
12060 /*
12061 * The page needs to be committed and we locked it.
12062 * Update the base and length parameters and add it
12063 * to r_pages.
12064 */
12065 if (rp->r_commit.c_pages == NULL) {
12066 rp->r_commit.c_commbase = (offset3)pp->p_offset;
12067 rp->r_commit.c_commlen = PAGESIZE;
12068 } else {
12069 rp->r_commit.c_commlen = (offset3)pp->p_offset -
12070 rp->r_commit.c_commbase + PAGESIZE;
12071 }
12072 page_add(&rp->r_commit.c_pages, pp);
12073 }
12074 }
12075
12076 /*
12077 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12078 * Flushes and commits data to the server.
12079 */
12080 static int
12081 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12082 {
12083 int error;
12084 verifier4 write_verf;
12085 rnode4_t *rp = VTOR4(vp);
12086
12087 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12088
12089 /*
12090 * Flush the data portion of the file and then commit any
12091 * portions which need to be committed. This may need to
12092 * be done twice if the server has changed state since
12093 * data was last written. The data will need to be
12094 * rewritten to the server and then a new commit done.
12095 *
12096 * In fact, this may need to be done several times if the
12097 * server is having problems and crashing while we are
12098 * attempting to do this.
12099 */
12100
12101 top:
12102 /*
12103 * Do a flush based on the poff and plen arguments. This
12104 * will synchronously write out any modified pages in the
12105 * range specified by (poff, plen). This starts all of the
12106 * i/o operations which will be waited for in the next
12107 * call to nfs4_putpage
12108 */
12109
12110 mutex_enter(&rp->r_statelock);
12111 write_verf = rp->r_writeverf;
12112 mutex_exit(&rp->r_statelock);
12113
12114 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12115 if (error == EAGAIN)
12116 error = 0;
12117
12118 /*
12119 * Do a flush based on the poff and plen arguments. This
12120 * will synchronously write out any modified pages in the
12121 * range specified by (poff, plen) and wait until all of
12122 * the asynchronous i/o's in that range are done as well.
12123 */
12124 if (!error)
12125 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12126
12127 if (error)
12128 return (error);
12129
12130 mutex_enter(&rp->r_statelock);
12131 if (rp->r_writeverf != write_verf) {
12132 mutex_exit(&rp->r_statelock);
12133 goto top;
12134 }
12135 mutex_exit(&rp->r_statelock);
12136
12137 /*
12138 * Now commit any pages which might need to be committed.
12139 * If the error, NFS_VERF_MISMATCH, is returned, then
12140 * start over with the flush operation.
12141 */
12142 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12143
12144 if (error == NFS_VERF_MISMATCH)
12145 goto top;
12146
12147 return (error);
12148 }
12149
12150 /*
12151 * nfs4_commit_vp() will wait for other pending commits and
12152 * will either commit the whole file or a range, plen dictates
12153 * if we commit whole file. a value of zero indicates the whole
12154 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12155 */
12156 static int
12157 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12158 cred_t *cr, int wait_on_writes)
12159 {
12160 rnode4_t *rp;
12161 page_t *plist;
12162 offset3 offset;
12163 count3 len;
12164
12165 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12166
12167 rp = VTOR4(vp);
12168
12169 /*
12170 * before we gather commitable pages make
12171 * sure there are no outstanding async writes
12172 */
12173 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12174 mutex_enter(&rp->r_statelock);
12175 while (rp->r_count > 0) {
12176 cv_wait(&rp->r_cv, &rp->r_statelock);
12177 }
12178 mutex_exit(&rp->r_statelock);
12179 }
12180
12181 /*
12182 * Set the `commit inprogress' state bit. We must
12183 * first wait until any current one finishes.
12184 */
12185 mutex_enter(&rp->r_statelock);
12186 while (rp->r_flags & R4COMMIT) {
12187 rp->r_flags |= R4COMMITWAIT;
12188 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12189 rp->r_flags &= ~R4COMMITWAIT;
12190 }
12191 rp->r_flags |= R4COMMIT;
12192 mutex_exit(&rp->r_statelock);
12193
12194 /*
12195 * Gather all of the pages which need to be
12196 * committed.
12197 */
12198 if (plen == 0)
12199 nfs4_get_commit(vp);
12200 else
12201 nfs4_get_commit_range(vp, poff, plen);
12202
12203 /*
12204 * Clear the `commit inprogress' bit and disconnect the
12205 * page list which was gathered by nfs4_get_commit.
12206 */
12207 plist = rp->r_commit.c_pages;
12208 rp->r_commit.c_pages = NULL;
12209 offset = rp->r_commit.c_commbase;
12210 len = rp->r_commit.c_commlen;
12211 mutex_enter(&rp->r_statelock);
12212 rp->r_flags &= ~R4COMMIT;
12213 cv_broadcast(&rp->r_commit.c_cv);
12214 mutex_exit(&rp->r_statelock);
12215
12216 /*
12217 * If any pages need to be committed, commit them and
12218 * then unlock them so that they can be freed some
12219 * time later.
12220 */
12221 if (plist == NULL)
12222 return (0);
12223
12224 /*
12225 * No error occurred during the flush portion
12226 * of this operation, so now attempt to commit
12227 * the data to stable storage on the server.
12228 *
12229 * This will unlock all of the pages on the list.
12230 */
12231 return (nfs4_sync_commit(vp, plist, offset, len, cr));
12232 }
12233
12234 static int
12235 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12236 cred_t *cr)
12237 {
12238 int error;
12239 page_t *pp;
12240
12241 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12242
12243 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12244
12245 /*
12246 * If we got an error, then just unlock all of the pages
12247 * on the list.
12248 */
12249 if (error) {
12250 while (plist != NULL) {
12251 pp = plist;
12252 page_sub(&plist, pp);
12253 page_unlock(pp);
12254 }
12255 return (error);
12256 }
12257 /*
12258 * We've tried as hard as we can to commit the data to stable
12259 * storage on the server. We just unlock the pages and clear
12260 * the commit required state. They will get freed later.
12261 */
12262 while (plist != NULL) {
12263 pp = plist;
12264 page_sub(&plist, pp);
12265 pp->p_fsdata = C_NOCOMMIT;
12266 page_unlock(pp);
12267 }
12268
12269 return (error);
12270 }
12271
12272 static void
12273 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12274 cred_t *cr)
12275 {
12276
12277 (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12278 }
12279
12280 /*ARGSUSED*/
12281 static int
12282 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12283 caller_context_t *ct)
12284 {
12285 int error = 0;
12286 mntinfo4_t *mi;
12287 vattr_t va;
12288 vsecattr_t nfsace4_vsap;
12289
12290 mi = VTOMI4(vp);
12291 if (nfs_zone() != mi->mi_zone)
12292 return (EIO);
12293 if (mi->mi_flags & MI4_ACL) {
12294 /* if we have a delegation, return it */
12295 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12296 (void) nfs4delegreturn(VTOR4(vp),
12297 NFS4_DR_REOPEN|NFS4_DR_PUSH);
12298
12299 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12300 NFS4_ACL_SET);
12301 if (error) /* EINVAL */
12302 return (error);
12303
12304 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12305 /*
12306 * These are aclent_t type entries.
12307 */
12308 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12309 vp->v_type == VDIR, FALSE);
12310 if (error)
12311 return (error);
12312 } else {
12313 /*
12314 * These are ace_t type entries.
12315 */
12316 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12317 FALSE);
12318 if (error)
12319 return (error);
12320 }
12321 bzero(&va, sizeof (va));
12322 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12323 vs_ace4_destroy(&nfsace4_vsap);
12324 return (error);
12325 }
12326 return (ENOSYS);
12327 }
12328
12329 /* ARGSUSED */
12330 int
12331 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12332 caller_context_t *ct)
12333 {
12334 int error;
12335 mntinfo4_t *mi;
12336 nfs4_ga_res_t gar;
12337 rnode4_t *rp = VTOR4(vp);
12338
12339 mi = VTOMI4(vp);
12340 if (nfs_zone() != mi->mi_zone)
12341 return (EIO);
12342
12343 bzero(&gar, sizeof (gar));
12344 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12345
12346 /*
12347 * vsecattr->vsa_mask holds the original acl request mask.
12348 * This is needed when determining what to return.
12349 * (See: nfs4_create_getsecattr_return())
12350 */
12351 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12352 if (error) /* EINVAL */
12353 return (error);
12354
12355 /*
12356 * If this is a referral stub, don't try to go OTW for an ACL
12357 */
12358 if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12359 return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12360
12361 if (mi->mi_flags & MI4_ACL) {
12362 /*
12363 * Check if the data is cached and the cache is valid. If it
12364 * is we don't go over the wire.
12365 */
12366 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12367 mutex_enter(&rp->r_statelock);
12368 if (rp->r_secattr != NULL) {
12369 error = nfs4_create_getsecattr_return(
12370 rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12371 rp->r_attr.va_gid,
12372 vp->v_type == VDIR);
12373 if (!error) { /* error == 0 - Success! */
12374 mutex_exit(&rp->r_statelock);
12375 return (error);
12376 }
12377 }
12378 mutex_exit(&rp->r_statelock);
12379 }
12380
12381 /*
12382 * The getattr otw call will always get both the acl, in
12383 * the form of a list of nfsace4's, and the number of acl
12384 * entries; independent of the value of gar.n4g_va.va_mask.
12385 */
12386 error = nfs4_getattr_otw(vp, &gar, cr, 1);
12387 if (error) {
12388 vs_ace4_destroy(&gar.n4g_vsa);
12389 if (error == ENOTSUP || error == EOPNOTSUPP)
12390 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12391 return (error);
12392 }
12393
12394 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12395 /*
12396 * No error was returned, but according to the response
12397 * bitmap, neither was an acl.
12398 */
12399 vs_ace4_destroy(&gar.n4g_vsa);
12400 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12401 return (error);
12402 }
12403
12404 /*
12405 * Update the cache with the ACL.
12406 */
12407 nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12408
12409 error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12410 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12411 vp->v_type == VDIR);
12412 vs_ace4_destroy(&gar.n4g_vsa);
12413 if ((error) && (vsecattr->vsa_mask &
12414 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12415 (error != EACCES)) {
12416 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12417 }
12418 return (error);
12419 }
12420 error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12421 return (error);
12422 }
12423
12424 /*
12425 * The function returns:
12426 * - 0 (zero) if the passed in "acl_mask" is a valid request.
12427 * - EINVAL if the passed in "acl_mask" is an invalid request.
12428 *
12429 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12430 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12431 *
12432 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12433 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12434 * - We have a count field set without the corresponding acl field set. (e.g. -
12435 * VSA_ACECNT is set, but VSA_ACE is not)
12436 */
12437 static int
12438 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12439 {
12440 /* Shortcut the masks that are always valid. */
12441 if (acl_mask == (VSA_ACE | VSA_ACECNT))
12442 return (0);
12443 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12444 return (0);
12445
12446 if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12447 /*
12448 * We can't have any VSA_ACL type stuff in the mask now.
12449 */
12450 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12451 VSA_DFACLCNT))
12452 return (EINVAL);
12453
12454 if (op == NFS4_ACL_SET) {
12455 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12456 return (EINVAL);
12457 }
12458 }
12459
12460 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12461 /*
12462 * We can't have any VSA_ACE type stuff in the mask now.
12463 */
12464 if (acl_mask & (VSA_ACE | VSA_ACECNT))
12465 return (EINVAL);
12466
12467 if (op == NFS4_ACL_SET) {
12468 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12469 return (EINVAL);
12470
12471 if ((acl_mask & VSA_DFACLCNT) &&
12472 !(acl_mask & VSA_DFACL))
12473 return (EINVAL);
12474 }
12475 }
12476 return (0);
12477 }
12478
12479 /*
12480 * The theory behind creating the correct getsecattr return is simply this:
12481 * "Don't return anything that the caller is not expecting to have to free."
12482 */
12483 static int
12484 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12485 uid_t uid, gid_t gid, int isdir)
12486 {
12487 int error = 0;
12488 /* Save the mask since the translators modify it. */
12489 uint_t orig_mask = vsap->vsa_mask;
12490
12491 if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12492 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12493
12494 if (error)
12495 return (error);
12496
12497 /*
12498 * If the caller only asked for the ace count (VSA_ACECNT)
12499 * don't give them the full acl (VSA_ACE), free it.
12500 */
12501 if (!orig_mask & VSA_ACE) {
12502 if (vsap->vsa_aclentp != NULL) {
12503 kmem_free(vsap->vsa_aclentp,
12504 vsap->vsa_aclcnt * sizeof (ace_t));
12505 vsap->vsa_aclentp = NULL;
12506 }
12507 }
12508 vsap->vsa_mask = orig_mask;
12509
12510 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12511 VSA_DFACLCNT)) {
12512 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12513 isdir, FALSE);
12514
12515 if (error)
12516 return (error);
12517
12518 /*
12519 * If the caller only asked for the acl count (VSA_ACLCNT)
12520 * and/or the default acl count (VSA_DFACLCNT) don't give them
12521 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12522 */
12523 if (!orig_mask & VSA_ACL) {
12524 if (vsap->vsa_aclentp != NULL) {
12525 kmem_free(vsap->vsa_aclentp,
12526 vsap->vsa_aclcnt * sizeof (aclent_t));
12527 vsap->vsa_aclentp = NULL;
12528 }
12529 }
12530
12531 if (!orig_mask & VSA_DFACL) {
12532 if (vsap->vsa_dfaclentp != NULL) {
12533 kmem_free(vsap->vsa_dfaclentp,
12534 vsap->vsa_dfaclcnt * sizeof (aclent_t));
12535 vsap->vsa_dfaclentp = NULL;
12536 }
12537 }
12538 vsap->vsa_mask = orig_mask;
12539 }
12540 return (0);
12541 }
12542
12543 /* ARGSUSED */
12544 int
12545 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12546 caller_context_t *ct)
12547 {
12548 int error;
12549
12550 if (nfs_zone() != VTOMI4(vp)->mi_zone)
12551 return (EIO);
12552 /*
12553 * check for valid cmd parameter
12554 */
12555 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12556 return (EINVAL);
12557
12558 /*
12559 * Check access permissions
12560 */
12561 if ((cmd & F_SHARE) &&
12562 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12563 (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12564 return (EBADF);
12565
12566 /*
12567 * If the filesystem is mounted using local locking, pass the
12568 * request off to the local share code.
12569 */
12570 if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12571 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12572
12573 switch (cmd) {
12574 case F_SHARE:
12575 case F_UNSHARE:
12576 /*
12577 * This will be properly implemented later,
12578 * see RFE: 4823948 .
12579 */
12580 error = EAGAIN;
12581 break;
12582
12583 case F_HASREMOTELOCKS:
12584 /*
12585 * NFS client can't store remote locks itself
12586 */
12587 shr->s_access = 0;
12588 error = 0;
12589 break;
12590
12591 default:
12592 error = EINVAL;
12593 break;
12594 }
12595
12596 return (error);
12597 }
12598
12599 /*
12600 * Common code called by directory ops to update the attrcache
12601 */
12602 static int
12603 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12604 hrtime_t t, vnode_t *vp, cred_t *cr)
12605 {
12606 int error = 0;
12607
12608 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12609
12610 if (status != NFS4_OK) {
12611 /* getattr not done or failed */
12612 PURGE_ATTRCACHE4(vp);
12613 return (error);
12614 }
12615
12616 if (garp) {
12617 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12618 } else {
12619 PURGE_ATTRCACHE4(vp);
12620 }
12621 return (error);
12622 }
12623
12624 /*
12625 * Update directory caches for directory modification ops (link, rename, etc.)
12626 * When dinfo is NULL, manage dircaches in the old way.
12627 */
12628 static void
12629 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12630 dirattr_info_t *dinfo)
12631 {
12632 rnode4_t *drp = VTOR4(dvp);
12633
12634 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12635
12636 /* Purge rddir cache for dir since it changed */
12637 if (drp->r_dir != NULL)
12638 nfs4_purge_rddir_cache(dvp);
12639
12640 /*
12641 * If caller provided dinfo, then use it to manage dir caches.
12642 */
12643 if (dinfo != NULL) {
12644 if (vp != NULL) {
12645 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12646 if (!VTOR4(vp)->created_v4) {
12647 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12648 dnlc_update(dvp, nm, vp);
12649 } else {
12650 /*
12651 * XXX don't update if the created_v4 flag is
12652 * set
12653 */
12654 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12655 NFS4_DEBUG(nfs4_client_state_debug,
12656 (CE_NOTE, "nfs4_update_dircaches: "
12657 "don't update dnlc: created_v4 flag"));
12658 }
12659 }
12660
12661 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12662 dinfo->di_cred, FALSE, cinfo);
12663
12664 return;
12665 }
12666
12667 /*
12668 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12669 * Since caller modified dir but didn't receive post-dirmod-op dir
12670 * attrs, the dir's attrs must be purged.
12671 *
12672 * XXX this check and dnlc update/purge should really be atomic,
12673 * XXX but can't use rnode statelock because it'll deadlock in
12674 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12675 * XXX does occur.
12676 *
12677 * XXX We also may want to check that atomic is true in the
12678 * XXX change_info struct. If it is not, the change_info may
12679 * XXX reflect changes by more than one clients which means that
12680 * XXX our cache may not be valid.
12681 */
12682 PURGE_ATTRCACHE4(dvp);
12683 if (drp->r_change == cinfo->before) {
12684 /* no changes took place in the directory prior to our link */
12685 if (vp != NULL) {
12686 mutex_enter(&VTOR4(vp)->r_statev4_lock);
12687 if (!VTOR4(vp)->created_v4) {
12688 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12689 dnlc_update(dvp, nm, vp);
12690 } else {
12691 /*
12692 * XXX dont' update if the created_v4 flag
12693 * is set
12694 */
12695 mutex_exit(&VTOR4(vp)->r_statev4_lock);
12696 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12697 "nfs4_update_dircaches: don't"
12698 " update dnlc: created_v4 flag"));
12699 }
12700 }
12701 } else {
12702 /* Another client modified directory - purge its dnlc cache */
12703 dnlc_purge_vp(dvp);
12704 }
12705 }
12706
12707 /*
12708 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12709 * file.
12710 *
12711 * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12712 * file (ie: client recovery) and otherwise set to FALSE.
12713 *
12714 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12715 * initiated) calling functions.
12716 *
12717 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12718 * of resending a 'lost' open request.
12719 *
12720 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12721 * server that hands out BAD_SEQID on open confirm.
12722 *
12723 * Errors are returned via the nfs4_error_t parameter.
12724 */
12725 void
12726 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12727 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12728 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12729 {
12730 COMPOUND4args_clnt args;
12731 COMPOUND4res_clnt res;
12732 nfs_argop4 argop[2];
12733 nfs_resop4 *resop;
12734 int doqueue = 1;
12735 mntinfo4_t *mi;
12736 OPEN_CONFIRM4args *open_confirm_args;
12737 int needrecov;
12738
12739 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12740 #if DEBUG
12741 mutex_enter(&oop->oo_lock);
12742 ASSERT(oop->oo_seqid_inuse);
12743 mutex_exit(&oop->oo_lock);
12744 #endif
12745
12746 recov_retry_confirm:
12747 nfs4_error_zinit(ep);
12748 *retry_open = FALSE;
12749
12750 if (resend)
12751 args.ctag = TAG_OPEN_CONFIRM_LOST;
12752 else
12753 args.ctag = TAG_OPEN_CONFIRM;
12754
12755 args.array_len = 2;
12756 args.array = argop;
12757
12758 /* putfh target fh */
12759 argop[0].argop = OP_CPUTFH;
12760 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12761
12762 argop[1].argop = OP_OPEN_CONFIRM;
12763 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12764
12765 (*seqid) += 1;
12766 open_confirm_args->seqid = *seqid;
12767 open_confirm_args->open_stateid = *stateid;
12768
12769 mi = VTOMI4(vp);
12770
12771 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12772
12773 if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12774 nfs4_set_open_seqid((*seqid), oop, args.ctag);
12775 }
12776
12777 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12778 if (!needrecov && ep->error)
12779 return;
12780
12781 if (needrecov) {
12782 bool_t abort = FALSE;
12783
12784 if (reopening_file == FALSE) {
12785 nfs4_bseqid_entry_t *bsep = NULL;
12786
12787 if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12788 bsep = nfs4_create_bseqid_entry(oop, NULL,
12789 vp, 0, args.ctag,
12790 open_confirm_args->seqid);
12791
12792 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12793 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12794 if (bsep) {
12795 kmem_free(bsep, sizeof (*bsep));
12796 if (num_bseqid_retryp &&
12797 --(*num_bseqid_retryp) == 0)
12798 abort = TRUE;
12799 }
12800 }
12801 if ((ep->error == ETIMEDOUT ||
12802 res.status == NFS4ERR_RESOURCE) &&
12803 abort == FALSE && resend == FALSE) {
12804 if (!ep->error)
12805 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12806
12807 delay(SEC_TO_TICK(confirm_retry_sec));
12808 goto recov_retry_confirm;
12809 }
12810 /* State may have changed so retry the entire OPEN op */
12811 if (abort == FALSE)
12812 *retry_open = TRUE;
12813 else
12814 *retry_open = FALSE;
12815 if (!ep->error)
12816 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12817 return;
12818 }
12819
12820 if (res.status) {
12821 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12822 return;
12823 }
12824
12825 resop = &res.array[1]; /* open confirm res */
12826 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12827 stateid, sizeof (*stateid));
12828
12829 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12830 }
12831
12832 /*
12833 * Return the credentials associated with a client state object. The
12834 * caller is responsible for freeing the credentials.
12835 */
12836
12837 static cred_t *
12838 state_to_cred(nfs4_open_stream_t *osp)
12839 {
12840 cred_t *cr;
12841
12842 /*
12843 * It's ok to not lock the open stream and open owner to get
12844 * the oo_cred since this is only written once (upon creation)
12845 * and will not change.
12846 */
12847 cr = osp->os_open_owner->oo_cred;
12848 crhold(cr);
12849
12850 return (cr);
12851 }
12852
12853 /*
12854 * nfs4_find_sysid
12855 *
12856 * Find the sysid for the knetconfig associated with the given mi.
12857 */
12858 static struct lm_sysid *
12859 nfs4_find_sysid(mntinfo4_t *mi)
12860 {
12861 ASSERT(nfs_zone() == mi->mi_zone);
12862
12863 /*
12864 * Switch from RDMA knconf to original mount knconf
12865 */
12866 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12867 mi->mi_curr_serv->sv_hostname, NULL));
12868 }
12869
12870 #ifdef DEBUG
12871 /*
12872 * Return a string version of the call type for easy reading.
12873 */
12874 static char *
12875 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12876 {
12877 switch (ctype) {
12878 case NFS4_LCK_CTYPE_NORM:
12879 return ("NORMAL");
12880 case NFS4_LCK_CTYPE_RECLAIM:
12881 return ("RECLAIM");
12882 case NFS4_LCK_CTYPE_RESEND:
12883 return ("RESEND");
12884 case NFS4_LCK_CTYPE_REINSTATE:
12885 return ("REINSTATE");
12886 default:
12887 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12888 "type %d", ctype);
12889 return ("");
12890 }
12891 }
12892 #endif
12893
12894 /*
12895 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12896 * Unlock requests don't have an over-the-wire locktype, so we just return
12897 * something non-threatening.
12898 */
12899
12900 static nfs_lock_type4
12901 flk_to_locktype(int cmd, int l_type)
12902 {
12903 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12904
12905 switch (l_type) {
12906 case F_UNLCK:
12907 return (READ_LT);
12908 case F_RDLCK:
12909 if (cmd == F_SETLK)
12910 return (READ_LT);
12911 else
12912 return (READW_LT);
12913 case F_WRLCK:
12914 if (cmd == F_SETLK)
12915 return (WRITE_LT);
12916 else
12917 return (WRITEW_LT);
12918 }
12919 panic("flk_to_locktype");
12920 /*NOTREACHED*/
12921 }
12922
12923 /*
12924 * Do some preliminary checks for nfs4frlock.
12925 */
12926 static int
12927 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12928 u_offset_t offset)
12929 {
12930 int error = 0;
12931
12932 /*
12933 * If we are setting a lock, check that the file is opened
12934 * with the correct mode.
12935 */
12936 if (cmd == F_SETLK || cmd == F_SETLKW) {
12937 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12938 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12939 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 "nfs4frlock_validate_args: file was opened with "
12941 "incorrect mode"));
12942 return (EBADF);
12943 }
12944 }
12945
12946 /* Convert the offset. It may need to be restored before returning. */
12947 if (error = convoff(vp, flk, 0, offset)) {
12948 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12949 "nfs4frlock_validate_args: convoff => error= %d\n",
12950 error));
12951 return (error);
12952 }
12953
12954 return (error);
12955 }
12956
12957 /*
12958 * Set the flock64's lm_sysid for nfs4frlock.
12959 */
12960 static int
12961 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12962 {
12963 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12964
12965 /* Find the lm_sysid */
12966 *lspp = nfs4_find_sysid(VTOMI4(vp));
12967
12968 if (*lspp == NULL) {
12969 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12970 "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12971 return (ENOLCK);
12972 }
12973
12974 flk->l_sysid = lm_sysidt(*lspp);
12975
12976 return (0);
12977 }
12978
12979 /*
12980 * Do the remaining preliminary setup for nfs4frlock.
12981 */
12982 static void
12983 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12984 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12985 cred_t **cred_otw)
12986 {
12987 /*
12988 * set tick_delay to the base delay time.
12989 * (NFS4_BASE_WAIT_TIME is in secs)
12990 */
12991
12992 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12993
12994 /*
12995 * If lock is relative to EOF, we need the newest length of the
12996 * file. Therefore invalidate the ATTR_CACHE.
12997 */
12998
12999 *whencep = flk->l_whence;
13000
13001 if (*whencep == 2) /* SEEK_END */
13002 PURGE_ATTRCACHE4(vp);
13003
13004 recov_statep->rs_flags = 0;
13005 recov_statep->rs_num_retry_despite_err = 0;
13006 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
13007 }
13008
13009 /*
13010 * Initialize and allocate the data structures necessary for
13011 * the nfs4frlock call.
13012 * Allocates argsp's op array.
13013 */
13014 static void
13015 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
13016 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
13017 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
13018 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
13019 {
13020 int argoplist_size;
13021 int num_ops = 2;
13022
13023 *retry = FALSE;
13024 *did_start_fop = FALSE;
13025 *skip_get_err = FALSE;
13026 lost_rqstp->lr_op = 0;
13027 argoplist_size = num_ops * sizeof (nfs_argop4);
13028 /* fill array with zero */
13029 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13030
13031 *argspp = argsp;
13032 *respp = NULL;
13033
13034 argsp->array_len = num_ops;
13035 argsp->array = *argopp;
13036
13037 /* initialize in case of error; will get real value down below */
13038 argsp->ctag = TAG_NONE;
13039
13040 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13041 *op_hintp = OH_LOCKU;
13042 else
13043 *op_hintp = OH_OTHER;
13044 }
13045
13046 /*
13047 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign
13048 * the proper nfs4_server_t for this instance of nfs4frlock.
13049 * Returns 0 (success) or an errno value.
13050 */
13051 static int
13052 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13053 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13054 bool_t *did_start_fop, bool_t *startrecovp)
13055 {
13056 int error = 0;
13057 rnode4_t *rp;
13058
13059 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13060
13061 if (ctype == NFS4_LCK_CTYPE_NORM) {
13062 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13063 recov_statep, startrecovp);
13064 if (error)
13065 return (error);
13066 *did_start_fop = TRUE;
13067 } else {
13068 *did_start_fop = FALSE;
13069 *startrecovp = FALSE;
13070 }
13071
13072 if (!error) {
13073 rp = VTOR4(vp);
13074
13075 /* If the file failed recovery, just quit. */
13076 mutex_enter(&rp->r_statelock);
13077 if (rp->r_flags & R4RECOVERR) {
13078 error = EIO;
13079 }
13080 mutex_exit(&rp->r_statelock);
13081 }
13082
13083 return (error);
13084 }
13085
13086 /*
13087 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A
13088 * resend nfs4frlock call is initiated by the recovery framework.
13089 * Acquires the lop and oop seqid synchronization.
13090 */
13091 static void
13092 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13093 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13094 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13095 LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13096 {
13097 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13098 int error;
13099
13100 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13101 (CE_NOTE,
13102 "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13103 ASSERT(resend_rqstp != NULL);
13104 ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13105 resend_rqstp->lr_op == OP_LOCKU);
13106
13107 *oopp = resend_rqstp->lr_oop;
13108 if (resend_rqstp->lr_oop) {
13109 open_owner_hold(resend_rqstp->lr_oop);
13110 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13111 ASSERT(error == 0); /* recov thread always succeeds */
13112 }
13113
13114 /* Must resend this lost lock/locku request. */
13115 ASSERT(resend_rqstp->lr_lop != NULL);
13116 *lopp = resend_rqstp->lr_lop;
13117 lock_owner_hold(resend_rqstp->lr_lop);
13118 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13119 ASSERT(error == 0); /* recov thread always succeeds */
13120
13121 *ospp = resend_rqstp->lr_osp;
13122 if (*ospp)
13123 open_stream_hold(resend_rqstp->lr_osp);
13124
13125 if (resend_rqstp->lr_op == OP_LOCK) {
13126 LOCK4args *lock_args;
13127
13128 argop->argop = OP_LOCK;
13129 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13130 lock_args->locktype = resend_rqstp->lr_locktype;
13131 lock_args->reclaim =
13132 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13133 lock_args->offset = resend_rqstp->lr_flk->l_start;
13134 lock_args->length = resend_rqstp->lr_flk->l_len;
13135 if (lock_args->length == 0)
13136 lock_args->length = ~lock_args->length;
13137 nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13138 mi2clientid(mi), &lock_args->locker);
13139
13140 switch (resend_rqstp->lr_ctype) {
13141 case NFS4_LCK_CTYPE_RESEND:
13142 argsp->ctag = TAG_LOCK_RESEND;
13143 break;
13144 case NFS4_LCK_CTYPE_REINSTATE:
13145 argsp->ctag = TAG_LOCK_REINSTATE;
13146 break;
13147 case NFS4_LCK_CTYPE_RECLAIM:
13148 argsp->ctag = TAG_LOCK_RECLAIM;
13149 break;
13150 default:
13151 argsp->ctag = TAG_LOCK_UNKNOWN;
13152 break;
13153 }
13154 } else {
13155 LOCKU4args *locku_args;
13156 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13157
13158 argop->argop = OP_LOCKU;
13159 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13160 locku_args->locktype = READ_LT;
13161 locku_args->seqid = lop->lock_seqid + 1;
13162 mutex_enter(&lop->lo_lock);
13163 locku_args->lock_stateid = lop->lock_stateid;
13164 mutex_exit(&lop->lo_lock);
13165 locku_args->offset = resend_rqstp->lr_flk->l_start;
13166 locku_args->length = resend_rqstp->lr_flk->l_len;
13167 if (locku_args->length == 0)
13168 locku_args->length = ~locku_args->length;
13169
13170 switch (resend_rqstp->lr_ctype) {
13171 case NFS4_LCK_CTYPE_RESEND:
13172 argsp->ctag = TAG_LOCKU_RESEND;
13173 break;
13174 case NFS4_LCK_CTYPE_REINSTATE:
13175 argsp->ctag = TAG_LOCKU_REINSTATE;
13176 break;
13177 default:
13178 argsp->ctag = TAG_LOCK_UNKNOWN;
13179 break;
13180 }
13181 }
13182 }
13183
13184 /*
13185 * Setup the LOCKT4 arguments.
13186 */
13187 static void
13188 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13189 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13190 rnode4_t *rp)
13191 {
13192 LOCKT4args *lockt_args;
13193
13194 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13195 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13196 argop->argop = OP_LOCKT;
13197 argsp->ctag = TAG_LOCKT;
13198 lockt_args = &argop->nfs_argop4_u.oplockt;
13199
13200 /*
13201 * The locktype will be READ_LT unless it's
13202 * a write lock. We do this because the Solaris
13203 * system call allows the combination of
13204 * F_UNLCK and F_GETLK* and so in that case the
13205 * unlock is mapped to a read.
13206 */
13207 if (flk->l_type == F_WRLCK)
13208 lockt_args->locktype = WRITE_LT;
13209 else
13210 lockt_args->locktype = READ_LT;
13211
13212 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13213 /* set the lock owner4 args */
13214 nfs4_setlockowner_args(&lockt_args->owner, rp,
13215 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13216 flk->l_pid);
13217 lockt_args->offset = flk->l_start;
13218 lockt_args->length = flk->l_len;
13219 if (flk->l_len == 0)
13220 lockt_args->length = ~lockt_args->length;
13221
13222 *lockt_argsp = lockt_args;
13223 }
13224
13225 /*
13226 * If the client is holding a delegation, and the open stream to be used
13227 * with this lock request is a delegation open stream, then re-open the stream.
13228 * Sets the nfs4_error_t to all zeros unless the open stream has already
13229 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY
13230 * means the caller should retry (like a recovery retry).
13231 */
13232 static void
13233 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13234 {
13235 open_delegation_type4 dt;
13236 bool_t reopen_needed, force;
13237 nfs4_open_stream_t *osp;
13238 open_claim_type4 oclaim;
13239 rnode4_t *rp = VTOR4(vp);
13240 mntinfo4_t *mi = VTOMI4(vp);
13241
13242 ASSERT(nfs_zone() == mi->mi_zone);
13243
13244 nfs4_error_zinit(ep);
13245
13246 mutex_enter(&rp->r_statev4_lock);
13247 dt = rp->r_deleg_type;
13248 mutex_exit(&rp->r_statev4_lock);
13249
13250 if (dt != OPEN_DELEGATE_NONE) {
13251 nfs4_open_owner_t *oop;
13252
13253 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13254 if (!oop) {
13255 ep->stat = NFS4ERR_IO;
13256 return;
13257 }
13258 /* returns with 'os_sync_lock' held */
13259 osp = find_open_stream(oop, rp);
13260 if (!osp) {
13261 open_owner_rele(oop);
13262 ep->stat = NFS4ERR_IO;
13263 return;
13264 }
13265
13266 if (osp->os_failed_reopen) {
13267 NFS4_DEBUG((nfs4_open_stream_debug ||
13268 nfs4_client_lock_debug), (CE_NOTE,
13269 "nfs4frlock_check_deleg: os_failed_reopen set "
13270 "for osp %p, cr %p, rp %s", (void *)osp,
13271 (void *)cr, rnode4info(rp)));
13272 mutex_exit(&osp->os_sync_lock);
13273 open_stream_rele(osp, rp);
13274 open_owner_rele(oop);
13275 ep->stat = NFS4ERR_IO;
13276 return;
13277 }
13278
13279 /*
13280 * Determine whether a reopen is needed. If this
13281 * is a delegation open stream, then send the open
13282 * to the server to give visibility to the open owner.
13283 * Even if it isn't a delegation open stream, we need
13284 * to check if the previous open CLAIM_DELEGATE_CUR
13285 * was sufficient.
13286 */
13287
13288 reopen_needed = osp->os_delegation ||
13289 ((lt == F_RDLCK &&
13290 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13291 (lt == F_WRLCK &&
13292 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13293
13294 mutex_exit(&osp->os_sync_lock);
13295 open_owner_rele(oop);
13296
13297 if (reopen_needed) {
13298 /*
13299 * Always use CLAIM_PREVIOUS after server reboot.
13300 * The server will reject CLAIM_DELEGATE_CUR if
13301 * it is used during the grace period.
13302 */
13303 mutex_enter(&mi->mi_lock);
13304 if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13305 oclaim = CLAIM_PREVIOUS;
13306 force = TRUE;
13307 } else {
13308 oclaim = CLAIM_DELEGATE_CUR;
13309 force = FALSE;
13310 }
13311 mutex_exit(&mi->mi_lock);
13312
13313 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13314 if (ep->error == EAGAIN) {
13315 nfs4_error_zinit(ep);
13316 ep->stat = NFS4ERR_DELAY;
13317 }
13318 }
13319 open_stream_rele(osp, rp);
13320 osp = NULL;
13321 }
13322 }
13323
13324 /*
13325 * Setup the LOCKU4 arguments.
13326 * Returns errors via the nfs4_error_t.
13327 * NFS4_OK no problems. *go_otwp is TRUE if call should go
13328 * over-the-wire. The caller must release the
13329 * reference on *lopp.
13330 * NFS4ERR_DELAY caller should retry (like recovery retry)
13331 * (other) unrecoverable error.
13332 */
13333 static void
13334 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13335 LOCKU4args **locku_argsp, flock64_t *flk,
13336 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13337 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13338 bool_t *skip_get_err, bool_t *go_otwp)
13339 {
13340 nfs4_lock_owner_t *lop = NULL;
13341 LOCKU4args *locku_args;
13342 pid_t pid;
13343 bool_t is_spec = FALSE;
13344 rnode4_t *rp = VTOR4(vp);
13345
13346 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13347 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13348
13349 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13350 if (ep->error || ep->stat)
13351 return;
13352
13353 argop->argop = OP_LOCKU;
13354 if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13355 argsp->ctag = TAG_LOCKU_REINSTATE;
13356 else
13357 argsp->ctag = TAG_LOCKU;
13358 locku_args = &argop->nfs_argop4_u.oplocku;
13359 *locku_argsp = locku_args;
13360
13361 /* locktype should be set to any legal value */
13362 locku_args->locktype = READ_LT;
13363
13364 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13365 flk->l_pid;
13366
13367 /*
13368 * Get the lock owner stateid. If no lock owner
13369 * exists, return success.
13370 */
13371 lop = find_lock_owner(rp, pid, LOWN_ANY);
13372 *lopp = lop;
13373 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13374 is_spec = TRUE;
13375 if (!lop || is_spec) {
13376 /*
13377 * No lock owner so no locks to unlock.
13378 * Return success. If there was a failed
13379 * reclaim earlier, the lock might still be
13380 * registered with the local locking code,
13381 * so notify it of the unlock.
13382 *
13383 * If the lockowner is using a special stateid,
13384 * then the original lock request (that created
13385 * this lockowner) was never successful, so we
13386 * have no lock to undo OTW.
13387 */
13388 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13389 "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13390 "(%ld) so return success", (long)pid));
13391
13392 if (ctype == NFS4_LCK_CTYPE_NORM)
13393 flk->l_pid = curproc->p_pid;
13394 nfs4_register_lock_locally(vp, flk, flag, offset);
13395 /*
13396 * Release our hold and NULL out so final_cleanup
13397 * doesn't try to end a lock seqid sync we
13398 * never started.
13399 */
13400 if (is_spec) {
13401 lock_owner_rele(lop);
13402 *lopp = NULL;
13403 }
13404 *skip_get_err = TRUE;
13405 *go_otwp = FALSE;
13406 return;
13407 }
13408
13409 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13410 if (ep->error == EAGAIN) {
13411 lock_owner_rele(lop);
13412 *lopp = NULL;
13413 return;
13414 }
13415
13416 mutex_enter(&lop->lo_lock);
13417 locku_args->lock_stateid = lop->lock_stateid;
13418 mutex_exit(&lop->lo_lock);
13419 locku_args->seqid = lop->lock_seqid + 1;
13420
13421 /* leave the ref count on lop, rele after RPC call */
13422
13423 locku_args->offset = flk->l_start;
13424 locku_args->length = flk->l_len;
13425 if (flk->l_len == 0)
13426 locku_args->length = ~locku_args->length;
13427
13428 *go_otwp = TRUE;
13429 }
13430
13431 /*
13432 * Setup the LOCK4 arguments.
13433 *
13434 * Returns errors via the nfs4_error_t.
13435 * NFS4_OK no problems
13436 * NFS4ERR_DELAY caller should retry (like recovery retry)
13437 * (other) unrecoverable error
13438 */
13439 static void
13440 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13441 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13442 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13443 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13444 {
13445 LOCK4args *lock_args;
13446 nfs4_open_owner_t *oop = NULL;
13447 nfs4_open_stream_t *osp = NULL;
13448 nfs4_lock_owner_t *lop = NULL;
13449 pid_t pid;
13450 rnode4_t *rp = VTOR4(vp);
13451
13452 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13453
13454 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13455 if (ep->error || ep->stat != NFS4_OK)
13456 return;
13457
13458 argop->argop = OP_LOCK;
13459 if (ctype == NFS4_LCK_CTYPE_NORM)
13460 argsp->ctag = TAG_LOCK;
13461 else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13462 argsp->ctag = TAG_RELOCK;
13463 else
13464 argsp->ctag = TAG_LOCK_REINSTATE;
13465 lock_args = &argop->nfs_argop4_u.oplock;
13466 lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13467 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13468 /*
13469 * Get the lock owner. If no lock owner exists,
13470 * create a 'temporary' one and grab the open seqid
13471 * synchronization (which puts a hold on the open
13472 * owner and open stream).
13473 * This also grabs the lock seqid synchronization.
13474 */
13475 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13476 ep->stat =
13477 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13478
13479 if (ep->stat != NFS4_OK)
13480 goto out;
13481
13482 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13483 &lock_args->locker);
13484
13485 lock_args->offset = flk->l_start;
13486 lock_args->length = flk->l_len;
13487 if (flk->l_len == 0)
13488 lock_args->length = ~lock_args->length;
13489 *lock_argsp = lock_args;
13490 out:
13491 *oopp = oop;
13492 *ospp = osp;
13493 *lopp = lop;
13494 }
13495
13496 /*
13497 * After we get the reply from the server, record the proper information
13498 * for possible resend lock requests.
13499 */
13500 static void
13501 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13502 nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13503 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13504 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13505 {
13506 bool_t unlock = (flk->l_type == F_UNLCK);
13507
13508 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13509 ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13510 ctype == NFS4_LCK_CTYPE_REINSTATE);
13511
13512 if (error != 0 && !unlock) {
13513 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13514 nfs4_client_lock_debug), (CE_NOTE,
13515 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13516 " for lop %p", (void *)lop));
13517 ASSERT(lop != NULL);
13518 mutex_enter(&lop->lo_lock);
13519 lop->lo_pending_rqsts = 1;
13520 mutex_exit(&lop->lo_lock);
13521 }
13522
13523 lost_rqstp->lr_putfirst = FALSE;
13524 lost_rqstp->lr_op = 0;
13525
13526 /*
13527 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13528 * recovery purposes so that the lock request that was sent
13529 * can be saved and re-issued later. Ditto for EIO from a forced
13530 * unmount. This is done to have the client's local locking state
13531 * match the v4 server's state; that is, the request was
13532 * potentially received and accepted by the server but the client
13533 * thinks it was not.
13534 */
13535 if (error == ETIMEDOUT || error == EINTR ||
13536 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13537 NFS4_DEBUG((nfs4_lost_rqst_debug ||
13538 nfs4_client_lock_debug), (CE_NOTE,
13539 "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13540 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13541 (void *)lop, (void *)oop, (void *)osp));
13542 if (unlock)
13543 lost_rqstp->lr_op = OP_LOCKU;
13544 else {
13545 lost_rqstp->lr_op = OP_LOCK;
13546 lost_rqstp->lr_locktype = locktype;
13547 }
13548 /*
13549 * Objects are held and rele'd via the recovery code.
13550 * See nfs4_save_lost_rqst.
13551 */
13552 lost_rqstp->lr_vp = vp;
13553 lost_rqstp->lr_dvp = NULL;
13554 lost_rqstp->lr_oop = oop;
13555 lost_rqstp->lr_osp = osp;
13556 lost_rqstp->lr_lop = lop;
13557 lost_rqstp->lr_cr = cr;
13558 switch (ctype) {
13559 case NFS4_LCK_CTYPE_NORM:
13560 flk->l_pid = ttoproc(curthread)->p_pid;
13561 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13562 break;
13563 case NFS4_LCK_CTYPE_REINSTATE:
13564 lost_rqstp->lr_putfirst = TRUE;
13565 lost_rqstp->lr_ctype = ctype;
13566 break;
13567 default:
13568 break;
13569 }
13570 lost_rqstp->lr_flk = flk;
13571 }
13572 }
13573
13574 /*
13575 * Update lop's seqid. Also update the seqid stored in a resend request,
13576 * if any. (Some recovery errors increment the seqid, and we may have to
13577 * send the resend request again.)
13578 */
13579
13580 static void
13581 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13582 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13583 {
13584 if (lock_args) {
13585 if (lock_args->locker.new_lock_owner == TRUE)
13586 nfs4_get_and_set_next_open_seqid(oop, tag_type);
13587 else {
13588 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13589 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13590 }
13591 } else if (locku_args) {
13592 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13593 nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13594 }
13595 }
13596
13597 /*
13598 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13599 * COMPOUND4 args/res for calls that need to retry.
13600 * Switches the *cred_otwp to base_cr.
13601 */
13602 static void
13603 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13604 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13605 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13606 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13607 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13608 {
13609 nfs4_open_owner_t *oop = *oopp;
13610 nfs4_open_stream_t *osp = *ospp;
13611 nfs4_lock_owner_t *lop = *lopp;
13612 nfs_argop4 *argop = (*argspp)->array;
13613
13614 if (*did_start_fop) {
13615 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13616 needrecov);
13617 *did_start_fop = FALSE;
13618 }
13619 ASSERT((*argspp)->array_len == 2);
13620 if (argop[1].argop == OP_LOCK)
13621 nfs4args_lock_free(&argop[1]);
13622 else if (argop[1].argop == OP_LOCKT)
13623 nfs4args_lockt_free(&argop[1]);
13624 kmem_free(argop, 2 * sizeof (nfs_argop4));
13625 if (!error)
13626 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13627 *argspp = NULL;
13628 *respp = NULL;
13629
13630 if (lop) {
13631 nfs4_end_lock_seqid_sync(lop);
13632 lock_owner_rele(lop);
13633 *lopp = NULL;
13634 }
13635
13636 /* need to free up the reference on osp for lock args */
13637 if (osp != NULL) {
13638 open_stream_rele(osp, VTOR4(vp));
13639 *ospp = NULL;
13640 }
13641
13642 /* need to free up the reference on oop for lock args */
13643 if (oop != NULL) {
13644 nfs4_end_open_seqid_sync(oop);
13645 open_owner_rele(oop);
13646 *oopp = NULL;
13647 }
13648
13649 crfree(*cred_otwp);
13650 *cred_otwp = base_cr;
13651 crhold(*cred_otwp);
13652 }
13653
13654 /*
13655 * Function to process the client's recovery for nfs4frlock.
13656 * Returns TRUE if we should retry the lock request; FALSE otherwise.
13657 *
13658 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13659 * COMPOUND4 args/res for calls that need to retry.
13660 *
13661 * Note: the rp's r_lkserlock is *not* dropped during this path.
13662 */
13663 static bool_t
13664 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13665 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13666 LOCK4args *lock_args, LOCKU4args *locku_args,
13667 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13668 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13669 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13670 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13671 {
13672 nfs4_open_owner_t *oop = *oopp;
13673 nfs4_open_stream_t *osp = *ospp;
13674 nfs4_lock_owner_t *lop = *lopp;
13675
13676 bool_t abort, retry;
13677
13678 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13679 ASSERT((*argspp) != NULL);
13680 ASSERT((*respp) != NULL);
13681 if (lock_args || locku_args)
13682 ASSERT(lop != NULL);
13683
13684 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13685 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13686
13687 retry = TRUE;
13688 abort = FALSE;
13689 if (needrecov) {
13690 nfs4_bseqid_entry_t *bsep = NULL;
13691 nfs_opnum4 op;
13692
13693 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13694
13695 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13696 seqid4 seqid;
13697
13698 if (lock_args) {
13699 if (lock_args->locker.new_lock_owner == TRUE)
13700 seqid = lock_args->locker.locker4_u.
13701 open_owner.open_seqid;
13702 else
13703 seqid = lock_args->locker.locker4_u.
13704 lock_owner.lock_seqid;
13705 } else if (locku_args) {
13706 seqid = locku_args->seqid;
13707 } else {
13708 seqid = 0;
13709 }
13710
13711 bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13712 flk->l_pid, (*argspp)->ctag, seqid);
13713 }
13714
13715 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13716 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13717 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13718 NULL, op, bsep, NULL, NULL);
13719
13720 if (bsep)
13721 kmem_free(bsep, sizeof (*bsep));
13722 }
13723
13724 /*
13725 * Return that we do not want to retry the request for 3 cases:
13726 * 1. If we received EINTR or are bailing out because of a forced
13727 * unmount, we came into this code path just for the sake of
13728 * initiating recovery, we now need to return the error.
13729 * 2. If we have aborted recovery.
13730 * 3. We received NFS4ERR_BAD_SEQID.
13731 */
13732 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13733 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13734 retry = FALSE;
13735
13736 if (*did_start_fop == TRUE) {
13737 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13738 needrecov);
13739 *did_start_fop = FALSE;
13740 }
13741
13742 if (retry == TRUE) {
13743 nfs_argop4 *argop;
13744
13745 argop = (*argspp)->array;
13746 ASSERT((*argspp)->array_len == 2);
13747
13748 if (argop[1].argop == OP_LOCK)
13749 nfs4args_lock_free(&argop[1]);
13750 else if (argop[1].argop == OP_LOCKT)
13751 nfs4args_lockt_free(&argop[1]);
13752 kmem_free(argop, 2 * sizeof (nfs_argop4));
13753 if (!ep->error)
13754 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13755 *respp = NULL;
13756 *argspp = NULL;
13757 }
13758
13759 if (lop != NULL) {
13760 nfs4_end_lock_seqid_sync(lop);
13761 lock_owner_rele(lop);
13762 }
13763
13764 *lopp = NULL;
13765
13766 /* need to free up the reference on osp for lock args */
13767 if (osp != NULL) {
13768 open_stream_rele(osp, rp);
13769 *ospp = NULL;
13770 }
13771
13772 /* need to free up the reference on oop for lock args */
13773 if (oop != NULL) {
13774 nfs4_end_open_seqid_sync(oop);
13775 open_owner_rele(oop);
13776 *oopp = NULL;
13777 }
13778
13779 return (retry);
13780 }
13781
13782 /*
13783 * Handles the successful reply from the server for nfs4frlock.
13784 */
13785 static void
13786 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13787 vnode_t *vp, int flag, u_offset_t offset,
13788 nfs4_lost_rqst_t *resend_rqstp)
13789 {
13790 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13791 if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13792 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13793 if (ctype == NFS4_LCK_CTYPE_NORM) {
13794 flk->l_pid = ttoproc(curthread)->p_pid;
13795 /*
13796 * We do not register lost locks locally in
13797 * the 'resend' case since the user/application
13798 * doesn't think we have the lock.
13799 */
13800 ASSERT(!resend_rqstp);
13801 nfs4_register_lock_locally(vp, flk, flag, offset);
13802 }
13803 }
13804 }
13805
13806 /*
13807 * Handle the DENIED reply from the server for nfs4frlock.
13808 * Returns TRUE if we should retry the request; FALSE otherwise.
13809 *
13810 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13811 * COMPOUND4 args/res for calls that need to retry. Can also
13812 * drop and regrab the r_lkserlock.
13813 */
13814 static bool_t
13815 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13816 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13817 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13818 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13819 nfs4_recov_state_t *recov_statep, int needrecov,
13820 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13821 clock_t *tick_delayp, short *whencep, int *errorp,
13822 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13823 bool_t *skip_get_err)
13824 {
13825 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13826
13827 if (lock_args) {
13828 nfs4_open_owner_t *oop = *oopp;
13829 nfs4_open_stream_t *osp = *ospp;
13830 nfs4_lock_owner_t *lop = *lopp;
13831 int intr;
13832
13833 /*
13834 * Blocking lock needs to sleep and retry from the request.
13835 *
13836 * Do not block and wait for 'resend' or 'reinstate'
13837 * lock requests, just return the error.
13838 *
13839 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13840 */
13841 if (cmd == F_SETLKW) {
13842 rnode4_t *rp = VTOR4(vp);
13843 nfs_argop4 *argop = (*argspp)->array;
13844
13845 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13846
13847 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13848 recov_statep, needrecov);
13849 *did_start_fop = FALSE;
13850 ASSERT((*argspp)->array_len == 2);
13851 if (argop[1].argop == OP_LOCK)
13852 nfs4args_lock_free(&argop[1]);
13853 else if (argop[1].argop == OP_LOCKT)
13854 nfs4args_lockt_free(&argop[1]);
13855 kmem_free(argop, 2 * sizeof (nfs_argop4));
13856 if (*respp)
13857 xdr_free(xdr_COMPOUND4res_clnt,
13858 (caddr_t)*respp);
13859 *argspp = NULL;
13860 *respp = NULL;
13861 nfs4_end_lock_seqid_sync(lop);
13862 lock_owner_rele(lop);
13863 *lopp = NULL;
13864 if (osp != NULL) {
13865 open_stream_rele(osp, rp);
13866 *ospp = NULL;
13867 }
13868 if (oop != NULL) {
13869 nfs4_end_open_seqid_sync(oop);
13870 open_owner_rele(oop);
13871 *oopp = NULL;
13872 }
13873
13874 nfs_rw_exit(&rp->r_lkserlock);
13875
13876 intr = nfs4_block_and_wait(tick_delayp, rp);
13877
13878 if (intr) {
13879 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13880 RW_WRITER, FALSE);
13881 *errorp = EINTR;
13882 return (FALSE);
13883 }
13884
13885 (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13886 RW_WRITER, FALSE);
13887
13888 /*
13889 * Make sure we are still safe to lock with
13890 * regards to mmapping.
13891 */
13892 if (!nfs4_safelock(vp, flk, cr)) {
13893 *errorp = EAGAIN;
13894 return (FALSE);
13895 }
13896
13897 return (TRUE);
13898 }
13899 if (ctype == NFS4_LCK_CTYPE_NORM)
13900 *errorp = EAGAIN;
13901 *skip_get_err = TRUE;
13902 flk->l_whence = 0;
13903 *whencep = 0;
13904 return (FALSE);
13905 } else if (lockt_args) {
13906 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13907 "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13908
13909 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13910 flk, lockt_args);
13911
13912 /* according to NLM code */
13913 *errorp = 0;
13914 *whencep = 0;
13915 *skip_get_err = TRUE;
13916 return (FALSE);
13917 }
13918 return (FALSE);
13919 }
13920
13921 /*
13922 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13923 */
13924 static void
13925 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13926 {
13927 switch (resp->status) {
13928 case NFS4ERR_ACCESS:
13929 case NFS4ERR_ADMIN_REVOKED:
13930 case NFS4ERR_BADHANDLE:
13931 case NFS4ERR_BAD_RANGE:
13932 case NFS4ERR_BAD_SEQID:
13933 case NFS4ERR_BAD_STATEID:
13934 case NFS4ERR_BADXDR:
13935 case NFS4ERR_DEADLOCK:
13936 case NFS4ERR_DELAY:
13937 case NFS4ERR_EXPIRED:
13938 case NFS4ERR_FHEXPIRED:
13939 case NFS4ERR_GRACE:
13940 case NFS4ERR_INVAL:
13941 case NFS4ERR_ISDIR:
13942 case NFS4ERR_LEASE_MOVED:
13943 case NFS4ERR_LOCK_NOTSUPP:
13944 case NFS4ERR_LOCK_RANGE:
13945 case NFS4ERR_MOVED:
13946 case NFS4ERR_NOFILEHANDLE:
13947 case NFS4ERR_NO_GRACE:
13948 case NFS4ERR_OLD_STATEID:
13949 case NFS4ERR_OPENMODE:
13950 case NFS4ERR_RECLAIM_BAD:
13951 case NFS4ERR_RECLAIM_CONFLICT:
13952 case NFS4ERR_RESOURCE:
13953 case NFS4ERR_SERVERFAULT:
13954 case NFS4ERR_STALE:
13955 case NFS4ERR_STALE_CLIENTID:
13956 case NFS4ERR_STALE_STATEID:
13957 return;
13958 default:
13959 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13960 "nfs4frlock_results_default: got unrecognizable "
13961 "res.status %d", resp->status));
13962 *errorp = NFS4ERR_INVAL;
13963 }
13964 }
13965
13966 /*
13967 * The lock request was successful, so update the client's state.
13968 */
13969 static void
13970 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13971 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13972 vnode_t *vp, flock64_t *flk, cred_t *cr,
13973 nfs4_lost_rqst_t *resend_rqstp)
13974 {
13975 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13976
13977 if (lock_args) {
13978 LOCK4res *lock_res;
13979
13980 lock_res = &resop->nfs_resop4_u.oplock;
13981 /* update the stateid with server's response */
13982
13983 if (lock_args->locker.new_lock_owner == TRUE) {
13984 mutex_enter(&lop->lo_lock);
13985 lop->lo_just_created = NFS4_PERM_CREATED;
13986 mutex_exit(&lop->lo_lock);
13987 }
13988
13989 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13990
13991 /*
13992 * If the lock was the result of a resending a lost
13993 * request, we've synched up the stateid and seqid
13994 * with the server, but now the server might be out of sync
13995 * with what the application thinks it has for locks.
13996 * Clean that up here. It's unclear whether we should do
13997 * this even if the filesystem has been forcibly unmounted.
13998 * For most servers, it's probably wasted effort, but
13999 * RFC 7530 lets servers require that unlocks exactly match
14000 * the locks that are held.
14001 */
14002 if (resend_rqstp != NULL &&
14003 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
14004 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
14005 } else {
14006 flk->l_whence = 0;
14007 }
14008 } else if (locku_args) {
14009 LOCKU4res *locku_res;
14010
14011 locku_res = &resop->nfs_resop4_u.oplocku;
14012
14013 /* Update the stateid with the server's response */
14014 nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
14015 } else if (lockt_args) {
14016 /* Switch the lock type to express success, see fcntl */
14017 flk->l_type = F_UNLCK;
14018 flk->l_whence = 0;
14019 }
14020 }
14021
14022 /*
14023 * Do final cleanup before exiting nfs4frlock.
14024 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14025 * COMPOUND4 args/res for calls that haven't already.
14026 */
14027 static void
14028 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14029 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14030 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14031 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14032 short whence, u_offset_t offset, struct lm_sysid *ls,
14033 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14034 bool_t did_start_fop, bool_t skip_get_err,
14035 cred_t *cred_otw, cred_t *cred)
14036 {
14037 mntinfo4_t *mi = VTOMI4(vp);
14038 rnode4_t *rp = VTOR4(vp);
14039 int error = *errorp;
14040 nfs_argop4 *argop;
14041 int do_flush_pages = 0;
14042
14043 ASSERT(nfs_zone() == mi->mi_zone);
14044 /*
14045 * The client recovery code wants the raw status information,
14046 * so don't map the NFS status code to an errno value for
14047 * non-normal call types.
14048 */
14049 if (ctype == NFS4_LCK_CTYPE_NORM) {
14050 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14051 *errorp = geterrno4(resp->status);
14052 if (did_start_fop == TRUE)
14053 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14054 needrecov);
14055
14056 /*
14057 * We've established a new lock on the server, so invalidate
14058 * the pages associated with the vnode to get the most up to
14059 * date pages from the server after acquiring the lock. We
14060 * want to be sure that the read operation gets the newest data.
14061 * N.B.
14062 * We used to do this in nfs4frlock_results_ok but that doesn't
14063 * work since VOP_PUTPAGE can call nfs4_commit which calls
14064 * nfs4_start_fop. We flush the pages below after calling
14065 * nfs4_end_fop above
14066 * The flush of the page cache must be done after
14067 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14068 */
14069 if (!error && resp && resp->status == NFS4_OK)
14070 do_flush_pages = 1;
14071 }
14072 if (argsp) {
14073 ASSERT(argsp->array_len == 2);
14074 argop = argsp->array;
14075 if (argop[1].argop == OP_LOCK)
14076 nfs4args_lock_free(&argop[1]);
14077 else if (argop[1].argop == OP_LOCKT)
14078 nfs4args_lockt_free(&argop[1]);
14079 kmem_free(argop, 2 * sizeof (nfs_argop4));
14080 if (resp)
14081 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14082 }
14083
14084 /* free the reference on the lock owner */
14085 if (lop != NULL) {
14086 nfs4_end_lock_seqid_sync(lop);
14087 lock_owner_rele(lop);
14088 }
14089
14090 /* need to free up the reference on osp for lock args */
14091 if (osp != NULL)
14092 open_stream_rele(osp, rp);
14093
14094 /* need to free up the reference on oop for lock args */
14095 if (oop != NULL) {
14096 nfs4_end_open_seqid_sync(oop);
14097 open_owner_rele(oop);
14098 }
14099
14100 if (do_flush_pages)
14101 nfs4_flush_pages(vp, cred);
14102
14103 (void) convoff(vp, flk, whence, offset);
14104
14105 lm_rel_sysid(ls);
14106
14107 /*
14108 * Record debug information in the event we get EINVAL.
14109 */
14110 mutex_enter(&mi->mi_lock);
14111 if (*errorp == EINVAL && (lock_args || locku_args) &&
14112 (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14113 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14114 zcmn_err(getzoneid(), CE_NOTE,
14115 "%s operation failed with "
14116 "EINVAL probably since the server, %s,"
14117 " doesn't support POSIX style locking",
14118 lock_args ? "LOCK" : "LOCKU",
14119 mi->mi_curr_serv->sv_hostname);
14120 mi->mi_flags |= MI4_LOCK_DEBUG;
14121 }
14122 }
14123 mutex_exit(&mi->mi_lock);
14124
14125 if (cred_otw)
14126 crfree(cred_otw);
14127 }
14128
14129 /*
14130 * This calls the server and the local locking code.
14131 *
14132 * Client locks are registerred locally by oring the sysid with
14133 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14134 * We need to distinguish between the two to avoid collision in case one
14135 * machine is used as both client and server.
14136 *
14137 * Blocking lock requests will continually retry to acquire the lock
14138 * forever.
14139 *
14140 * The ctype is defined as follows:
14141 * NFS4_LCK_CTYPE_NORM: normal lock request.
14142 *
14143 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client
14144 * recovery, get the pid from flk instead of curproc, and don't reregister
14145 * the lock locally.
14146 *
14147 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14148 * that we will use the information passed in via resend_rqstp to setup the
14149 * lock/locku request. This resend is the exact same request as the 'lost
14150 * lock', and is initiated by the recovery framework. A successful resend
14151 * request can initiate one or more reinstate requests.
14152 *
14153 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14154 * does not trigger additional reinstate requests. This lock call type is
14155 * set for setting the v4 server's locking state back to match what the
14156 * client's local locking state is in the event of a received 'lost lock'.
14157 *
14158 * Errors are returned via the nfs4_error_t parameter.
14159 */
14160 void
14161 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14162 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14163 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14164 {
14165 COMPOUND4args_clnt args, *argsp = NULL;
14166 COMPOUND4res_clnt res, *resp = NULL;
14167 nfs_argop4 *argop;
14168 nfs_resop4 *resop;
14169 rnode4_t *rp;
14170 int doqueue = 1;
14171 clock_t tick_delay; /* delay in clock ticks */
14172 struct lm_sysid *ls;
14173 LOCK4args *lock_args = NULL;
14174 LOCKU4args *locku_args = NULL;
14175 LOCKT4args *lockt_args = NULL;
14176 nfs4_open_owner_t *oop = NULL;
14177 nfs4_open_stream_t *osp = NULL;
14178 nfs4_lock_owner_t *lop = NULL;
14179 bool_t needrecov = FALSE;
14180 nfs4_recov_state_t recov_state;
14181 short whence;
14182 nfs4_op_hint_t op_hint;
14183 nfs4_lost_rqst_t lost_rqst;
14184 bool_t retry = FALSE;
14185 bool_t did_start_fop = FALSE;
14186 bool_t skip_get_err = FALSE;
14187 cred_t *cred_otw = NULL;
14188 bool_t recovonly; /* just queue request */
14189 int frc_no_reclaim = 0;
14190 #ifdef DEBUG
14191 char *name;
14192 #endif
14193
14194 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14195
14196 #ifdef DEBUG
14197 name = fn_name(VTOSV(vp)->sv_name);
14198 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14199 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14200 "length %"PRIu64", pid %d, sysid %d, call type %s, "
14201 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14202 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14203 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14204 resend_rqstp ? "TRUE" : "FALSE"));
14205 kmem_free(name, MAXNAMELEN);
14206 #endif
14207
14208 nfs4_error_zinit(ep);
14209 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14210 if (ep->error)
14211 return;
14212 ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14213 if (ep->error)
14214 return;
14215 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14216 vp, cr, &cred_otw);
14217
14218 recov_retry:
14219 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14220 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14221 rp = VTOR4(vp);
14222
14223 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14224 &did_start_fop, &recovonly);
14225
14226 if (ep->error)
14227 goto out;
14228
14229 if (recovonly) {
14230 /*
14231 * Leave the request for the recovery system to deal with.
14232 */
14233 ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14234 ASSERT(cmd != F_GETLK);
14235 ASSERT(flk->l_type == F_UNLCK);
14236
14237 nfs4_error_init(ep, EINTR);
14238 needrecov = TRUE;
14239 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14240 if (lop != NULL) {
14241 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14242 NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14243 (void) nfs4_start_recovery(ep,
14244 VTOMI4(vp), vp, NULL, NULL,
14245 (lost_rqst.lr_op == OP_LOCK ||
14246 lost_rqst.lr_op == OP_LOCKU) ?
14247 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14248 lock_owner_rele(lop);
14249 lop = NULL;
14250 }
14251 flk->l_pid = curproc->p_pid;
14252 nfs4_register_lock_locally(vp, flk, flag, offset);
14253 goto out;
14254 }
14255
14256 /* putfh directory fh */
14257 argop[0].argop = OP_CPUTFH;
14258 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14259
14260 /*
14261 * Set up the over-the-wire arguments and get references to the
14262 * open owner, etc.
14263 */
14264
14265 if (ctype == NFS4_LCK_CTYPE_RESEND ||
14266 ctype == NFS4_LCK_CTYPE_REINSTATE) {
14267 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14268 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14269 } else {
14270 bool_t go_otw = TRUE;
14271
14272 ASSERT(resend_rqstp == NULL);
14273
14274 switch (cmd) {
14275 case F_GETLK:
14276 nfs4frlock_setup_lockt_args(ctype, &argop[1],
14277 &lockt_args, argsp, flk, rp);
14278 break;
14279 case F_SETLKW:
14280 case F_SETLK:
14281 if (flk->l_type == F_UNLCK)
14282 nfs4frlock_setup_locku_args(ctype,
14283 &argop[1], &locku_args, flk,
14284 &lop, ep, argsp,
14285 vp, flag, offset, cr,
14286 &skip_get_err, &go_otw);
14287 else
14288 nfs4frlock_setup_lock_args(ctype,
14289 &lock_args, &oop, &osp, &lop, &argop[1],
14290 argsp, flk, cmd, vp, cr, ep);
14291
14292 if (ep->error)
14293 goto out;
14294
14295 switch (ep->stat) {
14296 case NFS4_OK:
14297 break;
14298 case NFS4ERR_DELAY:
14299 /* recov thread never gets this error */
14300 ASSERT(resend_rqstp == NULL);
14301 ASSERT(did_start_fop);
14302
14303 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14304 &recov_state, TRUE);
14305 did_start_fop = FALSE;
14306 if (argop[1].argop == OP_LOCK)
14307 nfs4args_lock_free(&argop[1]);
14308 else if (argop[1].argop == OP_LOCKT)
14309 nfs4args_lockt_free(&argop[1]);
14310 kmem_free(argop, 2 * sizeof (nfs_argop4));
14311 argsp = NULL;
14312 goto recov_retry;
14313 default:
14314 ep->error = EIO;
14315 goto out;
14316 }
14317 break;
14318 default:
14319 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14320 "nfs4_frlock: invalid cmd %d", cmd));
14321 ep->error = EINVAL;
14322 goto out;
14323 }
14324
14325 if (!go_otw)
14326 goto out;
14327 }
14328
14329 /* XXX should we use the local reclock as a cache ? */
14330 /*
14331 * Unregister the lock with the local locking code before
14332 * contacting the server. This avoids a potential race where
14333 * another process gets notified that it has been granted a lock
14334 * before we can unregister ourselves locally.
14335 */
14336 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14337 if (ctype == NFS4_LCK_CTYPE_NORM)
14338 flk->l_pid = ttoproc(curthread)->p_pid;
14339 nfs4_register_lock_locally(vp, flk, flag, offset);
14340 }
14341
14342 /*
14343 * Send the server the lock request. Continually loop with a delay
14344 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14345 */
14346 resp = &res;
14347
14348 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14349 (CE_NOTE,
14350 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14351 rnode4info(rp)));
14352
14353 if (lock_args && frc_no_reclaim) {
14354 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14355 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14356 "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14357 lock_args->reclaim = FALSE;
14358 if (did_reclaimp)
14359 *did_reclaimp = 0;
14360 }
14361
14362 /*
14363 * Do the OTW call.
14364 */
14365 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14366
14367 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14368 "nfs4frlock: error %d, status %d", ep->error, resp->status));
14369
14370 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14371 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14372 "nfs4frlock: needrecov %d", needrecov));
14373
14374 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14375 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14376 args.ctag);
14377
14378 /*
14379 * Check if one of these mutually exclusive error cases has
14380 * happened:
14381 * need to swap credentials due to access error
14382 * recovery is needed
14383 * different error (only known case is missing Kerberos ticket)
14384 */
14385
14386 if ((ep->error == EACCES ||
14387 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14388 cred_otw != cr) {
14389 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14390 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14391 cr, &cred_otw);
14392 goto recov_retry;
14393 }
14394
14395 if (needrecov) {
14396 /*
14397 * LOCKT requests don't need to recover from lost
14398 * requests since they don't create/modify state.
14399 */
14400 if ((ep->error == EINTR ||
14401 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14402 lockt_args)
14403 goto out;
14404 /*
14405 * Do not attempt recovery for requests initiated by
14406 * the recovery framework. Let the framework redrive them.
14407 */
14408 if (ctype != NFS4_LCK_CTYPE_NORM)
14409 goto out;
14410 else {
14411 ASSERT(resend_rqstp == NULL);
14412 }
14413
14414 nfs4frlock_save_lost_rqst(ctype, ep->error,
14415 flk_to_locktype(cmd, flk->l_type),
14416 oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14417
14418 retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14419 &resp, lock_args, locku_args, &oop, &osp, &lop,
14420 rp, vp, &recov_state, op_hint, &did_start_fop,
14421 cmd != F_GETLK ? &lost_rqst : NULL, flk);
14422
14423 if (retry) {
14424 ASSERT(oop == NULL);
14425 ASSERT(osp == NULL);
14426 ASSERT(lop == NULL);
14427 goto recov_retry;
14428 }
14429 goto out;
14430 }
14431
14432 /*
14433 * Bail out if have reached this point with ep->error set. Can
14434 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14435 * This happens if Kerberos ticket has expired or has been
14436 * destroyed.
14437 */
14438 if (ep->error != 0)
14439 goto out;
14440
14441 /*
14442 * Process the reply.
14443 */
14444 switch (resp->status) {
14445 case NFS4_OK:
14446 resop = &resp->array[1];
14447 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14448 resend_rqstp);
14449 /*
14450 * Have a successful lock operation, now update state.
14451 */
14452 nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14453 resop, lop, vp, flk, cr, resend_rqstp);
14454 break;
14455
14456 case NFS4ERR_DENIED:
14457 resop = &resp->array[1];
14458 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14459 &oop, &osp, &lop, cmd, vp, flk, op_hint,
14460 &recov_state, needrecov, &argsp, &resp,
14461 &tick_delay, &whence, &ep->error, resop, cr,
14462 &did_start_fop, &skip_get_err);
14463
14464 if (retry) {
14465 ASSERT(oop == NULL);
14466 ASSERT(osp == NULL);
14467 ASSERT(lop == NULL);
14468 goto recov_retry;
14469 }
14470 break;
14471 /*
14472 * If the server won't let us reclaim, fall-back to trying to lock
14473 * the file from scratch. Code elsewhere will check the changeinfo
14474 * to ensure the file hasn't been changed.
14475 */
14476 case NFS4ERR_NO_GRACE:
14477 if (lock_args && lock_args->reclaim == TRUE) {
14478 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14479 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14480 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14481 frc_no_reclaim = 1;
14482 /* clean up before retrying */
14483 needrecov = 0;
14484 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14485 lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14486 &recov_state, op_hint, &did_start_fop, NULL, flk);
14487 goto recov_retry;
14488 }
14489 /* FALLTHROUGH */
14490
14491 default:
14492 nfs4frlock_results_default(resp, &ep->error);
14493 break;
14494 }
14495 out:
14496 /*
14497 * Process and cleanup from error. Make interrupted unlock
14498 * requests look successful, since they will be handled by the
14499 * client recovery code.
14500 */
14501 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14502 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14503 lock_args, locku_args, did_start_fop,
14504 skip_get_err, cred_otw, cr);
14505
14506 if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14507 (cmd == F_SETLK || cmd == F_SETLKW))
14508 ep->error = 0;
14509 }
14510
14511 /*
14512 * nfs4_safelock:
14513 *
14514 * Return non-zero if the given lock request can be handled without
14515 * violating the constraints on concurrent mapping and locking.
14516 */
14517
14518 static int
14519 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14520 {
14521 rnode4_t *rp = VTOR4(vp);
14522 struct vattr va;
14523 int error;
14524
14525 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14526 ASSERT(rp->r_mapcnt >= 0);
14527 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14528 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14529 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14530 bfp->l_start, bfp->l_len, rp->r_mapcnt));
14531
14532 if (rp->r_mapcnt == 0)
14533 return (1); /* always safe if not mapped */
14534
14535 /*
14536 * If the file is already mapped and there are locks, then they
14537 * should be all safe locks. So adding or removing a lock is safe
14538 * as long as the new request is safe (i.e., whole-file, meaning
14539 * length and starting offset are both zero).
14540 */
14541
14542 if (bfp->l_start != 0 || bfp->l_len != 0) {
14543 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14544 "cannot lock a memory mapped file unless locking the "
14545 "entire file: start %"PRIx64", len %"PRIx64,
14546 bfp->l_start, bfp->l_len));
14547 return (0);
14548 }
14549
14550 /* mandatory locking and mapping don't mix */
14551 va.va_mask = AT_MODE;
14552 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14553 if (error != 0) {
14554 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14555 "getattr error %d", error));
14556 return (0); /* treat errors conservatively */
14557 }
14558 if (MANDLOCK(vp, va.va_mode)) {
14559 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14560 "cannot mandatory lock and mmap a file"));
14561 return (0);
14562 }
14563
14564 return (1);
14565 }
14566
14567
14568 /*
14569 * Register the lock locally within Solaris.
14570 * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14571 * recording locks locally.
14572 *
14573 * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14574 * are registered locally.
14575 */
14576 void
14577 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14578 u_offset_t offset)
14579 {
14580 int oldsysid;
14581 int error;
14582 #ifdef DEBUG
14583 char *name;
14584 #endif
14585
14586 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14587
14588 #ifdef DEBUG
14589 name = fn_name(VTOSV(vp)->sv_name);
14590 NFS4_DEBUG(nfs4_client_lock_debug,
14591 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14592 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14593 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14594 flk->l_sysid));
14595 kmem_free(name, MAXNAMELEN);
14596 #endif
14597
14598 /* register the lock with local locking */
14599 oldsysid = flk->l_sysid;
14600 flk->l_sysid |= LM_SYSID_CLIENT;
14601 error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14602 #ifdef DEBUG
14603 if (error != 0) {
14604 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14605 "nfs4_register_lock_locally: could not register with"
14606 " local locking"));
14607 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14608 "error %d, vp 0x%p, pid %d, sysid 0x%x",
14609 error, (void *)vp, flk->l_pid, flk->l_sysid));
14610 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14611 "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14612 flk->l_type, flk->l_start, flk->l_len));
14613 (void) reclock(vp, flk, 0, flag, offset, NULL);
14614 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14615 "blocked by pid %d sysid 0x%x type %d "
14616 "off 0x%" PRIx64 " len 0x%" PRIx64,
14617 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14618 flk->l_len));
14619 }
14620 #endif
14621 flk->l_sysid = oldsysid;
14622 }
14623
14624 /*
14625 * nfs4_lockrelease:
14626 *
14627 * Release any locks on the given vnode that are held by the current
14628 * process. Also removes the lock owner (if one exists) from the rnode's
14629 * list.
14630 */
14631 static int
14632 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14633 {
14634 flock64_t ld;
14635 int ret, error;
14636 rnode4_t *rp;
14637 nfs4_lock_owner_t *lop;
14638 nfs4_recov_state_t recov_state;
14639 mntinfo4_t *mi;
14640 bool_t possible_orphan = FALSE;
14641 bool_t recovonly;
14642
14643 ASSERT((uintptr_t)vp > KERNELBASE);
14644 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14645
14646 rp = VTOR4(vp);
14647 mi = VTOMI4(vp);
14648
14649 /*
14650 * If we have not locked anything then we can
14651 * just return since we have no work to do.
14652 */
14653 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14654 return (0);
14655 }
14656
14657 /*
14658 * We need to comprehend that another thread may
14659 * kick off recovery and the lock_owner we have stashed
14660 * in lop might be invalid so we should NOT cache it
14661 * locally!
14662 */
14663 recov_state.rs_flags = 0;
14664 recov_state.rs_num_retry_despite_err = 0;
14665 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14666 &recovonly);
14667 if (error) {
14668 mutex_enter(&rp->r_statelock);
14669 rp->r_flags |= R4LODANGLERS;
14670 mutex_exit(&rp->r_statelock);
14671 return (error);
14672 }
14673
14674 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14675
14676 /*
14677 * Check if the lock owner might have a lock (request was sent but
14678 * no response was received). Also check if there are any remote
14679 * locks on the file. (In theory we shouldn't have to make this
14680 * second check if there's no lock owner, but for now we'll be
14681 * conservative and do it anyway.) If either condition is true,
14682 * send an unlock for the entire file to the server.
14683 *
14684 * Note that no explicit synchronization is needed here. At worst,
14685 * flk_has_remote_locks() will return a false positive, in which case
14686 * the unlock call wastes time but doesn't harm correctness.
14687 */
14688
14689 if (lop) {
14690 mutex_enter(&lop->lo_lock);
14691 possible_orphan = lop->lo_pending_rqsts;
14692 mutex_exit(&lop->lo_lock);
14693 lock_owner_rele(lop);
14694 }
14695
14696 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14697
14698 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14699 "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14700 "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14701 (void *)lop));
14702
14703 if (possible_orphan || flk_has_remote_locks(vp)) {
14704 ld.l_type = F_UNLCK; /* set to unlock entire file */
14705 ld.l_whence = 0; /* unlock from start of file */
14706 ld.l_start = 0;
14707 ld.l_len = 0; /* do entire file */
14708
14709 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14710 cr, NULL);
14711
14712 if (ret != 0) {
14713 /*
14714 * If VOP_FRLOCK fails, make sure we unregister
14715 * local locks before we continue.
14716 */
14717 ld.l_pid = ttoproc(curthread)->p_pid;
14718 nfs4_register_lock_locally(vp, &ld, flag, offset);
14719 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14720 "nfs4_lockrelease: lock release error on vp"
14721 " %p: error %d.\n", (void *)vp, ret));
14722 }
14723 }
14724
14725 recov_state.rs_flags = 0;
14726 recov_state.rs_num_retry_despite_err = 0;
14727 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14728 &recovonly);
14729 if (error) {
14730 mutex_enter(&rp->r_statelock);
14731 rp->r_flags |= R4LODANGLERS;
14732 mutex_exit(&rp->r_statelock);
14733 return (error);
14734 }
14735
14736 /*
14737 * So, here we're going to need to retrieve the lock-owner
14738 * again (in case recovery has done a switch-a-roo) and
14739 * remove it because we can.
14740 */
14741 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14742
14743 if (lop) {
14744 nfs4_rnode_remove_lock_owner(rp, lop);
14745 lock_owner_rele(lop);
14746 }
14747
14748 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14749 return (0);
14750 }
14751
14752 /*
14753 * Wait for 'tick_delay' clock ticks.
14754 * Implement exponential backoff until hit the lease_time of this nfs4_server.
14755 * NOTE: lock_lease_time is in seconds.
14756 *
14757 * XXX For future improvements, should implement a waiting queue scheme.
14758 */
14759 static int
14760 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14761 {
14762 long milliseconds_delay;
14763 time_t lock_lease_time;
14764
14765 /* wait tick_delay clock ticks or siginteruptus */
14766 if (delay_sig(*tick_delay)) {
14767 return (EINTR);
14768 }
14769 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14770 "reissue the lock request: blocked for %ld clock ticks: %ld "
14771 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14772
14773 /* get the lease time */
14774 lock_lease_time = r2lease_time(rp);
14775
14776 /* drv_hztousec converts ticks to microseconds */
14777 milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14778 if (milliseconds_delay < lock_lease_time * 1000) {
14779 *tick_delay = 2 * *tick_delay;
14780 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14781 *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14782 }
14783 return (0);
14784 }
14785
14786
14787 void
14788 nfs4_vnops_init(void)
14789 {
14790 }
14791
14792 void
14793 nfs4_vnops_fini(void)
14794 {
14795 }
14796
14797 /*
14798 * Return a reference to the directory (parent) vnode for a given vnode,
14799 * using the saved pathname information and the directory file handle. The
14800 * caller is responsible for disposing of the reference.
14801 * Returns zero or an errno value.
14802 *
14803 * Caller should set need_start_op to FALSE if it is the recovery
14804 * thread, or if a start_fop has already been done. Otherwise, TRUE.
14805 */
14806 int
14807 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14808 {
14809 svnode_t *svnp;
14810 vnode_t *dvp = NULL;
14811 servinfo4_t *svp;
14812 nfs4_fname_t *mfname;
14813 int error;
14814
14815 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14816
14817 if (vp->v_flag & VROOT) {
14818 nfs4_sharedfh_t *sfh;
14819 nfs_fh4 fh;
14820 mntinfo4_t *mi;
14821
14822 ASSERT(vp->v_type == VREG);
14823
14824 mi = VTOMI4(vp);
14825 svp = mi->mi_curr_serv;
14826 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14827 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14828 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14829 sfh = sfh4_get(&fh, VTOMI4(vp));
14830 nfs_rw_exit(&svp->sv_lock);
14831 mfname = mi->mi_fname;
14832 fn_hold(mfname);
14833 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14834 sfh4_rele(&sfh);
14835
14836 if (dvp->v_type == VNON)
14837 dvp->v_type = VDIR;
14838 *dvpp = dvp;
14839 return (0);
14840 }
14841
14842 svnp = VTOSV(vp);
14843
14844 if (svnp == NULL) {
14845 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14846 "shadow node is NULL"));
14847 return (EINVAL);
14848 }
14849
14850 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14851 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14852 "shadow node name or dfh val == NULL"));
14853 return (EINVAL);
14854 }
14855
14856 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14857 (int)need_start_op);
14858 if (error != 0) {
14859 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14860 "nfs4_make_dotdot returned %d", error));
14861 return (error);
14862 }
14863 if (!dvp) {
14864 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14865 "nfs4_make_dotdot returned a NULL dvp"));
14866 return (EIO);
14867 }
14868 if (dvp->v_type == VNON)
14869 dvp->v_type = VDIR;
14870 ASSERT(dvp->v_type == VDIR);
14871 if (VTOR4(vp)->r_flags & R4ISXATTR) {
14872 mutex_enter(&dvp->v_lock);
14873 dvp->v_flag |= V_XATTRDIR;
14874 mutex_exit(&dvp->v_lock);
14875 }
14876 *dvpp = dvp;
14877 return (0);
14878 }
14879
14880 /*
14881 * Copy the (final) component name of vp to fnamep. maxlen is the maximum
14882 * length that fnamep can accept, including the trailing null.
14883 * Returns 0 if okay, returns an errno value if there was a problem.
14884 */
14885
14886 int
14887 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14888 {
14889 char *fn;
14890 int err = 0;
14891 servinfo4_t *svp;
14892 svnode_t *shvp;
14893
14894 /*
14895 * If the file being opened has VROOT set, then this is
14896 * a "file" mount. sv_name will not be interesting, so
14897 * go back to the servinfo4 to get the original mount
14898 * path and strip off all but the final edge. Otherwise
14899 * just return the name from the shadow vnode.
14900 */
14901
14902 if (vp->v_flag & VROOT) {
14903
14904 svp = VTOMI4(vp)->mi_curr_serv;
14905 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14906
14907 fn = strrchr(svp->sv_path, '/');
14908 if (fn == NULL)
14909 err = EINVAL;
14910 else
14911 fn++;
14912 } else {
14913 shvp = VTOSV(vp);
14914 fn = fn_name(shvp->sv_name);
14915 }
14916
14917 if (err == 0)
14918 if (strlen(fn) < maxlen)
14919 (void) strcpy(fnamep, fn);
14920 else
14921 err = ENAMETOOLONG;
14922
14923 if (vp->v_flag & VROOT)
14924 nfs_rw_exit(&svp->sv_lock);
14925 else
14926 kmem_free(fn, MAXNAMELEN);
14927
14928 return (err);
14929 }
14930
14931 /*
14932 * Bookkeeping for a close that doesn't need to go over the wire.
14933 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14934 * it is left at 1.
14935 */
14936 void
14937 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14938 {
14939 rnode4_t *rp;
14940 mntinfo4_t *mi;
14941
14942 mi = VTOMI4(vp);
14943 rp = VTOR4(vp);
14944
14945 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14946 "rp=%p osp=%p", (void *)rp, (void *)osp));
14947 ASSERT(nfs_zone() == mi->mi_zone);
14948 ASSERT(mutex_owned(&osp->os_sync_lock));
14949 ASSERT(*have_lockp);
14950
14951 if (!osp->os_valid ||
14952 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14953 return;
14954 }
14955
14956 /*
14957 * This removes the reference obtained at OPEN; ie,
14958 * when the open stream structure was created.
14959 *
14960 * We don't have to worry about calling 'open_stream_rele'
14961 * since we our currently holding a reference to this
14962 * open stream which means the count can not go to 0 with
14963 * this decrement.
14964 */
14965 ASSERT(osp->os_ref_count >= 2);
14966 osp->os_ref_count--;
14967 osp->os_valid = 0;
14968 mutex_exit(&osp->os_sync_lock);
14969 *have_lockp = 0;
14970
14971 nfs4_dec_state_ref_count(mi);
14972 }
14973
14974 /*
14975 * Close all remaining open streams on the rnode. These open streams
14976 * could be here because:
14977 * - The close attempted at either close or delmap failed
14978 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14979 * - Someone did mknod on a regular file but never opened it
14980 */
14981 int
14982 nfs4close_all(vnode_t *vp, cred_t *cr)
14983 {
14984 nfs4_open_stream_t *osp;
14985 int error;
14986 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14987 rnode4_t *rp;
14988
14989 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14990
14991 error = 0;
14992 rp = VTOR4(vp);
14993
14994 /*
14995 * At this point, all we know is that the last time
14996 * someone called vn_rele, the count was 1. Since then,
14997 * the vnode could have been re-activated. We want to
14998 * loop through the open streams and close each one, but
14999 * we have to be careful since once we release the rnode
15000 * hash bucket lock, someone else is free to come in and
15001 * re-activate the rnode and add new open streams. The
15002 * strategy is take the rnode hash bucket lock, verify that
15003 * the count is still 1, grab the open stream off the
15004 * head of the list and mark it invalid, then release the
15005 * rnode hash bucket lock and proceed with that open stream.
15006 * This is ok because nfs4close_one() will acquire the proper
15007 * open/create to close/destroy synchronization for open
15008 * streams, and will ensure that if someone has reopened
15009 * the open stream after we've dropped the hash bucket lock
15010 * then we'll just simply return without destroying the
15011 * open stream.
15012 * Repeat until the list is empty.
15013 */
15014
15015 for (;;) {
15016
15017 /* make sure vnode hasn't been reactivated */
15018 rw_enter(&rp->r_hashq->r_lock, RW_READER);
15019 mutex_enter(&vp->v_lock);
15020 if (vp->v_count > 1) {
15021 mutex_exit(&vp->v_lock);
15022 rw_exit(&rp->r_hashq->r_lock);
15023 break;
15024 }
15025 /*
15026 * Grabbing r_os_lock before releasing v_lock prevents
15027 * a window where the rnode/open stream could get
15028 * reactivated (and os_force_close set to 0) before we
15029 * had a chance to set os_force_close to 1.
15030 */
15031 mutex_enter(&rp->r_os_lock);
15032 mutex_exit(&vp->v_lock);
15033
15034 osp = list_head(&rp->r_open_streams);
15035 if (!osp) {
15036 /* nothing left to CLOSE OTW, so return */
15037 mutex_exit(&rp->r_os_lock);
15038 rw_exit(&rp->r_hashq->r_lock);
15039 break;
15040 }
15041
15042 mutex_enter(&rp->r_statev4_lock);
15043 /* the file can't still be mem mapped */
15044 ASSERT(rp->r_mapcnt == 0);
15045 if (rp->created_v4)
15046 rp->created_v4 = 0;
15047 mutex_exit(&rp->r_statev4_lock);
15048
15049 /*
15050 * Grab a ref on this open stream; nfs4close_one
15051 * will mark it as invalid
15052 */
15053 mutex_enter(&osp->os_sync_lock);
15054 osp->os_ref_count++;
15055 osp->os_force_close = 1;
15056 mutex_exit(&osp->os_sync_lock);
15057 mutex_exit(&rp->r_os_lock);
15058 rw_exit(&rp->r_hashq->r_lock);
15059
15060 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15061
15062 /* Update error if it isn't already non-zero */
15063 if (error == 0) {
15064 if (e.error)
15065 error = e.error;
15066 else if (e.stat)
15067 error = geterrno4(e.stat);
15068 }
15069
15070 #ifdef DEBUG
15071 nfs4close_all_cnt++;
15072 #endif
15073 /* Release the ref on osp acquired above. */
15074 open_stream_rele(osp, rp);
15075
15076 /* Proceed to the next open stream, if any */
15077 }
15078 return (error);
15079 }
15080
15081 /*
15082 * nfs4close_one - close one open stream for a file if needed.
15083 *
15084 * "close_type" indicates which close path this is:
15085 * CLOSE_NORM: close initiated via VOP_CLOSE.
15086 * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15087 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces
15088 * the close and release of client state for this open stream
15089 * (unless someone else has the open stream open).
15090 * CLOSE_RESEND: indicates the request is a replay of an earlier request
15091 * (e.g., due to abort because of a signal).
15092 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15093 *
15094 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15095 * recovery. Instead, the caller is expected to deal with retries.
15096 *
15097 * The caller can either pass in the osp ('provided_osp') or not.
15098 *
15099 * 'access_bits' represents the access we are closing/downgrading.
15100 *
15101 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the
15102 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15103 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15104 *
15105 * Errors are returned via the nfs4_error_t.
15106 */
15107 void
15108 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15109 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15110 nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15111 uint_t mmap_flags)
15112 {
15113 nfs4_open_owner_t *oop;
15114 nfs4_open_stream_t *osp = NULL;
15115 int retry = 0;
15116 int num_retries = NFS4_NUM_RECOV_RETRIES;
15117 rnode4_t *rp;
15118 mntinfo4_t *mi;
15119 nfs4_recov_state_t recov_state;
15120 cred_t *cred_otw = NULL;
15121 bool_t recovonly = FALSE;
15122 int isrecov;
15123 int force_close;
15124 int close_failed = 0;
15125 int did_dec_count = 0;
15126 int did_start_op = 0;
15127 int did_force_recovlock = 0;
15128 int did_start_seqid_sync = 0;
15129 int have_sync_lock = 0;
15130
15131 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15132
15133 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15134 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15135 (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15136 len, maxprot, mmap_flags, access_bits));
15137
15138 nfs4_error_zinit(ep);
15139 rp = VTOR4(vp);
15140 mi = VTOMI4(vp);
15141 isrecov = (close_type == CLOSE_RESEND ||
15142 close_type == CLOSE_AFTER_RESEND);
15143
15144 /*
15145 * First get the open owner.
15146 */
15147 if (!provided_osp) {
15148 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15149 } else {
15150 oop = provided_osp->os_open_owner;
15151 ASSERT(oop != NULL);
15152 open_owner_hold(oop);
15153 }
15154
15155 if (!oop) {
15156 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15157 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15158 "close type %d", (void *)rp, (void *)mi, (void *)cr,
15159 (void *)provided_osp, close_type));
15160 ep->error = EIO;
15161 goto out;
15162 }
15163
15164 cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15165 recov_retry:
15166 osp = NULL;
15167 close_failed = 0;
15168 force_close = (close_type == CLOSE_FORCE);
15169 retry = 0;
15170 did_start_op = 0;
15171 did_force_recovlock = 0;
15172 did_start_seqid_sync = 0;
15173 have_sync_lock = 0;
15174 recovonly = FALSE;
15175 recov_state.rs_flags = 0;
15176 recov_state.rs_num_retry_despite_err = 0;
15177
15178 /*
15179 * Second synchronize with recovery.
15180 */
15181 if (!isrecov) {
15182 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15183 &recov_state, &recovonly);
15184 if (!ep->error) {
15185 did_start_op = 1;
15186 } else {
15187 close_failed = 1;
15188 /*
15189 * If we couldn't get start_fop, but have to
15190 * cleanup state, then at least acquire the
15191 * mi_recovlock so we can synchronize with
15192 * recovery.
15193 */
15194 if (close_type == CLOSE_FORCE) {
15195 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15196 RW_READER, FALSE);
15197 did_force_recovlock = 1;
15198 } else
15199 goto out;
15200 }
15201 }
15202
15203 /*
15204 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15205 * set 'recovonly' to TRUE since most likely this is due to
15206 * reovery being active (MI4_RECOV_ACTIV). If recovery is active,
15207 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15208 * to retry, causing us to loop until recovery finishes. Plus we
15209 * don't need protection over the open seqid since we're not going
15210 * OTW, hence don't need to use the seqid.
15211 */
15212 if (recovonly == FALSE) {
15213 /* need to grab the open owner sync before 'os_sync_lock' */
15214 ep->error = nfs4_start_open_seqid_sync(oop, mi);
15215 if (ep->error == EAGAIN) {
15216 ASSERT(!isrecov);
15217 if (did_start_op)
15218 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15219 &recov_state, TRUE);
15220 if (did_force_recovlock)
15221 nfs_rw_exit(&mi->mi_recovlock);
15222 goto recov_retry;
15223 }
15224 did_start_seqid_sync = 1;
15225 }
15226
15227 /*
15228 * Third get an open stream and acquire 'os_sync_lock' to
15229 * sychronize the opening/creating of an open stream with the
15230 * closing/destroying of an open stream.
15231 */
15232 if (!provided_osp) {
15233 /* returns with 'os_sync_lock' held */
15234 osp = find_open_stream(oop, rp);
15235 if (!osp) {
15236 ep->error = EIO;
15237 goto out;
15238 }
15239 } else {
15240 osp = provided_osp;
15241 open_stream_hold(osp);
15242 mutex_enter(&osp->os_sync_lock);
15243 }
15244 have_sync_lock = 1;
15245
15246 ASSERT(oop == osp->os_open_owner);
15247
15248 /*
15249 * Fourth, do any special pre-OTW CLOSE processing
15250 * based on the specific close type.
15251 */
15252 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15253 !did_dec_count) {
15254 ASSERT(osp->os_open_ref_count > 0);
15255 osp->os_open_ref_count--;
15256 did_dec_count = 1;
15257 if (osp->os_open_ref_count == 0)
15258 osp->os_final_close = 1;
15259 }
15260
15261 if (close_type == CLOSE_FORCE) {
15262 /* see if somebody reopened the open stream. */
15263 if (!osp->os_force_close) {
15264 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15265 "nfs4close_one: skip CLOSE_FORCE as osp %p "
15266 "was reopened, vp %p", (void *)osp, (void *)vp));
15267 ep->error = 0;
15268 ep->stat = NFS4_OK;
15269 goto out;
15270 }
15271
15272 if (!osp->os_final_close && !did_dec_count) {
15273 osp->os_open_ref_count--;
15274 did_dec_count = 1;
15275 }
15276
15277 /*
15278 * We can't depend on os_open_ref_count being 0 due to the
15279 * way executables are opened (VN_RELE to match a VOP_OPEN).
15280 */
15281 #ifdef NOTYET
15282 ASSERT(osp->os_open_ref_count == 0);
15283 #endif
15284 if (osp->os_open_ref_count != 0) {
15285 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15286 "nfs4close_one: should panic here on an "
15287 "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15288 "since this is probably the exec problem."));
15289
15290 osp->os_open_ref_count = 0;
15291 }
15292
15293 /*
15294 * There is the possibility that nfs4close_one()
15295 * for close_type == CLOSE_DELMAP couldn't find the
15296 * open stream, thus couldn't decrement its os_mapcnt;
15297 * therefore we can't use this ASSERT yet.
15298 */
15299 #ifdef NOTYET
15300 ASSERT(osp->os_mapcnt == 0);
15301 #endif
15302 osp->os_mapcnt = 0;
15303 }
15304
15305 if (close_type == CLOSE_DELMAP && !did_dec_count) {
15306 ASSERT(osp->os_mapcnt >= btopr(len));
15307
15308 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15309 osp->os_mmap_write -= btopr(len);
15310 if (maxprot & PROT_READ)
15311 osp->os_mmap_read -= btopr(len);
15312 if (maxprot & PROT_EXEC)
15313 osp->os_mmap_read -= btopr(len);
15314 /* mirror the PROT_NONE check in nfs4_addmap() */
15315 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15316 !(maxprot & PROT_EXEC))
15317 osp->os_mmap_read -= btopr(len);
15318 osp->os_mapcnt -= btopr(len);
15319 did_dec_count = 1;
15320 }
15321
15322 if (recovonly) {
15323 nfs4_lost_rqst_t lost_rqst;
15324
15325 /* request should not already be in recovery queue */
15326 ASSERT(lrp == NULL);
15327 nfs4_error_init(ep, EINTR);
15328 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15329 osp, cred_otw, vp);
15330 mutex_exit(&osp->os_sync_lock);
15331 have_sync_lock = 0;
15332 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15333 lost_rqst.lr_op == OP_CLOSE ?
15334 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15335 close_failed = 1;
15336 force_close = 0;
15337 goto close_cleanup;
15338 }
15339
15340 /*
15341 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15342 * we stopped operating on the open owner's <old oo_name, old seqid>
15343 * space, which means we stopped operating on the open stream
15344 * too. So don't go OTW (as the seqid is likely bad, and the
15345 * stateid could be stale, potentially triggering a false
15346 * setclientid), and just clean up the client's internal state.
15347 */
15348 if (osp->os_orig_oo_name != oop->oo_name) {
15349 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15350 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15351 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15352 "oo_name %" PRIx64")",
15353 (void *)osp, (void *)oop, osp->os_orig_oo_name,
15354 oop->oo_name));
15355 close_failed = 1;
15356 }
15357
15358 /* If the file failed recovery, just quit. */
15359 mutex_enter(&rp->r_statelock);
15360 if (rp->r_flags & R4RECOVERR) {
15361 close_failed = 1;
15362 }
15363 mutex_exit(&rp->r_statelock);
15364
15365 /*
15366 * If the force close path failed to obtain start_fop
15367 * then skip the OTW close and just remove the state.
15368 */
15369 if (close_failed)
15370 goto close_cleanup;
15371
15372 /*
15373 * Fifth, check to see if there are still mapped pages or other
15374 * opens using this open stream. If there are then we can't
15375 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15376 */
15377 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15378 nfs4_lost_rqst_t new_lost_rqst;
15379 bool_t needrecov = FALSE;
15380 cred_t *odg_cred_otw = NULL;
15381 seqid4 open_dg_seqid = 0;
15382
15383 if (osp->os_delegation) {
15384 /*
15385 * If this open stream was never OPENed OTW then we
15386 * surely can't DOWNGRADE it (especially since the
15387 * osp->open_stateid is really a delegation stateid
15388 * when os_delegation is 1).
15389 */
15390 if (access_bits & FREAD)
15391 osp->os_share_acc_read--;
15392 if (access_bits & FWRITE)
15393 osp->os_share_acc_write--;
15394 osp->os_share_deny_none--;
15395 nfs4_error_zinit(ep);
15396 goto out;
15397 }
15398 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15399 lrp, ep, &odg_cred_otw, &open_dg_seqid);
15400 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15401 if (needrecov && !isrecov) {
15402 bool_t abort;
15403 nfs4_bseqid_entry_t *bsep = NULL;
15404
15405 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15406 bsep = nfs4_create_bseqid_entry(oop, NULL,
15407 vp, 0,
15408 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15409 open_dg_seqid);
15410
15411 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15412 oop, osp, odg_cred_otw, vp, access_bits, 0);
15413 mutex_exit(&osp->os_sync_lock);
15414 have_sync_lock = 0;
15415 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15416 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15417 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15418 bsep, NULL, NULL);
15419 if (odg_cred_otw)
15420 crfree(odg_cred_otw);
15421 if (bsep)
15422 kmem_free(bsep, sizeof (*bsep));
15423
15424 if (abort == TRUE)
15425 goto out;
15426
15427 if (did_start_seqid_sync) {
15428 nfs4_end_open_seqid_sync(oop);
15429 did_start_seqid_sync = 0;
15430 }
15431 open_stream_rele(osp, rp);
15432
15433 if (did_start_op)
15434 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15435 &recov_state, FALSE);
15436 if (did_force_recovlock)
15437 nfs_rw_exit(&mi->mi_recovlock);
15438
15439 goto recov_retry;
15440 } else {
15441 if (odg_cred_otw)
15442 crfree(odg_cred_otw);
15443 }
15444 goto out;
15445 }
15446
15447 /*
15448 * If this open stream was created as the results of an open
15449 * while holding a delegation, then just release it; no need
15450 * to do an OTW close. Otherwise do a "normal" OTW close.
15451 */
15452 if (osp->os_delegation) {
15453 nfs4close_notw(vp, osp, &have_sync_lock);
15454 nfs4_error_zinit(ep);
15455 goto out;
15456 }
15457
15458 /*
15459 * If this stream is not valid, we're done.
15460 */
15461 if (!osp->os_valid) {
15462 nfs4_error_zinit(ep);
15463 goto out;
15464 }
15465
15466 /*
15467 * Last open or mmap ref has vanished, need to do an OTW close.
15468 * First check to see if a close is still necessary.
15469 */
15470 if (osp->os_failed_reopen) {
15471 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15472 "don't close OTW osp %p since reopen failed.",
15473 (void *)osp));
15474 /*
15475 * Reopen of the open stream failed, hence the
15476 * stateid of the open stream is invalid/stale, and
15477 * sending this OTW would incorrectly cause another
15478 * round of recovery. In this case, we need to set
15479 * the 'os_valid' bit to 0 so another thread doesn't
15480 * come in and re-open this open stream before
15481 * this "closing" thread cleans up state (decrementing
15482 * the nfs4_server_t's state_ref_count and decrementing
15483 * the os_ref_count).
15484 */
15485 osp->os_valid = 0;
15486 /*
15487 * This removes the reference obtained at OPEN; ie,
15488 * when the open stream structure was created.
15489 *
15490 * We don't have to worry about calling 'open_stream_rele'
15491 * since we our currently holding a reference to this
15492 * open stream which means the count can not go to 0 with
15493 * this decrement.
15494 */
15495 ASSERT(osp->os_ref_count >= 2);
15496 osp->os_ref_count--;
15497 nfs4_error_zinit(ep);
15498 close_failed = 0;
15499 goto close_cleanup;
15500 }
15501
15502 ASSERT(osp->os_ref_count > 1);
15503
15504 /*
15505 * Sixth, try the CLOSE OTW.
15506 */
15507 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15508 close_type, ep, &have_sync_lock);
15509
15510 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15511 /*
15512 * Let the recovery thread be responsible for
15513 * removing the state for CLOSE.
15514 */
15515 close_failed = 1;
15516 force_close = 0;
15517 retry = 0;
15518 }
15519
15520 /* See if we need to retry with a different cred */
15521 if ((ep->error == EACCES ||
15522 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15523 cred_otw != cr) {
15524 crfree(cred_otw);
15525 cred_otw = cr;
15526 crhold(cred_otw);
15527 retry = 1;
15528 }
15529
15530 if (ep->error || ep->stat)
15531 close_failed = 1;
15532
15533 if (retry && !isrecov && num_retries-- > 0) {
15534 if (have_sync_lock) {
15535 mutex_exit(&osp->os_sync_lock);
15536 have_sync_lock = 0;
15537 }
15538 if (did_start_seqid_sync) {
15539 nfs4_end_open_seqid_sync(oop);
15540 did_start_seqid_sync = 0;
15541 }
15542 open_stream_rele(osp, rp);
15543
15544 if (did_start_op)
15545 nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15546 &recov_state, FALSE);
15547 if (did_force_recovlock)
15548 nfs_rw_exit(&mi->mi_recovlock);
15549 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15550 "nfs4close_one: need to retry the close "
15551 "operation"));
15552 goto recov_retry;
15553 }
15554 close_cleanup:
15555 /*
15556 * Seventh and lastly, process our results.
15557 */
15558 if (close_failed && force_close) {
15559 /*
15560 * It's ok to drop and regrab the 'os_sync_lock' since
15561 * nfs4close_notw() will recheck to make sure the
15562 * "close"/removal of state should happen.
15563 */
15564 if (!have_sync_lock) {
15565 mutex_enter(&osp->os_sync_lock);
15566 have_sync_lock = 1;
15567 }
15568 /*
15569 * This is last call, remove the ref on the open
15570 * stream created by open and clean everything up.
15571 */
15572 osp->os_pending_close = 0;
15573 nfs4close_notw(vp, osp, &have_sync_lock);
15574 nfs4_error_zinit(ep);
15575 }
15576
15577 if (!close_failed) {
15578 if (have_sync_lock) {
15579 osp->os_pending_close = 0;
15580 mutex_exit(&osp->os_sync_lock);
15581 have_sync_lock = 0;
15582 } else {
15583 mutex_enter(&osp->os_sync_lock);
15584 osp->os_pending_close = 0;
15585 mutex_exit(&osp->os_sync_lock);
15586 }
15587 if (did_start_op && recov_state.rs_sp != NULL) {
15588 mutex_enter(&recov_state.rs_sp->s_lock);
15589 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15590 mutex_exit(&recov_state.rs_sp->s_lock);
15591 } else {
15592 nfs4_dec_state_ref_count(mi);
15593 }
15594 nfs4_error_zinit(ep);
15595 }
15596
15597 out:
15598 if (have_sync_lock)
15599 mutex_exit(&osp->os_sync_lock);
15600 if (did_start_op)
15601 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15602 recovonly ? TRUE : FALSE);
15603 if (did_force_recovlock)
15604 nfs_rw_exit(&mi->mi_recovlock);
15605 if (cred_otw)
15606 crfree(cred_otw);
15607 if (osp)
15608 open_stream_rele(osp, rp);
15609 if (oop) {
15610 if (did_start_seqid_sync)
15611 nfs4_end_open_seqid_sync(oop);
15612 open_owner_rele(oop);
15613 }
15614 }
15615
15616 /*
15617 * Convert information returned by the server in the LOCK4denied
15618 * structure to the form required by fcntl.
15619 */
15620 static void
15621 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15622 {
15623 nfs4_lo_name_t *lo;
15624
15625 #ifdef DEBUG
15626 if (denied_to_flk_debug) {
15627 lockt_denied_debug = lockt_denied;
15628 debug_enter("lockt_denied");
15629 }
15630 #endif
15631
15632 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15633 flk->l_whence = 0; /* aka SEEK_SET */
15634 flk->l_start = lockt_denied->offset;
15635 flk->l_len = lockt_denied->length;
15636
15637 /*
15638 * If the blocking clientid matches our client id, then we can
15639 * interpret the lockowner (since we built it). If not, then
15640 * fabricate a sysid and pid. Note that the l_sysid field
15641 * in *flk already has the local sysid.
15642 */
15643
15644 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15645
15646 if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15647 lo = (nfs4_lo_name_t *)
15648 lockt_denied->owner.owner_val;
15649
15650 flk->l_pid = lo->ln_pid;
15651 } else {
15652 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15653 "denied_to_flk: bad lock owner length\n"));
15654
15655 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15656 }
15657 } else {
15658 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15659 "denied_to_flk: foreign clientid\n"));
15660
15661 /*
15662 * Construct a new sysid which should be different from
15663 * sysids of other systems.
15664 */
15665
15666 flk->l_sysid++;
15667 flk->l_pid = lo_to_pid(&lockt_denied->owner);
15668 }
15669 }
15670
15671 static pid_t
15672 lo_to_pid(lock_owner4 *lop)
15673 {
15674 pid_t pid = 0;
15675 uchar_t *cp;
15676 int i;
15677
15678 cp = (uchar_t *)&lop->clientid;
15679
15680 for (i = 0; i < sizeof (lop->clientid); i++)
15681 pid += (pid_t)*cp++;
15682
15683 cp = (uchar_t *)lop->owner_val;
15684
15685 for (i = 0; i < lop->owner_len; i++)
15686 pid += (pid_t)*cp++;
15687
15688 return (pid);
15689 }
15690
15691 /*
15692 * Given a lock pointer, returns the length of that lock.
15693 * "end" is the last locked offset the "l_len" covers from
15694 * the start of the lock.
15695 */
15696 static off64_t
15697 lock_to_end(flock64_t *lock)
15698 {
15699 off64_t lock_end;
15700
15701 if (lock->l_len == 0)
15702 lock_end = (off64_t)MAXEND;
15703 else
15704 lock_end = lock->l_start + lock->l_len - 1;
15705
15706 return (lock_end);
15707 }
15708
15709 /*
15710 * Given the end of a lock, it will return you the length "l_len" for that lock.
15711 */
15712 static off64_t
15713 end_to_len(off64_t start, off64_t end)
15714 {
15715 off64_t lock_len;
15716
15717 ASSERT(end >= start);
15718 if (end == MAXEND)
15719 lock_len = 0;
15720 else
15721 lock_len = end - start + 1;
15722
15723 return (lock_len);
15724 }
15725
15726 /*
15727 * On given end for a lock it determines if it is the last locked offset
15728 * or not, if so keeps it as is, else adds one to return the length for
15729 * valid start.
15730 */
15731 static off64_t
15732 start_check(off64_t x)
15733 {
15734 if (x == MAXEND)
15735 return (x);
15736 else
15737 return (x + 1);
15738 }
15739
15740 /*
15741 * See if these two locks overlap, and if so return 1;
15742 * otherwise, return 0.
15743 */
15744 static int
15745 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15746 {
15747 off64_t llfp_end, curfp_end;
15748
15749 llfp_end = lock_to_end(llfp);
15750 curfp_end = lock_to_end(curfp);
15751
15752 if (((llfp_end >= curfp->l_start) &&
15753 (llfp->l_start <= curfp->l_start)) ||
15754 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15755 return (1);
15756 return (0);
15757 }
15758
15759 /*
15760 * Determine what the intersecting lock region is, and add that to the
15761 * 'nl_llpp' locklist in increasing order (by l_start).
15762 */
15763 static void
15764 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15765 locklist_t **nl_llpp, vnode_t *vp)
15766 {
15767 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15768 off64_t lost_flp_end, local_flp_end, len, start;
15769
15770 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15771
15772 if (!locks_intersect(lost_flp, local_flp))
15773 return;
15774
15775 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15776 "locks intersect"));
15777
15778 lost_flp_end = lock_to_end(lost_flp);
15779 local_flp_end = lock_to_end(local_flp);
15780
15781 /* Find the starting point of the intersecting region */
15782 if (local_flp->l_start > lost_flp->l_start)
15783 start = local_flp->l_start;
15784 else
15785 start = lost_flp->l_start;
15786
15787 /* Find the lenght of the intersecting region */
15788 if (lost_flp_end < local_flp_end)
15789 len = end_to_len(start, lost_flp_end);
15790 else
15791 len = end_to_len(start, local_flp_end);
15792
15793 /*
15794 * Prepare the flock structure for the intersection found and insert
15795 * it into the new list in increasing l_start order. This list contains
15796 * intersections of locks registered by the client with the local host
15797 * and the lost lock.
15798 * The lock type of this lock is the same as that of the local_flp.
15799 */
15800 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15801 intersect_llp->ll_flock.l_start = start;
15802 intersect_llp->ll_flock.l_len = len;
15803 intersect_llp->ll_flock.l_type = local_flp->l_type;
15804 intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15805 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15806 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */
15807 intersect_llp->ll_vp = vp;
15808
15809 tmp_fllp = *nl_llpp;
15810 cur_fllp = NULL;
15811 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15812 intersect_llp->ll_flock.l_start) {
15813 cur_fllp = tmp_fllp;
15814 tmp_fllp = tmp_fllp->ll_next;
15815 }
15816 if (cur_fllp == NULL) {
15817 /* first on the list */
15818 intersect_llp->ll_next = *nl_llpp;
15819 *nl_llpp = intersect_llp;
15820 } else {
15821 intersect_llp->ll_next = cur_fllp->ll_next;
15822 cur_fllp->ll_next = intersect_llp;
15823 }
15824
15825 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15826 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15827 intersect_llp->ll_flock.l_start,
15828 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15829 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15830 }
15831
15832 /*
15833 * Our local locking current state is potentially different than
15834 * what the NFSv4 server thinks we have due to a lost lock that was
15835 * resent and then received. We need to reset our "NFSv4" locking
15836 * state to match the current local locking state for this pid since
15837 * that is what the user/application sees as what the world is.
15838 *
15839 * We cannot afford to drop the open/lock seqid sync since then we can
15840 * get confused about what the current local locking state "is" versus
15841 * "was".
15842 *
15843 * If we are unable to fix up the locks, we send SIGLOST to the affected
15844 * process. This is not done if the filesystem has been forcibly
15845 * unmounted, in case the process has already exited and a new process
15846 * exists with the same pid.
15847 */
15848 static void
15849 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15850 nfs4_lock_owner_t *lop)
15851 {
15852 locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15853 mntinfo4_t *mi = VTOMI4(vp);
15854 const int cmd = F_SETLK;
15855 off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15856 flock64_t ul_fl;
15857
15858 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15859 "nfs4_reinstitute_local_lock_state"));
15860
15861 /*
15862 * Find active locks for this vp from the local locking code.
15863 * Scan through this list and find out the locks that intersect with
15864 * the lost lock. Once we find the lock that intersects, add the
15865 * intersection area as a new lock to a new list "ri_llp". The lock
15866 * type of the intersection region lock added to ri_llp is the same
15867 * as that found in the active lock list, "list". The intersecting
15868 * region locks are added to ri_llp in increasing l_start order.
15869 */
15870 ASSERT(nfs_zone() == mi->mi_zone);
15871
15872 locks = flk_active_locks_for_vp(vp);
15873 ri_llp = NULL;
15874
15875 for (llp = locks; llp != NULL; llp = llp->ll_next) {
15876 ASSERT(llp->ll_vp == vp);
15877 /*
15878 * Pick locks that belong to this pid/lockowner
15879 */
15880 if (llp->ll_flock.l_pid != lost_flp->l_pid)
15881 continue;
15882
15883 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15884 }
15885
15886 /*
15887 * Now we have the list of intersections with the lost lock. These are
15888 * the locks that were/are active before the server replied to the
15889 * last/lost lock. Issue these locks to the server here. Playing these
15890 * locks to the server will re-establish our current local locking state
15891 * with the v4 server.
15892 * If we get an error, send SIGLOST to the application for that lock.
15893 */
15894
15895 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15896 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15897 "nfs4_reinstitute_local_lock_state: need to issue "
15898 "flock: [%"PRIx64" - %"PRIx64"] : %s",
15899 llp->ll_flock.l_start,
15900 llp->ll_flock.l_start + llp->ll_flock.l_len,
15901 llp->ll_flock.l_type == F_RDLCK ? "READ" :
15902 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15903 /*
15904 * No need to relock what we already have
15905 */
15906 if (llp->ll_flock.l_type == lost_flp->l_type)
15907 continue;
15908
15909 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15910 }
15911
15912 /*
15913 * Now keeping the start of the lost lock as our reference parse the
15914 * newly created ri_llp locklist to find the ranges that we have locked
15915 * with the v4 server but not in the current local locking. We need
15916 * to unlock these ranges.
15917 * These ranges can also be reffered to as those ranges, where the lost
15918 * lock does not overlap with the locks in the ri_llp but are locked
15919 * since the server replied to the lost lock.
15920 */
15921 cur_start = lost_flp->l_start;
15922 lost_flp_end = lock_to_end(lost_flp);
15923
15924 ul_fl.l_type = F_UNLCK;
15925 ul_fl.l_whence = 0; /* aka SEEK_SET */
15926 ul_fl.l_sysid = lost_flp->l_sysid;
15927 ul_fl.l_pid = lost_flp->l_pid;
15928
15929 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15930 llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15931
15932 if (llp->ll_flock.l_start <= cur_start) {
15933 cur_start = start_check(llp_ll_flock_end);
15934 continue;
15935 }
15936 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15937 "nfs4_reinstitute_local_lock_state: "
15938 "UNLOCK [%"PRIx64" - %"PRIx64"]",
15939 cur_start, llp->ll_flock.l_start));
15940
15941 ul_fl.l_start = cur_start;
15942 ul_fl.l_len = end_to_len(cur_start,
15943 (llp->ll_flock.l_start - 1));
15944
15945 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15946 cur_start = start_check(llp_ll_flock_end);
15947 }
15948
15949 /*
15950 * In the case where the lost lock ends after all intersecting locks,
15951 * unlock the last part of the lost lock range.
15952 */
15953 if (cur_start != start_check(lost_flp_end)) {
15954 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15955 "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15956 "lost lock region [%"PRIx64" - %"PRIx64"]",
15957 cur_start, lost_flp->l_start + lost_flp->l_len));
15958
15959 ul_fl.l_start = cur_start;
15960 /*
15961 * Is it an to-EOF lock? if so unlock till the end
15962 */
15963 if (lost_flp->l_len == 0)
15964 ul_fl.l_len = 0;
15965 else
15966 ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15967
15968 push_reinstate(vp, cmd, &ul_fl, cr, lop);
15969 }
15970
15971 if (locks != NULL)
15972 flk_free_locklist(locks);
15973
15974 /* Free up our newly created locklist */
15975 for (llp = ri_llp; llp != NULL; ) {
15976 tmp_llp = llp->ll_next;
15977 kmem_free(llp, sizeof (locklist_t));
15978 llp = tmp_llp;
15979 }
15980
15981 /*
15982 * Now return back to the original calling nfs4frlock()
15983 * and let us naturally drop our seqid syncs.
15984 */
15985 }
15986
15987 /*
15988 * Create a lost state record for the given lock reinstantiation request
15989 * and push it onto the lost state queue.
15990 */
15991 static void
15992 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15993 nfs4_lock_owner_t *lop)
15994 {
15995 nfs4_lost_rqst_t req;
15996 nfs_lock_type4 locktype;
15997 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15998
15999 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
16000
16001 locktype = flk_to_locktype(cmd, flk->l_type);
16002 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
16003 NULL, NULL, lop, flk, &req, cr, vp);
16004 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
16005 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
16006 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
16007 NULL, NULL, NULL);
16008 }