1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2017 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
29 * All Rights Reserved
30 */
31
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/thread.h>
36 #include <sys/t_lock.h>
37 #include <sys/time.h>
38 #include <sys/vnode.h>
39 #include <sys/vfs.h>
40 #include <sys/errno.h>
41 #include <sys/buf.h>
42 #include <sys/stat.h>
43 #include <sys/cred.h>
44 #include <sys/kmem.h>
45 #include <sys/debug.h>
46 #include <sys/dnlc.h>
47 #include <sys/vmsystm.h>
48 #include <sys/flock.h>
49 #include <sys/share.h>
50 #include <sys/cmn_err.h>
51 #include <sys/tiuser.h>
52 #include <sys/sysmacros.h>
53 #include <sys/callb.h>
54 #include <sys/acl.h>
55 #include <sys/kstat.h>
56 #include <sys/signal.h>
57 #include <sys/disp.h>
58 #include <sys/atomic.h>
59 #include <sys/list.h>
60 #include <sys/sdt.h>
61
62 #include <rpc/types.h>
63 #include <rpc/xdr.h>
64 #include <rpc/auth.h>
65 #include <rpc/clnt.h>
66
67 #include <nfs/nfs.h>
68 #include <nfs/nfs_clnt.h>
69 #include <nfs/nfs_acl.h>
70
71 #include <nfs/nfs4.h>
72 #include <nfs/rnode4.h>
73 #include <nfs/nfs4_clnt.h>
74
75 #include <vm/hat.h>
76 #include <vm/as.h>
77 #include <vm/page.h>
78 #include <vm/pvn.h>
79 #include <vm/seg.h>
80 #include <vm/seg_map.h>
81 #include <vm/seg_vn.h>
82
83 #include <sys/ddi.h>
84
85 /*
86 * Arguments to page-flush thread.
87 */
88 typedef struct {
89 vnode_t *vp;
90 cred_t *cr;
91 } pgflush_t;
92
93 #ifdef DEBUG
94 int nfs4_client_lease_debug;
95 int nfs4_sharedfh_debug;
96 int nfs4_fname_debug;
97
98 /* temporary: panic if v_type is inconsistent with r_attr va_type */
99 int nfs4_vtype_debug;
100
101 uint_t nfs4_tsd_key;
102 #endif
103
104 static time_t nfs4_client_resumed = 0;
105 static callb_id_t cid = 0;
106
107 static int nfs4renew(nfs4_server_t *);
108 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
109 static void nfs4_pgflush_thread(pgflush_t *);
110
111 static boolean_t nfs4_client_cpr_callb(void *, int);
112
113 struct mi4_globals {
114 kmutex_t mig_lock; /* lock protecting mig_list */
115 list_t mig_list; /* list of NFS v4 mounts in zone */
116 boolean_t mig_destructor_called;
117 };
118
119 static zone_key_t mi4_list_key;
120
121 /*
122 * Attributes caching:
123 *
124 * Attributes are cached in the rnode in struct vattr form.
125 * There is a time associated with the cached attributes (r_time_attr_inval)
126 * which tells whether the attributes are valid. The time is initialized
127 * to the difference between current time and the modify time of the vnode
128 * when new attributes are cached. This allows the attributes for
129 * files that have changed recently to be timed out sooner than for files
130 * that have not changed for a long time. There are minimum and maximum
131 * timeout values that can be set per mount point.
132 */
133
134 /*
135 * If a cache purge is in progress, wait for it to finish.
136 *
137 * The current thread must not be in the middle of an
138 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock
139 * between this thread, a recovery thread, and the page flush thread.
140 */
141 int
142 nfs4_waitfor_purge_complete(vnode_t *vp)
143 {
144 rnode4_t *rp;
145 k_sigset_t smask;
146
147 rp = VTOR4(vp);
148 if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
149 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
150 mutex_enter(&rp->r_statelock);
151 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
152 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
153 ((rp->r_flags & R4PGFLUSH) &&
154 rp->r_pgflush != curthread)) {
155 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
156 sigunintr(&smask);
157 mutex_exit(&rp->r_statelock);
158 return (EINTR);
159 }
160 }
161 sigunintr(&smask);
162 mutex_exit(&rp->r_statelock);
163 }
164 return (0);
165 }
166
167 /*
168 * Validate caches by checking cached attributes. If they have timed out,
169 * then get new attributes from the server. As a side effect, cache
170 * invalidation is done if the attributes have changed.
171 *
172 * If the attributes have not timed out and if there is a cache
173 * invalidation being done by some other thread, then wait until that
174 * thread has completed the cache invalidation.
175 */
176 int
177 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
178 {
179 int error;
180 nfs4_ga_res_t gar;
181
182 if (ATTRCACHE4_VALID(vp)) {
183 error = nfs4_waitfor_purge_complete(vp);
184 if (error)
185 return (error);
186 return (0);
187 }
188
189 return (nfs4_getattr_otw(vp, &gar, cr, 0));
190 }
191
192 /*
193 * Fill in attribute from the cache.
194 * If valid, then return 0 to indicate that no error occurred,
195 * otherwise return 1 to indicate that an error occurred.
196 */
197 static int
198 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
199 {
200 rnode4_t *rp;
201
202 rp = VTOR4(vp);
203 mutex_enter(&rp->r_statelock);
204 mutex_enter(&rp->r_statev4_lock);
205 if (ATTRCACHE4_VALID(vp)) {
206 mutex_exit(&rp->r_statev4_lock);
207 /*
208 * Cached attributes are valid
209 */
210 *vap = rp->r_attr;
211 mutex_exit(&rp->r_statelock);
212 return (0);
213 }
214 mutex_exit(&rp->r_statev4_lock);
215 mutex_exit(&rp->r_statelock);
216 return (1);
217 }
218
219
220 /*
221 * If returned error is ESTALE flush all caches. The nfs4_purge_caches()
222 * call is synchronous because all the pages were invalidated by the
223 * nfs4_invalidate_pages() call.
224 */
225 void
226 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
227 {
228 struct rnode4 *rp = VTOR4(vp);
229
230 /* Ensure that the ..._end_op() call has been done */
231 ASSERT(tsd_get(nfs4_tsd_key) == NULL);
232
233 if (errno != ESTALE)
234 return;
235
236 mutex_enter(&rp->r_statelock);
237 rp->r_flags |= R4STALE;
238 if (!rp->r_error)
239 rp->r_error = errno;
240 mutex_exit(&rp->r_statelock);
241 if (nfs4_has_pages(vp))
242 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
243 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
244 }
245
246 /*
247 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the
248 * page purge is done asynchronously.
249 */
250 void
251 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
252 {
253 rnode4_t *rp;
254 char *contents;
255 vnode_t *xattr;
256 int size;
257 int pgflush; /* are we the page flush thread? */
258
259 /*
260 * Purge the DNLC for any entries which refer to this file.
261 */
262 if (vp->v_count > 1 &&
263 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
264 dnlc_purge_vp(vp);
265
266 /*
267 * Clear any readdir state bits and purge the readlink response cache.
268 */
269 rp = VTOR4(vp);
270 mutex_enter(&rp->r_statelock);
271 rp->r_flags &= ~R4LOOKUP;
272 contents = rp->r_symlink.contents;
273 size = rp->r_symlink.size;
274 rp->r_symlink.contents = NULL;
275
276 xattr = rp->r_xattr_dir;
277 rp->r_xattr_dir = NULL;
278
279 /*
280 * Purge pathconf cache too.
281 */
282 rp->r_pathconf.pc4_xattr_valid = 0;
283 rp->r_pathconf.pc4_cache_valid = 0;
284
285 pgflush = (curthread == rp->r_pgflush);
286 mutex_exit(&rp->r_statelock);
287
288 if (contents != NULL) {
289
290 kmem_free((void *)contents, size);
291 }
292
293 if (xattr != NULL)
294 VN_RELE(xattr);
295
296 /*
297 * Flush the page cache. If the current thread is the page flush
298 * thread, don't initiate a new page flush. There's no need for
299 * it, and doing it correctly is hard.
300 */
301 if (nfs4_has_pages(vp) && !pgflush) {
302 if (!asyncpg) {
303 (void) nfs4_waitfor_purge_complete(vp);
304 nfs4_flush_pages(vp, cr);
305 } else {
306 pgflush_t *args;
307
308 /*
309 * We don't hold r_statelock while creating the
310 * thread, in case the call blocks. So we use a
311 * flag to indicate that a page flush thread is
312 * active.
313 */
314 mutex_enter(&rp->r_statelock);
315 if (rp->r_flags & R4PGFLUSH) {
316 mutex_exit(&rp->r_statelock);
317 } else {
318 rp->r_flags |= R4PGFLUSH;
319 mutex_exit(&rp->r_statelock);
320
321 args = kmem_alloc(sizeof (pgflush_t),
322 KM_SLEEP);
323 args->vp = vp;
324 VN_HOLD(args->vp);
325 args->cr = cr;
326 crhold(args->cr);
327 (void) zthread_create(NULL, 0,
328 nfs4_pgflush_thread, args, 0,
329 minclsyspri);
330 }
331 }
332 }
333
334 /*
335 * Flush the readdir response cache.
336 */
337 nfs4_purge_rddir_cache(vp);
338 }
339
340 /*
341 * Invalidate all pages for the given file, after writing back the dirty
342 * ones.
343 */
344
345 void
346 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
347 {
348 int error;
349 rnode4_t *rp = VTOR4(vp);
350
351 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
352 if (error == ENOSPC || error == EDQUOT) {
353 mutex_enter(&rp->r_statelock);
354 if (!rp->r_error)
355 rp->r_error = error;
356 mutex_exit(&rp->r_statelock);
357 }
358 }
359
360 /*
361 * Page flush thread.
362 */
363
364 static void
365 nfs4_pgflush_thread(pgflush_t *args)
366 {
367 rnode4_t *rp = VTOR4(args->vp);
368
369 /* remember which thread we are, so we don't deadlock ourselves */
370 mutex_enter(&rp->r_statelock);
371 ASSERT(rp->r_pgflush == NULL);
372 rp->r_pgflush = curthread;
373 mutex_exit(&rp->r_statelock);
374
375 nfs4_flush_pages(args->vp, args->cr);
376
377 mutex_enter(&rp->r_statelock);
378 rp->r_pgflush = NULL;
379 rp->r_flags &= ~R4PGFLUSH;
380 cv_broadcast(&rp->r_cv);
381 mutex_exit(&rp->r_statelock);
382
383 VN_RELE(args->vp);
384 crfree(args->cr);
385 kmem_free(args, sizeof (pgflush_t));
386 zthread_exit();
387 }
388
389 /*
390 * Purge the readdir cache of all entries which are not currently
391 * being filled.
392 */
393 void
394 nfs4_purge_rddir_cache(vnode_t *vp)
395 {
396 rnode4_t *rp;
397
398 rp = VTOR4(vp);
399
400 mutex_enter(&rp->r_statelock);
401 rp->r_direof = NULL;
402 rp->r_flags &= ~R4LOOKUP;
403 rp->r_flags |= R4READDIRWATTR;
404 rddir4_cache_purge(rp);
405 mutex_exit(&rp->r_statelock);
406 }
407
408 /*
409 * Set attributes cache for given vnode using virtual attributes. There is
410 * no cache validation, but if the attributes are deemed to be stale, they
411 * are ignored. This corresponds to nfs3_attrcache().
412 *
413 * Set the timeout value on the attribute cache and fill it
414 * with the passed in attributes.
415 */
416 void
417 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
418 {
419 rnode4_t *rp = VTOR4(vp);
420
421 mutex_enter(&rp->r_statelock);
422 if (rp->r_time_attr_saved <= t)
423 nfs4_attrcache_va(vp, garp, FALSE);
424 mutex_exit(&rp->r_statelock);
425 }
426
427 /*
428 * Use the passed in virtual attributes to check to see whether the
429 * data and metadata caches are valid, cache the new attributes, and
430 * then do the cache invalidation if required.
431 *
432 * The cache validation and caching of the new attributes is done
433 * atomically via the use of the mutex, r_statelock. If required,
434 * the cache invalidation is done atomically w.r.t. the cache
435 * validation and caching of the attributes via the pseudo lock,
436 * r_serial.
437 *
438 * This routine is used to do cache validation and attributes caching
439 * for operations with a single set of post operation attributes.
440 */
441
442 void
443 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
444 hrtime_t t, cred_t *cr, int async,
445 change_info4 *cinfo)
446 {
447 rnode4_t *rp;
448 int mtime_changed = 0;
449 int ctime_changed = 0;
450 vsecattr_t *vsp;
451 int was_serial, set_time_cache_inval, recov;
452 vattr_t *vap = &garp->n4g_va;
453 mntinfo4_t *mi = VTOMI4(vp);
454 len_t preattr_rsize;
455 boolean_t writemodify_set = B_FALSE;
456 boolean_t cachepurge_set = B_FALSE;
457
458 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
459
460 /* Is curthread the recovery thread? */
461 mutex_enter(&mi->mi_lock);
462 recov = (VTOMI4(vp)->mi_recovthread == curthread);
463 mutex_exit(&mi->mi_lock);
464
465 rp = VTOR4(vp);
466 mutex_enter(&rp->r_statelock);
467 was_serial = (rp->r_serial == curthread);
468 if (rp->r_serial && !was_serial) {
469 klwp_t *lwp = ttolwp(curthread);
470
471 /*
472 * If we're the recovery thread, then purge current attrs
473 * and bail out to avoid potential deadlock between another
474 * thread caching attrs (r_serial thread), recov thread,
475 * and an async writer thread.
476 */
477 if (recov) {
478 PURGE_ATTRCACHE4_LOCKED(rp);
479 mutex_exit(&rp->r_statelock);
480 return;
481 }
482
483 if (lwp != NULL)
484 lwp->lwp_nostop++;
485 while (rp->r_serial != NULL) {
486 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
487 mutex_exit(&rp->r_statelock);
488 if (lwp != NULL)
489 lwp->lwp_nostop--;
490 return;
491 }
492 }
493 if (lwp != NULL)
494 lwp->lwp_nostop--;
495 }
496
497 /*
498 * If there is a page flush thread, the current thread needs to
499 * bail out, to prevent a possible deadlock between the current
500 * thread (which might be in a start_op/end_op region), the
501 * recovery thread, and the page flush thread. Expire the
502 * attribute cache, so that any attributes the current thread was
503 * going to set are not lost.
504 */
505 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
506 PURGE_ATTRCACHE4_LOCKED(rp);
507 mutex_exit(&rp->r_statelock);
508 return;
509 }
510
511 if (rp->r_time_attr_saved > t) {
512 /*
513 * Attributes have been cached since these attributes were
514 * probably made. If there is an inconsistency in what is
515 * cached, mark them invalid. If not, don't act on them.
516 */
517 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
518 PURGE_ATTRCACHE4_LOCKED(rp);
519 mutex_exit(&rp->r_statelock);
520 return;
521 }
522 set_time_cache_inval = 0;
523 if (cinfo) {
524 /*
525 * Only directory modifying callers pass non-NULL cinfo.
526 */
527 ASSERT(vp->v_type == VDIR);
528 /*
529 * If the cache timeout either doesn't exist or hasn't expired,
530 * and dir didn't changed on server before dirmod op
531 * and dir didn't change after dirmod op but before getattr
532 * then there's a chance that the client's cached data for
533 * this object is current (not stale). No immediate cache
534 * flush is required.
535 *
536 */
537 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
538 cinfo->before == rp->r_change &&
539 (garp->n4g_change_valid &&
540 cinfo->after == garp->n4g_change)) {
541
542 /*
543 * If atomic isn't set, then the before/after info
544 * cannot be blindly trusted. For this case, we tell
545 * nfs4_attrcache_va to cache the attrs but also
546 * establish an absolute maximum cache timeout. When
547 * the timeout is reached, caches will be flushed.
548 */
549 if (! cinfo->atomic)
550 set_time_cache_inval = 1;
551 } else {
552
553 /*
554 * We're not sure exactly what changed, but we know
555 * what to do. flush all caches for dir. remove the
556 * attr timeout.
557 *
558 * a) timeout expired. flush all caches.
559 * b) r_change != cinfo.before. flush all caches.
560 * c) r_change == cinfo.before, but cinfo.after !=
561 * post-op getattr(change). flush all caches.
562 * d) post-op getattr(change) not provided by server.
563 * flush all caches.
564 */
565 mtime_changed = 1;
566 ctime_changed = 1;
567 rp->r_time_cache_inval = 0;
568 }
569 } else {
570 /*
571 * Write thread after writing data to file on remote server,
572 * will always set R4WRITEMODIFIED to indicate that file on
573 * remote server was modified with a WRITE operation and would
574 * have marked attribute cache as timed out. If R4WRITEMODIFIED
575 * is set, then do not check for mtime and ctime change.
576 */
577 if (!(rp->r_flags & R4WRITEMODIFIED)) {
578 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
579 mtime_changed = 1;
580
581 if (rp->r_attr.va_ctime.tv_sec !=
582 vap->va_ctime.tv_sec ||
583 rp->r_attr.va_ctime.tv_nsec !=
584 vap->va_ctime.tv_nsec)
585 ctime_changed = 1;
586
587 /*
588 * If the change attribute was not provided by server
589 * or it differs, then flush all caches.
590 */
591 if (!garp->n4g_change_valid ||
592 rp->r_change != garp->n4g_change) {
593 mtime_changed = 1;
594 ctime_changed = 1;
595 }
596 } else {
597 writemodify_set = B_TRUE;
598 }
599 }
600
601 preattr_rsize = rp->r_size;
602
603 nfs4_attrcache_va(vp, garp, set_time_cache_inval);
604
605 /*
606 * If we have updated filesize in nfs4_attrcache_va, as soon as we
607 * drop statelock we will be in transition of purging all
608 * our caches and updating them. It is possible for another
609 * thread to pick this new file size and read in zeroed data.
610 * stall other threads till cache purge is complete.
611 */
612 if ((!cinfo) && (rp->r_size != preattr_rsize)) {
613 /*
614 * If R4WRITEMODIFIED was set and we have updated the file
615 * size, Server's returned file size need not necessarily
616 * be because of this Client's WRITE. We need to purge
617 * all caches.
618 */
619 if (writemodify_set)
620 mtime_changed = 1;
621
622 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
623 rp->r_flags |= R4INCACHEPURGE;
624 cachepurge_set = B_TRUE;
625 }
626 }
627
628 if (!mtime_changed && !ctime_changed) {
629 mutex_exit(&rp->r_statelock);
630 return;
631 }
632
633 rp->r_serial = curthread;
634
635 mutex_exit(&rp->r_statelock);
636
637 /*
638 * If we're the recov thread, then force async nfs4_purge_caches
639 * to avoid potential deadlock.
640 */
641 if (mtime_changed)
642 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
643
644 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
645 mutex_enter(&rp->r_statelock);
646 rp->r_flags &= ~R4INCACHEPURGE;
647 cv_broadcast(&rp->r_cv);
648 mutex_exit(&rp->r_statelock);
649 cachepurge_set = B_FALSE;
650 }
651
652 if (ctime_changed) {
653 (void) nfs4_access_purge_rp(rp);
654 if (rp->r_secattr != NULL) {
655 mutex_enter(&rp->r_statelock);
656 vsp = rp->r_secattr;
657 rp->r_secattr = NULL;
658 mutex_exit(&rp->r_statelock);
659 if (vsp != NULL)
660 nfs4_acl_free_cache(vsp);
661 }
662 }
663
664 if (!was_serial) {
665 mutex_enter(&rp->r_statelock);
666 rp->r_serial = NULL;
667 cv_broadcast(&rp->r_cv);
668 mutex_exit(&rp->r_statelock);
669 }
670 }
671
672 /*
673 * Set attributes cache for given vnode using virtual attributes.
674 *
675 * Set the timeout value on the attribute cache and fill it
676 * with the passed in attributes.
677 *
678 * The caller must be holding r_statelock.
679 */
680 static void
681 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
682 {
683 rnode4_t *rp;
684 mntinfo4_t *mi;
685 hrtime_t delta;
686 hrtime_t now;
687 vattr_t *vap = &garp->n4g_va;
688
689 rp = VTOR4(vp);
690
691 ASSERT(MUTEX_HELD(&rp->r_statelock));
692 ASSERT(vap->va_mask == AT_ALL);
693
694 /* Switch to master before checking v_flag */
695 if (IS_SHADOW(vp, rp))
696 vp = RTOV4(rp);
697
698 now = gethrtime();
699
700 mi = VTOMI4(vp);
701
702 /*
703 * Only establish a new cache timeout (if requested). Never
704 * extend a timeout. Never clear a timeout. Clearing a timeout
705 * is done by nfs4_update_dircaches (ancestor in our call chain)
706 */
707 if (set_cache_timeout && ! rp->r_time_cache_inval)
708 rp->r_time_cache_inval = now + mi->mi_acdirmax;
709
710 /*
711 * Delta is the number of nanoseconds that we will
712 * cache the attributes of the file. It is based on
713 * the number of nanoseconds since the last time that
714 * we detected a change. The assumption is that files
715 * that changed recently are likely to change again.
716 * There is a minimum and a maximum for regular files
717 * and for directories which is enforced though.
718 *
719 * Using the time since last change was detected
720 * eliminates direct comparison or calculation
721 * using mixed client and server times. NFS does
722 * not make any assumptions regarding the client
723 * and server clocks being synchronized.
724 */
725 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
726 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
727 vap->va_size != rp->r_attr.va_size) {
728 rp->r_time_attr_saved = now;
729 }
730
731 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
732 delta = 0;
733 else {
734 delta = now - rp->r_time_attr_saved;
735 if (vp->v_type == VDIR) {
736 if (delta < mi->mi_acdirmin)
737 delta = mi->mi_acdirmin;
738 else if (delta > mi->mi_acdirmax)
739 delta = mi->mi_acdirmax;
740 } else {
741 if (delta < mi->mi_acregmin)
742 delta = mi->mi_acregmin;
743 else if (delta > mi->mi_acregmax)
744 delta = mi->mi_acregmax;
745 }
746 }
747 rp->r_time_attr_inval = now + delta;
748
749 rp->r_attr = *vap;
750 if (garp->n4g_change_valid)
751 rp->r_change = garp->n4g_change;
752
753 /*
754 * The attributes that were returned may be valid and can
755 * be used, but they may not be allowed to be cached.
756 * Reset the timers to cause immediate invalidation and
757 * clear r_change so no VERIFY operations will suceed
758 */
759 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
760 rp->r_time_attr_inval = now;
761 rp->r_time_attr_saved = now;
762 rp->r_change = 0;
763 }
764
765 /*
766 * If mounted_on_fileid returned AND the object is a stub,
767 * then set object's va_nodeid to the mounted over fid
768 * returned by server.
769 *
770 * If mounted_on_fileid not provided/supported, then
771 * just set it to 0 for now. Eventually it would be
772 * better to set it to a hashed version of FH. This
773 * would probably be good enough to provide a unique
774 * fid/d_ino within a dir.
775 *
776 * We don't need to carry mounted_on_fileid in the
777 * rnode as long as the client never requests fileid
778 * without also requesting mounted_on_fileid. For
779 * now, it stays.
780 */
781 if (garp->n4g_mon_fid_valid) {
782 rp->r_mntd_fid = garp->n4g_mon_fid;
783
784 if (RP_ISSTUB(rp))
785 rp->r_attr.va_nodeid = rp->r_mntd_fid;
786 }
787
788 /*
789 * Check to see if there are valid pathconf bits to
790 * cache in the rnode.
791 */
792 if (garp->n4g_ext_res) {
793 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
794 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
795 } else {
796 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
797 rp->r_pathconf.pc4_xattr_valid = TRUE;
798 rp->r_pathconf.pc4_xattr_exists =
799 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
800 }
801 }
802 }
803 /*
804 * Update the size of the file if there is no cached data or if
805 * the cached data is clean and there is no data being written
806 * out.
807 */
808 if (rp->r_size != vap->va_size &&
809 (!vn_has_cached_data(vp) ||
810 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
811 rp->r_size = vap->va_size;
812 }
813 nfs_setswaplike(vp, vap);
814 rp->r_flags &= ~R4WRITEMODIFIED;
815 }
816
817 /*
818 * Get attributes over-the-wire and update attributes cache
819 * if no error occurred in the over-the-wire operation.
820 * Return 0 if successful, otherwise error.
821 */
822 int
823 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
824 {
825 mntinfo4_t *mi = VTOMI4(vp);
826 hrtime_t t;
827 nfs4_recov_state_t recov_state;
828 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
829
830 recov_state.rs_flags = 0;
831 recov_state.rs_num_retry_despite_err = 0;
832
833 /* Save the original mount point security flavor */
834 (void) save_mnt_secinfo(mi->mi_curr_serv);
835
836 recov_retry:
837
838 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
839 &recov_state, NULL))) {
840 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
841 return (e.error);
842 }
843
844 t = gethrtime();
845
846 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
847
848 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
849 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
850 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) {
851 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
852 &recov_state, 1);
853 goto recov_retry;
854 }
855 }
856
857 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
858
859 if (!e.error) {
860 if (e.stat == NFS4_OK) {
861 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
862 } else {
863 e.error = geterrno4(e.stat);
864
865 nfs4_purge_stale_fh(e.error, vp, cr);
866 }
867 }
868
869 /*
870 * If getattr a node that is a stub for a crossed
871 * mount point, keep the original secinfo flavor for
872 * the current file system, not the crossed one.
873 */
874 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
875
876 return (e.error);
877 }
878
879 /*
880 * Generate a compound to get attributes over-the-wire.
881 */
882 void
883 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
884 nfs4_error_t *ep, cred_t *cr, int get_acl)
885 {
886 COMPOUND4args_clnt args;
887 COMPOUND4res_clnt res;
888 int doqueue;
889 rnode4_t *rp = VTOR4(vp);
890 nfs_argop4 argop[2];
891
892 args.ctag = TAG_GETATTR;
893
894 args.array_len = 2;
895 args.array = argop;
896
897 /* putfh */
898 argop[0].argop = OP_CPUTFH;
899 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
900
901 /* getattr */
902 /*
903 * Unlike nfs version 2 and 3, where getattr returns all the
904 * attributes, nfs version 4 returns only the ones explicitly
905 * asked for. This creates problems, as some system functions
906 * (e.g. cache check) require certain attributes and if the
907 * cached node lacks some attributes such as uid/gid, it can
908 * affect system utilities (e.g. "ls") that rely on the information
909 * to be there. This can lead to anything from system crashes to
910 * corrupted information processed by user apps.
911 * So to ensure that all bases are covered, request at least
912 * the AT_ALL attribute mask.
913 */
914 argop[1].argop = OP_GETATTR;
915 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
916 if (get_acl)
917 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
918 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
919
920 doqueue = 1;
921
922 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
923
924 if (ep->error)
925 return;
926
927 if (res.status != NFS4_OK) {
928 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
929 return;
930 }
931
932 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
933
934 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
935 }
936
937 /*
938 * Return either cached or remote attributes. If get remote attr
939 * use them to check and invalidate caches, then cache the new attributes.
940 */
941 int
942 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
943 {
944 int error;
945 rnode4_t *rp;
946 nfs4_ga_res_t gar;
947
948 ASSERT(nfs4_consistent_type(vp));
949
950 /*
951 * If we've got cached attributes, we're done, otherwise go
952 * to the server to get attributes, which will update the cache
953 * in the process. Either way, use the cached attributes for
954 * the caller's vattr_t.
955 *
956 * Note that we ignore the gar set by the OTW call: the attr caching
957 * code may make adjustments when storing to the rnode, and we want
958 * to see those changes here.
959 */
960 rp = VTOR4(vp);
961 error = 0;
962 mutex_enter(&rp->r_statelock);
963 if (!ATTRCACHE4_VALID(vp)) {
964 mutex_exit(&rp->r_statelock);
965 error = nfs4_getattr_otw(vp, &gar, cr, 0);
966 mutex_enter(&rp->r_statelock);
967 }
968
969 if (!error)
970 *vap = rp->r_attr;
971
972 /* Return the client's view of file size */
973 vap->va_size = rp->r_size;
974
975 mutex_exit(&rp->r_statelock);
976
977 ASSERT(nfs4_consistent_type(vp));
978
979 return (error);
980 }
981
982 int
983 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
984 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
985 {
986 COMPOUND4args_clnt args;
987 COMPOUND4res_clnt res;
988 int doqueue;
989 nfs_argop4 argop[2];
990 mntinfo4_t *mi = VTOMI4(vp);
991 bool_t needrecov = FALSE;
992 nfs4_recov_state_t recov_state;
993 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
994 nfs4_ga_ext_res_t *gerp;
995
996 recov_state.rs_flags = 0;
997 recov_state.rs_num_retry_despite_err = 0;
998
999 recov_retry:
1000 args.ctag = tag_type;
1001
1002 args.array_len = 2;
1003 args.array = argop;
1004
1005 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1006 if (e.error)
1007 return (e.error);
1008
1009 /* putfh */
1010 argop[0].argop = OP_CPUTFH;
1011 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1012
1013 /* getattr */
1014 argop[1].argop = OP_GETATTR;
1015 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1016 argop[1].nfs_argop4_u.opgetattr.mi = mi;
1017
1018 doqueue = 1;
1019
1020 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1021 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1022 rnode4info(VTOR4(vp))));
1023
1024 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1025
1026 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1027 if (!needrecov && e.error) {
1028 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1029 needrecov);
1030 return (e.error);
1031 }
1032
1033 if (needrecov) {
1034 bool_t abort;
1035
1036 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1037 "nfs4_attr_otw: initiating recovery\n"));
1038
1039 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1040 NULL, OP_GETATTR, NULL, NULL, NULL);
1041 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1042 needrecov);
1043 if (!e.error) {
1044 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1045 e.error = geterrno4(res.status);
1046 }
1047 if (abort == FALSE)
1048 goto recov_retry;
1049 return (e.error);
1050 }
1051
1052 if (res.status) {
1053 e.error = geterrno4(res.status);
1054 } else {
1055 gerp = garp->n4g_ext_res;
1056 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1057 garp, sizeof (nfs4_ga_res_t));
1058 garp->n4g_ext_res = gerp;
1059 if (garp->n4g_ext_res &&
1060 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1061 bcopy(res.array[1].nfs_resop4_u.opgetattr.
1062 ga_res.n4g_ext_res,
1063 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1064 }
1065 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1066 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1067 needrecov);
1068 return (e.error);
1069 }
1070
1071 /*
1072 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1073 * for the demand-based allocation of async threads per-mount. The
1074 * nfs_async_timeout is the amount of time a thread will live after it
1075 * becomes idle, unless new I/O requests are received before the thread
1076 * dies. See nfs4_async_putpage and nfs4_async_start.
1077 */
1078
1079 static void nfs4_async_start(struct vfs *);
1080 static void nfs4_async_pgops_start(struct vfs *);
1081 static void nfs4_async_common_start(struct vfs *, int);
1082
1083 static void
1084 free_async_args4(struct nfs4_async_reqs *args)
1085 {
1086 rnode4_t *rp;
1087
1088 if (args->a_io != NFS4_INACTIVE) {
1089 rp = VTOR4(args->a_vp);
1090 mutex_enter(&rp->r_statelock);
1091 rp->r_count--;
1092 if (args->a_io == NFS4_PUTAPAGE ||
1093 args->a_io == NFS4_PAGEIO)
1094 rp->r_awcount--;
1095 cv_broadcast(&rp->r_cv);
1096 mutex_exit(&rp->r_statelock);
1097 VN_RELE(args->a_vp);
1098 }
1099 crfree(args->a_cred);
1100 kmem_free(args, sizeof (*args));
1101 }
1102
1103 /*
1104 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1105 * pageout(), running in the global zone, have legitimate reasons to do
1106 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1107 * use of a a per-mount "asynchronous requests manager thread" which is
1108 * signaled by the various asynchronous work routines when there is
1109 * asynchronous work to be done. It is responsible for creating new
1110 * worker threads if necessary, and notifying existing worker threads
1111 * that there is work to be done.
1112 *
1113 * In other words, it will "take the specifications from the customers and
1114 * give them to the engineers."
1115 *
1116 * Worker threads die off of their own accord if they are no longer
1117 * needed.
1118 *
1119 * This thread is killed when the zone is going away or the filesystem
1120 * is being unmounted.
1121 */
1122 void
1123 nfs4_async_manager(vfs_t *vfsp)
1124 {
1125 callb_cpr_t cprinfo;
1126 mntinfo4_t *mi;
1127 uint_t max_threads;
1128
1129 mi = VFTOMI4(vfsp);
1130
1131 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1132 "nfs4_async_manager");
1133
1134 mutex_enter(&mi->mi_async_lock);
1135 /*
1136 * We want to stash the max number of threads that this mount was
1137 * allowed so we can use it later when the variable is set to zero as
1138 * part of the zone/mount going away.
1139 *
1140 * We want to be able to create at least one thread to handle
1141 * asynchronous inactive calls.
1142 */
1143 max_threads = MAX(mi->mi_max_threads, 1);
1144 /*
1145 * We don't want to wait for mi_max_threads to go to zero, since that
1146 * happens as part of a failed unmount, but this thread should only
1147 * exit when the mount is really going away.
1148 *
1149 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1150 * attempted: the various _async_*() functions know to do things
1151 * inline if mi_max_threads == 0. Henceforth we just drain out the
1152 * outstanding requests.
1153 *
1154 * Note that we still create zthreads even if we notice the zone is
1155 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1156 * shutdown sequence to take slightly longer in some cases, but
1157 * doesn't violate the protocol, as all threads will exit as soon as
1158 * they're done processing the remaining requests.
1159 */
1160 for (;;) {
1161 while (mi->mi_async_req_count > 0) {
1162 /*
1163 * Paranoia: If the mount started out having
1164 * (mi->mi_max_threads == 0), and the value was
1165 * later changed (via a debugger or somesuch),
1166 * we could be confused since we will think we
1167 * can't create any threads, and the calling
1168 * code (which looks at the current value of
1169 * mi->mi_max_threads, now non-zero) thinks we
1170 * can.
1171 *
1172 * So, because we're paranoid, we create threads
1173 * up to the maximum of the original and the
1174 * current value. This means that future
1175 * (debugger-induced) alterations of
1176 * mi->mi_max_threads are ignored for our
1177 * purposes, but who told them they could change
1178 * random values on a live kernel anyhow?
1179 */
1180 if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1181 MAX(mi->mi_max_threads, max_threads)) {
1182 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1183 mutex_exit(&mi->mi_async_lock);
1184 MI4_HOLD(mi);
1185 VFS_HOLD(vfsp); /* hold for new thread */
1186 (void) zthread_create(NULL, 0, nfs4_async_start,
1187 vfsp, 0, minclsyspri);
1188 mutex_enter(&mi->mi_async_lock);
1189 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1190 NUM_ASYNC_PGOPS_THREADS) {
1191 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1192 mutex_exit(&mi->mi_async_lock);
1193 MI4_HOLD(mi);
1194 VFS_HOLD(vfsp); /* hold for new thread */
1195 (void) zthread_create(NULL, 0,
1196 nfs4_async_pgops_start, vfsp, 0,
1197 minclsyspri);
1198 mutex_enter(&mi->mi_async_lock);
1199 }
1200 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1201 ASSERT(mi->mi_async_req_count != 0);
1202 mi->mi_async_req_count--;
1203 }
1204
1205 mutex_enter(&mi->mi_lock);
1206 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1207 mutex_exit(&mi->mi_lock);
1208 break;
1209 }
1210 mutex_exit(&mi->mi_lock);
1211
1212 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1213 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1214 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1215 }
1216
1217 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1218 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1219 /*
1220 * Let everyone know we're done.
1221 */
1222 mi->mi_manager_thread = NULL;
1223 /*
1224 * Wake up the inactive thread.
1225 */
1226 cv_broadcast(&mi->mi_inact_req_cv);
1227 /*
1228 * Wake up anyone sitting in nfs4_async_manager_stop()
1229 */
1230 cv_broadcast(&mi->mi_async_cv);
1231 /*
1232 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1233 * since CALLB_CPR_EXIT is actually responsible for releasing
1234 * 'mi_async_lock'.
1235 */
1236 CALLB_CPR_EXIT(&cprinfo);
1237 VFS_RELE(vfsp); /* release thread's hold */
1238 MI4_RELE(mi);
1239 zthread_exit();
1240 }
1241
1242 /*
1243 * Signal (and wait for) the async manager thread to clean up and go away.
1244 */
1245 void
1246 nfs4_async_manager_stop(vfs_t *vfsp)
1247 {
1248 mntinfo4_t *mi = VFTOMI4(vfsp);
1249
1250 mutex_enter(&mi->mi_async_lock);
1251 mutex_enter(&mi->mi_lock);
1252 mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1253 mutex_exit(&mi->mi_lock);
1254 cv_broadcast(&mi->mi_async_reqs_cv);
1255 /*
1256 * Wait for the async manager thread to die.
1257 */
1258 while (mi->mi_manager_thread != NULL)
1259 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1260 mutex_exit(&mi->mi_async_lock);
1261 }
1262
1263 int
1264 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1265 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1266 u_offset_t, caddr_t, struct seg *, cred_t *))
1267 {
1268 rnode4_t *rp;
1269 mntinfo4_t *mi;
1270 struct nfs4_async_reqs *args;
1271
1272 rp = VTOR4(vp);
1273 ASSERT(rp->r_freef == NULL);
1274
1275 mi = VTOMI4(vp);
1276
1277 /*
1278 * If addr falls in a different segment, don't bother doing readahead.
1279 */
1280 if (addr >= seg->s_base + seg->s_size)
1281 return (-1);
1282
1283 /*
1284 * If we can't allocate a request structure, punt on the readahead.
1285 */
1286 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1287 return (-1);
1288
1289 /*
1290 * If a lock operation is pending, don't initiate any new
1291 * readaheads. Otherwise, bump r_count to indicate the new
1292 * asynchronous I/O.
1293 */
1294 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1295 kmem_free(args, sizeof (*args));
1296 return (-1);
1297 }
1298 mutex_enter(&rp->r_statelock);
1299 rp->r_count++;
1300 mutex_exit(&rp->r_statelock);
1301 nfs_rw_exit(&rp->r_lkserlock);
1302
1303 args->a_next = NULL;
1304 #ifdef DEBUG
1305 args->a_queuer = curthread;
1306 #endif
1307 VN_HOLD(vp);
1308 args->a_vp = vp;
1309 ASSERT(cr != NULL);
1310 crhold(cr);
1311 args->a_cred = cr;
1312 args->a_io = NFS4_READ_AHEAD;
1313 args->a_nfs4_readahead = readahead;
1314 args->a_nfs4_blkoff = blkoff;
1315 args->a_nfs4_seg = seg;
1316 args->a_nfs4_addr = addr;
1317
1318 mutex_enter(&mi->mi_async_lock);
1319
1320 /*
1321 * If asyncio has been disabled, don't bother readahead.
1322 */
1323 if (mi->mi_max_threads == 0) {
1324 mutex_exit(&mi->mi_async_lock);
1325 goto noasync;
1326 }
1327
1328 /*
1329 * Link request structure into the async list and
1330 * wakeup async thread to do the i/o.
1331 */
1332 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1333 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1334 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1335 } else {
1336 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1337 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1338 }
1339
1340 if (mi->mi_io_kstats) {
1341 mutex_enter(&mi->mi_lock);
1342 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1343 mutex_exit(&mi->mi_lock);
1344 }
1345
1346 mi->mi_async_req_count++;
1347 ASSERT(mi->mi_async_req_count != 0);
1348 cv_signal(&mi->mi_async_reqs_cv);
1349 mutex_exit(&mi->mi_async_lock);
1350 return (0);
1351
1352 noasync:
1353 mutex_enter(&rp->r_statelock);
1354 rp->r_count--;
1355 cv_broadcast(&rp->r_cv);
1356 mutex_exit(&rp->r_statelock);
1357 VN_RELE(vp);
1358 crfree(cr);
1359 kmem_free(args, sizeof (*args));
1360 return (-1);
1361 }
1362
1363 static void
1364 nfs4_async_start(struct vfs *vfsp)
1365 {
1366 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1367 }
1368
1369 static void
1370 nfs4_async_pgops_start(struct vfs *vfsp)
1371 {
1372 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1373 }
1374
1375 /*
1376 * The async queues for each mounted file system are arranged as a
1377 * set of queues, one for each async i/o type. Requests are taken
1378 * from the queues in a round-robin fashion. A number of consecutive
1379 * requests are taken from each queue before moving on to the next
1380 * queue. This functionality may allow the NFS Version 2 server to do
1381 * write clustering, even if the client is mixing writes and reads
1382 * because it will take multiple write requests from the queue
1383 * before processing any of the other async i/o types.
1384 *
1385 * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1386 * model defined by cpr to suspend the system. Specifically over the
1387 * wire calls are cpr-unsafe. The thread should be reevaluated in
1388 * case of future updates to the cpr model.
1389 */
1390 static void
1391 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1392 {
1393 struct nfs4_async_reqs *args;
1394 mntinfo4_t *mi = VFTOMI4(vfsp);
1395 clock_t time_left = 1;
1396 callb_cpr_t cprinfo;
1397 int i;
1398 extern volatile int nfs_async_timeout;
1399 int async_types;
1400 kcondvar_t *async_work_cv;
1401
1402 if (async_queue == NFS4_ASYNC_QUEUE) {
1403 async_types = NFS4_ASYNC_TYPES;
1404 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1405 } else {
1406 async_types = NFS4_ASYNC_PGOPS_TYPES;
1407 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1408 }
1409
1410 /*
1411 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1412 * built in an implementation independent manner.
1413 */
1414 if (nfs_async_timeout == -1)
1415 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1416
1417 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1418
1419 mutex_enter(&mi->mi_async_lock);
1420 for (;;) {
1421 /*
1422 * Find the next queue containing an entry. We start
1423 * at the current queue pointer and then round robin
1424 * through all of them until we either find a non-empty
1425 * queue or have looked through all of them.
1426 */
1427 for (i = 0; i < async_types; i++) {
1428 args = *mi->mi_async_curr[async_queue];
1429 if (args != NULL)
1430 break;
1431 mi->mi_async_curr[async_queue]++;
1432 if (mi->mi_async_curr[async_queue] ==
1433 &mi->mi_async_reqs[async_types]) {
1434 mi->mi_async_curr[async_queue] =
1435 &mi->mi_async_reqs[0];
1436 }
1437 }
1438 /*
1439 * If we didn't find a entry, then block until woken up
1440 * again and then look through the queues again.
1441 */
1442 if (args == NULL) {
1443 /*
1444 * Exiting is considered to be safe for CPR as well
1445 */
1446 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1447
1448 /*
1449 * Wakeup thread waiting to unmount the file
1450 * system only if all async threads are inactive.
1451 *
1452 * If we've timed-out and there's nothing to do,
1453 * then get rid of this thread.
1454 */
1455 if (mi->mi_max_threads == 0 || time_left <= 0) {
1456 --mi->mi_threads[async_queue];
1457
1458 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1459 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1460 cv_signal(&mi->mi_async_cv);
1461 CALLB_CPR_EXIT(&cprinfo);
1462 VFS_RELE(vfsp); /* release thread's hold */
1463 MI4_RELE(mi);
1464 zthread_exit();
1465 /* NOTREACHED */
1466 }
1467 time_left = cv_reltimedwait(async_work_cv,
1468 &mi->mi_async_lock, nfs_async_timeout,
1469 TR_CLOCK_TICK);
1470
1471 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1472
1473 continue;
1474 } else {
1475 time_left = 1;
1476 }
1477
1478 /*
1479 * Remove the request from the async queue and then
1480 * update the current async request queue pointer. If
1481 * the current queue is empty or we have removed enough
1482 * consecutive entries from it, then reset the counter
1483 * for this queue and then move the current pointer to
1484 * the next queue.
1485 */
1486 *mi->mi_async_curr[async_queue] = args->a_next;
1487 if (*mi->mi_async_curr[async_queue] == NULL ||
1488 --mi->mi_async_clusters[args->a_io] == 0) {
1489 mi->mi_async_clusters[args->a_io] =
1490 mi->mi_async_init_clusters;
1491 mi->mi_async_curr[async_queue]++;
1492 if (mi->mi_async_curr[async_queue] ==
1493 &mi->mi_async_reqs[async_types]) {
1494 mi->mi_async_curr[async_queue] =
1495 &mi->mi_async_reqs[0];
1496 }
1497 }
1498
1499 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1500 mutex_enter(&mi->mi_lock);
1501 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1502 mutex_exit(&mi->mi_lock);
1503 }
1504
1505 mutex_exit(&mi->mi_async_lock);
1506
1507 /*
1508 * Obtain arguments from the async request structure.
1509 */
1510 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1511 (*args->a_nfs4_readahead)(args->a_vp,
1512 args->a_nfs4_blkoff, args->a_nfs4_addr,
1513 args->a_nfs4_seg, args->a_cred);
1514 } else if (args->a_io == NFS4_PUTAPAGE) {
1515 (void) (*args->a_nfs4_putapage)(args->a_vp,
1516 args->a_nfs4_pp, args->a_nfs4_off,
1517 args->a_nfs4_len, args->a_nfs4_flags,
1518 args->a_cred);
1519 } else if (args->a_io == NFS4_PAGEIO) {
1520 (void) (*args->a_nfs4_pageio)(args->a_vp,
1521 args->a_nfs4_pp, args->a_nfs4_off,
1522 args->a_nfs4_len, args->a_nfs4_flags,
1523 args->a_cred);
1524 } else if (args->a_io == NFS4_READDIR) {
1525 (void) ((*args->a_nfs4_readdir)(args->a_vp,
1526 args->a_nfs4_rdc, args->a_cred));
1527 } else if (args->a_io == NFS4_COMMIT) {
1528 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1529 args->a_nfs4_offset, args->a_nfs4_count,
1530 args->a_cred);
1531 } else if (args->a_io == NFS4_INACTIVE) {
1532 nfs4_inactive_otw(args->a_vp, args->a_cred);
1533 }
1534
1535 /*
1536 * Now, release the vnode and free the credentials
1537 * structure.
1538 */
1539 free_async_args4(args);
1540 /*
1541 * Reacquire the mutex because it will be needed above.
1542 */
1543 mutex_enter(&mi->mi_async_lock);
1544 }
1545 }
1546
1547 /*
1548 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1549 * part of VOP_INACTIVE.
1550 */
1551
1552 void
1553 nfs4_inactive_thread(mntinfo4_t *mi)
1554 {
1555 struct nfs4_async_reqs *args;
1556 callb_cpr_t cprinfo;
1557 vfs_t *vfsp = mi->mi_vfsp;
1558
1559 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1560 "nfs4_inactive_thread");
1561
1562 for (;;) {
1563 mutex_enter(&mi->mi_async_lock);
1564 args = mi->mi_async_reqs[NFS4_INACTIVE];
1565 if (args == NULL) {
1566 mutex_enter(&mi->mi_lock);
1567 /*
1568 * We don't want to exit until the async manager is done
1569 * with its work; hence the check for mi_manager_thread
1570 * being NULL.
1571 *
1572 * The async manager thread will cv_broadcast() on
1573 * mi_inact_req_cv when it's done, at which point we'll
1574 * wake up and exit.
1575 */
1576 if (mi->mi_manager_thread == NULL)
1577 goto die;
1578 mi->mi_flags |= MI4_INACTIVE_IDLE;
1579 mutex_exit(&mi->mi_lock);
1580 cv_signal(&mi->mi_async_cv);
1581 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1582 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1583 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1584 mutex_exit(&mi->mi_async_lock);
1585 } else {
1586 mutex_enter(&mi->mi_lock);
1587 mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1588 mutex_exit(&mi->mi_lock);
1589 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1590 mutex_exit(&mi->mi_async_lock);
1591 nfs4_inactive_otw(args->a_vp, args->a_cred);
1592 crfree(args->a_cred);
1593 kmem_free(args, sizeof (*args));
1594 }
1595 }
1596 die:
1597 mutex_exit(&mi->mi_lock);
1598 mi->mi_inactive_thread = NULL;
1599 cv_signal(&mi->mi_async_cv);
1600
1601 /*
1602 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1603 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1604 */
1605 CALLB_CPR_EXIT(&cprinfo);
1606
1607 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1608 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1609
1610 MI4_RELE(mi);
1611 zthread_exit();
1612 /* NOTREACHED */
1613 }
1614
1615 /*
1616 * nfs_async_stop:
1617 * Wait for all outstanding putpage operations and the inactive thread to
1618 * complete; nfs4_async_stop_sig() without interruptibility.
1619 */
1620 void
1621 nfs4_async_stop(struct vfs *vfsp)
1622 {
1623 mntinfo4_t *mi = VFTOMI4(vfsp);
1624
1625 /*
1626 * Wait for all outstanding async operations to complete and for
1627 * worker threads to exit.
1628 */
1629 mutex_enter(&mi->mi_async_lock);
1630 mi->mi_max_threads = 0;
1631 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1632 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1633 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1634 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635
1636 /*
1637 * Wait for the inactive thread to finish doing what it's doing. It
1638 * won't exit until the last reference to the vfs_t goes away.
1639 */
1640 if (mi->mi_inactive_thread != NULL) {
1641 mutex_enter(&mi->mi_lock);
1642 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1643 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1644 mutex_exit(&mi->mi_lock);
1645 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1646 mutex_enter(&mi->mi_lock);
1647 }
1648 mutex_exit(&mi->mi_lock);
1649 }
1650 mutex_exit(&mi->mi_async_lock);
1651 }
1652
1653 /*
1654 * nfs_async_stop_sig:
1655 * Wait for all outstanding putpage operations and the inactive thread to
1656 * complete. If a signal is delivered we will abort and return non-zero;
1657 * otherwise return 0. Since this routine is called from nfs4_unmount, we
1658 * need to make it interruptible.
1659 */
1660 int
1661 nfs4_async_stop_sig(struct vfs *vfsp)
1662 {
1663 mntinfo4_t *mi = VFTOMI4(vfsp);
1664 ushort_t omax;
1665 bool_t intr = FALSE;
1666
1667 /*
1668 * Wait for all outstanding putpage operations to complete and for
1669 * worker threads to exit.
1670 */
1671 mutex_enter(&mi->mi_async_lock);
1672 omax = mi->mi_max_threads;
1673 mi->mi_max_threads = 0;
1674 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1675 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1676 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1677 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1678 intr = TRUE;
1679 goto interrupted;
1680 }
1681 }
1682
1683 /*
1684 * Wait for the inactive thread to finish doing what it's doing. It
1685 * won't exit until the a last reference to the vfs_t goes away.
1686 */
1687 if (mi->mi_inactive_thread != NULL) {
1688 mutex_enter(&mi->mi_lock);
1689 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1690 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1691 mutex_exit(&mi->mi_lock);
1692 if (!cv_wait_sig(&mi->mi_async_cv,
1693 &mi->mi_async_lock)) {
1694 intr = TRUE;
1695 goto interrupted;
1696 }
1697 mutex_enter(&mi->mi_lock);
1698 }
1699 mutex_exit(&mi->mi_lock);
1700 }
1701 interrupted:
1702 if (intr)
1703 mi->mi_max_threads = omax;
1704 mutex_exit(&mi->mi_async_lock);
1705
1706 return (intr);
1707 }
1708
1709 int
1710 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1711 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1712 u_offset_t, size_t, int, cred_t *))
1713 {
1714 rnode4_t *rp;
1715 mntinfo4_t *mi;
1716 struct nfs4_async_reqs *args;
1717
1718 ASSERT(flags & B_ASYNC);
1719 ASSERT(vp->v_vfsp != NULL);
1720
1721 rp = VTOR4(vp);
1722 ASSERT(rp->r_count > 0);
1723
1724 mi = VTOMI4(vp);
1725
1726 /*
1727 * If we can't allocate a request structure, do the putpage
1728 * operation synchronously in this thread's context.
1729 */
1730 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1731 goto noasync;
1732
1733 args->a_next = NULL;
1734 #ifdef DEBUG
1735 args->a_queuer = curthread;
1736 #endif
1737 VN_HOLD(vp);
1738 args->a_vp = vp;
1739 ASSERT(cr != NULL);
1740 crhold(cr);
1741 args->a_cred = cr;
1742 args->a_io = NFS4_PUTAPAGE;
1743 args->a_nfs4_putapage = putapage;
1744 args->a_nfs4_pp = pp;
1745 args->a_nfs4_off = off;
1746 args->a_nfs4_len = (uint_t)len;
1747 args->a_nfs4_flags = flags;
1748
1749 mutex_enter(&mi->mi_async_lock);
1750
1751 /*
1752 * If asyncio has been disabled, then make a synchronous request.
1753 * This check is done a second time in case async io was diabled
1754 * while this thread was blocked waiting for memory pressure to
1755 * reduce or for the queue to drain.
1756 */
1757 if (mi->mi_max_threads == 0) {
1758 mutex_exit(&mi->mi_async_lock);
1759
1760 VN_RELE(vp);
1761 crfree(cr);
1762 kmem_free(args, sizeof (*args));
1763 goto noasync;
1764 }
1765
1766 /*
1767 * Link request structure into the async list and
1768 * wakeup async thread to do the i/o.
1769 */
1770 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1771 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1772 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1773 } else {
1774 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1775 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1776 }
1777
1778 mutex_enter(&rp->r_statelock);
1779 rp->r_count++;
1780 rp->r_awcount++;
1781 mutex_exit(&rp->r_statelock);
1782
1783 if (mi->mi_io_kstats) {
1784 mutex_enter(&mi->mi_lock);
1785 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1786 mutex_exit(&mi->mi_lock);
1787 }
1788
1789 mi->mi_async_req_count++;
1790 ASSERT(mi->mi_async_req_count != 0);
1791 cv_signal(&mi->mi_async_reqs_cv);
1792 mutex_exit(&mi->mi_async_lock);
1793 return (0);
1794
1795 noasync:
1796
1797 if (curproc == proc_pageout || curproc == proc_fsflush) {
1798 /*
1799 * If we get here in the context of the pageout/fsflush,
1800 * or we have run out of memory or we're attempting to
1801 * unmount we refuse to do a sync write, because this may
1802 * hang pageout/fsflush and the machine. In this case,
1803 * we just re-mark the page as dirty and punt on the page.
1804 *
1805 * Make sure B_FORCE isn't set. We can re-mark the
1806 * pages as dirty and unlock the pages in one swoop by
1807 * passing in B_ERROR to pvn_write_done(). However,
1808 * we should make sure B_FORCE isn't set - we don't
1809 * want the page tossed before it gets written out.
1810 */
1811 if (flags & B_FORCE)
1812 flags &= ~(B_INVAL | B_FORCE);
1813 pvn_write_done(pp, flags | B_ERROR);
1814 return (0);
1815 }
1816
1817 if (nfs_zone() != mi->mi_zone) {
1818 /*
1819 * So this was a cross-zone sync putpage.
1820 *
1821 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1822 * as dirty and unlock them.
1823 *
1824 * We don't want to clear B_FORCE here as the caller presumably
1825 * knows what they're doing if they set it.
1826 */
1827 pvn_write_done(pp, flags | B_ERROR);
1828 return (EPERM);
1829 }
1830 return ((*putapage)(vp, pp, off, len, flags, cr));
1831 }
1832
1833 int
1834 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1835 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1836 size_t, int, cred_t *))
1837 {
1838 rnode4_t *rp;
1839 mntinfo4_t *mi;
1840 struct nfs4_async_reqs *args;
1841
1842 ASSERT(flags & B_ASYNC);
1843 ASSERT(vp->v_vfsp != NULL);
1844
1845 rp = VTOR4(vp);
1846 ASSERT(rp->r_count > 0);
1847
1848 mi = VTOMI4(vp);
1849
1850 /*
1851 * If we can't allocate a request structure, do the pageio
1852 * request synchronously in this thread's context.
1853 */
1854 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1855 goto noasync;
1856
1857 args->a_next = NULL;
1858 #ifdef DEBUG
1859 args->a_queuer = curthread;
1860 #endif
1861 VN_HOLD(vp);
1862 args->a_vp = vp;
1863 ASSERT(cr != NULL);
1864 crhold(cr);
1865 args->a_cred = cr;
1866 args->a_io = NFS4_PAGEIO;
1867 args->a_nfs4_pageio = pageio;
1868 args->a_nfs4_pp = pp;
1869 args->a_nfs4_off = io_off;
1870 args->a_nfs4_len = (uint_t)io_len;
1871 args->a_nfs4_flags = flags;
1872
1873 mutex_enter(&mi->mi_async_lock);
1874
1875 /*
1876 * If asyncio has been disabled, then make a synchronous request.
1877 * This check is done a second time in case async io was diabled
1878 * while this thread was blocked waiting for memory pressure to
1879 * reduce or for the queue to drain.
1880 */
1881 if (mi->mi_max_threads == 0) {
1882 mutex_exit(&mi->mi_async_lock);
1883
1884 VN_RELE(vp);
1885 crfree(cr);
1886 kmem_free(args, sizeof (*args));
1887 goto noasync;
1888 }
1889
1890 /*
1891 * Link request structure into the async list and
1892 * wakeup async thread to do the i/o.
1893 */
1894 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1895 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1896 mi->mi_async_tail[NFS4_PAGEIO] = args;
1897 } else {
1898 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1899 mi->mi_async_tail[NFS4_PAGEIO] = args;
1900 }
1901
1902 mutex_enter(&rp->r_statelock);
1903 rp->r_count++;
1904 rp->r_awcount++;
1905 mutex_exit(&rp->r_statelock);
1906
1907 if (mi->mi_io_kstats) {
1908 mutex_enter(&mi->mi_lock);
1909 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1910 mutex_exit(&mi->mi_lock);
1911 }
1912
1913 mi->mi_async_req_count++;
1914 ASSERT(mi->mi_async_req_count != 0);
1915 cv_signal(&mi->mi_async_reqs_cv);
1916 mutex_exit(&mi->mi_async_lock);
1917 return (0);
1918
1919 noasync:
1920 /*
1921 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1922 * the page list), for writes we do it synchronously, except for
1923 * proc_pageout/proc_fsflush as described below.
1924 */
1925 if (flags & B_READ) {
1926 pvn_read_done(pp, flags | B_ERROR);
1927 return (0);
1928 }
1929
1930 if (curproc == proc_pageout || curproc == proc_fsflush) {
1931 /*
1932 * If we get here in the context of the pageout/fsflush,
1933 * we refuse to do a sync write, because this may hang
1934 * pageout/fsflush (and the machine). In this case, we just
1935 * re-mark the page as dirty and punt on the page.
1936 *
1937 * Make sure B_FORCE isn't set. We can re-mark the
1938 * pages as dirty and unlock the pages in one swoop by
1939 * passing in B_ERROR to pvn_write_done(). However,
1940 * we should make sure B_FORCE isn't set - we don't
1941 * want the page tossed before it gets written out.
1942 */
1943 if (flags & B_FORCE)
1944 flags &= ~(B_INVAL | B_FORCE);
1945 pvn_write_done(pp, flags | B_ERROR);
1946 return (0);
1947 }
1948
1949 if (nfs_zone() != mi->mi_zone) {
1950 /*
1951 * So this was a cross-zone sync pageio. We pass in B_ERROR
1952 * to pvn_write_done() to re-mark the pages as dirty and unlock
1953 * them.
1954 *
1955 * We don't want to clear B_FORCE here as the caller presumably
1956 * knows what they're doing if they set it.
1957 */
1958 pvn_write_done(pp, flags | B_ERROR);
1959 return (EPERM);
1960 }
1961 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1962 }
1963
1964 void
1965 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1966 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1967 {
1968 rnode4_t *rp;
1969 mntinfo4_t *mi;
1970 struct nfs4_async_reqs *args;
1971
1972 rp = VTOR4(vp);
1973 ASSERT(rp->r_freef == NULL);
1974
1975 mi = VTOMI4(vp);
1976
1977 /*
1978 * If we can't allocate a request structure, skip the readdir.
1979 */
1980 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1981 goto noasync;
1982
1983 args->a_next = NULL;
1984 #ifdef DEBUG
1985 args->a_queuer = curthread;
1986 #endif
1987 VN_HOLD(vp);
1988 args->a_vp = vp;
1989 ASSERT(cr != NULL);
1990 crhold(cr);
1991 args->a_cred = cr;
1992 args->a_io = NFS4_READDIR;
1993 args->a_nfs4_readdir = readdir;
1994 args->a_nfs4_rdc = rdc;
1995
1996 mutex_enter(&mi->mi_async_lock);
1997
1998 /*
1999 * If asyncio has been disabled, then skip this request
2000 */
2001 if (mi->mi_max_threads == 0) {
2002 mutex_exit(&mi->mi_async_lock);
2003
2004 VN_RELE(vp);
2005 crfree(cr);
2006 kmem_free(args, sizeof (*args));
2007 goto noasync;
2008 }
2009
2010 /*
2011 * Link request structure into the async list and
2012 * wakeup async thread to do the i/o.
2013 */
2014 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2015 mi->mi_async_reqs[NFS4_READDIR] = args;
2016 mi->mi_async_tail[NFS4_READDIR] = args;
2017 } else {
2018 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2019 mi->mi_async_tail[NFS4_READDIR] = args;
2020 }
2021
2022 mutex_enter(&rp->r_statelock);
2023 rp->r_count++;
2024 mutex_exit(&rp->r_statelock);
2025
2026 if (mi->mi_io_kstats) {
2027 mutex_enter(&mi->mi_lock);
2028 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2029 mutex_exit(&mi->mi_lock);
2030 }
2031
2032 mi->mi_async_req_count++;
2033 ASSERT(mi->mi_async_req_count != 0);
2034 cv_signal(&mi->mi_async_reqs_cv);
2035 mutex_exit(&mi->mi_async_lock);
2036 return;
2037
2038 noasync:
2039 mutex_enter(&rp->r_statelock);
2040 rdc->entries = NULL;
2041 /*
2042 * Indicate that no one is trying to fill this entry and
2043 * it still needs to be filled.
2044 */
2045 rdc->flags &= ~RDDIR;
2046 rdc->flags |= RDDIRREQ;
2047 rddir4_cache_rele(rp, rdc);
2048 mutex_exit(&rp->r_statelock);
2049 }
2050
2051 void
2052 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2053 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2054 cred_t *))
2055 {
2056 rnode4_t *rp;
2057 mntinfo4_t *mi;
2058 struct nfs4_async_reqs *args;
2059 page_t *pp;
2060
2061 rp = VTOR4(vp);
2062 mi = VTOMI4(vp);
2063
2064 /*
2065 * If we can't allocate a request structure, do the commit
2066 * operation synchronously in this thread's context.
2067 */
2068 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2069 goto noasync;
2070
2071 args->a_next = NULL;
2072 #ifdef DEBUG
2073 args->a_queuer = curthread;
2074 #endif
2075 VN_HOLD(vp);
2076 args->a_vp = vp;
2077 ASSERT(cr != NULL);
2078 crhold(cr);
2079 args->a_cred = cr;
2080 args->a_io = NFS4_COMMIT;
2081 args->a_nfs4_commit = commit;
2082 args->a_nfs4_plist = plist;
2083 args->a_nfs4_offset = offset;
2084 args->a_nfs4_count = count;
2085
2086 mutex_enter(&mi->mi_async_lock);
2087
2088 /*
2089 * If asyncio has been disabled, then make a synchronous request.
2090 * This check is done a second time in case async io was diabled
2091 * while this thread was blocked waiting for memory pressure to
2092 * reduce or for the queue to drain.
2093 */
2094 if (mi->mi_max_threads == 0) {
2095 mutex_exit(&mi->mi_async_lock);
2096
2097 VN_RELE(vp);
2098 crfree(cr);
2099 kmem_free(args, sizeof (*args));
2100 goto noasync;
2101 }
2102
2103 /*
2104 * Link request structure into the async list and
2105 * wakeup async thread to do the i/o.
2106 */
2107 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2108 mi->mi_async_reqs[NFS4_COMMIT] = args;
2109 mi->mi_async_tail[NFS4_COMMIT] = args;
2110 } else {
2111 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2112 mi->mi_async_tail[NFS4_COMMIT] = args;
2113 }
2114
2115 mutex_enter(&rp->r_statelock);
2116 rp->r_count++;
2117 mutex_exit(&rp->r_statelock);
2118
2119 if (mi->mi_io_kstats) {
2120 mutex_enter(&mi->mi_lock);
2121 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2122 mutex_exit(&mi->mi_lock);
2123 }
2124
2125 mi->mi_async_req_count++;
2126 ASSERT(mi->mi_async_req_count != 0);
2127 cv_signal(&mi->mi_async_reqs_cv);
2128 mutex_exit(&mi->mi_async_lock);
2129 return;
2130
2131 noasync:
2132 if (curproc == proc_pageout || curproc == proc_fsflush ||
2133 nfs_zone() != mi->mi_zone) {
2134 while (plist != NULL) {
2135 pp = plist;
2136 page_sub(&plist, pp);
2137 pp->p_fsdata = C_COMMIT;
2138 page_unlock(pp);
2139 }
2140 return;
2141 }
2142 (*commit)(vp, plist, offset, count, cr);
2143 }
2144
2145 /*
2146 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The
2147 * reference to the vnode is handed over to the thread; the caller should
2148 * no longer refer to the vnode.
2149 *
2150 * Unlike most of the async routines, this handoff is needed for
2151 * correctness reasons, not just performance. So doing operations in the
2152 * context of the current thread is not an option.
2153 */
2154 void
2155 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2156 {
2157 mntinfo4_t *mi;
2158 struct nfs4_async_reqs *args;
2159 boolean_t signal_inactive_thread = B_FALSE;
2160
2161 mi = VTOMI4(vp);
2162
2163 args = kmem_alloc(sizeof (*args), KM_SLEEP);
2164 args->a_next = NULL;
2165 #ifdef DEBUG
2166 args->a_queuer = curthread;
2167 #endif
2168 args->a_vp = vp;
2169 ASSERT(cr != NULL);
2170 crhold(cr);
2171 args->a_cred = cr;
2172 args->a_io = NFS4_INACTIVE;
2173
2174 /*
2175 * Note that we don't check mi->mi_max_threads here, since we
2176 * *need* to get rid of this vnode regardless of whether someone
2177 * set nfs4_max_threads to zero in /etc/system.
2178 *
2179 * The manager thread knows about this and is willing to create
2180 * at least one thread to accommodate us.
2181 */
2182 mutex_enter(&mi->mi_async_lock);
2183 if (mi->mi_inactive_thread == NULL) {
2184 rnode4_t *rp;
2185 vnode_t *unldvp = NULL;
2186 char *unlname;
2187 cred_t *unlcred;
2188
2189 mutex_exit(&mi->mi_async_lock);
2190 /*
2191 * We just need to free up the memory associated with the
2192 * vnode, which can be safely done from within the current
2193 * context.
2194 */
2195 crfree(cr); /* drop our reference */
2196 kmem_free(args, sizeof (*args));
2197 rp = VTOR4(vp);
2198 mutex_enter(&rp->r_statelock);
2199 if (rp->r_unldvp != NULL) {
2200 unldvp = rp->r_unldvp;
2201 rp->r_unldvp = NULL;
2202 unlname = rp->r_unlname;
2203 rp->r_unlname = NULL;
2204 unlcred = rp->r_unlcred;
2205 rp->r_unlcred = NULL;
2206 }
2207 mutex_exit(&rp->r_statelock);
2208 /*
2209 * No need to explicitly throw away any cached pages. The
2210 * eventual r4inactive() will attempt a synchronous
2211 * VOP_PUTPAGE() which will immediately fail since the request
2212 * is coming from the wrong zone, and then will proceed to call
2213 * nfs4_invalidate_pages() which will clean things up for us.
2214 *
2215 * Throw away the delegation here so rp4_addfree()'s attempt to
2216 * return any existing delegations becomes a no-op.
2217 */
2218 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2219 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2220 FALSE);
2221 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2222 nfs_rw_exit(&mi->mi_recovlock);
2223 }
2224 nfs4_clear_open_streams(rp);
2225
2226 rp4_addfree(rp, cr);
2227 if (unldvp != NULL) {
2228 kmem_free(unlname, MAXNAMELEN);
2229 VN_RELE(unldvp);
2230 crfree(unlcred);
2231 }
2232 return;
2233 }
2234
2235 if (mi->mi_manager_thread == NULL) {
2236 /*
2237 * We want to talk to the inactive thread.
2238 */
2239 signal_inactive_thread = B_TRUE;
2240 }
2241
2242 /*
2243 * Enqueue the vnode and wake up either the special thread (empty
2244 * list) or an async thread.
2245 */
2246 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2247 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2248 mi->mi_async_tail[NFS4_INACTIVE] = args;
2249 signal_inactive_thread = B_TRUE;
2250 } else {
2251 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2252 mi->mi_async_tail[NFS4_INACTIVE] = args;
2253 }
2254 if (signal_inactive_thread) {
2255 cv_signal(&mi->mi_inact_req_cv);
2256 } else {
2257 mi->mi_async_req_count++;
2258 ASSERT(mi->mi_async_req_count != 0);
2259 cv_signal(&mi->mi_async_reqs_cv);
2260 }
2261
2262 mutex_exit(&mi->mi_async_lock);
2263 }
2264
2265 int
2266 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2267 {
2268 int pagecreate;
2269 int n;
2270 int saved_n;
2271 caddr_t saved_base;
2272 u_offset_t offset;
2273 int error;
2274 int sm_error;
2275 vnode_t *vp = RTOV(rp);
2276
2277 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2278 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2279 if (!vpm_enable) {
2280 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2281 }
2282
2283 /*
2284 * Move bytes in at most PAGESIZE chunks. We must avoid
2285 * spanning pages in uiomove() because page faults may cause
2286 * the cache to be invalidated out from under us. The r_size is not
2287 * updated until after the uiomove. If we push the last page of a
2288 * file before r_size is correct, we will lose the data written past
2289 * the current (and invalid) r_size.
2290 */
2291 do {
2292 offset = uio->uio_loffset;
2293 pagecreate = 0;
2294
2295 /*
2296 * n is the number of bytes required to satisfy the request
2297 * or the number of bytes to fill out the page.
2298 */
2299 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2300
2301 /*
2302 * Check to see if we can skip reading in the page
2303 * and just allocate the memory. We can do this
2304 * if we are going to rewrite the entire mapping
2305 * or if we are going to write to or beyond the current
2306 * end of file from the beginning of the mapping.
2307 *
2308 * The read of r_size is now protected by r_statelock.
2309 */
2310 mutex_enter(&rp->r_statelock);
2311 /*
2312 * When pgcreated is nonzero the caller has already done
2313 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2314 * segkpm this means we already have at least one page
2315 * created and mapped at base.
2316 */
2317 pagecreate = pgcreated ||
2318 ((offset & PAGEOFFSET) == 0 &&
2319 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2320
2321 mutex_exit(&rp->r_statelock);
2322
2323 if (!vpm_enable && pagecreate) {
2324 /*
2325 * The last argument tells segmap_pagecreate() to
2326 * always lock the page, as opposed to sometimes
2327 * returning with the page locked. This way we avoid a
2328 * fault on the ensuing uiomove(), but also
2329 * more importantly (to fix bug 1094402) we can
2330 * call segmap_fault() to unlock the page in all
2331 * cases. An alternative would be to modify
2332 * segmap_pagecreate() to tell us when it is
2333 * locking a page, but that's a fairly major
2334 * interface change.
2335 */
2336 if (pgcreated == 0)
2337 (void) segmap_pagecreate(segkmap, base,
2338 (uint_t)n, 1);
2339 saved_base = base;
2340 saved_n = n;
2341 }
2342
2343 /*
2344 * The number of bytes of data in the last page can not
2345 * be accurately be determined while page is being
2346 * uiomove'd to and the size of the file being updated.
2347 * Thus, inform threads which need to know accurately
2348 * how much data is in the last page of the file. They
2349 * will not do the i/o immediately, but will arrange for
2350 * the i/o to happen later when this modify operation
2351 * will have finished.
2352 */
2353 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2354 mutex_enter(&rp->r_statelock);
2355 rp->r_flags |= R4MODINPROGRESS;
2356 rp->r_modaddr = (offset & MAXBMASK);
2357 mutex_exit(&rp->r_statelock);
2358
2359 if (vpm_enable) {
2360 /*
2361 * Copy data. If new pages are created, part of
2362 * the page that is not written will be initizliazed
2363 * with zeros.
2364 */
2365 error = vpm_data_copy(vp, offset, n, uio,
2366 !pagecreate, NULL, 0, S_WRITE);
2367 } else {
2368 error = uiomove(base, n, UIO_WRITE, uio);
2369 }
2370
2371 /*
2372 * r_size is the maximum number of
2373 * bytes known to be in the file.
2374 * Make sure it is at least as high as the
2375 * first unwritten byte pointed to by uio_loffset.
2376 */
2377 mutex_enter(&rp->r_statelock);
2378 if (rp->r_size < uio->uio_loffset)
2379 rp->r_size = uio->uio_loffset;
2380 rp->r_flags &= ~R4MODINPROGRESS;
2381 rp->r_flags |= R4DIRTY;
2382 mutex_exit(&rp->r_statelock);
2383
2384 /* n = # of bytes written */
2385 n = (int)(uio->uio_loffset - offset);
2386
2387 if (!vpm_enable) {
2388 base += n;
2389 }
2390
2391 tcount -= n;
2392 /*
2393 * If we created pages w/o initializing them completely,
2394 * we need to zero the part that wasn't set up.
2395 * This happens on a most EOF write cases and if
2396 * we had some sort of error during the uiomove.
2397 */
2398 if (!vpm_enable && pagecreate) {
2399 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2400 (void) kzero(base, PAGESIZE - n);
2401
2402 if (pgcreated) {
2403 /*
2404 * Caller is responsible for this page,
2405 * it was not created in this loop.
2406 */
2407 pgcreated = 0;
2408 } else {
2409 /*
2410 * For bug 1094402: segmap_pagecreate locks
2411 * page. Unlock it. This also unlocks the
2412 * pages allocated by page_create_va() in
2413 * segmap_pagecreate().
2414 */
2415 sm_error = segmap_fault(kas.a_hat, segkmap,
2416 saved_base, saved_n,
2417 F_SOFTUNLOCK, S_WRITE);
2418 if (error == 0)
2419 error = sm_error;
2420 }
2421 }
2422 } while (tcount > 0 && error == 0);
2423
2424 return (error);
2425 }
2426
2427 int
2428 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2429 {
2430 rnode4_t *rp;
2431 page_t *pp;
2432 u_offset_t eoff;
2433 u_offset_t io_off;
2434 size_t io_len;
2435 int error;
2436 int rdirty;
2437 int err;
2438
2439 rp = VTOR4(vp);
2440 ASSERT(rp->r_count > 0);
2441
2442 if (!nfs4_has_pages(vp))
2443 return (0);
2444
2445 ASSERT(vp->v_type != VCHR);
2446
2447 /*
2448 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2449 * writes. B_FORCE is set to force the VM system to actually
2450 * invalidate the pages, even if the i/o failed. The pages
2451 * need to get invalidated because they can't be written out
2452 * because there isn't any space left on either the server's
2453 * file system or in the user's disk quota. The B_FREE bit
2454 * is cleared to avoid confusion as to whether this is a
2455 * request to place the page on the freelist or to destroy
2456 * it.
2457 */
2458 if ((rp->r_flags & R4OUTOFSPACE) ||
2459 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2460 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2461
2462 if (len == 0) {
2463 /*
2464 * If doing a full file synchronous operation, then clear
2465 * the R4DIRTY bit. If a page gets dirtied while the flush
2466 * is happening, then R4DIRTY will get set again. The
2467 * R4DIRTY bit must get cleared before the flush so that
2468 * we don't lose this information.
2469 *
2470 * If there are no full file async write operations
2471 * pending and RDIRTY bit is set, clear it.
2472 */
2473 if (off == (u_offset_t)0 &&
2474 !(flags & B_ASYNC) &&
2475 (rp->r_flags & R4DIRTY)) {
2476 mutex_enter(&rp->r_statelock);
2477 rdirty = (rp->r_flags & R4DIRTY);
2478 rp->r_flags &= ~R4DIRTY;
2479 mutex_exit(&rp->r_statelock);
2480 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2481 mutex_enter(&rp->r_statelock);
2482 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2483 rdirty = (rp->r_flags & R4DIRTY);
2484 rp->r_flags &= ~R4DIRTY;
2485 }
2486 mutex_exit(&rp->r_statelock);
2487 } else
2488 rdirty = 0;
2489
2490 /*
2491 * Search the entire vp list for pages >= off, and flush
2492 * the dirty pages.
2493 */
2494 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2495 flags, cr);
2496
2497 /*
2498 * If an error occurred and the file was marked as dirty
2499 * before and we aren't forcibly invalidating pages, then
2500 * reset the R4DIRTY flag.
2501 */
2502 if (error && rdirty &&
2503 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2504 mutex_enter(&rp->r_statelock);
2505 rp->r_flags |= R4DIRTY;
2506 mutex_exit(&rp->r_statelock);
2507 }
2508 } else {
2509 /*
2510 * Do a range from [off...off + len) looking for pages
2511 * to deal with.
2512 */
2513 error = 0;
2514 io_len = 0;
2515 eoff = off + len;
2516 mutex_enter(&rp->r_statelock);
2517 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2518 io_off += io_len) {
2519 mutex_exit(&rp->r_statelock);
2520 /*
2521 * If we are not invalidating, synchronously
2522 * freeing or writing pages use the routine
2523 * page_lookup_nowait() to prevent reclaiming
2524 * them from the free list.
2525 */
2526 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2527 pp = page_lookup(vp, io_off,
2528 (flags & (B_INVAL | B_FREE)) ?
2529 SE_EXCL : SE_SHARED);
2530 } else {
2531 pp = page_lookup_nowait(vp, io_off,
2532 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2533 }
2534
2535 if (pp == NULL || !pvn_getdirty(pp, flags))
2536 io_len = PAGESIZE;
2537 else {
2538 err = (*rp->r_putapage)(vp, pp, &io_off,
2539 &io_len, flags, cr);
2540 if (!error)
2541 error = err;
2542 /*
2543 * "io_off" and "io_len" are returned as
2544 * the range of pages we actually wrote.
2545 * This allows us to skip ahead more quickly
2546 * since several pages may've been dealt
2547 * with by this iteration of the loop.
2548 */
2549 }
2550 mutex_enter(&rp->r_statelock);
2551 }
2552 mutex_exit(&rp->r_statelock);
2553 }
2554
2555 return (error);
2556 }
2557
2558 void
2559 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2560 {
2561 rnode4_t *rp;
2562
2563 rp = VTOR4(vp);
2564 if (IS_SHADOW(vp, rp))
2565 vp = RTOV4(rp);
2566 mutex_enter(&rp->r_statelock);
2567 while (rp->r_flags & R4TRUNCATE)
2568 cv_wait(&rp->r_cv, &rp->r_statelock);
2569 rp->r_flags |= R4TRUNCATE;
2570 if (off == (u_offset_t)0) {
2571 rp->r_flags &= ~R4DIRTY;
2572 if (!(rp->r_flags & R4STALE))
2573 rp->r_error = 0;
2574 }
2575 rp->r_truncaddr = off;
2576 mutex_exit(&rp->r_statelock);
2577 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2578 B_INVAL | B_TRUNC, cr);
2579 mutex_enter(&rp->r_statelock);
2580 rp->r_flags &= ~R4TRUNCATE;
2581 cv_broadcast(&rp->r_cv);
2582 mutex_exit(&rp->r_statelock);
2583 }
2584
2585 static int
2586 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2587 {
2588 mntinfo4_t *mi;
2589 struct mntinfo_kstat *mik;
2590 vfs_t *vfsp;
2591
2592 /* this is a read-only kstat. Bail out on a write */
2593 if (rw == KSTAT_WRITE)
2594 return (EACCES);
2595
2596
2597 /*
2598 * We don't want to wait here as kstat_chain_lock could be held by
2599 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2600 * and thus could lead to a deadlock.
2601 */
2602 vfsp = (struct vfs *)ksp->ks_private;
2603
2604 mi = VFTOMI4(vfsp);
2605 mik = (struct mntinfo_kstat *)ksp->ks_data;
2606
2607 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2608
2609 mik->mik_vers = (uint32_t)mi->mi_vers;
2610 mik->mik_flags = mi->mi_flags;
2611 /*
2612 * The sv_secdata holds the flavor the client specifies.
2613 * If the client uses default and a security negotiation
2614 * occurs, sv_currsec will point to the current flavor
2615 * selected from the server flavor list.
2616 * sv_currsec is NULL if no security negotiation takes place.
2617 */
2618 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2619 mi->mi_curr_serv->sv_currsec->secmod :
2620 mi->mi_curr_serv->sv_secdata->secmod;
2621 mik->mik_curread = (uint32_t)mi->mi_curread;
2622 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2623 mik->mik_retrans = mi->mi_retrans;
2624 mik->mik_timeo = mi->mi_timeo;
2625 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2626 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2627 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2628 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2629 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2630 mik->mik_failover = (uint32_t)mi->mi_failover;
2631 mik->mik_remap = (uint32_t)mi->mi_remap;
2632
2633 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2634
2635 return (0);
2636 }
2637
2638 void
2639 nfs4_mnt_kstat_init(struct vfs *vfsp)
2640 {
2641 mntinfo4_t *mi = VFTOMI4(vfsp);
2642
2643 /*
2644 * PSARC 2001/697 Contract Private Interface
2645 * All nfs kstats are under SunMC contract
2646 * Please refer to the PSARC listed above and contact
2647 * SunMC before making any changes!
2648 *
2649 * Changes must be reviewed by Solaris File Sharing
2650 * Changes must be communicated to contract-2001-697@sun.com
2651 *
2652 */
2653
2654 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2655 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2656 if (mi->mi_io_kstats) {
2657 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2658 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2659 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2660 kstat_install(mi->mi_io_kstats);
2661 }
2662
2663 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2664 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2665 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2666 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2667 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2668 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2669 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2670 kstat_install(mi->mi_ro_kstats);
2671 }
2672
2673 nfs4_mnt_recov_kstat_init(vfsp);
2674 }
2675
2676 void
2677 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2678 {
2679 mntinfo4_t *mi;
2680 clock_t now = ddi_get_lbolt();
2681
2682 mi = VTOMI4(vp);
2683 /*
2684 * In case of forced unmount, do not print any messages
2685 * since it can flood the console with error messages.
2686 */
2687 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2688 return;
2689
2690 /*
2691 * If the mount point is dead, not recoverable, do not
2692 * print error messages that can flood the console.
2693 */
2694 if (mi->mi_flags & MI4_RECOV_FAIL)
2695 return;
2696
2697 /*
2698 * No use in flooding the console with ENOSPC
2699 * messages from the same file system.
2700 */
2701 if ((error != ENOSPC && error != EDQUOT) ||
2702 now - mi->mi_printftime > 0) {
2703 zoneid_t zoneid = mi->mi_zone->zone_id;
2704
2705 #ifdef DEBUG
2706 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2707 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2708 #else
2709 nfs_perror(error, "NFS write error on host %s: %m.\n",
2710 VTOR4(vp)->r_server->sv_hostname, NULL);
2711 #endif
2712 if (error == ENOSPC || error == EDQUOT) {
2713 zcmn_err(zoneid, CE_CONT,
2714 "^File: userid=%d, groupid=%d\n",
2715 crgetuid(cr), crgetgid(cr));
2716 if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2717 crgetgid(curthread->t_cred) != crgetgid(cr)) {
2718 zcmn_err(zoneid, CE_CONT,
2719 "^User: userid=%d, groupid=%d\n",
2720 crgetuid(curthread->t_cred),
2721 crgetgid(curthread->t_cred));
2722 }
2723 mi->mi_printftime = now +
2724 nfs_write_error_interval * hz;
2725 }
2726 sfh4_printfhandle(VTOR4(vp)->r_fh);
2727 #ifdef DEBUG
2728 if (error == EACCES) {
2729 zcmn_err(zoneid, CE_CONT,
2730 "nfs_bio: cred is%s kcred\n",
2731 cr == kcred ? "" : " not");
2732 }
2733 #endif
2734 }
2735 }
2736
2737 /*
2738 * Return non-zero if the given file can be safely memory mapped. Locks
2739 * are safe if whole-file (length and offset are both zero).
2740 */
2741
2742 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0)
2743
2744 static int
2745 nfs4_safemap(const vnode_t *vp)
2746 {
2747 locklist_t *llp, *next_llp;
2748 int safe = 1;
2749 rnode4_t *rp = VTOR4(vp);
2750
2751 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2752
2753 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2754 "vp = %p", (void *)vp));
2755
2756 /*
2757 * Review all the locks for the vnode, both ones that have been
2758 * acquired and ones that are pending. We assume that
2759 * flk_active_locks_for_vp() has merged any locks that can be
2760 * merged (so that if a process has the entire file locked, it is
2761 * represented as a single lock).
2762 *
2763 * Note that we can't bail out of the loop if we find a non-safe
2764 * lock, because we have to free all the elements in the llp list.
2765 * We might be able to speed up this code slightly by not looking
2766 * at each lock's l_start and l_len fields once we've found a
2767 * non-safe lock.
2768 */
2769
2770 llp = flk_active_locks_for_vp(vp);
2771 while (llp) {
2772 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2773 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2774 llp->ll_flock.l_start, llp->ll_flock.l_len));
2775 if (!SAFE_LOCK(llp->ll_flock)) {
2776 safe = 0;
2777 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2778 "nfs4_safemap: unsafe active lock (%" PRId64
2779 ", %" PRId64 ")", llp->ll_flock.l_start,
2780 llp->ll_flock.l_len));
2781 }
2782 next_llp = llp->ll_next;
2783 VN_RELE(llp->ll_vp);
2784 kmem_free(llp, sizeof (*llp));
2785 llp = next_llp;
2786 }
2787
2788 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2789 safe ? "safe" : "unsafe"));
2790 return (safe);
2791 }
2792
2793 /*
2794 * Return whether there is a lost LOCK or LOCKU queued up for the given
2795 * file that would make an mmap request unsafe. cf. nfs4_safemap().
2796 */
2797
2798 bool_t
2799 nfs4_map_lost_lock_conflict(vnode_t *vp)
2800 {
2801 bool_t conflict = FALSE;
2802 nfs4_lost_rqst_t *lrp;
2803 mntinfo4_t *mi = VTOMI4(vp);
2804
2805 mutex_enter(&mi->mi_lock);
2806 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2807 lrp = list_next(&mi->mi_lost_state, lrp)) {
2808 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2809 continue;
2810 ASSERT(lrp->lr_vp != NULL);
2811 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2812 continue; /* different file */
2813 if (!SAFE_LOCK(*lrp->lr_flk)) {
2814 conflict = TRUE;
2815 break;
2816 }
2817 }
2818
2819 mutex_exit(&mi->mi_lock);
2820 return (conflict);
2821 }
2822
2823 /*
2824 * nfs_lockcompletion:
2825 *
2826 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2827 * as non cachable (set VNOCACHE bit).
2828 */
2829
2830 void
2831 nfs4_lockcompletion(vnode_t *vp, int cmd)
2832 {
2833 rnode4_t *rp = VTOR4(vp);
2834
2835 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2836 ASSERT(!IS_SHADOW(vp, rp));
2837
2838 if (cmd == F_SETLK || cmd == F_SETLKW) {
2839
2840 if (!nfs4_safemap(vp)) {
2841 mutex_enter(&vp->v_lock);
2842 vp->v_flag |= VNOCACHE;
2843 mutex_exit(&vp->v_lock);
2844 } else {
2845 mutex_enter(&vp->v_lock);
2846 vp->v_flag &= ~VNOCACHE;
2847 mutex_exit(&vp->v_lock);
2848 }
2849 }
2850 /*
2851 * The cached attributes of the file are stale after acquiring
2852 * the lock on the file. They were updated when the file was
2853 * opened, but not updated when the lock was acquired. Therefore the
2854 * cached attributes are invalidated after the lock is obtained.
2855 */
2856 PURGE_ATTRCACHE4(vp);
2857 }
2858
2859 /* ARGSUSED */
2860 static void *
2861 nfs4_mi_init(zoneid_t zoneid)
2862 {
2863 struct mi4_globals *mig;
2864
2865 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2866 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2867 list_create(&mig->mig_list, sizeof (mntinfo4_t),
2868 offsetof(mntinfo4_t, mi_zone_node));
2869 mig->mig_destructor_called = B_FALSE;
2870 return (mig);
2871 }
2872
2873 /*
2874 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2875 * state and killing off threads.
2876 */
2877 /* ARGSUSED */
2878 static void
2879 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2880 {
2881 struct mi4_globals *mig = data;
2882 mntinfo4_t *mi;
2883 nfs4_server_t *np;
2884
2885 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2886 "nfs4_mi_shutdown zone %d\n", zoneid));
2887 ASSERT(mig != NULL);
2888 for (;;) {
2889 mutex_enter(&mig->mig_lock);
2890 mi = list_head(&mig->mig_list);
2891 if (mi == NULL) {
2892 mutex_exit(&mig->mig_lock);
2893 break;
2894 }
2895
2896 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2897 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2898 /*
2899 * purge the DNLC for this filesystem
2900 */
2901 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2902 /*
2903 * Tell existing async worker threads to exit.
2904 */
2905 mutex_enter(&mi->mi_async_lock);
2906 mi->mi_max_threads = 0;
2907 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2908 /*
2909 * Set the appropriate flags, signal and wait for both the
2910 * async manager and the inactive thread to exit when they're
2911 * done with their current work.
2912 */
2913 mutex_enter(&mi->mi_lock);
2914 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2915 mutex_exit(&mi->mi_lock);
2916 mutex_exit(&mi->mi_async_lock);
2917 if (mi->mi_manager_thread) {
2918 nfs4_async_manager_stop(mi->mi_vfsp);
2919 }
2920 if (mi->mi_inactive_thread) {
2921 mutex_enter(&mi->mi_async_lock);
2922 cv_signal(&mi->mi_inact_req_cv);
2923 /*
2924 * Wait for the inactive thread to exit.
2925 */
2926 while (mi->mi_inactive_thread != NULL) {
2927 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2928 }
2929 mutex_exit(&mi->mi_async_lock);
2930 }
2931 /*
2932 * Wait for the recovery thread to complete, that is, it will
2933 * signal when it is done using the "mi" structure and about
2934 * to exit
2935 */
2936 mutex_enter(&mi->mi_lock);
2937 while (mi->mi_in_recovery > 0)
2938 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2939 mutex_exit(&mi->mi_lock);
2940 /*
2941 * We're done when every mi has been done or the list is empty.
2942 * This one is done, remove it from the list.
2943 */
2944 list_remove(&mig->mig_list, mi);
2945 mutex_exit(&mig->mig_lock);
2946 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2947
2948 /*
2949 * Release hold on vfs and mi done to prevent race with zone
2950 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2951 */
2952 VFS_RELE(mi->mi_vfsp);
2953 MI4_RELE(mi);
2954 }
2955 /*
2956 * Tell each renew thread in the zone to exit
2957 */
2958 mutex_enter(&nfs4_server_lst_lock);
2959 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2960 mutex_enter(&np->s_lock);
2961 if (np->zoneid == zoneid) {
2962 /*
2963 * We add another hold onto the nfs4_server_t
2964 * because this will make sure tha the nfs4_server_t
2965 * stays around until nfs4_callback_fini_zone destroys
2966 * the zone. This way, the renew thread can
2967 * unconditionally release its holds on the
2968 * nfs4_server_t.
2969 */
2970 np->s_refcnt++;
2971 nfs4_mark_srv_dead(np);
2972 }
2973 mutex_exit(&np->s_lock);
2974 }
2975 mutex_exit(&nfs4_server_lst_lock);
2976 }
2977
2978 static void
2979 nfs4_mi_free_globals(struct mi4_globals *mig)
2980 {
2981 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2982 mutex_destroy(&mig->mig_lock);
2983 kmem_free(mig, sizeof (*mig));
2984 }
2985
2986 /* ARGSUSED */
2987 static void
2988 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2989 {
2990 struct mi4_globals *mig = data;
2991
2992 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2993 "nfs4_mi_destroy zone %d\n", zoneid));
2994 ASSERT(mig != NULL);
2995 mutex_enter(&mig->mig_lock);
2996 if (list_head(&mig->mig_list) != NULL) {
2997 /* Still waiting for VFS_FREEVFS() */
2998 mig->mig_destructor_called = B_TRUE;
2999 mutex_exit(&mig->mig_lock);
3000 return;
3001 }
3002 nfs4_mi_free_globals(mig);
3003 }
3004
3005 /*
3006 * Add an NFS mount to the per-zone list of NFS mounts.
3007 */
3008 void
3009 nfs4_mi_zonelist_add(mntinfo4_t *mi)
3010 {
3011 struct mi4_globals *mig;
3012
3013 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3014 mutex_enter(&mig->mig_lock);
3015 list_insert_head(&mig->mig_list, mi);
3016 /*
3017 * hold added to eliminate race with zone shutdown -this will be
3018 * released in mi_shutdown
3019 */
3020 MI4_HOLD(mi);
3021 VFS_HOLD(mi->mi_vfsp);
3022 mutex_exit(&mig->mig_lock);
3023 }
3024
3025 /*
3026 * Remove an NFS mount from the per-zone list of NFS mounts.
3027 */
3028 int
3029 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3030 {
3031 struct mi4_globals *mig;
3032 int ret = 0;
3033
3034 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3035 mutex_enter(&mig->mig_lock);
3036 mutex_enter(&mi->mi_lock);
3037 /* if this mi is marked dead, then the zone already released it */
3038 if (!(mi->mi_flags & MI4_DEAD)) {
3039 list_remove(&mig->mig_list, mi);
3040 mutex_exit(&mi->mi_lock);
3041
3042 /* release the holds put on in zonelist_add(). */
3043 VFS_RELE(mi->mi_vfsp);
3044 MI4_RELE(mi);
3045 ret = 1;
3046 } else {
3047 mutex_exit(&mi->mi_lock);
3048 }
3049
3050 /*
3051 * We can be called asynchronously by VFS_FREEVFS() after the zone
3052 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3053 * mi globals.
3054 */
3055 if (list_head(&mig->mig_list) == NULL &&
3056 mig->mig_destructor_called == B_TRUE) {
3057 nfs4_mi_free_globals(mig);
3058 return (ret);
3059 }
3060 mutex_exit(&mig->mig_lock);
3061 return (ret);
3062 }
3063
3064 void
3065 nfs_free_mi4(mntinfo4_t *mi)
3066 {
3067 nfs4_open_owner_t *foop;
3068 nfs4_oo_hash_bucket_t *bucketp;
3069 nfs4_debug_msg_t *msgp;
3070 int i;
3071 servinfo4_t *svp;
3072
3073 /*
3074 * Code introduced here should be carefully evaluated to make
3075 * sure none of the freed resources are accessed either directly
3076 * or indirectly after freeing them. For eg: Introducing calls to
3077 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3078 * the structure members or other routines calling back into NFS
3079 * accessing freed mntinfo4_t structure member.
3080 */
3081 mutex_enter(&mi->mi_lock);
3082 ASSERT(mi->mi_recovthread == NULL);
3083 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3084 mutex_exit(&mi->mi_lock);
3085 mutex_enter(&mi->mi_async_lock);
3086 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3087 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3088 ASSERT(mi->mi_manager_thread == NULL);
3089 mutex_exit(&mi->mi_async_lock);
3090 if (mi->mi_io_kstats) {
3091 kstat_delete(mi->mi_io_kstats);
3092 mi->mi_io_kstats = NULL;
3093 }
3094 if (mi->mi_ro_kstats) {
3095 kstat_delete(mi->mi_ro_kstats);
3096 mi->mi_ro_kstats = NULL;
3097 }
3098 if (mi->mi_recov_ksp) {
3099 kstat_delete(mi->mi_recov_ksp);
3100 mi->mi_recov_ksp = NULL;
3101 }
3102 mutex_enter(&mi->mi_msg_list_lock);
3103 while (msgp = list_head(&mi->mi_msg_list)) {
3104 list_remove(&mi->mi_msg_list, msgp);
3105 nfs4_free_msg(msgp);
3106 }
3107 mutex_exit(&mi->mi_msg_list_lock);
3108 list_destroy(&mi->mi_msg_list);
3109 if (mi->mi_fname != NULL)
3110 fn_rele(&mi->mi_fname);
3111 if (mi->mi_rootfh != NULL)
3112 sfh4_rele(&mi->mi_rootfh);
3113 if (mi->mi_srvparentfh != NULL)
3114 sfh4_rele(&mi->mi_srvparentfh);
3115 svp = mi->mi_servers;
3116 sv4_free(svp);
3117 mutex_destroy(&mi->mi_lock);
3118 mutex_destroy(&mi->mi_async_lock);
3119 mutex_destroy(&mi->mi_msg_list_lock);
3120 nfs_rw_destroy(&mi->mi_recovlock);
3121 nfs_rw_destroy(&mi->mi_rename_lock);
3122 nfs_rw_destroy(&mi->mi_fh_lock);
3123 cv_destroy(&mi->mi_failover_cv);
3124 cv_destroy(&mi->mi_async_reqs_cv);
3125 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3126 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3127 cv_destroy(&mi->mi_async_cv);
3128 cv_destroy(&mi->mi_inact_req_cv);
3129 /*
3130 * Destroy the oo hash lists and mutexes for the cred hash table.
3131 */
3132 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3133 bucketp = &(mi->mi_oo_list[i]);
3134 /* Destroy any remaining open owners on the list */
3135 foop = list_head(&bucketp->b_oo_hash_list);
3136 while (foop != NULL) {
3137 list_remove(&bucketp->b_oo_hash_list, foop);
3138 nfs4_destroy_open_owner(foop);
3139 foop = list_head(&bucketp->b_oo_hash_list);
3140 }
3141 list_destroy(&bucketp->b_oo_hash_list);
3142 mutex_destroy(&bucketp->b_lock);
3143 }
3144 /*
3145 * Empty and destroy the freed open owner list.
3146 */
3147 foop = list_head(&mi->mi_foo_list);
3148 while (foop != NULL) {
3149 list_remove(&mi->mi_foo_list, foop);
3150 nfs4_destroy_open_owner(foop);
3151 foop = list_head(&mi->mi_foo_list);
3152 }
3153 list_destroy(&mi->mi_foo_list);
3154 list_destroy(&mi->mi_bseqid_list);
3155 list_destroy(&mi->mi_lost_state);
3156 avl_destroy(&mi->mi_filehandles);
3157 kmem_free(mi, sizeof (*mi));
3158 }
3159 void
3160 mi_hold(mntinfo4_t *mi)
3161 {
3162 atomic_inc_32(&mi->mi_count);
3163 ASSERT(mi->mi_count != 0);
3164 }
3165
3166 void
3167 mi_rele(mntinfo4_t *mi)
3168 {
3169 ASSERT(mi->mi_count != 0);
3170 if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3171 nfs_free_mi4(mi);
3172 }
3173 }
3174
3175 vnode_t nfs4_xattr_notsupp_vnode;
3176
3177 void
3178 nfs4_clnt_init(void)
3179 {
3180 nfs4_vnops_init();
3181 (void) nfs4_rnode_init();
3182 (void) nfs4_shadow_init();
3183 (void) nfs4_acache_init();
3184 (void) nfs4_subr_init();
3185 nfs4_acl_init();
3186 nfs_idmap_init();
3187 nfs4_callback_init();
3188 nfs4_secinfo_init();
3189 #ifdef DEBUG
3190 tsd_create(&nfs4_tsd_key, NULL);
3191 #endif
3192
3193 /*
3194 * Add a CPR callback so that we can update client
3195 * lease after a suspend and resume.
3196 */
3197 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3198
3199 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3200 nfs4_mi_destroy);
3201
3202 /*
3203 * Initialize the reference count of the notsupp xattr cache vnode to 1
3204 * so that it never goes away (VOP_INACTIVE isn't called on it).
3205 */
3206 vn_reinit(&nfs4_xattr_notsupp_vnode);
3207 }
3208
3209 void
3210 nfs4_clnt_fini(void)
3211 {
3212 (void) zone_key_delete(mi4_list_key);
3213 nfs4_vnops_fini();
3214 (void) nfs4_rnode_fini();
3215 (void) nfs4_shadow_fini();
3216 (void) nfs4_acache_fini();
3217 (void) nfs4_subr_fini();
3218 nfs_idmap_fini();
3219 nfs4_callback_fini();
3220 nfs4_secinfo_fini();
3221 #ifdef DEBUG
3222 tsd_destroy(&nfs4_tsd_key);
3223 #endif
3224 if (cid)
3225 (void) callb_delete(cid);
3226 }
3227
3228 /*ARGSUSED*/
3229 static boolean_t
3230 nfs4_client_cpr_callb(void *arg, int code)
3231 {
3232 /*
3233 * We get called for Suspend and Resume events.
3234 * For the suspend case we simply don't care!
3235 */
3236 if (code == CB_CODE_CPR_CHKPT) {
3237 return (B_TRUE);
3238 }
3239
3240 /*
3241 * When we get to here we are in the process of
3242 * resuming the system from a previous suspend.
3243 */
3244 nfs4_client_resumed = gethrestime_sec();
3245 return (B_TRUE);
3246 }
3247
3248 void
3249 nfs4_renew_lease_thread(nfs4_server_t *sp)
3250 {
3251 int error = 0;
3252 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3253 clock_t tick_delay = 0;
3254 clock_t time_left = 0;
3255 callb_cpr_t cpr_info;
3256 kmutex_t cpr_lock;
3257
3258 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3259 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3260 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3261 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3262
3263 mutex_enter(&sp->s_lock);
3264 /* sp->s_lease_time is set via a GETATTR */
3265 sp->last_renewal_time = gethrestime_sec();
3266 sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3267 ASSERT(sp->s_refcnt >= 1);
3268
3269 for (;;) {
3270 if (!sp->state_ref_count ||
3271 sp->lease_valid != NFS4_LEASE_VALID) {
3272
3273 kip_secs = MAX((sp->s_lease_time >> 1) -
3274 (3 * sp->propagation_delay.tv_sec), 1);
3275
3276 tick_delay = SEC_TO_TICK(kip_secs);
3277
3278 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3279 "nfs4_renew_lease_thread: no renew : thread "
3280 "wait %ld secs", kip_secs));
3281
3282 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3283 "nfs4_renew_lease_thread: no renew : "
3284 "state_ref_count %d, lease_valid %d",
3285 sp->state_ref_count, sp->lease_valid));
3286
3287 mutex_enter(&cpr_lock);
3288 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3289 mutex_exit(&cpr_lock);
3290 time_left = cv_reltimedwait(&sp->cv_thread_exit,
3291 &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3292 mutex_enter(&cpr_lock);
3293 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3294 mutex_exit(&cpr_lock);
3295
3296 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3297 "nfs4_renew_lease_thread: no renew: "
3298 "time left %ld", time_left));
3299
3300 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3301 goto die;
3302 continue;
3303 }
3304
3305 tmp_last_renewal_time = sp->last_renewal_time;
3306
3307 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3308 (3 * sp->propagation_delay.tv_sec);
3309
3310 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3311 "nfs4_renew_lease_thread: tmp_time %ld, "
3312 "sp->last_renewal_time %ld", tmp_time,
3313 sp->last_renewal_time));
3314
3315 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3316
3317 tick_delay = SEC_TO_TICK(kip_secs);
3318
3319 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3320 "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3321 "secs", kip_secs));
3322
3323 mutex_enter(&cpr_lock);
3324 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3325 mutex_exit(&cpr_lock);
3326 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3327 tick_delay, TR_CLOCK_TICK);
3328 mutex_enter(&cpr_lock);
3329 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3330 mutex_exit(&cpr_lock);
3331
3332 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3333 "nfs4_renew_lease_thread: valid lease: time left %ld :"
3334 "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3335 "tmp_last_renewal_time %ld", time_left,
3336 sp->last_renewal_time, nfs4_client_resumed,
3337 tmp_last_renewal_time));
3338
3339 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3340 goto die;
3341
3342 if (tmp_last_renewal_time == sp->last_renewal_time ||
3343 (nfs4_client_resumed != 0 &&
3344 nfs4_client_resumed > sp->last_renewal_time)) {
3345 /*
3346 * Issue RENEW op since we haven't renewed the lease
3347 * since we slept.
3348 */
3349 tmp_now_time = gethrestime_sec();
3350 error = nfs4renew(sp);
3351 /*
3352 * Need to re-acquire sp's lock, nfs4renew()
3353 * relinqueshes it.
3354 */
3355 mutex_enter(&sp->s_lock);
3356
3357 /*
3358 * See if someone changed s_thread_exit while we gave
3359 * up s_lock.
3360 */
3361 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3362 goto die;
3363
3364 if (!error) {
3365 /*
3366 * check to see if we implicitly renewed while
3367 * we waited for a reply for our RENEW call.
3368 */
3369 if (tmp_last_renewal_time ==
3370 sp->last_renewal_time) {
3371 /* no implicit renew came */
3372 sp->last_renewal_time = tmp_now_time;
3373 } else {
3374 NFS4_DEBUG(nfs4_client_lease_debug,
3375 (CE_NOTE, "renew_thread: did "
3376 "implicit renewal before reply "
3377 "from server for RENEW"));
3378 }
3379 } else {
3380 /* figure out error */
3381 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3382 "renew_thread: nfs4renew returned error"
3383 " %d", error));
3384 }
3385
3386 }
3387 }
3388
3389 die:
3390 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3391 "nfs4_renew_lease_thread: thread exiting"));
3392
3393 while (sp->s_otw_call_count != 0) {
3394 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3395 "nfs4_renew_lease_thread: waiting for outstanding "
3396 "otw calls to finish for sp 0x%p, current "
3397 "s_otw_call_count %d", (void *)sp,
3398 sp->s_otw_call_count));
3399 mutex_enter(&cpr_lock);
3400 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3401 mutex_exit(&cpr_lock);
3402 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3403 mutex_enter(&cpr_lock);
3404 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3405 mutex_exit(&cpr_lock);
3406 }
3407 mutex_exit(&sp->s_lock);
3408
3409 nfs4_server_rele(sp); /* free the thread's reference */
3410 nfs4_server_rele(sp); /* free the list's reference */
3411 sp = NULL;
3412
3413 done:
3414 mutex_enter(&cpr_lock);
3415 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
3416 mutex_destroy(&cpr_lock);
3417
3418 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3419 "nfs4_renew_lease_thread: renew thread exit officially"));
3420
3421 zthread_exit();
3422 /* NOT REACHED */
3423 }
3424
3425 /*
3426 * Send out a RENEW op to the server.
3427 * Assumes sp is locked down.
3428 */
3429 static int
3430 nfs4renew(nfs4_server_t *sp)
3431 {
3432 COMPOUND4args_clnt args;
3433 COMPOUND4res_clnt res;
3434 nfs_argop4 argop[1];
3435 int doqueue = 1;
3436 int rpc_error;
3437 cred_t *cr;
3438 mntinfo4_t *mi;
3439 timespec_t prop_time, after_time;
3440 int needrecov = FALSE;
3441 nfs4_recov_state_t recov_state;
3442 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3443
3444 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3445
3446 recov_state.rs_flags = 0;
3447 recov_state.rs_num_retry_despite_err = 0;
3448
3449 recov_retry:
3450 mi = sp->mntinfo4_list;
3451 VFS_HOLD(mi->mi_vfsp);
3452 mutex_exit(&sp->s_lock);
3453 ASSERT(mi != NULL);
3454
3455 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3456 if (e.error) {
3457 VFS_RELE(mi->mi_vfsp);
3458 return (e.error);
3459 }
3460
3461 /* Check to see if we're dealing with a marked-dead sp */
3462 mutex_enter(&sp->s_lock);
3463 if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3464 mutex_exit(&sp->s_lock);
3465 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3466 VFS_RELE(mi->mi_vfsp);
3467 return (0);
3468 }
3469
3470 /* Make sure mi hasn't changed on us */
3471 if (mi != sp->mntinfo4_list) {
3472 /* Must drop sp's lock to avoid a recursive mutex enter */
3473 mutex_exit(&sp->s_lock);
3474 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3475 VFS_RELE(mi->mi_vfsp);
3476 mutex_enter(&sp->s_lock);
3477 goto recov_retry;
3478 }
3479 mutex_exit(&sp->s_lock);
3480
3481 args.ctag = TAG_RENEW;
3482
3483 args.array_len = 1;
3484 args.array = argop;
3485
3486 argop[0].argop = OP_RENEW;
3487
3488 mutex_enter(&sp->s_lock);
3489 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3490 cr = sp->s_cred;
3491 crhold(cr);
3492 mutex_exit(&sp->s_lock);
3493
3494 ASSERT(cr != NULL);
3495
3496 /* used to figure out RTT for sp */
3497 gethrestime(&prop_time);
3498
3499 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3500 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3501 (void*)sp));
3502 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3503 prop_time.tv_sec, prop_time.tv_nsec));
3504
3505 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3506 mntinfo4_t *, mi);
3507
3508 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3509 crfree(cr);
3510
3511 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3512 mntinfo4_t *, mi);
3513
3514 gethrestime(&after_time);
3515
3516 mutex_enter(&sp->s_lock);
3517 sp->propagation_delay.tv_sec =
3518 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3519 mutex_exit(&sp->s_lock);
3520
3521 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3522 after_time.tv_sec, after_time.tv_nsec));
3523
3524 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3525 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3526 nfs4_delegreturn_all(sp);
3527 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3528 VFS_RELE(mi->mi_vfsp);
3529 /*
3530 * If the server returns CB_PATH_DOWN, it has renewed
3531 * the lease and informed us that the callback path is
3532 * down. Since the lease is renewed, just return 0 and
3533 * let the renew thread proceed as normal.
3534 */
3535 return (0);
3536 }
3537
3538 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3539 if (!needrecov && e.error) {
3540 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3541 VFS_RELE(mi->mi_vfsp);
3542 return (e.error);
3543 }
3544
3545 rpc_error = e.error;
3546
3547 if (needrecov) {
3548 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3549 "nfs4renew: initiating recovery\n"));
3550
3551 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3552 OP_RENEW, NULL, NULL, NULL) == FALSE) {
3553 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3554 VFS_RELE(mi->mi_vfsp);
3555 if (!e.error)
3556 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3557 mutex_enter(&sp->s_lock);
3558 goto recov_retry;
3559 }
3560 /* fall through for res.status case */
3561 }
3562
3563 if (res.status) {
3564 if (res.status == NFS4ERR_LEASE_MOVED) {
3565 /*EMPTY*/
3566 /*
3567 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3568 * to renew the lease on that server
3569 */
3570 }
3571 e.error = geterrno4(res.status);
3572 }
3573
3574 if (!rpc_error)
3575 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3576
3577 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3578
3579 VFS_RELE(mi->mi_vfsp);
3580
3581 return (e.error);
3582 }
3583
3584 void
3585 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3586 {
3587 nfs4_server_t *sp;
3588
3589 /* this locks down sp if it is found */
3590 sp = find_nfs4_server(mi);
3591
3592 if (sp != NULL) {
3593 nfs4_inc_state_ref_count_nolock(sp, mi);
3594 mutex_exit(&sp->s_lock);
3595 nfs4_server_rele(sp);
3596 }
3597 }
3598
3599 /*
3600 * Bump the number of OPEN files (ie: those with state) so we know if this
3601 * nfs4_server has any state to maintain a lease for or not.
3602 *
3603 * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3604 */
3605 void
3606 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3607 {
3608 ASSERT(mutex_owned(&sp->s_lock));
3609
3610 sp->state_ref_count++;
3611 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3612 "nfs4_inc_state_ref_count: state_ref_count now %d",
3613 sp->state_ref_count));
3614
3615 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3616 sp->lease_valid = NFS4_LEASE_VALID;
3617
3618 /*
3619 * If this call caused the lease to be marked valid and/or
3620 * took the state_ref_count from 0 to 1, then start the time
3621 * on lease renewal.
3622 */
3623 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3624 sp->last_renewal_time = gethrestime_sec();
3625
3626 /* update the number of open files for mi */
3627 mi->mi_open_files++;
3628 }
3629
3630 void
3631 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3632 {
3633 nfs4_server_t *sp;
3634
3635 /* this locks down sp if it is found */
3636 sp = find_nfs4_server_all(mi, 1);
3637
3638 if (sp != NULL) {
3639 nfs4_dec_state_ref_count_nolock(sp, mi);
3640 mutex_exit(&sp->s_lock);
3641 nfs4_server_rele(sp);
3642 }
3643 }
3644
3645 /*
3646 * Decrement the number of OPEN files (ie: those with state) so we know if
3647 * this nfs4_server has any state to maintain a lease for or not.
3648 */
3649 void
3650 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3651 {
3652 ASSERT(mutex_owned(&sp->s_lock));
3653 ASSERT(sp->state_ref_count != 0);
3654 sp->state_ref_count--;
3655
3656 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3657 "nfs4_dec_state_ref_count: state ref count now %d",
3658 sp->state_ref_count));
3659
3660 mi->mi_open_files--;
3661 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3662 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3663 mi->mi_open_files, mi->mi_flags));
3664
3665 /* We don't have to hold the mi_lock to test mi_flags */
3666 if (mi->mi_open_files == 0 &&
3667 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3668 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3669 "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3670 "we have closed the last open file", (void*)mi));
3671 nfs4_remove_mi_from_server(mi, sp);
3672 }
3673 }
3674
3675 bool_t
3676 inlease(nfs4_server_t *sp)
3677 {
3678 bool_t result;
3679
3680 ASSERT(mutex_owned(&sp->s_lock));
3681
3682 if (sp->lease_valid == NFS4_LEASE_VALID &&
3683 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3684 result = TRUE;
3685 else
3686 result = FALSE;
3687
3688 return (result);
3689 }
3690
3691
3692 /*
3693 * Return non-zero if the given nfs4_server_t is going through recovery.
3694 */
3695
3696 int
3697 nfs4_server_in_recovery(nfs4_server_t *sp)
3698 {
3699 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3700 }
3701
3702 /*
3703 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the
3704 * first is less than, equal to, or greater than the second.
3705 */
3706
3707 int
3708 sfh4cmp(const void *p1, const void *p2)
3709 {
3710 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3711 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3712
3713 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3714 }
3715
3716 /*
3717 * Create a table for shared filehandle objects.
3718 */
3719
3720 void
3721 sfh4_createtab(avl_tree_t *tab)
3722 {
3723 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3724 offsetof(nfs4_sharedfh_t, sfh_tree));
3725 }
3726
3727 /*
3728 * Return a shared filehandle object for the given filehandle. The caller
3729 * is responsible for eventually calling sfh4_rele().
3730 */
3731
3732 nfs4_sharedfh_t *
3733 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3734 {
3735 nfs4_sharedfh_t *sfh, *nsfh;
3736 avl_index_t where;
3737 nfs4_sharedfh_t skey;
3738
3739 if (!key) {
3740 skey.sfh_fh = *fh;
3741 key = &skey;
3742 }
3743
3744 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3745 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3746 /*
3747 * We allocate the largest possible filehandle size because it's
3748 * not that big, and it saves us from possibly having to resize the
3749 * buffer later.
3750 */
3751 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3752 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3753 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3754 nsfh->sfh_refcnt = 1;
3755 nsfh->sfh_flags = SFH4_IN_TREE;
3756 nsfh->sfh_mi = mi;
3757 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3758 (void *)nsfh));
3759
3760 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3761 sfh = avl_find(&mi->mi_filehandles, key, &where);
3762 if (sfh != NULL) {
3763 mutex_enter(&sfh->sfh_lock);
3764 sfh->sfh_refcnt++;
3765 mutex_exit(&sfh->sfh_lock);
3766 nfs_rw_exit(&mi->mi_fh_lock);
3767 /* free our speculative allocs */
3768 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3769 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3770 return (sfh);
3771 }
3772
3773 avl_insert(&mi->mi_filehandles, nsfh, where);
3774 nfs_rw_exit(&mi->mi_fh_lock);
3775
3776 return (nsfh);
3777 }
3778
3779 /*
3780 * Return a shared filehandle object for the given filehandle. The caller
3781 * is responsible for eventually calling sfh4_rele().
3782 */
3783
3784 nfs4_sharedfh_t *
3785 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3786 {
3787 nfs4_sharedfh_t *sfh;
3788 nfs4_sharedfh_t key;
3789
3790 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3791
3792 #ifdef DEBUG
3793 if (nfs4_sharedfh_debug) {
3794 nfs4_fhandle_t fhandle;
3795
3796 fhandle.fh_len = fh->nfs_fh4_len;
3797 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3798 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3799 nfs4_printfhandle(&fhandle);
3800 }
3801 #endif
3802
3803 /*
3804 * If there's already an object for the given filehandle, bump the
3805 * reference count and return it. Otherwise, create a new object
3806 * and add it to the AVL tree.
3807 */
3808
3809 key.sfh_fh = *fh;
3810
3811 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3812 sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3813 if (sfh != NULL) {
3814 mutex_enter(&sfh->sfh_lock);
3815 sfh->sfh_refcnt++;
3816 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3817 "sfh4_get: found existing %p, new refcnt=%d",
3818 (void *)sfh, sfh->sfh_refcnt));
3819 mutex_exit(&sfh->sfh_lock);
3820 nfs_rw_exit(&mi->mi_fh_lock);
3821 return (sfh);
3822 }
3823 nfs_rw_exit(&mi->mi_fh_lock);
3824
3825 return (sfh4_put(fh, mi, &key));
3826 }
3827
3828 /*
3829 * Get a reference to the given shared filehandle object.
3830 */
3831
3832 void
3833 sfh4_hold(nfs4_sharedfh_t *sfh)
3834 {
3835 ASSERT(sfh->sfh_refcnt > 0);
3836
3837 mutex_enter(&sfh->sfh_lock);
3838 sfh->sfh_refcnt++;
3839 NFS4_DEBUG(nfs4_sharedfh_debug,
3840 (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3841 (void *)sfh, sfh->sfh_refcnt));
3842 mutex_exit(&sfh->sfh_lock);
3843 }
3844
3845 /*
3846 * Release a reference to the given shared filehandle object and null out
3847 * the given pointer.
3848 */
3849
3850 void
3851 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3852 {
3853 mntinfo4_t *mi;
3854 nfs4_sharedfh_t *sfh = *sfhpp;
3855
3856 ASSERT(sfh->sfh_refcnt > 0);
3857
3858 mutex_enter(&sfh->sfh_lock);
3859 if (sfh->sfh_refcnt > 1) {
3860 sfh->sfh_refcnt--;
3861 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3862 "sfh4_rele %p, new refcnt=%d",
3863 (void *)sfh, sfh->sfh_refcnt));
3864 mutex_exit(&sfh->sfh_lock);
3865 goto finish;
3866 }
3867 mutex_exit(&sfh->sfh_lock);
3868
3869 /*
3870 * Possibly the last reference, so get the lock for the table in
3871 * case it's time to remove the object from the table.
3872 */
3873 mi = sfh->sfh_mi;
3874 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3875 mutex_enter(&sfh->sfh_lock);
3876 sfh->sfh_refcnt--;
3877 if (sfh->sfh_refcnt > 0) {
3878 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3879 "sfh4_rele %p, new refcnt=%d",
3880 (void *)sfh, sfh->sfh_refcnt));
3881 mutex_exit(&sfh->sfh_lock);
3882 nfs_rw_exit(&mi->mi_fh_lock);
3883 goto finish;
3884 }
3885
3886 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3887 "sfh4_rele %p, last ref", (void *)sfh));
3888 if (sfh->sfh_flags & SFH4_IN_TREE) {
3889 avl_remove(&mi->mi_filehandles, sfh);
3890 sfh->sfh_flags &= ~SFH4_IN_TREE;
3891 }
3892 mutex_exit(&sfh->sfh_lock);
3893 nfs_rw_exit(&mi->mi_fh_lock);
3894 mutex_destroy(&sfh->sfh_lock);
3895 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3896 kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3897
3898 finish:
3899 *sfhpp = NULL;
3900 }
3901
3902 /*
3903 * Update the filehandle for the given shared filehandle object.
3904 */
3905
3906 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */
3907
3908 void
3909 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3910 {
3911 mntinfo4_t *mi = sfh->sfh_mi;
3912 nfs4_sharedfh_t *dupsfh;
3913 avl_index_t where;
3914 nfs4_sharedfh_t key;
3915
3916 #ifdef DEBUG
3917 mutex_enter(&sfh->sfh_lock);
3918 ASSERT(sfh->sfh_refcnt > 0);
3919 mutex_exit(&sfh->sfh_lock);
3920 #endif
3921 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3922
3923 /*
3924 * The basic plan is to remove the shared filehandle object from
3925 * the table, update it to have the new filehandle, then reinsert
3926 * it.
3927 */
3928
3929 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3930 mutex_enter(&sfh->sfh_lock);
3931 if (sfh->sfh_flags & SFH4_IN_TREE) {
3932 avl_remove(&mi->mi_filehandles, sfh);
3933 sfh->sfh_flags &= ~SFH4_IN_TREE;
3934 }
3935 mutex_exit(&sfh->sfh_lock);
3936 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3937 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3938 sfh->sfh_fh.nfs_fh4_len);
3939
3940 /*
3941 * XXX If there is already a shared filehandle object with the new
3942 * filehandle, we're in trouble, because the rnode code assumes
3943 * that there is only one shared filehandle object for a given
3944 * filehandle. So issue a warning (for read-write mounts only)
3945 * and don't try to re-insert the given object into the table.
3946 * Hopefully the given object will quickly go away and everyone
3947 * will use the new object.
3948 */
3949 key.sfh_fh = *newfh;
3950 dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3951 if (dupsfh != NULL) {
3952 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3953 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3954 "duplicate filehandle detected");
3955 sfh4_printfhandle(dupsfh);
3956 }
3957 } else {
3958 avl_insert(&mi->mi_filehandles, sfh, where);
3959 mutex_enter(&sfh->sfh_lock);
3960 sfh->sfh_flags |= SFH4_IN_TREE;
3961 mutex_exit(&sfh->sfh_lock);
3962 }
3963 nfs_rw_exit(&mi->mi_fh_lock);
3964 }
3965
3966 /*
3967 * Copy out the current filehandle for the given shared filehandle object.
3968 */
3969
3970 void
3971 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3972 {
3973 mntinfo4_t *mi = sfh->sfh_mi;
3974
3975 ASSERT(sfh->sfh_refcnt > 0);
3976
3977 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3978 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3979 ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3980 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3981 nfs_rw_exit(&mi->mi_fh_lock);
3982 }
3983
3984 /*
3985 * Print out the filehandle for the given shared filehandle object.
3986 */
3987
3988 void
3989 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3990 {
3991 nfs4_fhandle_t fhandle;
3992
3993 sfh4_copyval(sfh, &fhandle);
3994 nfs4_printfhandle(&fhandle);
3995 }
3996
3997 /*
3998 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0
3999 * if they're the same, +1 if the first is "greater" than the second. The
4000 * caller (or whoever's calling the AVL package) is responsible for
4001 * handling locking issues.
4002 */
4003
4004 static int
4005 fncmp(const void *p1, const void *p2)
4006 {
4007 const nfs4_fname_t *f1 = p1;
4008 const nfs4_fname_t *f2 = p2;
4009 int res;
4010
4011 res = strcmp(f1->fn_name, f2->fn_name);
4012 /*
4013 * The AVL package wants +/-1, not arbitrary positive or negative
4014 * integers.
4015 */
4016 if (res > 0)
4017 res = 1;
4018 else if (res < 0)
4019 res = -1;
4020 return (res);
4021 }
4022
4023 /*
4024 * Get or create an fname with the given name, as a child of the given
4025 * fname. The caller is responsible for eventually releasing the reference
4026 * (fn_rele()). parent may be NULL.
4027 */
4028
4029 nfs4_fname_t *
4030 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4031 {
4032 nfs4_fname_t key;
4033 nfs4_fname_t *fnp;
4034 avl_index_t where;
4035
4036 key.fn_name = name;
4037
4038 /*
4039 * If there's already an fname registered with the given name, bump
4040 * its reference count and return it. Otherwise, create a new one
4041 * and add it to the parent's AVL tree.
4042 *
4043 * fname entries we are looking for should match both name
4044 * and sfh stored in the fname.
4045 */
4046 again:
4047 if (parent != NULL) {
4048 mutex_enter(&parent->fn_lock);
4049 fnp = avl_find(&parent->fn_children, &key, &where);
4050 if (fnp != NULL) {
4051 /*
4052 * This hold on fnp is released below later,
4053 * in case this is not the fnp we want.
4054 */
4055 fn_hold(fnp);
4056
4057 if (fnp->fn_sfh == sfh) {
4058 /*
4059 * We have found our entry.
4060 * put an hold and return it.
4061 */
4062 mutex_exit(&parent->fn_lock);
4063 return (fnp);
4064 }
4065
4066 /*
4067 * We have found an entry that has a mismatching
4068 * fn_sfh. This could be a stale entry due to
4069 * server side rename. We will remove this entry
4070 * and make sure no such entries exist.
4071 */
4072 mutex_exit(&parent->fn_lock);
4073 mutex_enter(&fnp->fn_lock);
4074 if (fnp->fn_parent == parent) {
4075 /*
4076 * Remove ourselves from parent's
4077 * fn_children tree.
4078 */
4079 mutex_enter(&parent->fn_lock);
4080 avl_remove(&parent->fn_children, fnp);
4081 mutex_exit(&parent->fn_lock);
4082 fn_rele(&fnp->fn_parent);
4083 }
4084 mutex_exit(&fnp->fn_lock);
4085 fn_rele(&fnp);
4086 goto again;
4087 }
4088 }
4089
4090 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4091 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4092 fnp->fn_parent = parent;
4093 if (parent != NULL)
4094 fn_hold(parent);
4095 fnp->fn_len = strlen(name);
4096 ASSERT(fnp->fn_len < MAXNAMELEN);
4097 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4098 (void) strcpy(fnp->fn_name, name);
4099 fnp->fn_refcnt = 1;
4100
4101 /*
4102 * This hold on sfh is later released
4103 * when we do the final fn_rele() on this fname.
4104 */
4105 sfh4_hold(sfh);
4106 fnp->fn_sfh = sfh;
4107
4108 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4109 offsetof(nfs4_fname_t, fn_tree));
4110 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4111 "fn_get %p:%s, a new nfs4_fname_t!",
4112 (void *)fnp, fnp->fn_name));
4113 if (parent != NULL) {
4114 avl_insert(&parent->fn_children, fnp, where);
4115 mutex_exit(&parent->fn_lock);
4116 }
4117
4118 return (fnp);
4119 }
4120
4121 void
4122 fn_hold(nfs4_fname_t *fnp)
4123 {
4124 atomic_inc_32(&fnp->fn_refcnt);
4125 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4126 "fn_hold %p:%s, new refcnt=%d",
4127 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4128 }
4129
4130 /*
4131 * Decrement the reference count of the given fname, and destroy it if its
4132 * reference count goes to zero. Nulls out the given pointer.
4133 */
4134
4135 void
4136 fn_rele(nfs4_fname_t **fnpp)
4137 {
4138 nfs4_fname_t *parent;
4139 uint32_t newref;
4140 nfs4_fname_t *fnp;
4141
4142 recur:
4143 fnp = *fnpp;
4144 *fnpp = NULL;
4145
4146 mutex_enter(&fnp->fn_lock);
4147 parent = fnp->fn_parent;
4148 if (parent != NULL)
4149 mutex_enter(&parent->fn_lock); /* prevent new references */
4150 newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4151 if (newref > 0) {
4152 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4153 "fn_rele %p:%s, new refcnt=%d",
4154 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4155 if (parent != NULL)
4156 mutex_exit(&parent->fn_lock);
4157 mutex_exit(&fnp->fn_lock);
4158 return;
4159 }
4160
4161 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4162 "fn_rele %p:%s, last reference, deleting...",
4163 (void *)fnp, fnp->fn_name));
4164 if (parent != NULL) {
4165 avl_remove(&parent->fn_children, fnp);
4166 mutex_exit(&parent->fn_lock);
4167 }
4168 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4169 sfh4_rele(&fnp->fn_sfh);
4170 mutex_destroy(&fnp->fn_lock);
4171 avl_destroy(&fnp->fn_children);
4172 kmem_free(fnp, sizeof (nfs4_fname_t));
4173 /*
4174 * Recursivly fn_rele the parent.
4175 * Use goto instead of a recursive call to avoid stack overflow.
4176 */
4177 if (parent != NULL) {
4178 fnpp = &parent;
4179 goto recur;
4180 }
4181 }
4182
4183 /*
4184 * Returns the single component name of the given fname, in a MAXNAMELEN
4185 * string buffer, which the caller is responsible for freeing. Note that
4186 * the name may become invalid as a result of fn_move().
4187 */
4188
4189 char *
4190 fn_name(nfs4_fname_t *fnp)
4191 {
4192 char *name;
4193
4194 ASSERT(fnp->fn_len < MAXNAMELEN);
4195 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4196 mutex_enter(&fnp->fn_lock);
4197 (void) strcpy(name, fnp->fn_name);
4198 mutex_exit(&fnp->fn_lock);
4199
4200 return (name);
4201 }
4202
4203
4204 /*
4205 * fn_path_realloc
4206 *
4207 * This function, used only by fn_path, constructs
4208 * a new string which looks like "prepend" + "/" + "current".
4209 * by allocating a new string and freeing the old one.
4210 */
4211 static void
4212 fn_path_realloc(char **curses, char *prepend)
4213 {
4214 int len, curlen = 0;
4215 char *news;
4216
4217 if (*curses == NULL) {
4218 /*
4219 * Prime the pump, allocate just the
4220 * space for prepend and return that.
4221 */
4222 len = strlen(prepend) + 1;
4223 news = kmem_alloc(len, KM_SLEEP);
4224 (void) strncpy(news, prepend, len);
4225 } else {
4226 /*
4227 * Allocate the space for a new string
4228 * +1 +1 is for the "/" and the NULL
4229 * byte at the end of it all.
4230 */
4231 curlen = strlen(*curses);
4232 len = curlen + strlen(prepend) + 1 + 1;
4233 news = kmem_alloc(len, KM_SLEEP);
4234 (void) strncpy(news, prepend, len);
4235 (void) strcat(news, "/");
4236 (void) strcat(news, *curses);
4237 kmem_free(*curses, curlen + 1);
4238 }
4239 *curses = news;
4240 }
4241
4242 /*
4243 * Returns the path name (starting from the fs root) for the given fname.
4244 * The caller is responsible for freeing. Note that the path may be or
4245 * become invalid as a result of fn_move().
4246 */
4247
4248 char *
4249 fn_path(nfs4_fname_t *fnp)
4250 {
4251 char *path;
4252 nfs4_fname_t *nextfnp;
4253
4254 if (fnp == NULL)
4255 return (NULL);
4256
4257 path = NULL;
4258
4259 /* walk up the tree constructing the pathname. */
4260
4261 fn_hold(fnp); /* adjust for later rele */
4262 do {
4263 mutex_enter(&fnp->fn_lock);
4264 /*
4265 * Add fn_name in front of the current path
4266 */
4267 fn_path_realloc(&path, fnp->fn_name);
4268 nextfnp = fnp->fn_parent;
4269 if (nextfnp != NULL)
4270 fn_hold(nextfnp);
4271 mutex_exit(&fnp->fn_lock);
4272 fn_rele(&fnp);
4273 fnp = nextfnp;
4274 } while (fnp != NULL);
4275
4276 return (path);
4277 }
4278
4279 /*
4280 * Return a reference to the parent of the given fname, which the caller is
4281 * responsible for eventually releasing.
4282 */
4283
4284 nfs4_fname_t *
4285 fn_parent(nfs4_fname_t *fnp)
4286 {
4287 nfs4_fname_t *parent;
4288
4289 mutex_enter(&fnp->fn_lock);
4290 parent = fnp->fn_parent;
4291 if (parent != NULL)
4292 fn_hold(parent);
4293 mutex_exit(&fnp->fn_lock);
4294
4295 return (parent);
4296 }
4297
4298 /*
4299 * Update fnp so that its parent is newparent and its name is newname.
4300 */
4301
4302 void
4303 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4304 {
4305 nfs4_fname_t *parent, *tmpfnp;
4306 ssize_t newlen;
4307 nfs4_fname_t key;
4308 avl_index_t where;
4309
4310 /*
4311 * This assert exists to catch the client trying to rename
4312 * a dir to be a child of itself. This happened at a recent
4313 * bakeoff against a 3rd party (broken) server which allowed
4314 * the rename to succeed. If it trips it means that:
4315 * a) the code in nfs4rename that detects this case is broken
4316 * b) the server is broken (since it allowed the bogus rename)
4317 *
4318 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4319 * panic below from: mutex_enter(&newparent->fn_lock);
4320 */
4321 ASSERT(fnp != newparent);
4322
4323 /*
4324 * Remove fnp from its current parent, change its name, then add it
4325 * to newparent. It might happen that fnp was replaced by another
4326 * nfs4_fname_t with the same fn_name in parent->fn_children.
4327 * In such case, fnp->fn_parent is NULL and we skip the removal
4328 * of fnp from its current parent.
4329 */
4330 mutex_enter(&fnp->fn_lock);
4331 parent = fnp->fn_parent;
4332 if (parent != NULL) {
4333 mutex_enter(&parent->fn_lock);
4334 avl_remove(&parent->fn_children, fnp);
4335 mutex_exit(&parent->fn_lock);
4336 fn_rele(&fnp->fn_parent);
4337 }
4338
4339 newlen = strlen(newname);
4340 if (newlen != fnp->fn_len) {
4341 ASSERT(newlen < MAXNAMELEN);
4342 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4343 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4344 fnp->fn_len = newlen;
4345 }
4346 (void) strcpy(fnp->fn_name, newname);
4347
4348 again:
4349 mutex_enter(&newparent->fn_lock);
4350 key.fn_name = fnp->fn_name;
4351 tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4352 if (tmpfnp != NULL) {
4353 /*
4354 * This could be due to a file that was unlinked while
4355 * open, or perhaps the rnode is in the free list. Remove
4356 * it from newparent and let it go away on its own. The
4357 * contorted code is to deal with lock order issues and
4358 * race conditions.
4359 */
4360 fn_hold(tmpfnp);
4361 mutex_exit(&newparent->fn_lock);
4362 mutex_enter(&tmpfnp->fn_lock);
4363 if (tmpfnp->fn_parent == newparent) {
4364 mutex_enter(&newparent->fn_lock);
4365 avl_remove(&newparent->fn_children, tmpfnp);
4366 mutex_exit(&newparent->fn_lock);
4367 fn_rele(&tmpfnp->fn_parent);
4368 }
4369 mutex_exit(&tmpfnp->fn_lock);
4370 fn_rele(&tmpfnp);
4371 goto again;
4372 }
4373 fnp->fn_parent = newparent;
4374 fn_hold(newparent);
4375 avl_insert(&newparent->fn_children, fnp, where);
4376 mutex_exit(&newparent->fn_lock);
4377 mutex_exit(&fnp->fn_lock);
4378 }
4379
4380 #ifdef DEBUG
4381 /*
4382 * Return non-zero if the type information makes sense for the given vnode.
4383 * Otherwise panic.
4384 */
4385 int
4386 nfs4_consistent_type(vnode_t *vp)
4387 {
4388 rnode4_t *rp = VTOR4(vp);
4389
4390 if (nfs4_vtype_debug && vp->v_type != VNON &&
4391 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4392 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4393 "rnode attr type=%d", (void *)vp, vp->v_type,
4394 rp->r_attr.va_type);
4395 }
4396
4397 return (1);
4398 }
4399 #endif /* DEBUG */