1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
29 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
30 */
31
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/proc.h>
37 #include <sys/user.h>
38 #include <sys/time.h>
39 #include <sys/buf.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/socket.h>
43 #include <sys/uio.h>
44 #include <sys/tiuser.h>
45 #include <sys/swap.h>
46 #include <sys/errno.h>
47 #include <sys/debug.h>
48 #include <sys/kmem.h>
49 #include <sys/kstat.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vtrace.h>
52 #include <sys/session.h>
53 #include <sys/dnlc.h>
54 #include <sys/bitmap.h>
55 #include <sys/acl.h>
56 #include <sys/ddi.h>
57 #include <sys/pathname.h>
58 #include <sys/flock.h>
59 #include <sys/dirent.h>
60 #include <sys/flock.h>
61 #include <sys/callb.h>
62 #include <sys/atomic.h>
63 #include <sys/list.h>
64 #include <sys/tsol/tnet.h>
65 #include <sys/priv.h>
66 #include <sys/sdt.h>
67 #include <sys/attr.h>
68
69 #include <inet/ip6.h>
70
71 #include <rpc/types.h>
72 #include <rpc/xdr.h>
73 #include <rpc/auth.h>
74 #include <rpc/clnt.h>
75
76 #include <nfs/nfs.h>
77 #include <nfs/nfs4.h>
78 #include <nfs/nfs_clnt.h>
79 #include <nfs/rnode.h>
80 #include <nfs/nfs_acl.h>
81
82 #include <sys/tsol/label.h>
83
84 /*
85 * The hash queues for the access to active and cached rnodes
86 * are organized as doubly linked lists. A reader/writer lock
87 * for each hash bucket is used to control access and to synchronize
88 * lookups, additions, and deletions from the hash queue.
89 *
90 * The rnode freelist is organized as a doubly linked list with
91 * a head pointer. Additions and deletions are synchronized via
92 * a single mutex.
93 *
94 * In order to add an rnode to the free list, it must be hashed into
95 * a hash queue and the exclusive lock to the hash queue be held.
96 * If an rnode is not hashed into a hash queue, then it is destroyed
97 * because it represents no valuable information that can be reused
98 * about the file. The exclusive lock to the hash queue must be
99 * held in order to prevent a lookup in the hash queue from finding
100 * the rnode and using it and assuming that the rnode is not on the
101 * freelist. The lookup in the hash queue will have the hash queue
102 * locked, either exclusive or shared.
103 *
104 * The vnode reference count for each rnode is not allowed to drop
105 * below 1. This prevents external entities, such as the VM
106 * subsystem, from acquiring references to vnodes already on the
107 * freelist and then trying to place them back on the freelist
108 * when their reference is released. This means that the when an
109 * rnode is looked up in the hash queues, then either the rnode
110 * is removed from the freelist and that reference is transferred to
111 * the new reference or the vnode reference count must be incremented
112 * accordingly. The mutex for the freelist must be held in order to
113 * accurately test to see if the rnode is on the freelist or not.
114 * The hash queue lock might be held shared and it is possible that
115 * two different threads may race to remove the rnode from the
116 * freelist. This race can be resolved by holding the mutex for the
117 * freelist. Please note that the mutex for the freelist does not
118 * need to held if the rnode is not on the freelist. It can not be
119 * placed on the freelist due to the requirement that the thread
120 * putting the rnode on the freelist must hold the exclusive lock
121 * to the hash queue and the thread doing the lookup in the hash
122 * queue is holding either a shared or exclusive lock to the hash
123 * queue.
124 *
125 * The lock ordering is:
126 *
127 * hash bucket lock -> vnode lock
128 * hash bucket lock -> freelist lock
129 */
130 static rhashq_t *rtable;
131
132 static kmutex_t rpfreelist_lock;
133 static rnode_t *rpfreelist = NULL;
134 static long rnew = 0;
135 volatile long nrnode = 0;
136
137 static int rtablesize;
138 static int rtablemask;
139
140 static int hashlen = 4;
141
142 static struct kmem_cache *rnode_cache;
143
144 /*
145 * Mutex to protect the following variables:
146 * nfs_major
147 * nfs_minor
148 */
149 kmutex_t nfs_minor_lock;
150 int nfs_major;
151 int nfs_minor;
152
153 /*
154 * Do we allow preepoch (negative) time values otw?
155 * default: do not allow preepoch
156 */
157 volatile bool_t nfs_allow_preepoch_time = FALSE;
158
159 /*
160 * Access cache
161 */
162 static acache_hash_t *acache;
163 volatile long nacache; /* used strictly to size the number of hash queues */
164
165 static int acachesize;
166 static int acachemask;
167 static struct kmem_cache *acache_cache;
168
169 /*
170 * Client side utilities
171 */
172
173 /*
174 * client side statistics
175 */
176 static const struct clstat clstat_tmpl = {
177 { "calls", KSTAT_DATA_UINT64 },
178 { "badcalls", KSTAT_DATA_UINT64 },
179 { "clgets", KSTAT_DATA_UINT64 },
180 { "cltoomany", KSTAT_DATA_UINT64 },
181 #ifdef DEBUG
182 { "clalloc", KSTAT_DATA_UINT64 },
183 { "noresponse", KSTAT_DATA_UINT64 },
184 { "failover", KSTAT_DATA_UINT64 },
185 { "remap", KSTAT_DATA_UINT64 },
186 #endif
187 };
188
189 /*
190 * The following are statistics that describe behavior of the system as a whole
191 * and doesn't correspond to any one particular zone.
192 */
193 #ifdef DEBUG
194 static struct clstat_debug {
195 kstat_named_t nrnode; /* number of allocated rnodes */
196 kstat_named_t access; /* size of access cache */
197 kstat_named_t dirent; /* size of readdir cache */
198 kstat_named_t dirents; /* size of readdir buf cache */
199 kstat_named_t reclaim; /* number of reclaims */
200 kstat_named_t clreclaim; /* number of cl reclaims */
201 kstat_named_t f_reclaim; /* number of free reclaims */
202 kstat_named_t a_reclaim; /* number of active reclaims */
203 kstat_named_t r_reclaim; /* number of rnode reclaims */
204 kstat_named_t rpath; /* bytes used to store rpaths */
205 } clstat_debug = {
206 { "nrnode", KSTAT_DATA_UINT64 },
207 { "access", KSTAT_DATA_UINT64 },
208 { "dirent", KSTAT_DATA_UINT64 },
209 { "dirents", KSTAT_DATA_UINT64 },
210 { "reclaim", KSTAT_DATA_UINT64 },
211 { "clreclaim", KSTAT_DATA_UINT64 },
212 { "f_reclaim", KSTAT_DATA_UINT64 },
213 { "a_reclaim", KSTAT_DATA_UINT64 },
214 { "r_reclaim", KSTAT_DATA_UINT64 },
215 { "r_path", KSTAT_DATA_UINT64 },
216 };
217 #endif /* DEBUG */
218
219 /*
220 * We keep a global list of per-zone client data, so we can clean up all zones
221 * if we get low on memory.
222 */
223 static list_t nfs_clnt_list;
224 static kmutex_t nfs_clnt_list_lock;
225 static zone_key_t nfsclnt_zone_key;
226
227 static struct kmem_cache *chtab_cache;
228
229 /*
230 * Some servers do not properly update the attributes of the
231 * directory when changes are made. To allow interoperability
232 * with these broken servers, the nfs_disable_rddir_cache
233 * parameter must be set in /etc/system
234 */
235 volatile int nfs_disable_rddir_cache = 0;
236
237 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
238 struct chtab **);
239 void clfree(CLIENT *, struct chtab *);
240 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
241 struct chtab **, struct nfs_clnt *);
242 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
243 struct chtab **, struct nfs_clnt *);
244 static void clreclaim(void *);
245 static int nfs_feedback(int, int, mntinfo_t *);
246 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
247 caddr_t, cred_t *, int *, enum clnt_stat *, int,
248 failinfo_t *);
249 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
250 caddr_t, cred_t *, int *, int, failinfo_t *);
251 static void rinactive(rnode_t *, cred_t *);
252 static int rtablehash(nfs_fhandle *);
253 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
254 struct vnodeops *,
255 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
256 cred_t *),
257 int (*)(const void *, const void *), int *, cred_t *,
258 char *, char *);
259 static void rp_rmfree(rnode_t *);
260 static void rp_addhash(rnode_t *);
261 static void rp_rmhash_locked(rnode_t *);
262 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
263 static void destroy_rnode(rnode_t *);
264 static void rddir_cache_free(rddir_cache *);
265 static int nfs_free_data_reclaim(rnode_t *);
266 static int nfs_active_data_reclaim(rnode_t *);
267 static int nfs_free_reclaim(void);
268 static int nfs_active_reclaim(void);
269 static int nfs_rnode_reclaim(void);
270 static void nfs_reclaim(void *);
271 static int failover_safe(failinfo_t *);
272 static void failover_newserver(mntinfo_t *mi);
273 static void failover_thread(mntinfo_t *mi);
274 static int failover_wait(mntinfo_t *);
275 static int failover_remap(failinfo_t *);
276 static int failover_lookup(char *, vnode_t *,
277 int (*)(vnode_t *, char *, vnode_t **,
278 struct pathname *, int, vnode_t *, cred_t *, int),
279 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
280 vnode_t **);
281 static void nfs_free_r_path(rnode_t *);
282 static void nfs_set_vroot(vnode_t *);
283 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
284
285 /*
286 * from rpcsec module (common/rpcsec)
287 */
288 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
289 extern void sec_clnt_freeh(AUTH *);
290 extern void sec_clnt_freeinfo(struct sec_data *);
291
292 /*
293 * used in mount policy
294 */
295 extern ts_label_t *getflabel_cipso(vfs_t *);
296
297 /*
298 * EIO or EINTR are not recoverable errors.
299 */
300 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
301
302 #ifdef DEBUG
303 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
304 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
305 #else
306 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
307 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
308 #endif
309 /*
310 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
311 */
312 static int
313 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
314 struct chtab **chp, struct nfs_clnt *nfscl)
315 {
316 struct chhead *ch, *newch;
317 struct chhead **plistp;
318 struct chtab *cp;
319 int error;
320 k_sigset_t smask;
321
322 if (newcl == NULL || chp == NULL || ci == NULL)
323 return (EINVAL);
324
325 *newcl = NULL;
326 *chp = NULL;
327
328 /*
329 * Find an unused handle or create one
330 */
331 newch = NULL;
332 nfscl->nfscl_stat.clgets.value.ui64++;
333 top:
334 /*
335 * Find the correct entry in the cache to check for free
336 * client handles. The search is based on the RPC program
337 * number, program version number, dev_t for the transport
338 * device, and the protocol family.
339 */
340 mutex_enter(&nfscl->nfscl_chtable_lock);
341 plistp = &nfscl->nfscl_chtable;
342 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
343 if (ch->ch_prog == ci->cl_prog &&
344 ch->ch_vers == ci->cl_vers &&
345 ch->ch_dev == svp->sv_knconf->knc_rdev &&
346 (strcmp(ch->ch_protofmly,
347 svp->sv_knconf->knc_protofmly) == 0))
348 break;
349 plistp = &ch->ch_next;
350 }
351
352 /*
353 * If we didn't find a cache entry for this quadruple, then
354 * create one. If we don't have one already preallocated,
355 * then drop the cache lock, create one, and then start over.
356 * If we did have a preallocated entry, then just add it to
357 * the front of the list.
358 */
359 if (ch == NULL) {
360 if (newch == NULL) {
361 mutex_exit(&nfscl->nfscl_chtable_lock);
362 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
363 newch->ch_timesused = 0;
364 newch->ch_prog = ci->cl_prog;
365 newch->ch_vers = ci->cl_vers;
366 newch->ch_dev = svp->sv_knconf->knc_rdev;
367 newch->ch_protofmly = kmem_alloc(
368 strlen(svp->sv_knconf->knc_protofmly) + 1,
369 KM_SLEEP);
370 (void) strcpy(newch->ch_protofmly,
371 svp->sv_knconf->knc_protofmly);
372 newch->ch_list = NULL;
373 goto top;
374 }
375 ch = newch;
376 newch = NULL;
377 ch->ch_next = nfscl->nfscl_chtable;
378 nfscl->nfscl_chtable = ch;
379 /*
380 * We found a cache entry, but if it isn't on the front of the
381 * list, then move it to the front of the list to try to take
382 * advantage of locality of operations.
383 */
384 } else if (ch != nfscl->nfscl_chtable) {
385 *plistp = ch->ch_next;
386 ch->ch_next = nfscl->nfscl_chtable;
387 nfscl->nfscl_chtable = ch;
388 }
389
390 /*
391 * If there was a free client handle cached, then remove it
392 * from the list, init it, and use it.
393 */
394 if (ch->ch_list != NULL) {
395 cp = ch->ch_list;
396 ch->ch_list = cp->ch_list;
397 mutex_exit(&nfscl->nfscl_chtable_lock);
398 if (newch != NULL) {
399 kmem_free(newch->ch_protofmly,
400 strlen(newch->ch_protofmly) + 1);
401 kmem_free(newch, sizeof (*newch));
402 }
403 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
404 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
405 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
406 &cp->ch_client->cl_auth);
407 if (error || cp->ch_client->cl_auth == NULL) {
408 CLNT_DESTROY(cp->ch_client);
409 kmem_cache_free(chtab_cache, cp);
410 return ((error != 0) ? error : EINTR);
411 }
412 ch->ch_timesused++;
413 *newcl = cp->ch_client;
414 *chp = cp;
415 return (0);
416 }
417
418 /*
419 * There weren't any free client handles which fit, so allocate
420 * a new one and use that.
421 */
422 #ifdef DEBUG
423 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
424 #endif
425 mutex_exit(&nfscl->nfscl_chtable_lock);
426
427 nfscl->nfscl_stat.cltoomany.value.ui64++;
428 if (newch != NULL) {
429 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
430 kmem_free(newch, sizeof (*newch));
431 }
432
433 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
434 cp->ch_head = ch;
435
436 sigintr(&smask, (int)ci->cl_flags & MI_INT);
437 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
438 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
439 sigunintr(&smask);
440
441 if (error != 0) {
442 kmem_cache_free(chtab_cache, cp);
443 #ifdef DEBUG
444 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
445 #endif
446 /*
447 * Warning is unnecessary if error is EINTR.
448 */
449 if (error != EINTR) {
450 nfs_cmn_err(error, CE_WARN,
451 "clget: couldn't create handle: %m\n");
452 }
453 return (error);
454 }
455 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
456 auth_destroy(cp->ch_client->cl_auth);
457 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
458 &cp->ch_client->cl_auth);
459 if (error || cp->ch_client->cl_auth == NULL) {
460 CLNT_DESTROY(cp->ch_client);
461 kmem_cache_free(chtab_cache, cp);
462 #ifdef DEBUG
463 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
464 #endif
465 return ((error != 0) ? error : EINTR);
466 }
467 ch->ch_timesused++;
468 *newcl = cp->ch_client;
469 ASSERT(cp->ch_client->cl_nosignal == FALSE);
470 *chp = cp;
471 return (0);
472 }
473
474 int
475 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
476 struct chtab **chp)
477 {
478 struct nfs_clnt *nfscl;
479
480 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
481 ASSERT(nfscl != NULL);
482
483 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
484 }
485
486 static int
487 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
488 struct chtab **chp, struct nfs_clnt *nfscl)
489 {
490 clinfo_t ci;
491 int error;
492
493 /*
494 * Set read buffer size to rsize
495 * and add room for RPC headers.
496 */
497 ci.cl_readsize = mi->mi_tsize;
498 if (ci.cl_readsize != 0)
499 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
500
501 /*
502 * If soft mount and server is down just try once.
503 * meaning: do not retransmit.
504 */
505 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
506 ci.cl_retrans = 0;
507 else
508 ci.cl_retrans = mi->mi_retrans;
509
510 ci.cl_prog = NFS_ACL_PROGRAM;
511 ci.cl_vers = mi->mi_vers;
512 ci.cl_flags = mi->mi_flags;
513
514 /*
515 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
516 * security flavor, the client tries to establish a security context
517 * by contacting the server. If the connection is timed out or reset,
518 * e.g. server reboot, we will try again.
519 */
520 do {
521 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
522
523 if (error == 0)
524 break;
525
526 /*
527 * For forced unmount or zone shutdown, bail out, no retry.
528 */
529 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
530 error = EIO;
531 break;
532 }
533
534 /* do not retry for softmount */
535 if (!(mi->mi_flags & MI_HARD))
536 break;
537
538 /* let the caller deal with the failover case */
539 if (FAILOVER_MOUNT(mi))
540 break;
541
542 } while (error == ETIMEDOUT || error == ECONNRESET);
543
544 return (error);
545 }
546
547 static int
548 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
549 struct chtab **chp, struct nfs_clnt *nfscl)
550 {
551 clinfo_t ci;
552 int error;
553
554 /*
555 * Set read buffer size to rsize
556 * and add room for RPC headers.
557 */
558 ci.cl_readsize = mi->mi_tsize;
559 if (ci.cl_readsize != 0)
560 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
561
562 /*
563 * If soft mount and server is down just try once.
564 * meaning: do not retransmit.
565 */
566 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
567 ci.cl_retrans = 0;
568 else
569 ci.cl_retrans = mi->mi_retrans;
570
571 ci.cl_prog = mi->mi_prog;
572 ci.cl_vers = mi->mi_vers;
573 ci.cl_flags = mi->mi_flags;
574
575 /*
576 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
577 * security flavor, the client tries to establish a security context
578 * by contacting the server. If the connection is timed out or reset,
579 * e.g. server reboot, we will try again.
580 */
581 do {
582 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
583
584 if (error == 0)
585 break;
586
587 /*
588 * For forced unmount or zone shutdown, bail out, no retry.
589 */
590 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
591 error = EIO;
592 break;
593 }
594
595 /* do not retry for softmount */
596 if (!(mi->mi_flags & MI_HARD))
597 break;
598
599 /* let the caller deal with the failover case */
600 if (FAILOVER_MOUNT(mi))
601 break;
602
603 } while (error == ETIMEDOUT || error == ECONNRESET);
604
605 return (error);
606 }
607
608 static void
609 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
610 {
611 if (cl->cl_auth != NULL) {
612 sec_clnt_freeh(cl->cl_auth);
613 cl->cl_auth = NULL;
614 }
615
616 /*
617 * Timestamp this cache entry so that we know when it was last
618 * used.
619 */
620 cp->ch_freed = gethrestime_sec();
621
622 /*
623 * Add the free client handle to the front of the list.
624 * This way, the list will be sorted in youngest to oldest
625 * order.
626 */
627 mutex_enter(&nfscl->nfscl_chtable_lock);
628 cp->ch_list = cp->ch_head->ch_list;
629 cp->ch_head->ch_list = cp;
630 mutex_exit(&nfscl->nfscl_chtable_lock);
631 }
632
633 void
634 clfree(CLIENT *cl, struct chtab *cp)
635 {
636 struct nfs_clnt *nfscl;
637
638 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
639 ASSERT(nfscl != NULL);
640
641 clfree_impl(cl, cp, nfscl);
642 }
643
644 #define CL_HOLDTIME 60 /* time to hold client handles */
645
646 static void
647 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
648 {
649 struct chhead *ch;
650 struct chtab *cp; /* list of objects that can be reclaimed */
651 struct chtab *cpe;
652 struct chtab *cpl;
653 struct chtab **cpp;
654 #ifdef DEBUG
655 int n = 0;
656 #endif
657
658 /*
659 * Need to reclaim some memory, so step through the cache
660 * looking through the lists for entries which can be freed.
661 */
662 cp = NULL;
663
664 mutex_enter(&nfscl->nfscl_chtable_lock);
665
666 /*
667 * Here we step through each non-NULL quadruple and start to
668 * construct the reclaim list pointed to by cp. Note that
669 * cp will contain all eligible chtab entries. When this traversal
670 * completes, chtab entries from the last quadruple will be at the
671 * front of cp and entries from previously inspected quadruples have
672 * been appended to the rear of cp.
673 */
674 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
675 if (ch->ch_list == NULL)
676 continue;
677 /*
678 * Search each list for entries older then
679 * cl_holdtime seconds. The lists are maintained
680 * in youngest to oldest order so that when the
681 * first entry is found which is old enough, then
682 * all of the rest of the entries on the list will
683 * be old enough as well.
684 */
685 cpl = ch->ch_list;
686 cpp = &ch->ch_list;
687 while (cpl != NULL &&
688 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
689 cpp = &cpl->ch_list;
690 cpl = cpl->ch_list;
691 }
692 if (cpl != NULL) {
693 *cpp = NULL;
694 if (cp != NULL) {
695 cpe = cpl;
696 while (cpe->ch_list != NULL)
697 cpe = cpe->ch_list;
698 cpe->ch_list = cp;
699 }
700 cp = cpl;
701 }
702 }
703
704 mutex_exit(&nfscl->nfscl_chtable_lock);
705
706 /*
707 * If cp is empty, then there is nothing to reclaim here.
708 */
709 if (cp == NULL)
710 return;
711
712 /*
713 * Step through the list of entries to free, destroying each client
714 * handle and kmem_free'ing the memory for each entry.
715 */
716 while (cp != NULL) {
717 #ifdef DEBUG
718 n++;
719 #endif
720 CLNT_DESTROY(cp->ch_client);
721 cpl = cp->ch_list;
722 kmem_cache_free(chtab_cache, cp);
723 cp = cpl;
724 }
725
726 #ifdef DEBUG
727 /*
728 * Update clalloc so that nfsstat shows the current number
729 * of allocated client handles.
730 */
731 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
732 #endif
733 }
734
735 /* ARGSUSED */
736 static void
737 clreclaim(void *all)
738 {
739 struct nfs_clnt *nfscl;
740
741 #ifdef DEBUG
742 clstat_debug.clreclaim.value.ui64++;
743 #endif
744 /*
745 * The system is low on memory; go through and try to reclaim some from
746 * every zone on the system.
747 */
748 mutex_enter(&nfs_clnt_list_lock);
749 nfscl = list_head(&nfs_clnt_list);
750 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
751 clreclaim_zone(nfscl, CL_HOLDTIME);
752 mutex_exit(&nfs_clnt_list_lock);
753 }
754
755 /*
756 * Minimum time-out values indexed by call type
757 * These units are in "eights" of a second to avoid multiplies
758 */
759 static unsigned int minimum_timeo[] = {
760 6, 7, 10
761 };
762
763 /*
764 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
765 */
766 #define MAXTIMO (20*hz)
767 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
768 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
769
770 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
771 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
772 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
773
774 /*
775 * Function called when rfscall notices that we have been
776 * re-transmitting, or when we get a response without retransmissions.
777 * Return 1 if the transfer size was adjusted down - 0 if no change.
778 */
779 static int
780 nfs_feedback(int flag, int which, mntinfo_t *mi)
781 {
782 int kind;
783 int r = 0;
784
785 mutex_enter(&mi->mi_lock);
786 if (flag == FEEDBACK_REXMIT1) {
787 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
788 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
789 goto done;
790 if (mi->mi_curread > MIN_NFS_TSIZE) {
791 mi->mi_curread /= 2;
792 if (mi->mi_curread < MIN_NFS_TSIZE)
793 mi->mi_curread = MIN_NFS_TSIZE;
794 r = 1;
795 }
796
797 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
798 mi->mi_curwrite /= 2;
799 if (mi->mi_curwrite < MIN_NFS_TSIZE)
800 mi->mi_curwrite = MIN_NFS_TSIZE;
801 r = 1;
802 }
803 } else if (flag == FEEDBACK_OK) {
804 kind = mi->mi_timer_type[which];
805 if (kind == 0 ||
806 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
807 goto done;
808 if (kind == 1) {
809 if (mi->mi_curread >= mi->mi_tsize)
810 goto done;
811 mi->mi_curread += MIN_NFS_TSIZE;
812 if (mi->mi_curread > mi->mi_tsize/2)
813 mi->mi_curread = mi->mi_tsize;
814 } else if (kind == 2) {
815 if (mi->mi_curwrite >= mi->mi_stsize)
816 goto done;
817 mi->mi_curwrite += MIN_NFS_TSIZE;
818 if (mi->mi_curwrite > mi->mi_stsize/2)
819 mi->mi_curwrite = mi->mi_stsize;
820 }
821 }
822 done:
823 mutex_exit(&mi->mi_lock);
824 return (r);
825 }
826
827 #ifdef DEBUG
828 static int rfs2call_hits = 0;
829 static int rfs2call_misses = 0;
830 #endif
831
832 int
833 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
834 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
835 enum nfsstat *statusp, int flags, failinfo_t *fi)
836 {
837 int rpcerror;
838 enum clnt_stat rpc_status;
839
840 ASSERT(statusp != NULL);
841
842 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
843 cr, douprintf, &rpc_status, flags, fi);
844 if (!rpcerror) {
845 /*
846 * See crnetadjust() for comments.
847 */
848 if (*statusp == NFSERR_ACCES &&
849 (cr = crnetadjust(cr)) != NULL) {
850 #ifdef DEBUG
851 rfs2call_hits++;
852 #endif
853 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
854 resp, cr, douprintf, NULL, flags, fi);
855 crfree(cr);
856 #ifdef DEBUG
857 if (*statusp == NFSERR_ACCES)
858 rfs2call_misses++;
859 #endif
860 }
861 } else if (rpc_status == RPC_PROCUNAVAIL) {
862 *statusp = NFSERR_OPNOTSUPP;
863 rpcerror = 0;
864 }
865
866 return (rpcerror);
867 }
868
869 #define NFS3_JUKEBOX_DELAY 10 * hz
870
871 volatile clock_t nfs3_jukebox_delay = 0;
872
873 #ifdef DEBUG
874 static int rfs3call_hits = 0;
875 static int rfs3call_misses = 0;
876 #endif
877
878 int
879 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
880 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
881 nfsstat3 *statusp, int flags, failinfo_t *fi)
882 {
883 int rpcerror;
884 int user_informed;
885
886 user_informed = 0;
887 do {
888 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
889 cr, douprintf, NULL, flags, fi);
890 if (!rpcerror) {
891 cred_t *crr;
892 if (*statusp == NFS3ERR_JUKEBOX) {
893 if (ttoproc(curthread) == &p0) {
894 rpcerror = EAGAIN;
895 break;
896 }
897 if (!user_informed) {
898 user_informed = 1;
899 uprintf(
900 "file temporarily unavailable on the server, retrying...\n");
901 }
902 delay(nfs3_jukebox_delay);
903 }
904 /*
905 * See crnetadjust() for comments.
906 */
907 else if (*statusp == NFS3ERR_ACCES &&
908 (crr = crnetadjust(cr)) != NULL) {
909 #ifdef DEBUG
910 rfs3call_hits++;
911 #endif
912 rpcerror = rfscall(mi, which, xdrargs, argsp,
913 xdrres, resp, crr, douprintf,
914 NULL, flags, fi);
915
916 crfree(crr);
917 #ifdef DEBUG
918 if (*statusp == NFS3ERR_ACCES)
919 rfs3call_misses++;
920 #endif
921 }
922 }
923 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
924
925 return (rpcerror);
926 }
927
928 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
929 #define INC_READERS(mi) { \
930 mi->mi_readers++; \
931 }
932 #define DEC_READERS(mi) { \
933 mi->mi_readers--; \
934 if (mi->mi_readers == 0) \
935 cv_broadcast(&mi->mi_failover_cv); \
936 }
937
938 static int
939 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
940 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
941 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
942 {
943 CLIENT *client;
944 struct chtab *ch;
945 cred_t *cr = icr;
946 enum clnt_stat status;
947 struct rpc_err rpcerr, rpcerr_tmp;
948 struct timeval wait;
949 int timeo; /* in units of hz */
950 int my_rsize, my_wsize;
951 bool_t tryagain;
952 bool_t cred_cloned = FALSE;
953 k_sigset_t smask;
954 servinfo_t *svp;
955 struct nfs_clnt *nfscl;
956 zoneid_t zoneid = getzoneid();
957 char *msg;
958 #ifdef DEBUG
959 char *bufp;
960 #endif
961
962
963 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
964 "rfscall_start:which %d mi %p", which, mi);
965
966 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
967 ASSERT(nfscl != NULL);
968
969 nfscl->nfscl_stat.calls.value.ui64++;
970 mi->mi_reqs[which].value.ui64++;
971
972 rpcerr.re_status = RPC_SUCCESS;
973
974 /*
975 * In case of forced unmount or zone shutdown, return EIO.
976 */
977
978 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
979 rpcerr.re_status = RPC_FAILED;
980 rpcerr.re_errno = EIO;
981 return (rpcerr.re_errno);
982 }
983
984 /*
985 * Remember the transfer sizes in case
986 * nfs_feedback changes them underneath us.
987 */
988 my_rsize = mi->mi_curread;
989 my_wsize = mi->mi_curwrite;
990
991 /*
992 * NFS client failover support
993 *
994 * If this rnode is not in sync with the current server (VALID_FH),
995 * we'd like to do a remap to get in sync. We can be interrupted
996 * in failover_remap(), and if so we'll bail. Otherwise, we'll
997 * use the best info we have to try the RPC. Part of that is
998 * unconditionally updating the filehandle copy kept for V3.
999 *
1000 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1001 * rw_enter(); we're trying to keep the current server from being
1002 * changed on us until we're done with the remapping and have a
1003 * matching client handle. We don't want to sending a filehandle
1004 * to the wrong host.
1005 */
1006 failoverretry:
1007 if (FAILOVER_MOUNT(mi)) {
1008 mutex_enter(&mi->mi_lock);
1009 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1010 if (failover_wait(mi)) {
1011 mutex_exit(&mi->mi_lock);
1012 return (EINTR);
1013 }
1014 }
1015 INC_READERS(mi);
1016 mutex_exit(&mi->mi_lock);
1017 if (fi) {
1018 if (!VALID_FH(fi) &&
1019 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1020 int remaperr;
1021
1022 svp = mi->mi_curr_serv;
1023 remaperr = failover_remap(fi);
1024 if (remaperr != 0) {
1025 #ifdef DEBUG
1026 if (remaperr != EINTR)
1027 nfs_cmn_err(remaperr, CE_WARN,
1028 "rfscall couldn't failover: %m");
1029 #endif
1030 mutex_enter(&mi->mi_lock);
1031 DEC_READERS(mi);
1032 mutex_exit(&mi->mi_lock);
1033 /*
1034 * If failover_remap returns ETIMEDOUT
1035 * and the filesystem is hard mounted
1036 * we have to retry the call with a new
1037 * server.
1038 */
1039 if ((mi->mi_flags & MI_HARD) &&
1040 IS_RECOVERABLE_ERROR(remaperr)) {
1041 if (svp == mi->mi_curr_serv)
1042 failover_newserver(mi);
1043 rpcerr.re_status = RPC_SUCCESS;
1044 goto failoverretry;
1045 }
1046 rpcerr.re_errno = remaperr;
1047 return (remaperr);
1048 }
1049 }
1050 if (fi->fhp && fi->copyproc)
1051 (*fi->copyproc)(fi->fhp, fi->vp);
1052 }
1053 }
1054
1055 /* For TSOL, use a new cred which has net_mac_aware flag */
1056 if (!cred_cloned && is_system_labeled()) {
1057 cred_cloned = TRUE;
1058 cr = crdup(icr);
1059 (void) setpflags(NET_MAC_AWARE, 1, cr);
1060 }
1061
1062 /*
1063 * clget() calls clnt_tli_kinit() which clears the xid, so we
1064 * are guaranteed to reprocess the retry as a new request.
1065 */
1066 svp = mi->mi_curr_serv;
1067 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1068
1069 if (FAILOVER_MOUNT(mi)) {
1070 mutex_enter(&mi->mi_lock);
1071 DEC_READERS(mi);
1072 mutex_exit(&mi->mi_lock);
1073
1074 if ((rpcerr.re_errno == ETIMEDOUT ||
1075 rpcerr.re_errno == ECONNRESET) &&
1076 failover_safe(fi)) {
1077 if (svp == mi->mi_curr_serv)
1078 failover_newserver(mi);
1079 goto failoverretry;
1080 }
1081 }
1082 if (rpcerr.re_errno != 0)
1083 return (rpcerr.re_errno);
1084
1085 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1086 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1087 timeo = (mi->mi_timeo * hz) / 10;
1088 } else {
1089 mutex_enter(&mi->mi_lock);
1090 timeo = CLNT_SETTIMERS(client,
1091 &(mi->mi_timers[mi->mi_timer_type[which]]),
1092 &(mi->mi_timers[NFS_CALLTYPES]),
1093 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1094 (void (*)())NULL, (caddr_t)mi, 0);
1095 mutex_exit(&mi->mi_lock);
1096 }
1097
1098 /*
1099 * If hard mounted fs, retry call forever unless hard error occurs.
1100 */
1101 do {
1102 tryagain = FALSE;
1103
1104 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1105 status = RPC_FAILED;
1106 rpcerr.re_status = RPC_FAILED;
1107 rpcerr.re_errno = EIO;
1108 break;
1109 }
1110
1111 TICK_TO_TIMEVAL(timeo, &wait);
1112
1113 /*
1114 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1115 * and SIGTERM. (Preserving the existing masks).
1116 * Mask out SIGINT if mount option nointr is specified.
1117 */
1118 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1119 if (!(mi->mi_flags & MI_INT))
1120 client->cl_nosignal = TRUE;
1121
1122 /*
1123 * If there is a current signal, then don't bother
1124 * even trying to send out the request because we
1125 * won't be able to block waiting for the response.
1126 * Simply assume RPC_INTR and get on with it.
1127 */
1128 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1129 status = RPC_INTR;
1130 else {
1131 status = CLNT_CALL(client, which, xdrargs, argsp,
1132 xdrres, resp, wait);
1133 }
1134
1135 if (!(mi->mi_flags & MI_INT))
1136 client->cl_nosignal = FALSE;
1137 /*
1138 * restore original signal mask
1139 */
1140 sigunintr(&smask);
1141
1142 switch (status) {
1143 case RPC_SUCCESS:
1144 if ((mi->mi_flags & MI_DYNAMIC) &&
1145 mi->mi_timer_type[which] != 0 &&
1146 (mi->mi_curread != my_rsize ||
1147 mi->mi_curwrite != my_wsize))
1148 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1149 break;
1150
1151 case RPC_INTR:
1152 /*
1153 * There is no way to recover from this error,
1154 * even if mount option nointr is specified.
1155 * SIGKILL, for example, cannot be blocked.
1156 */
1157 rpcerr.re_status = RPC_INTR;
1158 rpcerr.re_errno = EINTR;
1159 break;
1160
1161 case RPC_UDERROR:
1162 /*
1163 * If the NFS server is local (vold) and
1164 * it goes away then we get RPC_UDERROR.
1165 * This is a retryable error, so we would
1166 * loop, so check to see if the specific
1167 * error was ECONNRESET, indicating that
1168 * target did not exist at all. If so,
1169 * return with RPC_PROGUNAVAIL and
1170 * ECONNRESET to indicate why.
1171 */
1172 CLNT_GETERR(client, &rpcerr);
1173 if (rpcerr.re_errno == ECONNRESET) {
1174 rpcerr.re_status = RPC_PROGUNAVAIL;
1175 rpcerr.re_errno = ECONNRESET;
1176 break;
1177 }
1178 /*FALLTHROUGH*/
1179
1180 default: /* probably RPC_TIMEDOUT */
1181 if (IS_UNRECOVERABLE_RPC(status))
1182 break;
1183
1184 /*
1185 * increment server not responding count
1186 */
1187 mutex_enter(&mi->mi_lock);
1188 mi->mi_noresponse++;
1189 mutex_exit(&mi->mi_lock);
1190 #ifdef DEBUG
1191 nfscl->nfscl_stat.noresponse.value.ui64++;
1192 #endif
1193
1194 if (!(mi->mi_flags & MI_HARD)) {
1195 if (!(mi->mi_flags & MI_SEMISOFT) ||
1196 (mi->mi_ss_call_type[which] == 0))
1197 break;
1198 }
1199
1200 /*
1201 * The call is in progress (over COTS).
1202 * Try the CLNT_CALL again, but don't
1203 * print a noisy error message.
1204 */
1205 if (status == RPC_INPROGRESS) {
1206 tryagain = TRUE;
1207 break;
1208 }
1209
1210 if (flags & RFSCALL_SOFT)
1211 break;
1212
1213 /*
1214 * On zone shutdown, just move on.
1215 */
1216 if (zone_status_get(curproc->p_zone) >=
1217 ZONE_IS_SHUTTING_DOWN) {
1218 rpcerr.re_status = RPC_FAILED;
1219 rpcerr.re_errno = EIO;
1220 break;
1221 }
1222
1223 /*
1224 * NFS client failover support
1225 *
1226 * If the current server just failed us, we'll
1227 * start the process of finding a new server.
1228 * After that, we can just retry.
1229 */
1230 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1231 if (svp == mi->mi_curr_serv)
1232 failover_newserver(mi);
1233 clfree_impl(client, ch, nfscl);
1234 goto failoverretry;
1235 }
1236
1237 tryagain = TRUE;
1238 timeo = backoff(timeo);
1239
1240 CLNT_GETERR(client, &rpcerr_tmp);
1241 if ((status == RPC_CANTSEND) &&
1242 (rpcerr_tmp.re_errno == ENOBUFS))
1243 msg = SRV_QFULL_MSG;
1244 else
1245 msg = SRV_NOTRESP_MSG;
1246
1247 mutex_enter(&mi->mi_lock);
1248 if (!(mi->mi_flags & MI_PRINTED)) {
1249 mi->mi_flags |= MI_PRINTED;
1250 mutex_exit(&mi->mi_lock);
1251 #ifdef DEBUG
1252 zprintf(zoneid, msg, mi->mi_vers,
1253 svp->sv_hostname);
1254 #else
1255 zprintf(zoneid, msg, svp->sv_hostname);
1256 #endif
1257 } else
1258 mutex_exit(&mi->mi_lock);
1259 if (*douprintf && nfs_has_ctty()) {
1260 *douprintf = 0;
1261 if (!(mi->mi_flags & MI_NOPRINT))
1262 #ifdef DEBUG
1263 uprintf(msg, mi->mi_vers,
1264 svp->sv_hostname);
1265 #else
1266 uprintf(msg, svp->sv_hostname);
1267 #endif
1268 }
1269
1270 /*
1271 * If doing dynamic adjustment of transfer
1272 * size and if it's a read or write call
1273 * and if the transfer size changed while
1274 * retransmitting or if the feedback routine
1275 * changed the transfer size,
1276 * then exit rfscall so that the transfer
1277 * size can be adjusted at the vnops level.
1278 */
1279 if ((mi->mi_flags & MI_DYNAMIC) &&
1280 mi->mi_timer_type[which] != 0 &&
1281 (mi->mi_curread != my_rsize ||
1282 mi->mi_curwrite != my_wsize ||
1283 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1284 /*
1285 * On read or write calls, return
1286 * back to the vnode ops level if
1287 * the transfer size changed.
1288 */
1289 clfree_impl(client, ch, nfscl);
1290 if (cred_cloned)
1291 crfree(cr);
1292 return (ENFS_TRYAGAIN);
1293 }
1294 }
1295 } while (tryagain);
1296
1297 if (status != RPC_SUCCESS) {
1298 /*
1299 * Let soft mounts use the timed out message.
1300 */
1301 if (status == RPC_INPROGRESS)
1302 status = RPC_TIMEDOUT;
1303 nfscl->nfscl_stat.badcalls.value.ui64++;
1304 if (status != RPC_INTR) {
1305 mutex_enter(&mi->mi_lock);
1306 mi->mi_flags |= MI_DOWN;
1307 mutex_exit(&mi->mi_lock);
1308 CLNT_GETERR(client, &rpcerr);
1309 #ifdef DEBUG
1310 bufp = clnt_sperror(client, svp->sv_hostname);
1311 zprintf(zoneid, "NFS%d %s failed for %s\n",
1312 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1313 if (nfs_has_ctty()) {
1314 if (!(mi->mi_flags & MI_NOPRINT)) {
1315 uprintf("NFS%d %s failed for %s\n",
1316 mi->mi_vers, mi->mi_rfsnames[which],
1317 bufp);
1318 }
1319 }
1320 kmem_free(bufp, MAXPATHLEN);
1321 #else
1322 zprintf(zoneid,
1323 "NFS %s failed for server %s: error %d (%s)\n",
1324 mi->mi_rfsnames[which], svp->sv_hostname,
1325 status, clnt_sperrno(status));
1326 if (nfs_has_ctty()) {
1327 if (!(mi->mi_flags & MI_NOPRINT)) {
1328 uprintf(
1329 "NFS %s failed for server %s: error %d (%s)\n",
1330 mi->mi_rfsnames[which],
1331 svp->sv_hostname, status,
1332 clnt_sperrno(status));
1333 }
1334 }
1335 #endif
1336 /*
1337 * when CLNT_CALL() fails with RPC_AUTHERROR,
1338 * re_errno is set appropriately depending on
1339 * the authentication error
1340 */
1341 if (status == RPC_VERSMISMATCH ||
1342 status == RPC_PROGVERSMISMATCH)
1343 rpcerr.re_errno = EIO;
1344 }
1345 } else {
1346 /*
1347 * Test the value of mi_down and mi_printed without
1348 * holding the mi_lock mutex. If they are both zero,
1349 * then it is okay to skip the down and printed
1350 * processing. This saves on a mutex_enter and
1351 * mutex_exit pair for a normal, successful RPC.
1352 * This was just complete overhead.
1353 */
1354 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1355 mutex_enter(&mi->mi_lock);
1356 mi->mi_flags &= ~MI_DOWN;
1357 if (mi->mi_flags & MI_PRINTED) {
1358 mi->mi_flags &= ~MI_PRINTED;
1359 mutex_exit(&mi->mi_lock);
1360 #ifdef DEBUG
1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 zprintf(zoneid, "NFS%d server %s ok\n",
1363 mi->mi_vers, svp->sv_hostname);
1364 #else
1365 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1366 zprintf(zoneid, "NFS server %s ok\n",
1367 svp->sv_hostname);
1368 #endif
1369 } else
1370 mutex_exit(&mi->mi_lock);
1371 }
1372
1373 if (*douprintf == 0) {
1374 if (!(mi->mi_flags & MI_NOPRINT))
1375 #ifdef DEBUG
1376 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377 uprintf("NFS%d server %s ok\n",
1378 mi->mi_vers, svp->sv_hostname);
1379 #else
1380 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1381 uprintf("NFS server %s ok\n", svp->sv_hostname);
1382 #endif
1383 *douprintf = 1;
1384 }
1385 }
1386
1387 clfree_impl(client, ch, nfscl);
1388 if (cred_cloned)
1389 crfree(cr);
1390
1391 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1392
1393 if (rpc_status != NULL)
1394 *rpc_status = rpcerr.re_status;
1395
1396 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1397 rpcerr.re_errno);
1398
1399 return (rpcerr.re_errno);
1400 }
1401
1402 #ifdef DEBUG
1403 static int acl2call_hits = 0;
1404 static int acl2call_misses = 0;
1405 #endif
1406
1407 int
1408 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1409 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1410 enum nfsstat *statusp, int flags, failinfo_t *fi)
1411 {
1412 int rpcerror;
1413
1414 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1415 cr, douprintf, flags, fi);
1416 if (!rpcerror) {
1417 /*
1418 * See comments with crnetadjust().
1419 */
1420 if (*statusp == NFSERR_ACCES &&
1421 (cr = crnetadjust(cr)) != NULL) {
1422 #ifdef DEBUG
1423 acl2call_hits++;
1424 #endif
1425 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1426 resp, cr, douprintf, flags, fi);
1427 crfree(cr);
1428 #ifdef DEBUG
1429 if (*statusp == NFSERR_ACCES)
1430 acl2call_misses++;
1431 #endif
1432 }
1433 }
1434
1435 return (rpcerror);
1436 }
1437
1438 #ifdef DEBUG
1439 static int acl3call_hits = 0;
1440 static int acl3call_misses = 0;
1441 #endif
1442
1443 int
1444 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1445 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1446 nfsstat3 *statusp, int flags, failinfo_t *fi)
1447 {
1448 int rpcerror;
1449 int user_informed;
1450
1451 user_informed = 0;
1452
1453 do {
1454 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1455 cr, douprintf, flags, fi);
1456 if (!rpcerror) {
1457 cred_t *crr;
1458 if (*statusp == NFS3ERR_JUKEBOX) {
1459 if (!user_informed) {
1460 user_informed = 1;
1461 uprintf(
1462 "file temporarily unavailable on the server, retrying...\n");
1463 }
1464 delay(nfs3_jukebox_delay);
1465 }
1466 /*
1467 * See crnetadjust() for comments.
1468 */
1469 else if (*statusp == NFS3ERR_ACCES &&
1470 (crr = crnetadjust(cr)) != NULL) {
1471 #ifdef DEBUG
1472 acl3call_hits++;
1473 #endif
1474 rpcerror = aclcall(mi, which, xdrargs, argsp,
1475 xdrres, resp, crr, douprintf, flags, fi);
1476
1477 crfree(crr);
1478 #ifdef DEBUG
1479 if (*statusp == NFS3ERR_ACCES)
1480 acl3call_misses++;
1481 #endif
1482 }
1483 }
1484 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1485
1486 return (rpcerror);
1487 }
1488
1489 static int
1490 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1491 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1492 int flags, failinfo_t *fi)
1493 {
1494 CLIENT *client;
1495 struct chtab *ch;
1496 cred_t *cr = icr;
1497 bool_t cred_cloned = FALSE;
1498 enum clnt_stat status;
1499 struct rpc_err rpcerr;
1500 struct timeval wait;
1501 int timeo; /* in units of hz */
1502 #if 0 /* notyet */
1503 int my_rsize, my_wsize;
1504 #endif
1505 bool_t tryagain;
1506 k_sigset_t smask;
1507 servinfo_t *svp;
1508 struct nfs_clnt *nfscl;
1509 zoneid_t zoneid = getzoneid();
1510 #ifdef DEBUG
1511 char *bufp;
1512 #endif
1513
1514 #if 0 /* notyet */
1515 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1516 "rfscall_start:which %d mi %p", which, mi);
1517 #endif
1518
1519 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1520 ASSERT(nfscl != NULL);
1521
1522 nfscl->nfscl_stat.calls.value.ui64++;
1523 mi->mi_aclreqs[which].value.ui64++;
1524
1525 rpcerr.re_status = RPC_SUCCESS;
1526
1527 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1528 rpcerr.re_status = RPC_FAILED;
1529 rpcerr.re_errno = EIO;
1530 return (rpcerr.re_errno);
1531 }
1532
1533 #if 0 /* notyet */
1534 /*
1535 * Remember the transfer sizes in case
1536 * nfs_feedback changes them underneath us.
1537 */
1538 my_rsize = mi->mi_curread;
1539 my_wsize = mi->mi_curwrite;
1540 #endif
1541
1542 /*
1543 * NFS client failover support
1544 *
1545 * If this rnode is not in sync with the current server (VALID_FH),
1546 * we'd like to do a remap to get in sync. We can be interrupted
1547 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1548 * use the best info we have to try the RPC. Part of that is
1549 * unconditionally updating the filehandle copy kept for V3.
1550 *
1551 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1552 * rw_enter(); we're trying to keep the current server from being
1553 * changed on us until we're done with the remapping and have a
1554 * matching client handle. We don't want to sending a filehandle
1555 * to the wrong host.
1556 */
1557 failoverretry:
1558 if (FAILOVER_MOUNT(mi)) {
1559 mutex_enter(&mi->mi_lock);
1560 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1561 if (failover_wait(mi)) {
1562 mutex_exit(&mi->mi_lock);
1563 return (EINTR);
1564 }
1565 }
1566 INC_READERS(mi);
1567 mutex_exit(&mi->mi_lock);
1568 if (fi) {
1569 if (!VALID_FH(fi) &&
1570 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1571 int remaperr;
1572
1573 svp = mi->mi_curr_serv;
1574 remaperr = failover_remap(fi);
1575 if (remaperr != 0) {
1576 #ifdef DEBUG
1577 if (remaperr != EINTR)
1578 nfs_cmn_err(remaperr, CE_WARN,
1579 "aclcall couldn't failover: %m");
1580 #endif
1581 mutex_enter(&mi->mi_lock);
1582 DEC_READERS(mi);
1583 mutex_exit(&mi->mi_lock);
1584
1585 /*
1586 * If failover_remap returns ETIMEDOUT
1587 * and the filesystem is hard mounted
1588 * we have to retry the call with a new
1589 * server.
1590 */
1591 if ((mi->mi_flags & MI_HARD) &&
1592 IS_RECOVERABLE_ERROR(remaperr)) {
1593 if (svp == mi->mi_curr_serv)
1594 failover_newserver(mi);
1595 rpcerr.re_status = RPC_SUCCESS;
1596 goto failoverretry;
1597 }
1598 return (remaperr);
1599 }
1600 }
1601 if (fi->fhp && fi->copyproc)
1602 (*fi->copyproc)(fi->fhp, fi->vp);
1603 }
1604 }
1605
1606 /* For TSOL, use a new cred which has net_mac_aware flag */
1607 if (!cred_cloned && is_system_labeled()) {
1608 cred_cloned = TRUE;
1609 cr = crdup(icr);
1610 (void) setpflags(NET_MAC_AWARE, 1, cr);
1611 }
1612
1613 /*
1614 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1615 * are guaranteed to reprocess the retry as a new request.
1616 */
1617 svp = mi->mi_curr_serv;
1618 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1619 if (FAILOVER_MOUNT(mi)) {
1620 mutex_enter(&mi->mi_lock);
1621 DEC_READERS(mi);
1622 mutex_exit(&mi->mi_lock);
1623
1624 if ((rpcerr.re_errno == ETIMEDOUT ||
1625 rpcerr.re_errno == ECONNRESET) &&
1626 failover_safe(fi)) {
1627 if (svp == mi->mi_curr_serv)
1628 failover_newserver(mi);
1629 goto failoverretry;
1630 }
1631 }
1632 if (rpcerr.re_errno != 0) {
1633 if (cred_cloned)
1634 crfree(cr);
1635 return (rpcerr.re_errno);
1636 }
1637
1638 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1639 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1640 timeo = (mi->mi_timeo * hz) / 10;
1641 } else {
1642 mutex_enter(&mi->mi_lock);
1643 timeo = CLNT_SETTIMERS(client,
1644 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1645 &(mi->mi_timers[NFS_CALLTYPES]),
1646 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1647 (void (*)()) 0, (caddr_t)mi, 0);
1648 mutex_exit(&mi->mi_lock);
1649 }
1650
1651 /*
1652 * If hard mounted fs, retry call forever unless hard error occurs.
1653 */
1654 do {
1655 tryagain = FALSE;
1656
1657 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1658 status = RPC_FAILED;
1659 rpcerr.re_status = RPC_FAILED;
1660 rpcerr.re_errno = EIO;
1661 break;
1662 }
1663
1664 TICK_TO_TIMEVAL(timeo, &wait);
1665
1666 /*
1667 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1668 * and SIGTERM. (Preserving the existing masks).
1669 * Mask out SIGINT if mount option nointr is specified.
1670 */
1671 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1672 if (!(mi->mi_flags & MI_INT))
1673 client->cl_nosignal = TRUE;
1674
1675 /*
1676 * If there is a current signal, then don't bother
1677 * even trying to send out the request because we
1678 * won't be able to block waiting for the response.
1679 * Simply assume RPC_INTR and get on with it.
1680 */
1681 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1682 status = RPC_INTR;
1683 else {
1684 status = CLNT_CALL(client, which, xdrargs, argsp,
1685 xdrres, resp, wait);
1686 }
1687
1688 if (!(mi->mi_flags & MI_INT))
1689 client->cl_nosignal = FALSE;
1690 /*
1691 * restore original signal mask
1692 */
1693 sigunintr(&smask);
1694
1695 switch (status) {
1696 case RPC_SUCCESS:
1697 #if 0 /* notyet */
1698 if ((mi->mi_flags & MI_DYNAMIC) &&
1699 mi->mi_timer_type[which] != 0 &&
1700 (mi->mi_curread != my_rsize ||
1701 mi->mi_curwrite != my_wsize))
1702 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1703 #endif
1704 break;
1705
1706 /*
1707 * Unfortunately, there are servers in the world which
1708 * are not coded correctly. They are not prepared to
1709 * handle RPC requests to the NFS port which are not
1710 * NFS requests. Thus, they may try to process the
1711 * NFS_ACL request as if it were an NFS request. This
1712 * does not work. Generally, an error will be generated
1713 * on the client because it will not be able to decode
1714 * the response from the server. However, it seems
1715 * possible that the server may not be able to decode
1716 * the arguments. Thus, the criteria for deciding
1717 * whether the server supports NFS_ACL or not is whether
1718 * the following RPC errors are returned from CLNT_CALL.
1719 */
1720 case RPC_CANTDECODERES:
1721 case RPC_PROGUNAVAIL:
1722 case RPC_CANTDECODEARGS:
1723 case RPC_PROGVERSMISMATCH:
1724 mutex_enter(&mi->mi_lock);
1725 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1726 mutex_exit(&mi->mi_lock);
1727 break;
1728
1729 /*
1730 * If the server supports NFS_ACL but not the new ops
1731 * for extended attributes, make sure we don't retry.
1732 */
1733 case RPC_PROCUNAVAIL:
1734 mutex_enter(&mi->mi_lock);
1735 mi->mi_flags &= ~MI_EXTATTR;
1736 mutex_exit(&mi->mi_lock);
1737 break;
1738
1739 case RPC_INTR:
1740 /*
1741 * There is no way to recover from this error,
1742 * even if mount option nointr is specified.
1743 * SIGKILL, for example, cannot be blocked.
1744 */
1745 rpcerr.re_status = RPC_INTR;
1746 rpcerr.re_errno = EINTR;
1747 break;
1748
1749 case RPC_UDERROR:
1750 /*
1751 * If the NFS server is local (vold) and
1752 * it goes away then we get RPC_UDERROR.
1753 * This is a retryable error, so we would
1754 * loop, so check to see if the specific
1755 * error was ECONNRESET, indicating that
1756 * target did not exist at all. If so,
1757 * return with RPC_PROGUNAVAIL and
1758 * ECONNRESET to indicate why.
1759 */
1760 CLNT_GETERR(client, &rpcerr);
1761 if (rpcerr.re_errno == ECONNRESET) {
1762 rpcerr.re_status = RPC_PROGUNAVAIL;
1763 rpcerr.re_errno = ECONNRESET;
1764 break;
1765 }
1766 /*FALLTHROUGH*/
1767
1768 default: /* probably RPC_TIMEDOUT */
1769 if (IS_UNRECOVERABLE_RPC(status))
1770 break;
1771
1772 /*
1773 * increment server not responding count
1774 */
1775 mutex_enter(&mi->mi_lock);
1776 mi->mi_noresponse++;
1777 mutex_exit(&mi->mi_lock);
1778 #ifdef DEBUG
1779 nfscl->nfscl_stat.noresponse.value.ui64++;
1780 #endif
1781
1782 if (!(mi->mi_flags & MI_HARD)) {
1783 if (!(mi->mi_flags & MI_SEMISOFT) ||
1784 (mi->mi_acl_ss_call_type[which] == 0))
1785 break;
1786 }
1787
1788 /*
1789 * The call is in progress (over COTS).
1790 * Try the CLNT_CALL again, but don't
1791 * print a noisy error message.
1792 */
1793 if (status == RPC_INPROGRESS) {
1794 tryagain = TRUE;
1795 break;
1796 }
1797
1798 if (flags & RFSCALL_SOFT)
1799 break;
1800
1801 /*
1802 * On zone shutdown, just move on.
1803 */
1804 if (zone_status_get(curproc->p_zone) >=
1805 ZONE_IS_SHUTTING_DOWN) {
1806 rpcerr.re_status = RPC_FAILED;
1807 rpcerr.re_errno = EIO;
1808 break;
1809 }
1810
1811 /*
1812 * NFS client failover support
1813 *
1814 * If the current server just failed us, we'll
1815 * start the process of finding a new server.
1816 * After that, we can just retry.
1817 */
1818 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1819 if (svp == mi->mi_curr_serv)
1820 failover_newserver(mi);
1821 clfree_impl(client, ch, nfscl);
1822 goto failoverretry;
1823 }
1824
1825 tryagain = TRUE;
1826 timeo = backoff(timeo);
1827 mutex_enter(&mi->mi_lock);
1828 if (!(mi->mi_flags & MI_PRINTED)) {
1829 mi->mi_flags |= MI_PRINTED;
1830 mutex_exit(&mi->mi_lock);
1831 #ifdef DEBUG
1832 zprintf(zoneid,
1833 "NFS_ACL%d server %s not responding still trying\n",
1834 mi->mi_vers, svp->sv_hostname);
1835 #else
1836 zprintf(zoneid,
1837 "NFS server %s not responding still trying\n",
1838 svp->sv_hostname);
1839 #endif
1840 } else
1841 mutex_exit(&mi->mi_lock);
1842 if (*douprintf && nfs_has_ctty()) {
1843 *douprintf = 0;
1844 if (!(mi->mi_flags & MI_NOPRINT))
1845 #ifdef DEBUG
1846 uprintf(
1847 "NFS_ACL%d server %s not responding still trying\n",
1848 mi->mi_vers, svp->sv_hostname);
1849 #else
1850 uprintf(
1851 "NFS server %s not responding still trying\n",
1852 svp->sv_hostname);
1853 #endif
1854 }
1855
1856 #if 0 /* notyet */
1857 /*
1858 * If doing dynamic adjustment of transfer
1859 * size and if it's a read or write call
1860 * and if the transfer size changed while
1861 * retransmitting or if the feedback routine
1862 * changed the transfer size,
1863 * then exit rfscall so that the transfer
1864 * size can be adjusted at the vnops level.
1865 */
1866 if ((mi->mi_flags & MI_DYNAMIC) &&
1867 mi->mi_acl_timer_type[which] != 0 &&
1868 (mi->mi_curread != my_rsize ||
1869 mi->mi_curwrite != my_wsize ||
1870 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1871 /*
1872 * On read or write calls, return
1873 * back to the vnode ops level if
1874 * the transfer size changed.
1875 */
1876 clfree_impl(client, ch, nfscl);
1877 if (cred_cloned)
1878 crfree(cr);
1879 return (ENFS_TRYAGAIN);
1880 }
1881 #endif
1882 }
1883 } while (tryagain);
1884
1885 if (status != RPC_SUCCESS) {
1886 /*
1887 * Let soft mounts use the timed out message.
1888 */
1889 if (status == RPC_INPROGRESS)
1890 status = RPC_TIMEDOUT;
1891 nfscl->nfscl_stat.badcalls.value.ui64++;
1892 if (status == RPC_CANTDECODERES ||
1893 status == RPC_PROGUNAVAIL ||
1894 status == RPC_PROCUNAVAIL ||
1895 status == RPC_CANTDECODEARGS ||
1896 status == RPC_PROGVERSMISMATCH)
1897 CLNT_GETERR(client, &rpcerr);
1898 else if (status != RPC_INTR) {
1899 mutex_enter(&mi->mi_lock);
1900 mi->mi_flags |= MI_DOWN;
1901 mutex_exit(&mi->mi_lock);
1902 CLNT_GETERR(client, &rpcerr);
1903 #ifdef DEBUG
1904 bufp = clnt_sperror(client, svp->sv_hostname);
1905 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1906 mi->mi_vers, mi->mi_aclnames[which], bufp);
1907 if (nfs_has_ctty()) {
1908 if (!(mi->mi_flags & MI_NOPRINT)) {
1909 uprintf("NFS_ACL%d %s failed for %s\n",
1910 mi->mi_vers, mi->mi_aclnames[which],
1911 bufp);
1912 }
1913 }
1914 kmem_free(bufp, MAXPATHLEN);
1915 #else
1916 zprintf(zoneid,
1917 "NFS %s failed for server %s: error %d (%s)\n",
1918 mi->mi_aclnames[which], svp->sv_hostname,
1919 status, clnt_sperrno(status));
1920 if (nfs_has_ctty()) {
1921 if (!(mi->mi_flags & MI_NOPRINT))
1922 uprintf(
1923 "NFS %s failed for server %s: error %d (%s)\n",
1924 mi->mi_aclnames[which],
1925 svp->sv_hostname, status,
1926 clnt_sperrno(status));
1927 }
1928 #endif
1929 /*
1930 * when CLNT_CALL() fails with RPC_AUTHERROR,
1931 * re_errno is set appropriately depending on
1932 * the authentication error
1933 */
1934 if (status == RPC_VERSMISMATCH ||
1935 status == RPC_PROGVERSMISMATCH)
1936 rpcerr.re_errno = EIO;
1937 }
1938 } else {
1939 /*
1940 * Test the value of mi_down and mi_printed without
1941 * holding the mi_lock mutex. If they are both zero,
1942 * then it is okay to skip the down and printed
1943 * processing. This saves on a mutex_enter and
1944 * mutex_exit pair for a normal, successful RPC.
1945 * This was just complete overhead.
1946 */
1947 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1948 mutex_enter(&mi->mi_lock);
1949 mi->mi_flags &= ~MI_DOWN;
1950 if (mi->mi_flags & MI_PRINTED) {
1951 mi->mi_flags &= ~MI_PRINTED;
1952 mutex_exit(&mi->mi_lock);
1953 #ifdef DEBUG
1954 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1955 mi->mi_vers, svp->sv_hostname);
1956 #else
1957 zprintf(zoneid, "NFS server %s ok\n",
1958 svp->sv_hostname);
1959 #endif
1960 } else
1961 mutex_exit(&mi->mi_lock);
1962 }
1963
1964 if (*douprintf == 0) {
1965 if (!(mi->mi_flags & MI_NOPRINT))
1966 #ifdef DEBUG
1967 uprintf("NFS_ACL%d server %s ok\n",
1968 mi->mi_vers, svp->sv_hostname);
1969 #else
1970 uprintf("NFS server %s ok\n", svp->sv_hostname);
1971 #endif
1972 *douprintf = 1;
1973 }
1974 }
1975
1976 clfree_impl(client, ch, nfscl);
1977 if (cred_cloned)
1978 crfree(cr);
1979
1980 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1981
1982 #if 0 /* notyet */
1983 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1984 rpcerr.re_errno);
1985 #endif
1986
1987 return (rpcerr.re_errno);
1988 }
1989
1990 int
1991 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1992 {
1993 uint_t mask = vap->va_mask;
1994
1995 if (!(mask & AT_MODE))
1996 sa->sa_mode = (uint32_t)-1;
1997 else
1998 sa->sa_mode = vap->va_mode;
1999 if (!(mask & AT_UID))
2000 sa->sa_uid = (uint32_t)-1;
2001 else
2002 sa->sa_uid = (uint32_t)vap->va_uid;
2003 if (!(mask & AT_GID))
2004 sa->sa_gid = (uint32_t)-1;
2005 else
2006 sa->sa_gid = (uint32_t)vap->va_gid;
2007 if (!(mask & AT_SIZE))
2008 sa->sa_size = (uint32_t)-1;
2009 else
2010 sa->sa_size = (uint32_t)vap->va_size;
2011 if (!(mask & AT_ATIME))
2012 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2013 else {
2014 /* check time validity */
2015 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2016 return (EOVERFLOW);
2017 }
2018 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2019 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2020 }
2021 if (!(mask & AT_MTIME))
2022 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2023 else {
2024 /* check time validity */
2025 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2026 return (EOVERFLOW);
2027 }
2028 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2029 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2030 }
2031 return (0);
2032 }
2033
2034 int
2035 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2036 {
2037 uint_t mask = vap->va_mask;
2038
2039 if (!(mask & AT_MODE))
2040 sa->mode.set_it = FALSE;
2041 else {
2042 sa->mode.set_it = TRUE;
2043 sa->mode.mode = (mode3)vap->va_mode;
2044 }
2045 if (!(mask & AT_UID))
2046 sa->uid.set_it = FALSE;
2047 else {
2048 sa->uid.set_it = TRUE;
2049 sa->uid.uid = (uid3)vap->va_uid;
2050 }
2051 if (!(mask & AT_GID))
2052 sa->gid.set_it = FALSE;
2053 else {
2054 sa->gid.set_it = TRUE;
2055 sa->gid.gid = (gid3)vap->va_gid;
2056 }
2057 if (!(mask & AT_SIZE))
2058 sa->size.set_it = FALSE;
2059 else {
2060 sa->size.set_it = TRUE;
2061 sa->size.size = (size3)vap->va_size;
2062 }
2063 if (!(mask & AT_ATIME))
2064 sa->atime.set_it = DONT_CHANGE;
2065 else {
2066 /* check time validity */
2067 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2068 return (EOVERFLOW);
2069 }
2070 sa->atime.set_it = SET_TO_CLIENT_TIME;
2071 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2072 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2073 }
2074 if (!(mask & AT_MTIME))
2075 sa->mtime.set_it = DONT_CHANGE;
2076 else {
2077 /* check time validity */
2078 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2079 return (EOVERFLOW);
2080 }
2081 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2082 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2083 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2084 }
2085 return (0);
2086 }
2087
2088 void
2089 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2090 {
2091
2092 da->da_fhandle = VTOFH(dvp);
2093 da->da_name = nm;
2094 da->da_flags = 0;
2095 }
2096
2097 void
2098 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2099 {
2100
2101 da->dirp = VTOFH3(dvp);
2102 da->name = nm;
2103 }
2104
2105 int
2106 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2107 {
2108 int error;
2109 rnode_t *rp;
2110 struct vattr va;
2111
2112 va.va_mask = AT_MODE | AT_GID;
2113 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2114 if (error)
2115 return (error);
2116
2117 /*
2118 * To determine the expected group-id of the created file:
2119 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2120 * GRPID option, and the directory's set-gid bit is clear,
2121 * then use the process's gid.
2122 * 2) Otherwise, set the group-id to the gid of the parent directory.
2123 */
2124 rp = VTOR(dvp);
2125 mutex_enter(&rp->r_statelock);
2126 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2127 *gidp = crgetgid(cr);
2128 else
2129 *gidp = va.va_gid;
2130 mutex_exit(&rp->r_statelock);
2131 return (0);
2132 }
2133
2134 int
2135 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2136 {
2137 int error;
2138 struct vattr va;
2139
2140 va.va_mask = AT_MODE;
2141 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2142 if (error)
2143 return (error);
2144
2145 /*
2146 * Modify the expected mode (om) so that the set-gid bit matches
2147 * that of the parent directory (dvp).
2148 */
2149 if (va.va_mode & VSGID)
2150 *omp |= VSGID;
2151 else
2152 *omp &= ~VSGID;
2153 return (0);
2154 }
2155
2156 void
2157 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2158 {
2159
2160 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2161 if (!(vp->v_flag & VSWAPLIKE)) {
2162 mutex_enter(&vp->v_lock);
2163 vp->v_flag |= VSWAPLIKE;
2164 mutex_exit(&vp->v_lock);
2165 }
2166 } else {
2167 if (vp->v_flag & VSWAPLIKE) {
2168 mutex_enter(&vp->v_lock);
2169 vp->v_flag &= ~VSWAPLIKE;
2170 mutex_exit(&vp->v_lock);
2171 }
2172 }
2173 }
2174
2175 /*
2176 * Free the resources associated with an rnode.
2177 */
2178 static void
2179 rinactive(rnode_t *rp, cred_t *cr)
2180 {
2181 vnode_t *vp;
2182 cred_t *cred;
2183 char *contents;
2184 int size;
2185 vsecattr_t *vsp;
2186 int error;
2187 nfs3_pathconf_info *info;
2188
2189 /*
2190 * Before freeing anything, wait until all asynchronous
2191 * activity is done on this rnode. This will allow all
2192 * asynchronous read ahead and write behind i/o's to
2193 * finish.
2194 */
2195 mutex_enter(&rp->r_statelock);
2196 while (rp->r_count > 0)
2197 cv_wait(&rp->r_cv, &rp->r_statelock);
2198 mutex_exit(&rp->r_statelock);
2199
2200 /*
2201 * Flush and invalidate all pages associated with the vnode.
2202 */
2203 vp = RTOV(rp);
2204 if (vn_has_cached_data(vp)) {
2205 ASSERT(vp->v_type != VCHR);
2206 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2207 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2208 if (error && (error == ENOSPC || error == EDQUOT)) {
2209 mutex_enter(&rp->r_statelock);
2210 if (!rp->r_error)
2211 rp->r_error = error;
2212 mutex_exit(&rp->r_statelock);
2213 }
2214 }
2215 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2216 }
2217
2218 /*
2219 * Free any held credentials and caches which may be associated
2220 * with this rnode.
2221 */
2222 mutex_enter(&rp->r_statelock);
2223 cred = rp->r_cred;
2224 rp->r_cred = NULL;
2225 contents = rp->r_symlink.contents;
2226 size = rp->r_symlink.size;
2227 rp->r_symlink.contents = NULL;
2228 vsp = rp->r_secattr;
2229 rp->r_secattr = NULL;
2230 info = rp->r_pathconf;
2231 rp->r_pathconf = NULL;
2232 mutex_exit(&rp->r_statelock);
2233
2234 /*
2235 * Free the held credential.
2236 */
2237 if (cred != NULL)
2238 crfree(cred);
2239
2240 /*
2241 * Free the access cache entries.
2242 */
2243 (void) nfs_access_purge_rp(rp);
2244
2245 /*
2246 * Free the readdir cache entries.
2247 */
2248 if (HAVE_RDDIR_CACHE(rp))
2249 nfs_purge_rddir_cache(vp);
2250
2251 /*
2252 * Free the symbolic link cache.
2253 */
2254 if (contents != NULL) {
2255
2256 kmem_free((void *)contents, size);
2257 }
2258
2259 /*
2260 * Free any cached ACL.
2261 */
2262 if (vsp != NULL)
2263 nfs_acl_free(vsp);
2264
2265 /*
2266 * Free any cached pathconf information.
2267 */
2268 if (info != NULL)
2269 kmem_free(info, sizeof (*info));
2270 }
2271
2272 /*
2273 * Return a vnode for the given NFS Version 2 file handle.
2274 * If no rnode exists for this fhandle, create one and put it
2275 * into the hash queues. If the rnode for this fhandle
2276 * already exists, return it.
2277 *
2278 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2279 */
2280 vnode_t *
2281 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2282 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2283 {
2284 int newnode;
2285 int index;
2286 vnode_t *vp;
2287 nfs_fhandle nfh;
2288 vattr_t va;
2289
2290 nfh.fh_len = NFS_FHSIZE;
2291 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2292
2293 index = rtablehash(&nfh);
2294 rw_enter(&rtable[index].r_lock, RW_READER);
2295
2296 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2297 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2298
2299 if (attr != NULL) {
2300 if (!newnode) {
2301 rw_exit(&rtable[index].r_lock);
2302 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2303 } else {
2304 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2305 vp->v_type = VBAD;
2306 else
2307 vp->v_type = n2v_type(attr);
2308 /*
2309 * A translation here seems to be necessary
2310 * because this function can be called
2311 * with `attr' that has come from the wire,
2312 * and been operated on by vattr_to_nattr().
2313 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2314 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2315 * ->makenfsnode().
2316 */
2317 if ((attr->na_rdev & 0xffff0000) == 0)
2318 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2319 else
2320 vp->v_rdev = expldev(n2v_rdev(attr));
2321 nfs_attrcache(vp, attr, t);
2322 rw_exit(&rtable[index].r_lock);
2323 }
2324 } else {
2325 if (newnode) {
2326 PURGE_ATTRCACHE(vp);
2327 }
2328 rw_exit(&rtable[index].r_lock);
2329 }
2330
2331 return (vp);
2332 }
2333
2334 /*
2335 * Return a vnode for the given NFS Version 3 file handle.
2336 * If no rnode exists for this fhandle, create one and put it
2337 * into the hash queues. If the rnode for this fhandle
2338 * already exists, return it.
2339 *
2340 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2341 */
2342 vnode_t *
2343 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2344 cred_t *cr, char *dnm, char *nm)
2345 {
2346 int newnode;
2347 int index;
2348 vnode_t *vp;
2349
2350 index = rtablehash((nfs_fhandle *)fh);
2351 rw_enter(&rtable[index].r_lock, RW_READER);
2352
2353 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2354 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2355 dnm, nm);
2356
2357 if (vap == NULL) {
2358 if (newnode) {
2359 PURGE_ATTRCACHE(vp);
2360 }
2361 rw_exit(&rtable[index].r_lock);
2362 return (vp);
2363 }
2364
2365 if (!newnode) {
2366 rw_exit(&rtable[index].r_lock);
2367 nfs_attr_cache(vp, vap, t, cr);
2368 } else {
2369 rnode_t *rp = VTOR(vp);
2370
2371 vp->v_type = vap->va_type;
2372 vp->v_rdev = vap->va_rdev;
2373
2374 mutex_enter(&rp->r_statelock);
2375 if (rp->r_mtime <= t)
2376 nfs_attrcache_va(vp, vap);
2377 mutex_exit(&rp->r_statelock);
2378 rw_exit(&rtable[index].r_lock);
2379 }
2380
2381 return (vp);
2382 }
2383
2384 vnode_t *
2385 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2386 cred_t *cr, char *dnm, char *nm)
2387 {
2388 int newnode;
2389 int index;
2390 vnode_t *vp;
2391 vattr_t va;
2392
2393 index = rtablehash((nfs_fhandle *)fh);
2394 rw_enter(&rtable[index].r_lock, RW_READER);
2395
2396 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2397 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2398 dnm, nm);
2399
2400 if (attr == NULL) {
2401 if (newnode) {
2402 PURGE_ATTRCACHE(vp);
2403 }
2404 rw_exit(&rtable[index].r_lock);
2405 return (vp);
2406 }
2407
2408 if (!newnode) {
2409 rw_exit(&rtable[index].r_lock);
2410 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2411 } else {
2412 if (attr->type < NF3REG || attr->type > NF3FIFO)
2413 vp->v_type = VBAD;
2414 else
2415 vp->v_type = nf3_to_vt[attr->type];
2416 vp->v_rdev = makedevice(attr->rdev.specdata1,
2417 attr->rdev.specdata2);
2418 nfs3_attrcache(vp, attr, t);
2419 rw_exit(&rtable[index].r_lock);
2420 }
2421
2422 return (vp);
2423 }
2424
2425 /*
2426 * Read this comment before making changes to rtablehash()!
2427 * This is a hash function in which seemingly obvious and harmless
2428 * changes can cause escalations costing million dollars!
2429 * Know what you are doing.
2430 *
2431 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2432 * algorithm is currently detailed here:
2433 *
2434 * http://burtleburtle.net/bob/hash/doobs.html
2435 *
2436 * Of course, the above link may not be valid by the time you are reading
2437 * this, but suffice it to say that the one-at-a-time algorithm works well in
2438 * almost all cases. If you are changing the algorithm be sure to verify that
2439 * the hash algorithm still provides even distribution in all cases and with
2440 * any server returning filehandles in whatever order (sequential or random).
2441 */
2442 static int
2443 rtablehash(nfs_fhandle *fh)
2444 {
2445 ulong_t hash, len, i;
2446 char *key;
2447
2448 key = fh->fh_buf;
2449 len = (ulong_t)fh->fh_len;
2450 for (hash = 0, i = 0; i < len; i++) {
2451 hash += key[i];
2452 hash += (hash << 10);
2453 hash ^= (hash >> 6);
2454 }
2455 hash += (hash << 3);
2456 hash ^= (hash >> 11);
2457 hash += (hash << 15);
2458 return (hash & rtablemask);
2459 }
2460
2461 static vnode_t *
2462 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2463 struct vnodeops *vops,
2464 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2465 int (*compar)(const void *, const void *),
2466 int *newnode, cred_t *cr, char *dnm, char *nm)
2467 {
2468 rnode_t *rp;
2469 rnode_t *trp;
2470 vnode_t *vp;
2471 mntinfo_t *mi;
2472
2473 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2474
2475 mi = VFTOMI(vfsp);
2476 start:
2477 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2478 vp = RTOV(rp);
2479 nfs_set_vroot(vp);
2480 *newnode = 0;
2481 return (vp);
2482 }
2483 rw_exit(&rhtp->r_lock);
2484
2485 mutex_enter(&rpfreelist_lock);
2486 if (rpfreelist != NULL && rnew >= nrnode) {
2487 rp = rpfreelist;
2488 rp_rmfree(rp);
2489 mutex_exit(&rpfreelist_lock);
2490
2491 vp = RTOV(rp);
2492
2493 if (rp->r_flags & RHASHED) {
2494 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2495 mutex_enter(&vp->v_lock);
2496 if (vp->v_count > 1) {
2497 VN_RELE_LOCKED(vp);
2498 mutex_exit(&vp->v_lock);
2499 rw_exit(&rp->r_hashq->r_lock);
2500 rw_enter(&rhtp->r_lock, RW_READER);
2501 goto start;
2502 }
2503 mutex_exit(&vp->v_lock);
2504 rp_rmhash_locked(rp);
2505 rw_exit(&rp->r_hashq->r_lock);
2506 }
2507
2508 rinactive(rp, cr);
2509
2510 mutex_enter(&vp->v_lock);
2511 if (vp->v_count > 1) {
2512 VN_RELE_LOCKED(vp);
2513 mutex_exit(&vp->v_lock);
2514 rw_enter(&rhtp->r_lock, RW_READER);
2515 goto start;
2516 }
2517 mutex_exit(&vp->v_lock);
2518 vn_invalid(vp);
2519 /*
2520 * destroy old locks before bzero'ing and
2521 * recreating the locks below.
2522 */
2523 nfs_rw_destroy(&rp->r_rwlock);
2524 nfs_rw_destroy(&rp->r_lkserlock);
2525 mutex_destroy(&rp->r_statelock);
2526 cv_destroy(&rp->r_cv);
2527 cv_destroy(&rp->r_commit.c_cv);
2528 nfs_free_r_path(rp);
2529 avl_destroy(&rp->r_dir);
2530 /*
2531 * Make sure that if rnode is recycled then
2532 * VFS count is decremented properly before
2533 * reuse.
2534 */
2535 VFS_RELE(vp->v_vfsp);
2536 vn_reinit(vp);
2537 } else {
2538 vnode_t *new_vp;
2539
2540 mutex_exit(&rpfreelist_lock);
2541
2542 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2543 new_vp = vn_alloc(KM_SLEEP);
2544
2545 atomic_inc_ulong((ulong_t *)&rnew);
2546 #ifdef DEBUG
2547 clstat_debug.nrnode.value.ui64++;
2548 #endif
2549 vp = new_vp;
2550 }
2551
2552 bzero(rp, sizeof (*rp));
2553 rp->r_vnode = vp;
2554 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2555 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2556 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2557 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2558 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2559 rp->r_fh.fh_len = fh->fh_len;
2560 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2561 rp->r_server = mi->mi_curr_serv;
2562 if (FAILOVER_MOUNT(mi)) {
2563 /*
2564 * If replicated servers, stash pathnames
2565 */
2566 if (dnm != NULL && nm != NULL) {
2567 char *s, *p;
2568 uint_t len;
2569
2570 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2571 rp->r_path = kmem_alloc(len, KM_SLEEP);
2572 #ifdef DEBUG
2573 clstat_debug.rpath.value.ui64 += len;
2574 #endif
2575 s = rp->r_path;
2576 for (p = dnm; *p; p++)
2577 *s++ = *p;
2578 *s++ = '/';
2579 for (p = nm; *p; p++)
2580 *s++ = *p;
2581 *s = '\0';
2582 } else {
2583 /* special case for root */
2584 rp->r_path = kmem_alloc(2, KM_SLEEP);
2585 #ifdef DEBUG
2586 clstat_debug.rpath.value.ui64 += 2;
2587 #endif
2588 *rp->r_path = '.';
2589 *(rp->r_path + 1) = '\0';
2590 }
2591 }
2592 VFS_HOLD(vfsp);
2593 rp->r_putapage = putapage;
2594 rp->r_hashq = rhtp;
2595 rp->r_flags = RREADDIRPLUS;
2596 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2597 offsetof(rddir_cache, tree));
2598 vn_setops(vp, vops);
2599 vp->v_data = (caddr_t)rp;
2600 vp->v_vfsp = vfsp;
2601 vp->v_type = VNON;
2602 vp->v_flag |= VMODSORT;
2603 nfs_set_vroot(vp);
2604
2605 /*
2606 * There is a race condition if someone else
2607 * alloc's the rnode while no locks are held, so we
2608 * check again and recover if found.
2609 */
2610 rw_enter(&rhtp->r_lock, RW_WRITER);
2611 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2612 vp = RTOV(trp);
2613 nfs_set_vroot(vp);
2614 *newnode = 0;
2615 rw_exit(&rhtp->r_lock);
2616 rp_addfree(rp, cr);
2617 rw_enter(&rhtp->r_lock, RW_READER);
2618 return (vp);
2619 }
2620 rp_addhash(rp);
2621 *newnode = 1;
2622 return (vp);
2623 }
2624
2625 /*
2626 * Callback function to check if the page should be marked as
2627 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2628 */
2629 int
2630 nfs_setmod_check(page_t *pp)
2631 {
2632 if (pp->p_fsdata != C_NOCOMMIT) {
2633 pp->p_fsdata = C_NOCOMMIT;
2634 return (1);
2635 }
2636 return (0);
2637 }
2638
2639 static void
2640 nfs_set_vroot(vnode_t *vp)
2641 {
2642 rnode_t *rp;
2643 nfs_fhandle *rootfh;
2644
2645 rp = VTOR(vp);
2646 rootfh = &rp->r_server->sv_fhandle;
2647 if (rootfh->fh_len == rp->r_fh.fh_len &&
2648 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2649 if (!(vp->v_flag & VROOT)) {
2650 mutex_enter(&vp->v_lock);
2651 vp->v_flag |= VROOT;
2652 mutex_exit(&vp->v_lock);
2653 }
2654 }
2655 }
2656
2657 static void
2658 nfs_free_r_path(rnode_t *rp)
2659 {
2660 char *path;
2661 size_t len;
2662
2663 path = rp->r_path;
2664 if (path) {
2665 rp->r_path = NULL;
2666 len = strlen(path) + 1;
2667 kmem_free(path, len);
2668 #ifdef DEBUG
2669 clstat_debug.rpath.value.ui64 -= len;
2670 #endif
2671 }
2672 }
2673
2674 /*
2675 * Put an rnode on the free list.
2676 *
2677 * Rnodes which were allocated above and beyond the normal limit
2678 * are immediately freed.
2679 */
2680 void
2681 rp_addfree(rnode_t *rp, cred_t *cr)
2682 {
2683 vnode_t *vp;
2684 struct vfs *vfsp;
2685
2686 vp = RTOV(rp);
2687 ASSERT(vp->v_count >= 1);
2688 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2689
2690 /*
2691 * If we have too many rnodes allocated and there are no
2692 * references to this rnode, or if the rnode is no longer
2693 * accessible by it does not reside in the hash queues,
2694 * or if an i/o error occurred while writing to the file,
2695 * then just free it instead of putting it on the rnode
2696 * freelist.
2697 */
2698 vfsp = vp->v_vfsp;
2699 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2700 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2701 if (rp->r_flags & RHASHED) {
2702 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2703 mutex_enter(&vp->v_lock);
2704 if (vp->v_count > 1) {
2705 VN_RELE_LOCKED(vp);
2706 mutex_exit(&vp->v_lock);
2707 rw_exit(&rp->r_hashq->r_lock);
2708 return;
2709 }
2710 mutex_exit(&vp->v_lock);
2711 rp_rmhash_locked(rp);
2712 rw_exit(&rp->r_hashq->r_lock);
2713 }
2714
2715 rinactive(rp, cr);
2716
2717 /*
2718 * Recheck the vnode reference count. We need to
2719 * make sure that another reference has not been
2720 * acquired while we were not holding v_lock. The
2721 * rnode is not in the rnode hash queues, so the
2722 * only way for a reference to have been acquired
2723 * is for a VOP_PUTPAGE because the rnode was marked
2724 * with RDIRTY or for a modified page. This
2725 * reference may have been acquired before our call
2726 * to rinactive. The i/o may have been completed,
2727 * thus allowing rinactive to complete, but the
2728 * reference to the vnode may not have been released
2729 * yet. In any case, the rnode can not be destroyed
2730 * until the other references to this vnode have been
2731 * released. The other references will take care of
2732 * either destroying the rnode or placing it on the
2733 * rnode freelist. If there are no other references,
2734 * then the rnode may be safely destroyed.
2735 */
2736 mutex_enter(&vp->v_lock);
2737 if (vp->v_count > 1) {
2738 VN_RELE_LOCKED(vp);
2739 mutex_exit(&vp->v_lock);
2740 return;
2741 }
2742 mutex_exit(&vp->v_lock);
2743
2744 destroy_rnode(rp);
2745 return;
2746 }
2747
2748 /*
2749 * Lock the hash queue and then recheck the reference count
2750 * to ensure that no other threads have acquired a reference
2751 * to indicate that the rnode should not be placed on the
2752 * freelist. If another reference has been acquired, then
2753 * just release this one and let the other thread complete
2754 * the processing of adding this rnode to the freelist.
2755 */
2756 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2757
2758 mutex_enter(&vp->v_lock);
2759 if (vp->v_count > 1) {
2760 VN_RELE_LOCKED(vp);
2761 mutex_exit(&vp->v_lock);
2762 rw_exit(&rp->r_hashq->r_lock);
2763 return;
2764 }
2765 mutex_exit(&vp->v_lock);
2766
2767 /*
2768 * If there is no cached data or metadata for this file, then
2769 * put the rnode on the front of the freelist so that it will
2770 * be reused before other rnodes which may have cached data or
2771 * metadata associated with them.
2772 */
2773 mutex_enter(&rpfreelist_lock);
2774 if (rpfreelist == NULL) {
2775 rp->r_freef = rp;
2776 rp->r_freeb = rp;
2777 rpfreelist = rp;
2778 } else {
2779 rp->r_freef = rpfreelist;
2780 rp->r_freeb = rpfreelist->r_freeb;
2781 rpfreelist->r_freeb->r_freef = rp;
2782 rpfreelist->r_freeb = rp;
2783 if (!vn_has_cached_data(vp) &&
2784 !HAVE_RDDIR_CACHE(rp) &&
2785 rp->r_symlink.contents == NULL &&
2786 rp->r_secattr == NULL &&
2787 rp->r_pathconf == NULL)
2788 rpfreelist = rp;
2789 }
2790 mutex_exit(&rpfreelist_lock);
2791
2792 rw_exit(&rp->r_hashq->r_lock);
2793 }
2794
2795 /*
2796 * Remove an rnode from the free list.
2797 *
2798 * The caller must be holding rpfreelist_lock and the rnode
2799 * must be on the freelist.
2800 */
2801 static void
2802 rp_rmfree(rnode_t *rp)
2803 {
2804
2805 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2806 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2807
2808 if (rp == rpfreelist) {
2809 rpfreelist = rp->r_freef;
2810 if (rp == rpfreelist)
2811 rpfreelist = NULL;
2812 }
2813
2814 rp->r_freeb->r_freef = rp->r_freef;
2815 rp->r_freef->r_freeb = rp->r_freeb;
2816
2817 rp->r_freef = rp->r_freeb = NULL;
2818 }
2819
2820 /*
2821 * Put a rnode in the hash table.
2822 *
2823 * The caller must be holding the exclusive hash queue lock.
2824 */
2825 static void
2826 rp_addhash(rnode_t *rp)
2827 {
2828
2829 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2830 ASSERT(!(rp->r_flags & RHASHED));
2831
2832 rp->r_hashf = rp->r_hashq->r_hashf;
2833 rp->r_hashq->r_hashf = rp;
2834 rp->r_hashb = (rnode_t *)rp->r_hashq;
2835 rp->r_hashf->r_hashb = rp;
2836
2837 mutex_enter(&rp->r_statelock);
2838 rp->r_flags |= RHASHED;
2839 mutex_exit(&rp->r_statelock);
2840 }
2841
2842 /*
2843 * Remove a rnode from the hash table.
2844 *
2845 * The caller must be holding the hash queue lock.
2846 */
2847 static void
2848 rp_rmhash_locked(rnode_t *rp)
2849 {
2850
2851 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2852 ASSERT(rp->r_flags & RHASHED);
2853
2854 rp->r_hashb->r_hashf = rp->r_hashf;
2855 rp->r_hashf->r_hashb = rp->r_hashb;
2856
2857 mutex_enter(&rp->r_statelock);
2858 rp->r_flags &= ~RHASHED;
2859 mutex_exit(&rp->r_statelock);
2860 }
2861
2862 /*
2863 * Remove a rnode from the hash table.
2864 *
2865 * The caller must not be holding the hash queue lock.
2866 */
2867 void
2868 rp_rmhash(rnode_t *rp)
2869 {
2870
2871 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2872 rp_rmhash_locked(rp);
2873 rw_exit(&rp->r_hashq->r_lock);
2874 }
2875
2876 /*
2877 * Lookup a rnode by fhandle.
2878 *
2879 * The caller must be holding the hash queue lock, either shared or exclusive.
2880 */
2881 static rnode_t *
2882 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2883 {
2884 rnode_t *rp;
2885 vnode_t *vp;
2886
2887 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2888
2889 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2890 vp = RTOV(rp);
2891 if (vp->v_vfsp == vfsp &&
2892 rp->r_fh.fh_len == fh->fh_len &&
2893 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2894 /*
2895 * remove rnode from free list, if necessary.
2896 */
2897 if (rp->r_freef != NULL) {
2898 mutex_enter(&rpfreelist_lock);
2899 /*
2900 * If the rnode is on the freelist,
2901 * then remove it and use that reference
2902 * as the new reference. Otherwise,
2903 * need to increment the reference count.
2904 */
2905 if (rp->r_freef != NULL) {
2906 rp_rmfree(rp);
2907 mutex_exit(&rpfreelist_lock);
2908 } else {
2909 mutex_exit(&rpfreelist_lock);
2910 VN_HOLD(vp);
2911 }
2912 } else
2913 VN_HOLD(vp);
2914 return (rp);
2915 }
2916 }
2917 return (NULL);
2918 }
2919
2920 /*
2921 * Return 1 if there is a active vnode belonging to this vfs in the
2922 * rtable cache.
2923 *
2924 * Several of these checks are done without holding the usual
2925 * locks. This is safe because destroy_rtable(), rp_addfree(),
2926 * etc. will redo the necessary checks before actually destroying
2927 * any rnodes.
2928 */
2929 int
2930 check_rtable(struct vfs *vfsp)
2931 {
2932 int index;
2933 rnode_t *rp;
2934 vnode_t *vp;
2935
2936 for (index = 0; index < rtablesize; index++) {
2937 rw_enter(&rtable[index].r_lock, RW_READER);
2938 for (rp = rtable[index].r_hashf;
2939 rp != (rnode_t *)(&rtable[index]);
2940 rp = rp->r_hashf) {
2941 vp = RTOV(rp);
2942 if (vp->v_vfsp == vfsp) {
2943 if (rp->r_freef == NULL ||
2944 (vn_has_cached_data(vp) &&
2945 (rp->r_flags & RDIRTY)) ||
2946 rp->r_count > 0) {
2947 rw_exit(&rtable[index].r_lock);
2948 return (1);
2949 }
2950 }
2951 }
2952 rw_exit(&rtable[index].r_lock);
2953 }
2954 return (0);
2955 }
2956
2957 /*
2958 * Destroy inactive vnodes from the hash queues which belong to this
2959 * vfs. It is essential that we destroy all inactive vnodes during a
2960 * forced unmount as well as during a normal unmount.
2961 */
2962 void
2963 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2964 {
2965 int index;
2966 rnode_t *rp;
2967 rnode_t *rlist;
2968 rnode_t *r_hashf;
2969 vnode_t *vp;
2970
2971 rlist = NULL;
2972
2973 for (index = 0; index < rtablesize; index++) {
2974 rw_enter(&rtable[index].r_lock, RW_WRITER);
2975 for (rp = rtable[index].r_hashf;
2976 rp != (rnode_t *)(&rtable[index]);
2977 rp = r_hashf) {
2978 /* save the hash pointer before destroying */
2979 r_hashf = rp->r_hashf;
2980 vp = RTOV(rp);
2981 if (vp->v_vfsp == vfsp) {
2982 mutex_enter(&rpfreelist_lock);
2983 if (rp->r_freef != NULL) {
2984 rp_rmfree(rp);
2985 mutex_exit(&rpfreelist_lock);
2986 rp_rmhash_locked(rp);
2987 rp->r_hashf = rlist;
2988 rlist = rp;
2989 } else
2990 mutex_exit(&rpfreelist_lock);
2991 }
2992 }
2993 rw_exit(&rtable[index].r_lock);
2994 }
2995
2996 for (rp = rlist; rp != NULL; rp = rlist) {
2997 rlist = rp->r_hashf;
2998 /*
2999 * This call to rp_addfree will end up destroying the
3000 * rnode, but in a safe way with the appropriate set
3001 * of checks done.
3002 */
3003 rp_addfree(rp, cr);
3004 }
3005
3006 }
3007
3008 /*
3009 * This routine destroys all the resources associated with the rnode
3010 * and then the rnode itself.
3011 */
3012 static void
3013 destroy_rnode(rnode_t *rp)
3014 {
3015 vnode_t *vp;
3016 vfs_t *vfsp;
3017
3018 vp = RTOV(rp);
3019 vfsp = vp->v_vfsp;
3020
3021 ASSERT(vp->v_count == 1);
3022 ASSERT(rp->r_count == 0);
3023 ASSERT(rp->r_lmpl == NULL);
3024 ASSERT(rp->r_mapcnt == 0);
3025 ASSERT(!(rp->r_flags & RHASHED));
3026 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3027 atomic_dec_ulong((ulong_t *)&rnew);
3028 #ifdef DEBUG
3029 clstat_debug.nrnode.value.ui64--;
3030 #endif
3031 nfs_rw_destroy(&rp->r_rwlock);
3032 nfs_rw_destroy(&rp->r_lkserlock);
3033 mutex_destroy(&rp->r_statelock);
3034 cv_destroy(&rp->r_cv);
3035 cv_destroy(&rp->r_commit.c_cv);
3036 if (rp->r_flags & RDELMAPLIST)
3037 list_destroy(&rp->r_indelmap);
3038 nfs_free_r_path(rp);
3039 avl_destroy(&rp->r_dir);
3040 vn_invalid(vp);
3041 vn_free(vp);
3042 kmem_cache_free(rnode_cache, rp);
3043 VFS_RELE(vfsp);
3044 }
3045
3046 /*
3047 * Flush all vnodes in this (or every) vfs.
3048 * Used by nfs_sync and by nfs_unmount.
3049 */
3050 void
3051 rflush(struct vfs *vfsp, cred_t *cr)
3052 {
3053 int index;
3054 rnode_t *rp;
3055 vnode_t *vp, **vplist;
3056 long num, cnt;
3057
3058 /*
3059 * Check to see whether there is anything to do.
3060 */
3061 num = rnew;
3062 if (num == 0)
3063 return;
3064
3065 /*
3066 * Allocate a slot for all currently active rnodes on the
3067 * supposition that they all may need flushing.
3068 */
3069 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3070 cnt = 0;
3071
3072 /*
3073 * Walk the hash queues looking for rnodes with page
3074 * lists associated with them. Make a list of these
3075 * files.
3076 */
3077 for (index = 0; index < rtablesize; index++) {
3078 rw_enter(&rtable[index].r_lock, RW_READER);
3079 for (rp = rtable[index].r_hashf;
3080 rp != (rnode_t *)(&rtable[index]);
3081 rp = rp->r_hashf) {
3082 vp = RTOV(rp);
3083 /*
3084 * Don't bother sync'ing a vp if it
3085 * is part of virtual swap device or
3086 * if VFS is read-only
3087 */
3088 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3089 continue;
3090 /*
3091 * If flushing all mounted file systems or
3092 * the vnode belongs to this vfs, has pages
3093 * and is marked as either dirty or mmap'd,
3094 * hold and add this vnode to the list of
3095 * vnodes to flush.
3096 */
3097 if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3098 vn_has_cached_data(vp) &&
3099 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3100 VN_HOLD(vp);
3101 vplist[cnt++] = vp;
3102 if (cnt == num) {
3103 rw_exit(&rtable[index].r_lock);
3104 goto toomany;
3105 }
3106 }
3107 }
3108 rw_exit(&rtable[index].r_lock);
3109 }
3110 toomany:
3111
3112 /*
3113 * Flush and release all of the files on the list.
3114 */
3115 while (cnt-- > 0) {
3116 vp = vplist[cnt];
3117 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3118 VN_RELE(vp);
3119 }
3120
3121 /*
3122 * Free the space allocated to hold the list.
3123 */
3124 kmem_free(vplist, num * sizeof (*vplist));
3125 }
3126
3127 /*
3128 * This probably needs to be larger than or equal to
3129 * log2(sizeof (struct rnode)) due to the way that rnodes are
3130 * allocated.
3131 */
3132 #define ACACHE_SHIFT_BITS 9
3133
3134 static int
3135 acachehash(rnode_t *rp, cred_t *cr)
3136 {
3137
3138 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3139 acachemask);
3140 }
3141
3142 #ifdef DEBUG
3143 static long nfs_access_cache_hits = 0;
3144 static long nfs_access_cache_misses = 0;
3145 #endif
3146
3147 nfs_access_type_t
3148 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3149 {
3150 vnode_t *vp;
3151 acache_t *ap;
3152 acache_hash_t *hp;
3153 nfs_access_type_t all;
3154
3155 vp = RTOV(rp);
3156 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3157 return (NFS_ACCESS_UNKNOWN);
3158
3159 if (rp->r_acache != NULL) {
3160 hp = &acache[acachehash(rp, cr)];
3161 rw_enter(&hp->lock, RW_READER);
3162 ap = hp->next;
3163 while (ap != (acache_t *)hp) {
3164 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3165 if ((ap->known & acc) == acc) {
3166 #ifdef DEBUG
3167 nfs_access_cache_hits++;
3168 #endif
3169 if ((ap->allowed & acc) == acc)
3170 all = NFS_ACCESS_ALLOWED;
3171 else
3172 all = NFS_ACCESS_DENIED;
3173 } else {
3174 #ifdef DEBUG
3175 nfs_access_cache_misses++;
3176 #endif
3177 all = NFS_ACCESS_UNKNOWN;
3178 }
3179 rw_exit(&hp->lock);
3180 return (all);
3181 }
3182 ap = ap->next;
3183 }
3184 rw_exit(&hp->lock);
3185 }
3186
3187 #ifdef DEBUG
3188 nfs_access_cache_misses++;
3189 #endif
3190 return (NFS_ACCESS_UNKNOWN);
3191 }
3192
3193 void
3194 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3195 {
3196 acache_t *ap;
3197 acache_t *nap;
3198 acache_hash_t *hp;
3199
3200 hp = &acache[acachehash(rp, cr)];
3201
3202 /*
3203 * Allocate now assuming that mostly an allocation will be
3204 * required. This allows the allocation to happen without
3205 * holding the hash bucket locked.
3206 */
3207 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3208 if (nap != NULL) {
3209 nap->known = acc;
3210 nap->allowed = resacc;
3211 nap->rnode = rp;
3212 crhold(cr);
3213 nap->cred = cr;
3214 nap->hashq = hp;
3215 }
3216
3217 rw_enter(&hp->lock, RW_WRITER);
3218
3219 if (rp->r_acache != NULL) {
3220 ap = hp->next;
3221 while (ap != (acache_t *)hp) {
3222 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3223 ap->known |= acc;
3224 ap->allowed &= ~acc;
3225 ap->allowed |= resacc;
3226 rw_exit(&hp->lock);
3227 if (nap != NULL) {
3228 crfree(nap->cred);
3229 kmem_cache_free(acache_cache, nap);
3230 }
3231 return;
3232 }
3233 ap = ap->next;
3234 }
3235 }
3236
3237 if (nap != NULL) {
3238 #ifdef DEBUG
3239 clstat_debug.access.value.ui64++;
3240 #endif
3241 nap->next = hp->next;
3242 hp->next = nap;
3243 nap->next->prev = nap;
3244 nap->prev = (acache_t *)hp;
3245
3246 mutex_enter(&rp->r_statelock);
3247 nap->list = rp->r_acache;
3248 rp->r_acache = nap;
3249 mutex_exit(&rp->r_statelock);
3250 }
3251
3252 rw_exit(&hp->lock);
3253 }
3254
3255 int
3256 nfs_access_purge_rp(rnode_t *rp)
3257 {
3258 acache_t *ap;
3259 acache_t *tmpap;
3260 acache_t *rplist;
3261
3262 /*
3263 * If there aren't any cached entries, then there is nothing
3264 * to free.
3265 */
3266 if (rp->r_acache == NULL)
3267 return (0);
3268
3269 mutex_enter(&rp->r_statelock);
3270 rplist = rp->r_acache;
3271 rp->r_acache = NULL;
3272 mutex_exit(&rp->r_statelock);
3273
3274 /*
3275 * Loop through each entry in the list pointed to in the
3276 * rnode. Remove each of these entries from the hash
3277 * queue that it is on and remove it from the list in
3278 * the rnode.
3279 */
3280 for (ap = rplist; ap != NULL; ap = tmpap) {
3281 rw_enter(&ap->hashq->lock, RW_WRITER);
3282 ap->prev->next = ap->next;
3283 ap->next->prev = ap->prev;
3284 rw_exit(&ap->hashq->lock);
3285
3286 tmpap = ap->list;
3287 crfree(ap->cred);
3288 kmem_cache_free(acache_cache, ap);
3289 #ifdef DEBUG
3290 clstat_debug.access.value.ui64--;
3291 #endif
3292 }
3293
3294 return (1);
3295 }
3296
3297 static const char prefix[] = ".nfs";
3298
3299 static kmutex_t newnum_lock;
3300
3301 int
3302 newnum(void)
3303 {
3304 static uint_t newnum = 0;
3305 uint_t id;
3306
3307 mutex_enter(&newnum_lock);
3308 if (newnum == 0)
3309 newnum = gethrestime_sec() & 0xffff;
3310 id = newnum++;
3311 mutex_exit(&newnum_lock);
3312 return (id);
3313 }
3314
3315 char *
3316 newname(void)
3317 {
3318 char *news;
3319 char *s;
3320 const char *p;
3321 uint_t id;
3322
3323 id = newnum();
3324 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3325 s = news;
3326 p = prefix;
3327 while (*p != '\0')
3328 *s++ = *p++;
3329 while (id != 0) {
3330 *s++ = "0123456789ABCDEF"[id & 0x0f];
3331 id >>= 4;
3332 }
3333 *s = '\0';
3334 return (news);
3335 }
3336
3337 /*
3338 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3339 * framework.
3340 */
3341 static int
3342 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3343 {
3344 ksp->ks_snaptime = gethrtime();
3345 if (rw == KSTAT_WRITE) {
3346 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3347 #ifdef DEBUG
3348 /*
3349 * Currently only the global zone can write to kstats, but we
3350 * add the check just for paranoia.
3351 */
3352 if (INGLOBALZONE(curproc))
3353 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3354 sizeof (clstat_debug));
3355 #endif
3356 } else {
3357 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3358 #ifdef DEBUG
3359 /*
3360 * If we're displaying the "global" debug kstat values, we
3361 * display them as-is to all zones since in fact they apply to
3362 * the system as a whole.
3363 */
3364 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3365 sizeof (clstat_debug));
3366 #endif
3367 }
3368 return (0);
3369 }
3370
3371 static void *
3372 clinit_zone(zoneid_t zoneid)
3373 {
3374 kstat_t *nfs_client_kstat;
3375 struct nfs_clnt *nfscl;
3376 uint_t ndata;
3377
3378 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3379 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3380 nfscl->nfscl_chtable = NULL;
3381 nfscl->nfscl_zoneid = zoneid;
3382
3383 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3384 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3385 #ifdef DEBUG
3386 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3387 #endif
3388 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3389 "misc", KSTAT_TYPE_NAMED, ndata,
3390 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3391 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3392 nfs_client_kstat->ks_snapshot = cl_snapshot;
3393 kstat_install(nfs_client_kstat);
3394 }
3395 mutex_enter(&nfs_clnt_list_lock);
3396 list_insert_head(&nfs_clnt_list, nfscl);
3397 mutex_exit(&nfs_clnt_list_lock);
3398 return (nfscl);
3399 }
3400
3401 /*ARGSUSED*/
3402 static void
3403 clfini_zone(zoneid_t zoneid, void *arg)
3404 {
3405 struct nfs_clnt *nfscl = arg;
3406 chhead_t *chp, *next;
3407
3408 if (nfscl == NULL)
3409 return;
3410 mutex_enter(&nfs_clnt_list_lock);
3411 list_remove(&nfs_clnt_list, nfscl);
3412 mutex_exit(&nfs_clnt_list_lock);
3413 clreclaim_zone(nfscl, 0);
3414 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3415 ASSERT(chp->ch_list == NULL);
3416 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3417 next = chp->ch_next;
3418 kmem_free(chp, sizeof (*chp));
3419 }
3420 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3421 mutex_destroy(&nfscl->nfscl_chtable_lock);
3422 kmem_free(nfscl, sizeof (*nfscl));
3423 }
3424
3425 /*
3426 * Called by endpnt_destructor to make sure the client handles are
3427 * cleaned up before the RPC endpoints. This becomes a no-op if
3428 * clfini_zone (above) is called first. This function is needed
3429 * (rather than relying on clfini_zone to clean up) because the ZSD
3430 * callbacks have no ordering mechanism, so we have no way to ensure
3431 * that clfini_zone is called before endpnt_destructor.
3432 */
3433 void
3434 clcleanup_zone(zoneid_t zoneid)
3435 {
3436 struct nfs_clnt *nfscl;
3437
3438 mutex_enter(&nfs_clnt_list_lock);
3439 nfscl = list_head(&nfs_clnt_list);
3440 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3441 if (nfscl->nfscl_zoneid == zoneid) {
3442 clreclaim_zone(nfscl, 0);
3443 break;
3444 }
3445 }
3446 mutex_exit(&nfs_clnt_list_lock);
3447 }
3448
3449 int
3450 nfs_subrinit(void)
3451 {
3452 int i;
3453 ulong_t nrnode_max;
3454
3455 /*
3456 * Allocate and initialize the rnode hash queues
3457 */
3458 if (nrnode <= 0)
3459 nrnode = ncsize;
3460 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3461 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3462 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3463 "!setting nrnode to max value of %ld", nrnode_max);
3464 nrnode = nrnode_max;
3465 }
3466
3467 rtablesize = 1 << highbit(nrnode / hashlen);
3468 rtablemask = rtablesize - 1;
3469 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3470 for (i = 0; i < rtablesize; i++) {
3471 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3472 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3473 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3474 }
3475 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3476 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3477
3478 /*
3479 * Allocate and initialize the access cache
3480 */
3481
3482 /*
3483 * Initial guess is one access cache entry per rnode unless
3484 * nacache is set to a non-zero value and then it is used to
3485 * indicate a guess at the number of access cache entries.
3486 */
3487 if (nacache > 0)
3488 acachesize = 1 << highbit(nacache / hashlen);
3489 else
3490 acachesize = rtablesize;
3491 acachemask = acachesize - 1;
3492 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3493 for (i = 0; i < acachesize; i++) {
3494 acache[i].next = (acache_t *)&acache[i];
3495 acache[i].prev = (acache_t *)&acache[i];
3496 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3497 }
3498 acache_cache = kmem_cache_create("nfs_access_cache",
3499 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3500 /*
3501 * Allocate and initialize the client handle cache
3502 */
3503 chtab_cache = kmem_cache_create("client_handle_cache",
3504 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3505 /*
3506 * Initialize the list of per-zone client handles (and associated data).
3507 * This needs to be done before we call zone_key_create().
3508 */
3509 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3510 offsetof(struct nfs_clnt, nfscl_node));
3511 /*
3512 * Initialize the zone_key for per-zone client handle lists.
3513 */
3514 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3515 /*
3516 * Initialize the various mutexes and reader/writer locks
3517 */
3518 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3519 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3520 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3521
3522 /*
3523 * Assign unique major number for all nfs mounts
3524 */
3525 if ((nfs_major = getudev()) == -1) {
3526 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3527 "nfs: init: can't get unique device number");
3528 nfs_major = 0;
3529 }
3530 nfs_minor = 0;
3531
3532 if (nfs3_jukebox_delay == 0)
3533 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3534
3535 return (0);
3536 }
3537
3538 void
3539 nfs_subrfini(void)
3540 {
3541 int i;
3542
3543 /*
3544 * Deallocate the rnode hash queues
3545 */
3546 kmem_cache_destroy(rnode_cache);
3547
3548 for (i = 0; i < rtablesize; i++)
3549 rw_destroy(&rtable[i].r_lock);
3550 kmem_free(rtable, rtablesize * sizeof (*rtable));
3551
3552 /*
3553 * Deallocated the access cache
3554 */
3555 kmem_cache_destroy(acache_cache);
3556
3557 for (i = 0; i < acachesize; i++)
3558 rw_destroy(&acache[i].lock);
3559 kmem_free(acache, acachesize * sizeof (*acache));
3560
3561 /*
3562 * Deallocate the client handle cache
3563 */
3564 kmem_cache_destroy(chtab_cache);
3565
3566 /*
3567 * Destroy the various mutexes and reader/writer locks
3568 */
3569 mutex_destroy(&rpfreelist_lock);
3570 mutex_destroy(&newnum_lock);
3571 mutex_destroy(&nfs_minor_lock);
3572 (void) zone_key_delete(nfsclnt_zone_key);
3573 }
3574
3575 enum nfsstat
3576 puterrno(int error)
3577 {
3578
3579 switch (error) {
3580 case EOPNOTSUPP:
3581 return (NFSERR_OPNOTSUPP);
3582 case ENAMETOOLONG:
3583 return (NFSERR_NAMETOOLONG);
3584 case ENOTEMPTY:
3585 return (NFSERR_NOTEMPTY);
3586 case EDQUOT:
3587 return (NFSERR_DQUOT);
3588 case ESTALE:
3589 return (NFSERR_STALE);
3590 case EREMOTE:
3591 return (NFSERR_REMOTE);
3592 case ENOSYS:
3593 return (NFSERR_OPNOTSUPP);
3594 case EOVERFLOW:
3595 return (NFSERR_INVAL);
3596 default:
3597 return ((enum nfsstat)error);
3598 }
3599 /* NOTREACHED */
3600 }
3601
3602 int
3603 geterrno(enum nfsstat status)
3604 {
3605
3606 switch (status) {
3607 case NFSERR_OPNOTSUPP:
3608 return (EOPNOTSUPP);
3609 case NFSERR_NAMETOOLONG:
3610 return (ENAMETOOLONG);
3611 case NFSERR_NOTEMPTY:
3612 return (ENOTEMPTY);
3613 case NFSERR_DQUOT:
3614 return (EDQUOT);
3615 case NFSERR_STALE:
3616 return (ESTALE);
3617 case NFSERR_REMOTE:
3618 return (EREMOTE);
3619 case NFSERR_WFLUSH:
3620 return (EIO);
3621 default:
3622 return ((int)status);
3623 }
3624 /* NOTREACHED */
3625 }
3626
3627 enum nfsstat3
3628 puterrno3(int error)
3629 {
3630
3631 #ifdef DEBUG
3632 switch (error) {
3633 case 0:
3634 return (NFS3_OK);
3635 case EPERM:
3636 return (NFS3ERR_PERM);
3637 case ENOENT:
3638 return (NFS3ERR_NOENT);
3639 case EIO:
3640 return (NFS3ERR_IO);
3641 case ENXIO:
3642 return (NFS3ERR_NXIO);
3643 case EACCES:
3644 return (NFS3ERR_ACCES);
3645 case EEXIST:
3646 return (NFS3ERR_EXIST);
3647 case EXDEV:
3648 return (NFS3ERR_XDEV);
3649 case ENODEV:
3650 return (NFS3ERR_NODEV);
3651 case ENOTDIR:
3652 return (NFS3ERR_NOTDIR);
3653 case EISDIR:
3654 return (NFS3ERR_ISDIR);
3655 case EINVAL:
3656 return (NFS3ERR_INVAL);
3657 case EFBIG:
3658 return (NFS3ERR_FBIG);
3659 case ENOSPC:
3660 return (NFS3ERR_NOSPC);
3661 case EROFS:
3662 return (NFS3ERR_ROFS);
3663 case EMLINK:
3664 return (NFS3ERR_MLINK);
3665 case ENAMETOOLONG:
3666 return (NFS3ERR_NAMETOOLONG);
3667 case ENOTEMPTY:
3668 return (NFS3ERR_NOTEMPTY);
3669 case EDQUOT:
3670 return (NFS3ERR_DQUOT);
3671 case ESTALE:
3672 return (NFS3ERR_STALE);
3673 case EREMOTE:
3674 return (NFS3ERR_REMOTE);
3675 case ENOSYS:
3676 case EOPNOTSUPP:
3677 return (NFS3ERR_NOTSUPP);
3678 case EOVERFLOW:
3679 return (NFS3ERR_INVAL);
3680 default:
3681 zcmn_err(getzoneid(), CE_WARN,
3682 "puterrno3: got error %d", error);
3683 return ((enum nfsstat3)error);
3684 }
3685 #else
3686 switch (error) {
3687 case ENAMETOOLONG:
3688 return (NFS3ERR_NAMETOOLONG);
3689 case ENOTEMPTY:
3690 return (NFS3ERR_NOTEMPTY);
3691 case EDQUOT:
3692 return (NFS3ERR_DQUOT);
3693 case ESTALE:
3694 return (NFS3ERR_STALE);
3695 case ENOSYS:
3696 case EOPNOTSUPP:
3697 return (NFS3ERR_NOTSUPP);
3698 case EREMOTE:
3699 return (NFS3ERR_REMOTE);
3700 case EOVERFLOW:
3701 return (NFS3ERR_INVAL);
3702 default:
3703 return ((enum nfsstat3)error);
3704 }
3705 #endif
3706 }
3707
3708 int
3709 geterrno3(enum nfsstat3 status)
3710 {
3711
3712 #ifdef DEBUG
3713 switch (status) {
3714 case NFS3_OK:
3715 return (0);
3716 case NFS3ERR_PERM:
3717 return (EPERM);
3718 case NFS3ERR_NOENT:
3719 return (ENOENT);
3720 case NFS3ERR_IO:
3721 return (EIO);
3722 case NFS3ERR_NXIO:
3723 return (ENXIO);
3724 case NFS3ERR_ACCES:
3725 return (EACCES);
3726 case NFS3ERR_EXIST:
3727 return (EEXIST);
3728 case NFS3ERR_XDEV:
3729 return (EXDEV);
3730 case NFS3ERR_NODEV:
3731 return (ENODEV);
3732 case NFS3ERR_NOTDIR:
3733 return (ENOTDIR);
3734 case NFS3ERR_ISDIR:
3735 return (EISDIR);
3736 case NFS3ERR_INVAL:
3737 return (EINVAL);
3738 case NFS3ERR_FBIG:
3739 return (EFBIG);
3740 case NFS3ERR_NOSPC:
3741 return (ENOSPC);
3742 case NFS3ERR_ROFS:
3743 return (EROFS);
3744 case NFS3ERR_MLINK:
3745 return (EMLINK);
3746 case NFS3ERR_NAMETOOLONG:
3747 return (ENAMETOOLONG);
3748 case NFS3ERR_NOTEMPTY:
3749 return (ENOTEMPTY);
3750 case NFS3ERR_DQUOT:
3751 return (EDQUOT);
3752 case NFS3ERR_STALE:
3753 return (ESTALE);
3754 case NFS3ERR_REMOTE:
3755 return (EREMOTE);
3756 case NFS3ERR_BADHANDLE:
3757 return (ESTALE);
3758 case NFS3ERR_NOT_SYNC:
3759 return (EINVAL);
3760 case NFS3ERR_BAD_COOKIE:
3761 return (ENOENT);
3762 case NFS3ERR_NOTSUPP:
3763 return (EOPNOTSUPP);
3764 case NFS3ERR_TOOSMALL:
3765 return (EINVAL);
3766 case NFS3ERR_SERVERFAULT:
3767 return (EIO);
3768 case NFS3ERR_BADTYPE:
3769 return (EINVAL);
3770 case NFS3ERR_JUKEBOX:
3771 return (ENXIO);
3772 default:
3773 zcmn_err(getzoneid(), CE_WARN,
3774 "geterrno3: got status %d", status);
3775 return ((int)status);
3776 }
3777 #else
3778 switch (status) {
3779 case NFS3ERR_NAMETOOLONG:
3780 return (ENAMETOOLONG);
3781 case NFS3ERR_NOTEMPTY:
3782 return (ENOTEMPTY);
3783 case NFS3ERR_DQUOT:
3784 return (EDQUOT);
3785 case NFS3ERR_STALE:
3786 case NFS3ERR_BADHANDLE:
3787 return (ESTALE);
3788 case NFS3ERR_NOTSUPP:
3789 return (EOPNOTSUPP);
3790 case NFS3ERR_REMOTE:
3791 return (EREMOTE);
3792 case NFS3ERR_NOT_SYNC:
3793 case NFS3ERR_TOOSMALL:
3794 case NFS3ERR_BADTYPE:
3795 return (EINVAL);
3796 case NFS3ERR_BAD_COOKIE:
3797 return (ENOENT);
3798 case NFS3ERR_SERVERFAULT:
3799 return (EIO);
3800 case NFS3ERR_JUKEBOX:
3801 return (ENXIO);
3802 default:
3803 return ((int)status);
3804 }
3805 #endif
3806 }
3807
3808 rddir_cache *
3809 rddir_cache_alloc(int flags)
3810 {
3811 rddir_cache *rc;
3812
3813 rc = kmem_alloc(sizeof (*rc), flags);
3814 if (rc != NULL) {
3815 rc->entries = NULL;
3816 rc->flags = RDDIR;
3817 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3818 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3819 rc->count = 1;
3820 #ifdef DEBUG
3821 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3822 #endif
3823 }
3824 return (rc);
3825 }
3826
3827 static void
3828 rddir_cache_free(rddir_cache *rc)
3829 {
3830
3831 #ifdef DEBUG
3832 atomic_dec_64(&clstat_debug.dirent.value.ui64);
3833 #endif
3834 if (rc->entries != NULL) {
3835 #ifdef DEBUG
3836 rddir_cache_buf_free(rc->entries, rc->buflen);
3837 #else
3838 kmem_free(rc->entries, rc->buflen);
3839 #endif
3840 }
3841 cv_destroy(&rc->cv);
3842 mutex_destroy(&rc->lock);
3843 kmem_free(rc, sizeof (*rc));
3844 }
3845
3846 void
3847 rddir_cache_hold(rddir_cache *rc)
3848 {
3849
3850 mutex_enter(&rc->lock);
3851 rc->count++;
3852 mutex_exit(&rc->lock);
3853 }
3854
3855 void
3856 rddir_cache_rele(rddir_cache *rc)
3857 {
3858
3859 mutex_enter(&rc->lock);
3860 ASSERT(rc->count > 0);
3861 if (--rc->count == 0) {
3862 mutex_exit(&rc->lock);
3863 rddir_cache_free(rc);
3864 } else
3865 mutex_exit(&rc->lock);
3866 }
3867
3868 #ifdef DEBUG
3869 char *
3870 rddir_cache_buf_alloc(size_t size, int flags)
3871 {
3872 char *rc;
3873
3874 rc = kmem_alloc(size, flags);
3875 if (rc != NULL)
3876 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3877 return (rc);
3878 }
3879
3880 void
3881 rddir_cache_buf_free(void *addr, size_t size)
3882 {
3883
3884 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3885 kmem_free(addr, size);
3886 }
3887 #endif
3888
3889 static int
3890 nfs_free_data_reclaim(rnode_t *rp)
3891 {
3892 char *contents;
3893 int size;
3894 vsecattr_t *vsp;
3895 nfs3_pathconf_info *info;
3896 int freed;
3897 cred_t *cred;
3898
3899 /*
3900 * Free any held credentials and caches which
3901 * may be associated with this rnode.
3902 */
3903 mutex_enter(&rp->r_statelock);
3904 cred = rp->r_cred;
3905 rp->r_cred = NULL;
3906 contents = rp->r_symlink.contents;
3907 size = rp->r_symlink.size;
3908 rp->r_symlink.contents = NULL;
3909 vsp = rp->r_secattr;
3910 rp->r_secattr = NULL;
3911 info = rp->r_pathconf;
3912 rp->r_pathconf = NULL;
3913 mutex_exit(&rp->r_statelock);
3914
3915 if (cred != NULL)
3916 crfree(cred);
3917
3918 /*
3919 * Free the access cache entries.
3920 */
3921 freed = nfs_access_purge_rp(rp);
3922
3923 if (!HAVE_RDDIR_CACHE(rp) &&
3924 contents == NULL &&
3925 vsp == NULL &&
3926 info == NULL)
3927 return (freed);
3928
3929 /*
3930 * Free the readdir cache entries
3931 */
3932 if (HAVE_RDDIR_CACHE(rp))
3933 nfs_purge_rddir_cache(RTOV(rp));
3934
3935 /*
3936 * Free the symbolic link cache.
3937 */
3938 if (contents != NULL) {
3939
3940 kmem_free((void *)contents, size);
3941 }
3942
3943 /*
3944 * Free any cached ACL.
3945 */
3946 if (vsp != NULL)
3947 nfs_acl_free(vsp);
3948
3949 /*
3950 * Free any cached pathconf information.
3951 */
3952 if (info != NULL)
3953 kmem_free(info, sizeof (*info));
3954
3955 return (1);
3956 }
3957
3958 static int
3959 nfs_active_data_reclaim(rnode_t *rp)
3960 {
3961 char *contents;
3962 int size;
3963 vsecattr_t *vsp;
3964 nfs3_pathconf_info *info;
3965 int freed;
3966
3967 /*
3968 * Free any held credentials and caches which
3969 * may be associated with this rnode.
3970 */
3971 if (!mutex_tryenter(&rp->r_statelock))
3972 return (0);
3973 contents = rp->r_symlink.contents;
3974 size = rp->r_symlink.size;
3975 rp->r_symlink.contents = NULL;
3976 vsp = rp->r_secattr;
3977 rp->r_secattr = NULL;
3978 info = rp->r_pathconf;
3979 rp->r_pathconf = NULL;
3980 mutex_exit(&rp->r_statelock);
3981
3982 /*
3983 * Free the access cache entries.
3984 */
3985 freed = nfs_access_purge_rp(rp);
3986
3987 if (!HAVE_RDDIR_CACHE(rp) &&
3988 contents == NULL &&
3989 vsp == NULL &&
3990 info == NULL)
3991 return (freed);
3992
3993 /*
3994 * Free the readdir cache entries
3995 */
3996 if (HAVE_RDDIR_CACHE(rp))
3997 nfs_purge_rddir_cache(RTOV(rp));
3998
3999 /*
4000 * Free the symbolic link cache.
4001 */
4002 if (contents != NULL) {
4003
4004 kmem_free((void *)contents, size);
4005 }
4006
4007 /*
4008 * Free any cached ACL.
4009 */
4010 if (vsp != NULL)
4011 nfs_acl_free(vsp);
4012
4013 /*
4014 * Free any cached pathconf information.
4015 */
4016 if (info != NULL)
4017 kmem_free(info, sizeof (*info));
4018
4019 return (1);
4020 }
4021
4022 static int
4023 nfs_free_reclaim(void)
4024 {
4025 int freed;
4026 rnode_t *rp;
4027
4028 #ifdef DEBUG
4029 clstat_debug.f_reclaim.value.ui64++;
4030 #endif
4031 freed = 0;
4032 mutex_enter(&rpfreelist_lock);
4033 rp = rpfreelist;
4034 if (rp != NULL) {
4035 do {
4036 if (nfs_free_data_reclaim(rp))
4037 freed = 1;
4038 } while ((rp = rp->r_freef) != rpfreelist);
4039 }
4040 mutex_exit(&rpfreelist_lock);
4041 return (freed);
4042 }
4043
4044 static int
4045 nfs_active_reclaim(void)
4046 {
4047 int freed;
4048 int index;
4049 rnode_t *rp;
4050
4051 #ifdef DEBUG
4052 clstat_debug.a_reclaim.value.ui64++;
4053 #endif
4054 freed = 0;
4055 for (index = 0; index < rtablesize; index++) {
4056 rw_enter(&rtable[index].r_lock, RW_READER);
4057 for (rp = rtable[index].r_hashf;
4058 rp != (rnode_t *)(&rtable[index]);
4059 rp = rp->r_hashf) {
4060 if (nfs_active_data_reclaim(rp))
4061 freed = 1;
4062 }
4063 rw_exit(&rtable[index].r_lock);
4064 }
4065 return (freed);
4066 }
4067
4068 static int
4069 nfs_rnode_reclaim(void)
4070 {
4071 int freed;
4072 rnode_t *rp;
4073 vnode_t *vp;
4074
4075 #ifdef DEBUG
4076 clstat_debug.r_reclaim.value.ui64++;
4077 #endif
4078 freed = 0;
4079 mutex_enter(&rpfreelist_lock);
4080 while ((rp = rpfreelist) != NULL) {
4081 rp_rmfree(rp);
4082 mutex_exit(&rpfreelist_lock);
4083 if (rp->r_flags & RHASHED) {
4084 vp = RTOV(rp);
4085 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4086 mutex_enter(&vp->v_lock);
4087 if (vp->v_count > 1) {
4088 VN_RELE_LOCKED(vp);
4089 mutex_exit(&vp->v_lock);
4090 rw_exit(&rp->r_hashq->r_lock);
4091 mutex_enter(&rpfreelist_lock);
4092 continue;
4093 }
4094 mutex_exit(&vp->v_lock);
4095 rp_rmhash_locked(rp);
4096 rw_exit(&rp->r_hashq->r_lock);
4097 }
4098 /*
4099 * This call to rp_addfree will end up destroying the
4100 * rnode, but in a safe way with the appropriate set
4101 * of checks done.
4102 */
4103 rp_addfree(rp, CRED());
4104 mutex_enter(&rpfreelist_lock);
4105 }
4106 mutex_exit(&rpfreelist_lock);
4107 return (freed);
4108 }
4109
4110 /*ARGSUSED*/
4111 static void
4112 nfs_reclaim(void *cdrarg)
4113 {
4114
4115 #ifdef DEBUG
4116 clstat_debug.reclaim.value.ui64++;
4117 #endif
4118 if (nfs_free_reclaim())
4119 return;
4120
4121 if (nfs_active_reclaim())
4122 return;
4123
4124 (void) nfs_rnode_reclaim();
4125 }
4126
4127 /*
4128 * NFS client failover support
4129 *
4130 * Routines to copy filehandles
4131 */
4132 void
4133 nfscopyfh(caddr_t fhp, vnode_t *vp)
4134 {
4135 fhandle_t *dest = (fhandle_t *)fhp;
4136
4137 if (dest != NULL)
4138 *dest = *VTOFH(vp);
4139 }
4140
4141 void
4142 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4143 {
4144 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4145
4146 if (dest != NULL)
4147 *dest = *VTOFH3(vp);
4148 }
4149
4150 /*
4151 * NFS client failover support
4152 *
4153 * failover_safe() will test various conditions to ensure that
4154 * failover is permitted for this vnode. It will be denied
4155 * if:
4156 * 1) the operation in progress does not support failover (NULL fi)
4157 * 2) there are no available replicas (NULL mi_servers->sv_next)
4158 * 3) any locks are outstanding on this file
4159 */
4160 static int
4161 failover_safe(failinfo_t *fi)
4162 {
4163
4164 /*
4165 * Does this op permit failover?
4166 */
4167 if (fi == NULL || fi->vp == NULL)
4168 return (0);
4169
4170 /*
4171 * Are there any alternates to failover to?
4172 */
4173 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4174 return (0);
4175
4176 /*
4177 * Disable check; we've forced local locking
4178 *
4179 * if (flk_has_remote_locks(fi->vp))
4180 * return (0);
4181 */
4182
4183 /*
4184 * If we have no partial path, we can't do anything
4185 */
4186 if (VTOR(fi->vp)->r_path == NULL)
4187 return (0);
4188
4189 return (1);
4190 }
4191
4192 #include <sys/thread.h>
4193
4194 /*
4195 * NFS client failover support
4196 *
4197 * failover_newserver() will start a search for a new server,
4198 * preferably by starting an async thread to do the work. If
4199 * someone is already doing this (recognizable by MI_BINDINPROG
4200 * being set), it will simply return and the calling thread
4201 * will queue on the mi_failover_cv condition variable.
4202 */
4203 static void
4204 failover_newserver(mntinfo_t *mi)
4205 {
4206 /*
4207 * Check if someone else is doing this already
4208 */
4209 mutex_enter(&mi->mi_lock);
4210 if (mi->mi_flags & MI_BINDINPROG) {
4211 mutex_exit(&mi->mi_lock);
4212 return;
4213 }
4214 mi->mi_flags |= MI_BINDINPROG;
4215
4216 /*
4217 * Need to hold the vfs struct so that it can't be released
4218 * while the failover thread is selecting a new server.
4219 */
4220 VFS_HOLD(mi->mi_vfsp);
4221
4222 /*
4223 * Start a thread to do the real searching.
4224 */
4225 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4226
4227 mutex_exit(&mi->mi_lock);
4228 }
4229
4230 /*
4231 * NFS client failover support
4232 *
4233 * failover_thread() will find a new server to replace the one
4234 * currently in use, wake up other threads waiting on this mount
4235 * point, and die. It will start at the head of the server list
4236 * and poll servers until it finds one with an NFS server which is
4237 * registered and responds to a NULL procedure ping.
4238 *
4239 * XXX failover_thread is unsafe within the scope of the
4240 * present model defined for cpr to suspend the system.
4241 * Specifically, over-the-wire calls made by the thread
4242 * are unsafe. The thread needs to be reevaluated in case of
4243 * future updates to the cpr suspend model.
4244 */
4245 static void
4246 failover_thread(mntinfo_t *mi)
4247 {
4248 servinfo_t *svp = NULL;
4249 CLIENT *cl;
4250 enum clnt_stat status;
4251 struct timeval tv;
4252 int error;
4253 int oncethru = 0;
4254 callb_cpr_t cprinfo;
4255 rnode_t *rp;
4256 int index;
4257 char *srvnames;
4258 size_t srvnames_len;
4259 struct nfs_clnt *nfscl = NULL;
4260 zoneid_t zoneid = getzoneid();
4261
4262 #ifdef DEBUG
4263 /*
4264 * This is currently only needed to access counters which exist on
4265 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4266 * on non-DEBUG kernels.
4267 */
4268 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4269 ASSERT(nfscl != NULL);
4270 #endif
4271
4272 /*
4273 * Its safe to piggyback on the mi_lock since failover_newserver()
4274 * code guarantees that there will be only one failover thread
4275 * per mountinfo at any instance.
4276 */
4277 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4278 "failover_thread");
4279
4280 mutex_enter(&mi->mi_lock);
4281 while (mi->mi_readers) {
4282 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4283 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4284 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4285 }
4286 mutex_exit(&mi->mi_lock);
4287
4288 tv.tv_sec = 2;
4289 tv.tv_usec = 0;
4290
4291 /*
4292 * Ping the null NFS procedure of every server in
4293 * the list until one responds. We always start
4294 * at the head of the list and always skip the one
4295 * that is current, since it's caused us a problem.
4296 */
4297 while (svp == NULL) {
4298 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4299 if (!oncethru && svp == mi->mi_curr_serv)
4300 continue;
4301
4302 /*
4303 * If the file system was forcibly umounted
4304 * while trying to do a failover, then just
4305 * give up on the failover. It won't matter
4306 * what the server is.
4307 */
4308 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4309 svp = NULL;
4310 goto done;
4311 }
4312
4313 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4314 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4315 if (error)
4316 continue;
4317
4318 if (!(mi->mi_flags & MI_INT))
4319 cl->cl_nosignal = TRUE;
4320 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4321 xdr_void, NULL, tv);
4322 if (!(mi->mi_flags & MI_INT))
4323 cl->cl_nosignal = FALSE;
4324 AUTH_DESTROY(cl->cl_auth);
4325 CLNT_DESTROY(cl);
4326 if (status == RPC_SUCCESS) {
4327 if (svp == mi->mi_curr_serv) {
4328 #ifdef DEBUG
4329 zcmn_err(zoneid, CE_NOTE,
4330 "NFS%d: failing over: selecting original server %s",
4331 mi->mi_vers, svp->sv_hostname);
4332 #else
4333 zcmn_err(zoneid, CE_NOTE,
4334 "NFS: failing over: selecting original server %s",
4335 svp->sv_hostname);
4336 #endif
4337 } else {
4338 #ifdef DEBUG
4339 zcmn_err(zoneid, CE_NOTE,
4340 "NFS%d: failing over from %s to %s",
4341 mi->mi_vers,
4342 mi->mi_curr_serv->sv_hostname,
4343 svp->sv_hostname);
4344 #else
4345 zcmn_err(zoneid, CE_NOTE,
4346 "NFS: failing over from %s to %s",
4347 mi->mi_curr_serv->sv_hostname,
4348 svp->sv_hostname);
4349 #endif
4350 }
4351 break;
4352 }
4353 }
4354
4355 if (svp == NULL) {
4356 if (!oncethru) {
4357 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4358 #ifdef DEBUG
4359 zprintf(zoneid,
4360 "NFS%d servers %s not responding "
4361 "still trying\n", mi->mi_vers, srvnames);
4362 #else
4363 zprintf(zoneid, "NFS servers %s not responding "
4364 "still trying\n", srvnames);
4365 #endif
4366 oncethru = 1;
4367 }
4368 mutex_enter(&mi->mi_lock);
4369 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4370 mutex_exit(&mi->mi_lock);
4371 delay(hz);
4372 mutex_enter(&mi->mi_lock);
4373 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4374 mutex_exit(&mi->mi_lock);
4375 }
4376 }
4377
4378 if (oncethru) {
4379 #ifdef DEBUG
4380 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4381 #else
4382 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4383 #endif
4384 }
4385
4386 if (svp != mi->mi_curr_serv) {
4387 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4388 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4389 rw_enter(&rtable[index].r_lock, RW_WRITER);
4390 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4391 mi->mi_vfsp);
4392 if (rp != NULL) {
4393 if (rp->r_flags & RHASHED)
4394 rp_rmhash_locked(rp);
4395 rw_exit(&rtable[index].r_lock);
4396 rp->r_server = svp;
4397 rp->r_fh = svp->sv_fhandle;
4398 (void) nfs_free_data_reclaim(rp);
4399 index = rtablehash(&rp->r_fh);
4400 rp->r_hashq = &rtable[index];
4401 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4402 vn_exists(RTOV(rp));
4403 rp_addhash(rp);
4404 rw_exit(&rp->r_hashq->r_lock);
4405 VN_RELE(RTOV(rp));
4406 } else
4407 rw_exit(&rtable[index].r_lock);
4408 }
4409
4410 done:
4411 if (oncethru)
4412 kmem_free(srvnames, srvnames_len);
4413 mutex_enter(&mi->mi_lock);
4414 mi->mi_flags &= ~MI_BINDINPROG;
4415 if (svp != NULL) {
4416 mi->mi_curr_serv = svp;
4417 mi->mi_failover++;
4418 #ifdef DEBUG
4419 nfscl->nfscl_stat.failover.value.ui64++;
4420 #endif
4421 }
4422 cv_broadcast(&mi->mi_failover_cv);
4423 CALLB_CPR_EXIT(&cprinfo);
4424 VFS_RELE(mi->mi_vfsp);
4425 zthread_exit();
4426 /* NOTREACHED */
4427 }
4428
4429 /*
4430 * NFS client failover support
4431 *
4432 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4433 * is cleared, meaning that failover is complete. Called with
4434 * mi_lock mutex held.
4435 */
4436 static int
4437 failover_wait(mntinfo_t *mi)
4438 {
4439 k_sigset_t smask;
4440
4441 /*
4442 * If someone else is hunting for a living server,
4443 * sleep until it's done. After our sleep, we may
4444 * be bound to the right server and get off cheaply.
4445 */
4446 while (mi->mi_flags & MI_BINDINPROG) {
4447 /*
4448 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4449 * and SIGTERM. (Preserving the existing masks).
4450 * Mask out SIGINT if mount option nointr is specified.
4451 */
4452 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4453 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4454 /*
4455 * restore original signal mask
4456 */
4457 sigunintr(&smask);
4458 return (EINTR);
4459 }
4460 /*
4461 * restore original signal mask
4462 */
4463 sigunintr(&smask);
4464 }
4465 return (0);
4466 }
4467
4468 /*
4469 * NFS client failover support
4470 *
4471 * failover_remap() will do a partial pathname lookup and find the
4472 * desired vnode on the current server. The interim vnode will be
4473 * discarded after we pilfer the new filehandle.
4474 *
4475 * Side effects:
4476 * - This routine will also update the filehandle in the args structure
4477 * pointed to by the fi->fhp pointer if it is non-NULL.
4478 */
4479
4480 static int
4481 failover_remap(failinfo_t *fi)
4482 {
4483 vnode_t *vp, *nvp, *rootvp;
4484 rnode_t *rp, *nrp;
4485 mntinfo_t *mi;
4486 int error;
4487 #ifdef DEBUG
4488 struct nfs_clnt *nfscl;
4489
4490 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4491 ASSERT(nfscl != NULL);
4492 #endif
4493 /*
4494 * Sanity check
4495 */
4496 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4497 return (EINVAL);
4498 vp = fi->vp;
4499 rp = VTOR(vp);
4500 mi = VTOMI(vp);
4501
4502 if (!(vp->v_flag & VROOT)) {
4503 /*
4504 * Given the root fh, use the path stored in
4505 * the rnode to find the fh for the new server.
4506 */
4507 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4508 if (error)
4509 return (error);
4510
4511 error = failover_lookup(rp->r_path, rootvp,
4512 fi->lookupproc, fi->xattrdirproc, &nvp);
4513
4514 VN_RELE(rootvp);
4515
4516 if (error)
4517 return (error);
4518
4519 /*
4520 * If we found the same rnode, we're done now
4521 */
4522 if (nvp == vp) {
4523 /*
4524 * Failed and the new server may physically be same
4525 * OR may share a same disk subsystem. In this case
4526 * file handle for a particular file path is not going
4527 * to change, given the same filehandle lookup will
4528 * always locate the same rnode as the existing one.
4529 * All we might need to do is to update the r_server
4530 * with the current servinfo.
4531 */
4532 if (!VALID_FH(fi)) {
4533 rp->r_server = mi->mi_curr_serv;
4534 }
4535 VN_RELE(nvp);
4536 return (0);
4537 }
4538
4539 /*
4540 * Try to make it so that no one else will find this
4541 * vnode because it is just a temporary to hold the
4542 * new file handle until that file handle can be
4543 * copied to the original vnode/rnode.
4544 */
4545 nrp = VTOR(nvp);
4546 mutex_enter(&mi->mi_remap_lock);
4547 /*
4548 * Some other thread could have raced in here and could
4549 * have done the remap for this particular rnode before
4550 * this thread here. Check for rp->r_server and
4551 * mi->mi_curr_serv and return if they are same.
4552 */
4553 if (VALID_FH(fi)) {
4554 mutex_exit(&mi->mi_remap_lock);
4555 VN_RELE(nvp);
4556 return (0);
4557 }
4558
4559 if (nrp->r_flags & RHASHED)
4560 rp_rmhash(nrp);
4561
4562 /*
4563 * As a heuristic check on the validity of the new
4564 * file, check that the size and type match against
4565 * that we remember from the old version.
4566 */
4567 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4568 mutex_exit(&mi->mi_remap_lock);
4569 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4570 "NFS replicas %s and %s: file %s not same.",
4571 rp->r_server->sv_hostname,
4572 nrp->r_server->sv_hostname, rp->r_path);
4573 VN_RELE(nvp);
4574 return (EINVAL);
4575 }
4576
4577 /*
4578 * snarf the filehandle from the new rnode
4579 * then release it, again while updating the
4580 * hash queues for the rnode.
4581 */
4582 if (rp->r_flags & RHASHED)
4583 rp_rmhash(rp);
4584 rp->r_server = mi->mi_curr_serv;
4585 rp->r_fh = nrp->r_fh;
4586 rp->r_hashq = nrp->r_hashq;
4587 /*
4588 * Copy the attributes from the new rnode to the old
4589 * rnode. This will help to reduce unnecessary page
4590 * cache flushes.
4591 */
4592 rp->r_attr = nrp->r_attr;
4593 rp->r_attrtime = nrp->r_attrtime;
4594 rp->r_mtime = nrp->r_mtime;
4595 (void) nfs_free_data_reclaim(rp);
4596 nfs_setswaplike(vp, &rp->r_attr);
4597 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4598 rp_addhash(rp);
4599 rw_exit(&rp->r_hashq->r_lock);
4600 mutex_exit(&mi->mi_remap_lock);
4601 VN_RELE(nvp);
4602 }
4603
4604 /*
4605 * Update successful failover remap count
4606 */
4607 mutex_enter(&mi->mi_lock);
4608 mi->mi_remap++;
4609 mutex_exit(&mi->mi_lock);
4610 #ifdef DEBUG
4611 nfscl->nfscl_stat.remap.value.ui64++;
4612 #endif
4613
4614 /*
4615 * If we have a copied filehandle to update, do it now.
4616 */
4617 if (fi->fhp != NULL && fi->copyproc != NULL)
4618 (*fi->copyproc)(fi->fhp, vp);
4619
4620 return (0);
4621 }
4622
4623 /*
4624 * NFS client failover support
4625 *
4626 * We want a simple pathname lookup routine to parse the pieces
4627 * of path in rp->r_path. We know that the path was a created
4628 * as rnodes were made, so we know we have only to deal with
4629 * paths that look like:
4630 * dir1/dir2/dir3/file
4631 * Any evidence of anything like .., symlinks, and ENOTDIR
4632 * are hard errors, because they mean something in this filesystem
4633 * is different from the one we came from, or has changed under
4634 * us in some way. If this is true, we want the failure.
4635 *
4636 * Extended attributes: if the filesystem is mounted with extended
4637 * attributes enabled (-o xattr), the attribute directory will be
4638 * represented in the r_path as the magic name XATTR_RPATH. So if
4639 * we see that name in the pathname, is must be because this node
4640 * is an extended attribute. Therefore, look it up that way.
4641 */
4642 static int
4643 failover_lookup(char *path, vnode_t *root,
4644 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4645 vnode_t *, cred_t *, int),
4646 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4647 vnode_t **new)
4648 {
4649 vnode_t *dvp, *nvp;
4650 int error = EINVAL;
4651 char *s, *p, *tmppath;
4652 size_t len;
4653 mntinfo_t *mi;
4654 bool_t xattr;
4655
4656 /* Make local copy of path */
4657 len = strlen(path) + 1;
4658 tmppath = kmem_alloc(len, KM_SLEEP);
4659 (void) strcpy(tmppath, path);
4660 s = tmppath;
4661
4662 dvp = root;
4663 VN_HOLD(dvp);
4664 mi = VTOMI(root);
4665 xattr = mi->mi_flags & MI_EXTATTR;
4666
4667 do {
4668 p = strchr(s, '/');
4669 if (p != NULL)
4670 *p = '\0';
4671 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4672 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4673 RFSCALL_SOFT);
4674 } else {
4675 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4676 CRED(), RFSCALL_SOFT);
4677 }
4678 if (p != NULL)
4679 *p++ = '/';
4680 if (error) {
4681 VN_RELE(dvp);
4682 kmem_free(tmppath, len);
4683 return (error);
4684 }
4685 s = p;
4686 VN_RELE(dvp);
4687 dvp = nvp;
4688 } while (p != NULL);
4689
4690 if (nvp != NULL && new != NULL)
4691 *new = nvp;
4692 kmem_free(tmppath, len);
4693 return (0);
4694 }
4695
4696 /*
4697 * NFS client failover support
4698 *
4699 * sv_free() frees the malloc'd portion of a "servinfo_t".
4700 */
4701 void
4702 sv_free(servinfo_t *svp)
4703 {
4704 servinfo_t *next;
4705 struct knetconfig *knconf;
4706
4707 while (svp != NULL) {
4708 next = svp->sv_next;
4709 if (svp->sv_secdata)
4710 sec_clnt_freeinfo(svp->sv_secdata);
4711 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4712 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4713 knconf = svp->sv_knconf;
4714 if (knconf != NULL) {
4715 if (knconf->knc_protofmly != NULL)
4716 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4717 if (knconf->knc_proto != NULL)
4718 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4719 kmem_free(knconf, sizeof (*knconf));
4720 }
4721 knconf = svp->sv_origknconf;
4722 if (knconf != NULL) {
4723 if (knconf->knc_protofmly != NULL)
4724 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4725 if (knconf->knc_proto != NULL)
4726 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4727 kmem_free(knconf, sizeof (*knconf));
4728 }
4729 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4730 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4731 mutex_destroy(&svp->sv_lock);
4732 kmem_free(svp, sizeof (*svp));
4733 svp = next;
4734 }
4735 }
4736
4737 /*
4738 * Only can return non-zero if intr != 0.
4739 */
4740 int
4741 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4742 {
4743
4744 mutex_enter(&l->lock);
4745
4746 /*
4747 * If this is a nested enter, then allow it. There
4748 * must be as many exits as enters through.
4749 */
4750 if (l->owner == curthread) {
4751 /* lock is held for writing by current thread */
4752 ASSERT(rw == RW_READER || rw == RW_WRITER);
4753 l->count--;
4754 } else if (rw == RW_READER) {
4755 /*
4756 * While there is a writer active or writers waiting,
4757 * then wait for them to finish up and move on. Then,
4758 * increment the count to indicate that a reader is
4759 * active.
4760 */
4761 while (l->count < 0 || l->waiters > 0) {
4762 if (intr) {
4763 klwp_t *lwp = ttolwp(curthread);
4764
4765 if (lwp != NULL)
4766 lwp->lwp_nostop++;
4767 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4768 if (lwp != NULL)
4769 lwp->lwp_nostop--;
4770 mutex_exit(&l->lock);
4771 return (EINTR);
4772 }
4773 if (lwp != NULL)
4774 lwp->lwp_nostop--;
4775 } else
4776 cv_wait(&l->cv_rd, &l->lock);
4777 }
4778 ASSERT(l->count < INT_MAX);
4779 #ifdef DEBUG
4780 if ((l->count % 10000) == 9999)
4781 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4782 "rwlock @ %p\n", l->count, (void *)&l);
4783 #endif
4784 l->count++;
4785 } else {
4786 ASSERT(rw == RW_WRITER);
4787 /*
4788 * While there are readers active or a writer
4789 * active, then wait for all of the readers
4790 * to finish or for the writer to finish.
4791 * Then, set the owner field to curthread and
4792 * decrement count to indicate that a writer
4793 * is active.
4794 */
4795 while (l->count != 0) {
4796 l->waiters++;
4797 if (intr) {
4798 klwp_t *lwp = ttolwp(curthread);
4799
4800 if (lwp != NULL)
4801 lwp->lwp_nostop++;
4802 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4803 if (lwp != NULL)
4804 lwp->lwp_nostop--;
4805 l->waiters--;
4806 /*
4807 * If there are readers active and no
4808 * writers waiting then wake up all of
4809 * the waiting readers (if any).
4810 */
4811 if (l->count > 0 && l->waiters == 0)
4812 cv_broadcast(&l->cv_rd);
4813 mutex_exit(&l->lock);
4814 return (EINTR);
4815 }
4816 if (lwp != NULL)
4817 lwp->lwp_nostop--;
4818 } else
4819 cv_wait(&l->cv, &l->lock);
4820 l->waiters--;
4821 }
4822 ASSERT(l->owner == NULL);
4823 l->owner = curthread;
4824 l->count--;
4825 }
4826
4827 mutex_exit(&l->lock);
4828
4829 return (0);
4830 }
4831
4832 /*
4833 * If the lock is available, obtain it and return non-zero. If there is
4834 * already a conflicting lock, return 0 immediately.
4835 */
4836
4837 int
4838 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4839 {
4840 mutex_enter(&l->lock);
4841
4842 /*
4843 * If this is a nested enter, then allow it. There
4844 * must be as many exits as enters through.
4845 */
4846 if (l->owner == curthread) {
4847 /* lock is held for writing by current thread */
4848 ASSERT(rw == RW_READER || rw == RW_WRITER);
4849 l->count--;
4850 } else if (rw == RW_READER) {
4851 /*
4852 * If there is a writer active or writers waiting, deny the
4853 * lock. Otherwise, bump the count of readers.
4854 */
4855 if (l->count < 0 || l->waiters > 0) {
4856 mutex_exit(&l->lock);
4857 return (0);
4858 }
4859 l->count++;
4860 } else {
4861 ASSERT(rw == RW_WRITER);
4862 /*
4863 * If there are readers active or a writer active, deny the
4864 * lock. Otherwise, set the owner field to curthread and
4865 * decrement count to indicate that a writer is active.
4866 */
4867 if (l->count != 0) {
4868 mutex_exit(&l->lock);
4869 return (0);
4870 }
4871 ASSERT(l->owner == NULL);
4872 l->owner = curthread;
4873 l->count--;
4874 }
4875
4876 mutex_exit(&l->lock);
4877
4878 return (1);
4879 }
4880
4881 void
4882 nfs_rw_exit(nfs_rwlock_t *l)
4883 {
4884
4885 mutex_enter(&l->lock);
4886
4887 if (l->owner != NULL) {
4888 ASSERT(l->owner == curthread);
4889
4890 /*
4891 * To release a writer lock increment count to indicate that
4892 * there is one less writer active. If this was the last of
4893 * possibly nested writer locks, then clear the owner field as
4894 * well to indicate that there is no writer active.
4895 */
4896 ASSERT(l->count < 0);
4897 l->count++;
4898 if (l->count == 0) {
4899 l->owner = NULL;
4900
4901 /*
4902 * If there are no writers waiting then wakeup all of
4903 * the waiting readers (if any).
4904 */
4905 if (l->waiters == 0)
4906 cv_broadcast(&l->cv_rd);
4907 }
4908 } else {
4909 /*
4910 * To release a reader lock just decrement count to indicate
4911 * that there is one less reader active.
4912 */
4913 ASSERT(l->count > 0);
4914 l->count--;
4915 }
4916
4917 /*
4918 * If there are no readers active nor a writer active and there is a
4919 * writer waiting we need to wake up it.
4920 */
4921 if (l->count == 0 && l->waiters > 0)
4922 cv_signal(&l->cv);
4923 mutex_exit(&l->lock);
4924 }
4925
4926 int
4927 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4928 {
4929
4930 if (rw == RW_READER)
4931 return (l->count > 0);
4932 ASSERT(rw == RW_WRITER);
4933 return (l->count < 0);
4934 }
4935
4936 /* ARGSUSED */
4937 void
4938 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4939 {
4940
4941 l->count = 0;
4942 l->waiters = 0;
4943 l->owner = NULL;
4944 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4945 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4946 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4947 }
4948
4949 void
4950 nfs_rw_destroy(nfs_rwlock_t *l)
4951 {
4952
4953 mutex_destroy(&l->lock);
4954 cv_destroy(&l->cv);
4955 cv_destroy(&l->cv_rd);
4956 }
4957
4958 int
4959 nfs3_rddir_compar(const void *x, const void *y)
4960 {
4961 rddir_cache *a = (rddir_cache *)x;
4962 rddir_cache *b = (rddir_cache *)y;
4963
4964 if (a->nfs3_cookie == b->nfs3_cookie) {
4965 if (a->buflen == b->buflen)
4966 return (0);
4967 if (a->buflen < b->buflen)
4968 return (-1);
4969 return (1);
4970 }
4971
4972 if (a->nfs3_cookie < b->nfs3_cookie)
4973 return (-1);
4974
4975 return (1);
4976 }
4977
4978 int
4979 nfs_rddir_compar(const void *x, const void *y)
4980 {
4981 rddir_cache *a = (rddir_cache *)x;
4982 rddir_cache *b = (rddir_cache *)y;
4983
4984 if (a->nfs_cookie == b->nfs_cookie) {
4985 if (a->buflen == b->buflen)
4986 return (0);
4987 if (a->buflen < b->buflen)
4988 return (-1);
4989 return (1);
4990 }
4991
4992 if (a->nfs_cookie < b->nfs_cookie)
4993 return (-1);
4994
4995 return (1);
4996 }
4997
4998 static char *
4999 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5000 {
5001 servinfo_t *s;
5002 char *srvnames;
5003 char *namep;
5004 size_t length;
5005
5006 /*
5007 * Calculate the length of the string required to hold all
5008 * of the server names plus either a comma or a null
5009 * character following each individual one.
5010 */
5011 length = 0;
5012 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5013 length += s->sv_hostnamelen;
5014
5015 srvnames = kmem_alloc(length, KM_SLEEP);
5016
5017 namep = srvnames;
5018 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5019 (void) strcpy(namep, s->sv_hostname);
5020 namep += s->sv_hostnamelen - 1;
5021 *namep++ = ',';
5022 }
5023 *--namep = '\0';
5024
5025 *len = length;
5026
5027 return (srvnames);
5028 }
5029
5030 /*
5031 * These two functions are temporary and designed for the upgrade-workaround
5032 * only. They cannot be used for general zone-crossing NFS client support, and
5033 * will be removed shortly.
5034 *
5035 * When the workaround is enabled, all NFS traffic is forced into the global
5036 * zone. These functions are called when the code needs to refer to the state
5037 * of the underlying network connection. They're not called when the function
5038 * needs to refer to the state of the process that invoked the system call.
5039 * (E.g., when checking whether the zone is shutting down during the mount()
5040 * call.)
5041 */
5042
5043 struct zone *
5044 nfs_zone(void)
5045 {
5046 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5047 }
5048
5049 zoneid_t
5050 nfs_zoneid(void)
5051 {
5052 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5053 }
5054
5055 /*
5056 * nfs_mount_label_policy:
5057 * Determine whether the mount is allowed according to MAC check,
5058 * by comparing (where appropriate) label of the remote server
5059 * against the label of the zone being mounted into.
5060 *
5061 * Returns:
5062 * 0 : access allowed
5063 * -1 : read-only access allowed (i.e., read-down)
5064 * >0 : error code, such as EACCES
5065 */
5066 int
5067 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5068 struct knetconfig *knconf, cred_t *cr)
5069 {
5070 int addr_type;
5071 void *ipaddr;
5072 bslabel_t *server_sl, *mntlabel;
5073 zone_t *mntzone = NULL;
5074 ts_label_t *zlabel;
5075 tsol_tpc_t *tp;
5076 ts_label_t *tsl = NULL;
5077 int retv;
5078
5079 /*
5080 * Get the zone's label. Each zone on a labeled system has a label.
5081 */
5082 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5083 zlabel = mntzone->zone_slabel;
5084 ASSERT(zlabel != NULL);
5085 label_hold(zlabel);
5086
5087 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5088 addr_type = IPV4_VERSION;
5089 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5090 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5091 addr_type = IPV6_VERSION;
5092 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5093 } else {
5094 retv = 0;
5095 goto out;
5096 }
5097
5098 retv = EACCES; /* assume the worst */
5099
5100 /*
5101 * Next, get the assigned label of the remote server.
5102 */
5103 tp = find_tpc(ipaddr, addr_type, B_FALSE);
5104 if (tp == NULL)
5105 goto out; /* error getting host entry */
5106
5107 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5108 goto rel_tpc; /* invalid domain */
5109 if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5110 (tp->tpc_tp.host_type != UNLABELED))
5111 goto rel_tpc; /* invalid hosttype */
5112
5113 if (tp->tpc_tp.host_type == SUN_CIPSO) {
5114 tsl = getflabel_cipso(vfsp);
5115 if (tsl == NULL)
5116 goto rel_tpc; /* error getting server lbl */
5117
5118 server_sl = label2bslabel(tsl);
5119 } else { /* UNLABELED */
5120 server_sl = &tp->tpc_tp.tp_def_label;
5121 }
5122
5123 mntlabel = label2bslabel(zlabel);
5124
5125 /*
5126 * Now compare labels to complete the MAC check. If the labels
5127 * are equal or if the requestor is in the global zone and has
5128 * NET_MAC_AWARE, then allow read-write access. (Except for
5129 * mounts into the global zone itself; restrict these to
5130 * read-only.)
5131 *
5132 * If the requestor is in some other zone, but their label
5133 * dominates the server, then allow read-down.
5134 *
5135 * Otherwise, access is denied.
5136 */
5137 if (blequal(mntlabel, server_sl) ||
5138 (crgetzoneid(cr) == GLOBAL_ZONEID &&
5139 getpflags(NET_MAC_AWARE, cr) != 0)) {
5140 if ((mntzone == global_zone) ||
5141 !blequal(mntlabel, server_sl))
5142 retv = -1; /* read-only */
5143 else
5144 retv = 0; /* access OK */
5145 } else if (bldominates(mntlabel, server_sl)) {
5146 retv = -1; /* read-only */
5147 } else {
5148 retv = EACCES;
5149 }
5150
5151 if (tsl != NULL)
5152 label_rele(tsl);
5153
5154 rel_tpc:
5155 TPC_RELE(tp);
5156 out:
5157 if (mntzone)
5158 zone_rele(mntzone);
5159 label_rele(zlabel);
5160 return (retv);
5161 }
5162
5163 boolean_t
5164 nfs_has_ctty(void)
5165 {
5166 boolean_t rv;
5167 mutex_enter(&curproc->p_splock);
5168 rv = (curproc->p_sessp->s_vp != NULL);
5169 mutex_exit(&curproc->p_splock);
5170 return (rv);
5171 }
5172
5173 /*
5174 * See if xattr directory to see if it has any generic user attributes
5175 */
5176 int
5177 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5178 {
5179 struct uio uio;
5180 struct iovec iov;
5181 char *dbuf;
5182 struct dirent64 *dp;
5183 size_t dlen = 8 * 1024;
5184 size_t dbuflen;
5185 int eof = 0;
5186 int error;
5187
5188 *valp = 0;
5189 dbuf = kmem_alloc(dlen, KM_SLEEP);
5190 uio.uio_iov = &iov;
5191 uio.uio_iovcnt = 1;
5192 uio.uio_segflg = UIO_SYSSPACE;
5193 uio.uio_fmode = 0;
5194 uio.uio_extflg = UIO_COPY_CACHED;
5195 uio.uio_loffset = 0;
5196 uio.uio_resid = dlen;
5197 iov.iov_base = dbuf;
5198 iov.iov_len = dlen;
5199 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5200 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5201 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5202
5203 dbuflen = dlen - uio.uio_resid;
5204
5205 if (error || dbuflen == 0) {
5206 kmem_free(dbuf, dlen);
5207 return (error);
5208 }
5209
5210 dp = (dirent64_t *)dbuf;
5211
5212 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5213 if (strcmp(dp->d_name, ".") == 0 ||
5214 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5215 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5216 VIEW_READONLY) == 0) {
5217 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5218 continue;
5219 }
5220
5221 *valp = 1;
5222 break;
5223 }
5224 kmem_free(dbuf, dlen);
5225 return (0);
5226 }
5227
5228 /*
5229 * Return non-zero in a case the vp is an empty directory used as a ZFS mount
5230 * point. The NFSv2 and NFSv3 servers should not allow to write to such
5231 * directories.
5232 */
5233 int
5234 protect_zfs_mntpt(vnode_t *vp)
5235 {
5236 int error;
5237 vfs_t *vfsp;
5238 struct uio uio;
5239 struct iovec iov;
5240 int eof;
5241 size_t len = 8 * 1024;
5242 char *buf;
5243
5244 if (vp->v_type != VDIR || vn_ismntpt(vp) == 0)
5245 return (0);
5246
5247 error = vn_vfsrlock_wait(vp);
5248 if (error != 0)
5249 return (error);
5250
5251 /*
5252 * We protect ZFS mount points only
5253 */
5254 if ((vfsp = vn_mountedvfs(vp)) == NULL ||
5255 strncmp(vfssw[vfsp->vfs_fstype].vsw_name, "zfs", 3) != 0) {
5256 vn_vfsunlock(vp);
5257 return (0);
5258 }
5259
5260 vn_vfsunlock(vp);
5261
5262 buf = kmem_alloc(len, KM_SLEEP);
5263
5264 uio.uio_iov = &iov;
5265 uio.uio_iovcnt = 1;
5266 uio.uio_segflg = UIO_SYSSPACE;
5267 uio.uio_fmode = 0;
5268 uio.uio_extflg = UIO_COPY_CACHED;
5269 uio.uio_loffset = 0;
5270 uio.uio_llimit = MAXOFFSET_T;
5271
5272 eof = 0;
5273
5274 do {
5275 size_t rlen;
5276 dirent64_t *dp;
5277
5278 uio.uio_resid = len;
5279 iov.iov_base = buf;
5280 iov.iov_len = len;
5281
5282 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5283 error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
5284 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5285
5286 if (error != 0)
5287 break;
5288
5289 error = EBUSY;
5290
5291 rlen = len - uio.uio_resid;
5292 if (rlen == 0)
5293 break;
5294
5295 for (dp = (dirent64_t *)buf;
5296 (intptr_t)dp < (intptr_t)buf + rlen;
5297 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
5298 if (strcmp(dp->d_name, ".") != 0 &&
5299 strcmp(dp->d_name, "..") != 0) {
5300 error = 0;
5301 break;
5302 }
5303 }
5304 } while (eof == 0 && error != 0);
5305
5306 kmem_free(buf, len);
5307
5308 return (error);
5309 }