Print this page
NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs_subr.c
+++ new/usr/src/uts/common/fs/nfs/nfs_subr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
|
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 +
21 22 /*
22 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 24 * Use is subject to license terms.
24 25 */
25 26
26 27 /*
27 - * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 + * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28 29 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
29 30 */
30 31
31 32 #include <sys/param.h>
32 33 #include <sys/types.h>
33 34 #include <sys/systm.h>
34 35 #include <sys/cred.h>
35 36 #include <sys/proc.h>
36 37 #include <sys/user.h>
37 38 #include <sys/time.h>
38 39 #include <sys/buf.h>
39 40 #include <sys/vfs.h>
40 41 #include <sys/vnode.h>
41 42 #include <sys/socket.h>
42 43 #include <sys/uio.h>
43 44 #include <sys/tiuser.h>
44 45 #include <sys/swap.h>
45 46 #include <sys/errno.h>
46 47 #include <sys/debug.h>
47 48 #include <sys/kmem.h>
48 49 #include <sys/kstat.h>
49 50 #include <sys/cmn_err.h>
50 51 #include <sys/vtrace.h>
51 52 #include <sys/session.h>
52 53 #include <sys/dnlc.h>
53 54 #include <sys/bitmap.h>
54 55 #include <sys/acl.h>
55 56 #include <sys/ddi.h>
56 57 #include <sys/pathname.h>
57 58 #include <sys/flock.h>
58 59 #include <sys/dirent.h>
59 60 #include <sys/flock.h>
60 61 #include <sys/callb.h>
61 62 #include <sys/atomic.h>
62 63 #include <sys/list.h>
63 64 #include <sys/tsol/tnet.h>
64 65 #include <sys/priv.h>
65 66 #include <sys/sdt.h>
66 67 #include <sys/attr.h>
67 68
68 69 #include <inet/ip6.h>
69 70
70 71 #include <rpc/types.h>
71 72 #include <rpc/xdr.h>
72 73 #include <rpc/auth.h>
73 74 #include <rpc/clnt.h>
74 75
75 76 #include <nfs/nfs.h>
76 77 #include <nfs/nfs4.h>
77 78 #include <nfs/nfs_clnt.h>
78 79 #include <nfs/rnode.h>
79 80 #include <nfs/nfs_acl.h>
80 81
81 82 #include <sys/tsol/label.h>
82 83
83 84 /*
84 85 * The hash queues for the access to active and cached rnodes
85 86 * are organized as doubly linked lists. A reader/writer lock
86 87 * for each hash bucket is used to control access and to synchronize
87 88 * lookups, additions, and deletions from the hash queue.
88 89 *
89 90 * The rnode freelist is organized as a doubly linked list with
90 91 * a head pointer. Additions and deletions are synchronized via
91 92 * a single mutex.
92 93 *
93 94 * In order to add an rnode to the free list, it must be hashed into
94 95 * a hash queue and the exclusive lock to the hash queue be held.
95 96 * If an rnode is not hashed into a hash queue, then it is destroyed
96 97 * because it represents no valuable information that can be reused
97 98 * about the file. The exclusive lock to the hash queue must be
98 99 * held in order to prevent a lookup in the hash queue from finding
99 100 * the rnode and using it and assuming that the rnode is not on the
100 101 * freelist. The lookup in the hash queue will have the hash queue
101 102 * locked, either exclusive or shared.
102 103 *
103 104 * The vnode reference count for each rnode is not allowed to drop
104 105 * below 1. This prevents external entities, such as the VM
105 106 * subsystem, from acquiring references to vnodes already on the
106 107 * freelist and then trying to place them back on the freelist
107 108 * when their reference is released. This means that the when an
108 109 * rnode is looked up in the hash queues, then either the rnode
109 110 * is removed from the freelist and that reference is transferred to
110 111 * the new reference or the vnode reference count must be incremented
111 112 * accordingly. The mutex for the freelist must be held in order to
112 113 * accurately test to see if the rnode is on the freelist or not.
113 114 * The hash queue lock might be held shared and it is possible that
114 115 * two different threads may race to remove the rnode from the
115 116 * freelist. This race can be resolved by holding the mutex for the
116 117 * freelist. Please note that the mutex for the freelist does not
117 118 * need to held if the rnode is not on the freelist. It can not be
118 119 * placed on the freelist due to the requirement that the thread
119 120 * putting the rnode on the freelist must hold the exclusive lock
120 121 * to the hash queue and the thread doing the lookup in the hash
121 122 * queue is holding either a shared or exclusive lock to the hash
122 123 * queue.
123 124 *
|
↓ open down ↓ |
86 lines elided |
↑ open up ↑ |
124 125 * The lock ordering is:
125 126 *
126 127 * hash bucket lock -> vnode lock
127 128 * hash bucket lock -> freelist lock
128 129 */
129 130 static rhashq_t *rtable;
130 131
131 132 static kmutex_t rpfreelist_lock;
132 133 static rnode_t *rpfreelist = NULL;
133 134 static long rnew = 0;
134 -long nrnode = 0;
135 +volatile long nrnode = 0;
135 136
136 137 static int rtablesize;
137 138 static int rtablemask;
138 139
139 140 static int hashlen = 4;
140 141
141 142 static struct kmem_cache *rnode_cache;
142 143
143 144 /*
144 145 * Mutex to protect the following variables:
145 146 * nfs_major
146 147 * nfs_minor
147 148 */
148 149 kmutex_t nfs_minor_lock;
149 150 int nfs_major;
150 151 int nfs_minor;
151 152
152 -/* Do we allow preepoch (negative) time values otw? */
153 -bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
153 +/*
154 + * Do we allow preepoch (negative) time values otw?
155 + * default: do not allow preepoch
156 + */
157 +volatile bool_t nfs_allow_preepoch_time = FALSE;
154 158
155 159 /*
156 160 * Access cache
157 161 */
158 162 static acache_hash_t *acache;
159 -static long nacache; /* used strictly to size the number of hash queues */
163 +volatile long nacache; /* used strictly to size the number of hash queues */
160 164
161 165 static int acachesize;
162 166 static int acachemask;
163 167 static struct kmem_cache *acache_cache;
164 168
165 169 /*
166 170 * Client side utilities
167 171 */
168 172
169 173 /*
170 174 * client side statistics
171 175 */
172 176 static const struct clstat clstat_tmpl = {
173 177 { "calls", KSTAT_DATA_UINT64 },
174 178 { "badcalls", KSTAT_DATA_UINT64 },
175 179 { "clgets", KSTAT_DATA_UINT64 },
176 180 { "cltoomany", KSTAT_DATA_UINT64 },
177 181 #ifdef DEBUG
178 182 { "clalloc", KSTAT_DATA_UINT64 },
179 183 { "noresponse", KSTAT_DATA_UINT64 },
180 184 { "failover", KSTAT_DATA_UINT64 },
181 185 { "remap", KSTAT_DATA_UINT64 },
182 186 #endif
183 187 };
184 188
185 189 /*
186 190 * The following are statistics that describe behavior of the system as a whole
187 191 * and doesn't correspond to any one particular zone.
188 192 */
189 193 #ifdef DEBUG
190 194 static struct clstat_debug {
191 195 kstat_named_t nrnode; /* number of allocated rnodes */
192 196 kstat_named_t access; /* size of access cache */
193 197 kstat_named_t dirent; /* size of readdir cache */
194 198 kstat_named_t dirents; /* size of readdir buf cache */
195 199 kstat_named_t reclaim; /* number of reclaims */
196 200 kstat_named_t clreclaim; /* number of cl reclaims */
197 201 kstat_named_t f_reclaim; /* number of free reclaims */
198 202 kstat_named_t a_reclaim; /* number of active reclaims */
199 203 kstat_named_t r_reclaim; /* number of rnode reclaims */
200 204 kstat_named_t rpath; /* bytes used to store rpaths */
201 205 } clstat_debug = {
202 206 { "nrnode", KSTAT_DATA_UINT64 },
203 207 { "access", KSTAT_DATA_UINT64 },
204 208 { "dirent", KSTAT_DATA_UINT64 },
205 209 { "dirents", KSTAT_DATA_UINT64 },
206 210 { "reclaim", KSTAT_DATA_UINT64 },
207 211 { "clreclaim", KSTAT_DATA_UINT64 },
208 212 { "f_reclaim", KSTAT_DATA_UINT64 },
209 213 { "a_reclaim", KSTAT_DATA_UINT64 },
210 214 { "r_reclaim", KSTAT_DATA_UINT64 },
211 215 { "r_path", KSTAT_DATA_UINT64 },
212 216 };
213 217 #endif /* DEBUG */
214 218
215 219 /*
216 220 * We keep a global list of per-zone client data, so we can clean up all zones
217 221 * if we get low on memory.
218 222 */
219 223 static list_t nfs_clnt_list;
220 224 static kmutex_t nfs_clnt_list_lock;
|
↓ open down ↓ |
51 lines elided |
↑ open up ↑ |
221 225 static zone_key_t nfsclnt_zone_key;
222 226
223 227 static struct kmem_cache *chtab_cache;
224 228
225 229 /*
226 230 * Some servers do not properly update the attributes of the
227 231 * directory when changes are made. To allow interoperability
228 232 * with these broken servers, the nfs_disable_rddir_cache
229 233 * parameter must be set in /etc/system
230 234 */
231 -int nfs_disable_rddir_cache = 0;
235 +volatile int nfs_disable_rddir_cache = 0;
232 236
233 237 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 238 struct chtab **);
235 239 void clfree(CLIENT *, struct chtab *);
236 240 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
237 241 struct chtab **, struct nfs_clnt *);
238 242 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
239 243 struct chtab **, struct nfs_clnt *);
240 244 static void clreclaim(void *);
241 245 static int nfs_feedback(int, int, mntinfo_t *);
242 246 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
243 247 caddr_t, cred_t *, int *, enum clnt_stat *, int,
244 248 failinfo_t *);
245 249 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
246 250 caddr_t, cred_t *, int *, int, failinfo_t *);
247 251 static void rinactive(rnode_t *, cred_t *);
248 252 static int rtablehash(nfs_fhandle *);
249 253 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
250 254 struct vnodeops *,
251 255 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
252 256 cred_t *),
253 257 int (*)(const void *, const void *), int *, cred_t *,
254 258 char *, char *);
255 259 static void rp_rmfree(rnode_t *);
256 260 static void rp_addhash(rnode_t *);
257 261 static void rp_rmhash_locked(rnode_t *);
258 262 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
259 263 static void destroy_rnode(rnode_t *);
260 264 static void rddir_cache_free(rddir_cache *);
261 265 static int nfs_free_data_reclaim(rnode_t *);
262 266 static int nfs_active_data_reclaim(rnode_t *);
263 267 static int nfs_free_reclaim(void);
264 268 static int nfs_active_reclaim(void);
265 269 static int nfs_rnode_reclaim(void);
266 270 static void nfs_reclaim(void *);
267 271 static int failover_safe(failinfo_t *);
268 272 static void failover_newserver(mntinfo_t *mi);
269 273 static void failover_thread(mntinfo_t *mi);
270 274 static int failover_wait(mntinfo_t *);
271 275 static int failover_remap(failinfo_t *);
272 276 static int failover_lookup(char *, vnode_t *,
273 277 int (*)(vnode_t *, char *, vnode_t **,
274 278 struct pathname *, int, vnode_t *, cred_t *, int),
275 279 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
276 280 vnode_t **);
277 281 static void nfs_free_r_path(rnode_t *);
278 282 static void nfs_set_vroot(vnode_t *);
279 283 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
280 284
281 285 /*
282 286 * from rpcsec module (common/rpcsec)
283 287 */
284 288 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
285 289 extern void sec_clnt_freeh(AUTH *);
286 290 extern void sec_clnt_freeinfo(struct sec_data *);
287 291
288 292 /*
289 293 * used in mount policy
290 294 */
291 295 extern ts_label_t *getflabel_cipso(vfs_t *);
292 296
293 297 /*
294 298 * EIO or EINTR are not recoverable errors.
295 299 */
296 300 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
297 301
298 302 #ifdef DEBUG
299 303 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
300 304 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
301 305 #else
302 306 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
303 307 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
304 308 #endif
305 309 /*
306 310 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
307 311 */
308 312 static int
309 313 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
310 314 struct chtab **chp, struct nfs_clnt *nfscl)
311 315 {
312 316 struct chhead *ch, *newch;
313 317 struct chhead **plistp;
314 318 struct chtab *cp;
315 319 int error;
316 320 k_sigset_t smask;
317 321
318 322 if (newcl == NULL || chp == NULL || ci == NULL)
319 323 return (EINVAL);
320 324
321 325 *newcl = NULL;
322 326 *chp = NULL;
323 327
324 328 /*
325 329 * Find an unused handle or create one
326 330 */
327 331 newch = NULL;
328 332 nfscl->nfscl_stat.clgets.value.ui64++;
329 333 top:
330 334 /*
331 335 * Find the correct entry in the cache to check for free
332 336 * client handles. The search is based on the RPC program
333 337 * number, program version number, dev_t for the transport
334 338 * device, and the protocol family.
335 339 */
336 340 mutex_enter(&nfscl->nfscl_chtable_lock);
337 341 plistp = &nfscl->nfscl_chtable;
338 342 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
339 343 if (ch->ch_prog == ci->cl_prog &&
340 344 ch->ch_vers == ci->cl_vers &&
341 345 ch->ch_dev == svp->sv_knconf->knc_rdev &&
342 346 (strcmp(ch->ch_protofmly,
343 347 svp->sv_knconf->knc_protofmly) == 0))
344 348 break;
345 349 plistp = &ch->ch_next;
346 350 }
347 351
348 352 /*
349 353 * If we didn't find a cache entry for this quadruple, then
350 354 * create one. If we don't have one already preallocated,
351 355 * then drop the cache lock, create one, and then start over.
352 356 * If we did have a preallocated entry, then just add it to
353 357 * the front of the list.
354 358 */
355 359 if (ch == NULL) {
356 360 if (newch == NULL) {
357 361 mutex_exit(&nfscl->nfscl_chtable_lock);
358 362 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
359 363 newch->ch_timesused = 0;
360 364 newch->ch_prog = ci->cl_prog;
361 365 newch->ch_vers = ci->cl_vers;
362 366 newch->ch_dev = svp->sv_knconf->knc_rdev;
363 367 newch->ch_protofmly = kmem_alloc(
364 368 strlen(svp->sv_knconf->knc_protofmly) + 1,
365 369 KM_SLEEP);
366 370 (void) strcpy(newch->ch_protofmly,
367 371 svp->sv_knconf->knc_protofmly);
368 372 newch->ch_list = NULL;
369 373 goto top;
370 374 }
371 375 ch = newch;
372 376 newch = NULL;
373 377 ch->ch_next = nfscl->nfscl_chtable;
374 378 nfscl->nfscl_chtable = ch;
375 379 /*
376 380 * We found a cache entry, but if it isn't on the front of the
377 381 * list, then move it to the front of the list to try to take
378 382 * advantage of locality of operations.
379 383 */
380 384 } else if (ch != nfscl->nfscl_chtable) {
381 385 *plistp = ch->ch_next;
382 386 ch->ch_next = nfscl->nfscl_chtable;
383 387 nfscl->nfscl_chtable = ch;
384 388 }
385 389
386 390 /*
387 391 * If there was a free client handle cached, then remove it
388 392 * from the list, init it, and use it.
389 393 */
390 394 if (ch->ch_list != NULL) {
391 395 cp = ch->ch_list;
392 396 ch->ch_list = cp->ch_list;
393 397 mutex_exit(&nfscl->nfscl_chtable_lock);
394 398 if (newch != NULL) {
395 399 kmem_free(newch->ch_protofmly,
396 400 strlen(newch->ch_protofmly) + 1);
397 401 kmem_free(newch, sizeof (*newch));
398 402 }
399 403 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
400 404 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
401 405 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
402 406 &cp->ch_client->cl_auth);
403 407 if (error || cp->ch_client->cl_auth == NULL) {
404 408 CLNT_DESTROY(cp->ch_client);
405 409 kmem_cache_free(chtab_cache, cp);
406 410 return ((error != 0) ? error : EINTR);
407 411 }
408 412 ch->ch_timesused++;
409 413 *newcl = cp->ch_client;
410 414 *chp = cp;
411 415 return (0);
412 416 }
413 417
414 418 /*
415 419 * There weren't any free client handles which fit, so allocate
416 420 * a new one and use that.
417 421 */
418 422 #ifdef DEBUG
419 423 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
420 424 #endif
421 425 mutex_exit(&nfscl->nfscl_chtable_lock);
422 426
423 427 nfscl->nfscl_stat.cltoomany.value.ui64++;
424 428 if (newch != NULL) {
425 429 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
426 430 kmem_free(newch, sizeof (*newch));
427 431 }
428 432
429 433 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
430 434 cp->ch_head = ch;
431 435
432 436 sigintr(&smask, (int)ci->cl_flags & MI_INT);
433 437 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
434 438 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
435 439 sigunintr(&smask);
436 440
437 441 if (error != 0) {
438 442 kmem_cache_free(chtab_cache, cp);
439 443 #ifdef DEBUG
440 444 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
441 445 #endif
442 446 /*
443 447 * Warning is unnecessary if error is EINTR.
444 448 */
445 449 if (error != EINTR) {
446 450 nfs_cmn_err(error, CE_WARN,
447 451 "clget: couldn't create handle: %m\n");
448 452 }
449 453 return (error);
450 454 }
451 455 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
452 456 auth_destroy(cp->ch_client->cl_auth);
453 457 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
454 458 &cp->ch_client->cl_auth);
455 459 if (error || cp->ch_client->cl_auth == NULL) {
456 460 CLNT_DESTROY(cp->ch_client);
457 461 kmem_cache_free(chtab_cache, cp);
458 462 #ifdef DEBUG
459 463 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
460 464 #endif
461 465 return ((error != 0) ? error : EINTR);
462 466 }
463 467 ch->ch_timesused++;
464 468 *newcl = cp->ch_client;
465 469 ASSERT(cp->ch_client->cl_nosignal == FALSE);
466 470 *chp = cp;
467 471 return (0);
468 472 }
469 473
470 474 int
471 475 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
472 476 struct chtab **chp)
473 477 {
474 478 struct nfs_clnt *nfscl;
475 479
476 480 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
477 481 ASSERT(nfscl != NULL);
478 482
479 483 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
480 484 }
481 485
482 486 static int
483 487 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
484 488 struct chtab **chp, struct nfs_clnt *nfscl)
485 489 {
486 490 clinfo_t ci;
487 491 int error;
488 492
489 493 /*
490 494 * Set read buffer size to rsize
491 495 * and add room for RPC headers.
492 496 */
493 497 ci.cl_readsize = mi->mi_tsize;
494 498 if (ci.cl_readsize != 0)
495 499 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
496 500
497 501 /*
498 502 * If soft mount and server is down just try once.
499 503 * meaning: do not retransmit.
500 504 */
501 505 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
502 506 ci.cl_retrans = 0;
503 507 else
504 508 ci.cl_retrans = mi->mi_retrans;
505 509
506 510 ci.cl_prog = NFS_ACL_PROGRAM;
507 511 ci.cl_vers = mi->mi_vers;
508 512 ci.cl_flags = mi->mi_flags;
509 513
510 514 /*
511 515 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
512 516 * security flavor, the client tries to establish a security context
513 517 * by contacting the server. If the connection is timed out or reset,
514 518 * e.g. server reboot, we will try again.
515 519 */
516 520 do {
517 521 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
518 522
519 523 if (error == 0)
520 524 break;
521 525
522 526 /*
523 527 * For forced unmount or zone shutdown, bail out, no retry.
524 528 */
525 529 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
526 530 error = EIO;
527 531 break;
528 532 }
529 533
530 534 /* do not retry for softmount */
531 535 if (!(mi->mi_flags & MI_HARD))
532 536 break;
533 537
534 538 /* let the caller deal with the failover case */
535 539 if (FAILOVER_MOUNT(mi))
536 540 break;
537 541
538 542 } while (error == ETIMEDOUT || error == ECONNRESET);
539 543
540 544 return (error);
541 545 }
542 546
543 547 static int
544 548 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
545 549 struct chtab **chp, struct nfs_clnt *nfscl)
546 550 {
547 551 clinfo_t ci;
548 552 int error;
549 553
550 554 /*
551 555 * Set read buffer size to rsize
552 556 * and add room for RPC headers.
553 557 */
554 558 ci.cl_readsize = mi->mi_tsize;
555 559 if (ci.cl_readsize != 0)
556 560 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
557 561
558 562 /*
559 563 * If soft mount and server is down just try once.
560 564 * meaning: do not retransmit.
561 565 */
562 566 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
563 567 ci.cl_retrans = 0;
564 568 else
565 569 ci.cl_retrans = mi->mi_retrans;
566 570
567 571 ci.cl_prog = mi->mi_prog;
568 572 ci.cl_vers = mi->mi_vers;
569 573 ci.cl_flags = mi->mi_flags;
570 574
571 575 /*
572 576 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
573 577 * security flavor, the client tries to establish a security context
574 578 * by contacting the server. If the connection is timed out or reset,
575 579 * e.g. server reboot, we will try again.
576 580 */
577 581 do {
578 582 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
579 583
580 584 if (error == 0)
581 585 break;
582 586
583 587 /*
584 588 * For forced unmount or zone shutdown, bail out, no retry.
585 589 */
586 590 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
587 591 error = EIO;
588 592 break;
589 593 }
590 594
591 595 /* do not retry for softmount */
592 596 if (!(mi->mi_flags & MI_HARD))
593 597 break;
594 598
595 599 /* let the caller deal with the failover case */
596 600 if (FAILOVER_MOUNT(mi))
597 601 break;
598 602
599 603 } while (error == ETIMEDOUT || error == ECONNRESET);
600 604
601 605 return (error);
602 606 }
603 607
604 608 static void
605 609 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
606 610 {
607 611 if (cl->cl_auth != NULL) {
608 612 sec_clnt_freeh(cl->cl_auth);
609 613 cl->cl_auth = NULL;
610 614 }
611 615
612 616 /*
613 617 * Timestamp this cache entry so that we know when it was last
614 618 * used.
615 619 */
616 620 cp->ch_freed = gethrestime_sec();
617 621
618 622 /*
619 623 * Add the free client handle to the front of the list.
620 624 * This way, the list will be sorted in youngest to oldest
621 625 * order.
622 626 */
623 627 mutex_enter(&nfscl->nfscl_chtable_lock);
624 628 cp->ch_list = cp->ch_head->ch_list;
625 629 cp->ch_head->ch_list = cp;
626 630 mutex_exit(&nfscl->nfscl_chtable_lock);
627 631 }
628 632
629 633 void
630 634 clfree(CLIENT *cl, struct chtab *cp)
631 635 {
632 636 struct nfs_clnt *nfscl;
633 637
634 638 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
635 639 ASSERT(nfscl != NULL);
636 640
637 641 clfree_impl(cl, cp, nfscl);
638 642 }
639 643
640 644 #define CL_HOLDTIME 60 /* time to hold client handles */
641 645
642 646 static void
643 647 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
644 648 {
645 649 struct chhead *ch;
646 650 struct chtab *cp; /* list of objects that can be reclaimed */
647 651 struct chtab *cpe;
648 652 struct chtab *cpl;
649 653 struct chtab **cpp;
650 654 #ifdef DEBUG
651 655 int n = 0;
652 656 #endif
653 657
654 658 /*
655 659 * Need to reclaim some memory, so step through the cache
656 660 * looking through the lists for entries which can be freed.
657 661 */
658 662 cp = NULL;
659 663
660 664 mutex_enter(&nfscl->nfscl_chtable_lock);
661 665
662 666 /*
663 667 * Here we step through each non-NULL quadruple and start to
664 668 * construct the reclaim list pointed to by cp. Note that
665 669 * cp will contain all eligible chtab entries. When this traversal
666 670 * completes, chtab entries from the last quadruple will be at the
667 671 * front of cp and entries from previously inspected quadruples have
668 672 * been appended to the rear of cp.
669 673 */
670 674 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
671 675 if (ch->ch_list == NULL)
672 676 continue;
673 677 /*
674 678 * Search each list for entries older then
675 679 * cl_holdtime seconds. The lists are maintained
676 680 * in youngest to oldest order so that when the
677 681 * first entry is found which is old enough, then
678 682 * all of the rest of the entries on the list will
679 683 * be old enough as well.
680 684 */
681 685 cpl = ch->ch_list;
682 686 cpp = &ch->ch_list;
683 687 while (cpl != NULL &&
684 688 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
685 689 cpp = &cpl->ch_list;
686 690 cpl = cpl->ch_list;
687 691 }
688 692 if (cpl != NULL) {
689 693 *cpp = NULL;
690 694 if (cp != NULL) {
691 695 cpe = cpl;
692 696 while (cpe->ch_list != NULL)
693 697 cpe = cpe->ch_list;
694 698 cpe->ch_list = cp;
695 699 }
696 700 cp = cpl;
697 701 }
698 702 }
699 703
700 704 mutex_exit(&nfscl->nfscl_chtable_lock);
701 705
702 706 /*
703 707 * If cp is empty, then there is nothing to reclaim here.
704 708 */
705 709 if (cp == NULL)
706 710 return;
707 711
708 712 /*
709 713 * Step through the list of entries to free, destroying each client
710 714 * handle and kmem_free'ing the memory for each entry.
711 715 */
712 716 while (cp != NULL) {
713 717 #ifdef DEBUG
714 718 n++;
715 719 #endif
716 720 CLNT_DESTROY(cp->ch_client);
717 721 cpl = cp->ch_list;
718 722 kmem_cache_free(chtab_cache, cp);
719 723 cp = cpl;
720 724 }
721 725
722 726 #ifdef DEBUG
723 727 /*
724 728 * Update clalloc so that nfsstat shows the current number
725 729 * of allocated client handles.
726 730 */
727 731 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
728 732 #endif
729 733 }
730 734
731 735 /* ARGSUSED */
732 736 static void
733 737 clreclaim(void *all)
734 738 {
735 739 struct nfs_clnt *nfscl;
736 740
737 741 #ifdef DEBUG
738 742 clstat_debug.clreclaim.value.ui64++;
739 743 #endif
740 744 /*
741 745 * The system is low on memory; go through and try to reclaim some from
742 746 * every zone on the system.
743 747 */
744 748 mutex_enter(&nfs_clnt_list_lock);
745 749 nfscl = list_head(&nfs_clnt_list);
746 750 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
747 751 clreclaim_zone(nfscl, CL_HOLDTIME);
748 752 mutex_exit(&nfs_clnt_list_lock);
749 753 }
750 754
751 755 /*
752 756 * Minimum time-out values indexed by call type
753 757 * These units are in "eights" of a second to avoid multiplies
754 758 */
755 759 static unsigned int minimum_timeo[] = {
756 760 6, 7, 10
757 761 };
758 762
759 763 /*
760 764 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
761 765 */
762 766 #define MAXTIMO (20*hz)
763 767 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
764 768 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
765 769
766 770 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
767 771 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
768 772 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
769 773
770 774 /*
771 775 * Function called when rfscall notices that we have been
772 776 * re-transmitting, or when we get a response without retransmissions.
773 777 * Return 1 if the transfer size was adjusted down - 0 if no change.
774 778 */
775 779 static int
776 780 nfs_feedback(int flag, int which, mntinfo_t *mi)
777 781 {
778 782 int kind;
779 783 int r = 0;
780 784
781 785 mutex_enter(&mi->mi_lock);
782 786 if (flag == FEEDBACK_REXMIT1) {
783 787 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
784 788 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
785 789 goto done;
786 790 if (mi->mi_curread > MIN_NFS_TSIZE) {
787 791 mi->mi_curread /= 2;
788 792 if (mi->mi_curread < MIN_NFS_TSIZE)
789 793 mi->mi_curread = MIN_NFS_TSIZE;
790 794 r = 1;
791 795 }
792 796
793 797 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
794 798 mi->mi_curwrite /= 2;
795 799 if (mi->mi_curwrite < MIN_NFS_TSIZE)
796 800 mi->mi_curwrite = MIN_NFS_TSIZE;
797 801 r = 1;
798 802 }
799 803 } else if (flag == FEEDBACK_OK) {
800 804 kind = mi->mi_timer_type[which];
801 805 if (kind == 0 ||
802 806 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
803 807 goto done;
804 808 if (kind == 1) {
805 809 if (mi->mi_curread >= mi->mi_tsize)
806 810 goto done;
807 811 mi->mi_curread += MIN_NFS_TSIZE;
808 812 if (mi->mi_curread > mi->mi_tsize/2)
809 813 mi->mi_curread = mi->mi_tsize;
810 814 } else if (kind == 2) {
811 815 if (mi->mi_curwrite >= mi->mi_stsize)
812 816 goto done;
813 817 mi->mi_curwrite += MIN_NFS_TSIZE;
814 818 if (mi->mi_curwrite > mi->mi_stsize/2)
815 819 mi->mi_curwrite = mi->mi_stsize;
816 820 }
817 821 }
818 822 done:
819 823 mutex_exit(&mi->mi_lock);
820 824 return (r);
821 825 }
822 826
823 827 #ifdef DEBUG
824 828 static int rfs2call_hits = 0;
825 829 static int rfs2call_misses = 0;
826 830 #endif
827 831
828 832 int
829 833 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
830 834 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
831 835 enum nfsstat *statusp, int flags, failinfo_t *fi)
832 836 {
833 837 int rpcerror;
834 838 enum clnt_stat rpc_status;
835 839
836 840 ASSERT(statusp != NULL);
837 841
838 842 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
839 843 cr, douprintf, &rpc_status, flags, fi);
840 844 if (!rpcerror) {
841 845 /*
842 846 * See crnetadjust() for comments.
843 847 */
844 848 if (*statusp == NFSERR_ACCES &&
845 849 (cr = crnetadjust(cr)) != NULL) {
846 850 #ifdef DEBUG
847 851 rfs2call_hits++;
848 852 #endif
849 853 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
850 854 resp, cr, douprintf, NULL, flags, fi);
851 855 crfree(cr);
852 856 #ifdef DEBUG
853 857 if (*statusp == NFSERR_ACCES)
854 858 rfs2call_misses++;
855 859 #endif
856 860 }
|
↓ open down ↓ |
615 lines elided |
↑ open up ↑ |
857 861 } else if (rpc_status == RPC_PROCUNAVAIL) {
858 862 *statusp = NFSERR_OPNOTSUPP;
859 863 rpcerror = 0;
860 864 }
861 865
862 866 return (rpcerror);
863 867 }
864 868
865 869 #define NFS3_JUKEBOX_DELAY 10 * hz
866 870
867 -static clock_t nfs3_jukebox_delay = 0;
871 +volatile clock_t nfs3_jukebox_delay = 0;
868 872
869 873 #ifdef DEBUG
870 874 static int rfs3call_hits = 0;
871 875 static int rfs3call_misses = 0;
872 876 #endif
873 877
874 878 int
875 879 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
876 880 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
877 881 nfsstat3 *statusp, int flags, failinfo_t *fi)
878 882 {
879 883 int rpcerror;
880 884 int user_informed;
881 885
882 886 user_informed = 0;
883 887 do {
884 888 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
885 889 cr, douprintf, NULL, flags, fi);
886 890 if (!rpcerror) {
887 891 cred_t *crr;
888 892 if (*statusp == NFS3ERR_JUKEBOX) {
889 893 if (ttoproc(curthread) == &p0) {
890 894 rpcerror = EAGAIN;
891 895 break;
892 896 }
893 897 if (!user_informed) {
894 898 user_informed = 1;
895 899 uprintf(
896 900 "file temporarily unavailable on the server, retrying...\n");
897 901 }
898 902 delay(nfs3_jukebox_delay);
899 903 }
900 904 /*
901 905 * See crnetadjust() for comments.
902 906 */
903 907 else if (*statusp == NFS3ERR_ACCES &&
904 908 (crr = crnetadjust(cr)) != NULL) {
905 909 #ifdef DEBUG
906 910 rfs3call_hits++;
907 911 #endif
908 912 rpcerror = rfscall(mi, which, xdrargs, argsp,
909 913 xdrres, resp, crr, douprintf,
910 914 NULL, flags, fi);
911 915
912 916 crfree(crr);
913 917 #ifdef DEBUG
914 918 if (*statusp == NFS3ERR_ACCES)
915 919 rfs3call_misses++;
916 920 #endif
917 921 }
918 922 }
919 923 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
920 924
921 925 return (rpcerror);
922 926 }
923 927
924 928 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
925 929 #define INC_READERS(mi) { \
926 930 mi->mi_readers++; \
927 931 }
928 932 #define DEC_READERS(mi) { \
929 933 mi->mi_readers--; \
930 934 if (mi->mi_readers == 0) \
931 935 cv_broadcast(&mi->mi_failover_cv); \
932 936 }
933 937
934 938 static int
935 939 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
936 940 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
937 941 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
938 942 {
939 943 CLIENT *client;
940 944 struct chtab *ch;
941 945 cred_t *cr = icr;
942 946 enum clnt_stat status;
943 947 struct rpc_err rpcerr, rpcerr_tmp;
944 948 struct timeval wait;
945 949 int timeo; /* in units of hz */
946 950 int my_rsize, my_wsize;
947 951 bool_t tryagain;
948 952 bool_t cred_cloned = FALSE;
949 953 k_sigset_t smask;
950 954 servinfo_t *svp;
951 955 struct nfs_clnt *nfscl;
952 956 zoneid_t zoneid = getzoneid();
953 957 char *msg;
954 958 #ifdef DEBUG
955 959 char *bufp;
956 960 #endif
957 961
958 962
959 963 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
960 964 "rfscall_start:which %d mi %p", which, mi);
961 965
962 966 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
963 967 ASSERT(nfscl != NULL);
964 968
965 969 nfscl->nfscl_stat.calls.value.ui64++;
966 970 mi->mi_reqs[which].value.ui64++;
967 971
968 972 rpcerr.re_status = RPC_SUCCESS;
969 973
970 974 /*
971 975 * In case of forced unmount or zone shutdown, return EIO.
972 976 */
973 977
974 978 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
975 979 rpcerr.re_status = RPC_FAILED;
976 980 rpcerr.re_errno = EIO;
977 981 return (rpcerr.re_errno);
978 982 }
979 983
980 984 /*
981 985 * Remember the transfer sizes in case
982 986 * nfs_feedback changes them underneath us.
983 987 */
984 988 my_rsize = mi->mi_curread;
985 989 my_wsize = mi->mi_curwrite;
986 990
987 991 /*
988 992 * NFS client failover support
989 993 *
990 994 * If this rnode is not in sync with the current server (VALID_FH),
991 995 * we'd like to do a remap to get in sync. We can be interrupted
992 996 * in failover_remap(), and if so we'll bail. Otherwise, we'll
993 997 * use the best info we have to try the RPC. Part of that is
994 998 * unconditionally updating the filehandle copy kept for V3.
995 999 *
996 1000 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
997 1001 * rw_enter(); we're trying to keep the current server from being
998 1002 * changed on us until we're done with the remapping and have a
999 1003 * matching client handle. We don't want to sending a filehandle
1000 1004 * to the wrong host.
1001 1005 */
1002 1006 failoverretry:
1003 1007 if (FAILOVER_MOUNT(mi)) {
1004 1008 mutex_enter(&mi->mi_lock);
1005 1009 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1006 1010 if (failover_wait(mi)) {
1007 1011 mutex_exit(&mi->mi_lock);
1008 1012 return (EINTR);
1009 1013 }
1010 1014 }
1011 1015 INC_READERS(mi);
1012 1016 mutex_exit(&mi->mi_lock);
1013 1017 if (fi) {
1014 1018 if (!VALID_FH(fi) &&
1015 1019 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1016 1020 int remaperr;
1017 1021
1018 1022 svp = mi->mi_curr_serv;
1019 1023 remaperr = failover_remap(fi);
1020 1024 if (remaperr != 0) {
1021 1025 #ifdef DEBUG
1022 1026 if (remaperr != EINTR)
1023 1027 nfs_cmn_err(remaperr, CE_WARN,
1024 1028 "rfscall couldn't failover: %m");
1025 1029 #endif
1026 1030 mutex_enter(&mi->mi_lock);
1027 1031 DEC_READERS(mi);
1028 1032 mutex_exit(&mi->mi_lock);
1029 1033 /*
1030 1034 * If failover_remap returns ETIMEDOUT
1031 1035 * and the filesystem is hard mounted
1032 1036 * we have to retry the call with a new
1033 1037 * server.
1034 1038 */
1035 1039 if ((mi->mi_flags & MI_HARD) &&
1036 1040 IS_RECOVERABLE_ERROR(remaperr)) {
1037 1041 if (svp == mi->mi_curr_serv)
1038 1042 failover_newserver(mi);
1039 1043 rpcerr.re_status = RPC_SUCCESS;
1040 1044 goto failoverretry;
1041 1045 }
1042 1046 rpcerr.re_errno = remaperr;
1043 1047 return (remaperr);
1044 1048 }
1045 1049 }
1046 1050 if (fi->fhp && fi->copyproc)
1047 1051 (*fi->copyproc)(fi->fhp, fi->vp);
1048 1052 }
1049 1053 }
1050 1054
1051 1055 /* For TSOL, use a new cred which has net_mac_aware flag */
1052 1056 if (!cred_cloned && is_system_labeled()) {
1053 1057 cred_cloned = TRUE;
1054 1058 cr = crdup(icr);
1055 1059 (void) setpflags(NET_MAC_AWARE, 1, cr);
1056 1060 }
1057 1061
1058 1062 /*
1059 1063 * clget() calls clnt_tli_kinit() which clears the xid, so we
1060 1064 * are guaranteed to reprocess the retry as a new request.
1061 1065 */
1062 1066 svp = mi->mi_curr_serv;
1063 1067 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1064 1068
1065 1069 if (FAILOVER_MOUNT(mi)) {
1066 1070 mutex_enter(&mi->mi_lock);
1067 1071 DEC_READERS(mi);
1068 1072 mutex_exit(&mi->mi_lock);
1069 1073
1070 1074 if ((rpcerr.re_errno == ETIMEDOUT ||
1071 1075 rpcerr.re_errno == ECONNRESET) &&
1072 1076 failover_safe(fi)) {
1073 1077 if (svp == mi->mi_curr_serv)
1074 1078 failover_newserver(mi);
1075 1079 goto failoverretry;
1076 1080 }
1077 1081 }
1078 1082 if (rpcerr.re_errno != 0)
1079 1083 return (rpcerr.re_errno);
1080 1084
1081 1085 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1082 1086 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1083 1087 timeo = (mi->mi_timeo * hz) / 10;
1084 1088 } else {
1085 1089 mutex_enter(&mi->mi_lock);
1086 1090 timeo = CLNT_SETTIMERS(client,
1087 1091 &(mi->mi_timers[mi->mi_timer_type[which]]),
1088 1092 &(mi->mi_timers[NFS_CALLTYPES]),
1089 1093 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1090 1094 (void (*)())NULL, (caddr_t)mi, 0);
1091 1095 mutex_exit(&mi->mi_lock);
1092 1096 }
1093 1097
1094 1098 /*
1095 1099 * If hard mounted fs, retry call forever unless hard error occurs.
1096 1100 */
1097 1101 do {
1098 1102 tryagain = FALSE;
1099 1103
1100 1104 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1101 1105 status = RPC_FAILED;
1102 1106 rpcerr.re_status = RPC_FAILED;
1103 1107 rpcerr.re_errno = EIO;
1104 1108 break;
1105 1109 }
1106 1110
1107 1111 TICK_TO_TIMEVAL(timeo, &wait);
1108 1112
1109 1113 /*
1110 1114 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1111 1115 * and SIGTERM. (Preserving the existing masks).
1112 1116 * Mask out SIGINT if mount option nointr is specified.
1113 1117 */
1114 1118 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1115 1119 if (!(mi->mi_flags & MI_INT))
1116 1120 client->cl_nosignal = TRUE;
1117 1121
1118 1122 /*
1119 1123 * If there is a current signal, then don't bother
1120 1124 * even trying to send out the request because we
1121 1125 * won't be able to block waiting for the response.
1122 1126 * Simply assume RPC_INTR and get on with it.
1123 1127 */
1124 1128 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1125 1129 status = RPC_INTR;
1126 1130 else {
1127 1131 status = CLNT_CALL(client, which, xdrargs, argsp,
1128 1132 xdrres, resp, wait);
1129 1133 }
1130 1134
1131 1135 if (!(mi->mi_flags & MI_INT))
1132 1136 client->cl_nosignal = FALSE;
1133 1137 /*
1134 1138 * restore original signal mask
1135 1139 */
1136 1140 sigunintr(&smask);
1137 1141
1138 1142 switch (status) {
1139 1143 case RPC_SUCCESS:
1140 1144 if ((mi->mi_flags & MI_DYNAMIC) &&
1141 1145 mi->mi_timer_type[which] != 0 &&
1142 1146 (mi->mi_curread != my_rsize ||
1143 1147 mi->mi_curwrite != my_wsize))
1144 1148 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1145 1149 break;
1146 1150
1147 1151 case RPC_INTR:
1148 1152 /*
1149 1153 * There is no way to recover from this error,
1150 1154 * even if mount option nointr is specified.
1151 1155 * SIGKILL, for example, cannot be blocked.
1152 1156 */
1153 1157 rpcerr.re_status = RPC_INTR;
1154 1158 rpcerr.re_errno = EINTR;
1155 1159 break;
1156 1160
1157 1161 case RPC_UDERROR:
1158 1162 /*
1159 1163 * If the NFS server is local (vold) and
1160 1164 * it goes away then we get RPC_UDERROR.
1161 1165 * This is a retryable error, so we would
1162 1166 * loop, so check to see if the specific
1163 1167 * error was ECONNRESET, indicating that
1164 1168 * target did not exist at all. If so,
1165 1169 * return with RPC_PROGUNAVAIL and
1166 1170 * ECONNRESET to indicate why.
1167 1171 */
1168 1172 CLNT_GETERR(client, &rpcerr);
1169 1173 if (rpcerr.re_errno == ECONNRESET) {
1170 1174 rpcerr.re_status = RPC_PROGUNAVAIL;
1171 1175 rpcerr.re_errno = ECONNRESET;
1172 1176 break;
1173 1177 }
1174 1178 /*FALLTHROUGH*/
1175 1179
1176 1180 default: /* probably RPC_TIMEDOUT */
1177 1181 if (IS_UNRECOVERABLE_RPC(status))
1178 1182 break;
1179 1183
1180 1184 /*
1181 1185 * increment server not responding count
1182 1186 */
1183 1187 mutex_enter(&mi->mi_lock);
1184 1188 mi->mi_noresponse++;
1185 1189 mutex_exit(&mi->mi_lock);
1186 1190 #ifdef DEBUG
1187 1191 nfscl->nfscl_stat.noresponse.value.ui64++;
1188 1192 #endif
1189 1193
1190 1194 if (!(mi->mi_flags & MI_HARD)) {
1191 1195 if (!(mi->mi_flags & MI_SEMISOFT) ||
1192 1196 (mi->mi_ss_call_type[which] == 0))
1193 1197 break;
1194 1198 }
1195 1199
1196 1200 /*
1197 1201 * The call is in progress (over COTS).
1198 1202 * Try the CLNT_CALL again, but don't
1199 1203 * print a noisy error message.
1200 1204 */
1201 1205 if (status == RPC_INPROGRESS) {
1202 1206 tryagain = TRUE;
1203 1207 break;
1204 1208 }
1205 1209
1206 1210 if (flags & RFSCALL_SOFT)
1207 1211 break;
1208 1212
1209 1213 /*
1210 1214 * On zone shutdown, just move on.
1211 1215 */
1212 1216 if (zone_status_get(curproc->p_zone) >=
1213 1217 ZONE_IS_SHUTTING_DOWN) {
1214 1218 rpcerr.re_status = RPC_FAILED;
1215 1219 rpcerr.re_errno = EIO;
1216 1220 break;
1217 1221 }
1218 1222
1219 1223 /*
1220 1224 * NFS client failover support
1221 1225 *
1222 1226 * If the current server just failed us, we'll
1223 1227 * start the process of finding a new server.
1224 1228 * After that, we can just retry.
1225 1229 */
1226 1230 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1227 1231 if (svp == mi->mi_curr_serv)
1228 1232 failover_newserver(mi);
1229 1233 clfree_impl(client, ch, nfscl);
1230 1234 goto failoverretry;
1231 1235 }
1232 1236
1233 1237 tryagain = TRUE;
1234 1238 timeo = backoff(timeo);
1235 1239
1236 1240 CLNT_GETERR(client, &rpcerr_tmp);
1237 1241 if ((status == RPC_CANTSEND) &&
1238 1242 (rpcerr_tmp.re_errno == ENOBUFS))
1239 1243 msg = SRV_QFULL_MSG;
1240 1244 else
1241 1245 msg = SRV_NOTRESP_MSG;
1242 1246
1243 1247 mutex_enter(&mi->mi_lock);
1244 1248 if (!(mi->mi_flags & MI_PRINTED)) {
1245 1249 mi->mi_flags |= MI_PRINTED;
1246 1250 mutex_exit(&mi->mi_lock);
1247 1251 #ifdef DEBUG
1248 1252 zprintf(zoneid, msg, mi->mi_vers,
1249 1253 svp->sv_hostname);
1250 1254 #else
1251 1255 zprintf(zoneid, msg, svp->sv_hostname);
1252 1256 #endif
1253 1257 } else
1254 1258 mutex_exit(&mi->mi_lock);
1255 1259 if (*douprintf && nfs_has_ctty()) {
1256 1260 *douprintf = 0;
1257 1261 if (!(mi->mi_flags & MI_NOPRINT))
1258 1262 #ifdef DEBUG
1259 1263 uprintf(msg, mi->mi_vers,
1260 1264 svp->sv_hostname);
1261 1265 #else
1262 1266 uprintf(msg, svp->sv_hostname);
1263 1267 #endif
1264 1268 }
1265 1269
1266 1270 /*
1267 1271 * If doing dynamic adjustment of transfer
1268 1272 * size and if it's a read or write call
1269 1273 * and if the transfer size changed while
1270 1274 * retransmitting or if the feedback routine
1271 1275 * changed the transfer size,
1272 1276 * then exit rfscall so that the transfer
1273 1277 * size can be adjusted at the vnops level.
1274 1278 */
1275 1279 if ((mi->mi_flags & MI_DYNAMIC) &&
1276 1280 mi->mi_timer_type[which] != 0 &&
1277 1281 (mi->mi_curread != my_rsize ||
1278 1282 mi->mi_curwrite != my_wsize ||
1279 1283 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1280 1284 /*
1281 1285 * On read or write calls, return
1282 1286 * back to the vnode ops level if
1283 1287 * the transfer size changed.
1284 1288 */
1285 1289 clfree_impl(client, ch, nfscl);
1286 1290 if (cred_cloned)
1287 1291 crfree(cr);
1288 1292 return (ENFS_TRYAGAIN);
1289 1293 }
1290 1294 }
1291 1295 } while (tryagain);
1292 1296
1293 1297 if (status != RPC_SUCCESS) {
1294 1298 /*
1295 1299 * Let soft mounts use the timed out message.
1296 1300 */
1297 1301 if (status == RPC_INPROGRESS)
1298 1302 status = RPC_TIMEDOUT;
1299 1303 nfscl->nfscl_stat.badcalls.value.ui64++;
1300 1304 if (status != RPC_INTR) {
1301 1305 mutex_enter(&mi->mi_lock);
1302 1306 mi->mi_flags |= MI_DOWN;
1303 1307 mutex_exit(&mi->mi_lock);
1304 1308 CLNT_GETERR(client, &rpcerr);
1305 1309 #ifdef DEBUG
1306 1310 bufp = clnt_sperror(client, svp->sv_hostname);
1307 1311 zprintf(zoneid, "NFS%d %s failed for %s\n",
1308 1312 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1309 1313 if (nfs_has_ctty()) {
1310 1314 if (!(mi->mi_flags & MI_NOPRINT)) {
1311 1315 uprintf("NFS%d %s failed for %s\n",
1312 1316 mi->mi_vers, mi->mi_rfsnames[which],
1313 1317 bufp);
1314 1318 }
1315 1319 }
1316 1320 kmem_free(bufp, MAXPATHLEN);
1317 1321 #else
1318 1322 zprintf(zoneid,
1319 1323 "NFS %s failed for server %s: error %d (%s)\n",
1320 1324 mi->mi_rfsnames[which], svp->sv_hostname,
1321 1325 status, clnt_sperrno(status));
1322 1326 if (nfs_has_ctty()) {
1323 1327 if (!(mi->mi_flags & MI_NOPRINT)) {
1324 1328 uprintf(
1325 1329 "NFS %s failed for server %s: error %d (%s)\n",
1326 1330 mi->mi_rfsnames[which],
1327 1331 svp->sv_hostname, status,
1328 1332 clnt_sperrno(status));
1329 1333 }
1330 1334 }
1331 1335 #endif
1332 1336 /*
1333 1337 * when CLNT_CALL() fails with RPC_AUTHERROR,
1334 1338 * re_errno is set appropriately depending on
1335 1339 * the authentication error
1336 1340 */
1337 1341 if (status == RPC_VERSMISMATCH ||
1338 1342 status == RPC_PROGVERSMISMATCH)
1339 1343 rpcerr.re_errno = EIO;
1340 1344 }
1341 1345 } else {
1342 1346 /*
1343 1347 * Test the value of mi_down and mi_printed without
1344 1348 * holding the mi_lock mutex. If they are both zero,
1345 1349 * then it is okay to skip the down and printed
1346 1350 * processing. This saves on a mutex_enter and
1347 1351 * mutex_exit pair for a normal, successful RPC.
1348 1352 * This was just complete overhead.
1349 1353 */
1350 1354 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1351 1355 mutex_enter(&mi->mi_lock);
1352 1356 mi->mi_flags &= ~MI_DOWN;
1353 1357 if (mi->mi_flags & MI_PRINTED) {
1354 1358 mi->mi_flags &= ~MI_PRINTED;
1355 1359 mutex_exit(&mi->mi_lock);
1356 1360 #ifdef DEBUG
1357 1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358 1362 zprintf(zoneid, "NFS%d server %s ok\n",
1359 1363 mi->mi_vers, svp->sv_hostname);
1360 1364 #else
1361 1365 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 1366 zprintf(zoneid, "NFS server %s ok\n",
1363 1367 svp->sv_hostname);
1364 1368 #endif
1365 1369 } else
1366 1370 mutex_exit(&mi->mi_lock);
1367 1371 }
1368 1372
1369 1373 if (*douprintf == 0) {
1370 1374 if (!(mi->mi_flags & MI_NOPRINT))
1371 1375 #ifdef DEBUG
1372 1376 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1373 1377 uprintf("NFS%d server %s ok\n",
1374 1378 mi->mi_vers, svp->sv_hostname);
1375 1379 #else
1376 1380 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1377 1381 uprintf("NFS server %s ok\n", svp->sv_hostname);
1378 1382 #endif
1379 1383 *douprintf = 1;
1380 1384 }
1381 1385 }
1382 1386
1383 1387 clfree_impl(client, ch, nfscl);
1384 1388 if (cred_cloned)
1385 1389 crfree(cr);
1386 1390
1387 1391 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1388 1392
1389 1393 if (rpc_status != NULL)
1390 1394 *rpc_status = rpcerr.re_status;
1391 1395
1392 1396 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1393 1397 rpcerr.re_errno);
1394 1398
1395 1399 return (rpcerr.re_errno);
1396 1400 }
1397 1401
1398 1402 #ifdef DEBUG
1399 1403 static int acl2call_hits = 0;
1400 1404 static int acl2call_misses = 0;
1401 1405 #endif
1402 1406
1403 1407 int
1404 1408 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1405 1409 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1406 1410 enum nfsstat *statusp, int flags, failinfo_t *fi)
1407 1411 {
1408 1412 int rpcerror;
1409 1413
1410 1414 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1411 1415 cr, douprintf, flags, fi);
1412 1416 if (!rpcerror) {
1413 1417 /*
1414 1418 * See comments with crnetadjust().
1415 1419 */
1416 1420 if (*statusp == NFSERR_ACCES &&
1417 1421 (cr = crnetadjust(cr)) != NULL) {
1418 1422 #ifdef DEBUG
1419 1423 acl2call_hits++;
1420 1424 #endif
1421 1425 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1422 1426 resp, cr, douprintf, flags, fi);
1423 1427 crfree(cr);
1424 1428 #ifdef DEBUG
1425 1429 if (*statusp == NFSERR_ACCES)
1426 1430 acl2call_misses++;
1427 1431 #endif
1428 1432 }
1429 1433 }
1430 1434
1431 1435 return (rpcerror);
1432 1436 }
1433 1437
1434 1438 #ifdef DEBUG
1435 1439 static int acl3call_hits = 0;
1436 1440 static int acl3call_misses = 0;
1437 1441 #endif
1438 1442
1439 1443 int
1440 1444 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1441 1445 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1442 1446 nfsstat3 *statusp, int flags, failinfo_t *fi)
1443 1447 {
1444 1448 int rpcerror;
1445 1449 int user_informed;
1446 1450
1447 1451 user_informed = 0;
1448 1452
1449 1453 do {
1450 1454 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1451 1455 cr, douprintf, flags, fi);
1452 1456 if (!rpcerror) {
1453 1457 cred_t *crr;
1454 1458 if (*statusp == NFS3ERR_JUKEBOX) {
1455 1459 if (!user_informed) {
1456 1460 user_informed = 1;
1457 1461 uprintf(
1458 1462 "file temporarily unavailable on the server, retrying...\n");
1459 1463 }
1460 1464 delay(nfs3_jukebox_delay);
1461 1465 }
1462 1466 /*
1463 1467 * See crnetadjust() for comments.
1464 1468 */
1465 1469 else if (*statusp == NFS3ERR_ACCES &&
1466 1470 (crr = crnetadjust(cr)) != NULL) {
1467 1471 #ifdef DEBUG
1468 1472 acl3call_hits++;
1469 1473 #endif
1470 1474 rpcerror = aclcall(mi, which, xdrargs, argsp,
1471 1475 xdrres, resp, crr, douprintf, flags, fi);
1472 1476
1473 1477 crfree(crr);
1474 1478 #ifdef DEBUG
1475 1479 if (*statusp == NFS3ERR_ACCES)
1476 1480 acl3call_misses++;
1477 1481 #endif
1478 1482 }
1479 1483 }
1480 1484 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1481 1485
1482 1486 return (rpcerror);
1483 1487 }
1484 1488
1485 1489 static int
1486 1490 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1487 1491 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1488 1492 int flags, failinfo_t *fi)
1489 1493 {
1490 1494 CLIENT *client;
1491 1495 struct chtab *ch;
1492 1496 cred_t *cr = icr;
1493 1497 bool_t cred_cloned = FALSE;
1494 1498 enum clnt_stat status;
1495 1499 struct rpc_err rpcerr;
1496 1500 struct timeval wait;
1497 1501 int timeo; /* in units of hz */
1498 1502 #if 0 /* notyet */
1499 1503 int my_rsize, my_wsize;
1500 1504 #endif
1501 1505 bool_t tryagain;
1502 1506 k_sigset_t smask;
1503 1507 servinfo_t *svp;
1504 1508 struct nfs_clnt *nfscl;
1505 1509 zoneid_t zoneid = getzoneid();
1506 1510 #ifdef DEBUG
1507 1511 char *bufp;
1508 1512 #endif
1509 1513
1510 1514 #if 0 /* notyet */
1511 1515 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1512 1516 "rfscall_start:which %d mi %p", which, mi);
1513 1517 #endif
1514 1518
1515 1519 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1516 1520 ASSERT(nfscl != NULL);
1517 1521
1518 1522 nfscl->nfscl_stat.calls.value.ui64++;
1519 1523 mi->mi_aclreqs[which].value.ui64++;
1520 1524
1521 1525 rpcerr.re_status = RPC_SUCCESS;
1522 1526
1523 1527 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1524 1528 rpcerr.re_status = RPC_FAILED;
1525 1529 rpcerr.re_errno = EIO;
1526 1530 return (rpcerr.re_errno);
1527 1531 }
1528 1532
1529 1533 #if 0 /* notyet */
1530 1534 /*
1531 1535 * Remember the transfer sizes in case
1532 1536 * nfs_feedback changes them underneath us.
1533 1537 */
1534 1538 my_rsize = mi->mi_curread;
1535 1539 my_wsize = mi->mi_curwrite;
1536 1540 #endif
1537 1541
1538 1542 /*
1539 1543 * NFS client failover support
1540 1544 *
1541 1545 * If this rnode is not in sync with the current server (VALID_FH),
1542 1546 * we'd like to do a remap to get in sync. We can be interrupted
1543 1547 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1544 1548 * use the best info we have to try the RPC. Part of that is
1545 1549 * unconditionally updating the filehandle copy kept for V3.
1546 1550 *
1547 1551 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1548 1552 * rw_enter(); we're trying to keep the current server from being
1549 1553 * changed on us until we're done with the remapping and have a
1550 1554 * matching client handle. We don't want to sending a filehandle
1551 1555 * to the wrong host.
1552 1556 */
1553 1557 failoverretry:
1554 1558 if (FAILOVER_MOUNT(mi)) {
1555 1559 mutex_enter(&mi->mi_lock);
1556 1560 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1557 1561 if (failover_wait(mi)) {
1558 1562 mutex_exit(&mi->mi_lock);
1559 1563 return (EINTR);
1560 1564 }
1561 1565 }
1562 1566 INC_READERS(mi);
1563 1567 mutex_exit(&mi->mi_lock);
1564 1568 if (fi) {
1565 1569 if (!VALID_FH(fi) &&
1566 1570 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1567 1571 int remaperr;
1568 1572
1569 1573 svp = mi->mi_curr_serv;
1570 1574 remaperr = failover_remap(fi);
1571 1575 if (remaperr != 0) {
1572 1576 #ifdef DEBUG
1573 1577 if (remaperr != EINTR)
1574 1578 nfs_cmn_err(remaperr, CE_WARN,
1575 1579 "aclcall couldn't failover: %m");
1576 1580 #endif
1577 1581 mutex_enter(&mi->mi_lock);
1578 1582 DEC_READERS(mi);
1579 1583 mutex_exit(&mi->mi_lock);
1580 1584
1581 1585 /*
1582 1586 * If failover_remap returns ETIMEDOUT
1583 1587 * and the filesystem is hard mounted
1584 1588 * we have to retry the call with a new
1585 1589 * server.
1586 1590 */
1587 1591 if ((mi->mi_flags & MI_HARD) &&
1588 1592 IS_RECOVERABLE_ERROR(remaperr)) {
1589 1593 if (svp == mi->mi_curr_serv)
1590 1594 failover_newserver(mi);
1591 1595 rpcerr.re_status = RPC_SUCCESS;
1592 1596 goto failoverretry;
1593 1597 }
1594 1598 return (remaperr);
1595 1599 }
1596 1600 }
1597 1601 if (fi->fhp && fi->copyproc)
1598 1602 (*fi->copyproc)(fi->fhp, fi->vp);
1599 1603 }
1600 1604 }
1601 1605
1602 1606 /* For TSOL, use a new cred which has net_mac_aware flag */
1603 1607 if (!cred_cloned && is_system_labeled()) {
1604 1608 cred_cloned = TRUE;
1605 1609 cr = crdup(icr);
1606 1610 (void) setpflags(NET_MAC_AWARE, 1, cr);
1607 1611 }
1608 1612
1609 1613 /*
1610 1614 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1611 1615 * are guaranteed to reprocess the retry as a new request.
1612 1616 */
1613 1617 svp = mi->mi_curr_serv;
1614 1618 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1615 1619 if (FAILOVER_MOUNT(mi)) {
1616 1620 mutex_enter(&mi->mi_lock);
1617 1621 DEC_READERS(mi);
1618 1622 mutex_exit(&mi->mi_lock);
1619 1623
1620 1624 if ((rpcerr.re_errno == ETIMEDOUT ||
1621 1625 rpcerr.re_errno == ECONNRESET) &&
1622 1626 failover_safe(fi)) {
1623 1627 if (svp == mi->mi_curr_serv)
1624 1628 failover_newserver(mi);
1625 1629 goto failoverretry;
1626 1630 }
1627 1631 }
1628 1632 if (rpcerr.re_errno != 0) {
1629 1633 if (cred_cloned)
1630 1634 crfree(cr);
1631 1635 return (rpcerr.re_errno);
1632 1636 }
1633 1637
1634 1638 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1635 1639 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1636 1640 timeo = (mi->mi_timeo * hz) / 10;
1637 1641 } else {
1638 1642 mutex_enter(&mi->mi_lock);
1639 1643 timeo = CLNT_SETTIMERS(client,
1640 1644 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1641 1645 &(mi->mi_timers[NFS_CALLTYPES]),
1642 1646 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1643 1647 (void (*)()) 0, (caddr_t)mi, 0);
1644 1648 mutex_exit(&mi->mi_lock);
1645 1649 }
1646 1650
1647 1651 /*
1648 1652 * If hard mounted fs, retry call forever unless hard error occurs.
1649 1653 */
1650 1654 do {
1651 1655 tryagain = FALSE;
1652 1656
1653 1657 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1654 1658 status = RPC_FAILED;
1655 1659 rpcerr.re_status = RPC_FAILED;
1656 1660 rpcerr.re_errno = EIO;
1657 1661 break;
1658 1662 }
1659 1663
1660 1664 TICK_TO_TIMEVAL(timeo, &wait);
1661 1665
1662 1666 /*
1663 1667 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1664 1668 * and SIGTERM. (Preserving the existing masks).
1665 1669 * Mask out SIGINT if mount option nointr is specified.
1666 1670 */
1667 1671 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1668 1672 if (!(mi->mi_flags & MI_INT))
1669 1673 client->cl_nosignal = TRUE;
1670 1674
1671 1675 /*
1672 1676 * If there is a current signal, then don't bother
1673 1677 * even trying to send out the request because we
1674 1678 * won't be able to block waiting for the response.
1675 1679 * Simply assume RPC_INTR and get on with it.
1676 1680 */
1677 1681 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1678 1682 status = RPC_INTR;
1679 1683 else {
1680 1684 status = CLNT_CALL(client, which, xdrargs, argsp,
1681 1685 xdrres, resp, wait);
1682 1686 }
1683 1687
1684 1688 if (!(mi->mi_flags & MI_INT))
1685 1689 client->cl_nosignal = FALSE;
1686 1690 /*
1687 1691 * restore original signal mask
1688 1692 */
1689 1693 sigunintr(&smask);
1690 1694
1691 1695 switch (status) {
1692 1696 case RPC_SUCCESS:
1693 1697 #if 0 /* notyet */
1694 1698 if ((mi->mi_flags & MI_DYNAMIC) &&
1695 1699 mi->mi_timer_type[which] != 0 &&
1696 1700 (mi->mi_curread != my_rsize ||
1697 1701 mi->mi_curwrite != my_wsize))
1698 1702 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1699 1703 #endif
1700 1704 break;
1701 1705
1702 1706 /*
1703 1707 * Unfortunately, there are servers in the world which
1704 1708 * are not coded correctly. They are not prepared to
1705 1709 * handle RPC requests to the NFS port which are not
1706 1710 * NFS requests. Thus, they may try to process the
1707 1711 * NFS_ACL request as if it were an NFS request. This
1708 1712 * does not work. Generally, an error will be generated
1709 1713 * on the client because it will not be able to decode
1710 1714 * the response from the server. However, it seems
1711 1715 * possible that the server may not be able to decode
1712 1716 * the arguments. Thus, the criteria for deciding
1713 1717 * whether the server supports NFS_ACL or not is whether
1714 1718 * the following RPC errors are returned from CLNT_CALL.
1715 1719 */
1716 1720 case RPC_CANTDECODERES:
1717 1721 case RPC_PROGUNAVAIL:
1718 1722 case RPC_CANTDECODEARGS:
1719 1723 case RPC_PROGVERSMISMATCH:
1720 1724 mutex_enter(&mi->mi_lock);
1721 1725 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1722 1726 mutex_exit(&mi->mi_lock);
1723 1727 break;
1724 1728
1725 1729 /*
1726 1730 * If the server supports NFS_ACL but not the new ops
1727 1731 * for extended attributes, make sure we don't retry.
1728 1732 */
1729 1733 case RPC_PROCUNAVAIL:
1730 1734 mutex_enter(&mi->mi_lock);
1731 1735 mi->mi_flags &= ~MI_EXTATTR;
1732 1736 mutex_exit(&mi->mi_lock);
1733 1737 break;
1734 1738
1735 1739 case RPC_INTR:
1736 1740 /*
1737 1741 * There is no way to recover from this error,
1738 1742 * even if mount option nointr is specified.
1739 1743 * SIGKILL, for example, cannot be blocked.
1740 1744 */
1741 1745 rpcerr.re_status = RPC_INTR;
1742 1746 rpcerr.re_errno = EINTR;
1743 1747 break;
1744 1748
1745 1749 case RPC_UDERROR:
1746 1750 /*
1747 1751 * If the NFS server is local (vold) and
1748 1752 * it goes away then we get RPC_UDERROR.
1749 1753 * This is a retryable error, so we would
1750 1754 * loop, so check to see if the specific
1751 1755 * error was ECONNRESET, indicating that
1752 1756 * target did not exist at all. If so,
1753 1757 * return with RPC_PROGUNAVAIL and
1754 1758 * ECONNRESET to indicate why.
1755 1759 */
1756 1760 CLNT_GETERR(client, &rpcerr);
1757 1761 if (rpcerr.re_errno == ECONNRESET) {
1758 1762 rpcerr.re_status = RPC_PROGUNAVAIL;
1759 1763 rpcerr.re_errno = ECONNRESET;
1760 1764 break;
1761 1765 }
1762 1766 /*FALLTHROUGH*/
1763 1767
1764 1768 default: /* probably RPC_TIMEDOUT */
1765 1769 if (IS_UNRECOVERABLE_RPC(status))
1766 1770 break;
1767 1771
1768 1772 /*
1769 1773 * increment server not responding count
1770 1774 */
1771 1775 mutex_enter(&mi->mi_lock);
1772 1776 mi->mi_noresponse++;
1773 1777 mutex_exit(&mi->mi_lock);
1774 1778 #ifdef DEBUG
1775 1779 nfscl->nfscl_stat.noresponse.value.ui64++;
1776 1780 #endif
1777 1781
1778 1782 if (!(mi->mi_flags & MI_HARD)) {
1779 1783 if (!(mi->mi_flags & MI_SEMISOFT) ||
1780 1784 (mi->mi_acl_ss_call_type[which] == 0))
1781 1785 break;
1782 1786 }
1783 1787
1784 1788 /*
1785 1789 * The call is in progress (over COTS).
1786 1790 * Try the CLNT_CALL again, but don't
1787 1791 * print a noisy error message.
1788 1792 */
1789 1793 if (status == RPC_INPROGRESS) {
1790 1794 tryagain = TRUE;
1791 1795 break;
1792 1796 }
1793 1797
1794 1798 if (flags & RFSCALL_SOFT)
1795 1799 break;
1796 1800
1797 1801 /*
1798 1802 * On zone shutdown, just move on.
1799 1803 */
1800 1804 if (zone_status_get(curproc->p_zone) >=
1801 1805 ZONE_IS_SHUTTING_DOWN) {
1802 1806 rpcerr.re_status = RPC_FAILED;
1803 1807 rpcerr.re_errno = EIO;
1804 1808 break;
1805 1809 }
1806 1810
1807 1811 /*
1808 1812 * NFS client failover support
1809 1813 *
1810 1814 * If the current server just failed us, we'll
1811 1815 * start the process of finding a new server.
1812 1816 * After that, we can just retry.
1813 1817 */
1814 1818 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1815 1819 if (svp == mi->mi_curr_serv)
1816 1820 failover_newserver(mi);
1817 1821 clfree_impl(client, ch, nfscl);
1818 1822 goto failoverretry;
1819 1823 }
1820 1824
1821 1825 tryagain = TRUE;
1822 1826 timeo = backoff(timeo);
1823 1827 mutex_enter(&mi->mi_lock);
1824 1828 if (!(mi->mi_flags & MI_PRINTED)) {
1825 1829 mi->mi_flags |= MI_PRINTED;
1826 1830 mutex_exit(&mi->mi_lock);
1827 1831 #ifdef DEBUG
1828 1832 zprintf(zoneid,
1829 1833 "NFS_ACL%d server %s not responding still trying\n",
1830 1834 mi->mi_vers, svp->sv_hostname);
1831 1835 #else
1832 1836 zprintf(zoneid,
1833 1837 "NFS server %s not responding still trying\n",
1834 1838 svp->sv_hostname);
1835 1839 #endif
1836 1840 } else
1837 1841 mutex_exit(&mi->mi_lock);
1838 1842 if (*douprintf && nfs_has_ctty()) {
1839 1843 *douprintf = 0;
1840 1844 if (!(mi->mi_flags & MI_NOPRINT))
1841 1845 #ifdef DEBUG
1842 1846 uprintf(
1843 1847 "NFS_ACL%d server %s not responding still trying\n",
1844 1848 mi->mi_vers, svp->sv_hostname);
1845 1849 #else
1846 1850 uprintf(
1847 1851 "NFS server %s not responding still trying\n",
1848 1852 svp->sv_hostname);
1849 1853 #endif
1850 1854 }
1851 1855
1852 1856 #if 0 /* notyet */
1853 1857 /*
1854 1858 * If doing dynamic adjustment of transfer
1855 1859 * size and if it's a read or write call
1856 1860 * and if the transfer size changed while
1857 1861 * retransmitting or if the feedback routine
1858 1862 * changed the transfer size,
1859 1863 * then exit rfscall so that the transfer
1860 1864 * size can be adjusted at the vnops level.
1861 1865 */
1862 1866 if ((mi->mi_flags & MI_DYNAMIC) &&
1863 1867 mi->mi_acl_timer_type[which] != 0 &&
1864 1868 (mi->mi_curread != my_rsize ||
1865 1869 mi->mi_curwrite != my_wsize ||
1866 1870 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1867 1871 /*
1868 1872 * On read or write calls, return
1869 1873 * back to the vnode ops level if
1870 1874 * the transfer size changed.
1871 1875 */
1872 1876 clfree_impl(client, ch, nfscl);
1873 1877 if (cred_cloned)
1874 1878 crfree(cr);
1875 1879 return (ENFS_TRYAGAIN);
1876 1880 }
1877 1881 #endif
1878 1882 }
1879 1883 } while (tryagain);
1880 1884
1881 1885 if (status != RPC_SUCCESS) {
1882 1886 /*
1883 1887 * Let soft mounts use the timed out message.
1884 1888 */
1885 1889 if (status == RPC_INPROGRESS)
1886 1890 status = RPC_TIMEDOUT;
1887 1891 nfscl->nfscl_stat.badcalls.value.ui64++;
1888 1892 if (status == RPC_CANTDECODERES ||
1889 1893 status == RPC_PROGUNAVAIL ||
1890 1894 status == RPC_PROCUNAVAIL ||
1891 1895 status == RPC_CANTDECODEARGS ||
1892 1896 status == RPC_PROGVERSMISMATCH)
1893 1897 CLNT_GETERR(client, &rpcerr);
1894 1898 else if (status != RPC_INTR) {
1895 1899 mutex_enter(&mi->mi_lock);
1896 1900 mi->mi_flags |= MI_DOWN;
1897 1901 mutex_exit(&mi->mi_lock);
1898 1902 CLNT_GETERR(client, &rpcerr);
1899 1903 #ifdef DEBUG
1900 1904 bufp = clnt_sperror(client, svp->sv_hostname);
1901 1905 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1902 1906 mi->mi_vers, mi->mi_aclnames[which], bufp);
1903 1907 if (nfs_has_ctty()) {
1904 1908 if (!(mi->mi_flags & MI_NOPRINT)) {
1905 1909 uprintf("NFS_ACL%d %s failed for %s\n",
1906 1910 mi->mi_vers, mi->mi_aclnames[which],
1907 1911 bufp);
1908 1912 }
1909 1913 }
1910 1914 kmem_free(bufp, MAXPATHLEN);
1911 1915 #else
1912 1916 zprintf(zoneid,
1913 1917 "NFS %s failed for server %s: error %d (%s)\n",
1914 1918 mi->mi_aclnames[which], svp->sv_hostname,
1915 1919 status, clnt_sperrno(status));
1916 1920 if (nfs_has_ctty()) {
1917 1921 if (!(mi->mi_flags & MI_NOPRINT))
1918 1922 uprintf(
1919 1923 "NFS %s failed for server %s: error %d (%s)\n",
1920 1924 mi->mi_aclnames[which],
1921 1925 svp->sv_hostname, status,
1922 1926 clnt_sperrno(status));
1923 1927 }
1924 1928 #endif
1925 1929 /*
1926 1930 * when CLNT_CALL() fails with RPC_AUTHERROR,
1927 1931 * re_errno is set appropriately depending on
1928 1932 * the authentication error
1929 1933 */
1930 1934 if (status == RPC_VERSMISMATCH ||
1931 1935 status == RPC_PROGVERSMISMATCH)
1932 1936 rpcerr.re_errno = EIO;
1933 1937 }
1934 1938 } else {
1935 1939 /*
1936 1940 * Test the value of mi_down and mi_printed without
1937 1941 * holding the mi_lock mutex. If they are both zero,
1938 1942 * then it is okay to skip the down and printed
1939 1943 * processing. This saves on a mutex_enter and
1940 1944 * mutex_exit pair for a normal, successful RPC.
1941 1945 * This was just complete overhead.
1942 1946 */
1943 1947 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1944 1948 mutex_enter(&mi->mi_lock);
1945 1949 mi->mi_flags &= ~MI_DOWN;
1946 1950 if (mi->mi_flags & MI_PRINTED) {
1947 1951 mi->mi_flags &= ~MI_PRINTED;
1948 1952 mutex_exit(&mi->mi_lock);
1949 1953 #ifdef DEBUG
1950 1954 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1951 1955 mi->mi_vers, svp->sv_hostname);
1952 1956 #else
1953 1957 zprintf(zoneid, "NFS server %s ok\n",
1954 1958 svp->sv_hostname);
1955 1959 #endif
1956 1960 } else
1957 1961 mutex_exit(&mi->mi_lock);
1958 1962 }
1959 1963
1960 1964 if (*douprintf == 0) {
1961 1965 if (!(mi->mi_flags & MI_NOPRINT))
1962 1966 #ifdef DEBUG
1963 1967 uprintf("NFS_ACL%d server %s ok\n",
1964 1968 mi->mi_vers, svp->sv_hostname);
1965 1969 #else
1966 1970 uprintf("NFS server %s ok\n", svp->sv_hostname);
1967 1971 #endif
1968 1972 *douprintf = 1;
1969 1973 }
1970 1974 }
1971 1975
1972 1976 clfree_impl(client, ch, nfscl);
1973 1977 if (cred_cloned)
1974 1978 crfree(cr);
1975 1979
1976 1980 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1977 1981
1978 1982 #if 0 /* notyet */
1979 1983 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1980 1984 rpcerr.re_errno);
1981 1985 #endif
1982 1986
1983 1987 return (rpcerr.re_errno);
1984 1988 }
1985 1989
1986 1990 int
1987 1991 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1988 1992 {
1989 1993 uint_t mask = vap->va_mask;
1990 1994
1991 1995 if (!(mask & AT_MODE))
1992 1996 sa->sa_mode = (uint32_t)-1;
1993 1997 else
1994 1998 sa->sa_mode = vap->va_mode;
1995 1999 if (!(mask & AT_UID))
1996 2000 sa->sa_uid = (uint32_t)-1;
1997 2001 else
1998 2002 sa->sa_uid = (uint32_t)vap->va_uid;
1999 2003 if (!(mask & AT_GID))
2000 2004 sa->sa_gid = (uint32_t)-1;
2001 2005 else
2002 2006 sa->sa_gid = (uint32_t)vap->va_gid;
2003 2007 if (!(mask & AT_SIZE))
2004 2008 sa->sa_size = (uint32_t)-1;
2005 2009 else
2006 2010 sa->sa_size = (uint32_t)vap->va_size;
2007 2011 if (!(mask & AT_ATIME))
2008 2012 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2009 2013 else {
2010 2014 /* check time validity */
2011 2015 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2012 2016 return (EOVERFLOW);
2013 2017 }
2014 2018 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2015 2019 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2016 2020 }
2017 2021 if (!(mask & AT_MTIME))
2018 2022 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2019 2023 else {
2020 2024 /* check time validity */
2021 2025 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2022 2026 return (EOVERFLOW);
2023 2027 }
2024 2028 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2025 2029 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2026 2030 }
2027 2031 return (0);
2028 2032 }
2029 2033
2030 2034 int
2031 2035 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2032 2036 {
2033 2037 uint_t mask = vap->va_mask;
2034 2038
2035 2039 if (!(mask & AT_MODE))
2036 2040 sa->mode.set_it = FALSE;
2037 2041 else {
2038 2042 sa->mode.set_it = TRUE;
2039 2043 sa->mode.mode = (mode3)vap->va_mode;
2040 2044 }
2041 2045 if (!(mask & AT_UID))
2042 2046 sa->uid.set_it = FALSE;
2043 2047 else {
2044 2048 sa->uid.set_it = TRUE;
2045 2049 sa->uid.uid = (uid3)vap->va_uid;
2046 2050 }
2047 2051 if (!(mask & AT_GID))
2048 2052 sa->gid.set_it = FALSE;
2049 2053 else {
2050 2054 sa->gid.set_it = TRUE;
2051 2055 sa->gid.gid = (gid3)vap->va_gid;
2052 2056 }
2053 2057 if (!(mask & AT_SIZE))
2054 2058 sa->size.set_it = FALSE;
2055 2059 else {
2056 2060 sa->size.set_it = TRUE;
2057 2061 sa->size.size = (size3)vap->va_size;
2058 2062 }
2059 2063 if (!(mask & AT_ATIME))
2060 2064 sa->atime.set_it = DONT_CHANGE;
2061 2065 else {
2062 2066 /* check time validity */
2063 2067 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2064 2068 return (EOVERFLOW);
2065 2069 }
2066 2070 sa->atime.set_it = SET_TO_CLIENT_TIME;
2067 2071 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2068 2072 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2069 2073 }
2070 2074 if (!(mask & AT_MTIME))
2071 2075 sa->mtime.set_it = DONT_CHANGE;
2072 2076 else {
2073 2077 /* check time validity */
2074 2078 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2075 2079 return (EOVERFLOW);
2076 2080 }
2077 2081 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2078 2082 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2079 2083 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2080 2084 }
2081 2085 return (0);
2082 2086 }
2083 2087
2084 2088 void
2085 2089 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2086 2090 {
2087 2091
2088 2092 da->da_fhandle = VTOFH(dvp);
2089 2093 da->da_name = nm;
2090 2094 da->da_flags = 0;
2091 2095 }
2092 2096
2093 2097 void
2094 2098 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2095 2099 {
2096 2100
2097 2101 da->dirp = VTOFH3(dvp);
2098 2102 da->name = nm;
2099 2103 }
2100 2104
2101 2105 int
2102 2106 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2103 2107 {
2104 2108 int error;
2105 2109 rnode_t *rp;
2106 2110 struct vattr va;
2107 2111
2108 2112 va.va_mask = AT_MODE | AT_GID;
2109 2113 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2110 2114 if (error)
2111 2115 return (error);
2112 2116
2113 2117 /*
2114 2118 * To determine the expected group-id of the created file:
2115 2119 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2116 2120 * GRPID option, and the directory's set-gid bit is clear,
2117 2121 * then use the process's gid.
2118 2122 * 2) Otherwise, set the group-id to the gid of the parent directory.
2119 2123 */
2120 2124 rp = VTOR(dvp);
2121 2125 mutex_enter(&rp->r_statelock);
2122 2126 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2123 2127 *gidp = crgetgid(cr);
2124 2128 else
2125 2129 *gidp = va.va_gid;
2126 2130 mutex_exit(&rp->r_statelock);
2127 2131 return (0);
2128 2132 }
2129 2133
2130 2134 int
2131 2135 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2132 2136 {
2133 2137 int error;
2134 2138 struct vattr va;
2135 2139
2136 2140 va.va_mask = AT_MODE;
2137 2141 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2138 2142 if (error)
2139 2143 return (error);
2140 2144
2141 2145 /*
2142 2146 * Modify the expected mode (om) so that the set-gid bit matches
2143 2147 * that of the parent directory (dvp).
2144 2148 */
2145 2149 if (va.va_mode & VSGID)
2146 2150 *omp |= VSGID;
2147 2151 else
2148 2152 *omp &= ~VSGID;
2149 2153 return (0);
2150 2154 }
2151 2155
2152 2156 void
2153 2157 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2154 2158 {
2155 2159
2156 2160 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2157 2161 if (!(vp->v_flag & VSWAPLIKE)) {
2158 2162 mutex_enter(&vp->v_lock);
2159 2163 vp->v_flag |= VSWAPLIKE;
2160 2164 mutex_exit(&vp->v_lock);
2161 2165 }
2162 2166 } else {
2163 2167 if (vp->v_flag & VSWAPLIKE) {
2164 2168 mutex_enter(&vp->v_lock);
2165 2169 vp->v_flag &= ~VSWAPLIKE;
2166 2170 mutex_exit(&vp->v_lock);
2167 2171 }
2168 2172 }
2169 2173 }
2170 2174
2171 2175 /*
2172 2176 * Free the resources associated with an rnode.
2173 2177 */
2174 2178 static void
2175 2179 rinactive(rnode_t *rp, cred_t *cr)
2176 2180 {
2177 2181 vnode_t *vp;
2178 2182 cred_t *cred;
2179 2183 char *contents;
2180 2184 int size;
2181 2185 vsecattr_t *vsp;
2182 2186 int error;
2183 2187 nfs3_pathconf_info *info;
2184 2188
2185 2189 /*
2186 2190 * Before freeing anything, wait until all asynchronous
2187 2191 * activity is done on this rnode. This will allow all
2188 2192 * asynchronous read ahead and write behind i/o's to
2189 2193 * finish.
2190 2194 */
2191 2195 mutex_enter(&rp->r_statelock);
2192 2196 while (rp->r_count > 0)
2193 2197 cv_wait(&rp->r_cv, &rp->r_statelock);
2194 2198 mutex_exit(&rp->r_statelock);
2195 2199
2196 2200 /*
2197 2201 * Flush and invalidate all pages associated with the vnode.
2198 2202 */
2199 2203 vp = RTOV(rp);
2200 2204 if (vn_has_cached_data(vp)) {
2201 2205 ASSERT(vp->v_type != VCHR);
2202 2206 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2203 2207 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2204 2208 if (error && (error == ENOSPC || error == EDQUOT)) {
2205 2209 mutex_enter(&rp->r_statelock);
2206 2210 if (!rp->r_error)
2207 2211 rp->r_error = error;
2208 2212 mutex_exit(&rp->r_statelock);
2209 2213 }
2210 2214 }
2211 2215 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2212 2216 }
2213 2217
2214 2218 /*
2215 2219 * Free any held credentials and caches which may be associated
2216 2220 * with this rnode.
2217 2221 */
2218 2222 mutex_enter(&rp->r_statelock);
2219 2223 cred = rp->r_cred;
2220 2224 rp->r_cred = NULL;
2221 2225 contents = rp->r_symlink.contents;
2222 2226 size = rp->r_symlink.size;
2223 2227 rp->r_symlink.contents = NULL;
2224 2228 vsp = rp->r_secattr;
2225 2229 rp->r_secattr = NULL;
2226 2230 info = rp->r_pathconf;
2227 2231 rp->r_pathconf = NULL;
2228 2232 mutex_exit(&rp->r_statelock);
2229 2233
2230 2234 /*
2231 2235 * Free the held credential.
2232 2236 */
2233 2237 if (cred != NULL)
2234 2238 crfree(cred);
2235 2239
2236 2240 /*
2237 2241 * Free the access cache entries.
2238 2242 */
2239 2243 (void) nfs_access_purge_rp(rp);
2240 2244
2241 2245 /*
2242 2246 * Free the readdir cache entries.
2243 2247 */
2244 2248 if (HAVE_RDDIR_CACHE(rp))
2245 2249 nfs_purge_rddir_cache(vp);
2246 2250
2247 2251 /*
2248 2252 * Free the symbolic link cache.
2249 2253 */
2250 2254 if (contents != NULL) {
2251 2255
2252 2256 kmem_free((void *)contents, size);
2253 2257 }
2254 2258
2255 2259 /*
2256 2260 * Free any cached ACL.
2257 2261 */
2258 2262 if (vsp != NULL)
2259 2263 nfs_acl_free(vsp);
2260 2264
2261 2265 /*
2262 2266 * Free any cached pathconf information.
2263 2267 */
2264 2268 if (info != NULL)
2265 2269 kmem_free(info, sizeof (*info));
2266 2270 }
2267 2271
2268 2272 /*
2269 2273 * Return a vnode for the given NFS Version 2 file handle.
2270 2274 * If no rnode exists for this fhandle, create one and put it
2271 2275 * into the hash queues. If the rnode for this fhandle
2272 2276 * already exists, return it.
2273 2277 *
2274 2278 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2275 2279 */
2276 2280 vnode_t *
2277 2281 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2278 2282 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2279 2283 {
2280 2284 int newnode;
2281 2285 int index;
2282 2286 vnode_t *vp;
2283 2287 nfs_fhandle nfh;
2284 2288 vattr_t va;
2285 2289
2286 2290 nfh.fh_len = NFS_FHSIZE;
2287 2291 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2288 2292
2289 2293 index = rtablehash(&nfh);
2290 2294 rw_enter(&rtable[index].r_lock, RW_READER);
2291 2295
2292 2296 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2293 2297 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2294 2298
2295 2299 if (attr != NULL) {
2296 2300 if (!newnode) {
2297 2301 rw_exit(&rtable[index].r_lock);
2298 2302 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2299 2303 } else {
2300 2304 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2301 2305 vp->v_type = VBAD;
2302 2306 else
2303 2307 vp->v_type = n2v_type(attr);
2304 2308 /*
2305 2309 * A translation here seems to be necessary
2306 2310 * because this function can be called
2307 2311 * with `attr' that has come from the wire,
2308 2312 * and been operated on by vattr_to_nattr().
2309 2313 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2310 2314 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2311 2315 * ->makenfsnode().
2312 2316 */
2313 2317 if ((attr->na_rdev & 0xffff0000) == 0)
2314 2318 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2315 2319 else
2316 2320 vp->v_rdev = expldev(n2v_rdev(attr));
2317 2321 nfs_attrcache(vp, attr, t);
2318 2322 rw_exit(&rtable[index].r_lock);
2319 2323 }
2320 2324 } else {
2321 2325 if (newnode) {
2322 2326 PURGE_ATTRCACHE(vp);
2323 2327 }
2324 2328 rw_exit(&rtable[index].r_lock);
2325 2329 }
2326 2330
2327 2331 return (vp);
2328 2332 }
2329 2333
2330 2334 /*
2331 2335 * Return a vnode for the given NFS Version 3 file handle.
2332 2336 * If no rnode exists for this fhandle, create one and put it
2333 2337 * into the hash queues. If the rnode for this fhandle
2334 2338 * already exists, return it.
2335 2339 *
2336 2340 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2337 2341 */
2338 2342 vnode_t *
2339 2343 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2340 2344 cred_t *cr, char *dnm, char *nm)
2341 2345 {
2342 2346 int newnode;
2343 2347 int index;
2344 2348 vnode_t *vp;
2345 2349
2346 2350 index = rtablehash((nfs_fhandle *)fh);
2347 2351 rw_enter(&rtable[index].r_lock, RW_READER);
2348 2352
2349 2353 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2350 2354 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2351 2355 dnm, nm);
2352 2356
2353 2357 if (vap == NULL) {
2354 2358 if (newnode) {
2355 2359 PURGE_ATTRCACHE(vp);
2356 2360 }
2357 2361 rw_exit(&rtable[index].r_lock);
2358 2362 return (vp);
2359 2363 }
2360 2364
2361 2365 if (!newnode) {
2362 2366 rw_exit(&rtable[index].r_lock);
2363 2367 nfs_attr_cache(vp, vap, t, cr);
2364 2368 } else {
2365 2369 rnode_t *rp = VTOR(vp);
2366 2370
2367 2371 vp->v_type = vap->va_type;
2368 2372 vp->v_rdev = vap->va_rdev;
2369 2373
2370 2374 mutex_enter(&rp->r_statelock);
2371 2375 if (rp->r_mtime <= t)
2372 2376 nfs_attrcache_va(vp, vap);
2373 2377 mutex_exit(&rp->r_statelock);
2374 2378 rw_exit(&rtable[index].r_lock);
2375 2379 }
2376 2380
2377 2381 return (vp);
2378 2382 }
2379 2383
2380 2384 vnode_t *
2381 2385 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2382 2386 cred_t *cr, char *dnm, char *nm)
2383 2387 {
2384 2388 int newnode;
2385 2389 int index;
2386 2390 vnode_t *vp;
2387 2391 vattr_t va;
2388 2392
2389 2393 index = rtablehash((nfs_fhandle *)fh);
2390 2394 rw_enter(&rtable[index].r_lock, RW_READER);
2391 2395
2392 2396 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2393 2397 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2394 2398 dnm, nm);
2395 2399
2396 2400 if (attr == NULL) {
2397 2401 if (newnode) {
2398 2402 PURGE_ATTRCACHE(vp);
2399 2403 }
2400 2404 rw_exit(&rtable[index].r_lock);
2401 2405 return (vp);
2402 2406 }
2403 2407
2404 2408 if (!newnode) {
2405 2409 rw_exit(&rtable[index].r_lock);
2406 2410 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2407 2411 } else {
2408 2412 if (attr->type < NF3REG || attr->type > NF3FIFO)
2409 2413 vp->v_type = VBAD;
2410 2414 else
2411 2415 vp->v_type = nf3_to_vt[attr->type];
2412 2416 vp->v_rdev = makedevice(attr->rdev.specdata1,
2413 2417 attr->rdev.specdata2);
2414 2418 nfs3_attrcache(vp, attr, t);
2415 2419 rw_exit(&rtable[index].r_lock);
2416 2420 }
2417 2421
2418 2422 return (vp);
2419 2423 }
2420 2424
2421 2425 /*
2422 2426 * Read this comment before making changes to rtablehash()!
2423 2427 * This is a hash function in which seemingly obvious and harmless
2424 2428 * changes can cause escalations costing million dollars!
2425 2429 * Know what you are doing.
2426 2430 *
2427 2431 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2428 2432 * algorithm is currently detailed here:
2429 2433 *
2430 2434 * http://burtleburtle.net/bob/hash/doobs.html
2431 2435 *
2432 2436 * Of course, the above link may not be valid by the time you are reading
2433 2437 * this, but suffice it to say that the one-at-a-time algorithm works well in
2434 2438 * almost all cases. If you are changing the algorithm be sure to verify that
2435 2439 * the hash algorithm still provides even distribution in all cases and with
2436 2440 * any server returning filehandles in whatever order (sequential or random).
2437 2441 */
2438 2442 static int
2439 2443 rtablehash(nfs_fhandle *fh)
2440 2444 {
2441 2445 ulong_t hash, len, i;
2442 2446 char *key;
2443 2447
2444 2448 key = fh->fh_buf;
2445 2449 len = (ulong_t)fh->fh_len;
2446 2450 for (hash = 0, i = 0; i < len; i++) {
2447 2451 hash += key[i];
2448 2452 hash += (hash << 10);
2449 2453 hash ^= (hash >> 6);
2450 2454 }
2451 2455 hash += (hash << 3);
2452 2456 hash ^= (hash >> 11);
2453 2457 hash += (hash << 15);
2454 2458 return (hash & rtablemask);
2455 2459 }
2456 2460
2457 2461 static vnode_t *
2458 2462 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2459 2463 struct vnodeops *vops,
2460 2464 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2461 2465 int (*compar)(const void *, const void *),
2462 2466 int *newnode, cred_t *cr, char *dnm, char *nm)
2463 2467 {
2464 2468 rnode_t *rp;
2465 2469 rnode_t *trp;
2466 2470 vnode_t *vp;
2467 2471 mntinfo_t *mi;
2468 2472
2469 2473 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2470 2474
2471 2475 mi = VFTOMI(vfsp);
2472 2476 start:
2473 2477 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2474 2478 vp = RTOV(rp);
2475 2479 nfs_set_vroot(vp);
2476 2480 *newnode = 0;
2477 2481 return (vp);
2478 2482 }
2479 2483 rw_exit(&rhtp->r_lock);
2480 2484
2481 2485 mutex_enter(&rpfreelist_lock);
2482 2486 if (rpfreelist != NULL && rnew >= nrnode) {
2483 2487 rp = rpfreelist;
2484 2488 rp_rmfree(rp);
2485 2489 mutex_exit(&rpfreelist_lock);
2486 2490
2487 2491 vp = RTOV(rp);
2488 2492
2489 2493 if (rp->r_flags & RHASHED) {
2490 2494 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2491 2495 mutex_enter(&vp->v_lock);
2492 2496 if (vp->v_count > 1) {
2493 2497 VN_RELE_LOCKED(vp);
2494 2498 mutex_exit(&vp->v_lock);
2495 2499 rw_exit(&rp->r_hashq->r_lock);
2496 2500 rw_enter(&rhtp->r_lock, RW_READER);
2497 2501 goto start;
2498 2502 }
2499 2503 mutex_exit(&vp->v_lock);
2500 2504 rp_rmhash_locked(rp);
2501 2505 rw_exit(&rp->r_hashq->r_lock);
2502 2506 }
2503 2507
2504 2508 rinactive(rp, cr);
2505 2509
2506 2510 mutex_enter(&vp->v_lock);
2507 2511 if (vp->v_count > 1) {
2508 2512 VN_RELE_LOCKED(vp);
2509 2513 mutex_exit(&vp->v_lock);
2510 2514 rw_enter(&rhtp->r_lock, RW_READER);
2511 2515 goto start;
2512 2516 }
2513 2517 mutex_exit(&vp->v_lock);
2514 2518 vn_invalid(vp);
2515 2519 /*
2516 2520 * destroy old locks before bzero'ing and
2517 2521 * recreating the locks below.
2518 2522 */
2519 2523 nfs_rw_destroy(&rp->r_rwlock);
2520 2524 nfs_rw_destroy(&rp->r_lkserlock);
2521 2525 mutex_destroy(&rp->r_statelock);
2522 2526 cv_destroy(&rp->r_cv);
2523 2527 cv_destroy(&rp->r_commit.c_cv);
2524 2528 nfs_free_r_path(rp);
2525 2529 avl_destroy(&rp->r_dir);
2526 2530 /*
2527 2531 * Make sure that if rnode is recycled then
2528 2532 * VFS count is decremented properly before
2529 2533 * reuse.
2530 2534 */
2531 2535 VFS_RELE(vp->v_vfsp);
2532 2536 vn_reinit(vp);
2533 2537 } else {
2534 2538 vnode_t *new_vp;
2535 2539
2536 2540 mutex_exit(&rpfreelist_lock);
2537 2541
2538 2542 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2539 2543 new_vp = vn_alloc(KM_SLEEP);
2540 2544
2541 2545 atomic_inc_ulong((ulong_t *)&rnew);
2542 2546 #ifdef DEBUG
2543 2547 clstat_debug.nrnode.value.ui64++;
2544 2548 #endif
2545 2549 vp = new_vp;
2546 2550 }
2547 2551
2548 2552 bzero(rp, sizeof (*rp));
2549 2553 rp->r_vnode = vp;
2550 2554 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2551 2555 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2552 2556 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2553 2557 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2554 2558 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2555 2559 rp->r_fh.fh_len = fh->fh_len;
2556 2560 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2557 2561 rp->r_server = mi->mi_curr_serv;
2558 2562 if (FAILOVER_MOUNT(mi)) {
2559 2563 /*
2560 2564 * If replicated servers, stash pathnames
2561 2565 */
2562 2566 if (dnm != NULL && nm != NULL) {
2563 2567 char *s, *p;
2564 2568 uint_t len;
2565 2569
2566 2570 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2567 2571 rp->r_path = kmem_alloc(len, KM_SLEEP);
2568 2572 #ifdef DEBUG
2569 2573 clstat_debug.rpath.value.ui64 += len;
2570 2574 #endif
2571 2575 s = rp->r_path;
2572 2576 for (p = dnm; *p; p++)
2573 2577 *s++ = *p;
2574 2578 *s++ = '/';
2575 2579 for (p = nm; *p; p++)
2576 2580 *s++ = *p;
2577 2581 *s = '\0';
2578 2582 } else {
2579 2583 /* special case for root */
2580 2584 rp->r_path = kmem_alloc(2, KM_SLEEP);
2581 2585 #ifdef DEBUG
2582 2586 clstat_debug.rpath.value.ui64 += 2;
2583 2587 #endif
2584 2588 *rp->r_path = '.';
2585 2589 *(rp->r_path + 1) = '\0';
2586 2590 }
2587 2591 }
2588 2592 VFS_HOLD(vfsp);
2589 2593 rp->r_putapage = putapage;
2590 2594 rp->r_hashq = rhtp;
2591 2595 rp->r_flags = RREADDIRPLUS;
2592 2596 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2593 2597 offsetof(rddir_cache, tree));
2594 2598 vn_setops(vp, vops);
2595 2599 vp->v_data = (caddr_t)rp;
2596 2600 vp->v_vfsp = vfsp;
2597 2601 vp->v_type = VNON;
2598 2602 vp->v_flag |= VMODSORT;
2599 2603 nfs_set_vroot(vp);
2600 2604
2601 2605 /*
2602 2606 * There is a race condition if someone else
2603 2607 * alloc's the rnode while no locks are held, so we
2604 2608 * check again and recover if found.
2605 2609 */
2606 2610 rw_enter(&rhtp->r_lock, RW_WRITER);
2607 2611 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2608 2612 vp = RTOV(trp);
2609 2613 nfs_set_vroot(vp);
2610 2614 *newnode = 0;
2611 2615 rw_exit(&rhtp->r_lock);
2612 2616 rp_addfree(rp, cr);
2613 2617 rw_enter(&rhtp->r_lock, RW_READER);
2614 2618 return (vp);
2615 2619 }
2616 2620 rp_addhash(rp);
2617 2621 *newnode = 1;
2618 2622 return (vp);
2619 2623 }
2620 2624
2621 2625 /*
2622 2626 * Callback function to check if the page should be marked as
2623 2627 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2624 2628 */
2625 2629 int
2626 2630 nfs_setmod_check(page_t *pp)
2627 2631 {
2628 2632 if (pp->p_fsdata != C_NOCOMMIT) {
2629 2633 pp->p_fsdata = C_NOCOMMIT;
2630 2634 return (1);
2631 2635 }
2632 2636 return (0);
2633 2637 }
2634 2638
2635 2639 static void
2636 2640 nfs_set_vroot(vnode_t *vp)
2637 2641 {
2638 2642 rnode_t *rp;
2639 2643 nfs_fhandle *rootfh;
2640 2644
2641 2645 rp = VTOR(vp);
2642 2646 rootfh = &rp->r_server->sv_fhandle;
2643 2647 if (rootfh->fh_len == rp->r_fh.fh_len &&
2644 2648 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2645 2649 if (!(vp->v_flag & VROOT)) {
2646 2650 mutex_enter(&vp->v_lock);
2647 2651 vp->v_flag |= VROOT;
2648 2652 mutex_exit(&vp->v_lock);
2649 2653 }
2650 2654 }
2651 2655 }
2652 2656
2653 2657 static void
2654 2658 nfs_free_r_path(rnode_t *rp)
2655 2659 {
2656 2660 char *path;
2657 2661 size_t len;
2658 2662
2659 2663 path = rp->r_path;
2660 2664 if (path) {
2661 2665 rp->r_path = NULL;
2662 2666 len = strlen(path) + 1;
2663 2667 kmem_free(path, len);
2664 2668 #ifdef DEBUG
2665 2669 clstat_debug.rpath.value.ui64 -= len;
2666 2670 #endif
2667 2671 }
2668 2672 }
2669 2673
2670 2674 /*
2671 2675 * Put an rnode on the free list.
2672 2676 *
2673 2677 * Rnodes which were allocated above and beyond the normal limit
2674 2678 * are immediately freed.
2675 2679 */
2676 2680 void
2677 2681 rp_addfree(rnode_t *rp, cred_t *cr)
2678 2682 {
2679 2683 vnode_t *vp;
2680 2684 struct vfs *vfsp;
2681 2685
2682 2686 vp = RTOV(rp);
2683 2687 ASSERT(vp->v_count >= 1);
2684 2688 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2685 2689
2686 2690 /*
2687 2691 * If we have too many rnodes allocated and there are no
2688 2692 * references to this rnode, or if the rnode is no longer
2689 2693 * accessible by it does not reside in the hash queues,
2690 2694 * or if an i/o error occurred while writing to the file,
2691 2695 * then just free it instead of putting it on the rnode
2692 2696 * freelist.
2693 2697 */
2694 2698 vfsp = vp->v_vfsp;
2695 2699 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2696 2700 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2697 2701 if (rp->r_flags & RHASHED) {
2698 2702 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2699 2703 mutex_enter(&vp->v_lock);
2700 2704 if (vp->v_count > 1) {
2701 2705 VN_RELE_LOCKED(vp);
2702 2706 mutex_exit(&vp->v_lock);
2703 2707 rw_exit(&rp->r_hashq->r_lock);
2704 2708 return;
2705 2709 }
2706 2710 mutex_exit(&vp->v_lock);
2707 2711 rp_rmhash_locked(rp);
2708 2712 rw_exit(&rp->r_hashq->r_lock);
2709 2713 }
2710 2714
2711 2715 rinactive(rp, cr);
2712 2716
2713 2717 /*
2714 2718 * Recheck the vnode reference count. We need to
2715 2719 * make sure that another reference has not been
2716 2720 * acquired while we were not holding v_lock. The
2717 2721 * rnode is not in the rnode hash queues, so the
2718 2722 * only way for a reference to have been acquired
2719 2723 * is for a VOP_PUTPAGE because the rnode was marked
2720 2724 * with RDIRTY or for a modified page. This
2721 2725 * reference may have been acquired before our call
2722 2726 * to rinactive. The i/o may have been completed,
2723 2727 * thus allowing rinactive to complete, but the
2724 2728 * reference to the vnode may not have been released
2725 2729 * yet. In any case, the rnode can not be destroyed
2726 2730 * until the other references to this vnode have been
2727 2731 * released. The other references will take care of
2728 2732 * either destroying the rnode or placing it on the
2729 2733 * rnode freelist. If there are no other references,
2730 2734 * then the rnode may be safely destroyed.
2731 2735 */
2732 2736 mutex_enter(&vp->v_lock);
2733 2737 if (vp->v_count > 1) {
2734 2738 VN_RELE_LOCKED(vp);
2735 2739 mutex_exit(&vp->v_lock);
2736 2740 return;
2737 2741 }
2738 2742 mutex_exit(&vp->v_lock);
2739 2743
2740 2744 destroy_rnode(rp);
2741 2745 return;
2742 2746 }
2743 2747
2744 2748 /*
2745 2749 * Lock the hash queue and then recheck the reference count
2746 2750 * to ensure that no other threads have acquired a reference
2747 2751 * to indicate that the rnode should not be placed on the
2748 2752 * freelist. If another reference has been acquired, then
2749 2753 * just release this one and let the other thread complete
2750 2754 * the processing of adding this rnode to the freelist.
2751 2755 */
2752 2756 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2753 2757
2754 2758 mutex_enter(&vp->v_lock);
2755 2759 if (vp->v_count > 1) {
2756 2760 VN_RELE_LOCKED(vp);
2757 2761 mutex_exit(&vp->v_lock);
2758 2762 rw_exit(&rp->r_hashq->r_lock);
2759 2763 return;
2760 2764 }
2761 2765 mutex_exit(&vp->v_lock);
2762 2766
2763 2767 /*
2764 2768 * If there is no cached data or metadata for this file, then
2765 2769 * put the rnode on the front of the freelist so that it will
2766 2770 * be reused before other rnodes which may have cached data or
2767 2771 * metadata associated with them.
2768 2772 */
2769 2773 mutex_enter(&rpfreelist_lock);
2770 2774 if (rpfreelist == NULL) {
2771 2775 rp->r_freef = rp;
2772 2776 rp->r_freeb = rp;
2773 2777 rpfreelist = rp;
2774 2778 } else {
2775 2779 rp->r_freef = rpfreelist;
2776 2780 rp->r_freeb = rpfreelist->r_freeb;
2777 2781 rpfreelist->r_freeb->r_freef = rp;
2778 2782 rpfreelist->r_freeb = rp;
2779 2783 if (!vn_has_cached_data(vp) &&
2780 2784 !HAVE_RDDIR_CACHE(rp) &&
2781 2785 rp->r_symlink.contents == NULL &&
2782 2786 rp->r_secattr == NULL &&
2783 2787 rp->r_pathconf == NULL)
2784 2788 rpfreelist = rp;
2785 2789 }
2786 2790 mutex_exit(&rpfreelist_lock);
2787 2791
2788 2792 rw_exit(&rp->r_hashq->r_lock);
2789 2793 }
2790 2794
2791 2795 /*
2792 2796 * Remove an rnode from the free list.
2793 2797 *
2794 2798 * The caller must be holding rpfreelist_lock and the rnode
2795 2799 * must be on the freelist.
2796 2800 */
2797 2801 static void
2798 2802 rp_rmfree(rnode_t *rp)
2799 2803 {
2800 2804
2801 2805 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2802 2806 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2803 2807
2804 2808 if (rp == rpfreelist) {
2805 2809 rpfreelist = rp->r_freef;
2806 2810 if (rp == rpfreelist)
2807 2811 rpfreelist = NULL;
2808 2812 }
2809 2813
2810 2814 rp->r_freeb->r_freef = rp->r_freef;
2811 2815 rp->r_freef->r_freeb = rp->r_freeb;
2812 2816
2813 2817 rp->r_freef = rp->r_freeb = NULL;
2814 2818 }
2815 2819
2816 2820 /*
2817 2821 * Put a rnode in the hash table.
2818 2822 *
2819 2823 * The caller must be holding the exclusive hash queue lock.
2820 2824 */
2821 2825 static void
2822 2826 rp_addhash(rnode_t *rp)
2823 2827 {
2824 2828
2825 2829 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2826 2830 ASSERT(!(rp->r_flags & RHASHED));
2827 2831
2828 2832 rp->r_hashf = rp->r_hashq->r_hashf;
2829 2833 rp->r_hashq->r_hashf = rp;
2830 2834 rp->r_hashb = (rnode_t *)rp->r_hashq;
2831 2835 rp->r_hashf->r_hashb = rp;
2832 2836
2833 2837 mutex_enter(&rp->r_statelock);
2834 2838 rp->r_flags |= RHASHED;
2835 2839 mutex_exit(&rp->r_statelock);
2836 2840 }
2837 2841
2838 2842 /*
2839 2843 * Remove a rnode from the hash table.
2840 2844 *
2841 2845 * The caller must be holding the hash queue lock.
2842 2846 */
2843 2847 static void
2844 2848 rp_rmhash_locked(rnode_t *rp)
2845 2849 {
2846 2850
2847 2851 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2848 2852 ASSERT(rp->r_flags & RHASHED);
2849 2853
2850 2854 rp->r_hashb->r_hashf = rp->r_hashf;
2851 2855 rp->r_hashf->r_hashb = rp->r_hashb;
2852 2856
2853 2857 mutex_enter(&rp->r_statelock);
2854 2858 rp->r_flags &= ~RHASHED;
2855 2859 mutex_exit(&rp->r_statelock);
2856 2860 }
2857 2861
2858 2862 /*
2859 2863 * Remove a rnode from the hash table.
2860 2864 *
2861 2865 * The caller must not be holding the hash queue lock.
2862 2866 */
2863 2867 void
2864 2868 rp_rmhash(rnode_t *rp)
2865 2869 {
2866 2870
2867 2871 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2868 2872 rp_rmhash_locked(rp);
2869 2873 rw_exit(&rp->r_hashq->r_lock);
2870 2874 }
2871 2875
2872 2876 /*
2873 2877 * Lookup a rnode by fhandle.
2874 2878 *
2875 2879 * The caller must be holding the hash queue lock, either shared or exclusive.
2876 2880 */
2877 2881 static rnode_t *
2878 2882 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2879 2883 {
2880 2884 rnode_t *rp;
2881 2885 vnode_t *vp;
2882 2886
2883 2887 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2884 2888
2885 2889 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2886 2890 vp = RTOV(rp);
2887 2891 if (vp->v_vfsp == vfsp &&
2888 2892 rp->r_fh.fh_len == fh->fh_len &&
2889 2893 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2890 2894 /*
2891 2895 * remove rnode from free list, if necessary.
2892 2896 */
2893 2897 if (rp->r_freef != NULL) {
2894 2898 mutex_enter(&rpfreelist_lock);
2895 2899 /*
2896 2900 * If the rnode is on the freelist,
2897 2901 * then remove it and use that reference
2898 2902 * as the new reference. Otherwise,
2899 2903 * need to increment the reference count.
2900 2904 */
2901 2905 if (rp->r_freef != NULL) {
2902 2906 rp_rmfree(rp);
2903 2907 mutex_exit(&rpfreelist_lock);
2904 2908 } else {
2905 2909 mutex_exit(&rpfreelist_lock);
2906 2910 VN_HOLD(vp);
2907 2911 }
2908 2912 } else
2909 2913 VN_HOLD(vp);
2910 2914 return (rp);
2911 2915 }
2912 2916 }
2913 2917 return (NULL);
2914 2918 }
2915 2919
2916 2920 /*
2917 2921 * Return 1 if there is a active vnode belonging to this vfs in the
2918 2922 * rtable cache.
2919 2923 *
2920 2924 * Several of these checks are done without holding the usual
2921 2925 * locks. This is safe because destroy_rtable(), rp_addfree(),
2922 2926 * etc. will redo the necessary checks before actually destroying
2923 2927 * any rnodes.
2924 2928 */
2925 2929 int
2926 2930 check_rtable(struct vfs *vfsp)
2927 2931 {
2928 2932 int index;
2929 2933 rnode_t *rp;
2930 2934 vnode_t *vp;
2931 2935
2932 2936 for (index = 0; index < rtablesize; index++) {
2933 2937 rw_enter(&rtable[index].r_lock, RW_READER);
2934 2938 for (rp = rtable[index].r_hashf;
2935 2939 rp != (rnode_t *)(&rtable[index]);
2936 2940 rp = rp->r_hashf) {
2937 2941 vp = RTOV(rp);
2938 2942 if (vp->v_vfsp == vfsp) {
2939 2943 if (rp->r_freef == NULL ||
2940 2944 (vn_has_cached_data(vp) &&
2941 2945 (rp->r_flags & RDIRTY)) ||
2942 2946 rp->r_count > 0) {
2943 2947 rw_exit(&rtable[index].r_lock);
2944 2948 return (1);
2945 2949 }
2946 2950 }
2947 2951 }
2948 2952 rw_exit(&rtable[index].r_lock);
2949 2953 }
2950 2954 return (0);
2951 2955 }
2952 2956
2953 2957 /*
2954 2958 * Destroy inactive vnodes from the hash queues which belong to this
2955 2959 * vfs. It is essential that we destroy all inactive vnodes during a
2956 2960 * forced unmount as well as during a normal unmount.
2957 2961 */
2958 2962 void
2959 2963 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2960 2964 {
2961 2965 int index;
2962 2966 rnode_t *rp;
2963 2967 rnode_t *rlist;
2964 2968 rnode_t *r_hashf;
2965 2969 vnode_t *vp;
2966 2970
2967 2971 rlist = NULL;
2968 2972
2969 2973 for (index = 0; index < rtablesize; index++) {
2970 2974 rw_enter(&rtable[index].r_lock, RW_WRITER);
2971 2975 for (rp = rtable[index].r_hashf;
2972 2976 rp != (rnode_t *)(&rtable[index]);
2973 2977 rp = r_hashf) {
2974 2978 /* save the hash pointer before destroying */
2975 2979 r_hashf = rp->r_hashf;
2976 2980 vp = RTOV(rp);
2977 2981 if (vp->v_vfsp == vfsp) {
2978 2982 mutex_enter(&rpfreelist_lock);
2979 2983 if (rp->r_freef != NULL) {
2980 2984 rp_rmfree(rp);
2981 2985 mutex_exit(&rpfreelist_lock);
2982 2986 rp_rmhash_locked(rp);
2983 2987 rp->r_hashf = rlist;
2984 2988 rlist = rp;
2985 2989 } else
2986 2990 mutex_exit(&rpfreelist_lock);
2987 2991 }
2988 2992 }
2989 2993 rw_exit(&rtable[index].r_lock);
2990 2994 }
2991 2995
2992 2996 for (rp = rlist; rp != NULL; rp = rlist) {
2993 2997 rlist = rp->r_hashf;
2994 2998 /*
2995 2999 * This call to rp_addfree will end up destroying the
2996 3000 * rnode, but in a safe way with the appropriate set
2997 3001 * of checks done.
2998 3002 */
2999 3003 rp_addfree(rp, cr);
3000 3004 }
3001 3005
3002 3006 }
3003 3007
3004 3008 /*
3005 3009 * This routine destroys all the resources associated with the rnode
3006 3010 * and then the rnode itself.
3007 3011 */
3008 3012 static void
3009 3013 destroy_rnode(rnode_t *rp)
3010 3014 {
3011 3015 vnode_t *vp;
3012 3016 vfs_t *vfsp;
3013 3017
3014 3018 vp = RTOV(rp);
3015 3019 vfsp = vp->v_vfsp;
3016 3020
3017 3021 ASSERT(vp->v_count == 1);
3018 3022 ASSERT(rp->r_count == 0);
3019 3023 ASSERT(rp->r_lmpl == NULL);
3020 3024 ASSERT(rp->r_mapcnt == 0);
3021 3025 ASSERT(!(rp->r_flags & RHASHED));
3022 3026 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3023 3027 atomic_dec_ulong((ulong_t *)&rnew);
3024 3028 #ifdef DEBUG
3025 3029 clstat_debug.nrnode.value.ui64--;
3026 3030 #endif
3027 3031 nfs_rw_destroy(&rp->r_rwlock);
3028 3032 nfs_rw_destroy(&rp->r_lkserlock);
3029 3033 mutex_destroy(&rp->r_statelock);
3030 3034 cv_destroy(&rp->r_cv);
3031 3035 cv_destroy(&rp->r_commit.c_cv);
3032 3036 if (rp->r_flags & RDELMAPLIST)
3033 3037 list_destroy(&rp->r_indelmap);
3034 3038 nfs_free_r_path(rp);
3035 3039 avl_destroy(&rp->r_dir);
3036 3040 vn_invalid(vp);
3037 3041 vn_free(vp);
3038 3042 kmem_cache_free(rnode_cache, rp);
3039 3043 VFS_RELE(vfsp);
3040 3044 }
3041 3045
3042 3046 /*
3043 3047 * Flush all vnodes in this (or every) vfs.
3044 3048 * Used by nfs_sync and by nfs_unmount.
3045 3049 */
3046 3050 void
3047 3051 rflush(struct vfs *vfsp, cred_t *cr)
3048 3052 {
3049 3053 int index;
3050 3054 rnode_t *rp;
3051 3055 vnode_t *vp, **vplist;
3052 3056 long num, cnt;
3053 3057
3054 3058 /*
3055 3059 * Check to see whether there is anything to do.
3056 3060 */
3057 3061 num = rnew;
3058 3062 if (num == 0)
3059 3063 return;
3060 3064
3061 3065 /*
3062 3066 * Allocate a slot for all currently active rnodes on the
3063 3067 * supposition that they all may need flushing.
3064 3068 */
3065 3069 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3066 3070 cnt = 0;
3067 3071
3068 3072 /*
3069 3073 * Walk the hash queues looking for rnodes with page
3070 3074 * lists associated with them. Make a list of these
3071 3075 * files.
3072 3076 */
3073 3077 for (index = 0; index < rtablesize; index++) {
3074 3078 rw_enter(&rtable[index].r_lock, RW_READER);
3075 3079 for (rp = rtable[index].r_hashf;
3076 3080 rp != (rnode_t *)(&rtable[index]);
3077 3081 rp = rp->r_hashf) {
3078 3082 vp = RTOV(rp);
3079 3083 /*
3080 3084 * Don't bother sync'ing a vp if it
3081 3085 * is part of virtual swap device or
3082 3086 * if VFS is read-only
3083 3087 */
3084 3088 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3085 3089 continue;
3086 3090 /*
3087 3091 * If flushing all mounted file systems or
3088 3092 * the vnode belongs to this vfs, has pages
3089 3093 * and is marked as either dirty or mmap'd,
3090 3094 * hold and add this vnode to the list of
3091 3095 * vnodes to flush.
3092 3096 */
3093 3097 if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3094 3098 vn_has_cached_data(vp) &&
3095 3099 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3096 3100 VN_HOLD(vp);
3097 3101 vplist[cnt++] = vp;
3098 3102 if (cnt == num) {
3099 3103 rw_exit(&rtable[index].r_lock);
3100 3104 goto toomany;
3101 3105 }
3102 3106 }
3103 3107 }
3104 3108 rw_exit(&rtable[index].r_lock);
3105 3109 }
3106 3110 toomany:
3107 3111
3108 3112 /*
3109 3113 * Flush and release all of the files on the list.
3110 3114 */
3111 3115 while (cnt-- > 0) {
3112 3116 vp = vplist[cnt];
3113 3117 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3114 3118 VN_RELE(vp);
3115 3119 }
3116 3120
3117 3121 /*
3118 3122 * Free the space allocated to hold the list.
3119 3123 */
3120 3124 kmem_free(vplist, num * sizeof (*vplist));
3121 3125 }
3122 3126
3123 3127 /*
3124 3128 * This probably needs to be larger than or equal to
3125 3129 * log2(sizeof (struct rnode)) due to the way that rnodes are
3126 3130 * allocated.
3127 3131 */
3128 3132 #define ACACHE_SHIFT_BITS 9
3129 3133
3130 3134 static int
3131 3135 acachehash(rnode_t *rp, cred_t *cr)
3132 3136 {
3133 3137
3134 3138 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3135 3139 acachemask);
3136 3140 }
3137 3141
3138 3142 #ifdef DEBUG
3139 3143 static long nfs_access_cache_hits = 0;
3140 3144 static long nfs_access_cache_misses = 0;
3141 3145 #endif
3142 3146
3143 3147 nfs_access_type_t
3144 3148 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3145 3149 {
3146 3150 vnode_t *vp;
3147 3151 acache_t *ap;
3148 3152 acache_hash_t *hp;
3149 3153 nfs_access_type_t all;
3150 3154
3151 3155 vp = RTOV(rp);
3152 3156 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3153 3157 return (NFS_ACCESS_UNKNOWN);
3154 3158
3155 3159 if (rp->r_acache != NULL) {
3156 3160 hp = &acache[acachehash(rp, cr)];
3157 3161 rw_enter(&hp->lock, RW_READER);
3158 3162 ap = hp->next;
3159 3163 while (ap != (acache_t *)hp) {
3160 3164 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3161 3165 if ((ap->known & acc) == acc) {
3162 3166 #ifdef DEBUG
3163 3167 nfs_access_cache_hits++;
3164 3168 #endif
3165 3169 if ((ap->allowed & acc) == acc)
3166 3170 all = NFS_ACCESS_ALLOWED;
3167 3171 else
3168 3172 all = NFS_ACCESS_DENIED;
3169 3173 } else {
3170 3174 #ifdef DEBUG
3171 3175 nfs_access_cache_misses++;
3172 3176 #endif
3173 3177 all = NFS_ACCESS_UNKNOWN;
3174 3178 }
3175 3179 rw_exit(&hp->lock);
3176 3180 return (all);
3177 3181 }
3178 3182 ap = ap->next;
3179 3183 }
3180 3184 rw_exit(&hp->lock);
3181 3185 }
3182 3186
3183 3187 #ifdef DEBUG
3184 3188 nfs_access_cache_misses++;
3185 3189 #endif
3186 3190 return (NFS_ACCESS_UNKNOWN);
3187 3191 }
3188 3192
3189 3193 void
3190 3194 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3191 3195 {
3192 3196 acache_t *ap;
3193 3197 acache_t *nap;
3194 3198 acache_hash_t *hp;
3195 3199
3196 3200 hp = &acache[acachehash(rp, cr)];
3197 3201
3198 3202 /*
3199 3203 * Allocate now assuming that mostly an allocation will be
3200 3204 * required. This allows the allocation to happen without
3201 3205 * holding the hash bucket locked.
3202 3206 */
3203 3207 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3204 3208 if (nap != NULL) {
3205 3209 nap->known = acc;
3206 3210 nap->allowed = resacc;
3207 3211 nap->rnode = rp;
3208 3212 crhold(cr);
3209 3213 nap->cred = cr;
3210 3214 nap->hashq = hp;
3211 3215 }
3212 3216
3213 3217 rw_enter(&hp->lock, RW_WRITER);
3214 3218
3215 3219 if (rp->r_acache != NULL) {
3216 3220 ap = hp->next;
3217 3221 while (ap != (acache_t *)hp) {
3218 3222 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3219 3223 ap->known |= acc;
3220 3224 ap->allowed &= ~acc;
3221 3225 ap->allowed |= resacc;
3222 3226 rw_exit(&hp->lock);
3223 3227 if (nap != NULL) {
3224 3228 crfree(nap->cred);
3225 3229 kmem_cache_free(acache_cache, nap);
3226 3230 }
3227 3231 return;
3228 3232 }
3229 3233 ap = ap->next;
3230 3234 }
3231 3235 }
3232 3236
3233 3237 if (nap != NULL) {
3234 3238 #ifdef DEBUG
3235 3239 clstat_debug.access.value.ui64++;
3236 3240 #endif
3237 3241 nap->next = hp->next;
3238 3242 hp->next = nap;
3239 3243 nap->next->prev = nap;
3240 3244 nap->prev = (acache_t *)hp;
3241 3245
3242 3246 mutex_enter(&rp->r_statelock);
3243 3247 nap->list = rp->r_acache;
3244 3248 rp->r_acache = nap;
3245 3249 mutex_exit(&rp->r_statelock);
3246 3250 }
3247 3251
3248 3252 rw_exit(&hp->lock);
3249 3253 }
3250 3254
3251 3255 int
3252 3256 nfs_access_purge_rp(rnode_t *rp)
3253 3257 {
3254 3258 acache_t *ap;
3255 3259 acache_t *tmpap;
3256 3260 acache_t *rplist;
3257 3261
3258 3262 /*
3259 3263 * If there aren't any cached entries, then there is nothing
3260 3264 * to free.
3261 3265 */
3262 3266 if (rp->r_acache == NULL)
3263 3267 return (0);
3264 3268
3265 3269 mutex_enter(&rp->r_statelock);
3266 3270 rplist = rp->r_acache;
3267 3271 rp->r_acache = NULL;
3268 3272 mutex_exit(&rp->r_statelock);
3269 3273
3270 3274 /*
3271 3275 * Loop through each entry in the list pointed to in the
3272 3276 * rnode. Remove each of these entries from the hash
3273 3277 * queue that it is on and remove it from the list in
3274 3278 * the rnode.
3275 3279 */
3276 3280 for (ap = rplist; ap != NULL; ap = tmpap) {
3277 3281 rw_enter(&ap->hashq->lock, RW_WRITER);
3278 3282 ap->prev->next = ap->next;
3279 3283 ap->next->prev = ap->prev;
3280 3284 rw_exit(&ap->hashq->lock);
3281 3285
3282 3286 tmpap = ap->list;
3283 3287 crfree(ap->cred);
3284 3288 kmem_cache_free(acache_cache, ap);
3285 3289 #ifdef DEBUG
3286 3290 clstat_debug.access.value.ui64--;
3287 3291 #endif
3288 3292 }
3289 3293
3290 3294 return (1);
3291 3295 }
3292 3296
3293 3297 static const char prefix[] = ".nfs";
3294 3298
3295 3299 static kmutex_t newnum_lock;
3296 3300
3297 3301 int
3298 3302 newnum(void)
3299 3303 {
3300 3304 static uint_t newnum = 0;
3301 3305 uint_t id;
3302 3306
3303 3307 mutex_enter(&newnum_lock);
3304 3308 if (newnum == 0)
3305 3309 newnum = gethrestime_sec() & 0xffff;
3306 3310 id = newnum++;
3307 3311 mutex_exit(&newnum_lock);
3308 3312 return (id);
3309 3313 }
3310 3314
3311 3315 char *
3312 3316 newname(void)
3313 3317 {
3314 3318 char *news;
3315 3319 char *s;
3316 3320 const char *p;
3317 3321 uint_t id;
3318 3322
3319 3323 id = newnum();
3320 3324 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3321 3325 s = news;
3322 3326 p = prefix;
3323 3327 while (*p != '\0')
3324 3328 *s++ = *p++;
3325 3329 while (id != 0) {
3326 3330 *s++ = "0123456789ABCDEF"[id & 0x0f];
3327 3331 id >>= 4;
3328 3332 }
3329 3333 *s = '\0';
3330 3334 return (news);
3331 3335 }
3332 3336
3333 3337 /*
3334 3338 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3335 3339 * framework.
3336 3340 */
3337 3341 static int
3338 3342 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3339 3343 {
3340 3344 ksp->ks_snaptime = gethrtime();
3341 3345 if (rw == KSTAT_WRITE) {
3342 3346 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3343 3347 #ifdef DEBUG
3344 3348 /*
3345 3349 * Currently only the global zone can write to kstats, but we
3346 3350 * add the check just for paranoia.
3347 3351 */
3348 3352 if (INGLOBALZONE(curproc))
3349 3353 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3350 3354 sizeof (clstat_debug));
3351 3355 #endif
3352 3356 } else {
3353 3357 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3354 3358 #ifdef DEBUG
3355 3359 /*
3356 3360 * If we're displaying the "global" debug kstat values, we
3357 3361 * display them as-is to all zones since in fact they apply to
3358 3362 * the system as a whole.
3359 3363 */
3360 3364 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3361 3365 sizeof (clstat_debug));
3362 3366 #endif
3363 3367 }
3364 3368 return (0);
3365 3369 }
3366 3370
3367 3371 static void *
3368 3372 clinit_zone(zoneid_t zoneid)
3369 3373 {
3370 3374 kstat_t *nfs_client_kstat;
3371 3375 struct nfs_clnt *nfscl;
3372 3376 uint_t ndata;
3373 3377
3374 3378 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3375 3379 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3376 3380 nfscl->nfscl_chtable = NULL;
3377 3381 nfscl->nfscl_zoneid = zoneid;
3378 3382
3379 3383 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3380 3384 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3381 3385 #ifdef DEBUG
3382 3386 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3383 3387 #endif
3384 3388 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3385 3389 "misc", KSTAT_TYPE_NAMED, ndata,
3386 3390 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3387 3391 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3388 3392 nfs_client_kstat->ks_snapshot = cl_snapshot;
3389 3393 kstat_install(nfs_client_kstat);
3390 3394 }
3391 3395 mutex_enter(&nfs_clnt_list_lock);
3392 3396 list_insert_head(&nfs_clnt_list, nfscl);
3393 3397 mutex_exit(&nfs_clnt_list_lock);
3394 3398 return (nfscl);
3395 3399 }
3396 3400
3397 3401 /*ARGSUSED*/
3398 3402 static void
3399 3403 clfini_zone(zoneid_t zoneid, void *arg)
3400 3404 {
3401 3405 struct nfs_clnt *nfscl = arg;
3402 3406 chhead_t *chp, *next;
3403 3407
3404 3408 if (nfscl == NULL)
3405 3409 return;
3406 3410 mutex_enter(&nfs_clnt_list_lock);
3407 3411 list_remove(&nfs_clnt_list, nfscl);
3408 3412 mutex_exit(&nfs_clnt_list_lock);
3409 3413 clreclaim_zone(nfscl, 0);
3410 3414 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3411 3415 ASSERT(chp->ch_list == NULL);
3412 3416 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3413 3417 next = chp->ch_next;
3414 3418 kmem_free(chp, sizeof (*chp));
3415 3419 }
3416 3420 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3417 3421 mutex_destroy(&nfscl->nfscl_chtable_lock);
3418 3422 kmem_free(nfscl, sizeof (*nfscl));
3419 3423 }
3420 3424
3421 3425 /*
3422 3426 * Called by endpnt_destructor to make sure the client handles are
3423 3427 * cleaned up before the RPC endpoints. This becomes a no-op if
3424 3428 * clfini_zone (above) is called first. This function is needed
3425 3429 * (rather than relying on clfini_zone to clean up) because the ZSD
3426 3430 * callbacks have no ordering mechanism, so we have no way to ensure
3427 3431 * that clfini_zone is called before endpnt_destructor.
3428 3432 */
3429 3433 void
3430 3434 clcleanup_zone(zoneid_t zoneid)
3431 3435 {
3432 3436 struct nfs_clnt *nfscl;
3433 3437
3434 3438 mutex_enter(&nfs_clnt_list_lock);
3435 3439 nfscl = list_head(&nfs_clnt_list);
3436 3440 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3437 3441 if (nfscl->nfscl_zoneid == zoneid) {
3438 3442 clreclaim_zone(nfscl, 0);
3439 3443 break;
3440 3444 }
3441 3445 }
3442 3446 mutex_exit(&nfs_clnt_list_lock);
3443 3447 }
3444 3448
3445 3449 int
3446 3450 nfs_subrinit(void)
3447 3451 {
3448 3452 int i;
3449 3453 ulong_t nrnode_max;
3450 3454
3451 3455 /*
3452 3456 * Allocate and initialize the rnode hash queues
3453 3457 */
3454 3458 if (nrnode <= 0)
3455 3459 nrnode = ncsize;
3456 3460 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3457 3461 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3458 3462 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3459 3463 "!setting nrnode to max value of %ld", nrnode_max);
3460 3464 nrnode = nrnode_max;
3461 3465 }
3462 3466
3463 3467 rtablesize = 1 << highbit(nrnode / hashlen);
3464 3468 rtablemask = rtablesize - 1;
3465 3469 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3466 3470 for (i = 0; i < rtablesize; i++) {
3467 3471 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3468 3472 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3469 3473 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3470 3474 }
3471 3475 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3472 3476 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3473 3477
3474 3478 /*
3475 3479 * Allocate and initialize the access cache
3476 3480 */
3477 3481
3478 3482 /*
3479 3483 * Initial guess is one access cache entry per rnode unless
3480 3484 * nacache is set to a non-zero value and then it is used to
3481 3485 * indicate a guess at the number of access cache entries.
3482 3486 */
3483 3487 if (nacache > 0)
3484 3488 acachesize = 1 << highbit(nacache / hashlen);
3485 3489 else
3486 3490 acachesize = rtablesize;
3487 3491 acachemask = acachesize - 1;
3488 3492 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3489 3493 for (i = 0; i < acachesize; i++) {
3490 3494 acache[i].next = (acache_t *)&acache[i];
3491 3495 acache[i].prev = (acache_t *)&acache[i];
3492 3496 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3493 3497 }
3494 3498 acache_cache = kmem_cache_create("nfs_access_cache",
3495 3499 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3496 3500 /*
3497 3501 * Allocate and initialize the client handle cache
3498 3502 */
3499 3503 chtab_cache = kmem_cache_create("client_handle_cache",
3500 3504 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3501 3505 /*
3502 3506 * Initialize the list of per-zone client handles (and associated data).
3503 3507 * This needs to be done before we call zone_key_create().
3504 3508 */
3505 3509 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3506 3510 offsetof(struct nfs_clnt, nfscl_node));
3507 3511 /*
3508 3512 * Initialize the zone_key for per-zone client handle lists.
3509 3513 */
3510 3514 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3511 3515 /*
3512 3516 * Initialize the various mutexes and reader/writer locks
3513 3517 */
3514 3518 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3515 3519 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3516 3520 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3517 3521
3518 3522 /*
3519 3523 * Assign unique major number for all nfs mounts
3520 3524 */
3521 3525 if ((nfs_major = getudev()) == -1) {
3522 3526 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3523 3527 "nfs: init: can't get unique device number");
3524 3528 nfs_major = 0;
3525 3529 }
3526 3530 nfs_minor = 0;
3527 3531
3528 3532 if (nfs3_jukebox_delay == 0)
3529 3533 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3530 3534
3531 3535 return (0);
3532 3536 }
3533 3537
3534 3538 void
3535 3539 nfs_subrfini(void)
3536 3540 {
3537 3541 int i;
3538 3542
3539 3543 /*
3540 3544 * Deallocate the rnode hash queues
3541 3545 */
3542 3546 kmem_cache_destroy(rnode_cache);
3543 3547
3544 3548 for (i = 0; i < rtablesize; i++)
3545 3549 rw_destroy(&rtable[i].r_lock);
3546 3550 kmem_free(rtable, rtablesize * sizeof (*rtable));
3547 3551
3548 3552 /*
3549 3553 * Deallocated the access cache
3550 3554 */
3551 3555 kmem_cache_destroy(acache_cache);
3552 3556
3553 3557 for (i = 0; i < acachesize; i++)
3554 3558 rw_destroy(&acache[i].lock);
3555 3559 kmem_free(acache, acachesize * sizeof (*acache));
3556 3560
3557 3561 /*
3558 3562 * Deallocate the client handle cache
3559 3563 */
3560 3564 kmem_cache_destroy(chtab_cache);
3561 3565
3562 3566 /*
3563 3567 * Destroy the various mutexes and reader/writer locks
3564 3568 */
3565 3569 mutex_destroy(&rpfreelist_lock);
3566 3570 mutex_destroy(&newnum_lock);
3567 3571 mutex_destroy(&nfs_minor_lock);
3568 3572 (void) zone_key_delete(nfsclnt_zone_key);
3569 3573 }
3570 3574
3571 3575 enum nfsstat
3572 3576 puterrno(int error)
3573 3577 {
3574 3578
3575 3579 switch (error) {
3576 3580 case EOPNOTSUPP:
3577 3581 return (NFSERR_OPNOTSUPP);
3578 3582 case ENAMETOOLONG:
3579 3583 return (NFSERR_NAMETOOLONG);
3580 3584 case ENOTEMPTY:
3581 3585 return (NFSERR_NOTEMPTY);
3582 3586 case EDQUOT:
3583 3587 return (NFSERR_DQUOT);
3584 3588 case ESTALE:
3585 3589 return (NFSERR_STALE);
3586 3590 case EREMOTE:
3587 3591 return (NFSERR_REMOTE);
3588 3592 case ENOSYS:
3589 3593 return (NFSERR_OPNOTSUPP);
3590 3594 case EOVERFLOW:
3591 3595 return (NFSERR_INVAL);
3592 3596 default:
3593 3597 return ((enum nfsstat)error);
3594 3598 }
3595 3599 /* NOTREACHED */
3596 3600 }
3597 3601
3598 3602 int
3599 3603 geterrno(enum nfsstat status)
3600 3604 {
3601 3605
3602 3606 switch (status) {
3603 3607 case NFSERR_OPNOTSUPP:
3604 3608 return (EOPNOTSUPP);
3605 3609 case NFSERR_NAMETOOLONG:
3606 3610 return (ENAMETOOLONG);
3607 3611 case NFSERR_NOTEMPTY:
3608 3612 return (ENOTEMPTY);
3609 3613 case NFSERR_DQUOT:
3610 3614 return (EDQUOT);
3611 3615 case NFSERR_STALE:
3612 3616 return (ESTALE);
3613 3617 case NFSERR_REMOTE:
3614 3618 return (EREMOTE);
3615 3619 case NFSERR_WFLUSH:
3616 3620 return (EIO);
3617 3621 default:
3618 3622 return ((int)status);
3619 3623 }
3620 3624 /* NOTREACHED */
3621 3625 }
3622 3626
3623 3627 enum nfsstat3
3624 3628 puterrno3(int error)
3625 3629 {
3626 3630
3627 3631 #ifdef DEBUG
3628 3632 switch (error) {
3629 3633 case 0:
3630 3634 return (NFS3_OK);
3631 3635 case EPERM:
3632 3636 return (NFS3ERR_PERM);
3633 3637 case ENOENT:
3634 3638 return (NFS3ERR_NOENT);
3635 3639 case EIO:
3636 3640 return (NFS3ERR_IO);
3637 3641 case ENXIO:
3638 3642 return (NFS3ERR_NXIO);
3639 3643 case EACCES:
3640 3644 return (NFS3ERR_ACCES);
3641 3645 case EEXIST:
3642 3646 return (NFS3ERR_EXIST);
3643 3647 case EXDEV:
3644 3648 return (NFS3ERR_XDEV);
3645 3649 case ENODEV:
3646 3650 return (NFS3ERR_NODEV);
3647 3651 case ENOTDIR:
3648 3652 return (NFS3ERR_NOTDIR);
3649 3653 case EISDIR:
3650 3654 return (NFS3ERR_ISDIR);
3651 3655 case EINVAL:
3652 3656 return (NFS3ERR_INVAL);
3653 3657 case EFBIG:
3654 3658 return (NFS3ERR_FBIG);
3655 3659 case ENOSPC:
3656 3660 return (NFS3ERR_NOSPC);
3657 3661 case EROFS:
3658 3662 return (NFS3ERR_ROFS);
3659 3663 case EMLINK:
3660 3664 return (NFS3ERR_MLINK);
3661 3665 case ENAMETOOLONG:
3662 3666 return (NFS3ERR_NAMETOOLONG);
3663 3667 case ENOTEMPTY:
3664 3668 return (NFS3ERR_NOTEMPTY);
3665 3669 case EDQUOT:
3666 3670 return (NFS3ERR_DQUOT);
3667 3671 case ESTALE:
3668 3672 return (NFS3ERR_STALE);
3669 3673 case EREMOTE:
3670 3674 return (NFS3ERR_REMOTE);
3671 3675 case ENOSYS:
3672 3676 case EOPNOTSUPP:
3673 3677 return (NFS3ERR_NOTSUPP);
3674 3678 case EOVERFLOW:
3675 3679 return (NFS3ERR_INVAL);
3676 3680 default:
3677 3681 zcmn_err(getzoneid(), CE_WARN,
3678 3682 "puterrno3: got error %d", error);
3679 3683 return ((enum nfsstat3)error);
3680 3684 }
3681 3685 #else
3682 3686 switch (error) {
3683 3687 case ENAMETOOLONG:
3684 3688 return (NFS3ERR_NAMETOOLONG);
3685 3689 case ENOTEMPTY:
3686 3690 return (NFS3ERR_NOTEMPTY);
3687 3691 case EDQUOT:
3688 3692 return (NFS3ERR_DQUOT);
3689 3693 case ESTALE:
3690 3694 return (NFS3ERR_STALE);
3691 3695 case ENOSYS:
3692 3696 case EOPNOTSUPP:
3693 3697 return (NFS3ERR_NOTSUPP);
3694 3698 case EREMOTE:
3695 3699 return (NFS3ERR_REMOTE);
3696 3700 case EOVERFLOW:
3697 3701 return (NFS3ERR_INVAL);
3698 3702 default:
3699 3703 return ((enum nfsstat3)error);
3700 3704 }
3701 3705 #endif
3702 3706 }
3703 3707
3704 3708 int
3705 3709 geterrno3(enum nfsstat3 status)
3706 3710 {
3707 3711
3708 3712 #ifdef DEBUG
3709 3713 switch (status) {
3710 3714 case NFS3_OK:
3711 3715 return (0);
3712 3716 case NFS3ERR_PERM:
3713 3717 return (EPERM);
3714 3718 case NFS3ERR_NOENT:
3715 3719 return (ENOENT);
3716 3720 case NFS3ERR_IO:
3717 3721 return (EIO);
3718 3722 case NFS3ERR_NXIO:
3719 3723 return (ENXIO);
3720 3724 case NFS3ERR_ACCES:
3721 3725 return (EACCES);
3722 3726 case NFS3ERR_EXIST:
3723 3727 return (EEXIST);
3724 3728 case NFS3ERR_XDEV:
3725 3729 return (EXDEV);
3726 3730 case NFS3ERR_NODEV:
3727 3731 return (ENODEV);
3728 3732 case NFS3ERR_NOTDIR:
3729 3733 return (ENOTDIR);
3730 3734 case NFS3ERR_ISDIR:
3731 3735 return (EISDIR);
3732 3736 case NFS3ERR_INVAL:
3733 3737 return (EINVAL);
3734 3738 case NFS3ERR_FBIG:
3735 3739 return (EFBIG);
3736 3740 case NFS3ERR_NOSPC:
3737 3741 return (ENOSPC);
3738 3742 case NFS3ERR_ROFS:
3739 3743 return (EROFS);
3740 3744 case NFS3ERR_MLINK:
3741 3745 return (EMLINK);
3742 3746 case NFS3ERR_NAMETOOLONG:
3743 3747 return (ENAMETOOLONG);
3744 3748 case NFS3ERR_NOTEMPTY:
3745 3749 return (ENOTEMPTY);
3746 3750 case NFS3ERR_DQUOT:
3747 3751 return (EDQUOT);
3748 3752 case NFS3ERR_STALE:
3749 3753 return (ESTALE);
3750 3754 case NFS3ERR_REMOTE:
3751 3755 return (EREMOTE);
3752 3756 case NFS3ERR_BADHANDLE:
3753 3757 return (ESTALE);
3754 3758 case NFS3ERR_NOT_SYNC:
3755 3759 return (EINVAL);
3756 3760 case NFS3ERR_BAD_COOKIE:
3757 3761 return (ENOENT);
3758 3762 case NFS3ERR_NOTSUPP:
3759 3763 return (EOPNOTSUPP);
3760 3764 case NFS3ERR_TOOSMALL:
3761 3765 return (EINVAL);
3762 3766 case NFS3ERR_SERVERFAULT:
3763 3767 return (EIO);
3764 3768 case NFS3ERR_BADTYPE:
3765 3769 return (EINVAL);
3766 3770 case NFS3ERR_JUKEBOX:
3767 3771 return (ENXIO);
3768 3772 default:
3769 3773 zcmn_err(getzoneid(), CE_WARN,
3770 3774 "geterrno3: got status %d", status);
3771 3775 return ((int)status);
3772 3776 }
3773 3777 #else
3774 3778 switch (status) {
3775 3779 case NFS3ERR_NAMETOOLONG:
3776 3780 return (ENAMETOOLONG);
3777 3781 case NFS3ERR_NOTEMPTY:
3778 3782 return (ENOTEMPTY);
3779 3783 case NFS3ERR_DQUOT:
3780 3784 return (EDQUOT);
3781 3785 case NFS3ERR_STALE:
3782 3786 case NFS3ERR_BADHANDLE:
3783 3787 return (ESTALE);
3784 3788 case NFS3ERR_NOTSUPP:
3785 3789 return (EOPNOTSUPP);
3786 3790 case NFS3ERR_REMOTE:
3787 3791 return (EREMOTE);
3788 3792 case NFS3ERR_NOT_SYNC:
3789 3793 case NFS3ERR_TOOSMALL:
3790 3794 case NFS3ERR_BADTYPE:
3791 3795 return (EINVAL);
3792 3796 case NFS3ERR_BAD_COOKIE:
3793 3797 return (ENOENT);
3794 3798 case NFS3ERR_SERVERFAULT:
3795 3799 return (EIO);
3796 3800 case NFS3ERR_JUKEBOX:
3797 3801 return (ENXIO);
3798 3802 default:
3799 3803 return ((int)status);
3800 3804 }
3801 3805 #endif
3802 3806 }
3803 3807
3804 3808 rddir_cache *
3805 3809 rddir_cache_alloc(int flags)
3806 3810 {
3807 3811 rddir_cache *rc;
3808 3812
3809 3813 rc = kmem_alloc(sizeof (*rc), flags);
3810 3814 if (rc != NULL) {
3811 3815 rc->entries = NULL;
3812 3816 rc->flags = RDDIR;
3813 3817 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3814 3818 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3815 3819 rc->count = 1;
3816 3820 #ifdef DEBUG
3817 3821 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3818 3822 #endif
3819 3823 }
3820 3824 return (rc);
3821 3825 }
3822 3826
3823 3827 static void
3824 3828 rddir_cache_free(rddir_cache *rc)
3825 3829 {
3826 3830
3827 3831 #ifdef DEBUG
3828 3832 atomic_dec_64(&clstat_debug.dirent.value.ui64);
3829 3833 #endif
3830 3834 if (rc->entries != NULL) {
3831 3835 #ifdef DEBUG
3832 3836 rddir_cache_buf_free(rc->entries, rc->buflen);
3833 3837 #else
3834 3838 kmem_free(rc->entries, rc->buflen);
3835 3839 #endif
3836 3840 }
3837 3841 cv_destroy(&rc->cv);
3838 3842 mutex_destroy(&rc->lock);
3839 3843 kmem_free(rc, sizeof (*rc));
3840 3844 }
3841 3845
3842 3846 void
3843 3847 rddir_cache_hold(rddir_cache *rc)
3844 3848 {
3845 3849
3846 3850 mutex_enter(&rc->lock);
3847 3851 rc->count++;
3848 3852 mutex_exit(&rc->lock);
3849 3853 }
3850 3854
3851 3855 void
3852 3856 rddir_cache_rele(rddir_cache *rc)
3853 3857 {
3854 3858
3855 3859 mutex_enter(&rc->lock);
3856 3860 ASSERT(rc->count > 0);
3857 3861 if (--rc->count == 0) {
3858 3862 mutex_exit(&rc->lock);
3859 3863 rddir_cache_free(rc);
3860 3864 } else
3861 3865 mutex_exit(&rc->lock);
3862 3866 }
3863 3867
3864 3868 #ifdef DEBUG
3865 3869 char *
3866 3870 rddir_cache_buf_alloc(size_t size, int flags)
3867 3871 {
3868 3872 char *rc;
3869 3873
3870 3874 rc = kmem_alloc(size, flags);
3871 3875 if (rc != NULL)
3872 3876 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3873 3877 return (rc);
3874 3878 }
3875 3879
3876 3880 void
3877 3881 rddir_cache_buf_free(void *addr, size_t size)
3878 3882 {
3879 3883
3880 3884 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3881 3885 kmem_free(addr, size);
3882 3886 }
3883 3887 #endif
3884 3888
3885 3889 static int
3886 3890 nfs_free_data_reclaim(rnode_t *rp)
3887 3891 {
3888 3892 char *contents;
3889 3893 int size;
3890 3894 vsecattr_t *vsp;
3891 3895 nfs3_pathconf_info *info;
3892 3896 int freed;
3893 3897 cred_t *cred;
3894 3898
3895 3899 /*
3896 3900 * Free any held credentials and caches which
3897 3901 * may be associated with this rnode.
3898 3902 */
3899 3903 mutex_enter(&rp->r_statelock);
3900 3904 cred = rp->r_cred;
3901 3905 rp->r_cred = NULL;
3902 3906 contents = rp->r_symlink.contents;
3903 3907 size = rp->r_symlink.size;
3904 3908 rp->r_symlink.contents = NULL;
3905 3909 vsp = rp->r_secattr;
3906 3910 rp->r_secattr = NULL;
3907 3911 info = rp->r_pathconf;
3908 3912 rp->r_pathconf = NULL;
3909 3913 mutex_exit(&rp->r_statelock);
3910 3914
3911 3915 if (cred != NULL)
3912 3916 crfree(cred);
3913 3917
3914 3918 /*
3915 3919 * Free the access cache entries.
3916 3920 */
3917 3921 freed = nfs_access_purge_rp(rp);
3918 3922
3919 3923 if (!HAVE_RDDIR_CACHE(rp) &&
3920 3924 contents == NULL &&
3921 3925 vsp == NULL &&
3922 3926 info == NULL)
3923 3927 return (freed);
3924 3928
3925 3929 /*
3926 3930 * Free the readdir cache entries
3927 3931 */
3928 3932 if (HAVE_RDDIR_CACHE(rp))
3929 3933 nfs_purge_rddir_cache(RTOV(rp));
3930 3934
3931 3935 /*
3932 3936 * Free the symbolic link cache.
3933 3937 */
3934 3938 if (contents != NULL) {
3935 3939
3936 3940 kmem_free((void *)contents, size);
3937 3941 }
3938 3942
3939 3943 /*
3940 3944 * Free any cached ACL.
3941 3945 */
3942 3946 if (vsp != NULL)
3943 3947 nfs_acl_free(vsp);
3944 3948
3945 3949 /*
3946 3950 * Free any cached pathconf information.
3947 3951 */
3948 3952 if (info != NULL)
3949 3953 kmem_free(info, sizeof (*info));
3950 3954
3951 3955 return (1);
3952 3956 }
3953 3957
3954 3958 static int
3955 3959 nfs_active_data_reclaim(rnode_t *rp)
3956 3960 {
3957 3961 char *contents;
3958 3962 int size;
3959 3963 vsecattr_t *vsp;
3960 3964 nfs3_pathconf_info *info;
3961 3965 int freed;
3962 3966
3963 3967 /*
3964 3968 * Free any held credentials and caches which
3965 3969 * may be associated with this rnode.
3966 3970 */
3967 3971 if (!mutex_tryenter(&rp->r_statelock))
3968 3972 return (0);
3969 3973 contents = rp->r_symlink.contents;
3970 3974 size = rp->r_symlink.size;
3971 3975 rp->r_symlink.contents = NULL;
3972 3976 vsp = rp->r_secattr;
3973 3977 rp->r_secattr = NULL;
3974 3978 info = rp->r_pathconf;
3975 3979 rp->r_pathconf = NULL;
3976 3980 mutex_exit(&rp->r_statelock);
3977 3981
3978 3982 /*
3979 3983 * Free the access cache entries.
3980 3984 */
3981 3985 freed = nfs_access_purge_rp(rp);
3982 3986
3983 3987 if (!HAVE_RDDIR_CACHE(rp) &&
3984 3988 contents == NULL &&
3985 3989 vsp == NULL &&
3986 3990 info == NULL)
3987 3991 return (freed);
3988 3992
3989 3993 /*
3990 3994 * Free the readdir cache entries
3991 3995 */
3992 3996 if (HAVE_RDDIR_CACHE(rp))
3993 3997 nfs_purge_rddir_cache(RTOV(rp));
3994 3998
3995 3999 /*
3996 4000 * Free the symbolic link cache.
3997 4001 */
3998 4002 if (contents != NULL) {
3999 4003
4000 4004 kmem_free((void *)contents, size);
4001 4005 }
4002 4006
4003 4007 /*
4004 4008 * Free any cached ACL.
4005 4009 */
4006 4010 if (vsp != NULL)
4007 4011 nfs_acl_free(vsp);
4008 4012
4009 4013 /*
4010 4014 * Free any cached pathconf information.
4011 4015 */
4012 4016 if (info != NULL)
4013 4017 kmem_free(info, sizeof (*info));
4014 4018
4015 4019 return (1);
4016 4020 }
4017 4021
4018 4022 static int
4019 4023 nfs_free_reclaim(void)
4020 4024 {
4021 4025 int freed;
4022 4026 rnode_t *rp;
4023 4027
4024 4028 #ifdef DEBUG
4025 4029 clstat_debug.f_reclaim.value.ui64++;
4026 4030 #endif
4027 4031 freed = 0;
4028 4032 mutex_enter(&rpfreelist_lock);
4029 4033 rp = rpfreelist;
4030 4034 if (rp != NULL) {
4031 4035 do {
4032 4036 if (nfs_free_data_reclaim(rp))
4033 4037 freed = 1;
4034 4038 } while ((rp = rp->r_freef) != rpfreelist);
4035 4039 }
4036 4040 mutex_exit(&rpfreelist_lock);
4037 4041 return (freed);
4038 4042 }
4039 4043
4040 4044 static int
4041 4045 nfs_active_reclaim(void)
4042 4046 {
4043 4047 int freed;
4044 4048 int index;
4045 4049 rnode_t *rp;
4046 4050
4047 4051 #ifdef DEBUG
4048 4052 clstat_debug.a_reclaim.value.ui64++;
4049 4053 #endif
4050 4054 freed = 0;
4051 4055 for (index = 0; index < rtablesize; index++) {
4052 4056 rw_enter(&rtable[index].r_lock, RW_READER);
4053 4057 for (rp = rtable[index].r_hashf;
4054 4058 rp != (rnode_t *)(&rtable[index]);
4055 4059 rp = rp->r_hashf) {
4056 4060 if (nfs_active_data_reclaim(rp))
4057 4061 freed = 1;
4058 4062 }
4059 4063 rw_exit(&rtable[index].r_lock);
4060 4064 }
4061 4065 return (freed);
4062 4066 }
4063 4067
4064 4068 static int
4065 4069 nfs_rnode_reclaim(void)
4066 4070 {
4067 4071 int freed;
4068 4072 rnode_t *rp;
4069 4073 vnode_t *vp;
4070 4074
4071 4075 #ifdef DEBUG
4072 4076 clstat_debug.r_reclaim.value.ui64++;
4073 4077 #endif
4074 4078 freed = 0;
4075 4079 mutex_enter(&rpfreelist_lock);
4076 4080 while ((rp = rpfreelist) != NULL) {
4077 4081 rp_rmfree(rp);
4078 4082 mutex_exit(&rpfreelist_lock);
4079 4083 if (rp->r_flags & RHASHED) {
4080 4084 vp = RTOV(rp);
4081 4085 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4082 4086 mutex_enter(&vp->v_lock);
4083 4087 if (vp->v_count > 1) {
4084 4088 VN_RELE_LOCKED(vp);
4085 4089 mutex_exit(&vp->v_lock);
4086 4090 rw_exit(&rp->r_hashq->r_lock);
4087 4091 mutex_enter(&rpfreelist_lock);
4088 4092 continue;
4089 4093 }
4090 4094 mutex_exit(&vp->v_lock);
4091 4095 rp_rmhash_locked(rp);
4092 4096 rw_exit(&rp->r_hashq->r_lock);
4093 4097 }
4094 4098 /*
4095 4099 * This call to rp_addfree will end up destroying the
4096 4100 * rnode, but in a safe way with the appropriate set
4097 4101 * of checks done.
4098 4102 */
4099 4103 rp_addfree(rp, CRED());
4100 4104 mutex_enter(&rpfreelist_lock);
4101 4105 }
4102 4106 mutex_exit(&rpfreelist_lock);
4103 4107 return (freed);
4104 4108 }
4105 4109
4106 4110 /*ARGSUSED*/
4107 4111 static void
4108 4112 nfs_reclaim(void *cdrarg)
4109 4113 {
4110 4114
4111 4115 #ifdef DEBUG
4112 4116 clstat_debug.reclaim.value.ui64++;
4113 4117 #endif
4114 4118 if (nfs_free_reclaim())
4115 4119 return;
4116 4120
4117 4121 if (nfs_active_reclaim())
4118 4122 return;
4119 4123
4120 4124 (void) nfs_rnode_reclaim();
4121 4125 }
4122 4126
4123 4127 /*
4124 4128 * NFS client failover support
4125 4129 *
4126 4130 * Routines to copy filehandles
4127 4131 */
4128 4132 void
4129 4133 nfscopyfh(caddr_t fhp, vnode_t *vp)
4130 4134 {
4131 4135 fhandle_t *dest = (fhandle_t *)fhp;
4132 4136
4133 4137 if (dest != NULL)
4134 4138 *dest = *VTOFH(vp);
4135 4139 }
4136 4140
4137 4141 void
4138 4142 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4139 4143 {
4140 4144 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4141 4145
4142 4146 if (dest != NULL)
4143 4147 *dest = *VTOFH3(vp);
4144 4148 }
4145 4149
4146 4150 /*
4147 4151 * NFS client failover support
4148 4152 *
4149 4153 * failover_safe() will test various conditions to ensure that
4150 4154 * failover is permitted for this vnode. It will be denied
4151 4155 * if:
4152 4156 * 1) the operation in progress does not support failover (NULL fi)
4153 4157 * 2) there are no available replicas (NULL mi_servers->sv_next)
4154 4158 * 3) any locks are outstanding on this file
4155 4159 */
4156 4160 static int
4157 4161 failover_safe(failinfo_t *fi)
4158 4162 {
4159 4163
4160 4164 /*
4161 4165 * Does this op permit failover?
4162 4166 */
4163 4167 if (fi == NULL || fi->vp == NULL)
4164 4168 return (0);
4165 4169
4166 4170 /*
4167 4171 * Are there any alternates to failover to?
4168 4172 */
4169 4173 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4170 4174 return (0);
4171 4175
4172 4176 /*
4173 4177 * Disable check; we've forced local locking
4174 4178 *
4175 4179 * if (flk_has_remote_locks(fi->vp))
4176 4180 * return (0);
4177 4181 */
4178 4182
4179 4183 /*
4180 4184 * If we have no partial path, we can't do anything
4181 4185 */
4182 4186 if (VTOR(fi->vp)->r_path == NULL)
4183 4187 return (0);
4184 4188
4185 4189 return (1);
4186 4190 }
4187 4191
4188 4192 #include <sys/thread.h>
4189 4193
4190 4194 /*
4191 4195 * NFS client failover support
4192 4196 *
4193 4197 * failover_newserver() will start a search for a new server,
4194 4198 * preferably by starting an async thread to do the work. If
4195 4199 * someone is already doing this (recognizable by MI_BINDINPROG
4196 4200 * being set), it will simply return and the calling thread
4197 4201 * will queue on the mi_failover_cv condition variable.
4198 4202 */
4199 4203 static void
4200 4204 failover_newserver(mntinfo_t *mi)
4201 4205 {
4202 4206 /*
4203 4207 * Check if someone else is doing this already
4204 4208 */
4205 4209 mutex_enter(&mi->mi_lock);
4206 4210 if (mi->mi_flags & MI_BINDINPROG) {
4207 4211 mutex_exit(&mi->mi_lock);
4208 4212 return;
4209 4213 }
4210 4214 mi->mi_flags |= MI_BINDINPROG;
4211 4215
4212 4216 /*
4213 4217 * Need to hold the vfs struct so that it can't be released
4214 4218 * while the failover thread is selecting a new server.
4215 4219 */
4216 4220 VFS_HOLD(mi->mi_vfsp);
4217 4221
4218 4222 /*
4219 4223 * Start a thread to do the real searching.
4220 4224 */
4221 4225 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4222 4226
4223 4227 mutex_exit(&mi->mi_lock);
4224 4228 }
4225 4229
4226 4230 /*
4227 4231 * NFS client failover support
4228 4232 *
4229 4233 * failover_thread() will find a new server to replace the one
4230 4234 * currently in use, wake up other threads waiting on this mount
4231 4235 * point, and die. It will start at the head of the server list
4232 4236 * and poll servers until it finds one with an NFS server which is
4233 4237 * registered and responds to a NULL procedure ping.
4234 4238 *
4235 4239 * XXX failover_thread is unsafe within the scope of the
4236 4240 * present model defined for cpr to suspend the system.
4237 4241 * Specifically, over-the-wire calls made by the thread
4238 4242 * are unsafe. The thread needs to be reevaluated in case of
4239 4243 * future updates to the cpr suspend model.
4240 4244 */
4241 4245 static void
4242 4246 failover_thread(mntinfo_t *mi)
4243 4247 {
4244 4248 servinfo_t *svp = NULL;
4245 4249 CLIENT *cl;
4246 4250 enum clnt_stat status;
4247 4251 struct timeval tv;
4248 4252 int error;
4249 4253 int oncethru = 0;
4250 4254 callb_cpr_t cprinfo;
4251 4255 rnode_t *rp;
4252 4256 int index;
4253 4257 char *srvnames;
4254 4258 size_t srvnames_len;
4255 4259 struct nfs_clnt *nfscl = NULL;
4256 4260 zoneid_t zoneid = getzoneid();
4257 4261
4258 4262 #ifdef DEBUG
4259 4263 /*
4260 4264 * This is currently only needed to access counters which exist on
4261 4265 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4262 4266 * on non-DEBUG kernels.
4263 4267 */
4264 4268 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4265 4269 ASSERT(nfscl != NULL);
4266 4270 #endif
4267 4271
4268 4272 /*
4269 4273 * Its safe to piggyback on the mi_lock since failover_newserver()
4270 4274 * code guarantees that there will be only one failover thread
4271 4275 * per mountinfo at any instance.
4272 4276 */
4273 4277 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4274 4278 "failover_thread");
4275 4279
4276 4280 mutex_enter(&mi->mi_lock);
4277 4281 while (mi->mi_readers) {
4278 4282 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4279 4283 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4280 4284 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4281 4285 }
4282 4286 mutex_exit(&mi->mi_lock);
4283 4287
4284 4288 tv.tv_sec = 2;
4285 4289 tv.tv_usec = 0;
4286 4290
4287 4291 /*
4288 4292 * Ping the null NFS procedure of every server in
4289 4293 * the list until one responds. We always start
4290 4294 * at the head of the list and always skip the one
4291 4295 * that is current, since it's caused us a problem.
4292 4296 */
4293 4297 while (svp == NULL) {
4294 4298 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4295 4299 if (!oncethru && svp == mi->mi_curr_serv)
4296 4300 continue;
4297 4301
4298 4302 /*
4299 4303 * If the file system was forcibly umounted
4300 4304 * while trying to do a failover, then just
4301 4305 * give up on the failover. It won't matter
4302 4306 * what the server is.
4303 4307 */
4304 4308 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4305 4309 svp = NULL;
4306 4310 goto done;
4307 4311 }
4308 4312
4309 4313 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4310 4314 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4311 4315 if (error)
4312 4316 continue;
4313 4317
4314 4318 if (!(mi->mi_flags & MI_INT))
4315 4319 cl->cl_nosignal = TRUE;
4316 4320 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4317 4321 xdr_void, NULL, tv);
4318 4322 if (!(mi->mi_flags & MI_INT))
4319 4323 cl->cl_nosignal = FALSE;
4320 4324 AUTH_DESTROY(cl->cl_auth);
4321 4325 CLNT_DESTROY(cl);
4322 4326 if (status == RPC_SUCCESS) {
4323 4327 if (svp == mi->mi_curr_serv) {
4324 4328 #ifdef DEBUG
4325 4329 zcmn_err(zoneid, CE_NOTE,
4326 4330 "NFS%d: failing over: selecting original server %s",
4327 4331 mi->mi_vers, svp->sv_hostname);
4328 4332 #else
4329 4333 zcmn_err(zoneid, CE_NOTE,
4330 4334 "NFS: failing over: selecting original server %s",
4331 4335 svp->sv_hostname);
4332 4336 #endif
4333 4337 } else {
4334 4338 #ifdef DEBUG
4335 4339 zcmn_err(zoneid, CE_NOTE,
4336 4340 "NFS%d: failing over from %s to %s",
4337 4341 mi->mi_vers,
4338 4342 mi->mi_curr_serv->sv_hostname,
4339 4343 svp->sv_hostname);
4340 4344 #else
4341 4345 zcmn_err(zoneid, CE_NOTE,
4342 4346 "NFS: failing over from %s to %s",
4343 4347 mi->mi_curr_serv->sv_hostname,
4344 4348 svp->sv_hostname);
4345 4349 #endif
4346 4350 }
4347 4351 break;
4348 4352 }
4349 4353 }
4350 4354
4351 4355 if (svp == NULL) {
4352 4356 if (!oncethru) {
4353 4357 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4354 4358 #ifdef DEBUG
4355 4359 zprintf(zoneid,
4356 4360 "NFS%d servers %s not responding "
4357 4361 "still trying\n", mi->mi_vers, srvnames);
4358 4362 #else
4359 4363 zprintf(zoneid, "NFS servers %s not responding "
4360 4364 "still trying\n", srvnames);
4361 4365 #endif
4362 4366 oncethru = 1;
4363 4367 }
4364 4368 mutex_enter(&mi->mi_lock);
4365 4369 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4366 4370 mutex_exit(&mi->mi_lock);
4367 4371 delay(hz);
4368 4372 mutex_enter(&mi->mi_lock);
4369 4373 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4370 4374 mutex_exit(&mi->mi_lock);
4371 4375 }
4372 4376 }
4373 4377
4374 4378 if (oncethru) {
4375 4379 #ifdef DEBUG
4376 4380 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4377 4381 #else
4378 4382 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4379 4383 #endif
4380 4384 }
4381 4385
4382 4386 if (svp != mi->mi_curr_serv) {
4383 4387 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4384 4388 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4385 4389 rw_enter(&rtable[index].r_lock, RW_WRITER);
4386 4390 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4387 4391 mi->mi_vfsp);
4388 4392 if (rp != NULL) {
4389 4393 if (rp->r_flags & RHASHED)
4390 4394 rp_rmhash_locked(rp);
4391 4395 rw_exit(&rtable[index].r_lock);
4392 4396 rp->r_server = svp;
4393 4397 rp->r_fh = svp->sv_fhandle;
4394 4398 (void) nfs_free_data_reclaim(rp);
4395 4399 index = rtablehash(&rp->r_fh);
4396 4400 rp->r_hashq = &rtable[index];
4397 4401 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4398 4402 vn_exists(RTOV(rp));
4399 4403 rp_addhash(rp);
4400 4404 rw_exit(&rp->r_hashq->r_lock);
4401 4405 VN_RELE(RTOV(rp));
4402 4406 } else
4403 4407 rw_exit(&rtable[index].r_lock);
4404 4408 }
4405 4409
4406 4410 done:
4407 4411 if (oncethru)
4408 4412 kmem_free(srvnames, srvnames_len);
4409 4413 mutex_enter(&mi->mi_lock);
4410 4414 mi->mi_flags &= ~MI_BINDINPROG;
4411 4415 if (svp != NULL) {
4412 4416 mi->mi_curr_serv = svp;
4413 4417 mi->mi_failover++;
4414 4418 #ifdef DEBUG
4415 4419 nfscl->nfscl_stat.failover.value.ui64++;
4416 4420 #endif
4417 4421 }
4418 4422 cv_broadcast(&mi->mi_failover_cv);
4419 4423 CALLB_CPR_EXIT(&cprinfo);
4420 4424 VFS_RELE(mi->mi_vfsp);
4421 4425 zthread_exit();
4422 4426 /* NOTREACHED */
4423 4427 }
4424 4428
4425 4429 /*
4426 4430 * NFS client failover support
4427 4431 *
4428 4432 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4429 4433 * is cleared, meaning that failover is complete. Called with
4430 4434 * mi_lock mutex held.
4431 4435 */
4432 4436 static int
4433 4437 failover_wait(mntinfo_t *mi)
4434 4438 {
4435 4439 k_sigset_t smask;
4436 4440
4437 4441 /*
4438 4442 * If someone else is hunting for a living server,
4439 4443 * sleep until it's done. After our sleep, we may
4440 4444 * be bound to the right server and get off cheaply.
4441 4445 */
4442 4446 while (mi->mi_flags & MI_BINDINPROG) {
4443 4447 /*
4444 4448 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4445 4449 * and SIGTERM. (Preserving the existing masks).
4446 4450 * Mask out SIGINT if mount option nointr is specified.
4447 4451 */
4448 4452 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4449 4453 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4450 4454 /*
4451 4455 * restore original signal mask
4452 4456 */
4453 4457 sigunintr(&smask);
4454 4458 return (EINTR);
4455 4459 }
4456 4460 /*
4457 4461 * restore original signal mask
4458 4462 */
4459 4463 sigunintr(&smask);
4460 4464 }
4461 4465 return (0);
4462 4466 }
4463 4467
4464 4468 /*
4465 4469 * NFS client failover support
4466 4470 *
4467 4471 * failover_remap() will do a partial pathname lookup and find the
4468 4472 * desired vnode on the current server. The interim vnode will be
4469 4473 * discarded after we pilfer the new filehandle.
4470 4474 *
4471 4475 * Side effects:
4472 4476 * - This routine will also update the filehandle in the args structure
4473 4477 * pointed to by the fi->fhp pointer if it is non-NULL.
4474 4478 */
4475 4479
4476 4480 static int
4477 4481 failover_remap(failinfo_t *fi)
4478 4482 {
4479 4483 vnode_t *vp, *nvp, *rootvp;
4480 4484 rnode_t *rp, *nrp;
4481 4485 mntinfo_t *mi;
4482 4486 int error;
4483 4487 #ifdef DEBUG
4484 4488 struct nfs_clnt *nfscl;
4485 4489
4486 4490 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4487 4491 ASSERT(nfscl != NULL);
4488 4492 #endif
4489 4493 /*
4490 4494 * Sanity check
4491 4495 */
4492 4496 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4493 4497 return (EINVAL);
4494 4498 vp = fi->vp;
4495 4499 rp = VTOR(vp);
4496 4500 mi = VTOMI(vp);
4497 4501
4498 4502 if (!(vp->v_flag & VROOT)) {
4499 4503 /*
4500 4504 * Given the root fh, use the path stored in
4501 4505 * the rnode to find the fh for the new server.
4502 4506 */
4503 4507 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4504 4508 if (error)
4505 4509 return (error);
4506 4510
4507 4511 error = failover_lookup(rp->r_path, rootvp,
4508 4512 fi->lookupproc, fi->xattrdirproc, &nvp);
4509 4513
4510 4514 VN_RELE(rootvp);
4511 4515
4512 4516 if (error)
4513 4517 return (error);
4514 4518
4515 4519 /*
4516 4520 * If we found the same rnode, we're done now
4517 4521 */
4518 4522 if (nvp == vp) {
4519 4523 /*
4520 4524 * Failed and the new server may physically be same
4521 4525 * OR may share a same disk subsystem. In this case
4522 4526 * file handle for a particular file path is not going
4523 4527 * to change, given the same filehandle lookup will
4524 4528 * always locate the same rnode as the existing one.
4525 4529 * All we might need to do is to update the r_server
4526 4530 * with the current servinfo.
4527 4531 */
4528 4532 if (!VALID_FH(fi)) {
4529 4533 rp->r_server = mi->mi_curr_serv;
4530 4534 }
4531 4535 VN_RELE(nvp);
4532 4536 return (0);
4533 4537 }
4534 4538
4535 4539 /*
4536 4540 * Try to make it so that no one else will find this
4537 4541 * vnode because it is just a temporary to hold the
4538 4542 * new file handle until that file handle can be
4539 4543 * copied to the original vnode/rnode.
4540 4544 */
4541 4545 nrp = VTOR(nvp);
4542 4546 mutex_enter(&mi->mi_remap_lock);
4543 4547 /*
4544 4548 * Some other thread could have raced in here and could
4545 4549 * have done the remap for this particular rnode before
4546 4550 * this thread here. Check for rp->r_server and
4547 4551 * mi->mi_curr_serv and return if they are same.
4548 4552 */
4549 4553 if (VALID_FH(fi)) {
4550 4554 mutex_exit(&mi->mi_remap_lock);
4551 4555 VN_RELE(nvp);
4552 4556 return (0);
4553 4557 }
4554 4558
4555 4559 if (nrp->r_flags & RHASHED)
4556 4560 rp_rmhash(nrp);
4557 4561
4558 4562 /*
4559 4563 * As a heuristic check on the validity of the new
4560 4564 * file, check that the size and type match against
4561 4565 * that we remember from the old version.
4562 4566 */
4563 4567 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4564 4568 mutex_exit(&mi->mi_remap_lock);
4565 4569 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4566 4570 "NFS replicas %s and %s: file %s not same.",
4567 4571 rp->r_server->sv_hostname,
4568 4572 nrp->r_server->sv_hostname, rp->r_path);
4569 4573 VN_RELE(nvp);
4570 4574 return (EINVAL);
4571 4575 }
4572 4576
4573 4577 /*
4574 4578 * snarf the filehandle from the new rnode
4575 4579 * then release it, again while updating the
4576 4580 * hash queues for the rnode.
4577 4581 */
4578 4582 if (rp->r_flags & RHASHED)
4579 4583 rp_rmhash(rp);
4580 4584 rp->r_server = mi->mi_curr_serv;
4581 4585 rp->r_fh = nrp->r_fh;
4582 4586 rp->r_hashq = nrp->r_hashq;
4583 4587 /*
4584 4588 * Copy the attributes from the new rnode to the old
4585 4589 * rnode. This will help to reduce unnecessary page
4586 4590 * cache flushes.
4587 4591 */
4588 4592 rp->r_attr = nrp->r_attr;
4589 4593 rp->r_attrtime = nrp->r_attrtime;
4590 4594 rp->r_mtime = nrp->r_mtime;
4591 4595 (void) nfs_free_data_reclaim(rp);
4592 4596 nfs_setswaplike(vp, &rp->r_attr);
4593 4597 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4594 4598 rp_addhash(rp);
4595 4599 rw_exit(&rp->r_hashq->r_lock);
4596 4600 mutex_exit(&mi->mi_remap_lock);
4597 4601 VN_RELE(nvp);
4598 4602 }
4599 4603
4600 4604 /*
4601 4605 * Update successful failover remap count
4602 4606 */
4603 4607 mutex_enter(&mi->mi_lock);
4604 4608 mi->mi_remap++;
4605 4609 mutex_exit(&mi->mi_lock);
4606 4610 #ifdef DEBUG
4607 4611 nfscl->nfscl_stat.remap.value.ui64++;
4608 4612 #endif
4609 4613
4610 4614 /*
4611 4615 * If we have a copied filehandle to update, do it now.
4612 4616 */
4613 4617 if (fi->fhp != NULL && fi->copyproc != NULL)
4614 4618 (*fi->copyproc)(fi->fhp, vp);
4615 4619
4616 4620 return (0);
4617 4621 }
4618 4622
4619 4623 /*
4620 4624 * NFS client failover support
4621 4625 *
4622 4626 * We want a simple pathname lookup routine to parse the pieces
4623 4627 * of path in rp->r_path. We know that the path was a created
4624 4628 * as rnodes were made, so we know we have only to deal with
4625 4629 * paths that look like:
4626 4630 * dir1/dir2/dir3/file
4627 4631 * Any evidence of anything like .., symlinks, and ENOTDIR
4628 4632 * are hard errors, because they mean something in this filesystem
4629 4633 * is different from the one we came from, or has changed under
4630 4634 * us in some way. If this is true, we want the failure.
4631 4635 *
4632 4636 * Extended attributes: if the filesystem is mounted with extended
4633 4637 * attributes enabled (-o xattr), the attribute directory will be
4634 4638 * represented in the r_path as the magic name XATTR_RPATH. So if
4635 4639 * we see that name in the pathname, is must be because this node
4636 4640 * is an extended attribute. Therefore, look it up that way.
4637 4641 */
4638 4642 static int
4639 4643 failover_lookup(char *path, vnode_t *root,
4640 4644 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4641 4645 vnode_t *, cred_t *, int),
4642 4646 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4643 4647 vnode_t **new)
4644 4648 {
4645 4649 vnode_t *dvp, *nvp;
4646 4650 int error = EINVAL;
4647 4651 char *s, *p, *tmppath;
4648 4652 size_t len;
4649 4653 mntinfo_t *mi;
4650 4654 bool_t xattr;
4651 4655
4652 4656 /* Make local copy of path */
4653 4657 len = strlen(path) + 1;
4654 4658 tmppath = kmem_alloc(len, KM_SLEEP);
4655 4659 (void) strcpy(tmppath, path);
4656 4660 s = tmppath;
4657 4661
4658 4662 dvp = root;
4659 4663 VN_HOLD(dvp);
4660 4664 mi = VTOMI(root);
4661 4665 xattr = mi->mi_flags & MI_EXTATTR;
4662 4666
4663 4667 do {
4664 4668 p = strchr(s, '/');
4665 4669 if (p != NULL)
4666 4670 *p = '\0';
4667 4671 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4668 4672 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4669 4673 RFSCALL_SOFT);
4670 4674 } else {
4671 4675 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4672 4676 CRED(), RFSCALL_SOFT);
4673 4677 }
4674 4678 if (p != NULL)
4675 4679 *p++ = '/';
4676 4680 if (error) {
4677 4681 VN_RELE(dvp);
4678 4682 kmem_free(tmppath, len);
4679 4683 return (error);
4680 4684 }
4681 4685 s = p;
4682 4686 VN_RELE(dvp);
4683 4687 dvp = nvp;
4684 4688 } while (p != NULL);
4685 4689
4686 4690 if (nvp != NULL && new != NULL)
4687 4691 *new = nvp;
4688 4692 kmem_free(tmppath, len);
4689 4693 return (0);
4690 4694 }
4691 4695
4692 4696 /*
4693 4697 * NFS client failover support
4694 4698 *
4695 4699 * sv_free() frees the malloc'd portion of a "servinfo_t".
4696 4700 */
4697 4701 void
4698 4702 sv_free(servinfo_t *svp)
4699 4703 {
4700 4704 servinfo_t *next;
4701 4705 struct knetconfig *knconf;
4702 4706
4703 4707 while (svp != NULL) {
4704 4708 next = svp->sv_next;
4705 4709 if (svp->sv_secdata)
4706 4710 sec_clnt_freeinfo(svp->sv_secdata);
4707 4711 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4708 4712 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4709 4713 knconf = svp->sv_knconf;
4710 4714 if (knconf != NULL) {
4711 4715 if (knconf->knc_protofmly != NULL)
4712 4716 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4713 4717 if (knconf->knc_proto != NULL)
4714 4718 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4715 4719 kmem_free(knconf, sizeof (*knconf));
4716 4720 }
4717 4721 knconf = svp->sv_origknconf;
4718 4722 if (knconf != NULL) {
4719 4723 if (knconf->knc_protofmly != NULL)
4720 4724 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4721 4725 if (knconf->knc_proto != NULL)
4722 4726 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4723 4727 kmem_free(knconf, sizeof (*knconf));
4724 4728 }
4725 4729 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4726 4730 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4727 4731 mutex_destroy(&svp->sv_lock);
4728 4732 kmem_free(svp, sizeof (*svp));
4729 4733 svp = next;
4730 4734 }
4731 4735 }
4732 4736
4733 4737 /*
4734 4738 * Only can return non-zero if intr != 0.
4735 4739 */
4736 4740 int
4737 4741 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4738 4742 {
4739 4743
4740 4744 mutex_enter(&l->lock);
4741 4745
4742 4746 /*
4743 4747 * If this is a nested enter, then allow it. There
4744 4748 * must be as many exits as enters through.
4745 4749 */
4746 4750 if (l->owner == curthread) {
4747 4751 /* lock is held for writing by current thread */
4748 4752 ASSERT(rw == RW_READER || rw == RW_WRITER);
4749 4753 l->count--;
4750 4754 } else if (rw == RW_READER) {
4751 4755 /*
4752 4756 * While there is a writer active or writers waiting,
4753 4757 * then wait for them to finish up and move on. Then,
4754 4758 * increment the count to indicate that a reader is
4755 4759 * active.
4756 4760 */
4757 4761 while (l->count < 0 || l->waiters > 0) {
4758 4762 if (intr) {
4759 4763 klwp_t *lwp = ttolwp(curthread);
4760 4764
4761 4765 if (lwp != NULL)
4762 4766 lwp->lwp_nostop++;
4763 4767 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4764 4768 if (lwp != NULL)
4765 4769 lwp->lwp_nostop--;
4766 4770 mutex_exit(&l->lock);
4767 4771 return (EINTR);
4768 4772 }
4769 4773 if (lwp != NULL)
4770 4774 lwp->lwp_nostop--;
4771 4775 } else
4772 4776 cv_wait(&l->cv_rd, &l->lock);
4773 4777 }
4774 4778 ASSERT(l->count < INT_MAX);
4775 4779 #ifdef DEBUG
4776 4780 if ((l->count % 10000) == 9999)
4777 4781 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4778 4782 "rwlock @ %p\n", l->count, (void *)&l);
4779 4783 #endif
4780 4784 l->count++;
4781 4785 } else {
4782 4786 ASSERT(rw == RW_WRITER);
4783 4787 /*
4784 4788 * While there are readers active or a writer
4785 4789 * active, then wait for all of the readers
4786 4790 * to finish or for the writer to finish.
4787 4791 * Then, set the owner field to curthread and
4788 4792 * decrement count to indicate that a writer
4789 4793 * is active.
4790 4794 */
4791 4795 while (l->count != 0) {
4792 4796 l->waiters++;
4793 4797 if (intr) {
4794 4798 klwp_t *lwp = ttolwp(curthread);
4795 4799
4796 4800 if (lwp != NULL)
4797 4801 lwp->lwp_nostop++;
4798 4802 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4799 4803 if (lwp != NULL)
4800 4804 lwp->lwp_nostop--;
4801 4805 l->waiters--;
4802 4806 /*
4803 4807 * If there are readers active and no
4804 4808 * writers waiting then wake up all of
4805 4809 * the waiting readers (if any).
4806 4810 */
4807 4811 if (l->count > 0 && l->waiters == 0)
4808 4812 cv_broadcast(&l->cv_rd);
4809 4813 mutex_exit(&l->lock);
4810 4814 return (EINTR);
4811 4815 }
4812 4816 if (lwp != NULL)
4813 4817 lwp->lwp_nostop--;
4814 4818 } else
4815 4819 cv_wait(&l->cv, &l->lock);
4816 4820 l->waiters--;
4817 4821 }
4818 4822 ASSERT(l->owner == NULL);
4819 4823 l->owner = curthread;
4820 4824 l->count--;
4821 4825 }
4822 4826
4823 4827 mutex_exit(&l->lock);
4824 4828
4825 4829 return (0);
4826 4830 }
4827 4831
4828 4832 /*
4829 4833 * If the lock is available, obtain it and return non-zero. If there is
4830 4834 * already a conflicting lock, return 0 immediately.
4831 4835 */
4832 4836
4833 4837 int
4834 4838 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4835 4839 {
4836 4840 mutex_enter(&l->lock);
4837 4841
4838 4842 /*
4839 4843 * If this is a nested enter, then allow it. There
4840 4844 * must be as many exits as enters through.
4841 4845 */
4842 4846 if (l->owner == curthread) {
4843 4847 /* lock is held for writing by current thread */
4844 4848 ASSERT(rw == RW_READER || rw == RW_WRITER);
4845 4849 l->count--;
4846 4850 } else if (rw == RW_READER) {
4847 4851 /*
4848 4852 * If there is a writer active or writers waiting, deny the
4849 4853 * lock. Otherwise, bump the count of readers.
4850 4854 */
4851 4855 if (l->count < 0 || l->waiters > 0) {
4852 4856 mutex_exit(&l->lock);
4853 4857 return (0);
4854 4858 }
4855 4859 l->count++;
4856 4860 } else {
4857 4861 ASSERT(rw == RW_WRITER);
4858 4862 /*
4859 4863 * If there are readers active or a writer active, deny the
4860 4864 * lock. Otherwise, set the owner field to curthread and
4861 4865 * decrement count to indicate that a writer is active.
4862 4866 */
4863 4867 if (l->count != 0) {
4864 4868 mutex_exit(&l->lock);
4865 4869 return (0);
4866 4870 }
4867 4871 ASSERT(l->owner == NULL);
4868 4872 l->owner = curthread;
4869 4873 l->count--;
4870 4874 }
4871 4875
4872 4876 mutex_exit(&l->lock);
4873 4877
4874 4878 return (1);
4875 4879 }
4876 4880
4877 4881 void
4878 4882 nfs_rw_exit(nfs_rwlock_t *l)
4879 4883 {
4880 4884
4881 4885 mutex_enter(&l->lock);
4882 4886
4883 4887 if (l->owner != NULL) {
4884 4888 ASSERT(l->owner == curthread);
4885 4889
4886 4890 /*
4887 4891 * To release a writer lock increment count to indicate that
4888 4892 * there is one less writer active. If this was the last of
4889 4893 * possibly nested writer locks, then clear the owner field as
4890 4894 * well to indicate that there is no writer active.
4891 4895 */
4892 4896 ASSERT(l->count < 0);
4893 4897 l->count++;
4894 4898 if (l->count == 0) {
4895 4899 l->owner = NULL;
4896 4900
4897 4901 /*
4898 4902 * If there are no writers waiting then wakeup all of
4899 4903 * the waiting readers (if any).
4900 4904 */
4901 4905 if (l->waiters == 0)
4902 4906 cv_broadcast(&l->cv_rd);
4903 4907 }
4904 4908 } else {
4905 4909 /*
4906 4910 * To release a reader lock just decrement count to indicate
4907 4911 * that there is one less reader active.
4908 4912 */
4909 4913 ASSERT(l->count > 0);
4910 4914 l->count--;
4911 4915 }
4912 4916
4913 4917 /*
4914 4918 * If there are no readers active nor a writer active and there is a
4915 4919 * writer waiting we need to wake up it.
4916 4920 */
4917 4921 if (l->count == 0 && l->waiters > 0)
4918 4922 cv_signal(&l->cv);
4919 4923 mutex_exit(&l->lock);
4920 4924 }
4921 4925
4922 4926 int
4923 4927 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4924 4928 {
4925 4929
4926 4930 if (rw == RW_READER)
4927 4931 return (l->count > 0);
4928 4932 ASSERT(rw == RW_WRITER);
4929 4933 return (l->count < 0);
4930 4934 }
4931 4935
4932 4936 /* ARGSUSED */
4933 4937 void
4934 4938 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4935 4939 {
4936 4940
4937 4941 l->count = 0;
4938 4942 l->waiters = 0;
4939 4943 l->owner = NULL;
4940 4944 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4941 4945 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4942 4946 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4943 4947 }
4944 4948
4945 4949 void
4946 4950 nfs_rw_destroy(nfs_rwlock_t *l)
4947 4951 {
4948 4952
4949 4953 mutex_destroy(&l->lock);
4950 4954 cv_destroy(&l->cv);
4951 4955 cv_destroy(&l->cv_rd);
4952 4956 }
4953 4957
4954 4958 int
4955 4959 nfs3_rddir_compar(const void *x, const void *y)
4956 4960 {
4957 4961 rddir_cache *a = (rddir_cache *)x;
4958 4962 rddir_cache *b = (rddir_cache *)y;
4959 4963
4960 4964 if (a->nfs3_cookie == b->nfs3_cookie) {
4961 4965 if (a->buflen == b->buflen)
4962 4966 return (0);
4963 4967 if (a->buflen < b->buflen)
4964 4968 return (-1);
4965 4969 return (1);
4966 4970 }
4967 4971
4968 4972 if (a->nfs3_cookie < b->nfs3_cookie)
4969 4973 return (-1);
4970 4974
4971 4975 return (1);
4972 4976 }
4973 4977
4974 4978 int
4975 4979 nfs_rddir_compar(const void *x, const void *y)
4976 4980 {
4977 4981 rddir_cache *a = (rddir_cache *)x;
4978 4982 rddir_cache *b = (rddir_cache *)y;
4979 4983
4980 4984 if (a->nfs_cookie == b->nfs_cookie) {
4981 4985 if (a->buflen == b->buflen)
4982 4986 return (0);
4983 4987 if (a->buflen < b->buflen)
4984 4988 return (-1);
4985 4989 return (1);
4986 4990 }
4987 4991
4988 4992 if (a->nfs_cookie < b->nfs_cookie)
4989 4993 return (-1);
4990 4994
4991 4995 return (1);
4992 4996 }
4993 4997
4994 4998 static char *
4995 4999 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4996 5000 {
4997 5001 servinfo_t *s;
4998 5002 char *srvnames;
4999 5003 char *namep;
5000 5004 size_t length;
5001 5005
5002 5006 /*
5003 5007 * Calculate the length of the string required to hold all
5004 5008 * of the server names plus either a comma or a null
5005 5009 * character following each individual one.
5006 5010 */
5007 5011 length = 0;
5008 5012 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5009 5013 length += s->sv_hostnamelen;
5010 5014
5011 5015 srvnames = kmem_alloc(length, KM_SLEEP);
5012 5016
5013 5017 namep = srvnames;
5014 5018 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5015 5019 (void) strcpy(namep, s->sv_hostname);
5016 5020 namep += s->sv_hostnamelen - 1;
5017 5021 *namep++ = ',';
5018 5022 }
5019 5023 *--namep = '\0';
5020 5024
5021 5025 *len = length;
5022 5026
5023 5027 return (srvnames);
5024 5028 }
5025 5029
5026 5030 /*
5027 5031 * These two functions are temporary and designed for the upgrade-workaround
5028 5032 * only. They cannot be used for general zone-crossing NFS client support, and
5029 5033 * will be removed shortly.
5030 5034 *
5031 5035 * When the workaround is enabled, all NFS traffic is forced into the global
5032 5036 * zone. These functions are called when the code needs to refer to the state
5033 5037 * of the underlying network connection. They're not called when the function
5034 5038 * needs to refer to the state of the process that invoked the system call.
5035 5039 * (E.g., when checking whether the zone is shutting down during the mount()
5036 5040 * call.)
5037 5041 */
5038 5042
5039 5043 struct zone *
5040 5044 nfs_zone(void)
5041 5045 {
5042 5046 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5043 5047 }
5044 5048
5045 5049 zoneid_t
5046 5050 nfs_zoneid(void)
5047 5051 {
5048 5052 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5049 5053 }
5050 5054
5051 5055 /*
5052 5056 * nfs_mount_label_policy:
5053 5057 * Determine whether the mount is allowed according to MAC check,
5054 5058 * by comparing (where appropriate) label of the remote server
5055 5059 * against the label of the zone being mounted into.
5056 5060 *
5057 5061 * Returns:
5058 5062 * 0 : access allowed
5059 5063 * -1 : read-only access allowed (i.e., read-down)
5060 5064 * >0 : error code, such as EACCES
5061 5065 */
5062 5066 int
5063 5067 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5064 5068 struct knetconfig *knconf, cred_t *cr)
5065 5069 {
5066 5070 int addr_type;
5067 5071 void *ipaddr;
5068 5072 bslabel_t *server_sl, *mntlabel;
5069 5073 zone_t *mntzone = NULL;
5070 5074 ts_label_t *zlabel;
5071 5075 tsol_tpc_t *tp;
5072 5076 ts_label_t *tsl = NULL;
5073 5077 int retv;
5074 5078
5075 5079 /*
5076 5080 * Get the zone's label. Each zone on a labeled system has a label.
5077 5081 */
5078 5082 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5079 5083 zlabel = mntzone->zone_slabel;
5080 5084 ASSERT(zlabel != NULL);
5081 5085 label_hold(zlabel);
5082 5086
5083 5087 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5084 5088 addr_type = IPV4_VERSION;
5085 5089 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5086 5090 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5087 5091 addr_type = IPV6_VERSION;
5088 5092 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5089 5093 } else {
5090 5094 retv = 0;
5091 5095 goto out;
5092 5096 }
5093 5097
5094 5098 retv = EACCES; /* assume the worst */
5095 5099
5096 5100 /*
5097 5101 * Next, get the assigned label of the remote server.
5098 5102 */
5099 5103 tp = find_tpc(ipaddr, addr_type, B_FALSE);
5100 5104 if (tp == NULL)
5101 5105 goto out; /* error getting host entry */
5102 5106
5103 5107 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5104 5108 goto rel_tpc; /* invalid domain */
5105 5109 if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5106 5110 (tp->tpc_tp.host_type != UNLABELED))
5107 5111 goto rel_tpc; /* invalid hosttype */
5108 5112
5109 5113 if (tp->tpc_tp.host_type == SUN_CIPSO) {
5110 5114 tsl = getflabel_cipso(vfsp);
5111 5115 if (tsl == NULL)
5112 5116 goto rel_tpc; /* error getting server lbl */
5113 5117
5114 5118 server_sl = label2bslabel(tsl);
5115 5119 } else { /* UNLABELED */
5116 5120 server_sl = &tp->tpc_tp.tp_def_label;
5117 5121 }
5118 5122
5119 5123 mntlabel = label2bslabel(zlabel);
5120 5124
5121 5125 /*
5122 5126 * Now compare labels to complete the MAC check. If the labels
5123 5127 * are equal or if the requestor is in the global zone and has
5124 5128 * NET_MAC_AWARE, then allow read-write access. (Except for
5125 5129 * mounts into the global zone itself; restrict these to
5126 5130 * read-only.)
5127 5131 *
5128 5132 * If the requestor is in some other zone, but their label
5129 5133 * dominates the server, then allow read-down.
5130 5134 *
5131 5135 * Otherwise, access is denied.
5132 5136 */
5133 5137 if (blequal(mntlabel, server_sl) ||
5134 5138 (crgetzoneid(cr) == GLOBAL_ZONEID &&
5135 5139 getpflags(NET_MAC_AWARE, cr) != 0)) {
5136 5140 if ((mntzone == global_zone) ||
5137 5141 !blequal(mntlabel, server_sl))
5138 5142 retv = -1; /* read-only */
5139 5143 else
5140 5144 retv = 0; /* access OK */
5141 5145 } else if (bldominates(mntlabel, server_sl)) {
5142 5146 retv = -1; /* read-only */
5143 5147 } else {
5144 5148 retv = EACCES;
5145 5149 }
5146 5150
5147 5151 if (tsl != NULL)
5148 5152 label_rele(tsl);
5149 5153
5150 5154 rel_tpc:
5151 5155 TPC_RELE(tp);
5152 5156 out:
5153 5157 if (mntzone)
5154 5158 zone_rele(mntzone);
5155 5159 label_rele(zlabel);
5156 5160 return (retv);
5157 5161 }
5158 5162
5159 5163 boolean_t
5160 5164 nfs_has_ctty(void)
5161 5165 {
5162 5166 boolean_t rv;
5163 5167 mutex_enter(&curproc->p_splock);
5164 5168 rv = (curproc->p_sessp->s_vp != NULL);
5165 5169 mutex_exit(&curproc->p_splock);
5166 5170 return (rv);
5167 5171 }
5168 5172
5169 5173 /*
5170 5174 * See if xattr directory to see if it has any generic user attributes
5171 5175 */
5172 5176 int
5173 5177 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5174 5178 {
5175 5179 struct uio uio;
5176 5180 struct iovec iov;
5177 5181 char *dbuf;
5178 5182 struct dirent64 *dp;
5179 5183 size_t dlen = 8 * 1024;
5180 5184 size_t dbuflen;
5181 5185 int eof = 0;
5182 5186 int error;
5183 5187
5184 5188 *valp = 0;
5185 5189 dbuf = kmem_alloc(dlen, KM_SLEEP);
5186 5190 uio.uio_iov = &iov;
5187 5191 uio.uio_iovcnt = 1;
5188 5192 uio.uio_segflg = UIO_SYSSPACE;
5189 5193 uio.uio_fmode = 0;
5190 5194 uio.uio_extflg = UIO_COPY_CACHED;
5191 5195 uio.uio_loffset = 0;
5192 5196 uio.uio_resid = dlen;
5193 5197 iov.iov_base = dbuf;
5194 5198 iov.iov_len = dlen;
5195 5199 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5196 5200 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5197 5201 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5198 5202
5199 5203 dbuflen = dlen - uio.uio_resid;
5200 5204
5201 5205 if (error || dbuflen == 0) {
5202 5206 kmem_free(dbuf, dlen);
5203 5207 return (error);
5204 5208 }
5205 5209
5206 5210 dp = (dirent64_t *)dbuf;
5207 5211
5208 5212 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5209 5213 if (strcmp(dp->d_name, ".") == 0 ||
5210 5214 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5211 5215 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
|
↓ open down ↓ |
4334 lines elided |
↑ open up ↑ |
5212 5216 VIEW_READONLY) == 0) {
5213 5217 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5214 5218 continue;
5215 5219 }
5216 5220
5217 5221 *valp = 1;
5218 5222 break;
5219 5223 }
5220 5224 kmem_free(dbuf, dlen);
5221 5225 return (0);
5226 +}
5227 +
5228 +/*
5229 + * Return non-zero in a case the vp is an empty directory used as a ZFS mount
5230 + * point. The NFSv2 and NFSv3 servers should not allow to write to such
5231 + * directories.
5232 + */
5233 +int
5234 +protect_zfs_mntpt(vnode_t *vp)
5235 +{
5236 + int error;
5237 + vfs_t *vfsp;
5238 + struct uio uio;
5239 + struct iovec iov;
5240 + int eof;
5241 + size_t len = 8 * 1024;
5242 + char *buf;
5243 +
5244 + if (vp->v_type != VDIR || vn_ismntpt(vp) == 0)
5245 + return (0);
5246 +
5247 + error = vn_vfsrlock_wait(vp);
5248 + if (error != 0)
5249 + return (error);
5250 +
5251 + /*
5252 + * We protect ZFS mount points only
5253 + */
5254 + if ((vfsp = vn_mountedvfs(vp)) == NULL ||
5255 + strncmp(vfssw[vfsp->vfs_fstype].vsw_name, "zfs", 3) != 0) {
5256 + vn_vfsunlock(vp);
5257 + return (0);
5258 + }
5259 +
5260 + vn_vfsunlock(vp);
5261 +
5262 + buf = kmem_alloc(len, KM_SLEEP);
5263 +
5264 + uio.uio_iov = &iov;
5265 + uio.uio_iovcnt = 1;
5266 + uio.uio_segflg = UIO_SYSSPACE;
5267 + uio.uio_fmode = 0;
5268 + uio.uio_extflg = UIO_COPY_CACHED;
5269 + uio.uio_loffset = 0;
5270 + uio.uio_llimit = MAXOFFSET_T;
5271 +
5272 + eof = 0;
5273 +
5274 + do {
5275 + size_t rlen;
5276 + dirent64_t *dp;
5277 +
5278 + uio.uio_resid = len;
5279 + iov.iov_base = buf;
5280 + iov.iov_len = len;
5281 +
5282 + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5283 + error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
5284 + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5285 +
5286 + if (error != 0)
5287 + break;
5288 +
5289 + error = EBUSY;
5290 +
5291 + rlen = len - uio.uio_resid;
5292 + if (rlen == 0)
5293 + break;
5294 +
5295 + for (dp = (dirent64_t *)buf;
5296 + (intptr_t)dp < (intptr_t)buf + rlen;
5297 + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
5298 + if (strcmp(dp->d_name, ".") != 0 &&
5299 + strcmp(dp->d_name, "..") != 0) {
5300 + error = 0;
5301 + break;
5302 + }
5303 + }
5304 + } while (eof == 0 && error != 0);
5305 +
5306 + kmem_free(buf, len);
5307 +
5308 + return (error);
5222 5309 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX