Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_client.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_client.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
23 24 * Copyright (c) 2017 by Delphix. All rights reserved.
24 25 */
25 26
26 27 /*
27 28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 29 * All Rights Reserved
29 30 */
30 31
31 32 #include <sys/param.h>
32 33 #include <sys/types.h>
33 34 #include <sys/systm.h>
34 35 #include <sys/thread.h>
35 36 #include <sys/t_lock.h>
36 37 #include <sys/time.h>
37 38 #include <sys/vnode.h>
38 39 #include <sys/vfs.h>
39 40 #include <sys/errno.h>
40 41 #include <sys/buf.h>
41 42 #include <sys/stat.h>
42 43 #include <sys/cred.h>
43 44 #include <sys/kmem.h>
44 45 #include <sys/debug.h>
45 46 #include <sys/dnlc.h>
46 47 #include <sys/vmsystm.h>
47 48 #include <sys/flock.h>
48 49 #include <sys/share.h>
49 50 #include <sys/cmn_err.h>
50 51 #include <sys/tiuser.h>
51 52 #include <sys/sysmacros.h>
52 53 #include <sys/callb.h>
53 54 #include <sys/acl.h>
54 55 #include <sys/kstat.h>
55 56 #include <sys/signal.h>
56 57 #include <sys/disp.h>
57 58 #include <sys/atomic.h>
58 59 #include <sys/list.h>
59 60 #include <sys/sdt.h>
60 61
61 62 #include <rpc/types.h>
62 63 #include <rpc/xdr.h>
63 64 #include <rpc/auth.h>
64 65 #include <rpc/clnt.h>
65 66
66 67 #include <nfs/nfs.h>
67 68 #include <nfs/nfs_clnt.h>
68 69 #include <nfs/nfs_acl.h>
69 70
70 71 #include <nfs/nfs4.h>
71 72 #include <nfs/rnode4.h>
72 73 #include <nfs/nfs4_clnt.h>
73 74
74 75 #include <vm/hat.h>
75 76 #include <vm/as.h>
76 77 #include <vm/page.h>
77 78 #include <vm/pvn.h>
78 79 #include <vm/seg.h>
79 80 #include <vm/seg_map.h>
80 81 #include <vm/seg_vn.h>
81 82
82 83 #include <sys/ddi.h>
83 84
84 85 /*
85 86 * Arguments to page-flush thread.
86 87 */
87 88 typedef struct {
88 89 vnode_t *vp;
89 90 cred_t *cr;
90 91 } pgflush_t;
91 92
92 93 #ifdef DEBUG
93 94 int nfs4_client_lease_debug;
94 95 int nfs4_sharedfh_debug;
95 96 int nfs4_fname_debug;
96 97
97 98 /* temporary: panic if v_type is inconsistent with r_attr va_type */
98 99 int nfs4_vtype_debug;
99 100
100 101 uint_t nfs4_tsd_key;
101 102 #endif
102 103
103 104 static time_t nfs4_client_resumed = 0;
104 105 static callb_id_t cid = 0;
105 106
106 107 static int nfs4renew(nfs4_server_t *);
107 108 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
108 109 static void nfs4_pgflush_thread(pgflush_t *);
109 110
110 111 static boolean_t nfs4_client_cpr_callb(void *, int);
111 112
112 113 struct mi4_globals {
113 114 kmutex_t mig_lock; /* lock protecting mig_list */
114 115 list_t mig_list; /* list of NFS v4 mounts in zone */
115 116 boolean_t mig_destructor_called;
116 117 };
117 118
118 119 static zone_key_t mi4_list_key;
119 120
120 121 /*
121 122 * Attributes caching:
122 123 *
123 124 * Attributes are cached in the rnode in struct vattr form.
124 125 * There is a time associated with the cached attributes (r_time_attr_inval)
125 126 * which tells whether the attributes are valid. The time is initialized
126 127 * to the difference between current time and the modify time of the vnode
127 128 * when new attributes are cached. This allows the attributes for
128 129 * files that have changed recently to be timed out sooner than for files
129 130 * that have not changed for a long time. There are minimum and maximum
130 131 * timeout values that can be set per mount point.
131 132 */
132 133
133 134 /*
134 135 * If a cache purge is in progress, wait for it to finish.
135 136 *
136 137 * The current thread must not be in the middle of an
137 138 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock
138 139 * between this thread, a recovery thread, and the page flush thread.
139 140 */
140 141 int
141 142 nfs4_waitfor_purge_complete(vnode_t *vp)
142 143 {
143 144 rnode4_t *rp;
144 145 k_sigset_t smask;
145 146
146 147 rp = VTOR4(vp);
147 148 if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
148 149 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
149 150 mutex_enter(&rp->r_statelock);
150 151 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
151 152 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
152 153 ((rp->r_flags & R4PGFLUSH) &&
153 154 rp->r_pgflush != curthread)) {
154 155 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
155 156 sigunintr(&smask);
156 157 mutex_exit(&rp->r_statelock);
157 158 return (EINTR);
158 159 }
159 160 }
160 161 sigunintr(&smask);
161 162 mutex_exit(&rp->r_statelock);
162 163 }
163 164 return (0);
164 165 }
165 166
166 167 /*
167 168 * Validate caches by checking cached attributes. If they have timed out,
168 169 * then get new attributes from the server. As a side effect, cache
169 170 * invalidation is done if the attributes have changed.
170 171 *
171 172 * If the attributes have not timed out and if there is a cache
172 173 * invalidation being done by some other thread, then wait until that
173 174 * thread has completed the cache invalidation.
174 175 */
175 176 int
176 177 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
177 178 {
178 179 int error;
179 180 nfs4_ga_res_t gar;
180 181
181 182 if (ATTRCACHE4_VALID(vp)) {
182 183 error = nfs4_waitfor_purge_complete(vp);
183 184 if (error)
184 185 return (error);
185 186 return (0);
186 187 }
187 188
188 189 return (nfs4_getattr_otw(vp, &gar, cr, 0));
189 190 }
190 191
191 192 /*
192 193 * Fill in attribute from the cache.
193 194 * If valid, then return 0 to indicate that no error occurred,
194 195 * otherwise return 1 to indicate that an error occurred.
195 196 */
196 197 static int
197 198 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
198 199 {
199 200 rnode4_t *rp;
200 201
201 202 rp = VTOR4(vp);
202 203 mutex_enter(&rp->r_statelock);
203 204 mutex_enter(&rp->r_statev4_lock);
204 205 if (ATTRCACHE4_VALID(vp)) {
205 206 mutex_exit(&rp->r_statev4_lock);
206 207 /*
207 208 * Cached attributes are valid
208 209 */
209 210 *vap = rp->r_attr;
210 211 mutex_exit(&rp->r_statelock);
211 212 return (0);
212 213 }
213 214 mutex_exit(&rp->r_statev4_lock);
214 215 mutex_exit(&rp->r_statelock);
215 216 return (1);
216 217 }
217 218
218 219
219 220 /*
220 221 * If returned error is ESTALE flush all caches. The nfs4_purge_caches()
221 222 * call is synchronous because all the pages were invalidated by the
222 223 * nfs4_invalidate_pages() call.
223 224 */
224 225 void
225 226 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
226 227 {
227 228 struct rnode4 *rp = VTOR4(vp);
228 229
229 230 /* Ensure that the ..._end_op() call has been done */
230 231 ASSERT(tsd_get(nfs4_tsd_key) == NULL);
231 232
232 233 if (errno != ESTALE)
233 234 return;
234 235
235 236 mutex_enter(&rp->r_statelock);
236 237 rp->r_flags |= R4STALE;
237 238 if (!rp->r_error)
238 239 rp->r_error = errno;
239 240 mutex_exit(&rp->r_statelock);
240 241 if (nfs4_has_pages(vp))
241 242 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
242 243 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
243 244 }
244 245
245 246 /*
246 247 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the
247 248 * page purge is done asynchronously.
248 249 */
249 250 void
250 251 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
251 252 {
252 253 rnode4_t *rp;
253 254 char *contents;
254 255 vnode_t *xattr;
255 256 int size;
256 257 int pgflush; /* are we the page flush thread? */
257 258
258 259 /*
259 260 * Purge the DNLC for any entries which refer to this file.
260 261 */
261 262 if (vp->v_count > 1 &&
262 263 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
263 264 dnlc_purge_vp(vp);
264 265
265 266 /*
266 267 * Clear any readdir state bits and purge the readlink response cache.
267 268 */
268 269 rp = VTOR4(vp);
269 270 mutex_enter(&rp->r_statelock);
270 271 rp->r_flags &= ~R4LOOKUP;
271 272 contents = rp->r_symlink.contents;
272 273 size = rp->r_symlink.size;
273 274 rp->r_symlink.contents = NULL;
274 275
275 276 xattr = rp->r_xattr_dir;
276 277 rp->r_xattr_dir = NULL;
277 278
278 279 /*
279 280 * Purge pathconf cache too.
280 281 */
281 282 rp->r_pathconf.pc4_xattr_valid = 0;
282 283 rp->r_pathconf.pc4_cache_valid = 0;
283 284
284 285 pgflush = (curthread == rp->r_pgflush);
285 286 mutex_exit(&rp->r_statelock);
286 287
287 288 if (contents != NULL) {
288 289
289 290 kmem_free((void *)contents, size);
290 291 }
291 292
292 293 if (xattr != NULL)
293 294 VN_RELE(xattr);
294 295
295 296 /*
296 297 * Flush the page cache. If the current thread is the page flush
297 298 * thread, don't initiate a new page flush. There's no need for
298 299 * it, and doing it correctly is hard.
299 300 */
300 301 if (nfs4_has_pages(vp) && !pgflush) {
301 302 if (!asyncpg) {
302 303 (void) nfs4_waitfor_purge_complete(vp);
303 304 nfs4_flush_pages(vp, cr);
304 305 } else {
305 306 pgflush_t *args;
306 307
307 308 /*
308 309 * We don't hold r_statelock while creating the
309 310 * thread, in case the call blocks. So we use a
310 311 * flag to indicate that a page flush thread is
311 312 * active.
312 313 */
313 314 mutex_enter(&rp->r_statelock);
314 315 if (rp->r_flags & R4PGFLUSH) {
315 316 mutex_exit(&rp->r_statelock);
316 317 } else {
317 318 rp->r_flags |= R4PGFLUSH;
318 319 mutex_exit(&rp->r_statelock);
319 320
320 321 args = kmem_alloc(sizeof (pgflush_t),
321 322 KM_SLEEP);
322 323 args->vp = vp;
323 324 VN_HOLD(args->vp);
324 325 args->cr = cr;
325 326 crhold(args->cr);
326 327 (void) zthread_create(NULL, 0,
327 328 nfs4_pgflush_thread, args, 0,
328 329 minclsyspri);
329 330 }
330 331 }
331 332 }
332 333
333 334 /*
334 335 * Flush the readdir response cache.
335 336 */
336 337 nfs4_purge_rddir_cache(vp);
337 338 }
338 339
339 340 /*
340 341 * Invalidate all pages for the given file, after writing back the dirty
341 342 * ones.
342 343 */
343 344
344 345 void
345 346 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
346 347 {
347 348 int error;
348 349 rnode4_t *rp = VTOR4(vp);
349 350
350 351 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
351 352 if (error == ENOSPC || error == EDQUOT) {
352 353 mutex_enter(&rp->r_statelock);
353 354 if (!rp->r_error)
354 355 rp->r_error = error;
355 356 mutex_exit(&rp->r_statelock);
356 357 }
357 358 }
358 359
359 360 /*
360 361 * Page flush thread.
361 362 */
362 363
363 364 static void
364 365 nfs4_pgflush_thread(pgflush_t *args)
365 366 {
366 367 rnode4_t *rp = VTOR4(args->vp);
367 368
368 369 /* remember which thread we are, so we don't deadlock ourselves */
369 370 mutex_enter(&rp->r_statelock);
370 371 ASSERT(rp->r_pgflush == NULL);
371 372 rp->r_pgflush = curthread;
372 373 mutex_exit(&rp->r_statelock);
373 374
374 375 nfs4_flush_pages(args->vp, args->cr);
375 376
376 377 mutex_enter(&rp->r_statelock);
377 378 rp->r_pgflush = NULL;
378 379 rp->r_flags &= ~R4PGFLUSH;
379 380 cv_broadcast(&rp->r_cv);
380 381 mutex_exit(&rp->r_statelock);
381 382
382 383 VN_RELE(args->vp);
383 384 crfree(args->cr);
384 385 kmem_free(args, sizeof (pgflush_t));
385 386 zthread_exit();
386 387 }
387 388
388 389 /*
389 390 * Purge the readdir cache of all entries which are not currently
390 391 * being filled.
391 392 */
392 393 void
393 394 nfs4_purge_rddir_cache(vnode_t *vp)
394 395 {
395 396 rnode4_t *rp;
396 397
397 398 rp = VTOR4(vp);
398 399
399 400 mutex_enter(&rp->r_statelock);
400 401 rp->r_direof = NULL;
401 402 rp->r_flags &= ~R4LOOKUP;
402 403 rp->r_flags |= R4READDIRWATTR;
403 404 rddir4_cache_purge(rp);
404 405 mutex_exit(&rp->r_statelock);
405 406 }
406 407
407 408 /*
408 409 * Set attributes cache for given vnode using virtual attributes. There is
409 410 * no cache validation, but if the attributes are deemed to be stale, they
410 411 * are ignored. This corresponds to nfs3_attrcache().
411 412 *
412 413 * Set the timeout value on the attribute cache and fill it
413 414 * with the passed in attributes.
414 415 */
415 416 void
416 417 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
417 418 {
418 419 rnode4_t *rp = VTOR4(vp);
419 420
420 421 mutex_enter(&rp->r_statelock);
421 422 if (rp->r_time_attr_saved <= t)
422 423 nfs4_attrcache_va(vp, garp, FALSE);
423 424 mutex_exit(&rp->r_statelock);
424 425 }
425 426
426 427 /*
427 428 * Use the passed in virtual attributes to check to see whether the
428 429 * data and metadata caches are valid, cache the new attributes, and
429 430 * then do the cache invalidation if required.
430 431 *
431 432 * The cache validation and caching of the new attributes is done
432 433 * atomically via the use of the mutex, r_statelock. If required,
433 434 * the cache invalidation is done atomically w.r.t. the cache
434 435 * validation and caching of the attributes via the pseudo lock,
435 436 * r_serial.
436 437 *
437 438 * This routine is used to do cache validation and attributes caching
438 439 * for operations with a single set of post operation attributes.
439 440 */
440 441
441 442 void
442 443 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
443 444 hrtime_t t, cred_t *cr, int async,
444 445 change_info4 *cinfo)
445 446 {
446 447 rnode4_t *rp;
447 448 int mtime_changed = 0;
448 449 int ctime_changed = 0;
449 450 vsecattr_t *vsp;
450 451 int was_serial, set_time_cache_inval, recov;
451 452 vattr_t *vap = &garp->n4g_va;
452 453 mntinfo4_t *mi = VTOMI4(vp);
453 454 len_t preattr_rsize;
454 455 boolean_t writemodify_set = B_FALSE;
455 456 boolean_t cachepurge_set = B_FALSE;
456 457
457 458 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
458 459
459 460 /* Is curthread the recovery thread? */
460 461 mutex_enter(&mi->mi_lock);
461 462 recov = (VTOMI4(vp)->mi_recovthread == curthread);
462 463 mutex_exit(&mi->mi_lock);
463 464
464 465 rp = VTOR4(vp);
465 466 mutex_enter(&rp->r_statelock);
466 467 was_serial = (rp->r_serial == curthread);
467 468 if (rp->r_serial && !was_serial) {
468 469 klwp_t *lwp = ttolwp(curthread);
469 470
470 471 /*
471 472 * If we're the recovery thread, then purge current attrs
472 473 * and bail out to avoid potential deadlock between another
473 474 * thread caching attrs (r_serial thread), recov thread,
474 475 * and an async writer thread.
475 476 */
476 477 if (recov) {
477 478 PURGE_ATTRCACHE4_LOCKED(rp);
478 479 mutex_exit(&rp->r_statelock);
479 480 return;
480 481 }
481 482
482 483 if (lwp != NULL)
483 484 lwp->lwp_nostop++;
484 485 while (rp->r_serial != NULL) {
485 486 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
486 487 mutex_exit(&rp->r_statelock);
487 488 if (lwp != NULL)
488 489 lwp->lwp_nostop--;
489 490 return;
490 491 }
491 492 }
492 493 if (lwp != NULL)
493 494 lwp->lwp_nostop--;
494 495 }
495 496
496 497 /*
497 498 * If there is a page flush thread, the current thread needs to
498 499 * bail out, to prevent a possible deadlock between the current
499 500 * thread (which might be in a start_op/end_op region), the
500 501 * recovery thread, and the page flush thread. Expire the
501 502 * attribute cache, so that any attributes the current thread was
502 503 * going to set are not lost.
503 504 */
504 505 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
505 506 PURGE_ATTRCACHE4_LOCKED(rp);
506 507 mutex_exit(&rp->r_statelock);
507 508 return;
508 509 }
509 510
510 511 if (rp->r_time_attr_saved > t) {
511 512 /*
512 513 * Attributes have been cached since these attributes were
513 514 * probably made. If there is an inconsistency in what is
514 515 * cached, mark them invalid. If not, don't act on them.
515 516 */
516 517 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
517 518 PURGE_ATTRCACHE4_LOCKED(rp);
518 519 mutex_exit(&rp->r_statelock);
519 520 return;
520 521 }
521 522 set_time_cache_inval = 0;
522 523 if (cinfo) {
523 524 /*
524 525 * Only directory modifying callers pass non-NULL cinfo.
525 526 */
526 527 ASSERT(vp->v_type == VDIR);
527 528 /*
528 529 * If the cache timeout either doesn't exist or hasn't expired,
529 530 * and dir didn't changed on server before dirmod op
530 531 * and dir didn't change after dirmod op but before getattr
531 532 * then there's a chance that the client's cached data for
532 533 * this object is current (not stale). No immediate cache
533 534 * flush is required.
534 535 *
535 536 */
536 537 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
537 538 cinfo->before == rp->r_change &&
538 539 (garp->n4g_change_valid &&
539 540 cinfo->after == garp->n4g_change)) {
540 541
541 542 /*
542 543 * If atomic isn't set, then the before/after info
543 544 * cannot be blindly trusted. For this case, we tell
544 545 * nfs4_attrcache_va to cache the attrs but also
545 546 * establish an absolute maximum cache timeout. When
546 547 * the timeout is reached, caches will be flushed.
547 548 */
548 549 if (! cinfo->atomic)
549 550 set_time_cache_inval = 1;
550 551 } else {
551 552
552 553 /*
553 554 * We're not sure exactly what changed, but we know
554 555 * what to do. flush all caches for dir. remove the
555 556 * attr timeout.
556 557 *
557 558 * a) timeout expired. flush all caches.
558 559 * b) r_change != cinfo.before. flush all caches.
559 560 * c) r_change == cinfo.before, but cinfo.after !=
560 561 * post-op getattr(change). flush all caches.
561 562 * d) post-op getattr(change) not provided by server.
562 563 * flush all caches.
563 564 */
564 565 mtime_changed = 1;
565 566 ctime_changed = 1;
566 567 rp->r_time_cache_inval = 0;
567 568 }
568 569 } else {
569 570 /*
570 571 * Write thread after writing data to file on remote server,
571 572 * will always set R4WRITEMODIFIED to indicate that file on
572 573 * remote server was modified with a WRITE operation and would
573 574 * have marked attribute cache as timed out. If R4WRITEMODIFIED
574 575 * is set, then do not check for mtime and ctime change.
575 576 */
576 577 if (!(rp->r_flags & R4WRITEMODIFIED)) {
577 578 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
578 579 mtime_changed = 1;
579 580
580 581 if (rp->r_attr.va_ctime.tv_sec !=
581 582 vap->va_ctime.tv_sec ||
582 583 rp->r_attr.va_ctime.tv_nsec !=
583 584 vap->va_ctime.tv_nsec)
584 585 ctime_changed = 1;
585 586
586 587 /*
587 588 * If the change attribute was not provided by server
588 589 * or it differs, then flush all caches.
589 590 */
590 591 if (!garp->n4g_change_valid ||
591 592 rp->r_change != garp->n4g_change) {
592 593 mtime_changed = 1;
593 594 ctime_changed = 1;
594 595 }
595 596 } else {
596 597 writemodify_set = B_TRUE;
597 598 }
598 599 }
599 600
600 601 preattr_rsize = rp->r_size;
601 602
602 603 nfs4_attrcache_va(vp, garp, set_time_cache_inval);
603 604
604 605 /*
605 606 * If we have updated filesize in nfs4_attrcache_va, as soon as we
606 607 * drop statelock we will be in transition of purging all
607 608 * our caches and updating them. It is possible for another
608 609 * thread to pick this new file size and read in zeroed data.
609 610 * stall other threads till cache purge is complete.
610 611 */
611 612 if ((!cinfo) && (rp->r_size != preattr_rsize)) {
612 613 /*
613 614 * If R4WRITEMODIFIED was set and we have updated the file
614 615 * size, Server's returned file size need not necessarily
615 616 * be because of this Client's WRITE. We need to purge
616 617 * all caches.
617 618 */
618 619 if (writemodify_set)
619 620 mtime_changed = 1;
620 621
621 622 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
622 623 rp->r_flags |= R4INCACHEPURGE;
623 624 cachepurge_set = B_TRUE;
624 625 }
625 626 }
626 627
627 628 if (!mtime_changed && !ctime_changed) {
628 629 mutex_exit(&rp->r_statelock);
629 630 return;
630 631 }
631 632
632 633 rp->r_serial = curthread;
633 634
634 635 mutex_exit(&rp->r_statelock);
635 636
636 637 /*
637 638 * If we're the recov thread, then force async nfs4_purge_caches
638 639 * to avoid potential deadlock.
639 640 */
640 641 if (mtime_changed)
641 642 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
642 643
643 644 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
644 645 mutex_enter(&rp->r_statelock);
645 646 rp->r_flags &= ~R4INCACHEPURGE;
646 647 cv_broadcast(&rp->r_cv);
647 648 mutex_exit(&rp->r_statelock);
648 649 cachepurge_set = B_FALSE;
649 650 }
650 651
651 652 if (ctime_changed) {
652 653 (void) nfs4_access_purge_rp(rp);
653 654 if (rp->r_secattr != NULL) {
654 655 mutex_enter(&rp->r_statelock);
655 656 vsp = rp->r_secattr;
656 657 rp->r_secattr = NULL;
657 658 mutex_exit(&rp->r_statelock);
658 659 if (vsp != NULL)
659 660 nfs4_acl_free_cache(vsp);
660 661 }
661 662 }
662 663
663 664 if (!was_serial) {
664 665 mutex_enter(&rp->r_statelock);
665 666 rp->r_serial = NULL;
666 667 cv_broadcast(&rp->r_cv);
667 668 mutex_exit(&rp->r_statelock);
668 669 }
669 670 }
670 671
671 672 /*
672 673 * Set attributes cache for given vnode using virtual attributes.
673 674 *
674 675 * Set the timeout value on the attribute cache and fill it
675 676 * with the passed in attributes.
676 677 *
677 678 * The caller must be holding r_statelock.
678 679 */
679 680 static void
680 681 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
681 682 {
682 683 rnode4_t *rp;
683 684 mntinfo4_t *mi;
684 685 hrtime_t delta;
685 686 hrtime_t now;
686 687 vattr_t *vap = &garp->n4g_va;
687 688
688 689 rp = VTOR4(vp);
689 690
690 691 ASSERT(MUTEX_HELD(&rp->r_statelock));
691 692 ASSERT(vap->va_mask == AT_ALL);
692 693
693 694 /* Switch to master before checking v_flag */
694 695 if (IS_SHADOW(vp, rp))
695 696 vp = RTOV4(rp);
696 697
697 698 now = gethrtime();
698 699
699 700 mi = VTOMI4(vp);
700 701
701 702 /*
702 703 * Only establish a new cache timeout (if requested). Never
703 704 * extend a timeout. Never clear a timeout. Clearing a timeout
704 705 * is done by nfs4_update_dircaches (ancestor in our call chain)
705 706 */
706 707 if (set_cache_timeout && ! rp->r_time_cache_inval)
707 708 rp->r_time_cache_inval = now + mi->mi_acdirmax;
708 709
709 710 /*
710 711 * Delta is the number of nanoseconds that we will
711 712 * cache the attributes of the file. It is based on
712 713 * the number of nanoseconds since the last time that
713 714 * we detected a change. The assumption is that files
714 715 * that changed recently are likely to change again.
715 716 * There is a minimum and a maximum for regular files
716 717 * and for directories which is enforced though.
717 718 *
718 719 * Using the time since last change was detected
719 720 * eliminates direct comparison or calculation
720 721 * using mixed client and server times. NFS does
721 722 * not make any assumptions regarding the client
722 723 * and server clocks being synchronized.
723 724 */
724 725 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
725 726 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
726 727 vap->va_size != rp->r_attr.va_size) {
727 728 rp->r_time_attr_saved = now;
728 729 }
729 730
730 731 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
731 732 delta = 0;
732 733 else {
733 734 delta = now - rp->r_time_attr_saved;
734 735 if (vp->v_type == VDIR) {
735 736 if (delta < mi->mi_acdirmin)
736 737 delta = mi->mi_acdirmin;
737 738 else if (delta > mi->mi_acdirmax)
738 739 delta = mi->mi_acdirmax;
739 740 } else {
740 741 if (delta < mi->mi_acregmin)
741 742 delta = mi->mi_acregmin;
742 743 else if (delta > mi->mi_acregmax)
743 744 delta = mi->mi_acregmax;
744 745 }
745 746 }
746 747 rp->r_time_attr_inval = now + delta;
747 748
748 749 rp->r_attr = *vap;
749 750 if (garp->n4g_change_valid)
750 751 rp->r_change = garp->n4g_change;
751 752
752 753 /*
753 754 * The attributes that were returned may be valid and can
754 755 * be used, but they may not be allowed to be cached.
755 756 * Reset the timers to cause immediate invalidation and
756 757 * clear r_change so no VERIFY operations will suceed
757 758 */
758 759 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
759 760 rp->r_time_attr_inval = now;
760 761 rp->r_time_attr_saved = now;
761 762 rp->r_change = 0;
762 763 }
763 764
764 765 /*
765 766 * If mounted_on_fileid returned AND the object is a stub,
766 767 * then set object's va_nodeid to the mounted over fid
767 768 * returned by server.
768 769 *
769 770 * If mounted_on_fileid not provided/supported, then
770 771 * just set it to 0 for now. Eventually it would be
771 772 * better to set it to a hashed version of FH. This
772 773 * would probably be good enough to provide a unique
773 774 * fid/d_ino within a dir.
774 775 *
775 776 * We don't need to carry mounted_on_fileid in the
776 777 * rnode as long as the client never requests fileid
777 778 * without also requesting mounted_on_fileid. For
778 779 * now, it stays.
779 780 */
780 781 if (garp->n4g_mon_fid_valid) {
781 782 rp->r_mntd_fid = garp->n4g_mon_fid;
782 783
783 784 if (RP_ISSTUB(rp))
784 785 rp->r_attr.va_nodeid = rp->r_mntd_fid;
785 786 }
786 787
787 788 /*
788 789 * Check to see if there are valid pathconf bits to
789 790 * cache in the rnode.
790 791 */
791 792 if (garp->n4g_ext_res) {
792 793 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
793 794 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
794 795 } else {
795 796 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
796 797 rp->r_pathconf.pc4_xattr_valid = TRUE;
797 798 rp->r_pathconf.pc4_xattr_exists =
798 799 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
799 800 }
800 801 }
801 802 }
802 803 /*
803 804 * Update the size of the file if there is no cached data or if
804 805 * the cached data is clean and there is no data being written
805 806 * out.
806 807 */
807 808 if (rp->r_size != vap->va_size &&
808 809 (!vn_has_cached_data(vp) ||
809 810 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
810 811 rp->r_size = vap->va_size;
811 812 }
812 813 nfs_setswaplike(vp, vap);
813 814 rp->r_flags &= ~R4WRITEMODIFIED;
814 815 }
815 816
816 817 /*
817 818 * Get attributes over-the-wire and update attributes cache
818 819 * if no error occurred in the over-the-wire operation.
819 820 * Return 0 if successful, otherwise error.
820 821 */
821 822 int
822 823 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
823 824 {
824 825 mntinfo4_t *mi = VTOMI4(vp);
825 826 hrtime_t t;
826 827 nfs4_recov_state_t recov_state;
827 828 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
828 829
829 830 recov_state.rs_flags = 0;
830 831 recov_state.rs_num_retry_despite_err = 0;
831 832
832 833 /* Save the original mount point security flavor */
833 834 (void) save_mnt_secinfo(mi->mi_curr_serv);
834 835
835 836 recov_retry:
836 837
837 838 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
838 839 &recov_state, NULL))) {
839 840 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
840 841 return (e.error);
841 842 }
842 843
843 844 t = gethrtime();
844 845
845 846 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
846 847
847 848 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
848 849 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
849 850 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) {
850 851 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
851 852 &recov_state, 1);
852 853 goto recov_retry;
853 854 }
854 855 }
855 856
856 857 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
857 858
858 859 if (!e.error) {
859 860 if (e.stat == NFS4_OK) {
860 861 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
861 862 } else {
862 863 e.error = geterrno4(e.stat);
863 864
864 865 nfs4_purge_stale_fh(e.error, vp, cr);
865 866 }
866 867 }
867 868
868 869 /*
869 870 * If getattr a node that is a stub for a crossed
870 871 * mount point, keep the original secinfo flavor for
871 872 * the current file system, not the crossed one.
872 873 */
873 874 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
874 875
875 876 return (e.error);
876 877 }
877 878
878 879 /*
879 880 * Generate a compound to get attributes over-the-wire.
880 881 */
881 882 void
882 883 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
883 884 nfs4_error_t *ep, cred_t *cr, int get_acl)
884 885 {
885 886 COMPOUND4args_clnt args;
886 887 COMPOUND4res_clnt res;
887 888 int doqueue;
888 889 rnode4_t *rp = VTOR4(vp);
889 890 nfs_argop4 argop[2];
890 891
891 892 args.ctag = TAG_GETATTR;
892 893
893 894 args.array_len = 2;
894 895 args.array = argop;
895 896
896 897 /* putfh */
897 898 argop[0].argop = OP_CPUTFH;
898 899 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
899 900
900 901 /* getattr */
901 902 /*
902 903 * Unlike nfs version 2 and 3, where getattr returns all the
903 904 * attributes, nfs version 4 returns only the ones explicitly
904 905 * asked for. This creates problems, as some system functions
905 906 * (e.g. cache check) require certain attributes and if the
906 907 * cached node lacks some attributes such as uid/gid, it can
907 908 * affect system utilities (e.g. "ls") that rely on the information
908 909 * to be there. This can lead to anything from system crashes to
909 910 * corrupted information processed by user apps.
910 911 * So to ensure that all bases are covered, request at least
911 912 * the AT_ALL attribute mask.
912 913 */
913 914 argop[1].argop = OP_GETATTR;
914 915 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
915 916 if (get_acl)
916 917 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
917 918 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
918 919
919 920 doqueue = 1;
920 921
921 922 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
922 923
923 924 if (ep->error)
924 925 return;
925 926
926 927 if (res.status != NFS4_OK) {
927 928 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
928 929 return;
929 930 }
930 931
931 932 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
932 933
933 934 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
934 935 }
935 936
936 937 /*
937 938 * Return either cached or remote attributes. If get remote attr
938 939 * use them to check and invalidate caches, then cache the new attributes.
939 940 */
940 941 int
941 942 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
942 943 {
943 944 int error;
944 945 rnode4_t *rp;
945 946 nfs4_ga_res_t gar;
946 947
947 948 ASSERT(nfs4_consistent_type(vp));
948 949
949 950 /*
950 951 * If we've got cached attributes, we're done, otherwise go
951 952 * to the server to get attributes, which will update the cache
952 953 * in the process. Either way, use the cached attributes for
953 954 * the caller's vattr_t.
954 955 *
955 956 * Note that we ignore the gar set by the OTW call: the attr caching
956 957 * code may make adjustments when storing to the rnode, and we want
957 958 * to see those changes here.
958 959 */
959 960 rp = VTOR4(vp);
960 961 error = 0;
961 962 mutex_enter(&rp->r_statelock);
962 963 if (!ATTRCACHE4_VALID(vp)) {
963 964 mutex_exit(&rp->r_statelock);
964 965 error = nfs4_getattr_otw(vp, &gar, cr, 0);
965 966 mutex_enter(&rp->r_statelock);
966 967 }
967 968
968 969 if (!error)
969 970 *vap = rp->r_attr;
970 971
971 972 /* Return the client's view of file size */
972 973 vap->va_size = rp->r_size;
973 974
974 975 mutex_exit(&rp->r_statelock);
975 976
976 977 ASSERT(nfs4_consistent_type(vp));
977 978
978 979 return (error);
979 980 }
980 981
981 982 int
982 983 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
983 984 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
984 985 {
985 986 COMPOUND4args_clnt args;
986 987 COMPOUND4res_clnt res;
987 988 int doqueue;
988 989 nfs_argop4 argop[2];
989 990 mntinfo4_t *mi = VTOMI4(vp);
990 991 bool_t needrecov = FALSE;
991 992 nfs4_recov_state_t recov_state;
992 993 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
993 994 nfs4_ga_ext_res_t *gerp;
994 995
995 996 recov_state.rs_flags = 0;
996 997 recov_state.rs_num_retry_despite_err = 0;
997 998
998 999 recov_retry:
999 1000 args.ctag = tag_type;
1000 1001
1001 1002 args.array_len = 2;
1002 1003 args.array = argop;
1003 1004
1004 1005 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1005 1006 if (e.error)
1006 1007 return (e.error);
1007 1008
1008 1009 /* putfh */
1009 1010 argop[0].argop = OP_CPUTFH;
1010 1011 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1011 1012
1012 1013 /* getattr */
1013 1014 argop[1].argop = OP_GETATTR;
1014 1015 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1015 1016 argop[1].nfs_argop4_u.opgetattr.mi = mi;
1016 1017
1017 1018 doqueue = 1;
1018 1019
1019 1020 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1020 1021 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1021 1022 rnode4info(VTOR4(vp))));
1022 1023
1023 1024 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1024 1025
1025 1026 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1026 1027 if (!needrecov && e.error) {
1027 1028 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1028 1029 needrecov);
1029 1030 return (e.error);
1030 1031 }
1031 1032
1032 1033 if (needrecov) {
1033 1034 bool_t abort;
1034 1035
1035 1036 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1036 1037 "nfs4_attr_otw: initiating recovery\n"));
1037 1038
1038 1039 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1039 1040 NULL, OP_GETATTR, NULL, NULL, NULL);
1040 1041 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1041 1042 needrecov);
1042 1043 if (!e.error) {
1043 1044 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1044 1045 e.error = geterrno4(res.status);
1045 1046 }
1046 1047 if (abort == FALSE)
1047 1048 goto recov_retry;
1048 1049 return (e.error);
1049 1050 }
1050 1051
1051 1052 if (res.status) {
1052 1053 e.error = geterrno4(res.status);
1053 1054 } else {
1054 1055 gerp = garp->n4g_ext_res;
1055 1056 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1056 1057 garp, sizeof (nfs4_ga_res_t));
1057 1058 garp->n4g_ext_res = gerp;
1058 1059 if (garp->n4g_ext_res &&
1059 1060 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1060 1061 bcopy(res.array[1].nfs_resop4_u.opgetattr.
1061 1062 ga_res.n4g_ext_res,
1062 1063 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1063 1064 }
1064 1065 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1065 1066 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1066 1067 needrecov);
1067 1068 return (e.error);
1068 1069 }
1069 1070
1070 1071 /*
1071 1072 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1072 1073 * for the demand-based allocation of async threads per-mount. The
1073 1074 * nfs_async_timeout is the amount of time a thread will live after it
1074 1075 * becomes idle, unless new I/O requests are received before the thread
1075 1076 * dies. See nfs4_async_putpage and nfs4_async_start.
1076 1077 */
1077 1078
1078 1079 static void nfs4_async_start(struct vfs *);
1079 1080 static void nfs4_async_pgops_start(struct vfs *);
1080 1081 static void nfs4_async_common_start(struct vfs *, int);
1081 1082
1082 1083 static void
1083 1084 free_async_args4(struct nfs4_async_reqs *args)
1084 1085 {
1085 1086 rnode4_t *rp;
1086 1087
1087 1088 if (args->a_io != NFS4_INACTIVE) {
1088 1089 rp = VTOR4(args->a_vp);
1089 1090 mutex_enter(&rp->r_statelock);
1090 1091 rp->r_count--;
1091 1092 if (args->a_io == NFS4_PUTAPAGE ||
1092 1093 args->a_io == NFS4_PAGEIO)
1093 1094 rp->r_awcount--;
1094 1095 cv_broadcast(&rp->r_cv);
1095 1096 mutex_exit(&rp->r_statelock);
1096 1097 VN_RELE(args->a_vp);
1097 1098 }
1098 1099 crfree(args->a_cred);
1099 1100 kmem_free(args, sizeof (*args));
1100 1101 }
1101 1102
1102 1103 /*
1103 1104 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1104 1105 * pageout(), running in the global zone, have legitimate reasons to do
1105 1106 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1106 1107 * use of a a per-mount "asynchronous requests manager thread" which is
1107 1108 * signaled by the various asynchronous work routines when there is
1108 1109 * asynchronous work to be done. It is responsible for creating new
1109 1110 * worker threads if necessary, and notifying existing worker threads
1110 1111 * that there is work to be done.
1111 1112 *
1112 1113 * In other words, it will "take the specifications from the customers and
1113 1114 * give them to the engineers."
1114 1115 *
1115 1116 * Worker threads die off of their own accord if they are no longer
1116 1117 * needed.
1117 1118 *
1118 1119 * This thread is killed when the zone is going away or the filesystem
1119 1120 * is being unmounted.
1120 1121 */
1121 1122 void
1122 1123 nfs4_async_manager(vfs_t *vfsp)
1123 1124 {
1124 1125 callb_cpr_t cprinfo;
1125 1126 mntinfo4_t *mi;
1126 1127 uint_t max_threads;
1127 1128
1128 1129 mi = VFTOMI4(vfsp);
1129 1130
1130 1131 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1131 1132 "nfs4_async_manager");
1132 1133
1133 1134 mutex_enter(&mi->mi_async_lock);
1134 1135 /*
1135 1136 * We want to stash the max number of threads that this mount was
1136 1137 * allowed so we can use it later when the variable is set to zero as
1137 1138 * part of the zone/mount going away.
1138 1139 *
1139 1140 * We want to be able to create at least one thread to handle
1140 1141 * asynchronous inactive calls.
1141 1142 */
1142 1143 max_threads = MAX(mi->mi_max_threads, 1);
1143 1144 /*
1144 1145 * We don't want to wait for mi_max_threads to go to zero, since that
1145 1146 * happens as part of a failed unmount, but this thread should only
1146 1147 * exit when the mount is really going away.
1147 1148 *
1148 1149 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1149 1150 * attempted: the various _async_*() functions know to do things
1150 1151 * inline if mi_max_threads == 0. Henceforth we just drain out the
1151 1152 * outstanding requests.
1152 1153 *
1153 1154 * Note that we still create zthreads even if we notice the zone is
1154 1155 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1155 1156 * shutdown sequence to take slightly longer in some cases, but
1156 1157 * doesn't violate the protocol, as all threads will exit as soon as
1157 1158 * they're done processing the remaining requests.
1158 1159 */
1159 1160 for (;;) {
1160 1161 while (mi->mi_async_req_count > 0) {
1161 1162 /*
1162 1163 * Paranoia: If the mount started out having
1163 1164 * (mi->mi_max_threads == 0), and the value was
1164 1165 * later changed (via a debugger or somesuch),
1165 1166 * we could be confused since we will think we
1166 1167 * can't create any threads, and the calling
1167 1168 * code (which looks at the current value of
1168 1169 * mi->mi_max_threads, now non-zero) thinks we
1169 1170 * can.
1170 1171 *
1171 1172 * So, because we're paranoid, we create threads
1172 1173 * up to the maximum of the original and the
1173 1174 * current value. This means that future
1174 1175 * (debugger-induced) alterations of
1175 1176 * mi->mi_max_threads are ignored for our
1176 1177 * purposes, but who told them they could change
1177 1178 * random values on a live kernel anyhow?
1178 1179 */
1179 1180 if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1180 1181 MAX(mi->mi_max_threads, max_threads)) {
1181 1182 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1182 1183 mutex_exit(&mi->mi_async_lock);
1183 1184 MI4_HOLD(mi);
1184 1185 VFS_HOLD(vfsp); /* hold for new thread */
1185 1186 (void) zthread_create(NULL, 0, nfs4_async_start,
1186 1187 vfsp, 0, minclsyspri);
1187 1188 mutex_enter(&mi->mi_async_lock);
1188 1189 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1189 1190 NUM_ASYNC_PGOPS_THREADS) {
1190 1191 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1191 1192 mutex_exit(&mi->mi_async_lock);
1192 1193 MI4_HOLD(mi);
1193 1194 VFS_HOLD(vfsp); /* hold for new thread */
1194 1195 (void) zthread_create(NULL, 0,
1195 1196 nfs4_async_pgops_start, vfsp, 0,
1196 1197 minclsyspri);
1197 1198 mutex_enter(&mi->mi_async_lock);
1198 1199 }
1199 1200 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1200 1201 ASSERT(mi->mi_async_req_count != 0);
1201 1202 mi->mi_async_req_count--;
1202 1203 }
1203 1204
1204 1205 mutex_enter(&mi->mi_lock);
1205 1206 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1206 1207 mutex_exit(&mi->mi_lock);
1207 1208 break;
1208 1209 }
1209 1210 mutex_exit(&mi->mi_lock);
1210 1211
1211 1212 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1212 1213 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1213 1214 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1214 1215 }
1215 1216
1216 1217 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1217 1218 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1218 1219 /*
1219 1220 * Let everyone know we're done.
1220 1221 */
1221 1222 mi->mi_manager_thread = NULL;
1222 1223 /*
1223 1224 * Wake up the inactive thread.
1224 1225 */
1225 1226 cv_broadcast(&mi->mi_inact_req_cv);
1226 1227 /*
1227 1228 * Wake up anyone sitting in nfs4_async_manager_stop()
1228 1229 */
1229 1230 cv_broadcast(&mi->mi_async_cv);
1230 1231 /*
1231 1232 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1232 1233 * since CALLB_CPR_EXIT is actually responsible for releasing
1233 1234 * 'mi_async_lock'.
1234 1235 */
1235 1236 CALLB_CPR_EXIT(&cprinfo);
1236 1237 VFS_RELE(vfsp); /* release thread's hold */
1237 1238 MI4_RELE(mi);
1238 1239 zthread_exit();
1239 1240 }
1240 1241
1241 1242 /*
1242 1243 * Signal (and wait for) the async manager thread to clean up and go away.
1243 1244 */
1244 1245 void
1245 1246 nfs4_async_manager_stop(vfs_t *vfsp)
1246 1247 {
1247 1248 mntinfo4_t *mi = VFTOMI4(vfsp);
1248 1249
1249 1250 mutex_enter(&mi->mi_async_lock);
1250 1251 mutex_enter(&mi->mi_lock);
1251 1252 mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1252 1253 mutex_exit(&mi->mi_lock);
1253 1254 cv_broadcast(&mi->mi_async_reqs_cv);
1254 1255 /*
1255 1256 * Wait for the async manager thread to die.
1256 1257 */
1257 1258 while (mi->mi_manager_thread != NULL)
1258 1259 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1259 1260 mutex_exit(&mi->mi_async_lock);
1260 1261 }
1261 1262
1262 1263 int
1263 1264 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1264 1265 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1265 1266 u_offset_t, caddr_t, struct seg *, cred_t *))
1266 1267 {
1267 1268 rnode4_t *rp;
1268 1269 mntinfo4_t *mi;
1269 1270 struct nfs4_async_reqs *args;
1270 1271
1271 1272 rp = VTOR4(vp);
1272 1273 ASSERT(rp->r_freef == NULL);
1273 1274
1274 1275 mi = VTOMI4(vp);
1275 1276
1276 1277 /*
1277 1278 * If addr falls in a different segment, don't bother doing readahead.
1278 1279 */
1279 1280 if (addr >= seg->s_base + seg->s_size)
1280 1281 return (-1);
1281 1282
1282 1283 /*
1283 1284 * If we can't allocate a request structure, punt on the readahead.
1284 1285 */
1285 1286 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1286 1287 return (-1);
1287 1288
1288 1289 /*
1289 1290 * If a lock operation is pending, don't initiate any new
1290 1291 * readaheads. Otherwise, bump r_count to indicate the new
1291 1292 * asynchronous I/O.
1292 1293 */
1293 1294 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1294 1295 kmem_free(args, sizeof (*args));
1295 1296 return (-1);
1296 1297 }
1297 1298 mutex_enter(&rp->r_statelock);
1298 1299 rp->r_count++;
1299 1300 mutex_exit(&rp->r_statelock);
1300 1301 nfs_rw_exit(&rp->r_lkserlock);
1301 1302
1302 1303 args->a_next = NULL;
1303 1304 #ifdef DEBUG
1304 1305 args->a_queuer = curthread;
1305 1306 #endif
1306 1307 VN_HOLD(vp);
1307 1308 args->a_vp = vp;
1308 1309 ASSERT(cr != NULL);
1309 1310 crhold(cr);
1310 1311 args->a_cred = cr;
1311 1312 args->a_io = NFS4_READ_AHEAD;
1312 1313 args->a_nfs4_readahead = readahead;
1313 1314 args->a_nfs4_blkoff = blkoff;
1314 1315 args->a_nfs4_seg = seg;
1315 1316 args->a_nfs4_addr = addr;
1316 1317
1317 1318 mutex_enter(&mi->mi_async_lock);
1318 1319
1319 1320 /*
1320 1321 * If asyncio has been disabled, don't bother readahead.
1321 1322 */
1322 1323 if (mi->mi_max_threads == 0) {
1323 1324 mutex_exit(&mi->mi_async_lock);
1324 1325 goto noasync;
1325 1326 }
1326 1327
1327 1328 /*
1328 1329 * Link request structure into the async list and
1329 1330 * wakeup async thread to do the i/o.
1330 1331 */
1331 1332 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1332 1333 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1333 1334 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1334 1335 } else {
1335 1336 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1336 1337 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1337 1338 }
1338 1339
1339 1340 if (mi->mi_io_kstats) {
1340 1341 mutex_enter(&mi->mi_lock);
1341 1342 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1342 1343 mutex_exit(&mi->mi_lock);
1343 1344 }
1344 1345
1345 1346 mi->mi_async_req_count++;
1346 1347 ASSERT(mi->mi_async_req_count != 0);
1347 1348 cv_signal(&mi->mi_async_reqs_cv);
1348 1349 mutex_exit(&mi->mi_async_lock);
1349 1350 return (0);
1350 1351
1351 1352 noasync:
1352 1353 mutex_enter(&rp->r_statelock);
1353 1354 rp->r_count--;
1354 1355 cv_broadcast(&rp->r_cv);
1355 1356 mutex_exit(&rp->r_statelock);
1356 1357 VN_RELE(vp);
1357 1358 crfree(cr);
1358 1359 kmem_free(args, sizeof (*args));
1359 1360 return (-1);
1360 1361 }
1361 1362
1362 1363 static void
1363 1364 nfs4_async_start(struct vfs *vfsp)
1364 1365 {
1365 1366 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1366 1367 }
1367 1368
1368 1369 static void
1369 1370 nfs4_async_pgops_start(struct vfs *vfsp)
1370 1371 {
1371 1372 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1372 1373 }
1373 1374
1374 1375 /*
1375 1376 * The async queues for each mounted file system are arranged as a
1376 1377 * set of queues, one for each async i/o type. Requests are taken
1377 1378 * from the queues in a round-robin fashion. A number of consecutive
1378 1379 * requests are taken from each queue before moving on to the next
1379 1380 * queue. This functionality may allow the NFS Version 2 server to do
1380 1381 * write clustering, even if the client is mixing writes and reads
1381 1382 * because it will take multiple write requests from the queue
1382 1383 * before processing any of the other async i/o types.
1383 1384 *
1384 1385 * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1385 1386 * model defined by cpr to suspend the system. Specifically over the
1386 1387 * wire calls are cpr-unsafe. The thread should be reevaluated in
|
↓ open down ↓ |
1354 lines elided |
↑ open up ↑ |
1387 1388 * case of future updates to the cpr model.
1388 1389 */
1389 1390 static void
1390 1391 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1391 1392 {
1392 1393 struct nfs4_async_reqs *args;
1393 1394 mntinfo4_t *mi = VFTOMI4(vfsp);
1394 1395 clock_t time_left = 1;
1395 1396 callb_cpr_t cprinfo;
1396 1397 int i;
1397 - extern int nfs_async_timeout;
1398 + extern volatile int nfs_async_timeout;
1398 1399 int async_types;
1399 1400 kcondvar_t *async_work_cv;
1400 1401
1401 1402 if (async_queue == NFS4_ASYNC_QUEUE) {
1402 1403 async_types = NFS4_ASYNC_TYPES;
1403 1404 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1404 1405 } else {
1405 1406 async_types = NFS4_ASYNC_PGOPS_TYPES;
1406 1407 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1407 1408 }
1408 1409
1409 1410 /*
1410 1411 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1411 1412 * built in an implementation independent manner.
1412 1413 */
1413 1414 if (nfs_async_timeout == -1)
1414 1415 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1415 1416
1416 1417 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1417 1418
1418 1419 mutex_enter(&mi->mi_async_lock);
1419 1420 for (;;) {
1420 1421 /*
1421 1422 * Find the next queue containing an entry. We start
1422 1423 * at the current queue pointer and then round robin
1423 1424 * through all of them until we either find a non-empty
1424 1425 * queue or have looked through all of them.
1425 1426 */
1426 1427 for (i = 0; i < async_types; i++) {
1427 1428 args = *mi->mi_async_curr[async_queue];
1428 1429 if (args != NULL)
1429 1430 break;
1430 1431 mi->mi_async_curr[async_queue]++;
1431 1432 if (mi->mi_async_curr[async_queue] ==
1432 1433 &mi->mi_async_reqs[async_types]) {
1433 1434 mi->mi_async_curr[async_queue] =
1434 1435 &mi->mi_async_reqs[0];
1435 1436 }
1436 1437 }
1437 1438 /*
1438 1439 * If we didn't find a entry, then block until woken up
1439 1440 * again and then look through the queues again.
1440 1441 */
1441 1442 if (args == NULL) {
1442 1443 /*
1443 1444 * Exiting is considered to be safe for CPR as well
1444 1445 */
1445 1446 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1446 1447
1447 1448 /*
1448 1449 * Wakeup thread waiting to unmount the file
1449 1450 * system only if all async threads are inactive.
1450 1451 *
1451 1452 * If we've timed-out and there's nothing to do,
1452 1453 * then get rid of this thread.
1453 1454 */
1454 1455 if (mi->mi_max_threads == 0 || time_left <= 0) {
1455 1456 --mi->mi_threads[async_queue];
1456 1457
1457 1458 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1458 1459 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1459 1460 cv_signal(&mi->mi_async_cv);
1460 1461 CALLB_CPR_EXIT(&cprinfo);
1461 1462 VFS_RELE(vfsp); /* release thread's hold */
1462 1463 MI4_RELE(mi);
1463 1464 zthread_exit();
1464 1465 /* NOTREACHED */
1465 1466 }
1466 1467 time_left = cv_reltimedwait(async_work_cv,
1467 1468 &mi->mi_async_lock, nfs_async_timeout,
1468 1469 TR_CLOCK_TICK);
1469 1470
1470 1471 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1471 1472
1472 1473 continue;
1473 1474 } else {
1474 1475 time_left = 1;
1475 1476 }
1476 1477
1477 1478 /*
1478 1479 * Remove the request from the async queue and then
1479 1480 * update the current async request queue pointer. If
1480 1481 * the current queue is empty or we have removed enough
1481 1482 * consecutive entries from it, then reset the counter
1482 1483 * for this queue and then move the current pointer to
1483 1484 * the next queue.
1484 1485 */
1485 1486 *mi->mi_async_curr[async_queue] = args->a_next;
1486 1487 if (*mi->mi_async_curr[async_queue] == NULL ||
1487 1488 --mi->mi_async_clusters[args->a_io] == 0) {
1488 1489 mi->mi_async_clusters[args->a_io] =
1489 1490 mi->mi_async_init_clusters;
1490 1491 mi->mi_async_curr[async_queue]++;
1491 1492 if (mi->mi_async_curr[async_queue] ==
1492 1493 &mi->mi_async_reqs[async_types]) {
1493 1494 mi->mi_async_curr[async_queue] =
1494 1495 &mi->mi_async_reqs[0];
1495 1496 }
1496 1497 }
1497 1498
1498 1499 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1499 1500 mutex_enter(&mi->mi_lock);
1500 1501 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1501 1502 mutex_exit(&mi->mi_lock);
1502 1503 }
1503 1504
1504 1505 mutex_exit(&mi->mi_async_lock);
1505 1506
1506 1507 /*
1507 1508 * Obtain arguments from the async request structure.
1508 1509 */
1509 1510 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1510 1511 (*args->a_nfs4_readahead)(args->a_vp,
1511 1512 args->a_nfs4_blkoff, args->a_nfs4_addr,
1512 1513 args->a_nfs4_seg, args->a_cred);
1513 1514 } else if (args->a_io == NFS4_PUTAPAGE) {
1514 1515 (void) (*args->a_nfs4_putapage)(args->a_vp,
1515 1516 args->a_nfs4_pp, args->a_nfs4_off,
1516 1517 args->a_nfs4_len, args->a_nfs4_flags,
1517 1518 args->a_cred);
1518 1519 } else if (args->a_io == NFS4_PAGEIO) {
1519 1520 (void) (*args->a_nfs4_pageio)(args->a_vp,
1520 1521 args->a_nfs4_pp, args->a_nfs4_off,
1521 1522 args->a_nfs4_len, args->a_nfs4_flags,
1522 1523 args->a_cred);
1523 1524 } else if (args->a_io == NFS4_READDIR) {
1524 1525 (void) ((*args->a_nfs4_readdir)(args->a_vp,
1525 1526 args->a_nfs4_rdc, args->a_cred));
1526 1527 } else if (args->a_io == NFS4_COMMIT) {
1527 1528 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1528 1529 args->a_nfs4_offset, args->a_nfs4_count,
1529 1530 args->a_cred);
1530 1531 } else if (args->a_io == NFS4_INACTIVE) {
1531 1532 nfs4_inactive_otw(args->a_vp, args->a_cred);
1532 1533 }
1533 1534
1534 1535 /*
1535 1536 * Now, release the vnode and free the credentials
1536 1537 * structure.
1537 1538 */
1538 1539 free_async_args4(args);
1539 1540 /*
1540 1541 * Reacquire the mutex because it will be needed above.
1541 1542 */
1542 1543 mutex_enter(&mi->mi_async_lock);
1543 1544 }
1544 1545 }
1545 1546
1546 1547 /*
1547 1548 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1548 1549 * part of VOP_INACTIVE.
1549 1550 */
1550 1551
1551 1552 void
1552 1553 nfs4_inactive_thread(mntinfo4_t *mi)
1553 1554 {
1554 1555 struct nfs4_async_reqs *args;
1555 1556 callb_cpr_t cprinfo;
1556 1557 vfs_t *vfsp = mi->mi_vfsp;
1557 1558
1558 1559 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1559 1560 "nfs4_inactive_thread");
1560 1561
1561 1562 for (;;) {
1562 1563 mutex_enter(&mi->mi_async_lock);
1563 1564 args = mi->mi_async_reqs[NFS4_INACTIVE];
1564 1565 if (args == NULL) {
1565 1566 mutex_enter(&mi->mi_lock);
1566 1567 /*
1567 1568 * We don't want to exit until the async manager is done
1568 1569 * with its work; hence the check for mi_manager_thread
1569 1570 * being NULL.
1570 1571 *
1571 1572 * The async manager thread will cv_broadcast() on
1572 1573 * mi_inact_req_cv when it's done, at which point we'll
1573 1574 * wake up and exit.
1574 1575 */
1575 1576 if (mi->mi_manager_thread == NULL)
1576 1577 goto die;
1577 1578 mi->mi_flags |= MI4_INACTIVE_IDLE;
1578 1579 mutex_exit(&mi->mi_lock);
1579 1580 cv_signal(&mi->mi_async_cv);
1580 1581 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1581 1582 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1582 1583 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1583 1584 mutex_exit(&mi->mi_async_lock);
1584 1585 } else {
1585 1586 mutex_enter(&mi->mi_lock);
1586 1587 mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1587 1588 mutex_exit(&mi->mi_lock);
1588 1589 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1589 1590 mutex_exit(&mi->mi_async_lock);
1590 1591 nfs4_inactive_otw(args->a_vp, args->a_cred);
1591 1592 crfree(args->a_cred);
1592 1593 kmem_free(args, sizeof (*args));
1593 1594 }
1594 1595 }
1595 1596 die:
1596 1597 mutex_exit(&mi->mi_lock);
1597 1598 mi->mi_inactive_thread = NULL;
1598 1599 cv_signal(&mi->mi_async_cv);
1599 1600
1600 1601 /*
1601 1602 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1602 1603 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1603 1604 */
1604 1605 CALLB_CPR_EXIT(&cprinfo);
1605 1606
1606 1607 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1607 1608 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1608 1609
1609 1610 MI4_RELE(mi);
1610 1611 zthread_exit();
1611 1612 /* NOTREACHED */
1612 1613 }
1613 1614
1614 1615 /*
1615 1616 * nfs_async_stop:
1616 1617 * Wait for all outstanding putpage operations and the inactive thread to
1617 1618 * complete; nfs4_async_stop_sig() without interruptibility.
1618 1619 */
1619 1620 void
1620 1621 nfs4_async_stop(struct vfs *vfsp)
1621 1622 {
1622 1623 mntinfo4_t *mi = VFTOMI4(vfsp);
1623 1624
1624 1625 /*
1625 1626 * Wait for all outstanding async operations to complete and for
1626 1627 * worker threads to exit.
1627 1628 */
1628 1629 mutex_enter(&mi->mi_async_lock);
1629 1630 mi->mi_max_threads = 0;
1630 1631 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1631 1632 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1632 1633 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1633 1634 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1634 1635
1635 1636 /*
1636 1637 * Wait for the inactive thread to finish doing what it's doing. It
1637 1638 * won't exit until the last reference to the vfs_t goes away.
1638 1639 */
1639 1640 if (mi->mi_inactive_thread != NULL) {
1640 1641 mutex_enter(&mi->mi_lock);
1641 1642 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1642 1643 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1643 1644 mutex_exit(&mi->mi_lock);
1644 1645 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1645 1646 mutex_enter(&mi->mi_lock);
1646 1647 }
1647 1648 mutex_exit(&mi->mi_lock);
1648 1649 }
1649 1650 mutex_exit(&mi->mi_async_lock);
1650 1651 }
1651 1652
1652 1653 /*
1653 1654 * nfs_async_stop_sig:
1654 1655 * Wait for all outstanding putpage operations and the inactive thread to
1655 1656 * complete. If a signal is delivered we will abort and return non-zero;
1656 1657 * otherwise return 0. Since this routine is called from nfs4_unmount, we
1657 1658 * need to make it interruptible.
1658 1659 */
1659 1660 int
1660 1661 nfs4_async_stop_sig(struct vfs *vfsp)
1661 1662 {
1662 1663 mntinfo4_t *mi = VFTOMI4(vfsp);
1663 1664 ushort_t omax;
1664 1665 bool_t intr = FALSE;
1665 1666
1666 1667 /*
1667 1668 * Wait for all outstanding putpage operations to complete and for
1668 1669 * worker threads to exit.
1669 1670 */
1670 1671 mutex_enter(&mi->mi_async_lock);
1671 1672 omax = mi->mi_max_threads;
1672 1673 mi->mi_max_threads = 0;
1673 1674 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1674 1675 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1675 1676 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1676 1677 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1677 1678 intr = TRUE;
1678 1679 goto interrupted;
1679 1680 }
1680 1681 }
1681 1682
1682 1683 /*
1683 1684 * Wait for the inactive thread to finish doing what it's doing. It
1684 1685 * won't exit until the a last reference to the vfs_t goes away.
1685 1686 */
1686 1687 if (mi->mi_inactive_thread != NULL) {
1687 1688 mutex_enter(&mi->mi_lock);
1688 1689 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1689 1690 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1690 1691 mutex_exit(&mi->mi_lock);
1691 1692 if (!cv_wait_sig(&mi->mi_async_cv,
1692 1693 &mi->mi_async_lock)) {
1693 1694 intr = TRUE;
1694 1695 goto interrupted;
1695 1696 }
1696 1697 mutex_enter(&mi->mi_lock);
1697 1698 }
1698 1699 mutex_exit(&mi->mi_lock);
1699 1700 }
1700 1701 interrupted:
1701 1702 if (intr)
1702 1703 mi->mi_max_threads = omax;
1703 1704 mutex_exit(&mi->mi_async_lock);
1704 1705
1705 1706 return (intr);
1706 1707 }
1707 1708
1708 1709 int
1709 1710 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1710 1711 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1711 1712 u_offset_t, size_t, int, cred_t *))
1712 1713 {
1713 1714 rnode4_t *rp;
1714 1715 mntinfo4_t *mi;
1715 1716 struct nfs4_async_reqs *args;
1716 1717
1717 1718 ASSERT(flags & B_ASYNC);
1718 1719 ASSERT(vp->v_vfsp != NULL);
1719 1720
1720 1721 rp = VTOR4(vp);
1721 1722 ASSERT(rp->r_count > 0);
1722 1723
1723 1724 mi = VTOMI4(vp);
1724 1725
1725 1726 /*
1726 1727 * If we can't allocate a request structure, do the putpage
1727 1728 * operation synchronously in this thread's context.
1728 1729 */
1729 1730 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1730 1731 goto noasync;
1731 1732
1732 1733 args->a_next = NULL;
1733 1734 #ifdef DEBUG
1734 1735 args->a_queuer = curthread;
1735 1736 #endif
1736 1737 VN_HOLD(vp);
1737 1738 args->a_vp = vp;
1738 1739 ASSERT(cr != NULL);
1739 1740 crhold(cr);
1740 1741 args->a_cred = cr;
1741 1742 args->a_io = NFS4_PUTAPAGE;
1742 1743 args->a_nfs4_putapage = putapage;
1743 1744 args->a_nfs4_pp = pp;
1744 1745 args->a_nfs4_off = off;
1745 1746 args->a_nfs4_len = (uint_t)len;
1746 1747 args->a_nfs4_flags = flags;
1747 1748
1748 1749 mutex_enter(&mi->mi_async_lock);
1749 1750
1750 1751 /*
1751 1752 * If asyncio has been disabled, then make a synchronous request.
1752 1753 * This check is done a second time in case async io was diabled
1753 1754 * while this thread was blocked waiting for memory pressure to
1754 1755 * reduce or for the queue to drain.
1755 1756 */
1756 1757 if (mi->mi_max_threads == 0) {
1757 1758 mutex_exit(&mi->mi_async_lock);
1758 1759
1759 1760 VN_RELE(vp);
1760 1761 crfree(cr);
1761 1762 kmem_free(args, sizeof (*args));
1762 1763 goto noasync;
1763 1764 }
1764 1765
1765 1766 /*
1766 1767 * Link request structure into the async list and
1767 1768 * wakeup async thread to do the i/o.
1768 1769 */
1769 1770 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1770 1771 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1771 1772 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1772 1773 } else {
1773 1774 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1774 1775 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1775 1776 }
1776 1777
1777 1778 mutex_enter(&rp->r_statelock);
1778 1779 rp->r_count++;
1779 1780 rp->r_awcount++;
1780 1781 mutex_exit(&rp->r_statelock);
1781 1782
1782 1783 if (mi->mi_io_kstats) {
1783 1784 mutex_enter(&mi->mi_lock);
1784 1785 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1785 1786 mutex_exit(&mi->mi_lock);
1786 1787 }
1787 1788
1788 1789 mi->mi_async_req_count++;
1789 1790 ASSERT(mi->mi_async_req_count != 0);
1790 1791 cv_signal(&mi->mi_async_reqs_cv);
1791 1792 mutex_exit(&mi->mi_async_lock);
1792 1793 return (0);
1793 1794
1794 1795 noasync:
1795 1796
1796 1797 if (curproc == proc_pageout || curproc == proc_fsflush) {
1797 1798 /*
1798 1799 * If we get here in the context of the pageout/fsflush,
1799 1800 * or we have run out of memory or we're attempting to
1800 1801 * unmount we refuse to do a sync write, because this may
1801 1802 * hang pageout/fsflush and the machine. In this case,
1802 1803 * we just re-mark the page as dirty and punt on the page.
1803 1804 *
1804 1805 * Make sure B_FORCE isn't set. We can re-mark the
1805 1806 * pages as dirty and unlock the pages in one swoop by
1806 1807 * passing in B_ERROR to pvn_write_done(). However,
1807 1808 * we should make sure B_FORCE isn't set - we don't
1808 1809 * want the page tossed before it gets written out.
1809 1810 */
1810 1811 if (flags & B_FORCE)
1811 1812 flags &= ~(B_INVAL | B_FORCE);
1812 1813 pvn_write_done(pp, flags | B_ERROR);
1813 1814 return (0);
1814 1815 }
1815 1816
1816 1817 if (nfs_zone() != mi->mi_zone) {
1817 1818 /*
1818 1819 * So this was a cross-zone sync putpage.
1819 1820 *
1820 1821 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1821 1822 * as dirty and unlock them.
1822 1823 *
1823 1824 * We don't want to clear B_FORCE here as the caller presumably
1824 1825 * knows what they're doing if they set it.
1825 1826 */
1826 1827 pvn_write_done(pp, flags | B_ERROR);
1827 1828 return (EPERM);
1828 1829 }
1829 1830 return ((*putapage)(vp, pp, off, len, flags, cr));
1830 1831 }
1831 1832
1832 1833 int
1833 1834 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1834 1835 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1835 1836 size_t, int, cred_t *))
1836 1837 {
1837 1838 rnode4_t *rp;
1838 1839 mntinfo4_t *mi;
1839 1840 struct nfs4_async_reqs *args;
1840 1841
1841 1842 ASSERT(flags & B_ASYNC);
1842 1843 ASSERT(vp->v_vfsp != NULL);
1843 1844
1844 1845 rp = VTOR4(vp);
1845 1846 ASSERT(rp->r_count > 0);
1846 1847
1847 1848 mi = VTOMI4(vp);
1848 1849
1849 1850 /*
1850 1851 * If we can't allocate a request structure, do the pageio
1851 1852 * request synchronously in this thread's context.
1852 1853 */
1853 1854 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1854 1855 goto noasync;
1855 1856
1856 1857 args->a_next = NULL;
1857 1858 #ifdef DEBUG
1858 1859 args->a_queuer = curthread;
1859 1860 #endif
1860 1861 VN_HOLD(vp);
1861 1862 args->a_vp = vp;
1862 1863 ASSERT(cr != NULL);
1863 1864 crhold(cr);
1864 1865 args->a_cred = cr;
1865 1866 args->a_io = NFS4_PAGEIO;
1866 1867 args->a_nfs4_pageio = pageio;
1867 1868 args->a_nfs4_pp = pp;
1868 1869 args->a_nfs4_off = io_off;
1869 1870 args->a_nfs4_len = (uint_t)io_len;
1870 1871 args->a_nfs4_flags = flags;
1871 1872
1872 1873 mutex_enter(&mi->mi_async_lock);
1873 1874
1874 1875 /*
1875 1876 * If asyncio has been disabled, then make a synchronous request.
1876 1877 * This check is done a second time in case async io was diabled
1877 1878 * while this thread was blocked waiting for memory pressure to
1878 1879 * reduce or for the queue to drain.
1879 1880 */
1880 1881 if (mi->mi_max_threads == 0) {
1881 1882 mutex_exit(&mi->mi_async_lock);
1882 1883
1883 1884 VN_RELE(vp);
1884 1885 crfree(cr);
1885 1886 kmem_free(args, sizeof (*args));
1886 1887 goto noasync;
1887 1888 }
1888 1889
1889 1890 /*
1890 1891 * Link request structure into the async list and
1891 1892 * wakeup async thread to do the i/o.
1892 1893 */
1893 1894 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1894 1895 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1895 1896 mi->mi_async_tail[NFS4_PAGEIO] = args;
1896 1897 } else {
1897 1898 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1898 1899 mi->mi_async_tail[NFS4_PAGEIO] = args;
1899 1900 }
1900 1901
1901 1902 mutex_enter(&rp->r_statelock);
1902 1903 rp->r_count++;
1903 1904 rp->r_awcount++;
1904 1905 mutex_exit(&rp->r_statelock);
1905 1906
1906 1907 if (mi->mi_io_kstats) {
1907 1908 mutex_enter(&mi->mi_lock);
1908 1909 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1909 1910 mutex_exit(&mi->mi_lock);
1910 1911 }
1911 1912
1912 1913 mi->mi_async_req_count++;
1913 1914 ASSERT(mi->mi_async_req_count != 0);
1914 1915 cv_signal(&mi->mi_async_reqs_cv);
1915 1916 mutex_exit(&mi->mi_async_lock);
1916 1917 return (0);
1917 1918
1918 1919 noasync:
1919 1920 /*
1920 1921 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1921 1922 * the page list), for writes we do it synchronously, except for
1922 1923 * proc_pageout/proc_fsflush as described below.
1923 1924 */
1924 1925 if (flags & B_READ) {
1925 1926 pvn_read_done(pp, flags | B_ERROR);
1926 1927 return (0);
1927 1928 }
1928 1929
1929 1930 if (curproc == proc_pageout || curproc == proc_fsflush) {
1930 1931 /*
1931 1932 * If we get here in the context of the pageout/fsflush,
1932 1933 * we refuse to do a sync write, because this may hang
1933 1934 * pageout/fsflush (and the machine). In this case, we just
1934 1935 * re-mark the page as dirty and punt on the page.
1935 1936 *
1936 1937 * Make sure B_FORCE isn't set. We can re-mark the
1937 1938 * pages as dirty and unlock the pages in one swoop by
1938 1939 * passing in B_ERROR to pvn_write_done(). However,
1939 1940 * we should make sure B_FORCE isn't set - we don't
1940 1941 * want the page tossed before it gets written out.
1941 1942 */
1942 1943 if (flags & B_FORCE)
1943 1944 flags &= ~(B_INVAL | B_FORCE);
1944 1945 pvn_write_done(pp, flags | B_ERROR);
1945 1946 return (0);
1946 1947 }
1947 1948
1948 1949 if (nfs_zone() != mi->mi_zone) {
1949 1950 /*
1950 1951 * So this was a cross-zone sync pageio. We pass in B_ERROR
1951 1952 * to pvn_write_done() to re-mark the pages as dirty and unlock
1952 1953 * them.
1953 1954 *
1954 1955 * We don't want to clear B_FORCE here as the caller presumably
1955 1956 * knows what they're doing if they set it.
1956 1957 */
1957 1958 pvn_write_done(pp, flags | B_ERROR);
1958 1959 return (EPERM);
1959 1960 }
1960 1961 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1961 1962 }
1962 1963
1963 1964 void
1964 1965 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1965 1966 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1966 1967 {
1967 1968 rnode4_t *rp;
1968 1969 mntinfo4_t *mi;
1969 1970 struct nfs4_async_reqs *args;
1970 1971
1971 1972 rp = VTOR4(vp);
1972 1973 ASSERT(rp->r_freef == NULL);
1973 1974
1974 1975 mi = VTOMI4(vp);
1975 1976
1976 1977 /*
1977 1978 * If we can't allocate a request structure, skip the readdir.
1978 1979 */
1979 1980 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1980 1981 goto noasync;
1981 1982
1982 1983 args->a_next = NULL;
1983 1984 #ifdef DEBUG
1984 1985 args->a_queuer = curthread;
1985 1986 #endif
1986 1987 VN_HOLD(vp);
1987 1988 args->a_vp = vp;
1988 1989 ASSERT(cr != NULL);
1989 1990 crhold(cr);
1990 1991 args->a_cred = cr;
1991 1992 args->a_io = NFS4_READDIR;
1992 1993 args->a_nfs4_readdir = readdir;
1993 1994 args->a_nfs4_rdc = rdc;
1994 1995
1995 1996 mutex_enter(&mi->mi_async_lock);
1996 1997
1997 1998 /*
1998 1999 * If asyncio has been disabled, then skip this request
1999 2000 */
2000 2001 if (mi->mi_max_threads == 0) {
2001 2002 mutex_exit(&mi->mi_async_lock);
2002 2003
2003 2004 VN_RELE(vp);
2004 2005 crfree(cr);
2005 2006 kmem_free(args, sizeof (*args));
2006 2007 goto noasync;
2007 2008 }
2008 2009
2009 2010 /*
2010 2011 * Link request structure into the async list and
2011 2012 * wakeup async thread to do the i/o.
2012 2013 */
2013 2014 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2014 2015 mi->mi_async_reqs[NFS4_READDIR] = args;
2015 2016 mi->mi_async_tail[NFS4_READDIR] = args;
2016 2017 } else {
2017 2018 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2018 2019 mi->mi_async_tail[NFS4_READDIR] = args;
2019 2020 }
2020 2021
2021 2022 mutex_enter(&rp->r_statelock);
2022 2023 rp->r_count++;
2023 2024 mutex_exit(&rp->r_statelock);
2024 2025
2025 2026 if (mi->mi_io_kstats) {
2026 2027 mutex_enter(&mi->mi_lock);
2027 2028 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2028 2029 mutex_exit(&mi->mi_lock);
2029 2030 }
2030 2031
2031 2032 mi->mi_async_req_count++;
2032 2033 ASSERT(mi->mi_async_req_count != 0);
2033 2034 cv_signal(&mi->mi_async_reqs_cv);
2034 2035 mutex_exit(&mi->mi_async_lock);
2035 2036 return;
2036 2037
2037 2038 noasync:
2038 2039 mutex_enter(&rp->r_statelock);
2039 2040 rdc->entries = NULL;
2040 2041 /*
2041 2042 * Indicate that no one is trying to fill this entry and
2042 2043 * it still needs to be filled.
2043 2044 */
2044 2045 rdc->flags &= ~RDDIR;
2045 2046 rdc->flags |= RDDIRREQ;
2046 2047 rddir4_cache_rele(rp, rdc);
2047 2048 mutex_exit(&rp->r_statelock);
2048 2049 }
2049 2050
2050 2051 void
2051 2052 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2052 2053 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2053 2054 cred_t *))
2054 2055 {
2055 2056 rnode4_t *rp;
2056 2057 mntinfo4_t *mi;
2057 2058 struct nfs4_async_reqs *args;
2058 2059 page_t *pp;
2059 2060
2060 2061 rp = VTOR4(vp);
2061 2062 mi = VTOMI4(vp);
2062 2063
2063 2064 /*
2064 2065 * If we can't allocate a request structure, do the commit
2065 2066 * operation synchronously in this thread's context.
2066 2067 */
2067 2068 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2068 2069 goto noasync;
2069 2070
2070 2071 args->a_next = NULL;
2071 2072 #ifdef DEBUG
2072 2073 args->a_queuer = curthread;
2073 2074 #endif
2074 2075 VN_HOLD(vp);
2075 2076 args->a_vp = vp;
2076 2077 ASSERT(cr != NULL);
2077 2078 crhold(cr);
2078 2079 args->a_cred = cr;
2079 2080 args->a_io = NFS4_COMMIT;
2080 2081 args->a_nfs4_commit = commit;
2081 2082 args->a_nfs4_plist = plist;
2082 2083 args->a_nfs4_offset = offset;
2083 2084 args->a_nfs4_count = count;
2084 2085
2085 2086 mutex_enter(&mi->mi_async_lock);
2086 2087
2087 2088 /*
2088 2089 * If asyncio has been disabled, then make a synchronous request.
2089 2090 * This check is done a second time in case async io was diabled
2090 2091 * while this thread was blocked waiting for memory pressure to
2091 2092 * reduce or for the queue to drain.
2092 2093 */
2093 2094 if (mi->mi_max_threads == 0) {
2094 2095 mutex_exit(&mi->mi_async_lock);
2095 2096
2096 2097 VN_RELE(vp);
2097 2098 crfree(cr);
2098 2099 kmem_free(args, sizeof (*args));
2099 2100 goto noasync;
2100 2101 }
2101 2102
2102 2103 /*
2103 2104 * Link request structure into the async list and
2104 2105 * wakeup async thread to do the i/o.
2105 2106 */
2106 2107 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2107 2108 mi->mi_async_reqs[NFS4_COMMIT] = args;
2108 2109 mi->mi_async_tail[NFS4_COMMIT] = args;
2109 2110 } else {
2110 2111 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2111 2112 mi->mi_async_tail[NFS4_COMMIT] = args;
2112 2113 }
2113 2114
2114 2115 mutex_enter(&rp->r_statelock);
2115 2116 rp->r_count++;
2116 2117 mutex_exit(&rp->r_statelock);
2117 2118
2118 2119 if (mi->mi_io_kstats) {
2119 2120 mutex_enter(&mi->mi_lock);
2120 2121 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2121 2122 mutex_exit(&mi->mi_lock);
2122 2123 }
2123 2124
2124 2125 mi->mi_async_req_count++;
2125 2126 ASSERT(mi->mi_async_req_count != 0);
2126 2127 cv_signal(&mi->mi_async_reqs_cv);
2127 2128 mutex_exit(&mi->mi_async_lock);
2128 2129 return;
2129 2130
2130 2131 noasync:
2131 2132 if (curproc == proc_pageout || curproc == proc_fsflush ||
2132 2133 nfs_zone() != mi->mi_zone) {
2133 2134 while (plist != NULL) {
2134 2135 pp = plist;
2135 2136 page_sub(&plist, pp);
2136 2137 pp->p_fsdata = C_COMMIT;
2137 2138 page_unlock(pp);
2138 2139 }
2139 2140 return;
2140 2141 }
2141 2142 (*commit)(vp, plist, offset, count, cr);
2142 2143 }
2143 2144
2144 2145 /*
2145 2146 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The
2146 2147 * reference to the vnode is handed over to the thread; the caller should
2147 2148 * no longer refer to the vnode.
2148 2149 *
2149 2150 * Unlike most of the async routines, this handoff is needed for
2150 2151 * correctness reasons, not just performance. So doing operations in the
2151 2152 * context of the current thread is not an option.
2152 2153 */
2153 2154 void
2154 2155 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2155 2156 {
2156 2157 mntinfo4_t *mi;
2157 2158 struct nfs4_async_reqs *args;
2158 2159 boolean_t signal_inactive_thread = B_FALSE;
2159 2160
2160 2161 mi = VTOMI4(vp);
2161 2162
2162 2163 args = kmem_alloc(sizeof (*args), KM_SLEEP);
2163 2164 args->a_next = NULL;
2164 2165 #ifdef DEBUG
2165 2166 args->a_queuer = curthread;
2166 2167 #endif
2167 2168 args->a_vp = vp;
2168 2169 ASSERT(cr != NULL);
2169 2170 crhold(cr);
2170 2171 args->a_cred = cr;
2171 2172 args->a_io = NFS4_INACTIVE;
2172 2173
2173 2174 /*
2174 2175 * Note that we don't check mi->mi_max_threads here, since we
2175 2176 * *need* to get rid of this vnode regardless of whether someone
2176 2177 * set nfs4_max_threads to zero in /etc/system.
2177 2178 *
2178 2179 * The manager thread knows about this and is willing to create
2179 2180 * at least one thread to accommodate us.
2180 2181 */
2181 2182 mutex_enter(&mi->mi_async_lock);
2182 2183 if (mi->mi_inactive_thread == NULL) {
2183 2184 rnode4_t *rp;
2184 2185 vnode_t *unldvp = NULL;
2185 2186 char *unlname;
2186 2187 cred_t *unlcred;
2187 2188
2188 2189 mutex_exit(&mi->mi_async_lock);
2189 2190 /*
2190 2191 * We just need to free up the memory associated with the
2191 2192 * vnode, which can be safely done from within the current
2192 2193 * context.
2193 2194 */
2194 2195 crfree(cr); /* drop our reference */
2195 2196 kmem_free(args, sizeof (*args));
2196 2197 rp = VTOR4(vp);
2197 2198 mutex_enter(&rp->r_statelock);
2198 2199 if (rp->r_unldvp != NULL) {
2199 2200 unldvp = rp->r_unldvp;
2200 2201 rp->r_unldvp = NULL;
2201 2202 unlname = rp->r_unlname;
2202 2203 rp->r_unlname = NULL;
2203 2204 unlcred = rp->r_unlcred;
2204 2205 rp->r_unlcred = NULL;
2205 2206 }
2206 2207 mutex_exit(&rp->r_statelock);
2207 2208 /*
2208 2209 * No need to explicitly throw away any cached pages. The
2209 2210 * eventual r4inactive() will attempt a synchronous
2210 2211 * VOP_PUTPAGE() which will immediately fail since the request
2211 2212 * is coming from the wrong zone, and then will proceed to call
2212 2213 * nfs4_invalidate_pages() which will clean things up for us.
2213 2214 *
2214 2215 * Throw away the delegation here so rp4_addfree()'s attempt to
2215 2216 * return any existing delegations becomes a no-op.
2216 2217 */
2217 2218 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2218 2219 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2219 2220 FALSE);
2220 2221 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2221 2222 nfs_rw_exit(&mi->mi_recovlock);
2222 2223 }
2223 2224 nfs4_clear_open_streams(rp);
2224 2225
2225 2226 rp4_addfree(rp, cr);
2226 2227 if (unldvp != NULL) {
2227 2228 kmem_free(unlname, MAXNAMELEN);
2228 2229 VN_RELE(unldvp);
2229 2230 crfree(unlcred);
2230 2231 }
2231 2232 return;
2232 2233 }
2233 2234
2234 2235 if (mi->mi_manager_thread == NULL) {
2235 2236 /*
2236 2237 * We want to talk to the inactive thread.
2237 2238 */
2238 2239 signal_inactive_thread = B_TRUE;
2239 2240 }
2240 2241
2241 2242 /*
2242 2243 * Enqueue the vnode and wake up either the special thread (empty
2243 2244 * list) or an async thread.
2244 2245 */
2245 2246 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2246 2247 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2247 2248 mi->mi_async_tail[NFS4_INACTIVE] = args;
2248 2249 signal_inactive_thread = B_TRUE;
2249 2250 } else {
2250 2251 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2251 2252 mi->mi_async_tail[NFS4_INACTIVE] = args;
2252 2253 }
2253 2254 if (signal_inactive_thread) {
2254 2255 cv_signal(&mi->mi_inact_req_cv);
2255 2256 } else {
2256 2257 mi->mi_async_req_count++;
2257 2258 ASSERT(mi->mi_async_req_count != 0);
2258 2259 cv_signal(&mi->mi_async_reqs_cv);
2259 2260 }
2260 2261
2261 2262 mutex_exit(&mi->mi_async_lock);
2262 2263 }
2263 2264
2264 2265 int
2265 2266 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2266 2267 {
2267 2268 int pagecreate;
2268 2269 int n;
2269 2270 int saved_n;
2270 2271 caddr_t saved_base;
2271 2272 u_offset_t offset;
2272 2273 int error;
2273 2274 int sm_error;
2274 2275 vnode_t *vp = RTOV(rp);
2275 2276
2276 2277 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2277 2278 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2278 2279 if (!vpm_enable) {
2279 2280 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2280 2281 }
2281 2282
2282 2283 /*
2283 2284 * Move bytes in at most PAGESIZE chunks. We must avoid
2284 2285 * spanning pages in uiomove() because page faults may cause
2285 2286 * the cache to be invalidated out from under us. The r_size is not
2286 2287 * updated until after the uiomove. If we push the last page of a
2287 2288 * file before r_size is correct, we will lose the data written past
2288 2289 * the current (and invalid) r_size.
2289 2290 */
2290 2291 do {
2291 2292 offset = uio->uio_loffset;
2292 2293 pagecreate = 0;
2293 2294
2294 2295 /*
2295 2296 * n is the number of bytes required to satisfy the request
2296 2297 * or the number of bytes to fill out the page.
2297 2298 */
2298 2299 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2299 2300
2300 2301 /*
2301 2302 * Check to see if we can skip reading in the page
2302 2303 * and just allocate the memory. We can do this
2303 2304 * if we are going to rewrite the entire mapping
2304 2305 * or if we are going to write to or beyond the current
2305 2306 * end of file from the beginning of the mapping.
2306 2307 *
2307 2308 * The read of r_size is now protected by r_statelock.
2308 2309 */
2309 2310 mutex_enter(&rp->r_statelock);
2310 2311 /*
2311 2312 * When pgcreated is nonzero the caller has already done
2312 2313 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2313 2314 * segkpm this means we already have at least one page
2314 2315 * created and mapped at base.
2315 2316 */
2316 2317 pagecreate = pgcreated ||
2317 2318 ((offset & PAGEOFFSET) == 0 &&
2318 2319 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2319 2320
2320 2321 mutex_exit(&rp->r_statelock);
2321 2322
2322 2323 if (!vpm_enable && pagecreate) {
2323 2324 /*
2324 2325 * The last argument tells segmap_pagecreate() to
2325 2326 * always lock the page, as opposed to sometimes
2326 2327 * returning with the page locked. This way we avoid a
2327 2328 * fault on the ensuing uiomove(), but also
2328 2329 * more importantly (to fix bug 1094402) we can
2329 2330 * call segmap_fault() to unlock the page in all
2330 2331 * cases. An alternative would be to modify
2331 2332 * segmap_pagecreate() to tell us when it is
2332 2333 * locking a page, but that's a fairly major
2333 2334 * interface change.
2334 2335 */
2335 2336 if (pgcreated == 0)
2336 2337 (void) segmap_pagecreate(segkmap, base,
2337 2338 (uint_t)n, 1);
2338 2339 saved_base = base;
2339 2340 saved_n = n;
2340 2341 }
2341 2342
2342 2343 /*
2343 2344 * The number of bytes of data in the last page can not
2344 2345 * be accurately be determined while page is being
2345 2346 * uiomove'd to and the size of the file being updated.
2346 2347 * Thus, inform threads which need to know accurately
2347 2348 * how much data is in the last page of the file. They
2348 2349 * will not do the i/o immediately, but will arrange for
2349 2350 * the i/o to happen later when this modify operation
2350 2351 * will have finished.
2351 2352 */
2352 2353 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2353 2354 mutex_enter(&rp->r_statelock);
2354 2355 rp->r_flags |= R4MODINPROGRESS;
2355 2356 rp->r_modaddr = (offset & MAXBMASK);
2356 2357 mutex_exit(&rp->r_statelock);
2357 2358
2358 2359 if (vpm_enable) {
2359 2360 /*
2360 2361 * Copy data. If new pages are created, part of
2361 2362 * the page that is not written will be initizliazed
2362 2363 * with zeros.
2363 2364 */
2364 2365 error = vpm_data_copy(vp, offset, n, uio,
2365 2366 !pagecreate, NULL, 0, S_WRITE);
2366 2367 } else {
2367 2368 error = uiomove(base, n, UIO_WRITE, uio);
2368 2369 }
2369 2370
2370 2371 /*
2371 2372 * r_size is the maximum number of
2372 2373 * bytes known to be in the file.
2373 2374 * Make sure it is at least as high as the
2374 2375 * first unwritten byte pointed to by uio_loffset.
2375 2376 */
2376 2377 mutex_enter(&rp->r_statelock);
2377 2378 if (rp->r_size < uio->uio_loffset)
2378 2379 rp->r_size = uio->uio_loffset;
2379 2380 rp->r_flags &= ~R4MODINPROGRESS;
2380 2381 rp->r_flags |= R4DIRTY;
2381 2382 mutex_exit(&rp->r_statelock);
2382 2383
2383 2384 /* n = # of bytes written */
2384 2385 n = (int)(uio->uio_loffset - offset);
2385 2386
2386 2387 if (!vpm_enable) {
2387 2388 base += n;
2388 2389 }
2389 2390
2390 2391 tcount -= n;
2391 2392 /*
2392 2393 * If we created pages w/o initializing them completely,
2393 2394 * we need to zero the part that wasn't set up.
2394 2395 * This happens on a most EOF write cases and if
2395 2396 * we had some sort of error during the uiomove.
2396 2397 */
2397 2398 if (!vpm_enable && pagecreate) {
2398 2399 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2399 2400 (void) kzero(base, PAGESIZE - n);
2400 2401
2401 2402 if (pgcreated) {
2402 2403 /*
2403 2404 * Caller is responsible for this page,
2404 2405 * it was not created in this loop.
2405 2406 */
2406 2407 pgcreated = 0;
2407 2408 } else {
2408 2409 /*
2409 2410 * For bug 1094402: segmap_pagecreate locks
2410 2411 * page. Unlock it. This also unlocks the
2411 2412 * pages allocated by page_create_va() in
2412 2413 * segmap_pagecreate().
2413 2414 */
2414 2415 sm_error = segmap_fault(kas.a_hat, segkmap,
2415 2416 saved_base, saved_n,
2416 2417 F_SOFTUNLOCK, S_WRITE);
2417 2418 if (error == 0)
2418 2419 error = sm_error;
2419 2420 }
2420 2421 }
2421 2422 } while (tcount > 0 && error == 0);
2422 2423
2423 2424 return (error);
2424 2425 }
2425 2426
2426 2427 int
2427 2428 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2428 2429 {
2429 2430 rnode4_t *rp;
2430 2431 page_t *pp;
2431 2432 u_offset_t eoff;
2432 2433 u_offset_t io_off;
2433 2434 size_t io_len;
2434 2435 int error;
2435 2436 int rdirty;
2436 2437 int err;
2437 2438
2438 2439 rp = VTOR4(vp);
2439 2440 ASSERT(rp->r_count > 0);
2440 2441
2441 2442 if (!nfs4_has_pages(vp))
2442 2443 return (0);
2443 2444
2444 2445 ASSERT(vp->v_type != VCHR);
2445 2446
2446 2447 /*
2447 2448 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2448 2449 * writes. B_FORCE is set to force the VM system to actually
2449 2450 * invalidate the pages, even if the i/o failed. The pages
2450 2451 * need to get invalidated because they can't be written out
2451 2452 * because there isn't any space left on either the server's
2452 2453 * file system or in the user's disk quota. The B_FREE bit
2453 2454 * is cleared to avoid confusion as to whether this is a
2454 2455 * request to place the page on the freelist or to destroy
2455 2456 * it.
2456 2457 */
2457 2458 if ((rp->r_flags & R4OUTOFSPACE) ||
2458 2459 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2459 2460 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2460 2461
2461 2462 if (len == 0) {
2462 2463 /*
2463 2464 * If doing a full file synchronous operation, then clear
2464 2465 * the R4DIRTY bit. If a page gets dirtied while the flush
2465 2466 * is happening, then R4DIRTY will get set again. The
2466 2467 * R4DIRTY bit must get cleared before the flush so that
2467 2468 * we don't lose this information.
2468 2469 *
2469 2470 * If there are no full file async write operations
2470 2471 * pending and RDIRTY bit is set, clear it.
2471 2472 */
2472 2473 if (off == (u_offset_t)0 &&
2473 2474 !(flags & B_ASYNC) &&
2474 2475 (rp->r_flags & R4DIRTY)) {
2475 2476 mutex_enter(&rp->r_statelock);
2476 2477 rdirty = (rp->r_flags & R4DIRTY);
2477 2478 rp->r_flags &= ~R4DIRTY;
2478 2479 mutex_exit(&rp->r_statelock);
2479 2480 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2480 2481 mutex_enter(&rp->r_statelock);
2481 2482 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2482 2483 rdirty = (rp->r_flags & R4DIRTY);
2483 2484 rp->r_flags &= ~R4DIRTY;
2484 2485 }
2485 2486 mutex_exit(&rp->r_statelock);
2486 2487 } else
2487 2488 rdirty = 0;
2488 2489
2489 2490 /*
2490 2491 * Search the entire vp list for pages >= off, and flush
2491 2492 * the dirty pages.
2492 2493 */
2493 2494 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2494 2495 flags, cr);
2495 2496
2496 2497 /*
2497 2498 * If an error occurred and the file was marked as dirty
2498 2499 * before and we aren't forcibly invalidating pages, then
2499 2500 * reset the R4DIRTY flag.
2500 2501 */
2501 2502 if (error && rdirty &&
2502 2503 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2503 2504 mutex_enter(&rp->r_statelock);
2504 2505 rp->r_flags |= R4DIRTY;
2505 2506 mutex_exit(&rp->r_statelock);
2506 2507 }
2507 2508 } else {
2508 2509 /*
2509 2510 * Do a range from [off...off + len) looking for pages
2510 2511 * to deal with.
2511 2512 */
2512 2513 error = 0;
2513 2514 io_len = 0;
2514 2515 eoff = off + len;
2515 2516 mutex_enter(&rp->r_statelock);
2516 2517 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2517 2518 io_off += io_len) {
2518 2519 mutex_exit(&rp->r_statelock);
2519 2520 /*
2520 2521 * If we are not invalidating, synchronously
2521 2522 * freeing or writing pages use the routine
2522 2523 * page_lookup_nowait() to prevent reclaiming
2523 2524 * them from the free list.
2524 2525 */
2525 2526 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2526 2527 pp = page_lookup(vp, io_off,
2527 2528 (flags & (B_INVAL | B_FREE)) ?
2528 2529 SE_EXCL : SE_SHARED);
2529 2530 } else {
2530 2531 pp = page_lookup_nowait(vp, io_off,
2531 2532 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2532 2533 }
2533 2534
2534 2535 if (pp == NULL || !pvn_getdirty(pp, flags))
2535 2536 io_len = PAGESIZE;
2536 2537 else {
2537 2538 err = (*rp->r_putapage)(vp, pp, &io_off,
2538 2539 &io_len, flags, cr);
2539 2540 if (!error)
2540 2541 error = err;
2541 2542 /*
2542 2543 * "io_off" and "io_len" are returned as
2543 2544 * the range of pages we actually wrote.
2544 2545 * This allows us to skip ahead more quickly
2545 2546 * since several pages may've been dealt
2546 2547 * with by this iteration of the loop.
2547 2548 */
2548 2549 }
2549 2550 mutex_enter(&rp->r_statelock);
2550 2551 }
2551 2552 mutex_exit(&rp->r_statelock);
2552 2553 }
2553 2554
2554 2555 return (error);
2555 2556 }
2556 2557
2557 2558 void
2558 2559 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2559 2560 {
2560 2561 rnode4_t *rp;
2561 2562
2562 2563 rp = VTOR4(vp);
2563 2564 if (IS_SHADOW(vp, rp))
2564 2565 vp = RTOV4(rp);
2565 2566 mutex_enter(&rp->r_statelock);
2566 2567 while (rp->r_flags & R4TRUNCATE)
2567 2568 cv_wait(&rp->r_cv, &rp->r_statelock);
2568 2569 rp->r_flags |= R4TRUNCATE;
2569 2570 if (off == (u_offset_t)0) {
2570 2571 rp->r_flags &= ~R4DIRTY;
2571 2572 if (!(rp->r_flags & R4STALE))
2572 2573 rp->r_error = 0;
2573 2574 }
2574 2575 rp->r_truncaddr = off;
2575 2576 mutex_exit(&rp->r_statelock);
2576 2577 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2577 2578 B_INVAL | B_TRUNC, cr);
2578 2579 mutex_enter(&rp->r_statelock);
2579 2580 rp->r_flags &= ~R4TRUNCATE;
2580 2581 cv_broadcast(&rp->r_cv);
2581 2582 mutex_exit(&rp->r_statelock);
2582 2583 }
2583 2584
2584 2585 static int
2585 2586 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2586 2587 {
2587 2588 mntinfo4_t *mi;
2588 2589 struct mntinfo_kstat *mik;
2589 2590 vfs_t *vfsp;
2590 2591
2591 2592 /* this is a read-only kstat. Bail out on a write */
2592 2593 if (rw == KSTAT_WRITE)
2593 2594 return (EACCES);
2594 2595
2595 2596
2596 2597 /*
2597 2598 * We don't want to wait here as kstat_chain_lock could be held by
2598 2599 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2599 2600 * and thus could lead to a deadlock.
2600 2601 */
2601 2602 vfsp = (struct vfs *)ksp->ks_private;
2602 2603
2603 2604 mi = VFTOMI4(vfsp);
2604 2605 mik = (struct mntinfo_kstat *)ksp->ks_data;
2605 2606
2606 2607 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2607 2608
2608 2609 mik->mik_vers = (uint32_t)mi->mi_vers;
2609 2610 mik->mik_flags = mi->mi_flags;
2610 2611 /*
2611 2612 * The sv_secdata holds the flavor the client specifies.
2612 2613 * If the client uses default and a security negotiation
2613 2614 * occurs, sv_currsec will point to the current flavor
2614 2615 * selected from the server flavor list.
2615 2616 * sv_currsec is NULL if no security negotiation takes place.
2616 2617 */
2617 2618 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2618 2619 mi->mi_curr_serv->sv_currsec->secmod :
2619 2620 mi->mi_curr_serv->sv_secdata->secmod;
2620 2621 mik->mik_curread = (uint32_t)mi->mi_curread;
2621 2622 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2622 2623 mik->mik_retrans = mi->mi_retrans;
2623 2624 mik->mik_timeo = mi->mi_timeo;
2624 2625 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2625 2626 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2626 2627 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2627 2628 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2628 2629 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2629 2630 mik->mik_failover = (uint32_t)mi->mi_failover;
2630 2631 mik->mik_remap = (uint32_t)mi->mi_remap;
2631 2632
2632 2633 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2633 2634
2634 2635 return (0);
2635 2636 }
2636 2637
2637 2638 void
2638 2639 nfs4_mnt_kstat_init(struct vfs *vfsp)
2639 2640 {
2640 2641 mntinfo4_t *mi = VFTOMI4(vfsp);
2641 2642
2642 2643 /*
2643 2644 * PSARC 2001/697 Contract Private Interface
2644 2645 * All nfs kstats are under SunMC contract
2645 2646 * Please refer to the PSARC listed above and contact
2646 2647 * SunMC before making any changes!
2647 2648 *
2648 2649 * Changes must be reviewed by Solaris File Sharing
2649 2650 * Changes must be communicated to contract-2001-697@sun.com
2650 2651 *
2651 2652 */
2652 2653
2653 2654 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2654 2655 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2655 2656 if (mi->mi_io_kstats) {
2656 2657 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2657 2658 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2658 2659 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2659 2660 kstat_install(mi->mi_io_kstats);
2660 2661 }
2661 2662
2662 2663 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2663 2664 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2664 2665 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2665 2666 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2666 2667 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2667 2668 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2668 2669 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2669 2670 kstat_install(mi->mi_ro_kstats);
2670 2671 }
2671 2672
2672 2673 nfs4_mnt_recov_kstat_init(vfsp);
2673 2674 }
2674 2675
2675 2676 void
2676 2677 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2677 2678 {
2678 2679 mntinfo4_t *mi;
2679 2680 clock_t now = ddi_get_lbolt();
2680 2681
2681 2682 mi = VTOMI4(vp);
2682 2683 /*
2683 2684 * In case of forced unmount, do not print any messages
2684 2685 * since it can flood the console with error messages.
2685 2686 */
2686 2687 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2687 2688 return;
2688 2689
2689 2690 /*
2690 2691 * If the mount point is dead, not recoverable, do not
2691 2692 * print error messages that can flood the console.
2692 2693 */
2693 2694 if (mi->mi_flags & MI4_RECOV_FAIL)
2694 2695 return;
2695 2696
2696 2697 /*
2697 2698 * No use in flooding the console with ENOSPC
2698 2699 * messages from the same file system.
2699 2700 */
2700 2701 if ((error != ENOSPC && error != EDQUOT) ||
2701 2702 now - mi->mi_printftime > 0) {
2702 2703 zoneid_t zoneid = mi->mi_zone->zone_id;
2703 2704
2704 2705 #ifdef DEBUG
2705 2706 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2706 2707 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2707 2708 #else
2708 2709 nfs_perror(error, "NFS write error on host %s: %m.\n",
2709 2710 VTOR4(vp)->r_server->sv_hostname, NULL);
2710 2711 #endif
2711 2712 if (error == ENOSPC || error == EDQUOT) {
2712 2713 zcmn_err(zoneid, CE_CONT,
2713 2714 "^File: userid=%d, groupid=%d\n",
2714 2715 crgetuid(cr), crgetgid(cr));
2715 2716 if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2716 2717 crgetgid(curthread->t_cred) != crgetgid(cr)) {
2717 2718 zcmn_err(zoneid, CE_CONT,
2718 2719 "^User: userid=%d, groupid=%d\n",
2719 2720 crgetuid(curthread->t_cred),
2720 2721 crgetgid(curthread->t_cred));
2721 2722 }
2722 2723 mi->mi_printftime = now +
2723 2724 nfs_write_error_interval * hz;
2724 2725 }
2725 2726 sfh4_printfhandle(VTOR4(vp)->r_fh);
2726 2727 #ifdef DEBUG
2727 2728 if (error == EACCES) {
2728 2729 zcmn_err(zoneid, CE_CONT,
2729 2730 "nfs_bio: cred is%s kcred\n",
2730 2731 cr == kcred ? "" : " not");
2731 2732 }
2732 2733 #endif
2733 2734 }
2734 2735 }
2735 2736
2736 2737 /*
2737 2738 * Return non-zero if the given file can be safely memory mapped. Locks
2738 2739 * are safe if whole-file (length and offset are both zero).
2739 2740 */
2740 2741
2741 2742 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0)
2742 2743
2743 2744 static int
2744 2745 nfs4_safemap(const vnode_t *vp)
2745 2746 {
2746 2747 locklist_t *llp, *next_llp;
2747 2748 int safe = 1;
2748 2749 rnode4_t *rp = VTOR4(vp);
2749 2750
2750 2751 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2751 2752
2752 2753 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2753 2754 "vp = %p", (void *)vp));
2754 2755
2755 2756 /*
2756 2757 * Review all the locks for the vnode, both ones that have been
2757 2758 * acquired and ones that are pending. We assume that
2758 2759 * flk_active_locks_for_vp() has merged any locks that can be
2759 2760 * merged (so that if a process has the entire file locked, it is
2760 2761 * represented as a single lock).
2761 2762 *
2762 2763 * Note that we can't bail out of the loop if we find a non-safe
2763 2764 * lock, because we have to free all the elements in the llp list.
2764 2765 * We might be able to speed up this code slightly by not looking
2765 2766 * at each lock's l_start and l_len fields once we've found a
2766 2767 * non-safe lock.
2767 2768 */
2768 2769
2769 2770 llp = flk_active_locks_for_vp(vp);
2770 2771 while (llp) {
2771 2772 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2772 2773 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2773 2774 llp->ll_flock.l_start, llp->ll_flock.l_len));
2774 2775 if (!SAFE_LOCK(llp->ll_flock)) {
2775 2776 safe = 0;
2776 2777 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2777 2778 "nfs4_safemap: unsafe active lock (%" PRId64
2778 2779 ", %" PRId64 ")", llp->ll_flock.l_start,
2779 2780 llp->ll_flock.l_len));
2780 2781 }
2781 2782 next_llp = llp->ll_next;
2782 2783 VN_RELE(llp->ll_vp);
2783 2784 kmem_free(llp, sizeof (*llp));
2784 2785 llp = next_llp;
2785 2786 }
2786 2787
2787 2788 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2788 2789 safe ? "safe" : "unsafe"));
2789 2790 return (safe);
2790 2791 }
2791 2792
2792 2793 /*
2793 2794 * Return whether there is a lost LOCK or LOCKU queued up for the given
2794 2795 * file that would make an mmap request unsafe. cf. nfs4_safemap().
2795 2796 */
2796 2797
2797 2798 bool_t
2798 2799 nfs4_map_lost_lock_conflict(vnode_t *vp)
2799 2800 {
2800 2801 bool_t conflict = FALSE;
2801 2802 nfs4_lost_rqst_t *lrp;
2802 2803 mntinfo4_t *mi = VTOMI4(vp);
2803 2804
2804 2805 mutex_enter(&mi->mi_lock);
2805 2806 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2806 2807 lrp = list_next(&mi->mi_lost_state, lrp)) {
2807 2808 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2808 2809 continue;
2809 2810 ASSERT(lrp->lr_vp != NULL);
2810 2811 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2811 2812 continue; /* different file */
2812 2813 if (!SAFE_LOCK(*lrp->lr_flk)) {
2813 2814 conflict = TRUE;
2814 2815 break;
2815 2816 }
2816 2817 }
2817 2818
2818 2819 mutex_exit(&mi->mi_lock);
2819 2820 return (conflict);
2820 2821 }
2821 2822
2822 2823 /*
2823 2824 * nfs_lockcompletion:
2824 2825 *
2825 2826 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2826 2827 * as non cachable (set VNOCACHE bit).
2827 2828 */
2828 2829
2829 2830 void
2830 2831 nfs4_lockcompletion(vnode_t *vp, int cmd)
2831 2832 {
2832 2833 rnode4_t *rp = VTOR4(vp);
2833 2834
2834 2835 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2835 2836 ASSERT(!IS_SHADOW(vp, rp));
2836 2837
2837 2838 if (cmd == F_SETLK || cmd == F_SETLKW) {
2838 2839
2839 2840 if (!nfs4_safemap(vp)) {
2840 2841 mutex_enter(&vp->v_lock);
2841 2842 vp->v_flag |= VNOCACHE;
2842 2843 mutex_exit(&vp->v_lock);
2843 2844 } else {
2844 2845 mutex_enter(&vp->v_lock);
2845 2846 vp->v_flag &= ~VNOCACHE;
2846 2847 mutex_exit(&vp->v_lock);
2847 2848 }
2848 2849 }
2849 2850 /*
2850 2851 * The cached attributes of the file are stale after acquiring
2851 2852 * the lock on the file. They were updated when the file was
2852 2853 * opened, but not updated when the lock was acquired. Therefore the
2853 2854 * cached attributes are invalidated after the lock is obtained.
2854 2855 */
2855 2856 PURGE_ATTRCACHE4(vp);
2856 2857 }
2857 2858
2858 2859 /* ARGSUSED */
2859 2860 static void *
2860 2861 nfs4_mi_init(zoneid_t zoneid)
2861 2862 {
2862 2863 struct mi4_globals *mig;
2863 2864
2864 2865 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2865 2866 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2866 2867 list_create(&mig->mig_list, sizeof (mntinfo4_t),
2867 2868 offsetof(mntinfo4_t, mi_zone_node));
2868 2869 mig->mig_destructor_called = B_FALSE;
2869 2870 return (mig);
2870 2871 }
2871 2872
2872 2873 /*
2873 2874 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2874 2875 * state and killing off threads.
2875 2876 */
2876 2877 /* ARGSUSED */
2877 2878 static void
2878 2879 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2879 2880 {
2880 2881 struct mi4_globals *mig = data;
2881 2882 mntinfo4_t *mi;
2882 2883 nfs4_server_t *np;
2883 2884
2884 2885 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2885 2886 "nfs4_mi_shutdown zone %d\n", zoneid));
2886 2887 ASSERT(mig != NULL);
2887 2888 for (;;) {
2888 2889 mutex_enter(&mig->mig_lock);
2889 2890 mi = list_head(&mig->mig_list);
2890 2891 if (mi == NULL) {
2891 2892 mutex_exit(&mig->mig_lock);
2892 2893 break;
2893 2894 }
2894 2895
2895 2896 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2896 2897 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2897 2898 /*
2898 2899 * purge the DNLC for this filesystem
2899 2900 */
2900 2901 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2901 2902 /*
2902 2903 * Tell existing async worker threads to exit.
2903 2904 */
2904 2905 mutex_enter(&mi->mi_async_lock);
2905 2906 mi->mi_max_threads = 0;
2906 2907 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2907 2908 /*
2908 2909 * Set the appropriate flags, signal and wait for both the
2909 2910 * async manager and the inactive thread to exit when they're
2910 2911 * done with their current work.
2911 2912 */
2912 2913 mutex_enter(&mi->mi_lock);
2913 2914 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2914 2915 mutex_exit(&mi->mi_lock);
2915 2916 mutex_exit(&mi->mi_async_lock);
2916 2917 if (mi->mi_manager_thread) {
2917 2918 nfs4_async_manager_stop(mi->mi_vfsp);
2918 2919 }
2919 2920 if (mi->mi_inactive_thread) {
2920 2921 mutex_enter(&mi->mi_async_lock);
2921 2922 cv_signal(&mi->mi_inact_req_cv);
2922 2923 /*
2923 2924 * Wait for the inactive thread to exit.
2924 2925 */
2925 2926 while (mi->mi_inactive_thread != NULL) {
2926 2927 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2927 2928 }
2928 2929 mutex_exit(&mi->mi_async_lock);
2929 2930 }
2930 2931 /*
2931 2932 * Wait for the recovery thread to complete, that is, it will
2932 2933 * signal when it is done using the "mi" structure and about
2933 2934 * to exit
2934 2935 */
2935 2936 mutex_enter(&mi->mi_lock);
2936 2937 while (mi->mi_in_recovery > 0)
2937 2938 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2938 2939 mutex_exit(&mi->mi_lock);
2939 2940 /*
2940 2941 * We're done when every mi has been done or the list is empty.
2941 2942 * This one is done, remove it from the list.
2942 2943 */
2943 2944 list_remove(&mig->mig_list, mi);
2944 2945 mutex_exit(&mig->mig_lock);
2945 2946 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2946 2947
2947 2948 /*
2948 2949 * Release hold on vfs and mi done to prevent race with zone
2949 2950 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2950 2951 */
2951 2952 VFS_RELE(mi->mi_vfsp);
2952 2953 MI4_RELE(mi);
2953 2954 }
2954 2955 /*
2955 2956 * Tell each renew thread in the zone to exit
2956 2957 */
2957 2958 mutex_enter(&nfs4_server_lst_lock);
2958 2959 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2959 2960 mutex_enter(&np->s_lock);
2960 2961 if (np->zoneid == zoneid) {
2961 2962 /*
2962 2963 * We add another hold onto the nfs4_server_t
2963 2964 * because this will make sure tha the nfs4_server_t
2964 2965 * stays around until nfs4_callback_fini_zone destroys
2965 2966 * the zone. This way, the renew thread can
2966 2967 * unconditionally release its holds on the
2967 2968 * nfs4_server_t.
2968 2969 */
2969 2970 np->s_refcnt++;
2970 2971 nfs4_mark_srv_dead(np);
2971 2972 }
2972 2973 mutex_exit(&np->s_lock);
2973 2974 }
2974 2975 mutex_exit(&nfs4_server_lst_lock);
2975 2976 }
2976 2977
2977 2978 static void
2978 2979 nfs4_mi_free_globals(struct mi4_globals *mig)
2979 2980 {
2980 2981 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2981 2982 mutex_destroy(&mig->mig_lock);
2982 2983 kmem_free(mig, sizeof (*mig));
2983 2984 }
2984 2985
2985 2986 /* ARGSUSED */
2986 2987 static void
2987 2988 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2988 2989 {
2989 2990 struct mi4_globals *mig = data;
2990 2991
2991 2992 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2992 2993 "nfs4_mi_destroy zone %d\n", zoneid));
2993 2994 ASSERT(mig != NULL);
2994 2995 mutex_enter(&mig->mig_lock);
2995 2996 if (list_head(&mig->mig_list) != NULL) {
2996 2997 /* Still waiting for VFS_FREEVFS() */
2997 2998 mig->mig_destructor_called = B_TRUE;
2998 2999 mutex_exit(&mig->mig_lock);
2999 3000 return;
3000 3001 }
3001 3002 nfs4_mi_free_globals(mig);
3002 3003 }
3003 3004
3004 3005 /*
3005 3006 * Add an NFS mount to the per-zone list of NFS mounts.
3006 3007 */
3007 3008 void
3008 3009 nfs4_mi_zonelist_add(mntinfo4_t *mi)
3009 3010 {
3010 3011 struct mi4_globals *mig;
3011 3012
3012 3013 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3013 3014 mutex_enter(&mig->mig_lock);
3014 3015 list_insert_head(&mig->mig_list, mi);
3015 3016 /*
3016 3017 * hold added to eliminate race with zone shutdown -this will be
3017 3018 * released in mi_shutdown
3018 3019 */
3019 3020 MI4_HOLD(mi);
3020 3021 VFS_HOLD(mi->mi_vfsp);
3021 3022 mutex_exit(&mig->mig_lock);
3022 3023 }
3023 3024
3024 3025 /*
3025 3026 * Remove an NFS mount from the per-zone list of NFS mounts.
3026 3027 */
3027 3028 int
3028 3029 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3029 3030 {
3030 3031 struct mi4_globals *mig;
3031 3032 int ret = 0;
3032 3033
3033 3034 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3034 3035 mutex_enter(&mig->mig_lock);
3035 3036 mutex_enter(&mi->mi_lock);
3036 3037 /* if this mi is marked dead, then the zone already released it */
3037 3038 if (!(mi->mi_flags & MI4_DEAD)) {
3038 3039 list_remove(&mig->mig_list, mi);
3039 3040 mutex_exit(&mi->mi_lock);
3040 3041
3041 3042 /* release the holds put on in zonelist_add(). */
3042 3043 VFS_RELE(mi->mi_vfsp);
3043 3044 MI4_RELE(mi);
3044 3045 ret = 1;
3045 3046 } else {
3046 3047 mutex_exit(&mi->mi_lock);
3047 3048 }
3048 3049
3049 3050 /*
3050 3051 * We can be called asynchronously by VFS_FREEVFS() after the zone
3051 3052 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3052 3053 * mi globals.
3053 3054 */
3054 3055 if (list_head(&mig->mig_list) == NULL &&
3055 3056 mig->mig_destructor_called == B_TRUE) {
3056 3057 nfs4_mi_free_globals(mig);
3057 3058 return (ret);
3058 3059 }
3059 3060 mutex_exit(&mig->mig_lock);
3060 3061 return (ret);
3061 3062 }
3062 3063
3063 3064 void
3064 3065 nfs_free_mi4(mntinfo4_t *mi)
3065 3066 {
3066 3067 nfs4_open_owner_t *foop;
3067 3068 nfs4_oo_hash_bucket_t *bucketp;
3068 3069 nfs4_debug_msg_t *msgp;
3069 3070 int i;
3070 3071 servinfo4_t *svp;
3071 3072
3072 3073 /*
3073 3074 * Code introduced here should be carefully evaluated to make
3074 3075 * sure none of the freed resources are accessed either directly
3075 3076 * or indirectly after freeing them. For eg: Introducing calls to
3076 3077 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3077 3078 * the structure members or other routines calling back into NFS
3078 3079 * accessing freed mntinfo4_t structure member.
3079 3080 */
3080 3081 mutex_enter(&mi->mi_lock);
3081 3082 ASSERT(mi->mi_recovthread == NULL);
3082 3083 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3083 3084 mutex_exit(&mi->mi_lock);
3084 3085 mutex_enter(&mi->mi_async_lock);
3085 3086 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3086 3087 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3087 3088 ASSERT(mi->mi_manager_thread == NULL);
3088 3089 mutex_exit(&mi->mi_async_lock);
3089 3090 if (mi->mi_io_kstats) {
3090 3091 kstat_delete(mi->mi_io_kstats);
3091 3092 mi->mi_io_kstats = NULL;
3092 3093 }
3093 3094 if (mi->mi_ro_kstats) {
3094 3095 kstat_delete(mi->mi_ro_kstats);
3095 3096 mi->mi_ro_kstats = NULL;
3096 3097 }
3097 3098 if (mi->mi_recov_ksp) {
3098 3099 kstat_delete(mi->mi_recov_ksp);
3099 3100 mi->mi_recov_ksp = NULL;
3100 3101 }
3101 3102 mutex_enter(&mi->mi_msg_list_lock);
3102 3103 while (msgp = list_head(&mi->mi_msg_list)) {
3103 3104 list_remove(&mi->mi_msg_list, msgp);
3104 3105 nfs4_free_msg(msgp);
3105 3106 }
3106 3107 mutex_exit(&mi->mi_msg_list_lock);
3107 3108 list_destroy(&mi->mi_msg_list);
3108 3109 if (mi->mi_fname != NULL)
3109 3110 fn_rele(&mi->mi_fname);
3110 3111 if (mi->mi_rootfh != NULL)
3111 3112 sfh4_rele(&mi->mi_rootfh);
3112 3113 if (mi->mi_srvparentfh != NULL)
3113 3114 sfh4_rele(&mi->mi_srvparentfh);
3114 3115 svp = mi->mi_servers;
3115 3116 sv4_free(svp);
3116 3117 mutex_destroy(&mi->mi_lock);
3117 3118 mutex_destroy(&mi->mi_async_lock);
3118 3119 mutex_destroy(&mi->mi_msg_list_lock);
3119 3120 nfs_rw_destroy(&mi->mi_recovlock);
3120 3121 nfs_rw_destroy(&mi->mi_rename_lock);
3121 3122 nfs_rw_destroy(&mi->mi_fh_lock);
3122 3123 cv_destroy(&mi->mi_failover_cv);
3123 3124 cv_destroy(&mi->mi_async_reqs_cv);
3124 3125 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3125 3126 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3126 3127 cv_destroy(&mi->mi_async_cv);
3127 3128 cv_destroy(&mi->mi_inact_req_cv);
3128 3129 /*
3129 3130 * Destroy the oo hash lists and mutexes for the cred hash table.
3130 3131 */
3131 3132 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3132 3133 bucketp = &(mi->mi_oo_list[i]);
3133 3134 /* Destroy any remaining open owners on the list */
3134 3135 foop = list_head(&bucketp->b_oo_hash_list);
3135 3136 while (foop != NULL) {
3136 3137 list_remove(&bucketp->b_oo_hash_list, foop);
3137 3138 nfs4_destroy_open_owner(foop);
3138 3139 foop = list_head(&bucketp->b_oo_hash_list);
3139 3140 }
3140 3141 list_destroy(&bucketp->b_oo_hash_list);
3141 3142 mutex_destroy(&bucketp->b_lock);
3142 3143 }
3143 3144 /*
3144 3145 * Empty and destroy the freed open owner list.
3145 3146 */
3146 3147 foop = list_head(&mi->mi_foo_list);
3147 3148 while (foop != NULL) {
3148 3149 list_remove(&mi->mi_foo_list, foop);
3149 3150 nfs4_destroy_open_owner(foop);
3150 3151 foop = list_head(&mi->mi_foo_list);
3151 3152 }
3152 3153 list_destroy(&mi->mi_foo_list);
3153 3154 list_destroy(&mi->mi_bseqid_list);
3154 3155 list_destroy(&mi->mi_lost_state);
3155 3156 avl_destroy(&mi->mi_filehandles);
3156 3157 kmem_free(mi, sizeof (*mi));
3157 3158 }
3158 3159 void
3159 3160 mi_hold(mntinfo4_t *mi)
3160 3161 {
3161 3162 atomic_inc_32(&mi->mi_count);
3162 3163 ASSERT(mi->mi_count != 0);
3163 3164 }
3164 3165
3165 3166 void
3166 3167 mi_rele(mntinfo4_t *mi)
3167 3168 {
3168 3169 ASSERT(mi->mi_count != 0);
3169 3170 if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3170 3171 nfs_free_mi4(mi);
3171 3172 }
3172 3173 }
3173 3174
3174 3175 vnode_t nfs4_xattr_notsupp_vnode;
3175 3176
3176 3177 void
3177 3178 nfs4_clnt_init(void)
3178 3179 {
3179 3180 nfs4_vnops_init();
3180 3181 (void) nfs4_rnode_init();
3181 3182 (void) nfs4_shadow_init();
3182 3183 (void) nfs4_acache_init();
3183 3184 (void) nfs4_subr_init();
3184 3185 nfs4_acl_init();
3185 3186 nfs_idmap_init();
3186 3187 nfs4_callback_init();
3187 3188 nfs4_secinfo_init();
3188 3189 #ifdef DEBUG
3189 3190 tsd_create(&nfs4_tsd_key, NULL);
3190 3191 #endif
3191 3192
3192 3193 /*
3193 3194 * Add a CPR callback so that we can update client
3194 3195 * lease after a suspend and resume.
3195 3196 */
3196 3197 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3197 3198
3198 3199 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3199 3200 nfs4_mi_destroy);
3200 3201
3201 3202 /*
3202 3203 * Initialize the reference count of the notsupp xattr cache vnode to 1
3203 3204 * so that it never goes away (VOP_INACTIVE isn't called on it).
3204 3205 */
3205 3206 vn_reinit(&nfs4_xattr_notsupp_vnode);
3206 3207 }
3207 3208
3208 3209 void
3209 3210 nfs4_clnt_fini(void)
3210 3211 {
3211 3212 (void) zone_key_delete(mi4_list_key);
3212 3213 nfs4_vnops_fini();
3213 3214 (void) nfs4_rnode_fini();
3214 3215 (void) nfs4_shadow_fini();
3215 3216 (void) nfs4_acache_fini();
3216 3217 (void) nfs4_subr_fini();
3217 3218 nfs_idmap_fini();
3218 3219 nfs4_callback_fini();
3219 3220 nfs4_secinfo_fini();
3220 3221 #ifdef DEBUG
3221 3222 tsd_destroy(&nfs4_tsd_key);
3222 3223 #endif
3223 3224 if (cid)
3224 3225 (void) callb_delete(cid);
3225 3226 }
3226 3227
3227 3228 /*ARGSUSED*/
3228 3229 static boolean_t
3229 3230 nfs4_client_cpr_callb(void *arg, int code)
3230 3231 {
3231 3232 /*
3232 3233 * We get called for Suspend and Resume events.
3233 3234 * For the suspend case we simply don't care!
3234 3235 */
3235 3236 if (code == CB_CODE_CPR_CHKPT) {
3236 3237 return (B_TRUE);
3237 3238 }
3238 3239
3239 3240 /*
3240 3241 * When we get to here we are in the process of
3241 3242 * resuming the system from a previous suspend.
3242 3243 */
3243 3244 nfs4_client_resumed = gethrestime_sec();
3244 3245 return (B_TRUE);
3245 3246 }
3246 3247
3247 3248 void
3248 3249 nfs4_renew_lease_thread(nfs4_server_t *sp)
3249 3250 {
3250 3251 int error = 0;
3251 3252 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3252 3253 clock_t tick_delay = 0;
3253 3254 clock_t time_left = 0;
3254 3255 callb_cpr_t cpr_info;
3255 3256 kmutex_t cpr_lock;
3256 3257
3257 3258 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3258 3259 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3259 3260 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3260 3261 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3261 3262
3262 3263 mutex_enter(&sp->s_lock);
3263 3264 /* sp->s_lease_time is set via a GETATTR */
3264 3265 sp->last_renewal_time = gethrestime_sec();
3265 3266 sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3266 3267 ASSERT(sp->s_refcnt >= 1);
3267 3268
3268 3269 for (;;) {
3269 3270 if (!sp->state_ref_count ||
3270 3271 sp->lease_valid != NFS4_LEASE_VALID) {
3271 3272
3272 3273 kip_secs = MAX((sp->s_lease_time >> 1) -
3273 3274 (3 * sp->propagation_delay.tv_sec), 1);
3274 3275
3275 3276 tick_delay = SEC_TO_TICK(kip_secs);
3276 3277
3277 3278 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3278 3279 "nfs4_renew_lease_thread: no renew : thread "
3279 3280 "wait %ld secs", kip_secs));
3280 3281
3281 3282 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3282 3283 "nfs4_renew_lease_thread: no renew : "
3283 3284 "state_ref_count %d, lease_valid %d",
3284 3285 sp->state_ref_count, sp->lease_valid));
3285 3286
3286 3287 mutex_enter(&cpr_lock);
3287 3288 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3288 3289 mutex_exit(&cpr_lock);
3289 3290 time_left = cv_reltimedwait(&sp->cv_thread_exit,
3290 3291 &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3291 3292 mutex_enter(&cpr_lock);
3292 3293 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3293 3294 mutex_exit(&cpr_lock);
3294 3295
3295 3296 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3296 3297 "nfs4_renew_lease_thread: no renew: "
3297 3298 "time left %ld", time_left));
3298 3299
3299 3300 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3300 3301 goto die;
3301 3302 continue;
3302 3303 }
3303 3304
3304 3305 tmp_last_renewal_time = sp->last_renewal_time;
3305 3306
3306 3307 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3307 3308 (3 * sp->propagation_delay.tv_sec);
3308 3309
3309 3310 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3310 3311 "nfs4_renew_lease_thread: tmp_time %ld, "
3311 3312 "sp->last_renewal_time %ld", tmp_time,
3312 3313 sp->last_renewal_time));
3313 3314
3314 3315 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3315 3316
3316 3317 tick_delay = SEC_TO_TICK(kip_secs);
3317 3318
3318 3319 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3319 3320 "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3320 3321 "secs", kip_secs));
3321 3322
3322 3323 mutex_enter(&cpr_lock);
3323 3324 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3324 3325 mutex_exit(&cpr_lock);
3325 3326 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3326 3327 tick_delay, TR_CLOCK_TICK);
3327 3328 mutex_enter(&cpr_lock);
3328 3329 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3329 3330 mutex_exit(&cpr_lock);
3330 3331
3331 3332 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3332 3333 "nfs4_renew_lease_thread: valid lease: time left %ld :"
3333 3334 "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3334 3335 "tmp_last_renewal_time %ld", time_left,
3335 3336 sp->last_renewal_time, nfs4_client_resumed,
3336 3337 tmp_last_renewal_time));
3337 3338
3338 3339 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3339 3340 goto die;
3340 3341
3341 3342 if (tmp_last_renewal_time == sp->last_renewal_time ||
3342 3343 (nfs4_client_resumed != 0 &&
3343 3344 nfs4_client_resumed > sp->last_renewal_time)) {
3344 3345 /*
3345 3346 * Issue RENEW op since we haven't renewed the lease
3346 3347 * since we slept.
3347 3348 */
3348 3349 tmp_now_time = gethrestime_sec();
3349 3350 error = nfs4renew(sp);
3350 3351 /*
3351 3352 * Need to re-acquire sp's lock, nfs4renew()
3352 3353 * relinqueshes it.
3353 3354 */
3354 3355 mutex_enter(&sp->s_lock);
3355 3356
3356 3357 /*
3357 3358 * See if someone changed s_thread_exit while we gave
3358 3359 * up s_lock.
3359 3360 */
3360 3361 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3361 3362 goto die;
3362 3363
3363 3364 if (!error) {
3364 3365 /*
3365 3366 * check to see if we implicitly renewed while
3366 3367 * we waited for a reply for our RENEW call.
3367 3368 */
3368 3369 if (tmp_last_renewal_time ==
3369 3370 sp->last_renewal_time) {
3370 3371 /* no implicit renew came */
3371 3372 sp->last_renewal_time = tmp_now_time;
3372 3373 } else {
3373 3374 NFS4_DEBUG(nfs4_client_lease_debug,
3374 3375 (CE_NOTE, "renew_thread: did "
3375 3376 "implicit renewal before reply "
3376 3377 "from server for RENEW"));
3377 3378 }
3378 3379 } else {
3379 3380 /* figure out error */
3380 3381 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3381 3382 "renew_thread: nfs4renew returned error"
3382 3383 " %d", error));
3383 3384 }
3384 3385
3385 3386 }
3386 3387 }
3387 3388
3388 3389 die:
3389 3390 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3390 3391 "nfs4_renew_lease_thread: thread exiting"));
3391 3392
3392 3393 while (sp->s_otw_call_count != 0) {
3393 3394 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3394 3395 "nfs4_renew_lease_thread: waiting for outstanding "
3395 3396 "otw calls to finish for sp 0x%p, current "
3396 3397 "s_otw_call_count %d", (void *)sp,
3397 3398 sp->s_otw_call_count));
3398 3399 mutex_enter(&cpr_lock);
3399 3400 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3400 3401 mutex_exit(&cpr_lock);
3401 3402 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3402 3403 mutex_enter(&cpr_lock);
3403 3404 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3404 3405 mutex_exit(&cpr_lock);
3405 3406 }
3406 3407 mutex_exit(&sp->s_lock);
3407 3408
3408 3409 nfs4_server_rele(sp); /* free the thread's reference */
3409 3410 nfs4_server_rele(sp); /* free the list's reference */
3410 3411 sp = NULL;
3411 3412
3412 3413 done:
3413 3414 mutex_enter(&cpr_lock);
3414 3415 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
3415 3416 mutex_destroy(&cpr_lock);
3416 3417
3417 3418 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3418 3419 "nfs4_renew_lease_thread: renew thread exit officially"));
3419 3420
3420 3421 zthread_exit();
3421 3422 /* NOT REACHED */
3422 3423 }
3423 3424
3424 3425 /*
3425 3426 * Send out a RENEW op to the server.
3426 3427 * Assumes sp is locked down.
3427 3428 */
3428 3429 static int
3429 3430 nfs4renew(nfs4_server_t *sp)
3430 3431 {
3431 3432 COMPOUND4args_clnt args;
3432 3433 COMPOUND4res_clnt res;
3433 3434 nfs_argop4 argop[1];
3434 3435 int doqueue = 1;
3435 3436 int rpc_error;
3436 3437 cred_t *cr;
3437 3438 mntinfo4_t *mi;
3438 3439 timespec_t prop_time, after_time;
3439 3440 int needrecov = FALSE;
3440 3441 nfs4_recov_state_t recov_state;
3441 3442 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3442 3443
3443 3444 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3444 3445
3445 3446 recov_state.rs_flags = 0;
3446 3447 recov_state.rs_num_retry_despite_err = 0;
3447 3448
3448 3449 recov_retry:
3449 3450 mi = sp->mntinfo4_list;
3450 3451 VFS_HOLD(mi->mi_vfsp);
3451 3452 mutex_exit(&sp->s_lock);
3452 3453 ASSERT(mi != NULL);
3453 3454
3454 3455 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3455 3456 if (e.error) {
3456 3457 VFS_RELE(mi->mi_vfsp);
3457 3458 return (e.error);
3458 3459 }
3459 3460
3460 3461 /* Check to see if we're dealing with a marked-dead sp */
3461 3462 mutex_enter(&sp->s_lock);
3462 3463 if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3463 3464 mutex_exit(&sp->s_lock);
3464 3465 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3465 3466 VFS_RELE(mi->mi_vfsp);
3466 3467 return (0);
3467 3468 }
3468 3469
3469 3470 /* Make sure mi hasn't changed on us */
3470 3471 if (mi != sp->mntinfo4_list) {
3471 3472 /* Must drop sp's lock to avoid a recursive mutex enter */
3472 3473 mutex_exit(&sp->s_lock);
3473 3474 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3474 3475 VFS_RELE(mi->mi_vfsp);
3475 3476 mutex_enter(&sp->s_lock);
3476 3477 goto recov_retry;
3477 3478 }
3478 3479 mutex_exit(&sp->s_lock);
3479 3480
3480 3481 args.ctag = TAG_RENEW;
3481 3482
3482 3483 args.array_len = 1;
3483 3484 args.array = argop;
3484 3485
3485 3486 argop[0].argop = OP_RENEW;
3486 3487
3487 3488 mutex_enter(&sp->s_lock);
3488 3489 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3489 3490 cr = sp->s_cred;
3490 3491 crhold(cr);
3491 3492 mutex_exit(&sp->s_lock);
3492 3493
3493 3494 ASSERT(cr != NULL);
3494 3495
3495 3496 /* used to figure out RTT for sp */
3496 3497 gethrestime(&prop_time);
3497 3498
3498 3499 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3499 3500 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3500 3501 (void*)sp));
3501 3502 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3502 3503 prop_time.tv_sec, prop_time.tv_nsec));
3503 3504
3504 3505 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3505 3506 mntinfo4_t *, mi);
3506 3507
3507 3508 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3508 3509 crfree(cr);
3509 3510
3510 3511 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3511 3512 mntinfo4_t *, mi);
3512 3513
3513 3514 gethrestime(&after_time);
3514 3515
3515 3516 mutex_enter(&sp->s_lock);
3516 3517 sp->propagation_delay.tv_sec =
3517 3518 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3518 3519 mutex_exit(&sp->s_lock);
3519 3520
3520 3521 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3521 3522 after_time.tv_sec, after_time.tv_nsec));
3522 3523
3523 3524 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3524 3525 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3525 3526 nfs4_delegreturn_all(sp);
3526 3527 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3527 3528 VFS_RELE(mi->mi_vfsp);
3528 3529 /*
3529 3530 * If the server returns CB_PATH_DOWN, it has renewed
3530 3531 * the lease and informed us that the callback path is
3531 3532 * down. Since the lease is renewed, just return 0 and
3532 3533 * let the renew thread proceed as normal.
3533 3534 */
3534 3535 return (0);
3535 3536 }
3536 3537
3537 3538 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3538 3539 if (!needrecov && e.error) {
3539 3540 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3540 3541 VFS_RELE(mi->mi_vfsp);
3541 3542 return (e.error);
3542 3543 }
3543 3544
3544 3545 rpc_error = e.error;
3545 3546
3546 3547 if (needrecov) {
3547 3548 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3548 3549 "nfs4renew: initiating recovery\n"));
3549 3550
3550 3551 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3551 3552 OP_RENEW, NULL, NULL, NULL) == FALSE) {
3552 3553 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3553 3554 VFS_RELE(mi->mi_vfsp);
3554 3555 if (!e.error)
3555 3556 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3556 3557 mutex_enter(&sp->s_lock);
3557 3558 goto recov_retry;
3558 3559 }
3559 3560 /* fall through for res.status case */
3560 3561 }
3561 3562
3562 3563 if (res.status) {
3563 3564 if (res.status == NFS4ERR_LEASE_MOVED) {
3564 3565 /*EMPTY*/
3565 3566 /*
3566 3567 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3567 3568 * to renew the lease on that server
3568 3569 */
3569 3570 }
3570 3571 e.error = geterrno4(res.status);
3571 3572 }
3572 3573
3573 3574 if (!rpc_error)
3574 3575 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3575 3576
3576 3577 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3577 3578
3578 3579 VFS_RELE(mi->mi_vfsp);
3579 3580
3580 3581 return (e.error);
3581 3582 }
3582 3583
3583 3584 void
3584 3585 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3585 3586 {
3586 3587 nfs4_server_t *sp;
3587 3588
3588 3589 /* this locks down sp if it is found */
3589 3590 sp = find_nfs4_server(mi);
3590 3591
3591 3592 if (sp != NULL) {
3592 3593 nfs4_inc_state_ref_count_nolock(sp, mi);
3593 3594 mutex_exit(&sp->s_lock);
3594 3595 nfs4_server_rele(sp);
3595 3596 }
3596 3597 }
3597 3598
3598 3599 /*
3599 3600 * Bump the number of OPEN files (ie: those with state) so we know if this
3600 3601 * nfs4_server has any state to maintain a lease for or not.
3601 3602 *
3602 3603 * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3603 3604 */
3604 3605 void
3605 3606 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3606 3607 {
3607 3608 ASSERT(mutex_owned(&sp->s_lock));
3608 3609
3609 3610 sp->state_ref_count++;
3610 3611 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3611 3612 "nfs4_inc_state_ref_count: state_ref_count now %d",
3612 3613 sp->state_ref_count));
3613 3614
3614 3615 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3615 3616 sp->lease_valid = NFS4_LEASE_VALID;
3616 3617
3617 3618 /*
3618 3619 * If this call caused the lease to be marked valid and/or
3619 3620 * took the state_ref_count from 0 to 1, then start the time
3620 3621 * on lease renewal.
3621 3622 */
3622 3623 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3623 3624 sp->last_renewal_time = gethrestime_sec();
3624 3625
3625 3626 /* update the number of open files for mi */
3626 3627 mi->mi_open_files++;
3627 3628 }
3628 3629
3629 3630 void
3630 3631 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3631 3632 {
3632 3633 nfs4_server_t *sp;
3633 3634
3634 3635 /* this locks down sp if it is found */
3635 3636 sp = find_nfs4_server_all(mi, 1);
3636 3637
3637 3638 if (sp != NULL) {
3638 3639 nfs4_dec_state_ref_count_nolock(sp, mi);
3639 3640 mutex_exit(&sp->s_lock);
3640 3641 nfs4_server_rele(sp);
3641 3642 }
3642 3643 }
3643 3644
3644 3645 /*
3645 3646 * Decrement the number of OPEN files (ie: those with state) so we know if
3646 3647 * this nfs4_server has any state to maintain a lease for or not.
3647 3648 */
3648 3649 void
3649 3650 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3650 3651 {
3651 3652 ASSERT(mutex_owned(&sp->s_lock));
3652 3653 ASSERT(sp->state_ref_count != 0);
3653 3654 sp->state_ref_count--;
3654 3655
3655 3656 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3656 3657 "nfs4_dec_state_ref_count: state ref count now %d",
3657 3658 sp->state_ref_count));
3658 3659
3659 3660 mi->mi_open_files--;
3660 3661 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3661 3662 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3662 3663 mi->mi_open_files, mi->mi_flags));
3663 3664
3664 3665 /* We don't have to hold the mi_lock to test mi_flags */
3665 3666 if (mi->mi_open_files == 0 &&
3666 3667 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3667 3668 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3668 3669 "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3669 3670 "we have closed the last open file", (void*)mi));
3670 3671 nfs4_remove_mi_from_server(mi, sp);
3671 3672 }
3672 3673 }
3673 3674
3674 3675 bool_t
3675 3676 inlease(nfs4_server_t *sp)
3676 3677 {
3677 3678 bool_t result;
3678 3679
3679 3680 ASSERT(mutex_owned(&sp->s_lock));
3680 3681
3681 3682 if (sp->lease_valid == NFS4_LEASE_VALID &&
3682 3683 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3683 3684 result = TRUE;
3684 3685 else
3685 3686 result = FALSE;
3686 3687
3687 3688 return (result);
3688 3689 }
3689 3690
3690 3691
3691 3692 /*
3692 3693 * Return non-zero if the given nfs4_server_t is going through recovery.
3693 3694 */
3694 3695
3695 3696 int
3696 3697 nfs4_server_in_recovery(nfs4_server_t *sp)
3697 3698 {
3698 3699 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3699 3700 }
3700 3701
3701 3702 /*
3702 3703 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the
3703 3704 * first is less than, equal to, or greater than the second.
3704 3705 */
3705 3706
3706 3707 int
3707 3708 sfh4cmp(const void *p1, const void *p2)
3708 3709 {
3709 3710 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3710 3711 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3711 3712
3712 3713 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3713 3714 }
3714 3715
3715 3716 /*
3716 3717 * Create a table for shared filehandle objects.
3717 3718 */
3718 3719
3719 3720 void
3720 3721 sfh4_createtab(avl_tree_t *tab)
3721 3722 {
3722 3723 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3723 3724 offsetof(nfs4_sharedfh_t, sfh_tree));
3724 3725 }
3725 3726
3726 3727 /*
3727 3728 * Return a shared filehandle object for the given filehandle. The caller
3728 3729 * is responsible for eventually calling sfh4_rele().
3729 3730 */
3730 3731
3731 3732 nfs4_sharedfh_t *
3732 3733 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3733 3734 {
3734 3735 nfs4_sharedfh_t *sfh, *nsfh;
3735 3736 avl_index_t where;
3736 3737 nfs4_sharedfh_t skey;
3737 3738
3738 3739 if (!key) {
3739 3740 skey.sfh_fh = *fh;
3740 3741 key = &skey;
3741 3742 }
3742 3743
3743 3744 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3744 3745 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3745 3746 /*
3746 3747 * We allocate the largest possible filehandle size because it's
3747 3748 * not that big, and it saves us from possibly having to resize the
3748 3749 * buffer later.
3749 3750 */
3750 3751 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3751 3752 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3752 3753 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3753 3754 nsfh->sfh_refcnt = 1;
3754 3755 nsfh->sfh_flags = SFH4_IN_TREE;
3755 3756 nsfh->sfh_mi = mi;
3756 3757 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3757 3758 (void *)nsfh));
3758 3759
3759 3760 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3760 3761 sfh = avl_find(&mi->mi_filehandles, key, &where);
3761 3762 if (sfh != NULL) {
3762 3763 mutex_enter(&sfh->sfh_lock);
3763 3764 sfh->sfh_refcnt++;
3764 3765 mutex_exit(&sfh->sfh_lock);
3765 3766 nfs_rw_exit(&mi->mi_fh_lock);
3766 3767 /* free our speculative allocs */
3767 3768 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3768 3769 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3769 3770 return (sfh);
3770 3771 }
3771 3772
3772 3773 avl_insert(&mi->mi_filehandles, nsfh, where);
3773 3774 nfs_rw_exit(&mi->mi_fh_lock);
3774 3775
3775 3776 return (nsfh);
3776 3777 }
3777 3778
3778 3779 /*
3779 3780 * Return a shared filehandle object for the given filehandle. The caller
3780 3781 * is responsible for eventually calling sfh4_rele().
3781 3782 */
3782 3783
3783 3784 nfs4_sharedfh_t *
3784 3785 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3785 3786 {
3786 3787 nfs4_sharedfh_t *sfh;
3787 3788 nfs4_sharedfh_t key;
3788 3789
3789 3790 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3790 3791
3791 3792 #ifdef DEBUG
3792 3793 if (nfs4_sharedfh_debug) {
3793 3794 nfs4_fhandle_t fhandle;
3794 3795
3795 3796 fhandle.fh_len = fh->nfs_fh4_len;
3796 3797 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3797 3798 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3798 3799 nfs4_printfhandle(&fhandle);
3799 3800 }
3800 3801 #endif
3801 3802
3802 3803 /*
3803 3804 * If there's already an object for the given filehandle, bump the
3804 3805 * reference count and return it. Otherwise, create a new object
3805 3806 * and add it to the AVL tree.
3806 3807 */
3807 3808
3808 3809 key.sfh_fh = *fh;
3809 3810
3810 3811 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3811 3812 sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3812 3813 if (sfh != NULL) {
3813 3814 mutex_enter(&sfh->sfh_lock);
3814 3815 sfh->sfh_refcnt++;
3815 3816 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3816 3817 "sfh4_get: found existing %p, new refcnt=%d",
3817 3818 (void *)sfh, sfh->sfh_refcnt));
3818 3819 mutex_exit(&sfh->sfh_lock);
3819 3820 nfs_rw_exit(&mi->mi_fh_lock);
3820 3821 return (sfh);
3821 3822 }
3822 3823 nfs_rw_exit(&mi->mi_fh_lock);
3823 3824
3824 3825 return (sfh4_put(fh, mi, &key));
3825 3826 }
3826 3827
3827 3828 /*
3828 3829 * Get a reference to the given shared filehandle object.
3829 3830 */
3830 3831
3831 3832 void
3832 3833 sfh4_hold(nfs4_sharedfh_t *sfh)
3833 3834 {
3834 3835 ASSERT(sfh->sfh_refcnt > 0);
3835 3836
3836 3837 mutex_enter(&sfh->sfh_lock);
3837 3838 sfh->sfh_refcnt++;
3838 3839 NFS4_DEBUG(nfs4_sharedfh_debug,
3839 3840 (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3840 3841 (void *)sfh, sfh->sfh_refcnt));
3841 3842 mutex_exit(&sfh->sfh_lock);
3842 3843 }
3843 3844
3844 3845 /*
3845 3846 * Release a reference to the given shared filehandle object and null out
3846 3847 * the given pointer.
3847 3848 */
3848 3849
3849 3850 void
3850 3851 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3851 3852 {
3852 3853 mntinfo4_t *mi;
3853 3854 nfs4_sharedfh_t *sfh = *sfhpp;
3854 3855
3855 3856 ASSERT(sfh->sfh_refcnt > 0);
3856 3857
3857 3858 mutex_enter(&sfh->sfh_lock);
3858 3859 if (sfh->sfh_refcnt > 1) {
3859 3860 sfh->sfh_refcnt--;
3860 3861 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3861 3862 "sfh4_rele %p, new refcnt=%d",
3862 3863 (void *)sfh, sfh->sfh_refcnt));
3863 3864 mutex_exit(&sfh->sfh_lock);
3864 3865 goto finish;
3865 3866 }
3866 3867 mutex_exit(&sfh->sfh_lock);
3867 3868
3868 3869 /*
3869 3870 * Possibly the last reference, so get the lock for the table in
3870 3871 * case it's time to remove the object from the table.
3871 3872 */
3872 3873 mi = sfh->sfh_mi;
3873 3874 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3874 3875 mutex_enter(&sfh->sfh_lock);
3875 3876 sfh->sfh_refcnt--;
3876 3877 if (sfh->sfh_refcnt > 0) {
3877 3878 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3878 3879 "sfh4_rele %p, new refcnt=%d",
3879 3880 (void *)sfh, sfh->sfh_refcnt));
3880 3881 mutex_exit(&sfh->sfh_lock);
3881 3882 nfs_rw_exit(&mi->mi_fh_lock);
3882 3883 goto finish;
3883 3884 }
3884 3885
3885 3886 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3886 3887 "sfh4_rele %p, last ref", (void *)sfh));
3887 3888 if (sfh->sfh_flags & SFH4_IN_TREE) {
3888 3889 avl_remove(&mi->mi_filehandles, sfh);
3889 3890 sfh->sfh_flags &= ~SFH4_IN_TREE;
3890 3891 }
3891 3892 mutex_exit(&sfh->sfh_lock);
3892 3893 nfs_rw_exit(&mi->mi_fh_lock);
3893 3894 mutex_destroy(&sfh->sfh_lock);
3894 3895 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3895 3896 kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3896 3897
3897 3898 finish:
3898 3899 *sfhpp = NULL;
3899 3900 }
3900 3901
3901 3902 /*
3902 3903 * Update the filehandle for the given shared filehandle object.
3903 3904 */
3904 3905
3905 3906 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */
3906 3907
3907 3908 void
3908 3909 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3909 3910 {
3910 3911 mntinfo4_t *mi = sfh->sfh_mi;
3911 3912 nfs4_sharedfh_t *dupsfh;
3912 3913 avl_index_t where;
3913 3914 nfs4_sharedfh_t key;
3914 3915
3915 3916 #ifdef DEBUG
3916 3917 mutex_enter(&sfh->sfh_lock);
3917 3918 ASSERT(sfh->sfh_refcnt > 0);
3918 3919 mutex_exit(&sfh->sfh_lock);
3919 3920 #endif
3920 3921 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3921 3922
3922 3923 /*
3923 3924 * The basic plan is to remove the shared filehandle object from
3924 3925 * the table, update it to have the new filehandle, then reinsert
3925 3926 * it.
3926 3927 */
3927 3928
3928 3929 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3929 3930 mutex_enter(&sfh->sfh_lock);
3930 3931 if (sfh->sfh_flags & SFH4_IN_TREE) {
3931 3932 avl_remove(&mi->mi_filehandles, sfh);
3932 3933 sfh->sfh_flags &= ~SFH4_IN_TREE;
3933 3934 }
3934 3935 mutex_exit(&sfh->sfh_lock);
3935 3936 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3936 3937 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3937 3938 sfh->sfh_fh.nfs_fh4_len);
3938 3939
3939 3940 /*
3940 3941 * XXX If there is already a shared filehandle object with the new
3941 3942 * filehandle, we're in trouble, because the rnode code assumes
3942 3943 * that there is only one shared filehandle object for a given
3943 3944 * filehandle. So issue a warning (for read-write mounts only)
3944 3945 * and don't try to re-insert the given object into the table.
3945 3946 * Hopefully the given object will quickly go away and everyone
3946 3947 * will use the new object.
3947 3948 */
3948 3949 key.sfh_fh = *newfh;
3949 3950 dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3950 3951 if (dupsfh != NULL) {
3951 3952 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3952 3953 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3953 3954 "duplicate filehandle detected");
3954 3955 sfh4_printfhandle(dupsfh);
3955 3956 }
3956 3957 } else {
3957 3958 avl_insert(&mi->mi_filehandles, sfh, where);
3958 3959 mutex_enter(&sfh->sfh_lock);
3959 3960 sfh->sfh_flags |= SFH4_IN_TREE;
3960 3961 mutex_exit(&sfh->sfh_lock);
3961 3962 }
3962 3963 nfs_rw_exit(&mi->mi_fh_lock);
3963 3964 }
3964 3965
3965 3966 /*
3966 3967 * Copy out the current filehandle for the given shared filehandle object.
3967 3968 */
3968 3969
3969 3970 void
3970 3971 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3971 3972 {
3972 3973 mntinfo4_t *mi = sfh->sfh_mi;
3973 3974
3974 3975 ASSERT(sfh->sfh_refcnt > 0);
3975 3976
3976 3977 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3977 3978 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3978 3979 ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3979 3980 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3980 3981 nfs_rw_exit(&mi->mi_fh_lock);
3981 3982 }
3982 3983
3983 3984 /*
3984 3985 * Print out the filehandle for the given shared filehandle object.
3985 3986 */
3986 3987
3987 3988 void
3988 3989 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3989 3990 {
3990 3991 nfs4_fhandle_t fhandle;
3991 3992
3992 3993 sfh4_copyval(sfh, &fhandle);
3993 3994 nfs4_printfhandle(&fhandle);
3994 3995 }
3995 3996
3996 3997 /*
3997 3998 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0
3998 3999 * if they're the same, +1 if the first is "greater" than the second. The
3999 4000 * caller (or whoever's calling the AVL package) is responsible for
4000 4001 * handling locking issues.
4001 4002 */
4002 4003
4003 4004 static int
4004 4005 fncmp(const void *p1, const void *p2)
4005 4006 {
4006 4007 const nfs4_fname_t *f1 = p1;
4007 4008 const nfs4_fname_t *f2 = p2;
4008 4009 int res;
4009 4010
4010 4011 res = strcmp(f1->fn_name, f2->fn_name);
4011 4012 /*
4012 4013 * The AVL package wants +/-1, not arbitrary positive or negative
4013 4014 * integers.
4014 4015 */
4015 4016 if (res > 0)
4016 4017 res = 1;
4017 4018 else if (res < 0)
4018 4019 res = -1;
4019 4020 return (res);
4020 4021 }
4021 4022
4022 4023 /*
4023 4024 * Get or create an fname with the given name, as a child of the given
4024 4025 * fname. The caller is responsible for eventually releasing the reference
4025 4026 * (fn_rele()). parent may be NULL.
4026 4027 */
4027 4028
4028 4029 nfs4_fname_t *
4029 4030 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4030 4031 {
4031 4032 nfs4_fname_t key;
4032 4033 nfs4_fname_t *fnp;
4033 4034 avl_index_t where;
4034 4035
4035 4036 key.fn_name = name;
4036 4037
4037 4038 /*
4038 4039 * If there's already an fname registered with the given name, bump
4039 4040 * its reference count and return it. Otherwise, create a new one
4040 4041 * and add it to the parent's AVL tree.
4041 4042 *
4042 4043 * fname entries we are looking for should match both name
4043 4044 * and sfh stored in the fname.
4044 4045 */
4045 4046 again:
4046 4047 if (parent != NULL) {
4047 4048 mutex_enter(&parent->fn_lock);
4048 4049 fnp = avl_find(&parent->fn_children, &key, &where);
4049 4050 if (fnp != NULL) {
4050 4051 /*
4051 4052 * This hold on fnp is released below later,
4052 4053 * in case this is not the fnp we want.
4053 4054 */
4054 4055 fn_hold(fnp);
4055 4056
4056 4057 if (fnp->fn_sfh == sfh) {
4057 4058 /*
4058 4059 * We have found our entry.
4059 4060 * put an hold and return it.
4060 4061 */
4061 4062 mutex_exit(&parent->fn_lock);
4062 4063 return (fnp);
4063 4064 }
4064 4065
4065 4066 /*
4066 4067 * We have found an entry that has a mismatching
4067 4068 * fn_sfh. This could be a stale entry due to
4068 4069 * server side rename. We will remove this entry
4069 4070 * and make sure no such entries exist.
4070 4071 */
4071 4072 mutex_exit(&parent->fn_lock);
4072 4073 mutex_enter(&fnp->fn_lock);
4073 4074 if (fnp->fn_parent == parent) {
4074 4075 /*
4075 4076 * Remove ourselves from parent's
4076 4077 * fn_children tree.
4077 4078 */
4078 4079 mutex_enter(&parent->fn_lock);
4079 4080 avl_remove(&parent->fn_children, fnp);
4080 4081 mutex_exit(&parent->fn_lock);
4081 4082 fn_rele(&fnp->fn_parent);
4082 4083 }
4083 4084 mutex_exit(&fnp->fn_lock);
4084 4085 fn_rele(&fnp);
4085 4086 goto again;
4086 4087 }
4087 4088 }
4088 4089
4089 4090 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4090 4091 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4091 4092 fnp->fn_parent = parent;
4092 4093 if (parent != NULL)
4093 4094 fn_hold(parent);
4094 4095 fnp->fn_len = strlen(name);
4095 4096 ASSERT(fnp->fn_len < MAXNAMELEN);
4096 4097 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4097 4098 (void) strcpy(fnp->fn_name, name);
4098 4099 fnp->fn_refcnt = 1;
4099 4100
4100 4101 /*
4101 4102 * This hold on sfh is later released
4102 4103 * when we do the final fn_rele() on this fname.
4103 4104 */
4104 4105 sfh4_hold(sfh);
4105 4106 fnp->fn_sfh = sfh;
4106 4107
4107 4108 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4108 4109 offsetof(nfs4_fname_t, fn_tree));
4109 4110 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4110 4111 "fn_get %p:%s, a new nfs4_fname_t!",
4111 4112 (void *)fnp, fnp->fn_name));
4112 4113 if (parent != NULL) {
4113 4114 avl_insert(&parent->fn_children, fnp, where);
4114 4115 mutex_exit(&parent->fn_lock);
4115 4116 }
4116 4117
4117 4118 return (fnp);
4118 4119 }
4119 4120
4120 4121 void
4121 4122 fn_hold(nfs4_fname_t *fnp)
4122 4123 {
4123 4124 atomic_inc_32(&fnp->fn_refcnt);
4124 4125 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4125 4126 "fn_hold %p:%s, new refcnt=%d",
4126 4127 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4127 4128 }
4128 4129
4129 4130 /*
4130 4131 * Decrement the reference count of the given fname, and destroy it if its
4131 4132 * reference count goes to zero. Nulls out the given pointer.
4132 4133 */
4133 4134
4134 4135 void
4135 4136 fn_rele(nfs4_fname_t **fnpp)
4136 4137 {
4137 4138 nfs4_fname_t *parent;
4138 4139 uint32_t newref;
4139 4140 nfs4_fname_t *fnp;
4140 4141
4141 4142 recur:
4142 4143 fnp = *fnpp;
4143 4144 *fnpp = NULL;
4144 4145
4145 4146 mutex_enter(&fnp->fn_lock);
4146 4147 parent = fnp->fn_parent;
4147 4148 if (parent != NULL)
4148 4149 mutex_enter(&parent->fn_lock); /* prevent new references */
4149 4150 newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4150 4151 if (newref > 0) {
4151 4152 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152 4153 "fn_rele %p:%s, new refcnt=%d",
4153 4154 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4154 4155 if (parent != NULL)
4155 4156 mutex_exit(&parent->fn_lock);
4156 4157 mutex_exit(&fnp->fn_lock);
4157 4158 return;
4158 4159 }
4159 4160
4160 4161 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4161 4162 "fn_rele %p:%s, last reference, deleting...",
4162 4163 (void *)fnp, fnp->fn_name));
4163 4164 if (parent != NULL) {
4164 4165 avl_remove(&parent->fn_children, fnp);
4165 4166 mutex_exit(&parent->fn_lock);
4166 4167 }
4167 4168 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4168 4169 sfh4_rele(&fnp->fn_sfh);
4169 4170 mutex_destroy(&fnp->fn_lock);
4170 4171 avl_destroy(&fnp->fn_children);
4171 4172 kmem_free(fnp, sizeof (nfs4_fname_t));
4172 4173 /*
4173 4174 * Recursivly fn_rele the parent.
4174 4175 * Use goto instead of a recursive call to avoid stack overflow.
4175 4176 */
4176 4177 if (parent != NULL) {
4177 4178 fnpp = &parent;
4178 4179 goto recur;
4179 4180 }
4180 4181 }
4181 4182
4182 4183 /*
4183 4184 * Returns the single component name of the given fname, in a MAXNAMELEN
4184 4185 * string buffer, which the caller is responsible for freeing. Note that
4185 4186 * the name may become invalid as a result of fn_move().
4186 4187 */
4187 4188
4188 4189 char *
4189 4190 fn_name(nfs4_fname_t *fnp)
4190 4191 {
4191 4192 char *name;
4192 4193
4193 4194 ASSERT(fnp->fn_len < MAXNAMELEN);
4194 4195 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4195 4196 mutex_enter(&fnp->fn_lock);
4196 4197 (void) strcpy(name, fnp->fn_name);
4197 4198 mutex_exit(&fnp->fn_lock);
4198 4199
4199 4200 return (name);
4200 4201 }
4201 4202
4202 4203
4203 4204 /*
4204 4205 * fn_path_realloc
4205 4206 *
4206 4207 * This function, used only by fn_path, constructs
4207 4208 * a new string which looks like "prepend" + "/" + "current".
4208 4209 * by allocating a new string and freeing the old one.
4209 4210 */
4210 4211 static void
4211 4212 fn_path_realloc(char **curses, char *prepend)
4212 4213 {
4213 4214 int len, curlen = 0;
4214 4215 char *news;
4215 4216
4216 4217 if (*curses == NULL) {
4217 4218 /*
4218 4219 * Prime the pump, allocate just the
4219 4220 * space for prepend and return that.
4220 4221 */
4221 4222 len = strlen(prepend) + 1;
4222 4223 news = kmem_alloc(len, KM_SLEEP);
4223 4224 (void) strncpy(news, prepend, len);
4224 4225 } else {
4225 4226 /*
4226 4227 * Allocate the space for a new string
4227 4228 * +1 +1 is for the "/" and the NULL
4228 4229 * byte at the end of it all.
4229 4230 */
4230 4231 curlen = strlen(*curses);
4231 4232 len = curlen + strlen(prepend) + 1 + 1;
4232 4233 news = kmem_alloc(len, KM_SLEEP);
4233 4234 (void) strncpy(news, prepend, len);
4234 4235 (void) strcat(news, "/");
4235 4236 (void) strcat(news, *curses);
4236 4237 kmem_free(*curses, curlen + 1);
4237 4238 }
4238 4239 *curses = news;
4239 4240 }
4240 4241
4241 4242 /*
4242 4243 * Returns the path name (starting from the fs root) for the given fname.
4243 4244 * The caller is responsible for freeing. Note that the path may be or
4244 4245 * become invalid as a result of fn_move().
4245 4246 */
4246 4247
4247 4248 char *
4248 4249 fn_path(nfs4_fname_t *fnp)
4249 4250 {
4250 4251 char *path;
4251 4252 nfs4_fname_t *nextfnp;
4252 4253
4253 4254 if (fnp == NULL)
4254 4255 return (NULL);
4255 4256
4256 4257 path = NULL;
4257 4258
4258 4259 /* walk up the tree constructing the pathname. */
4259 4260
4260 4261 fn_hold(fnp); /* adjust for later rele */
4261 4262 do {
4262 4263 mutex_enter(&fnp->fn_lock);
4263 4264 /*
4264 4265 * Add fn_name in front of the current path
4265 4266 */
4266 4267 fn_path_realloc(&path, fnp->fn_name);
4267 4268 nextfnp = fnp->fn_parent;
4268 4269 if (nextfnp != NULL)
4269 4270 fn_hold(nextfnp);
4270 4271 mutex_exit(&fnp->fn_lock);
4271 4272 fn_rele(&fnp);
4272 4273 fnp = nextfnp;
4273 4274 } while (fnp != NULL);
4274 4275
4275 4276 return (path);
4276 4277 }
4277 4278
4278 4279 /*
4279 4280 * Return a reference to the parent of the given fname, which the caller is
4280 4281 * responsible for eventually releasing.
4281 4282 */
4282 4283
4283 4284 nfs4_fname_t *
4284 4285 fn_parent(nfs4_fname_t *fnp)
4285 4286 {
4286 4287 nfs4_fname_t *parent;
4287 4288
4288 4289 mutex_enter(&fnp->fn_lock);
4289 4290 parent = fnp->fn_parent;
4290 4291 if (parent != NULL)
4291 4292 fn_hold(parent);
4292 4293 mutex_exit(&fnp->fn_lock);
4293 4294
4294 4295 return (parent);
4295 4296 }
4296 4297
4297 4298 /*
4298 4299 * Update fnp so that its parent is newparent and its name is newname.
4299 4300 */
4300 4301
4301 4302 void
4302 4303 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4303 4304 {
4304 4305 nfs4_fname_t *parent, *tmpfnp;
4305 4306 ssize_t newlen;
4306 4307 nfs4_fname_t key;
4307 4308 avl_index_t where;
4308 4309
4309 4310 /*
4310 4311 * This assert exists to catch the client trying to rename
4311 4312 * a dir to be a child of itself. This happened at a recent
4312 4313 * bakeoff against a 3rd party (broken) server which allowed
4313 4314 * the rename to succeed. If it trips it means that:
4314 4315 * a) the code in nfs4rename that detects this case is broken
4315 4316 * b) the server is broken (since it allowed the bogus rename)
4316 4317 *
4317 4318 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4318 4319 * panic below from: mutex_enter(&newparent->fn_lock);
4319 4320 */
4320 4321 ASSERT(fnp != newparent);
4321 4322
4322 4323 /*
4323 4324 * Remove fnp from its current parent, change its name, then add it
4324 4325 * to newparent. It might happen that fnp was replaced by another
4325 4326 * nfs4_fname_t with the same fn_name in parent->fn_children.
4326 4327 * In such case, fnp->fn_parent is NULL and we skip the removal
4327 4328 * of fnp from its current parent.
4328 4329 */
4329 4330 mutex_enter(&fnp->fn_lock);
4330 4331 parent = fnp->fn_parent;
4331 4332 if (parent != NULL) {
4332 4333 mutex_enter(&parent->fn_lock);
4333 4334 avl_remove(&parent->fn_children, fnp);
4334 4335 mutex_exit(&parent->fn_lock);
4335 4336 fn_rele(&fnp->fn_parent);
4336 4337 }
4337 4338
4338 4339 newlen = strlen(newname);
4339 4340 if (newlen != fnp->fn_len) {
4340 4341 ASSERT(newlen < MAXNAMELEN);
4341 4342 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4342 4343 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4343 4344 fnp->fn_len = newlen;
4344 4345 }
4345 4346 (void) strcpy(fnp->fn_name, newname);
4346 4347
4347 4348 again:
4348 4349 mutex_enter(&newparent->fn_lock);
4349 4350 key.fn_name = fnp->fn_name;
4350 4351 tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4351 4352 if (tmpfnp != NULL) {
4352 4353 /*
4353 4354 * This could be due to a file that was unlinked while
4354 4355 * open, or perhaps the rnode is in the free list. Remove
4355 4356 * it from newparent and let it go away on its own. The
4356 4357 * contorted code is to deal with lock order issues and
4357 4358 * race conditions.
4358 4359 */
4359 4360 fn_hold(tmpfnp);
4360 4361 mutex_exit(&newparent->fn_lock);
4361 4362 mutex_enter(&tmpfnp->fn_lock);
4362 4363 if (tmpfnp->fn_parent == newparent) {
4363 4364 mutex_enter(&newparent->fn_lock);
4364 4365 avl_remove(&newparent->fn_children, tmpfnp);
4365 4366 mutex_exit(&newparent->fn_lock);
4366 4367 fn_rele(&tmpfnp->fn_parent);
4367 4368 }
4368 4369 mutex_exit(&tmpfnp->fn_lock);
4369 4370 fn_rele(&tmpfnp);
4370 4371 goto again;
4371 4372 }
4372 4373 fnp->fn_parent = newparent;
4373 4374 fn_hold(newparent);
4374 4375 avl_insert(&newparent->fn_children, fnp, where);
4375 4376 mutex_exit(&newparent->fn_lock);
4376 4377 mutex_exit(&fnp->fn_lock);
4377 4378 }
4378 4379
4379 4380 #ifdef DEBUG
4380 4381 /*
4381 4382 * Return non-zero if the type information makes sense for the given vnode.
4382 4383 * Otherwise panic.
4383 4384 */
4384 4385 int
4385 4386 nfs4_consistent_type(vnode_t *vp)
4386 4387 {
4387 4388 rnode4_t *rp = VTOR4(vp);
4388 4389
4389 4390 if (nfs4_vtype_debug && vp->v_type != VNON &&
4390 4391 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4391 4392 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4392 4393 "rnode attr type=%d", (void *)vp, vp->v_type,
4393 4394 rp->r_attr.va_type);
4394 4395 }
4395 4396
4396 4397 return (1);
4397 4398 }
4398 4399 #endif /* DEBUG */
|
↓ open down ↓ |
2991 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX