Print this page
8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages()
Reviewed by: Arne Jansen <arne@die-jansens.de>
Reviewed by: Vitaliy Gusev <gusev.vitaliy@icloud.com>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs4_client.c
+++ new/usr/src/uts/common/fs/nfs/nfs4_client.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2017 by Delphix. All rights reserved.
24 24 */
25 25
26 26 /*
27 - * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
27 + * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 28 * All Rights Reserved
29 29 */
30 30
31 31 #include <sys/param.h>
32 32 #include <sys/types.h>
33 33 #include <sys/systm.h>
34 34 #include <sys/thread.h>
35 35 #include <sys/t_lock.h>
36 36 #include <sys/time.h>
37 37 #include <sys/vnode.h>
38 38 #include <sys/vfs.h>
39 39 #include <sys/errno.h>
40 40 #include <sys/buf.h>
41 41 #include <sys/stat.h>
42 42 #include <sys/cred.h>
43 43 #include <sys/kmem.h>
44 44 #include <sys/debug.h>
45 45 #include <sys/dnlc.h>
46 46 #include <sys/vmsystm.h>
47 47 #include <sys/flock.h>
48 48 #include <sys/share.h>
49 49 #include <sys/cmn_err.h>
50 50 #include <sys/tiuser.h>
51 51 #include <sys/sysmacros.h>
52 52 #include <sys/callb.h>
53 53 #include <sys/acl.h>
54 54 #include <sys/kstat.h>
55 55 #include <sys/signal.h>
56 56 #include <sys/disp.h>
57 57 #include <sys/atomic.h>
58 58 #include <sys/list.h>
59 59 #include <sys/sdt.h>
60 60
61 61 #include <rpc/types.h>
62 62 #include <rpc/xdr.h>
63 63 #include <rpc/auth.h>
64 64 #include <rpc/clnt.h>
65 65
66 66 #include <nfs/nfs.h>
67 67 #include <nfs/nfs_clnt.h>
68 68 #include <nfs/nfs_acl.h>
69 69
70 70 #include <nfs/nfs4.h>
71 71 #include <nfs/rnode4.h>
72 72 #include <nfs/nfs4_clnt.h>
73 73
74 74 #include <vm/hat.h>
75 75 #include <vm/as.h>
76 76 #include <vm/page.h>
77 77 #include <vm/pvn.h>
78 78 #include <vm/seg.h>
79 79 #include <vm/seg_map.h>
80 80 #include <vm/seg_vn.h>
81 81
82 82 #include <sys/ddi.h>
83 83
84 84 /*
85 85 * Arguments to page-flush thread.
86 86 */
87 87 typedef struct {
88 88 vnode_t *vp;
89 89 cred_t *cr;
90 90 } pgflush_t;
91 91
92 92 #ifdef DEBUG
93 93 int nfs4_client_lease_debug;
94 94 int nfs4_sharedfh_debug;
95 95 int nfs4_fname_debug;
96 96
97 97 /* temporary: panic if v_type is inconsistent with r_attr va_type */
98 98 int nfs4_vtype_debug;
99 99
100 100 uint_t nfs4_tsd_key;
101 101 #endif
102 102
103 103 static time_t nfs4_client_resumed = 0;
104 104 static callb_id_t cid = 0;
105 105
106 106 static int nfs4renew(nfs4_server_t *);
107 107 static void nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
108 108 static void nfs4_pgflush_thread(pgflush_t *);
109 109
110 110 static boolean_t nfs4_client_cpr_callb(void *, int);
111 111
112 112 struct mi4_globals {
113 113 kmutex_t mig_lock; /* lock protecting mig_list */
114 114 list_t mig_list; /* list of NFS v4 mounts in zone */
115 115 boolean_t mig_destructor_called;
116 116 };
117 117
118 118 static zone_key_t mi4_list_key;
119 119
120 120 /*
121 121 * Attributes caching:
122 122 *
123 123 * Attributes are cached in the rnode in struct vattr form.
124 124 * There is a time associated with the cached attributes (r_time_attr_inval)
125 125 * which tells whether the attributes are valid. The time is initialized
126 126 * to the difference between current time and the modify time of the vnode
127 127 * when new attributes are cached. This allows the attributes for
128 128 * files that have changed recently to be timed out sooner than for files
129 129 * that have not changed for a long time. There are minimum and maximum
130 130 * timeout values that can be set per mount point.
131 131 */
132 132
133 133 /*
134 134 * If a cache purge is in progress, wait for it to finish.
135 135 *
136 136 * The current thread must not be in the middle of an
137 137 * nfs4_start_op/nfs4_end_op region. Otherwise, there could be a deadlock
138 138 * between this thread, a recovery thread, and the page flush thread.
139 139 */
140 140 int
141 141 nfs4_waitfor_purge_complete(vnode_t *vp)
142 142 {
143 143 rnode4_t *rp;
144 144 k_sigset_t smask;
145 145
146 146 rp = VTOR4(vp);
147 147 if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
148 148 ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
149 149 mutex_enter(&rp->r_statelock);
150 150 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
151 151 while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
152 152 ((rp->r_flags & R4PGFLUSH) &&
153 153 rp->r_pgflush != curthread)) {
154 154 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
155 155 sigunintr(&smask);
156 156 mutex_exit(&rp->r_statelock);
157 157 return (EINTR);
158 158 }
159 159 }
160 160 sigunintr(&smask);
161 161 mutex_exit(&rp->r_statelock);
162 162 }
163 163 return (0);
164 164 }
165 165
166 166 /*
167 167 * Validate caches by checking cached attributes. If they have timed out,
168 168 * then get new attributes from the server. As a side effect, cache
169 169 * invalidation is done if the attributes have changed.
170 170 *
171 171 * If the attributes have not timed out and if there is a cache
172 172 * invalidation being done by some other thread, then wait until that
173 173 * thread has completed the cache invalidation.
174 174 */
175 175 int
176 176 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
177 177 {
178 178 int error;
179 179 nfs4_ga_res_t gar;
180 180
181 181 if (ATTRCACHE4_VALID(vp)) {
182 182 error = nfs4_waitfor_purge_complete(vp);
183 183 if (error)
184 184 return (error);
185 185 return (0);
186 186 }
187 187
188 188 return (nfs4_getattr_otw(vp, &gar, cr, 0));
189 189 }
190 190
191 191 /*
192 192 * Fill in attribute from the cache.
193 193 * If valid, then return 0 to indicate that no error occurred,
194 194 * otherwise return 1 to indicate that an error occurred.
195 195 */
196 196 static int
197 197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
198 198 {
199 199 rnode4_t *rp;
200 200
201 201 rp = VTOR4(vp);
202 202 mutex_enter(&rp->r_statelock);
203 203 mutex_enter(&rp->r_statev4_lock);
204 204 if (ATTRCACHE4_VALID(vp)) {
205 205 mutex_exit(&rp->r_statev4_lock);
206 206 /*
207 207 * Cached attributes are valid
208 208 */
209 209 *vap = rp->r_attr;
210 210 mutex_exit(&rp->r_statelock);
211 211 return (0);
212 212 }
213 213 mutex_exit(&rp->r_statev4_lock);
214 214 mutex_exit(&rp->r_statelock);
215 215 return (1);
216 216 }
217 217
218 218
219 219 /*
220 220 * If returned error is ESTALE flush all caches. The nfs4_purge_caches()
221 221 * call is synchronous because all the pages were invalidated by the
222 222 * nfs4_invalidate_pages() call.
223 223 */
224 224 void
225 225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
226 226 {
227 227 struct rnode4 *rp = VTOR4(vp);
228 228
229 229 /* Ensure that the ..._end_op() call has been done */
230 230 ASSERT(tsd_get(nfs4_tsd_key) == NULL);
231 231
232 232 if (errno != ESTALE)
233 233 return;
234 234
235 235 mutex_enter(&rp->r_statelock);
236 236 rp->r_flags |= R4STALE;
237 237 if (!rp->r_error)
238 238 rp->r_error = errno;
239 239 mutex_exit(&rp->r_statelock);
240 240 if (nfs4_has_pages(vp))
241 241 nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
242 242 nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
243 243 }
244 244
245 245 /*
246 246 * Purge all of the various NFS `data' caches. If "asyncpg" is TRUE, the
247 247 * page purge is done asynchronously.
248 248 */
249 249 void
250 250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
251 251 {
252 252 rnode4_t *rp;
253 253 char *contents;
254 254 vnode_t *xattr;
255 255 int size;
256 256 int pgflush; /* are we the page flush thread? */
257 257
258 258 /*
259 259 * Purge the DNLC for any entries which refer to this file.
260 260 */
261 261 if (vp->v_count > 1 &&
262 262 (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
263 263 dnlc_purge_vp(vp);
264 264
265 265 /*
266 266 * Clear any readdir state bits and purge the readlink response cache.
267 267 */
268 268 rp = VTOR4(vp);
269 269 mutex_enter(&rp->r_statelock);
270 270 rp->r_flags &= ~R4LOOKUP;
271 271 contents = rp->r_symlink.contents;
272 272 size = rp->r_symlink.size;
273 273 rp->r_symlink.contents = NULL;
274 274
275 275 xattr = rp->r_xattr_dir;
276 276 rp->r_xattr_dir = NULL;
277 277
278 278 /*
279 279 * Purge pathconf cache too.
280 280 */
281 281 rp->r_pathconf.pc4_xattr_valid = 0;
282 282 rp->r_pathconf.pc4_cache_valid = 0;
283 283
284 284 pgflush = (curthread == rp->r_pgflush);
285 285 mutex_exit(&rp->r_statelock);
286 286
287 287 if (contents != NULL) {
288 288
289 289 kmem_free((void *)contents, size);
290 290 }
291 291
292 292 if (xattr != NULL)
293 293 VN_RELE(xattr);
294 294
295 295 /*
296 296 * Flush the page cache. If the current thread is the page flush
297 297 * thread, don't initiate a new page flush. There's no need for
298 298 * it, and doing it correctly is hard.
299 299 */
300 300 if (nfs4_has_pages(vp) && !pgflush) {
301 301 if (!asyncpg) {
302 302 (void) nfs4_waitfor_purge_complete(vp);
303 303 nfs4_flush_pages(vp, cr);
304 304 } else {
305 305 pgflush_t *args;
306 306
307 307 /*
308 308 * We don't hold r_statelock while creating the
309 309 * thread, in case the call blocks. So we use a
310 310 * flag to indicate that a page flush thread is
311 311 * active.
312 312 */
313 313 mutex_enter(&rp->r_statelock);
314 314 if (rp->r_flags & R4PGFLUSH) {
315 315 mutex_exit(&rp->r_statelock);
316 316 } else {
317 317 rp->r_flags |= R4PGFLUSH;
318 318 mutex_exit(&rp->r_statelock);
319 319
320 320 args = kmem_alloc(sizeof (pgflush_t),
321 321 KM_SLEEP);
322 322 args->vp = vp;
323 323 VN_HOLD(args->vp);
324 324 args->cr = cr;
325 325 crhold(args->cr);
326 326 (void) zthread_create(NULL, 0,
327 327 nfs4_pgflush_thread, args, 0,
328 328 minclsyspri);
329 329 }
330 330 }
331 331 }
332 332
333 333 /*
334 334 * Flush the readdir response cache.
335 335 */
336 336 nfs4_purge_rddir_cache(vp);
337 337 }
338 338
339 339 /*
340 340 * Invalidate all pages for the given file, after writing back the dirty
341 341 * ones.
342 342 */
343 343
344 344 void
345 345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
346 346 {
347 347 int error;
348 348 rnode4_t *rp = VTOR4(vp);
349 349
350 350 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
351 351 if (error == ENOSPC || error == EDQUOT) {
352 352 mutex_enter(&rp->r_statelock);
353 353 if (!rp->r_error)
354 354 rp->r_error = error;
355 355 mutex_exit(&rp->r_statelock);
356 356 }
357 357 }
358 358
359 359 /*
360 360 * Page flush thread.
361 361 */
362 362
363 363 static void
364 364 nfs4_pgflush_thread(pgflush_t *args)
365 365 {
366 366 rnode4_t *rp = VTOR4(args->vp);
367 367
368 368 /* remember which thread we are, so we don't deadlock ourselves */
369 369 mutex_enter(&rp->r_statelock);
370 370 ASSERT(rp->r_pgflush == NULL);
371 371 rp->r_pgflush = curthread;
372 372 mutex_exit(&rp->r_statelock);
373 373
374 374 nfs4_flush_pages(args->vp, args->cr);
375 375
376 376 mutex_enter(&rp->r_statelock);
377 377 rp->r_pgflush = NULL;
378 378 rp->r_flags &= ~R4PGFLUSH;
379 379 cv_broadcast(&rp->r_cv);
380 380 mutex_exit(&rp->r_statelock);
381 381
382 382 VN_RELE(args->vp);
383 383 crfree(args->cr);
384 384 kmem_free(args, sizeof (pgflush_t));
385 385 zthread_exit();
386 386 }
387 387
388 388 /*
389 389 * Purge the readdir cache of all entries which are not currently
390 390 * being filled.
391 391 */
392 392 void
393 393 nfs4_purge_rddir_cache(vnode_t *vp)
394 394 {
395 395 rnode4_t *rp;
396 396
397 397 rp = VTOR4(vp);
398 398
399 399 mutex_enter(&rp->r_statelock);
400 400 rp->r_direof = NULL;
401 401 rp->r_flags &= ~R4LOOKUP;
402 402 rp->r_flags |= R4READDIRWATTR;
403 403 rddir4_cache_purge(rp);
404 404 mutex_exit(&rp->r_statelock);
405 405 }
406 406
407 407 /*
408 408 * Set attributes cache for given vnode using virtual attributes. There is
409 409 * no cache validation, but if the attributes are deemed to be stale, they
410 410 * are ignored. This corresponds to nfs3_attrcache().
411 411 *
412 412 * Set the timeout value on the attribute cache and fill it
413 413 * with the passed in attributes.
414 414 */
415 415 void
416 416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
417 417 {
418 418 rnode4_t *rp = VTOR4(vp);
419 419
420 420 mutex_enter(&rp->r_statelock);
421 421 if (rp->r_time_attr_saved <= t)
422 422 nfs4_attrcache_va(vp, garp, FALSE);
423 423 mutex_exit(&rp->r_statelock);
424 424 }
425 425
426 426 /*
427 427 * Use the passed in virtual attributes to check to see whether the
428 428 * data and metadata caches are valid, cache the new attributes, and
429 429 * then do the cache invalidation if required.
430 430 *
431 431 * The cache validation and caching of the new attributes is done
432 432 * atomically via the use of the mutex, r_statelock. If required,
433 433 * the cache invalidation is done atomically w.r.t. the cache
434 434 * validation and caching of the attributes via the pseudo lock,
435 435 * r_serial.
436 436 *
437 437 * This routine is used to do cache validation and attributes caching
438 438 * for operations with a single set of post operation attributes.
439 439 */
440 440
441 441 void
442 442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
443 443 hrtime_t t, cred_t *cr, int async,
444 444 change_info4 *cinfo)
445 445 {
446 446 rnode4_t *rp;
447 447 int mtime_changed = 0;
448 448 int ctime_changed = 0;
449 449 vsecattr_t *vsp;
450 450 int was_serial, set_time_cache_inval, recov;
451 451 vattr_t *vap = &garp->n4g_va;
452 452 mntinfo4_t *mi = VTOMI4(vp);
453 453 len_t preattr_rsize;
454 454 boolean_t writemodify_set = B_FALSE;
455 455 boolean_t cachepurge_set = B_FALSE;
456 456
|
↓ open down ↓ |
419 lines elided |
↑ open up ↑ |
457 457 ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
458 458
459 459 /* Is curthread the recovery thread? */
460 460 mutex_enter(&mi->mi_lock);
461 461 recov = (VTOMI4(vp)->mi_recovthread == curthread);
462 462 mutex_exit(&mi->mi_lock);
463 463
464 464 rp = VTOR4(vp);
465 465 mutex_enter(&rp->r_statelock);
466 466 was_serial = (rp->r_serial == curthread);
467 - if (rp->r_serial && !was_serial) {
468 - klwp_t *lwp = ttolwp(curthread);
469 -
467 + if (rp->r_serial != NULL && !was_serial) {
470 468 /*
471 - * If we're the recovery thread, then purge current attrs
472 - * and bail out to avoid potential deadlock between another
473 - * thread caching attrs (r_serial thread), recov thread,
474 - * and an async writer thread.
469 + * Purge current attrs and bail out to avoid potential deadlock
470 + * between another thread caching attrs (r_serial thread), this
471 + * thread, and a thread trying to read or write pages.
475 472 */
476 - if (recov) {
477 - PURGE_ATTRCACHE4_LOCKED(rp);
478 - mutex_exit(&rp->r_statelock);
479 - return;
480 - }
481 -
482 - if (lwp != NULL)
483 - lwp->lwp_nostop++;
484 - while (rp->r_serial != NULL) {
485 - if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
486 - mutex_exit(&rp->r_statelock);
487 - if (lwp != NULL)
488 - lwp->lwp_nostop--;
489 - return;
490 - }
491 - }
492 - if (lwp != NULL)
493 - lwp->lwp_nostop--;
473 + PURGE_ATTRCACHE4_LOCKED(rp);
474 + mutex_exit(&rp->r_statelock);
475 + return;
494 476 }
495 477
496 478 /*
497 479 * If there is a page flush thread, the current thread needs to
498 480 * bail out, to prevent a possible deadlock between the current
499 481 * thread (which might be in a start_op/end_op region), the
500 482 * recovery thread, and the page flush thread. Expire the
501 483 * attribute cache, so that any attributes the current thread was
502 484 * going to set are not lost.
503 485 */
504 486 if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
505 487 PURGE_ATTRCACHE4_LOCKED(rp);
506 488 mutex_exit(&rp->r_statelock);
507 489 return;
508 490 }
509 491
510 492 if (rp->r_time_attr_saved > t) {
511 493 /*
512 494 * Attributes have been cached since these attributes were
513 495 * probably made. If there is an inconsistency in what is
514 496 * cached, mark them invalid. If not, don't act on them.
515 497 */
516 498 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
517 499 PURGE_ATTRCACHE4_LOCKED(rp);
518 500 mutex_exit(&rp->r_statelock);
519 501 return;
520 502 }
521 503 set_time_cache_inval = 0;
522 504 if (cinfo) {
523 505 /*
524 506 * Only directory modifying callers pass non-NULL cinfo.
525 507 */
526 508 ASSERT(vp->v_type == VDIR);
527 509 /*
528 510 * If the cache timeout either doesn't exist or hasn't expired,
529 511 * and dir didn't changed on server before dirmod op
530 512 * and dir didn't change after dirmod op but before getattr
531 513 * then there's a chance that the client's cached data for
532 514 * this object is current (not stale). No immediate cache
533 515 * flush is required.
534 516 *
535 517 */
536 518 if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
537 519 cinfo->before == rp->r_change &&
538 520 (garp->n4g_change_valid &&
539 521 cinfo->after == garp->n4g_change)) {
540 522
541 523 /*
542 524 * If atomic isn't set, then the before/after info
543 525 * cannot be blindly trusted. For this case, we tell
544 526 * nfs4_attrcache_va to cache the attrs but also
545 527 * establish an absolute maximum cache timeout. When
546 528 * the timeout is reached, caches will be flushed.
547 529 */
548 530 if (! cinfo->atomic)
549 531 set_time_cache_inval = 1;
550 532 } else {
551 533
552 534 /*
553 535 * We're not sure exactly what changed, but we know
554 536 * what to do. flush all caches for dir. remove the
555 537 * attr timeout.
556 538 *
557 539 * a) timeout expired. flush all caches.
558 540 * b) r_change != cinfo.before. flush all caches.
559 541 * c) r_change == cinfo.before, but cinfo.after !=
560 542 * post-op getattr(change). flush all caches.
561 543 * d) post-op getattr(change) not provided by server.
562 544 * flush all caches.
563 545 */
564 546 mtime_changed = 1;
565 547 ctime_changed = 1;
566 548 rp->r_time_cache_inval = 0;
567 549 }
568 550 } else {
569 551 /*
570 552 * Write thread after writing data to file on remote server,
571 553 * will always set R4WRITEMODIFIED to indicate that file on
572 554 * remote server was modified with a WRITE operation and would
573 555 * have marked attribute cache as timed out. If R4WRITEMODIFIED
574 556 * is set, then do not check for mtime and ctime change.
575 557 */
576 558 if (!(rp->r_flags & R4WRITEMODIFIED)) {
577 559 if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
578 560 mtime_changed = 1;
579 561
580 562 if (rp->r_attr.va_ctime.tv_sec !=
581 563 vap->va_ctime.tv_sec ||
582 564 rp->r_attr.va_ctime.tv_nsec !=
583 565 vap->va_ctime.tv_nsec)
584 566 ctime_changed = 1;
585 567
586 568 /*
587 569 * If the change attribute was not provided by server
588 570 * or it differs, then flush all caches.
589 571 */
590 572 if (!garp->n4g_change_valid ||
591 573 rp->r_change != garp->n4g_change) {
592 574 mtime_changed = 1;
593 575 ctime_changed = 1;
594 576 }
595 577 } else {
596 578 writemodify_set = B_TRUE;
597 579 }
598 580 }
599 581
600 582 preattr_rsize = rp->r_size;
601 583
602 584 nfs4_attrcache_va(vp, garp, set_time_cache_inval);
603 585
604 586 /*
605 587 * If we have updated filesize in nfs4_attrcache_va, as soon as we
606 588 * drop statelock we will be in transition of purging all
607 589 * our caches and updating them. It is possible for another
608 590 * thread to pick this new file size and read in zeroed data.
609 591 * stall other threads till cache purge is complete.
610 592 */
611 593 if ((!cinfo) && (rp->r_size != preattr_rsize)) {
612 594 /*
613 595 * If R4WRITEMODIFIED was set and we have updated the file
614 596 * size, Server's returned file size need not necessarily
615 597 * be because of this Client's WRITE. We need to purge
616 598 * all caches.
617 599 */
618 600 if (writemodify_set)
619 601 mtime_changed = 1;
620 602
621 603 if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
622 604 rp->r_flags |= R4INCACHEPURGE;
623 605 cachepurge_set = B_TRUE;
624 606 }
625 607 }
626 608
627 609 if (!mtime_changed && !ctime_changed) {
628 610 mutex_exit(&rp->r_statelock);
629 611 return;
630 612 }
631 613
632 614 rp->r_serial = curthread;
633 615
634 616 mutex_exit(&rp->r_statelock);
635 617
636 618 /*
637 619 * If we're the recov thread, then force async nfs4_purge_caches
638 620 * to avoid potential deadlock.
639 621 */
640 622 if (mtime_changed)
641 623 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
642 624
643 625 if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
644 626 mutex_enter(&rp->r_statelock);
645 627 rp->r_flags &= ~R4INCACHEPURGE;
646 628 cv_broadcast(&rp->r_cv);
647 629 mutex_exit(&rp->r_statelock);
648 630 cachepurge_set = B_FALSE;
649 631 }
650 632
651 633 if (ctime_changed) {
652 634 (void) nfs4_access_purge_rp(rp);
653 635 if (rp->r_secattr != NULL) {
654 636 mutex_enter(&rp->r_statelock);
655 637 vsp = rp->r_secattr;
656 638 rp->r_secattr = NULL;
657 639 mutex_exit(&rp->r_statelock);
658 640 if (vsp != NULL)
659 641 nfs4_acl_free_cache(vsp);
660 642 }
661 643 }
662 644
663 645 if (!was_serial) {
664 646 mutex_enter(&rp->r_statelock);
665 647 rp->r_serial = NULL;
666 648 cv_broadcast(&rp->r_cv);
667 649 mutex_exit(&rp->r_statelock);
668 650 }
669 651 }
670 652
671 653 /*
672 654 * Set attributes cache for given vnode using virtual attributes.
673 655 *
674 656 * Set the timeout value on the attribute cache and fill it
675 657 * with the passed in attributes.
676 658 *
677 659 * The caller must be holding r_statelock.
678 660 */
679 661 static void
680 662 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
681 663 {
682 664 rnode4_t *rp;
683 665 mntinfo4_t *mi;
684 666 hrtime_t delta;
685 667 hrtime_t now;
686 668 vattr_t *vap = &garp->n4g_va;
687 669
688 670 rp = VTOR4(vp);
689 671
690 672 ASSERT(MUTEX_HELD(&rp->r_statelock));
691 673 ASSERT(vap->va_mask == AT_ALL);
692 674
693 675 /* Switch to master before checking v_flag */
694 676 if (IS_SHADOW(vp, rp))
695 677 vp = RTOV4(rp);
696 678
697 679 now = gethrtime();
698 680
699 681 mi = VTOMI4(vp);
700 682
701 683 /*
702 684 * Only establish a new cache timeout (if requested). Never
703 685 * extend a timeout. Never clear a timeout. Clearing a timeout
704 686 * is done by nfs4_update_dircaches (ancestor in our call chain)
705 687 */
706 688 if (set_cache_timeout && ! rp->r_time_cache_inval)
707 689 rp->r_time_cache_inval = now + mi->mi_acdirmax;
708 690
709 691 /*
710 692 * Delta is the number of nanoseconds that we will
711 693 * cache the attributes of the file. It is based on
712 694 * the number of nanoseconds since the last time that
713 695 * we detected a change. The assumption is that files
714 696 * that changed recently are likely to change again.
715 697 * There is a minimum and a maximum for regular files
716 698 * and for directories which is enforced though.
717 699 *
718 700 * Using the time since last change was detected
719 701 * eliminates direct comparison or calculation
720 702 * using mixed client and server times. NFS does
721 703 * not make any assumptions regarding the client
722 704 * and server clocks being synchronized.
723 705 */
724 706 if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
725 707 vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
726 708 vap->va_size != rp->r_attr.va_size) {
727 709 rp->r_time_attr_saved = now;
728 710 }
729 711
730 712 if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
731 713 delta = 0;
732 714 else {
733 715 delta = now - rp->r_time_attr_saved;
734 716 if (vp->v_type == VDIR) {
735 717 if (delta < mi->mi_acdirmin)
736 718 delta = mi->mi_acdirmin;
737 719 else if (delta > mi->mi_acdirmax)
738 720 delta = mi->mi_acdirmax;
739 721 } else {
740 722 if (delta < mi->mi_acregmin)
741 723 delta = mi->mi_acregmin;
742 724 else if (delta > mi->mi_acregmax)
743 725 delta = mi->mi_acregmax;
744 726 }
745 727 }
746 728 rp->r_time_attr_inval = now + delta;
747 729
748 730 rp->r_attr = *vap;
749 731 if (garp->n4g_change_valid)
750 732 rp->r_change = garp->n4g_change;
751 733
752 734 /*
753 735 * The attributes that were returned may be valid and can
754 736 * be used, but they may not be allowed to be cached.
755 737 * Reset the timers to cause immediate invalidation and
756 738 * clear r_change so no VERIFY operations will suceed
757 739 */
758 740 if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
759 741 rp->r_time_attr_inval = now;
760 742 rp->r_time_attr_saved = now;
761 743 rp->r_change = 0;
762 744 }
763 745
764 746 /*
765 747 * If mounted_on_fileid returned AND the object is a stub,
766 748 * then set object's va_nodeid to the mounted over fid
767 749 * returned by server.
768 750 *
769 751 * If mounted_on_fileid not provided/supported, then
770 752 * just set it to 0 for now. Eventually it would be
771 753 * better to set it to a hashed version of FH. This
772 754 * would probably be good enough to provide a unique
773 755 * fid/d_ino within a dir.
774 756 *
775 757 * We don't need to carry mounted_on_fileid in the
776 758 * rnode as long as the client never requests fileid
777 759 * without also requesting mounted_on_fileid. For
778 760 * now, it stays.
779 761 */
780 762 if (garp->n4g_mon_fid_valid) {
781 763 rp->r_mntd_fid = garp->n4g_mon_fid;
782 764
783 765 if (RP_ISSTUB(rp))
784 766 rp->r_attr.va_nodeid = rp->r_mntd_fid;
785 767 }
786 768
787 769 /*
788 770 * Check to see if there are valid pathconf bits to
789 771 * cache in the rnode.
790 772 */
791 773 if (garp->n4g_ext_res) {
792 774 if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
793 775 rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
794 776 } else {
795 777 if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
796 778 rp->r_pathconf.pc4_xattr_valid = TRUE;
797 779 rp->r_pathconf.pc4_xattr_exists =
798 780 garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
799 781 }
800 782 }
801 783 }
802 784 /*
803 785 * Update the size of the file if there is no cached data or if
804 786 * the cached data is clean and there is no data being written
805 787 * out.
806 788 */
807 789 if (rp->r_size != vap->va_size &&
808 790 (!vn_has_cached_data(vp) ||
809 791 (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
810 792 rp->r_size = vap->va_size;
811 793 }
812 794 nfs_setswaplike(vp, vap);
813 795 rp->r_flags &= ~R4WRITEMODIFIED;
814 796 }
815 797
816 798 /*
817 799 * Get attributes over-the-wire and update attributes cache
818 800 * if no error occurred in the over-the-wire operation.
819 801 * Return 0 if successful, otherwise error.
820 802 */
821 803 int
822 804 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
823 805 {
824 806 mntinfo4_t *mi = VTOMI4(vp);
825 807 hrtime_t t;
826 808 nfs4_recov_state_t recov_state;
827 809 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
828 810
829 811 recov_state.rs_flags = 0;
830 812 recov_state.rs_num_retry_despite_err = 0;
831 813
832 814 /* Save the original mount point security flavor */
833 815 (void) save_mnt_secinfo(mi->mi_curr_serv);
834 816
835 817 recov_retry:
836 818
837 819 if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
838 820 &recov_state, NULL))) {
839 821 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
840 822 return (e.error);
841 823 }
842 824
843 825 t = gethrtime();
844 826
845 827 nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
846 828
847 829 if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
848 830 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
849 831 NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE) {
850 832 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
851 833 &recov_state, 1);
852 834 goto recov_retry;
853 835 }
854 836 }
855 837
856 838 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
857 839
858 840 if (!e.error) {
859 841 if (e.stat == NFS4_OK) {
860 842 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
861 843 } else {
862 844 e.error = geterrno4(e.stat);
863 845
864 846 nfs4_purge_stale_fh(e.error, vp, cr);
865 847 }
866 848 }
867 849
868 850 /*
869 851 * If getattr a node that is a stub for a crossed
870 852 * mount point, keep the original secinfo flavor for
871 853 * the current file system, not the crossed one.
872 854 */
873 855 (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
874 856
875 857 return (e.error);
876 858 }
877 859
878 860 /*
879 861 * Generate a compound to get attributes over-the-wire.
880 862 */
881 863 void
882 864 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
883 865 nfs4_error_t *ep, cred_t *cr, int get_acl)
884 866 {
885 867 COMPOUND4args_clnt args;
886 868 COMPOUND4res_clnt res;
887 869 int doqueue;
888 870 rnode4_t *rp = VTOR4(vp);
889 871 nfs_argop4 argop[2];
890 872
891 873 args.ctag = TAG_GETATTR;
892 874
893 875 args.array_len = 2;
894 876 args.array = argop;
895 877
896 878 /* putfh */
897 879 argop[0].argop = OP_CPUTFH;
898 880 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
899 881
900 882 /* getattr */
901 883 /*
902 884 * Unlike nfs version 2 and 3, where getattr returns all the
903 885 * attributes, nfs version 4 returns only the ones explicitly
904 886 * asked for. This creates problems, as some system functions
905 887 * (e.g. cache check) require certain attributes and if the
906 888 * cached node lacks some attributes such as uid/gid, it can
907 889 * affect system utilities (e.g. "ls") that rely on the information
908 890 * to be there. This can lead to anything from system crashes to
909 891 * corrupted information processed by user apps.
910 892 * So to ensure that all bases are covered, request at least
911 893 * the AT_ALL attribute mask.
912 894 */
913 895 argop[1].argop = OP_GETATTR;
914 896 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
915 897 if (get_acl)
916 898 argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
917 899 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
918 900
919 901 doqueue = 1;
920 902
921 903 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
922 904
923 905 if (ep->error)
924 906 return;
925 907
926 908 if (res.status != NFS4_OK) {
927 909 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
928 910 return;
929 911 }
930 912
931 913 *garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
932 914
933 915 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
934 916 }
935 917
936 918 /*
937 919 * Return either cached or remote attributes. If get remote attr
938 920 * use them to check and invalidate caches, then cache the new attributes.
939 921 */
940 922 int
941 923 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
942 924 {
943 925 int error;
944 926 rnode4_t *rp;
945 927 nfs4_ga_res_t gar;
946 928
947 929 ASSERT(nfs4_consistent_type(vp));
948 930
949 931 /*
950 932 * If we've got cached attributes, we're done, otherwise go
951 933 * to the server to get attributes, which will update the cache
952 934 * in the process. Either way, use the cached attributes for
953 935 * the caller's vattr_t.
954 936 *
955 937 * Note that we ignore the gar set by the OTW call: the attr caching
956 938 * code may make adjustments when storing to the rnode, and we want
957 939 * to see those changes here.
958 940 */
959 941 rp = VTOR4(vp);
960 942 error = 0;
961 943 mutex_enter(&rp->r_statelock);
962 944 if (!ATTRCACHE4_VALID(vp)) {
963 945 mutex_exit(&rp->r_statelock);
964 946 error = nfs4_getattr_otw(vp, &gar, cr, 0);
965 947 mutex_enter(&rp->r_statelock);
966 948 }
967 949
968 950 if (!error)
969 951 *vap = rp->r_attr;
970 952
971 953 /* Return the client's view of file size */
972 954 vap->va_size = rp->r_size;
973 955
974 956 mutex_exit(&rp->r_statelock);
975 957
976 958 ASSERT(nfs4_consistent_type(vp));
977 959
978 960 return (error);
979 961 }
980 962
981 963 int
982 964 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
983 965 nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
984 966 {
985 967 COMPOUND4args_clnt args;
986 968 COMPOUND4res_clnt res;
987 969 int doqueue;
988 970 nfs_argop4 argop[2];
989 971 mntinfo4_t *mi = VTOMI4(vp);
990 972 bool_t needrecov = FALSE;
991 973 nfs4_recov_state_t recov_state;
992 974 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
993 975 nfs4_ga_ext_res_t *gerp;
994 976
995 977 recov_state.rs_flags = 0;
996 978 recov_state.rs_num_retry_despite_err = 0;
997 979
998 980 recov_retry:
999 981 args.ctag = tag_type;
1000 982
1001 983 args.array_len = 2;
1002 984 args.array = argop;
1003 985
1004 986 e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
1005 987 if (e.error)
1006 988 return (e.error);
1007 989
1008 990 /* putfh */
1009 991 argop[0].argop = OP_CPUTFH;
1010 992 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1011 993
1012 994 /* getattr */
1013 995 argop[1].argop = OP_GETATTR;
1014 996 argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1015 997 argop[1].nfs_argop4_u.opgetattr.mi = mi;
1016 998
1017 999 doqueue = 1;
1018 1000
1019 1001 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1020 1002 "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1021 1003 rnode4info(VTOR4(vp))));
1022 1004
1023 1005 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1024 1006
1025 1007 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1026 1008 if (!needrecov && e.error) {
1027 1009 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1028 1010 needrecov);
1029 1011 return (e.error);
1030 1012 }
1031 1013
1032 1014 if (needrecov) {
1033 1015 bool_t abort;
1034 1016
1035 1017 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1036 1018 "nfs4_attr_otw: initiating recovery\n"));
1037 1019
1038 1020 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1039 1021 NULL, OP_GETATTR, NULL, NULL, NULL);
1040 1022 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1041 1023 needrecov);
1042 1024 if (!e.error) {
1043 1025 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1044 1026 e.error = geterrno4(res.status);
1045 1027 }
1046 1028 if (abort == FALSE)
1047 1029 goto recov_retry;
1048 1030 return (e.error);
1049 1031 }
1050 1032
1051 1033 if (res.status) {
1052 1034 e.error = geterrno4(res.status);
1053 1035 } else {
1054 1036 gerp = garp->n4g_ext_res;
1055 1037 bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1056 1038 garp, sizeof (nfs4_ga_res_t));
1057 1039 garp->n4g_ext_res = gerp;
1058 1040 if (garp->n4g_ext_res &&
1059 1041 res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1060 1042 bcopy(res.array[1].nfs_resop4_u.opgetattr.
1061 1043 ga_res.n4g_ext_res,
1062 1044 garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1063 1045 }
1064 1046 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1065 1047 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1066 1048 needrecov);
1067 1049 return (e.error);
1068 1050 }
1069 1051
1070 1052 /*
1071 1053 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1072 1054 * for the demand-based allocation of async threads per-mount. The
1073 1055 * nfs_async_timeout is the amount of time a thread will live after it
1074 1056 * becomes idle, unless new I/O requests are received before the thread
1075 1057 * dies. See nfs4_async_putpage and nfs4_async_start.
1076 1058 */
1077 1059
1078 1060 static void nfs4_async_start(struct vfs *);
1079 1061 static void nfs4_async_pgops_start(struct vfs *);
1080 1062 static void nfs4_async_common_start(struct vfs *, int);
1081 1063
1082 1064 static void
1083 1065 free_async_args4(struct nfs4_async_reqs *args)
1084 1066 {
1085 1067 rnode4_t *rp;
1086 1068
1087 1069 if (args->a_io != NFS4_INACTIVE) {
1088 1070 rp = VTOR4(args->a_vp);
1089 1071 mutex_enter(&rp->r_statelock);
1090 1072 rp->r_count--;
1091 1073 if (args->a_io == NFS4_PUTAPAGE ||
1092 1074 args->a_io == NFS4_PAGEIO)
1093 1075 rp->r_awcount--;
1094 1076 cv_broadcast(&rp->r_cv);
1095 1077 mutex_exit(&rp->r_statelock);
1096 1078 VN_RELE(args->a_vp);
1097 1079 }
1098 1080 crfree(args->a_cred);
1099 1081 kmem_free(args, sizeof (*args));
1100 1082 }
1101 1083
1102 1084 /*
1103 1085 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1104 1086 * pageout(), running in the global zone, have legitimate reasons to do
1105 1087 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1106 1088 * use of a a per-mount "asynchronous requests manager thread" which is
1107 1089 * signaled by the various asynchronous work routines when there is
1108 1090 * asynchronous work to be done. It is responsible for creating new
1109 1091 * worker threads if necessary, and notifying existing worker threads
1110 1092 * that there is work to be done.
1111 1093 *
1112 1094 * In other words, it will "take the specifications from the customers and
1113 1095 * give them to the engineers."
1114 1096 *
1115 1097 * Worker threads die off of their own accord if they are no longer
1116 1098 * needed.
1117 1099 *
1118 1100 * This thread is killed when the zone is going away or the filesystem
1119 1101 * is being unmounted.
1120 1102 */
1121 1103 void
1122 1104 nfs4_async_manager(vfs_t *vfsp)
1123 1105 {
1124 1106 callb_cpr_t cprinfo;
1125 1107 mntinfo4_t *mi;
1126 1108 uint_t max_threads;
1127 1109
1128 1110 mi = VFTOMI4(vfsp);
1129 1111
1130 1112 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1131 1113 "nfs4_async_manager");
1132 1114
1133 1115 mutex_enter(&mi->mi_async_lock);
1134 1116 /*
1135 1117 * We want to stash the max number of threads that this mount was
1136 1118 * allowed so we can use it later when the variable is set to zero as
1137 1119 * part of the zone/mount going away.
1138 1120 *
1139 1121 * We want to be able to create at least one thread to handle
1140 1122 * asynchronous inactive calls.
1141 1123 */
1142 1124 max_threads = MAX(mi->mi_max_threads, 1);
1143 1125 /*
1144 1126 * We don't want to wait for mi_max_threads to go to zero, since that
1145 1127 * happens as part of a failed unmount, but this thread should only
1146 1128 * exit when the mount is really going away.
1147 1129 *
1148 1130 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1149 1131 * attempted: the various _async_*() functions know to do things
1150 1132 * inline if mi_max_threads == 0. Henceforth we just drain out the
1151 1133 * outstanding requests.
1152 1134 *
1153 1135 * Note that we still create zthreads even if we notice the zone is
1154 1136 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1155 1137 * shutdown sequence to take slightly longer in some cases, but
1156 1138 * doesn't violate the protocol, as all threads will exit as soon as
1157 1139 * they're done processing the remaining requests.
1158 1140 */
1159 1141 for (;;) {
1160 1142 while (mi->mi_async_req_count > 0) {
1161 1143 /*
1162 1144 * Paranoia: If the mount started out having
1163 1145 * (mi->mi_max_threads == 0), and the value was
1164 1146 * later changed (via a debugger or somesuch),
1165 1147 * we could be confused since we will think we
1166 1148 * can't create any threads, and the calling
1167 1149 * code (which looks at the current value of
1168 1150 * mi->mi_max_threads, now non-zero) thinks we
1169 1151 * can.
1170 1152 *
1171 1153 * So, because we're paranoid, we create threads
1172 1154 * up to the maximum of the original and the
1173 1155 * current value. This means that future
1174 1156 * (debugger-induced) alterations of
1175 1157 * mi->mi_max_threads are ignored for our
1176 1158 * purposes, but who told them they could change
1177 1159 * random values on a live kernel anyhow?
1178 1160 */
1179 1161 if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1180 1162 MAX(mi->mi_max_threads, max_threads)) {
1181 1163 mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1182 1164 mutex_exit(&mi->mi_async_lock);
1183 1165 MI4_HOLD(mi);
1184 1166 VFS_HOLD(vfsp); /* hold for new thread */
1185 1167 (void) zthread_create(NULL, 0, nfs4_async_start,
1186 1168 vfsp, 0, minclsyspri);
1187 1169 mutex_enter(&mi->mi_async_lock);
1188 1170 } else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1189 1171 NUM_ASYNC_PGOPS_THREADS) {
1190 1172 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1191 1173 mutex_exit(&mi->mi_async_lock);
1192 1174 MI4_HOLD(mi);
1193 1175 VFS_HOLD(vfsp); /* hold for new thread */
1194 1176 (void) zthread_create(NULL, 0,
1195 1177 nfs4_async_pgops_start, vfsp, 0,
1196 1178 minclsyspri);
1197 1179 mutex_enter(&mi->mi_async_lock);
1198 1180 }
1199 1181 NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1200 1182 ASSERT(mi->mi_async_req_count != 0);
1201 1183 mi->mi_async_req_count--;
1202 1184 }
1203 1185
1204 1186 mutex_enter(&mi->mi_lock);
1205 1187 if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1206 1188 mutex_exit(&mi->mi_lock);
1207 1189 break;
1208 1190 }
1209 1191 mutex_exit(&mi->mi_lock);
1210 1192
1211 1193 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1212 1194 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1213 1195 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1214 1196 }
1215 1197
1216 1198 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1217 1199 "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1218 1200 /*
1219 1201 * Let everyone know we're done.
1220 1202 */
1221 1203 mi->mi_manager_thread = NULL;
1222 1204 /*
1223 1205 * Wake up the inactive thread.
1224 1206 */
1225 1207 cv_broadcast(&mi->mi_inact_req_cv);
1226 1208 /*
1227 1209 * Wake up anyone sitting in nfs4_async_manager_stop()
1228 1210 */
1229 1211 cv_broadcast(&mi->mi_async_cv);
1230 1212 /*
1231 1213 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1232 1214 * since CALLB_CPR_EXIT is actually responsible for releasing
1233 1215 * 'mi_async_lock'.
1234 1216 */
1235 1217 CALLB_CPR_EXIT(&cprinfo);
1236 1218 VFS_RELE(vfsp); /* release thread's hold */
1237 1219 MI4_RELE(mi);
1238 1220 zthread_exit();
1239 1221 }
1240 1222
1241 1223 /*
1242 1224 * Signal (and wait for) the async manager thread to clean up and go away.
1243 1225 */
1244 1226 void
1245 1227 nfs4_async_manager_stop(vfs_t *vfsp)
1246 1228 {
1247 1229 mntinfo4_t *mi = VFTOMI4(vfsp);
1248 1230
1249 1231 mutex_enter(&mi->mi_async_lock);
1250 1232 mutex_enter(&mi->mi_lock);
1251 1233 mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1252 1234 mutex_exit(&mi->mi_lock);
1253 1235 cv_broadcast(&mi->mi_async_reqs_cv);
1254 1236 /*
1255 1237 * Wait for the async manager thread to die.
1256 1238 */
1257 1239 while (mi->mi_manager_thread != NULL)
1258 1240 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1259 1241 mutex_exit(&mi->mi_async_lock);
1260 1242 }
1261 1243
1262 1244 int
1263 1245 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1264 1246 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1265 1247 u_offset_t, caddr_t, struct seg *, cred_t *))
1266 1248 {
1267 1249 rnode4_t *rp;
1268 1250 mntinfo4_t *mi;
1269 1251 struct nfs4_async_reqs *args;
1270 1252
1271 1253 rp = VTOR4(vp);
1272 1254 ASSERT(rp->r_freef == NULL);
1273 1255
1274 1256 mi = VTOMI4(vp);
1275 1257
1276 1258 /*
1277 1259 * If addr falls in a different segment, don't bother doing readahead.
1278 1260 */
1279 1261 if (addr >= seg->s_base + seg->s_size)
1280 1262 return (-1);
1281 1263
1282 1264 /*
1283 1265 * If we can't allocate a request structure, punt on the readahead.
1284 1266 */
1285 1267 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1286 1268 return (-1);
1287 1269
1288 1270 /*
1289 1271 * If a lock operation is pending, don't initiate any new
1290 1272 * readaheads. Otherwise, bump r_count to indicate the new
1291 1273 * asynchronous I/O.
1292 1274 */
1293 1275 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1294 1276 kmem_free(args, sizeof (*args));
1295 1277 return (-1);
1296 1278 }
1297 1279 mutex_enter(&rp->r_statelock);
1298 1280 rp->r_count++;
1299 1281 mutex_exit(&rp->r_statelock);
1300 1282 nfs_rw_exit(&rp->r_lkserlock);
1301 1283
1302 1284 args->a_next = NULL;
1303 1285 #ifdef DEBUG
1304 1286 args->a_queuer = curthread;
1305 1287 #endif
1306 1288 VN_HOLD(vp);
1307 1289 args->a_vp = vp;
1308 1290 ASSERT(cr != NULL);
1309 1291 crhold(cr);
1310 1292 args->a_cred = cr;
1311 1293 args->a_io = NFS4_READ_AHEAD;
1312 1294 args->a_nfs4_readahead = readahead;
1313 1295 args->a_nfs4_blkoff = blkoff;
1314 1296 args->a_nfs4_seg = seg;
1315 1297 args->a_nfs4_addr = addr;
1316 1298
1317 1299 mutex_enter(&mi->mi_async_lock);
1318 1300
1319 1301 /*
1320 1302 * If asyncio has been disabled, don't bother readahead.
1321 1303 */
1322 1304 if (mi->mi_max_threads == 0) {
1323 1305 mutex_exit(&mi->mi_async_lock);
1324 1306 goto noasync;
1325 1307 }
1326 1308
1327 1309 /*
1328 1310 * Link request structure into the async list and
1329 1311 * wakeup async thread to do the i/o.
1330 1312 */
1331 1313 if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1332 1314 mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1333 1315 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1334 1316 } else {
1335 1317 mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1336 1318 mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1337 1319 }
1338 1320
1339 1321 if (mi->mi_io_kstats) {
1340 1322 mutex_enter(&mi->mi_lock);
1341 1323 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1342 1324 mutex_exit(&mi->mi_lock);
1343 1325 }
1344 1326
1345 1327 mi->mi_async_req_count++;
1346 1328 ASSERT(mi->mi_async_req_count != 0);
1347 1329 cv_signal(&mi->mi_async_reqs_cv);
1348 1330 mutex_exit(&mi->mi_async_lock);
1349 1331 return (0);
1350 1332
1351 1333 noasync:
1352 1334 mutex_enter(&rp->r_statelock);
1353 1335 rp->r_count--;
1354 1336 cv_broadcast(&rp->r_cv);
1355 1337 mutex_exit(&rp->r_statelock);
1356 1338 VN_RELE(vp);
1357 1339 crfree(cr);
1358 1340 kmem_free(args, sizeof (*args));
1359 1341 return (-1);
1360 1342 }
1361 1343
1362 1344 static void
1363 1345 nfs4_async_start(struct vfs *vfsp)
1364 1346 {
1365 1347 nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1366 1348 }
1367 1349
1368 1350 static void
1369 1351 nfs4_async_pgops_start(struct vfs *vfsp)
1370 1352 {
1371 1353 nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1372 1354 }
1373 1355
1374 1356 /*
1375 1357 * The async queues for each mounted file system are arranged as a
1376 1358 * set of queues, one for each async i/o type. Requests are taken
1377 1359 * from the queues in a round-robin fashion. A number of consecutive
1378 1360 * requests are taken from each queue before moving on to the next
1379 1361 * queue. This functionality may allow the NFS Version 2 server to do
1380 1362 * write clustering, even if the client is mixing writes and reads
1381 1363 * because it will take multiple write requests from the queue
1382 1364 * before processing any of the other async i/o types.
1383 1365 *
1384 1366 * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1385 1367 * model defined by cpr to suspend the system. Specifically over the
1386 1368 * wire calls are cpr-unsafe. The thread should be reevaluated in
1387 1369 * case of future updates to the cpr model.
1388 1370 */
1389 1371 static void
1390 1372 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1391 1373 {
1392 1374 struct nfs4_async_reqs *args;
1393 1375 mntinfo4_t *mi = VFTOMI4(vfsp);
1394 1376 clock_t time_left = 1;
1395 1377 callb_cpr_t cprinfo;
1396 1378 int i;
1397 1379 extern int nfs_async_timeout;
1398 1380 int async_types;
1399 1381 kcondvar_t *async_work_cv;
1400 1382
1401 1383 if (async_queue == NFS4_ASYNC_QUEUE) {
1402 1384 async_types = NFS4_ASYNC_TYPES;
1403 1385 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1404 1386 } else {
1405 1387 async_types = NFS4_ASYNC_PGOPS_TYPES;
1406 1388 async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1407 1389 }
1408 1390
1409 1391 /*
1410 1392 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1411 1393 * built in an implementation independent manner.
1412 1394 */
1413 1395 if (nfs_async_timeout == -1)
1414 1396 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1415 1397
1416 1398 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1417 1399
1418 1400 mutex_enter(&mi->mi_async_lock);
1419 1401 for (;;) {
1420 1402 /*
1421 1403 * Find the next queue containing an entry. We start
1422 1404 * at the current queue pointer and then round robin
1423 1405 * through all of them until we either find a non-empty
1424 1406 * queue or have looked through all of them.
1425 1407 */
1426 1408 for (i = 0; i < async_types; i++) {
1427 1409 args = *mi->mi_async_curr[async_queue];
1428 1410 if (args != NULL)
1429 1411 break;
1430 1412 mi->mi_async_curr[async_queue]++;
1431 1413 if (mi->mi_async_curr[async_queue] ==
1432 1414 &mi->mi_async_reqs[async_types]) {
1433 1415 mi->mi_async_curr[async_queue] =
1434 1416 &mi->mi_async_reqs[0];
1435 1417 }
1436 1418 }
1437 1419 /*
1438 1420 * If we didn't find a entry, then block until woken up
1439 1421 * again and then look through the queues again.
1440 1422 */
1441 1423 if (args == NULL) {
1442 1424 /*
1443 1425 * Exiting is considered to be safe for CPR as well
1444 1426 */
1445 1427 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1446 1428
1447 1429 /*
1448 1430 * Wakeup thread waiting to unmount the file
1449 1431 * system only if all async threads are inactive.
1450 1432 *
1451 1433 * If we've timed-out and there's nothing to do,
1452 1434 * then get rid of this thread.
1453 1435 */
1454 1436 if (mi->mi_max_threads == 0 || time_left <= 0) {
1455 1437 --mi->mi_threads[async_queue];
1456 1438
1457 1439 if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1458 1440 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1459 1441 cv_signal(&mi->mi_async_cv);
1460 1442 CALLB_CPR_EXIT(&cprinfo);
1461 1443 VFS_RELE(vfsp); /* release thread's hold */
1462 1444 MI4_RELE(mi);
1463 1445 zthread_exit();
1464 1446 /* NOTREACHED */
1465 1447 }
1466 1448 time_left = cv_reltimedwait(async_work_cv,
1467 1449 &mi->mi_async_lock, nfs_async_timeout,
1468 1450 TR_CLOCK_TICK);
1469 1451
1470 1452 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1471 1453
1472 1454 continue;
1473 1455 } else {
1474 1456 time_left = 1;
1475 1457 }
1476 1458
1477 1459 /*
1478 1460 * Remove the request from the async queue and then
1479 1461 * update the current async request queue pointer. If
1480 1462 * the current queue is empty or we have removed enough
1481 1463 * consecutive entries from it, then reset the counter
1482 1464 * for this queue and then move the current pointer to
1483 1465 * the next queue.
1484 1466 */
1485 1467 *mi->mi_async_curr[async_queue] = args->a_next;
1486 1468 if (*mi->mi_async_curr[async_queue] == NULL ||
1487 1469 --mi->mi_async_clusters[args->a_io] == 0) {
1488 1470 mi->mi_async_clusters[args->a_io] =
1489 1471 mi->mi_async_init_clusters;
1490 1472 mi->mi_async_curr[async_queue]++;
1491 1473 if (mi->mi_async_curr[async_queue] ==
1492 1474 &mi->mi_async_reqs[async_types]) {
1493 1475 mi->mi_async_curr[async_queue] =
1494 1476 &mi->mi_async_reqs[0];
1495 1477 }
1496 1478 }
1497 1479
1498 1480 if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1499 1481 mutex_enter(&mi->mi_lock);
1500 1482 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1501 1483 mutex_exit(&mi->mi_lock);
1502 1484 }
1503 1485
1504 1486 mutex_exit(&mi->mi_async_lock);
1505 1487
1506 1488 /*
1507 1489 * Obtain arguments from the async request structure.
1508 1490 */
1509 1491 if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1510 1492 (*args->a_nfs4_readahead)(args->a_vp,
1511 1493 args->a_nfs4_blkoff, args->a_nfs4_addr,
1512 1494 args->a_nfs4_seg, args->a_cred);
1513 1495 } else if (args->a_io == NFS4_PUTAPAGE) {
1514 1496 (void) (*args->a_nfs4_putapage)(args->a_vp,
1515 1497 args->a_nfs4_pp, args->a_nfs4_off,
1516 1498 args->a_nfs4_len, args->a_nfs4_flags,
1517 1499 args->a_cred);
1518 1500 } else if (args->a_io == NFS4_PAGEIO) {
1519 1501 (void) (*args->a_nfs4_pageio)(args->a_vp,
1520 1502 args->a_nfs4_pp, args->a_nfs4_off,
1521 1503 args->a_nfs4_len, args->a_nfs4_flags,
1522 1504 args->a_cred);
1523 1505 } else if (args->a_io == NFS4_READDIR) {
1524 1506 (void) ((*args->a_nfs4_readdir)(args->a_vp,
1525 1507 args->a_nfs4_rdc, args->a_cred));
1526 1508 } else if (args->a_io == NFS4_COMMIT) {
1527 1509 (*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1528 1510 args->a_nfs4_offset, args->a_nfs4_count,
1529 1511 args->a_cred);
1530 1512 } else if (args->a_io == NFS4_INACTIVE) {
1531 1513 nfs4_inactive_otw(args->a_vp, args->a_cred);
1532 1514 }
1533 1515
1534 1516 /*
1535 1517 * Now, release the vnode and free the credentials
1536 1518 * structure.
1537 1519 */
1538 1520 free_async_args4(args);
1539 1521 /*
1540 1522 * Reacquire the mutex because it will be needed above.
1541 1523 */
1542 1524 mutex_enter(&mi->mi_async_lock);
1543 1525 }
1544 1526 }
1545 1527
1546 1528 /*
1547 1529 * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1548 1530 * part of VOP_INACTIVE.
1549 1531 */
1550 1532
1551 1533 void
1552 1534 nfs4_inactive_thread(mntinfo4_t *mi)
1553 1535 {
1554 1536 struct nfs4_async_reqs *args;
1555 1537 callb_cpr_t cprinfo;
1556 1538 vfs_t *vfsp = mi->mi_vfsp;
1557 1539
1558 1540 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1559 1541 "nfs4_inactive_thread");
1560 1542
1561 1543 for (;;) {
1562 1544 mutex_enter(&mi->mi_async_lock);
1563 1545 args = mi->mi_async_reqs[NFS4_INACTIVE];
1564 1546 if (args == NULL) {
1565 1547 mutex_enter(&mi->mi_lock);
1566 1548 /*
1567 1549 * We don't want to exit until the async manager is done
1568 1550 * with its work; hence the check for mi_manager_thread
1569 1551 * being NULL.
1570 1552 *
1571 1553 * The async manager thread will cv_broadcast() on
1572 1554 * mi_inact_req_cv when it's done, at which point we'll
1573 1555 * wake up and exit.
1574 1556 */
1575 1557 if (mi->mi_manager_thread == NULL)
1576 1558 goto die;
1577 1559 mi->mi_flags |= MI4_INACTIVE_IDLE;
1578 1560 mutex_exit(&mi->mi_lock);
1579 1561 cv_signal(&mi->mi_async_cv);
1580 1562 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1581 1563 cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1582 1564 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1583 1565 mutex_exit(&mi->mi_async_lock);
1584 1566 } else {
1585 1567 mutex_enter(&mi->mi_lock);
1586 1568 mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1587 1569 mutex_exit(&mi->mi_lock);
1588 1570 mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1589 1571 mutex_exit(&mi->mi_async_lock);
1590 1572 nfs4_inactive_otw(args->a_vp, args->a_cred);
1591 1573 crfree(args->a_cred);
1592 1574 kmem_free(args, sizeof (*args));
1593 1575 }
1594 1576 }
1595 1577 die:
1596 1578 mutex_exit(&mi->mi_lock);
1597 1579 mi->mi_inactive_thread = NULL;
1598 1580 cv_signal(&mi->mi_async_cv);
1599 1581
1600 1582 /*
1601 1583 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1602 1584 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1603 1585 */
1604 1586 CALLB_CPR_EXIT(&cprinfo);
1605 1587
1606 1588 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1607 1589 "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1608 1590
1609 1591 MI4_RELE(mi);
1610 1592 zthread_exit();
1611 1593 /* NOTREACHED */
1612 1594 }
1613 1595
1614 1596 /*
1615 1597 * nfs_async_stop:
1616 1598 * Wait for all outstanding putpage operations and the inactive thread to
1617 1599 * complete; nfs4_async_stop_sig() without interruptibility.
1618 1600 */
1619 1601 void
1620 1602 nfs4_async_stop(struct vfs *vfsp)
1621 1603 {
1622 1604 mntinfo4_t *mi = VFTOMI4(vfsp);
1623 1605
1624 1606 /*
1625 1607 * Wait for all outstanding async operations to complete and for
1626 1608 * worker threads to exit.
1627 1609 */
1628 1610 mutex_enter(&mi->mi_async_lock);
1629 1611 mi->mi_max_threads = 0;
1630 1612 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1631 1613 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1632 1614 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1633 1615 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1634 1616
1635 1617 /*
1636 1618 * Wait for the inactive thread to finish doing what it's doing. It
1637 1619 * won't exit until the last reference to the vfs_t goes away.
1638 1620 */
1639 1621 if (mi->mi_inactive_thread != NULL) {
1640 1622 mutex_enter(&mi->mi_lock);
1641 1623 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1642 1624 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1643 1625 mutex_exit(&mi->mi_lock);
1644 1626 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1645 1627 mutex_enter(&mi->mi_lock);
1646 1628 }
1647 1629 mutex_exit(&mi->mi_lock);
1648 1630 }
1649 1631 mutex_exit(&mi->mi_async_lock);
1650 1632 }
1651 1633
1652 1634 /*
1653 1635 * nfs_async_stop_sig:
1654 1636 * Wait for all outstanding putpage operations and the inactive thread to
1655 1637 * complete. If a signal is delivered we will abort and return non-zero;
1656 1638 * otherwise return 0. Since this routine is called from nfs4_unmount, we
1657 1639 * need to make it interruptible.
1658 1640 */
1659 1641 int
1660 1642 nfs4_async_stop_sig(struct vfs *vfsp)
1661 1643 {
1662 1644 mntinfo4_t *mi = VFTOMI4(vfsp);
1663 1645 ushort_t omax;
1664 1646 bool_t intr = FALSE;
1665 1647
1666 1648 /*
1667 1649 * Wait for all outstanding putpage operations to complete and for
1668 1650 * worker threads to exit.
1669 1651 */
1670 1652 mutex_enter(&mi->mi_async_lock);
1671 1653 omax = mi->mi_max_threads;
1672 1654 mi->mi_max_threads = 0;
1673 1655 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1674 1656 while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1675 1657 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1676 1658 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1677 1659 intr = TRUE;
1678 1660 goto interrupted;
1679 1661 }
1680 1662 }
1681 1663
1682 1664 /*
1683 1665 * Wait for the inactive thread to finish doing what it's doing. It
1684 1666 * won't exit until the a last reference to the vfs_t goes away.
1685 1667 */
1686 1668 if (mi->mi_inactive_thread != NULL) {
1687 1669 mutex_enter(&mi->mi_lock);
1688 1670 while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1689 1671 (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1690 1672 mutex_exit(&mi->mi_lock);
1691 1673 if (!cv_wait_sig(&mi->mi_async_cv,
1692 1674 &mi->mi_async_lock)) {
1693 1675 intr = TRUE;
1694 1676 goto interrupted;
1695 1677 }
1696 1678 mutex_enter(&mi->mi_lock);
1697 1679 }
1698 1680 mutex_exit(&mi->mi_lock);
1699 1681 }
1700 1682 interrupted:
1701 1683 if (intr)
1702 1684 mi->mi_max_threads = omax;
1703 1685 mutex_exit(&mi->mi_async_lock);
1704 1686
1705 1687 return (intr);
1706 1688 }
1707 1689
1708 1690 int
1709 1691 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1710 1692 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1711 1693 u_offset_t, size_t, int, cred_t *))
1712 1694 {
1713 1695 rnode4_t *rp;
1714 1696 mntinfo4_t *mi;
1715 1697 struct nfs4_async_reqs *args;
1716 1698
1717 1699 ASSERT(flags & B_ASYNC);
1718 1700 ASSERT(vp->v_vfsp != NULL);
1719 1701
1720 1702 rp = VTOR4(vp);
1721 1703 ASSERT(rp->r_count > 0);
1722 1704
1723 1705 mi = VTOMI4(vp);
1724 1706
1725 1707 /*
1726 1708 * If we can't allocate a request structure, do the putpage
1727 1709 * operation synchronously in this thread's context.
1728 1710 */
1729 1711 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1730 1712 goto noasync;
1731 1713
1732 1714 args->a_next = NULL;
1733 1715 #ifdef DEBUG
1734 1716 args->a_queuer = curthread;
1735 1717 #endif
1736 1718 VN_HOLD(vp);
1737 1719 args->a_vp = vp;
1738 1720 ASSERT(cr != NULL);
1739 1721 crhold(cr);
1740 1722 args->a_cred = cr;
1741 1723 args->a_io = NFS4_PUTAPAGE;
1742 1724 args->a_nfs4_putapage = putapage;
1743 1725 args->a_nfs4_pp = pp;
1744 1726 args->a_nfs4_off = off;
1745 1727 args->a_nfs4_len = (uint_t)len;
1746 1728 args->a_nfs4_flags = flags;
1747 1729
1748 1730 mutex_enter(&mi->mi_async_lock);
1749 1731
1750 1732 /*
1751 1733 * If asyncio has been disabled, then make a synchronous request.
1752 1734 * This check is done a second time in case async io was diabled
1753 1735 * while this thread was blocked waiting for memory pressure to
1754 1736 * reduce or for the queue to drain.
1755 1737 */
1756 1738 if (mi->mi_max_threads == 0) {
1757 1739 mutex_exit(&mi->mi_async_lock);
1758 1740
1759 1741 VN_RELE(vp);
1760 1742 crfree(cr);
1761 1743 kmem_free(args, sizeof (*args));
1762 1744 goto noasync;
1763 1745 }
1764 1746
1765 1747 /*
1766 1748 * Link request structure into the async list and
1767 1749 * wakeup async thread to do the i/o.
1768 1750 */
1769 1751 if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1770 1752 mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1771 1753 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1772 1754 } else {
1773 1755 mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1774 1756 mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1775 1757 }
1776 1758
1777 1759 mutex_enter(&rp->r_statelock);
1778 1760 rp->r_count++;
1779 1761 rp->r_awcount++;
1780 1762 mutex_exit(&rp->r_statelock);
1781 1763
1782 1764 if (mi->mi_io_kstats) {
1783 1765 mutex_enter(&mi->mi_lock);
1784 1766 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1785 1767 mutex_exit(&mi->mi_lock);
1786 1768 }
1787 1769
1788 1770 mi->mi_async_req_count++;
1789 1771 ASSERT(mi->mi_async_req_count != 0);
1790 1772 cv_signal(&mi->mi_async_reqs_cv);
1791 1773 mutex_exit(&mi->mi_async_lock);
1792 1774 return (0);
1793 1775
1794 1776 noasync:
1795 1777
1796 1778 if (curproc == proc_pageout || curproc == proc_fsflush) {
1797 1779 /*
1798 1780 * If we get here in the context of the pageout/fsflush,
1799 1781 * or we have run out of memory or we're attempting to
1800 1782 * unmount we refuse to do a sync write, because this may
1801 1783 * hang pageout/fsflush and the machine. In this case,
1802 1784 * we just re-mark the page as dirty and punt on the page.
1803 1785 *
1804 1786 * Make sure B_FORCE isn't set. We can re-mark the
1805 1787 * pages as dirty and unlock the pages in one swoop by
1806 1788 * passing in B_ERROR to pvn_write_done(). However,
1807 1789 * we should make sure B_FORCE isn't set - we don't
1808 1790 * want the page tossed before it gets written out.
1809 1791 */
1810 1792 if (flags & B_FORCE)
1811 1793 flags &= ~(B_INVAL | B_FORCE);
1812 1794 pvn_write_done(pp, flags | B_ERROR);
1813 1795 return (0);
1814 1796 }
1815 1797
1816 1798 if (nfs_zone() != mi->mi_zone) {
1817 1799 /*
1818 1800 * So this was a cross-zone sync putpage.
1819 1801 *
1820 1802 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1821 1803 * as dirty and unlock them.
1822 1804 *
1823 1805 * We don't want to clear B_FORCE here as the caller presumably
1824 1806 * knows what they're doing if they set it.
1825 1807 */
1826 1808 pvn_write_done(pp, flags | B_ERROR);
1827 1809 return (EPERM);
1828 1810 }
1829 1811 return ((*putapage)(vp, pp, off, len, flags, cr));
1830 1812 }
1831 1813
1832 1814 int
1833 1815 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1834 1816 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1835 1817 size_t, int, cred_t *))
1836 1818 {
1837 1819 rnode4_t *rp;
1838 1820 mntinfo4_t *mi;
1839 1821 struct nfs4_async_reqs *args;
1840 1822
1841 1823 ASSERT(flags & B_ASYNC);
1842 1824 ASSERT(vp->v_vfsp != NULL);
1843 1825
1844 1826 rp = VTOR4(vp);
1845 1827 ASSERT(rp->r_count > 0);
1846 1828
1847 1829 mi = VTOMI4(vp);
1848 1830
1849 1831 /*
1850 1832 * If we can't allocate a request structure, do the pageio
1851 1833 * request synchronously in this thread's context.
1852 1834 */
1853 1835 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1854 1836 goto noasync;
1855 1837
1856 1838 args->a_next = NULL;
1857 1839 #ifdef DEBUG
1858 1840 args->a_queuer = curthread;
1859 1841 #endif
1860 1842 VN_HOLD(vp);
1861 1843 args->a_vp = vp;
1862 1844 ASSERT(cr != NULL);
1863 1845 crhold(cr);
1864 1846 args->a_cred = cr;
1865 1847 args->a_io = NFS4_PAGEIO;
1866 1848 args->a_nfs4_pageio = pageio;
1867 1849 args->a_nfs4_pp = pp;
1868 1850 args->a_nfs4_off = io_off;
1869 1851 args->a_nfs4_len = (uint_t)io_len;
1870 1852 args->a_nfs4_flags = flags;
1871 1853
1872 1854 mutex_enter(&mi->mi_async_lock);
1873 1855
1874 1856 /*
1875 1857 * If asyncio has been disabled, then make a synchronous request.
1876 1858 * This check is done a second time in case async io was diabled
1877 1859 * while this thread was blocked waiting for memory pressure to
1878 1860 * reduce or for the queue to drain.
1879 1861 */
1880 1862 if (mi->mi_max_threads == 0) {
1881 1863 mutex_exit(&mi->mi_async_lock);
1882 1864
1883 1865 VN_RELE(vp);
1884 1866 crfree(cr);
1885 1867 kmem_free(args, sizeof (*args));
1886 1868 goto noasync;
1887 1869 }
1888 1870
1889 1871 /*
1890 1872 * Link request structure into the async list and
1891 1873 * wakeup async thread to do the i/o.
1892 1874 */
1893 1875 if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1894 1876 mi->mi_async_reqs[NFS4_PAGEIO] = args;
1895 1877 mi->mi_async_tail[NFS4_PAGEIO] = args;
1896 1878 } else {
1897 1879 mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1898 1880 mi->mi_async_tail[NFS4_PAGEIO] = args;
1899 1881 }
1900 1882
1901 1883 mutex_enter(&rp->r_statelock);
1902 1884 rp->r_count++;
1903 1885 rp->r_awcount++;
1904 1886 mutex_exit(&rp->r_statelock);
1905 1887
1906 1888 if (mi->mi_io_kstats) {
1907 1889 mutex_enter(&mi->mi_lock);
1908 1890 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1909 1891 mutex_exit(&mi->mi_lock);
1910 1892 }
1911 1893
1912 1894 mi->mi_async_req_count++;
1913 1895 ASSERT(mi->mi_async_req_count != 0);
1914 1896 cv_signal(&mi->mi_async_reqs_cv);
1915 1897 mutex_exit(&mi->mi_async_lock);
1916 1898 return (0);
1917 1899
1918 1900 noasync:
1919 1901 /*
1920 1902 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1921 1903 * the page list), for writes we do it synchronously, except for
1922 1904 * proc_pageout/proc_fsflush as described below.
1923 1905 */
1924 1906 if (flags & B_READ) {
1925 1907 pvn_read_done(pp, flags | B_ERROR);
1926 1908 return (0);
1927 1909 }
1928 1910
1929 1911 if (curproc == proc_pageout || curproc == proc_fsflush) {
1930 1912 /*
1931 1913 * If we get here in the context of the pageout/fsflush,
1932 1914 * we refuse to do a sync write, because this may hang
1933 1915 * pageout/fsflush (and the machine). In this case, we just
1934 1916 * re-mark the page as dirty and punt on the page.
1935 1917 *
1936 1918 * Make sure B_FORCE isn't set. We can re-mark the
1937 1919 * pages as dirty and unlock the pages in one swoop by
1938 1920 * passing in B_ERROR to pvn_write_done(). However,
1939 1921 * we should make sure B_FORCE isn't set - we don't
1940 1922 * want the page tossed before it gets written out.
1941 1923 */
1942 1924 if (flags & B_FORCE)
1943 1925 flags &= ~(B_INVAL | B_FORCE);
1944 1926 pvn_write_done(pp, flags | B_ERROR);
1945 1927 return (0);
1946 1928 }
1947 1929
1948 1930 if (nfs_zone() != mi->mi_zone) {
1949 1931 /*
1950 1932 * So this was a cross-zone sync pageio. We pass in B_ERROR
1951 1933 * to pvn_write_done() to re-mark the pages as dirty and unlock
1952 1934 * them.
1953 1935 *
1954 1936 * We don't want to clear B_FORCE here as the caller presumably
1955 1937 * knows what they're doing if they set it.
1956 1938 */
1957 1939 pvn_write_done(pp, flags | B_ERROR);
1958 1940 return (EPERM);
1959 1941 }
1960 1942 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1961 1943 }
1962 1944
1963 1945 void
1964 1946 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1965 1947 int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1966 1948 {
1967 1949 rnode4_t *rp;
1968 1950 mntinfo4_t *mi;
1969 1951 struct nfs4_async_reqs *args;
1970 1952
1971 1953 rp = VTOR4(vp);
1972 1954 ASSERT(rp->r_freef == NULL);
1973 1955
1974 1956 mi = VTOMI4(vp);
1975 1957
1976 1958 /*
1977 1959 * If we can't allocate a request structure, skip the readdir.
1978 1960 */
1979 1961 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1980 1962 goto noasync;
1981 1963
1982 1964 args->a_next = NULL;
1983 1965 #ifdef DEBUG
1984 1966 args->a_queuer = curthread;
1985 1967 #endif
1986 1968 VN_HOLD(vp);
1987 1969 args->a_vp = vp;
1988 1970 ASSERT(cr != NULL);
1989 1971 crhold(cr);
1990 1972 args->a_cred = cr;
1991 1973 args->a_io = NFS4_READDIR;
1992 1974 args->a_nfs4_readdir = readdir;
1993 1975 args->a_nfs4_rdc = rdc;
1994 1976
1995 1977 mutex_enter(&mi->mi_async_lock);
1996 1978
1997 1979 /*
1998 1980 * If asyncio has been disabled, then skip this request
1999 1981 */
2000 1982 if (mi->mi_max_threads == 0) {
2001 1983 mutex_exit(&mi->mi_async_lock);
2002 1984
2003 1985 VN_RELE(vp);
2004 1986 crfree(cr);
2005 1987 kmem_free(args, sizeof (*args));
2006 1988 goto noasync;
2007 1989 }
2008 1990
2009 1991 /*
2010 1992 * Link request structure into the async list and
2011 1993 * wakeup async thread to do the i/o.
2012 1994 */
2013 1995 if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2014 1996 mi->mi_async_reqs[NFS4_READDIR] = args;
2015 1997 mi->mi_async_tail[NFS4_READDIR] = args;
2016 1998 } else {
2017 1999 mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2018 2000 mi->mi_async_tail[NFS4_READDIR] = args;
2019 2001 }
2020 2002
2021 2003 mutex_enter(&rp->r_statelock);
2022 2004 rp->r_count++;
2023 2005 mutex_exit(&rp->r_statelock);
2024 2006
2025 2007 if (mi->mi_io_kstats) {
2026 2008 mutex_enter(&mi->mi_lock);
2027 2009 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2028 2010 mutex_exit(&mi->mi_lock);
2029 2011 }
2030 2012
2031 2013 mi->mi_async_req_count++;
2032 2014 ASSERT(mi->mi_async_req_count != 0);
2033 2015 cv_signal(&mi->mi_async_reqs_cv);
2034 2016 mutex_exit(&mi->mi_async_lock);
2035 2017 return;
2036 2018
2037 2019 noasync:
2038 2020 mutex_enter(&rp->r_statelock);
2039 2021 rdc->entries = NULL;
2040 2022 /*
2041 2023 * Indicate that no one is trying to fill this entry and
2042 2024 * it still needs to be filled.
2043 2025 */
2044 2026 rdc->flags &= ~RDDIR;
2045 2027 rdc->flags |= RDDIRREQ;
2046 2028 rddir4_cache_rele(rp, rdc);
2047 2029 mutex_exit(&rp->r_statelock);
2048 2030 }
2049 2031
2050 2032 void
2051 2033 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2052 2034 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2053 2035 cred_t *))
2054 2036 {
2055 2037 rnode4_t *rp;
2056 2038 mntinfo4_t *mi;
2057 2039 struct nfs4_async_reqs *args;
2058 2040 page_t *pp;
2059 2041
2060 2042 rp = VTOR4(vp);
2061 2043 mi = VTOMI4(vp);
2062 2044
2063 2045 /*
2064 2046 * If we can't allocate a request structure, do the commit
2065 2047 * operation synchronously in this thread's context.
2066 2048 */
2067 2049 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2068 2050 goto noasync;
2069 2051
2070 2052 args->a_next = NULL;
2071 2053 #ifdef DEBUG
2072 2054 args->a_queuer = curthread;
2073 2055 #endif
2074 2056 VN_HOLD(vp);
2075 2057 args->a_vp = vp;
2076 2058 ASSERT(cr != NULL);
2077 2059 crhold(cr);
2078 2060 args->a_cred = cr;
2079 2061 args->a_io = NFS4_COMMIT;
2080 2062 args->a_nfs4_commit = commit;
2081 2063 args->a_nfs4_plist = plist;
2082 2064 args->a_nfs4_offset = offset;
2083 2065 args->a_nfs4_count = count;
2084 2066
2085 2067 mutex_enter(&mi->mi_async_lock);
2086 2068
2087 2069 /*
2088 2070 * If asyncio has been disabled, then make a synchronous request.
2089 2071 * This check is done a second time in case async io was diabled
2090 2072 * while this thread was blocked waiting for memory pressure to
2091 2073 * reduce or for the queue to drain.
2092 2074 */
2093 2075 if (mi->mi_max_threads == 0) {
2094 2076 mutex_exit(&mi->mi_async_lock);
2095 2077
2096 2078 VN_RELE(vp);
2097 2079 crfree(cr);
2098 2080 kmem_free(args, sizeof (*args));
2099 2081 goto noasync;
2100 2082 }
2101 2083
2102 2084 /*
2103 2085 * Link request structure into the async list and
2104 2086 * wakeup async thread to do the i/o.
2105 2087 */
2106 2088 if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2107 2089 mi->mi_async_reqs[NFS4_COMMIT] = args;
2108 2090 mi->mi_async_tail[NFS4_COMMIT] = args;
2109 2091 } else {
2110 2092 mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2111 2093 mi->mi_async_tail[NFS4_COMMIT] = args;
2112 2094 }
2113 2095
2114 2096 mutex_enter(&rp->r_statelock);
2115 2097 rp->r_count++;
2116 2098 mutex_exit(&rp->r_statelock);
2117 2099
2118 2100 if (mi->mi_io_kstats) {
2119 2101 mutex_enter(&mi->mi_lock);
2120 2102 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2121 2103 mutex_exit(&mi->mi_lock);
2122 2104 }
2123 2105
2124 2106 mi->mi_async_req_count++;
2125 2107 ASSERT(mi->mi_async_req_count != 0);
2126 2108 cv_signal(&mi->mi_async_reqs_cv);
2127 2109 mutex_exit(&mi->mi_async_lock);
2128 2110 return;
2129 2111
2130 2112 noasync:
2131 2113 if (curproc == proc_pageout || curproc == proc_fsflush ||
2132 2114 nfs_zone() != mi->mi_zone) {
2133 2115 while (plist != NULL) {
2134 2116 pp = plist;
2135 2117 page_sub(&plist, pp);
2136 2118 pp->p_fsdata = C_COMMIT;
2137 2119 page_unlock(pp);
2138 2120 }
2139 2121 return;
2140 2122 }
2141 2123 (*commit)(vp, plist, offset, count, cr);
2142 2124 }
2143 2125
2144 2126 /*
2145 2127 * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread. The
2146 2128 * reference to the vnode is handed over to the thread; the caller should
2147 2129 * no longer refer to the vnode.
2148 2130 *
2149 2131 * Unlike most of the async routines, this handoff is needed for
2150 2132 * correctness reasons, not just performance. So doing operations in the
2151 2133 * context of the current thread is not an option.
2152 2134 */
2153 2135 void
2154 2136 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2155 2137 {
2156 2138 mntinfo4_t *mi;
2157 2139 struct nfs4_async_reqs *args;
2158 2140 boolean_t signal_inactive_thread = B_FALSE;
2159 2141
2160 2142 mi = VTOMI4(vp);
2161 2143
2162 2144 args = kmem_alloc(sizeof (*args), KM_SLEEP);
2163 2145 args->a_next = NULL;
2164 2146 #ifdef DEBUG
2165 2147 args->a_queuer = curthread;
2166 2148 #endif
2167 2149 args->a_vp = vp;
2168 2150 ASSERT(cr != NULL);
2169 2151 crhold(cr);
2170 2152 args->a_cred = cr;
2171 2153 args->a_io = NFS4_INACTIVE;
2172 2154
2173 2155 /*
2174 2156 * Note that we don't check mi->mi_max_threads here, since we
2175 2157 * *need* to get rid of this vnode regardless of whether someone
2176 2158 * set nfs4_max_threads to zero in /etc/system.
2177 2159 *
2178 2160 * The manager thread knows about this and is willing to create
2179 2161 * at least one thread to accommodate us.
2180 2162 */
2181 2163 mutex_enter(&mi->mi_async_lock);
2182 2164 if (mi->mi_inactive_thread == NULL) {
2183 2165 rnode4_t *rp;
2184 2166 vnode_t *unldvp = NULL;
2185 2167 char *unlname;
2186 2168 cred_t *unlcred;
2187 2169
2188 2170 mutex_exit(&mi->mi_async_lock);
2189 2171 /*
2190 2172 * We just need to free up the memory associated with the
2191 2173 * vnode, which can be safely done from within the current
2192 2174 * context.
2193 2175 */
2194 2176 crfree(cr); /* drop our reference */
2195 2177 kmem_free(args, sizeof (*args));
2196 2178 rp = VTOR4(vp);
2197 2179 mutex_enter(&rp->r_statelock);
2198 2180 if (rp->r_unldvp != NULL) {
2199 2181 unldvp = rp->r_unldvp;
2200 2182 rp->r_unldvp = NULL;
2201 2183 unlname = rp->r_unlname;
2202 2184 rp->r_unlname = NULL;
2203 2185 unlcred = rp->r_unlcred;
2204 2186 rp->r_unlcred = NULL;
2205 2187 }
2206 2188 mutex_exit(&rp->r_statelock);
2207 2189 /*
2208 2190 * No need to explicitly throw away any cached pages. The
2209 2191 * eventual r4inactive() will attempt a synchronous
2210 2192 * VOP_PUTPAGE() which will immediately fail since the request
2211 2193 * is coming from the wrong zone, and then will proceed to call
2212 2194 * nfs4_invalidate_pages() which will clean things up for us.
2213 2195 *
2214 2196 * Throw away the delegation here so rp4_addfree()'s attempt to
2215 2197 * return any existing delegations becomes a no-op.
2216 2198 */
2217 2199 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2218 2200 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2219 2201 FALSE);
2220 2202 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2221 2203 nfs_rw_exit(&mi->mi_recovlock);
2222 2204 }
2223 2205 nfs4_clear_open_streams(rp);
2224 2206
2225 2207 rp4_addfree(rp, cr);
2226 2208 if (unldvp != NULL) {
2227 2209 kmem_free(unlname, MAXNAMELEN);
2228 2210 VN_RELE(unldvp);
2229 2211 crfree(unlcred);
2230 2212 }
2231 2213 return;
2232 2214 }
2233 2215
2234 2216 if (mi->mi_manager_thread == NULL) {
2235 2217 /*
2236 2218 * We want to talk to the inactive thread.
2237 2219 */
2238 2220 signal_inactive_thread = B_TRUE;
2239 2221 }
2240 2222
2241 2223 /*
2242 2224 * Enqueue the vnode and wake up either the special thread (empty
2243 2225 * list) or an async thread.
2244 2226 */
2245 2227 if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2246 2228 mi->mi_async_reqs[NFS4_INACTIVE] = args;
2247 2229 mi->mi_async_tail[NFS4_INACTIVE] = args;
2248 2230 signal_inactive_thread = B_TRUE;
2249 2231 } else {
2250 2232 mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2251 2233 mi->mi_async_tail[NFS4_INACTIVE] = args;
2252 2234 }
2253 2235 if (signal_inactive_thread) {
2254 2236 cv_signal(&mi->mi_inact_req_cv);
2255 2237 } else {
2256 2238 mi->mi_async_req_count++;
2257 2239 ASSERT(mi->mi_async_req_count != 0);
2258 2240 cv_signal(&mi->mi_async_reqs_cv);
2259 2241 }
2260 2242
2261 2243 mutex_exit(&mi->mi_async_lock);
2262 2244 }
2263 2245
2264 2246 int
2265 2247 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2266 2248 {
2267 2249 int pagecreate;
2268 2250 int n;
2269 2251 int saved_n;
2270 2252 caddr_t saved_base;
2271 2253 u_offset_t offset;
2272 2254 int error;
2273 2255 int sm_error;
2274 2256 vnode_t *vp = RTOV(rp);
2275 2257
2276 2258 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2277 2259 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2278 2260 if (!vpm_enable) {
2279 2261 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2280 2262 }
2281 2263
2282 2264 /*
2283 2265 * Move bytes in at most PAGESIZE chunks. We must avoid
2284 2266 * spanning pages in uiomove() because page faults may cause
2285 2267 * the cache to be invalidated out from under us. The r_size is not
2286 2268 * updated until after the uiomove. If we push the last page of a
2287 2269 * file before r_size is correct, we will lose the data written past
2288 2270 * the current (and invalid) r_size.
2289 2271 */
2290 2272 do {
2291 2273 offset = uio->uio_loffset;
2292 2274 pagecreate = 0;
2293 2275
2294 2276 /*
2295 2277 * n is the number of bytes required to satisfy the request
2296 2278 * or the number of bytes to fill out the page.
2297 2279 */
2298 2280 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2299 2281
2300 2282 /*
2301 2283 * Check to see if we can skip reading in the page
2302 2284 * and just allocate the memory. We can do this
2303 2285 * if we are going to rewrite the entire mapping
2304 2286 * or if we are going to write to or beyond the current
2305 2287 * end of file from the beginning of the mapping.
2306 2288 *
2307 2289 * The read of r_size is now protected by r_statelock.
2308 2290 */
2309 2291 mutex_enter(&rp->r_statelock);
2310 2292 /*
2311 2293 * When pgcreated is nonzero the caller has already done
2312 2294 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2313 2295 * segkpm this means we already have at least one page
2314 2296 * created and mapped at base.
2315 2297 */
2316 2298 pagecreate = pgcreated ||
2317 2299 ((offset & PAGEOFFSET) == 0 &&
2318 2300 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2319 2301
2320 2302 mutex_exit(&rp->r_statelock);
2321 2303
2322 2304 if (!vpm_enable && pagecreate) {
2323 2305 /*
2324 2306 * The last argument tells segmap_pagecreate() to
2325 2307 * always lock the page, as opposed to sometimes
2326 2308 * returning with the page locked. This way we avoid a
2327 2309 * fault on the ensuing uiomove(), but also
2328 2310 * more importantly (to fix bug 1094402) we can
2329 2311 * call segmap_fault() to unlock the page in all
2330 2312 * cases. An alternative would be to modify
2331 2313 * segmap_pagecreate() to tell us when it is
2332 2314 * locking a page, but that's a fairly major
2333 2315 * interface change.
2334 2316 */
2335 2317 if (pgcreated == 0)
2336 2318 (void) segmap_pagecreate(segkmap, base,
2337 2319 (uint_t)n, 1);
2338 2320 saved_base = base;
2339 2321 saved_n = n;
2340 2322 }
2341 2323
2342 2324 /*
2343 2325 * The number of bytes of data in the last page can not
2344 2326 * be accurately be determined while page is being
2345 2327 * uiomove'd to and the size of the file being updated.
2346 2328 * Thus, inform threads which need to know accurately
2347 2329 * how much data is in the last page of the file. They
2348 2330 * will not do the i/o immediately, but will arrange for
2349 2331 * the i/o to happen later when this modify operation
2350 2332 * will have finished.
2351 2333 */
2352 2334 ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2353 2335 mutex_enter(&rp->r_statelock);
2354 2336 rp->r_flags |= R4MODINPROGRESS;
2355 2337 rp->r_modaddr = (offset & MAXBMASK);
2356 2338 mutex_exit(&rp->r_statelock);
2357 2339
2358 2340 if (vpm_enable) {
2359 2341 /*
2360 2342 * Copy data. If new pages are created, part of
2361 2343 * the page that is not written will be initizliazed
2362 2344 * with zeros.
2363 2345 */
2364 2346 error = vpm_data_copy(vp, offset, n, uio,
2365 2347 !pagecreate, NULL, 0, S_WRITE);
2366 2348 } else {
2367 2349 error = uiomove(base, n, UIO_WRITE, uio);
2368 2350 }
2369 2351
2370 2352 /*
2371 2353 * r_size is the maximum number of
2372 2354 * bytes known to be in the file.
2373 2355 * Make sure it is at least as high as the
2374 2356 * first unwritten byte pointed to by uio_loffset.
2375 2357 */
2376 2358 mutex_enter(&rp->r_statelock);
2377 2359 if (rp->r_size < uio->uio_loffset)
2378 2360 rp->r_size = uio->uio_loffset;
2379 2361 rp->r_flags &= ~R4MODINPROGRESS;
2380 2362 rp->r_flags |= R4DIRTY;
2381 2363 mutex_exit(&rp->r_statelock);
2382 2364
2383 2365 /* n = # of bytes written */
2384 2366 n = (int)(uio->uio_loffset - offset);
2385 2367
2386 2368 if (!vpm_enable) {
2387 2369 base += n;
2388 2370 }
2389 2371
2390 2372 tcount -= n;
2391 2373 /*
2392 2374 * If we created pages w/o initializing them completely,
2393 2375 * we need to zero the part that wasn't set up.
2394 2376 * This happens on a most EOF write cases and if
2395 2377 * we had some sort of error during the uiomove.
2396 2378 */
2397 2379 if (!vpm_enable && pagecreate) {
2398 2380 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2399 2381 (void) kzero(base, PAGESIZE - n);
2400 2382
2401 2383 if (pgcreated) {
2402 2384 /*
2403 2385 * Caller is responsible for this page,
2404 2386 * it was not created in this loop.
2405 2387 */
2406 2388 pgcreated = 0;
2407 2389 } else {
2408 2390 /*
2409 2391 * For bug 1094402: segmap_pagecreate locks
2410 2392 * page. Unlock it. This also unlocks the
2411 2393 * pages allocated by page_create_va() in
2412 2394 * segmap_pagecreate().
2413 2395 */
2414 2396 sm_error = segmap_fault(kas.a_hat, segkmap,
2415 2397 saved_base, saved_n,
2416 2398 F_SOFTUNLOCK, S_WRITE);
2417 2399 if (error == 0)
2418 2400 error = sm_error;
2419 2401 }
2420 2402 }
2421 2403 } while (tcount > 0 && error == 0);
2422 2404
2423 2405 return (error);
2424 2406 }
2425 2407
2426 2408 int
2427 2409 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2428 2410 {
2429 2411 rnode4_t *rp;
2430 2412 page_t *pp;
2431 2413 u_offset_t eoff;
2432 2414 u_offset_t io_off;
2433 2415 size_t io_len;
2434 2416 int error;
2435 2417 int rdirty;
2436 2418 int err;
2437 2419
2438 2420 rp = VTOR4(vp);
2439 2421 ASSERT(rp->r_count > 0);
2440 2422
2441 2423 if (!nfs4_has_pages(vp))
2442 2424 return (0);
2443 2425
2444 2426 ASSERT(vp->v_type != VCHR);
2445 2427
2446 2428 /*
2447 2429 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2448 2430 * writes. B_FORCE is set to force the VM system to actually
2449 2431 * invalidate the pages, even if the i/o failed. The pages
2450 2432 * need to get invalidated because they can't be written out
2451 2433 * because there isn't any space left on either the server's
2452 2434 * file system or in the user's disk quota. The B_FREE bit
2453 2435 * is cleared to avoid confusion as to whether this is a
2454 2436 * request to place the page on the freelist or to destroy
2455 2437 * it.
2456 2438 */
2457 2439 if ((rp->r_flags & R4OUTOFSPACE) ||
2458 2440 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2459 2441 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2460 2442
2461 2443 if (len == 0) {
2462 2444 /*
2463 2445 * If doing a full file synchronous operation, then clear
2464 2446 * the R4DIRTY bit. If a page gets dirtied while the flush
2465 2447 * is happening, then R4DIRTY will get set again. The
2466 2448 * R4DIRTY bit must get cleared before the flush so that
2467 2449 * we don't lose this information.
2468 2450 *
2469 2451 * If there are no full file async write operations
2470 2452 * pending and RDIRTY bit is set, clear it.
2471 2453 */
2472 2454 if (off == (u_offset_t)0 &&
2473 2455 !(flags & B_ASYNC) &&
2474 2456 (rp->r_flags & R4DIRTY)) {
2475 2457 mutex_enter(&rp->r_statelock);
2476 2458 rdirty = (rp->r_flags & R4DIRTY);
2477 2459 rp->r_flags &= ~R4DIRTY;
2478 2460 mutex_exit(&rp->r_statelock);
2479 2461 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2480 2462 mutex_enter(&rp->r_statelock);
2481 2463 if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2482 2464 rdirty = (rp->r_flags & R4DIRTY);
2483 2465 rp->r_flags &= ~R4DIRTY;
2484 2466 }
2485 2467 mutex_exit(&rp->r_statelock);
2486 2468 } else
2487 2469 rdirty = 0;
2488 2470
2489 2471 /*
2490 2472 * Search the entire vp list for pages >= off, and flush
2491 2473 * the dirty pages.
2492 2474 */
2493 2475 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2494 2476 flags, cr);
2495 2477
2496 2478 /*
2497 2479 * If an error occurred and the file was marked as dirty
2498 2480 * before and we aren't forcibly invalidating pages, then
2499 2481 * reset the R4DIRTY flag.
2500 2482 */
2501 2483 if (error && rdirty &&
2502 2484 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2503 2485 mutex_enter(&rp->r_statelock);
2504 2486 rp->r_flags |= R4DIRTY;
2505 2487 mutex_exit(&rp->r_statelock);
2506 2488 }
2507 2489 } else {
2508 2490 /*
2509 2491 * Do a range from [off...off + len) looking for pages
2510 2492 * to deal with.
2511 2493 */
2512 2494 error = 0;
2513 2495 io_len = 0;
2514 2496 eoff = off + len;
2515 2497 mutex_enter(&rp->r_statelock);
2516 2498 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2517 2499 io_off += io_len) {
2518 2500 mutex_exit(&rp->r_statelock);
2519 2501 /*
2520 2502 * If we are not invalidating, synchronously
2521 2503 * freeing or writing pages use the routine
2522 2504 * page_lookup_nowait() to prevent reclaiming
2523 2505 * them from the free list.
2524 2506 */
2525 2507 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2526 2508 pp = page_lookup(vp, io_off,
2527 2509 (flags & (B_INVAL | B_FREE)) ?
2528 2510 SE_EXCL : SE_SHARED);
2529 2511 } else {
2530 2512 pp = page_lookup_nowait(vp, io_off,
2531 2513 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2532 2514 }
2533 2515
2534 2516 if (pp == NULL || !pvn_getdirty(pp, flags))
2535 2517 io_len = PAGESIZE;
2536 2518 else {
2537 2519 err = (*rp->r_putapage)(vp, pp, &io_off,
2538 2520 &io_len, flags, cr);
2539 2521 if (!error)
2540 2522 error = err;
2541 2523 /*
2542 2524 * "io_off" and "io_len" are returned as
2543 2525 * the range of pages we actually wrote.
2544 2526 * This allows us to skip ahead more quickly
2545 2527 * since several pages may've been dealt
2546 2528 * with by this iteration of the loop.
2547 2529 */
2548 2530 }
2549 2531 mutex_enter(&rp->r_statelock);
2550 2532 }
2551 2533 mutex_exit(&rp->r_statelock);
2552 2534 }
2553 2535
2554 2536 return (error);
2555 2537 }
2556 2538
2557 2539 void
2558 2540 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2559 2541 {
2560 2542 rnode4_t *rp;
2561 2543
2562 2544 rp = VTOR4(vp);
2563 2545 if (IS_SHADOW(vp, rp))
2564 2546 vp = RTOV4(rp);
2565 2547 mutex_enter(&rp->r_statelock);
2566 2548 while (rp->r_flags & R4TRUNCATE)
2567 2549 cv_wait(&rp->r_cv, &rp->r_statelock);
2568 2550 rp->r_flags |= R4TRUNCATE;
2569 2551 if (off == (u_offset_t)0) {
2570 2552 rp->r_flags &= ~R4DIRTY;
2571 2553 if (!(rp->r_flags & R4STALE))
2572 2554 rp->r_error = 0;
2573 2555 }
2574 2556 rp->r_truncaddr = off;
2575 2557 mutex_exit(&rp->r_statelock);
2576 2558 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2577 2559 B_INVAL | B_TRUNC, cr);
2578 2560 mutex_enter(&rp->r_statelock);
2579 2561 rp->r_flags &= ~R4TRUNCATE;
2580 2562 cv_broadcast(&rp->r_cv);
2581 2563 mutex_exit(&rp->r_statelock);
2582 2564 }
2583 2565
2584 2566 static int
2585 2567 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2586 2568 {
2587 2569 mntinfo4_t *mi;
2588 2570 struct mntinfo_kstat *mik;
2589 2571 vfs_t *vfsp;
2590 2572
2591 2573 /* this is a read-only kstat. Bail out on a write */
2592 2574 if (rw == KSTAT_WRITE)
2593 2575 return (EACCES);
2594 2576
2595 2577
2596 2578 /*
2597 2579 * We don't want to wait here as kstat_chain_lock could be held by
2598 2580 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2599 2581 * and thus could lead to a deadlock.
2600 2582 */
2601 2583 vfsp = (struct vfs *)ksp->ks_private;
2602 2584
2603 2585 mi = VFTOMI4(vfsp);
2604 2586 mik = (struct mntinfo_kstat *)ksp->ks_data;
2605 2587
2606 2588 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2607 2589
2608 2590 mik->mik_vers = (uint32_t)mi->mi_vers;
2609 2591 mik->mik_flags = mi->mi_flags;
2610 2592 /*
2611 2593 * The sv_secdata holds the flavor the client specifies.
2612 2594 * If the client uses default and a security negotiation
2613 2595 * occurs, sv_currsec will point to the current flavor
2614 2596 * selected from the server flavor list.
2615 2597 * sv_currsec is NULL if no security negotiation takes place.
2616 2598 */
2617 2599 mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2618 2600 mi->mi_curr_serv->sv_currsec->secmod :
2619 2601 mi->mi_curr_serv->sv_secdata->secmod;
2620 2602 mik->mik_curread = (uint32_t)mi->mi_curread;
2621 2603 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2622 2604 mik->mik_retrans = mi->mi_retrans;
2623 2605 mik->mik_timeo = mi->mi_timeo;
2624 2606 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2625 2607 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2626 2608 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2627 2609 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2628 2610 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2629 2611 mik->mik_failover = (uint32_t)mi->mi_failover;
2630 2612 mik->mik_remap = (uint32_t)mi->mi_remap;
2631 2613
2632 2614 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2633 2615
2634 2616 return (0);
2635 2617 }
2636 2618
2637 2619 void
2638 2620 nfs4_mnt_kstat_init(struct vfs *vfsp)
2639 2621 {
2640 2622 mntinfo4_t *mi = VFTOMI4(vfsp);
2641 2623
2642 2624 /*
2643 2625 * PSARC 2001/697 Contract Private Interface
2644 2626 * All nfs kstats are under SunMC contract
2645 2627 * Please refer to the PSARC listed above and contact
2646 2628 * SunMC before making any changes!
2647 2629 *
2648 2630 * Changes must be reviewed by Solaris File Sharing
2649 2631 * Changes must be communicated to contract-2001-697@sun.com
2650 2632 *
2651 2633 */
2652 2634
2653 2635 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2654 2636 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2655 2637 if (mi->mi_io_kstats) {
2656 2638 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2657 2639 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2658 2640 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2659 2641 kstat_install(mi->mi_io_kstats);
2660 2642 }
2661 2643
2662 2644 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2663 2645 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2664 2646 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2665 2647 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2666 2648 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2667 2649 mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2668 2650 mi->mi_ro_kstats->ks_private = (void *)vfsp;
2669 2651 kstat_install(mi->mi_ro_kstats);
2670 2652 }
2671 2653
2672 2654 nfs4_mnt_recov_kstat_init(vfsp);
2673 2655 }
2674 2656
2675 2657 void
2676 2658 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2677 2659 {
2678 2660 mntinfo4_t *mi;
2679 2661 clock_t now = ddi_get_lbolt();
2680 2662
2681 2663 mi = VTOMI4(vp);
2682 2664 /*
2683 2665 * In case of forced unmount, do not print any messages
2684 2666 * since it can flood the console with error messages.
2685 2667 */
2686 2668 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2687 2669 return;
2688 2670
2689 2671 /*
2690 2672 * If the mount point is dead, not recoverable, do not
2691 2673 * print error messages that can flood the console.
2692 2674 */
2693 2675 if (mi->mi_flags & MI4_RECOV_FAIL)
2694 2676 return;
2695 2677
2696 2678 /*
2697 2679 * No use in flooding the console with ENOSPC
2698 2680 * messages from the same file system.
2699 2681 */
2700 2682 if ((error != ENOSPC && error != EDQUOT) ||
2701 2683 now - mi->mi_printftime > 0) {
2702 2684 zoneid_t zoneid = mi->mi_zone->zone_id;
2703 2685
2704 2686 #ifdef DEBUG
2705 2687 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2706 2688 mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2707 2689 #else
2708 2690 nfs_perror(error, "NFS write error on host %s: %m.\n",
2709 2691 VTOR4(vp)->r_server->sv_hostname, NULL);
2710 2692 #endif
2711 2693 if (error == ENOSPC || error == EDQUOT) {
2712 2694 zcmn_err(zoneid, CE_CONT,
2713 2695 "^File: userid=%d, groupid=%d\n",
2714 2696 crgetuid(cr), crgetgid(cr));
2715 2697 if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2716 2698 crgetgid(curthread->t_cred) != crgetgid(cr)) {
2717 2699 zcmn_err(zoneid, CE_CONT,
2718 2700 "^User: userid=%d, groupid=%d\n",
2719 2701 crgetuid(curthread->t_cred),
2720 2702 crgetgid(curthread->t_cred));
2721 2703 }
2722 2704 mi->mi_printftime = now +
2723 2705 nfs_write_error_interval * hz;
2724 2706 }
2725 2707 sfh4_printfhandle(VTOR4(vp)->r_fh);
2726 2708 #ifdef DEBUG
2727 2709 if (error == EACCES) {
2728 2710 zcmn_err(zoneid, CE_CONT,
2729 2711 "nfs_bio: cred is%s kcred\n",
2730 2712 cr == kcred ? "" : " not");
2731 2713 }
2732 2714 #endif
2733 2715 }
2734 2716 }
2735 2717
2736 2718 /*
2737 2719 * Return non-zero if the given file can be safely memory mapped. Locks
2738 2720 * are safe if whole-file (length and offset are both zero).
2739 2721 */
2740 2722
2741 2723 #define SAFE_LOCK(flk) ((flk).l_start == 0 && (flk).l_len == 0)
2742 2724
2743 2725 static int
2744 2726 nfs4_safemap(const vnode_t *vp)
2745 2727 {
2746 2728 locklist_t *llp, *next_llp;
2747 2729 int safe = 1;
2748 2730 rnode4_t *rp = VTOR4(vp);
2749 2731
2750 2732 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2751 2733
2752 2734 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2753 2735 "vp = %p", (void *)vp));
2754 2736
2755 2737 /*
2756 2738 * Review all the locks for the vnode, both ones that have been
2757 2739 * acquired and ones that are pending. We assume that
2758 2740 * flk_active_locks_for_vp() has merged any locks that can be
2759 2741 * merged (so that if a process has the entire file locked, it is
2760 2742 * represented as a single lock).
2761 2743 *
2762 2744 * Note that we can't bail out of the loop if we find a non-safe
2763 2745 * lock, because we have to free all the elements in the llp list.
2764 2746 * We might be able to speed up this code slightly by not looking
2765 2747 * at each lock's l_start and l_len fields once we've found a
2766 2748 * non-safe lock.
2767 2749 */
2768 2750
2769 2751 llp = flk_active_locks_for_vp(vp);
2770 2752 while (llp) {
2771 2753 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2772 2754 "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2773 2755 llp->ll_flock.l_start, llp->ll_flock.l_len));
2774 2756 if (!SAFE_LOCK(llp->ll_flock)) {
2775 2757 safe = 0;
2776 2758 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2777 2759 "nfs4_safemap: unsafe active lock (%" PRId64
2778 2760 ", %" PRId64 ")", llp->ll_flock.l_start,
2779 2761 llp->ll_flock.l_len));
2780 2762 }
2781 2763 next_llp = llp->ll_next;
2782 2764 VN_RELE(llp->ll_vp);
2783 2765 kmem_free(llp, sizeof (*llp));
2784 2766 llp = next_llp;
2785 2767 }
2786 2768
2787 2769 NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2788 2770 safe ? "safe" : "unsafe"));
2789 2771 return (safe);
2790 2772 }
2791 2773
2792 2774 /*
2793 2775 * Return whether there is a lost LOCK or LOCKU queued up for the given
2794 2776 * file that would make an mmap request unsafe. cf. nfs4_safemap().
2795 2777 */
2796 2778
2797 2779 bool_t
2798 2780 nfs4_map_lost_lock_conflict(vnode_t *vp)
2799 2781 {
2800 2782 bool_t conflict = FALSE;
2801 2783 nfs4_lost_rqst_t *lrp;
2802 2784 mntinfo4_t *mi = VTOMI4(vp);
2803 2785
2804 2786 mutex_enter(&mi->mi_lock);
2805 2787 for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2806 2788 lrp = list_next(&mi->mi_lost_state, lrp)) {
2807 2789 if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2808 2790 continue;
2809 2791 ASSERT(lrp->lr_vp != NULL);
2810 2792 if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2811 2793 continue; /* different file */
2812 2794 if (!SAFE_LOCK(*lrp->lr_flk)) {
2813 2795 conflict = TRUE;
2814 2796 break;
2815 2797 }
2816 2798 }
2817 2799
2818 2800 mutex_exit(&mi->mi_lock);
2819 2801 return (conflict);
2820 2802 }
2821 2803
2822 2804 /*
2823 2805 * nfs_lockcompletion:
2824 2806 *
2825 2807 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2826 2808 * as non cachable (set VNOCACHE bit).
2827 2809 */
2828 2810
2829 2811 void
2830 2812 nfs4_lockcompletion(vnode_t *vp, int cmd)
2831 2813 {
2832 2814 rnode4_t *rp = VTOR4(vp);
2833 2815
2834 2816 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2835 2817 ASSERT(!IS_SHADOW(vp, rp));
2836 2818
2837 2819 if (cmd == F_SETLK || cmd == F_SETLKW) {
2838 2820
2839 2821 if (!nfs4_safemap(vp)) {
2840 2822 mutex_enter(&vp->v_lock);
2841 2823 vp->v_flag |= VNOCACHE;
2842 2824 mutex_exit(&vp->v_lock);
2843 2825 } else {
2844 2826 mutex_enter(&vp->v_lock);
2845 2827 vp->v_flag &= ~VNOCACHE;
2846 2828 mutex_exit(&vp->v_lock);
2847 2829 }
2848 2830 }
2849 2831 /*
2850 2832 * The cached attributes of the file are stale after acquiring
2851 2833 * the lock on the file. They were updated when the file was
2852 2834 * opened, but not updated when the lock was acquired. Therefore the
2853 2835 * cached attributes are invalidated after the lock is obtained.
2854 2836 */
2855 2837 PURGE_ATTRCACHE4(vp);
2856 2838 }
2857 2839
2858 2840 /* ARGSUSED */
2859 2841 static void *
2860 2842 nfs4_mi_init(zoneid_t zoneid)
2861 2843 {
2862 2844 struct mi4_globals *mig;
2863 2845
2864 2846 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2865 2847 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2866 2848 list_create(&mig->mig_list, sizeof (mntinfo4_t),
2867 2849 offsetof(mntinfo4_t, mi_zone_node));
2868 2850 mig->mig_destructor_called = B_FALSE;
2869 2851 return (mig);
2870 2852 }
2871 2853
2872 2854 /*
2873 2855 * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2874 2856 * state and killing off threads.
2875 2857 */
2876 2858 /* ARGSUSED */
2877 2859 static void
2878 2860 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2879 2861 {
2880 2862 struct mi4_globals *mig = data;
2881 2863 mntinfo4_t *mi;
2882 2864 nfs4_server_t *np;
2883 2865
2884 2866 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2885 2867 "nfs4_mi_shutdown zone %d\n", zoneid));
2886 2868 ASSERT(mig != NULL);
2887 2869 for (;;) {
2888 2870 mutex_enter(&mig->mig_lock);
2889 2871 mi = list_head(&mig->mig_list);
2890 2872 if (mi == NULL) {
2891 2873 mutex_exit(&mig->mig_lock);
2892 2874 break;
2893 2875 }
2894 2876
2895 2877 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2896 2878 "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2897 2879 /*
2898 2880 * purge the DNLC for this filesystem
2899 2881 */
2900 2882 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2901 2883 /*
2902 2884 * Tell existing async worker threads to exit.
2903 2885 */
2904 2886 mutex_enter(&mi->mi_async_lock);
2905 2887 mi->mi_max_threads = 0;
2906 2888 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2907 2889 /*
2908 2890 * Set the appropriate flags, signal and wait for both the
2909 2891 * async manager and the inactive thread to exit when they're
2910 2892 * done with their current work.
2911 2893 */
2912 2894 mutex_enter(&mi->mi_lock);
2913 2895 mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2914 2896 mutex_exit(&mi->mi_lock);
2915 2897 mutex_exit(&mi->mi_async_lock);
2916 2898 if (mi->mi_manager_thread) {
2917 2899 nfs4_async_manager_stop(mi->mi_vfsp);
2918 2900 }
2919 2901 if (mi->mi_inactive_thread) {
2920 2902 mutex_enter(&mi->mi_async_lock);
2921 2903 cv_signal(&mi->mi_inact_req_cv);
2922 2904 /*
2923 2905 * Wait for the inactive thread to exit.
2924 2906 */
2925 2907 while (mi->mi_inactive_thread != NULL) {
2926 2908 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2927 2909 }
2928 2910 mutex_exit(&mi->mi_async_lock);
2929 2911 }
2930 2912 /*
2931 2913 * Wait for the recovery thread to complete, that is, it will
2932 2914 * signal when it is done using the "mi" structure and about
2933 2915 * to exit
2934 2916 */
2935 2917 mutex_enter(&mi->mi_lock);
2936 2918 while (mi->mi_in_recovery > 0)
2937 2919 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2938 2920 mutex_exit(&mi->mi_lock);
2939 2921 /*
2940 2922 * We're done when every mi has been done or the list is empty.
2941 2923 * This one is done, remove it from the list.
2942 2924 */
2943 2925 list_remove(&mig->mig_list, mi);
2944 2926 mutex_exit(&mig->mig_lock);
2945 2927 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2946 2928
2947 2929 /*
2948 2930 * Release hold on vfs and mi done to prevent race with zone
2949 2931 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2950 2932 */
2951 2933 VFS_RELE(mi->mi_vfsp);
2952 2934 MI4_RELE(mi);
2953 2935 }
2954 2936 /*
2955 2937 * Tell each renew thread in the zone to exit
2956 2938 */
2957 2939 mutex_enter(&nfs4_server_lst_lock);
2958 2940 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2959 2941 mutex_enter(&np->s_lock);
2960 2942 if (np->zoneid == zoneid) {
2961 2943 /*
2962 2944 * We add another hold onto the nfs4_server_t
2963 2945 * because this will make sure tha the nfs4_server_t
2964 2946 * stays around until nfs4_callback_fini_zone destroys
2965 2947 * the zone. This way, the renew thread can
2966 2948 * unconditionally release its holds on the
2967 2949 * nfs4_server_t.
2968 2950 */
2969 2951 np->s_refcnt++;
2970 2952 nfs4_mark_srv_dead(np);
2971 2953 }
2972 2954 mutex_exit(&np->s_lock);
2973 2955 }
2974 2956 mutex_exit(&nfs4_server_lst_lock);
2975 2957 }
2976 2958
2977 2959 static void
2978 2960 nfs4_mi_free_globals(struct mi4_globals *mig)
2979 2961 {
2980 2962 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2981 2963 mutex_destroy(&mig->mig_lock);
2982 2964 kmem_free(mig, sizeof (*mig));
2983 2965 }
2984 2966
2985 2967 /* ARGSUSED */
2986 2968 static void
2987 2969 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2988 2970 {
2989 2971 struct mi4_globals *mig = data;
2990 2972
2991 2973 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2992 2974 "nfs4_mi_destroy zone %d\n", zoneid));
2993 2975 ASSERT(mig != NULL);
2994 2976 mutex_enter(&mig->mig_lock);
2995 2977 if (list_head(&mig->mig_list) != NULL) {
2996 2978 /* Still waiting for VFS_FREEVFS() */
2997 2979 mig->mig_destructor_called = B_TRUE;
2998 2980 mutex_exit(&mig->mig_lock);
2999 2981 return;
3000 2982 }
3001 2983 nfs4_mi_free_globals(mig);
3002 2984 }
3003 2985
3004 2986 /*
3005 2987 * Add an NFS mount to the per-zone list of NFS mounts.
3006 2988 */
3007 2989 void
3008 2990 nfs4_mi_zonelist_add(mntinfo4_t *mi)
3009 2991 {
3010 2992 struct mi4_globals *mig;
3011 2993
3012 2994 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3013 2995 mutex_enter(&mig->mig_lock);
3014 2996 list_insert_head(&mig->mig_list, mi);
3015 2997 /*
3016 2998 * hold added to eliminate race with zone shutdown -this will be
3017 2999 * released in mi_shutdown
3018 3000 */
3019 3001 MI4_HOLD(mi);
3020 3002 VFS_HOLD(mi->mi_vfsp);
3021 3003 mutex_exit(&mig->mig_lock);
3022 3004 }
3023 3005
3024 3006 /*
3025 3007 * Remove an NFS mount from the per-zone list of NFS mounts.
3026 3008 */
3027 3009 int
3028 3010 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3029 3011 {
3030 3012 struct mi4_globals *mig;
3031 3013 int ret = 0;
3032 3014
3033 3015 mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3034 3016 mutex_enter(&mig->mig_lock);
3035 3017 mutex_enter(&mi->mi_lock);
3036 3018 /* if this mi is marked dead, then the zone already released it */
3037 3019 if (!(mi->mi_flags & MI4_DEAD)) {
3038 3020 list_remove(&mig->mig_list, mi);
3039 3021 mutex_exit(&mi->mi_lock);
3040 3022
3041 3023 /* release the holds put on in zonelist_add(). */
3042 3024 VFS_RELE(mi->mi_vfsp);
3043 3025 MI4_RELE(mi);
3044 3026 ret = 1;
3045 3027 } else {
3046 3028 mutex_exit(&mi->mi_lock);
3047 3029 }
3048 3030
3049 3031 /*
3050 3032 * We can be called asynchronously by VFS_FREEVFS() after the zone
3051 3033 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3052 3034 * mi globals.
3053 3035 */
3054 3036 if (list_head(&mig->mig_list) == NULL &&
3055 3037 mig->mig_destructor_called == B_TRUE) {
3056 3038 nfs4_mi_free_globals(mig);
3057 3039 return (ret);
3058 3040 }
3059 3041 mutex_exit(&mig->mig_lock);
|
↓ open down ↓ |
2556 lines elided |
↑ open up ↑ |
3060 3042 return (ret);
3061 3043 }
3062 3044
3063 3045 void
3064 3046 nfs_free_mi4(mntinfo4_t *mi)
3065 3047 {
3066 3048 nfs4_open_owner_t *foop;
3067 3049 nfs4_oo_hash_bucket_t *bucketp;
3068 3050 nfs4_debug_msg_t *msgp;
3069 3051 int i;
3070 - servinfo4_t *svp;
3052 + servinfo4_t *svp;
3071 3053
3072 3054 /*
3073 3055 * Code introduced here should be carefully evaluated to make
3074 3056 * sure none of the freed resources are accessed either directly
3075 3057 * or indirectly after freeing them. For eg: Introducing calls to
3076 3058 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3077 3059 * the structure members or other routines calling back into NFS
3078 3060 * accessing freed mntinfo4_t structure member.
3079 3061 */
3080 3062 mutex_enter(&mi->mi_lock);
3081 3063 ASSERT(mi->mi_recovthread == NULL);
3082 3064 ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3083 3065 mutex_exit(&mi->mi_lock);
3084 3066 mutex_enter(&mi->mi_async_lock);
3085 3067 ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3086 3068 mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3087 3069 ASSERT(mi->mi_manager_thread == NULL);
3088 3070 mutex_exit(&mi->mi_async_lock);
3089 3071 if (mi->mi_io_kstats) {
3090 3072 kstat_delete(mi->mi_io_kstats);
3091 3073 mi->mi_io_kstats = NULL;
3092 3074 }
3093 3075 if (mi->mi_ro_kstats) {
3094 3076 kstat_delete(mi->mi_ro_kstats);
3095 3077 mi->mi_ro_kstats = NULL;
3096 3078 }
3097 3079 if (mi->mi_recov_ksp) {
3098 3080 kstat_delete(mi->mi_recov_ksp);
3099 3081 mi->mi_recov_ksp = NULL;
3100 3082 }
3101 3083 mutex_enter(&mi->mi_msg_list_lock);
3102 3084 while (msgp = list_head(&mi->mi_msg_list)) {
3103 3085 list_remove(&mi->mi_msg_list, msgp);
3104 3086 nfs4_free_msg(msgp);
3105 3087 }
3106 3088 mutex_exit(&mi->mi_msg_list_lock);
3107 3089 list_destroy(&mi->mi_msg_list);
3108 3090 if (mi->mi_fname != NULL)
3109 3091 fn_rele(&mi->mi_fname);
3110 3092 if (mi->mi_rootfh != NULL)
3111 3093 sfh4_rele(&mi->mi_rootfh);
3112 3094 if (mi->mi_srvparentfh != NULL)
3113 3095 sfh4_rele(&mi->mi_srvparentfh);
3114 3096 svp = mi->mi_servers;
3115 3097 sv4_free(svp);
3116 3098 mutex_destroy(&mi->mi_lock);
3117 3099 mutex_destroy(&mi->mi_async_lock);
3118 3100 mutex_destroy(&mi->mi_msg_list_lock);
3119 3101 mutex_destroy(&mi->mi_rnodes_lock);
3120 3102 nfs_rw_destroy(&mi->mi_recovlock);
3121 3103 nfs_rw_destroy(&mi->mi_rename_lock);
3122 3104 nfs_rw_destroy(&mi->mi_fh_lock);
3123 3105 cv_destroy(&mi->mi_failover_cv);
3124 3106 cv_destroy(&mi->mi_async_reqs_cv);
3125 3107 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3126 3108 cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3127 3109 cv_destroy(&mi->mi_async_cv);
3128 3110 cv_destroy(&mi->mi_inact_req_cv);
3129 3111 /*
3130 3112 * Destroy the oo hash lists and mutexes for the cred hash table.
3131 3113 */
3132 3114 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3133 3115 bucketp = &(mi->mi_oo_list[i]);
3134 3116 /* Destroy any remaining open owners on the list */
3135 3117 foop = list_head(&bucketp->b_oo_hash_list);
3136 3118 while (foop != NULL) {
3137 3119 list_remove(&bucketp->b_oo_hash_list, foop);
3138 3120 nfs4_destroy_open_owner(foop);
3139 3121 foop = list_head(&bucketp->b_oo_hash_list);
3140 3122 }
3141 3123 list_destroy(&bucketp->b_oo_hash_list);
3142 3124 mutex_destroy(&bucketp->b_lock);
3143 3125 }
3144 3126 /*
3145 3127 * Empty and destroy the freed open owner list.
3146 3128 */
3147 3129 foop = list_head(&mi->mi_foo_list);
3148 3130 while (foop != NULL) {
3149 3131 list_remove(&mi->mi_foo_list, foop);
3150 3132 nfs4_destroy_open_owner(foop);
3151 3133 foop = list_head(&mi->mi_foo_list);
3152 3134 }
3153 3135 list_destroy(&mi->mi_foo_list);
3154 3136 list_destroy(&mi->mi_bseqid_list);
3155 3137 list_destroy(&mi->mi_lost_state);
3156 3138 list_destroy(&mi->mi_rnodes);
3157 3139 avl_destroy(&mi->mi_filehandles);
3158 3140 kmem_free(mi, sizeof (*mi));
3159 3141 }
3160 3142 void
3161 3143 mi_hold(mntinfo4_t *mi)
3162 3144 {
3163 3145 atomic_inc_32(&mi->mi_count);
3164 3146 ASSERT(mi->mi_count != 0);
3165 3147 }
3166 3148
3167 3149 void
3168 3150 mi_rele(mntinfo4_t *mi)
3169 3151 {
3170 3152 ASSERT(mi->mi_count != 0);
3171 3153 if (atomic_dec_32_nv(&mi->mi_count) == 0) {
3172 3154 nfs_free_mi4(mi);
3173 3155 }
3174 3156 }
3175 3157
3176 3158 vnode_t nfs4_xattr_notsupp_vnode;
3177 3159
3178 3160 void
3179 3161 nfs4_clnt_init(void)
3180 3162 {
3181 3163 nfs4_vnops_init();
3182 3164 (void) nfs4_rnode_init();
3183 3165 (void) nfs4_shadow_init();
3184 3166 (void) nfs4_acache_init();
3185 3167 (void) nfs4_subr_init();
3186 3168 nfs4_acl_init();
3187 3169 nfs_idmap_init();
3188 3170 nfs4_callback_init();
3189 3171 nfs4_secinfo_init();
3190 3172 #ifdef DEBUG
3191 3173 tsd_create(&nfs4_tsd_key, NULL);
3192 3174 #endif
3193 3175
3194 3176 /*
3195 3177 * Add a CPR callback so that we can update client
3196 3178 * lease after a suspend and resume.
3197 3179 */
3198 3180 cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3199 3181
3200 3182 zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3201 3183 nfs4_mi_destroy);
3202 3184
3203 3185 /*
3204 3186 * Initialize the reference count of the notsupp xattr cache vnode to 1
3205 3187 * so that it never goes away (VOP_INACTIVE isn't called on it).
3206 3188 */
3207 3189 vn_reinit(&nfs4_xattr_notsupp_vnode);
3208 3190 }
3209 3191
3210 3192 void
3211 3193 nfs4_clnt_fini(void)
3212 3194 {
3213 3195 (void) zone_key_delete(mi4_list_key);
3214 3196 nfs4_vnops_fini();
3215 3197 (void) nfs4_rnode_fini();
3216 3198 (void) nfs4_shadow_fini();
3217 3199 (void) nfs4_acache_fini();
3218 3200 (void) nfs4_subr_fini();
3219 3201 nfs_idmap_fini();
3220 3202 nfs4_callback_fini();
3221 3203 nfs4_secinfo_fini();
3222 3204 #ifdef DEBUG
3223 3205 tsd_destroy(&nfs4_tsd_key);
3224 3206 #endif
3225 3207 if (cid)
3226 3208 (void) callb_delete(cid);
3227 3209 }
3228 3210
3229 3211 /*ARGSUSED*/
3230 3212 static boolean_t
3231 3213 nfs4_client_cpr_callb(void *arg, int code)
3232 3214 {
3233 3215 /*
3234 3216 * We get called for Suspend and Resume events.
3235 3217 * For the suspend case we simply don't care!
3236 3218 */
3237 3219 if (code == CB_CODE_CPR_CHKPT) {
3238 3220 return (B_TRUE);
3239 3221 }
3240 3222
3241 3223 /*
3242 3224 * When we get to here we are in the process of
3243 3225 * resuming the system from a previous suspend.
3244 3226 */
3245 3227 nfs4_client_resumed = gethrestime_sec();
3246 3228 return (B_TRUE);
3247 3229 }
3248 3230
3249 3231 void
3250 3232 nfs4_renew_lease_thread(nfs4_server_t *sp)
3251 3233 {
3252 3234 int error = 0;
3253 3235 time_t tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3254 3236 clock_t tick_delay = 0;
3255 3237 clock_t time_left = 0;
3256 3238 callb_cpr_t cpr_info;
3257 3239 kmutex_t cpr_lock;
3258 3240
3259 3241 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3260 3242 "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3261 3243 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3262 3244 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3263 3245
3264 3246 mutex_enter(&sp->s_lock);
3265 3247 /* sp->s_lease_time is set via a GETATTR */
3266 3248 sp->last_renewal_time = gethrestime_sec();
3267 3249 sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3268 3250 ASSERT(sp->s_refcnt >= 1);
3269 3251
3270 3252 for (;;) {
3271 3253 if (!sp->state_ref_count ||
3272 3254 sp->lease_valid != NFS4_LEASE_VALID) {
3273 3255
3274 3256 kip_secs = MAX((sp->s_lease_time >> 1) -
3275 3257 (3 * sp->propagation_delay.tv_sec), 1);
3276 3258
3277 3259 tick_delay = SEC_TO_TICK(kip_secs);
3278 3260
3279 3261 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3280 3262 "nfs4_renew_lease_thread: no renew : thread "
3281 3263 "wait %ld secs", kip_secs));
3282 3264
3283 3265 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3284 3266 "nfs4_renew_lease_thread: no renew : "
3285 3267 "state_ref_count %d, lease_valid %d",
3286 3268 sp->state_ref_count, sp->lease_valid));
3287 3269
3288 3270 mutex_enter(&cpr_lock);
3289 3271 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3290 3272 mutex_exit(&cpr_lock);
3291 3273 time_left = cv_reltimedwait(&sp->cv_thread_exit,
3292 3274 &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3293 3275 mutex_enter(&cpr_lock);
3294 3276 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3295 3277 mutex_exit(&cpr_lock);
3296 3278
3297 3279 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3298 3280 "nfs4_renew_lease_thread: no renew: "
3299 3281 "time left %ld", time_left));
3300 3282
3301 3283 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3302 3284 goto die;
3303 3285 continue;
3304 3286 }
3305 3287
3306 3288 tmp_last_renewal_time = sp->last_renewal_time;
3307 3289
3308 3290 tmp_time = gethrestime_sec() - sp->last_renewal_time +
3309 3291 (3 * sp->propagation_delay.tv_sec);
3310 3292
3311 3293 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3312 3294 "nfs4_renew_lease_thread: tmp_time %ld, "
3313 3295 "sp->last_renewal_time %ld", tmp_time,
3314 3296 sp->last_renewal_time));
3315 3297
3316 3298 kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3317 3299
3318 3300 tick_delay = SEC_TO_TICK(kip_secs);
3319 3301
3320 3302 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3321 3303 "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3322 3304 "secs", kip_secs));
3323 3305
3324 3306 mutex_enter(&cpr_lock);
3325 3307 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3326 3308 mutex_exit(&cpr_lock);
3327 3309 time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3328 3310 tick_delay, TR_CLOCK_TICK);
3329 3311 mutex_enter(&cpr_lock);
3330 3312 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3331 3313 mutex_exit(&cpr_lock);
3332 3314
3333 3315 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3334 3316 "nfs4_renew_lease_thread: valid lease: time left %ld :"
3335 3317 "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3336 3318 "tmp_last_renewal_time %ld", time_left,
3337 3319 sp->last_renewal_time, nfs4_client_resumed,
3338 3320 tmp_last_renewal_time));
3339 3321
3340 3322 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3341 3323 goto die;
3342 3324
3343 3325 if (tmp_last_renewal_time == sp->last_renewal_time ||
3344 3326 (nfs4_client_resumed != 0 &&
3345 3327 nfs4_client_resumed > sp->last_renewal_time)) {
3346 3328 /*
3347 3329 * Issue RENEW op since we haven't renewed the lease
3348 3330 * since we slept.
3349 3331 */
3350 3332 tmp_now_time = gethrestime_sec();
3351 3333 error = nfs4renew(sp);
3352 3334 /*
3353 3335 * Need to re-acquire sp's lock, nfs4renew()
3354 3336 * relinqueshes it.
3355 3337 */
3356 3338 mutex_enter(&sp->s_lock);
3357 3339
3358 3340 /*
3359 3341 * See if someone changed s_thread_exit while we gave
3360 3342 * up s_lock.
3361 3343 */
3362 3344 if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3363 3345 goto die;
3364 3346
3365 3347 if (!error) {
3366 3348 /*
3367 3349 * check to see if we implicitly renewed while
3368 3350 * we waited for a reply for our RENEW call.
3369 3351 */
3370 3352 if (tmp_last_renewal_time ==
3371 3353 sp->last_renewal_time) {
3372 3354 /* no implicit renew came */
3373 3355 sp->last_renewal_time = tmp_now_time;
3374 3356 } else {
3375 3357 NFS4_DEBUG(nfs4_client_lease_debug,
3376 3358 (CE_NOTE, "renew_thread: did "
3377 3359 "implicit renewal before reply "
3378 3360 "from server for RENEW"));
3379 3361 }
3380 3362 } else {
3381 3363 /* figure out error */
3382 3364 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3383 3365 "renew_thread: nfs4renew returned error"
3384 3366 " %d", error));
3385 3367 }
3386 3368
3387 3369 }
3388 3370 }
3389 3371
3390 3372 die:
3391 3373 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3392 3374 "nfs4_renew_lease_thread: thread exiting"));
3393 3375
3394 3376 while (sp->s_otw_call_count != 0) {
3395 3377 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3396 3378 "nfs4_renew_lease_thread: waiting for outstanding "
3397 3379 "otw calls to finish for sp 0x%p, current "
3398 3380 "s_otw_call_count %d", (void *)sp,
3399 3381 sp->s_otw_call_count));
3400 3382 mutex_enter(&cpr_lock);
3401 3383 CALLB_CPR_SAFE_BEGIN(&cpr_info);
3402 3384 mutex_exit(&cpr_lock);
3403 3385 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3404 3386 mutex_enter(&cpr_lock);
3405 3387 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3406 3388 mutex_exit(&cpr_lock);
3407 3389 }
3408 3390 mutex_exit(&sp->s_lock);
3409 3391
3410 3392 nfs4_server_rele(sp); /* free the thread's reference */
3411 3393 nfs4_server_rele(sp); /* free the list's reference */
3412 3394 sp = NULL;
3413 3395
3414 3396 done:
3415 3397 mutex_enter(&cpr_lock);
3416 3398 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
3417 3399 mutex_destroy(&cpr_lock);
3418 3400
3419 3401 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3420 3402 "nfs4_renew_lease_thread: renew thread exit officially"));
3421 3403
3422 3404 zthread_exit();
3423 3405 /* NOT REACHED */
3424 3406 }
3425 3407
3426 3408 /*
3427 3409 * Send out a RENEW op to the server.
3428 3410 * Assumes sp is locked down.
3429 3411 */
3430 3412 static int
3431 3413 nfs4renew(nfs4_server_t *sp)
3432 3414 {
3433 3415 COMPOUND4args_clnt args;
3434 3416 COMPOUND4res_clnt res;
3435 3417 nfs_argop4 argop[1];
3436 3418 int doqueue = 1;
3437 3419 int rpc_error;
3438 3420 cred_t *cr;
3439 3421 mntinfo4_t *mi;
3440 3422 timespec_t prop_time, after_time;
3441 3423 int needrecov = FALSE;
3442 3424 nfs4_recov_state_t recov_state;
3443 3425 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3444 3426
3445 3427 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3446 3428
3447 3429 recov_state.rs_flags = 0;
3448 3430 recov_state.rs_num_retry_despite_err = 0;
3449 3431
3450 3432 recov_retry:
3451 3433 mi = sp->mntinfo4_list;
3452 3434 VFS_HOLD(mi->mi_vfsp);
3453 3435 mutex_exit(&sp->s_lock);
3454 3436 ASSERT(mi != NULL);
3455 3437
3456 3438 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3457 3439 if (e.error) {
3458 3440 VFS_RELE(mi->mi_vfsp);
3459 3441 return (e.error);
3460 3442 }
3461 3443
3462 3444 /* Check to see if we're dealing with a marked-dead sp */
3463 3445 mutex_enter(&sp->s_lock);
3464 3446 if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3465 3447 mutex_exit(&sp->s_lock);
3466 3448 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3467 3449 VFS_RELE(mi->mi_vfsp);
3468 3450 return (0);
3469 3451 }
3470 3452
3471 3453 /* Make sure mi hasn't changed on us */
3472 3454 if (mi != sp->mntinfo4_list) {
3473 3455 /* Must drop sp's lock to avoid a recursive mutex enter */
3474 3456 mutex_exit(&sp->s_lock);
3475 3457 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3476 3458 VFS_RELE(mi->mi_vfsp);
3477 3459 mutex_enter(&sp->s_lock);
3478 3460 goto recov_retry;
3479 3461 }
3480 3462 mutex_exit(&sp->s_lock);
3481 3463
3482 3464 args.ctag = TAG_RENEW;
3483 3465
3484 3466 args.array_len = 1;
3485 3467 args.array = argop;
3486 3468
3487 3469 argop[0].argop = OP_RENEW;
3488 3470
3489 3471 mutex_enter(&sp->s_lock);
3490 3472 argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3491 3473 cr = sp->s_cred;
3492 3474 crhold(cr);
3493 3475 mutex_exit(&sp->s_lock);
3494 3476
3495 3477 ASSERT(cr != NULL);
3496 3478
3497 3479 /* used to figure out RTT for sp */
3498 3480 gethrestime(&prop_time);
3499 3481
3500 3482 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3501 3483 "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3502 3484 (void*)sp));
3503 3485 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3504 3486 prop_time.tv_sec, prop_time.tv_nsec));
3505 3487
3506 3488 DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3507 3489 mntinfo4_t *, mi);
3508 3490
3509 3491 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3510 3492 crfree(cr);
3511 3493
3512 3494 DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3513 3495 mntinfo4_t *, mi);
3514 3496
3515 3497 gethrestime(&after_time);
3516 3498
3517 3499 mutex_enter(&sp->s_lock);
3518 3500 sp->propagation_delay.tv_sec =
3519 3501 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3520 3502 mutex_exit(&sp->s_lock);
3521 3503
3522 3504 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3523 3505 after_time.tv_sec, after_time.tv_nsec));
3524 3506
3525 3507 if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3526 3508 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3527 3509 nfs4_delegreturn_all(sp);
3528 3510 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3529 3511 VFS_RELE(mi->mi_vfsp);
3530 3512 /*
3531 3513 * If the server returns CB_PATH_DOWN, it has renewed
3532 3514 * the lease and informed us that the callback path is
3533 3515 * down. Since the lease is renewed, just return 0 and
3534 3516 * let the renew thread proceed as normal.
3535 3517 */
3536 3518 return (0);
3537 3519 }
3538 3520
3539 3521 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3540 3522 if (!needrecov && e.error) {
3541 3523 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3542 3524 VFS_RELE(mi->mi_vfsp);
3543 3525 return (e.error);
3544 3526 }
3545 3527
3546 3528 rpc_error = e.error;
3547 3529
3548 3530 if (needrecov) {
3549 3531 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3550 3532 "nfs4renew: initiating recovery\n"));
3551 3533
3552 3534 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3553 3535 OP_RENEW, NULL, NULL, NULL) == FALSE) {
3554 3536 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3555 3537 VFS_RELE(mi->mi_vfsp);
3556 3538 if (!e.error)
3557 3539 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3558 3540 mutex_enter(&sp->s_lock);
3559 3541 goto recov_retry;
3560 3542 }
3561 3543 /* fall through for res.status case */
3562 3544 }
3563 3545
3564 3546 if (res.status) {
3565 3547 if (res.status == NFS4ERR_LEASE_MOVED) {
3566 3548 /*EMPTY*/
3567 3549 /*
3568 3550 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3569 3551 * to renew the lease on that server
3570 3552 */
3571 3553 }
3572 3554 e.error = geterrno4(res.status);
3573 3555 }
3574 3556
3575 3557 if (!rpc_error)
3576 3558 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3577 3559
3578 3560 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3579 3561
3580 3562 VFS_RELE(mi->mi_vfsp);
3581 3563
3582 3564 return (e.error);
3583 3565 }
3584 3566
3585 3567 void
3586 3568 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3587 3569 {
3588 3570 nfs4_server_t *sp;
3589 3571
3590 3572 /* this locks down sp if it is found */
3591 3573 sp = find_nfs4_server(mi);
3592 3574
3593 3575 if (sp != NULL) {
3594 3576 nfs4_inc_state_ref_count_nolock(sp, mi);
3595 3577 mutex_exit(&sp->s_lock);
3596 3578 nfs4_server_rele(sp);
3597 3579 }
3598 3580 }
3599 3581
3600 3582 /*
3601 3583 * Bump the number of OPEN files (ie: those with state) so we know if this
3602 3584 * nfs4_server has any state to maintain a lease for or not.
3603 3585 *
3604 3586 * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3605 3587 */
3606 3588 void
3607 3589 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3608 3590 {
3609 3591 ASSERT(mutex_owned(&sp->s_lock));
3610 3592
3611 3593 sp->state_ref_count++;
3612 3594 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3613 3595 "nfs4_inc_state_ref_count: state_ref_count now %d",
3614 3596 sp->state_ref_count));
3615 3597
3616 3598 if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3617 3599 sp->lease_valid = NFS4_LEASE_VALID;
3618 3600
3619 3601 /*
3620 3602 * If this call caused the lease to be marked valid and/or
3621 3603 * took the state_ref_count from 0 to 1, then start the time
3622 3604 * on lease renewal.
3623 3605 */
3624 3606 if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3625 3607 sp->last_renewal_time = gethrestime_sec();
3626 3608
3627 3609 /* update the number of open files for mi */
3628 3610 mi->mi_open_files++;
3629 3611 }
3630 3612
3631 3613 void
3632 3614 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3633 3615 {
3634 3616 nfs4_server_t *sp;
3635 3617
3636 3618 /* this locks down sp if it is found */
3637 3619 sp = find_nfs4_server_all(mi, 1);
3638 3620
3639 3621 if (sp != NULL) {
3640 3622 nfs4_dec_state_ref_count_nolock(sp, mi);
3641 3623 mutex_exit(&sp->s_lock);
3642 3624 nfs4_server_rele(sp);
3643 3625 }
3644 3626 }
3645 3627
3646 3628 /*
3647 3629 * Decrement the number of OPEN files (ie: those with state) so we know if
3648 3630 * this nfs4_server has any state to maintain a lease for or not.
3649 3631 */
3650 3632 void
3651 3633 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3652 3634 {
3653 3635 ASSERT(mutex_owned(&sp->s_lock));
3654 3636 ASSERT(sp->state_ref_count != 0);
3655 3637 sp->state_ref_count--;
3656 3638
3657 3639 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3658 3640 "nfs4_dec_state_ref_count: state ref count now %d",
3659 3641 sp->state_ref_count));
3660 3642
3661 3643 mi->mi_open_files--;
3662 3644 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3663 3645 "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3664 3646 mi->mi_open_files, mi->mi_flags));
3665 3647
3666 3648 /* We don't have to hold the mi_lock to test mi_flags */
3667 3649 if (mi->mi_open_files == 0 &&
3668 3650 (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3669 3651 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3670 3652 "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3671 3653 "we have closed the last open file", (void*)mi));
3672 3654 nfs4_remove_mi_from_server(mi, sp);
3673 3655 }
3674 3656 }
3675 3657
3676 3658 bool_t
3677 3659 inlease(nfs4_server_t *sp)
3678 3660 {
3679 3661 bool_t result;
3680 3662
3681 3663 ASSERT(mutex_owned(&sp->s_lock));
3682 3664
3683 3665 if (sp->lease_valid == NFS4_LEASE_VALID &&
3684 3666 gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3685 3667 result = TRUE;
3686 3668 else
3687 3669 result = FALSE;
3688 3670
3689 3671 return (result);
3690 3672 }
3691 3673
3692 3674
3693 3675 /*
3694 3676 * Return non-zero if the given nfs4_server_t is going through recovery.
3695 3677 */
3696 3678
3697 3679 int
3698 3680 nfs4_server_in_recovery(nfs4_server_t *sp)
3699 3681 {
3700 3682 return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3701 3683 }
3702 3684
3703 3685 /*
3704 3686 * Compare two shared filehandle objects. Returns -1, 0, or +1, if the
3705 3687 * first is less than, equal to, or greater than the second.
3706 3688 */
3707 3689
3708 3690 int
3709 3691 sfh4cmp(const void *p1, const void *p2)
3710 3692 {
3711 3693 const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3712 3694 const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3713 3695
3714 3696 return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3715 3697 }
3716 3698
3717 3699 /*
3718 3700 * Create a table for shared filehandle objects.
3719 3701 */
3720 3702
3721 3703 void
3722 3704 sfh4_createtab(avl_tree_t *tab)
3723 3705 {
3724 3706 avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3725 3707 offsetof(nfs4_sharedfh_t, sfh_tree));
3726 3708 }
3727 3709
3728 3710 /*
3729 3711 * Return a shared filehandle object for the given filehandle. The caller
3730 3712 * is responsible for eventually calling sfh4_rele().
3731 3713 */
3732 3714
3733 3715 nfs4_sharedfh_t *
3734 3716 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3735 3717 {
3736 3718 nfs4_sharedfh_t *sfh, *nsfh;
3737 3719 avl_index_t where;
3738 3720 nfs4_sharedfh_t skey;
3739 3721
3740 3722 if (!key) {
3741 3723 skey.sfh_fh = *fh;
3742 3724 key = &skey;
3743 3725 }
3744 3726
3745 3727 nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3746 3728 nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3747 3729 /*
3748 3730 * We allocate the largest possible filehandle size because it's
3749 3731 * not that big, and it saves us from possibly having to resize the
3750 3732 * buffer later.
3751 3733 */
3752 3734 nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3753 3735 bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3754 3736 mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3755 3737 nsfh->sfh_refcnt = 1;
3756 3738 nsfh->sfh_flags = SFH4_IN_TREE;
3757 3739 nsfh->sfh_mi = mi;
3758 3740 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3759 3741 (void *)nsfh));
3760 3742
3761 3743 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3762 3744 sfh = avl_find(&mi->mi_filehandles, key, &where);
3763 3745 if (sfh != NULL) {
3764 3746 mutex_enter(&sfh->sfh_lock);
3765 3747 sfh->sfh_refcnt++;
3766 3748 mutex_exit(&sfh->sfh_lock);
3767 3749 nfs_rw_exit(&mi->mi_fh_lock);
3768 3750 /* free our speculative allocs */
3769 3751 kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3770 3752 kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3771 3753 return (sfh);
3772 3754 }
3773 3755
3774 3756 avl_insert(&mi->mi_filehandles, nsfh, where);
3775 3757 nfs_rw_exit(&mi->mi_fh_lock);
3776 3758
3777 3759 return (nsfh);
3778 3760 }
3779 3761
3780 3762 /*
3781 3763 * Return a shared filehandle object for the given filehandle. The caller
3782 3764 * is responsible for eventually calling sfh4_rele().
3783 3765 */
3784 3766
3785 3767 nfs4_sharedfh_t *
3786 3768 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3787 3769 {
3788 3770 nfs4_sharedfh_t *sfh;
3789 3771 nfs4_sharedfh_t key;
3790 3772
3791 3773 ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3792 3774
3793 3775 #ifdef DEBUG
3794 3776 if (nfs4_sharedfh_debug) {
3795 3777 nfs4_fhandle_t fhandle;
3796 3778
3797 3779 fhandle.fh_len = fh->nfs_fh4_len;
3798 3780 bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3799 3781 zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3800 3782 nfs4_printfhandle(&fhandle);
3801 3783 }
3802 3784 #endif
3803 3785
3804 3786 /*
3805 3787 * If there's already an object for the given filehandle, bump the
3806 3788 * reference count and return it. Otherwise, create a new object
3807 3789 * and add it to the AVL tree.
3808 3790 */
3809 3791
3810 3792 key.sfh_fh = *fh;
3811 3793
3812 3794 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3813 3795 sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3814 3796 if (sfh != NULL) {
3815 3797 mutex_enter(&sfh->sfh_lock);
3816 3798 sfh->sfh_refcnt++;
3817 3799 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3818 3800 "sfh4_get: found existing %p, new refcnt=%d",
3819 3801 (void *)sfh, sfh->sfh_refcnt));
3820 3802 mutex_exit(&sfh->sfh_lock);
3821 3803 nfs_rw_exit(&mi->mi_fh_lock);
3822 3804 return (sfh);
3823 3805 }
3824 3806 nfs_rw_exit(&mi->mi_fh_lock);
3825 3807
3826 3808 return (sfh4_put(fh, mi, &key));
3827 3809 }
3828 3810
3829 3811 /*
3830 3812 * Get a reference to the given shared filehandle object.
3831 3813 */
3832 3814
3833 3815 void
3834 3816 sfh4_hold(nfs4_sharedfh_t *sfh)
3835 3817 {
3836 3818 ASSERT(sfh->sfh_refcnt > 0);
3837 3819
3838 3820 mutex_enter(&sfh->sfh_lock);
3839 3821 sfh->sfh_refcnt++;
3840 3822 NFS4_DEBUG(nfs4_sharedfh_debug,
3841 3823 (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3842 3824 (void *)sfh, sfh->sfh_refcnt));
3843 3825 mutex_exit(&sfh->sfh_lock);
3844 3826 }
3845 3827
3846 3828 /*
3847 3829 * Release a reference to the given shared filehandle object and null out
3848 3830 * the given pointer.
3849 3831 */
3850 3832
3851 3833 void
3852 3834 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3853 3835 {
3854 3836 mntinfo4_t *mi;
3855 3837 nfs4_sharedfh_t *sfh = *sfhpp;
3856 3838
3857 3839 ASSERT(sfh->sfh_refcnt > 0);
3858 3840
3859 3841 mutex_enter(&sfh->sfh_lock);
3860 3842 if (sfh->sfh_refcnt > 1) {
3861 3843 sfh->sfh_refcnt--;
3862 3844 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3863 3845 "sfh4_rele %p, new refcnt=%d",
3864 3846 (void *)sfh, sfh->sfh_refcnt));
3865 3847 mutex_exit(&sfh->sfh_lock);
3866 3848 goto finish;
3867 3849 }
3868 3850 mutex_exit(&sfh->sfh_lock);
3869 3851
3870 3852 /*
3871 3853 * Possibly the last reference, so get the lock for the table in
3872 3854 * case it's time to remove the object from the table.
3873 3855 */
3874 3856 mi = sfh->sfh_mi;
3875 3857 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3876 3858 mutex_enter(&sfh->sfh_lock);
3877 3859 sfh->sfh_refcnt--;
3878 3860 if (sfh->sfh_refcnt > 0) {
3879 3861 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3880 3862 "sfh4_rele %p, new refcnt=%d",
3881 3863 (void *)sfh, sfh->sfh_refcnt));
3882 3864 mutex_exit(&sfh->sfh_lock);
3883 3865 nfs_rw_exit(&mi->mi_fh_lock);
3884 3866 goto finish;
3885 3867 }
3886 3868
3887 3869 NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3888 3870 "sfh4_rele %p, last ref", (void *)sfh));
3889 3871 if (sfh->sfh_flags & SFH4_IN_TREE) {
3890 3872 avl_remove(&mi->mi_filehandles, sfh);
3891 3873 sfh->sfh_flags &= ~SFH4_IN_TREE;
3892 3874 }
3893 3875 mutex_exit(&sfh->sfh_lock);
3894 3876 nfs_rw_exit(&mi->mi_fh_lock);
3895 3877 mutex_destroy(&sfh->sfh_lock);
3896 3878 kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3897 3879 kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3898 3880
3899 3881 finish:
3900 3882 *sfhpp = NULL;
3901 3883 }
3902 3884
3903 3885 /*
3904 3886 * Update the filehandle for the given shared filehandle object.
3905 3887 */
3906 3888
3907 3889 int nfs4_warn_dupfh = 0; /* if set, always warn about dup fhs below */
3908 3890
3909 3891 void
3910 3892 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3911 3893 {
3912 3894 mntinfo4_t *mi = sfh->sfh_mi;
3913 3895 nfs4_sharedfh_t *dupsfh;
3914 3896 avl_index_t where;
3915 3897 nfs4_sharedfh_t key;
3916 3898
3917 3899 #ifdef DEBUG
3918 3900 mutex_enter(&sfh->sfh_lock);
3919 3901 ASSERT(sfh->sfh_refcnt > 0);
3920 3902 mutex_exit(&sfh->sfh_lock);
3921 3903 #endif
3922 3904 ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3923 3905
3924 3906 /*
3925 3907 * The basic plan is to remove the shared filehandle object from
3926 3908 * the table, update it to have the new filehandle, then reinsert
3927 3909 * it.
3928 3910 */
3929 3911
3930 3912 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3931 3913 mutex_enter(&sfh->sfh_lock);
3932 3914 if (sfh->sfh_flags & SFH4_IN_TREE) {
3933 3915 avl_remove(&mi->mi_filehandles, sfh);
3934 3916 sfh->sfh_flags &= ~SFH4_IN_TREE;
3935 3917 }
3936 3918 mutex_exit(&sfh->sfh_lock);
3937 3919 sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3938 3920 bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3939 3921 sfh->sfh_fh.nfs_fh4_len);
3940 3922
3941 3923 /*
3942 3924 * XXX If there is already a shared filehandle object with the new
3943 3925 * filehandle, we're in trouble, because the rnode code assumes
3944 3926 * that there is only one shared filehandle object for a given
3945 3927 * filehandle. So issue a warning (for read-write mounts only)
3946 3928 * and don't try to re-insert the given object into the table.
3947 3929 * Hopefully the given object will quickly go away and everyone
3948 3930 * will use the new object.
3949 3931 */
3950 3932 key.sfh_fh = *newfh;
3951 3933 dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3952 3934 if (dupsfh != NULL) {
3953 3935 if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3954 3936 zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3955 3937 "duplicate filehandle detected");
3956 3938 sfh4_printfhandle(dupsfh);
3957 3939 }
3958 3940 } else {
3959 3941 avl_insert(&mi->mi_filehandles, sfh, where);
3960 3942 mutex_enter(&sfh->sfh_lock);
3961 3943 sfh->sfh_flags |= SFH4_IN_TREE;
3962 3944 mutex_exit(&sfh->sfh_lock);
3963 3945 }
3964 3946 nfs_rw_exit(&mi->mi_fh_lock);
3965 3947 }
3966 3948
3967 3949 /*
3968 3950 * Copy out the current filehandle for the given shared filehandle object.
3969 3951 */
3970 3952
3971 3953 void
3972 3954 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3973 3955 {
3974 3956 mntinfo4_t *mi = sfh->sfh_mi;
3975 3957
3976 3958 ASSERT(sfh->sfh_refcnt > 0);
3977 3959
3978 3960 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3979 3961 fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3980 3962 ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3981 3963 bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3982 3964 nfs_rw_exit(&mi->mi_fh_lock);
3983 3965 }
3984 3966
3985 3967 /*
3986 3968 * Print out the filehandle for the given shared filehandle object.
3987 3969 */
3988 3970
3989 3971 void
3990 3972 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3991 3973 {
3992 3974 nfs4_fhandle_t fhandle;
3993 3975
3994 3976 sfh4_copyval(sfh, &fhandle);
3995 3977 nfs4_printfhandle(&fhandle);
3996 3978 }
3997 3979
3998 3980 /*
3999 3981 * Compare 2 fnames. Returns -1 if the first is "less" than the second, 0
4000 3982 * if they're the same, +1 if the first is "greater" than the second. The
4001 3983 * caller (or whoever's calling the AVL package) is responsible for
4002 3984 * handling locking issues.
4003 3985 */
4004 3986
4005 3987 static int
4006 3988 fncmp(const void *p1, const void *p2)
4007 3989 {
4008 3990 const nfs4_fname_t *f1 = p1;
4009 3991 const nfs4_fname_t *f2 = p2;
4010 3992 int res;
4011 3993
4012 3994 res = strcmp(f1->fn_name, f2->fn_name);
4013 3995 /*
4014 3996 * The AVL package wants +/-1, not arbitrary positive or negative
4015 3997 * integers.
4016 3998 */
4017 3999 if (res > 0)
4018 4000 res = 1;
4019 4001 else if (res < 0)
4020 4002 res = -1;
4021 4003 return (res);
4022 4004 }
4023 4005
4024 4006 /*
4025 4007 * Get or create an fname with the given name, as a child of the given
4026 4008 * fname. The caller is responsible for eventually releasing the reference
4027 4009 * (fn_rele()). parent may be NULL.
4028 4010 */
4029 4011
4030 4012 nfs4_fname_t *
4031 4013 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4032 4014 {
4033 4015 nfs4_fname_t key;
4034 4016 nfs4_fname_t *fnp;
4035 4017 avl_index_t where;
4036 4018
4037 4019 key.fn_name = name;
4038 4020
4039 4021 /*
4040 4022 * If there's already an fname registered with the given name, bump
4041 4023 * its reference count and return it. Otherwise, create a new one
4042 4024 * and add it to the parent's AVL tree.
4043 4025 *
4044 4026 * fname entries we are looking for should match both name
4045 4027 * and sfh stored in the fname.
4046 4028 */
4047 4029 again:
4048 4030 if (parent != NULL) {
4049 4031 mutex_enter(&parent->fn_lock);
4050 4032 fnp = avl_find(&parent->fn_children, &key, &where);
4051 4033 if (fnp != NULL) {
4052 4034 /*
4053 4035 * This hold on fnp is released below later,
4054 4036 * in case this is not the fnp we want.
4055 4037 */
4056 4038 fn_hold(fnp);
4057 4039
4058 4040 if (fnp->fn_sfh == sfh) {
4059 4041 /*
4060 4042 * We have found our entry.
4061 4043 * put an hold and return it.
4062 4044 */
4063 4045 mutex_exit(&parent->fn_lock);
4064 4046 return (fnp);
4065 4047 }
4066 4048
4067 4049 /*
4068 4050 * We have found an entry that has a mismatching
4069 4051 * fn_sfh. This could be a stale entry due to
4070 4052 * server side rename. We will remove this entry
4071 4053 * and make sure no such entries exist.
4072 4054 */
4073 4055 mutex_exit(&parent->fn_lock);
4074 4056 mutex_enter(&fnp->fn_lock);
4075 4057 if (fnp->fn_parent == parent) {
4076 4058 /*
4077 4059 * Remove ourselves from parent's
4078 4060 * fn_children tree.
4079 4061 */
4080 4062 mutex_enter(&parent->fn_lock);
4081 4063 avl_remove(&parent->fn_children, fnp);
4082 4064 mutex_exit(&parent->fn_lock);
4083 4065 fn_rele(&fnp->fn_parent);
4084 4066 }
4085 4067 mutex_exit(&fnp->fn_lock);
4086 4068 fn_rele(&fnp);
4087 4069 goto again;
4088 4070 }
4089 4071 }
4090 4072
4091 4073 fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4092 4074 mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4093 4075 fnp->fn_parent = parent;
4094 4076 if (parent != NULL)
4095 4077 fn_hold(parent);
4096 4078 fnp->fn_len = strlen(name);
4097 4079 ASSERT(fnp->fn_len < MAXNAMELEN);
4098 4080 fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4099 4081 (void) strcpy(fnp->fn_name, name);
4100 4082 fnp->fn_refcnt = 1;
4101 4083
4102 4084 /*
4103 4085 * This hold on sfh is later released
4104 4086 * when we do the final fn_rele() on this fname.
4105 4087 */
4106 4088 sfh4_hold(sfh);
4107 4089 fnp->fn_sfh = sfh;
4108 4090
4109 4091 avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4110 4092 offsetof(nfs4_fname_t, fn_tree));
4111 4093 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4112 4094 "fn_get %p:%s, a new nfs4_fname_t!",
4113 4095 (void *)fnp, fnp->fn_name));
4114 4096 if (parent != NULL) {
4115 4097 avl_insert(&parent->fn_children, fnp, where);
4116 4098 mutex_exit(&parent->fn_lock);
4117 4099 }
4118 4100
4119 4101 return (fnp);
4120 4102 }
4121 4103
4122 4104 void
4123 4105 fn_hold(nfs4_fname_t *fnp)
4124 4106 {
4125 4107 atomic_inc_32(&fnp->fn_refcnt);
4126 4108 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4127 4109 "fn_hold %p:%s, new refcnt=%d",
4128 4110 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4129 4111 }
4130 4112
4131 4113 /*
4132 4114 * Decrement the reference count of the given fname, and destroy it if its
4133 4115 * reference count goes to zero. Nulls out the given pointer.
4134 4116 */
4135 4117
4136 4118 void
4137 4119 fn_rele(nfs4_fname_t **fnpp)
4138 4120 {
4139 4121 nfs4_fname_t *parent;
4140 4122 uint32_t newref;
4141 4123 nfs4_fname_t *fnp;
4142 4124
4143 4125 recur:
4144 4126 fnp = *fnpp;
4145 4127 *fnpp = NULL;
4146 4128
4147 4129 mutex_enter(&fnp->fn_lock);
4148 4130 parent = fnp->fn_parent;
4149 4131 if (parent != NULL)
4150 4132 mutex_enter(&parent->fn_lock); /* prevent new references */
4151 4133 newref = atomic_dec_32_nv(&fnp->fn_refcnt);
4152 4134 if (newref > 0) {
4153 4135 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4154 4136 "fn_rele %p:%s, new refcnt=%d",
4155 4137 (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4156 4138 if (parent != NULL)
4157 4139 mutex_exit(&parent->fn_lock);
4158 4140 mutex_exit(&fnp->fn_lock);
4159 4141 return;
4160 4142 }
4161 4143
4162 4144 NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4163 4145 "fn_rele %p:%s, last reference, deleting...",
4164 4146 (void *)fnp, fnp->fn_name));
4165 4147 if (parent != NULL) {
4166 4148 avl_remove(&parent->fn_children, fnp);
4167 4149 mutex_exit(&parent->fn_lock);
4168 4150 }
4169 4151 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4170 4152 sfh4_rele(&fnp->fn_sfh);
4171 4153 mutex_destroy(&fnp->fn_lock);
4172 4154 avl_destroy(&fnp->fn_children);
4173 4155 kmem_free(fnp, sizeof (nfs4_fname_t));
4174 4156 /*
4175 4157 * Recursivly fn_rele the parent.
4176 4158 * Use goto instead of a recursive call to avoid stack overflow.
4177 4159 */
4178 4160 if (parent != NULL) {
4179 4161 fnpp = &parent;
4180 4162 goto recur;
4181 4163 }
4182 4164 }
4183 4165
4184 4166 /*
4185 4167 * Returns the single component name of the given fname, in a MAXNAMELEN
4186 4168 * string buffer, which the caller is responsible for freeing. Note that
4187 4169 * the name may become invalid as a result of fn_move().
4188 4170 */
4189 4171
4190 4172 char *
4191 4173 fn_name(nfs4_fname_t *fnp)
4192 4174 {
4193 4175 char *name;
4194 4176
4195 4177 ASSERT(fnp->fn_len < MAXNAMELEN);
4196 4178 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4197 4179 mutex_enter(&fnp->fn_lock);
4198 4180 (void) strcpy(name, fnp->fn_name);
4199 4181 mutex_exit(&fnp->fn_lock);
4200 4182
4201 4183 return (name);
4202 4184 }
4203 4185
4204 4186
4205 4187 /*
4206 4188 * fn_path_realloc
4207 4189 *
4208 4190 * This function, used only by fn_path, constructs
4209 4191 * a new string which looks like "prepend" + "/" + "current".
4210 4192 * by allocating a new string and freeing the old one.
4211 4193 */
4212 4194 static void
4213 4195 fn_path_realloc(char **curses, char *prepend)
4214 4196 {
4215 4197 int len, curlen = 0;
4216 4198 char *news;
4217 4199
4218 4200 if (*curses == NULL) {
4219 4201 /*
4220 4202 * Prime the pump, allocate just the
4221 4203 * space for prepend and return that.
4222 4204 */
4223 4205 len = strlen(prepend) + 1;
4224 4206 news = kmem_alloc(len, KM_SLEEP);
4225 4207 (void) strncpy(news, prepend, len);
4226 4208 } else {
4227 4209 /*
4228 4210 * Allocate the space for a new string
4229 4211 * +1 +1 is for the "/" and the NULL
4230 4212 * byte at the end of it all.
4231 4213 */
4232 4214 curlen = strlen(*curses);
4233 4215 len = curlen + strlen(prepend) + 1 + 1;
4234 4216 news = kmem_alloc(len, KM_SLEEP);
4235 4217 (void) strncpy(news, prepend, len);
4236 4218 (void) strcat(news, "/");
4237 4219 (void) strcat(news, *curses);
4238 4220 kmem_free(*curses, curlen + 1);
4239 4221 }
4240 4222 *curses = news;
4241 4223 }
4242 4224
4243 4225 /*
4244 4226 * Returns the path name (starting from the fs root) for the given fname.
4245 4227 * The caller is responsible for freeing. Note that the path may be or
4246 4228 * become invalid as a result of fn_move().
4247 4229 */
4248 4230
4249 4231 char *
4250 4232 fn_path(nfs4_fname_t *fnp)
4251 4233 {
4252 4234 char *path;
4253 4235 nfs4_fname_t *nextfnp;
4254 4236
4255 4237 if (fnp == NULL)
4256 4238 return (NULL);
4257 4239
4258 4240 path = NULL;
4259 4241
4260 4242 /* walk up the tree constructing the pathname. */
4261 4243
4262 4244 fn_hold(fnp); /* adjust for later rele */
4263 4245 do {
4264 4246 mutex_enter(&fnp->fn_lock);
4265 4247 /*
4266 4248 * Add fn_name in front of the current path
4267 4249 */
4268 4250 fn_path_realloc(&path, fnp->fn_name);
4269 4251 nextfnp = fnp->fn_parent;
4270 4252 if (nextfnp != NULL)
4271 4253 fn_hold(nextfnp);
4272 4254 mutex_exit(&fnp->fn_lock);
4273 4255 fn_rele(&fnp);
4274 4256 fnp = nextfnp;
4275 4257 } while (fnp != NULL);
4276 4258
4277 4259 return (path);
4278 4260 }
4279 4261
4280 4262 /*
4281 4263 * Return a reference to the parent of the given fname, which the caller is
4282 4264 * responsible for eventually releasing.
4283 4265 */
4284 4266
4285 4267 nfs4_fname_t *
4286 4268 fn_parent(nfs4_fname_t *fnp)
4287 4269 {
4288 4270 nfs4_fname_t *parent;
4289 4271
4290 4272 mutex_enter(&fnp->fn_lock);
4291 4273 parent = fnp->fn_parent;
4292 4274 if (parent != NULL)
4293 4275 fn_hold(parent);
4294 4276 mutex_exit(&fnp->fn_lock);
4295 4277
4296 4278 return (parent);
4297 4279 }
4298 4280
4299 4281 /*
4300 4282 * Update fnp so that its parent is newparent and its name is newname.
4301 4283 */
4302 4284
4303 4285 void
4304 4286 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4305 4287 {
4306 4288 nfs4_fname_t *parent, *tmpfnp;
4307 4289 ssize_t newlen;
4308 4290 nfs4_fname_t key;
4309 4291 avl_index_t where;
4310 4292
4311 4293 /*
4312 4294 * This assert exists to catch the client trying to rename
4313 4295 * a dir to be a child of itself. This happened at a recent
4314 4296 * bakeoff against a 3rd party (broken) server which allowed
4315 4297 * the rename to succeed. If it trips it means that:
4316 4298 * a) the code in nfs4rename that detects this case is broken
4317 4299 * b) the server is broken (since it allowed the bogus rename)
4318 4300 *
4319 4301 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4320 4302 * panic below from: mutex_enter(&newparent->fn_lock);
4321 4303 */
4322 4304 ASSERT(fnp != newparent);
4323 4305
4324 4306 /*
4325 4307 * Remove fnp from its current parent, change its name, then add it
4326 4308 * to newparent. It might happen that fnp was replaced by another
4327 4309 * nfs4_fname_t with the same fn_name in parent->fn_children.
4328 4310 * In such case, fnp->fn_parent is NULL and we skip the removal
4329 4311 * of fnp from its current parent.
4330 4312 */
4331 4313 mutex_enter(&fnp->fn_lock);
4332 4314 parent = fnp->fn_parent;
4333 4315 if (parent != NULL) {
4334 4316 mutex_enter(&parent->fn_lock);
4335 4317 avl_remove(&parent->fn_children, fnp);
4336 4318 mutex_exit(&parent->fn_lock);
4337 4319 fn_rele(&fnp->fn_parent);
4338 4320 }
4339 4321
4340 4322 newlen = strlen(newname);
4341 4323 if (newlen != fnp->fn_len) {
4342 4324 ASSERT(newlen < MAXNAMELEN);
4343 4325 kmem_free(fnp->fn_name, fnp->fn_len + 1);
4344 4326 fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4345 4327 fnp->fn_len = newlen;
4346 4328 }
4347 4329 (void) strcpy(fnp->fn_name, newname);
4348 4330
4349 4331 again:
4350 4332 mutex_enter(&newparent->fn_lock);
4351 4333 key.fn_name = fnp->fn_name;
4352 4334 tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4353 4335 if (tmpfnp != NULL) {
4354 4336 /*
4355 4337 * This could be due to a file that was unlinked while
4356 4338 * open, or perhaps the rnode is in the free list. Remove
4357 4339 * it from newparent and let it go away on its own. The
4358 4340 * contorted code is to deal with lock order issues and
4359 4341 * race conditions.
4360 4342 */
4361 4343 fn_hold(tmpfnp);
4362 4344 mutex_exit(&newparent->fn_lock);
4363 4345 mutex_enter(&tmpfnp->fn_lock);
4364 4346 if (tmpfnp->fn_parent == newparent) {
4365 4347 mutex_enter(&newparent->fn_lock);
4366 4348 avl_remove(&newparent->fn_children, tmpfnp);
4367 4349 mutex_exit(&newparent->fn_lock);
4368 4350 fn_rele(&tmpfnp->fn_parent);
4369 4351 }
4370 4352 mutex_exit(&tmpfnp->fn_lock);
4371 4353 fn_rele(&tmpfnp);
4372 4354 goto again;
4373 4355 }
4374 4356 fnp->fn_parent = newparent;
4375 4357 fn_hold(newparent);
4376 4358 avl_insert(&newparent->fn_children, fnp, where);
4377 4359 mutex_exit(&newparent->fn_lock);
4378 4360 mutex_exit(&fnp->fn_lock);
4379 4361 }
4380 4362
4381 4363 #ifdef DEBUG
4382 4364 /*
4383 4365 * Return non-zero if the type information makes sense for the given vnode.
4384 4366 * Otherwise panic.
4385 4367 */
4386 4368 int
4387 4369 nfs4_consistent_type(vnode_t *vp)
4388 4370 {
4389 4371 rnode4_t *rp = VTOR4(vp);
4390 4372
4391 4373 if (nfs4_vtype_debug && vp->v_type != VNON &&
4392 4374 rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4393 4375 cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4394 4376 "rnode attr type=%d", (void *)vp, vp->v_type,
4395 4377 rp->r_attr.va_type);
4396 4378 }
4397 4379
4398 4380 return (1);
4399 4381 }
4400 4382 #endif /* DEBUG */
|
↓ open down ↓ |
1320 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX