Print this page
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/nfs/nfs_client.c
+++ new/usr/src/uts/common/fs/nfs/nfs_client.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
|
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 +
21 22 /*
22 23 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 - *
24 + */
25 +
26 +/*
24 27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
25 28 * All rights reserved.
26 29 */
27 30
31 +/*
32 + * Copyright 2018 Nexenta Systems, Inc.
33 + */
34 +
28 35 #include <sys/param.h>
29 36 #include <sys/types.h>
30 37 #include <sys/systm.h>
31 38 #include <sys/thread.h>
32 39 #include <sys/t_lock.h>
33 40 #include <sys/time.h>
34 41 #include <sys/vnode.h>
35 42 #include <sys/vfs.h>
36 43 #include <sys/errno.h>
37 44 #include <sys/buf.h>
38 45 #include <sys/stat.h>
39 46 #include <sys/cred.h>
40 47 #include <sys/kmem.h>
41 48 #include <sys/debug.h>
42 49 #include <sys/dnlc.h>
43 50 #include <sys/vmsystm.h>
44 51 #include <sys/flock.h>
45 52 #include <sys/share.h>
46 53 #include <sys/cmn_err.h>
47 54 #include <sys/tiuser.h>
48 55 #include <sys/sysmacros.h>
49 56 #include <sys/callb.h>
50 57 #include <sys/acl.h>
51 58 #include <sys/kstat.h>
52 59 #include <sys/signal.h>
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
53 60 #include <sys/list.h>
54 61 #include <sys/zone.h>
55 62
56 63 #include <rpc/types.h>
57 64 #include <rpc/xdr.h>
58 65 #include <rpc/auth.h>
59 66 #include <rpc/clnt.h>
60 67
61 68 #include <nfs/nfs.h>
62 69 #include <nfs/nfs_clnt.h>
70 +#include <nfs/nfs_cmd.h>
63 71
64 72 #include <nfs/rnode.h>
65 73 #include <nfs/nfs_acl.h>
66 74 #include <nfs/lm.h>
67 75
68 76 #include <vm/hat.h>
69 77 #include <vm/as.h>
70 78 #include <vm/page.h>
71 79 #include <vm/pvn.h>
72 80 #include <vm/seg.h>
73 81 #include <vm/seg_map.h>
74 82 #include <vm/seg_vn.h>
75 83
76 84 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
77 85 cred_t *);
78 86 static int nfs_getattr_cache(vnode_t *, struct vattr *);
79 87 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
80 88
81 89 struct mi_globals {
82 90 kmutex_t mig_lock; /* lock protecting mig_list */
83 91 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */
84 92 boolean_t mig_destructor_called;
85 93 };
86 94
87 95 static zone_key_t mi_list_key;
88 96
89 97 /* Debugging flag for PC file shares. */
90 98 extern int share_debug;
91 99
92 100 /*
93 101 * Attributes caching:
94 102 *
95 103 * Attributes are cached in the rnode in struct vattr form.
96 104 * There is a time associated with the cached attributes (r_attrtime)
97 105 * which tells whether the attributes are valid. The time is initialized
98 106 * to the difference between current time and the modify time of the vnode
99 107 * when new attributes are cached. This allows the attributes for
100 108 * files that have changed recently to be timed out sooner than for files
101 109 * that have not changed for a long time. There are minimum and maximum
102 110 * timeout values that can be set per mount point.
103 111 */
104 112
105 113 int
106 114 nfs_waitfor_purge_complete(vnode_t *vp)
107 115 {
108 116 rnode_t *rp;
109 117 k_sigset_t smask;
110 118
111 119 rp = VTOR(vp);
112 120 if (rp->r_serial != NULL && rp->r_serial != curthread) {
113 121 mutex_enter(&rp->r_statelock);
114 122 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
115 123 while (rp->r_serial != NULL) {
116 124 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
117 125 sigunintr(&smask);
118 126 mutex_exit(&rp->r_statelock);
119 127 return (EINTR);
120 128 }
121 129 }
122 130 sigunintr(&smask);
123 131 mutex_exit(&rp->r_statelock);
124 132 }
125 133 return (0);
126 134 }
127 135
128 136 /*
129 137 * Validate caches by checking cached attributes. If the cached
130 138 * attributes have timed out, then get new attributes from the server.
131 139 * As a side affect, this will do cache invalidation if the attributes
132 140 * have changed.
133 141 *
134 142 * If the attributes have not timed out and if there is a cache
135 143 * invalidation being done by some other thread, then wait until that
136 144 * thread has completed the cache invalidation.
137 145 */
138 146 int
139 147 nfs_validate_caches(vnode_t *vp, cred_t *cr)
140 148 {
141 149 int error;
142 150 struct vattr va;
143 151
144 152 if (ATTRCACHE_VALID(vp)) {
145 153 error = nfs_waitfor_purge_complete(vp);
146 154 if (error)
147 155 return (error);
148 156 return (0);
149 157 }
150 158
151 159 va.va_mask = AT_ALL;
152 160 return (nfs_getattr_otw(vp, &va, cr));
153 161 }
154 162
155 163 /*
156 164 * Validate caches by checking cached attributes. If the cached
157 165 * attributes have timed out, then get new attributes from the server.
158 166 * As a side affect, this will do cache invalidation if the attributes
159 167 * have changed.
160 168 *
161 169 * If the attributes have not timed out and if there is a cache
162 170 * invalidation being done by some other thread, then wait until that
163 171 * thread has completed the cache invalidation.
164 172 */
165 173 int
166 174 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
167 175 {
168 176 int error;
169 177 struct vattr va;
170 178
171 179 if (ATTRCACHE_VALID(vp)) {
172 180 error = nfs_waitfor_purge_complete(vp);
173 181 if (error)
174 182 return (error);
175 183 return (0);
176 184 }
177 185
178 186 va.va_mask = AT_ALL;
179 187 return (nfs3_getattr_otw(vp, &va, cr));
180 188 }
181 189
182 190 /*
183 191 * Purge all of the various NFS `data' caches.
184 192 */
185 193 void
186 194 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
187 195 {
188 196 rnode_t *rp;
189 197 char *contents;
190 198 int size;
191 199 int error;
192 200
193 201 /*
194 202 * Purge the DNLC for any entries which refer to this file.
195 203 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
196 204 */
197 205 rp = VTOR(vp);
198 206 mutex_enter(&rp->r_statelock);
199 207 if (vp->v_count > 1 &&
200 208 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
201 209 !(rp->r_flags & RINDNLCPURGE)) {
202 210 /*
203 211 * Set the RINDNLCPURGE flag to prevent recursive entry
204 212 * into dnlc_purge_vp()
205 213 */
206 214 if (vp->v_type == VDIR)
207 215 rp->r_flags |= RINDNLCPURGE;
208 216 mutex_exit(&rp->r_statelock);
209 217 dnlc_purge_vp(vp);
210 218 mutex_enter(&rp->r_statelock);
211 219 if (rp->r_flags & RINDNLCPURGE)
212 220 rp->r_flags &= ~RINDNLCPURGE;
213 221 }
214 222
215 223 /*
216 224 * Clear any readdir state bits and purge the readlink response cache.
217 225 */
218 226 contents = rp->r_symlink.contents;
219 227 size = rp->r_symlink.size;
220 228 rp->r_symlink.contents = NULL;
221 229 mutex_exit(&rp->r_statelock);
222 230
223 231 if (contents != NULL) {
224 232
225 233 kmem_free((void *)contents, size);
226 234 }
227 235
228 236 /*
229 237 * Flush the page cache.
230 238 */
231 239 if (vn_has_cached_data(vp)) {
232 240 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
233 241 if (error && (error == ENOSPC || error == EDQUOT)) {
234 242 mutex_enter(&rp->r_statelock);
235 243 if (!rp->r_error)
236 244 rp->r_error = error;
237 245 mutex_exit(&rp->r_statelock);
238 246 }
239 247 }
240 248
241 249 /*
242 250 * Flush the readdir response cache.
243 251 */
244 252 if (HAVE_RDDIR_CACHE(rp))
245 253 nfs_purge_rddir_cache(vp);
246 254 }
247 255
248 256 /*
249 257 * Purge the readdir cache of all entries
250 258 */
251 259 void
252 260 nfs_purge_rddir_cache(vnode_t *vp)
253 261 {
254 262 rnode_t *rp;
255 263 rddir_cache *rdc;
256 264 rddir_cache *nrdc;
257 265
258 266 rp = VTOR(vp);
259 267 top:
260 268 mutex_enter(&rp->r_statelock);
261 269 rp->r_direof = NULL;
262 270 rp->r_flags &= ~RLOOKUP;
263 271 rp->r_flags |= RREADDIRPLUS;
264 272 rdc = avl_first(&rp->r_dir);
265 273 while (rdc != NULL) {
266 274 nrdc = AVL_NEXT(&rp->r_dir, rdc);
267 275 avl_remove(&rp->r_dir, rdc);
268 276 rddir_cache_rele(rdc);
269 277 rdc = nrdc;
270 278 }
271 279 mutex_exit(&rp->r_statelock);
272 280 }
273 281
274 282 /*
275 283 * Do a cache check based on the post-operation attributes.
276 284 * Then make them the new cached attributes. If no attributes
277 285 * were returned, then mark the attributes as timed out.
278 286 */
279 287 void
280 288 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
281 289 {
282 290 vattr_t attr;
283 291
284 292 if (!poap->attributes) {
285 293 PURGE_ATTRCACHE(vp);
286 294 return;
287 295 }
288 296 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
289 297 }
290 298
291 299 /*
292 300 * Same as above, but using a vattr
293 301 */
294 302 void
295 303 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
296 304 cred_t *cr)
297 305 {
298 306 if (!poap->attributes) {
299 307 PURGE_ATTRCACHE(vp);
300 308 return;
301 309 }
302 310 nfs_attr_cache(vp, poap->fres.vap, t, cr);
303 311 }
304 312
305 313 /*
306 314 * Do a cache check based on the weak cache consistency attributes.
307 315 * These consist of a small set of pre-operation attributes and the
308 316 * full set of post-operation attributes.
309 317 *
310 318 * If we are given the pre-operation attributes, then use them to
311 319 * check the validity of the various caches. Then, if we got the
312 320 * post-operation attributes, make them the new cached attributes.
313 321 * If we didn't get the post-operation attributes, then mark the
314 322 * attribute cache as timed out so that the next reference will
315 323 * cause a GETATTR to the server to refresh with the current
316 324 * attributes.
317 325 *
318 326 * Otherwise, if we didn't get the pre-operation attributes, but
319 327 * we did get the post-operation attributes, then use these
320 328 * attributes to check the validity of the various caches. This
321 329 * will probably cause a flush of the caches because if the
322 330 * operation succeeded, the attributes of the object were changed
323 331 * in some way from the old post-operation attributes. This
324 332 * should be okay because it is the safe thing to do. After
325 333 * checking the data caches, then we make these the new cached
326 334 * attributes.
327 335 *
328 336 * Otherwise, we didn't get either the pre- or post-operation
329 337 * attributes. Simply mark the attribute cache as timed out so
330 338 * the next reference will cause a GETATTR to the server to
331 339 * refresh with the current attributes.
332 340 *
333 341 * If an error occurred trying to convert the over the wire
334 342 * attributes to a vattr, then simply mark the attribute cache as
335 343 * timed out.
336 344 */
337 345 void
338 346 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
339 347 {
340 348 vattr_t bva;
341 349 vattr_t ava;
342 350
343 351 if (wccp->after.attributes) {
344 352 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
345 353 PURGE_ATTRCACHE(vp);
346 354 return;
347 355 }
348 356 if (wccp->before.attributes) {
349 357 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
350 358 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
351 359 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
352 360 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
353 361 bva.va_size = wccp->before.attr.size;
354 362 nfs3_attr_cache(vp, &bva, &ava, t, cr);
355 363 } else
356 364 nfs_attr_cache(vp, &ava, t, cr);
357 365 } else {
358 366 PURGE_ATTRCACHE(vp);
359 367 }
360 368 }
361 369
362 370 /*
363 371 * Set attributes cache for given vnode using nfsattr.
364 372 *
365 373 * This routine does not do cache validation with the attributes.
366 374 *
367 375 * If an error occurred trying to convert the over the wire
368 376 * attributes to a vattr, then simply mark the attribute cache as
369 377 * timed out.
370 378 */
371 379 void
372 380 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
373 381 {
374 382 rnode_t *rp;
375 383 struct vattr va;
376 384
377 385 if (!nattr_to_vattr(vp, na, &va)) {
378 386 rp = VTOR(vp);
379 387 mutex_enter(&rp->r_statelock);
380 388 if (rp->r_mtime <= t)
381 389 nfs_attrcache_va(vp, &va);
382 390 mutex_exit(&rp->r_statelock);
383 391 } else {
384 392 PURGE_ATTRCACHE(vp);
385 393 }
386 394 }
387 395
388 396 /*
389 397 * Set attributes cache for given vnode using fattr3.
390 398 *
391 399 * This routine does not do cache validation with the attributes.
392 400 *
393 401 * If an error occurred trying to convert the over the wire
394 402 * attributes to a vattr, then simply mark the attribute cache as
395 403 * timed out.
396 404 */
397 405 void
398 406 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
399 407 {
400 408 rnode_t *rp;
401 409 struct vattr va;
402 410
403 411 if (!fattr3_to_vattr(vp, na, &va)) {
404 412 rp = VTOR(vp);
405 413 mutex_enter(&rp->r_statelock);
406 414 if (rp->r_mtime <= t)
407 415 nfs_attrcache_va(vp, &va);
408 416 mutex_exit(&rp->r_statelock);
409 417 } else {
410 418 PURGE_ATTRCACHE(vp);
411 419 }
412 420 }
413 421
414 422 /*
415 423 * Do a cache check based on attributes returned over the wire. The
416 424 * new attributes are cached.
417 425 *
418 426 * If an error occurred trying to convert the over the wire attributes
419 427 * to a vattr, then just return that error.
420 428 *
421 429 * As a side affect, the vattr argument is filled in with the converted
422 430 * attributes.
423 431 */
424 432 int
425 433 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
426 434 cred_t *cr)
427 435 {
428 436 int error;
429 437
430 438 error = nattr_to_vattr(vp, na, vap);
431 439 if (error)
432 440 return (error);
433 441 nfs_attr_cache(vp, vap, t, cr);
434 442 return (0);
435 443 }
436 444
437 445 /*
438 446 * Do a cache check based on attributes returned over the wire. The
439 447 * new attributes are cached.
440 448 *
441 449 * If an error occurred trying to convert the over the wire attributes
442 450 * to a vattr, then just return that error.
443 451 *
444 452 * As a side affect, the vattr argument is filled in with the converted
445 453 * attributes.
446 454 */
447 455 int
448 456 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
449 457 {
450 458 int error;
451 459
452 460 error = fattr3_to_vattr(vp, na, vap);
453 461 if (error)
454 462 return (error);
455 463 nfs_attr_cache(vp, vap, t, cr);
456 464 return (0);
457 465 }
458 466
459 467 /*
460 468 * Use the passed in virtual attributes to check to see whether the
461 469 * data and metadata caches are valid, cache the new attributes, and
462 470 * then do the cache invalidation if required.
463 471 *
464 472 * The cache validation and caching of the new attributes is done
465 473 * atomically via the use of the mutex, r_statelock. If required,
466 474 * the cache invalidation is done atomically w.r.t. the cache
467 475 * validation and caching of the attributes via the pseudo lock,
468 476 * r_serial.
469 477 *
470 478 * This routine is used to do cache validation and attributes caching
471 479 * for operations with a single set of post operation attributes.
472 480 */
473 481 void
474 482 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
475 483 {
476 484 rnode_t *rp;
477 485 int mtime_changed = 0;
478 486 int ctime_changed = 0;
479 487 vsecattr_t *vsp;
480 488 int was_serial;
481 489 len_t preattr_rsize;
482 490 boolean_t writeattr_set = B_FALSE;
483 491 boolean_t cachepurge_set = B_FALSE;
484 492
485 493 rp = VTOR(vp);
486 494
487 495 mutex_enter(&rp->r_statelock);
488 496
489 497 if (rp->r_serial != curthread) {
490 498 klwp_t *lwp = ttolwp(curthread);
491 499
492 500 was_serial = 0;
493 501 if (lwp != NULL)
494 502 lwp->lwp_nostop++;
495 503 while (rp->r_serial != NULL) {
496 504 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
497 505 mutex_exit(&rp->r_statelock);
498 506 if (lwp != NULL)
499 507 lwp->lwp_nostop--;
500 508 return;
501 509 }
502 510 }
503 511 if (lwp != NULL)
504 512 lwp->lwp_nostop--;
505 513 } else
506 514 was_serial = 1;
507 515
508 516 if (rp->r_mtime > t) {
509 517 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
510 518 PURGE_ATTRCACHE_LOCKED(rp);
511 519 mutex_exit(&rp->r_statelock);
512 520 return;
513 521 }
514 522
515 523 /*
516 524 * Write thread after writing data to file on remote server,
517 525 * will always set RWRITEATTR to indicate that file on remote
518 526 * server was modified with a WRITE operation and would have
519 527 * marked attribute cache as timed out. If RWRITEATTR
520 528 * is set, then do not check for mtime and ctime change.
521 529 */
522 530 if (!(rp->r_flags & RWRITEATTR)) {
523 531 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
524 532 mtime_changed = 1;
525 533
526 534 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
527 535 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
528 536 ctime_changed = 1;
529 537 } else {
530 538 writeattr_set = B_TRUE;
531 539 }
532 540
533 541 preattr_rsize = rp->r_size;
534 542
535 543 nfs_attrcache_va(vp, vap);
536 544
537 545 /*
538 546 * If we have updated filesize in nfs_attrcache_va, as soon as we
539 547 * drop statelock we will be in transition of purging all
540 548 * our caches and updating them. It is possible for another
541 549 * thread to pick this new file size and read in zeroed data.
542 550 * stall other threads till cache purge is complete.
543 551 */
544 552 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
545 553 /*
546 554 * If RWRITEATTR was set and we have updated the file
547 555 * size, Server's returned file size need not necessarily
548 556 * be because of this Client's WRITE. We need to purge
549 557 * all caches.
550 558 */
551 559 if (writeattr_set)
552 560 mtime_changed = 1;
553 561
554 562 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
555 563 rp->r_flags |= RINCACHEPURGE;
556 564 cachepurge_set = B_TRUE;
557 565 }
558 566 }
559 567
560 568 if (!mtime_changed && !ctime_changed) {
561 569 mutex_exit(&rp->r_statelock);
562 570 return;
563 571 }
564 572
565 573 rp->r_serial = curthread;
566 574
567 575 mutex_exit(&rp->r_statelock);
568 576
569 577 if (mtime_changed)
570 578 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
571 579
572 580 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
573 581 mutex_enter(&rp->r_statelock);
574 582 rp->r_flags &= ~RINCACHEPURGE;
575 583 cv_broadcast(&rp->r_cv);
576 584 mutex_exit(&rp->r_statelock);
577 585 cachepurge_set = B_FALSE;
578 586 }
579 587
580 588 if (ctime_changed) {
581 589 (void) nfs_access_purge_rp(rp);
582 590 if (rp->r_secattr != NULL) {
583 591 mutex_enter(&rp->r_statelock);
584 592 vsp = rp->r_secattr;
585 593 rp->r_secattr = NULL;
586 594 mutex_exit(&rp->r_statelock);
587 595 if (vsp != NULL)
588 596 nfs_acl_free(vsp);
589 597 }
590 598 }
591 599
592 600 if (!was_serial) {
593 601 mutex_enter(&rp->r_statelock);
594 602 rp->r_serial = NULL;
595 603 cv_broadcast(&rp->r_cv);
596 604 mutex_exit(&rp->r_statelock);
597 605 }
598 606 }
599 607
600 608 /*
601 609 * Use the passed in "before" virtual attributes to check to see
602 610 * whether the data and metadata caches are valid, cache the "after"
603 611 * new attributes, and then do the cache invalidation if required.
604 612 *
605 613 * The cache validation and caching of the new attributes is done
606 614 * atomically via the use of the mutex, r_statelock. If required,
607 615 * the cache invalidation is done atomically w.r.t. the cache
608 616 * validation and caching of the attributes via the pseudo lock,
609 617 * r_serial.
610 618 *
611 619 * This routine is used to do cache validation and attributes caching
612 620 * for operations with both pre operation attributes and post operation
613 621 * attributes.
614 622 */
615 623 static void
616 624 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
617 625 cred_t *cr)
618 626 {
619 627 rnode_t *rp;
620 628 int mtime_changed = 0;
621 629 int ctime_changed = 0;
622 630 vsecattr_t *vsp;
623 631 int was_serial;
624 632 len_t preattr_rsize;
625 633 boolean_t writeattr_set = B_FALSE;
626 634 boolean_t cachepurge_set = B_FALSE;
627 635
628 636 rp = VTOR(vp);
629 637
630 638 mutex_enter(&rp->r_statelock);
631 639
632 640 if (rp->r_serial != curthread) {
633 641 klwp_t *lwp = ttolwp(curthread);
634 642
635 643 was_serial = 0;
636 644 if (lwp != NULL)
637 645 lwp->lwp_nostop++;
638 646 while (rp->r_serial != NULL) {
639 647 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
640 648 mutex_exit(&rp->r_statelock);
641 649 if (lwp != NULL)
642 650 lwp->lwp_nostop--;
643 651 return;
644 652 }
645 653 }
646 654 if (lwp != NULL)
647 655 lwp->lwp_nostop--;
648 656 } else
649 657 was_serial = 1;
650 658
651 659 if (rp->r_mtime > t) {
652 660 if (!CACHE_VALID(rp, avap->va_mtime, avap->va_size))
653 661 PURGE_ATTRCACHE_LOCKED(rp);
654 662 mutex_exit(&rp->r_statelock);
655 663 return;
656 664 }
657 665
658 666 /*
659 667 * Write thread after writing data to file on remote server,
660 668 * will always set RWRITEATTR to indicate that file on remote
661 669 * server was modified with a WRITE operation and would have
662 670 * marked attribute cache as timed out. If RWRITEATTR
663 671 * is set, then do not check for mtime and ctime change.
664 672 */
665 673 if (!(rp->r_flags & RWRITEATTR)) {
666 674 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
667 675 mtime_changed = 1;
668 676
669 677 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
670 678 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
671 679 ctime_changed = 1;
672 680 } else {
673 681 writeattr_set = B_TRUE;
674 682 }
675 683
676 684 preattr_rsize = rp->r_size;
677 685
678 686 nfs_attrcache_va(vp, avap);
679 687
680 688 /*
681 689 * If we have updated filesize in nfs_attrcache_va, as soon as we
682 690 * drop statelock we will be in transition of purging all
683 691 * our caches and updating them. It is possible for another
684 692 * thread to pick this new file size and read in zeroed data.
685 693 * stall other threads till cache purge is complete.
686 694 */
687 695 if ((vp->v_type == VREG) && (rp->r_size != preattr_rsize)) {
688 696 /*
689 697 * If RWRITEATTR was set and we have updated the file
690 698 * size, Server's returned file size need not necessarily
691 699 * be because of this Client's WRITE. We need to purge
692 700 * all caches.
693 701 */
694 702 if (writeattr_set)
695 703 mtime_changed = 1;
696 704
697 705 if (mtime_changed && !(rp->r_flags & RINCACHEPURGE)) {
698 706 rp->r_flags |= RINCACHEPURGE;
699 707 cachepurge_set = B_TRUE;
700 708 }
701 709 }
702 710
703 711 if (!mtime_changed && !ctime_changed) {
704 712 mutex_exit(&rp->r_statelock);
705 713 return;
706 714 }
707 715
708 716 rp->r_serial = curthread;
709 717
710 718 mutex_exit(&rp->r_statelock);
711 719
712 720 if (mtime_changed)
713 721 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
714 722
715 723 if ((rp->r_flags & RINCACHEPURGE) && cachepurge_set) {
716 724 mutex_enter(&rp->r_statelock);
717 725 rp->r_flags &= ~RINCACHEPURGE;
718 726 cv_broadcast(&rp->r_cv);
719 727 mutex_exit(&rp->r_statelock);
720 728 cachepurge_set = B_FALSE;
721 729 }
722 730
723 731 if (ctime_changed) {
724 732 (void) nfs_access_purge_rp(rp);
725 733 if (rp->r_secattr != NULL) {
726 734 mutex_enter(&rp->r_statelock);
727 735 vsp = rp->r_secattr;
728 736 rp->r_secattr = NULL;
729 737 mutex_exit(&rp->r_statelock);
730 738 if (vsp != NULL)
731 739 nfs_acl_free(vsp);
732 740 }
733 741 }
734 742
735 743 if (!was_serial) {
736 744 mutex_enter(&rp->r_statelock);
737 745 rp->r_serial = NULL;
738 746 cv_broadcast(&rp->r_cv);
739 747 mutex_exit(&rp->r_statelock);
740 748 }
741 749 }
742 750
743 751 /*
744 752 * Set attributes cache for given vnode using virtual attributes.
745 753 *
746 754 * Set the timeout value on the attribute cache and fill it
747 755 * with the passed in attributes.
748 756 *
749 757 * The caller must be holding r_statelock.
750 758 */
751 759 void
752 760 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
753 761 {
754 762 rnode_t *rp;
755 763 mntinfo_t *mi;
756 764 hrtime_t delta;
757 765 hrtime_t now;
758 766
759 767 rp = VTOR(vp);
760 768
761 769 ASSERT(MUTEX_HELD(&rp->r_statelock));
762 770
763 771 now = gethrtime();
764 772
765 773 mi = VTOMI(vp);
766 774
767 775 /*
768 776 * Delta is the number of nanoseconds that we will
769 777 * cache the attributes of the file. It is based on
770 778 * the number of nanoseconds since the last time that
771 779 * we detected a change. The assumption is that files
772 780 * that changed recently are likely to change again.
773 781 * There is a minimum and a maximum for regular files
774 782 * and for directories which is enforced though.
775 783 *
776 784 * Using the time since last change was detected
777 785 * eliminates direct comparison or calculation
778 786 * using mixed client and server times. NFS does
779 787 * not make any assumptions regarding the client
780 788 * and server clocks being synchronized.
781 789 */
782 790 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
783 791 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
784 792 va->va_size != rp->r_attr.va_size)
785 793 rp->r_mtime = now;
786 794
787 795 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
788 796 delta = 0;
789 797 else {
790 798 delta = now - rp->r_mtime;
791 799 if (vp->v_type == VDIR) {
792 800 if (delta < mi->mi_acdirmin)
793 801 delta = mi->mi_acdirmin;
794 802 else if (delta > mi->mi_acdirmax)
795 803 delta = mi->mi_acdirmax;
796 804 } else {
797 805 if (delta < mi->mi_acregmin)
798 806 delta = mi->mi_acregmin;
799 807 else if (delta > mi->mi_acregmax)
800 808 delta = mi->mi_acregmax;
801 809 }
802 810 }
803 811 rp->r_attrtime = now + delta;
804 812 rp->r_attr = *va;
805 813 /*
806 814 * Update the size of the file if there is no cached data or if
807 815 * the cached data is clean and there is no data being written
808 816 * out.
809 817 */
810 818 if (rp->r_size != va->va_size &&
811 819 (!vn_has_cached_data(vp) ||
812 820 (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
813 821 rp->r_size = va->va_size;
814 822 nfs_setswaplike(vp, va);
815 823 rp->r_flags &= ~RWRITEATTR;
816 824 }
817 825
818 826 /*
819 827 * Fill in attribute from the cache.
820 828 * If valid, then return 0 to indicate that no error occurred,
821 829 * otherwise return 1 to indicate that an error occurred.
822 830 */
823 831 static int
824 832 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
825 833 {
826 834 rnode_t *rp;
827 835 uint_t mask = vap->va_mask;
828 836
829 837 rp = VTOR(vp);
830 838 mutex_enter(&rp->r_statelock);
831 839 if (ATTRCACHE_VALID(vp)) {
832 840 /*
833 841 * Cached attributes are valid
834 842 */
835 843 *vap = rp->r_attr;
836 844 /*
837 845 * Set the caller's va_mask to the set of attributes
838 846 * that were requested ANDed with the attributes that
839 847 * are available. If attributes were requested that
840 848 * are not available, those bits must be turned off
841 849 * in the callers va_mask.
842 850 */
843 851 vap->va_mask &= mask;
844 852 mutex_exit(&rp->r_statelock);
845 853 return (0);
846 854 }
847 855 mutex_exit(&rp->r_statelock);
848 856 return (1);
849 857 }
850 858
851 859 /*
852 860 * Get attributes over-the-wire and update attributes cache
853 861 * if no error occurred in the over-the-wire operation.
854 862 * Return 0 if successful, otherwise error.
855 863 */
856 864 int
857 865 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
858 866 {
859 867 int error;
860 868 struct nfsattrstat ns;
861 869 int douprintf;
862 870 mntinfo_t *mi;
863 871 failinfo_t fi;
864 872 hrtime_t t;
865 873
866 874 mi = VTOMI(vp);
867 875 fi.vp = vp;
868 876 fi.fhp = NULL; /* no need to update, filehandle not copied */
869 877 fi.copyproc = nfscopyfh;
870 878 fi.lookupproc = nfslookup;
871 879 fi.xattrdirproc = acl_getxattrdir2;
872 880
873 881 if (mi->mi_flags & MI_ACL) {
874 882 error = acl_getattr2_otw(vp, vap, cr);
875 883 if (mi->mi_flags & MI_ACL)
876 884 return (error);
877 885 }
878 886
879 887 douprintf = 1;
880 888
881 889 t = gethrtime();
882 890
883 891 error = rfs2call(mi, RFS_GETATTR,
884 892 xdr_fhandle, (caddr_t)VTOFH(vp),
885 893 xdr_attrstat, (caddr_t)&ns, cr,
886 894 &douprintf, &ns.ns_status, 0, &fi);
887 895
888 896 if (!error) {
889 897 error = geterrno(ns.ns_status);
890 898 if (!error)
891 899 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
892 900 else {
893 901 PURGE_STALE_FH(error, vp, cr);
894 902 }
895 903 }
896 904
897 905 return (error);
898 906 }
899 907
900 908 /*
901 909 * Return either cached ot remote attributes. If get remote attr
902 910 * use them to check and invalidate caches, then cache the new attributes.
903 911 */
904 912 int
905 913 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
906 914 {
907 915 int error;
908 916 rnode_t *rp;
909 917
910 918 /*
911 919 * If we've got cached attributes, we're done, otherwise go
912 920 * to the server to get attributes, which will update the cache
913 921 * in the process.
914 922 */
915 923 error = nfs_getattr_cache(vp, vap);
916 924 if (error)
917 925 error = nfs_getattr_otw(vp, vap, cr);
918 926
919 927 /* Return the client's view of file size */
920 928 rp = VTOR(vp);
921 929 mutex_enter(&rp->r_statelock);
922 930 vap->va_size = rp->r_size;
923 931 mutex_exit(&rp->r_statelock);
924 932
925 933 return (error);
926 934 }
927 935
928 936 /*
929 937 * Get attributes over-the-wire and update attributes cache
930 938 * if no error occurred in the over-the-wire operation.
931 939 * Return 0 if successful, otherwise error.
932 940 */
933 941 int
934 942 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
935 943 {
936 944 int error;
937 945 GETATTR3args args;
938 946 GETATTR3vres res;
939 947 int douprintf;
940 948 failinfo_t fi;
941 949 hrtime_t t;
942 950
943 951 args.object = *VTOFH3(vp);
944 952 fi.vp = vp;
945 953 fi.fhp = (caddr_t)&args.object;
946 954 fi.copyproc = nfs3copyfh;
947 955 fi.lookupproc = nfs3lookup;
948 956 fi.xattrdirproc = acl_getxattrdir3;
949 957 res.fres.vp = vp;
950 958 res.fres.vap = vap;
951 959
952 960 douprintf = 1;
953 961
954 962 t = gethrtime();
955 963
956 964 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
957 965 xdr_nfs_fh3, (caddr_t)&args,
958 966 xdr_GETATTR3vres, (caddr_t)&res, cr,
959 967 &douprintf, &res.status, 0, &fi);
960 968
961 969 if (error)
962 970 return (error);
963 971
964 972 error = geterrno3(res.status);
965 973 if (error) {
966 974 PURGE_STALE_FH(error, vp, cr);
967 975 return (error);
968 976 }
969 977
970 978 /*
971 979 * Catch status codes that indicate fattr3 to vattr translation failure
972 980 */
973 981 if (res.fres.status)
974 982 return (res.fres.status);
975 983
976 984 nfs_attr_cache(vp, vap, t, cr);
977 985 return (0);
978 986 }
979 987
980 988 /*
981 989 * Return either cached or remote attributes. If get remote attr
982 990 * use them to check and invalidate caches, then cache the new attributes.
983 991 */
984 992 int
985 993 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
986 994 {
987 995 int error;
988 996 rnode_t *rp;
989 997
990 998 /*
991 999 * If we've got cached attributes, we're done, otherwise go
992 1000 * to the server to get attributes, which will update the cache
993 1001 * in the process.
994 1002 */
995 1003 error = nfs_getattr_cache(vp, vap);
996 1004 if (error)
997 1005 error = nfs3_getattr_otw(vp, vap, cr);
998 1006
999 1007 /* Return the client's view of file size */
1000 1008 rp = VTOR(vp);
1001 1009 mutex_enter(&rp->r_statelock);
1002 1010 vap->va_size = rp->r_size;
1003 1011 mutex_exit(&rp->r_statelock);
1004 1012
1005 1013 return (error);
1006 1014 }
1007 1015
1008 1016 vtype_t nf_to_vt[] = {
1009 1017 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
1010 1018 };
1011 1019 /*
1012 1020 * Convert NFS Version 2 over the network attributes to the local
1013 1021 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1014 1022 * network representation and the local representation is done here.
1015 1023 * Returns 0 for success, error if failed due to overflow.
1016 1024 */
1017 1025 int
1018 1026 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
1019 1027 {
1020 1028 /* overflow in time attributes? */
1021 1029 #ifndef _LP64
1022 1030 if (!NFS2_FATTR_TIME_OK(na))
1023 1031 return (EOVERFLOW);
1024 1032 #endif
1025 1033
1026 1034 vap->va_mask = AT_ALL;
1027 1035
1028 1036 if (na->na_type < NFNON || na->na_type > NFSOC)
1029 1037 vap->va_type = VBAD;
1030 1038 else
1031 1039 vap->va_type = nf_to_vt[na->na_type];
1032 1040 vap->va_mode = na->na_mode;
1033 1041 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
1034 1042 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
1035 1043 vap->va_fsid = vp->v_vfsp->vfs_dev;
1036 1044 vap->va_nodeid = na->na_nodeid;
1037 1045 vap->va_nlink = na->na_nlink;
1038 1046 vap->va_size = na->na_size; /* keep for cache validation */
1039 1047 /*
1040 1048 * nfs protocol defines times as unsigned so don't extend sign,
1041 1049 * unless sysadmin set nfs_allow_preepoch_time.
1042 1050 */
1043 1051 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
1044 1052 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
1045 1053 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
1046 1054 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
1047 1055 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
1048 1056 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
1049 1057 /*
1050 1058 * Shannon's law - uncompress the received dev_t
1051 1059 * if the top half of is zero indicating a response
1052 1060 * from an `older style' OS. Except for when it is a
1053 1061 * `new style' OS sending the maj device of zero,
1054 1062 * in which case the algorithm still works because the
1055 1063 * fact that it is a new style server
1056 1064 * is hidden by the minor device not being greater
1057 1065 * than 255 (a requirement in this case).
1058 1066 */
1059 1067 if ((na->na_rdev & 0xffff0000) == 0)
1060 1068 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1061 1069 else
1062 1070 vap->va_rdev = expldev(na->na_rdev);
1063 1071
1064 1072 vap->va_nblocks = na->na_blocks;
1065 1073 switch (na->na_type) {
1066 1074 case NFBLK:
1067 1075 vap->va_blksize = DEV_BSIZE;
1068 1076 break;
1069 1077
1070 1078 case NFCHR:
1071 1079 vap->va_blksize = MAXBSIZE;
1072 1080 break;
1073 1081
1074 1082 case NFSOC:
1075 1083 default:
1076 1084 vap->va_blksize = na->na_blocksize;
1077 1085 break;
1078 1086 }
1079 1087 /*
1080 1088 * This bit of ugliness is a hack to preserve the
1081 1089 * over-the-wire protocols for named-pipe vnodes.
1082 1090 * It remaps the special over-the-wire type to the
1083 1091 * VFIFO type. (see note in nfs.h)
1084 1092 */
1085 1093 if (NA_ISFIFO(na)) {
1086 1094 vap->va_type = VFIFO;
1087 1095 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1088 1096 vap->va_rdev = 0;
1089 1097 vap->va_blksize = na->na_blocksize;
1090 1098 }
1091 1099 vap->va_seq = 0;
1092 1100 return (0);
1093 1101 }
1094 1102
1095 1103 /*
1096 1104 * Convert NFS Version 3 over the network attributes to the local
1097 1105 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1098 1106 * network representation and the local representation is done here.
1099 1107 */
1100 1108 vtype_t nf3_to_vt[] = {
1101 1109 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1102 1110 };
1103 1111
1104 1112 int
1105 1113 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1106 1114 {
1107 1115
1108 1116 #ifndef _LP64
1109 1117 /* overflow in time attributes? */
1110 1118 if (!NFS3_FATTR_TIME_OK(na))
1111 1119 return (EOVERFLOW);
1112 1120 #endif
1113 1121 if (!NFS3_SIZE_OK(na->size))
1114 1122 /* file too big */
1115 1123 return (EFBIG);
1116 1124
1117 1125 vap->va_mask = AT_ALL;
1118 1126
1119 1127 if (na->type < NF3REG || na->type > NF3FIFO)
1120 1128 vap->va_type = VBAD;
1121 1129 else
1122 1130 vap->va_type = nf3_to_vt[na->type];
1123 1131 vap->va_mode = na->mode;
1124 1132 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1125 1133 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1126 1134 vap->va_fsid = vp->v_vfsp->vfs_dev;
1127 1135 vap->va_nodeid = na->fileid;
1128 1136 vap->va_nlink = na->nlink;
1129 1137 vap->va_size = na->size;
1130 1138
1131 1139 /*
1132 1140 * nfs protocol defines times as unsigned so don't extend sign,
1133 1141 * unless sysadmin set nfs_allow_preepoch_time.
1134 1142 */
1135 1143 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1136 1144 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1137 1145 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1138 1146 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1139 1147 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1140 1148 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1141 1149
1142 1150 switch (na->type) {
1143 1151 case NF3BLK:
1144 1152 vap->va_rdev = makedevice(na->rdev.specdata1,
1145 1153 na->rdev.specdata2);
1146 1154 vap->va_blksize = DEV_BSIZE;
1147 1155 vap->va_nblocks = 0;
1148 1156 break;
1149 1157 case NF3CHR:
1150 1158 vap->va_rdev = makedevice(na->rdev.specdata1,
1151 1159 na->rdev.specdata2);
1152 1160 vap->va_blksize = MAXBSIZE;
1153 1161 vap->va_nblocks = 0;
1154 1162 break;
1155 1163 case NF3REG:
1156 1164 case NF3DIR:
1157 1165 case NF3LNK:
1158 1166 vap->va_rdev = 0;
1159 1167 vap->va_blksize = MAXBSIZE;
1160 1168 vap->va_nblocks = (u_longlong_t)
1161 1169 ((na->used + (size3)DEV_BSIZE - (size3)1) /
1162 1170 (size3)DEV_BSIZE);
1163 1171 break;
1164 1172 case NF3SOCK:
1165 1173 case NF3FIFO:
1166 1174 default:
1167 1175 vap->va_rdev = 0;
1168 1176 vap->va_blksize = MAXBSIZE;
1169 1177 vap->va_nblocks = 0;
1170 1178 break;
1171 1179 }
1172 1180 vap->va_seq = 0;
1173 1181 return (0);
|
↓ open down ↓ |
1101 lines elided |
↑ open up ↑ |
1174 1182 }
1175 1183
1176 1184 /*
1177 1185 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1178 1186 * for the demand-based allocation of async threads per-mount. The
1179 1187 * nfs_async_timeout is the amount of time a thread will live after it
1180 1188 * becomes idle, unless new I/O requests are received before the thread
1181 1189 * dies. See nfs_async_putpage and nfs_async_start.
1182 1190 */
1183 1191
1184 -int nfs_async_timeout = -1; /* uninitialized */
1192 +volatile int nfs_async_timeout = -1; /* uninitialized */
1185 1193
1186 1194 static void nfs_async_start(struct vfs *);
1187 1195 static void nfs_async_pgops_start(struct vfs *);
1188 1196 static void nfs_async_common_start(struct vfs *, int);
1189 1197
1190 1198 static void
1191 1199 free_async_args(struct nfs_async_reqs *args)
1192 1200 {
1193 1201 rnode_t *rp;
1194 1202
1195 1203 if (args->a_io != NFS_INACTIVE) {
1196 1204 rp = VTOR(args->a_vp);
1197 1205 mutex_enter(&rp->r_statelock);
1198 1206 rp->r_count--;
1199 1207 if (args->a_io == NFS_PUTAPAGE ||
1200 1208 args->a_io == NFS_PAGEIO)
1201 1209 rp->r_awcount--;
1202 1210 cv_broadcast(&rp->r_cv);
1203 1211 mutex_exit(&rp->r_statelock);
1204 1212 VN_RELE(args->a_vp);
1205 1213 }
1206 1214 crfree(args->a_cred);
1207 1215 kmem_free(args, sizeof (*args));
1208 1216 }
1209 1217
1210 1218 /*
1211 1219 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1212 1220 * pageout(), running in the global zone, have legitimate reasons to do
1213 1221 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1214 1222 * use of a a per-mount "asynchronous requests manager thread" which is
1215 1223 * signaled by the various asynchronous work routines when there is
1216 1224 * asynchronous work to be done. It is responsible for creating new
1217 1225 * worker threads if necessary, and notifying existing worker threads
1218 1226 * that there is work to be done.
1219 1227 *
1220 1228 * In other words, it will "take the specifications from the customers and
1221 1229 * give them to the engineers."
1222 1230 *
1223 1231 * Worker threads die off of their own accord if they are no longer
1224 1232 * needed.
1225 1233 *
1226 1234 * This thread is killed when the zone is going away or the filesystem
1227 1235 * is being unmounted.
1228 1236 */
1229 1237 void
1230 1238 nfs_async_manager(vfs_t *vfsp)
1231 1239 {
1232 1240 callb_cpr_t cprinfo;
1233 1241 mntinfo_t *mi;
1234 1242 uint_t max_threads;
1235 1243
1236 1244 mi = VFTOMI(vfsp);
1237 1245
1238 1246 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1239 1247 "nfs_async_manager");
1240 1248
1241 1249 mutex_enter(&mi->mi_async_lock);
1242 1250 /*
1243 1251 * We want to stash the max number of threads that this mount was
1244 1252 * allowed so we can use it later when the variable is set to zero as
1245 1253 * part of the zone/mount going away.
1246 1254 *
1247 1255 * We want to be able to create at least one thread to handle
1248 1256 * asynchronous inactive calls.
1249 1257 */
1250 1258 max_threads = MAX(mi->mi_max_threads, 1);
1251 1259 /*
1252 1260 * We don't want to wait for mi_max_threads to go to zero, since that
1253 1261 * happens as part of a failed unmount, but this thread should only
1254 1262 * exit when the mount/zone is really going away.
1255 1263 *
1256 1264 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1257 1265 * attempted: the various _async_*() functions know to do things
1258 1266 * inline if mi_max_threads == 0. Henceforth we just drain out the
1259 1267 * outstanding requests.
1260 1268 *
1261 1269 * Note that we still create zthreads even if we notice the zone is
1262 1270 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1263 1271 * shutdown sequence to take slightly longer in some cases, but
1264 1272 * doesn't violate the protocol, as all threads will exit as soon as
1265 1273 * they're done processing the remaining requests.
1266 1274 */
1267 1275 for (;;) {
1268 1276 while (mi->mi_async_req_count > 0) {
1269 1277 /*
1270 1278 * Paranoia: If the mount started out having
1271 1279 * (mi->mi_max_threads == 0), and the value was
1272 1280 * later changed (via a debugger or somesuch),
1273 1281 * we could be confused since we will think we
1274 1282 * can't create any threads, and the calling
1275 1283 * code (which looks at the current value of
1276 1284 * mi->mi_max_threads, now non-zero) thinks we
1277 1285 * can.
1278 1286 *
1279 1287 * So, because we're paranoid, we create threads
1280 1288 * up to the maximum of the original and the
1281 1289 * current value. This means that future
1282 1290 * (debugger-induced) lowerings of
1283 1291 * mi->mi_max_threads are ignored for our
1284 1292 * purposes, but who told them they could change
1285 1293 * random values on a live kernel anyhow?
1286 1294 */
1287 1295 if (mi->mi_threads[NFS_ASYNC_QUEUE] <
1288 1296 MAX(mi->mi_max_threads, max_threads)) {
1289 1297 mi->mi_threads[NFS_ASYNC_QUEUE]++;
1290 1298 mutex_exit(&mi->mi_async_lock);
1291 1299 VFS_HOLD(vfsp); /* hold for new thread */
1292 1300 (void) zthread_create(NULL, 0, nfs_async_start,
1293 1301 vfsp, 0, minclsyspri);
1294 1302 mutex_enter(&mi->mi_async_lock);
1295 1303 } else if (mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] <
1296 1304 NUM_ASYNC_PGOPS_THREADS) {
1297 1305 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE]++;
1298 1306 mutex_exit(&mi->mi_async_lock);
1299 1307 VFS_HOLD(vfsp); /* hold for new thread */
1300 1308 (void) zthread_create(NULL, 0,
1301 1309 nfs_async_pgops_start, vfsp, 0,
1302 1310 minclsyspri);
1303 1311 mutex_enter(&mi->mi_async_lock);
1304 1312 }
1305 1313 NFS_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1306 1314 ASSERT(mi->mi_async_req_count != 0);
1307 1315 mi->mi_async_req_count--;
1308 1316 }
1309 1317
1310 1318 mutex_enter(&mi->mi_lock);
1311 1319 if (mi->mi_flags & MI_ASYNC_MGR_STOP) {
1312 1320 mutex_exit(&mi->mi_lock);
1313 1321 break;
1314 1322 }
1315 1323 mutex_exit(&mi->mi_lock);
1316 1324
1317 1325 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1318 1326 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1319 1327 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1320 1328 }
1321 1329 /*
1322 1330 * Let everyone know we're done.
1323 1331 */
1324 1332 mi->mi_manager_thread = NULL;
1325 1333 cv_broadcast(&mi->mi_async_cv);
1326 1334
1327 1335 /*
1328 1336 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1329 1337 * since CALLB_CPR_EXIT is actually responsible for releasing
1330 1338 * 'mi_async_lock'.
1331 1339 */
1332 1340 CALLB_CPR_EXIT(&cprinfo);
1333 1341 VFS_RELE(vfsp); /* release thread's hold */
1334 1342 zthread_exit();
1335 1343 }
1336 1344
1337 1345 /*
1338 1346 * Signal (and wait for) the async manager thread to clean up and go away.
1339 1347 */
1340 1348 void
1341 1349 nfs_async_manager_stop(vfs_t *vfsp)
1342 1350 {
1343 1351 mntinfo_t *mi = VFTOMI(vfsp);
1344 1352
1345 1353 mutex_enter(&mi->mi_async_lock);
1346 1354 mutex_enter(&mi->mi_lock);
|
↓ open down ↓ |
152 lines elided |
↑ open up ↑ |
1347 1355 mi->mi_flags |= MI_ASYNC_MGR_STOP;
1348 1356 mutex_exit(&mi->mi_lock);
1349 1357 cv_broadcast(&mi->mi_async_reqs_cv);
1350 1358 while (mi->mi_manager_thread != NULL)
1351 1359 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1352 1360 mutex_exit(&mi->mi_async_lock);
1353 1361 }
1354 1362
1355 1363 int
1356 1364 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1357 - struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1358 - u_offset_t, caddr_t, struct seg *, cred_t *))
1365 + struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, u_offset_t,
1366 + caddr_t, struct seg *, cred_t *))
1359 1367 {
1360 1368 rnode_t *rp;
1361 1369 mntinfo_t *mi;
1362 1370 struct nfs_async_reqs *args;
1363 1371
1364 1372 rp = VTOR(vp);
1365 1373 ASSERT(rp->r_freef == NULL);
1366 1374
1367 1375 mi = VTOMI(vp);
1368 1376
1369 1377 /*
1370 1378 * If addr falls in a different segment, don't bother doing readahead.
1371 1379 */
1372 1380 if (addr >= seg->s_base + seg->s_size)
1373 1381 return (-1);
1374 1382
1375 1383 /*
1376 1384 * If we can't allocate a request structure, punt on the readahead.
1377 1385 */
1378 1386 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379 1387 return (-1);
1380 1388
1381 1389 /*
1382 1390 * If a lock operation is pending, don't initiate any new
1383 1391 * readaheads. Otherwise, bump r_count to indicate the new
1384 1392 * asynchronous I/O.
1385 1393 */
1386 1394 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1387 1395 kmem_free(args, sizeof (*args));
1388 1396 return (-1);
1389 1397 }
1390 1398 mutex_enter(&rp->r_statelock);
1391 1399 rp->r_count++;
1392 1400 mutex_exit(&rp->r_statelock);
1393 1401 nfs_rw_exit(&rp->r_lkserlock);
1394 1402
1395 1403 args->a_next = NULL;
1396 1404 #ifdef DEBUG
1397 1405 args->a_queuer = curthread;
1398 1406 #endif
1399 1407 VN_HOLD(vp);
1400 1408 args->a_vp = vp;
1401 1409 ASSERT(cr != NULL);
1402 1410 crhold(cr);
1403 1411 args->a_cred = cr;
1404 1412 args->a_io = NFS_READ_AHEAD;
1405 1413 args->a_nfs_readahead = readahead;
1406 1414 args->a_nfs_blkoff = blkoff;
1407 1415 args->a_nfs_seg = seg;
1408 1416 args->a_nfs_addr = addr;
1409 1417
1410 1418 mutex_enter(&mi->mi_async_lock);
1411 1419
1412 1420 /*
1413 1421 * If asyncio has been disabled, don't bother readahead.
1414 1422 */
1415 1423 if (mi->mi_max_threads == 0) {
1416 1424 mutex_exit(&mi->mi_async_lock);
1417 1425 goto noasync;
1418 1426 }
1419 1427
1420 1428 /*
1421 1429 * Link request structure into the async list and
1422 1430 * wakeup async thread to do the i/o.
1423 1431 */
1424 1432 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1425 1433 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1426 1434 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1427 1435 } else {
1428 1436 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1429 1437 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1430 1438 }
1431 1439
1432 1440 if (mi->mi_io_kstats) {
1433 1441 mutex_enter(&mi->mi_lock);
1434 1442 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1435 1443 mutex_exit(&mi->mi_lock);
1436 1444 }
1437 1445
1438 1446 mi->mi_async_req_count++;
1439 1447 ASSERT(mi->mi_async_req_count != 0);
1440 1448 cv_signal(&mi->mi_async_reqs_cv);
1441 1449 mutex_exit(&mi->mi_async_lock);
1442 1450 return (0);
1443 1451
1444 1452 noasync:
1445 1453 mutex_enter(&rp->r_statelock);
1446 1454 rp->r_count--;
|
↓ open down ↓ |
78 lines elided |
↑ open up ↑ |
1447 1455 cv_broadcast(&rp->r_cv);
1448 1456 mutex_exit(&rp->r_statelock);
1449 1457 VN_RELE(vp);
1450 1458 crfree(cr);
1451 1459 kmem_free(args, sizeof (*args));
1452 1460 return (-1);
1453 1461 }
1454 1462
1455 1463 int
1456 1464 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1457 - int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1458 - u_offset_t, size_t, int, cred_t *))
1465 + int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, u_offset_t,
1466 + size_t, int, cred_t *))
1459 1467 {
1460 1468 rnode_t *rp;
1461 1469 mntinfo_t *mi;
1462 1470 struct nfs_async_reqs *args;
1463 1471
1464 1472 ASSERT(flags & B_ASYNC);
1465 1473 ASSERT(vp->v_vfsp != NULL);
1466 1474
1467 1475 rp = VTOR(vp);
1468 1476 ASSERT(rp->r_count > 0);
1469 1477
1470 1478 mi = VTOMI(vp);
1471 1479
1472 1480 /*
1473 1481 * If we can't allocate a request structure, do the putpage
1474 1482 * operation synchronously in this thread's context.
1475 1483 */
1476 1484 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1477 1485 goto noasync;
1478 1486
1479 1487 args->a_next = NULL;
1480 1488 #ifdef DEBUG
1481 1489 args->a_queuer = curthread;
1482 1490 #endif
1483 1491 VN_HOLD(vp);
1484 1492 args->a_vp = vp;
1485 1493 ASSERT(cr != NULL);
1486 1494 crhold(cr);
1487 1495 args->a_cred = cr;
1488 1496 args->a_io = NFS_PUTAPAGE;
1489 1497 args->a_nfs_putapage = putapage;
1490 1498 args->a_nfs_pp = pp;
1491 1499 args->a_nfs_off = off;
1492 1500 args->a_nfs_len = (uint_t)len;
1493 1501 args->a_nfs_flags = flags;
1494 1502
1495 1503 mutex_enter(&mi->mi_async_lock);
1496 1504
1497 1505 /*
1498 1506 * If asyncio has been disabled, then make a synchronous request.
1499 1507 * This check is done a second time in case async io was diabled
1500 1508 * while this thread was blocked waiting for memory pressure to
1501 1509 * reduce or for the queue to drain.
1502 1510 */
1503 1511 if (mi->mi_max_threads == 0) {
1504 1512 mutex_exit(&mi->mi_async_lock);
1505 1513 goto noasync;
1506 1514 }
1507 1515
1508 1516 /*
1509 1517 * Link request structure into the async list and
1510 1518 * wakeup async thread to do the i/o.
1511 1519 */
1512 1520 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1513 1521 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1514 1522 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1515 1523 } else {
1516 1524 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1517 1525 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1518 1526 }
1519 1527
1520 1528 mutex_enter(&rp->r_statelock);
1521 1529 rp->r_count++;
1522 1530 rp->r_awcount++;
1523 1531 mutex_exit(&rp->r_statelock);
1524 1532
1525 1533 if (mi->mi_io_kstats) {
1526 1534 mutex_enter(&mi->mi_lock);
1527 1535 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1528 1536 mutex_exit(&mi->mi_lock);
1529 1537 }
1530 1538
1531 1539 mi->mi_async_req_count++;
1532 1540 ASSERT(mi->mi_async_req_count != 0);
1533 1541 cv_signal(&mi->mi_async_reqs_cv);
1534 1542 mutex_exit(&mi->mi_async_lock);
1535 1543 return (0);
1536 1544
1537 1545 noasync:
1538 1546 if (args != NULL) {
1539 1547 VN_RELE(vp);
1540 1548 crfree(cr);
1541 1549 kmem_free(args, sizeof (*args));
1542 1550 }
1543 1551
1544 1552 if (curproc == proc_pageout || curproc == proc_fsflush) {
1545 1553 /*
1546 1554 * If we get here in the context of the pageout/fsflush,
1547 1555 * we refuse to do a sync write, because this may hang
1548 1556 * pageout (and the machine). In this case, we just
1549 1557 * re-mark the page as dirty and punt on the page.
1550 1558 *
1551 1559 * Make sure B_FORCE isn't set. We can re-mark the
1552 1560 * pages as dirty and unlock the pages in one swoop by
1553 1561 * passing in B_ERROR to pvn_write_done(). However,
1554 1562 * we should make sure B_FORCE isn't set - we don't
1555 1563 * want the page tossed before it gets written out.
1556 1564 */
1557 1565 if (flags & B_FORCE)
1558 1566 flags &= ~(B_INVAL | B_FORCE);
1559 1567 pvn_write_done(pp, flags | B_ERROR);
1560 1568 return (0);
1561 1569 }
1562 1570 if (nfs_zone() != mi->mi_zone) {
1563 1571 /*
1564 1572 * So this was a cross-zone sync putpage. We pass in B_ERROR
1565 1573 * to pvn_write_done() to re-mark the pages as dirty and unlock
1566 1574 * them.
1567 1575 *
1568 1576 * We don't want to clear B_FORCE here as the caller presumably
|
↓ open down ↓ |
100 lines elided |
↑ open up ↑ |
1569 1577 * knows what they're doing if they set it.
1570 1578 */
1571 1579 pvn_write_done(pp, flags | B_ERROR);
1572 1580 return (EPERM);
1573 1581 }
1574 1582 return ((*putapage)(vp, pp, off, len, flags, cr));
1575 1583 }
1576 1584
1577 1585 int
1578 1586 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1579 - int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1580 - size_t, int, cred_t *))
1587 + int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1588 + size_t, int, cred_t *))
1581 1589 {
1582 1590 rnode_t *rp;
1583 1591 mntinfo_t *mi;
1584 1592 struct nfs_async_reqs *args;
1585 1593
1586 1594 ASSERT(flags & B_ASYNC);
1587 1595 ASSERT(vp->v_vfsp != NULL);
1588 1596
1589 1597 rp = VTOR(vp);
1590 1598 ASSERT(rp->r_count > 0);
1591 1599
1592 1600 mi = VTOMI(vp);
1593 1601
1594 1602 /*
1595 1603 * If we can't allocate a request structure, do the pageio
1596 1604 * request synchronously in this thread's context.
1597 1605 */
1598 1606 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1599 1607 goto noasync;
1600 1608
1601 1609 args->a_next = NULL;
1602 1610 #ifdef DEBUG
1603 1611 args->a_queuer = curthread;
1604 1612 #endif
1605 1613 VN_HOLD(vp);
1606 1614 args->a_vp = vp;
1607 1615 ASSERT(cr != NULL);
1608 1616 crhold(cr);
1609 1617 args->a_cred = cr;
1610 1618 args->a_io = NFS_PAGEIO;
1611 1619 args->a_nfs_pageio = pageio;
1612 1620 args->a_nfs_pp = pp;
1613 1621 args->a_nfs_off = io_off;
1614 1622 args->a_nfs_len = (uint_t)io_len;
1615 1623 args->a_nfs_flags = flags;
1616 1624
1617 1625 mutex_enter(&mi->mi_async_lock);
1618 1626
1619 1627 /*
1620 1628 * If asyncio has been disabled, then make a synchronous request.
1621 1629 * This check is done a second time in case async io was diabled
1622 1630 * while this thread was blocked waiting for memory pressure to
1623 1631 * reduce or for the queue to drain.
1624 1632 */
1625 1633 if (mi->mi_max_threads == 0) {
1626 1634 mutex_exit(&mi->mi_async_lock);
1627 1635 goto noasync;
1628 1636 }
1629 1637
1630 1638 /*
1631 1639 * Link request structure into the async list and
1632 1640 * wakeup async thread to do the i/o.
1633 1641 */
1634 1642 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1635 1643 mi->mi_async_reqs[NFS_PAGEIO] = args;
1636 1644 mi->mi_async_tail[NFS_PAGEIO] = args;
1637 1645 } else {
1638 1646 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1639 1647 mi->mi_async_tail[NFS_PAGEIO] = args;
1640 1648 }
1641 1649
1642 1650 mutex_enter(&rp->r_statelock);
1643 1651 rp->r_count++;
1644 1652 rp->r_awcount++;
1645 1653 mutex_exit(&rp->r_statelock);
1646 1654
1647 1655 if (mi->mi_io_kstats) {
1648 1656 mutex_enter(&mi->mi_lock);
1649 1657 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1650 1658 mutex_exit(&mi->mi_lock);
1651 1659 }
1652 1660
1653 1661 mi->mi_async_req_count++;
1654 1662 ASSERT(mi->mi_async_req_count != 0);
1655 1663 cv_signal(&mi->mi_async_reqs_cv);
1656 1664 mutex_exit(&mi->mi_async_lock);
1657 1665 return (0);
1658 1666
1659 1667 noasync:
1660 1668 if (args != NULL) {
1661 1669 VN_RELE(vp);
1662 1670 crfree(cr);
1663 1671 kmem_free(args, sizeof (*args));
1664 1672 }
1665 1673
1666 1674 /*
1667 1675 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1668 1676 * the page list), for writes we do it synchronously, except for
1669 1677 * proc_pageout/proc_fsflush as described below.
1670 1678 */
1671 1679 if (flags & B_READ) {
1672 1680 pvn_read_done(pp, flags | B_ERROR);
1673 1681 return (0);
1674 1682 }
1675 1683
1676 1684 if (curproc == proc_pageout || curproc == proc_fsflush) {
1677 1685 /*
1678 1686 * If we get here in the context of the pageout/fsflush,
1679 1687 * we refuse to do a sync write, because this may hang
1680 1688 * pageout/fsflush (and the machine). In this case, we just
1681 1689 * re-mark the page as dirty and punt on the page.
1682 1690 *
1683 1691 * Make sure B_FORCE isn't set. We can re-mark the
1684 1692 * pages as dirty and unlock the pages in one swoop by
1685 1693 * passing in B_ERROR to pvn_write_done(). However,
1686 1694 * we should make sure B_FORCE isn't set - we don't
1687 1695 * want the page tossed before it gets written out.
1688 1696 */
1689 1697 if (flags & B_FORCE)
1690 1698 flags &= ~(B_INVAL | B_FORCE);
1691 1699 pvn_write_done(pp, flags | B_ERROR);
1692 1700 return (0);
1693 1701 }
1694 1702
1695 1703 if (nfs_zone() != mi->mi_zone) {
1696 1704 /*
1697 1705 * So this was a cross-zone sync pageio. We pass in B_ERROR
1698 1706 * to pvn_write_done() to re-mark the pages as dirty and unlock
1699 1707 * them.
1700 1708 *
1701 1709 * We don't want to clear B_FORCE here as the caller presumably
|
↓ open down ↓ |
111 lines elided |
↑ open up ↑ |
1702 1710 * knows what they're doing if they set it.
1703 1711 */
1704 1712 pvn_write_done(pp, flags | B_ERROR);
1705 1713 return (EPERM);
1706 1714 }
1707 1715 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1708 1716 }
1709 1717
1710 1718 void
1711 1719 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1712 - int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1720 + int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1713 1721 {
1714 1722 rnode_t *rp;
1715 1723 mntinfo_t *mi;
1716 1724 struct nfs_async_reqs *args;
1717 1725
1718 1726 rp = VTOR(vp);
1719 1727 ASSERT(rp->r_freef == NULL);
1720 1728
1721 1729 mi = VTOMI(vp);
1722 1730
1723 1731 /*
1724 1732 * If we can't allocate a request structure, do the readdir
1725 1733 * operation synchronously in this thread's context.
1726 1734 */
1727 1735 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1728 1736 goto noasync;
1729 1737
1730 1738 args->a_next = NULL;
1731 1739 #ifdef DEBUG
1732 1740 args->a_queuer = curthread;
1733 1741 #endif
1734 1742 VN_HOLD(vp);
1735 1743 args->a_vp = vp;
1736 1744 ASSERT(cr != NULL);
1737 1745 crhold(cr);
1738 1746 args->a_cred = cr;
1739 1747 args->a_io = NFS_READDIR;
1740 1748 args->a_nfs_readdir = readdir;
1741 1749 args->a_nfs_rdc = rdc;
1742 1750
1743 1751 mutex_enter(&mi->mi_async_lock);
1744 1752
1745 1753 /*
1746 1754 * If asyncio has been disabled, then make a synchronous request.
1747 1755 */
1748 1756 if (mi->mi_max_threads == 0) {
1749 1757 mutex_exit(&mi->mi_async_lock);
1750 1758 goto noasync;
1751 1759 }
1752 1760
1753 1761 /*
1754 1762 * Link request structure into the async list and
1755 1763 * wakeup async thread to do the i/o.
1756 1764 */
1757 1765 if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1758 1766 mi->mi_async_reqs[NFS_READDIR] = args;
1759 1767 mi->mi_async_tail[NFS_READDIR] = args;
1760 1768 } else {
1761 1769 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1762 1770 mi->mi_async_tail[NFS_READDIR] = args;
1763 1771 }
1764 1772
1765 1773 mutex_enter(&rp->r_statelock);
1766 1774 rp->r_count++;
1767 1775 mutex_exit(&rp->r_statelock);
1768 1776
1769 1777 if (mi->mi_io_kstats) {
1770 1778 mutex_enter(&mi->mi_lock);
1771 1779 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1772 1780 mutex_exit(&mi->mi_lock);
1773 1781 }
1774 1782
1775 1783 mi->mi_async_req_count++;
1776 1784 ASSERT(mi->mi_async_req_count != 0);
1777 1785 cv_signal(&mi->mi_async_reqs_cv);
1778 1786 mutex_exit(&mi->mi_async_lock);
1779 1787 return;
1780 1788
1781 1789 noasync:
1782 1790 if (args != NULL) {
1783 1791 VN_RELE(vp);
1784 1792 crfree(cr);
1785 1793 kmem_free(args, sizeof (*args));
1786 1794 }
1787 1795
1788 1796 rdc->entries = NULL;
1789 1797 mutex_enter(&rp->r_statelock);
1790 1798 ASSERT(rdc->flags & RDDIR);
1791 1799 rdc->flags &= ~RDDIR;
1792 1800 rdc->flags |= RDDIRREQ;
1793 1801 /*
1794 1802 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1795 1803 * is set, wakeup the thread sleeping in cv_wait_sig().
1796 1804 * The woken up thread will reset the flag to RDDIR and will
1797 1805 * continue with the readdir opeartion.
1798 1806 */
|
↓ open down ↓ |
76 lines elided |
↑ open up ↑ |
1799 1807 if (rdc->flags & RDDIRWAIT) {
1800 1808 rdc->flags &= ~RDDIRWAIT;
1801 1809 cv_broadcast(&rdc->cv);
1802 1810 }
1803 1811 mutex_exit(&rp->r_statelock);
1804 1812 rddir_cache_rele(rdc);
1805 1813 }
1806 1814
1807 1815 void
1808 1816 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1809 - cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1810 - cred_t *))
1817 + cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, cred_t *))
1811 1818 {
1812 1819 rnode_t *rp;
1813 1820 mntinfo_t *mi;
1814 1821 struct nfs_async_reqs *args;
1815 1822 page_t *pp;
1816 1823
1817 1824 rp = VTOR(vp);
1818 1825 mi = VTOMI(vp);
1819 1826
1820 1827 /*
1821 1828 * If we can't allocate a request structure, do the commit
1822 1829 * operation synchronously in this thread's context.
1823 1830 */
1824 1831 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1825 1832 goto noasync;
1826 1833
1827 1834 args->a_next = NULL;
1828 1835 #ifdef DEBUG
1829 1836 args->a_queuer = curthread;
1830 1837 #endif
1831 1838 VN_HOLD(vp);
1832 1839 args->a_vp = vp;
1833 1840 ASSERT(cr != NULL);
1834 1841 crhold(cr);
1835 1842 args->a_cred = cr;
1836 1843 args->a_io = NFS_COMMIT;
1837 1844 args->a_nfs_commit = commit;
1838 1845 args->a_nfs_plist = plist;
1839 1846 args->a_nfs_offset = offset;
1840 1847 args->a_nfs_count = count;
1841 1848
1842 1849 mutex_enter(&mi->mi_async_lock);
1843 1850
1844 1851 /*
1845 1852 * If asyncio has been disabled, then make a synchronous request.
1846 1853 * This check is done a second time in case async io was diabled
1847 1854 * while this thread was blocked waiting for memory pressure to
1848 1855 * reduce or for the queue to drain.
1849 1856 */
1850 1857 if (mi->mi_max_threads == 0) {
1851 1858 mutex_exit(&mi->mi_async_lock);
1852 1859 goto noasync;
1853 1860 }
1854 1861
1855 1862 /*
1856 1863 * Link request structure into the async list and
1857 1864 * wakeup async thread to do the i/o.
1858 1865 */
1859 1866 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1860 1867 mi->mi_async_reqs[NFS_COMMIT] = args;
1861 1868 mi->mi_async_tail[NFS_COMMIT] = args;
1862 1869 } else {
1863 1870 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1864 1871 mi->mi_async_tail[NFS_COMMIT] = args;
1865 1872 }
1866 1873
1867 1874 mutex_enter(&rp->r_statelock);
1868 1875 rp->r_count++;
1869 1876 mutex_exit(&rp->r_statelock);
1870 1877
1871 1878 if (mi->mi_io_kstats) {
1872 1879 mutex_enter(&mi->mi_lock);
1873 1880 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1874 1881 mutex_exit(&mi->mi_lock);
1875 1882 }
1876 1883
1877 1884 mi->mi_async_req_count++;
1878 1885 ASSERT(mi->mi_async_req_count != 0);
1879 1886 cv_signal(&mi->mi_async_reqs_cv);
1880 1887 mutex_exit(&mi->mi_async_lock);
1881 1888 return;
1882 1889
1883 1890 noasync:
1884 1891 if (args != NULL) {
1885 1892 VN_RELE(vp);
1886 1893 crfree(cr);
1887 1894 kmem_free(args, sizeof (*args));
1888 1895 }
1889 1896
1890 1897 if (curproc == proc_pageout || curproc == proc_fsflush ||
1891 1898 nfs_zone() != mi->mi_zone) {
1892 1899 while (plist != NULL) {
1893 1900 pp = plist;
1894 1901 page_sub(&plist, pp);
1895 1902 pp->p_fsdata = C_COMMIT;
1896 1903 page_unlock(pp);
1897 1904 }
1898 1905 return;
1899 1906 }
1900 1907 (*commit)(vp, plist, offset, count, cr);
1901 1908 }
1902 1909
1903 1910 void
1904 1911 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1905 1912 void (*inactive)(vnode_t *, cred_t *, caller_context_t *))
1906 1913 {
1907 1914 mntinfo_t *mi;
1908 1915 struct nfs_async_reqs *args;
1909 1916
1910 1917 mi = VTOMI(vp);
1911 1918
1912 1919 args = kmem_alloc(sizeof (*args), KM_SLEEP);
1913 1920 args->a_next = NULL;
1914 1921 #ifdef DEBUG
1915 1922 args->a_queuer = curthread;
1916 1923 #endif
1917 1924 args->a_vp = vp;
1918 1925 ASSERT(cr != NULL);
1919 1926 crhold(cr);
1920 1927 args->a_cred = cr;
1921 1928 args->a_io = NFS_INACTIVE;
1922 1929 args->a_nfs_inactive = inactive;
1923 1930
1924 1931 /*
1925 1932 * Note that we don't check mi->mi_max_threads here, since we
1926 1933 * *need* to get rid of this vnode regardless of whether someone
1927 1934 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1928 1935 *
1929 1936 * The manager thread knows about this and is willing to create
1930 1937 * at least one thread to accommodate us.
1931 1938 */
1932 1939 mutex_enter(&mi->mi_async_lock);
1933 1940 if (mi->mi_manager_thread == NULL) {
1934 1941 rnode_t *rp = VTOR(vp);
1935 1942
1936 1943 mutex_exit(&mi->mi_async_lock);
1937 1944 crfree(cr); /* drop our reference */
1938 1945 kmem_free(args, sizeof (*args));
1939 1946 /*
1940 1947 * We can't do an over-the-wire call since we're in the wrong
1941 1948 * zone, so we need to clean up state as best we can and then
1942 1949 * throw away the vnode.
1943 1950 */
1944 1951 mutex_enter(&rp->r_statelock);
1945 1952 if (rp->r_unldvp != NULL) {
1946 1953 vnode_t *unldvp;
1947 1954 char *unlname;
1948 1955 cred_t *unlcred;
1949 1956
1950 1957 unldvp = rp->r_unldvp;
1951 1958 rp->r_unldvp = NULL;
1952 1959 unlname = rp->r_unlname;
1953 1960 rp->r_unlname = NULL;
1954 1961 unlcred = rp->r_unlcred;
1955 1962 rp->r_unlcred = NULL;
1956 1963 mutex_exit(&rp->r_statelock);
1957 1964
1958 1965 VN_RELE(unldvp);
1959 1966 kmem_free(unlname, MAXNAMELEN);
1960 1967 crfree(unlcred);
1961 1968 } else {
1962 1969 mutex_exit(&rp->r_statelock);
1963 1970 }
1964 1971 /*
1965 1972 * No need to explicitly throw away any cached pages. The
1966 1973 * eventual rinactive() will attempt a synchronous
1967 1974 * VOP_PUTPAGE() which will immediately fail since the request
1968 1975 * is coming from the wrong zone, and then will proceed to call
1969 1976 * nfs_invalidate_pages() which will clean things up for us.
1970 1977 */
1971 1978 rp_addfree(VTOR(vp), cr);
1972 1979 return;
1973 1980 }
1974 1981
1975 1982 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1976 1983 mi->mi_async_reqs[NFS_INACTIVE] = args;
1977 1984 } else {
1978 1985 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1979 1986 }
1980 1987 mi->mi_async_tail[NFS_INACTIVE] = args;
1981 1988 /*
1982 1989 * Don't increment r_count, since we're trying to get rid of the vnode.
1983 1990 */
1984 1991
1985 1992 mi->mi_async_req_count++;
1986 1993 ASSERT(mi->mi_async_req_count != 0);
1987 1994 cv_signal(&mi->mi_async_reqs_cv);
1988 1995 mutex_exit(&mi->mi_async_lock);
1989 1996 }
1990 1997
1991 1998 static void
1992 1999 nfs_async_start(struct vfs *vfsp)
1993 2000 {
1994 2001 nfs_async_common_start(vfsp, NFS_ASYNC_QUEUE);
1995 2002 }
1996 2003
1997 2004 static void
1998 2005 nfs_async_pgops_start(struct vfs *vfsp)
1999 2006 {
2000 2007 nfs_async_common_start(vfsp, NFS_ASYNC_PGOPS_QUEUE);
2001 2008 }
2002 2009
2003 2010 /*
2004 2011 * The async queues for each mounted file system are arranged as a
2005 2012 * set of queues, one for each async i/o type. Requests are taken
2006 2013 * from the queues in a round-robin fashion. A number of consecutive
2007 2014 * requests are taken from each queue before moving on to the next
2008 2015 * queue. This functionality may allow the NFS Version 2 server to do
2009 2016 * write clustering, even if the client is mixing writes and reads
2010 2017 * because it will take multiple write requests from the queue
2011 2018 * before processing any of the other async i/o types.
2012 2019 *
2013 2020 * XXX The nfs_async_common_start thread is unsafe in the light of the present
2014 2021 * model defined by cpr to suspend the system. Specifically over the
2015 2022 * wire calls are cpr-unsafe. The thread should be reevaluated in
2016 2023 * case of future updates to the cpr model.
2017 2024 */
2018 2025 static void
2019 2026 nfs_async_common_start(struct vfs *vfsp, int async_queue)
2020 2027 {
2021 2028 struct nfs_async_reqs *args;
2022 2029 mntinfo_t *mi = VFTOMI(vfsp);
2023 2030 clock_t time_left = 1;
2024 2031 callb_cpr_t cprinfo;
2025 2032 int i;
2026 2033 int async_types;
2027 2034 kcondvar_t *async_work_cv;
2028 2035
2029 2036 if (async_queue == NFS_ASYNC_QUEUE) {
2030 2037 async_types = NFS_ASYNC_TYPES;
2031 2038 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_QUEUE];
2032 2039 } else {
2033 2040 async_types = NFS_ASYNC_PGOPS_TYPES;
2034 2041 async_work_cv = &mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE];
2035 2042 }
2036 2043
2037 2044 /*
2038 2045 * Dynamic initialization of nfs_async_timeout to allow nfs to be
2039 2046 * built in an implementation independent manner.
2040 2047 */
2041 2048 if (nfs_async_timeout == -1)
2042 2049 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
2043 2050
2044 2051 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
2045 2052
2046 2053 mutex_enter(&mi->mi_async_lock);
2047 2054 for (;;) {
2048 2055 /*
2049 2056 * Find the next queue containing an entry. We start
2050 2057 * at the current queue pointer and then round robin
2051 2058 * through all of them until we either find a non-empty
2052 2059 * queue or have looked through all of them.
2053 2060 */
2054 2061 for (i = 0; i < async_types; i++) {
2055 2062 args = *mi->mi_async_curr[async_queue];
2056 2063 if (args != NULL)
2057 2064 break;
2058 2065 mi->mi_async_curr[async_queue]++;
2059 2066 if (mi->mi_async_curr[async_queue] ==
2060 2067 &mi->mi_async_reqs[async_types]) {
2061 2068 mi->mi_async_curr[async_queue] =
2062 2069 &mi->mi_async_reqs[0];
2063 2070 }
2064 2071 }
2065 2072 /*
2066 2073 * If we didn't find a entry, then block until woken up
2067 2074 * again and then look through the queues again.
2068 2075 */
2069 2076 if (args == NULL) {
2070 2077 /*
2071 2078 * Exiting is considered to be safe for CPR as well
2072 2079 */
2073 2080 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2074 2081
2075 2082 /*
2076 2083 * Wakeup thread waiting to unmount the file
2077 2084 * system only if all async threads are inactive.
2078 2085 *
2079 2086 * If we've timed-out and there's nothing to do,
2080 2087 * then get rid of this thread.
2081 2088 */
2082 2089 if (mi->mi_max_threads == 0 || time_left <= 0) {
2083 2090 --mi->mi_threads[async_queue];
2084 2091
2085 2092 if (mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
2086 2093 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0)
2087 2094 cv_signal(&mi->mi_async_cv);
2088 2095 CALLB_CPR_EXIT(&cprinfo);
2089 2096 VFS_RELE(vfsp); /* release thread's hold */
2090 2097 zthread_exit();
2091 2098 /* NOTREACHED */
2092 2099 }
2093 2100 time_left = cv_reltimedwait(async_work_cv,
2094 2101 &mi->mi_async_lock, nfs_async_timeout,
2095 2102 TR_CLOCK_TICK);
2096 2103
2097 2104 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
2098 2105
2099 2106 continue;
2100 2107 }
2101 2108 time_left = 1;
2102 2109
2103 2110 /*
2104 2111 * Remove the request from the async queue and then
2105 2112 * update the current async request queue pointer. If
2106 2113 * the current queue is empty or we have removed enough
2107 2114 * consecutive entries from it, then reset the counter
2108 2115 * for this queue and then move the current pointer to
2109 2116 * the next queue.
2110 2117 */
2111 2118 *mi->mi_async_curr[async_queue] = args->a_next;
2112 2119 if (*mi->mi_async_curr[async_queue] == NULL ||
2113 2120 --mi->mi_async_clusters[args->a_io] == 0) {
2114 2121 mi->mi_async_clusters[args->a_io] =
2115 2122 mi->mi_async_init_clusters;
2116 2123 mi->mi_async_curr[async_queue]++;
2117 2124 if (mi->mi_async_curr[async_queue] ==
2118 2125 &mi->mi_async_reqs[async_types]) {
2119 2126 mi->mi_async_curr[async_queue] =
2120 2127 &mi->mi_async_reqs[0];
2121 2128 }
2122 2129 }
2123 2130
2124 2131 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2125 2132 mutex_enter(&mi->mi_lock);
2126 2133 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2127 2134 mutex_exit(&mi->mi_lock);
2128 2135 }
2129 2136
2130 2137 mutex_exit(&mi->mi_async_lock);
2131 2138
2132 2139 /*
2133 2140 * Obtain arguments from the async request structure.
2134 2141 */
2135 2142 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2136 2143 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2137 2144 args->a_nfs_addr, args->a_nfs_seg,
2138 2145 args->a_cred);
2139 2146 } else if (args->a_io == NFS_PUTAPAGE) {
2140 2147 (void) (*args->a_nfs_putapage)(args->a_vp,
2141 2148 args->a_nfs_pp, args->a_nfs_off,
2142 2149 args->a_nfs_len, args->a_nfs_flags,
2143 2150 args->a_cred);
2144 2151 } else if (args->a_io == NFS_PAGEIO) {
2145 2152 (void) (*args->a_nfs_pageio)(args->a_vp,
2146 2153 args->a_nfs_pp, args->a_nfs_off,
2147 2154 args->a_nfs_len, args->a_nfs_flags,
2148 2155 args->a_cred);
2149 2156 } else if (args->a_io == NFS_READDIR) {
2150 2157 (void) ((*args->a_nfs_readdir)(args->a_vp,
2151 2158 args->a_nfs_rdc, args->a_cred));
2152 2159 } else if (args->a_io == NFS_COMMIT) {
2153 2160 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2154 2161 args->a_nfs_offset, args->a_nfs_count,
2155 2162 args->a_cred);
2156 2163 } else if (args->a_io == NFS_INACTIVE) {
2157 2164 (*args->a_nfs_inactive)(args->a_vp, args->a_cred, NULL);
2158 2165 }
2159 2166
2160 2167 /*
2161 2168 * Now, release the vnode and free the credentials
2162 2169 * structure.
2163 2170 */
2164 2171 free_async_args(args);
2165 2172 /*
2166 2173 * Reacquire the mutex because it will be needed above.
2167 2174 */
2168 2175 mutex_enter(&mi->mi_async_lock);
2169 2176 }
2170 2177 }
2171 2178
2172 2179 void
2173 2180 nfs_async_stop(struct vfs *vfsp)
2174 2181 {
2175 2182 mntinfo_t *mi = VFTOMI(vfsp);
2176 2183
2177 2184 /*
2178 2185 * Wait for all outstanding async operations to complete and for the
2179 2186 * worker threads to exit.
2180 2187 */
2181 2188 mutex_enter(&mi->mi_async_lock);
2182 2189 mi->mi_max_threads = 0;
2183 2190 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2184 2191 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2185 2192 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0)
2186 2193 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2187 2194 mutex_exit(&mi->mi_async_lock);
2188 2195 }
2189 2196
2190 2197 /*
2191 2198 * nfs_async_stop_sig:
2192 2199 * Wait for all outstanding putpage operation to complete. If a signal
2193 2200 * is deliver we will abort and return non-zero. If we can put all the
2194 2201 * pages we will return 0. This routine is called from nfs_unmount and
2195 2202 * nfs3_unmount to make these operations interruptible.
2196 2203 */
2197 2204 int
2198 2205 nfs_async_stop_sig(struct vfs *vfsp)
2199 2206 {
2200 2207 mntinfo_t *mi = VFTOMI(vfsp);
2201 2208 ushort_t omax;
2202 2209 int rval;
2203 2210
2204 2211 /*
2205 2212 * Wait for all outstanding async operations to complete and for the
2206 2213 * worker threads to exit.
2207 2214 */
2208 2215 mutex_enter(&mi->mi_async_lock);
2209 2216 omax = mi->mi_max_threads;
2210 2217 mi->mi_max_threads = 0;
2211 2218 /*
2212 2219 * Tell all the worker threads to exit.
2213 2220 */
2214 2221 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2215 2222 while (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2216 2223 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0) {
2217 2224 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2218 2225 break;
2219 2226 }
2220 2227 rval = (mi->mi_threads[NFS_ASYNC_QUEUE] != 0 ||
2221 2228 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] != 0); /* Interrupted */
2222 2229 if (rval)
2223 2230 mi->mi_max_threads = omax;
2224 2231 mutex_exit(&mi->mi_async_lock);
2225 2232
2226 2233 return (rval);
2227 2234 }
2228 2235
2229 2236 int
2230 2237 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2231 2238 {
2232 2239 int pagecreate;
2233 2240 int n;
2234 2241 int saved_n;
2235 2242 caddr_t saved_base;
2236 2243 u_offset_t offset;
2237 2244 int error;
2238 2245 int sm_error;
2239 2246 vnode_t *vp = RTOV(rp);
2240 2247
2241 2248 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2242 2249 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2243 2250 if (!vpm_enable) {
2244 2251 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2245 2252 }
2246 2253
2247 2254 /*
2248 2255 * Move bytes in at most PAGESIZE chunks. We must avoid
2249 2256 * spanning pages in uiomove() because page faults may cause
2250 2257 * the cache to be invalidated out from under us. The r_size is not
2251 2258 * updated until after the uiomove. If we push the last page of a
2252 2259 * file before r_size is correct, we will lose the data written past
2253 2260 * the current (and invalid) r_size.
2254 2261 */
2255 2262 do {
2256 2263 offset = uio->uio_loffset;
2257 2264 pagecreate = 0;
2258 2265
2259 2266 /*
2260 2267 * n is the number of bytes required to satisfy the request
2261 2268 * or the number of bytes to fill out the page.
2262 2269 */
2263 2270 n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2264 2271
2265 2272 /*
2266 2273 * Check to see if we can skip reading in the page
2267 2274 * and just allocate the memory. We can do this
2268 2275 * if we are going to rewrite the entire mapping
2269 2276 * or if we are going to write to or beyond the current
2270 2277 * end of file from the beginning of the mapping.
2271 2278 *
2272 2279 * The read of r_size is now protected by r_statelock.
2273 2280 */
2274 2281 mutex_enter(&rp->r_statelock);
2275 2282 /*
2276 2283 * When pgcreated is nonzero the caller has already done
2277 2284 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2278 2285 * segkpm this means we already have at least one page
2279 2286 * created and mapped at base.
2280 2287 */
2281 2288 pagecreate = pgcreated ||
2282 2289 ((offset & PAGEOFFSET) == 0 &&
2283 2290 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2284 2291
2285 2292 mutex_exit(&rp->r_statelock);
2286 2293 if (!vpm_enable && pagecreate) {
2287 2294 /*
2288 2295 * The last argument tells segmap_pagecreate() to
2289 2296 * always lock the page, as opposed to sometimes
2290 2297 * returning with the page locked. This way we avoid a
2291 2298 * fault on the ensuing uiomove(), but also
2292 2299 * more importantly (to fix bug 1094402) we can
2293 2300 * call segmap_fault() to unlock the page in all
2294 2301 * cases. An alternative would be to modify
2295 2302 * segmap_pagecreate() to tell us when it is
2296 2303 * locking a page, but that's a fairly major
2297 2304 * interface change.
2298 2305 */
2299 2306 if (pgcreated == 0)
2300 2307 (void) segmap_pagecreate(segkmap, base,
2301 2308 (uint_t)n, 1);
2302 2309 saved_base = base;
2303 2310 saved_n = n;
2304 2311 }
2305 2312
2306 2313 /*
2307 2314 * The number of bytes of data in the last page can not
2308 2315 * be accurately be determined while page is being
2309 2316 * uiomove'd to and the size of the file being updated.
2310 2317 * Thus, inform threads which need to know accurately
2311 2318 * how much data is in the last page of the file. They
2312 2319 * will not do the i/o immediately, but will arrange for
2313 2320 * the i/o to happen later when this modify operation
2314 2321 * will have finished.
2315 2322 */
2316 2323 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2317 2324 mutex_enter(&rp->r_statelock);
2318 2325 rp->r_flags |= RMODINPROGRESS;
2319 2326 rp->r_modaddr = (offset & MAXBMASK);
2320 2327 mutex_exit(&rp->r_statelock);
2321 2328
2322 2329 if (vpm_enable) {
2323 2330 /*
2324 2331 * Copy data. If new pages are created, part of
2325 2332 * the page that is not written will be initizliazed
2326 2333 * with zeros.
2327 2334 */
2328 2335 error = vpm_data_copy(vp, offset, n, uio,
2329 2336 !pagecreate, NULL, 0, S_WRITE);
2330 2337 } else {
2331 2338 error = uiomove(base, n, UIO_WRITE, uio);
2332 2339 }
2333 2340
2334 2341 /*
2335 2342 * r_size is the maximum number of
2336 2343 * bytes known to be in the file.
2337 2344 * Make sure it is at least as high as the
2338 2345 * first unwritten byte pointed to by uio_loffset.
2339 2346 */
2340 2347 mutex_enter(&rp->r_statelock);
2341 2348 if (rp->r_size < uio->uio_loffset)
2342 2349 rp->r_size = uio->uio_loffset;
2343 2350 rp->r_flags &= ~RMODINPROGRESS;
2344 2351 rp->r_flags |= RDIRTY;
2345 2352 mutex_exit(&rp->r_statelock);
2346 2353
2347 2354 /* n = # of bytes written */
2348 2355 n = (int)(uio->uio_loffset - offset);
2349 2356
2350 2357 if (!vpm_enable) {
2351 2358 base += n;
2352 2359 }
2353 2360 tcount -= n;
2354 2361 /*
2355 2362 * If we created pages w/o initializing them completely,
2356 2363 * we need to zero the part that wasn't set up.
2357 2364 * This happens on a most EOF write cases and if
2358 2365 * we had some sort of error during the uiomove.
2359 2366 */
2360 2367 if (!vpm_enable && pagecreate) {
2361 2368 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2362 2369 (void) kzero(base, PAGESIZE - n);
2363 2370
2364 2371 if (pgcreated) {
2365 2372 /*
2366 2373 * Caller is responsible for this page,
2367 2374 * it was not created in this loop.
2368 2375 */
2369 2376 pgcreated = 0;
2370 2377 } else {
2371 2378 /*
2372 2379 * For bug 1094402: segmap_pagecreate locks
2373 2380 * page. Unlock it. This also unlocks the
2374 2381 * pages allocated by page_create_va() in
2375 2382 * segmap_pagecreate().
2376 2383 */
2377 2384 sm_error = segmap_fault(kas.a_hat, segkmap,
2378 2385 saved_base, saved_n,
2379 2386 F_SOFTUNLOCK, S_WRITE);
2380 2387 if (error == 0)
2381 2388 error = sm_error;
2382 2389 }
2383 2390 }
2384 2391 } while (tcount > 0 && error == 0);
2385 2392
2386 2393 return (error);
2387 2394 }
2388 2395
2389 2396 int
2390 2397 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2391 2398 {
2392 2399 rnode_t *rp;
2393 2400 page_t *pp;
2394 2401 u_offset_t eoff;
2395 2402 u_offset_t io_off;
2396 2403 size_t io_len;
2397 2404 int error;
2398 2405 int rdirty;
2399 2406 int err;
2400 2407
2401 2408 rp = VTOR(vp);
2402 2409 ASSERT(rp->r_count > 0);
2403 2410
2404 2411 if (!vn_has_cached_data(vp))
2405 2412 return (0);
2406 2413
2407 2414 ASSERT(vp->v_type != VCHR);
2408 2415
2409 2416 /*
2410 2417 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2411 2418 * writes. B_FORCE is set to force the VM system to actually
2412 2419 * invalidate the pages, even if the i/o failed. The pages
2413 2420 * need to get invalidated because they can't be written out
2414 2421 * because there isn't any space left on either the server's
2415 2422 * file system or in the user's disk quota. The B_FREE bit
2416 2423 * is cleared to avoid confusion as to whether this is a
2417 2424 * request to place the page on the freelist or to destroy
2418 2425 * it.
2419 2426 */
2420 2427 if ((rp->r_flags & ROUTOFSPACE) ||
2421 2428 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2422 2429 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2423 2430
2424 2431 if (len == 0) {
2425 2432 /*
2426 2433 * If doing a full file synchronous operation, then clear
2427 2434 * the RDIRTY bit. If a page gets dirtied while the flush
2428 2435 * is happening, then RDIRTY will get set again. The
2429 2436 * RDIRTY bit must get cleared before the flush so that
2430 2437 * we don't lose this information.
2431 2438 *
2432 2439 * If there are no full file async write operations
2433 2440 * pending and RDIRTY bit is set, clear it.
2434 2441 */
2435 2442 if (off == (u_offset_t)0 &&
2436 2443 !(flags & B_ASYNC) &&
2437 2444 (rp->r_flags & RDIRTY)) {
2438 2445 mutex_enter(&rp->r_statelock);
2439 2446 rdirty = (rp->r_flags & RDIRTY);
2440 2447 rp->r_flags &= ~RDIRTY;
2441 2448 mutex_exit(&rp->r_statelock);
2442 2449 } else if (flags & B_ASYNC && off == (u_offset_t)0) {
2443 2450 mutex_enter(&rp->r_statelock);
2444 2451 if (rp->r_flags & RDIRTY && rp->r_awcount == 0) {
2445 2452 rdirty = (rp->r_flags & RDIRTY);
2446 2453 rp->r_flags &= ~RDIRTY;
2447 2454 }
2448 2455 mutex_exit(&rp->r_statelock);
2449 2456 } else
2450 2457 rdirty = 0;
2451 2458
2452 2459 /*
2453 2460 * Search the entire vp list for pages >= off, and flush
2454 2461 * the dirty pages.
2455 2462 */
2456 2463 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2457 2464 flags, cr);
2458 2465
2459 2466 /*
2460 2467 * If an error occurred and the file was marked as dirty
2461 2468 * before and we aren't forcibly invalidating pages, then
2462 2469 * reset the RDIRTY flag.
2463 2470 */
2464 2471 if (error && rdirty &&
2465 2472 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2466 2473 mutex_enter(&rp->r_statelock);
2467 2474 rp->r_flags |= RDIRTY;
2468 2475 mutex_exit(&rp->r_statelock);
2469 2476 }
2470 2477 } else {
2471 2478 /*
2472 2479 * Do a range from [off...off + len) looking for pages
2473 2480 * to deal with.
2474 2481 */
2475 2482 error = 0;
2476 2483 #ifdef lint
2477 2484 io_len = 0;
2478 2485 #endif
2479 2486 eoff = off + len;
2480 2487 mutex_enter(&rp->r_statelock);
2481 2488 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2482 2489 io_off += io_len) {
2483 2490 mutex_exit(&rp->r_statelock);
2484 2491 /*
2485 2492 * If we are not invalidating, synchronously
2486 2493 * freeing or writing pages use the routine
2487 2494 * page_lookup_nowait() to prevent reclaiming
2488 2495 * them from the free list.
2489 2496 */
2490 2497 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2491 2498 pp = page_lookup(vp, io_off,
2492 2499 (flags & (B_INVAL | B_FREE)) ?
2493 2500 SE_EXCL : SE_SHARED);
2494 2501 } else {
2495 2502 pp = page_lookup_nowait(vp, io_off,
2496 2503 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2497 2504 }
2498 2505
2499 2506 if (pp == NULL || !pvn_getdirty(pp, flags))
2500 2507 io_len = PAGESIZE;
2501 2508 else {
2502 2509 err = (*rp->r_putapage)(vp, pp, &io_off,
2503 2510 &io_len, flags, cr);
2504 2511 if (!error)
2505 2512 error = err;
2506 2513 /*
2507 2514 * "io_off" and "io_len" are returned as
2508 2515 * the range of pages we actually wrote.
2509 2516 * This allows us to skip ahead more quickly
2510 2517 * since several pages may've been dealt
2511 2518 * with by this iteration of the loop.
2512 2519 */
2513 2520 }
2514 2521 mutex_enter(&rp->r_statelock);
2515 2522 }
2516 2523 mutex_exit(&rp->r_statelock);
2517 2524 }
2518 2525
2519 2526 return (error);
2520 2527 }
2521 2528
2522 2529 void
2523 2530 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2524 2531 {
2525 2532 rnode_t *rp;
2526 2533
2527 2534 rp = VTOR(vp);
2528 2535 mutex_enter(&rp->r_statelock);
2529 2536 while (rp->r_flags & RTRUNCATE)
2530 2537 cv_wait(&rp->r_cv, &rp->r_statelock);
2531 2538 rp->r_flags |= RTRUNCATE;
2532 2539 if (off == (u_offset_t)0) {
2533 2540 rp->r_flags &= ~RDIRTY;
2534 2541 if (!(rp->r_flags & RSTALE))
2535 2542 rp->r_error = 0;
2536 2543 }
|
↓ open down ↓ |
716 lines elided |
↑ open up ↑ |
2537 2544 rp->r_truncaddr = off;
2538 2545 mutex_exit(&rp->r_statelock);
2539 2546 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2540 2547 B_INVAL | B_TRUNC, cr);
2541 2548 mutex_enter(&rp->r_statelock);
2542 2549 rp->r_flags &= ~RTRUNCATE;
2543 2550 cv_broadcast(&rp->r_cv);
2544 2551 mutex_exit(&rp->r_statelock);
2545 2552 }
2546 2553
2547 -static int nfs_write_error_to_cons_only = 0;
2554 +volatile int nfs_write_error_to_cons_only = 0;
2548 2555 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2549 2556
2550 2557 /*
2551 2558 * Print a file handle
2552 2559 */
2553 2560 void
2554 2561 nfs_printfhandle(nfs_fhandle *fhp)
2555 2562 {
2556 2563 int *ip;
2557 2564 char *buf;
2558 2565 size_t bufsize;
2559 2566 char *cp;
2560 2567
2561 2568 /*
2562 2569 * 13 == "(file handle:"
2563 2570 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2564 2571 * 1 == ' '
2565 2572 * 8 == maximum strlen of "%x"
2566 2573 * 3 == ")\n\0"
2567 2574 */
2568 2575 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2569 2576 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2570 2577 if (buf == NULL)
2571 2578 return;
2572 2579
2573 2580 cp = buf;
2574 2581 (void) strcpy(cp, "(file handle:");
2575 2582 while (*cp != '\0')
2576 2583 cp++;
2577 2584 for (ip = (int *)fhp->fh_buf;
2578 2585 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2579 2586 ip++) {
2580 2587 (void) sprintf(cp, " %x", *ip);
2581 2588 while (*cp != '\0')
2582 2589 cp++;
2583 2590 }
2584 2591 (void) strcpy(cp, ")\n");
2585 2592
2586 2593 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
|
↓ open down ↓ |
29 lines elided |
↑ open up ↑ |
2587 2594
2588 2595 kmem_free(buf, bufsize);
2589 2596 }
2590 2597
2591 2598 /*
2592 2599 * Notify the system administrator that an NFS write error has
2593 2600 * occurred.
2594 2601 */
2595 2602
2596 2603 /* seconds between ENOSPC/EDQUOT messages */
2597 -clock_t nfs_write_error_interval = 5;
2604 +volatile clock_t nfs_write_error_interval = 5;
2598 2605
2599 2606 void
2600 2607 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2601 2608 {
2602 2609 mntinfo_t *mi;
2603 2610 clock_t now;
2604 2611
2605 2612 mi = VTOMI(vp);
2606 2613 /*
2607 2614 * In case of forced unmount or zone shutdown, do not print any
2608 2615 * messages since it can flood the console with error messages.
2609 2616 */
2610 2617 if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2611 2618 return;
2612 2619
2613 2620 /*
2614 2621 * No use in flooding the console with ENOSPC
2615 2622 * messages from the same file system.
2616 2623 */
2617 2624 now = ddi_get_lbolt();
2618 2625 if ((error != ENOSPC && error != EDQUOT) ||
2619 2626 now - mi->mi_printftime > 0) {
2620 2627 zoneid_t zoneid = mi->mi_zone->zone_id;
2621 2628
2622 2629 #ifdef DEBUG
2623 2630 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2624 2631 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2625 2632 #else
2626 2633 nfs_perror(error, "NFS write error on host %s: %m.\n",
2627 2634 VTOR(vp)->r_server->sv_hostname, NULL);
2628 2635 #endif
2629 2636 if (error == ENOSPC || error == EDQUOT) {
2630 2637 zcmn_err(zoneid, CE_CONT,
2631 2638 MSG("^File: userid=%d, groupid=%d\n"),
2632 2639 crgetuid(cr), crgetgid(cr));
2633 2640 if (crgetuid(CRED()) != crgetuid(cr) ||
2634 2641 crgetgid(CRED()) != crgetgid(cr)) {
2635 2642 zcmn_err(zoneid, CE_CONT,
2636 2643 MSG("^User: userid=%d, groupid=%d\n"),
2637 2644 crgetuid(CRED()), crgetgid(CRED()));
2638 2645 }
2639 2646 mi->mi_printftime = now +
2640 2647 nfs_write_error_interval * hz;
2641 2648 }
2642 2649 nfs_printfhandle(&VTOR(vp)->r_fh);
2643 2650 #ifdef DEBUG
2644 2651 if (error == EACCES) {
2645 2652 zcmn_err(zoneid, CE_CONT,
2646 2653 MSG("^nfs_bio: cred is%s kcred\n"),
2647 2654 cr == kcred ? "" : " not");
2648 2655 }
2649 2656 #endif
2650 2657 }
2651 2658 }
2652 2659
2653 2660 /* ARGSUSED */
2654 2661 static void *
2655 2662 nfs_mi_init(zoneid_t zoneid)
2656 2663 {
2657 2664 struct mi_globals *mig;
2658 2665
2659 2666 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2660 2667 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2661 2668 list_create(&mig->mig_list, sizeof (mntinfo_t),
2662 2669 offsetof(mntinfo_t, mi_zone_node));
2663 2670 mig->mig_destructor_called = B_FALSE;
2664 2671 return (mig);
2665 2672 }
2666 2673
2667 2674 /*
2668 2675 * Callback routine to tell all NFS mounts in the zone to stop creating new
2669 2676 * threads. Existing threads should exit.
2670 2677 */
2671 2678 /* ARGSUSED */
2672 2679 static void
2673 2680 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2674 2681 {
2675 2682 struct mi_globals *mig = data;
2676 2683 mntinfo_t *mi;
2677 2684
2678 2685 ASSERT(mig != NULL);
2679 2686 again:
2680 2687 mutex_enter(&mig->mig_lock);
2681 2688 for (mi = list_head(&mig->mig_list); mi != NULL;
2682 2689 mi = list_next(&mig->mig_list, mi)) {
2683 2690
2684 2691 /*
2685 2692 * If we've done the shutdown work for this FS, skip.
2686 2693 * Once we go off the end of the list, we're done.
2687 2694 */
2688 2695 if (mi->mi_flags & MI_DEAD)
2689 2696 continue;
2690 2697
2691 2698 /*
2692 2699 * We will do work, so not done. Get a hold on the FS.
2693 2700 */
2694 2701 VFS_HOLD(mi->mi_vfsp);
2695 2702
2696 2703 /*
2697 2704 * purge the DNLC for this filesystem
2698 2705 */
2699 2706 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2700 2707
2701 2708 mutex_enter(&mi->mi_async_lock);
2702 2709 /*
2703 2710 * Tell existing async worker threads to exit.
2704 2711 */
2705 2712 mi->mi_max_threads = 0;
2706 2713 NFS_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2707 2714 /*
2708 2715 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2709 2716 * getting ready to exit when it's done with its current work.
2710 2717 * Also set MI_DEAD to note we've acted on this FS.
2711 2718 */
2712 2719 mutex_enter(&mi->mi_lock);
2713 2720 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2714 2721 mutex_exit(&mi->mi_lock);
2715 2722 /*
2716 2723 * Wake up the async manager thread.
2717 2724 */
2718 2725 cv_broadcast(&mi->mi_async_reqs_cv);
2719 2726 mutex_exit(&mi->mi_async_lock);
2720 2727
2721 2728 /*
2722 2729 * Drop lock and release FS, which may change list, then repeat.
2723 2730 * We're done when every mi has been done or the list is empty.
2724 2731 */
2725 2732 mutex_exit(&mig->mig_lock);
2726 2733 VFS_RELE(mi->mi_vfsp);
2727 2734 goto again;
2728 2735 }
2729 2736 mutex_exit(&mig->mig_lock);
2730 2737 }
2731 2738
2732 2739 static void
2733 2740 nfs_mi_free_globals(struct mi_globals *mig)
2734 2741 {
2735 2742 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2736 2743 mutex_destroy(&mig->mig_lock);
2737 2744 kmem_free(mig, sizeof (*mig));
2738 2745
2739 2746 }
2740 2747
2741 2748 /* ARGSUSED */
2742 2749 static void
2743 2750 nfs_mi_destroy(zoneid_t zoneid, void *data)
2744 2751 {
2745 2752 struct mi_globals *mig = data;
2746 2753
2747 2754 ASSERT(mig != NULL);
2748 2755 mutex_enter(&mig->mig_lock);
2749 2756 if (list_head(&mig->mig_list) != NULL) {
2750 2757 /* Still waiting for VFS_FREEVFS() */
2751 2758 mig->mig_destructor_called = B_TRUE;
2752 2759 mutex_exit(&mig->mig_lock);
2753 2760 return;
2754 2761 }
2755 2762 nfs_mi_free_globals(mig);
2756 2763 }
2757 2764
2758 2765 /*
2759 2766 * Add an NFS mount to the per-zone list of NFS mounts.
2760 2767 */
2761 2768 void
2762 2769 nfs_mi_zonelist_add(mntinfo_t *mi)
2763 2770 {
2764 2771 struct mi_globals *mig;
2765 2772
2766 2773 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2767 2774 mutex_enter(&mig->mig_lock);
2768 2775 list_insert_head(&mig->mig_list, mi);
2769 2776 mutex_exit(&mig->mig_lock);
2770 2777 }
2771 2778
2772 2779 /*
2773 2780 * Remove an NFS mount from the per-zone list of NFS mounts.
2774 2781 */
2775 2782 static void
2776 2783 nfs_mi_zonelist_remove(mntinfo_t *mi)
2777 2784 {
2778 2785 struct mi_globals *mig;
2779 2786
2780 2787 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2781 2788 mutex_enter(&mig->mig_lock);
2782 2789 list_remove(&mig->mig_list, mi);
2783 2790 /*
2784 2791 * We can be called asynchronously by VFS_FREEVFS() after the zone
2785 2792 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2786 2793 * mi globals.
2787 2794 */
2788 2795 if (list_head(&mig->mig_list) == NULL &&
2789 2796 mig->mig_destructor_called == B_TRUE) {
2790 2797 nfs_mi_free_globals(mig);
2791 2798 return;
2792 2799 }
2793 2800 mutex_exit(&mig->mig_lock);
2794 2801 }
2795 2802
2796 2803 /*
2797 2804 * NFS Client initialization routine. This routine should only be called
2798 2805 * once. It performs the following tasks:
2799 2806 * - Initalize all global locks
2800 2807 * - Call sub-initialization routines (localize access to variables)
2801 2808 */
2802 2809 int
2803 2810 nfs_clntinit(void)
2804 2811 {
2805 2812 #ifdef DEBUG
2806 2813 static boolean_t nfs_clntup = B_FALSE;
2807 2814 #endif
2808 2815 int error;
2809 2816
2810 2817 #ifdef DEBUG
2811 2818 ASSERT(nfs_clntup == B_FALSE);
2812 2819 #endif
2813 2820
2814 2821 error = nfs_subrinit();
2815 2822 if (error)
2816 2823 return (error);
2817 2824
2818 2825 error = nfs_vfsinit();
2819 2826 if (error) {
2820 2827 /*
|
↓ open down ↓ |
213 lines elided |
↑ open up ↑ |
2821 2828 * Cleanup nfs_subrinit() work
2822 2829 */
2823 2830 nfs_subrfini();
2824 2831 return (error);
2825 2832 }
2826 2833 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2827 2834 nfs_mi_destroy);
2828 2835
2829 2836 nfs4_clnt_init();
2830 2837
2838 + nfscmd_init();
2839 +
2831 2840 #ifdef DEBUG
2832 2841 nfs_clntup = B_TRUE;
2833 2842 #endif
2834 2843
2835 2844 return (0);
2836 2845 }
2837 2846
2838 2847 /*
2839 2848 * This routine is only called if the NFS Client has been initialized but
2840 2849 * the module failed to be installed. This routine will cleanup the previously
2841 2850 * allocated/initialized work.
2842 2851 */
2843 2852 void
2844 2853 nfs_clntfini(void)
2845 2854 {
2846 2855 (void) zone_key_delete(mi_list_key);
2847 2856 nfs_subrfini();
2848 2857 nfs_vfsfini();
2849 2858 nfs4_clnt_fini();
2859 + nfscmd_fini();
2850 2860 }
2851 2861
2852 2862 /*
2853 2863 * nfs_lockrelease:
2854 2864 *
2855 2865 * Release any locks on the given vnode that are held by the current
2856 2866 * process.
2857 2867 */
2858 2868 void
2859 2869 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2860 2870 {
2861 2871 flock64_t ld;
2862 2872 struct shrlock shr;
2863 2873 char *buf;
2864 2874 int remote_lock_possible;
2865 2875 int ret;
2866 2876
2867 2877 ASSERT((uintptr_t)vp > KERNELBASE);
2868 2878
2869 2879 /*
2870 2880 * Generate an explicit unlock operation for the entire file. As a
2871 2881 * partial optimization, only generate the unlock if there is a
2872 2882 * lock registered for the file. We could check whether this
2873 2883 * particular process has any locks on the file, but that would
2874 2884 * require the local locking code to provide yet another query
2875 2885 * routine. Note that no explicit synchronization is needed here.
2876 2886 * At worst, flk_has_remote_locks() will return a false positive,
2877 2887 * in which case the unlock call wastes time but doesn't harm
2878 2888 * correctness.
2879 2889 *
2880 2890 * In addition, an unlock request is generated if the process
2881 2891 * is listed as possibly having a lock on the file because the
2882 2892 * server and client lock managers may have gotten out of sync.
2883 2893 * N.B. It is important to make sure nfs_remove_locking_id() is
2884 2894 * called here even if flk_has_remote_locks(vp) reports true.
2885 2895 * If it is not called and there is an entry on the process id
2886 2896 * list, that entry will never get removed.
2887 2897 */
2888 2898 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2889 2899 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2890 2900 if (remote_lock_possible || flk_has_remote_locks(vp)) {
2891 2901 ld.l_type = F_UNLCK; /* set to unlock entire file */
2892 2902 ld.l_whence = 0; /* unlock from start of file */
2893 2903 ld.l_start = 0;
2894 2904 ld.l_len = 0; /* do entire file */
2895 2905 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr,
2896 2906 NULL);
2897 2907
2898 2908 if (ret != 0) {
2899 2909 /*
2900 2910 * If VOP_FRLOCK fails, make sure we unregister
2901 2911 * local locks before we continue.
2902 2912 */
2903 2913 ld.l_pid = ttoproc(curthread)->p_pid;
2904 2914 lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2905 2915 #ifdef DEBUG
2906 2916 nfs_perror(ret,
2907 2917 "NFS lock release error on vp %p: %m.\n",
2908 2918 (void *)vp, NULL);
2909 2919 #endif
2910 2920 }
2911 2921
2912 2922 /*
2913 2923 * The call to VOP_FRLOCK may put the pid back on the
2914 2924 * list. We need to remove it.
2915 2925 */
2916 2926 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2917 2927 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2918 2928 }
2919 2929
2920 2930 /*
2921 2931 * As long as the vp has a share matching our pid,
2922 2932 * pluck it off and unshare it. There are circumstances in
2923 2933 * which the call to nfs_remove_locking_id() may put the
2924 2934 * owner back on the list, in which case we simply do a
2925 2935 * redundant and harmless unshare.
2926 2936 */
2927 2937 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2928 2938 while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2929 2939 (char *)NULL, buf, &shr.s_own_len)) {
2930 2940 shr.s_owner = buf;
2931 2941 shr.s_access = 0;
2932 2942 shr.s_deny = 0;
2933 2943 shr.s_sysid = 0;
2934 2944 shr.s_pid = curproc->p_pid;
2935 2945
2936 2946 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr, NULL);
2937 2947 #ifdef DEBUG
2938 2948 if (ret != 0) {
2939 2949 nfs_perror(ret,
2940 2950 "NFS share release error on vp %p: %m.\n",
2941 2951 (void *)vp, NULL);
2942 2952 }
2943 2953 #endif
2944 2954 }
2945 2955 kmem_free(buf, MAX_SHR_OWNER_LEN);
2946 2956 }
2947 2957
2948 2958 /*
2949 2959 * nfs_lockcompletion:
2950 2960 *
2951 2961 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2952 2962 * as non cachable (set VNOCACHE bit).
2953 2963 */
2954 2964
2955 2965 void
2956 2966 nfs_lockcompletion(vnode_t *vp, int cmd)
2957 2967 {
2958 2968 #ifdef DEBUG
2959 2969 rnode_t *rp = VTOR(vp);
2960 2970
2961 2971 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2962 2972 #endif
2963 2973
2964 2974 if (cmd == F_SETLK || cmd == F_SETLKW) {
2965 2975 if (!lm_safemap(vp)) {
2966 2976 mutex_enter(&vp->v_lock);
2967 2977 vp->v_flag |= VNOCACHE;
2968 2978 mutex_exit(&vp->v_lock);
2969 2979 } else {
2970 2980 mutex_enter(&vp->v_lock);
2971 2981 vp->v_flag &= ~VNOCACHE;
2972 2982 mutex_exit(&vp->v_lock);
2973 2983 }
2974 2984 }
2975 2985 /*
2976 2986 * The cached attributes of the file are stale after acquiring
2977 2987 * the lock on the file. They were updated when the file was
2978 2988 * opened, but not updated when the lock was acquired. Therefore the
2979 2989 * cached attributes are invalidated after the lock is obtained.
2980 2990 */
2981 2991 PURGE_ATTRCACHE(vp);
2982 2992 }
2983 2993
2984 2994 /*
2985 2995 * The lock manager holds state making it possible for the client
2986 2996 * and server to be out of sync. For example, if the response from
2987 2997 * the server granting a lock request is lost, the server will think
2988 2998 * the lock is granted and the client will think the lock is lost.
2989 2999 * The client can tell when it is not positive if it is in sync with
2990 3000 * the server.
2991 3001 *
2992 3002 * To deal with this, a list of processes for which the client is
2993 3003 * not sure if the server holds a lock is attached to the rnode.
2994 3004 * When such a process closes the rnode, an unlock request is sent
2995 3005 * to the server to unlock the entire file.
2996 3006 *
2997 3007 * The list is kept as a singularly linked NULL terminated list.
2998 3008 * Because it is only added to under extreme error conditions, the
2999 3009 * list shouldn't get very big. DEBUG kernels print a message if
3000 3010 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
3001 3011 * choosen to be 8, but can be tuned at runtime.
3002 3012 */
3003 3013 #ifdef DEBUG
3004 3014 /* int nfs_lmpl_high_water = 8; */
3005 3015 int nfs_lmpl_high_water = 128;
3006 3016 int nfs_cnt_add_locking_id = 0;
3007 3017 int nfs_len_add_locking_id = 0;
3008 3018 #endif /* DEBUG */
3009 3019
3010 3020 /*
3011 3021 * Record that the nfs lock manager server may be holding a lock on
3012 3022 * a vnode for a process.
3013 3023 *
3014 3024 * Because the nfs lock manager server holds state, it is possible
3015 3025 * for the server to get out of sync with the client. This routine is called
3016 3026 * from the client when it is no longer sure if the server is in sync
3017 3027 * with the client. nfs_lockrelease() will then notice this and send
3018 3028 * an unlock request when the file is closed
3019 3029 */
3020 3030 void
3021 3031 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
3022 3032 {
3023 3033 rnode_t *rp;
3024 3034 lmpl_t *new;
3025 3035 lmpl_t *cur;
3026 3036 lmpl_t **lmplp;
3027 3037 #ifdef DEBUG
3028 3038 int list_len = 1;
3029 3039 #endif /* DEBUG */
3030 3040
3031 3041 #ifdef DEBUG
3032 3042 ++nfs_cnt_add_locking_id;
3033 3043 #endif /* DEBUG */
3034 3044 /*
3035 3045 * allocate new lmpl_t now so we don't sleep
3036 3046 * later after grabbing mutexes
3037 3047 */
3038 3048 ASSERT(len < MAX_SHR_OWNER_LEN);
3039 3049 new = kmem_alloc(sizeof (*new), KM_SLEEP);
3040 3050 new->lmpl_type = type;
3041 3051 new->lmpl_pid = pid;
3042 3052 new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
3043 3053 bcopy(id, new->lmpl_owner, len);
3044 3054 new->lmpl_own_len = len;
3045 3055 new->lmpl_next = (lmpl_t *)NULL;
3046 3056 #ifdef DEBUG
3047 3057 if (type == RLMPL_PID) {
3048 3058 ASSERT(len == sizeof (pid_t));
3049 3059 ASSERT(pid == *(pid_t *)new->lmpl_owner);
3050 3060 } else {
3051 3061 ASSERT(type == RLMPL_OWNER);
3052 3062 }
3053 3063 #endif
3054 3064
3055 3065 rp = VTOR(vp);
3056 3066 mutex_enter(&rp->r_statelock);
3057 3067
3058 3068 /*
3059 3069 * Add this id to the list for this rnode only if the
3060 3070 * rnode is active and the id is not already there.
3061 3071 */
3062 3072 ASSERT(rp->r_flags & RHASHED);
3063 3073 lmplp = &(rp->r_lmpl);
3064 3074 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3065 3075 if (cur->lmpl_pid == pid &&
3066 3076 cur->lmpl_type == type &&
3067 3077 cur->lmpl_own_len == len &&
3068 3078 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
3069 3079 kmem_free(new->lmpl_owner, len);
3070 3080 kmem_free(new, sizeof (*new));
3071 3081 break;
3072 3082 }
3073 3083 lmplp = &cur->lmpl_next;
3074 3084 #ifdef DEBUG
3075 3085 ++list_len;
3076 3086 #endif /* DEBUG */
3077 3087 }
3078 3088 if (cur == (lmpl_t *)NULL) {
3079 3089 *lmplp = new;
3080 3090 #ifdef DEBUG
3081 3091 if (list_len > nfs_len_add_locking_id) {
3082 3092 nfs_len_add_locking_id = list_len;
3083 3093 }
3084 3094 if (list_len > nfs_lmpl_high_water) {
3085 3095 cmn_err(CE_WARN, "nfs_add_locking_id: long list "
3086 3096 "vp=%p is %d", (void *)vp, list_len);
3087 3097 }
3088 3098 #endif /* DEBUG */
3089 3099 }
3090 3100
3091 3101 #ifdef DEBUG
3092 3102 if (share_debug) {
3093 3103 int nitems = 0;
3094 3104 int npids = 0;
3095 3105 int nowners = 0;
3096 3106
3097 3107 /*
3098 3108 * Count the number of things left on r_lmpl after the remove.
3099 3109 */
3100 3110 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3101 3111 cur = cur->lmpl_next) {
3102 3112 nitems++;
3103 3113 if (cur->lmpl_type == RLMPL_PID) {
3104 3114 npids++;
3105 3115 } else if (cur->lmpl_type == RLMPL_OWNER) {
3106 3116 nowners++;
3107 3117 } else {
3108 3118 cmn_err(CE_PANIC, "nfs_add_locking_id: "
3109 3119 "unrecognized lmpl_type %d",
3110 3120 cur->lmpl_type);
3111 3121 }
3112 3122 }
3113 3123
3114 3124 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
3115 3125 "OWNs = %d items left on r_lmpl\n",
3116 3126 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
3117 3127 }
3118 3128 #endif
3119 3129
3120 3130 mutex_exit(&rp->r_statelock);
3121 3131 }
3122 3132
3123 3133 /*
3124 3134 * Remove an id from the lock manager id list.
3125 3135 *
3126 3136 * If the id is not in the list return 0. If it was found and
3127 3137 * removed, return 1.
3128 3138 */
3129 3139 static int
3130 3140 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3131 3141 {
3132 3142 lmpl_t *cur;
3133 3143 lmpl_t **lmplp;
3134 3144 rnode_t *rp;
3135 3145 int rv = 0;
3136 3146
3137 3147 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3138 3148
3139 3149 rp = VTOR(vp);
3140 3150
3141 3151 mutex_enter(&rp->r_statelock);
3142 3152 ASSERT(rp->r_flags & RHASHED);
3143 3153 lmplp = &(rp->r_lmpl);
3144 3154
3145 3155 /*
3146 3156 * Search through the list and remove the entry for this id
3147 3157 * if it is there. The special case id == NULL allows removal
3148 3158 * of the first share on the r_lmpl list belonging to the
3149 3159 * current process (if any), without regard to further details
3150 3160 * of its identity.
3151 3161 */
3152 3162 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3153 3163 if (cur->lmpl_type == type &&
3154 3164 cur->lmpl_pid == curproc->p_pid &&
3155 3165 (id == (char *)NULL ||
3156 3166 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3157 3167 *lmplp = cur->lmpl_next;
3158 3168 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3159 3169 if (rid != NULL) {
3160 3170 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3161 3171 *rlen = cur->lmpl_own_len;
3162 3172 }
3163 3173 kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3164 3174 kmem_free(cur, sizeof (*cur));
3165 3175 rv = 1;
3166 3176 break;
3167 3177 }
3168 3178 lmplp = &cur->lmpl_next;
3169 3179 }
3170 3180
3171 3181 #ifdef DEBUG
3172 3182 if (share_debug) {
3173 3183 int nitems = 0;
3174 3184 int npids = 0;
3175 3185 int nowners = 0;
3176 3186
3177 3187 /*
3178 3188 * Count the number of things left on r_lmpl after the remove.
3179 3189 */
3180 3190 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3181 3191 cur = cur->lmpl_next) {
3182 3192 nitems++;
3183 3193 if (cur->lmpl_type == RLMPL_PID) {
3184 3194 npids++;
3185 3195 } else if (cur->lmpl_type == RLMPL_OWNER) {
3186 3196 nowners++;
3187 3197 } else {
3188 3198 cmn_err(CE_PANIC,
3189 3199 "nrli: unrecognized lmpl_type %d",
3190 3200 cur->lmpl_type);
3191 3201 }
3192 3202 }
3193 3203
3194 3204 cmn_err(CE_CONT,
3195 3205 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3196 3206 (type == RLMPL_PID) ? "P" : "O",
3197 3207 npids,
3198 3208 nowners,
3199 3209 nitems);
3200 3210 }
3201 3211 #endif
3202 3212
3203 3213 mutex_exit(&rp->r_statelock);
3204 3214 return (rv);
3205 3215 }
3206 3216
3207 3217 void
3208 3218 nfs_free_mi(mntinfo_t *mi)
3209 3219 {
3210 3220 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3211 3221 ASSERT(mi->mi_manager_thread == NULL);
3212 3222 ASSERT(mi->mi_threads[NFS_ASYNC_QUEUE] == 0 &&
3213 3223 mi->mi_threads[NFS_ASYNC_PGOPS_QUEUE] == 0);
3214 3224
3215 3225 /*
3216 3226 * Remove the node from the global list before we start tearing it down.
3217 3227 */
3218 3228 nfs_mi_zonelist_remove(mi);
3219 3229 if (mi->mi_klmconfig) {
3220 3230 lm_free_config(mi->mi_klmconfig);
3221 3231 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3222 3232 }
3223 3233 mutex_destroy(&mi->mi_lock);
3224 3234 mutex_destroy(&mi->mi_remap_lock);
3225 3235 mutex_destroy(&mi->mi_async_lock);
3226 3236 cv_destroy(&mi->mi_failover_cv);
3227 3237 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_QUEUE]);
3228 3238 cv_destroy(&mi->mi_async_work_cv[NFS_ASYNC_PGOPS_QUEUE]);
3229 3239 cv_destroy(&mi->mi_async_reqs_cv);
3230 3240 cv_destroy(&mi->mi_async_cv);
3231 3241 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFS);
3232 3242 kmem_free(mi, sizeof (*mi));
3233 3243 }
3234 3244
3235 3245 static int
3236 3246 mnt_kstat_update(kstat_t *ksp, int rw)
3237 3247 {
3238 3248 mntinfo_t *mi;
3239 3249 struct mntinfo_kstat *mik;
3240 3250 vfs_t *vfsp;
3241 3251 int i;
3242 3252
3243 3253 /* this is a read-only kstat. Bail out on a write */
3244 3254 if (rw == KSTAT_WRITE)
3245 3255 return (EACCES);
3246 3256
3247 3257 /*
3248 3258 * We don't want to wait here as kstat_chain_lock could be held by
3249 3259 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3250 3260 * and thus could lead to a deadlock.
3251 3261 */
3252 3262 vfsp = (struct vfs *)ksp->ks_private;
3253 3263
3254 3264
3255 3265 mi = VFTOMI(vfsp);
3256 3266
3257 3267 mik = (struct mntinfo_kstat *)ksp->ks_data;
3258 3268
3259 3269 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3260 3270 mik->mik_vers = (uint32_t)mi->mi_vers;
3261 3271 mik->mik_flags = mi->mi_flags;
3262 3272 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3263 3273 mik->mik_curread = (uint32_t)mi->mi_curread;
3264 3274 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3265 3275 mik->mik_retrans = mi->mi_retrans;
3266 3276 mik->mik_timeo = mi->mi_timeo;
3267 3277 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3268 3278 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3269 3279 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3270 3280 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3271 3281 for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3272 3282 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3273 3283 mik->mik_timers[i].deviate =
3274 3284 (uint32_t)mi->mi_timers[i].rt_deviate;
3275 3285 mik->mik_timers[i].rtxcur =
3276 3286 (uint32_t)mi->mi_timers[i].rt_rtxcur;
3277 3287 }
3278 3288 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3279 3289 mik->mik_failover = (uint32_t)mi->mi_failover;
3280 3290 mik->mik_remap = (uint32_t)mi->mi_remap;
3281 3291 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3282 3292
3283 3293 return (0);
3284 3294 }
3285 3295
3286 3296 void
3287 3297 nfs_mnt_kstat_init(struct vfs *vfsp)
3288 3298 {
3289 3299 mntinfo_t *mi = VFTOMI(vfsp);
3290 3300
3291 3301 /*
3292 3302 * Create the version specific kstats.
3293 3303 *
3294 3304 * PSARC 2001/697 Contract Private Interface
3295 3305 * All nfs kstats are under SunMC contract
3296 3306 * Please refer to the PSARC listed above and contact
3297 3307 * SunMC before making any changes!
3298 3308 *
3299 3309 * Changes must be reviewed by Solaris File Sharing
3300 3310 * Changes must be communicated to contract-2001-697@sun.com
3301 3311 *
3302 3312 */
3303 3313
3304 3314 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3305 3315 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3306 3316 if (mi->mi_io_kstats) {
3307 3317 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3308 3318 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3309 3319 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3310 3320 kstat_install(mi->mi_io_kstats);
3311 3321 }
3312 3322
3313 3323 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3314 3324 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3315 3325 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3316 3326 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3317 3327 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3318 3328 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3319 3329 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3320 3330 kstat_install(mi->mi_ro_kstats);
3321 3331 }
3322 3332 }
3323 3333
3324 3334 nfs_delmapcall_t *
3325 3335 nfs_init_delmapcall()
3326 3336 {
3327 3337 nfs_delmapcall_t *delmap_call;
3328 3338
3329 3339 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3330 3340 delmap_call->call_id = curthread;
3331 3341 delmap_call->error = 0;
3332 3342
3333 3343 return (delmap_call);
3334 3344 }
3335 3345
3336 3346 void
3337 3347 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3338 3348 {
3339 3349 kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3340 3350 }
3341 3351
3342 3352 /*
3343 3353 * Searches for the current delmap caller (based on curthread) in the list of
3344 3354 * callers. If it is found, we remove it and free the delmap caller.
3345 3355 * Returns:
3346 3356 * 0 if the caller wasn't found
3347 3357 * 1 if the caller was found, removed and freed. *errp is set to what
3348 3358 * the result of the delmap was.
3349 3359 */
3350 3360 int
3351 3361 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3352 3362 {
3353 3363 nfs_delmapcall_t *delmap_call;
3354 3364
3355 3365 /*
3356 3366 * If the list doesn't exist yet, we create it and return
3357 3367 * that the caller wasn't found. No list = no callers.
3358 3368 */
3359 3369 mutex_enter(&rp->r_statelock);
3360 3370 if (!(rp->r_flags & RDELMAPLIST)) {
3361 3371 /* The list does not exist */
3362 3372 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3363 3373 offsetof(nfs_delmapcall_t, call_node));
3364 3374 rp->r_flags |= RDELMAPLIST;
3365 3375 mutex_exit(&rp->r_statelock);
3366 3376 return (0);
3367 3377 } else {
3368 3378 /* The list exists so search it */
3369 3379 for (delmap_call = list_head(&rp->r_indelmap);
3370 3380 delmap_call != NULL;
3371 3381 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3372 3382 if (delmap_call->call_id == curthread) {
3373 3383 /* current caller is in the list */
3374 3384 *errp = delmap_call->error;
3375 3385 list_remove(&rp->r_indelmap, delmap_call);
3376 3386 mutex_exit(&rp->r_statelock);
3377 3387 nfs_free_delmapcall(delmap_call);
3378 3388 return (1);
3379 3389 }
3380 3390 }
3381 3391 }
3382 3392 mutex_exit(&rp->r_statelock);
3383 3393 return (0);
3384 3394 }
|
↓ open down ↓ |
525 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX