Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/vm/vm_pvn.c
+++ new/usr/src/uts/common/vm/vm_pvn.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 - * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 + * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 28 /* All Rights Reserved */
29 29
30 30 /*
31 31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 32 * The Regents of the University of California
33 33 * All Rights Reserved
34 34 *
35 35 * University Acknowledgment- Portions of this document are derived from
36 36 * software developed by the University of California, Berkeley, and its
37 37 * contributors.
38 38 */
39 39
40 40 /*
41 41 * VM - paged vnode.
42 42 *
43 43 * This file supplies vm support for the vnode operations that deal with pages.
44 44 */
45 45 #include <sys/types.h>
46 46 #include <sys/t_lock.h>
47 47 #include <sys/param.h>
48 48 #include <sys/sysmacros.h>
49 49 #include <sys/systm.h>
50 50 #include <sys/time.h>
51 51 #include <sys/buf.h>
52 52 #include <sys/vnode.h>
53 53 #include <sys/uio.h>
54 54 #include <sys/vmsystm.h>
55 55 #include <sys/mman.h>
56 56 #include <sys/vfs.h>
57 57 #include <sys/cred.h>
58 58 #include <sys/user.h>
59 59 #include <sys/kmem.h>
60 60 #include <sys/cmn_err.h>
61 61 #include <sys/debug.h>
62 62 #include <sys/cpuvar.h>
63 63 #include <sys/vtrace.h>
64 64 #include <sys/tnf_probe.h>
65 65
66 66 #include <vm/hat.h>
67 67 #include <vm/as.h>
68 68 #include <vm/seg.h>
69 69 #include <vm/rm.h>
70 70 #include <vm/pvn.h>
71 71 #include <vm/page.h>
72 72 #include <vm/seg_map.h>
73 73 #include <vm/seg_kmem.h>
74 74 #include <sys/fs/swapnode.h>
75 75
76 76 int pvn_nofodklust = 0;
77 77 int pvn_write_noklust = 0;
78 78
79 79 uint_t pvn_vmodsort_supported = 0; /* set if HAT supports VMODSORT */
80 80 uint_t pvn_vmodsort_disable = 0; /* set in /etc/system to disable HAT */
81 81 /* support for vmodsort for testing */
82 82
83 83 static struct kmem_cache *marker_cache = NULL;
84 84
85 85 /*
86 86 * Find the largest contiguous block which contains `addr' for file offset
87 87 * `offset' in it while living within the file system block sizes (`vp_off'
88 88 * and `vp_len') and the address space limits for which no pages currently
89 89 * exist and which map to consecutive file offsets.
90 90 */
91 91 page_t *
92 92 pvn_read_kluster(
93 93 struct vnode *vp,
94 94 u_offset_t off,
95 95 struct seg *seg,
96 96 caddr_t addr,
97 97 u_offset_t *offp, /* return values */
98 98 size_t *lenp, /* return values */
99 99 u_offset_t vp_off,
100 100 size_t vp_len,
101 101 int isra)
102 102 {
103 103 ssize_t deltaf, deltab;
104 104 page_t *pp;
105 105 page_t *plist = NULL;
106 106 spgcnt_t pagesavail;
107 107 u_offset_t vp_end;
108 108
109 109 ASSERT(off >= vp_off && off < vp_off + vp_len);
110 110
111 111 /*
112 112 * We only want to do klustering/read ahead if there
113 113 * is more than minfree pages currently available.
114 114 */
115 115 pagesavail = freemem - minfree;
116 116
117 117 if (pagesavail <= 0)
118 118 if (isra)
119 119 return ((page_t *)NULL); /* ra case - give up */
120 120 else
121 121 pagesavail = 1; /* must return a page */
122 122
123 123 /* We calculate in pages instead of bytes due to 32-bit overflows */
124 124 if (pagesavail < (spgcnt_t)btopr(vp_len)) {
125 125 /*
126 126 * Don't have enough free memory for the
127 127 * max request, try sizing down vp request.
128 128 */
129 129 deltab = (ssize_t)(off - vp_off);
130 130 vp_len -= deltab;
131 131 vp_off += deltab;
132 132 if (pagesavail < btopr(vp_len)) {
133 133 /*
134 134 * Still not enough memory, just settle for
135 135 * pagesavail which is at least 1.
136 136 */
137 137 vp_len = ptob(pagesavail);
138 138 }
139 139 }
140 140
141 141 vp_end = vp_off + vp_len;
142 142 ASSERT(off >= vp_off && off < vp_end);
143 143
144 144 if (isra && SEGOP_KLUSTER(seg, addr, 0))
145 145 return ((page_t *)NULL); /* segment driver says no */
146 146
147 147 if ((plist = page_create_va(vp, off,
148 148 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
149 149 return ((page_t *)NULL);
150 150
151 151 if (vp_len <= PAGESIZE || pvn_nofodklust) {
152 152 *offp = off;
153 153 *lenp = MIN(vp_len, PAGESIZE);
154 154 } else {
155 155 /*
156 156 * Scan back from front by incrementing "deltab" and
157 157 * comparing "off" with "vp_off + deltab" to avoid
158 158 * "signed" versus "unsigned" conversion problems.
159 159 */
160 160 for (deltab = PAGESIZE; off >= vp_off + deltab;
161 161 deltab += PAGESIZE) {
162 162 /*
163 163 * Call back to the segment driver to verify that
164 164 * the klustering/read ahead operation makes sense.
165 165 */
166 166 if (SEGOP_KLUSTER(seg, addr, -deltab))
167 167 break; /* page not eligible */
168 168 if ((pp = page_create_va(vp, off - deltab,
169 169 PAGESIZE, PG_EXCL, seg, addr - deltab))
170 170 == NULL)
171 171 break; /* already have the page */
172 172 /*
173 173 * Add page to front of page list.
174 174 */
175 175 page_add(&plist, pp);
176 176 }
177 177 deltab -= PAGESIZE;
178 178
179 179 /* scan forward from front */
180 180 for (deltaf = PAGESIZE; off + deltaf < vp_end;
181 181 deltaf += PAGESIZE) {
182 182 /*
183 183 * Call back to the segment driver to verify that
184 184 * the klustering/read ahead operation makes sense.
185 185 */
186 186 if (SEGOP_KLUSTER(seg, addr, deltaf))
187 187 break; /* page not file extension */
188 188 if ((pp = page_create_va(vp, off + deltaf,
189 189 PAGESIZE, PG_EXCL, seg, addr + deltaf))
190 190 == NULL)
191 191 break; /* already have page */
192 192
193 193 /*
194 194 * Add page to end of page list.
195 195 */
196 196 page_add(&plist, pp);
197 197 plist = plist->p_next;
198 198 }
199 199 *offp = off = off - deltab;
200 200 *lenp = deltab + deltaf;
201 201 ASSERT(off >= vp_off);
202 202
203 203 /*
204 204 * If we ended up getting more than was actually
205 205 * requested, retract the returned length to only
206 206 * reflect what was requested. This might happen
207 207 * if we were allowed to kluster pages across a
208 208 * span of (say) 5 frags, and frag size is less
209 209 * than PAGESIZE. We need a whole number of
210 210 * pages to contain those frags, but the returned
211 211 * size should only allow the returned range to
212 212 * extend as far as the end of the frags.
213 213 */
214 214 if ((vp_off + vp_len) < (off + *lenp)) {
215 215 ASSERT(vp_end > off);
216 216 *lenp = vp_end - off;
217 217 }
218 218 }
219 219 TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
220 220 "pvn_read_kluster:seg %p addr %x isra %x",
221 221 seg, addr, isra);
222 222 return (plist);
223 223 }
224 224
225 225 /*
226 226 * Handle pages for this vnode on either side of the page "pp"
227 227 * which has been locked by the caller. This routine will also
228 228 * do klustering in the range [vp_off, vp_off + vp_len] up
229 229 * until a page which is not found. The offset and length
230 230 * of pages included is returned in "*offp" and "*lenp".
231 231 *
232 232 * Returns a list of dirty locked pages all ready to be
233 233 * written back.
234 234 */
235 235 page_t *
236 236 pvn_write_kluster(
237 237 struct vnode *vp,
238 238 page_t *pp,
239 239 u_offset_t *offp, /* return values */
240 240 size_t *lenp, /* return values */
241 241 u_offset_t vp_off,
242 242 size_t vp_len,
243 243 int flags)
244 244 {
245 245 u_offset_t off;
246 246 page_t *dirty;
247 247 size_t deltab, deltaf;
248 248 se_t se;
249 249 u_offset_t vp_end;
250 250
251 251 off = pp->p_offset;
252 252
253 253 /*
254 254 * Kustering should not be done if we are invalidating
255 255 * pages since we could destroy pages that belong to
256 256 * some other process if this is a swap vnode.
257 257 */
258 258 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
259 259 *offp = off;
260 260 *lenp = PAGESIZE;
261 261 return (pp);
262 262 }
263 263
264 264 if (flags & (B_FREE | B_INVAL))
265 265 se = SE_EXCL;
266 266 else
267 267 se = SE_SHARED;
268 268
269 269 dirty = pp;
270 270 /*
271 271 * Scan backwards looking for pages to kluster by incrementing
272 272 * "deltab" and comparing "off" with "vp_off + deltab" to
273 273 * avoid "signed" versus "unsigned" conversion problems.
274 274 */
275 275 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
276 276 pp = page_lookup_nowait(vp, off - deltab, se);
277 277 if (pp == NULL)
278 278 break; /* page not found */
279 279 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
280 280 break;
281 281 page_add(&dirty, pp);
282 282 }
283 283 deltab -= PAGESIZE;
284 284
285 285 vp_end = vp_off + vp_len;
286 286 /* now scan forwards looking for pages to kluster */
287 287 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
288 288 pp = page_lookup_nowait(vp, off + deltaf, se);
289 289 if (pp == NULL)
290 290 break; /* page not found */
291 291 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
292 292 break;
293 293 page_add(&dirty, pp);
294 294 dirty = dirty->p_next;
295 295 }
296 296
297 297 *offp = off - deltab;
298 298 *lenp = deltab + deltaf;
299 299 return (dirty);
300 300 }
301 301
302 302 /*
303 303 * Generic entry point used to release the "shared/exclusive" lock
304 304 * and the "p_iolock" on pages after i/o is complete.
305 305 */
306 306 void
307 307 pvn_io_done(page_t *plist)
308 308 {
309 309 page_t *pp;
310 310
311 311 while (plist != NULL) {
312 312 pp = plist;
313 313 page_sub(&plist, pp);
314 314 page_io_unlock(pp);
315 315 page_unlock(pp);
316 316 }
317 317 }
318 318
319 319 /*
320 320 * Entry point to be used by file system getpage subr's and
321 321 * other such routines which either want to unlock pages (B_ASYNC
322 322 * request) or destroy a list of pages if an error occurred.
323 323 */
324 324 void
325 325 pvn_read_done(page_t *plist, int flags)
326 326 {
327 327 page_t *pp;
328 328
329 329 while (plist != NULL) {
330 330 pp = plist;
331 331 page_sub(&plist, pp);
332 332 page_io_unlock(pp);
333 333 if (flags & B_ERROR) {
334 334 /*LINTED: constant in conditional context*/
335 335 VN_DISPOSE(pp, B_INVAL, 0, kcred);
336 336 } else {
337 337 (void) page_release(pp, 0);
338 338 }
339 339 }
340 340 }
341 341
342 342 /*
343 343 * Automagic pageout.
344 344 * When memory gets tight, start freeing pages popping out of the
345 345 * write queue.
346 346 */
347 347 int write_free = 1;
348 348 pgcnt_t pages_before_pager = 200; /* LMXXX */
349 349
350 350 /*
351 351 * Routine to be called when page-out's complete.
352 352 * The caller, typically VOP_PUTPAGE, has to explicity call this routine
353 353 * after waiting for i/o to complete (biowait) to free the list of
354 354 * pages associated with the buffer. These pages must be locked
355 355 * before i/o is initiated.
356 356 *
357 357 * If a write error occurs, the pages are marked as modified
358 358 * so the write will be re-tried later.
359 359 */
360 360
361 361 void
362 362 pvn_write_done(page_t *plist, int flags)
363 363 {
364 364 int dfree = 0;
365 365 int pgrec = 0;
366 366 int pgout = 0;
367 367 int pgpgout = 0;
368 368 int anonpgout = 0;
369 369 int anonfree = 0;
370 370 int fspgout = 0;
371 371 int fsfree = 0;
372 372 int execpgout = 0;
373 373 int execfree = 0;
374 374 page_t *pp;
375 375 struct cpu *cpup;
376 376 struct vnode *vp = NULL; /* for probe */
377 377 uint_t ppattr;
378 378 kmutex_t *vphm = NULL;
379 379
380 380 ASSERT((flags & B_READ) == 0);
381 381
382 382 /*
383 383 * If we are about to start paging anyway, start freeing pages.
384 384 */
385 385 if (write_free && freemem < lotsfree + pages_before_pager &&
386 386 (flags & B_ERROR) == 0) {
387 387 flags |= B_FREE;
388 388 }
389 389
390 390 /*
391 391 * Handle each page involved in the i/o operation.
392 392 */
393 393 while (plist != NULL) {
394 394 pp = plist;
395 395 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
396 396 page_sub(&plist, pp);
397 397
398 398 /* Kernel probe support */
399 399 if (vp == NULL)
400 400 vp = pp->p_vnode;
401 401
402 402 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
403 403 /*
404 404 * Move page to the top of the v_page list.
405 405 * Skip pages modified during IO.
406 406 */
407 407 vphm = page_vnode_mutex(vp);
408 408 mutex_enter(vphm);
409 409 if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
410 410 page_vpsub(&vp->v_pages, pp);
411 411 page_vpadd(&vp->v_pages, pp);
412 412 }
413 413 mutex_exit(vphm);
414 414 }
415 415
416 416 if (flags & B_ERROR) {
417 417 /*
418 418 * Write operation failed. We don't want
419 419 * to destroy (or free) the page unless B_FORCE
420 420 * is set. We set the mod bit again and release
421 421 * all locks on the page so that it will get written
422 422 * back again later when things are hopefully
423 423 * better again.
424 424 * If B_INVAL and B_FORCE is set we really have
425 425 * to destroy the page.
426 426 */
427 427 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
428 428 page_io_unlock(pp);
429 429 /*LINTED: constant in conditional context*/
430 430 VN_DISPOSE(pp, B_INVAL, 0, kcred);
431 431 } else {
432 432 hat_setmod_only(pp);
433 433 page_io_unlock(pp);
434 434 page_unlock(pp);
435 435 }
436 436 } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
437 437 /*
438 438 * If B_INVALCURONLY is set, then we handle that case
439 439 * in the next conditional if hat_page_is_mapped()
440 440 * indicates that there are no additional mappings
441 441 * to the page.
442 442 */
443 443
444 444 /*
445 445 * XXX - Failed writes with B_INVAL set are
446 446 * not handled appropriately.
447 447 */
448 448 page_io_unlock(pp);
449 449 /*LINTED: constant in conditional context*/
450 450 VN_DISPOSE(pp, B_INVAL, 0, kcred);
451 451 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
452 452 /*
453 453 * Update statistics for pages being paged out
454 454 */
455 455 if (pp->p_vnode) {
456 456 if (IS_SWAPFSVP(pp->p_vnode)) {
457 457 anonpgout++;
458 458 } else {
459 459 if (pp->p_vnode->v_flag & VVMEXEC) {
460 460 execpgout++;
461 461 } else {
462 462 fspgout++;
463 463 }
464 464 }
465 465 }
466 466 page_io_unlock(pp);
467 467 pgout = 1;
468 468 pgpgout++;
469 469 TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
470 470 "page_ws_out:pp %p", pp);
471 471
472 472 /*
473 473 * The page_struct_lock need not be acquired to
474 474 * examine "p_lckcnt" and "p_cowcnt" since we'll
475 475 * have an "exclusive" lock if the upgrade succeeds.
476 476 */
477 477 if (page_tryupgrade(pp) &&
478 478 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
479 479 /*
480 480 * Check if someone has reclaimed the
481 481 * page. If ref and mod are not set, no
482 482 * one is using it so we can free it.
483 483 * The rest of the system is careful
484 484 * to use the NOSYNC flag to unload
485 485 * translations set up for i/o w/o
486 486 * affecting ref and mod bits.
487 487 *
488 488 * Obtain a copy of the real hardware
489 489 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
490 490 * to avoid having to flush the cache.
491 491 */
492 492 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
493 493 HAT_SYNC_STOPON_MOD);
494 494 ck_refmod:
495 495 if (!(ppattr & (P_REF | P_MOD))) {
496 496 if (hat_page_is_mapped(pp)) {
497 497 /*
498 498 * Doesn't look like the page
499 499 * was modified so now we
500 500 * really have to unload the
501 501 * translations. Meanwhile
502 502 * another CPU could've
503 503 * modified it so we have to
504 504 * check again. We don't loop
505 505 * forever here because now
506 506 * the translations are gone
507 507 * and no one can get a new one
508 508 * since we have the "exclusive"
509 509 * lock on the page.
510 510 */
511 511 (void) hat_pageunload(pp,
512 512 HAT_FORCE_PGUNLOAD);
513 513 ppattr = hat_page_getattr(pp,
514 514 P_REF | P_MOD);
515 515 goto ck_refmod;
516 516 }
517 517 /*
518 518 * Update statistics for pages being
519 519 * freed
520 520 */
521 521 if (pp->p_vnode) {
522 522 if (IS_SWAPFSVP(pp->p_vnode)) {
523 523 anonfree++;
524 524 } else {
525 525 if (pp->p_vnode->v_flag
526 526 & VVMEXEC) {
527 527 execfree++;
528 528 } else {
529 529 fsfree++;
530 530 }
531 531 }
532 532 }
533 533 /*LINTED: constant in conditional ctx*/
534 534 VN_DISPOSE(pp, B_FREE,
535 535 (flags & B_DONTNEED), kcred);
536 536 dfree++;
537 537 } else {
538 538 page_unlock(pp);
539 539 pgrec++;
540 540 TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
541 541 "page_ws_free:pp %p", pp);
542 542 }
543 543 } else {
544 544 /*
545 545 * Page is either `locked' in memory
546 546 * or was reclaimed and now has a
547 547 * "shared" lock, so release it.
548 548 */
549 549 page_unlock(pp);
550 550 }
551 551 } else {
552 552 /*
553 553 * Neither B_FREE nor B_INVAL nor B_ERROR.
554 554 * Just release locks.
555 555 */
556 556 page_io_unlock(pp);
557 557 page_unlock(pp);
558 558 }
559 559 }
560 560
561 561 CPU_STATS_ENTER_K();
562 562 cpup = CPU; /* get cpup now that CPU cannot change */
563 563 CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
564 564 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
565 565 CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
566 566 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
567 567 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
568 568 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
569 569 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
570 570 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
571 571 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
572 572 CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
573 573 CPU_STATS_EXIT_K();
574 574
575 575 /* Kernel probe */
576 576 TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
577 577 tnf_opaque, vnode, vp,
578 578 tnf_ulong, pages_pageout, pgpgout,
579 579 tnf_ulong, pages_freed, dfree,
580 580 tnf_ulong, pages_reclaimed, pgrec);
581 581 }
582 582
583 583 /*
584 584 * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
585 585 * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
586 586 * B_DELWRI indicates that this page is part of a kluster
587 587 * operation and is only to be considered if it doesn't involve any
588 588 * waiting here. B_TRUNC indicates that the file is being truncated
589 589 * and so no i/o needs to be done. B_FORCE indicates that the page
590 590 * must be destroyed so don't try wrting it out.
591 591 *
592 592 * The caller must ensure that the page is locked. Returns 1, if
593 593 * the page should be written back (the "iolock" is held in this
594 594 * case), or 0 if the page has been dealt with or has been
595 595 * unlocked.
596 596 */
597 597 int
598 598 pvn_getdirty(page_t *pp, int flags)
599 599 {
600 600 ASSERT((flags & (B_INVAL | B_FREE)) ?
601 601 PAGE_EXCL(pp) : PAGE_SHARED(pp));
602 602 ASSERT(PP_ISFREE(pp) == 0);
603 603
604 604 /*
605 605 * If trying to invalidate or free a logically `locked' page,
606 606 * forget it. Don't need page_struct_lock to check p_lckcnt and
607 607 * p_cowcnt as the page is exclusively locked.
608 608 */
609 609 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
610 610 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
611 611 page_unlock(pp);
612 612 return (0);
613 613 }
614 614
615 615 /*
616 616 * Now acquire the i/o lock so we can add it to the dirty
617 617 * list (if necessary). We avoid blocking on the i/o lock
618 618 * in the following cases:
619 619 *
620 620 * If B_DELWRI is set, which implies that this request is
621 621 * due to a klustering operartion.
622 622 *
623 623 * If this is an async (B_ASYNC) operation and we are not doing
624 624 * invalidation (B_INVAL) [The current i/o or fsflush will ensure
625 625 * that the the page is written out].
626 626 */
627 627 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
628 628 if (!page_io_trylock(pp)) {
629 629 page_unlock(pp);
630 630 return (0);
631 631 }
632 632 } else {
633 633 page_io_lock(pp);
634 634 }
635 635
636 636 /*
637 637 * If we want to free or invalidate the page then
638 638 * we need to unload it so that anyone who wants
639 639 * it will have to take a minor fault to get it.
640 640 * If we are only invalidating the page for the
641 641 * current process, then pass in a different flag.
642 642 * Otherwise, we're just writing the page back so we
643 643 * need to sync up the hardwre and software mod bit to
644 644 * detect any future modifications. We clear the
645 645 * software mod bit when we put the page on the dirty
646 646 * list.
647 647 */
648 648 if (flags & B_INVALCURONLY) {
649 649 (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
650 650 } else if (flags & (B_INVAL | B_FREE)) {
651 651 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
652 652 } else {
653 653 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
654 654 }
655 655
656 656 if (!hat_ismod(pp) || (flags & B_TRUNC)) {
657 657 /*
658 658 * Don't need to add it to the
659 659 * list after all.
660 660 */
661 661 page_io_unlock(pp);
662 662 if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
663 663 /*LINTED: constant in conditional context*/
664 664 VN_DISPOSE(pp, B_INVAL, 0, kcred);
665 665 } else if (flags & B_FREE) {
666 666 /*LINTED: constant in conditional context*/
667 667 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
668 668 } else {
669 669 /*
670 670 * This is advisory path for the callers
671 671 * of VOP_PUTPAGE() who prefer freeing the
672 672 * page _only_ if no one else is accessing it.
673 673 * E.g. segmap_release()
674 674 * We also take this path for B_INVALCURONLY and
675 675 * let page_release call VN_DISPOSE if no one else is
676 676 * using the page.
677 677 *
678 678 * The above hat_ismod() check is useless because:
679 679 * (1) we may not be holding SE_EXCL lock;
680 680 * (2) we've not unloaded _all_ translations
681 681 *
682 682 * Let page_release() do the heavy-lifting.
683 683 */
684 684 (void) page_release(pp, 1);
685 685 }
686 686 return (0);
687 687 }
688 688
689 689 /*
690 690 * Page is dirty, get it ready for the write back
691 691 * and add page to the dirty list.
692 692 */
693 693 hat_clrrefmod(pp);
694 694
695 695 /*
696 696 * If we're going to free the page when we're done
697 697 * then we can let others try to use it starting now.
698 698 * We'll detect the fact that they used it when the
699 699 * i/o is done and avoid freeing the page.
700 700 */
701 701 if (flags & (B_FREE | B_INVALCURONLY))
702 702 page_downgrade(pp);
703 703
704 704
705 705 TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
706 706
707 707 return (1);
708 708 }
709 709
710 710
711 711 /*ARGSUSED*/
712 712 static int
713 713 marker_constructor(void *buf, void *cdrarg, int kmflags)
714 714 {
715 715 page_t *mark = buf;
716 716 bzero(mark, sizeof (page_t));
717 717 mark->p_hash = PVN_VPLIST_HASH_TAG;
718 718 return (0);
719 719 }
720 720
721 721 void
722 722 pvn_init()
723 723 {
724 724 if (pvn_vmodsort_disable == 0)
725 725 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
726 726 marker_cache = kmem_cache_create("marker_cache",
727 727 sizeof (page_t), 0, marker_constructor,
728 728 NULL, NULL, NULL, NULL, 0);
729 729 }
730 730
731 731
732 732 /*
733 733 * Process a vnode's page list for all pages whose offset is >= off.
734 734 * Pages are to either be free'd, invalidated, or written back to disk.
735 735 *
736 736 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
737 737 * is specified, otherwise they are "shared" locked.
738 738 *
739 739 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
740 740 *
741 741 * Special marker page_t's are inserted in the list in order
742 742 * to keep track of where we are in the list when locks are dropped.
743 743 *
744 744 * Note the list is circular and insertions can happen only at the
745 745 * head and tail of the list. The algorithm ensures visiting all pages
746 746 * on the list in the following way:
747 747 *
748 748 * Drop two marker pages at the end of the list.
749 749 *
750 750 * Move one marker page backwards towards the start of the list until
751 751 * it is at the list head, processing the pages passed along the way.
752 752 *
753 753 * Due to race conditions when the vphm mutex is dropped, additional pages
754 754 * can be added to either end of the list, so we'll continue to move
755 755 * the marker and process pages until it is up against the end marker.
756 756 *
757 757 * There is one special exit condition. If we are processing a VMODSORT
758 758 * vnode and only writing back modified pages, we can stop as soon as
759 759 * we run into an unmodified page. This makes fsync(3) operations fast.
760 760 */
761 761 int
762 762 pvn_vplist_dirty(
763 763 vnode_t *vp,
764 764 u_offset_t off,
765 765 int (*putapage)(vnode_t *, page_t *, u_offset_t *,
766 766 size_t *, int, cred_t *),
767 767 int flags,
768 768 cred_t *cred)
769 769 {
770 770 page_t *pp;
771 771 page_t *mark; /* marker page that moves toward head */
772 772 page_t *end; /* marker page at end of list */
773 773 int err = 0;
774 774 int error;
775 775 kmutex_t *vphm;
776 776 se_t se;
777 777 page_t **where_to_move;
778 778
779 779 ASSERT(vp->v_type != VCHR);
780 780
781 781 if (vp->v_pages == NULL)
782 782 return (0);
783 783
784 784
785 785 /*
786 786 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
787 787 *
788 788 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
789 789 * from getting blocked while flushing pages to a dead NFS server.
790 790 */
791 791 mutex_enter(&vp->v_lock);
792 792 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
793 793 mutex_exit(&vp->v_lock);
794 794 return (EAGAIN);
795 795 }
796 796
797 797 while (vp->v_flag & VVMLOCK)
798 798 cv_wait(&vp->v_cv, &vp->v_lock);
799 799
800 800 if (vp->v_pages == NULL) {
801 801 mutex_exit(&vp->v_lock);
802 802 return (0);
803 803 }
804 804
805 805 vp->v_flag |= VVMLOCK;
806 806 mutex_exit(&vp->v_lock);
807 807
808 808
809 809 /*
810 810 * Set up the marker pages used to walk the list
811 811 */
812 812 end = kmem_cache_alloc(marker_cache, KM_SLEEP);
813 813 end->p_vnode = vp;
814 814 end->p_offset = (u_offset_t)-2;
815 815 mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
816 816 mark->p_vnode = vp;
817 817 mark->p_offset = (u_offset_t)-1;
818 818
819 819 /*
820 820 * Grab the lock protecting the vnode's page list
821 821 * note that this lock is dropped at times in the loop.
822 822 */
823 823 vphm = page_vnode_mutex(vp);
824 824 mutex_enter(vphm);
825 825 if (vp->v_pages == NULL)
826 826 goto leave;
827 827
828 828 /*
829 829 * insert the markers and loop through the list of pages
830 830 */
831 831 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
832 832 page_vpadd(&mark->p_vpnext, end);
833 833 for (;;) {
834 834
835 835 /*
836 836 * If only doing an async write back, then we can
837 837 * stop as soon as we get to start of the list.
838 838 */
839 839 if (flags == B_ASYNC && vp->v_pages == mark)
840 840 break;
841 841
842 842 /*
843 843 * otherwise stop when we've gone through all the pages
844 844 */
845 845 if (mark->p_vpprev == end)
846 846 break;
847 847
848 848 pp = mark->p_vpprev;
849 849 if (vp->v_pages == pp)
850 850 where_to_move = &vp->v_pages;
851 851 else
852 852 where_to_move = &pp->p_vpprev->p_vpnext;
853 853
854 854 ASSERT(pp->p_vnode == vp);
855 855
856 856 /*
857 857 * If just flushing dirty pages to disk and this vnode
858 858 * is using a sorted list of pages, we can stop processing
859 859 * as soon as we find an unmodified page. Since all the
860 860 * modified pages are visited first.
861 861 */
862 862 if (IS_VMODSORT(vp) &&
863 863 !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
864 864 if (!hat_ismod(pp) && !page_io_locked(pp)) {
865 865 #ifdef DEBUG
866 866 /*
867 867 * For debug kernels examine what should be
868 868 * all the remaining clean pages, asserting
869 869 * that they are not modified.
870 870 */
871 871 page_t *chk = pp;
872 872 int attr;
873 873
874 874 page_vpsub(&vp->v_pages, mark);
875 875 page_vpadd(where_to_move, mark);
876 876 do {
877 877 chk = chk->p_vpprev;
878 878 ASSERT(chk != end);
879 879 if (chk == mark)
880 880 continue;
881 881 attr = hat_page_getattr(chk, P_MOD |
882 882 P_REF);
883 883 if ((attr & P_MOD) == 0)
884 884 continue;
885 885 panic("v_pages list not all clean: "
886 886 "page_t*=%p vnode=%p off=%lx "
887 887 "attr=0x%x last clean page_t*=%p\n",
888 888 (void *)chk, (void *)chk->p_vnode,
889 889 (long)chk->p_offset, attr,
890 890 (void *)pp);
891 891 } while (chk != vp->v_pages);
892 892 #endif
893 893 break;
894 894 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
895 895 /*
896 896 * Couldn't get io lock, wait until IO is done.
897 897 * Block only for sync IO since we don't want
898 898 * to block async IO.
899 899 */
900 900 mutex_exit(vphm);
901 901 page_io_wait(pp);
902 902 mutex_enter(vphm);
903 903 continue;
904 904 }
905 905 }
906 906
907 907 /*
908 908 * Skip this page if the offset is out of the desired range.
909 909 * Just move the marker and continue.
910 910 */
911 911 if (pp->p_offset < off) {
912 912 page_vpsub(&vp->v_pages, mark);
913 913 page_vpadd(where_to_move, mark);
914 914 continue;
915 915 }
916 916
917 917 /*
918 918 * If we are supposed to invalidate or free this
919 919 * page, then we need an exclusive lock.
920 920 */
921 921 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
922 922
923 923 /*
924 924 * We must acquire the page lock for all synchronous
925 925 * operations (invalidate, free and write).
926 926 */
927 927 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
928 928 /*
929 929 * If the page_lock() drops the mutex
930 930 * we must retry the loop.
931 931 */
932 932 if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
933 933 continue;
934 934
935 935 /*
936 936 * It's ok to move the marker page now.
937 937 */
938 938 page_vpsub(&vp->v_pages, mark);
939 939 page_vpadd(where_to_move, mark);
940 940 } else {
941 941
942 942 /*
943 943 * update the marker page for all remaining cases
944 944 */
945 945 page_vpsub(&vp->v_pages, mark);
946 946 page_vpadd(where_to_move, mark);
947 947
948 948 /*
949 949 * For write backs, If we can't lock the page, it's
950 950 * invalid or in the process of being destroyed. Skip
951 951 * it, assuming someone else is writing it.
952 952 */
953 953 if (!page_trylock(pp, se))
954 954 continue;
955 955 }
956 956
957 957 ASSERT(pp->p_vnode == vp);
958 958
959 959 /*
960 960 * Successfully locked the page, now figure out what to
961 961 * do with it. Free pages are easily dealt with, invalidate
962 962 * if desired or just go on to the next page.
963 963 */
964 964 if (PP_ISFREE(pp)) {
965 965 if ((flags & B_INVAL) == 0) {
966 966 page_unlock(pp);
967 967 continue;
968 968 }
969 969
970 970 /*
971 971 * Invalidate (destroy) the page.
972 972 */
973 973 mutex_exit(vphm);
974 974 page_destroy_free(pp);
975 975 mutex_enter(vphm);
976 976 continue;
977 977 }
978 978
979 979 /*
980 980 * pvn_getdirty() figures out what do do with a dirty page.
981 981 * If the page is dirty, the putapage() routine will write it
982 982 * and will kluster any other adjacent dirty pages it can.
983 983 *
984 984 * pvn_getdirty() and `(*putapage)' unlock the page.
985 985 */
986 986 mutex_exit(vphm);
987 987 if (pvn_getdirty(pp, flags)) {
988 988 error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
989 989 if (!err)
990 990 err = error;
991 991 }
992 992 mutex_enter(vphm);
993 993 }
994 994 page_vpsub(&vp->v_pages, mark);
995 995 page_vpsub(&vp->v_pages, end);
996 996
997 997 leave:
998 998 /*
999 999 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
1000 1000 */
1001 1001 mutex_exit(vphm);
1002 1002 kmem_cache_free(marker_cache, mark);
1003 1003 kmem_cache_free(marker_cache, end);
1004 1004 mutex_enter(&vp->v_lock);
1005 1005 vp->v_flag &= ~VVMLOCK;
1006 1006 cv_broadcast(&vp->v_cv);
1007 1007 mutex_exit(&vp->v_lock);
1008 1008 return (err);
1009 1009 }
1010 1010
1011 1011 /*
1012 1012 * Walk the vp->v_pages list, for every page call the callback function
1013 1013 * pointed by *page_check. If page_check returns non-zero, then mark the
1014 1014 * page as modified and if VMODSORT is set, move it to the end of v_pages
1015 1015 * list. Moving makes sense only if we have at least two pages - this also
1016 1016 * avoids having v_pages temporarily being NULL after calling page_vpsub()
1017 1017 * if there was just one page.
1018 1018 */
1019 1019 void
1020 1020 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1021 1021 {
1022 1022 page_t *pp, *next, *end;
1023 1023 kmutex_t *vphm;
1024 1024 int shuffle;
1025 1025
1026 1026 vphm = page_vnode_mutex(vp);
1027 1027 mutex_enter(vphm);
1028 1028
1029 1029 if (vp->v_pages == NULL) {
1030 1030 mutex_exit(vphm);
1031 1031 return;
1032 1032 }
1033 1033
1034 1034 end = vp->v_pages->p_vpprev;
1035 1035 shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1036 1036 pp = vp->v_pages;
1037 1037
1038 1038 for (;;) {
1039 1039 next = pp->p_vpnext;
1040 1040 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1041 1041 /*
1042 1042 * hat_setmod_only() in contrast to hat_setmod() does
1043 1043 * not shuffle the pages and does not grab the mutex
1044 1044 * page_vnode_mutex. Exactly what we need.
1045 1045 */
1046 1046 hat_setmod_only(pp);
1047 1047 if (shuffle) {
1048 1048 page_vpsub(&vp->v_pages, pp);
1049 1049 ASSERT(vp->v_pages != NULL);
1050 1050 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1051 1051 pp);
1052 1052 }
1053 1053 }
1054 1054 /* Stop if we have just processed the last page. */
1055 1055 if (pp == end)
1056 1056 break;
1057 1057 pp = next;
1058 1058 }
1059 1059
1060 1060 mutex_exit(vphm);
1061 1061 }
1062 1062
1063 1063 /*
1064 1064 * Zero out zbytes worth of data. Caller should be aware that this
1065 1065 * routine may enter back into the fs layer (xxx_getpage). Locks
1066 1066 * that the xxx_getpage routine may need should not be held while
1067 1067 * calling this.
1068 1068 */
1069 1069 void
1070 1070 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1071 1071 {
1072 1072 caddr_t addr;
1073 1073
1074 1074 ASSERT(vp->v_type != VCHR);
1075 1075
1076 1076 if (vp->v_pages == NULL)
1077 1077 return;
1078 1078
1079 1079 /*
1080 1080 * zbytes may be zero but there still may be some portion of
1081 1081 * a page which needs clearing (since zbytes is a function
1082 1082 * of filesystem block size, not pagesize.)
1083 1083 */
1084 1084 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1085 1085 return;
1086 1086
1087 1087 /*
1088 1088 * We get the last page and handle the partial
1089 1089 * zeroing via kernel mappings. This will make the page
1090 1090 * dirty so that we know that when this page is written
1091 1091 * back, the zeroed information will go out with it. If
1092 1092 * the page is not currently in memory, then the kzero
1093 1093 * operation will cause it to be brought it. We use kzero
1094 1094 * instead of bzero so that if the page cannot be read in
1095 1095 * for any reason, the system will not panic. We need
1096 1096 * to zero out a minimum of the fs given zbytes, but we
1097 1097 * might also have to do more to get the entire last page.
1098 1098 */
1099 1099
1100 1100 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1101 1101 panic("pvn_vptrunc zbytes");
1102 1102 addr = segmap_getmapflt(segkmap, vp, vplen,
1103 1103 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1104 1104 (void) kzero(addr + (vplen & MAXBOFFSET),
1105 1105 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1106 1106 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1107 1107 }
1108 1108
1109 1109 /*
1110 1110 * Handles common work of the VOP_GETPAGE routines by iterating page by page
1111 1111 * calling the getpage helper for each.
1112 1112 */
1113 1113 int
1114 1114 pvn_getpages(
1115 1115 int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1116 1116 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1117 1117 struct vnode *vp,
1118 1118 u_offset_t off,
1119 1119 size_t len,
1120 1120 uint_t *protp,
1121 1121 page_t *pl[],
1122 1122 size_t plsz,
1123 1123 struct seg *seg,
1124 1124 caddr_t addr,
1125 1125 enum seg_rw rw,
1126 1126 struct cred *cred)
1127 1127 {
1128 1128 page_t **ppp;
1129 1129 u_offset_t o, eoff;
1130 1130 size_t sz, xlen;
1131 1131 int err;
1132 1132
1133 1133 /* ensure that we have enough space */
1134 1134 ASSERT(pl == NULL || plsz >= len);
1135 1135
1136 1136 /*
1137 1137 * Loop one page at a time and let getapage function fill
1138 1138 * in the next page in array. We only allow one page to be
1139 1139 * returned at a time (except for the last page) so that we
1140 1140 * don't have any problems with duplicates and other such
1141 1141 * painful problems. This is a very simple minded algorithm,
1142 1142 * but it does the job correctly. We hope that the cost of a
1143 1143 * getapage call for a resident page that we might have been
1144 1144 * able to get from an earlier call doesn't cost too much.
1145 1145 */
1146 1146 ppp = pl;
1147 1147 sz = (pl != NULL) ? PAGESIZE : 0;
1148 1148 eoff = off + len;
1149 1149 xlen = len;
1150 1150 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1151 1151 xlen -= PAGESIZE) {
1152 1152 if (o + PAGESIZE >= eoff && pl != NULL) {
1153 1153 /*
1154 1154 * Last time through - allow the all of
1155 1155 * what's left of the pl[] array to be used.
1156 1156 */
1157 1157 sz = plsz - (o - off);
1158 1158 }
1159 1159 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1160 1160 rw, cred);
1161 1161 if (err) {
1162 1162 /*
1163 1163 * Release any pages we already got.
1164 1164 */
1165 1165 if (o > off && pl != NULL) {
1166 1166 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1167 1167 (void) page_release(*ppp, 1);
1168 1168 }
1169 1169 break;
1170 1170 }
1171 1171 if (pl != NULL)
1172 1172 ppp++;
1173 1173 }
1174 1174 return (err);
1175 1175 }
1176 1176
1177 1177 /*
1178 1178 * Initialize the page list array.
1179 1179 */
1180 1180 /*ARGSUSED*/
1181 1181 void
1182 1182 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1183 1183 u_offset_t off, size_t io_len, enum seg_rw rw)
1184 1184 {
1185 1185 ssize_t sz;
1186 1186 page_t *ppcur, **ppp;
1187 1187
1188 1188 /*
1189 1189 * Set up to load plsz worth
1190 1190 * starting at the needed page.
1191 1191 */
1192 1192 while (pp != NULL && pp->p_offset != off) {
1193 1193 /*
1194 1194 * Remove page from the i/o list,
1195 1195 * release the i/o and the page lock.
1196 1196 */
1197 1197 ppcur = pp;
1198 1198 page_sub(&pp, ppcur);
1199 1199 page_io_unlock(ppcur);
1200 1200 (void) page_release(ppcur, 1);
1201 1201 }
1202 1202
1203 1203 if (pp == NULL) {
1204 1204 pl[0] = NULL;
1205 1205 return;
1206 1206 }
1207 1207
1208 1208 sz = plsz;
1209 1209
1210 1210 /*
1211 1211 * Initialize the page list array.
1212 1212 */
1213 1213 ppp = pl;
1214 1214 do {
1215 1215 ppcur = pp;
1216 1216 *ppp++ = ppcur;
1217 1217 page_sub(&pp, ppcur);
1218 1218 page_io_unlock(ppcur);
1219 1219 if (rw != S_CREATE)
1220 1220 page_downgrade(ppcur);
1221 1221 sz -= PAGESIZE;
1222 1222 } while (sz > 0 && pp != NULL);
1223 1223 *ppp = NULL; /* terminate list */
1224 1224
1225 1225 /*
1226 1226 * Now free the remaining pages that weren't
1227 1227 * loaded in the page list.
1228 1228 */
1229 1229 while (pp != NULL) {
1230 1230 ppcur = pp;
1231 1231 page_sub(&pp, ppcur);
1232 1232 page_io_unlock(ppcur);
1233 1233 (void) page_release(ppcur, 1);
1234 1234 }
1235 1235 }
|
↓ open down ↓ |
1201 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX