Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/vm_pageout.c
+++ new/usr/src/uts/common/os/vm_pageout.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 +/*
26 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 + */
25 28
26 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 30 /* All Rights Reserved */
28 31
29 32 /*
30 33 * University Copyright- Copyright (c) 1982, 1986, 1988
31 34 * The Regents of the University of California
32 35 * All Rights Reserved
33 36 *
34 37 * University Acknowledgment- Portions of this document are derived from
35 38 * software developed by the University of California, Berkeley, and its
36 39 * contributors.
37 40 */
38 41
39 42 #include <sys/types.h>
40 43 #include <sys/t_lock.h>
41 44 #include <sys/param.h>
42 45 #include <sys/buf.h>
43 46 #include <sys/uio.h>
44 47 #include <sys/proc.h>
45 48 #include <sys/systm.h>
46 49 #include <sys/mman.h>
47 50 #include <sys/cred.h>
48 51 #include <sys/vnode.h>
49 52 #include <sys/vm.h>
50 53 #include <sys/vmparam.h>
51 54 #include <sys/vtrace.h>
52 55 #include <sys/cmn_err.h>
53 56 #include <sys/cpuvar.h>
54 57 #include <sys/user.h>
55 58 #include <sys/kmem.h>
56 59 #include <sys/debug.h>
57 60 #include <sys/callb.h>
58 61 #include <sys/tnf_probe.h>
59 62 #include <sys/mem_cage.h>
60 63 #include <sys/time.h>
61 64
62 65 #include <vm/hat.h>
63 66 #include <vm/as.h>
64 67 #include <vm/seg.h>
65 68 #include <vm/page.h>
66 69 #include <vm/pvn.h>
67 70 #include <vm/seg_kmem.h>
|
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
68 71
69 72 static int checkpage(page_t *, int);
70 73
71 74 /*
72 75 * The following parameters control operation of the page replacement
73 76 * algorithm. They are initialized to 0, and then computed at boot time
74 77 * based on the size of the system. If they are patched non-zero in
75 78 * a loaded vmunix they are left alone and may thus be changed per system
76 79 * using adb on the loaded system.
77 80 */
78 -pgcnt_t slowscan = 0;
79 -pgcnt_t fastscan = 0;
81 +volatile pgcnt_t slowscan = 0;
82 +volatile pgcnt_t fastscan = 0;
80 83
81 -static pgcnt_t handspreadpages = 0;
84 +volatile pgcnt_t handspreadpages = 0;
82 85 static int loopfraction = 2;
83 86 static pgcnt_t looppages;
84 -static int min_percent_cpu = 4;
87 +volatile int min_percent_cpu = 4;
85 88 static int max_percent_cpu = 80;
86 89 static pgcnt_t maxfastscan = 0;
87 90 static pgcnt_t maxslowscan = 100;
88 91
89 -pgcnt_t maxpgio = 0;
90 -pgcnt_t minfree = 0;
91 -pgcnt_t desfree = 0;
92 -pgcnt_t lotsfree = 0;
92 +volatile pgcnt_t maxpgio = 0;
93 +volatile pgcnt_t minfree = 0;
94 +volatile pgcnt_t desfree = 0;
95 +volatile pgcnt_t lotsfree = 0;
93 96 pgcnt_t needfree = 0;
94 -pgcnt_t throttlefree = 0;
95 -pgcnt_t pageout_reserve = 0;
97 +volatile pgcnt_t throttlefree = 0;
98 +volatile pgcnt_t pageout_reserve = 0;
96 99
97 100 pgcnt_t deficit;
98 101 pgcnt_t nscan;
99 102 pgcnt_t desscan;
100 103
101 104 /*
102 105 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
103 106 * are the number of ticks in each wakeup cycle that gives the
104 107 * equivalent of some underlying %CPU duty cycle.
105 108 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
106 109 * awakened every 25 clock ticks. So, converting from %CPU to ticks
107 110 * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
108 111 * So, for example, 4% == 1 tick and 80% == 20 ticks.
109 112 *
110 113 * min_pageout_ticks:
111 114 * ticks/wakeup equivalent of min_percent_cpu.
112 115 *
113 116 * max_pageout_ticks:
114 117 * ticks/wakeup equivalent of max_percent_cpu.
115 118 *
116 119 * pageout_ticks:
117 120 * Number of clock ticks budgeted for each wakeup cycle.
118 121 * Computed each time around by schedpaging().
119 122 * Varies between min_pageout_ticks .. max_pageout_ticks,
120 123 * depending on memory pressure.
121 124 *
122 125 * pageout_lbolt:
123 126 * Timestamp of the last time pageout_scanner woke up and started
124 127 * (or resumed) scanning for not recently referenced pages.
125 128 */
126 129
127 130 static clock_t min_pageout_ticks;
128 131 static clock_t max_pageout_ticks;
129 132 static clock_t pageout_ticks;
130 133 static clock_t pageout_lbolt;
131 134
132 135 static uint_t reset_hands;
133 136
134 137 #define PAGES_POLL_MASK 1023
135 138
136 139 /*
137 140 * pageout_sample_lim:
138 141 * The limit on the number of samples needed to establish a value
139 142 * for new pageout parameters, fastscan, slowscan, and handspreadpages.
140 143 *
141 144 * pageout_sample_cnt:
142 145 * Current sample number. Once the sample gets large enough,
143 146 * set new values for handspreadpages, fastscan and slowscan.
144 147 *
145 148 * pageout_sample_pages:
146 149 * The accumulated number of pages scanned during sampling.
147 150 *
148 151 * pageout_sample_ticks:
149 152 * The accumulated clock ticks for the sample.
150 153 *
151 154 * pageout_rate:
152 155 * Rate in pages/nanosecond, computed at the end of sampling.
153 156 *
154 157 * pageout_new_spread:
155 158 * The new value to use for fastscan and handspreadpages.
156 159 * Calculated after enough samples have been taken.
157 160 */
158 161
159 162 typedef hrtime_t hrrate_t;
160 163
161 164 static uint64_t pageout_sample_lim = 4;
162 165 static uint64_t pageout_sample_cnt = 0;
163 166 static pgcnt_t pageout_sample_pages = 0;
164 167 static hrrate_t pageout_rate = 0;
165 168 static pgcnt_t pageout_new_spread = 0;
166 169
167 170 static clock_t pageout_cycle_ticks;
168 171 static hrtime_t sample_start, sample_end;
169 172 static hrtime_t pageout_sample_etime = 0;
170 173
171 174 /*
172 175 * Record number of times a pageout_scanner wakeup cycle finished because it
173 176 * timed out (exceeded its CPU budget), rather than because it visited
174 177 * its budgeted number of pages.
175 178 */
176 179 uint64_t pageout_timeouts = 0;
177 180
178 181 #ifdef VM_STATS
179 182 static struct pageoutvmstats_str {
180 183 ulong_t checkpage[3];
181 184 } pageoutvmstats;
182 185 #endif /* VM_STATS */
183 186
184 187 /*
185 188 * Threads waiting for free memory use this condition variable and lock until
186 189 * memory becomes available.
187 190 */
188 191 kmutex_t memavail_lock;
189 192 kcondvar_t memavail_cv;
190 193
191 194 /*
192 195 * The size of the clock loop.
193 196 */
194 197 #define LOOPPAGES total_pages
195 198
196 199 /*
197 200 * Set up the paging constants for the clock algorithm.
198 201 * Called after the system is initialized and the amount of memory
199 202 * and number of paging devices is known.
200 203 *
201 204 * lotsfree is 1/64 of memory, but at least 512K.
202 205 * desfree is 1/2 of lotsfree.
203 206 * minfree is 1/2 of desfree.
204 207 *
205 208 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
206 209 *
207 210 * lotsfree = btop(512K)
208 211 * desfree = btop(200K)
209 212 * minfree = btop(100K)
210 213 * throttlefree = INT_MIN
211 214 * max_percent_cpu = 4
212 215 */
213 216 void
214 217 setupclock(int recalc)
215 218 {
216 219
217 220 static spgcnt_t init_lfree, init_dfree, init_mfree;
218 221 static spgcnt_t init_tfree, init_preserve, init_mpgio;
219 222 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
220 223
221 224 looppages = LOOPPAGES;
222 225
223 226 /*
224 227 * setupclock can now be called to recalculate the paging
225 228 * parameters in the case of dynamic addition of memory.
226 229 * So to make sure we make the proper calculations, if such a
227 230 * situation should arise, we save away the initial values
228 231 * of each parameter so we can recall them when needed. This
229 232 * way we don't lose the settings an admin might have made
230 233 * through the /etc/system file.
231 234 */
232 235
233 236 if (!recalc) {
234 237 init_lfree = lotsfree;
235 238 init_dfree = desfree;
236 239 init_mfree = minfree;
237 240 init_tfree = throttlefree;
238 241 init_preserve = pageout_reserve;
239 242 init_mpgio = maxpgio;
240 243 init_mfscan = maxfastscan;
241 244 init_fscan = fastscan;
242 245 init_sscan = slowscan;
243 246 init_hspages = handspreadpages;
244 247 }
245 248
246 249 /*
247 250 * Set up thresholds for paging:
248 251 */
249 252
250 253 /*
251 254 * Lotsfree is threshold where paging daemon turns on.
252 255 */
253 256 if (init_lfree == 0 || init_lfree >= looppages)
254 257 lotsfree = MAX(looppages / 64, btop(512 * 1024));
255 258 else
256 259 lotsfree = init_lfree;
257 260
258 261 /*
259 262 * Desfree is amount of memory desired free.
260 263 * If less than this for extended period, start swapping.
261 264 */
262 265 if (init_dfree == 0 || init_dfree >= lotsfree)
263 266 desfree = lotsfree / 2;
264 267 else
265 268 desfree = init_dfree;
266 269
267 270 /*
268 271 * Minfree is minimal amount of free memory which is tolerable.
269 272 */
270 273 if (init_mfree == 0 || init_mfree >= desfree)
271 274 minfree = desfree / 2;
272 275 else
273 276 minfree = init_mfree;
274 277
275 278 /*
276 279 * Throttlefree is the point at which we start throttling
277 280 * PG_WAIT requests until enough memory becomes available.
278 281 */
279 282 if (init_tfree == 0 || init_tfree >= desfree)
280 283 throttlefree = minfree;
281 284 else
282 285 throttlefree = init_tfree;
283 286
284 287 /*
285 288 * Pageout_reserve is the number of pages that we keep in
286 289 * stock for pageout's own use. Having a few such pages
287 290 * provides insurance against system deadlock due to
288 291 * pageout needing pages. When freemem < pageout_reserve,
289 292 * non-blocking allocations are denied to any threads
290 293 * other than pageout and sched. (At some point we might
291 294 * want to consider a per-thread flag like T_PUSHING_PAGES
292 295 * to indicate that a thread is part of the page-pushing
293 296 * dance (e.g. an interrupt thread) and thus is entitled
294 297 * to the same special dispensation we accord pageout.)
295 298 */
296 299 if (init_preserve == 0 || init_preserve >= throttlefree)
297 300 pageout_reserve = throttlefree / 2;
298 301 else
299 302 pageout_reserve = init_preserve;
300 303
301 304 /*
302 305 * Maxpgio thresholds how much paging is acceptable.
303 306 * This figures that 2/3 busy on an arm is all that is
304 307 * tolerable for paging. We assume one operation per disk rev.
305 308 *
306 309 * XXX - Does not account for multiple swap devices.
307 310 */
308 311 if (init_mpgio == 0)
309 312 maxpgio = (DISKRPM * 2) / 3;
310 313 else
311 314 maxpgio = init_mpgio;
312 315
313 316 /*
314 317 * The clock scan rate varies between fastscan and slowscan
315 318 * based on the amount of free memory available. Fastscan
316 319 * rate should be set based on the number pages that can be
317 320 * scanned per sec using ~10% of processor time. Since this
318 321 * value depends on the processor, MMU, Mhz etc., it is
319 322 * difficult to determine it in a generic manner for all
320 323 * architectures.
321 324 *
322 325 * Instead of trying to determine the number of pages scanned
323 326 * per sec for every processor, fastscan is set to be the smaller
324 327 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
325 328 * time is limited to ~4% of processor time.
326 329 *
327 330 * Setting fastscan to be 1/2 of memory allows pageout to scan
328 331 * all of memory in ~2 secs. This implies that user pages not
329 332 * accessed within 1 sec (assuming, handspreadpages == fastscan)
330 333 * can be reclaimed when free memory is very low. Stealing pages
331 334 * not accessed within 1 sec seems reasonable and ensures that
332 335 * active user processes don't thrash.
333 336 *
334 337 * Smaller values of fastscan result in scanning fewer pages
335 338 * every second and consequently pageout may not be able to free
336 339 * sufficient memory to maintain the minimum threshold. Larger
337 340 * values of fastscan result in scanning a lot more pages which
338 341 * could lead to thrashing and higher CPU usage.
339 342 *
340 343 * Fastscan needs to be limited to a maximum value and should not
341 344 * scale with memory to prevent pageout from consuming too much
342 345 * time for scanning on slow CPU's and avoid thrashing, as a
343 346 * result of scanning too many pages, on faster CPU's.
344 347 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
345 348 * (the upper bound for fastscan) based on the average number
346 349 * of pages that can potentially be scanned in ~1 sec (using ~4%
347 350 * of the CPU) on some of the following machines that currently
348 351 * run Solaris 2.x:
349 352 *
350 353 * average memory scanned in ~1 sec
351 354 *
352 355 * 25 Mhz SS1+: 23 Meg
353 356 * LX: 37 Meg
354 357 * 50 Mhz SC2000: 68 Meg
355 358 *
356 359 * 40 Mhz 486: 26 Meg
357 360 * 66 Mhz 486: 42 Meg
358 361 *
359 362 * When free memory falls just below lotsfree, the scan rate
360 363 * goes from 0 to slowscan (i.e., pageout starts running). This
361 364 * transition needs to be smooth and is achieved by ensuring that
362 365 * pageout scans a small number of pages to satisfy the transient
363 366 * memory demand. This is set to not exceed 100 pages/sec (25 per
364 367 * wakeup) since scanning that many pages has no noticible impact
365 368 * on system performance.
366 369 *
367 370 * In addition to setting fastscan and slowscan, pageout is
368 371 * limited to using ~4% of the CPU. This results in increasing
369 372 * the time taken to scan all of memory, which in turn means that
370 373 * user processes have a better opportunity of preventing their
371 374 * pages from being stolen. This has a positive effect on
372 375 * interactive and overall system performance when memory demand
373 376 * is high.
374 377 *
375 378 * Thus, the rate at which pages are scanned for replacement will
376 379 * vary linearly between slowscan and the number of pages that
377 380 * can be scanned using ~4% of processor time instead of varying
378 381 * linearly between slowscan and fastscan.
379 382 *
380 383 * Also, the processor time used by pageout will vary from ~1%
381 384 * at slowscan to ~4% at fastscan instead of varying between
382 385 * ~1% at slowscan and ~10% at fastscan.
383 386 *
384 387 * The values chosen for the various VM parameters (fastscan,
385 388 * handspreadpages, etc) are not universally true for all machines,
386 389 * but appear to be a good rule of thumb for the machines we've
387 390 * tested. They have the following ranges:
388 391 *
389 392 * cpu speed: 20 to 70 Mhz
390 393 * page size: 4K to 8K
391 394 * memory size: 16M to 5G
392 395 * page scan rate: 4000 - 17400 4K pages per sec
393 396 *
394 397 * The values need to be re-examined for machines which don't
395 398 * fall into the various ranges (e.g., slower or faster CPUs,
396 399 * smaller or larger pagesizes etc) shown above.
397 400 *
398 401 * On an MP machine, pageout is often unable to maintain the
399 402 * minimum paging thresholds under heavy load. This is due to
400 403 * the fact that user processes running on other CPU's can be
401 404 * dirtying memory at a much faster pace than pageout can find
402 405 * pages to free. The memory demands could be met by enabling
403 406 * more than one CPU to run the clock algorithm in such a manner
404 407 * that the various clock hands don't overlap. This also makes
405 408 * it more difficult to determine the values for fastscan, slowscan
406 409 * and handspreadpages.
407 410 *
408 411 * The swapper is currently used to free up memory when pageout
409 412 * is unable to meet memory demands by swapping out processes.
410 413 * In addition to freeing up memory, swapping also reduces the
411 414 * demand for memory by preventing user processes from running
412 415 * and thereby consuming memory.
413 416 */
414 417 if (init_mfscan == 0) {
415 418 if (pageout_new_spread != 0)
416 419 maxfastscan = pageout_new_spread;
417 420 else
418 421 maxfastscan = MAXHANDSPREADPAGES;
419 422 } else {
420 423 maxfastscan = init_mfscan;
421 424 }
422 425 if (init_fscan == 0)
423 426 fastscan = MIN(looppages / loopfraction, maxfastscan);
424 427 else
425 428 fastscan = init_fscan;
426 429 if (fastscan > looppages / loopfraction)
427 430 fastscan = looppages / loopfraction;
428 431
429 432 /*
430 433 * Set slow scan time to 1/10 the fast scan time, but
431 434 * not to exceed maxslowscan.
432 435 */
433 436 if (init_sscan == 0)
434 437 slowscan = MIN(fastscan / 10, maxslowscan);
435 438 else
436 439 slowscan = init_sscan;
437 440 if (slowscan > fastscan / 2)
438 441 slowscan = fastscan / 2;
439 442
440 443 /*
441 444 * Handspreadpages is distance (in pages) between front and back
442 445 * pageout daemon hands. The amount of time to reclaim a page
443 446 * once pageout examines it increases with this distance and
444 447 * decreases as the scan rate rises. It must be < the amount
445 448 * of pageable memory.
446 449 *
447 450 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
448 451 * to be "fastscan" results in the front hand being a few secs
449 452 * (varies based on the processor speed) ahead of the back hand
450 453 * at fastscan rates. This distance can be further reduced, if
451 454 * necessary, by increasing the processor time used by pageout
452 455 * to be more than ~4% and preferrably not more than ~10%.
453 456 *
454 457 * As a result, user processes have a much better chance of
455 458 * referencing their pages before the back hand examines them.
456 459 * This also significantly lowers the number of reclaims from
457 460 * the freelist since pageout does not end up freeing pages which
458 461 * may be referenced a sec later.
459 462 */
460 463 if (init_hspages == 0)
461 464 handspreadpages = fastscan;
462 465 else
463 466 handspreadpages = init_hspages;
464 467
465 468 /*
466 469 * Make sure that back hand follows front hand by at least
467 470 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
468 471 * for the back hand to look at a page during the same wakeup of
469 472 * the pageout daemon in which the front hand cleared its ref bit.
470 473 */
471 474 if (handspreadpages >= looppages)
472 475 handspreadpages = looppages - 1;
473 476
474 477 /*
475 478 * If we have been called to recalculate the parameters,
476 479 * set a flag to re-evaluate the clock hand pointers.
477 480 */
478 481 if (recalc)
479 482 reset_hands = 1;
480 483 }
481 484
482 485 /*
483 486 * Pageout scheduling.
484 487 *
485 488 * Schedpaging controls the rate at which the page out daemon runs by
486 489 * setting the global variables nscan and desscan RATETOSCHEDPAGING
487 490 * times a second. Nscan records the number of pages pageout has examined
488 491 * in its current pass; schedpaging resets this value to zero each time
489 492 * it runs. Desscan records the number of pages pageout should examine
490 493 * in its next pass; schedpaging sets this value based on the amount of
491 494 * currently available memory.
492 495 */
493 496
494 497 #define RATETOSCHEDPAGING 4 /* hz that is */
495 498
496 499 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
497 500
498 501 /*
499 502 * Pool of available async pageout putpage requests.
500 503 */
501 504 static struct async_reqs *push_req;
502 505 static struct async_reqs *req_freelist; /* available req structs */
503 506 static struct async_reqs *push_list; /* pending reqs */
504 507 static kmutex_t push_lock; /* protects req pool */
505 508 static kcondvar_t push_cv;
506 509
507 510 static int async_list_size = 256; /* number of async request structs */
508 511
509 512 static void pageout_scanner(void);
510 513
511 514 /*
512 515 * If a page is being shared more than "po_share" times
513 516 * then leave it alone- don't page it out.
514 517 */
515 518 #define MIN_PO_SHARE (8)
516 519 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
517 520 ulong_t po_share = MIN_PO_SHARE;
518 521
519 522 /*
520 523 * Schedule rate for paging.
521 524 * Rate is linear interpolation between
522 525 * slowscan with lotsfree and fastscan when out of memory.
523 526 */
524 527 static void
525 528 schedpaging(void *arg)
526 529 {
527 530 spgcnt_t vavail;
528 531
529 532 if (freemem < lotsfree + needfree + kmem_reapahead)
530 533 kmem_reap();
531 534
532 535 if (freemem < lotsfree + needfree)
533 536 seg_preap();
534 537
535 538 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
536 539 kcage_cageout_wakeup();
537 540
538 541 if (mutex_tryenter(&pageout_mutex)) {
539 542 /* pageout() not running */
540 543 nscan = 0;
541 544 vavail = freemem - deficit;
542 545 if (pageout_new_spread != 0)
543 546 vavail -= needfree;
544 547 if (vavail < 0)
545 548 vavail = 0;
546 549 if (vavail > lotsfree)
547 550 vavail = lotsfree;
548 551
549 552 /*
550 553 * Fix for 1161438 (CRS SPR# 73922). All variables
551 554 * in the original calculation for desscan were 32 bit signed
552 555 * ints. As freemem approaches 0x0 on a system with 1 Gig or
553 556 * more of memory, the calculation can overflow. When this
554 557 * happens, desscan becomes negative and pageout_scanner()
555 558 * stops paging out.
556 559 */
557 560 if ((needfree) && (pageout_new_spread == 0)) {
558 561 /*
559 562 * If we've not yet collected enough samples to
560 563 * calculate a spread, use the old logic of kicking
561 564 * into high gear anytime needfree is non-zero.
562 565 */
563 566 desscan = fastscan / RATETOSCHEDPAGING;
564 567 } else {
565 568 /*
566 569 * Once we've calculated a spread based on system
567 570 * memory and usage, just treat needfree as another
568 571 * form of deficit.
569 572 */
570 573 spgcnt_t faststmp, slowstmp, result;
571 574
572 575 slowstmp = slowscan * vavail;
573 576 faststmp = fastscan * (lotsfree - vavail);
574 577 result = (slowstmp + faststmp) /
575 578 nz(lotsfree) / RATETOSCHEDPAGING;
576 579 desscan = (pgcnt_t)result;
577 580 }
578 581
579 582 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
580 583 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
581 584
582 585 if (freemem < lotsfree + needfree ||
583 586 pageout_sample_cnt < pageout_sample_lim) {
584 587 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
585 588 "pageout_cv_signal:freemem %ld", freemem);
586 589 cv_signal(&proc_pageout->p_cv);
587 590 } else {
588 591 /*
589 592 * There are enough free pages, no need to
590 593 * kick the scanner thread. And next time
591 594 * around, keep more of the `highly shared'
592 595 * pages.
593 596 */
594 597 cv_signal_pageout();
595 598 if (po_share > MIN_PO_SHARE) {
596 599 po_share >>= 1;
597 600 }
598 601 }
599 602 mutex_exit(&pageout_mutex);
600 603 }
601 604
602 605 /*
603 606 * Signal threads waiting for available memory.
604 607 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
605 608 * in this case it is not needed - the waiters will be waken up during
606 609 * the next invocation of this function.
607 610 */
608 611 if (kmem_avail() > 0)
609 612 cv_broadcast(&memavail_cv);
610 613
611 614 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
612 615 }
613 616
614 617 pgcnt_t pushes;
615 618 ulong_t push_list_size; /* # of requests on pageout queue */
616 619
617 620 #define FRONT 1
618 621 #define BACK 2
619 622
620 623 int dopageout = 1; /* must be non-zero to turn page stealing on */
621 624
622 625 /*
623 626 * The page out daemon, which runs as process 2.
624 627 *
625 628 * As long as there are at least lotsfree pages,
626 629 * this process is not run. When the number of free
627 630 * pages stays in the range desfree to lotsfree,
628 631 * this daemon runs through the pages in the loop
629 632 * at a rate determined in schedpaging(). Pageout manages
630 633 * two hands on the clock. The front hand moves through
631 634 * memory, clearing the reference bit,
632 635 * and stealing pages from procs that are over maxrss.
633 636 * The back hand travels a distance behind the front hand,
634 637 * freeing the pages that have not been referenced in the time
635 638 * since the front hand passed. If modified, they are pushed to
636 639 * swap before being freed.
637 640 *
638 641 * There are 2 threads that act on behalf of the pageout process.
639 642 * One thread scans pages (pageout_scanner) and frees them up if
640 643 * they don't require any VOP_PUTPAGE operation. If a page must be
641 644 * written back to its backing store, the request is put on a list
642 645 * and the other (pageout) thread is signaled. The pageout thread
643 646 * grabs VOP_PUTPAGE requests from the list, and processes them.
644 647 * Some filesystems may require resources for the VOP_PUTPAGE
645 648 * operations (like memory) and hence can block the pageout
646 649 * thread, but the scanner thread can still operate. There is still
647 650 * no guarantee that memory deadlocks cannot occur.
648 651 *
649 652 * For now, this thing is in very rough form.
650 653 */
651 654 void
652 655 pageout()
653 656 {
654 657 struct async_reqs *arg;
655 658 pri_t pageout_pri;
656 659 int i;
657 660 pgcnt_t max_pushes;
658 661 callb_cpr_t cprinfo;
659 662
660 663 proc_pageout = ttoproc(curthread);
661 664 proc_pageout->p_cstime = 0;
662 665 proc_pageout->p_stime = 0;
663 666 proc_pageout->p_cutime = 0;
664 667 proc_pageout->p_utime = 0;
665 668 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
666 669 bcopy("pageout", PTOU(curproc)->u_comm, 7);
667 670
668 671 /*
669 672 * Create pageout scanner thread
670 673 */
671 674 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
672 675 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
673 676
674 677 /*
675 678 * Allocate and initialize the async request structures
676 679 * for pageout.
677 680 */
678 681 push_req = (struct async_reqs *)
679 682 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
680 683
681 684 req_freelist = push_req;
682 685 for (i = 0; i < async_list_size - 1; i++)
683 686 push_req[i].a_next = &push_req[i + 1];
684 687
685 688 pageout_pri = curthread->t_pri;
686 689
687 690 /* Create the pageout scanner thread. */
688 691 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
689 692 pageout_pri - 1);
690 693
691 694 /*
692 695 * kick off pageout scheduler.
693 696 */
694 697 schedpaging(NULL);
695 698
696 699 /*
697 700 * Create kernel cage thread.
698 701 * The kernel cage thread is started under the pageout process
699 702 * to take advantage of the less restricted page allocation
700 703 * in page_create_throttle().
701 704 */
702 705 kcage_cageout_init();
703 706
704 707 /*
705 708 * Limit pushes to avoid saturating pageout devices.
706 709 */
707 710 max_pushes = maxpgio / RATETOSCHEDPAGING;
708 711 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
709 712
710 713 for (;;) {
711 714 mutex_enter(&push_lock);
712 715
713 716 while ((arg = push_list) == NULL || pushes > max_pushes) {
714 717 CALLB_CPR_SAFE_BEGIN(&cprinfo);
715 718 cv_wait(&push_cv, &push_lock);
716 719 pushes = 0;
717 720 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
718 721 }
719 722 push_list = arg->a_next;
720 723 arg->a_next = NULL;
721 724 mutex_exit(&push_lock);
722 725
723 726 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
724 727 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
725 728 pushes++;
726 729 }
727 730
728 731 /* vp held by checkpage() */
729 732 VN_RELE(arg->a_vp);
730 733
731 734 mutex_enter(&push_lock);
732 735 arg->a_next = req_freelist; /* back on freelist */
733 736 req_freelist = arg;
734 737 push_list_size--;
735 738 mutex_exit(&push_lock);
736 739 }
737 740 }
738 741
739 742 /*
740 743 * Kernel thread that scans pages looking for ones to free
741 744 */
742 745 static void
743 746 pageout_scanner(void)
744 747 {
745 748 struct page *fronthand, *backhand;
746 749 uint_t count;
747 750 callb_cpr_t cprinfo;
748 751 pgcnt_t nscan_limit;
749 752 pgcnt_t pcount;
750 753
751 754 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
752 755 mutex_enter(&pageout_mutex);
753 756
754 757 /*
755 758 * The restart case does not attempt to point the hands at roughly
756 759 * the right point on the assumption that after one circuit things
757 760 * will have settled down - and restarts shouldn't be that often.
758 761 */
759 762
760 763 /*
761 764 * Set the two clock hands to be separated by a reasonable amount,
762 765 * but no more than 360 degrees apart.
763 766 */
764 767 backhand = page_first();
765 768 if (handspreadpages >= total_pages)
766 769 fronthand = page_nextn(backhand, total_pages - 1);
767 770 else
768 771 fronthand = page_nextn(backhand, handspreadpages);
769 772
770 773 min_pageout_ticks = MAX(1,
771 774 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
772 775 max_pageout_ticks = MAX(min_pageout_ticks,
773 776 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
774 777
775 778 loop:
776 779 cv_signal_pageout();
777 780
778 781 CALLB_CPR_SAFE_BEGIN(&cprinfo);
779 782 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
780 783 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
781 784
782 785 if (!dopageout)
783 786 goto loop;
784 787
785 788 if (reset_hands) {
786 789 reset_hands = 0;
787 790
788 791 backhand = page_first();
789 792 if (handspreadpages >= total_pages)
790 793 fronthand = page_nextn(backhand, total_pages - 1);
791 794 else
792 795 fronthand = page_nextn(backhand, handspreadpages);
793 796 }
794 797
795 798 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
796 799 count = 0;
797 800
798 801 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
799 802 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
800 803 freemem, lotsfree, nscan, desscan);
801 804
802 805 /* Kernel probe */
803 806 TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
804 807 tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
805 808
806 809 pcount = 0;
807 810 if (pageout_sample_cnt < pageout_sample_lim) {
808 811 nscan_limit = total_pages;
809 812 } else {
810 813 nscan_limit = desscan;
811 814 }
812 815 pageout_lbolt = ddi_get_lbolt();
813 816 sample_start = gethrtime();
814 817
815 818 /*
816 819 * Scan the appropriate number of pages for a single duty cycle.
817 820 * However, stop scanning as soon as there is enough free memory.
818 821 * For a short while, we will be sampling the performance of the
819 822 * scanner and need to keep running just to get sample data, in
820 823 * which case we keep going and don't pay attention to whether
821 824 * or not there is enough free memory.
822 825 */
823 826
824 827 while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
825 828 pageout_sample_cnt < pageout_sample_lim)) {
826 829 int rvfront, rvback;
827 830
828 831 /*
829 832 * Check to see if we have exceeded our %CPU budget
830 833 * for this wakeup, but not on every single page visited,
831 834 * just every once in a while.
832 835 */
833 836 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
834 837 pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
835 838 if (pageout_cycle_ticks >= pageout_ticks) {
836 839 ++pageout_timeouts;
837 840 break;
838 841 }
839 842 }
840 843
841 844 /*
842 845 * If checkpage manages to add a page to the free list,
843 846 * we give ourselves another couple of trips around the loop.
844 847 */
845 848 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
846 849 count = 0;
847 850 if ((rvback = checkpage(backhand, BACK)) == 1)
848 851 count = 0;
849 852
850 853 ++pcount;
851 854
852 855 /*
853 856 * protected by pageout_mutex instead of cpu_stat_lock
854 857 */
855 858 CPU_STATS_ADDQ(CPU, vm, scan, 1);
856 859
857 860 /*
858 861 * Don't include ineligible pages in the number scanned.
859 862 */
860 863 if (rvfront != -1 || rvback != -1)
861 864 nscan++;
862 865
863 866 backhand = page_next(backhand);
864 867
865 868 /*
866 869 * backhand update and wraparound check are done separately
867 870 * because lint barks when it finds an empty "if" body
868 871 */
869 872
870 873 if ((fronthand = page_next(fronthand)) == page_first()) {
871 874 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
872 875 "pageout_hand_wrap:freemem %ld whichhand %d",
873 876 freemem, FRONT);
874 877
875 878 /*
876 879 * protected by pageout_mutex instead of cpu_stat_lock
877 880 */
878 881 CPU_STATS_ADDQ(CPU, vm, rev, 1);
879 882 if (++count > 1) {
880 883 /*
881 884 * Extremely unlikely, but it happens.
882 885 * We went around the loop at least once
883 886 * and didn't get far enough.
884 887 * If we are still skipping `highly shared'
885 888 * pages, skip fewer of them. Otherwise,
886 889 * give up till the next clock tick.
887 890 */
888 891 if (po_share < MAX_PO_SHARE) {
889 892 po_share <<= 1;
890 893 } else {
891 894 /*
892 895 * Really a "goto loop", but
893 896 * if someone is TRACing or
894 897 * TNF_PROBE_ing, at least
895 898 * make records to show
896 899 * where we are.
897 900 */
898 901 break;
899 902 }
900 903 }
901 904 }
902 905 }
903 906
904 907 sample_end = gethrtime();
905 908
906 909 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
907 910 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
908 911 freemem, lotsfree, nscan, desscan, count);
909 912
910 913 /* Kernel probe */
911 914 TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
912 915 tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
913 916
914 917 if (pageout_sample_cnt < pageout_sample_lim) {
915 918 pageout_sample_pages += pcount;
916 919 pageout_sample_etime += sample_end - sample_start;
917 920 ++pageout_sample_cnt;
918 921 }
919 922 if (pageout_sample_cnt >= pageout_sample_lim &&
920 923 pageout_new_spread == 0) {
921 924 pageout_rate = (hrrate_t)pageout_sample_pages *
922 925 (hrrate_t)(NANOSEC) / pageout_sample_etime;
923 926 pageout_new_spread = pageout_rate / 10;
924 927 setupclock(1);
925 928 }
926 929
927 930 goto loop;
928 931 }
929 932
930 933 /*
931 934 * Look at the page at hand. If it is locked (e.g., for physical i/o),
932 935 * system (u., page table) or free, then leave it alone. Otherwise,
933 936 * if we are running the front hand, turn off the page's reference bit.
934 937 * If the proc is over maxrss, we take it. If running the back hand,
935 938 * check whether the page has been reclaimed. If not, free the page,
936 939 * pushing it to disk first if necessary.
937 940 *
938 941 * Return values:
939 942 * -1 if the page is not a candidate at all,
940 943 * 0 if not freed, or
941 944 * 1 if we freed it.
942 945 */
943 946 static int
944 947 checkpage(struct page *pp, int whichhand)
945 948 {
946 949 int ppattr;
947 950 int isfs = 0;
948 951 int isexec = 0;
949 952 int pagesync_flag;
950 953
951 954 /*
952 955 * Skip pages:
953 956 * - associated with the kernel vnode since
954 957 * they are always "exclusively" locked.
955 958 * - that are free
956 959 * - that are shared more than po_share'd times
957 960 * - its already locked
958 961 *
959 962 * NOTE: These optimizations assume that reads are atomic.
960 963 */
961 964
962 965 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
963 966 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
964 967 hat_page_checkshare(pp, po_share)) {
965 968 return (-1);
966 969 }
967 970
968 971 if (!page_trylock(pp, SE_EXCL)) {
969 972 /*
970 973 * Skip the page if we can't acquire the "exclusive" lock.
971 974 */
972 975 return (-1);
973 976 } else if (PP_ISFREE(pp)) {
974 977 /*
975 978 * It became free between the above check and our actually
976 979 * locking the page. Oh, well there will be other pages.
977 980 */
978 981 page_unlock(pp);
979 982 return (-1);
980 983 }
981 984
982 985 /*
983 986 * Reject pages that cannot be freed. The page_struct_lock
984 987 * need not be acquired to examine these
985 988 * fields since the page has an "exclusive" lock.
986 989 */
987 990 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
988 991 page_unlock(pp);
989 992 return (-1);
990 993 }
991 994
992 995 /*
993 996 * Maintain statistics for what we are freeing
994 997 */
995 998
996 999 if (pp->p_vnode != NULL) {
997 1000 if (pp->p_vnode->v_flag & VVMEXEC)
998 1001 isexec = 1;
999 1002
1000 1003 if (!IS_SWAPFSVP(pp->p_vnode))
1001 1004 isfs = 1;
1002 1005 }
1003 1006
1004 1007 /*
1005 1008 * Turn off REF and MOD bits with the front hand.
1006 1009 * The back hand examines the REF bit and always considers
1007 1010 * SHARED pages as referenced.
1008 1011 */
1009 1012 if (whichhand == FRONT)
1010 1013 pagesync_flag = HAT_SYNC_ZERORM;
1011 1014 else
1012 1015 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1013 1016 HAT_SYNC_STOPON_SHARED;
1014 1017
1015 1018 ppattr = hat_pagesync(pp, pagesync_flag);
1016 1019
1017 1020 recheck:
1018 1021 /*
1019 1022 * If page is referenced; make unreferenced but reclaimable.
1020 1023 * If this page is not referenced, then it must be reclaimable
1021 1024 * and we can add it to the free list.
1022 1025 */
1023 1026 if (ppattr & P_REF) {
1024 1027 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1025 1028 "pageout_isref:pp %p whichhand %d", pp, whichhand);
1026 1029 if (whichhand == FRONT) {
1027 1030 /*
1028 1031 * Checking of rss or madvise flags needed here...
1029 1032 *
1030 1033 * If not "well-behaved", fall through into the code
1031 1034 * for not referenced.
1032 1035 */
1033 1036 hat_clrref(pp);
1034 1037 }
1035 1038 /*
1036 1039 * Somebody referenced the page since the front
1037 1040 * hand went by, so it's not a candidate for
1038 1041 * freeing up.
1039 1042 */
1040 1043 page_unlock(pp);
1041 1044 return (0);
1042 1045 }
1043 1046
1044 1047 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1045 1048
1046 1049 /*
1047 1050 * If large page, attempt to demote it. If successfully demoted,
1048 1051 * retry the checkpage.
1049 1052 */
1050 1053 if (pp->p_szc != 0) {
1051 1054 if (!page_try_demote_pages(pp)) {
1052 1055 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1053 1056 page_unlock(pp);
1054 1057 return (-1);
1055 1058 }
1056 1059 ASSERT(pp->p_szc == 0);
1057 1060 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1058 1061 /*
1059 1062 * since page_try_demote_pages() could have unloaded some
1060 1063 * mappings it makes sense to reload ppattr.
1061 1064 */
1062 1065 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1063 1066 }
1064 1067
1065 1068 /*
1066 1069 * If the page is currently dirty, we have to arrange
1067 1070 * to have it cleaned before it can be freed.
1068 1071 *
1069 1072 * XXX - ASSERT(pp->p_vnode != NULL);
1070 1073 */
1071 1074 if ((ppattr & P_MOD) && pp->p_vnode) {
1072 1075 struct vnode *vp = pp->p_vnode;
1073 1076 u_offset_t offset = pp->p_offset;
1074 1077
1075 1078 /*
1076 1079 * XXX - Test for process being swapped out or about to exit?
1077 1080 * [Can't get back to process(es) using the page.]
1078 1081 */
1079 1082
1080 1083 /*
1081 1084 * Hold the vnode before releasing the page lock to
1082 1085 * prevent it from being freed and re-used by some
1083 1086 * other thread.
1084 1087 */
1085 1088 VN_HOLD(vp);
1086 1089 page_unlock(pp);
1087 1090
1088 1091 /*
1089 1092 * Queue i/o request for the pageout thread.
1090 1093 */
1091 1094 if (!queue_io_request(vp, offset)) {
1092 1095 VN_RELE(vp);
1093 1096 return (0);
1094 1097 }
1095 1098 return (1);
1096 1099 }
1097 1100
1098 1101 /*
1099 1102 * Now we unload all the translations,
1100 1103 * and put the page back on to the free list.
1101 1104 * If the page was used (referenced or modified) after
1102 1105 * the pagesync but before it was unloaded we catch it
1103 1106 * and handle the page properly.
1104 1107 */
1105 1108 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1106 1109 "pageout_free:pp %p whichhand %d", pp, whichhand);
1107 1110 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1108 1111 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1109 1112 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1110 1113 goto recheck;
1111 1114
1112 1115 /*LINTED: constant in conditional context*/
1113 1116 VN_DISPOSE(pp, B_FREE, 0, kcred);
1114 1117
1115 1118 CPU_STATS_ADD_K(vm, dfree, 1);
1116 1119
1117 1120 if (isfs) {
1118 1121 if (isexec) {
1119 1122 CPU_STATS_ADD_K(vm, execfree, 1);
1120 1123 } else {
1121 1124 CPU_STATS_ADD_K(vm, fsfree, 1);
1122 1125 }
1123 1126 } else {
1124 1127 CPU_STATS_ADD_K(vm, anonfree, 1);
1125 1128 }
1126 1129
1127 1130 return (1); /* freed a page! */
1128 1131 }
1129 1132
1130 1133 /*
1131 1134 * Queue async i/o request from pageout_scanner and segment swapout
1132 1135 * routines on one common list. This ensures that pageout devices (swap)
1133 1136 * are not saturated by pageout_scanner or swapout requests.
1134 1137 * The pageout thread empties this list by initiating i/o operations.
1135 1138 */
1136 1139 int
1137 1140 queue_io_request(vnode_t *vp, u_offset_t off)
1138 1141 {
1139 1142 struct async_reqs *arg;
1140 1143
1141 1144 /*
1142 1145 * If we cannot allocate an async request struct,
1143 1146 * skip this page.
1144 1147 */
1145 1148 mutex_enter(&push_lock);
1146 1149 if ((arg = req_freelist) == NULL) {
1147 1150 mutex_exit(&push_lock);
1148 1151 return (0);
1149 1152 }
1150 1153 req_freelist = arg->a_next; /* adjust freelist */
1151 1154 push_list_size++;
1152 1155
1153 1156 arg->a_vp = vp;
1154 1157 arg->a_off = off;
1155 1158 arg->a_len = PAGESIZE;
1156 1159 arg->a_flags = B_ASYNC | B_FREE;
1157 1160 arg->a_cred = kcred; /* always held */
1158 1161
1159 1162 /*
1160 1163 * Add to list of pending write requests.
1161 1164 */
1162 1165 arg->a_next = push_list;
1163 1166 push_list = arg;
1164 1167
1165 1168 if (req_freelist == NULL) {
1166 1169 /*
1167 1170 * No free async requests left. The lock is held so we
1168 1171 * might as well signal the pusher thread now.
1169 1172 */
1170 1173 cv_signal(&push_cv);
1171 1174 }
1172 1175 mutex_exit(&push_lock);
1173 1176 return (1);
1174 1177 }
1175 1178
1176 1179 /*
1177 1180 * Wakeup pageout to initiate i/o if push_list is not empty.
1178 1181 */
1179 1182 void
1180 1183 cv_signal_pageout()
1181 1184 {
1182 1185 if (push_list != NULL) {
1183 1186 mutex_enter(&push_lock);
1184 1187 cv_signal(&push_cv);
1185 1188 mutex_exit(&push_lock);
1186 1189 }
1187 1190 }
|
↓ open down ↓ |
1082 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX