Print this page
5513 KM_NORMALPRI should be documented in kmem_alloc(9f) and kmem_cache_create(9f) man pages
14465 Present KM_NOSLEEP_LAZY as documented interface
Change-Id: I002ec28ddf390650f1fcba1ca94f6abfdb241439
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/vm_pageout.c
+++ new/usr/src/uts/common/os/vm_pageout.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2021 Oxide Computer Company
24 24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
25 25 */
26 26
27 27 /*
28 28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 29 * Use is subject to license terms.
30 30 */
31 31
32 32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
33 33 /* All Rights Reserved */
34 34
35 35 /*
36 36 * University Copyright- Copyright (c) 1982, 1986, 1988
37 37 * The Regents of the University of California
38 38 * All Rights Reserved
39 39 *
40 40 * University Acknowledgment- Portions of this document are derived from
41 41 * software developed by the University of California, Berkeley, and its
42 42 * contributors.
43 43 */
44 44
45 45 #include <sys/types.h>
46 46 #include <sys/t_lock.h>
47 47 #include <sys/param.h>
48 48 #include <sys/buf.h>
49 49 #include <sys/uio.h>
50 50 #include <sys/proc.h>
51 51 #include <sys/systm.h>
52 52 #include <sys/mman.h>
53 53 #include <sys/cred.h>
54 54 #include <sys/vnode.h>
55 55 #include <sys/vm.h>
56 56 #include <sys/vmparam.h>
57 57 #include <sys/vtrace.h>
58 58 #include <sys/cmn_err.h>
59 59 #include <sys/cpuvar.h>
60 60 #include <sys/user.h>
61 61 #include <sys/kmem.h>
62 62 #include <sys/debug.h>
63 63 #include <sys/callb.h>
64 64 #include <sys/tnf_probe.h>
65 65 #include <sys/mem_cage.h>
66 66 #include <sys/time.h>
67 67 #include <sys/stdbool.h>
68 68
69 69 #include <vm/hat.h>
70 70 #include <vm/as.h>
71 71 #include <vm/seg.h>
72 72 #include <vm/page.h>
73 73 #include <vm/pvn.h>
74 74 #include <vm/seg_kmem.h>
75 75
76 76 /*
77 77 * FREE MEMORY MANAGEMENT
78 78 *
79 79 * Management of the pool of free pages is a tricky business. There are
80 80 * several critical threshold values which constrain our allocation of new
81 81 * pages and inform the rate of paging out of memory to swap. These threshold
82 82 * values, and the behaviour they induce, are described below in descending
83 83 * order of size -- and thus increasing order of severity!
84 84 *
85 85 * +---------------------------------------------------- physmem (all memory)
86 86 * |
87 87 * | Ordinarily there are no particular constraints placed on page
88 88 * v allocation. The page scanner is not running and page_create_va()
89 89 * | will effectively grant all page requests (whether from the kernel
90 90 * | or from user processes) without artificial delay.
91 91 * |
92 92 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
93 93 * |
94 94 * | When we have less than "lotsfree" pages, pageout_scanner() is
95 95 * v signalled by schedpaging() to begin looking for pages that can
96 96 * | be evicted to disk to bring us back above lotsfree. At this
97 97 * | stage there is still no constraint on allocation of free pages.
98 98 * |
99 99 * | For small systems, we set a lower bound of 16MB for lotsfree;
100 100 * v this is the natural value for a system with 1GB memory. This is
101 101 * | to ensure that the pageout reserve pool contains at least 4MB
102 102 * | for use by ZFS.
103 103 * |
104 104 * | For systems with a large amount of memory, we constrain lotsfree
105 105 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
106 106 * v at some point the required slack relates more closely to the
107 107 * | rate at which paging can occur than to the total amount of memory.
108 108 * |
109 109 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
110 110 * |
111 111 * | When we drop below desfree, a number of kernel facilities will
112 112 * v wait before allocating more memory, under the assumption that
113 113 * | pageout or reaping will make progress and free up some memory.
114 114 * | This behaviour is not especially coordinated; look for comparisons
115 115 * | of desfree and freemem.
116 116 * |
117 117 * | In addition to various attempts at advisory caution, clock()
118 118 * | will wake up the thread that is ordinarily parked in sched().
119 119 * | This routine is responsible for the heavy-handed swapping out
120 120 * v of entire processes in an attempt to arrest the slide of free
121 121 * | memory. See comments in sched.c for more details.
122 122 * |
123 123 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
124 124 * |
125 125 * | These two separate tunables have, by default, the same value.
|
↓ open down ↓ |
125 lines elided |
↑ open up ↑ |
126 126 * v Various parts of the kernel use minfree to signal the need for
127 127 * | more aggressive reclamation of memory, and sched() is more
128 128 * | aggressive at swapping processes out.
129 129 * |
130 130 * | If free memory falls below throttlefree, page_create_va() will
131 131 * | use page_create_throttle() to begin holding most requests for
132 132 * | new pages while pageout and reaping free up memory. Sleeping
133 133 * v allocations (e.g., KM_SLEEP) are held here while we wait for
134 134 * | more memory. Non-sleeping allocations are generally allowed to
135 135 * | proceed, unless their priority is explicitly lowered with
136 - * | KM_NORMALPRI.
136 + * | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
137 137 * |
138 138 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
139 139 * |
140 140 * | When we hit throttlefree, the situation is already dire. The
141 141 * v system is generally paging out memory and swapping out entire
142 142 * | processes in order to free up memory for continued operation.
143 143 * |
144 144 * | Unfortunately, evicting memory to disk generally requires short
145 145 * | term use of additional memory; e.g., allocation of buffers for
146 146 * | storage drivers, updating maps of free and used blocks, etc.
147 147 * | As such, pageout_reserve is the number of pages that we keep in
148 148 * | special reserve for use by pageout() and sched() and by any
149 149 * v other parts of the kernel that need to be working for those to
150 150 * | make forward progress such as the ZFS I/O pipeline.
151 151 * |
152 152 * | When we are below pageout_reserve, we fail or hold any allocation
153 153 * | that has not explicitly requested access to the reserve pool.
154 154 * | Access to the reserve is generally granted via the KM_PUSHPAGE
155 155 * | flag, or by marking a thread T_PUSHPAGE such that all allocations
156 156 * | can implicitly tap the reserve. For more details, see the
157 157 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
158 158 * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
159 159 * |
160 160 * +---------------------------------------------------------- no free memory
161 161 * |
162 162 * | If we have arrived here, things are very bad indeed. It is
163 163 * v surprisingly difficult to tell if this condition is even fatal,
164 164 * | as enough memory may have been granted to pageout() and to the
165 165 * | ZFS I/O pipeline that requests for eviction that have already been
166 166 * | made will complete and free up memory some time soon.
167 167 * |
168 168 * | If free memory does not materialise, the system generally remains
169 169 * | deadlocked. The pageout_deadman() below is run once per second
170 170 * | from clock(), seeking to limit the amount of time a single request
171 171 * v to page out can be blocked before the system panics to get a crash
172 172 * | dump and return to service.
173 173 * |
174 174 * +-------------------------------------------------------------------------
175 175 */
176 176
177 177 /*
178 178 * The following parameters control operation of the page replacement
179 179 * algorithm. They are initialized to 0, and then computed at boot time based
180 180 * on the size of the system; see setupclock(). If they are patched non-zero
181 181 * in a loaded vmunix they are left alone and may thus be changed per system
182 182 * using "mdb -kw" on the loaded system.
183 183 */
184 184 pgcnt_t slowscan = 0;
185 185 pgcnt_t fastscan = 0;
186 186
187 187 static pgcnt_t handspreadpages = 0;
188 188
189 189 /*
190 190 * looppages:
191 191 * Cached copy of the total number of pages in the system (total_pages).
192 192 *
193 193 * loopfraction:
194 194 * Divisor used to relate fastscan to looppages in setupclock().
195 195 */
196 196 static uint_t loopfraction = 2;
197 197 static pgcnt_t looppages;
198 198
199 199 static uint_t min_percent_cpu = 4;
200 200 static uint_t max_percent_cpu = 80;
201 201 static pgcnt_t maxfastscan = 0;
202 202 static pgcnt_t maxslowscan = 100;
203 203
204 204 #define MEGABYTES (1024ULL * 1024ULL)
205 205
206 206 /*
207 207 * pageout_threshold_style:
208 208 * set to 1 to use the previous default threshold size calculation;
209 209 * i.e., each threshold is half of the next largest value.
210 210 */
211 211 uint_t pageout_threshold_style = 0;
212 212
213 213 /*
214 214 * The operator may override these tunables to request a different minimum or
215 215 * maximum lotsfree value, or to change the divisor we use for automatic
216 216 * sizing.
217 217 *
218 218 * By default, we make lotsfree 1/64th of the total memory in the machine. The
219 219 * minimum and maximum are specified in bytes, rather than pages; a zero value
220 220 * means the default values (below) are used.
221 221 */
222 222 uint_t lotsfree_fraction = 64;
223 223 pgcnt_t lotsfree_min = 0;
224 224 pgcnt_t lotsfree_max = 0;
225 225
226 226 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
227 227 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
228 228
229 229 /*
230 230 * If these tunables are set to non-zero values in /etc/system, and provided
231 231 * the value is not larger than the threshold above, the specified value will
232 232 * be used directly without any additional calculation or adjustment. The boot
233 233 * time value of these overrides is preserved in the "clockinit" struct. More
234 234 * detail is available in the comment at the top of the file.
235 235 */
236 236 pgcnt_t maxpgio = 0;
237 237 pgcnt_t minfree = 0;
238 238 pgcnt_t desfree = 0;
239 239 pgcnt_t lotsfree = 0;
240 240 pgcnt_t needfree = 0;
241 241 pgcnt_t throttlefree = 0;
242 242 pgcnt_t pageout_reserve = 0;
243 243
244 244 pgcnt_t deficit;
245 245 pgcnt_t nscan;
246 246 pgcnt_t desscan;
247 247
248 248 /*
249 249 * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
250 250 * number of nanoseconds in each wakeup cycle that gives the equivalent of some
251 251 * underlying %CPU duty cycle.
252 252 *
253 253 * min_pageout_nsec:
254 254 * nanoseconds/wakeup equivalent of min_percent_cpu.
255 255 *
256 256 * max_pageout_nsec:
257 257 * nanoseconds/wakeup equivalent of max_percent_cpu.
258 258 *
259 259 * pageout_nsec:
260 260 * Number of nanoseconds budgeted for each wakeup cycle.
261 261 * Computed each time around by schedpaging().
262 262 * Varies between min_pageout_nsec and max_pageout_nsec,
263 263 * depending on memory pressure.
264 264 */
265 265 static hrtime_t min_pageout_nsec;
266 266 static hrtime_t max_pageout_nsec;
267 267 static hrtime_t pageout_nsec;
268 268
269 269 static uint_t reset_hands;
270 270
271 271 #define PAGES_POLL_MASK 1023
272 272
273 273 /*
274 274 * pageout_sample_lim:
275 275 * The limit on the number of samples needed to establish a value for new
276 276 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
277 277 * handspreadpages.
278 278 *
279 279 * pageout_sample_cnt:
280 280 * Current sample number. Once the sample gets large enough, set new
281 281 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
282 282 *
283 283 * pageout_sample_pages:
284 284 * The accumulated number of pages scanned during sampling.
285 285 *
286 286 * pageout_sample_etime:
287 287 * The accumulated nanoseconds for the sample.
288 288 *
289 289 * pageout_rate:
290 290 * Rate in pages/nanosecond, computed at the end of sampling.
291 291 *
292 292 * pageout_new_spread:
293 293 * Initially zero while the system scan rate is measured by
294 294 * pageout_scanner(), which then sets this value once per system boot after
295 295 * enough samples have been recorded (pageout_sample_cnt). Once set, this
296 296 * new value is used for fastscan and handspreadpages.
297 297 *
298 298 * sample_start, sample_end:
299 299 * The hrtime at which the last pageout_scanner() sample began and ended.
300 300 */
301 301 typedef hrtime_t hrrate_t;
302 302
303 303 static uint64_t pageout_sample_lim = 4;
304 304 static uint64_t pageout_sample_cnt = 0;
305 305 static pgcnt_t pageout_sample_pages = 0;
306 306 static hrrate_t pageout_rate = 0;
307 307 static pgcnt_t pageout_new_spread = 0;
308 308
309 309 static hrtime_t pageout_cycle_nsec;
310 310 static hrtime_t sample_start, sample_end;
311 311 static hrtime_t pageout_sample_etime = 0;
312 312
313 313 /*
314 314 * Record number of times a pageout_scanner() wakeup cycle finished because it
315 315 * timed out (exceeded its CPU budget), rather than because it visited
316 316 * its budgeted number of pages.
317 317 */
318 318 uint64_t pageout_timeouts = 0;
319 319
320 320 #ifdef VM_STATS
321 321 static struct pageoutvmstats_str {
322 322 ulong_t checkpage[3];
323 323 } pageoutvmstats;
324 324 #endif /* VM_STATS */
325 325
326 326 /*
327 327 * Threads waiting for free memory use this condition variable and lock until
328 328 * memory becomes available.
329 329 */
330 330 kmutex_t memavail_lock;
331 331 kcondvar_t memavail_cv;
332 332
333 333 typedef enum pageout_hand {
334 334 POH_FRONT = 1,
335 335 POH_BACK,
336 336 } pageout_hand_t;
337 337
338 338 typedef enum {
339 339 CKP_INELIGIBLE,
340 340 CKP_NOT_FREED,
341 341 CKP_FREED,
342 342 } checkpage_result_t;
343 343
344 344 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
345 345
346 346 static struct clockinit {
347 347 bool ci_init;
348 348 pgcnt_t ci_lotsfree_min;
349 349 pgcnt_t ci_lotsfree_max;
350 350 pgcnt_t ci_lotsfree;
351 351 pgcnt_t ci_desfree;
352 352 pgcnt_t ci_minfree;
353 353 pgcnt_t ci_throttlefree;
354 354 pgcnt_t ci_pageout_reserve;
355 355 pgcnt_t ci_maxpgio;
356 356 pgcnt_t ci_maxfastscan;
357 357 pgcnt_t ci_fastscan;
358 358 pgcnt_t ci_slowscan;
359 359 pgcnt_t ci_handspreadpages;
360 360 } clockinit = { .ci_init = false };
361 361
362 362 static pgcnt_t
363 363 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
364 364 {
365 365 if (value < minimum) {
366 366 return (minimum);
367 367 } else if (value > maximum) {
368 368 return (maximum);
369 369 } else {
370 370 return (value);
371 371 }
372 372 }
373 373
374 374 static pgcnt_t
375 375 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
376 376 {
377 377 if (initval == 0 || initval >= initval_ceiling) {
378 378 return (defval);
379 379 } else {
380 380 return (initval);
381 381 }
382 382 }
383 383
384 384 /*
385 385 * Set up the paging constants for the clock algorithm used by
386 386 * pageout_scanner(), and by the virtual memory system overall. See the
387 387 * comments at the top of this file for more information about the threshold
388 388 * values and system responses to memory pressure.
389 389 *
390 390 * This routine is called once by main() at startup, after the initial size of
391 391 * physical memory is determined. It may be called again later if memory is
392 392 * added to or removed from the system, or if new measurements of the page scan
393 393 * rate become available.
394 394 */
395 395 void
396 396 setupclock(void)
397 397 {
398 398 pgcnt_t defval;
399 399 bool half = (pageout_threshold_style == 1);
400 400 bool recalc = true;
401 401
402 402 looppages = total_pages;
403 403
404 404 /*
405 405 * The operator may have provided specific values for some of the
406 406 * tunables via /etc/system. On our first call, we preserve those
407 407 * values so that they can be used for subsequent recalculations.
408 408 *
409 409 * A value of zero for any tunable means we will use the default
410 410 * sizing.
411 411 */
412 412 if (!clockinit.ci_init) {
413 413 clockinit.ci_init = true;
414 414
415 415 clockinit.ci_lotsfree_min = lotsfree_min;
416 416 clockinit.ci_lotsfree_max = lotsfree_max;
417 417 clockinit.ci_lotsfree = lotsfree;
418 418 clockinit.ci_desfree = desfree;
419 419 clockinit.ci_minfree = minfree;
420 420 clockinit.ci_throttlefree = throttlefree;
421 421 clockinit.ci_pageout_reserve = pageout_reserve;
422 422 clockinit.ci_maxpgio = maxpgio;
423 423 clockinit.ci_maxfastscan = maxfastscan;
424 424 clockinit.ci_fastscan = fastscan;
425 425 clockinit.ci_slowscan = slowscan;
426 426 clockinit.ci_handspreadpages = handspreadpages;
427 427
428 428 /*
429 429 * The first call does not trigger a recalculation, only
430 430 * subsequent calls.
431 431 */
432 432 recalc = false;
433 433 }
434 434
435 435 /*
436 436 * Configure paging threshold values. For more details on what each
437 437 * threshold signifies, see the comments at the top of this file.
438 438 */
439 439 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
440 440 btop(LOTSFREE_MAX_DEFAULT));
441 441 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
442 442 btop(LOTSFREE_MIN_DEFAULT));
443 443
444 444 lotsfree = tune(clockinit.ci_lotsfree, looppages,
445 445 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
446 446
447 447 desfree = tune(clockinit.ci_desfree, lotsfree,
448 448 lotsfree / 2);
449 449
450 450 minfree = tune(clockinit.ci_minfree, desfree,
451 451 half ? desfree / 2 : 3 * desfree / 4);
452 452
453 453 throttlefree = tune(clockinit.ci_throttlefree, desfree,
454 454 minfree);
455 455
456 456 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
457 457 half ? throttlefree / 2 : 3 * throttlefree / 4);
458 458
459 459 /*
460 460 * Maxpgio thresholds how much paging is acceptable.
461 461 * This figures that 2/3 busy on an arm is all that is
462 462 * tolerable for paging. We assume one operation per disk rev.
463 463 *
464 464 * XXX - Does not account for multiple swap devices.
465 465 */
466 466 if (clockinit.ci_maxpgio == 0) {
467 467 maxpgio = (DISKRPM * 2) / 3;
468 468 } else {
469 469 maxpgio = clockinit.ci_maxpgio;
470 470 }
471 471
472 472 /*
473 473 * The clock scan rate varies between fastscan and slowscan
474 474 * based on the amount of free memory available. Fastscan
475 475 * rate should be set based on the number pages that can be
476 476 * scanned per sec using ~10% of processor time. Since this
477 477 * value depends on the processor, MMU, Mhz etc., it is
478 478 * difficult to determine it in a generic manner for all
479 479 * architectures.
480 480 *
481 481 * Instead of trying to determine the number of pages scanned
482 482 * per sec for every processor, fastscan is set to be the smaller
483 483 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
484 484 * time is limited to ~4% of processor time.
485 485 *
486 486 * Setting fastscan to be 1/2 of memory allows pageout to scan
487 487 * all of memory in ~2 secs. This implies that user pages not
488 488 * accessed within 1 sec (assuming, handspreadpages == fastscan)
489 489 * can be reclaimed when free memory is very low. Stealing pages
490 490 * not accessed within 1 sec seems reasonable and ensures that
491 491 * active user processes don't thrash.
492 492 *
493 493 * Smaller values of fastscan result in scanning fewer pages
494 494 * every second and consequently pageout may not be able to free
495 495 * sufficient memory to maintain the minimum threshold. Larger
496 496 * values of fastscan result in scanning a lot more pages which
497 497 * could lead to thrashing and higher CPU usage.
498 498 *
499 499 * Fastscan needs to be limited to a maximum value and should not
500 500 * scale with memory to prevent pageout from consuming too much
501 501 * time for scanning on slow CPU's and avoid thrashing, as a
502 502 * result of scanning too many pages, on faster CPU's.
503 503 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
504 504 * (the upper bound for fastscan) based on the average number
505 505 * of pages that can potentially be scanned in ~1 sec (using ~4%
506 506 * of the CPU) on some of the following machines that currently
507 507 * run Solaris 2.x:
508 508 *
509 509 * average memory scanned in ~1 sec
510 510 *
511 511 * 25 Mhz SS1+: 23 Meg
512 512 * LX: 37 Meg
513 513 * 50 Mhz SC2000: 68 Meg
514 514 *
515 515 * 40 Mhz 486: 26 Meg
516 516 * 66 Mhz 486: 42 Meg
517 517 *
518 518 * When free memory falls just below lotsfree, the scan rate
519 519 * goes from 0 to slowscan (i.e., pageout starts running). This
520 520 * transition needs to be smooth and is achieved by ensuring that
521 521 * pageout scans a small number of pages to satisfy the transient
522 522 * memory demand. This is set to not exceed 100 pages/sec (25 per
523 523 * wakeup) since scanning that many pages has no noticible impact
524 524 * on system performance.
525 525 *
526 526 * In addition to setting fastscan and slowscan, pageout is
527 527 * limited to using ~4% of the CPU. This results in increasing
528 528 * the time taken to scan all of memory, which in turn means that
529 529 * user processes have a better opportunity of preventing their
530 530 * pages from being stolen. This has a positive effect on
531 531 * interactive and overall system performance when memory demand
532 532 * is high.
533 533 *
534 534 * Thus, the rate at which pages are scanned for replacement will
535 535 * vary linearly between slowscan and the number of pages that
536 536 * can be scanned using ~4% of processor time instead of varying
537 537 * linearly between slowscan and fastscan.
538 538 *
539 539 * Also, the processor time used by pageout will vary from ~1%
540 540 * at slowscan to ~4% at fastscan instead of varying between
541 541 * ~1% at slowscan and ~10% at fastscan.
542 542 *
543 543 * The values chosen for the various VM parameters (fastscan,
544 544 * handspreadpages, etc) are not universally true for all machines,
545 545 * but appear to be a good rule of thumb for the machines we've
546 546 * tested. They have the following ranges:
547 547 *
548 548 * cpu speed: 20 to 70 Mhz
549 549 * page size: 4K to 8K
550 550 * memory size: 16M to 5G
551 551 * page scan rate: 4000 - 17400 4K pages per sec
552 552 *
553 553 * The values need to be re-examined for machines which don't
554 554 * fall into the various ranges (e.g., slower or faster CPUs,
555 555 * smaller or larger pagesizes etc) shown above.
556 556 *
557 557 * On an MP machine, pageout is often unable to maintain the
558 558 * minimum paging thresholds under heavy load. This is due to
559 559 * the fact that user processes running on other CPU's can be
560 560 * dirtying memory at a much faster pace than pageout can find
561 561 * pages to free. The memory demands could be met by enabling
562 562 * more than one CPU to run the clock algorithm in such a manner
563 563 * that the various clock hands don't overlap. This also makes
564 564 * it more difficult to determine the values for fastscan, slowscan
565 565 * and handspreadpages.
566 566 *
567 567 * The swapper is currently used to free up memory when pageout
568 568 * is unable to meet memory demands by swapping out processes.
569 569 * In addition to freeing up memory, swapping also reduces the
570 570 * demand for memory by preventing user processes from running
571 571 * and thereby consuming memory.
572 572 */
573 573 if (clockinit.ci_maxfastscan == 0) {
574 574 if (pageout_new_spread != 0) {
575 575 maxfastscan = pageout_new_spread;
576 576 } else {
577 577 maxfastscan = MAXHANDSPREADPAGES;
578 578 }
579 579 } else {
580 580 maxfastscan = clockinit.ci_maxfastscan;
581 581 }
582 582
583 583 if (clockinit.ci_fastscan == 0) {
584 584 fastscan = MIN(looppages / loopfraction, maxfastscan);
585 585 } else {
586 586 fastscan = clockinit.ci_fastscan;
587 587 }
588 588
589 589 if (fastscan > looppages / loopfraction) {
590 590 fastscan = looppages / loopfraction;
591 591 }
592 592
593 593 /*
594 594 * Set slow scan time to 1/10 the fast scan time, but
595 595 * not to exceed maxslowscan.
596 596 */
597 597 if (clockinit.ci_slowscan == 0) {
598 598 slowscan = MIN(fastscan / 10, maxslowscan);
599 599 } else {
600 600 slowscan = clockinit.ci_slowscan;
601 601 }
602 602
603 603 if (slowscan > fastscan / 2) {
604 604 slowscan = fastscan / 2;
605 605 }
606 606
607 607 /*
608 608 * Handspreadpages is distance (in pages) between front and back
609 609 * pageout daemon hands. The amount of time to reclaim a page
610 610 * once pageout examines it increases with this distance and
611 611 * decreases as the scan rate rises. It must be < the amount
612 612 * of pageable memory.
613 613 *
614 614 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
615 615 * to be "fastscan" results in the front hand being a few secs
616 616 * (varies based on the processor speed) ahead of the back hand
617 617 * at fastscan rates. This distance can be further reduced, if
618 618 * necessary, by increasing the processor time used by pageout
619 619 * to be more than ~4% and preferrably not more than ~10%.
620 620 *
621 621 * As a result, user processes have a much better chance of
622 622 * referencing their pages before the back hand examines them.
623 623 * This also significantly lowers the number of reclaims from
624 624 * the freelist since pageout does not end up freeing pages which
625 625 * may be referenced a sec later.
626 626 */
627 627 if (clockinit.ci_handspreadpages == 0) {
628 628 handspreadpages = fastscan;
629 629 } else {
630 630 handspreadpages = clockinit.ci_handspreadpages;
631 631 }
632 632
633 633 /*
634 634 * Make sure that back hand follows front hand by at least
635 635 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
636 636 * back hand to look at a page during the same wakeup of the pageout
637 637 * daemon in which the front hand cleared its ref bit.
638 638 */
639 639 if (handspreadpages >= looppages) {
640 640 handspreadpages = looppages - 1;
641 641 }
642 642
643 643 /*
644 644 * If we have been called to recalculate the parameters, set a flag to
645 645 * re-evaluate the clock hand pointers.
646 646 */
647 647 if (recalc) {
648 648 reset_hands = 1;
649 649 }
650 650 }
651 651
652 652 /*
653 653 * Pageout scheduling.
654 654 *
655 655 * Schedpaging controls the rate at which the page out daemon runs by
656 656 * setting the global variables nscan and desscan SCHEDPAGING_HZ
657 657 * times a second. Nscan records the number of pages pageout has examined
658 658 * in its current pass; schedpaging() resets this value to zero each time
659 659 * it runs. Desscan records the number of pages pageout should examine
660 660 * in its next pass; schedpaging() sets this value based on the amount of
661 661 * currently available memory.
662 662 */
663 663 #define SCHEDPAGING_HZ 4
664 664
665 665 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
666 666
667 667 /*
668 668 * Pool of available async pageout putpage requests.
669 669 */
670 670 static struct async_reqs *push_req;
671 671 static struct async_reqs *req_freelist; /* available req structs */
672 672 static struct async_reqs *push_list; /* pending reqs */
673 673 static kmutex_t push_lock; /* protects req pool */
674 674 static kcondvar_t push_cv;
675 675
676 676 /*
677 677 * If pageout() is stuck on a single push for this many seconds,
678 678 * pageout_deadman() will assume the system has hit a memory deadlock. If set
679 679 * to 0, the deadman will have no effect.
680 680 *
681 681 * Note that we are only looking for stalls in the calls that pageout() makes
682 682 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
683 683 * I/O, which should not take long unless the underlying strategy call blocks
684 684 * indefinitely for memory. The actual I/O request happens (or fails) later.
685 685 */
686 686 uint_t pageout_deadman_seconds = 90;
687 687
688 688 static uint_t pageout_stucktime = 0;
689 689 static bool pageout_pushing = false;
690 690 static uint64_t pageout_pushcount = 0;
691 691 static uint64_t pageout_pushcount_seen = 0;
692 692
693 693 static int async_list_size = 256; /* number of async request structs */
694 694
695 695 static void pageout_scanner(void);
696 696
697 697 /*
698 698 * If a page is being shared more than "po_share" times
699 699 * then leave it alone- don't page it out.
700 700 */
701 701 #define MIN_PO_SHARE (8)
702 702 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
703 703 ulong_t po_share = MIN_PO_SHARE;
704 704
705 705 /*
706 706 * Schedule rate for paging.
707 707 * Rate is linear interpolation between
708 708 * slowscan with lotsfree and fastscan when out of memory.
709 709 */
710 710 static void
711 711 schedpaging(void *arg)
712 712 {
713 713 spgcnt_t vavail;
714 714
715 715 if (freemem < lotsfree + needfree + kmem_reapahead)
716 716 kmem_reap();
717 717
718 718 if (freemem < lotsfree + needfree)
719 719 seg_preap();
720 720
721 721 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
722 722 kcage_cageout_wakeup();
723 723
724 724 if (mutex_tryenter(&pageout_mutex)) {
725 725 /* pageout() not running */
726 726 nscan = 0;
727 727 vavail = freemem - deficit;
728 728 if (pageout_new_spread != 0)
729 729 vavail -= needfree;
730 730 if (vavail < 0)
731 731 vavail = 0;
732 732 if (vavail > lotsfree)
733 733 vavail = lotsfree;
734 734
735 735 /*
736 736 * Fix for 1161438 (CRS SPR# 73922). All variables
737 737 * in the original calculation for desscan were 32 bit signed
738 738 * ints. As freemem approaches 0x0 on a system with 1 Gig or
739 739 * more of memory, the calculation can overflow. When this
740 740 * happens, desscan becomes negative and pageout_scanner()
741 741 * stops paging out.
742 742 */
743 743 if (needfree > 0 && pageout_new_spread == 0) {
744 744 /*
745 745 * If we've not yet collected enough samples to
746 746 * calculate a spread, use the old logic of kicking
747 747 * into high gear anytime needfree is non-zero.
748 748 */
749 749 desscan = fastscan / SCHEDPAGING_HZ;
750 750 } else {
751 751 /*
752 752 * Once we've calculated a spread based on system
753 753 * memory and usage, just treat needfree as another
754 754 * form of deficit.
755 755 */
756 756 spgcnt_t faststmp, slowstmp, result;
757 757
758 758 slowstmp = slowscan * vavail;
759 759 faststmp = fastscan * (lotsfree - vavail);
760 760 result = (slowstmp + faststmp) /
761 761 nz(lotsfree) / SCHEDPAGING_HZ;
762 762 desscan = (pgcnt_t)result;
763 763 }
764 764
765 765 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
766 766 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
767 767
768 768 if (freemem < lotsfree + needfree ||
769 769 pageout_sample_cnt < pageout_sample_lim) {
770 770 /*
771 771 * Either we need more memory, or we still need to
772 772 * measure the average scan rate. Wake the scanner.
773 773 */
774 774 DTRACE_PROBE(pageout__cv__signal);
775 775 cv_signal(&proc_pageout->p_cv);
776 776 } else {
777 777 /*
778 778 * There are enough free pages, no need to
779 779 * kick the scanner thread. And next time
780 780 * around, keep more of the `highly shared'
781 781 * pages.
782 782 */
783 783 cv_signal_pageout();
784 784 if (po_share > MIN_PO_SHARE) {
785 785 po_share >>= 1;
786 786 }
787 787 }
788 788 mutex_exit(&pageout_mutex);
789 789 }
790 790
791 791 /*
792 792 * Signal threads waiting for available memory.
793 793 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
794 794 * in this case it is not needed - the waiters will be waken up during
795 795 * the next invocation of this function.
796 796 */
797 797 if (kmem_avail() > 0)
798 798 cv_broadcast(&memavail_cv);
799 799
800 800 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
801 801 }
802 802
803 803 pgcnt_t pushes;
804 804 ulong_t push_list_size; /* # of requests on pageout queue */
805 805
806 806 /*
807 807 * Paging out should always be enabled. This tunable exists to hold pageout
808 808 * for debugging purposes. If set to 0, pageout_scanner() will go back to
809 809 * sleep each time it is woken by schedpaging().
810 810 */
811 811 uint_t dopageout = 1;
812 812
813 813 /*
814 814 * The page out daemon, which runs as process 2.
815 815 *
816 816 * As long as there are at least lotsfree pages,
817 817 * this process is not run. When the number of free
818 818 * pages stays in the range desfree to lotsfree,
819 819 * this daemon runs through the pages in the loop
820 820 * at a rate determined in schedpaging(). Pageout manages
821 821 * two hands on the clock. The front hand moves through
822 822 * memory, clearing the reference bit,
823 823 * and stealing pages from procs that are over maxrss.
824 824 * The back hand travels a distance behind the front hand,
825 825 * freeing the pages that have not been referenced in the time
826 826 * since the front hand passed. If modified, they are pushed to
827 827 * swap before being freed.
828 828 *
829 829 * There are 2 threads that act on behalf of the pageout process.
830 830 * One thread scans pages (pageout_scanner) and frees them up if
831 831 * they don't require any VOP_PUTPAGE operation. If a page must be
832 832 * written back to its backing store, the request is put on a list
833 833 * and the other (pageout) thread is signaled. The pageout thread
834 834 * grabs VOP_PUTPAGE requests from the list, and processes them.
835 835 * Some filesystems may require resources for the VOP_PUTPAGE
836 836 * operations (like memory) and hence can block the pageout
837 837 * thread, but the scanner thread can still operate. There is still
838 838 * no guarantee that memory deadlocks cannot occur.
839 839 *
840 840 * For now, this thing is in very rough form.
841 841 */
842 842 void
843 843 pageout()
844 844 {
845 845 struct async_reqs *arg;
846 846 pri_t pageout_pri;
847 847 int i;
848 848 pgcnt_t max_pushes;
849 849 callb_cpr_t cprinfo;
850 850
851 851 proc_pageout = ttoproc(curthread);
852 852 proc_pageout->p_cstime = 0;
853 853 proc_pageout->p_stime = 0;
854 854 proc_pageout->p_cutime = 0;
855 855 proc_pageout->p_utime = 0;
856 856 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
857 857 bcopy("pageout", PTOU(curproc)->u_comm, 7);
858 858
859 859 /*
860 860 * Create pageout scanner thread
861 861 */
862 862 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
863 863 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
864 864
865 865 /*
866 866 * Allocate and initialize the async request structures
867 867 * for pageout.
868 868 */
869 869 push_req = (struct async_reqs *)
870 870 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
871 871
872 872 req_freelist = push_req;
873 873 for (i = 0; i < async_list_size - 1; i++) {
874 874 push_req[i].a_next = &push_req[i + 1];
875 875 }
876 876
877 877 pageout_pri = curthread->t_pri;
878 878
879 879 /* Create the pageout scanner thread. */
880 880 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
881 881 pageout_pri - 1);
882 882
883 883 /*
884 884 * kick off pageout scheduler.
885 885 */
886 886 schedpaging(NULL);
887 887
888 888 /*
889 889 * Create kernel cage thread.
890 890 * The kernel cage thread is started under the pageout process
891 891 * to take advantage of the less restricted page allocation
892 892 * in page_create_throttle().
893 893 */
894 894 kcage_cageout_init();
895 895
896 896 /*
897 897 * Limit pushes to avoid saturating pageout devices.
898 898 */
899 899 max_pushes = maxpgio / SCHEDPAGING_HZ;
900 900 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
901 901
902 902 for (;;) {
903 903 mutex_enter(&push_lock);
904 904
905 905 while ((arg = push_list) == NULL || pushes > max_pushes) {
906 906 CALLB_CPR_SAFE_BEGIN(&cprinfo);
907 907 cv_wait(&push_cv, &push_lock);
908 908 pushes = 0;
909 909 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
910 910 }
911 911 push_list = arg->a_next;
912 912 arg->a_next = NULL;
913 913 pageout_pushing = true;
914 914 mutex_exit(&push_lock);
915 915
916 916 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
917 917 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
918 918 pushes++;
919 919 }
920 920
921 921 /* vp held by checkpage() */
922 922 VN_RELE(arg->a_vp);
923 923
924 924 mutex_enter(&push_lock);
925 925 pageout_pushing = false;
926 926 pageout_pushcount++;
927 927 arg->a_next = req_freelist; /* back on freelist */
928 928 req_freelist = arg;
929 929 push_list_size--;
930 930 mutex_exit(&push_lock);
931 931 }
932 932 }
933 933
934 934 /*
935 935 * Kernel thread that scans pages looking for ones to free
936 936 */
937 937 static void
938 938 pageout_scanner(void)
939 939 {
940 940 struct page *fronthand, *backhand;
941 941 uint_t laps;
942 942 callb_cpr_t cprinfo;
943 943 pgcnt_t nscan_limit;
944 944 pgcnt_t pcount;
945 945 bool sampling;
946 946
947 947 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
948 948 mutex_enter(&pageout_mutex);
949 949
950 950 /*
951 951 * The restart case does not attempt to point the hands at roughly
952 952 * the right point on the assumption that after one circuit things
953 953 * will have settled down, and restarts shouldn't be that often.
954 954 */
955 955
956 956 /*
957 957 * Set the two clock hands to be separated by a reasonable amount,
958 958 * but no more than 360 degrees apart.
959 959 */
960 960 backhand = page_first();
961 961 if (handspreadpages >= total_pages) {
962 962 fronthand = page_nextn(backhand, total_pages - 1);
963 963 } else {
964 964 fronthand = page_nextn(backhand, handspreadpages);
965 965 }
966 966
967 967 /*
968 968 * Establish the minimum and maximum length of time to be spent
969 969 * scanning pages per wakeup, limiting the scanner duty cycle. The
970 970 * input percentage values (0-100) must be converted to a fraction of
971 971 * the number of nanoseconds in a second of wall time, then further
972 972 * scaled down by the number of scanner wakeups in a second:
973 973 */
974 974 min_pageout_nsec = MAX(1,
975 975 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
976 976 max_pageout_nsec = MAX(min_pageout_nsec,
977 977 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
978 978
979 979 loop:
980 980 cv_signal_pageout();
981 981
982 982 CALLB_CPR_SAFE_BEGIN(&cprinfo);
983 983 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
984 984 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
985 985
986 986 /*
987 987 * Check if pageout has been disabled for debugging purposes:
988 988 */
989 989 if (!dopageout) {
990 990 goto loop;
991 991 }
992 992
993 993 /*
994 994 * One may reset the clock hands for debugging purposes. Hands will
995 995 * also be reset if memory is added to or removed from the system.
996 996 */
997 997 if (reset_hands) {
998 998 reset_hands = 0;
999 999
1000 1000 backhand = page_first();
1001 1001 if (handspreadpages >= total_pages) {
1002 1002 fronthand = page_nextn(backhand, total_pages - 1);
1003 1003 } else {
1004 1004 fronthand = page_nextn(backhand, handspreadpages);
1005 1005 }
1006 1006 }
1007 1007
1008 1008 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1009 1009
1010 1010 /*
1011 1011 * Keep track of the number of times we have scanned all the way around
1012 1012 * the loop:
1013 1013 */
1014 1014 laps = 0;
1015 1015
1016 1016 DTRACE_PROBE(pageout__start);
1017 1017
1018 1018 /*
1019 1019 * Track the number of pages visited during this scan so that we can
1020 1020 * periodically measure our duty cycle.
1021 1021 */
1022 1022 pcount = 0;
1023 1023
1024 1024 if (pageout_sample_cnt < pageout_sample_lim) {
1025 1025 /*
1026 1026 * We need to measure the rate at which the system is able to
1027 1027 * scan pages of memory. Each of these initial samples is a
1028 1028 * scan of all system memory, regardless of whether or not we
1029 1029 * are experiencing memory pressure.
1030 1030 */
1031 1031 nscan_limit = total_pages;
1032 1032 sampling = true;
1033 1033 } else {
1034 1034 nscan_limit = desscan;
1035 1035 sampling = false;
1036 1036 }
1037 1037
1038 1038 sample_start = gethrtime();
1039 1039
1040 1040 /*
1041 1041 * Scan the appropriate number of pages for a single duty cycle.
1042 1042 */
1043 1043 while (nscan < nscan_limit) {
1044 1044 checkpage_result_t rvfront, rvback;
1045 1045
1046 1046 if (!sampling && freemem >= lotsfree + needfree) {
1047 1047 /*
1048 1048 * We are not sampling and enough memory has become
1049 1049 * available that scanning is no longer required.
1050 1050 */
1051 1051 break;
1052 1052 }
1053 1053
1054 1054 /*
1055 1055 * Periodically check to see if we have exceeded the CPU duty
1056 1056 * cycle for a single wakeup.
1057 1057 */
1058 1058 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1059 1059 pageout_cycle_nsec = gethrtime() - sample_start;
1060 1060 if (pageout_cycle_nsec >= pageout_nsec) {
1061 1061 ++pageout_timeouts;
1062 1062 break;
1063 1063 }
1064 1064 }
1065 1065
1066 1066 /*
1067 1067 * If checkpage manages to add a page to the free list,
1068 1068 * we give ourselves another couple of trips around the loop.
1069 1069 */
1070 1070 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1071 1071 laps = 0;
1072 1072 }
1073 1073 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1074 1074 laps = 0;
1075 1075 }
1076 1076
1077 1077 ++pcount;
1078 1078
1079 1079 /*
1080 1080 * Protected by pageout_mutex instead of cpu_stat_lock:
1081 1081 */
1082 1082 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1083 1083
1084 1084 /*
1085 1085 * Don't include ineligible pages in the number scanned.
1086 1086 */
1087 1087 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1088 1088 nscan++;
1089 1089 }
1090 1090
1091 1091 backhand = page_next(backhand);
1092 1092 fronthand = page_next(fronthand);
1093 1093
1094 1094 /*
1095 1095 * The front hand has wrapped around to the first page in the
1096 1096 * loop.
1097 1097 */
1098 1098 if (fronthand == page_first()) {
1099 1099 laps++;
1100 1100 DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps);
1101 1101
1102 1102 /*
1103 1103 * Protected by pageout_mutex instead of cpu_stat_lock:
1104 1104 */
1105 1105 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1106 1106
1107 1107 if (laps > 1) {
1108 1108 /*
1109 1109 * Extremely unlikely, but it happens.
1110 1110 * We went around the loop at least once
1111 1111 * and didn't get far enough.
1112 1112 * If we are still skipping `highly shared'
1113 1113 * pages, skip fewer of them. Otherwise,
1114 1114 * give up till the next clock tick.
1115 1115 */
1116 1116 if (po_share < MAX_PO_SHARE) {
1117 1117 po_share <<= 1;
1118 1118 } else {
1119 1119 break;
1120 1120 }
1121 1121 }
1122 1122 }
1123 1123 }
1124 1124
1125 1125 sample_end = gethrtime();
1126 1126
1127 1127 DTRACE_PROBE1(pageout__end, uint_t, laps);
1128 1128
1129 1129 if (pageout_new_spread == 0) {
1130 1130 if (pageout_sample_cnt < pageout_sample_lim) {
1131 1131 /*
1132 1132 * Continue accumulating samples until we have enough
1133 1133 * to get a reasonable value for average scan rate:
1134 1134 */
1135 1135 pageout_sample_pages += pcount;
1136 1136 pageout_sample_etime += sample_end - sample_start;
1137 1137 ++pageout_sample_cnt;
1138 1138 }
1139 1139
1140 1140 if (pageout_sample_cnt >= pageout_sample_lim) {
1141 1141 /*
1142 1142 * We have enough samples, set the spread.
1143 1143 */
1144 1144 pageout_rate = (hrrate_t)pageout_sample_pages *
1145 1145 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1146 1146 pageout_new_spread = pageout_rate / 10;
1147 1147 setupclock();
1148 1148 }
1149 1149 }
1150 1150
1151 1151 goto loop;
1152 1152 }
1153 1153
1154 1154 /*
1155 1155 * The pageout deadman is run once per second by clock().
1156 1156 */
1157 1157 void
1158 1158 pageout_deadman(void)
1159 1159 {
1160 1160 if (panicstr != NULL) {
1161 1161 /*
1162 1162 * There is no pageout after panic.
1163 1163 */
1164 1164 return;
1165 1165 }
1166 1166
1167 1167 if (pageout_deadman_seconds == 0) {
1168 1168 /*
1169 1169 * The deadman is not enabled.
1170 1170 */
1171 1171 return;
1172 1172 }
1173 1173
1174 1174 if (!pageout_pushing) {
1175 1175 goto reset;
1176 1176 }
1177 1177
1178 1178 /*
1179 1179 * We are pushing a page. Check to see if it is the same call we saw
1180 1180 * last time we looked:
1181 1181 */
1182 1182 if (pageout_pushcount != pageout_pushcount_seen) {
1183 1183 /*
1184 1184 * It is a different call from the last check, so we are not
1185 1185 * stuck.
1186 1186 */
1187 1187 goto reset;
1188 1188 }
1189 1189
1190 1190 if (++pageout_stucktime >= pageout_deadman_seconds) {
1191 1191 panic("pageout_deadman: stuck pushing the same page for %d "
1192 1192 "seconds (freemem is %lu)", pageout_deadman_seconds,
1193 1193 freemem);
1194 1194 }
1195 1195
1196 1196 return;
1197 1197
1198 1198 reset:
1199 1199 /*
1200 1200 * Reset our tracking state to reflect that we are not stuck:
1201 1201 */
1202 1202 pageout_stucktime = 0;
1203 1203 pageout_pushcount_seen = pageout_pushcount;
1204 1204 }
1205 1205
1206 1206 /*
1207 1207 * Look at the page at hand. If it is locked (e.g., for physical i/o),
1208 1208 * system (u., page table) or free, then leave it alone. Otherwise,
1209 1209 * if we are running the front hand, turn off the page's reference bit.
1210 1210 * If the proc is over maxrss, we take it. If running the back hand,
1211 1211 * check whether the page has been reclaimed. If not, free the page,
1212 1212 * pushing it to disk first if necessary.
1213 1213 *
1214 1214 * Return values:
1215 1215 * CKP_INELIGIBLE if the page is not a candidate at all,
1216 1216 * CKP_NOT_FREED if the page was not freed, or
1217 1217 * CKP_FREED if we freed it.
1218 1218 */
1219 1219 static checkpage_result_t
1220 1220 checkpage(struct page *pp, pageout_hand_t whichhand)
1221 1221 {
1222 1222 int ppattr;
1223 1223 int isfs = 0;
1224 1224 int isexec = 0;
1225 1225 int pagesync_flag;
1226 1226
1227 1227 /*
1228 1228 * Skip pages:
1229 1229 * - associated with the kernel vnode since
1230 1230 * they are always "exclusively" locked.
1231 1231 * - that are free
1232 1232 * - that are shared more than po_share'd times
1233 1233 * - its already locked
1234 1234 *
1235 1235 * NOTE: These optimizations assume that reads are atomic.
1236 1236 */
1237 1237
1238 1238 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1239 1239 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1240 1240 hat_page_checkshare(pp, po_share)) {
1241 1241 return (CKP_INELIGIBLE);
1242 1242 }
1243 1243
1244 1244 if (!page_trylock(pp, SE_EXCL)) {
1245 1245 /*
1246 1246 * Skip the page if we can't acquire the "exclusive" lock.
1247 1247 */
1248 1248 return (CKP_INELIGIBLE);
1249 1249 } else if (PP_ISFREE(pp)) {
1250 1250 /*
1251 1251 * It became free between the above check and our actually
1252 1252 * locking the page. Oh well, there will be other pages.
1253 1253 */
1254 1254 page_unlock(pp);
1255 1255 return (CKP_INELIGIBLE);
1256 1256 }
1257 1257
1258 1258 /*
1259 1259 * Reject pages that cannot be freed. The page_struct_lock
1260 1260 * need not be acquired to examine these
1261 1261 * fields since the page has an "exclusive" lock.
1262 1262 */
1263 1263 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1264 1264 page_unlock(pp);
1265 1265 return (CKP_INELIGIBLE);
1266 1266 }
1267 1267
1268 1268 /*
1269 1269 * Maintain statistics for what we are freeing
1270 1270 */
1271 1271 if (pp->p_vnode != NULL) {
1272 1272 if (pp->p_vnode->v_flag & VVMEXEC)
1273 1273 isexec = 1;
1274 1274
1275 1275 if (!IS_SWAPFSVP(pp->p_vnode))
1276 1276 isfs = 1;
1277 1277 }
1278 1278
1279 1279 /*
1280 1280 * Turn off REF and MOD bits with the front hand.
1281 1281 * The back hand examines the REF bit and always considers
1282 1282 * SHARED pages as referenced.
1283 1283 */
1284 1284 if (whichhand == POH_FRONT) {
1285 1285 pagesync_flag = HAT_SYNC_ZERORM;
1286 1286 } else {
1287 1287 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1288 1288 HAT_SYNC_STOPON_SHARED;
1289 1289 }
1290 1290
1291 1291 ppattr = hat_pagesync(pp, pagesync_flag);
1292 1292
1293 1293 recheck:
1294 1294 /*
1295 1295 * If page is referenced; make unreferenced but reclaimable.
1296 1296 * If this page is not referenced, then it must be reclaimable
1297 1297 * and we can add it to the free list.
1298 1298 */
1299 1299 if (ppattr & P_REF) {
1300 1300 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1301 1301 pageout_hand_t, whichhand);
1302 1302
1303 1303 if (whichhand == POH_FRONT) {
1304 1304 /*
1305 1305 * Checking of rss or madvise flags needed here...
1306 1306 *
1307 1307 * If not "well-behaved", fall through into the code
1308 1308 * for not referenced.
1309 1309 */
1310 1310 hat_clrref(pp);
1311 1311 }
1312 1312
1313 1313 /*
1314 1314 * Somebody referenced the page since the front
1315 1315 * hand went by, so it's not a candidate for
1316 1316 * freeing up.
1317 1317 */
1318 1318 page_unlock(pp);
1319 1319 return (CKP_NOT_FREED);
1320 1320 }
1321 1321
1322 1322 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1323 1323
1324 1324 /*
1325 1325 * If large page, attempt to demote it. If successfully demoted,
1326 1326 * retry the checkpage.
1327 1327 */
1328 1328 if (pp->p_szc != 0) {
1329 1329 if (!page_try_demote_pages(pp)) {
1330 1330 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1331 1331 page_unlock(pp);
1332 1332 return (CKP_INELIGIBLE);
1333 1333 }
1334 1334
1335 1335 ASSERT(pp->p_szc == 0);
1336 1336 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1337 1337
1338 1338 /*
1339 1339 * Since page_try_demote_pages() could have unloaded some
1340 1340 * mappings it makes sense to reload ppattr.
1341 1341 */
1342 1342 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1343 1343 }
1344 1344
1345 1345 /*
1346 1346 * If the page is currently dirty, we have to arrange to have it
1347 1347 * cleaned before it can be freed.
1348 1348 *
1349 1349 * XXX - ASSERT(pp->p_vnode != NULL);
1350 1350 */
1351 1351 if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1352 1352 struct vnode *vp = pp->p_vnode;
1353 1353 u_offset_t offset = pp->p_offset;
1354 1354
1355 1355 /*
1356 1356 * XXX - Test for process being swapped out or about to exit?
1357 1357 * [Can't get back to process(es) using the page.]
1358 1358 */
1359 1359
1360 1360 /*
1361 1361 * Hold the vnode before releasing the page lock to
1362 1362 * prevent it from being freed and re-used by some
1363 1363 * other thread.
1364 1364 */
1365 1365 VN_HOLD(vp);
1366 1366 page_unlock(pp);
1367 1367
1368 1368 /*
1369 1369 * Queue I/O request for the pageout thread.
1370 1370 */
1371 1371 if (!queue_io_request(vp, offset)) {
1372 1372 VN_RELE(vp);
1373 1373 return (CKP_NOT_FREED);
1374 1374 }
1375 1375 return (CKP_FREED);
1376 1376 }
1377 1377
1378 1378 /*
1379 1379 * Now we unload all the translations and put the page back on to the
1380 1380 * free list. If the page was used (referenced or modified) after the
1381 1381 * pagesync but before it was unloaded we catch it and handle the page
1382 1382 * properly.
1383 1383 */
1384 1384 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1385 1385 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1386 1386 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1387 1387 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1388 1388 goto recheck;
1389 1389 }
1390 1390
1391 1391 VN_DISPOSE(pp, B_FREE, 0, kcred);
1392 1392
1393 1393 CPU_STATS_ADD_K(vm, dfree, 1);
1394 1394
1395 1395 if (isfs) {
1396 1396 if (isexec) {
1397 1397 CPU_STATS_ADD_K(vm, execfree, 1);
1398 1398 } else {
1399 1399 CPU_STATS_ADD_K(vm, fsfree, 1);
1400 1400 }
1401 1401 } else {
1402 1402 CPU_STATS_ADD_K(vm, anonfree, 1);
1403 1403 }
1404 1404
1405 1405 return (CKP_FREED);
1406 1406 }
1407 1407
1408 1408 /*
1409 1409 * Queue async i/o request from pageout_scanner and segment swapout
1410 1410 * routines on one common list. This ensures that pageout devices (swap)
1411 1411 * are not saturated by pageout_scanner or swapout requests.
1412 1412 * The pageout thread empties this list by initiating i/o operations.
1413 1413 */
1414 1414 int
1415 1415 queue_io_request(vnode_t *vp, u_offset_t off)
1416 1416 {
1417 1417 struct async_reqs *arg;
1418 1418
1419 1419 /*
1420 1420 * If we cannot allocate an async request struct,
1421 1421 * skip this page.
1422 1422 */
1423 1423 mutex_enter(&push_lock);
1424 1424 if ((arg = req_freelist) == NULL) {
1425 1425 mutex_exit(&push_lock);
1426 1426 return (0);
1427 1427 }
1428 1428 req_freelist = arg->a_next; /* adjust freelist */
1429 1429 push_list_size++;
1430 1430
1431 1431 arg->a_vp = vp;
1432 1432 arg->a_off = off;
1433 1433 arg->a_len = PAGESIZE;
1434 1434 arg->a_flags = B_ASYNC | B_FREE;
1435 1435 arg->a_cred = kcred; /* always held */
1436 1436
1437 1437 /*
1438 1438 * Add to list of pending write requests.
1439 1439 */
1440 1440 arg->a_next = push_list;
1441 1441 push_list = arg;
1442 1442
1443 1443 if (req_freelist == NULL) {
1444 1444 /*
1445 1445 * No free async requests left. The lock is held so we
1446 1446 * might as well signal the pusher thread now.
1447 1447 */
1448 1448 cv_signal(&push_cv);
1449 1449 }
1450 1450 mutex_exit(&push_lock);
1451 1451 return (1);
1452 1452 }
1453 1453
1454 1454 /*
1455 1455 * Wakeup pageout to initiate i/o if push_list is not empty.
1456 1456 */
1457 1457 void
1458 1458 cv_signal_pageout()
1459 1459 {
1460 1460 if (push_list != NULL) {
1461 1461 mutex_enter(&push_lock);
1462 1462 cv_signal(&push_cv);
1463 1463 mutex_exit(&push_lock);
1464 1464 }
1465 1465 }
|
↓ open down ↓ |
1319 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX