1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2021 Oxide Computer Company
24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 * Copyright 2018 Joyent, Inc.
31 */
32
33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
34 /* All Rights Reserved */
35
36 /*
37 * University Copyright- Copyright (c) 1982, 1986, 1988
38 * The Regents of the University of California
39 * All Rights Reserved
40 *
41 * University Acknowledgment- Portions of this document are derived from
42 * software developed by the University of California, Berkeley, and its
43 * contributors.
44 */
45
46 #include <sys/types.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/buf.h>
50 #include <sys/uio.h>
51 #include <sys/proc.h>
52 #include <sys/systm.h>
53 #include <sys/mman.h>
54 #include <sys/cred.h>
55 #include <sys/vnode.h>
56 #include <sys/vm.h>
57 #include <sys/vmparam.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/cpuvar.h>
61 #include <sys/user.h>
62 #include <sys/kmem.h>
63 #include <sys/debug.h>
64 #include <sys/callb.h>
65 #include <sys/tnf_probe.h>
66 #include <sys/mem_cage.h>
67 #include <sys/time.h>
68 #include <sys/zone.h>
69 #include <sys/stdbool.h>
70
71 #include <vm/hat.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/page.h>
75 #include <vm/pvn.h>
76 #include <vm/seg_kmem.h>
77
78 /*
79 * FREE MEMORY MANAGEMENT
80 *
81 * Management of the pool of free pages is a tricky business. There are
82 * several critical threshold values which constrain our allocation of new
83 * pages and inform the rate of paging out of memory to swap. These threshold
84 * values, and the behaviour they induce, are described below in descending
85 * order of size -- and thus increasing order of severity!
86 *
87 * +---------------------------------------------------- physmem (all memory)
88 * |
89 * | Ordinarily there are no particular constraints placed on page
90 * v allocation. The page scanner is not running and page_create_va()
91 * | will effectively grant all page requests (whether from the kernel
92 * | or from user processes) without artificial delay.
93 * |
94 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
95 * |
96 * | When we have less than "lotsfree" pages, pageout_scanner() is
97 * v signalled by schedpaging() to begin looking for pages that can
98 * | be evicted to disk to bring us back above lotsfree. At this
99 * | stage there is still no constraint on allocation of free pages.
100 * |
101 * | For small systems, we set a lower bound of 16MB for lotsfree;
102 * v this is the natural value for a system with 1GB memory. This is
103 * | to ensure that the pageout reserve pool contains at least 4MB
104 * | for use by ZFS.
105 * |
106 * | For systems with a large amount of memory, we constrain lotsfree
107 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
108 * v at some point the required slack relates more closely to the
109 * | rate at which paging can occur than to the total amount of memory.
110 * |
111 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
112 * |
113 * | When we drop below desfree, a number of kernel facilities will
114 * v wait before allocating more memory, under the assumption that
115 * | pageout or reaping will make progress and free up some memory.
116 * | This behaviour is not especially coordinated; look for comparisons
117 * | of desfree and freemem.
118 * |
119 * | In addition to various attempts at advisory caution, clock()
120 * | will wake up the thread that is ordinarily parked in sched().
121 * | This routine is responsible for the heavy-handed swapping out
122 * v of entire processes in an attempt to arrest the slide of free
123 * | memory. See comments in sched.c for more details.
124 * |
125 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
126 * |
127 * | These two separate tunables have, by default, the same value.
128 * v Various parts of the kernel use minfree to signal the need for
129 * | more aggressive reclamation of memory, and sched() is more
130 * | aggressive at swapping processes out.
131 * |
132 * | If free memory falls below throttlefree, page_create_va() will
133 * | use page_create_throttle() to begin holding most requests for
134 * | new pages while pageout and reaping free up memory. Sleeping
135 * v allocations (e.g., KM_SLEEP) are held here while we wait for
136 * | more memory. Non-sleeping allocations are generally allowed to
137 * | proceed, unless their priority is explicitly lowered with
138 * | KM_NORMALPRI.
139 * |
140 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
141 * |
142 * | When we hit throttlefree, the situation is already dire. The
143 * v system is generally paging out memory and swapping out entire
144 * | processes in order to free up memory for continued operation.
145 * |
146 * | Unfortunately, evicting memory to disk generally requires short
147 * | term use of additional memory; e.g., allocation of buffers for
148 * | storage drivers, updating maps of free and used blocks, etc.
149 * | As such, pageout_reserve is the number of pages that we keep in
150 * | special reserve for use by pageout() and sched() and by any
151 * v other parts of the kernel that need to be working for those to
152 * | make forward progress such as the ZFS I/O pipeline.
153 * |
154 * | When we are below pageout_reserve, we fail or hold any allocation
155 * | that has not explicitly requested access to the reserve pool.
156 * | Access to the reserve is generally granted via the KM_PUSHPAGE
157 * | flag, or by marking a thread T_PUSHPAGE such that all allocations
158 * | can implicitly tap the reserve. For more details, see the
159 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
160 * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
161 * |
162 * +---------------------------------------------------------- no free memory
163 * |
164 * | If we have arrived here, things are very bad indeed. It is
165 * v surprisingly difficult to tell if this condition is even fatal,
166 * | as enough memory may have been granted to pageout() and to the
167 * | ZFS I/O pipeline that requests for eviction that have already been
168 * | made will complete and free up memory some time soon.
169 * |
170 * | If free memory does not materialise, the system generally remains
171 * | deadlocked. The pageout_deadman() below is run once per second
172 * | from clock(), seeking to limit the amount of time a single request
173 * v to page out can be blocked before the system panics to get a crash
174 * | dump and return to service.
175 * |
176 * +-------------------------------------------------------------------------
177 */
178
179 /*
180 * The following parameters control operation of the page replacement
181 * algorithm. They are initialized to 0, and then computed at boot time based
182 * on the size of the system; see setupclock(). If they are patched non-zero
183 * in a loaded vmunix they are left alone and may thus be changed per system
184 * using "mdb -kw" on the loaded system.
185 */
186 pgcnt_t slowscan = 0;
187 pgcnt_t fastscan = 0;
188
189 static pgcnt_t handspreadpages = 0;
190
191 /*
192 * looppages:
193 * Cached copy of the total number of pages in the system (total_pages).
194 *
195 * loopfraction:
196 * Divisor used to relate fastscan to looppages in setupclock().
197 */
198 static uint_t loopfraction = 2;
199 static pgcnt_t looppages;
200
201 static uint_t min_percent_cpu = 4;
202 static uint_t max_percent_cpu = 80;
203 static pgcnt_t maxfastscan = 0;
204 static pgcnt_t maxslowscan = 100;
205
206 #define MEGABYTES (1024ULL * 1024ULL)
207
208 /*
209 * pageout_threshold_style:
210 * set to 1 to use the previous default threshold size calculation;
211 * i.e., each threshold is half of the next largest value.
212 */
213 uint_t pageout_threshold_style = 0;
214
215 /*
216 * The operator may override these tunables to request a different minimum or
217 * maximum lotsfree value, or to change the divisor we use for automatic
218 * sizing.
219 *
220 * By default, we make lotsfree 1/64th of the total memory in the machine. The
221 * minimum and maximum are specified in bytes, rather than pages; a zero value
222 * means the default values (below) are used.
223 */
224 uint_t lotsfree_fraction = 64;
225 pgcnt_t lotsfree_min = 0;
226 pgcnt_t lotsfree_max = 0;
227
228 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
229 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
230
231 /*
232 * If these tunables are set to non-zero values in /etc/system, and provided
233 * the value is not larger than the threshold above, the specified value will
234 * be used directly without any additional calculation or adjustment. The boot
235 * time value of these overrides is preserved in the "clockinit" struct. More
236 * detail is available in the comment at the top of the file.
237 */
238 pgcnt_t maxpgio = 0;
239 pgcnt_t minfree = 0;
240 pgcnt_t desfree = 0;
241 pgcnt_t lotsfree = 0;
242 pgcnt_t needfree = 0;
243 pgcnt_t throttlefree = 0;
244 pgcnt_t pageout_reserve = 0;
245
246 pgcnt_t deficit;
247 pgcnt_t nscan;
248 pgcnt_t desscan;
249
250 /* kstats */
251 uint64_t low_mem_scan;
252 uint64_t zone_cap_scan;
253 uint64_t n_throttle;
254
255 /*
256 * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
257 * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
258 * that gives the equivalent of some underlying %CPU duty cycle.
259 *
260 * min_pageout_nsec:
261 * nanoseconds/wakeup equivalent of min_percent_cpu.
262 *
263 * max_pageout_nsec:
264 * nanoseconds/wakeup equivalent of max_percent_cpu.
265 *
266 * pageout_nsec:
267 * Number of nanoseconds budgeted for each wakeup cycle.
268 * Computed each time around by schedpaging().
269 * Varies between min_pageout_nsec and max_pageout_nsec,
270 * depending on memory pressure or zones over their cap.
271 *
272 * zone_pageout_nsec:
273 * Number of nanoseconds budget for each cycle when a zone
274 * is over its memory cap. If this is zero, then the value
275 * of max_pageout_nsec is used instead.
276 */
277
278 static hrtime_t min_pageout_nsec;
279 static hrtime_t max_pageout_nsec;
280 static hrtime_t pageout_nsec;
281 static hrtime_t zone_pageout_nsec;
282
283 #define MAX_PSCAN_THREADS 16
284 static boolean_t reset_hands[MAX_PSCAN_THREADS];
285
286 /*
287 * These can be tuned in /etc/system or set with mdb.
288 * 'des_page_scanners' is the desired number of page scanner threads. The
289 * system will bring the actual number of threads into line with the desired
290 * number. If des_page_scanners is set to an invalid value, the system will
291 * correct the setting.
292 */
293 uint_t des_page_scanners;
294 uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
295
296 uint_t n_page_scanners;
297 static pgcnt_t pscan_region_sz; /* informational only */
298
299 #define PAGES_POLL_MASK 1023
300
301 /*
302 * pageout_sample_lim:
303 * The limit on the number of samples needed to establish a value for new
304 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
305 * handspreadpages.
306 *
307 * pageout_sample_cnt:
308 * Current sample number. Once the sample gets large enough, set new
309 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
310 *
311 * pageout_sample_pages:
312 * The accumulated number of pages scanned during sampling.
313 *
314 * pageout_sample_etime:
315 * The accumulated nanoseconds for the sample.
316 *
317 * pageout_rate:
318 * Rate in pages/nanosecond, computed at the end of sampling.
319 *
320 * pageout_new_spread:
321 * Initially zero while the system scan rate is measured by
322 * pageout_scanner(), which then sets this value once per system boot after
323 * enough samples have been recorded (pageout_sample_cnt). Once set, this
324 * new value is used for fastscan and handspreadpages.
325 */
326
327 typedef hrtime_t hrrate_t;
328
329 static uint64_t pageout_sample_lim = 4;
330 static uint64_t pageout_sample_cnt = 0;
331 static pgcnt_t pageout_sample_pages = 0;
332 static hrrate_t pageout_rate = 0;
333 static pgcnt_t pageout_new_spread = 0;
334
335 static hrtime_t pageout_sample_etime = 0;
336
337 /* True if page scanner is first starting up */
338 #define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
339
340 /*
341 * Record number of times a pageout_scanner() wakeup cycle finished because it
342 * timed out (exceeded its CPU budget), rather than because it visited
343 * its budgeted number of pages. This is only done when scanning under low
344 * free memory conditions, not when scanning for zones over their cap.
345 */
346 uint64_t pageout_timeouts = 0;
347
348 #ifdef VM_STATS
349 static struct pageoutvmstats_str {
350 ulong_t checkpage[3];
351 } pageoutvmstats;
352 #endif /* VM_STATS */
353
354 /*
355 * Threads waiting for free memory use this condition variable and lock until
356 * memory becomes available.
357 */
358 kmutex_t memavail_lock;
359 kcondvar_t memavail_cv;
360
361 typedef enum pageout_hand {
362 POH_FRONT = 1,
363 POH_BACK,
364 } pageout_hand_t;
365
366 typedef enum {
367 CKP_INELIGIBLE,
368 CKP_NOT_FREED,
369 CKP_FREED,
370 } checkpage_result_t;
371
372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
373
374 static struct clockinit {
375 bool ci_init;
376 pgcnt_t ci_lotsfree_min;
377 pgcnt_t ci_lotsfree_max;
378 pgcnt_t ci_lotsfree;
379 pgcnt_t ci_desfree;
380 pgcnt_t ci_minfree;
381 pgcnt_t ci_throttlefree;
382 pgcnt_t ci_pageout_reserve;
383 pgcnt_t ci_maxpgio;
384 pgcnt_t ci_maxfastscan;
385 pgcnt_t ci_fastscan;
386 pgcnt_t ci_slowscan;
387 pgcnt_t ci_handspreadpages;
388 } clockinit = { .ci_init = false };
389
390 static pgcnt_t
391 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
392 {
393 if (value < minimum) {
394 return (minimum);
395 } else if (value > maximum) {
396 return (maximum);
397 } else {
398 return (value);
399 }
400 }
401
402 static pgcnt_t
403 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
404 {
405 if (initval == 0 || initval >= initval_ceiling) {
406 return (defval);
407 } else {
408 return (initval);
409 }
410 }
411
412 /*
413 * Local boolean to control scanning when zones are over their cap. Avoids
414 * accessing the zone_num_over_cap variable except within schedpaging(), which
415 * only runs periodically. This is here only to reduce our access to
416 * zone_num_over_cap, since it is already accessed a lot during paging, and
417 * the page scanner accesses the zones_over variable on each page during a
418 * scan. There is no lock needed for zone_num_over_cap since schedpaging()
419 * doesn't modify the variable, it only cares if the variable is 0 or non-0.
420 */
421 static boolean_t zones_over = B_FALSE;
422
423 /*
424 * Set up the paging constants for the clock algorithm used by
425 * pageout_scanner(), and by the virtual memory system overall. See the
426 * comments at the top of this file for more information about the threshold
427 * values and system responses to memory pressure.
428 *
429 * This routine is called once by main() at startup, after the initial size of
430 * physical memory is determined. It may be called again later if memory is
431 * added to or removed from the system, or if new measurements of the page scan
432 * rate become available.
433 */
434 void
435 setupclock(void)
436 {
437 uint_t i;
438 pgcnt_t sz, tmp;
439 pgcnt_t defval;
440 bool half = (pageout_threshold_style == 1);
441 bool recalc = true;
442
443 looppages = total_pages;
444
445 /*
446 * The operator may have provided specific values for some of the
447 * tunables via /etc/system. On our first call, we preserve those
448 * values so that they can be used for subsequent recalculations.
449 *
450 * A value of zero for any tunable means we will use the default
451 * sizing.
452 */
453
454 if (!clockinit.ci_init) {
455 clockinit.ci_init = true;
456
457 clockinit.ci_lotsfree_min = lotsfree_min;
458 clockinit.ci_lotsfree_max = lotsfree_max;
459 clockinit.ci_lotsfree = lotsfree;
460 clockinit.ci_desfree = desfree;
461 clockinit.ci_minfree = minfree;
462 clockinit.ci_throttlefree = throttlefree;
463 clockinit.ci_pageout_reserve = pageout_reserve;
464 clockinit.ci_maxpgio = maxpgio;
465 clockinit.ci_maxfastscan = maxfastscan;
466 clockinit.ci_fastscan = fastscan;
467 clockinit.ci_slowscan = slowscan;
468 clockinit.ci_handspreadpages = handspreadpages;
469
470 /*
471 * The first call does not trigger a recalculation, only
472 * subsequent calls.
473 */
474 recalc = false;
475 }
476
477 /*
478 * Configure paging threshold values. For more details on what each
479 * threshold signifies, see the comments at the top of this file.
480 */
481 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
482 btop(LOTSFREE_MAX_DEFAULT));
483 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
484 btop(LOTSFREE_MIN_DEFAULT));
485
486 lotsfree = tune(clockinit.ci_lotsfree, looppages,
487 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
488
489 desfree = tune(clockinit.ci_desfree, lotsfree,
490 lotsfree / 2);
491
492 minfree = tune(clockinit.ci_minfree, desfree,
493 half ? desfree / 2 : 3 * desfree / 4);
494
495 throttlefree = tune(clockinit.ci_throttlefree, desfree,
496 minfree);
497
498 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
499 half ? throttlefree / 2 : 3 * throttlefree / 4);
500
501 /*
502 * Maxpgio thresholds how much paging is acceptable.
503 * This figures that 2/3 busy on an arm is all that is
504 * tolerable for paging. We assume one operation per disk rev.
505 *
506 * XXX - Does not account for multiple swap devices.
507 */
508 if (clockinit.ci_maxpgio == 0) {
509 maxpgio = (DISKRPM * 2) / 3;
510 } else {
511 maxpgio = clockinit.ci_maxpgio;
512 }
513
514 /*
515 * The clock scan rate varies between fastscan and slowscan
516 * based on the amount of free memory available. Fastscan
517 * rate should be set based on the number pages that can be
518 * scanned per sec using ~10% of processor time. Since this
519 * value depends on the processor, MMU, Mhz etc., it is
520 * difficult to determine it in a generic manner for all
521 * architectures.
522 *
523 * Instead of trying to determine the number of pages scanned
524 * per sec for every processor, fastscan is set to be the smaller
525 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
526 * time is limited to ~4% of processor time.
527 *
528 * Setting fastscan to be 1/2 of memory allows pageout to scan
529 * all of memory in ~2 secs. This implies that user pages not
530 * accessed within 1 sec (assuming, handspreadpages == fastscan)
531 * can be reclaimed when free memory is very low. Stealing pages
532 * not accessed within 1 sec seems reasonable and ensures that
533 * active user processes don't thrash.
534 *
535 * Smaller values of fastscan result in scanning fewer pages
536 * every second and consequently pageout may not be able to free
537 * sufficient memory to maintain the minimum threshold. Larger
538 * values of fastscan result in scanning a lot more pages which
539 * could lead to thrashing and higher CPU usage.
540 *
541 * Fastscan needs to be limited to a maximum value and should not
542 * scale with memory to prevent pageout from consuming too much
543 * time for scanning on slow CPU's and avoid thrashing, as a
544 * result of scanning too many pages, on faster CPU's.
545 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
546 * (the upper bound for fastscan) based on the average number
547 * of pages that can potentially be scanned in ~1 sec (using ~4%
548 * of the CPU) on some of the following machines that currently
549 * run Solaris 2.x:
550 *
551 * average memory scanned in ~1 sec
552 *
553 * 25 Mhz SS1+: 23 Meg
554 * LX: 37 Meg
555 * 50 Mhz SC2000: 68 Meg
556 *
557 * 40 Mhz 486: 26 Meg
558 * 66 Mhz 486: 42 Meg
559 *
560 * When free memory falls just below lotsfree, the scan rate
561 * goes from 0 to slowscan (i.e., pageout starts running). This
562 * transition needs to be smooth and is achieved by ensuring that
563 * pageout scans a small number of pages to satisfy the transient
564 * memory demand. This is set to not exceed 100 pages/sec (25 per
565 * wakeup) since scanning that many pages has no noticible impact
566 * on system performance.
567 *
568 * In addition to setting fastscan and slowscan, pageout is
569 * limited to using ~4% of the CPU. This results in increasing
570 * the time taken to scan all of memory, which in turn means that
571 * user processes have a better opportunity of preventing their
572 * pages from being stolen. This has a positive effect on
573 * interactive and overall system performance when memory demand
574 * is high.
575 *
576 * Thus, the rate at which pages are scanned for replacement will
577 * vary linearly between slowscan and the number of pages that
578 * can be scanned using ~4% of processor time instead of varying
579 * linearly between slowscan and fastscan.
580 *
581 * Also, the processor time used by pageout will vary from ~1%
582 * at slowscan to ~4% at fastscan instead of varying between
583 * ~1% at slowscan and ~10% at fastscan.
584 *
585 * The values chosen for the various VM parameters (fastscan,
586 * handspreadpages, etc) are not universally true for all machines,
587 * but appear to be a good rule of thumb for the machines we've
588 * tested. They have the following ranges:
589 *
590 * cpu speed: 20 to 70 Mhz
591 * page size: 4K to 8K
592 * memory size: 16M to 5G
593 * page scan rate: 4000 - 17400 4K pages per sec
594 *
595 * The values need to be re-examined for machines which don't
596 * fall into the various ranges (e.g., slower or faster CPUs,
597 * smaller or larger pagesizes etc) shown above.
598 *
599 * On an MP machine, pageout is often unable to maintain the
600 * minimum paging thresholds under heavy load. This is due to
601 * the fact that user processes running on other CPU's can be
602 * dirtying memory at a much faster pace than pageout can find
603 * pages to free. The memory demands could be met by enabling
604 * more than one CPU to run the clock algorithm in such a manner
605 * that the various clock hands don't overlap. This also makes
606 * it more difficult to determine the values for fastscan, slowscan
607 * and handspreadpages.
608 *
609 * The swapper is currently used to free up memory when pageout
610 * is unable to meet memory demands by swapping out processes.
611 * In addition to freeing up memory, swapping also reduces the
612 * demand for memory by preventing user processes from running
613 * and thereby consuming memory.
614 */
615 if (clockinit.ci_maxfastscan == 0) {
616 if (pageout_new_spread != 0) {
617 maxfastscan = pageout_new_spread;
618 } else {
619 maxfastscan = MAXHANDSPREADPAGES;
620 }
621 } else {
622 maxfastscan = clockinit.ci_maxfastscan;
623 }
624
625 if (clockinit.ci_fastscan == 0) {
626 fastscan = MIN(looppages / loopfraction, maxfastscan);
627 } else {
628 fastscan = clockinit.ci_fastscan;
629 }
630
631 if (fastscan > looppages / loopfraction) {
632 fastscan = looppages / loopfraction;
633 }
634
635 /*
636 * Set slow scan time to 1/10 the fast scan time, but
637 * not to exceed maxslowscan.
638 */
639 if (clockinit.ci_slowscan == 0) {
640 slowscan = MIN(fastscan / 10, maxslowscan);
641 } else {
642 slowscan = clockinit.ci_slowscan;
643 }
644
645 if (slowscan > fastscan / 2) {
646 slowscan = fastscan / 2;
647 }
648
649 /*
650 * Handspreadpages is distance (in pages) between front and back
651 * pageout daemon hands. The amount of time to reclaim a page
652 * once pageout examines it increases with this distance and
653 * decreases as the scan rate rises. It must be < the amount
654 * of pageable memory.
655 *
656 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
657 * to be "fastscan" results in the front hand being a few secs
658 * (varies based on the processor speed) ahead of the back hand
659 * at fastscan rates. This distance can be further reduced, if
660 * necessary, by increasing the processor time used by pageout
661 * to be more than ~4% and preferrably not more than ~10%.
662 *
663 * As a result, user processes have a much better chance of
664 * referencing their pages before the back hand examines them.
665 * This also significantly lowers the number of reclaims from
666 * the freelist since pageout does not end up freeing pages which
667 * may be referenced a sec later.
668 */
669 if (clockinit.ci_handspreadpages == 0) {
670 handspreadpages = fastscan;
671 } else {
672 handspreadpages = clockinit.ci_handspreadpages;
673 }
674
675 /*
676 * Make sure that back hand follows front hand by at least
677 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
678 * back hand to look at a page during the same wakeup of the pageout
679 * daemon in which the front hand cleared its ref bit.
680 */
681 if (handspreadpages >= looppages) {
682 handspreadpages = looppages - 1;
683 }
684
685 if (!recalc) {
686 /*
687 * Setup basic values at initialization.
688 */
689 pscan_region_sz = total_pages;
690 des_page_scanners = n_page_scanners = 1;
691 reset_hands[0] = B_TRUE;
692 return;
693 }
694
695 /*
696 * Recalculating
697 *
698 * We originally set the number of page scanners to 1. Now that we
699 * know what the handspreadpages is for a scanner, figure out how many
700 * scanners we should run. We want to ensure that the regions don't
701 * overlap and that they are not touching.
702 *
703 * A default 64GB region size is used as the initial value to calculate
704 * how many scanner threads we should create on lower memory systems.
705 * The idea is to limit the number of threads to a practical value
706 * (e.g. a 64GB machine really only needs one scanner thread). For very
707 * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
708 * threads.
709 *
710 * The scanner threads themselves are evenly spread out around the
711 * memory "clock" in pageout_scanner when we reset the hands, and each
712 * thread will scan all of memory.
713 */
714 sz = (btop(64ULL * 0x40000000ULL));
715 if (sz < handspreadpages) {
716 /*
717 * 64GB is smaller than the separation between the front
718 * and back hands; use double handspreadpages.
719 */
720 sz = handspreadpages << 1;
721 }
722 if (sz > total_pages) {
723 sz = total_pages;
724 }
725 /* Record region size for inspection with mdb, otherwise unused */
726 pscan_region_sz = sz;
727
728 tmp = sz;
729 for (i = 1; tmp < total_pages; i++) {
730 tmp += sz;
731 }
732
733 if (i > MAX_PSCAN_THREADS)
734 i = MAX_PSCAN_THREADS;
735
736 des_page_scanners = i;
737 }
738
739 /*
740 * Pageout scheduling.
741 *
742 * Schedpaging controls the rate at which the page out daemon runs by
743 * setting the global variables nscan and desscan SCHEDPAGING_HZ
744 * times a second. Nscan records the number of pages pageout has examined
745 * in its current pass; schedpaging() resets this value to zero each time
746 * it runs. Desscan records the number of pages pageout should examine
747 * in its next pass; schedpaging() sets this value based on the amount of
748 * currently available memory.
749 */
750 #define SCHEDPAGING_HZ 4
751
752 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
753
754 /*
755 * Pool of available async pageout putpage requests.
756 */
757 static struct async_reqs *push_req;
758 static struct async_reqs *req_freelist; /* available req structs */
759 static struct async_reqs *push_list; /* pending reqs */
760 static kmutex_t push_lock; /* protects req pool */
761 static kcondvar_t push_cv;
762
763 /*
764 * If pageout() is stuck on a single push for this many seconds,
765 * pageout_deadman() will assume the system has hit a memory deadlock. If set
766 * to 0, the deadman will have no effect.
767 *
768 * Note that we are only looking for stalls in the calls that pageout() makes
769 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
770 * I/O, which should not take long unless the underlying strategy call blocks
771 * indefinitely for memory. The actual I/O request happens (or fails) later.
772 */
773 uint_t pageout_deadman_seconds = 90;
774
775 static uint_t pageout_stucktime = 0;
776 static bool pageout_pushing = false;
777 static uint64_t pageout_pushcount = 0;
778 static uint64_t pageout_pushcount_seen = 0;
779
780 static int async_list_size = 256; /* number of async request structs */
781
782 static void pageout_scanner(void *);
783
784 /*
785 * If a page is being shared more than "po_share" times
786 * then leave it alone- don't page it out.
787 */
788 #define MIN_PO_SHARE (8)
789 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
790 ulong_t po_share = MIN_PO_SHARE;
791
792 /*
793 * Schedule rate for paging.
794 * Rate is linear interpolation between
795 * slowscan with lotsfree and fastscan when out of memory.
796 */
797 static void
798 schedpaging(void *arg)
799 {
800 spgcnt_t vavail;
801
802 if (freemem < lotsfree + needfree + kmem_reapahead)
803 kmem_reap();
804
805 if (freemem < lotsfree + needfree)
806 seg_preap();
807
808 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
809 kcage_cageout_wakeup();
810
811 (void) atomic_swap_ulong(&nscan, 0);
812 vavail = freemem - deficit;
813 if (pageout_new_spread != 0)
814 vavail -= needfree;
815 if (vavail < 0)
816 vavail = 0;
817 if (vavail > lotsfree)
818 vavail = lotsfree;
819
820 /*
821 * Fix for 1161438 (CRS SPR# 73922). All variables
822 * in the original calculation for desscan were 32 bit signed
823 * ints. As freemem approaches 0x0 on a system with 1 Gig or
824 * more of memory, the calculation can overflow. When this
825 * happens, desscan becomes negative and pageout_scanner()
826 * stops paging out.
827 */
828 if (needfree > 0 && pageout_new_spread == 0) {
829 /*
830 * If we've not yet collected enough samples to
831 * calculate a spread, kick into high gear anytime
832 * needfree is non-zero. Note that desscan will not be
833 * the limiting factor for systems with larger memory;
834 * the %CPU will limit the scan. That will also be
835 * maxed out below.
836 */
837 desscan = fastscan / SCHEDPAGING_HZ;
838 } else {
839 /*
840 * Once we've calculated a spread based on system
841 * memory and usage, just treat needfree as another
842 * form of deficit.
843 */
844 spgcnt_t faststmp, slowstmp, result;
845
846 slowstmp = slowscan * vavail;
847 faststmp = fastscan * (lotsfree - vavail);
848 result = (slowstmp + faststmp) /
849 nz(lotsfree) / SCHEDPAGING_HZ;
850 desscan = (pgcnt_t)result;
851 }
852
853 /*
854 * If we've not yet collected enough samples to calculate a
855 * spread, also kick %CPU to the max.
856 */
857 if (pageout_new_spread == 0) {
858 pageout_nsec = max_pageout_nsec;
859 } else {
860 pageout_nsec = min_pageout_nsec +
861 (lotsfree - vavail) *
862 (max_pageout_nsec - min_pageout_nsec) /
863 nz(lotsfree);
864 }
865
866 if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
867 /*
868 * We have finished the pagescan initialization and the desired
869 * number of page scanners has changed, either because
870 * initialization just finished, because of a memory DR, or
871 * because des_page_scanners has been modified on the fly (i.e.
872 * by mdb). If we need more scanners, start them now, otherwise
873 * the excess scanners will terminate on their own when they
874 * reset their hands.
875 */
876 uint_t i;
877 uint_t curr_nscan = n_page_scanners;
878 pgcnt_t max = total_pages / handspreadpages;
879
880 if (des_page_scanners > max)
881 des_page_scanners = max;
882
883 if (des_page_scanners > MAX_PSCAN_THREADS) {
884 des_page_scanners = MAX_PSCAN_THREADS;
885 } else if (des_page_scanners == 0) {
886 des_page_scanners = 1;
887 }
888
889 /*
890 * Each thread has its own entry in the reset_hands array, so
891 * we don't need any locking in pageout_scanner to check the
892 * thread's reset_hands entry. Thus, we use a pre-allocated
893 * fixed size reset_hands array and upper limit on the number
894 * of pagescan threads.
895 *
896 * The reset_hands entries need to be true before we start new
897 * scanners, but if we're reducing, we don't want a race on the
898 * recalculation for the existing threads, so we set
899 * n_page_scanners first.
900 */
901 n_page_scanners = des_page_scanners;
902 for (i = 0; i < MAX_PSCAN_THREADS; i++) {
903 reset_hands[i] = B_TRUE;
904 }
905
906 if (des_page_scanners > curr_nscan) {
907 /* Create additional pageout scanner threads. */
908 for (i = curr_nscan; i < des_page_scanners; i++) {
909 (void) lwp_kernel_create(proc_pageout,
910 pageout_scanner, (void *)(uintptr_t)i,
911 TS_RUN, curthread->t_pri);
912 }
913 }
914 }
915
916 zones_over = B_FALSE;
917
918 if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
919 if (!PAGE_SCAN_STARTUP)
920 low_mem_scan++;
921 /*
922 * Either we need more memory, or we still need to
923 * measure the average scan rate. Wake the scanner.
924 */
925 DTRACE_PROBE(schedpage__wake__low);
926 WAKE_PAGEOUT_SCANNER();
927
928 } else if (zone_num_over_cap > 0) {
929 /* One or more zones are over their cap. */
930
931 /* No page limit */
932 desscan = total_pages;
933
934 /*
935 * Increase the scanning CPU% to the max. This implies
936 * 80% of one CPU/sec if the scanner can run each
937 * opportunity. Can also be tuned via setting
938 * zone_pageout_nsec in /etc/system or with mdb.
939 */
940 pageout_nsec = (zone_pageout_nsec != 0) ?
941 zone_pageout_nsec : max_pageout_nsec;
942
943 zones_over = B_TRUE;
944 zone_cap_scan++;
945
946 DTRACE_PROBE(schedpage__wake__zone);
947 WAKE_PAGEOUT_SCANNER();
948
949 } else {
950 /*
951 * There are enough free pages, no need to
952 * kick the scanner thread. And next time
953 * around, keep more of the `highly shared'
954 * pages.
955 */
956 cv_signal_pageout();
957
958 mutex_enter(&pageout_mutex);
959 if (po_share > MIN_PO_SHARE) {
960 po_share >>= 1;
961 }
962 mutex_exit(&pageout_mutex);
963 }
964
965 /*
966 * Signal threads waiting for available memory.
967 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
968 * in this case it is not needed - the waiters will be waken up during
969 * the next invocation of this function.
970 */
971 if (kmem_avail() > 0)
972 cv_broadcast(&memavail_cv);
973
974 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
975 }
976
977 pgcnt_t pushes;
978 ulong_t push_list_size; /* # of requests on pageout queue */
979
980 /*
981 * Paging out should always be enabled. This tunable exists to hold pageout
982 * for debugging purposes. If set to 0, pageout_scanner() will go back to
983 * sleep each time it is woken by schedpaging().
984 */
985 uint_t dopageout = 1;
986
987 /*
988 * The page out daemon, which runs as process 2.
989 *
990 * Page out occurs when either:
991 * a) there is less than lotsfree pages,
992 * b) there are one or more zones over their physical memory cap.
993 *
994 * The daemon treats physical memory as a circular array of pages and scans the
995 * pages using a 'two-handed clock' algorithm. The front hand moves through
996 * the pages, clearing the reference bit. The back hand travels a distance
997 * (handspreadpages) behind the front hand, freeing the pages that have not
998 * been referenced in the time since the front hand passed. If modified, they
999 * are first written to their backing store before being freed.
1000 *
1001 * In order to make page invalidation more responsive on machines with larger
1002 * memory, multiple pageout_scanner threads may be created. In this case, the
1003 * threads are evenly distributed around the the memory "clock face" so that
1004 * memory can be reclaimed more quickly (that is, there can be large regions in
1005 * which no pages can be reclaimed by a single thread, leading to lag which
1006 * causes undesirable behavior such as htable stealing).
1007 *
1008 * As long as there are at least lotsfree pages, or no zones over their cap,
1009 * then pageout_scanner threads are not run. When pageout_scanner threads are
1010 * running for case (a), all pages are considered for pageout. For case (b),
1011 * only pages belonging to a zone over its cap will be considered for pageout.
1012 *
1013 * There are multiple threads that act on behalf of the pageout process.
1014 * A set of threads scan pages (pageout_scanner) and frees them up if
1015 * they don't require any VOP_PUTPAGE operation. If a page must be
1016 * written back to its backing store, the request is put on a list
1017 * and the other (pageout) thread is signaled. The pageout thread
1018 * grabs VOP_PUTPAGE requests from the list, and processes them.
1019 * Some filesystems may require resources for the VOP_PUTPAGE
1020 * operations (like memory) and hence can block the pageout
1021 * thread, but the pageout_scanner threads can still operate. There is still
1022 * no guarantee that memory deadlocks cannot occur.
1023 *
1024 * The pageout_scanner parameters are determined in schedpaging().
1025 */
1026 void
1027 pageout()
1028 {
1029 struct async_reqs *arg;
1030 pri_t pageout_pri;
1031 int i;
1032 pgcnt_t max_pushes;
1033 callb_cpr_t cprinfo;
1034
1035 proc_pageout = ttoproc(curthread);
1036 proc_pageout->p_cstime = 0;
1037 proc_pageout->p_stime = 0;
1038 proc_pageout->p_cutime = 0;
1039 proc_pageout->p_utime = 0;
1040 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1041 bcopy("pageout", PTOU(curproc)->u_comm, 7);
1042
1043 /*
1044 * Create pageout scanner thread
1045 */
1046 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1047 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1048
1049 /*
1050 * Allocate and initialize the async request structures
1051 * for pageout.
1052 */
1053 push_req = (struct async_reqs *)
1054 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055
1056 req_freelist = push_req;
1057 for (i = 0; i < async_list_size - 1; i++) {
1058 push_req[i].a_next = &push_req[i + 1];
1059 }
1060
1061 pageout_pri = curthread->t_pri;
1062
1063 /* Create the (first) pageout scanner thread. */
1064 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065 pageout_pri - 1);
1066
1067 /*
1068 * kick off pageout scheduler.
1069 */
1070 schedpaging(NULL);
1071
1072 /*
1073 * Create kernel cage thread.
1074 * The kernel cage thread is started under the pageout process
1075 * to take advantage of the less restricted page allocation
1076 * in page_create_throttle().
1077 */
1078 kcage_cageout_init();
1079
1080 /*
1081 * Limit pushes to avoid saturating pageout devices.
1082 */
1083 max_pushes = maxpgio / SCHEDPAGING_HZ;
1084 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1085
1086 for (;;) {
1087 mutex_enter(&push_lock);
1088
1089 while ((arg = push_list) == NULL || pushes > max_pushes) {
1090 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1091 cv_wait(&push_cv, &push_lock);
1092 pushes = 0;
1093 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094 }
1095 push_list = arg->a_next;
1096 arg->a_next = NULL;
1097 pageout_pushing = true;
1098 mutex_exit(&push_lock);
1099
1100 DTRACE_PROBE(pageout__push);
1101 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103 pushes++;
1104 }
1105
1106 /* vp held by checkpage() */
1107 VN_RELE(arg->a_vp);
1108
1109 mutex_enter(&push_lock);
1110 pageout_pushing = false;
1111 pageout_pushcount++;
1112 arg->a_next = req_freelist; /* back on freelist */
1113 req_freelist = arg;
1114 push_list_size--;
1115 mutex_exit(&push_lock);
1116 }
1117 }
1118
1119 /*
1120 * Kernel thread that scans pages looking for ones to free
1121 */
1122 static void
1123 pageout_scanner(void *a)
1124 {
1125 struct page *fronthand, *backhand;
1126 uint_t laps, iter = 0;
1127 callb_cpr_t cprinfo;
1128 pgcnt_t nscan_cnt, nscan_limit;
1129 pgcnt_t pcount;
1130 uint_t inst = (uint_t)(uintptr_t)a;
1131 hrtime_t sample_start, sample_end;
1132 kmutex_t pscan_mutex;
1133 bool sampling;
1134
1135 VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136
1137 mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
1138
1139 CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140 mutex_enter(&pscan_mutex);
1141
1142 /*
1143 * Establish the minimum and maximum length of time to be spent
1144 * scanning pages per wakeup, limiting the scanner duty cycle. The
1145 * input percentage values (0-100) must be converted to a fraction of
1146 * the number of nanoseconds in a second of wall time, then further
1147 * scaled down by the number of scanner wakeups in a second:
1148 */
1149 min_pageout_nsec = MAX(1,
1150 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151 max_pageout_nsec = MAX(min_pageout_nsec,
1152 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
1153
1154 loop:
1155 cv_signal_pageout();
1156
1157 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158 cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159 CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
1160
1161 /*
1162 * Check if pageout has been disabled for debugging purposes:
1163 */
1164 if (!dopageout) {
1165 goto loop;
1166 }
1167
1168 /*
1169 * One may reset the clock hands for debugging purposes. Hands will
1170 * also be reset if memory is added to or removed from the system.
1171 */
1172 if (reset_hands[inst]) {
1173 struct page *first;
1174 pgcnt_t offset = total_pages / n_page_scanners;
1175
1176 reset_hands[inst] = B_FALSE;
1177 if (inst >= n_page_scanners) {
1178 /*
1179 * The desired number of page scanners has been
1180 * reduced and this instance is no longer wanted.
1181 * Exit the lwp.
1182 */
1183 VERIFY3U(inst, !=, 0);
1184 mutex_exit(&pscan_mutex);
1185 mutex_enter(&curproc->p_lock);
1186 lwp_exit();
1187 }
1188
1189 /*
1190 * The reset case repositions the hands at the proper place
1191 * on the memory clock face to prevent creep into another
1192 * thread's active region or when the number of threads has
1193 * changed.
1194 *
1195 * Set the two clock hands to be separated by a reasonable
1196 * amount, but no more than 360 degrees apart.
1197 *
1198 * If inst == 0, backhand starts at first page, otherwise
1199 * it is (inst * offset) around the memory "clock face" so that
1200 * we spread out each scanner instance evenly.
1201 */
1202 first = page_first();
1203 backhand = page_nextn(first, offset * inst);
1204 if (handspreadpages >= total_pages) {
1205 fronthand = page_nextn(backhand, total_pages - 1);
1206 } else {
1207 fronthand = page_nextn(backhand, handspreadpages);
1208 }
1209 }
1210
1211 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212
1213 /*
1214 * Keep track of the number of times we have scanned all the way around
1215 * the loop:
1216 */
1217 laps = 0;
1218
1219 /*
1220 * Track the number of pages visited during this scan so that we can
1221 * periodically measure our duty cycle.
1222 */
1223 pcount = 0;
1224 nscan_cnt = 0;
1225
1226 if (PAGE_SCAN_STARTUP) {
1227 /*
1228 * We need to measure the rate at which the system is able to
1229 * scan pages of memory. Each of these initial samples is a
1230 * scan of all system memory, regardless of whether or not we
1231 * are experiencing memory pressure.
1232 */
1233 nscan_limit = total_pages;
1234 sampling = true;
1235 } else {
1236 nscan_limit = desscan;
1237 sampling = false;
1238 }
1239
1240 DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241 page_t *, backhand, page_t *, fronthand);
1242
1243 sample_start = gethrtime();
1244
1245 /*
1246 * Scan the appropriate number of pages for a single duty cycle.
1247 * Only scan while at least one of these is true:
1248 * 1) one or more zones is over its cap
1249 * 2) there is not enough free memory
1250 * 3) during page scan startup when determining sample data
1251 */
1252 while (nscan_cnt < nscan_limit) {
1253 checkpage_result_t rvfront, rvback;
1254
1255 if (!sampling && !zones_over &&
1256 freemem >= lotsfree + needfree) {
1257 /*
1258 * We are not sampling and enough memory has become
1259 * available that scanning is no longer required.
1260 */
1261 break;
1262 }
1263
1264 DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
1265
1266 /*
1267 * Periodically check to see if we have exceeded the CPU duty
1268 * cycle for a single wakeup.
1269 */
1270 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271 hrtime_t pageout_cycle_nsec;
1272
1273 pageout_cycle_nsec = gethrtime() - sample_start;
1274 if (pageout_cycle_nsec >= pageout_nsec) {
1275 /*
1276 * This is where we normally break out of the
1277 * loop when scanning zones or sampling.
1278 */
1279 if (!zones_over) {
1280 atomic_inc_64(&pageout_timeouts);
1281 }
1282 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283 break;
1284 }
1285 }
1286
1287 /*
1288 * If checkpage manages to add a page to the free list,
1289 * we give ourselves another couple of trips around the loop.
1290 */
1291 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1292 laps = 0;
1293 }
1294 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1295 laps = 0;
1296 }
1297
1298 ++pcount;
1299
1300 /*
1301 * This CPU kstat is only incremented here and we're obviously
1302 * on this CPU, so no lock.
1303 */
1304 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305
1306 /*
1307 * Don't include ineligible pages in the number scanned.
1308 */
1309 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310 nscan_cnt++;
1311 }
1312
1313 backhand = page_next(backhand);
1314 fronthand = page_next(fronthand);
1315
1316 /*
1317 * The front hand has wrapped around to the first page in the
1318 * loop.
1319 */
1320 if (fronthand == page_first()) {
1321 DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
1322
1323 /*
1324 * Every 64 wraps we reposition our hands within our
1325 * region to prevent creep into another thread.
1326 */
1327 if ((++iter % pageout_reset_cnt) == 0)
1328 reset_hands[inst] = B_TRUE;
1329
1330 /*
1331 * This CPU kstat is only incremented here and we're
1332 * obviously on this CPU, so no lock.
1333 */
1334 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335
1336 /*
1337 * If scanning because the system is low on memory,
1338 * then when we wraparound memory we want to try to
1339 * reclaim more pages.
1340 * If scanning only because zones are over their cap,
1341 * then wrapping is common and we simply keep going.
1342 */
1343 if (freemem < lotsfree + needfree && ++laps > 1) {
1344 /*
1345 * The system is low on memory.
1346 * Extremely unlikely, but it happens.
1347 * We went around the loop at least once
1348 * and didn't get far enough.
1349 * If we are still skipping `highly shared'
1350 * pages, skip fewer of them. Otherwise,
1351 * give up till the next clock tick.
1352 */
1353 mutex_enter(&pageout_mutex);
1354 if (po_share < MAX_PO_SHARE) {
1355 po_share <<= 1;
1356 mutex_exit(&pageout_mutex);
1357 } else {
1358 mutex_exit(&pageout_mutex);
1359 break;
1360 }
1361 }
1362 }
1363 }
1364
1365 atomic_add_long(&nscan, nscan_cnt);
1366
1367 sample_end = gethrtime();
1368
1369 DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370 uint_t, inst);
1371
1372 /*
1373 * The following two blocks are only relevant when the scanner is
1374 * first started up. After the scanner runs for a while, neither of
1375 * the conditions will ever be true again.
1376 *
1377 * The global variables used below are only modified by this thread and
1378 * only during initial scanning when there is a single page scanner
1379 * thread running. Thus, we don't use any locking.
1380 */
1381 if (pageout_new_spread == 0) {
1382 VERIFY3U(inst, ==, 0);
1383 if (PAGE_SCAN_STARTUP) {
1384 /*
1385 * Continue accumulating samples until we have enough
1386 * to get a reasonable value for average scan rate:
1387 */
1388 pageout_sample_pages += pcount;
1389 pageout_sample_etime += sample_end - sample_start;
1390 ++pageout_sample_cnt;
1391 }
1392
1393 if (!PAGE_SCAN_STARTUP) {
1394 /*
1395 * We have enough samples, set the spread.
1396 */
1397 pageout_rate = (hrrate_t)pageout_sample_pages *
1398 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1399 pageout_new_spread = pageout_rate / 10;
1400 setupclock();
1401 }
1402 }
1403
1404 goto loop;
1405 }
1406
1407 /*
1408 * The pageout deadman is run once per second by clock().
1409 */
1410 void
1411 pageout_deadman(void)
1412 {
1413 if (panicstr != NULL) {
1414 /*
1415 * There is no pageout after panic.
1416 */
1417 return;
1418 }
1419
1420 if (pageout_deadman_seconds == 0) {
1421 /*
1422 * The deadman is not enabled.
1423 */
1424 return;
1425 }
1426
1427 if (!pageout_pushing) {
1428 goto reset;
1429 }
1430
1431 /*
1432 * We are pushing a page. Check to see if it is the same call we saw
1433 * last time we looked:
1434 */
1435 if (pageout_pushcount != pageout_pushcount_seen) {
1436 /*
1437 * It is a different call from the last check, so we are not
1438 * stuck.
1439 */
1440 goto reset;
1441 }
1442
1443 if (++pageout_stucktime >= pageout_deadman_seconds) {
1444 panic("pageout_deadman: stuck pushing the same page for %d "
1445 "seconds (freemem is %lu)", pageout_deadman_seconds,
1446 freemem);
1447 }
1448
1449 return;
1450
1451 reset:
1452 /*
1453 * Reset our tracking state to reflect that we are not stuck:
1454 */
1455 pageout_stucktime = 0;
1456 pageout_pushcount_seen = pageout_pushcount;
1457 }
1458
1459 /*
1460 * Look at the page at hand. If it is locked (e.g., for physical i/o),
1461 * system (u., page table) or free, then leave it alone. Otherwise,
1462 * if we are running the front hand, turn off the page's reference bit.
1463 * If the proc is over maxrss, we take it. If running the back hand,
1464 * check whether the page has been reclaimed. If not, free the page,
1465 * pushing it to disk first if necessary.
1466 *
1467 * Return values:
1468 * CKP_INELIGIBLE if the page is not a candidate at all,
1469 * CKP_NOT_FREED if the page was not freed, or
1470 * CKP_FREED if we freed it.
1471 */
1472 static checkpage_result_t
1473 checkpage(struct page *pp, pageout_hand_t whichhand)
1474 {
1475 int ppattr;
1476 int isfs = 0;
1477 int isexec = 0;
1478 int pagesync_flag;
1479 zoneid_t zid = ALL_ZONES;
1480
1481 /*
1482 * Skip pages:
1483 * - associated with the kernel vnode since
1484 * they are always "exclusively" locked.
1485 * - that are free
1486 * - that are shared more than po_share'd times
1487 * - its already locked
1488 *
1489 * NOTE: These optimizations assume that reads are atomic.
1490 */
1491
1492 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1493 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1494 hat_page_checkshare(pp, po_share)) {
1495 return (CKP_INELIGIBLE);
1496 }
1497
1498 if (!page_trylock(pp, SE_EXCL)) {
1499 /*
1500 * Skip the page if we can't acquire the "exclusive" lock.
1501 */
1502 return (CKP_INELIGIBLE);
1503 } else if (PP_ISFREE(pp)) {
1504 /*
1505 * It became free between the above check and our actually
1506 * locking the page. Oh well, there will be other pages.
1507 */
1508 page_unlock(pp);
1509 return (CKP_INELIGIBLE);
1510 }
1511
1512 /*
1513 * Reject pages that cannot be freed. The page_struct_lock
1514 * need not be acquired to examine these
1515 * fields since the page has an "exclusive" lock.
1516 */
1517 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1518 page_unlock(pp);
1519 return (CKP_INELIGIBLE);
1520 }
1521
1522 if (zones_over) {
1523 ASSERT(pp->p_zoneid == ALL_ZONES ||
1524 pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525 if (pp->p_zoneid == ALL_ZONES ||
1526 zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527 /*
1528 * Cross-zone shared page, or zone not over it's cap.
1529 * Leave the page alone.
1530 */
1531 page_unlock(pp);
1532 return (CKP_INELIGIBLE);
1533 }
1534 zid = pp->p_zoneid;
1535 }
1536
1537 /*
1538 * Maintain statistics for what we are freeing
1539 */
1540
1541 if (pp->p_vnode != NULL) {
1542 if (pp->p_vnode->v_flag & VVMEXEC)
1543 isexec = 1;
1544
1545 if (!IS_SWAPFSVP(pp->p_vnode))
1546 isfs = 1;
1547 }
1548
1549 /*
1550 * Turn off REF and MOD bits with the front hand.
1551 * The back hand examines the REF bit and always considers
1552 * SHARED pages as referenced.
1553 */
1554 if (whichhand == POH_FRONT) {
1555 pagesync_flag = HAT_SYNC_ZERORM;
1556 } else {
1557 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1558 HAT_SYNC_STOPON_SHARED;
1559 }
1560
1561 ppattr = hat_pagesync(pp, pagesync_flag);
1562
1563 recheck:
1564 /*
1565 * If page is referenced; make unreferenced but reclaimable.
1566 * If this page is not referenced, then it must be reclaimable
1567 * and we can add it to the free list.
1568 */
1569 if (ppattr & P_REF) {
1570 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1571 pageout_hand_t, whichhand);
1572
1573 if (whichhand == POH_FRONT) {
1574 /*
1575 * Checking of rss or madvise flags needed here...
1576 *
1577 * If not "well-behaved", fall through into the code
1578 * for not referenced.
1579 */
1580 hat_clrref(pp);
1581 }
1582
1583 /*
1584 * Somebody referenced the page since the front
1585 * hand went by, so it's not a candidate for
1586 * freeing up.
1587 */
1588 page_unlock(pp);
1589 return (CKP_NOT_FREED);
1590 }
1591
1592 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1593
1594 /*
1595 * If large page, attempt to demote it. If successfully demoted,
1596 * retry the checkpage.
1597 */
1598 if (pp->p_szc != 0) {
1599 if (!page_try_demote_pages(pp)) {
1600 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1601 page_unlock(pp);
1602 return (CKP_INELIGIBLE);
1603 }
1604
1605 ASSERT(pp->p_szc == 0);
1606 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1607
1608 /*
1609 * Since page_try_demote_pages() could have unloaded some
1610 * mappings it makes sense to reload ppattr.
1611 */
1612 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1613 }
1614
1615 /*
1616 * If the page is currently dirty, we have to arrange to have it
1617 * cleaned before it can be freed.
1618 *
1619 * XXX - ASSERT(pp->p_vnode != NULL);
1620 */
1621 if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1622 struct vnode *vp = pp->p_vnode;
1623 u_offset_t offset = pp->p_offset;
1624
1625 /*
1626 * XXX - Test for process being swapped out or about to exit?
1627 * [Can't get back to process(es) using the page.]
1628 */
1629
1630 /*
1631 * Hold the vnode before releasing the page lock to
1632 * prevent it from being freed and re-used by some
1633 * other thread.
1634 */
1635 VN_HOLD(vp);
1636 page_unlock(pp);
1637
1638 /*
1639 * Queue I/O request for the pageout thread.
1640 */
1641 if (!queue_io_request(vp, offset)) {
1642 VN_RELE(vp);
1643 return (CKP_NOT_FREED);
1644 }
1645 if (isfs) {
1646 zone_pageout_stat(zid, ZPO_DIRTY);
1647 } else {
1648 zone_pageout_stat(zid, ZPO_ANONDIRTY);
1649 }
1650 return (CKP_FREED);
1651 }
1652
1653 /*
1654 * Now we unload all the translations and put the page back on to the
1655 * free list. If the page was used (referenced or modified) after the
1656 * pagesync but before it was unloaded we catch it and handle the page
1657 * properly.
1658 */
1659 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1660 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1661 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1662 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1663 goto recheck;
1664 }
1665
1666 VN_DISPOSE(pp, B_FREE, 0, kcred);
1667
1668 CPU_STATS_ADD_K(vm, dfree, 1);
1669
1670 if (isfs) {
1671 if (isexec) {
1672 CPU_STATS_ADD_K(vm, execfree, 1);
1673 } else {
1674 CPU_STATS_ADD_K(vm, fsfree, 1);
1675 }
1676 zone_pageout_stat(zid, ZPO_FS);
1677 } else {
1678 CPU_STATS_ADD_K(vm, anonfree, 1);
1679 zone_pageout_stat(zid, ZPO_ANON);
1680 }
1681
1682 return (CKP_FREED);
1683 }
1684
1685 /*
1686 * Queue async i/o request from pageout_scanner and segment swapout
1687 * routines on one common list. This ensures that pageout devices (swap)
1688 * are not saturated by pageout_scanner or swapout requests.
1689 * The pageout thread empties this list by initiating i/o operations.
1690 */
1691 int
1692 queue_io_request(vnode_t *vp, u_offset_t off)
1693 {
1694 struct async_reqs *arg;
1695
1696 /*
1697 * If we cannot allocate an async request struct,
1698 * skip this page.
1699 */
1700 mutex_enter(&push_lock);
1701 if ((arg = req_freelist) == NULL) {
1702 mutex_exit(&push_lock);
1703 return (0);
1704 }
1705 req_freelist = arg->a_next; /* adjust freelist */
1706 push_list_size++;
1707
1708 arg->a_vp = vp;
1709 arg->a_off = off;
1710 arg->a_len = PAGESIZE;
1711 arg->a_flags = B_ASYNC | B_FREE;
1712 arg->a_cred = kcred; /* always held */
1713
1714 /*
1715 * Add to list of pending write requests.
1716 */
1717 arg->a_next = push_list;
1718 push_list = arg;
1719
1720 if (req_freelist == NULL) {
1721 /*
1722 * No free async requests left. The lock is held so we
1723 * might as well signal the pusher thread now.
1724 */
1725 cv_signal(&push_cv);
1726 }
1727 mutex_exit(&push_lock);
1728 return (1);
1729 }
1730
1731 /*
1732 * Wakeup pageout to initiate i/o if push_list is not empty.
1733 */
1734 void
1735 cv_signal_pageout()
1736 {
1737 if (push_list != NULL) {
1738 mutex_enter(&push_lock);
1739 cv_signal(&push_cv);
1740 mutex_exit(&push_lock);
1741 }
1742 }