1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2021 Oxide Computer Company
24 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 * Copyright 2018 Joyent, Inc.
31 */
32
33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
34 /* All Rights Reserved */
35
36 /*
37 * University Copyright- Copyright (c) 1982, 1986, 1988
38 * The Regents of the University of California
39 * All Rights Reserved
40 *
41 * University Acknowledgment- Portions of this document are derived from
42 * software developed by the University of California, Berkeley, and its
43 * contributors.
44 */
45
46 #include <sys/types.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/buf.h>
50 #include <sys/uio.h>
51 #include <sys/proc.h>
52 #include <sys/systm.h>
53 #include <sys/mman.h>
54 #include <sys/cred.h>
55 #include <sys/vnode.h>
56 #include <sys/vm.h>
57 #include <sys/vmparam.h>
58 #include <sys/vtrace.h>
59 #include <sys/cmn_err.h>
60 #include <sys/cpuvar.h>
61 #include <sys/user.h>
62 #include <sys/kmem.h>
63 #include <sys/debug.h>
64 #include <sys/callb.h>
65 #include <sys/tnf_probe.h>
66 #include <sys/mem_cage.h>
67 #include <sys/time.h>
68 #include <sys/zone.h>
69 #include <sys/stdbool.h>
70
71 #include <vm/hat.h>
72 #include <vm/as.h>
73 #include <vm/seg.h>
74 #include <vm/page.h>
75 #include <vm/pvn.h>
76 #include <vm/seg_kmem.h>
77
78 /*
79 * FREE MEMORY MANAGEMENT
80 *
81 * Management of the pool of free pages is a tricky business. There are
82 * several critical threshold values which constrain our allocation of new
83 * pages and inform the rate of paging out of memory to swap. These threshold
84 * values, and the behaviour they induce, are described below in descending
85 * order of size -- and thus increasing order of severity!
86 *
87 * +---------------------------------------------------- physmem (all memory)
88 * |
89 * | Ordinarily there are no particular constraints placed on page
90 * v allocation. The page scanner is not running and page_create_va()
91 * | will effectively grant all page requests (whether from the kernel
92 * | or from user processes) without artificial delay.
93 * |
94 * +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
95 * |
96 * | When we have less than "lotsfree" pages, pageout_scanner() is
97 * v signalled by schedpaging() to begin looking for pages that can
98 * | be evicted to disk to bring us back above lotsfree. At this
99 * | stage there is still no constraint on allocation of free pages.
100 * |
101 * | For small systems, we set a lower bound of 16MB for lotsfree;
102 * v this is the natural value for a system with 1GB memory. This is
103 * | to ensure that the pageout reserve pool contains at least 4MB
104 * | for use by ZFS.
105 * |
106 * | For systems with a large amount of memory, we constrain lotsfree
107 * | to be at most 2GB (with a pageout reserve of around 0.5GB), as
108 * v at some point the required slack relates more closely to the
109 * | rate at which paging can occur than to the total amount of memory.
110 * |
111 * +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
112 * |
113 * | When we drop below desfree, a number of kernel facilities will
114 * v wait before allocating more memory, under the assumption that
115 * | pageout or reaping will make progress and free up some memory.
116 * | This behaviour is not especially coordinated; look for comparisons
117 * | of desfree and freemem.
118 * |
119 * | In addition to various attempts at advisory caution, clock()
120 * | will wake up the thread that is ordinarily parked in sched().
121 * | This routine is responsible for the heavy-handed swapping out
122 * v of entire processes in an attempt to arrest the slide of free
123 * | memory. See comments in sched.c for more details.
124 * |
125 * +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
126 * |
127 * | These two separate tunables have, by default, the same value.
128 * v Various parts of the kernel use minfree to signal the need for
129 * | more aggressive reclamation of memory, and sched() is more
130 * | aggressive at swapping processes out.
131 * |
132 * | If free memory falls below throttlefree, page_create_va() will
133 * | use page_create_throttle() to begin holding most requests for
134 * | new pages while pageout and reaping free up memory. Sleeping
135 * v allocations (e.g., KM_SLEEP) are held here while we wait for
136 * | more memory. Non-sleeping allocations are generally allowed to
137 * | proceed, unless their priority is explicitly lowered with
138 * | KM_NORMALPRI.
139 * |
140 * +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
141 * |
142 * | When we hit throttlefree, the situation is already dire. The
143 * v system is generally paging out memory and swapping out entire
144 * | processes in order to free up memory for continued operation.
145 * |
146 * | Unfortunately, evicting memory to disk generally requires short
147 * | term use of additional memory; e.g., allocation of buffers for
148 * | storage drivers, updating maps of free and used blocks, etc.
149 * | As such, pageout_reserve is the number of pages that we keep in
150 * | special reserve for use by pageout() and sched() and by any
151 * v other parts of the kernel that need to be working for those to
152 * | make forward progress such as the ZFS I/O pipeline.
153 * |
154 * | When we are below pageout_reserve, we fail or hold any allocation
155 * | that has not explicitly requested access to the reserve pool.
156 * | Access to the reserve is generally granted via the KM_PUSHPAGE
157 * | flag, or by marking a thread T_PUSHPAGE such that all allocations
158 * | can implicitly tap the reserve. For more details, see the
159 * v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
160 * | and VM_PUSHPAGE allocation flags, and page_create_throttle().
161 * |
162 * +---------------------------------------------------------- no free memory
163 * |
164 * | If we have arrived here, things are very bad indeed. It is
165 * v surprisingly difficult to tell if this condition is even fatal,
166 * | as enough memory may have been granted to pageout() and to the
167 * | ZFS I/O pipeline that requests for eviction that have already been
168 * | made will complete and free up memory some time soon.
169 * |
170 * | If free memory does not materialise, the system generally remains
171 * | deadlocked. The pageout_deadman() below is run once per second
172 * | from clock(), seeking to limit the amount of time a single request
173 * v to page out can be blocked before the system panics to get a crash
174 * | dump and return to service.
175 * |
176 * +-------------------------------------------------------------------------
177 */
178
179 /*
180 * The following parameters control operation of the page replacement
181 * algorithm. They are initialized to 0, and then computed at boot time based
182 * on the size of the system; see setupclock(). If they are patched non-zero
183 * in a loaded vmunix they are left alone and may thus be changed per system
184 * using "mdb -kw" on the loaded system.
185 */
186 pgcnt_t slowscan = 0;
187 pgcnt_t fastscan = 0;
188
189 static pgcnt_t handspreadpages = 0;
190
191 /*
192 * looppages:
193 * Cached copy of the total number of pages in the system (total_pages).
194 *
195 * loopfraction:
196 * Divisor used to relate fastscan to looppages in setupclock().
197 */
198 static uint_t loopfraction = 2;
199 static pgcnt_t looppages;
200
201 static uint_t min_percent_cpu = 4;
202 static uint_t max_percent_cpu = 80;
203 static pgcnt_t maxfastscan = 0;
204 static pgcnt_t maxslowscan = 100;
205
206 #define MEGABYTES (1024ULL * 1024ULL)
207
208 /*
209 * pageout_threshold_style:
210 * set to 1 to use the previous default threshold size calculation;
211 * i.e., each threshold is half of the next largest value.
212 */
213 uint_t pageout_threshold_style = 0;
214
215 /*
216 * The operator may override these tunables to request a different minimum or
217 * maximum lotsfree value, or to change the divisor we use for automatic
218 * sizing.
219 *
220 * By default, we make lotsfree 1/64th of the total memory in the machine. The
221 * minimum and maximum are specified in bytes, rather than pages; a zero value
222 * means the default values (below) are used.
223 */
224 uint_t lotsfree_fraction = 64;
225 pgcnt_t lotsfree_min = 0;
226 pgcnt_t lotsfree_max = 0;
227
228 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
229 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
230
231 /*
232 * If these tunables are set to non-zero values in /etc/system, and provided
233 * the value is not larger than the threshold above, the specified value will
234 * be used directly without any additional calculation or adjustment. The boot
235 * time value of these overrides is preserved in the "clockinit" struct. More
236 * detail is available in the comment at the top of the file.
237 */
238 pgcnt_t maxpgio = 0;
239 pgcnt_t minfree = 0;
240 pgcnt_t desfree = 0;
241 pgcnt_t lotsfree = 0;
242 pgcnt_t needfree = 0;
243 pgcnt_t throttlefree = 0;
244 pgcnt_t pageout_reserve = 0;
245 pri_t pageout_pri;
246
247 pgcnt_t deficit;
248 pgcnt_t nscan;
249 pgcnt_t desscan;
250
251 /* kstats */
252 uint64_t low_mem_scan;
253 uint64_t zone_cap_scan;
254
255 #define MAX_PSCAN_THREADS 16
256
257 /*
258 * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
259 * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
260 * that gives the equivalent of some underlying %CPU duty cycle.
261 *
262 * min_pageout_nsec:
263 * nanoseconds/wakeup equivalent of min_percent_cpu.
264 *
265 * max_pageout_nsec:
266 * nanoseconds/wakeup equivalent of max_percent_cpu.
267 *
268 * pageout_nsec:
269 * Number of nanoseconds budgeted for each wakeup cycle.
270 * Computed each time around by schedpaging().
271 * Varies between min_pageout_nsec and max_pageout_nsec,
272 * depending on memory pressure or zones over their cap.
273 *
274 * zone_pageout_nsec:
275 * Number of nanoseconds budget for each cycle when a zone
276 * is over its memory cap. If this is zero, then the value
277 * of max_pageout_nsec is used instead.
278 */
279 static hrtime_t min_pageout_nsec;
280 static hrtime_t max_pageout_nsec;
281 static hrtime_t pageout_nsec;
282 static hrtime_t zone_pageout_nsec;
283
284 static boolean_t reset_hands[MAX_PSCAN_THREADS];
285
286 #define PAGES_POLL_MASK 1023
287 #define SCHEDPAGING_HZ 4
288
289 /*
290 * despagescanners:
291 * The desired number of page scanner threads. The value can be set in
292 * /etc/system or tuned directly with 'mdb -kw'. The system will bring
293 * the actual number of threads into line with the desired number. If set
294 * to an invalid value, the system will correct the setting.
295 */
296 uint_t despagescanners = 0;
297
298 /*
299 * pageout_sample_lim:
300 * The limit on the number of samples needed to establish a value for new
301 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
302 * handspreadpages.
303 *
304 * pageout_sample_cnt:
305 * Current sample number. Once the sample gets large enough, set new
306 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
307 *
308 * pageout_sample_pages:
309 * The accumulated number of pages scanned during sampling.
310 *
311 * pageout_sample_etime:
312 * The accumulated nanoseconds for the sample.
313 *
314 * pageout_rate:
315 * Rate in pages/nanosecond, computed at the end of sampling.
316 *
317 * pageout_new_spread:
318 * Initially zero while the system scan rate is measured by
319 * pageout_scanner(), which then sets this value once per system boot after
320 * enough samples have been recorded (pageout_sample_cnt). Once set, this
321 * new value is used for fastscan and handspreadpages.
322 */
323 typedef hrtime_t hrrate_t;
324
325 static uint64_t pageout_sample_lim = 4;
326 static uint64_t pageout_sample_cnt = 0;
327 static pgcnt_t pageout_sample_pages = 0;
328 static hrtime_t pageout_sample_etime = 0;
329 static hrrate_t pageout_rate = 0;
330 static pgcnt_t pageout_new_spread = 0;
331
332 /* True if the page scanner is first starting up */
333 #define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
334
335 /* The current number of page scanner threads */
336 static uint_t n_page_scanners = 1;
337 /* The number of page scanner threads that are actively scanning. */
338 static uint_t pageouts_running;
339
340 /*
341 * Record number of times a pageout_scanner() wakeup cycle finished because it
342 * timed out (exceeded its CPU budget), rather than because it visited
343 * its budgeted number of pages. This is only done when scanning under low
344 * free memory conditions, not when scanning for zones over their cap.
345 */
346 uint64_t pageout_timeouts = 0;
347
348 #ifdef VM_STATS
349 static struct pageoutvmstats_str {
350 ulong_t checkpage[3];
351 } pageoutvmstats;
352 #endif /* VM_STATS */
353
354 /*
355 * Threads waiting for free memory use this condition variable and lock until
356 * memory becomes available.
357 */
358 kmutex_t memavail_lock;
359 kcondvar_t memavail_cv;
360
361 typedef enum pageout_hand {
362 POH_FRONT = 1,
363 POH_BACK,
364 } pageout_hand_t;
365
366 typedef enum {
367 CKP_INELIGIBLE,
368 CKP_NOT_FREED,
369 CKP_FREED,
370 } checkpage_result_t;
371
372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
373
374 static struct clockinit {
375 bool ci_init;
376 pgcnt_t ci_lotsfree_min;
377 pgcnt_t ci_lotsfree_max;
378 pgcnt_t ci_lotsfree;
379 pgcnt_t ci_desfree;
380 pgcnt_t ci_minfree;
381 pgcnt_t ci_throttlefree;
382 pgcnt_t ci_pageout_reserve;
383 pgcnt_t ci_maxpgio;
384 pgcnt_t ci_maxfastscan;
385 pgcnt_t ci_fastscan;
386 pgcnt_t ci_slowscan;
387 pgcnt_t ci_handspreadpages;
388 uint_t ci_despagescanners;
389 } clockinit = { .ci_init = false };
390
391 static inline pgcnt_t
392 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
393 {
394 if (value < minimum) {
395 return (minimum);
396 } else if (value > maximum) {
397 return (maximum);
398 } else {
399 return (value);
400 }
401 }
402
403 static pgcnt_t
404 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
405 {
406 if (initval == 0 || initval >= initval_ceiling) {
407 return (defval);
408 } else {
409 return (initval);
410 }
411 }
412
413 /*
414 * Local boolean to control scanning when zones are over their cap. Avoids
415 * accessing the zone_num_over_cap variable except within schedpaging(), which
416 * only runs periodically. This is here only to reduce our access to
417 * zone_num_over_cap, since it is already accessed a lot during paging, and
418 * the page scanner accesses the zones_over variable on each page during a
419 * scan. There is no lock needed for zone_num_over_cap since schedpaging()
420 * doesn't modify the variable, it only cares if the variable is 0 or non-0.
421 */
422 static boolean_t zones_over = B_FALSE;
423
424 /*
425 * On large memory systems, multiple instances of the page scanner are run,
426 * each responsible for a separate region of memory. This speeds up page
427 * invalidation under low memory conditions.
428 *
429 * despagescanners can be set in /etc/system or via mdb and it will
430 * be used as a guide for how many page scanners to create; the value
431 * will be adjusted if it is not sensible. Otherwise, the number of
432 * page scanners is determined dynamically based on handspreadpages.
433 */
434 static void
435 recalc_pagescanners(void)
436 {
437 pgcnt_t sz;
438 uint_t des;
439
440 /* If the initial calibration has not been done, take no action. */
441 if (pageout_new_spread == 0)
442 return;
443
444 /*
445 * If the desired number of scanners is set in /etc/system
446 * then try to use it.
447 */
448 if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
449 despagescanners = clockinit.ci_despagescanners;
450
451 if (despagescanners != 0) {
452 /*
453 * We have a desired number of page scanners, either from
454 * /etc/system or set via mdb. Try and use it (it will be
455 * clamped below).
456 */
457 des = despagescanners;
458 } else {
459 /*
460 * Calculate the number of desired scanners based on the
461 * system's memory size.
462 *
463 * A 64GiB region size is used as the basis for calculating how
464 * many scanner threads should be created. For systems with up
465 * to 64GiB of RAM, a single thread is used; for very large
466 * memory systems the threads are limited to MAX_PSCAN_THREADS.
467 */
468 sz = btop(64ULL << 30);
469
470 if (sz > looppages) {
471 des = 1;
472 } else {
473 pgcnt_t tmp = sz;
474
475 for (des = 1; tmp < looppages; des++)
476 tmp += sz;
477 }
478 }
479
480 /*
481 * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
482 * and so that each scanner covers at least 10% more than
483 * handspreadpages.
484 */
485 des = clamp(des, 1,
486 looppages / (handspreadpages + handspreadpages / 10));
487 despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
488 }
489
490 /*
491 * Set up the paging constants for the clock algorithm used by
492 * pageout_scanner(), and by the virtual memory system overall. See the
493 * comments at the top of this file for more information about the threshold
494 * values and system responses to memory pressure.
495 *
496 * This routine is called once by main() at startup, after the initial size of
497 * physical memory is determined. It may be called again later if memory is
498 * added to or removed from the system, or if new measurements of the page scan
499 * rate become available.
500 */
501 void
502 setupclock(void)
503 {
504 bool half = (pageout_threshold_style == 1);
505 bool recalc = true;
506
507 looppages = total_pages;
508
509 /*
510 * The operator may have provided specific values for some of the
511 * tunables via /etc/system. On our first call, we preserve those
512 * values so that they can be used for subsequent recalculations.
513 *
514 * A value of zero for any tunable means we will use the default
515 * sizing.
516 */
517 if (!clockinit.ci_init) {
518 clockinit.ci_init = true;
519
520 clockinit.ci_lotsfree_min = lotsfree_min;
521 clockinit.ci_lotsfree_max = lotsfree_max;
522 clockinit.ci_lotsfree = lotsfree;
523 clockinit.ci_desfree = desfree;
524 clockinit.ci_minfree = minfree;
525 clockinit.ci_throttlefree = throttlefree;
526 clockinit.ci_pageout_reserve = pageout_reserve;
527 clockinit.ci_maxpgio = maxpgio;
528 clockinit.ci_maxfastscan = maxfastscan;
529 clockinit.ci_fastscan = fastscan;
530 clockinit.ci_slowscan = slowscan;
531 clockinit.ci_handspreadpages = handspreadpages;
532 clockinit.ci_despagescanners = despagescanners;
533
534 /*
535 * The first call does not trigger a recalculation, only
536 * subsequent calls.
537 */
538 recalc = false;
539 }
540
541 /*
542 * Configure paging threshold values. For more details on what each
543 * threshold signifies, see the comments at the top of this file.
544 */
545 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
546 btop(LOTSFREE_MAX_DEFAULT));
547 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
548 btop(LOTSFREE_MIN_DEFAULT));
549
550 lotsfree = tune(clockinit.ci_lotsfree, looppages,
551 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
552
553 desfree = tune(clockinit.ci_desfree, lotsfree,
554 lotsfree / 2);
555
556 minfree = tune(clockinit.ci_minfree, desfree,
557 half ? desfree / 2 : 3 * desfree / 4);
558
559 throttlefree = tune(clockinit.ci_throttlefree, desfree,
560 minfree);
561
562 pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
563 half ? throttlefree / 2 : 3 * throttlefree / 4);
564
565 /*
566 * Maxpgio thresholds how much paging is acceptable.
567 * This figures that 2/3 busy on an arm is all that is
568 * tolerable for paging. We assume one operation per disk rev.
569 *
570 * XXX - Does not account for multiple swap devices.
571 */
572 if (clockinit.ci_maxpgio == 0) {
573 maxpgio = (DISKRPM * 2) / 3;
574 } else {
575 maxpgio = clockinit.ci_maxpgio;
576 }
577
578 /*
579 * The clock scan rate varies between fastscan and slowscan
580 * based on the amount of free memory available. Fastscan
581 * rate should be set based on the number pages that can be
582 * scanned per sec using ~10% of processor time. Since this
583 * value depends on the processor, MMU, Mhz etc., it is
584 * difficult to determine it in a generic manner for all
585 * architectures.
586 *
587 * Instead of trying to determine the number of pages scanned
588 * per sec for every processor, fastscan is set to be the smaller
589 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
590 * time is limited to ~4% of processor time.
591 *
592 * Setting fastscan to be 1/2 of memory allows pageout to scan
593 * all of memory in ~2 secs. This implies that user pages not
594 * accessed within 1 sec (assuming, handspreadpages == fastscan)
595 * can be reclaimed when free memory is very low. Stealing pages
596 * not accessed within 1 sec seems reasonable and ensures that
597 * active user processes don't thrash.
598 *
599 * Smaller values of fastscan result in scanning fewer pages
600 * every second and consequently pageout may not be able to free
601 * sufficient memory to maintain the minimum threshold. Larger
602 * values of fastscan result in scanning a lot more pages which
603 * could lead to thrashing and higher CPU usage.
604 *
605 * Fastscan needs to be limited to a maximum value and should not
606 * scale with memory to prevent pageout from consuming too much
607 * time for scanning on slow CPU's and avoid thrashing, as a
608 * result of scanning too many pages, on faster CPU's.
609 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
610 * (the upper bound for fastscan) based on the average number
611 * of pages that can potentially be scanned in ~1 sec (using ~4%
612 * of the CPU) on some of the following machines that currently
613 * run Solaris 2.x:
614 *
615 * average memory scanned in ~1 sec
616 *
617 * 25 Mhz SS1+: 23 Meg
618 * LX: 37 Meg
619 * 50 Mhz SC2000: 68 Meg
620 *
621 * 40 Mhz 486: 26 Meg
622 * 66 Mhz 486: 42 Meg
623 *
624 * When free memory falls just below lotsfree, the scan rate
625 * goes from 0 to slowscan (i.e., pageout starts running). This
626 * transition needs to be smooth and is achieved by ensuring that
627 * pageout scans a small number of pages to satisfy the transient
628 * memory demand. This is set to not exceed 100 pages/sec (25 per
629 * wakeup) since scanning that many pages has no noticible impact
630 * on system performance.
631 *
632 * In addition to setting fastscan and slowscan, pageout is
633 * limited to using ~4% of the CPU. This results in increasing
634 * the time taken to scan all of memory, which in turn means that
635 * user processes have a better opportunity of preventing their
636 * pages from being stolen. This has a positive effect on
637 * interactive and overall system performance when memory demand
638 * is high.
639 *
640 * Thus, the rate at which pages are scanned for replacement will
641 * vary linearly between slowscan and the number of pages that
642 * can be scanned using ~4% of processor time instead of varying
643 * linearly between slowscan and fastscan.
644 *
645 * Also, the processor time used by pageout will vary from ~1%
646 * at slowscan to ~4% at fastscan instead of varying between
647 * ~1% at slowscan and ~10% at fastscan.
648 *
649 * The values chosen for the various VM parameters (fastscan,
650 * handspreadpages, etc) are not universally true for all machines,
651 * but appear to be a good rule of thumb for the machines we've
652 * tested. They have the following ranges:
653 *
654 * cpu speed: 20 to 70 Mhz
655 * page size: 4K to 8K
656 * memory size: 16M to 5G
657 * page scan rate: 4000 - 17400 4K pages per sec
658 *
659 * The values need to be re-examined for machines which don't
660 * fall into the various ranges (e.g., slower or faster CPUs,
661 * smaller or larger pagesizes etc) shown above.
662 *
663 * On an MP machine, pageout is often unable to maintain the
664 * minimum paging thresholds under heavy load. This is due to
665 * the fact that user processes running on other CPU's can be
666 * dirtying memory at a much faster pace than pageout can find
667 * pages to free. The memory demands could be met by enabling
668 * more than one CPU to run the clock algorithm in such a manner
669 * that the various clock hands don't overlap. This also makes
670 * it more difficult to determine the values for fastscan, slowscan
671 * and handspreadpages.
672 *
673 * The swapper is currently used to free up memory when pageout
674 * is unable to meet memory demands by swapping out processes.
675 * In addition to freeing up memory, swapping also reduces the
676 * demand for memory by preventing user processes from running
677 * and thereby consuming memory.
678 */
679 if (clockinit.ci_maxfastscan == 0) {
680 if (pageout_new_spread != 0) {
681 maxfastscan = pageout_new_spread;
682 } else {
683 maxfastscan = MAXHANDSPREADPAGES;
684 }
685 } else {
686 maxfastscan = clockinit.ci_maxfastscan;
687 }
688
689 if (clockinit.ci_fastscan == 0) {
690 fastscan = MIN(looppages / loopfraction, maxfastscan);
691 } else {
692 fastscan = clockinit.ci_fastscan;
693 }
694
695 if (fastscan > looppages / loopfraction) {
696 fastscan = looppages / loopfraction;
697 }
698
699 /*
700 * Set slow scan time to 1/10 the fast scan time, but
701 * not to exceed maxslowscan.
702 */
703 if (clockinit.ci_slowscan == 0) {
704 slowscan = MIN(fastscan / 10, maxslowscan);
705 } else {
706 slowscan = clockinit.ci_slowscan;
707 }
708
709 if (slowscan > fastscan / 2) {
710 slowscan = fastscan / 2;
711 }
712
713 /*
714 * Handspreadpages is the distance (in pages) between front and back
715 * pageout daemon hands. The amount of time to reclaim a page
716 * once pageout examines it increases with this distance and
717 * decreases as the scan rate rises. It must be < the amount
718 * of pageable memory.
719 *
720 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
721 * to be "fastscan" results in the front hand being a few secs
722 * (varies based on the processor speed) ahead of the back hand
723 * at fastscan rates. This distance can be further reduced, if
724 * necessary, by increasing the processor time used by pageout
725 * to be more than ~4% and preferrably not more than ~10%.
726 *
727 * As a result, user processes have a much better chance of
728 * referencing their pages before the back hand examines them.
729 * This also significantly lowers the number of reclaims from
730 * the freelist since pageout does not end up freeing pages which
731 * may be referenced a sec later.
732 */
733 if (clockinit.ci_handspreadpages == 0) {
734 handspreadpages = fastscan;
735 } else {
736 handspreadpages = clockinit.ci_handspreadpages;
737 }
738
739 /*
740 * Make sure that back hand follows front hand by at least
741 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
742 * back hand to look at a page during the same wakeup of the pageout
743 * daemon in which the front hand cleared its ref bit.
744 */
745 if (handspreadpages >= looppages) {
746 handspreadpages = looppages - 1;
747 }
748
749 /*
750 * Establish the minimum and maximum length of time to be spent
751 * scanning pages per wakeup, limiting the scanner duty cycle. The
752 * input percentage values (0-100) must be converted to a fraction of
753 * the number of nanoseconds in a second of wall time, then further
754 * scaled down by the number of scanner wakeups in a second.
755 */
756 min_pageout_nsec = MAX(1,
757 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
758 max_pageout_nsec = MAX(min_pageout_nsec,
759 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
760
761 /*
762 * If not called for recalculation, return and skip the remaining
763 * steps.
764 */
765 if (!recalc)
766 return;
767
768 /*
769 * Set a flag to re-evaluate the clock hand positions.
770 */
771 for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
772 reset_hands[i] = B_TRUE;
773
774 recalc_pagescanners();
775 }
776
777 /*
778 * Pageout scheduling.
779 *
780 * Schedpaging controls the rate at which the page out daemon runs by
781 * setting the global variables nscan and desscan SCHEDPAGING_HZ
782 * times a second. Nscan records the number of pages pageout has examined
783 * in its current pass; schedpaging() resets this value to zero each time
784 * it runs. Desscan records the number of pages pageout should examine
785 * in its next pass; schedpaging() sets this value based on the amount of
786 * currently available memory.
787 */
788
789 static kmutex_t pageout_mutex;
790
791 /*
792 * Pool of available async pageout putpage requests.
793 */
794 static struct async_reqs *push_req;
795 static struct async_reqs *req_freelist; /* available req structs */
796 static struct async_reqs *push_list; /* pending reqs */
797 static kmutex_t push_lock; /* protects req pool */
798 static kcondvar_t push_cv;
799
800 /*
801 * If pageout() is stuck on a single push for this many seconds,
802 * pageout_deadman() will assume the system has hit a memory deadlock. If set
803 * to 0, the deadman will have no effect.
804 *
805 * Note that we are only looking for stalls in the calls that pageout() makes
806 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
807 * I/O, which should not take long unless the underlying strategy call blocks
808 * indefinitely for memory. The actual I/O request happens (or fails) later.
809 */
810 uint_t pageout_deadman_seconds = 90;
811
812 static uint_t pageout_stucktime = 0;
813 static bool pageout_pushing = false;
814 static uint64_t pageout_pushcount = 0;
815 static uint64_t pageout_pushcount_seen = 0;
816
817 static int async_list_size = 8192; /* number of async request structs */
818
819 static void pageout_scanner(void *);
820
821 /*
822 * If a page is being shared more than "po_share" times
823 * then leave it alone- don't page it out.
824 */
825 #define MIN_PO_SHARE (8)
826 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
827 ulong_t po_share = MIN_PO_SHARE;
828
829 /*
830 * Schedule rate for paging.
831 * Rate is linear interpolation between
832 * slowscan with lotsfree and fastscan when out of memory.
833 */
834 static void
835 schedpaging(void *arg)
836 {
837 spgcnt_t vavail;
838
839 if (freemem < lotsfree + needfree + kmem_reapahead)
840 kmem_reap();
841
842 if (freemem < lotsfree + needfree)
843 seg_preap();
844
845 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
846 kcage_cageout_wakeup();
847
848 if (mutex_tryenter(&pageout_mutex)) {
849
850 if (pageouts_running != 0)
851 goto out;
852
853 /* No pageout scanner threads running. */
854 nscan = 0;
855 vavail = freemem - deficit;
856 if (pageout_new_spread != 0)
857 vavail -= needfree;
858 vavail = clamp(vavail, 0, lotsfree);
859
860 if (needfree > 0 && pageout_new_spread == 0) {
861 /*
862 * If we've not yet collected enough samples to
863 * calculate a spread, use the old logic of kicking
864 * into high gear anytime needfree is non-zero.
865 */
866 desscan = fastscan / SCHEDPAGING_HZ;
867 } else {
868 /*
869 * Once we've calculated a spread based on system
870 * memory and usage, just treat needfree as another
871 * form of deficit.
872 */
873 spgcnt_t faststmp, slowstmp, result;
874
875 slowstmp = slowscan * vavail;
876 faststmp = fastscan * (lotsfree - vavail);
877 result = (slowstmp + faststmp) /
878 nz(lotsfree) / SCHEDPAGING_HZ;
879 desscan = (pgcnt_t)result;
880 }
881
882 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
883 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
884
885 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
886 pageout_nsec);
887
888 if (pageout_new_spread != 0 && despagescanners != 0 &&
889 despagescanners != n_page_scanners) {
890 /*
891 * We have finished the pagescan initialisation and the
892 * desired number of page scanners has changed, either
893 * because initialisation just finished, because of a
894 * memory DR, or because despagescanners has been
895 * modified on the fly (i.e. by mdb).
896 */
897 uint_t i, curr_nscan = n_page_scanners;
898
899 /* Re-validate despagescanners */
900 recalc_pagescanners();
901
902 n_page_scanners = despagescanners;
903
904 for (i = 0; i < MAX_PSCAN_THREADS; i++)
905 reset_hands[i] = B_TRUE;
906
907 /* If we need more scanners, start them now. */
908 if (n_page_scanners > curr_nscan) {
909 for (i = curr_nscan; i < n_page_scanners; i++) {
910 (void) lwp_kernel_create(proc_pageout,
911 pageout_scanner,
912 (void *)(uintptr_t)i, TS_RUN,
913 pageout_pri);
914 }
915 }
916
917 /*
918 * If the number of scanners has decreased, trigger a
919 * wakeup so that the excess threads will terminate.
920 */
921 if (n_page_scanners < curr_nscan) {
922 WAKE_PAGEOUT_SCANNER();
923 }
924 }
925
926 zones_over = B_FALSE;
927
928 if (PAGE_SCAN_STARTUP) {
929 /*
930 * We still need to measure the rate at which the
931 * system is able to scan pages of memory. Each of
932 * these initial samples is a scan of as much system
933 * memory as practical, regardless of whether or not we
934 * are experiencing memory pressure.
935 */
936 desscan = total_pages;
937 pageout_nsec = max_pageout_nsec;
938
939 DTRACE_PROBE(schedpage__wake__sample);
940 WAKE_PAGEOUT_SCANNER();
941 } else if (freemem < lotsfree + needfree) {
942 /*
943 * We need more memory.
944 */
945 low_mem_scan++;
946
947 DTRACE_PROBE(schedpage__wake__low);
948 WAKE_PAGEOUT_SCANNER();
949 } else if (zone_num_over_cap > 0) {
950 /*
951 * One of more zones are over their cap.
952 */
953
954 /* No page limit */
955 desscan = total_pages;
956
957 /*
958 * Increase the scanning CPU% to the max. This implies
959 * 80% of one CPU/sec if the scanner can run each
960 * opportunity. Can also be tuned via setting
961 * zone_pageout_nsec in /etc/system or with mdb.
962 */
963 pageout_nsec = (zone_pageout_nsec != 0) ?
964 zone_pageout_nsec : max_pageout_nsec;
965
966 zones_over = B_TRUE;
967 zone_cap_scan++;
968
969 DTRACE_PROBE(schedpage__wake__zone);
970 WAKE_PAGEOUT_SCANNER();
971 } else {
972 /*
973 * There are enough free pages, no need to
974 * kick the scanner thread. And next time
975 * around, keep more of the `highly shared'
976 * pages.
977 */
978 cv_signal_pageout();
979 if (po_share > MIN_PO_SHARE) {
980 po_share >>= 1;
981 }
982 }
983 out:
984 mutex_exit(&pageout_mutex);
985 }
986
987 /*
988 * Signal threads waiting for available memory.
989 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
990 * in this case it is not needed - the waiters will be waken up during
991 * the next invocation of this function.
992 */
993 if (kmem_avail() > 0)
994 cv_broadcast(&memavail_cv);
995
996 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
997 }
998
999 pgcnt_t pushes;
1000 ulong_t push_list_size; /* # of requests on pageout queue */
1001
1002 /*
1003 * Paging out should always be enabled. This tunable exists to hold pageout
1004 * for debugging purposes. If set to 0, pageout_scanner() will go back to
1005 * sleep each time it is woken by schedpaging().
1006 */
1007 uint_t dopageout = 1;
1008
1009 /*
1010 * The page out daemon, which runs as process 2.
1011 *
1012 * The daemon treats physical memory as a circular array of pages and scans
1013 * the pages using a 'two-handed clock' algorithm. The front hand moves
1014 * through the pages, clearing the reference bit. The back hand travels a
1015 * distance (handspreadpages) behind the front hand, freeing the pages that
1016 * have not been referenced in the time since the front hand passed. If
1017 * modified, they are first written to their backing store before being
1018 * freed.
1019 *
1020 * In order to make page invalidation more responsive on machines with
1021 * larger memory, multiple pageout_scanner threads may be created. In this
1022 * case, each thread is given a segment of the memory "clock face" so that
1023 * memory can be reclaimed more quickly.
1024 *
1025 * As long as there are at least lotsfree pages, or no zones over their
1026 * cap, then pageout_scanner threads are not run. When pageout_scanner
1027 * threads are running for case (a), all pages are considered for pageout.
1028 * For case (b), only pages belonging to a zone over its cap will be
1029 * considered for pageout.
1030 *
1031 * There are multiple threads that act on behalf of the pageout process. A
1032 * set of threads scan pages (pageout_scanner) and frees them up if they
1033 * don't require any VOP_PUTPAGE operation. If a page must be written back
1034 * to its backing store, the request is put on a list and the other
1035 * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
1036 * requests from the list, and processes them. Some filesystems may require
1037 * resources for the VOP_PUTPAGE operations (like memory) and hence can
1038 * block the pageout thread, but the scanner thread can still operate.
1039 * There is still no guarantee that memory deadlocks cannot occur.
1040 */
1041 void
1042 pageout()
1043 {
1044 struct async_reqs *arg;
1045 int i;
1046 pgcnt_t max_pushes;
1047 callb_cpr_t cprinfo;
1048
1049 proc_pageout = ttoproc(curthread);
1050 proc_pageout->p_cstime = 0;
1051 proc_pageout->p_stime = 0;
1052 proc_pageout->p_cutime = 0;
1053 proc_pageout->p_utime = 0;
1054 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1055 bcopy("pageout", PTOU(curproc)->u_comm, 7);
1056
1057 /*
1058 * Create pageout scanner thread
1059 */
1060 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1061 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1062
1063 /*
1064 * Allocate and initialize the async request structures
1065 * for pageout.
1066 */
1067 push_req = (struct async_reqs *)
1068 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1069
1070 req_freelist = push_req;
1071 for (i = 0; i < async_list_size - 1; i++) {
1072 push_req[i].a_next = &push_req[i + 1];
1073 }
1074
1075 pageout_pri = curthread->t_pri - 1;
1076
1077 /* Create the first pageout scanner thread. */
1078 (void) lwp_kernel_create(proc_pageout, pageout_scanner,
1079 (void *)0, /* this is instance 0, not NULL */
1080 TS_RUN, pageout_pri);
1081
1082 /*
1083 * kick off pageout scheduler.
1084 */
1085 schedpaging(NULL);
1086
1087 /*
1088 * Create kernel cage thread.
1089 * The kernel cage thread is started under the pageout process
1090 * to take advantage of the less restricted page allocation
1091 * in page_create_throttle().
1092 */
1093 kcage_cageout_init();
1094
1095 /*
1096 * Limit pushes to avoid saturating pageout devices.
1097 */
1098 max_pushes = maxpgio / SCHEDPAGING_HZ;
1099 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1100
1101 for (;;) {
1102 mutex_enter(&push_lock);
1103
1104 while ((arg = push_list) == NULL || pushes > max_pushes) {
1105 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1106 cv_wait(&push_cv, &push_lock);
1107 pushes = 0;
1108 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1109 }
1110 push_list = arg->a_next;
1111 arg->a_next = NULL;
1112 pageout_pushing = true;
1113 mutex_exit(&push_lock);
1114
1115 DTRACE_PROBE(pageout__push);
1116
1117 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1118 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1119 pushes++;
1120 }
1121
1122 /* vp held by checkpage() */
1123 VN_RELE(arg->a_vp);
1124
1125 mutex_enter(&push_lock);
1126 pageout_pushing = false;
1127 pageout_pushcount++;
1128 arg->a_next = req_freelist; /* back on freelist */
1129 req_freelist = arg;
1130 push_list_size--;
1131 mutex_exit(&push_lock);
1132 }
1133 }
1134
1135 /*
1136 * Kernel thread that scans pages looking for ones to free
1137 */
1138 static void
1139 pageout_scanner(void *a)
1140 {
1141 struct page *fronthand, *backhand, *fronthandstart;
1142 struct page *regionstart, *regionend;
1143 uint_t laps;
1144 callb_cpr_t cprinfo;
1145 pgcnt_t nscan_cnt, tick;
1146 pgcnt_t pcount;
1147 bool bhwrapping, fhwrapping;
1148 hrtime_t sample_start, sample_end;
1149 uint_t inst = (uint_t)(uintptr_t)a;
1150
1151 VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1152
1153 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1154 mutex_enter(&pageout_mutex);
1155
1156 /*
1157 * The restart case does not attempt to point the hands at roughly
1158 * the right point on the assumption that after one circuit things
1159 * will have settled down, and restarts shouldn't be that often.
1160 */
1161 reset_hands[inst] = B_TRUE;
1162
1163 pageouts_running++;
1164 mutex_exit(&pageout_mutex);
1165
1166 loop:
1167 cv_signal_pageout();
1168
1169 mutex_enter(&pageout_mutex);
1170 pageouts_running--;
1171 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1172 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1173 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1174 pageouts_running++;
1175 mutex_exit(&pageout_mutex);
1176
1177 /*
1178 * Check if pageout has been disabled for debugging purposes.
1179 */
1180 if (!dopageout) {
1181 goto loop;
1182 }
1183
1184 /*
1185 * One may reset the clock hands and scanned region for debugging
1186 * purposes. Hands will also be reset on first thread startup, if
1187 * the number of scanning threads (n_page_scanners) changes, or if
1188 * memory is added to, or removed from, the system.
1189 */
1190 if (reset_hands[inst]) {
1191 struct page *first;
1192
1193 reset_hands[inst] = B_FALSE;
1194
1195 if (inst >= n_page_scanners) {
1196 /*
1197 * The desired number of page scanners has been
1198 * reduced and this instance is no longer wanted.
1199 * Exit the lwp.
1200 */
1201 VERIFY3U(inst, !=, 0);
1202 DTRACE_PROBE1(pageout__exit, uint_t, inst);
1203 mutex_enter(&pageout_mutex);
1204 pageouts_running--;
1205 mutex_exit(&pageout_mutex);
1206 mutex_enter(&curproc->p_lock);
1207 lwp_exit();
1208 /* NOTREACHED */
1209 }
1210
1211 first = page_first();
1212
1213 /*
1214 * Each scanner thread gets its own sector of the memory
1215 * clock face.
1216 */
1217 pgcnt_t span, offset;
1218
1219 span = looppages / n_page_scanners;
1220 VERIFY3U(span, >, handspreadpages);
1221
1222 offset = inst * span;
1223 regionstart = page_nextn(first, offset);
1224 if (inst == n_page_scanners - 1) {
1225 /* The last instance goes up to the last page */
1226 regionend = page_nextn(first, looppages - 1);
1227 } else {
1228 regionend = page_nextn(regionstart, span - 1);
1229 }
1230
1231 backhand = regionstart;
1232 fronthand = page_nextn(backhand, handspreadpages);
1233 tick = 1;
1234
1235 bhwrapping = fhwrapping = B_FALSE;
1236
1237 DTRACE_PROBE4(pageout__reset, uint_t, inst,
1238 pgcnt_t, regionstart, pgcnt_t, regionend,
1239 pgcnt_t, fronthand);
1240 }
1241
1242 /*
1243 * This CPU kstat is only incremented here and we're obviously
1244 * on this CPU, so no lock.
1245 */
1246 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1247
1248 /*
1249 * Keep track of the number of times we have scanned all the way around
1250 * the loop on this wakeup.
1251 */
1252 laps = 0;
1253
1254 /*
1255 * Track the number of pages visited during this scan so that we can
1256 * periodically measure our duty cycle.
1257 */
1258 nscan_cnt = 0;
1259 pcount = 0;
1260
1261 DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1262 hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1263
1264 /*
1265 * Record the initial position of the front hand for this cycle so
1266 * that we can detect when the hand wraps around.
1267 */
1268 fronthandstart = fronthand;
1269
1270 sample_start = gethrtime();
1271
1272 /*
1273 * Scan the appropriate number of pages for a single duty cycle.
1274 */
1275 while (nscan_cnt < desscan) {
1276 checkpage_result_t rvfront, rvback;
1277
1278 /*
1279 * Only scan while at least one of these is true:
1280 * 1) one or more zones is over its cap
1281 * 2) there is not enough free memory
1282 * 3) during page scan startup when determining sample data
1283 */
1284 if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
1285 !zones_over) {
1286 /*
1287 * We are not sampling and enough memory has become
1288 * available that scanning is no longer required.
1289 */
1290 DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1291 break;
1292 }
1293
1294 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1295
1296 /*
1297 * Periodically check to see if we have exceeded the CPU duty
1298 * cycle for a single wakeup.
1299 */
1300 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1301 hrtime_t pageout_cycle_nsec;
1302
1303 pageout_cycle_nsec = gethrtime() - sample_start;
1304 if (pageout_cycle_nsec >= pageout_nsec) {
1305 if (!zones_over)
1306 atomic_inc_64(&pageout_timeouts);
1307 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1308 break;
1309 }
1310 }
1311
1312 /*
1313 * If checkpage manages to add a page to the free list,
1314 * we give ourselves another couple of trips around the loop.
1315 */
1316 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1317 laps = 0;
1318 }
1319 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1320 laps = 0;
1321 }
1322
1323 ++pcount;
1324
1325 /*
1326 * This CPU kstat is only incremented here and we're obviously
1327 * on this CPU, so no lock.
1328 */
1329 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1330
1331 /*
1332 * Don't include ineligible pages in the number scanned.
1333 */
1334 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1335 nscan_cnt++;
1336 }
1337
1338 if (bhwrapping) {
1339 backhand = regionstart;
1340 bhwrapping = B_FALSE;
1341 } else {
1342 backhand = page_nextn(backhand, tick);
1343 if (backhand == regionend)
1344 bhwrapping = B_TRUE;
1345 }
1346
1347 if (fhwrapping) {
1348 fronthand = regionstart;
1349 fhwrapping = B_FALSE;
1350 } else {
1351 fronthand = page_nextn(fronthand, tick);
1352 if (fronthand == regionend)
1353 fhwrapping = B_TRUE;
1354 }
1355
1356 /*
1357 * The front hand has wrapped around during this wakeup.
1358 */
1359 if (fronthand == fronthandstart) {
1360 laps++;
1361 DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1362 uint_t, laps);
1363
1364 /*
1365 * This CPU kstat is only incremented here and we're
1366 * obviously on this CPU, so no lock.
1367 */
1368 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1369
1370 /*
1371 * then when we wraparound memory we want to try to
1372 * reclaim more pages.
1373 * If scanning only because zones are over their cap,
1374 * then wrapping is common and we simply keep going.
1375 */
1376 if (laps > 1 && freemem < lotsfree + needfree) {
1377 /*
1378 * Extremely unlikely, but it happens.
1379 * We went around the loop at least once
1380 * and didn't get far enough.
1381 * If we are still skipping `highly shared'
1382 * pages, skip fewer of them. Otherwise,
1383 * give up till the next clock tick.
1384 */
1385 if (po_share < MAX_PO_SHARE) {
1386 po_share <<= 1;
1387 } else {
1388 break;
1389 }
1390 }
1391 }
1392 }
1393
1394 sample_end = gethrtime();
1395 atomic_add_long(&nscan, nscan_cnt);
1396
1397 DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1398 pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1399
1400 /*
1401 * The global variables used below are only modified by this thread and
1402 * only during initial scanning when there is a single page scanner
1403 * thread running.
1404 */
1405 if (pageout_new_spread == 0) {
1406 VERIFY3U(inst, ==, 0);
1407
1408 if (PAGE_SCAN_STARTUP) {
1409 /*
1410 * Continue accumulating samples until we have enough
1411 * to get a reasonable value for average scan rate.
1412 */
1413 pageout_sample_pages += pcount;
1414 pageout_sample_etime += sample_end - sample_start;
1415 ++pageout_sample_cnt;
1416 }
1417
1418 if (!PAGE_SCAN_STARTUP) {
1419 /*
1420 * We have enough samples, set the spread.
1421 */
1422 pageout_rate = (hrrate_t)pageout_sample_pages *
1423 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1424 pageout_new_spread = pageout_rate / 10;
1425 setupclock();
1426 }
1427 }
1428
1429 goto loop;
1430 }
1431
1432 /*
1433 * The pageout deadman is run once per second by clock().
1434 */
1435 void
1436 pageout_deadman(void)
1437 {
1438 if (panicstr != NULL) {
1439 /*
1440 * There is no pageout after panic.
1441 */
1442 return;
1443 }
1444
1445 if (pageout_deadman_seconds == 0) {
1446 /*
1447 * The deadman is not enabled.
1448 */
1449 return;
1450 }
1451
1452 if (!pageout_pushing) {
1453 goto reset;
1454 }
1455
1456 /*
1457 * We are pushing a page. Check to see if it is the same call we saw
1458 * last time we looked:
1459 */
1460 if (pageout_pushcount != pageout_pushcount_seen) {
1461 /*
1462 * It is a different call from the last check, so we are not
1463 * stuck.
1464 */
1465 goto reset;
1466 }
1467
1468 if (++pageout_stucktime >= pageout_deadman_seconds) {
1469 panic("pageout_deadman: stuck pushing the same page for %d "
1470 "seconds (freemem is %lu)", pageout_deadman_seconds,
1471 freemem);
1472 }
1473
1474 return;
1475
1476 reset:
1477 /*
1478 * Reset our tracking state to reflect that we are not stuck:
1479 */
1480 pageout_stucktime = 0;
1481 pageout_pushcount_seen = pageout_pushcount;
1482 }
1483
1484 /*
1485 * Look at the page at hand. If it is locked (e.g., for physical i/o),
1486 * system (u., page table) or free, then leave it alone. Otherwise,
1487 * if we are running the front hand, turn off the page's reference bit.
1488 * If the proc is over maxrss, we take it. If running the back hand,
1489 * check whether the page has been reclaimed. If not, free the page,
1490 * pushing it to disk first if necessary.
1491 *
1492 * Return values:
1493 * CKP_INELIGIBLE if the page is not a candidate at all,
1494 * CKP_NOT_FREED if the page was not freed, or
1495 * CKP_FREED if we freed it.
1496 */
1497 static checkpage_result_t
1498 checkpage(struct page *pp, pageout_hand_t whichhand)
1499 {
1500 int ppattr;
1501 int isfs = 0;
1502 int isexec = 0;
1503 int pagesync_flag;
1504 zoneid_t zid = ALL_ZONES;
1505
1506 /*
1507 * Skip pages:
1508 * - associated with the kernel vnode since
1509 * they are always "exclusively" locked.
1510 * - that are free
1511 * - that are shared more than po_share'd times
1512 * - its already locked
1513 *
1514 * NOTE: These optimizations assume that reads are atomic.
1515 */
1516
1517 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1518 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1519 hat_page_checkshare(pp, po_share)) {
1520 return (CKP_INELIGIBLE);
1521 }
1522
1523 if (!page_trylock(pp, SE_EXCL)) {
1524 /*
1525 * Skip the page if we can't acquire the "exclusive" lock.
1526 */
1527 return (CKP_INELIGIBLE);
1528 } else if (PP_ISFREE(pp)) {
1529 /*
1530 * It became free between the above check and our actually
1531 * locking the page. Oh well, there will be other pages.
1532 */
1533 page_unlock(pp);
1534 return (CKP_INELIGIBLE);
1535 }
1536
1537 /*
1538 * Reject pages that cannot be freed. The page_struct_lock
1539 * need not be acquired to examine these
1540 * fields since the page has an "exclusive" lock.
1541 */
1542 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1543 page_unlock(pp);
1544 return (CKP_INELIGIBLE);
1545 }
1546
1547 if (zones_over) {
1548 ASSERT(pp->p_zoneid == ALL_ZONES ||
1549 pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1550 if (pp->p_zoneid == ALL_ZONES ||
1551 zone_pdata[pp->p_zoneid].zpers_over == 0) {
1552 /*
1553 * Cross-zone shared page, or zone not over it's cap.
1554 * Leave the page alone.
1555 */
1556 page_unlock(pp);
1557 return (CKP_INELIGIBLE);
1558 }
1559 zid = pp->p_zoneid;
1560 }
1561
1562 /*
1563 * Maintain statistics for what we are freeing
1564 */
1565 if (pp->p_vnode != NULL) {
1566 if (pp->p_vnode->v_flag & VVMEXEC)
1567 isexec = 1;
1568
1569 if (!IS_SWAPFSVP(pp->p_vnode))
1570 isfs = 1;
1571 }
1572
1573 /*
1574 * Turn off REF and MOD bits with the front hand.
1575 * The back hand examines the REF bit and always considers
1576 * SHARED pages as referenced.
1577 */
1578 if (whichhand == POH_FRONT) {
1579 pagesync_flag = HAT_SYNC_ZERORM;
1580 } else {
1581 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1582 HAT_SYNC_STOPON_SHARED;
1583 }
1584
1585 ppattr = hat_pagesync(pp, pagesync_flag);
1586
1587 recheck:
1588 /*
1589 * If page is referenced; make unreferenced but reclaimable.
1590 * If this page is not referenced, then it must be reclaimable
1591 * and we can add it to the free list.
1592 */
1593 if (ppattr & P_REF) {
1594 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1595 pageout_hand_t, whichhand);
1596
1597 if (whichhand == POH_FRONT) {
1598 /*
1599 * Checking of rss or madvise flags needed here...
1600 *
1601 * If not "well-behaved", fall through into the code
1602 * for not referenced.
1603 */
1604 hat_clrref(pp);
1605 }
1606
1607 /*
1608 * Somebody referenced the page since the front
1609 * hand went by, so it's not a candidate for
1610 * freeing up.
1611 */
1612 page_unlock(pp);
1613 return (CKP_NOT_FREED);
1614 }
1615
1616 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1617
1618 /*
1619 * If large page, attempt to demote it. If successfully demoted,
1620 * retry the checkpage.
1621 */
1622 if (pp->p_szc != 0) {
1623 if (!page_try_demote_pages(pp)) {
1624 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1625 page_unlock(pp);
1626 return (CKP_INELIGIBLE);
1627 }
1628
1629 ASSERT(pp->p_szc == 0);
1630 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1631
1632 /*
1633 * Since page_try_demote_pages() could have unloaded some
1634 * mappings it makes sense to reload ppattr.
1635 */
1636 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1637 }
1638
1639 /*
1640 * If the page is currently dirty, we have to arrange to have it
1641 * cleaned before it can be freed.
1642 *
1643 * XXX - ASSERT(pp->p_vnode != NULL);
1644 */
1645 if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1646 struct vnode *vp = pp->p_vnode;
1647 u_offset_t offset = pp->p_offset;
1648
1649 /*
1650 * XXX - Test for process being swapped out or about to exit?
1651 * [Can't get back to process(es) using the page.]
1652 */
1653
1654 /*
1655 * Hold the vnode before releasing the page lock to
1656 * prevent it from being freed and re-used by some
1657 * other thread.
1658 */
1659 VN_HOLD(vp);
1660 page_unlock(pp);
1661
1662 /*
1663 * Queue I/O request for the pageout thread.
1664 */
1665 if (!queue_io_request(vp, offset)) {
1666 VN_RELE(vp);
1667 return (CKP_NOT_FREED);
1668 }
1669 if (isfs) {
1670 zone_pageout_stat(zid, ZPO_DIRTY);
1671 } else {
1672 zone_pageout_stat(zid, ZPO_ANONDIRTY);
1673 }
1674 return (CKP_FREED);
1675 }
1676
1677 /*
1678 * Now we unload all the translations and put the page back on to the
1679 * free list. If the page was used (referenced or modified) after the
1680 * pagesync but before it was unloaded we catch it and handle the page
1681 * properly.
1682 */
1683 DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1684 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1685 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1686 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1687 goto recheck;
1688 }
1689
1690 VN_DISPOSE(pp, B_FREE, 0, kcred);
1691
1692 CPU_STATS_ADD_K(vm, dfree, 1);
1693
1694 if (isfs) {
1695 if (isexec) {
1696 CPU_STATS_ADD_K(vm, execfree, 1);
1697 } else {
1698 CPU_STATS_ADD_K(vm, fsfree, 1);
1699 }
1700 zone_pageout_stat(zid, ZPO_FS);
1701 } else {
1702 CPU_STATS_ADD_K(vm, anonfree, 1);
1703 zone_pageout_stat(zid, ZPO_ANON);
1704 }
1705
1706 return (CKP_FREED);
1707 }
1708
1709 /*
1710 * Queue async i/o request from pageout_scanner and segment swapout
1711 * routines on one common list. This ensures that pageout devices (swap)
1712 * are not saturated by pageout_scanner or swapout requests.
1713 * The pageout thread empties this list by initiating i/o operations.
1714 */
1715 int
1716 queue_io_request(vnode_t *vp, u_offset_t off)
1717 {
1718 struct async_reqs *arg;
1719
1720 /*
1721 * If we cannot allocate an async request struct,
1722 * skip this page.
1723 */
1724 mutex_enter(&push_lock);
1725 if ((arg = req_freelist) == NULL) {
1726 mutex_exit(&push_lock);
1727 return (0);
1728 }
1729 req_freelist = arg->a_next; /* adjust freelist */
1730 push_list_size++;
1731
1732 arg->a_vp = vp;
1733 arg->a_off = off;
1734 arg->a_len = PAGESIZE;
1735 arg->a_flags = B_ASYNC | B_FREE;
1736 arg->a_cred = kcred; /* always held */
1737
1738 /*
1739 * Add to list of pending write requests.
1740 */
1741 arg->a_next = push_list;
1742 push_list = arg;
1743
1744 if (req_freelist == NULL) {
1745 /*
1746 * No free async requests left. The lock is held so we
1747 * might as well signal the pusher thread now.
1748 */
1749 cv_signal(&push_cv);
1750 }
1751 mutex_exit(&push_lock);
1752 return (1);
1753 }
1754
1755 /*
1756 * Wakeup pageout to initiate i/o if push_list is not empty.
1757 */
1758 void
1759 cv_signal_pageout()
1760 {
1761 if (push_list != NULL) {
1762 mutex_enter(&push_lock);
1763 cv_signal(&push_cv);
1764 mutex_exit(&push_lock);
1765 }
1766 }