1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
31
32 /*
33 * University Copyright- Copyright (c) 1982, 1986, 1988
34 * The Regents of the University of California
35 * All Rights Reserved
36 *
37 * University Acknowledgment- Portions of this document are derived from
38 * software developed by the University of California, Berkeley, and its
39 * contributors.
40 */
41
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/buf.h>
46 #include <sys/uio.h>
47 #include <sys/proc.h>
48 #include <sys/systm.h>
49 #include <sys/mman.h>
50 #include <sys/cred.h>
51 #include <sys/vnode.h>
52 #include <sys/vm.h>
53 #include <sys/vmparam.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/cpuvar.h>
57 #include <sys/user.h>
58 #include <sys/kmem.h>
59 #include <sys/debug.h>
60 #include <sys/callb.h>
61 #include <sys/tnf_probe.h>
62 #include <sys/mem_cage.h>
63 #include <sys/time.h>
64
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/page.h>
69 #include <vm/pvn.h>
70 #include <vm/seg_kmem.h>
71
72 static int checkpage(page_t *, int);
73
74 /*
75 * The following parameters control operation of the page replacement
76 * algorithm. They are initialized to 0, and then computed at boot time
77 * based on the size of the system. If they are patched non-zero in
78 * a loaded vmunix they are left alone and may thus be changed per system
79 * using adb on the loaded system.
80 */
81 volatile pgcnt_t slowscan = 0;
82 volatile pgcnt_t fastscan = 0;
83
84 volatile pgcnt_t handspreadpages = 0;
85 static int loopfraction = 2;
86 static pgcnt_t looppages;
87 volatile int min_percent_cpu = 4;
88 static int max_percent_cpu = 80;
89 static pgcnt_t maxfastscan = 0;
90 static pgcnt_t maxslowscan = 100;
91
92 volatile pgcnt_t maxpgio = 0;
93 volatile pgcnt_t minfree = 0;
94 volatile pgcnt_t desfree = 0;
95 volatile pgcnt_t lotsfree = 0;
96 pgcnt_t needfree = 0;
97 volatile pgcnt_t throttlefree = 0;
98 volatile pgcnt_t pageout_reserve = 0;
99
100 pgcnt_t deficit;
101 pgcnt_t nscan;
102 pgcnt_t desscan;
103
104 /*
105 * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
106 * are the number of ticks in each wakeup cycle that gives the
107 * equivalent of some underlying %CPU duty cycle.
108 * When RATETOSCHEDPAGING is 4, and hz is 100, pageout_scanner is
109 * awakened every 25 clock ticks. So, converting from %CPU to ticks
110 * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
111 * So, for example, 4% == 1 tick and 80% == 20 ticks.
112 *
113 * min_pageout_ticks:
114 * ticks/wakeup equivalent of min_percent_cpu.
115 *
116 * max_pageout_ticks:
117 * ticks/wakeup equivalent of max_percent_cpu.
118 *
119 * pageout_ticks:
120 * Number of clock ticks budgeted for each wakeup cycle.
121 * Computed each time around by schedpaging().
122 * Varies between min_pageout_ticks .. max_pageout_ticks,
123 * depending on memory pressure.
124 *
125 * pageout_lbolt:
126 * Timestamp of the last time pageout_scanner woke up and started
127 * (or resumed) scanning for not recently referenced pages.
128 */
129
130 static clock_t min_pageout_ticks;
131 static clock_t max_pageout_ticks;
132 static clock_t pageout_ticks;
133 static clock_t pageout_lbolt;
134
135 static uint_t reset_hands;
136
137 #define PAGES_POLL_MASK 1023
138
139 /*
140 * pageout_sample_lim:
141 * The limit on the number of samples needed to establish a value
142 * for new pageout parameters, fastscan, slowscan, and handspreadpages.
143 *
144 * pageout_sample_cnt:
145 * Current sample number. Once the sample gets large enough,
146 * set new values for handspreadpages, fastscan and slowscan.
147 *
148 * pageout_sample_pages:
149 * The accumulated number of pages scanned during sampling.
150 *
151 * pageout_sample_ticks:
152 * The accumulated clock ticks for the sample.
153 *
154 * pageout_rate:
155 * Rate in pages/nanosecond, computed at the end of sampling.
156 *
157 * pageout_new_spread:
158 * The new value to use for fastscan and handspreadpages.
159 * Calculated after enough samples have been taken.
160 */
161
162 typedef hrtime_t hrrate_t;
163
164 static uint64_t pageout_sample_lim = 4;
165 static uint64_t pageout_sample_cnt = 0;
166 static pgcnt_t pageout_sample_pages = 0;
167 static hrrate_t pageout_rate = 0;
168 static pgcnt_t pageout_new_spread = 0;
169
170 static clock_t pageout_cycle_ticks;
171 static hrtime_t sample_start, sample_end;
172 static hrtime_t pageout_sample_etime = 0;
173
174 /*
175 * Record number of times a pageout_scanner wakeup cycle finished because it
176 * timed out (exceeded its CPU budget), rather than because it visited
177 * its budgeted number of pages.
178 */
179 uint64_t pageout_timeouts = 0;
180
181 #ifdef VM_STATS
182 static struct pageoutvmstats_str {
183 ulong_t checkpage[3];
184 } pageoutvmstats;
185 #endif /* VM_STATS */
186
187 /*
188 * Threads waiting for free memory use this condition variable and lock until
189 * memory becomes available.
190 */
191 kmutex_t memavail_lock;
192 kcondvar_t memavail_cv;
193
194 /*
195 * The size of the clock loop.
196 */
197 #define LOOPPAGES total_pages
198
199 /*
200 * Set up the paging constants for the clock algorithm.
201 * Called after the system is initialized and the amount of memory
202 * and number of paging devices is known.
203 *
204 * lotsfree is 1/64 of memory, but at least 512K.
205 * desfree is 1/2 of lotsfree.
206 * minfree is 1/2 of desfree.
207 *
208 * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
209 *
210 * lotsfree = btop(512K)
211 * desfree = btop(200K)
212 * minfree = btop(100K)
213 * throttlefree = INT_MIN
214 * max_percent_cpu = 4
215 */
216 void
217 setupclock(int recalc)
218 {
219
220 static spgcnt_t init_lfree, init_dfree, init_mfree;
221 static spgcnt_t init_tfree, init_preserve, init_mpgio;
222 static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
223
224 looppages = LOOPPAGES;
225
226 /*
227 * setupclock can now be called to recalculate the paging
228 * parameters in the case of dynamic addition of memory.
229 * So to make sure we make the proper calculations, if such a
230 * situation should arise, we save away the initial values
231 * of each parameter so we can recall them when needed. This
232 * way we don't lose the settings an admin might have made
233 * through the /etc/system file.
234 */
235
236 if (!recalc) {
237 init_lfree = lotsfree;
238 init_dfree = desfree;
239 init_mfree = minfree;
240 init_tfree = throttlefree;
241 init_preserve = pageout_reserve;
242 init_mpgio = maxpgio;
243 init_mfscan = maxfastscan;
244 init_fscan = fastscan;
245 init_sscan = slowscan;
246 init_hspages = handspreadpages;
247 }
248
249 /*
250 * Set up thresholds for paging:
251 */
252
253 /*
254 * Lotsfree is threshold where paging daemon turns on.
255 */
256 if (init_lfree == 0 || init_lfree >= looppages)
257 lotsfree = MAX(looppages / 64, btop(512 * 1024));
258 else
259 lotsfree = init_lfree;
260
261 /*
262 * Desfree is amount of memory desired free.
263 * If less than this for extended period, start swapping.
264 */
265 if (init_dfree == 0 || init_dfree >= lotsfree)
266 desfree = lotsfree / 2;
267 else
268 desfree = init_dfree;
269
270 /*
271 * Minfree is minimal amount of free memory which is tolerable.
272 */
273 if (init_mfree == 0 || init_mfree >= desfree)
274 minfree = desfree / 2;
275 else
276 minfree = init_mfree;
277
278 /*
279 * Throttlefree is the point at which we start throttling
280 * PG_WAIT requests until enough memory becomes available.
281 */
282 if (init_tfree == 0 || init_tfree >= desfree)
283 throttlefree = minfree;
284 else
285 throttlefree = init_tfree;
286
287 /*
288 * Pageout_reserve is the number of pages that we keep in
289 * stock for pageout's own use. Having a few such pages
290 * provides insurance against system deadlock due to
291 * pageout needing pages. When freemem < pageout_reserve,
292 * non-blocking allocations are denied to any threads
293 * other than pageout and sched. (At some point we might
294 * want to consider a per-thread flag like T_PUSHING_PAGES
295 * to indicate that a thread is part of the page-pushing
296 * dance (e.g. an interrupt thread) and thus is entitled
297 * to the same special dispensation we accord pageout.)
298 */
299 if (init_preserve == 0 || init_preserve >= throttlefree)
300 pageout_reserve = throttlefree / 2;
301 else
302 pageout_reserve = init_preserve;
303
304 /*
305 * Maxpgio thresholds how much paging is acceptable.
306 * This figures that 2/3 busy on an arm is all that is
307 * tolerable for paging. We assume one operation per disk rev.
308 *
309 * XXX - Does not account for multiple swap devices.
310 */
311 if (init_mpgio == 0)
312 maxpgio = (DISKRPM * 2) / 3;
313 else
314 maxpgio = init_mpgio;
315
316 /*
317 * The clock scan rate varies between fastscan and slowscan
318 * based on the amount of free memory available. Fastscan
319 * rate should be set based on the number pages that can be
320 * scanned per sec using ~10% of processor time. Since this
321 * value depends on the processor, MMU, Mhz etc., it is
322 * difficult to determine it in a generic manner for all
323 * architectures.
324 *
325 * Instead of trying to determine the number of pages scanned
326 * per sec for every processor, fastscan is set to be the smaller
327 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
328 * time is limited to ~4% of processor time.
329 *
330 * Setting fastscan to be 1/2 of memory allows pageout to scan
331 * all of memory in ~2 secs. This implies that user pages not
332 * accessed within 1 sec (assuming, handspreadpages == fastscan)
333 * can be reclaimed when free memory is very low. Stealing pages
334 * not accessed within 1 sec seems reasonable and ensures that
335 * active user processes don't thrash.
336 *
337 * Smaller values of fastscan result in scanning fewer pages
338 * every second and consequently pageout may not be able to free
339 * sufficient memory to maintain the minimum threshold. Larger
340 * values of fastscan result in scanning a lot more pages which
341 * could lead to thrashing and higher CPU usage.
342 *
343 * Fastscan needs to be limited to a maximum value and should not
344 * scale with memory to prevent pageout from consuming too much
345 * time for scanning on slow CPU's and avoid thrashing, as a
346 * result of scanning too many pages, on faster CPU's.
347 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
348 * (the upper bound for fastscan) based on the average number
349 * of pages that can potentially be scanned in ~1 sec (using ~4%
350 * of the CPU) on some of the following machines that currently
351 * run Solaris 2.x:
352 *
353 * average memory scanned in ~1 sec
354 *
355 * 25 Mhz SS1+: 23 Meg
356 * LX: 37 Meg
357 * 50 Mhz SC2000: 68 Meg
358 *
359 * 40 Mhz 486: 26 Meg
360 * 66 Mhz 486: 42 Meg
361 *
362 * When free memory falls just below lotsfree, the scan rate
363 * goes from 0 to slowscan (i.e., pageout starts running). This
364 * transition needs to be smooth and is achieved by ensuring that
365 * pageout scans a small number of pages to satisfy the transient
366 * memory demand. This is set to not exceed 100 pages/sec (25 per
367 * wakeup) since scanning that many pages has no noticible impact
368 * on system performance.
369 *
370 * In addition to setting fastscan and slowscan, pageout is
371 * limited to using ~4% of the CPU. This results in increasing
372 * the time taken to scan all of memory, which in turn means that
373 * user processes have a better opportunity of preventing their
374 * pages from being stolen. This has a positive effect on
375 * interactive and overall system performance when memory demand
376 * is high.
377 *
378 * Thus, the rate at which pages are scanned for replacement will
379 * vary linearly between slowscan and the number of pages that
380 * can be scanned using ~4% of processor time instead of varying
381 * linearly between slowscan and fastscan.
382 *
383 * Also, the processor time used by pageout will vary from ~1%
384 * at slowscan to ~4% at fastscan instead of varying between
385 * ~1% at slowscan and ~10% at fastscan.
386 *
387 * The values chosen for the various VM parameters (fastscan,
388 * handspreadpages, etc) are not universally true for all machines,
389 * but appear to be a good rule of thumb for the machines we've
390 * tested. They have the following ranges:
391 *
392 * cpu speed: 20 to 70 Mhz
393 * page size: 4K to 8K
394 * memory size: 16M to 5G
395 * page scan rate: 4000 - 17400 4K pages per sec
396 *
397 * The values need to be re-examined for machines which don't
398 * fall into the various ranges (e.g., slower or faster CPUs,
399 * smaller or larger pagesizes etc) shown above.
400 *
401 * On an MP machine, pageout is often unable to maintain the
402 * minimum paging thresholds under heavy load. This is due to
403 * the fact that user processes running on other CPU's can be
404 * dirtying memory at a much faster pace than pageout can find
405 * pages to free. The memory demands could be met by enabling
406 * more than one CPU to run the clock algorithm in such a manner
407 * that the various clock hands don't overlap. This also makes
408 * it more difficult to determine the values for fastscan, slowscan
409 * and handspreadpages.
410 *
411 * The swapper is currently used to free up memory when pageout
412 * is unable to meet memory demands by swapping out processes.
413 * In addition to freeing up memory, swapping also reduces the
414 * demand for memory by preventing user processes from running
415 * and thereby consuming memory.
416 */
417 if (init_mfscan == 0) {
418 if (pageout_new_spread != 0)
419 maxfastscan = pageout_new_spread;
420 else
421 maxfastscan = MAXHANDSPREADPAGES;
422 } else {
423 maxfastscan = init_mfscan;
424 }
425 if (init_fscan == 0)
426 fastscan = MIN(looppages / loopfraction, maxfastscan);
427 else
428 fastscan = init_fscan;
429 if (fastscan > looppages / loopfraction)
430 fastscan = looppages / loopfraction;
431
432 /*
433 * Set slow scan time to 1/10 the fast scan time, but
434 * not to exceed maxslowscan.
435 */
436 if (init_sscan == 0)
437 slowscan = MIN(fastscan / 10, maxslowscan);
438 else
439 slowscan = init_sscan;
440 if (slowscan > fastscan / 2)
441 slowscan = fastscan / 2;
442
443 /*
444 * Handspreadpages is distance (in pages) between front and back
445 * pageout daemon hands. The amount of time to reclaim a page
446 * once pageout examines it increases with this distance and
447 * decreases as the scan rate rises. It must be < the amount
448 * of pageable memory.
449 *
450 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
451 * to be "fastscan" results in the front hand being a few secs
452 * (varies based on the processor speed) ahead of the back hand
453 * at fastscan rates. This distance can be further reduced, if
454 * necessary, by increasing the processor time used by pageout
455 * to be more than ~4% and preferrably not more than ~10%.
456 *
457 * As a result, user processes have a much better chance of
458 * referencing their pages before the back hand examines them.
459 * This also significantly lowers the number of reclaims from
460 * the freelist since pageout does not end up freeing pages which
461 * may be referenced a sec later.
462 */
463 if (init_hspages == 0)
464 handspreadpages = fastscan;
465 else
466 handspreadpages = init_hspages;
467
468 /*
469 * Make sure that back hand follows front hand by at least
470 * 1/RATETOSCHEDPAGING seconds. Without this test, it is possible
471 * for the back hand to look at a page during the same wakeup of
472 * the pageout daemon in which the front hand cleared its ref bit.
473 */
474 if (handspreadpages >= looppages)
475 handspreadpages = looppages - 1;
476
477 /*
478 * If we have been called to recalculate the parameters,
479 * set a flag to re-evaluate the clock hand pointers.
480 */
481 if (recalc)
482 reset_hands = 1;
483 }
484
485 /*
486 * Pageout scheduling.
487 *
488 * Schedpaging controls the rate at which the page out daemon runs by
489 * setting the global variables nscan and desscan RATETOSCHEDPAGING
490 * times a second. Nscan records the number of pages pageout has examined
491 * in its current pass; schedpaging resets this value to zero each time
492 * it runs. Desscan records the number of pages pageout should examine
493 * in its next pass; schedpaging sets this value based on the amount of
494 * currently available memory.
495 */
496
497 #define RATETOSCHEDPAGING 4 /* hz that is */
498
499 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
500
501 /*
502 * Pool of available async pageout putpage requests.
503 */
504 static struct async_reqs *push_req;
505 static struct async_reqs *req_freelist; /* available req structs */
506 static struct async_reqs *push_list; /* pending reqs */
507 static kmutex_t push_lock; /* protects req pool */
508 static kcondvar_t push_cv;
509
510 static int async_list_size = 256; /* number of async request structs */
511
512 static void pageout_scanner(void);
513
514 /*
515 * If a page is being shared more than "po_share" times
516 * then leave it alone- don't page it out.
517 */
518 #define MIN_PO_SHARE (8)
519 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
520 ulong_t po_share = MIN_PO_SHARE;
521
522 /*
523 * Schedule rate for paging.
524 * Rate is linear interpolation between
525 * slowscan with lotsfree and fastscan when out of memory.
526 */
527 static void
528 schedpaging(void *arg)
529 {
530 spgcnt_t vavail;
531
532 if (freemem < lotsfree + needfree + kmem_reapahead)
533 kmem_reap();
534
535 if (freemem < lotsfree + needfree)
536 seg_preap();
537
538 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
539 kcage_cageout_wakeup();
540
541 if (mutex_tryenter(&pageout_mutex)) {
542 /* pageout() not running */
543 nscan = 0;
544 vavail = freemem - deficit;
545 if (pageout_new_spread != 0)
546 vavail -= needfree;
547 if (vavail < 0)
548 vavail = 0;
549 if (vavail > lotsfree)
550 vavail = lotsfree;
551
552 /*
553 * Fix for 1161438 (CRS SPR# 73922). All variables
554 * in the original calculation for desscan were 32 bit signed
555 * ints. As freemem approaches 0x0 on a system with 1 Gig or
556 * more of memory, the calculation can overflow. When this
557 * happens, desscan becomes negative and pageout_scanner()
558 * stops paging out.
559 */
560 if ((needfree) && (pageout_new_spread == 0)) {
561 /*
562 * If we've not yet collected enough samples to
563 * calculate a spread, use the old logic of kicking
564 * into high gear anytime needfree is non-zero.
565 */
566 desscan = fastscan / RATETOSCHEDPAGING;
567 } else {
568 /*
569 * Once we've calculated a spread based on system
570 * memory and usage, just treat needfree as another
571 * form of deficit.
572 */
573 spgcnt_t faststmp, slowstmp, result;
574
575 slowstmp = slowscan * vavail;
576 faststmp = fastscan * (lotsfree - vavail);
577 result = (slowstmp + faststmp) /
578 nz(lotsfree) / RATETOSCHEDPAGING;
579 desscan = (pgcnt_t)result;
580 }
581
582 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
583 (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
584
585 if (freemem < lotsfree + needfree ||
586 pageout_sample_cnt < pageout_sample_lim) {
587 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
588 "pageout_cv_signal:freemem %ld", freemem);
589 cv_signal(&proc_pageout->p_cv);
590 } else {
591 /*
592 * There are enough free pages, no need to
593 * kick the scanner thread. And next time
594 * around, keep more of the `highly shared'
595 * pages.
596 */
597 cv_signal_pageout();
598 if (po_share > MIN_PO_SHARE) {
599 po_share >>= 1;
600 }
601 }
602 mutex_exit(&pageout_mutex);
603 }
604
605 /*
606 * Signal threads waiting for available memory.
607 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
608 * in this case it is not needed - the waiters will be waken up during
609 * the next invocation of this function.
610 */
611 if (kmem_avail() > 0)
612 cv_broadcast(&memavail_cv);
613
614 (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
615 }
616
617 pgcnt_t pushes;
618 ulong_t push_list_size; /* # of requests on pageout queue */
619
620 #define FRONT 1
621 #define BACK 2
622
623 int dopageout = 1; /* must be non-zero to turn page stealing on */
624
625 /*
626 * The page out daemon, which runs as process 2.
627 *
628 * As long as there are at least lotsfree pages,
629 * this process is not run. When the number of free
630 * pages stays in the range desfree to lotsfree,
631 * this daemon runs through the pages in the loop
632 * at a rate determined in schedpaging(). Pageout manages
633 * two hands on the clock. The front hand moves through
634 * memory, clearing the reference bit,
635 * and stealing pages from procs that are over maxrss.
636 * The back hand travels a distance behind the front hand,
637 * freeing the pages that have not been referenced in the time
638 * since the front hand passed. If modified, they are pushed to
639 * swap before being freed.
640 *
641 * There are 2 threads that act on behalf of the pageout process.
642 * One thread scans pages (pageout_scanner) and frees them up if
643 * they don't require any VOP_PUTPAGE operation. If a page must be
644 * written back to its backing store, the request is put on a list
645 * and the other (pageout) thread is signaled. The pageout thread
646 * grabs VOP_PUTPAGE requests from the list, and processes them.
647 * Some filesystems may require resources for the VOP_PUTPAGE
648 * operations (like memory) and hence can block the pageout
649 * thread, but the scanner thread can still operate. There is still
650 * no guarantee that memory deadlocks cannot occur.
651 *
652 * For now, this thing is in very rough form.
653 */
654 void
655 pageout()
656 {
657 struct async_reqs *arg;
658 pri_t pageout_pri;
659 int i;
660 pgcnt_t max_pushes;
661 callb_cpr_t cprinfo;
662
663 proc_pageout = ttoproc(curthread);
664 proc_pageout->p_cstime = 0;
665 proc_pageout->p_stime = 0;
666 proc_pageout->p_cutime = 0;
667 proc_pageout->p_utime = 0;
668 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
669 bcopy("pageout", PTOU(curproc)->u_comm, 7);
670
671 /*
672 * Create pageout scanner thread
673 */
674 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
675 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
676
677 /*
678 * Allocate and initialize the async request structures
679 * for pageout.
680 */
681 push_req = (struct async_reqs *)
682 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
683
684 req_freelist = push_req;
685 for (i = 0; i < async_list_size - 1; i++)
686 push_req[i].a_next = &push_req[i + 1];
687
688 pageout_pri = curthread->t_pri;
689
690 /* Create the pageout scanner thread. */
691 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
692 pageout_pri - 1);
693
694 /*
695 * kick off pageout scheduler.
696 */
697 schedpaging(NULL);
698
699 /*
700 * Create kernel cage thread.
701 * The kernel cage thread is started under the pageout process
702 * to take advantage of the less restricted page allocation
703 * in page_create_throttle().
704 */
705 kcage_cageout_init();
706
707 /*
708 * Limit pushes to avoid saturating pageout devices.
709 */
710 max_pushes = maxpgio / RATETOSCHEDPAGING;
711 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
712
713 for (;;) {
714 mutex_enter(&push_lock);
715
716 while ((arg = push_list) == NULL || pushes > max_pushes) {
717 CALLB_CPR_SAFE_BEGIN(&cprinfo);
718 cv_wait(&push_cv, &push_lock);
719 pushes = 0;
720 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
721 }
722 push_list = arg->a_next;
723 arg->a_next = NULL;
724 mutex_exit(&push_lock);
725
726 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
727 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
728 pushes++;
729 }
730
731 /* vp held by checkpage() */
732 VN_RELE(arg->a_vp);
733
734 mutex_enter(&push_lock);
735 arg->a_next = req_freelist; /* back on freelist */
736 req_freelist = arg;
737 push_list_size--;
738 mutex_exit(&push_lock);
739 }
740 }
741
742 /*
743 * Kernel thread that scans pages looking for ones to free
744 */
745 static void
746 pageout_scanner(void)
747 {
748 struct page *fronthand, *backhand;
749 uint_t count;
750 callb_cpr_t cprinfo;
751 pgcnt_t nscan_limit;
752 pgcnt_t pcount;
753
754 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
755 mutex_enter(&pageout_mutex);
756
757 /*
758 * The restart case does not attempt to point the hands at roughly
759 * the right point on the assumption that after one circuit things
760 * will have settled down - and restarts shouldn't be that often.
761 */
762
763 /*
764 * Set the two clock hands to be separated by a reasonable amount,
765 * but no more than 360 degrees apart.
766 */
767 backhand = page_first();
768 if (handspreadpages >= total_pages)
769 fronthand = page_nextn(backhand, total_pages - 1);
770 else
771 fronthand = page_nextn(backhand, handspreadpages);
772
773 min_pageout_ticks = MAX(1,
774 ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
775 max_pageout_ticks = MAX(min_pageout_ticks,
776 ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
777
778 loop:
779 cv_signal_pageout();
780
781 CALLB_CPR_SAFE_BEGIN(&cprinfo);
782 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
783 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
784
785 if (!dopageout)
786 goto loop;
787
788 if (reset_hands) {
789 reset_hands = 0;
790
791 backhand = page_first();
792 if (handspreadpages >= total_pages)
793 fronthand = page_nextn(backhand, total_pages - 1);
794 else
795 fronthand = page_nextn(backhand, handspreadpages);
796 }
797
798 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
799 count = 0;
800
801 TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
802 "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
803 freemem, lotsfree, nscan, desscan);
804
805 /* Kernel probe */
806 TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
807 tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
808
809 pcount = 0;
810 if (pageout_sample_cnt < pageout_sample_lim) {
811 nscan_limit = total_pages;
812 } else {
813 nscan_limit = desscan;
814 }
815 pageout_lbolt = ddi_get_lbolt();
816 sample_start = gethrtime();
817
818 /*
819 * Scan the appropriate number of pages for a single duty cycle.
820 * However, stop scanning as soon as there is enough free memory.
821 * For a short while, we will be sampling the performance of the
822 * scanner and need to keep running just to get sample data, in
823 * which case we keep going and don't pay attention to whether
824 * or not there is enough free memory.
825 */
826
827 while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
828 pageout_sample_cnt < pageout_sample_lim)) {
829 int rvfront, rvback;
830
831 /*
832 * Check to see if we have exceeded our %CPU budget
833 * for this wakeup, but not on every single page visited,
834 * just every once in a while.
835 */
836 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
837 pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
838 if (pageout_cycle_ticks >= pageout_ticks) {
839 ++pageout_timeouts;
840 break;
841 }
842 }
843
844 /*
845 * If checkpage manages to add a page to the free list,
846 * we give ourselves another couple of trips around the loop.
847 */
848 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
849 count = 0;
850 if ((rvback = checkpage(backhand, BACK)) == 1)
851 count = 0;
852
853 ++pcount;
854
855 /*
856 * protected by pageout_mutex instead of cpu_stat_lock
857 */
858 CPU_STATS_ADDQ(CPU, vm, scan, 1);
859
860 /*
861 * Don't include ineligible pages in the number scanned.
862 */
863 if (rvfront != -1 || rvback != -1)
864 nscan++;
865
866 backhand = page_next(backhand);
867
868 /*
869 * backhand update and wraparound check are done separately
870 * because lint barks when it finds an empty "if" body
871 */
872
873 if ((fronthand = page_next(fronthand)) == page_first()) {
874 TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
875 "pageout_hand_wrap:freemem %ld whichhand %d",
876 freemem, FRONT);
877
878 /*
879 * protected by pageout_mutex instead of cpu_stat_lock
880 */
881 CPU_STATS_ADDQ(CPU, vm, rev, 1);
882 if (++count > 1) {
883 /*
884 * Extremely unlikely, but it happens.
885 * We went around the loop at least once
886 * and didn't get far enough.
887 * If we are still skipping `highly shared'
888 * pages, skip fewer of them. Otherwise,
889 * give up till the next clock tick.
890 */
891 if (po_share < MAX_PO_SHARE) {
892 po_share <<= 1;
893 } else {
894 /*
895 * Really a "goto loop", but
896 * if someone is TRACing or
897 * TNF_PROBE_ing, at least
898 * make records to show
899 * where we are.
900 */
901 break;
902 }
903 }
904 }
905 }
906
907 sample_end = gethrtime();
908
909 TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
910 "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
911 freemem, lotsfree, nscan, desscan, count);
912
913 /* Kernel probe */
914 TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
915 tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
916
917 if (pageout_sample_cnt < pageout_sample_lim) {
918 pageout_sample_pages += pcount;
919 pageout_sample_etime += sample_end - sample_start;
920 ++pageout_sample_cnt;
921 }
922 if (pageout_sample_cnt >= pageout_sample_lim &&
923 pageout_new_spread == 0) {
924 pageout_rate = (hrrate_t)pageout_sample_pages *
925 (hrrate_t)(NANOSEC) / pageout_sample_etime;
926 pageout_new_spread = pageout_rate / 10;
927 setupclock(1);
928 }
929
930 goto loop;
931 }
932
933 /*
934 * Look at the page at hand. If it is locked (e.g., for physical i/o),
935 * system (u., page table) or free, then leave it alone. Otherwise,
936 * if we are running the front hand, turn off the page's reference bit.
937 * If the proc is over maxrss, we take it. If running the back hand,
938 * check whether the page has been reclaimed. If not, free the page,
939 * pushing it to disk first if necessary.
940 *
941 * Return values:
942 * -1 if the page is not a candidate at all,
943 * 0 if not freed, or
944 * 1 if we freed it.
945 */
946 static int
947 checkpage(struct page *pp, int whichhand)
948 {
949 int ppattr;
950 int isfs = 0;
951 int isexec = 0;
952 int pagesync_flag;
953
954 /*
955 * Skip pages:
956 * - associated with the kernel vnode since
957 * they are always "exclusively" locked.
958 * - that are free
959 * - that are shared more than po_share'd times
960 * - its already locked
961 *
962 * NOTE: These optimizations assume that reads are atomic.
963 */
964
965 if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
966 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
967 hat_page_checkshare(pp, po_share)) {
968 return (-1);
969 }
970
971 if (!page_trylock(pp, SE_EXCL)) {
972 /*
973 * Skip the page if we can't acquire the "exclusive" lock.
974 */
975 return (-1);
976 } else if (PP_ISFREE(pp)) {
977 /*
978 * It became free between the above check and our actually
979 * locking the page. Oh, well there will be other pages.
980 */
981 page_unlock(pp);
982 return (-1);
983 }
984
985 /*
986 * Reject pages that cannot be freed. The page_struct_lock
987 * need not be acquired to examine these
988 * fields since the page has an "exclusive" lock.
989 */
990 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
991 page_unlock(pp);
992 return (-1);
993 }
994
995 /*
996 * Maintain statistics for what we are freeing
997 */
998
999 if (pp->p_vnode != NULL) {
1000 if (pp->p_vnode->v_flag & VVMEXEC)
1001 isexec = 1;
1002
1003 if (!IS_SWAPFSVP(pp->p_vnode))
1004 isfs = 1;
1005 }
1006
1007 /*
1008 * Turn off REF and MOD bits with the front hand.
1009 * The back hand examines the REF bit and always considers
1010 * SHARED pages as referenced.
1011 */
1012 if (whichhand == FRONT)
1013 pagesync_flag = HAT_SYNC_ZERORM;
1014 else
1015 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1016 HAT_SYNC_STOPON_SHARED;
1017
1018 ppattr = hat_pagesync(pp, pagesync_flag);
1019
1020 recheck:
1021 /*
1022 * If page is referenced; make unreferenced but reclaimable.
1023 * If this page is not referenced, then it must be reclaimable
1024 * and we can add it to the free list.
1025 */
1026 if (ppattr & P_REF) {
1027 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1028 "pageout_isref:pp %p whichhand %d", pp, whichhand);
1029 if (whichhand == FRONT) {
1030 /*
1031 * Checking of rss or madvise flags needed here...
1032 *
1033 * If not "well-behaved", fall through into the code
1034 * for not referenced.
1035 */
1036 hat_clrref(pp);
1037 }
1038 /*
1039 * Somebody referenced the page since the front
1040 * hand went by, so it's not a candidate for
1041 * freeing up.
1042 */
1043 page_unlock(pp);
1044 return (0);
1045 }
1046
1047 VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1048
1049 /*
1050 * If large page, attempt to demote it. If successfully demoted,
1051 * retry the checkpage.
1052 */
1053 if (pp->p_szc != 0) {
1054 if (!page_try_demote_pages(pp)) {
1055 VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1056 page_unlock(pp);
1057 return (-1);
1058 }
1059 ASSERT(pp->p_szc == 0);
1060 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1061 /*
1062 * since page_try_demote_pages() could have unloaded some
1063 * mappings it makes sense to reload ppattr.
1064 */
1065 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1066 }
1067
1068 /*
1069 * If the page is currently dirty, we have to arrange
1070 * to have it cleaned before it can be freed.
1071 *
1072 * XXX - ASSERT(pp->p_vnode != NULL);
1073 */
1074 if ((ppattr & P_MOD) && pp->p_vnode) {
1075 struct vnode *vp = pp->p_vnode;
1076 u_offset_t offset = pp->p_offset;
1077
1078 /*
1079 * XXX - Test for process being swapped out or about to exit?
1080 * [Can't get back to process(es) using the page.]
1081 */
1082
1083 /*
1084 * Hold the vnode before releasing the page lock to
1085 * prevent it from being freed and re-used by some
1086 * other thread.
1087 */
1088 VN_HOLD(vp);
1089 page_unlock(pp);
1090
1091 /*
1092 * Queue i/o request for the pageout thread.
1093 */
1094 if (!queue_io_request(vp, offset)) {
1095 VN_RELE(vp);
1096 return (0);
1097 }
1098 return (1);
1099 }
1100
1101 /*
1102 * Now we unload all the translations,
1103 * and put the page back on to the free list.
1104 * If the page was used (referenced or modified) after
1105 * the pagesync but before it was unloaded we catch it
1106 * and handle the page properly.
1107 */
1108 TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1109 "pageout_free:pp %p whichhand %d", pp, whichhand);
1110 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1111 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1112 if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1113 goto recheck;
1114
1115 /*LINTED: constant in conditional context*/
1116 VN_DISPOSE(pp, B_FREE, 0, kcred);
1117
1118 CPU_STATS_ADD_K(vm, dfree, 1);
1119
1120 if (isfs) {
1121 if (isexec) {
1122 CPU_STATS_ADD_K(vm, execfree, 1);
1123 } else {
1124 CPU_STATS_ADD_K(vm, fsfree, 1);
1125 }
1126 } else {
1127 CPU_STATS_ADD_K(vm, anonfree, 1);
1128 }
1129
1130 return (1); /* freed a page! */
1131 }
1132
1133 /*
1134 * Queue async i/o request from pageout_scanner and segment swapout
1135 * routines on one common list. This ensures that pageout devices (swap)
1136 * are not saturated by pageout_scanner or swapout requests.
1137 * The pageout thread empties this list by initiating i/o operations.
1138 */
1139 int
1140 queue_io_request(vnode_t *vp, u_offset_t off)
1141 {
1142 struct async_reqs *arg;
1143
1144 /*
1145 * If we cannot allocate an async request struct,
1146 * skip this page.
1147 */
1148 mutex_enter(&push_lock);
1149 if ((arg = req_freelist) == NULL) {
1150 mutex_exit(&push_lock);
1151 return (0);
1152 }
1153 req_freelist = arg->a_next; /* adjust freelist */
1154 push_list_size++;
1155
1156 arg->a_vp = vp;
1157 arg->a_off = off;
1158 arg->a_len = PAGESIZE;
1159 arg->a_flags = B_ASYNC | B_FREE;
1160 arg->a_cred = kcred; /* always held */
1161
1162 /*
1163 * Add to list of pending write requests.
1164 */
1165 arg->a_next = push_list;
1166 push_list = arg;
1167
1168 if (req_freelist == NULL) {
1169 /*
1170 * No free async requests left. The lock is held so we
1171 * might as well signal the pusher thread now.
1172 */
1173 cv_signal(&push_cv);
1174 }
1175 mutex_exit(&push_lock);
1176 return (1);
1177 }
1178
1179 /*
1180 * Wakeup pageout to initiate i/o if push_list is not empty.
1181 */
1182 void
1183 cv_signal_pageout()
1184 {
1185 if (push_list != NULL) {
1186 mutex_enter(&push_lock);
1187 cv_signal(&push_cv);
1188 mutex_exit(&push_lock);
1189 }
1190 }