225 pgcnt_t lotsfree_min = 0;
226 pgcnt_t lotsfree_max = 0;
227
228 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
229 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
230
231 /*
232 * If these tunables are set to non-zero values in /etc/system, and provided
233 * the value is not larger than the threshold above, the specified value will
234 * be used directly without any additional calculation or adjustment. The boot
235 * time value of these overrides is preserved in the "clockinit" struct. More
236 * detail is available in the comment at the top of the file.
237 */
238 pgcnt_t maxpgio = 0;
239 pgcnt_t minfree = 0;
240 pgcnt_t desfree = 0;
241 pgcnt_t lotsfree = 0;
242 pgcnt_t needfree = 0;
243 pgcnt_t throttlefree = 0;
244 pgcnt_t pageout_reserve = 0;
245
246 pgcnt_t deficit;
247 pgcnt_t nscan;
248 pgcnt_t desscan;
249
250 /* kstats */
251 uint64_t low_mem_scan;
252 uint64_t zone_cap_scan;
253 uint64_t n_throttle;
254
255 /*
256 * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
257 * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
258 * that gives the equivalent of some underlying %CPU duty cycle.
259 *
260 * min_pageout_nsec:
261 * nanoseconds/wakeup equivalent of min_percent_cpu.
262 *
263 * max_pageout_nsec:
264 * nanoseconds/wakeup equivalent of max_percent_cpu.
265 *
266 * pageout_nsec:
267 * Number of nanoseconds budgeted for each wakeup cycle.
268 * Computed each time around by schedpaging().
269 * Varies between min_pageout_nsec and max_pageout_nsec,
270 * depending on memory pressure or zones over their cap.
271 *
272 * zone_pageout_nsec:
273 * Number of nanoseconds budget for each cycle when a zone
274 * is over its memory cap. If this is zero, then the value
275 * of max_pageout_nsec is used instead.
276 */
277
278 static hrtime_t min_pageout_nsec;
279 static hrtime_t max_pageout_nsec;
280 static hrtime_t pageout_nsec;
281 static hrtime_t zone_pageout_nsec;
282
283 #define MAX_PSCAN_THREADS 16
284 static boolean_t reset_hands[MAX_PSCAN_THREADS];
285
286 /*
287 * These can be tuned in /etc/system or set with mdb.
288 * 'des_page_scanners' is the desired number of page scanner threads. The
289 * system will bring the actual number of threads into line with the desired
290 * number. If des_page_scanners is set to an invalid value, the system will
291 * correct the setting.
292 */
293 uint_t des_page_scanners;
294 uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
295
296 uint_t n_page_scanners;
297 static pgcnt_t pscan_region_sz; /* informational only */
298
299 #define PAGES_POLL_MASK 1023
300
301 /*
302 * pageout_sample_lim:
303 * The limit on the number of samples needed to establish a value for new
304 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
305 * handspreadpages.
306 *
307 * pageout_sample_cnt:
308 * Current sample number. Once the sample gets large enough, set new
309 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
310 *
311 * pageout_sample_pages:
312 * The accumulated number of pages scanned during sampling.
313 *
314 * pageout_sample_etime:
315 * The accumulated nanoseconds for the sample.
316 *
317 * pageout_rate:
318 * Rate in pages/nanosecond, computed at the end of sampling.
319 *
320 * pageout_new_spread:
321 * Initially zero while the system scan rate is measured by
322 * pageout_scanner(), which then sets this value once per system boot after
323 * enough samples have been recorded (pageout_sample_cnt). Once set, this
324 * new value is used for fastscan and handspreadpages.
325 */
326
327 typedef hrtime_t hrrate_t;
328
329 static uint64_t pageout_sample_lim = 4;
330 static uint64_t pageout_sample_cnt = 0;
331 static pgcnt_t pageout_sample_pages = 0;
332 static hrrate_t pageout_rate = 0;
333 static pgcnt_t pageout_new_spread = 0;
334
335 static hrtime_t pageout_sample_etime = 0;
336
337 /* True if page scanner is first starting up */
338 #define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
339
340 /*
341 * Record number of times a pageout_scanner() wakeup cycle finished because it
342 * timed out (exceeded its CPU budget), rather than because it visited
343 * its budgeted number of pages. This is only done when scanning under low
344 * free memory conditions, not when scanning for zones over their cap.
345 */
346 uint64_t pageout_timeouts = 0;
347
348 #ifdef VM_STATS
349 static struct pageoutvmstats_str {
350 ulong_t checkpage[3];
351 } pageoutvmstats;
352 #endif /* VM_STATS */
353
354 /*
355 * Threads waiting for free memory use this condition variable and lock until
356 * memory becomes available.
357 */
358 kmutex_t memavail_lock;
359 kcondvar_t memavail_cv;
368 CKP_NOT_FREED,
369 CKP_FREED,
370 } checkpage_result_t;
371
372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
373
374 static struct clockinit {
375 bool ci_init;
376 pgcnt_t ci_lotsfree_min;
377 pgcnt_t ci_lotsfree_max;
378 pgcnt_t ci_lotsfree;
379 pgcnt_t ci_desfree;
380 pgcnt_t ci_minfree;
381 pgcnt_t ci_throttlefree;
382 pgcnt_t ci_pageout_reserve;
383 pgcnt_t ci_maxpgio;
384 pgcnt_t ci_maxfastscan;
385 pgcnt_t ci_fastscan;
386 pgcnt_t ci_slowscan;
387 pgcnt_t ci_handspreadpages;
388 } clockinit = { .ci_init = false };
389
390 static pgcnt_t
391 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
392 {
393 if (value < minimum) {
394 return (minimum);
395 } else if (value > maximum) {
396 return (maximum);
397 } else {
398 return (value);
399 }
400 }
401
402 static pgcnt_t
403 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
404 {
405 if (initval == 0 || initval >= initval_ceiling) {
406 return (defval);
407 } else {
408 return (initval);
409 }
410 }
411
412 /*
413 * Local boolean to control scanning when zones are over their cap. Avoids
414 * accessing the zone_num_over_cap variable except within schedpaging(), which
415 * only runs periodically. This is here only to reduce our access to
416 * zone_num_over_cap, since it is already accessed a lot during paging, and
417 * the page scanner accesses the zones_over variable on each page during a
418 * scan. There is no lock needed for zone_num_over_cap since schedpaging()
419 * doesn't modify the variable, it only cares if the variable is 0 or non-0.
420 */
421 static boolean_t zones_over = B_FALSE;
422
423 /*
424 * Set up the paging constants for the clock algorithm used by
425 * pageout_scanner(), and by the virtual memory system overall. See the
426 * comments at the top of this file for more information about the threshold
427 * values and system responses to memory pressure.
428 *
429 * This routine is called once by main() at startup, after the initial size of
430 * physical memory is determined. It may be called again later if memory is
431 * added to or removed from the system, or if new measurements of the page scan
432 * rate become available.
433 */
434 void
435 setupclock(void)
436 {
437 uint_t i;
438 pgcnt_t sz, tmp;
439 pgcnt_t defval;
440 bool half = (pageout_threshold_style == 1);
441 bool recalc = true;
442
443 looppages = total_pages;
444
445 /*
446 * The operator may have provided specific values for some of the
447 * tunables via /etc/system. On our first call, we preserve those
448 * values so that they can be used for subsequent recalculations.
449 *
450 * A value of zero for any tunable means we will use the default
451 * sizing.
452 */
453
454 if (!clockinit.ci_init) {
455 clockinit.ci_init = true;
456
457 clockinit.ci_lotsfree_min = lotsfree_min;
458 clockinit.ci_lotsfree_max = lotsfree_max;
459 clockinit.ci_lotsfree = lotsfree;
460 clockinit.ci_desfree = desfree;
461 clockinit.ci_minfree = minfree;
462 clockinit.ci_throttlefree = throttlefree;
463 clockinit.ci_pageout_reserve = pageout_reserve;
464 clockinit.ci_maxpgio = maxpgio;
465 clockinit.ci_maxfastscan = maxfastscan;
466 clockinit.ci_fastscan = fastscan;
467 clockinit.ci_slowscan = slowscan;
468 clockinit.ci_handspreadpages = handspreadpages;
469
470 /*
471 * The first call does not trigger a recalculation, only
472 * subsequent calls.
473 */
474 recalc = false;
475 }
476
477 /*
478 * Configure paging threshold values. For more details on what each
479 * threshold signifies, see the comments at the top of this file.
480 */
481 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
482 btop(LOTSFREE_MAX_DEFAULT));
483 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
484 btop(LOTSFREE_MIN_DEFAULT));
485
486 lotsfree = tune(clockinit.ci_lotsfree, looppages,
487 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
488
630
631 if (fastscan > looppages / loopfraction) {
632 fastscan = looppages / loopfraction;
633 }
634
635 /*
636 * Set slow scan time to 1/10 the fast scan time, but
637 * not to exceed maxslowscan.
638 */
639 if (clockinit.ci_slowscan == 0) {
640 slowscan = MIN(fastscan / 10, maxslowscan);
641 } else {
642 slowscan = clockinit.ci_slowscan;
643 }
644
645 if (slowscan > fastscan / 2) {
646 slowscan = fastscan / 2;
647 }
648
649 /*
650 * Handspreadpages is distance (in pages) between front and back
651 * pageout daemon hands. The amount of time to reclaim a page
652 * once pageout examines it increases with this distance and
653 * decreases as the scan rate rises. It must be < the amount
654 * of pageable memory.
655 *
656 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
657 * to be "fastscan" results in the front hand being a few secs
658 * (varies based on the processor speed) ahead of the back hand
659 * at fastscan rates. This distance can be further reduced, if
660 * necessary, by increasing the processor time used by pageout
661 * to be more than ~4% and preferrably not more than ~10%.
662 *
663 * As a result, user processes have a much better chance of
664 * referencing their pages before the back hand examines them.
665 * This also significantly lowers the number of reclaims from
666 * the freelist since pageout does not end up freeing pages which
667 * may be referenced a sec later.
668 */
669 if (clockinit.ci_handspreadpages == 0) {
670 handspreadpages = fastscan;
671 } else {
672 handspreadpages = clockinit.ci_handspreadpages;
673 }
674
675 /*
676 * Make sure that back hand follows front hand by at least
677 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
678 * back hand to look at a page during the same wakeup of the pageout
679 * daemon in which the front hand cleared its ref bit.
680 */
681 if (handspreadpages >= looppages) {
682 handspreadpages = looppages - 1;
683 }
684
685 if (!recalc) {
686 /*
687 * Setup basic values at initialization.
688 */
689 pscan_region_sz = total_pages;
690 des_page_scanners = n_page_scanners = 1;
691 reset_hands[0] = B_TRUE;
692 return;
693 }
694
695 /*
696 * Recalculating
697 *
698 * We originally set the number of page scanners to 1. Now that we
699 * know what the handspreadpages is for a scanner, figure out how many
700 * scanners we should run. We want to ensure that the regions don't
701 * overlap and that they are not touching.
702 *
703 * A default 64GB region size is used as the initial value to calculate
704 * how many scanner threads we should create on lower memory systems.
705 * The idea is to limit the number of threads to a practical value
706 * (e.g. a 64GB machine really only needs one scanner thread). For very
707 * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
708 * threads.
709 *
710 * The scanner threads themselves are evenly spread out around the
711 * memory "clock" in pageout_scanner when we reset the hands, and each
712 * thread will scan all of memory.
713 */
714 sz = (btop(64ULL * 0x40000000ULL));
715 if (sz < handspreadpages) {
716 /*
717 * 64GB is smaller than the separation between the front
718 * and back hands; use double handspreadpages.
719 */
720 sz = handspreadpages << 1;
721 }
722 if (sz > total_pages) {
723 sz = total_pages;
724 }
725 /* Record region size for inspection with mdb, otherwise unused */
726 pscan_region_sz = sz;
727
728 tmp = sz;
729 for (i = 1; tmp < total_pages; i++) {
730 tmp += sz;
731 }
732
733 if (i > MAX_PSCAN_THREADS)
734 i = MAX_PSCAN_THREADS;
735
736 des_page_scanners = i;
737 }
738
739 /*
740 * Pageout scheduling.
741 *
742 * Schedpaging controls the rate at which the page out daemon runs by
743 * setting the global variables nscan and desscan SCHEDPAGING_HZ
744 * times a second. Nscan records the number of pages pageout has examined
745 * in its current pass; schedpaging() resets this value to zero each time
746 * it runs. Desscan records the number of pages pageout should examine
747 * in its next pass; schedpaging() sets this value based on the amount of
748 * currently available memory.
749 */
750 #define SCHEDPAGING_HZ 4
751
752 static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
753
754 /*
755 * Pool of available async pageout putpage requests.
756 */
757 static struct async_reqs *push_req;
758 static struct async_reqs *req_freelist; /* available req structs */
759 static struct async_reqs *push_list; /* pending reqs */
760 static kmutex_t push_lock; /* protects req pool */
761 static kcondvar_t push_cv;
762
763 /*
764 * If pageout() is stuck on a single push for this many seconds,
765 * pageout_deadman() will assume the system has hit a memory deadlock. If set
766 * to 0, the deadman will have no effect.
767 *
768 * Note that we are only looking for stalls in the calls that pageout() makes
769 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
770 * I/O, which should not take long unless the underlying strategy call blocks
771 * indefinitely for memory. The actual I/O request happens (or fails) later.
772 */
773 uint_t pageout_deadman_seconds = 90;
774
775 static uint_t pageout_stucktime = 0;
776 static bool pageout_pushing = false;
777 static uint64_t pageout_pushcount = 0;
778 static uint64_t pageout_pushcount_seen = 0;
779
780 static int async_list_size = 256; /* number of async request structs */
781
782 static void pageout_scanner(void *);
783
784 /*
785 * If a page is being shared more than "po_share" times
786 * then leave it alone- don't page it out.
787 */
788 #define MIN_PO_SHARE (8)
789 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
790 ulong_t po_share = MIN_PO_SHARE;
791
792 /*
793 * Schedule rate for paging.
794 * Rate is linear interpolation between
795 * slowscan with lotsfree and fastscan when out of memory.
796 */
797 static void
798 schedpaging(void *arg)
799 {
800 spgcnt_t vavail;
801
802 if (freemem < lotsfree + needfree + kmem_reapahead)
803 kmem_reap();
804
805 if (freemem < lotsfree + needfree)
806 seg_preap();
807
808 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
809 kcage_cageout_wakeup();
810
811 (void) atomic_swap_ulong(&nscan, 0);
812 vavail = freemem - deficit;
813 if (pageout_new_spread != 0)
814 vavail -= needfree;
815 if (vavail < 0)
816 vavail = 0;
817 if (vavail > lotsfree)
818 vavail = lotsfree;
819
820 /*
821 * Fix for 1161438 (CRS SPR# 73922). All variables
822 * in the original calculation for desscan were 32 bit signed
823 * ints. As freemem approaches 0x0 on a system with 1 Gig or
824 * more of memory, the calculation can overflow. When this
825 * happens, desscan becomes negative and pageout_scanner()
826 * stops paging out.
827 */
828 if (needfree > 0 && pageout_new_spread == 0) {
829 /*
830 * If we've not yet collected enough samples to
831 * calculate a spread, kick into high gear anytime
832 * needfree is non-zero. Note that desscan will not be
833 * the limiting factor for systems with larger memory;
834 * the %CPU will limit the scan. That will also be
835 * maxed out below.
836 */
837 desscan = fastscan / SCHEDPAGING_HZ;
838 } else {
839 /*
840 * Once we've calculated a spread based on system
841 * memory and usage, just treat needfree as another
842 * form of deficit.
843 */
844 spgcnt_t faststmp, slowstmp, result;
845
846 slowstmp = slowscan * vavail;
847 faststmp = fastscan * (lotsfree - vavail);
848 result = (slowstmp + faststmp) /
849 nz(lotsfree) / SCHEDPAGING_HZ;
850 desscan = (pgcnt_t)result;
851 }
852
853 /*
854 * If we've not yet collected enough samples to calculate a
855 * spread, also kick %CPU to the max.
856 */
857 if (pageout_new_spread == 0) {
858 pageout_nsec = max_pageout_nsec;
859 } else {
860 pageout_nsec = min_pageout_nsec +
861 (lotsfree - vavail) *
862 (max_pageout_nsec - min_pageout_nsec) /
863 nz(lotsfree);
864 }
865
866 if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
867 /*
868 * We have finished the pagescan initialization and the desired
869 * number of page scanners has changed, either because
870 * initialization just finished, because of a memory DR, or
871 * because des_page_scanners has been modified on the fly (i.e.
872 * by mdb). If we need more scanners, start them now, otherwise
873 * the excess scanners will terminate on their own when they
874 * reset their hands.
875 */
876 uint_t i;
877 uint_t curr_nscan = n_page_scanners;
878 pgcnt_t max = total_pages / handspreadpages;
879
880 if (des_page_scanners > max)
881 des_page_scanners = max;
882
883 if (des_page_scanners > MAX_PSCAN_THREADS) {
884 des_page_scanners = MAX_PSCAN_THREADS;
885 } else if (des_page_scanners == 0) {
886 des_page_scanners = 1;
887 }
888
889 /*
890 * Each thread has its own entry in the reset_hands array, so
891 * we don't need any locking in pageout_scanner to check the
892 * thread's reset_hands entry. Thus, we use a pre-allocated
893 * fixed size reset_hands array and upper limit on the number
894 * of pagescan threads.
895 *
896 * The reset_hands entries need to be true before we start new
897 * scanners, but if we're reducing, we don't want a race on the
898 * recalculation for the existing threads, so we set
899 * n_page_scanners first.
900 */
901 n_page_scanners = des_page_scanners;
902 for (i = 0; i < MAX_PSCAN_THREADS; i++) {
903 reset_hands[i] = B_TRUE;
904 }
905
906 if (des_page_scanners > curr_nscan) {
907 /* Create additional pageout scanner threads. */
908 for (i = curr_nscan; i < des_page_scanners; i++) {
909 (void) lwp_kernel_create(proc_pageout,
910 pageout_scanner, (void *)(uintptr_t)i,
911 TS_RUN, curthread->t_pri);
912 }
913 }
914 }
915
916 zones_over = B_FALSE;
917
918 if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
919 if (!PAGE_SCAN_STARTUP)
920 low_mem_scan++;
921 /*
922 * Either we need more memory, or we still need to
923 * measure the average scan rate. Wake the scanner.
924 */
925 DTRACE_PROBE(schedpage__wake__low);
926 WAKE_PAGEOUT_SCANNER();
927
928 } else if (zone_num_over_cap > 0) {
929 /* One or more zones are over their cap. */
930
931 /* No page limit */
932 desscan = total_pages;
933
934 /*
935 * Increase the scanning CPU% to the max. This implies
936 * 80% of one CPU/sec if the scanner can run each
937 * opportunity. Can also be tuned via setting
938 * zone_pageout_nsec in /etc/system or with mdb.
939 */
940 pageout_nsec = (zone_pageout_nsec != 0) ?
941 zone_pageout_nsec : max_pageout_nsec;
942
943 zones_over = B_TRUE;
944 zone_cap_scan++;
945
946 DTRACE_PROBE(schedpage__wake__zone);
947 WAKE_PAGEOUT_SCANNER();
948
949 } else {
950 /*
951 * There are enough free pages, no need to
952 * kick the scanner thread. And next time
953 * around, keep more of the `highly shared'
954 * pages.
955 */
956 cv_signal_pageout();
957
958 mutex_enter(&pageout_mutex);
959 if (po_share > MIN_PO_SHARE) {
960 po_share >>= 1;
961 }
962 mutex_exit(&pageout_mutex);
963 }
964
965 /*
966 * Signal threads waiting for available memory.
967 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
968 * in this case it is not needed - the waiters will be waken up during
969 * the next invocation of this function.
970 */
971 if (kmem_avail() > 0)
972 cv_broadcast(&memavail_cv);
973
974 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
975 }
976
977 pgcnt_t pushes;
978 ulong_t push_list_size; /* # of requests on pageout queue */
979
980 /*
981 * Paging out should always be enabled. This tunable exists to hold pageout
982 * for debugging purposes. If set to 0, pageout_scanner() will go back to
983 * sleep each time it is woken by schedpaging().
984 */
985 uint_t dopageout = 1;
986
987 /*
988 * The page out daemon, which runs as process 2.
989 *
990 * Page out occurs when either:
991 * a) there is less than lotsfree pages,
992 * b) there are one or more zones over their physical memory cap.
993 *
994 * The daemon treats physical memory as a circular array of pages and scans the
995 * pages using a 'two-handed clock' algorithm. The front hand moves through
996 * the pages, clearing the reference bit. The back hand travels a distance
997 * (handspreadpages) behind the front hand, freeing the pages that have not
998 * been referenced in the time since the front hand passed. If modified, they
999 * are first written to their backing store before being freed.
1000 *
1001 * In order to make page invalidation more responsive on machines with larger
1002 * memory, multiple pageout_scanner threads may be created. In this case, the
1003 * threads are evenly distributed around the the memory "clock face" so that
1004 * memory can be reclaimed more quickly (that is, there can be large regions in
1005 * which no pages can be reclaimed by a single thread, leading to lag which
1006 * causes undesirable behavior such as htable stealing).
1007 *
1008 * As long as there are at least lotsfree pages, or no zones over their cap,
1009 * then pageout_scanner threads are not run. When pageout_scanner threads are
1010 * running for case (a), all pages are considered for pageout. For case (b),
1011 * only pages belonging to a zone over its cap will be considered for pageout.
1012 *
1013 * There are multiple threads that act on behalf of the pageout process.
1014 * A set of threads scan pages (pageout_scanner) and frees them up if
1015 * they don't require any VOP_PUTPAGE operation. If a page must be
1016 * written back to its backing store, the request is put on a list
1017 * and the other (pageout) thread is signaled. The pageout thread
1018 * grabs VOP_PUTPAGE requests from the list, and processes them.
1019 * Some filesystems may require resources for the VOP_PUTPAGE
1020 * operations (like memory) and hence can block the pageout
1021 * thread, but the pageout_scanner threads can still operate. There is still
1022 * no guarantee that memory deadlocks cannot occur.
1023 *
1024 * The pageout_scanner parameters are determined in schedpaging().
1025 */
1026 void
1027 pageout()
1028 {
1029 struct async_reqs *arg;
1030 pri_t pageout_pri;
1031 int i;
1032 pgcnt_t max_pushes;
1033 callb_cpr_t cprinfo;
1034
1035 proc_pageout = ttoproc(curthread);
1036 proc_pageout->p_cstime = 0;
1037 proc_pageout->p_stime = 0;
1038 proc_pageout->p_cutime = 0;
1039 proc_pageout->p_utime = 0;
1040 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1041 bcopy("pageout", PTOU(curproc)->u_comm, 7);
1042
1043 /*
1044 * Create pageout scanner thread
1045 */
1046 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1047 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1048
1049 /*
1050 * Allocate and initialize the async request structures
1051 * for pageout.
1052 */
1053 push_req = (struct async_reqs *)
1054 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055
1056 req_freelist = push_req;
1057 for (i = 0; i < async_list_size - 1; i++) {
1058 push_req[i].a_next = &push_req[i + 1];
1059 }
1060
1061 pageout_pri = curthread->t_pri;
1062
1063 /* Create the (first) pageout scanner thread. */
1064 (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065 pageout_pri - 1);
1066
1067 /*
1068 * kick off pageout scheduler.
1069 */
1070 schedpaging(NULL);
1071
1072 /*
1073 * Create kernel cage thread.
1074 * The kernel cage thread is started under the pageout process
1075 * to take advantage of the less restricted page allocation
1076 * in page_create_throttle().
1077 */
1078 kcage_cageout_init();
1079
1080 /*
1081 * Limit pushes to avoid saturating pageout devices.
1082 */
1083 max_pushes = maxpgio / SCHEDPAGING_HZ;
1084 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1085
1086 for (;;) {
1087 mutex_enter(&push_lock);
1088
1089 while ((arg = push_list) == NULL || pushes > max_pushes) {
1090 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1091 cv_wait(&push_cv, &push_lock);
1092 pushes = 0;
1093 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094 }
1095 push_list = arg->a_next;
1096 arg->a_next = NULL;
1097 pageout_pushing = true;
1098 mutex_exit(&push_lock);
1099
1100 DTRACE_PROBE(pageout__push);
1101 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103 pushes++;
1104 }
1105
1106 /* vp held by checkpage() */
1107 VN_RELE(arg->a_vp);
1108
1109 mutex_enter(&push_lock);
1110 pageout_pushing = false;
1111 pageout_pushcount++;
1112 arg->a_next = req_freelist; /* back on freelist */
1113 req_freelist = arg;
1114 push_list_size--;
1115 mutex_exit(&push_lock);
1116 }
1117 }
1118
1119 /*
1120 * Kernel thread that scans pages looking for ones to free
1121 */
1122 static void
1123 pageout_scanner(void *a)
1124 {
1125 struct page *fronthand, *backhand;
1126 uint_t laps, iter = 0;
1127 callb_cpr_t cprinfo;
1128 pgcnt_t nscan_cnt, nscan_limit;
1129 pgcnt_t pcount;
1130 uint_t inst = (uint_t)(uintptr_t)a;
1131 hrtime_t sample_start, sample_end;
1132 kmutex_t pscan_mutex;
1133 bool sampling;
1134
1135 VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136
1137 mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
1138
1139 CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140 mutex_enter(&pscan_mutex);
1141
1142 /*
1143 * Establish the minimum and maximum length of time to be spent
1144 * scanning pages per wakeup, limiting the scanner duty cycle. The
1145 * input percentage values (0-100) must be converted to a fraction of
1146 * the number of nanoseconds in a second of wall time, then further
1147 * scaled down by the number of scanner wakeups in a second:
1148 */
1149 min_pageout_nsec = MAX(1,
1150 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151 max_pageout_nsec = MAX(min_pageout_nsec,
1152 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
1153
1154 loop:
1155 cv_signal_pageout();
1156
1157 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158 cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159 CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
1160
1161 /*
1162 * Check if pageout has been disabled for debugging purposes:
1163 */
1164 if (!dopageout) {
1165 goto loop;
1166 }
1167
1168 /*
1169 * One may reset the clock hands for debugging purposes. Hands will
1170 * also be reset if memory is added to or removed from the system.
1171 */
1172 if (reset_hands[inst]) {
1173 struct page *first;
1174 pgcnt_t offset = total_pages / n_page_scanners;
1175
1176 reset_hands[inst] = B_FALSE;
1177 if (inst >= n_page_scanners) {
1178 /*
1179 * The desired number of page scanners has been
1180 * reduced and this instance is no longer wanted.
1181 * Exit the lwp.
1182 */
1183 VERIFY3U(inst, !=, 0);
1184 mutex_exit(&pscan_mutex);
1185 mutex_enter(&curproc->p_lock);
1186 lwp_exit();
1187 }
1188
1189 /*
1190 * The reset case repositions the hands at the proper place
1191 * on the memory clock face to prevent creep into another
1192 * thread's active region or when the number of threads has
1193 * changed.
1194 *
1195 * Set the two clock hands to be separated by a reasonable
1196 * amount, but no more than 360 degrees apart.
1197 *
1198 * If inst == 0, backhand starts at first page, otherwise
1199 * it is (inst * offset) around the memory "clock face" so that
1200 * we spread out each scanner instance evenly.
1201 */
1202 first = page_first();
1203 backhand = page_nextn(first, offset * inst);
1204 if (handspreadpages >= total_pages) {
1205 fronthand = page_nextn(backhand, total_pages - 1);
1206 } else {
1207 fronthand = page_nextn(backhand, handspreadpages);
1208 }
1209 }
1210
1211 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212
1213 /*
1214 * Keep track of the number of times we have scanned all the way around
1215 * the loop:
1216 */
1217 laps = 0;
1218
1219 /*
1220 * Track the number of pages visited during this scan so that we can
1221 * periodically measure our duty cycle.
1222 */
1223 pcount = 0;
1224 nscan_cnt = 0;
1225
1226 if (PAGE_SCAN_STARTUP) {
1227 /*
1228 * We need to measure the rate at which the system is able to
1229 * scan pages of memory. Each of these initial samples is a
1230 * scan of all system memory, regardless of whether or not we
1231 * are experiencing memory pressure.
1232 */
1233 nscan_limit = total_pages;
1234 sampling = true;
1235 } else {
1236 nscan_limit = desscan;
1237 sampling = false;
1238 }
1239
1240 DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241 page_t *, backhand, page_t *, fronthand);
1242
1243 sample_start = gethrtime();
1244
1245 /*
1246 * Scan the appropriate number of pages for a single duty cycle.
1247 * Only scan while at least one of these is true:
1248 * 1) one or more zones is over its cap
1249 * 2) there is not enough free memory
1250 * 3) during page scan startup when determining sample data
1251 */
1252 while (nscan_cnt < nscan_limit) {
1253 checkpage_result_t rvfront, rvback;
1254
1255 if (!sampling && !zones_over &&
1256 freemem >= lotsfree + needfree) {
1257 /*
1258 * We are not sampling and enough memory has become
1259 * available that scanning is no longer required.
1260 */
1261 break;
1262 }
1263
1264 DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
1265
1266 /*
1267 * Periodically check to see if we have exceeded the CPU duty
1268 * cycle for a single wakeup.
1269 */
1270 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271 hrtime_t pageout_cycle_nsec;
1272
1273 pageout_cycle_nsec = gethrtime() - sample_start;
1274 if (pageout_cycle_nsec >= pageout_nsec) {
1275 /*
1276 * This is where we normally break out of the
1277 * loop when scanning zones or sampling.
1278 */
1279 if (!zones_over) {
1280 atomic_inc_64(&pageout_timeouts);
1281 }
1282 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283 break;
1284 }
1285 }
1286
1287 /*
1288 * If checkpage manages to add a page to the free list,
1289 * we give ourselves another couple of trips around the loop.
1290 */
1291 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1292 laps = 0;
1293 }
1294 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1295 laps = 0;
1296 }
1297
1298 ++pcount;
1299
1300 /*
1301 * This CPU kstat is only incremented here and we're obviously
1302 * on this CPU, so no lock.
1303 */
1304 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305
1306 /*
1307 * Don't include ineligible pages in the number scanned.
1308 */
1309 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310 nscan_cnt++;
1311 }
1312
1313 backhand = page_next(backhand);
1314 fronthand = page_next(fronthand);
1315
1316 /*
1317 * The front hand has wrapped around to the first page in the
1318 * loop.
1319 */
1320 if (fronthand == page_first()) {
1321 DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
1322
1323 /*
1324 * Every 64 wraps we reposition our hands within our
1325 * region to prevent creep into another thread.
1326 */
1327 if ((++iter % pageout_reset_cnt) == 0)
1328 reset_hands[inst] = B_TRUE;
1329
1330 /*
1331 * This CPU kstat is only incremented here and we're
1332 * obviously on this CPU, so no lock.
1333 */
1334 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335
1336 /*
1337 * If scanning because the system is low on memory,
1338 * then when we wraparound memory we want to try to
1339 * reclaim more pages.
1340 * If scanning only because zones are over their cap,
1341 * then wrapping is common and we simply keep going.
1342 */
1343 if (freemem < lotsfree + needfree && ++laps > 1) {
1344 /*
1345 * The system is low on memory.
1346 * Extremely unlikely, but it happens.
1347 * We went around the loop at least once
1348 * and didn't get far enough.
1349 * If we are still skipping `highly shared'
1350 * pages, skip fewer of them. Otherwise,
1351 * give up till the next clock tick.
1352 */
1353 mutex_enter(&pageout_mutex);
1354 if (po_share < MAX_PO_SHARE) {
1355 po_share <<= 1;
1356 mutex_exit(&pageout_mutex);
1357 } else {
1358 mutex_exit(&pageout_mutex);
1359 break;
1360 }
1361 }
1362 }
1363 }
1364
1365 atomic_add_long(&nscan, nscan_cnt);
1366
1367 sample_end = gethrtime();
1368
1369 DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370 uint_t, inst);
1371
1372 /*
1373 * The following two blocks are only relevant when the scanner is
1374 * first started up. After the scanner runs for a while, neither of
1375 * the conditions will ever be true again.
1376 *
1377 * The global variables used below are only modified by this thread and
1378 * only during initial scanning when there is a single page scanner
1379 * thread running. Thus, we don't use any locking.
1380 */
1381 if (pageout_new_spread == 0) {
1382 VERIFY3U(inst, ==, 0);
1383 if (PAGE_SCAN_STARTUP) {
1384 /*
1385 * Continue accumulating samples until we have enough
1386 * to get a reasonable value for average scan rate:
1387 */
1388 pageout_sample_pages += pcount;
1389 pageout_sample_etime += sample_end - sample_start;
1390 ++pageout_sample_cnt;
1391 }
1392
1393 if (!PAGE_SCAN_STARTUP) {
1394 /*
1395 * We have enough samples, set the spread.
1396 */
1397 pageout_rate = (hrrate_t)pageout_sample_pages *
1398 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1399 pageout_new_spread = pageout_rate / 10;
1400 setupclock();
1401 }
1402 }
1403
1404 goto loop;
1405 }
1406
1520 }
1521
1522 if (zones_over) {
1523 ASSERT(pp->p_zoneid == ALL_ZONES ||
1524 pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525 if (pp->p_zoneid == ALL_ZONES ||
1526 zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527 /*
1528 * Cross-zone shared page, or zone not over it's cap.
1529 * Leave the page alone.
1530 */
1531 page_unlock(pp);
1532 return (CKP_INELIGIBLE);
1533 }
1534 zid = pp->p_zoneid;
1535 }
1536
1537 /*
1538 * Maintain statistics for what we are freeing
1539 */
1540
1541 if (pp->p_vnode != NULL) {
1542 if (pp->p_vnode->v_flag & VVMEXEC)
1543 isexec = 1;
1544
1545 if (!IS_SWAPFSVP(pp->p_vnode))
1546 isfs = 1;
1547 }
1548
1549 /*
1550 * Turn off REF and MOD bits with the front hand.
1551 * The back hand examines the REF bit and always considers
1552 * SHARED pages as referenced.
1553 */
1554 if (whichhand == POH_FRONT) {
1555 pagesync_flag = HAT_SYNC_ZERORM;
1556 } else {
1557 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1558 HAT_SYNC_STOPON_SHARED;
1559 }
1560
|
225 pgcnt_t lotsfree_min = 0;
226 pgcnt_t lotsfree_max = 0;
227
228 #define LOTSFREE_MIN_DEFAULT (16 * MEGABYTES)
229 #define LOTSFREE_MAX_DEFAULT (2048 * MEGABYTES)
230
231 /*
232 * If these tunables are set to non-zero values in /etc/system, and provided
233 * the value is not larger than the threshold above, the specified value will
234 * be used directly without any additional calculation or adjustment. The boot
235 * time value of these overrides is preserved in the "clockinit" struct. More
236 * detail is available in the comment at the top of the file.
237 */
238 pgcnt_t maxpgio = 0;
239 pgcnt_t minfree = 0;
240 pgcnt_t desfree = 0;
241 pgcnt_t lotsfree = 0;
242 pgcnt_t needfree = 0;
243 pgcnt_t throttlefree = 0;
244 pgcnt_t pageout_reserve = 0;
245 pri_t pageout_pri;
246
247 pgcnt_t deficit;
248 pgcnt_t nscan;
249 pgcnt_t desscan;
250
251 /* kstats */
252 uint64_t low_mem_scan;
253 uint64_t zone_cap_scan;
254
255 #define MAX_PSCAN_THREADS 16
256
257 /*
258 * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
259 * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
260 * that gives the equivalent of some underlying %CPU duty cycle.
261 *
262 * min_pageout_nsec:
263 * nanoseconds/wakeup equivalent of min_percent_cpu.
264 *
265 * max_pageout_nsec:
266 * nanoseconds/wakeup equivalent of max_percent_cpu.
267 *
268 * pageout_nsec:
269 * Number of nanoseconds budgeted for each wakeup cycle.
270 * Computed each time around by schedpaging().
271 * Varies between min_pageout_nsec and max_pageout_nsec,
272 * depending on memory pressure or zones over their cap.
273 *
274 * zone_pageout_nsec:
275 * Number of nanoseconds budget for each cycle when a zone
276 * is over its memory cap. If this is zero, then the value
277 * of max_pageout_nsec is used instead.
278 */
279 static hrtime_t min_pageout_nsec;
280 static hrtime_t max_pageout_nsec;
281 static hrtime_t pageout_nsec;
282 static hrtime_t zone_pageout_nsec;
283
284 static boolean_t reset_hands[MAX_PSCAN_THREADS];
285
286 #define PAGES_POLL_MASK 1023
287 #define SCHEDPAGING_HZ 4
288
289 /*
290 * despagescanners:
291 * The desired number of page scanner threads. The value can be set in
292 * /etc/system or tuned directly with 'mdb -kw'. The system will bring
293 * the actual number of threads into line with the desired number. If set
294 * to an invalid value, the system will correct the setting.
295 */
296 uint_t despagescanners = 0;
297
298 /*
299 * pageout_sample_lim:
300 * The limit on the number of samples needed to establish a value for new
301 * pageout parameters: fastscan, slowscan, pageout_new_spread, and
302 * handspreadpages.
303 *
304 * pageout_sample_cnt:
305 * Current sample number. Once the sample gets large enough, set new
306 * values for handspreadpages, pageout_new_spread, fastscan and slowscan.
307 *
308 * pageout_sample_pages:
309 * The accumulated number of pages scanned during sampling.
310 *
311 * pageout_sample_etime:
312 * The accumulated nanoseconds for the sample.
313 *
314 * pageout_rate:
315 * Rate in pages/nanosecond, computed at the end of sampling.
316 *
317 * pageout_new_spread:
318 * Initially zero while the system scan rate is measured by
319 * pageout_scanner(), which then sets this value once per system boot after
320 * enough samples have been recorded (pageout_sample_cnt). Once set, this
321 * new value is used for fastscan and handspreadpages.
322 */
323 typedef hrtime_t hrrate_t;
324
325 static uint64_t pageout_sample_lim = 4;
326 static uint64_t pageout_sample_cnt = 0;
327 static pgcnt_t pageout_sample_pages = 0;
328 static hrtime_t pageout_sample_etime = 0;
329 static hrrate_t pageout_rate = 0;
330 static pgcnt_t pageout_new_spread = 0;
331
332 /* True if the page scanner is first starting up */
333 #define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
334
335 /* The current number of page scanner threads */
336 static uint_t n_page_scanners = 1;
337 /* The number of page scanner threads that are actively scanning. */
338 static uint_t pageouts_running;
339
340 /*
341 * Record number of times a pageout_scanner() wakeup cycle finished because it
342 * timed out (exceeded its CPU budget), rather than because it visited
343 * its budgeted number of pages. This is only done when scanning under low
344 * free memory conditions, not when scanning for zones over their cap.
345 */
346 uint64_t pageout_timeouts = 0;
347
348 #ifdef VM_STATS
349 static struct pageoutvmstats_str {
350 ulong_t checkpage[3];
351 } pageoutvmstats;
352 #endif /* VM_STATS */
353
354 /*
355 * Threads waiting for free memory use this condition variable and lock until
356 * memory becomes available.
357 */
358 kmutex_t memavail_lock;
359 kcondvar_t memavail_cv;
368 CKP_NOT_FREED,
369 CKP_FREED,
370 } checkpage_result_t;
371
372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
373
374 static struct clockinit {
375 bool ci_init;
376 pgcnt_t ci_lotsfree_min;
377 pgcnt_t ci_lotsfree_max;
378 pgcnt_t ci_lotsfree;
379 pgcnt_t ci_desfree;
380 pgcnt_t ci_minfree;
381 pgcnt_t ci_throttlefree;
382 pgcnt_t ci_pageout_reserve;
383 pgcnt_t ci_maxpgio;
384 pgcnt_t ci_maxfastscan;
385 pgcnt_t ci_fastscan;
386 pgcnt_t ci_slowscan;
387 pgcnt_t ci_handspreadpages;
388 uint_t ci_despagescanners;
389 } clockinit = { .ci_init = false };
390
391 static inline pgcnt_t
392 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
393 {
394 if (value < minimum) {
395 return (minimum);
396 } else if (value > maximum) {
397 return (maximum);
398 } else {
399 return (value);
400 }
401 }
402
403 static pgcnt_t
404 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
405 {
406 if (initval == 0 || initval >= initval_ceiling) {
407 return (defval);
408 } else {
409 return (initval);
410 }
411 }
412
413 /*
414 * Local boolean to control scanning when zones are over their cap. Avoids
415 * accessing the zone_num_over_cap variable except within schedpaging(), which
416 * only runs periodically. This is here only to reduce our access to
417 * zone_num_over_cap, since it is already accessed a lot during paging, and
418 * the page scanner accesses the zones_over variable on each page during a
419 * scan. There is no lock needed for zone_num_over_cap since schedpaging()
420 * doesn't modify the variable, it only cares if the variable is 0 or non-0.
421 */
422 static boolean_t zones_over = B_FALSE;
423
424 /*
425 * On large memory systems, multiple instances of the page scanner are run,
426 * each responsible for a separate region of memory. This speeds up page
427 * invalidation under low memory conditions.
428 *
429 * despagescanners can be set in /etc/system or via mdb and it will
430 * be used as a guide for how many page scanners to create; the value
431 * will be adjusted if it is not sensible. Otherwise, the number of
432 * page scanners is determined dynamically based on handspreadpages.
433 */
434 static void
435 recalc_pagescanners(void)
436 {
437 pgcnt_t sz;
438 uint_t des;
439
440 /* If the initial calibration has not been done, take no action. */
441 if (pageout_new_spread == 0)
442 return;
443
444 /*
445 * If the desired number of scanners is set in /etc/system
446 * then try to use it.
447 */
448 if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
449 despagescanners = clockinit.ci_despagescanners;
450
451 if (despagescanners != 0) {
452 /*
453 * We have a desired number of page scanners, either from
454 * /etc/system or set via mdb. Try and use it (it will be
455 * clamped below).
456 */
457 des = despagescanners;
458 } else {
459 /*
460 * Calculate the number of desired scanners based on the
461 * system's memory size.
462 *
463 * A 64GiB region size is used as the basis for calculating how
464 * many scanner threads should be created. For systems with up
465 * to 64GiB of RAM, a single thread is used; for very large
466 * memory systems the threads are limited to MAX_PSCAN_THREADS.
467 */
468 sz = btop(64ULL << 30);
469
470 if (sz > looppages) {
471 des = 1;
472 } else {
473 pgcnt_t tmp = sz;
474
475 for (des = 1; tmp < looppages; des++)
476 tmp += sz;
477 }
478 }
479
480 /*
481 * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
482 * and so that each scanner covers at least 10% more than
483 * handspreadpages.
484 */
485 des = clamp(des, 1,
486 looppages / (handspreadpages + handspreadpages / 10));
487 despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
488 }
489
490 /*
491 * Set up the paging constants for the clock algorithm used by
492 * pageout_scanner(), and by the virtual memory system overall. See the
493 * comments at the top of this file for more information about the threshold
494 * values and system responses to memory pressure.
495 *
496 * This routine is called once by main() at startup, after the initial size of
497 * physical memory is determined. It may be called again later if memory is
498 * added to or removed from the system, or if new measurements of the page scan
499 * rate become available.
500 */
501 void
502 setupclock(void)
503 {
504 bool half = (pageout_threshold_style == 1);
505 bool recalc = true;
506
507 looppages = total_pages;
508
509 /*
510 * The operator may have provided specific values for some of the
511 * tunables via /etc/system. On our first call, we preserve those
512 * values so that they can be used for subsequent recalculations.
513 *
514 * A value of zero for any tunable means we will use the default
515 * sizing.
516 */
517 if (!clockinit.ci_init) {
518 clockinit.ci_init = true;
519
520 clockinit.ci_lotsfree_min = lotsfree_min;
521 clockinit.ci_lotsfree_max = lotsfree_max;
522 clockinit.ci_lotsfree = lotsfree;
523 clockinit.ci_desfree = desfree;
524 clockinit.ci_minfree = minfree;
525 clockinit.ci_throttlefree = throttlefree;
526 clockinit.ci_pageout_reserve = pageout_reserve;
527 clockinit.ci_maxpgio = maxpgio;
528 clockinit.ci_maxfastscan = maxfastscan;
529 clockinit.ci_fastscan = fastscan;
530 clockinit.ci_slowscan = slowscan;
531 clockinit.ci_handspreadpages = handspreadpages;
532 clockinit.ci_despagescanners = despagescanners;
533
534 /*
535 * The first call does not trigger a recalculation, only
536 * subsequent calls.
537 */
538 recalc = false;
539 }
540
541 /*
542 * Configure paging threshold values. For more details on what each
543 * threshold signifies, see the comments at the top of this file.
544 */
545 lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
546 btop(LOTSFREE_MAX_DEFAULT));
547 lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
548 btop(LOTSFREE_MIN_DEFAULT));
549
550 lotsfree = tune(clockinit.ci_lotsfree, looppages,
551 clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
552
694
695 if (fastscan > looppages / loopfraction) {
696 fastscan = looppages / loopfraction;
697 }
698
699 /*
700 * Set slow scan time to 1/10 the fast scan time, but
701 * not to exceed maxslowscan.
702 */
703 if (clockinit.ci_slowscan == 0) {
704 slowscan = MIN(fastscan / 10, maxslowscan);
705 } else {
706 slowscan = clockinit.ci_slowscan;
707 }
708
709 if (slowscan > fastscan / 2) {
710 slowscan = fastscan / 2;
711 }
712
713 /*
714 * Handspreadpages is the distance (in pages) between front and back
715 * pageout daemon hands. The amount of time to reclaim a page
716 * once pageout examines it increases with this distance and
717 * decreases as the scan rate rises. It must be < the amount
718 * of pageable memory.
719 *
720 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
721 * to be "fastscan" results in the front hand being a few secs
722 * (varies based on the processor speed) ahead of the back hand
723 * at fastscan rates. This distance can be further reduced, if
724 * necessary, by increasing the processor time used by pageout
725 * to be more than ~4% and preferrably not more than ~10%.
726 *
727 * As a result, user processes have a much better chance of
728 * referencing their pages before the back hand examines them.
729 * This also significantly lowers the number of reclaims from
730 * the freelist since pageout does not end up freeing pages which
731 * may be referenced a sec later.
732 */
733 if (clockinit.ci_handspreadpages == 0) {
734 handspreadpages = fastscan;
735 } else {
736 handspreadpages = clockinit.ci_handspreadpages;
737 }
738
739 /*
740 * Make sure that back hand follows front hand by at least
741 * 1/SCHEDPAGING_HZ seconds. Without this test, it is possible for the
742 * back hand to look at a page during the same wakeup of the pageout
743 * daemon in which the front hand cleared its ref bit.
744 */
745 if (handspreadpages >= looppages) {
746 handspreadpages = looppages - 1;
747 }
748
749 /*
750 * Establish the minimum and maximum length of time to be spent
751 * scanning pages per wakeup, limiting the scanner duty cycle. The
752 * input percentage values (0-100) must be converted to a fraction of
753 * the number of nanoseconds in a second of wall time, then further
754 * scaled down by the number of scanner wakeups in a second.
755 */
756 min_pageout_nsec = MAX(1,
757 NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
758 max_pageout_nsec = MAX(min_pageout_nsec,
759 NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
760
761 /*
762 * If not called for recalculation, return and skip the remaining
763 * steps.
764 */
765 if (!recalc)
766 return;
767
768 /*
769 * Set a flag to re-evaluate the clock hand positions.
770 */
771 for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
772 reset_hands[i] = B_TRUE;
773
774 recalc_pagescanners();
775 }
776
777 /*
778 * Pageout scheduling.
779 *
780 * Schedpaging controls the rate at which the page out daemon runs by
781 * setting the global variables nscan and desscan SCHEDPAGING_HZ
782 * times a second. Nscan records the number of pages pageout has examined
783 * in its current pass; schedpaging() resets this value to zero each time
784 * it runs. Desscan records the number of pages pageout should examine
785 * in its next pass; schedpaging() sets this value based on the amount of
786 * currently available memory.
787 */
788
789 static kmutex_t pageout_mutex;
790
791 /*
792 * Pool of available async pageout putpage requests.
793 */
794 static struct async_reqs *push_req;
795 static struct async_reqs *req_freelist; /* available req structs */
796 static struct async_reqs *push_list; /* pending reqs */
797 static kmutex_t push_lock; /* protects req pool */
798 static kcondvar_t push_cv;
799
800 /*
801 * If pageout() is stuck on a single push for this many seconds,
802 * pageout_deadman() will assume the system has hit a memory deadlock. If set
803 * to 0, the deadman will have no effect.
804 *
805 * Note that we are only looking for stalls in the calls that pageout() makes
806 * to VOP_PUTPAGE(). These calls are merely asynchronous requests for paging
807 * I/O, which should not take long unless the underlying strategy call blocks
808 * indefinitely for memory. The actual I/O request happens (or fails) later.
809 */
810 uint_t pageout_deadman_seconds = 90;
811
812 static uint_t pageout_stucktime = 0;
813 static bool pageout_pushing = false;
814 static uint64_t pageout_pushcount = 0;
815 static uint64_t pageout_pushcount_seen = 0;
816
817 static int async_list_size = 8192; /* number of async request structs */
818
819 static void pageout_scanner(void *);
820
821 /*
822 * If a page is being shared more than "po_share" times
823 * then leave it alone- don't page it out.
824 */
825 #define MIN_PO_SHARE (8)
826 #define MAX_PO_SHARE ((MIN_PO_SHARE) << 24)
827 ulong_t po_share = MIN_PO_SHARE;
828
829 /*
830 * Schedule rate for paging.
831 * Rate is linear interpolation between
832 * slowscan with lotsfree and fastscan when out of memory.
833 */
834 static void
835 schedpaging(void *arg)
836 {
837 spgcnt_t vavail;
838
839 if (freemem < lotsfree + needfree + kmem_reapahead)
840 kmem_reap();
841
842 if (freemem < lotsfree + needfree)
843 seg_preap();
844
845 if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
846 kcage_cageout_wakeup();
847
848 if (mutex_tryenter(&pageout_mutex)) {
849
850 if (pageouts_running != 0)
851 goto out;
852
853 /* No pageout scanner threads running. */
854 nscan = 0;
855 vavail = freemem - deficit;
856 if (pageout_new_spread != 0)
857 vavail -= needfree;
858 vavail = clamp(vavail, 0, lotsfree);
859
860 if (needfree > 0 && pageout_new_spread == 0) {
861 /*
862 * If we've not yet collected enough samples to
863 * calculate a spread, use the old logic of kicking
864 * into high gear anytime needfree is non-zero.
865 */
866 desscan = fastscan / SCHEDPAGING_HZ;
867 } else {
868 /*
869 * Once we've calculated a spread based on system
870 * memory and usage, just treat needfree as another
871 * form of deficit.
872 */
873 spgcnt_t faststmp, slowstmp, result;
874
875 slowstmp = slowscan * vavail;
876 faststmp = fastscan * (lotsfree - vavail);
877 result = (slowstmp + faststmp) /
878 nz(lotsfree) / SCHEDPAGING_HZ;
879 desscan = (pgcnt_t)result;
880 }
881
882 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
883 (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
884
885 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
886 pageout_nsec);
887
888 if (pageout_new_spread != 0 && despagescanners != 0 &&
889 despagescanners != n_page_scanners) {
890 /*
891 * We have finished the pagescan initialisation and the
892 * desired number of page scanners has changed, either
893 * because initialisation just finished, because of a
894 * memory DR, or because despagescanners has been
895 * modified on the fly (i.e. by mdb).
896 */
897 uint_t i, curr_nscan = n_page_scanners;
898
899 /* Re-validate despagescanners */
900 recalc_pagescanners();
901
902 n_page_scanners = despagescanners;
903
904 for (i = 0; i < MAX_PSCAN_THREADS; i++)
905 reset_hands[i] = B_TRUE;
906
907 /* If we need more scanners, start them now. */
908 if (n_page_scanners > curr_nscan) {
909 for (i = curr_nscan; i < n_page_scanners; i++) {
910 (void) lwp_kernel_create(proc_pageout,
911 pageout_scanner,
912 (void *)(uintptr_t)i, TS_RUN,
913 pageout_pri);
914 }
915 }
916
917 /*
918 * If the number of scanners has decreased, trigger a
919 * wakeup so that the excess threads will terminate.
920 */
921 if (n_page_scanners < curr_nscan) {
922 WAKE_PAGEOUT_SCANNER();
923 }
924 }
925
926 zones_over = B_FALSE;
927
928 if (PAGE_SCAN_STARTUP) {
929 /*
930 * We still need to measure the rate at which the
931 * system is able to scan pages of memory. Each of
932 * these initial samples is a scan of as much system
933 * memory as practical, regardless of whether or not we
934 * are experiencing memory pressure.
935 */
936 desscan = total_pages;
937 pageout_nsec = max_pageout_nsec;
938
939 DTRACE_PROBE(schedpage__wake__sample);
940 WAKE_PAGEOUT_SCANNER();
941 } else if (freemem < lotsfree + needfree) {
942 /*
943 * We need more memory.
944 */
945 low_mem_scan++;
946
947 DTRACE_PROBE(schedpage__wake__low);
948 WAKE_PAGEOUT_SCANNER();
949 } else if (zone_num_over_cap > 0) {
950 /*
951 * One of more zones are over their cap.
952 */
953
954 /* No page limit */
955 desscan = total_pages;
956
957 /*
958 * Increase the scanning CPU% to the max. This implies
959 * 80% of one CPU/sec if the scanner can run each
960 * opportunity. Can also be tuned via setting
961 * zone_pageout_nsec in /etc/system or with mdb.
962 */
963 pageout_nsec = (zone_pageout_nsec != 0) ?
964 zone_pageout_nsec : max_pageout_nsec;
965
966 zones_over = B_TRUE;
967 zone_cap_scan++;
968
969 DTRACE_PROBE(schedpage__wake__zone);
970 WAKE_PAGEOUT_SCANNER();
971 } else {
972 /*
973 * There are enough free pages, no need to
974 * kick the scanner thread. And next time
975 * around, keep more of the `highly shared'
976 * pages.
977 */
978 cv_signal_pageout();
979 if (po_share > MIN_PO_SHARE) {
980 po_share >>= 1;
981 }
982 }
983 out:
984 mutex_exit(&pageout_mutex);
985 }
986
987 /*
988 * Signal threads waiting for available memory.
989 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
990 * in this case it is not needed - the waiters will be waken up during
991 * the next invocation of this function.
992 */
993 if (kmem_avail() > 0)
994 cv_broadcast(&memavail_cv);
995
996 (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
997 }
998
999 pgcnt_t pushes;
1000 ulong_t push_list_size; /* # of requests on pageout queue */
1001
1002 /*
1003 * Paging out should always be enabled. This tunable exists to hold pageout
1004 * for debugging purposes. If set to 0, pageout_scanner() will go back to
1005 * sleep each time it is woken by schedpaging().
1006 */
1007 uint_t dopageout = 1;
1008
1009 /*
1010 * The page out daemon, which runs as process 2.
1011 *
1012 * The daemon treats physical memory as a circular array of pages and scans
1013 * the pages using a 'two-handed clock' algorithm. The front hand moves
1014 * through the pages, clearing the reference bit. The back hand travels a
1015 * distance (handspreadpages) behind the front hand, freeing the pages that
1016 * have not been referenced in the time since the front hand passed. If
1017 * modified, they are first written to their backing store before being
1018 * freed.
1019 *
1020 * In order to make page invalidation more responsive on machines with
1021 * larger memory, multiple pageout_scanner threads may be created. In this
1022 * case, each thread is given a segment of the memory "clock face" so that
1023 * memory can be reclaimed more quickly.
1024 *
1025 * As long as there are at least lotsfree pages, or no zones over their
1026 * cap, then pageout_scanner threads are not run. When pageout_scanner
1027 * threads are running for case (a), all pages are considered for pageout.
1028 * For case (b), only pages belonging to a zone over its cap will be
1029 * considered for pageout.
1030 *
1031 * There are multiple threads that act on behalf of the pageout process. A
1032 * set of threads scan pages (pageout_scanner) and frees them up if they
1033 * don't require any VOP_PUTPAGE operation. If a page must be written back
1034 * to its backing store, the request is put on a list and the other
1035 * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
1036 * requests from the list, and processes them. Some filesystems may require
1037 * resources for the VOP_PUTPAGE operations (like memory) and hence can
1038 * block the pageout thread, but the scanner thread can still operate.
1039 * There is still no guarantee that memory deadlocks cannot occur.
1040 */
1041 void
1042 pageout()
1043 {
1044 struct async_reqs *arg;
1045 int i;
1046 pgcnt_t max_pushes;
1047 callb_cpr_t cprinfo;
1048
1049 proc_pageout = ttoproc(curthread);
1050 proc_pageout->p_cstime = 0;
1051 proc_pageout->p_stime = 0;
1052 proc_pageout->p_cutime = 0;
1053 proc_pageout->p_utime = 0;
1054 bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1055 bcopy("pageout", PTOU(curproc)->u_comm, 7);
1056
1057 /*
1058 * Create pageout scanner thread
1059 */
1060 mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1061 mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1062
1063 /*
1064 * Allocate and initialize the async request structures
1065 * for pageout.
1066 */
1067 push_req = (struct async_reqs *)
1068 kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1069
1070 req_freelist = push_req;
1071 for (i = 0; i < async_list_size - 1; i++) {
1072 push_req[i].a_next = &push_req[i + 1];
1073 }
1074
1075 pageout_pri = curthread->t_pri - 1;
1076
1077 /* Create the first pageout scanner thread. */
1078 (void) lwp_kernel_create(proc_pageout, pageout_scanner,
1079 (void *)0, /* this is instance 0, not NULL */
1080 TS_RUN, pageout_pri);
1081
1082 /*
1083 * kick off pageout scheduler.
1084 */
1085 schedpaging(NULL);
1086
1087 /*
1088 * Create kernel cage thread.
1089 * The kernel cage thread is started under the pageout process
1090 * to take advantage of the less restricted page allocation
1091 * in page_create_throttle().
1092 */
1093 kcage_cageout_init();
1094
1095 /*
1096 * Limit pushes to avoid saturating pageout devices.
1097 */
1098 max_pushes = maxpgio / SCHEDPAGING_HZ;
1099 CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1100
1101 for (;;) {
1102 mutex_enter(&push_lock);
1103
1104 while ((arg = push_list) == NULL || pushes > max_pushes) {
1105 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1106 cv_wait(&push_cv, &push_lock);
1107 pushes = 0;
1108 CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1109 }
1110 push_list = arg->a_next;
1111 arg->a_next = NULL;
1112 pageout_pushing = true;
1113 mutex_exit(&push_lock);
1114
1115 DTRACE_PROBE(pageout__push);
1116
1117 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1118 arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1119 pushes++;
1120 }
1121
1122 /* vp held by checkpage() */
1123 VN_RELE(arg->a_vp);
1124
1125 mutex_enter(&push_lock);
1126 pageout_pushing = false;
1127 pageout_pushcount++;
1128 arg->a_next = req_freelist; /* back on freelist */
1129 req_freelist = arg;
1130 push_list_size--;
1131 mutex_exit(&push_lock);
1132 }
1133 }
1134
1135 /*
1136 * Kernel thread that scans pages looking for ones to free
1137 */
1138 static void
1139 pageout_scanner(void *a)
1140 {
1141 struct page *fronthand, *backhand, *fronthandstart;
1142 struct page *regionstart, *regionend;
1143 uint_t laps;
1144 callb_cpr_t cprinfo;
1145 pgcnt_t nscan_cnt, tick;
1146 pgcnt_t pcount;
1147 bool bhwrapping, fhwrapping;
1148 hrtime_t sample_start, sample_end;
1149 uint_t inst = (uint_t)(uintptr_t)a;
1150
1151 VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1152
1153 CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1154 mutex_enter(&pageout_mutex);
1155
1156 /*
1157 * The restart case does not attempt to point the hands at roughly
1158 * the right point on the assumption that after one circuit things
1159 * will have settled down, and restarts shouldn't be that often.
1160 */
1161 reset_hands[inst] = B_TRUE;
1162
1163 pageouts_running++;
1164 mutex_exit(&pageout_mutex);
1165
1166 loop:
1167 cv_signal_pageout();
1168
1169 mutex_enter(&pageout_mutex);
1170 pageouts_running--;
1171 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1172 cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1173 CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1174 pageouts_running++;
1175 mutex_exit(&pageout_mutex);
1176
1177 /*
1178 * Check if pageout has been disabled for debugging purposes.
1179 */
1180 if (!dopageout) {
1181 goto loop;
1182 }
1183
1184 /*
1185 * One may reset the clock hands and scanned region for debugging
1186 * purposes. Hands will also be reset on first thread startup, if
1187 * the number of scanning threads (n_page_scanners) changes, or if
1188 * memory is added to, or removed from, the system.
1189 */
1190 if (reset_hands[inst]) {
1191 struct page *first;
1192
1193 reset_hands[inst] = B_FALSE;
1194
1195 if (inst >= n_page_scanners) {
1196 /*
1197 * The desired number of page scanners has been
1198 * reduced and this instance is no longer wanted.
1199 * Exit the lwp.
1200 */
1201 VERIFY3U(inst, !=, 0);
1202 DTRACE_PROBE1(pageout__exit, uint_t, inst);
1203 mutex_enter(&pageout_mutex);
1204 pageouts_running--;
1205 mutex_exit(&pageout_mutex);
1206 mutex_enter(&curproc->p_lock);
1207 lwp_exit();
1208 /* NOTREACHED */
1209 }
1210
1211 first = page_first();
1212
1213 /*
1214 * Each scanner thread gets its own sector of the memory
1215 * clock face.
1216 */
1217 pgcnt_t span, offset;
1218
1219 span = looppages / n_page_scanners;
1220 VERIFY3U(span, >, handspreadpages);
1221
1222 offset = inst * span;
1223 regionstart = page_nextn(first, offset);
1224 if (inst == n_page_scanners - 1) {
1225 /* The last instance goes up to the last page */
1226 regionend = page_nextn(first, looppages - 1);
1227 } else {
1228 regionend = page_nextn(regionstart, span - 1);
1229 }
1230
1231 backhand = regionstart;
1232 fronthand = page_nextn(backhand, handspreadpages);
1233 tick = 1;
1234
1235 bhwrapping = fhwrapping = B_FALSE;
1236
1237 DTRACE_PROBE4(pageout__reset, uint_t, inst,
1238 pgcnt_t, regionstart, pgcnt_t, regionend,
1239 pgcnt_t, fronthand);
1240 }
1241
1242 /*
1243 * This CPU kstat is only incremented here and we're obviously
1244 * on this CPU, so no lock.
1245 */
1246 CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1247
1248 /*
1249 * Keep track of the number of times we have scanned all the way around
1250 * the loop on this wakeup.
1251 */
1252 laps = 0;
1253
1254 /*
1255 * Track the number of pages visited during this scan so that we can
1256 * periodically measure our duty cycle.
1257 */
1258 nscan_cnt = 0;
1259 pcount = 0;
1260
1261 DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1262 hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1263
1264 /*
1265 * Record the initial position of the front hand for this cycle so
1266 * that we can detect when the hand wraps around.
1267 */
1268 fronthandstart = fronthand;
1269
1270 sample_start = gethrtime();
1271
1272 /*
1273 * Scan the appropriate number of pages for a single duty cycle.
1274 */
1275 while (nscan_cnt < desscan) {
1276 checkpage_result_t rvfront, rvback;
1277
1278 /*
1279 * Only scan while at least one of these is true:
1280 * 1) one or more zones is over its cap
1281 * 2) there is not enough free memory
1282 * 3) during page scan startup when determining sample data
1283 */
1284 if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
1285 !zones_over) {
1286 /*
1287 * We are not sampling and enough memory has become
1288 * available that scanning is no longer required.
1289 */
1290 DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1291 break;
1292 }
1293
1294 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1295
1296 /*
1297 * Periodically check to see if we have exceeded the CPU duty
1298 * cycle for a single wakeup.
1299 */
1300 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1301 hrtime_t pageout_cycle_nsec;
1302
1303 pageout_cycle_nsec = gethrtime() - sample_start;
1304 if (pageout_cycle_nsec >= pageout_nsec) {
1305 if (!zones_over)
1306 atomic_inc_64(&pageout_timeouts);
1307 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1308 break;
1309 }
1310 }
1311
1312 /*
1313 * If checkpage manages to add a page to the free list,
1314 * we give ourselves another couple of trips around the loop.
1315 */
1316 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1317 laps = 0;
1318 }
1319 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1320 laps = 0;
1321 }
1322
1323 ++pcount;
1324
1325 /*
1326 * This CPU kstat is only incremented here and we're obviously
1327 * on this CPU, so no lock.
1328 */
1329 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1330
1331 /*
1332 * Don't include ineligible pages in the number scanned.
1333 */
1334 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1335 nscan_cnt++;
1336 }
1337
1338 if (bhwrapping) {
1339 backhand = regionstart;
1340 bhwrapping = B_FALSE;
1341 } else {
1342 backhand = page_nextn(backhand, tick);
1343 if (backhand == regionend)
1344 bhwrapping = B_TRUE;
1345 }
1346
1347 if (fhwrapping) {
1348 fronthand = regionstart;
1349 fhwrapping = B_FALSE;
1350 } else {
1351 fronthand = page_nextn(fronthand, tick);
1352 if (fronthand == regionend)
1353 fhwrapping = B_TRUE;
1354 }
1355
1356 /*
1357 * The front hand has wrapped around during this wakeup.
1358 */
1359 if (fronthand == fronthandstart) {
1360 laps++;
1361 DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1362 uint_t, laps);
1363
1364 /*
1365 * This CPU kstat is only incremented here and we're
1366 * obviously on this CPU, so no lock.
1367 */
1368 CPU_STATS_ADDQ(CPU, vm, rev, 1);
1369
1370 /*
1371 * then when we wraparound memory we want to try to
1372 * reclaim more pages.
1373 * If scanning only because zones are over their cap,
1374 * then wrapping is common and we simply keep going.
1375 */
1376 if (laps > 1 && freemem < lotsfree + needfree) {
1377 /*
1378 * Extremely unlikely, but it happens.
1379 * We went around the loop at least once
1380 * and didn't get far enough.
1381 * If we are still skipping `highly shared'
1382 * pages, skip fewer of them. Otherwise,
1383 * give up till the next clock tick.
1384 */
1385 if (po_share < MAX_PO_SHARE) {
1386 po_share <<= 1;
1387 } else {
1388 break;
1389 }
1390 }
1391 }
1392 }
1393
1394 sample_end = gethrtime();
1395 atomic_add_long(&nscan, nscan_cnt);
1396
1397 DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1398 pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1399
1400 /*
1401 * The global variables used below are only modified by this thread and
1402 * only during initial scanning when there is a single page scanner
1403 * thread running.
1404 */
1405 if (pageout_new_spread == 0) {
1406 VERIFY3U(inst, ==, 0);
1407
1408 if (PAGE_SCAN_STARTUP) {
1409 /*
1410 * Continue accumulating samples until we have enough
1411 * to get a reasonable value for average scan rate.
1412 */
1413 pageout_sample_pages += pcount;
1414 pageout_sample_etime += sample_end - sample_start;
1415 ++pageout_sample_cnt;
1416 }
1417
1418 if (!PAGE_SCAN_STARTUP) {
1419 /*
1420 * We have enough samples, set the spread.
1421 */
1422 pageout_rate = (hrrate_t)pageout_sample_pages *
1423 (hrrate_t)(NANOSEC) / pageout_sample_etime;
1424 pageout_new_spread = pageout_rate / 10;
1425 setupclock();
1426 }
1427 }
1428
1429 goto loop;
1430 }
1431
1545 }
1546
1547 if (zones_over) {
1548 ASSERT(pp->p_zoneid == ALL_ZONES ||
1549 pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1550 if (pp->p_zoneid == ALL_ZONES ||
1551 zone_pdata[pp->p_zoneid].zpers_over == 0) {
1552 /*
1553 * Cross-zone shared page, or zone not over it's cap.
1554 * Leave the page alone.
1555 */
1556 page_unlock(pp);
1557 return (CKP_INELIGIBLE);
1558 }
1559 zid = pp->p_zoneid;
1560 }
1561
1562 /*
1563 * Maintain statistics for what we are freeing
1564 */
1565 if (pp->p_vnode != NULL) {
1566 if (pp->p_vnode->v_flag & VVMEXEC)
1567 isexec = 1;
1568
1569 if (!IS_SWAPFSVP(pp->p_vnode))
1570 isfs = 1;
1571 }
1572
1573 /*
1574 * Turn off REF and MOD bits with the front hand.
1575 * The back hand examines the REF bit and always considers
1576 * SHARED pages as referenced.
1577 */
1578 if (whichhand == POH_FRONT) {
1579 pagesync_flag = HAT_SYNC_ZERORM;
1580 } else {
1581 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1582 HAT_SYNC_STOPON_SHARED;
1583 }
1584
|