Print this page
13097 improve VM tunables for modern systems (fix mismerge)
*** 240,259 ****
pgcnt_t desfree = 0;
pgcnt_t lotsfree = 0;
pgcnt_t needfree = 0;
pgcnt_t throttlefree = 0;
pgcnt_t pageout_reserve = 0;
pgcnt_t deficit;
pgcnt_t nscan;
pgcnt_t desscan;
/* kstats */
uint64_t low_mem_scan;
uint64_t zone_cap_scan;
- uint64_t n_throttle;
/*
* Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
* zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
* that gives the equivalent of some underlying %CPU duty cycle.
*
--- 240,261 ----
pgcnt_t desfree = 0;
pgcnt_t lotsfree = 0;
pgcnt_t needfree = 0;
pgcnt_t throttlefree = 0;
pgcnt_t pageout_reserve = 0;
+ pri_t pageout_pri;
pgcnt_t deficit;
pgcnt_t nscan;
pgcnt_t desscan;
/* kstats */
uint64_t low_mem_scan;
uint64_t zone_cap_scan;
+ #define MAX_PSCAN_THREADS 16
+
/*
* Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
* zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
* that gives the equivalent of some underlying %CPU duty cycle.
*
*** 272,305 ****
* zone_pageout_nsec:
* Number of nanoseconds budget for each cycle when a zone
* is over its memory cap. If this is zero, then the value
* of max_pageout_nsec is used instead.
*/
-
static hrtime_t min_pageout_nsec;
static hrtime_t max_pageout_nsec;
static hrtime_t pageout_nsec;
static hrtime_t zone_pageout_nsec;
- #define MAX_PSCAN_THREADS 16
static boolean_t reset_hands[MAX_PSCAN_THREADS];
/*
! * These can be tuned in /etc/system or set with mdb.
! * 'des_page_scanners' is the desired number of page scanner threads. The
! * system will bring the actual number of threads into line with the desired
! * number. If des_page_scanners is set to an invalid value, the system will
! * correct the setting.
*/
! uint_t des_page_scanners;
! uint_t pageout_reset_cnt = 64; /* num. cycles for pageout_scanner hand reset */
- uint_t n_page_scanners;
- static pgcnt_t pscan_region_sz; /* informational only */
-
- #define PAGES_POLL_MASK 1023
-
/*
* pageout_sample_lim:
* The limit on the number of samples needed to establish a value for new
* pageout parameters: fastscan, slowscan, pageout_new_spread, and
* handspreadpages.
--- 274,302 ----
* zone_pageout_nsec:
* Number of nanoseconds budget for each cycle when a zone
* is over its memory cap. If this is zero, then the value
* of max_pageout_nsec is used instead.
*/
static hrtime_t min_pageout_nsec;
static hrtime_t max_pageout_nsec;
static hrtime_t pageout_nsec;
static hrtime_t zone_pageout_nsec;
static boolean_t reset_hands[MAX_PSCAN_THREADS];
+ #define PAGES_POLL_MASK 1023
+ #define SCHEDPAGING_HZ 4
+
/*
! * despagescanners:
! * The desired number of page scanner threads. The value can be set in
! * /etc/system or tuned directly with 'mdb -kw'. The system will bring
! * the actual number of threads into line with the desired number. If set
! * to an invalid value, the system will correct the setting.
*/
! uint_t despagescanners = 0;
/*
* pageout_sample_lim:
* The limit on the number of samples needed to establish a value for new
* pageout parameters: fastscan, slowscan, pageout_new_spread, and
* handspreadpages.
*** 321,344 ****
* Initially zero while the system scan rate is measured by
* pageout_scanner(), which then sets this value once per system boot after
* enough samples have been recorded (pageout_sample_cnt). Once set, this
* new value is used for fastscan and handspreadpages.
*/
-
typedef hrtime_t hrrate_t;
static uint64_t pageout_sample_lim = 4;
static uint64_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
! static hrtime_t pageout_sample_etime = 0;
!
! /* True if page scanner is first starting up */
#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
/*
* Record number of times a pageout_scanner() wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
* its budgeted number of pages. This is only done when scanning under low
* free memory conditions, not when scanning for zones over their cap.
--- 318,344 ----
* Initially zero while the system scan rate is measured by
* pageout_scanner(), which then sets this value once per system boot after
* enough samples have been recorded (pageout_sample_cnt). Once set, this
* new value is used for fastscan and handspreadpages.
*/
typedef hrtime_t hrrate_t;
static uint64_t pageout_sample_lim = 4;
static uint64_t pageout_sample_cnt = 0;
static pgcnt_t pageout_sample_pages = 0;
+ static hrtime_t pageout_sample_etime = 0;
static hrrate_t pageout_rate = 0;
static pgcnt_t pageout_new_spread = 0;
! /* True if the page scanner is first starting up */
#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim)
+ /* The current number of page scanner threads */
+ static uint_t n_page_scanners = 1;
+ /* The number of page scanner threads that are actively scanning. */
+ static uint_t pageouts_running;
+
/*
* Record number of times a pageout_scanner() wakeup cycle finished because it
* timed out (exceeded its CPU budget), rather than because it visited
* its budgeted number of pages. This is only done when scanning under low
* free memory conditions, not when scanning for zones over their cap.
*** 383,395 ****
pgcnt_t ci_maxpgio;
pgcnt_t ci_maxfastscan;
pgcnt_t ci_fastscan;
pgcnt_t ci_slowscan;
pgcnt_t ci_handspreadpages;
} clockinit = { .ci_init = false };
! static pgcnt_t
clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
{
if (value < minimum) {
return (minimum);
} else if (value > maximum) {
--- 383,396 ----
pgcnt_t ci_maxpgio;
pgcnt_t ci_maxfastscan;
pgcnt_t ci_fastscan;
pgcnt_t ci_slowscan;
pgcnt_t ci_handspreadpages;
+ uint_t ci_despagescanners;
} clockinit = { .ci_init = false };
! static inline pgcnt_t
clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
{
if (value < minimum) {
return (minimum);
} else if (value > maximum) {
*** 419,428 ****
--- 420,495 ----
* doesn't modify the variable, it only cares if the variable is 0 or non-0.
*/
static boolean_t zones_over = B_FALSE;
/*
+ * On large memory systems, multiple instances of the page scanner are run,
+ * each responsible for a separate region of memory. This speeds up page
+ * invalidation under low memory conditions.
+ *
+ * despagescanners can be set in /etc/system or via mdb and it will
+ * be used as a guide for how many page scanners to create; the value
+ * will be adjusted if it is not sensible. Otherwise, the number of
+ * page scanners is determined dynamically based on handspreadpages.
+ */
+ static void
+ recalc_pagescanners(void)
+ {
+ pgcnt_t sz;
+ uint_t des;
+
+ /* If the initial calibration has not been done, take no action. */
+ if (pageout_new_spread == 0)
+ return;
+
+ /*
+ * If the desired number of scanners is set in /etc/system
+ * then try to use it.
+ */
+ if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
+ despagescanners = clockinit.ci_despagescanners;
+
+ if (despagescanners != 0) {
+ /*
+ * We have a desired number of page scanners, either from
+ * /etc/system or set via mdb. Try and use it (it will be
+ * clamped below).
+ */
+ des = despagescanners;
+ } else {
+ /*
+ * Calculate the number of desired scanners based on the
+ * system's memory size.
+ *
+ * A 64GiB region size is used as the basis for calculating how
+ * many scanner threads should be created. For systems with up
+ * to 64GiB of RAM, a single thread is used; for very large
+ * memory systems the threads are limited to MAX_PSCAN_THREADS.
+ */
+ sz = btop(64ULL << 30);
+
+ if (sz > looppages) {
+ des = 1;
+ } else {
+ pgcnt_t tmp = sz;
+
+ for (des = 1; tmp < looppages; des++)
+ tmp += sz;
+ }
+ }
+
+ /*
+ * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
+ * and so that each scanner covers at least 10% more than
+ * handspreadpages.
+ */
+ des = clamp(des, 1,
+ looppages / (handspreadpages + handspreadpages / 10));
+ despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
+ }
+
+ /*
* Set up the paging constants for the clock algorithm used by
* pageout_scanner(), and by the virtual memory system overall. See the
* comments at the top of this file for more information about the threshold
* values and system responses to memory pressure.
*
*** 432,444 ****
* rate become available.
*/
void
setupclock(void)
{
- uint_t i;
- pgcnt_t sz, tmp;
- pgcnt_t defval;
bool half = (pageout_threshold_style == 1);
bool recalc = true;
looppages = total_pages;
--- 499,508 ----
*** 448,458 ****
* values so that they can be used for subsequent recalculations.
*
* A value of zero for any tunable means we will use the default
* sizing.
*/
-
if (!clockinit.ci_init) {
clockinit.ci_init = true;
clockinit.ci_lotsfree_min = lotsfree_min;
clockinit.ci_lotsfree_max = lotsfree_max;
--- 512,521 ----
*** 464,473 ****
--- 527,537 ----
clockinit.ci_maxpgio = maxpgio;
clockinit.ci_maxfastscan = maxfastscan;
clockinit.ci_fastscan = fastscan;
clockinit.ci_slowscan = slowscan;
clockinit.ci_handspreadpages = handspreadpages;
+ clockinit.ci_despagescanners = despagescanners;
/*
* The first call does not trigger a recalculation, only
* subsequent calls.
*/
*** 645,655 ****
if (slowscan > fastscan / 2) {
slowscan = fastscan / 2;
}
/*
! * Handspreadpages is distance (in pages) between front and back
* pageout daemon hands. The amount of time to reclaim a page
* once pageout examines it increases with this distance and
* decreases as the scan rate rises. It must be < the amount
* of pageable memory.
*
--- 709,719 ----
if (slowscan > fastscan / 2) {
slowscan = fastscan / 2;
}
/*
! * Handspreadpages is the distance (in pages) between front and back
* pageout daemon hands. The amount of time to reclaim a page
* once pageout examines it increases with this distance and
* decreases as the scan rate rises. It must be < the amount
* of pageable memory.
*
*** 680,741 ****
*/
if (handspreadpages >= looppages) {
handspreadpages = looppages - 1;
}
- if (!recalc) {
/*
! * Setup basic values at initialization.
*/
! pscan_region_sz = total_pages;
! des_page_scanners = n_page_scanners = 1;
! reset_hands[0] = B_TRUE;
! return;
! }
/*
! * Recalculating
! *
! * We originally set the number of page scanners to 1. Now that we
! * know what the handspreadpages is for a scanner, figure out how many
! * scanners we should run. We want to ensure that the regions don't
! * overlap and that they are not touching.
! *
! * A default 64GB region size is used as the initial value to calculate
! * how many scanner threads we should create on lower memory systems.
! * The idea is to limit the number of threads to a practical value
! * (e.g. a 64GB machine really only needs one scanner thread). For very
! * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
! * threads.
! *
! * The scanner threads themselves are evenly spread out around the
! * memory "clock" in pageout_scanner when we reset the hands, and each
! * thread will scan all of memory.
*/
! sz = (btop(64ULL * 0x40000000ULL));
! if (sz < handspreadpages) {
/*
! * 64GB is smaller than the separation between the front
! * and back hands; use double handspreadpages.
*/
! sz = handspreadpages << 1;
! }
! if (sz > total_pages) {
! sz = total_pages;
! }
! /* Record region size for inspection with mdb, otherwise unused */
! pscan_region_sz = sz;
! tmp = sz;
! for (i = 1; tmp < total_pages; i++) {
! tmp += sz;
! }
!
! if (i > MAX_PSCAN_THREADS)
! i = MAX_PSCAN_THREADS;
!
! des_page_scanners = i;
}
/*
* Pageout scheduling.
*
--- 744,779 ----
*/
if (handspreadpages >= looppages) {
handspreadpages = looppages - 1;
}
/*
! * Establish the minimum and maximum length of time to be spent
! * scanning pages per wakeup, limiting the scanner duty cycle. The
! * input percentage values (0-100) must be converted to a fraction of
! * the number of nanoseconds in a second of wall time, then further
! * scaled down by the number of scanner wakeups in a second.
*/
! min_pageout_nsec = MAX(1,
! NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
! max_pageout_nsec = MAX(min_pageout_nsec,
! NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
/*
! * If not called for recalculation, return and skip the remaining
! * steps.
*/
! if (!recalc)
! return;
!
/*
! * Set a flag to re-evaluate the clock hand positions.
*/
! for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
! reset_hands[i] = B_TRUE;
! recalc_pagescanners();
}
/*
* Pageout scheduling.
*
*** 745,757 ****
* in its current pass; schedpaging() resets this value to zero each time
* it runs. Desscan records the number of pages pageout should examine
* in its next pass; schedpaging() sets this value based on the amount of
* currently available memory.
*/
- #define SCHEDPAGING_HZ 4
! static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */
/*
* Pool of available async pageout putpage requests.
*/
static struct async_reqs *push_req;
--- 783,794 ----
* in its current pass; schedpaging() resets this value to zero each time
* it runs. Desscan records the number of pages pageout should examine
* in its next pass; schedpaging() sets this value based on the amount of
* currently available memory.
*/
! static kmutex_t pageout_mutex;
/*
* Pool of available async pageout putpage requests.
*/
static struct async_reqs *push_req;
*** 775,785 ****
static uint_t pageout_stucktime = 0;
static bool pageout_pushing = false;
static uint64_t pageout_pushcount = 0;
static uint64_t pageout_pushcount_seen = 0;
! static int async_list_size = 256; /* number of async request structs */
static void pageout_scanner(void *);
/*
* If a page is being shared more than "po_share" times
--- 812,822 ----
static uint_t pageout_stucktime = 0;
static bool pageout_pushing = false;
static uint64_t pageout_pushcount = 0;
static uint64_t pageout_pushcount_seen = 0;
! static int async_list_size = 8192; /* number of async request structs */
static void pageout_scanner(void *);
/*
* If a page is being shared more than "po_share" times
*** 806,840 ****
seg_preap();
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
kcage_cageout_wakeup();
! (void) atomic_swap_ulong(&nscan, 0);
vavail = freemem - deficit;
if (pageout_new_spread != 0)
vavail -= needfree;
! if (vavail < 0)
! vavail = 0;
! if (vavail > lotsfree)
! vavail = lotsfree;
- /*
- * Fix for 1161438 (CRS SPR# 73922). All variables
- * in the original calculation for desscan were 32 bit signed
- * ints. As freemem approaches 0x0 on a system with 1 Gig or
- * more of memory, the calculation can overflow. When this
- * happens, desscan becomes negative and pageout_scanner()
- * stops paging out.
- */
if (needfree > 0 && pageout_new_spread == 0) {
/*
* If we've not yet collected enough samples to
! * calculate a spread, kick into high gear anytime
! * needfree is non-zero. Note that desscan will not be
! * the limiting factor for systems with larger memory;
! * the %CPU will limit the scan. That will also be
! * maxed out below.
*/
desscan = fastscan / SCHEDPAGING_HZ;
} else {
/*
* Once we've calculated a spread based on system
--- 843,869 ----
seg_preap();
if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
kcage_cageout_wakeup();
! if (mutex_tryenter(&pageout_mutex)) {
!
! if (pageouts_running != 0)
! goto out;
!
! /* No pageout scanner threads running. */
! nscan = 0;
vavail = freemem - deficit;
if (pageout_new_spread != 0)
vavail -= needfree;
! vavail = clamp(vavail, 0, lotsfree);
if (needfree > 0 && pageout_new_spread == 0) {
/*
* If we've not yet collected enough samples to
! * calculate a spread, use the old logic of kicking
! * into high gear anytime needfree is non-zero.
*/
desscan = fastscan / SCHEDPAGING_HZ;
} else {
/*
* Once we've calculated a spread based on system
*** 848,934 ****
result = (slowstmp + faststmp) /
nz(lotsfree) / SCHEDPAGING_HZ;
desscan = (pgcnt_t)result;
}
! /*
! * If we've not yet collected enough samples to calculate a
! * spread, also kick %CPU to the max.
! */
! if (pageout_new_spread == 0) {
! pageout_nsec = max_pageout_nsec;
! } else {
! pageout_nsec = min_pageout_nsec +
! (lotsfree - vavail) *
! (max_pageout_nsec - min_pageout_nsec) /
! nz(lotsfree);
! }
! if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
/*
! * We have finished the pagescan initialization and the desired
! * number of page scanners has changed, either because
! * initialization just finished, because of a memory DR, or
! * because des_page_scanners has been modified on the fly (i.e.
! * by mdb). If we need more scanners, start them now, otherwise
! * the excess scanners will terminate on their own when they
! * reset their hands.
*/
! uint_t i;
! uint_t curr_nscan = n_page_scanners;
! pgcnt_t max = total_pages / handspreadpages;
! if (des_page_scanners > max)
! des_page_scanners = max;
! if (des_page_scanners > MAX_PSCAN_THREADS) {
! des_page_scanners = MAX_PSCAN_THREADS;
! } else if (des_page_scanners == 0) {
! des_page_scanners = 1;
! }
! /*
! * Each thread has its own entry in the reset_hands array, so
! * we don't need any locking in pageout_scanner to check the
! * thread's reset_hands entry. Thus, we use a pre-allocated
! * fixed size reset_hands array and upper limit on the number
! * of pagescan threads.
! *
! * The reset_hands entries need to be true before we start new
! * scanners, but if we're reducing, we don't want a race on the
! * recalculation for the existing threads, so we set
! * n_page_scanners first.
! */
! n_page_scanners = des_page_scanners;
! for (i = 0; i < MAX_PSCAN_THREADS; i++) {
reset_hands[i] = B_TRUE;
- }
! if (des_page_scanners > curr_nscan) {
! /* Create additional pageout scanner threads. */
! for (i = curr_nscan; i < des_page_scanners; i++) {
(void) lwp_kernel_create(proc_pageout,
! pageout_scanner, (void *)(uintptr_t)i,
! TS_RUN, curthread->t_pri);
}
}
}
zones_over = B_FALSE;
! if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
! if (!PAGE_SCAN_STARTUP)
! low_mem_scan++;
/*
! * Either we need more memory, or we still need to
! * measure the average scan rate. Wake the scanner.
*/
! DTRACE_PROBE(schedpage__wake__low);
WAKE_PAGEOUT_SCANNER();
} else if (zone_num_over_cap > 0) {
! /* One or more zones are over their cap. */
/* No page limit */
desscan = total_pages;
/*
--- 877,957 ----
result = (slowstmp + faststmp) /
nz(lotsfree) / SCHEDPAGING_HZ;
desscan = (pgcnt_t)result;
}
! pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
! (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
! DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
! pageout_nsec);
!
! if (pageout_new_spread != 0 && despagescanners != 0 &&
! despagescanners != n_page_scanners) {
/*
! * We have finished the pagescan initialisation and the
! * desired number of page scanners has changed, either
! * because initialisation just finished, because of a
! * memory DR, or because despagescanners has been
! * modified on the fly (i.e. by mdb).
*/
! uint_t i, curr_nscan = n_page_scanners;
! /* Re-validate despagescanners */
! recalc_pagescanners();
! n_page_scanners = despagescanners;
! for (i = 0; i < MAX_PSCAN_THREADS; i++)
reset_hands[i] = B_TRUE;
! /* If we need more scanners, start them now. */
! if (n_page_scanners > curr_nscan) {
! for (i = curr_nscan; i < n_page_scanners; i++) {
(void) lwp_kernel_create(proc_pageout,
! pageout_scanner,
! (void *)(uintptr_t)i, TS_RUN,
! pageout_pri);
}
}
+
+ /*
+ * If the number of scanners has decreased, trigger a
+ * wakeup so that the excess threads will terminate.
+ */
+ if (n_page_scanners < curr_nscan) {
+ WAKE_PAGEOUT_SCANNER();
}
+ }
zones_over = B_FALSE;
! if (PAGE_SCAN_STARTUP) {
/*
! * We still need to measure the rate at which the
! * system is able to scan pages of memory. Each of
! * these initial samples is a scan of as much system
! * memory as practical, regardless of whether or not we
! * are experiencing memory pressure.
*/
! desscan = total_pages;
! pageout_nsec = max_pageout_nsec;
!
! DTRACE_PROBE(schedpage__wake__sample);
WAKE_PAGEOUT_SCANNER();
+ } else if (freemem < lotsfree + needfree) {
+ /*
+ * We need more memory.
+ */
+ low_mem_scan++;
+ DTRACE_PROBE(schedpage__wake__low);
+ WAKE_PAGEOUT_SCANNER();
} else if (zone_num_over_cap > 0) {
! /*
! * One of more zones are over their cap.
! */
/* No page limit */
desscan = total_pages;
/*
*** 943,966 ****
zones_over = B_TRUE;
zone_cap_scan++;
DTRACE_PROBE(schedpage__wake__zone);
WAKE_PAGEOUT_SCANNER();
-
} else {
/*
* There are enough free pages, no need to
* kick the scanner thread. And next time
* around, keep more of the `highly shared'
* pages.
*/
cv_signal_pageout();
-
- mutex_enter(&pageout_mutex);
if (po_share > MIN_PO_SHARE) {
po_share >>= 1;
}
mutex_exit(&pageout_mutex);
}
/*
* Signal threads waiting for available memory.
--- 966,988 ----
zones_over = B_TRUE;
zone_cap_scan++;
DTRACE_PROBE(schedpage__wake__zone);
WAKE_PAGEOUT_SCANNER();
} else {
/*
* There are enough free pages, no need to
* kick the scanner thread. And next time
* around, keep more of the `highly shared'
* pages.
*/
cv_signal_pageout();
if (po_share > MIN_PO_SHARE) {
po_share >>= 1;
}
+ }
+ out:
mutex_exit(&pageout_mutex);
}
/*
* Signal threads waiting for available memory.
*** 985,1035 ****
uint_t dopageout = 1;
/*
* The page out daemon, which runs as process 2.
*
! * Page out occurs when either:
! * a) there is less than lotsfree pages,
! * b) there are one or more zones over their physical memory cap.
*
! * The daemon treats physical memory as a circular array of pages and scans the
! * pages using a 'two-handed clock' algorithm. The front hand moves through
! * the pages, clearing the reference bit. The back hand travels a distance
! * (handspreadpages) behind the front hand, freeing the pages that have not
! * been referenced in the time since the front hand passed. If modified, they
! * are first written to their backing store before being freed.
*
! * In order to make page invalidation more responsive on machines with larger
! * memory, multiple pageout_scanner threads may be created. In this case, the
! * threads are evenly distributed around the the memory "clock face" so that
! * memory can be reclaimed more quickly (that is, there can be large regions in
! * which no pages can be reclaimed by a single thread, leading to lag which
! * causes undesirable behavior such as htable stealing).
*
! * As long as there are at least lotsfree pages, or no zones over their cap,
! * then pageout_scanner threads are not run. When pageout_scanner threads are
! * running for case (a), all pages are considered for pageout. For case (b),
! * only pages belonging to a zone over its cap will be considered for pageout.
! *
! * There are multiple threads that act on behalf of the pageout process.
! * A set of threads scan pages (pageout_scanner) and frees them up if
! * they don't require any VOP_PUTPAGE operation. If a page must be
! * written back to its backing store, the request is put on a list
! * and the other (pageout) thread is signaled. The pageout thread
! * grabs VOP_PUTPAGE requests from the list, and processes them.
! * Some filesystems may require resources for the VOP_PUTPAGE
! * operations (like memory) and hence can block the pageout
! * thread, but the pageout_scanner threads can still operate. There is still
! * no guarantee that memory deadlocks cannot occur.
! *
! * The pageout_scanner parameters are determined in schedpaging().
*/
void
pageout()
{
struct async_reqs *arg;
- pri_t pageout_pri;
int i;
pgcnt_t max_pushes;
callb_cpr_t cprinfo;
proc_pageout = ttoproc(curthread);
--- 1007,1049 ----
uint_t dopageout = 1;
/*
* The page out daemon, which runs as process 2.
*
! * The daemon treats physical memory as a circular array of pages and scans
! * the pages using a 'two-handed clock' algorithm. The front hand moves
! * through the pages, clearing the reference bit. The back hand travels a
! * distance (handspreadpages) behind the front hand, freeing the pages that
! * have not been referenced in the time since the front hand passed. If
! * modified, they are first written to their backing store before being
! * freed.
*
! * In order to make page invalidation more responsive on machines with
! * larger memory, multiple pageout_scanner threads may be created. In this
! * case, each thread is given a segment of the memory "clock face" so that
! * memory can be reclaimed more quickly.
*
! * As long as there are at least lotsfree pages, or no zones over their
! * cap, then pageout_scanner threads are not run. When pageout_scanner
! * threads are running for case (a), all pages are considered for pageout.
! * For case (b), only pages belonging to a zone over its cap will be
! * considered for pageout.
*
! * There are multiple threads that act on behalf of the pageout process. A
! * set of threads scan pages (pageout_scanner) and frees them up if they
! * don't require any VOP_PUTPAGE operation. If a page must be written back
! * to its backing store, the request is put on a list and the other
! * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
! * requests from the list, and processes them. Some filesystems may require
! * resources for the VOP_PUTPAGE operations (like memory) and hence can
! * block the pageout thread, but the scanner thread can still operate.
! * There is still no guarantee that memory deadlocks cannot occur.
*/
void
pageout()
{
struct async_reqs *arg;
int i;
pgcnt_t max_pushes;
callb_cpr_t cprinfo;
proc_pageout = ttoproc(curthread);
*** 1056,1070 ****
req_freelist = push_req;
for (i = 0; i < async_list_size - 1; i++) {
push_req[i].a_next = &push_req[i + 1];
}
! pageout_pri = curthread->t_pri;
! /* Create the (first) pageout scanner thread. */
! (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
! pageout_pri - 1);
/*
* kick off pageout scheduler.
*/
schedpaging(NULL);
--- 1070,1085 ----
req_freelist = push_req;
for (i = 0; i < async_list_size - 1; i++) {
push_req[i].a_next = &push_req[i + 1];
}
! pageout_pri = curthread->t_pri - 1;
! /* Create the first pageout scanner thread. */
! (void) lwp_kernel_create(proc_pageout, pageout_scanner,
! (void *)0, /* this is instance 0, not NULL */
! TS_RUN, pageout_pri);
/*
* kick off pageout scheduler.
*/
schedpaging(NULL);
*** 1096,1105 ****
--- 1111,1121 ----
arg->a_next = NULL;
pageout_pushing = true;
mutex_exit(&push_lock);
DTRACE_PROBE(pageout__push);
+
if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
pushes++;
}
*** 1120,1269 ****
* Kernel thread that scans pages looking for ones to free
*/
static void
pageout_scanner(void *a)
{
! struct page *fronthand, *backhand;
! uint_t laps, iter = 0;
callb_cpr_t cprinfo;
! pgcnt_t nscan_cnt, nscan_limit;
pgcnt_t pcount;
! uint_t inst = (uint_t)(uintptr_t)a;
hrtime_t sample_start, sample_end;
! kmutex_t pscan_mutex;
! bool sampling;
VERIFY3U(inst, <, MAX_PSCAN_THREADS);
! mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
- CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
- mutex_enter(&pscan_mutex);
-
/*
! * Establish the minimum and maximum length of time to be spent
! * scanning pages per wakeup, limiting the scanner duty cycle. The
! * input percentage values (0-100) must be converted to a fraction of
! * the number of nanoseconds in a second of wall time, then further
! * scaled down by the number of scanner wakeups in a second:
*/
! min_pageout_nsec = MAX(1,
! NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
! max_pageout_nsec = MAX(min_pageout_nsec,
! NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
loop:
cv_signal_pageout();
CALLB_CPR_SAFE_BEGIN(&cprinfo);
! cv_wait(&proc_pageout->p_cv, &pscan_mutex);
! CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
/*
! * Check if pageout has been disabled for debugging purposes:
*/
if (!dopageout) {
goto loop;
}
/*
! * One may reset the clock hands for debugging purposes. Hands will
! * also be reset if memory is added to or removed from the system.
*/
if (reset_hands[inst]) {
struct page *first;
- pgcnt_t offset = total_pages / n_page_scanners;
reset_hands[inst] = B_FALSE;
if (inst >= n_page_scanners) {
/*
* The desired number of page scanners has been
* reduced and this instance is no longer wanted.
* Exit the lwp.
*/
VERIFY3U(inst, !=, 0);
! mutex_exit(&pscan_mutex);
mutex_enter(&curproc->p_lock);
lwp_exit();
}
/*
! * The reset case repositions the hands at the proper place
! * on the memory clock face to prevent creep into another
! * thread's active region or when the number of threads has
! * changed.
! *
! * Set the two clock hands to be separated by a reasonable
! * amount, but no more than 360 degrees apart.
! *
! * If inst == 0, backhand starts at first page, otherwise
! * it is (inst * offset) around the memory "clock face" so that
! * we spread out each scanner instance evenly.
*/
! first = page_first();
! backhand = page_nextn(first, offset * inst);
! if (handspreadpages >= total_pages) {
! fronthand = page_nextn(backhand, total_pages - 1);
} else {
fronthand = page_nextn(backhand, handspreadpages);
}
- }
CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
/*
* Keep track of the number of times we have scanned all the way around
! * the loop:
*/
laps = 0;
/*
* Track the number of pages visited during this scan so that we can
* periodically measure our duty cycle.
*/
- pcount = 0;
nscan_cnt = 0;
! if (PAGE_SCAN_STARTUP) {
/*
! * We need to measure the rate at which the system is able to
! * scan pages of memory. Each of these initial samples is a
! * scan of all system memory, regardless of whether or not we
! * are experiencing memory pressure.
*/
! nscan_limit = total_pages;
! sampling = true;
! } else {
! nscan_limit = desscan;
! sampling = false;
! }
- DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
- page_t *, backhand, page_t *, fronthand);
-
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
* Only scan while at least one of these is true:
* 1) one or more zones is over its cap
* 2) there is not enough free memory
* 3) during page scan startup when determining sample data
*/
! while (nscan_cnt < nscan_limit) {
! checkpage_result_t rvfront, rvback;
!
! if (!sampling && !zones_over &&
! freemem >= lotsfree + needfree) {
/*
* We are not sampling and enough memory has become
* available that scanning is no longer required.
*/
break;
}
! DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
/*
* Periodically check to see if we have exceeded the CPU duty
* cycle for a single wakeup.
*/
--- 1136,1299 ----
* Kernel thread that scans pages looking for ones to free
*/
static void
pageout_scanner(void *a)
{
! struct page *fronthand, *backhand, *fronthandstart;
! struct page *regionstart, *regionend;
! uint_t laps;
callb_cpr_t cprinfo;
! pgcnt_t nscan_cnt, tick;
pgcnt_t pcount;
! bool bhwrapping, fhwrapping;
hrtime_t sample_start, sample_end;
! uint_t inst = (uint_t)(uintptr_t)a;
VERIFY3U(inst, <, MAX_PSCAN_THREADS);
! CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
! mutex_enter(&pageout_mutex);
/*
! * The restart case does not attempt to point the hands at roughly
! * the right point on the assumption that after one circuit things
! * will have settled down, and restarts shouldn't be that often.
*/
! reset_hands[inst] = B_TRUE;
+ pageouts_running++;
+ mutex_exit(&pageout_mutex);
+
loop:
cv_signal_pageout();
+ mutex_enter(&pageout_mutex);
+ pageouts_running--;
CALLB_CPR_SAFE_BEGIN(&cprinfo);
! cv_wait(&proc_pageout->p_cv, &pageout_mutex);
! CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
! pageouts_running++;
! mutex_exit(&pageout_mutex);
/*
! * Check if pageout has been disabled for debugging purposes.
*/
if (!dopageout) {
goto loop;
}
/*
! * One may reset the clock hands and scanned region for debugging
! * purposes. Hands will also be reset on first thread startup, if
! * the number of scanning threads (n_page_scanners) changes, or if
! * memory is added to, or removed from, the system.
*/
if (reset_hands[inst]) {
struct page *first;
reset_hands[inst] = B_FALSE;
+
if (inst >= n_page_scanners) {
/*
* The desired number of page scanners has been
* reduced and this instance is no longer wanted.
* Exit the lwp.
*/
VERIFY3U(inst, !=, 0);
! DTRACE_PROBE1(pageout__exit, uint_t, inst);
! mutex_enter(&pageout_mutex);
! pageouts_running--;
! mutex_exit(&pageout_mutex);
mutex_enter(&curproc->p_lock);
lwp_exit();
+ /* NOTREACHED */
}
+ first = page_first();
+
/*
! * Each scanner thread gets its own sector of the memory
! * clock face.
*/
! pgcnt_t span, offset;
!
! span = looppages / n_page_scanners;
! VERIFY3U(span, >, handspreadpages);
!
! offset = inst * span;
! regionstart = page_nextn(first, offset);
! if (inst == n_page_scanners - 1) {
! /* The last instance goes up to the last page */
! regionend = page_nextn(first, looppages - 1);
} else {
+ regionend = page_nextn(regionstart, span - 1);
+ }
+
+ backhand = regionstart;
fronthand = page_nextn(backhand, handspreadpages);
+ tick = 1;
+
+ bhwrapping = fhwrapping = B_FALSE;
+
+ DTRACE_PROBE4(pageout__reset, uint_t, inst,
+ pgcnt_t, regionstart, pgcnt_t, regionend,
+ pgcnt_t, fronthand);
}
+ /*
+ * This CPU kstat is only incremented here and we're obviously
+ * on this CPU, so no lock.
+ */
CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
/*
* Keep track of the number of times we have scanned all the way around
! * the loop on this wakeup.
*/
laps = 0;
/*
* Track the number of pages visited during this scan so that we can
* periodically measure our duty cycle.
*/
nscan_cnt = 0;
+ pcount = 0;
! DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
! hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
!
/*
! * Record the initial position of the front hand for this cycle so
! * that we can detect when the hand wraps around.
*/
! fronthandstart = fronthand;
sample_start = gethrtime();
/*
* Scan the appropriate number of pages for a single duty cycle.
+ */
+ while (nscan_cnt < desscan) {
+ checkpage_result_t rvfront, rvback;
+
+ /*
* Only scan while at least one of these is true:
* 1) one or more zones is over its cap
* 2) there is not enough free memory
* 3) during page scan startup when determining sample data
*/
! if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
! !zones_over) {
/*
* We are not sampling and enough memory has become
* available that scanning is no longer required.
*/
+ DTRACE_PROBE1(pageout__memfree, uint_t, inst);
break;
}
! DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
/*
* Periodically check to see if we have exceeded the CPU duty
* cycle for a single wakeup.
*/
*** 1270,1286 ****
if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
hrtime_t pageout_cycle_nsec;
pageout_cycle_nsec = gethrtime() - sample_start;
if (pageout_cycle_nsec >= pageout_nsec) {
! /*
! * This is where we normally break out of the
! * loop when scanning zones or sampling.
! */
! if (!zones_over) {
atomic_inc_64(&pageout_timeouts);
- }
DTRACE_PROBE1(pageout__timeout, uint_t, inst);
break;
}
}
--- 1300,1311 ----
if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
hrtime_t pageout_cycle_nsec;
pageout_cycle_nsec = gethrtime() - sample_start;
if (pageout_cycle_nsec >= pageout_nsec) {
! if (!zones_over)
atomic_inc_64(&pageout_timeouts);
DTRACE_PROBE1(pageout__timeout, uint_t, inst);
break;
}
}
*** 1308,1391 ****
*/
if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
nscan_cnt++;
}
! backhand = page_next(backhand);
! fronthand = page_next(fronthand);
! /*
! * The front hand has wrapped around to the first page in the
! * loop.
! */
! if (fronthand == page_first()) {
! DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
/*
! * Every 64 wraps we reposition our hands within our
! * region to prevent creep into another thread.
*/
! if ((++iter % pageout_reset_cnt) == 0)
! reset_hands[inst] = B_TRUE;
/*
* This CPU kstat is only incremented here and we're
* obviously on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, rev, 1);
/*
- * If scanning because the system is low on memory,
* then when we wraparound memory we want to try to
* reclaim more pages.
* If scanning only because zones are over their cap,
* then wrapping is common and we simply keep going.
*/
! if (freemem < lotsfree + needfree && ++laps > 1) {
/*
- * The system is low on memory.
* Extremely unlikely, but it happens.
* We went around the loop at least once
* and didn't get far enough.
* If we are still skipping `highly shared'
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
- mutex_enter(&pageout_mutex);
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
- mutex_exit(&pageout_mutex);
} else {
- mutex_exit(&pageout_mutex);
break;
}
}
}
}
atomic_add_long(&nscan, nscan_cnt);
! sample_end = gethrtime();
- DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
- uint_t, inst);
-
/*
- * The following two blocks are only relevant when the scanner is
- * first started up. After the scanner runs for a while, neither of
- * the conditions will ever be true again.
- *
* The global variables used below are only modified by this thread and
* only during initial scanning when there is a single page scanner
! * thread running. Thus, we don't use any locking.
*/
if (pageout_new_spread == 0) {
VERIFY3U(inst, ==, 0);
if (PAGE_SCAN_STARTUP) {
/*
* Continue accumulating samples until we have enough
! * to get a reasonable value for average scan rate:
*/
pageout_sample_pages += pcount;
pageout_sample_etime += sample_end - sample_start;
++pageout_sample_cnt;
}
--- 1333,1416 ----
*/
if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
nscan_cnt++;
}
! if (bhwrapping) {
! backhand = regionstart;
! bhwrapping = B_FALSE;
! } else {
! backhand = page_nextn(backhand, tick);
! if (backhand == regionend)
! bhwrapping = B_TRUE;
! }
! if (fhwrapping) {
! fronthand = regionstart;
! fhwrapping = B_FALSE;
! } else {
! fronthand = page_nextn(fronthand, tick);
! if (fronthand == regionend)
! fhwrapping = B_TRUE;
! }
/*
! * The front hand has wrapped around during this wakeup.
*/
! if (fronthand == fronthandstart) {
! laps++;
! DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
! uint_t, laps);
/*
* This CPU kstat is only incremented here and we're
* obviously on this CPU, so no lock.
*/
CPU_STATS_ADDQ(CPU, vm, rev, 1);
/*
* then when we wraparound memory we want to try to
* reclaim more pages.
* If scanning only because zones are over their cap,
* then wrapping is common and we simply keep going.
*/
! if (laps > 1 && freemem < lotsfree + needfree) {
/*
* Extremely unlikely, but it happens.
* We went around the loop at least once
* and didn't get far enough.
* If we are still skipping `highly shared'
* pages, skip fewer of them. Otherwise,
* give up till the next clock tick.
*/
if (po_share < MAX_PO_SHARE) {
po_share <<= 1;
} else {
break;
}
}
}
}
+ sample_end = gethrtime();
atomic_add_long(&nscan, nscan_cnt);
! DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
! pgcnt_t, nscan_cnt, pgcnt_t, pcount)
/*
* The global variables used below are only modified by this thread and
* only during initial scanning when there is a single page scanner
! * thread running.
*/
if (pageout_new_spread == 0) {
VERIFY3U(inst, ==, 0);
+
if (PAGE_SCAN_STARTUP) {
/*
* Continue accumulating samples until we have enough
! * to get a reasonable value for average scan rate.
*/
pageout_sample_pages += pcount;
pageout_sample_etime += sample_end - sample_start;
++pageout_sample_cnt;
}
*** 1535,1545 ****
}
/*
* Maintain statistics for what we are freeing
*/
-
if (pp->p_vnode != NULL) {
if (pp->p_vnode->v_flag & VVMEXEC)
isexec = 1;
if (!IS_SWAPFSVP(pp->p_vnode))
--- 1560,1569 ----