Print this page
13097 improve VM tunables for modern systems (fix mismerge)
        
*** 240,259 ****
  pgcnt_t         desfree = 0;
  pgcnt_t         lotsfree = 0;
  pgcnt_t         needfree = 0;
  pgcnt_t         throttlefree = 0;
  pgcnt_t         pageout_reserve = 0;
  
  pgcnt_t         deficit;
  pgcnt_t         nscan;
  pgcnt_t         desscan;
  
  /* kstats */
  uint64_t low_mem_scan;
  uint64_t zone_cap_scan;
- uint64_t n_throttle;
  
  /*
   * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
   * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
   * that gives the equivalent of some underlying %CPU duty cycle.
   *
--- 240,261 ----
  pgcnt_t         desfree = 0;
  pgcnt_t         lotsfree = 0;
  pgcnt_t         needfree = 0;
  pgcnt_t         throttlefree = 0;
  pgcnt_t         pageout_reserve = 0;
+ pri_t           pageout_pri;
  
  pgcnt_t         deficit;
  pgcnt_t         nscan;
  pgcnt_t         desscan;
  
  /* kstats */
  uint64_t low_mem_scan;
  uint64_t zone_cap_scan;
  
+ #define MAX_PSCAN_THREADS       16
+ 
  /*
   * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
   * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
   * that gives the equivalent of some underlying %CPU duty cycle.
   *
*** 272,305 ****
   * zone_pageout_nsec:
   *     Number of nanoseconds budget for each cycle when a zone
   *     is over its memory cap. If this is zero, then the value
   *     of max_pageout_nsec is used instead.
   */
- 
  static hrtime_t min_pageout_nsec;
  static hrtime_t max_pageout_nsec;
  static hrtime_t pageout_nsec;
  static hrtime_t zone_pageout_nsec;
  
- #define MAX_PSCAN_THREADS       16
  static boolean_t reset_hands[MAX_PSCAN_THREADS];
  
  /*
!  * These can be tuned in /etc/system or set with mdb.
!  * 'des_page_scanners' is the desired number of page scanner threads. The
!  * system will bring the actual number of threads into line with the desired
!  * number. If des_page_scanners is set to an invalid value, the system will
!  * correct the setting.
   */
! uint_t des_page_scanners;
! uint_t pageout_reset_cnt = 64;  /* num. cycles for pageout_scanner hand reset */
  
- uint_t n_page_scanners;
- static pgcnt_t  pscan_region_sz; /* informational only */
- 
- #define PAGES_POLL_MASK 1023
- 
  /*
   * pageout_sample_lim:
   *     The limit on the number of samples needed to establish a value for new
   *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
   *     handspreadpages.
--- 274,302 ----
   * zone_pageout_nsec:
   *      Number of nanoseconds budget for each cycle when a zone
   *      is over its memory cap. If this is zero, then the value
   *      of max_pageout_nsec is used instead.
   */
  static hrtime_t min_pageout_nsec;
  static hrtime_t max_pageout_nsec;
  static hrtime_t pageout_nsec;
  static hrtime_t zone_pageout_nsec;
  
  static boolean_t        reset_hands[MAX_PSCAN_THREADS];
  
+ #define PAGES_POLL_MASK 1023
+ #define SCHEDPAGING_HZ  4
+ 
  /*
!  * despagescanners:
!  *      The desired number of page scanner threads. The value can be set in
!  *      /etc/system or tuned directly with 'mdb -kw'.  The system will bring
!  *      the actual number of threads into line with the desired number. If set
!  *      to an invalid value, the system will correct the setting.
   */
! uint_t despagescanners = 0;
  
  /*
   * pageout_sample_lim:
   *     The limit on the number of samples needed to establish a value for new
   *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
   *     handspreadpages.
*** 321,344 ****
   *     Initially zero while the system scan rate is measured by
   *     pageout_scanner(), which then sets this value once per system boot after
   *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
   *     new value is used for fastscan and handspreadpages.
   */
- 
  typedef hrtime_t hrrate_t;
  
  static uint64_t pageout_sample_lim = 4;
  static uint64_t pageout_sample_cnt = 0;
  static pgcnt_t  pageout_sample_pages = 0;
  static hrrate_t pageout_rate = 0;
  static pgcnt_t  pageout_new_spread = 0;
  
! static hrtime_t pageout_sample_etime = 0;
! 
! /* True if page scanner is first starting up */
  #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
  
  /*
   * Record number of times a pageout_scanner() wakeup cycle finished because it
   * timed out (exceeded its CPU budget), rather than because it visited
   * its budgeted number of pages. This is only done when scanning under low
   * free memory conditions, not when scanning for zones over their cap.
--- 318,344 ----
   *     Initially zero while the system scan rate is measured by
   *     pageout_scanner(), which then sets this value once per system boot after
   *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
   *     new value is used for fastscan and handspreadpages.
   */
  typedef hrtime_t hrrate_t;
  
  static uint64_t pageout_sample_lim = 4;
  static uint64_t pageout_sample_cnt = 0;
  static pgcnt_t  pageout_sample_pages = 0;
+ static hrtime_t pageout_sample_etime = 0;
  static hrrate_t pageout_rate = 0;
  static pgcnt_t  pageout_new_spread = 0;
  
! /* True if the page scanner is first starting up */
  #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
  
+ /* The current number of page scanner threads */
+ static uint_t n_page_scanners = 1;
+ /* The number of page scanner threads that are actively scanning. */
+ static uint_t pageouts_running;
+ 
  /*
   * Record number of times a pageout_scanner() wakeup cycle finished because it
   * timed out (exceeded its CPU budget), rather than because it visited
   * its budgeted number of pages. This is only done when scanning under low
   * free memory conditions, not when scanning for zones over their cap.
*** 383,395 ****
          pgcnt_t ci_maxpgio;
          pgcnt_t ci_maxfastscan;
          pgcnt_t ci_fastscan;
          pgcnt_t ci_slowscan;
          pgcnt_t ci_handspreadpages;
  } clockinit = { .ci_init = false };
  
! static pgcnt_t
  clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
  {
          if (value < minimum) {
                  return (minimum);
          } else if (value > maximum) {
--- 383,396 ----
          pgcnt_t ci_maxpgio;
          pgcnt_t ci_maxfastscan;
          pgcnt_t ci_fastscan;
          pgcnt_t ci_slowscan;
          pgcnt_t ci_handspreadpages;
+         uint_t  ci_despagescanners;
  } clockinit = { .ci_init = false };
  
! static inline pgcnt_t
  clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
  {
          if (value < minimum) {
                  return (minimum);
          } else if (value > maximum) {
*** 419,428 ****
--- 420,495 ----
   * doesn't modify the variable, it only cares if the variable is 0 or non-0.
   */
  static boolean_t zones_over = B_FALSE;
  
  /*
+  * On large memory systems, multiple instances of the page scanner are run,
+  * each responsible for a separate region of memory. This speeds up page
+  * invalidation under low memory conditions.
+  *
+  * despagescanners can be set in /etc/system or via mdb and it will
+  * be used as a guide for how many page scanners to create; the value
+  * will be adjusted if it is not sensible. Otherwise, the number of
+  * page scanners is determined dynamically based on handspreadpages.
+  */
+ static void
+ recalc_pagescanners(void)
+ {
+         pgcnt_t sz;
+         uint_t des;
+ 
+         /* If the initial calibration has not been done, take no action. */
+         if (pageout_new_spread == 0)
+                 return;
+ 
+         /*
+          * If the desired number of scanners is set in /etc/system
+          * then try to use it.
+          */
+         if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
+                 despagescanners = clockinit.ci_despagescanners;
+ 
+         if (despagescanners != 0) {
+                 /*
+                  * We have a desired number of page scanners, either from
+                  * /etc/system or set via mdb. Try and use it (it will be
+                  * clamped below).
+                  */
+                 des = despagescanners;
+         } else {
+                 /*
+                  * Calculate the number of desired scanners based on the
+                  * system's memory size.
+                  *
+                  * A 64GiB region size is used as the basis for calculating how
+                  * many scanner threads should be created. For systems with up
+                  * to 64GiB of RAM, a single thread is used; for very large
+                  * memory systems the threads are limited to MAX_PSCAN_THREADS.
+                  */
+                 sz = btop(64ULL << 30);
+ 
+                 if (sz > looppages) {
+                         des = 1;
+                 } else {
+                         pgcnt_t tmp = sz;
+ 
+                         for (des = 1; tmp < looppages; des++)
+                                 tmp += sz;
+                 }
+         }
+ 
+         /*
+          * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
+          * and so that each scanner covers at least 10% more than
+          * handspreadpages.
+          */
+         des = clamp(des, 1,
+             looppages / (handspreadpages + handspreadpages / 10));
+         despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
+ }
+ 
+ /*
   * Set up the paging constants for the clock algorithm used by
   * pageout_scanner(), and by the virtual memory system overall.  See the
   * comments at the top of this file for more information about the threshold
   * values and system responses to memory pressure.
   *
*** 432,444 ****
   * rate become available.
   */
  void
  setupclock(void)
  {
-         uint_t i;
-         pgcnt_t sz, tmp;
-         pgcnt_t defval;
          bool half = (pageout_threshold_style == 1);
          bool recalc = true;
  
          looppages = total_pages;
  
--- 499,508 ----
*** 448,458 ****
           * values so that they can be used for subsequent recalculations.
           *
           * A value of zero for any tunable means we will use the default
           * sizing.
           */
- 
          if (!clockinit.ci_init) {
                  clockinit.ci_init = true;
  
                  clockinit.ci_lotsfree_min = lotsfree_min;
                  clockinit.ci_lotsfree_max = lotsfree_max;
--- 512,521 ----
*** 464,473 ****
--- 527,537 ----
                  clockinit.ci_maxpgio = maxpgio;
                  clockinit.ci_maxfastscan = maxfastscan;
                  clockinit.ci_fastscan = fastscan;
                  clockinit.ci_slowscan = slowscan;
                  clockinit.ci_handspreadpages = handspreadpages;
+                 clockinit.ci_despagescanners = despagescanners;
  
                  /*
                   * The first call does not trigger a recalculation, only
                   * subsequent calls.
                   */
*** 645,655 ****
          if (slowscan > fastscan / 2) {
                  slowscan = fastscan / 2;
          }
  
          /*
!          * Handspreadpages is distance (in pages) between front and back
           * pageout daemon hands.  The amount of time to reclaim a page
           * once pageout examines it increases with this distance and
           * decreases as the scan rate rises. It must be < the amount
           * of pageable memory.
           *
--- 709,719 ----
          if (slowscan > fastscan / 2) {
                  slowscan = fastscan / 2;
          }
  
          /*
!          * Handspreadpages is the distance (in pages) between front and back
           * pageout daemon hands.  The amount of time to reclaim a page
           * once pageout examines it increases with this distance and
           * decreases as the scan rate rises. It must be < the amount
           * of pageable memory.
           *
*** 680,741 ****
           */
          if (handspreadpages >= looppages) {
                  handspreadpages = looppages - 1;
          }
  
-         if (!recalc) {
                  /*
!                  * Setup basic values at initialization.
                   */
!                 pscan_region_sz = total_pages;
!                 des_page_scanners = n_page_scanners = 1;
!                 reset_hands[0] = B_TRUE;
!                 return;
!         }
  
          /*
!          * Recalculating
!          *
!          * We originally set the number of page scanners to 1. Now that we
!          * know what the handspreadpages is for a scanner, figure out how many
!          * scanners we should run. We want to ensure that the regions don't
!          * overlap and that they are not touching.
!          *
!          * A default 64GB region size is used as the initial value to calculate
!          * how many scanner threads we should create on lower memory systems.
!          * The idea is to limit the number of threads to a practical value
!          * (e.g. a 64GB machine really only needs one scanner thread). For very
!          * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
!          * threads.
!          *
!          * The scanner threads themselves are evenly spread out around the
!          * memory "clock" in pageout_scanner when we reset the hands, and each
!          * thread will scan all of memory.
           */
!         sz = (btop(64ULL * 0x40000000ULL));
!         if (sz < handspreadpages) {
                  /*
!                  * 64GB is smaller than the separation between the front
!                  * and back hands; use double handspreadpages.
                   */
!                 sz = handspreadpages << 1;
!         }
!         if (sz > total_pages) {
!                 sz = total_pages;
!         }
!         /* Record region size for inspection with mdb, otherwise unused */
!         pscan_region_sz = sz;
  
!         tmp = sz;
!         for (i = 1; tmp < total_pages; i++) {
!                 tmp += sz;
!         }
! 
!         if (i > MAX_PSCAN_THREADS)
!                 i = MAX_PSCAN_THREADS;
! 
!         des_page_scanners = i;
  }
  
  /*
   * Pageout scheduling.
   *
--- 744,779 ----
           */
          if (handspreadpages >= looppages) {
                  handspreadpages = looppages - 1;
          }
  
          /*
!          * Establish the minimum and maximum length of time to be spent
!          * scanning pages per wakeup, limiting the scanner duty cycle.  The
!          * input percentage values (0-100) must be converted to a fraction of
!          * the number of nanoseconds in a second of wall time, then further
!          * scaled down by the number of scanner wakeups in a second.
           */
!         min_pageout_nsec = MAX(1,
!             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
!         max_pageout_nsec = MAX(min_pageout_nsec,
!             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
  
          /*
!          * If not called for recalculation, return and skip the remaining
!          * steps.
           */
!         if (!recalc)
!                 return;
! 
          /*
!          * Set a flag to re-evaluate the clock hand positions.
           */
!         for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
!                 reset_hands[i] = B_TRUE;
  
!         recalc_pagescanners();
  }
  
  /*
   * Pageout scheduling.
   *
*** 745,757 ****
   * in its current pass; schedpaging() resets this value to zero each time
   * it runs.  Desscan records the number of pages pageout should examine
   * in its next pass; schedpaging() sets this value based on the amount of
   * currently available memory.
   */
- #define SCHEDPAGING_HZ  4
  
! static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
  
  /*
   * Pool of available async pageout putpage requests.
   */
  static struct async_reqs *push_req;
--- 783,794 ----
   * in its current pass; schedpaging() resets this value to zero each time
   * it runs.  Desscan records the number of pages pageout should examine
   * in its next pass; schedpaging() sets this value based on the amount of
   * currently available memory.
   */
  
! static kmutex_t pageout_mutex;
  
  /*
   * Pool of available async pageout putpage requests.
   */
  static struct async_reqs *push_req;
*** 775,785 ****
  static uint_t pageout_stucktime = 0;
  static bool pageout_pushing = false;
  static uint64_t pageout_pushcount = 0;
  static uint64_t pageout_pushcount_seen = 0;
  
! static int async_list_size = 256;       /* number of async request structs */
  
  static void pageout_scanner(void *);
  
  /*
   * If a page is being shared more than "po_share" times
--- 812,822 ----
  static uint_t pageout_stucktime = 0;
  static bool pageout_pushing = false;
  static uint64_t pageout_pushcount = 0;
  static uint64_t pageout_pushcount_seen = 0;
  
! static int async_list_size = 8192;      /* number of async request structs */
  
  static void pageout_scanner(void *);
  
  /*
   * If a page is being shared more than "po_share" times
*** 806,840 ****
                  seg_preap();
  
          if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
                  kcage_cageout_wakeup();
  
!         (void) atomic_swap_ulong(&nscan, 0);
          vavail = freemem - deficit;
          if (pageout_new_spread != 0)
                  vavail -= needfree;
!         if (vavail < 0)
!                 vavail = 0;
!         if (vavail > lotsfree)
!                 vavail = lotsfree;
  
-         /*
-          * Fix for 1161438 (CRS SPR# 73922).  All variables
-          * in the original calculation for desscan were 32 bit signed
-          * ints.  As freemem approaches 0x0 on a system with 1 Gig or
-          * more of memory, the calculation can overflow.  When this
-          * happens, desscan becomes negative and pageout_scanner()
-          * stops paging out.
-          */
          if (needfree > 0 && pageout_new_spread == 0) {
                  /*
                   * If we've not yet collected enough samples to
!                  * calculate a spread, kick into high gear anytime
!                  * needfree is non-zero. Note that desscan will not be
!                  * the limiting factor for systems with larger memory;
!                  * the %CPU will limit the scan. That will also be
!                  * maxed out below.
                   */
                  desscan = fastscan / SCHEDPAGING_HZ;
          } else {
                  /*
                   * Once we've calculated a spread based on system
--- 843,869 ----
                  seg_preap();
  
          if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
                  kcage_cageout_wakeup();
  
!         if (mutex_tryenter(&pageout_mutex)) {
! 
!                 if (pageouts_running != 0)
!                         goto out;
! 
!                 /* No pageout scanner threads running. */
!                 nscan = 0;
                  vavail = freemem - deficit;
                  if (pageout_new_spread != 0)
                          vavail -= needfree;
!                 vavail = clamp(vavail, 0, lotsfree);
  
                  if (needfree > 0 && pageout_new_spread == 0) {
                          /*
                           * If we've not yet collected enough samples to
!                          * calculate a spread, use the old logic of kicking
!                          * into high gear anytime needfree is non-zero.
                           */
                          desscan = fastscan / SCHEDPAGING_HZ;
                  } else {
                          /*
                           * Once we've calculated a spread based on system
*** 848,934 ****
                  result = (slowstmp + faststmp) /
                      nz(lotsfree) / SCHEDPAGING_HZ;
                  desscan = (pgcnt_t)result;
          }
  
!         /*
!          * If we've not yet collected enough samples to calculate a
!          * spread, also kick %CPU to the max.
!          */
!         if (pageout_new_spread == 0) {
!                 pageout_nsec = max_pageout_nsec;
!         } else {
!                 pageout_nsec = min_pageout_nsec +
!                     (lotsfree - vavail) *
!                     (max_pageout_nsec - min_pageout_nsec) /
!                     nz(lotsfree);
!         }
  
!         if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
                  /*
!                  * We have finished the pagescan initialization and the desired
!                  * number of page scanners has changed, either because
!                  * initialization just finished, because of a memory DR, or
!                  * because des_page_scanners has been modified on the fly (i.e.
!                  * by mdb). If we need more scanners, start them now, otherwise
!                  * the excess scanners will terminate on their own when they
!                  * reset their hands.
                   */
!                 uint_t i;
!                 uint_t curr_nscan = n_page_scanners;
!                 pgcnt_t max = total_pages / handspreadpages;
  
!                 if (des_page_scanners > max)
!                         des_page_scanners = max;
  
!                 if (des_page_scanners > MAX_PSCAN_THREADS) {
!                         des_page_scanners = MAX_PSCAN_THREADS;
!                 } else if (des_page_scanners == 0) {
!                         des_page_scanners = 1;
!                 }
  
!                 /*
!                  * Each thread has its own entry in the reset_hands array, so
!                  * we don't need any locking in pageout_scanner to check the
!                  * thread's reset_hands entry. Thus, we use a pre-allocated
!                  * fixed size reset_hands array and upper limit on the number
!                  * of pagescan threads.
!                  *
!                  * The reset_hands entries need to be true before we start new
!                  * scanners, but if we're reducing, we don't want a race on the
!                  * recalculation for the existing threads, so we set
!                  * n_page_scanners first.
!                  */
!                 n_page_scanners = des_page_scanners;
!                 for (i = 0; i < MAX_PSCAN_THREADS; i++) {
                          reset_hands[i] = B_TRUE;
-                 }
  
!                 if (des_page_scanners > curr_nscan) {
!                         /* Create additional pageout scanner threads. */
!                         for (i = curr_nscan; i < des_page_scanners; i++) {
                                  (void) lwp_kernel_create(proc_pageout,
!                                     pageout_scanner, (void *)(uintptr_t)i,
!                                     TS_RUN, curthread->t_pri);
                          }
                  }
          }
  
          zones_over = B_FALSE;
  
!         if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
!                 if (!PAGE_SCAN_STARTUP)
!                         low_mem_scan++;
                  /*
!                  * Either we need more memory, or we still need to
!                  * measure the average scan rate.  Wake the scanner.
                   */
!                 DTRACE_PROBE(schedpage__wake__low);
                  WAKE_PAGEOUT_SCANNER();
  
          } else if (zone_num_over_cap > 0) {
!                 /* One or more zones are over their cap. */
  
                  /* No page limit */
                  desscan = total_pages;
  
                  /*
--- 877,957 ----
                          result = (slowstmp + faststmp) /
                              nz(lotsfree) / SCHEDPAGING_HZ;
                          desscan = (pgcnt_t)result;
                  }
  
!                 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
!                     (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
  
!                 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
!                     pageout_nsec);
! 
!                 if (pageout_new_spread != 0 && despagescanners != 0 &&
!                     despagescanners != n_page_scanners) {
                          /*
!                         * We have finished the pagescan initialisation and the
!                         * desired number of page scanners has changed, either
!                         * because initialisation just finished, because of a
!                         * memory DR, or because despagescanners has been
!                         * modified on the fly (i.e. by mdb).
                          */
!                         uint_t i, curr_nscan = n_page_scanners;
  
!                         /* Re-validate despagescanners */
!                         recalc_pagescanners();
  
!                         n_page_scanners = despagescanners;
  
!                         for (i = 0; i < MAX_PSCAN_THREADS; i++)
                                  reset_hands[i] = B_TRUE;
  
!                         /* If we need more scanners, start them now. */
!                         if (n_page_scanners > curr_nscan) {
!                                 for (i = curr_nscan; i < n_page_scanners; i++) {
                                          (void) lwp_kernel_create(proc_pageout,
!                                             pageout_scanner,
!                                             (void *)(uintptr_t)i, TS_RUN,
!                                             pageout_pri);
                                  }
                          }
+ 
+                         /*
+                          * If the number of scanners has decreased, trigger a
+                          * wakeup so that the excess threads will terminate.
+                          */
+                         if (n_page_scanners < curr_nscan) {
+                                 WAKE_PAGEOUT_SCANNER();
                          }
+                 }
  
                  zones_over = B_FALSE;
  
!                 if (PAGE_SCAN_STARTUP) {
                          /*
!                          * We still need to measure the rate at which the
!                          * system is able to scan pages of memory. Each of
!                          * these initial samples is a scan of as much system
!                          * memory as practical, regardless of whether or not we
!                          * are experiencing memory pressure.
                           */
!                         desscan = total_pages;
!                         pageout_nsec = max_pageout_nsec;
! 
!                         DTRACE_PROBE(schedpage__wake__sample);
                          WAKE_PAGEOUT_SCANNER();
+                 } else if (freemem < lotsfree + needfree) {
+                         /*
+                          * We need more memory.
+                          */
+                         low_mem_scan++;
  
+                         DTRACE_PROBE(schedpage__wake__low);
+                         WAKE_PAGEOUT_SCANNER();
                  } else if (zone_num_over_cap > 0) {
!                         /*
!                          * One of more zones are over their cap.
!                          */
  
                          /* No page limit */
                          desscan = total_pages;
  
                          /*
*** 943,966 ****
                  zones_over = B_TRUE;
                  zone_cap_scan++;
  
                  DTRACE_PROBE(schedpage__wake__zone);
                  WAKE_PAGEOUT_SCANNER();
- 
          } else {
                  /*
                   * There are enough free pages, no need to
                   * kick the scanner thread.  And next time
                   * around, keep more of the `highly shared'
                   * pages.
                   */
                  cv_signal_pageout();
- 
-                 mutex_enter(&pageout_mutex);
                  if (po_share > MIN_PO_SHARE) {
                          po_share >>= 1;
                  }
                  mutex_exit(&pageout_mutex);
          }
  
          /*
           * Signal threads waiting for available memory.
--- 966,988 ----
                          zones_over = B_TRUE;
                          zone_cap_scan++;
  
                          DTRACE_PROBE(schedpage__wake__zone);
                          WAKE_PAGEOUT_SCANNER();
                  } else {
                          /*
                           * There are enough free pages, no need to
                           * kick the scanner thread.  And next time
                           * around, keep more of the `highly shared'
                           * pages.
                           */
                          cv_signal_pageout();
                          if (po_share > MIN_PO_SHARE) {
                                  po_share >>= 1;
                          }
+                 }
+ out:
                  mutex_exit(&pageout_mutex);
          }
  
          /*
           * Signal threads waiting for available memory.
*** 985,1035 ****
  uint_t dopageout = 1;
  
  /*
   * The page out daemon, which runs as process 2.
   *
!  * Page out occurs when either:
!  * a) there is less than lotsfree pages,
!  * b) there are one or more zones over their physical memory cap.
   *
!  * The daemon treats physical memory as a circular array of pages and scans the
!  * pages using a 'two-handed clock' algorithm. The front hand moves through
!  * the pages, clearing the reference bit. The back hand travels a distance
!  * (handspreadpages) behind the front hand, freeing the pages that have not
!  * been referenced in the time since the front hand passed. If modified, they
!  * are first written to their backing store before being freed.
   *
!  * In order to make page invalidation more responsive on machines with larger
!  * memory, multiple pageout_scanner threads may be created. In this case, the
!  * threads are evenly distributed around the the memory "clock face" so that
!  * memory can be reclaimed more quickly (that is, there can be large regions in
!  * which no pages can be reclaimed by a single thread, leading to lag which
!  * causes undesirable behavior such as htable stealing).
   *
!  * As long as there are at least lotsfree pages, or no zones over their cap,
!  * then pageout_scanner threads are not run. When pageout_scanner threads are
!  * running for case (a), all pages are considered for pageout. For case (b),
!  * only pages belonging to a zone over its cap will be considered for pageout.
!  *
!  * There are multiple threads that act on behalf of the pageout process.
!  * A set of threads scan pages (pageout_scanner) and frees them up if
!  * they don't require any VOP_PUTPAGE operation. If a page must be
!  * written back to its backing store, the request is put on a list
!  * and the other (pageout) thread is signaled. The pageout thread
!  * grabs VOP_PUTPAGE requests from the list, and processes them.
!  * Some filesystems may require resources for the VOP_PUTPAGE
!  * operations (like memory) and hence can block the pageout
!  * thread, but the pageout_scanner threads can still operate. There is still
!  * no guarantee that memory deadlocks cannot occur.
!  *
!  * The pageout_scanner parameters are determined in schedpaging().
   */
  void
  pageout()
  {
          struct async_reqs *arg;
-         pri_t pageout_pri;
          int i;
          pgcnt_t max_pushes;
          callb_cpr_t cprinfo;
  
          proc_pageout = ttoproc(curthread);
--- 1007,1049 ----
  uint_t dopageout = 1;
  
  /*
   * The page out daemon, which runs as process 2.
   *
!  * The daemon treats physical memory as a circular array of pages and scans
!  * the pages using a 'two-handed clock' algorithm. The front hand moves
!  * through the pages, clearing the reference bit. The back hand travels a
!  * distance (handspreadpages) behind the front hand, freeing the pages that
!  * have not been referenced in the time since the front hand passed. If
!  * modified, they are first written to their backing store before being
!  * freed.
   *
!  * In order to make page invalidation more responsive on machines with
!  * larger memory, multiple pageout_scanner threads may be created. In this
!  * case, each thread is given a segment of the memory "clock face" so that
!  * memory can be reclaimed more quickly.
   *
!  * As long as there are at least lotsfree pages, or no zones over their
!  * cap, then pageout_scanner threads are not run. When pageout_scanner
!  * threads are running for case (a), all pages are considered for pageout.
!  * For case (b), only pages belonging to a zone over its cap will be
!  * considered for pageout.
   *
!  * There are multiple threads that act on behalf of the pageout process. A
!  * set of threads scan pages (pageout_scanner) and frees them up if they
!  * don't require any VOP_PUTPAGE operation. If a page must be written back
!  * to its backing store, the request is put on a list and the other
!  * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
!  * requests from the list, and processes them. Some filesystems may require
!  * resources for the VOP_PUTPAGE operations (like memory) and hence can
!  * block the pageout thread, but the scanner thread can still operate.
!  * There is still no guarantee that memory deadlocks cannot occur.
   */
  void
  pageout()
  {
          struct async_reqs *arg;
          int i;
          pgcnt_t max_pushes;
          callb_cpr_t cprinfo;
  
          proc_pageout = ttoproc(curthread);
*** 1056,1070 ****
          req_freelist = push_req;
          for (i = 0; i < async_list_size - 1; i++) {
                  push_req[i].a_next = &push_req[i + 1];
          }
  
!         pageout_pri = curthread->t_pri;
  
!         /* Create the (first) pageout scanner thread. */
!         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
!             pageout_pri - 1);
  
          /*
           * kick off pageout scheduler.
           */
          schedpaging(NULL);
--- 1070,1085 ----
          req_freelist = push_req;
          for (i = 0; i < async_list_size - 1; i++) {
                  push_req[i].a_next = &push_req[i + 1];
          }
  
!         pageout_pri = curthread->t_pri - 1;
  
!         /* Create the first pageout scanner thread. */
!         (void) lwp_kernel_create(proc_pageout, pageout_scanner,
!             (void *)0,  /* this is instance 0, not NULL */
!             TS_RUN, pageout_pri);
  
          /*
           * kick off pageout scheduler.
           */
          schedpaging(NULL);
*** 1096,1105 ****
--- 1111,1121 ----
                  arg->a_next = NULL;
                  pageout_pushing = true;
                  mutex_exit(&push_lock);
  
                  DTRACE_PROBE(pageout__push);
+ 
                  if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
                      arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
                          pushes++;
                  }
  
*** 1120,1269 ****
   * Kernel thread that scans pages looking for ones to free
   */
  static void
  pageout_scanner(void *a)
  {
!         struct page *fronthand, *backhand;
!         uint_t laps, iter = 0;
          callb_cpr_t cprinfo;
!         pgcnt_t nscan_cnt, nscan_limit;
          pgcnt_t pcount;
!         uint_t inst = (uint_t)(uintptr_t)a;
          hrtime_t sample_start, sample_end;
!         kmutex_t pscan_mutex;
!         bool sampling;
  
          VERIFY3U(inst, <, MAX_PSCAN_THREADS);
  
!         mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
  
-         CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
-         mutex_enter(&pscan_mutex);
- 
          /*
!          * Establish the minimum and maximum length of time to be spent
!          * scanning pages per wakeup, limiting the scanner duty cycle.  The
!          * input percentage values (0-100) must be converted to a fraction of
!          * the number of nanoseconds in a second of wall time, then further
!          * scaled down by the number of scanner wakeups in a second:
           */
!         min_pageout_nsec = MAX(1,
!             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
!         max_pageout_nsec = MAX(min_pageout_nsec,
!             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
  
  loop:
          cv_signal_pageout();
  
          CALLB_CPR_SAFE_BEGIN(&cprinfo);
!         cv_wait(&proc_pageout->p_cv, &pscan_mutex);
!         CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
  
          /*
!          * Check if pageout has been disabled for debugging purposes:
           */
          if (!dopageout) {
                  goto loop;
          }
  
          /*
!          * One may reset the clock hands for debugging purposes.  Hands will
!          * also be reset if memory is added to or removed from the system.
           */
          if (reset_hands[inst]) {
                  struct page *first;
-                 pgcnt_t offset = total_pages / n_page_scanners;
  
                  reset_hands[inst] = B_FALSE;
                  if (inst >= n_page_scanners) {
                          /*
                           * The desired number of page scanners has been
                           * reduced and this instance is no longer wanted.
                           * Exit the lwp.
                           */
                          VERIFY3U(inst, !=, 0);
!                         mutex_exit(&pscan_mutex);
                          mutex_enter(&curproc->p_lock);
                          lwp_exit();
                  }
  
                  /*
!                  * The reset case repositions the hands at the proper place
!                  * on the memory clock face to prevent creep into another
!                  * thread's active region or when the number of threads has
!                  * changed.
!                  *
!                  * Set the two clock hands to be separated by a reasonable
!                  * amount, but no more than 360 degrees apart.
!                  *
!                  * If inst == 0, backhand starts at first page, otherwise
!                  * it is (inst * offset) around the memory "clock face" so that
!                  * we spread out each scanner instance evenly.
                   */
!                 first = page_first();
!                 backhand = page_nextn(first, offset * inst);
!                 if (handspreadpages >= total_pages) {
!                         fronthand = page_nextn(backhand, total_pages - 1);
                  } else {
                          fronthand = page_nextn(backhand, handspreadpages);
                  }
-         }
  
          CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
  
          /*
           * Keep track of the number of times we have scanned all the way around
!          * the loop:
           */
          laps = 0;
  
          /*
           * Track the number of pages visited during this scan so that we can
           * periodically measure our duty cycle.
           */
-         pcount = 0;
          nscan_cnt = 0;
  
!         if (PAGE_SCAN_STARTUP) {
                  /*
!                  * We need to measure the rate at which the system is able to
!                  * scan pages of memory.  Each of these initial samples is a
!                  * scan of all system memory, regardless of whether or not we
!                  * are experiencing memory pressure.
                   */
!                 nscan_limit = total_pages;
!                 sampling = true;
!         } else {
!                 nscan_limit = desscan;
!                 sampling = false;
!         }
  
-         DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
-             page_t *, backhand, page_t *, fronthand);
- 
          sample_start = gethrtime();
  
          /*
           * Scan the appropriate number of pages for a single duty cycle.
           * Only scan while at least one of these is true:
           * 1) one or more zones is over its cap
           * 2) there is not enough free memory
           * 3) during page scan startup when determining sample data
           */
!         while (nscan_cnt < nscan_limit) {
!                 checkpage_result_t rvfront, rvback;
! 
!                 if (!sampling && !zones_over &&
!                     freemem >= lotsfree + needfree) {
                          /*
                           * We are not sampling and enough memory has become
                           * available that scanning is no longer required.
                           */
                          break;
                  }
  
!                 DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
  
                  /*
                   * Periodically check to see if we have exceeded the CPU duty
                   * cycle for a single wakeup.
                   */
--- 1136,1299 ----
   * Kernel thread that scans pages looking for ones to free
   */
  static void
  pageout_scanner(void *a)
  {
!         struct page *fronthand, *backhand, *fronthandstart;
!         struct page *regionstart, *regionend;
!         uint_t laps;
          callb_cpr_t cprinfo;
!         pgcnt_t nscan_cnt, tick;
          pgcnt_t pcount;
!         bool bhwrapping, fhwrapping;
          hrtime_t sample_start, sample_end;
!         uint_t inst = (uint_t)(uintptr_t)a;
  
          VERIFY3U(inst, <, MAX_PSCAN_THREADS);
  
!         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
!         mutex_enter(&pageout_mutex);
  
          /*
!          * The restart case does not attempt to point the hands at roughly
!          * the right point on the assumption that after one circuit things
!          * will have settled down, and restarts shouldn't be that often.
           */
!         reset_hands[inst] = B_TRUE;
  
+         pageouts_running++;
+         mutex_exit(&pageout_mutex);
+ 
  loop:
          cv_signal_pageout();
  
+         mutex_enter(&pageout_mutex);
+         pageouts_running--;
          CALLB_CPR_SAFE_BEGIN(&cprinfo);
!         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
!         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
!         pageouts_running++;
!         mutex_exit(&pageout_mutex);
  
          /*
!          * Check if pageout has been disabled for debugging purposes.
           */
          if (!dopageout) {
                  goto loop;
          }
  
          /*
!          * One may reset the clock hands and scanned region for debugging
!          * purposes. Hands will also be reset on first thread startup, if
!          * the number of scanning threads (n_page_scanners) changes, or if
!          * memory is added to, or removed from, the system.
           */
          if (reset_hands[inst]) {
                  struct page *first;
  
                  reset_hands[inst] = B_FALSE;
+ 
                  if (inst >= n_page_scanners) {
                          /*
                          * The desired number of page scanners has been
                          * reduced and this instance is no longer wanted.
                          * Exit the lwp.
                          */
                          VERIFY3U(inst, !=, 0);
!                         DTRACE_PROBE1(pageout__exit, uint_t, inst);
!                         mutex_enter(&pageout_mutex);
!                         pageouts_running--;
!                         mutex_exit(&pageout_mutex);
                          mutex_enter(&curproc->p_lock);
                          lwp_exit();
+                         /* NOTREACHED */
                  }
  
+                 first = page_first();
+ 
                  /*
!                  * Each scanner thread gets its own sector of the memory
!                  * clock face.
                   */
!                 pgcnt_t span, offset;
! 
!                 span = looppages / n_page_scanners;
!                 VERIFY3U(span, >, handspreadpages);
! 
!                 offset = inst * span;
!                 regionstart = page_nextn(first, offset);
!                 if (inst == n_page_scanners - 1) {
!                         /* The last instance goes up to the last page */
!                         regionend = page_nextn(first, looppages - 1);
                  } else {
+                         regionend = page_nextn(regionstart, span - 1);
+                 }
+ 
+                 backhand = regionstart;
                  fronthand = page_nextn(backhand, handspreadpages);
+                 tick = 1;
+ 
+                 bhwrapping = fhwrapping = B_FALSE;
+ 
+                 DTRACE_PROBE4(pageout__reset, uint_t, inst,
+                     pgcnt_t, regionstart, pgcnt_t, regionend,
+                     pgcnt_t, fronthand);
          }
  
+         /*
+          * This CPU kstat is only incremented here and we're obviously
+          * on this CPU, so no lock.
+          */
          CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
  
          /*
           * Keep track of the number of times we have scanned all the way around
!          * the loop on this wakeup.
           */
          laps = 0;
  
          /*
           * Track the number of pages visited during this scan so that we can
           * periodically measure our duty cycle.
           */
          nscan_cnt = 0;
+         pcount = 0;
  
!         DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
!             hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
! 
          /*
!          * Record the initial position of the front hand for this cycle so
!          * that we can detect when the hand wraps around.
           */
!         fronthandstart = fronthand;
  
          sample_start = gethrtime();
  
          /*
           * Scan the appropriate number of pages for a single duty cycle.
+          */
+         while (nscan_cnt < desscan) {
+                 checkpage_result_t rvfront, rvback;
+ 
+                 /*
                   * Only scan while at least one of these is true:
                   *  1) one or more zones is over its cap
                   *  2) there is not enough free memory
                   *  3) during page scan startup when determining sample data
                   */
!                 if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
!                     !zones_over) {
                          /*
                           * We are not sampling and enough memory has become
                           * available that scanning is no longer required.
                           */
+                         DTRACE_PROBE1(pageout__memfree, uint_t, inst);
                          break;
                  }
  
!                 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
  
                  /*
                   * Periodically check to see if we have exceeded the CPU duty
                   * cycle for a single wakeup.
                   */
*** 1270,1286 ****
                  if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
                          hrtime_t pageout_cycle_nsec;
  
                          pageout_cycle_nsec = gethrtime() - sample_start;
                          if (pageout_cycle_nsec >= pageout_nsec) {
!                                 /*
!                                  * This is where we normally break out of the
!                                  * loop when scanning zones or sampling.
!                                  */
!                                 if (!zones_over) {
                                          atomic_inc_64(&pageout_timeouts);
-                                 }
                                  DTRACE_PROBE1(pageout__timeout, uint_t, inst);
                                  break;
                          }
                  }
  
--- 1300,1311 ----
                  if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
                          hrtime_t pageout_cycle_nsec;
  
                          pageout_cycle_nsec = gethrtime() - sample_start;
                          if (pageout_cycle_nsec >= pageout_nsec) {
!                                 if (!zones_over)
                                          atomic_inc_64(&pageout_timeouts);
                                  DTRACE_PROBE1(pageout__timeout, uint_t, inst);
                                  break;
                          }
                  }
  
*** 1308,1391 ****
                   */
                  if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
                          nscan_cnt++;
                  }
  
!                 backhand = page_next(backhand);
!                 fronthand = page_next(fronthand);
  
!                 /*
!                  * The front hand has wrapped around to the first page in the
!                  * loop.
!                  */
!                 if (fronthand == page_first())  {
!                         DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
  
                          /*
!                          * Every 64 wraps we reposition our hands within our
!                          * region to prevent creep into another thread.
                           */
!                         if ((++iter % pageout_reset_cnt) == 0)
!                                 reset_hands[inst] = B_TRUE;
  
                          /*
                           * This CPU kstat is only incremented here and we're
                           * obviously on this CPU, so no lock.
                           */
                          CPU_STATS_ADDQ(CPU, vm, rev, 1);
  
                          /*
-                          * If scanning because the system is low on memory,
                           * then when we wraparound memory we want to try to
                           * reclaim more pages.
                           * If scanning only because zones are over their cap,
                           * then wrapping is common and we simply keep going.
                           */
!                         if (freemem < lotsfree + needfree && ++laps > 1) {
                                  /*
-                                  * The system is low on memory.
                                   * Extremely unlikely, but it happens.
                                   * We went around the loop at least once
                                   * and didn't get far enough.
                                   * If we are still skipping `highly shared'
                                   * pages, skip fewer of them.  Otherwise,
                                   * give up till the next clock tick.
                                   */
-                                 mutex_enter(&pageout_mutex);
                                  if (po_share < MAX_PO_SHARE) {
                                          po_share <<= 1;
-                                         mutex_exit(&pageout_mutex);
                                  } else {
-                                         mutex_exit(&pageout_mutex);
                                          break;
                                  }
                          }
                  }
          }
  
          atomic_add_long(&nscan, nscan_cnt);
  
!         sample_end = gethrtime();
  
-         DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
-             uint_t, inst);
- 
          /*
-          * The following two blocks are only relevant when the scanner is
-          * first started up. After the scanner runs for a while, neither of
-          * the conditions will ever be true again.
-          *
           * The global variables used below are only modified by this thread and
           * only during initial scanning when there is a single page scanner
!          * thread running. Thus, we don't use any locking.
           */
          if (pageout_new_spread == 0) {
                  VERIFY3U(inst, ==, 0);
                  if (PAGE_SCAN_STARTUP) {
                          /*
                           * Continue accumulating samples until we have enough
!                          * to get a reasonable value for average scan rate:
                           */
                          pageout_sample_pages += pcount;
                          pageout_sample_etime += sample_end - sample_start;
                          ++pageout_sample_cnt;
                  }
--- 1333,1416 ----
                   */
                  if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
                          nscan_cnt++;
                  }
  
!                 if (bhwrapping) {
!                         backhand = regionstart;
!                         bhwrapping = B_FALSE;
!                 } else {
!                         backhand = page_nextn(backhand, tick);
!                         if (backhand == regionend)
!                                 bhwrapping = B_TRUE;
!                 }
  
!                 if (fhwrapping) {
!                         fronthand = regionstart;
!                         fhwrapping = B_FALSE;
!                 } else {
!                         fronthand = page_nextn(fronthand, tick);
!                         if (fronthand == regionend)
!                                 fhwrapping = B_TRUE;
!                 }
  
                  /*
!                  * The front hand has wrapped around during this wakeup.
                   */
!                 if (fronthand == fronthandstart) {
!                         laps++;
!                         DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
!                             uint_t, laps);
  
                          /*
                           * This CPU kstat is only incremented here and we're
                           * obviously on this CPU, so no lock.
                           */
                          CPU_STATS_ADDQ(CPU, vm, rev, 1);
  
                          /*
                           * then when we wraparound memory we want to try to
                           * reclaim more pages.
                           * If scanning only because zones are over their cap,
                           * then wrapping is common and we simply keep going.
                          */
!                         if (laps > 1 && freemem < lotsfree + needfree) {
                                  /*
                                   * Extremely unlikely, but it happens.
                                   * We went around the loop at least once
                                   * and didn't get far enough.
                                   * If we are still skipping `highly shared'
                                   * pages, skip fewer of them.  Otherwise,
                                   * give up till the next clock tick.
                                   */
                                  if (po_share < MAX_PO_SHARE) {
                                          po_share <<= 1;
                                  } else {
                                          break;
                                  }
                          }
                  }
          }
  
+         sample_end = gethrtime();
          atomic_add_long(&nscan, nscan_cnt);
  
!         DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
!             pgcnt_t, nscan_cnt, pgcnt_t, pcount)
  
          /*
           * The global variables used below are only modified by this thread and
           * only during initial scanning when there is a single page scanner
!          * thread running.
           */
          if (pageout_new_spread == 0) {
                  VERIFY3U(inst, ==, 0);
+ 
                  if (PAGE_SCAN_STARTUP) {
                          /*
                           * Continue accumulating samples until we have enough
!                          * to get a reasonable value for average scan rate.
                           */
                          pageout_sample_pages += pcount;
                          pageout_sample_etime += sample_end - sample_start;
                          ++pageout_sample_cnt;
                  }
*** 1535,1545 ****
          }
  
          /*
           * Maintain statistics for what we are freeing
           */
- 
          if (pp->p_vnode != NULL) {
                  if (pp->p_vnode->v_flag & VVMEXEC)
                          isexec = 1;
  
                  if (!IS_SWAPFSVP(pp->p_vnode))
--- 1560,1569 ----