Print this page
    
13097 improve VM tunables for modern systems (fix mismerge)
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/vm_pageout.c
          +++ new/usr/src/uts/common/os/vm_pageout.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2021 Oxide Computer Company
  24   24   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29   29   * Use is subject to license terms.
  30   30   * Copyright 2018 Joyent, Inc.
  31   31   */
  32   32  
  33   33  /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  34   34  /* All Rights Reserved */
  35   35  
  36   36  /*
  37   37   * University Copyright- Copyright (c) 1982, 1986, 1988
  38   38   * The Regents of the University of California
  39   39   * All Rights Reserved
  40   40   *
  41   41   * University Acknowledgment- Portions of this document are derived from
  42   42   * software developed by the University of California, Berkeley, and its
  43   43   * contributors.
  44   44   */
  45   45  
  46   46  #include <sys/types.h>
  47   47  #include <sys/t_lock.h>
  48   48  #include <sys/param.h>
  49   49  #include <sys/buf.h>
  50   50  #include <sys/uio.h>
  51   51  #include <sys/proc.h>
  52   52  #include <sys/systm.h>
  53   53  #include <sys/mman.h>
  54   54  #include <sys/cred.h>
  55   55  #include <sys/vnode.h>
  56   56  #include <sys/vm.h>
  57   57  #include <sys/vmparam.h>
  58   58  #include <sys/vtrace.h>
  59   59  #include <sys/cmn_err.h>
  60   60  #include <sys/cpuvar.h>
  61   61  #include <sys/user.h>
  62   62  #include <sys/kmem.h>
  63   63  #include <sys/debug.h>
  64   64  #include <sys/callb.h>
  65   65  #include <sys/tnf_probe.h>
  66   66  #include <sys/mem_cage.h>
  67   67  #include <sys/time.h>
  68   68  #include <sys/zone.h>
  69   69  #include <sys/stdbool.h>
  70   70  
  71   71  #include <vm/hat.h>
  72   72  #include <vm/as.h>
  73   73  #include <vm/seg.h>
  74   74  #include <vm/page.h>
  75   75  #include <vm/pvn.h>
  76   76  #include <vm/seg_kmem.h>
  77   77  
  78   78  /*
  79   79   * FREE MEMORY MANAGEMENT
  80   80   *
  81   81   * Management of the pool of free pages is a tricky business.  There are
  82   82   * several critical threshold values which constrain our allocation of new
  83   83   * pages and inform the rate of paging out of memory to swap.  These threshold
  84   84   * values, and the behaviour they induce, are described below in descending
  85   85   * order of size -- and thus increasing order of severity!
  86   86   *
  87   87   *   +---------------------------------------------------- physmem (all memory)
  88   88   *   |
  89   89   *   | Ordinarily there are no particular constraints placed on page
  90   90   *   v allocation.  The page scanner is not running and page_create_va()
  91   91   *   | will effectively grant all page requests (whether from the kernel
  92   92   *   | or from user processes) without artificial delay.
  93   93   *   |
  94   94   *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
  95   95   *   |
  96   96   *   | When we have less than "lotsfree" pages, pageout_scanner() is
  97   97   *   v signalled by schedpaging() to begin looking for pages that can
  98   98   *   | be evicted to disk to bring us back above lotsfree.  At this
  99   99   *   | stage there is still no constraint on allocation of free pages.
 100  100   *   |
 101  101   *   | For small systems, we set a lower bound of 16MB for lotsfree;
 102  102   *   v this is the natural value for a system with 1GB memory.  This is
 103  103   *   | to ensure that the pageout reserve pool contains at least 4MB
 104  104   *   | for use by ZFS.
 105  105   *   |
 106  106   *   | For systems with a large amount of memory, we constrain lotsfree
 107  107   *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
 108  108   *   v at some point the required slack relates more closely to the
 109  109   *   | rate at which paging can occur than to the total amount of memory.
 110  110   *   |
 111  111   *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
 112  112   *   |
 113  113   *   | When we drop below desfree, a number of kernel facilities will
 114  114   *   v wait before allocating more memory, under the assumption that
 115  115   *   | pageout or reaping will make progress and free up some memory.
 116  116   *   | This behaviour is not especially coordinated; look for comparisons
 117  117   *   | of desfree and freemem.
 118  118   *   |
 119  119   *   | In addition to various attempts at advisory caution, clock()
 120  120   *   | will wake up the thread that is ordinarily parked in sched().
 121  121   *   | This routine is responsible for the heavy-handed swapping out
 122  122   *   v of entire processes in an attempt to arrest the slide of free
 123  123   *   | memory.  See comments in sched.c for more details.
 124  124   *   |
 125  125   *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
 126  126   *   |
 127  127   *   | These two separate tunables have, by default, the same value.
 128  128   *   v Various parts of the kernel use minfree to signal the need for
 129  129   *   | more aggressive reclamation of memory, and sched() is more
 130  130   *   | aggressive at swapping processes out.
 131  131   *   |
 132  132   *   | If free memory falls below throttlefree, page_create_va() will
 133  133   *   | use page_create_throttle() to begin holding most requests for
 134  134   *   | new pages while pageout and reaping free up memory.  Sleeping
 135  135   *   v allocations (e.g., KM_SLEEP) are held here while we wait for
 136  136   *   | more memory.  Non-sleeping allocations are generally allowed to
 137  137   *   | proceed, unless their priority is explicitly lowered with
 138  138   *   | KM_NORMALPRI.
 139  139   *   |
 140  140   *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
 141  141   *   |
 142  142   *   | When we hit throttlefree, the situation is already dire.  The
 143  143   *   v system is generally paging out memory and swapping out entire
 144  144   *   | processes in order to free up memory for continued operation.
 145  145   *   |
 146  146   *   | Unfortunately, evicting memory to disk generally requires short
 147  147   *   | term use of additional memory; e.g., allocation of buffers for
 148  148   *   | storage drivers, updating maps of free and used blocks, etc.
 149  149   *   | As such, pageout_reserve is the number of pages that we keep in
 150  150   *   | special reserve for use by pageout() and sched() and by any
 151  151   *   v other parts of the kernel that need to be working for those to
 152  152   *   | make forward progress such as the ZFS I/O pipeline.
 153  153   *   |
 154  154   *   | When we are below pageout_reserve, we fail or hold any allocation
 155  155   *   | that has not explicitly requested access to the reserve pool.
 156  156   *   | Access to the reserve is generally granted via the KM_PUSHPAGE
 157  157   *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
 158  158   *   | can implicitly tap the reserve.  For more details, see the
 159  159   *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
 160  160   *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
 161  161   *   |
 162  162   *   +---------------------------------------------------------- no free memory
 163  163   *   |
 164  164   *   | If we have arrived here, things are very bad indeed.  It is
 165  165   *   v surprisingly difficult to tell if this condition is even fatal,
 166  166   *   | as enough memory may have been granted to pageout() and to the
 167  167   *   | ZFS I/O pipeline that requests for eviction that have already been
 168  168   *   | made will complete and free up memory some time soon.
 169  169   *   |
 170  170   *   | If free memory does not materialise, the system generally remains
 171  171   *   | deadlocked.  The pageout_deadman() below is run once per second
 172  172   *   | from clock(), seeking to limit the amount of time a single request
 173  173   *   v to page out can be blocked before the system panics to get a crash
 174  174   *   | dump and return to service.
 175  175   *   |
 176  176   *   +-------------------------------------------------------------------------
 177  177   */
 178  178  
 179  179  /*
 180  180   * The following parameters control operation of the page replacement
 181  181   * algorithm.  They are initialized to 0, and then computed at boot time based
 182  182   * on the size of the system; see setupclock().  If they are patched non-zero
 183  183   * in a loaded vmunix they are left alone and may thus be changed per system
 184  184   * using "mdb -kw" on the loaded system.
 185  185   */
 186  186  pgcnt_t         slowscan = 0;
 187  187  pgcnt_t         fastscan = 0;
 188  188  
 189  189  static pgcnt_t  handspreadpages = 0;
 190  190  
 191  191  /*
 192  192   * looppages:
 193  193   *     Cached copy of the total number of pages in the system (total_pages).
 194  194   *
 195  195   * loopfraction:
 196  196   *     Divisor used to relate fastscan to looppages in setupclock().
 197  197   */
 198  198  static uint_t   loopfraction = 2;
 199  199  static pgcnt_t  looppages;
 200  200  
 201  201  static uint_t   min_percent_cpu = 4;
 202  202  static uint_t   max_percent_cpu = 80;
 203  203  static pgcnt_t  maxfastscan = 0;
 204  204  static pgcnt_t  maxslowscan = 100;
 205  205  
 206  206  #define         MEGABYTES               (1024ULL * 1024ULL)
 207  207  
 208  208  /*
 209  209   * pageout_threshold_style:
 210  210   *     set to 1 to use the previous default threshold size calculation;
 211  211   *     i.e., each threshold is half of the next largest value.
 212  212   */
 213  213  uint_t          pageout_threshold_style = 0;
 214  214  
 215  215  /*
 216  216   * The operator may override these tunables to request a different minimum or
 217  217   * maximum lotsfree value, or to change the divisor we use for automatic
 218  218   * sizing.
 219  219   *
 220  220   * By default, we make lotsfree 1/64th of the total memory in the machine.  The
 221  221   * minimum and maximum are specified in bytes, rather than pages; a zero value
 222  222   * means the default values (below) are used.
 223  223   */
 224  224  uint_t          lotsfree_fraction = 64;
 225  225  pgcnt_t         lotsfree_min = 0;
 226  226  pgcnt_t         lotsfree_max = 0;
 227  227  
 228  228  #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 229  229  #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 230  230  
 231  231  /*
 232  232   * If these tunables are set to non-zero values in /etc/system, and provided
 233  233   * the value is not larger than the threshold above, the specified value will
 234  234   * be used directly without any additional calculation or adjustment.  The boot
  
    | 
      ↓ open down ↓ | 
    234 lines elided | 
    
      ↑ open up ↑ | 
  
 235  235   * time value of these overrides is preserved in the "clockinit" struct.  More
 236  236   * detail is available in the comment at the top of the file.
 237  237   */
 238  238  pgcnt_t         maxpgio = 0;
 239  239  pgcnt_t         minfree = 0;
 240  240  pgcnt_t         desfree = 0;
 241  241  pgcnt_t         lotsfree = 0;
 242  242  pgcnt_t         needfree = 0;
 243  243  pgcnt_t         throttlefree = 0;
 244  244  pgcnt_t         pageout_reserve = 0;
      245 +pri_t           pageout_pri;
 245  246  
 246  247  pgcnt_t         deficit;
 247  248  pgcnt_t         nscan;
 248  249  pgcnt_t         desscan;
 249  250  
 250  251  /* kstats */
 251  252  uint64_t low_mem_scan;
 252  253  uint64_t zone_cap_scan;
 253      -uint64_t n_throttle;
 254  254  
      255 +#define MAX_PSCAN_THREADS       16
      256 +
 255  257  /*
 256  258   * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
 257  259   * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
 258  260   * that gives the equivalent of some underlying %CPU duty cycle.
 259  261   *
 260  262   * min_pageout_nsec:
 261  263   *     nanoseconds/wakeup equivalent of min_percent_cpu.
 262  264   *
 263  265   * max_pageout_nsec:
 264  266   *     nanoseconds/wakeup equivalent of max_percent_cpu.
 265  267   *
 266  268   * pageout_nsec:
 267  269   *     Number of nanoseconds budgeted for each wakeup cycle.
 268  270   *     Computed each time around by schedpaging().
 269  271   *     Varies between min_pageout_nsec and max_pageout_nsec,
 270  272   *     depending on memory pressure or zones over their cap.
 271  273   *
 272  274   * zone_pageout_nsec:
 273      - *     Number of nanoseconds budget for each cycle when a zone
 274      - *     is over its memory cap. If this is zero, then the value
 275      - *     of max_pageout_nsec is used instead.
      275 + *      Number of nanoseconds budget for each cycle when a zone
      276 + *      is over its memory cap. If this is zero, then the value
      277 + *      of max_pageout_nsec is used instead.
 276  278   */
 277      -
 278  279  static hrtime_t min_pageout_nsec;
 279  280  static hrtime_t max_pageout_nsec;
 280  281  static hrtime_t pageout_nsec;
 281  282  static hrtime_t zone_pageout_nsec;
 282  283  
 283      -#define MAX_PSCAN_THREADS       16
 284      -static boolean_t reset_hands[MAX_PSCAN_THREADS];
      284 +static boolean_t        reset_hands[MAX_PSCAN_THREADS];
 285  285  
      286 +#define PAGES_POLL_MASK 1023
      287 +#define SCHEDPAGING_HZ  4
      288 +
 286  289  /*
 287      - * These can be tuned in /etc/system or set with mdb.
 288      - * 'des_page_scanners' is the desired number of page scanner threads. The
 289      - * system will bring the actual number of threads into line with the desired
 290      - * number. If des_page_scanners is set to an invalid value, the system will
 291      - * correct the setting.
      290 + * despagescanners:
      291 + *      The desired number of page scanner threads. The value can be set in
      292 + *      /etc/system or tuned directly with 'mdb -kw'.  The system will bring
      293 + *      the actual number of threads into line with the desired number. If set
      294 + *      to an invalid value, the system will correct the setting.
 292  295   */
 293      -uint_t des_page_scanners;
 294      -uint_t pageout_reset_cnt = 64;  /* num. cycles for pageout_scanner hand reset */
      296 +uint_t despagescanners = 0;
 295  297  
 296      -uint_t n_page_scanners;
 297      -static pgcnt_t  pscan_region_sz; /* informational only */
 298      -
 299      -#define PAGES_POLL_MASK 1023
 300      -
 301  298  /*
 302  299   * pageout_sample_lim:
 303  300   *     The limit on the number of samples needed to establish a value for new
 304  301   *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 305  302   *     handspreadpages.
 306  303   *
 307  304   * pageout_sample_cnt:
 308  305   *     Current sample number.  Once the sample gets large enough, set new
 309  306   *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 310  307   *
 311  308   * pageout_sample_pages:
 312  309   *     The accumulated number of pages scanned during sampling.
 313  310   *
 314  311   * pageout_sample_etime:
 315  312   *     The accumulated nanoseconds for the sample.
  
    | 
      ↓ open down ↓ | 
    5 lines elided | 
    
      ↑ open up ↑ | 
  
 316  313   *
 317  314   * pageout_rate:
 318  315   *     Rate in pages/nanosecond, computed at the end of sampling.
 319  316   *
 320  317   * pageout_new_spread:
 321  318   *     Initially zero while the system scan rate is measured by
 322  319   *     pageout_scanner(), which then sets this value once per system boot after
 323  320   *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 324  321   *     new value is used for fastscan and handspreadpages.
 325  322   */
 326      -
 327  323  typedef hrtime_t hrrate_t;
 328  324  
 329  325  static uint64_t pageout_sample_lim = 4;
 330  326  static uint64_t pageout_sample_cnt = 0;
 331  327  static pgcnt_t  pageout_sample_pages = 0;
      328 +static hrtime_t pageout_sample_etime = 0;
 332  329  static hrrate_t pageout_rate = 0;
 333  330  static pgcnt_t  pageout_new_spread = 0;
 334  331  
 335      -static hrtime_t pageout_sample_etime = 0;
 336      -
 337      -/* True if page scanner is first starting up */
      332 +/* True if the page scanner is first starting up */
 338  333  #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
 339  334  
      335 +/* The current number of page scanner threads */
      336 +static uint_t n_page_scanners = 1;
      337 +/* The number of page scanner threads that are actively scanning. */
      338 +static uint_t pageouts_running;
      339 +
 340  340  /*
 341  341   * Record number of times a pageout_scanner() wakeup cycle finished because it
 342  342   * timed out (exceeded its CPU budget), rather than because it visited
 343  343   * its budgeted number of pages. This is only done when scanning under low
 344  344   * free memory conditions, not when scanning for zones over their cap.
 345  345   */
 346  346  uint64_t        pageout_timeouts = 0;
 347  347  
 348  348  #ifdef VM_STATS
 349  349  static struct pageoutvmstats_str {
 350  350          ulong_t checkpage[3];
 351  351  } pageoutvmstats;
 352  352  #endif /* VM_STATS */
 353  353  
 354  354  /*
 355  355   * Threads waiting for free memory use this condition variable and lock until
 356  356   * memory becomes available.
 357  357   */
 358  358  kmutex_t        memavail_lock;
 359  359  kcondvar_t      memavail_cv;
 360  360  
 361  361  typedef enum pageout_hand {
 362  362          POH_FRONT = 1,
 363  363          POH_BACK,
 364  364  } pageout_hand_t;
 365  365  
 366  366  typedef enum {
 367  367          CKP_INELIGIBLE,
 368  368          CKP_NOT_FREED,
 369  369          CKP_FREED,
 370  370  } checkpage_result_t;
 371  371  
 372  372  static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 373  373  
 374  374  static struct clockinit {
 375  375          bool ci_init;
 376  376          pgcnt_t ci_lotsfree_min;
 377  377          pgcnt_t ci_lotsfree_max;
  
    | 
      ↓ open down ↓ | 
    28 lines elided | 
    
      ↑ open up ↑ | 
  
 378  378          pgcnt_t ci_lotsfree;
 379  379          pgcnt_t ci_desfree;
 380  380          pgcnt_t ci_minfree;
 381  381          pgcnt_t ci_throttlefree;
 382  382          pgcnt_t ci_pageout_reserve;
 383  383          pgcnt_t ci_maxpgio;
 384  384          pgcnt_t ci_maxfastscan;
 385  385          pgcnt_t ci_fastscan;
 386  386          pgcnt_t ci_slowscan;
 387  387          pgcnt_t ci_handspreadpages;
      388 +        uint_t  ci_despagescanners;
 388  389  } clockinit = { .ci_init = false };
 389  390  
 390      -static pgcnt_t
      391 +static inline pgcnt_t
 391  392  clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 392  393  {
 393  394          if (value < minimum) {
 394  395                  return (minimum);
 395  396          } else if (value > maximum) {
 396  397                  return (maximum);
 397  398          } else {
 398  399                  return (value);
 399  400          }
 400  401  }
 401  402  
 402  403  static pgcnt_t
 403  404  tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 404  405  {
 405  406          if (initval == 0 || initval >= initval_ceiling) {
 406  407                  return (defval);
 407  408          } else {
 408  409                  return (initval);
 409  410          }
 410  411  }
 411  412  
 412  413  /*
 413  414   * Local boolean to control scanning when zones are over their cap. Avoids
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
 414  415   * accessing the zone_num_over_cap variable except within schedpaging(), which
 415  416   * only runs periodically. This is here only to reduce our access to
 416  417   * zone_num_over_cap, since it is already accessed a lot during paging, and
 417  418   * the page scanner accesses the zones_over variable on each page during a
 418  419   * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 419  420   * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 420  421   */
 421  422  static boolean_t zones_over = B_FALSE;
 422  423  
 423  424  /*
      425 + * On large memory systems, multiple instances of the page scanner are run,
      426 + * each responsible for a separate region of memory. This speeds up page
      427 + * invalidation under low memory conditions.
      428 + *
      429 + * despagescanners can be set in /etc/system or via mdb and it will
      430 + * be used as a guide for how many page scanners to create; the value
      431 + * will be adjusted if it is not sensible. Otherwise, the number of
      432 + * page scanners is determined dynamically based on handspreadpages.
      433 + */
      434 +static void
      435 +recalc_pagescanners(void)
      436 +{
      437 +        pgcnt_t sz;
      438 +        uint_t des;
      439 +
      440 +        /* If the initial calibration has not been done, take no action. */
      441 +        if (pageout_new_spread == 0)
      442 +                return;
      443 +
      444 +        /*
      445 +         * If the desired number of scanners is set in /etc/system
      446 +         * then try to use it.
      447 +         */
      448 +        if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
      449 +                despagescanners = clockinit.ci_despagescanners;
      450 +
      451 +        if (despagescanners != 0) {
      452 +                /*
      453 +                 * We have a desired number of page scanners, either from
      454 +                 * /etc/system or set via mdb. Try and use it (it will be
      455 +                 * clamped below).
      456 +                 */
      457 +                des = despagescanners;
      458 +        } else {
      459 +                /*
      460 +                 * Calculate the number of desired scanners based on the
      461 +                 * system's memory size.
      462 +                 *
      463 +                 * A 64GiB region size is used as the basis for calculating how
      464 +                 * many scanner threads should be created. For systems with up
      465 +                 * to 64GiB of RAM, a single thread is used; for very large
      466 +                 * memory systems the threads are limited to MAX_PSCAN_THREADS.
      467 +                 */
      468 +                sz = btop(64ULL << 30);
      469 +
      470 +                if (sz > looppages) {
      471 +                        des = 1;
      472 +                } else {
      473 +                        pgcnt_t tmp = sz;
      474 +
      475 +                        for (des = 1; tmp < looppages; des++)
      476 +                                tmp += sz;
      477 +                }
      478 +        }
      479 +
      480 +        /*
      481 +         * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
      482 +         * and so that each scanner covers at least 10% more than
      483 +         * handspreadpages.
      484 +         */
      485 +        des = clamp(des, 1,
      486 +            looppages / (handspreadpages + handspreadpages / 10));
      487 +        despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
      488 +}
      489 +
      490 +/*
 424  491   * Set up the paging constants for the clock algorithm used by
 425  492   * pageout_scanner(), and by the virtual memory system overall.  See the
 426  493   * comments at the top of this file for more information about the threshold
 427  494   * values and system responses to memory pressure.
 428  495   *
 429  496   * This routine is called once by main() at startup, after the initial size of
 430  497   * physical memory is determined.  It may be called again later if memory is
 431  498   * added to or removed from the system, or if new measurements of the page scan
 432  499   * rate become available.
 433  500   */
 434  501  void
 435  502  setupclock(void)
 436  503  {
 437      -        uint_t i;
 438      -        pgcnt_t sz, tmp;
 439      -        pgcnt_t defval;
 440  504          bool half = (pageout_threshold_style == 1);
 441  505          bool recalc = true;
 442  506  
 443  507          looppages = total_pages;
 444  508  
 445  509          /*
 446  510           * The operator may have provided specific values for some of the
 447  511           * tunables via /etc/system.  On our first call, we preserve those
 448  512           * values so that they can be used for subsequent recalculations.
 449  513           *
 450  514           * A value of zero for any tunable means we will use the default
 451  515           * sizing.
 452  516           */
 453      -
 454  517          if (!clockinit.ci_init) {
 455  518                  clockinit.ci_init = true;
 456  519  
 457  520                  clockinit.ci_lotsfree_min = lotsfree_min;
 458  521                  clockinit.ci_lotsfree_max = lotsfree_max;
 459  522                  clockinit.ci_lotsfree = lotsfree;
 460  523                  clockinit.ci_desfree = desfree;
 461  524                  clockinit.ci_minfree = minfree;
 462  525                  clockinit.ci_throttlefree = throttlefree;
 463  526                  clockinit.ci_pageout_reserve = pageout_reserve;
 464  527                  clockinit.ci_maxpgio = maxpgio;
 465  528                  clockinit.ci_maxfastscan = maxfastscan;
 466  529                  clockinit.ci_fastscan = fastscan;
 467  530                  clockinit.ci_slowscan = slowscan;
 468  531                  clockinit.ci_handspreadpages = handspreadpages;
      532 +                clockinit.ci_despagescanners = despagescanners;
 469  533  
 470  534                  /*
 471  535                   * The first call does not trigger a recalculation, only
 472  536                   * subsequent calls.
 473  537                   */
 474  538                  recalc = false;
 475  539          }
 476  540  
 477  541          /*
 478  542           * Configure paging threshold values.  For more details on what each
 479  543           * threshold signifies, see the comments at the top of this file.
 480  544           */
 481  545          lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 482  546              btop(LOTSFREE_MAX_DEFAULT));
 483  547          lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 484  548              btop(LOTSFREE_MIN_DEFAULT));
 485  549  
 486  550          lotsfree = tune(clockinit.ci_lotsfree, looppages,
 487  551              clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 488  552  
 489  553          desfree = tune(clockinit.ci_desfree, lotsfree,
 490  554              lotsfree / 2);
 491  555  
 492  556          minfree = tune(clockinit.ci_minfree, desfree,
 493  557              half ? desfree / 2 : 3 * desfree / 4);
 494  558  
 495  559          throttlefree = tune(clockinit.ci_throttlefree, desfree,
 496  560              minfree);
 497  561  
 498  562          pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
 499  563              half ? throttlefree / 2 : 3 * throttlefree / 4);
 500  564  
 501  565          /*
 502  566           * Maxpgio thresholds how much paging is acceptable.
 503  567           * This figures that 2/3 busy on an arm is all that is
 504  568           * tolerable for paging.  We assume one operation per disk rev.
 505  569           *
 506  570           * XXX - Does not account for multiple swap devices.
 507  571           */
 508  572          if (clockinit.ci_maxpgio == 0) {
 509  573                  maxpgio = (DISKRPM * 2) / 3;
 510  574          } else {
 511  575                  maxpgio = clockinit.ci_maxpgio;
 512  576          }
 513  577  
 514  578          /*
 515  579           * The clock scan rate varies between fastscan and slowscan
 516  580           * based on the amount of free memory available.  Fastscan
 517  581           * rate should be set based on the number pages that can be
 518  582           * scanned per sec using ~10% of processor time.  Since this
 519  583           * value depends on the processor, MMU, Mhz etc., it is
 520  584           * difficult to determine it in a generic manner for all
 521  585           * architectures.
 522  586           *
 523  587           * Instead of trying to determine the number of pages scanned
 524  588           * per sec for every processor, fastscan is set to be the smaller
 525  589           * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 526  590           * time is limited to ~4% of processor time.
 527  591           *
 528  592           * Setting fastscan to be 1/2 of memory allows pageout to scan
 529  593           * all of memory in ~2 secs.  This implies that user pages not
 530  594           * accessed within 1 sec (assuming, handspreadpages == fastscan)
 531  595           * can be reclaimed when free memory is very low.  Stealing pages
 532  596           * not accessed within 1 sec seems reasonable and ensures that
 533  597           * active user processes don't thrash.
 534  598           *
 535  599           * Smaller values of fastscan result in scanning fewer pages
 536  600           * every second and consequently pageout may not be able to free
 537  601           * sufficient memory to maintain the minimum threshold.  Larger
 538  602           * values of fastscan result in scanning a lot more pages which
 539  603           * could lead to thrashing and higher CPU usage.
 540  604           *
 541  605           * Fastscan needs to be limited to a maximum value and should not
 542  606           * scale with memory to prevent pageout from consuming too much
 543  607           * time for scanning on slow CPU's and avoid thrashing, as a
 544  608           * result of scanning too many pages, on faster CPU's.
 545  609           * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 546  610           * (the upper bound for fastscan) based on the average number
 547  611           * of pages that can potentially be scanned in ~1 sec (using ~4%
 548  612           * of the CPU) on some of the following machines that currently
 549  613           * run Solaris 2.x:
 550  614           *
 551  615           *                      average memory scanned in ~1 sec
 552  616           *
 553  617           *      25 Mhz SS1+:            23 Meg
 554  618           *      LX:                     37 Meg
 555  619           *      50 Mhz SC2000:          68 Meg
 556  620           *
 557  621           *      40 Mhz 486:             26 Meg
 558  622           *      66 Mhz 486:             42 Meg
 559  623           *
 560  624           * When free memory falls just below lotsfree, the scan rate
 561  625           * goes from 0 to slowscan (i.e., pageout starts running).  This
 562  626           * transition needs to be smooth and is achieved by ensuring that
 563  627           * pageout scans a small number of pages to satisfy the transient
 564  628           * memory demand.  This is set to not exceed 100 pages/sec (25 per
 565  629           * wakeup) since scanning that many pages has no noticible impact
 566  630           * on system performance.
 567  631           *
 568  632           * In addition to setting fastscan and slowscan, pageout is
 569  633           * limited to using ~4% of the CPU.  This results in increasing
 570  634           * the time taken to scan all of memory, which in turn means that
 571  635           * user processes have a better opportunity of preventing their
 572  636           * pages from being stolen.  This has a positive effect on
 573  637           * interactive and overall system performance when memory demand
 574  638           * is high.
 575  639           *
 576  640           * Thus, the rate at which pages are scanned for replacement will
 577  641           * vary linearly between slowscan and the number of pages that
 578  642           * can be scanned using ~4% of processor time instead of varying
 579  643           * linearly between slowscan and fastscan.
 580  644           *
 581  645           * Also, the processor time used by pageout will vary from ~1%
 582  646           * at slowscan to ~4% at fastscan instead of varying between
 583  647           * ~1% at slowscan and ~10% at fastscan.
 584  648           *
 585  649           * The values chosen for the various VM parameters (fastscan,
 586  650           * handspreadpages, etc) are not universally true for all machines,
 587  651           * but appear to be a good rule of thumb for the machines we've
 588  652           * tested.  They have the following ranges:
 589  653           *
 590  654           *      cpu speed:      20 to 70 Mhz
 591  655           *      page size:      4K to 8K
 592  656           *      memory size:    16M to 5G
 593  657           *      page scan rate: 4000 - 17400 4K pages per sec
 594  658           *
 595  659           * The values need to be re-examined for machines which don't
 596  660           * fall into the various ranges (e.g., slower or faster CPUs,
 597  661           * smaller or larger pagesizes etc) shown above.
 598  662           *
 599  663           * On an MP machine, pageout is often unable to maintain the
 600  664           * minimum paging thresholds under heavy load.  This is due to
 601  665           * the fact that user processes running on other CPU's can be
 602  666           * dirtying memory at a much faster pace than pageout can find
 603  667           * pages to free.  The memory demands could be met by enabling
 604  668           * more than one CPU to run the clock algorithm in such a manner
 605  669           * that the various clock hands don't overlap.  This also makes
 606  670           * it more difficult to determine the values for fastscan, slowscan
 607  671           * and handspreadpages.
 608  672           *
 609  673           * The swapper is currently used to free up memory when pageout
 610  674           * is unable to meet memory demands by swapping out processes.
 611  675           * In addition to freeing up memory, swapping also reduces the
 612  676           * demand for memory by preventing user processes from running
 613  677           * and thereby consuming memory.
 614  678           */
 615  679          if (clockinit.ci_maxfastscan == 0) {
 616  680                  if (pageout_new_spread != 0) {
 617  681                          maxfastscan = pageout_new_spread;
 618  682                  } else {
 619  683                          maxfastscan = MAXHANDSPREADPAGES;
 620  684                  }
 621  685          } else {
 622  686                  maxfastscan = clockinit.ci_maxfastscan;
 623  687          }
 624  688  
 625  689          if (clockinit.ci_fastscan == 0) {
 626  690                  fastscan = MIN(looppages / loopfraction, maxfastscan);
 627  691          } else {
 628  692                  fastscan = clockinit.ci_fastscan;
 629  693          }
 630  694  
 631  695          if (fastscan > looppages / loopfraction) {
 632  696                  fastscan = looppages / loopfraction;
 633  697          }
 634  698  
 635  699          /*
 636  700           * Set slow scan time to 1/10 the fast scan time, but
 637  701           * not to exceed maxslowscan.
 638  702           */
 639  703          if (clockinit.ci_slowscan == 0) {
  
    | 
      ↓ open down ↓ | 
    161 lines elided | 
    
      ↑ open up ↑ | 
  
 640  704                  slowscan = MIN(fastscan / 10, maxslowscan);
 641  705          } else {
 642  706                  slowscan = clockinit.ci_slowscan;
 643  707          }
 644  708  
 645  709          if (slowscan > fastscan / 2) {
 646  710                  slowscan = fastscan / 2;
 647  711          }
 648  712  
 649  713          /*
 650      -         * Handspreadpages is distance (in pages) between front and back
      714 +         * Handspreadpages is the distance (in pages) between front and back
 651  715           * pageout daemon hands.  The amount of time to reclaim a page
 652  716           * once pageout examines it increases with this distance and
 653  717           * decreases as the scan rate rises. It must be < the amount
 654  718           * of pageable memory.
 655  719           *
 656  720           * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 657  721           * to be "fastscan" results in the front hand being a few secs
 658  722           * (varies based on the processor speed) ahead of the back hand
 659  723           * at fastscan rates.  This distance can be further reduced, if
 660  724           * necessary, by increasing the processor time used by pageout
 661  725           * to be more than ~4% and preferrably not more than ~10%.
 662  726           *
 663  727           * As a result, user processes have a much better chance of
 664  728           * referencing their pages before the back hand examines them.
 665  729           * This also significantly lowers the number of reclaims from
 666  730           * the freelist since pageout does not end up freeing pages which
 667  731           * may be referenced a sec later.
 668  732           */
 669  733          if (clockinit.ci_handspreadpages == 0) {
 670  734                  handspreadpages = fastscan;
 671  735          } else {
 672  736                  handspreadpages = clockinit.ci_handspreadpages;
 673  737          }
 674  738  
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
 675  739          /*
 676  740           * Make sure that back hand follows front hand by at least
 677  741           * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 678  742           * back hand to look at a page during the same wakeup of the pageout
 679  743           * daemon in which the front hand cleared its ref bit.
 680  744           */
 681  745          if (handspreadpages >= looppages) {
 682  746                  handspreadpages = looppages - 1;
 683  747          }
 684  748  
 685      -        if (!recalc) {
 686      -                /*
 687      -                 * Setup basic values at initialization.
 688      -                 */
 689      -                pscan_region_sz = total_pages;
 690      -                des_page_scanners = n_page_scanners = 1;
 691      -                reset_hands[0] = B_TRUE;
      749 +        /*
      750 +         * Establish the minimum and maximum length of time to be spent
      751 +         * scanning pages per wakeup, limiting the scanner duty cycle.  The
      752 +         * input percentage values (0-100) must be converted to a fraction of
      753 +         * the number of nanoseconds in a second of wall time, then further
      754 +         * scaled down by the number of scanner wakeups in a second.
      755 +         */
      756 +        min_pageout_nsec = MAX(1,
      757 +            NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
      758 +        max_pageout_nsec = MAX(min_pageout_nsec,
      759 +            NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
      760 +
      761 +        /*
      762 +         * If not called for recalculation, return and skip the remaining
      763 +         * steps.
      764 +         */
      765 +        if (!recalc)
 692  766                  return;
 693      -        }
 694  767  
 695  768          /*
 696      -         * Recalculating
 697      -         *
 698      -         * We originally set the number of page scanners to 1. Now that we
 699      -         * know what the handspreadpages is for a scanner, figure out how many
 700      -         * scanners we should run. We want to ensure that the regions don't
 701      -         * overlap and that they are not touching.
 702      -         *
 703      -         * A default 64GB region size is used as the initial value to calculate
 704      -         * how many scanner threads we should create on lower memory systems.
 705      -         * The idea is to limit the number of threads to a practical value
 706      -         * (e.g. a 64GB machine really only needs one scanner thread). For very
 707      -         * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
 708      -         * threads.
 709      -         *
 710      -         * The scanner threads themselves are evenly spread out around the
 711      -         * memory "clock" in pageout_scanner when we reset the hands, and each
 712      -         * thread will scan all of memory.
      769 +         * Set a flag to re-evaluate the clock hand positions.
 713  770           */
 714      -        sz = (btop(64ULL * 0x40000000ULL));
 715      -        if (sz < handspreadpages) {
 716      -                /*
 717      -                 * 64GB is smaller than the separation between the front
 718      -                 * and back hands; use double handspreadpages.
 719      -                 */
 720      -                sz = handspreadpages << 1;
 721      -        }
 722      -        if (sz > total_pages) {
 723      -                sz = total_pages;
 724      -        }
 725      -        /* Record region size for inspection with mdb, otherwise unused */
 726      -        pscan_region_sz = sz;
      771 +        for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
      772 +                reset_hands[i] = B_TRUE;
 727  773  
 728      -        tmp = sz;
 729      -        for (i = 1; tmp < total_pages; i++) {
 730      -                tmp += sz;
 731      -        }
 732      -
 733      -        if (i > MAX_PSCAN_THREADS)
 734      -                i = MAX_PSCAN_THREADS;
 735      -
 736      -        des_page_scanners = i;
      774 +        recalc_pagescanners();
 737  775  }
 738  776  
 739  777  /*
 740  778   * Pageout scheduling.
 741  779   *
 742  780   * Schedpaging controls the rate at which the page out daemon runs by
 743  781   * setting the global variables nscan and desscan SCHEDPAGING_HZ
 744  782   * times a second.  Nscan records the number of pages pageout has examined
 745  783   * in its current pass; schedpaging() resets this value to zero each time
 746  784   * it runs.  Desscan records the number of pages pageout should examine
 747  785   * in its next pass; schedpaging() sets this value based on the amount of
 748  786   * currently available memory.
 749  787   */
 750      -#define SCHEDPAGING_HZ  4
 751  788  
 752      -static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
      789 +static kmutex_t pageout_mutex;
 753  790  
 754  791  /*
 755  792   * Pool of available async pageout putpage requests.
 756  793   */
 757  794  static struct async_reqs *push_req;
 758  795  static struct async_reqs *req_freelist; /* available req structs */
 759  796  static struct async_reqs *push_list;    /* pending reqs */
 760  797  static kmutex_t push_lock;              /* protects req pool */
 761  798  static kcondvar_t push_cv;
 762  799  
 763  800  /*
 764  801   * If pageout() is stuck on a single push for this many seconds,
 765  802   * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 766  803   * to 0, the deadman will have no effect.
 767  804   *
 768  805   * Note that we are only looking for stalls in the calls that pageout() makes
 769  806   * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
  
    | 
      ↓ open down ↓ | 
    7 lines elided | 
    
      ↑ open up ↑ | 
  
 770  807   * I/O, which should not take long unless the underlying strategy call blocks
 771  808   * indefinitely for memory.  The actual I/O request happens (or fails) later.
 772  809   */
 773  810  uint_t pageout_deadman_seconds = 90;
 774  811  
 775  812  static uint_t pageout_stucktime = 0;
 776  813  static bool pageout_pushing = false;
 777  814  static uint64_t pageout_pushcount = 0;
 778  815  static uint64_t pageout_pushcount_seen = 0;
 779  816  
 780      -static int async_list_size = 256;       /* number of async request structs */
      817 +static int async_list_size = 8192;      /* number of async request structs */
 781  818  
 782  819  static void pageout_scanner(void *);
 783  820  
 784  821  /*
 785  822   * If a page is being shared more than "po_share" times
 786  823   * then leave it alone- don't page it out.
 787  824   */
 788  825  #define MIN_PO_SHARE    (8)
 789  826  #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 790  827  ulong_t po_share = MIN_PO_SHARE;
 791  828  
 792  829  /*
 793  830   * Schedule rate for paging.
 794  831   * Rate is linear interpolation between
 795  832   * slowscan with lotsfree and fastscan when out of memory.
 796  833   */
 797  834  static void
 798  835  schedpaging(void *arg)
 799  836  {
 800  837          spgcnt_t vavail;
  
    | 
      ↓ open down ↓ | 
    10 lines elided | 
    
      ↑ open up ↑ | 
  
 801  838  
 802  839          if (freemem < lotsfree + needfree + kmem_reapahead)
 803  840                  kmem_reap();
 804  841  
 805  842          if (freemem < lotsfree + needfree)
 806  843                  seg_preap();
 807  844  
 808  845          if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 809  846                  kcage_cageout_wakeup();
 810  847  
 811      -        (void) atomic_swap_ulong(&nscan, 0);
 812      -        vavail = freemem - deficit;
 813      -        if (pageout_new_spread != 0)
 814      -                vavail -= needfree;
 815      -        if (vavail < 0)
 816      -                vavail = 0;
 817      -        if (vavail > lotsfree)
 818      -                vavail = lotsfree;
      848 +        if (mutex_tryenter(&pageout_mutex)) {
 819  849  
 820      -        /*
 821      -         * Fix for 1161438 (CRS SPR# 73922).  All variables
 822      -         * in the original calculation for desscan were 32 bit signed
 823      -         * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 824      -         * more of memory, the calculation can overflow.  When this
 825      -         * happens, desscan becomes negative and pageout_scanner()
 826      -         * stops paging out.
 827      -         */
 828      -        if (needfree > 0 && pageout_new_spread == 0) {
 829      -                /*
 830      -                 * If we've not yet collected enough samples to
 831      -                 * calculate a spread, kick into high gear anytime
 832      -                 * needfree is non-zero. Note that desscan will not be
 833      -                 * the limiting factor for systems with larger memory;
 834      -                 * the %CPU will limit the scan. That will also be
 835      -                 * maxed out below.
 836      -                 */
 837      -                desscan = fastscan / SCHEDPAGING_HZ;
 838      -        } else {
 839      -                /*
 840      -                 * Once we've calculated a spread based on system
 841      -                 * memory and usage, just treat needfree as another
 842      -                 * form of deficit.
 843      -                 */
 844      -                spgcnt_t faststmp, slowstmp, result;
      850 +                if (pageouts_running != 0)
      851 +                        goto out;
 845  852  
 846      -                slowstmp = slowscan * vavail;
 847      -                faststmp = fastscan * (lotsfree - vavail);
 848      -                result = (slowstmp + faststmp) /
 849      -                    nz(lotsfree) / SCHEDPAGING_HZ;
 850      -                desscan = (pgcnt_t)result;
 851      -        }
      853 +                /* No pageout scanner threads running. */
      854 +                nscan = 0;
      855 +                vavail = freemem - deficit;
      856 +                if (pageout_new_spread != 0)
      857 +                        vavail -= needfree;
      858 +                vavail = clamp(vavail, 0, lotsfree);
 852  859  
 853      -        /*
 854      -         * If we've not yet collected enough samples to calculate a
 855      -         * spread, also kick %CPU to the max.
 856      -         */
 857      -        if (pageout_new_spread == 0) {
 858      -                pageout_nsec = max_pageout_nsec;
 859      -        } else {
 860      -                pageout_nsec = min_pageout_nsec +
 861      -                    (lotsfree - vavail) *
 862      -                    (max_pageout_nsec - min_pageout_nsec) /
 863      -                    nz(lotsfree);
 864      -        }
      860 +                if (needfree > 0 && pageout_new_spread == 0) {
      861 +                        /*
      862 +                         * If we've not yet collected enough samples to
      863 +                         * calculate a spread, use the old logic of kicking
      864 +                         * into high gear anytime needfree is non-zero.
      865 +                         */
      866 +                        desscan = fastscan / SCHEDPAGING_HZ;
      867 +                } else {
      868 +                        /*
      869 +                         * Once we've calculated a spread based on system
      870 +                         * memory and usage, just treat needfree as another
      871 +                         * form of deficit.
      872 +                         */
      873 +                        spgcnt_t faststmp, slowstmp, result;
 865  874  
 866      -        if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
 867      -                /*
 868      -                 * We have finished the pagescan initialization and the desired
 869      -                 * number of page scanners has changed, either because
 870      -                 * initialization just finished, because of a memory DR, or
 871      -                 * because des_page_scanners has been modified on the fly (i.e.
 872      -                 * by mdb). If we need more scanners, start them now, otherwise
 873      -                 * the excess scanners will terminate on their own when they
 874      -                 * reset their hands.
 875      -                 */
 876      -                uint_t i;
 877      -                uint_t curr_nscan = n_page_scanners;
 878      -                pgcnt_t max = total_pages / handspreadpages;
      875 +                        slowstmp = slowscan * vavail;
      876 +                        faststmp = fastscan * (lotsfree - vavail);
      877 +                        result = (slowstmp + faststmp) /
      878 +                            nz(lotsfree) / SCHEDPAGING_HZ;
      879 +                        desscan = (pgcnt_t)result;
      880 +                }
 879  881  
 880      -                if (des_page_scanners > max)
 881      -                        des_page_scanners = max;
      882 +                pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
      883 +                    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 882  884  
 883      -                if (des_page_scanners > MAX_PSCAN_THREADS) {
 884      -                        des_page_scanners = MAX_PSCAN_THREADS;
 885      -                } else if (des_page_scanners == 0) {
 886      -                        des_page_scanners = 1;
 887      -                }
      885 +                DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
      886 +                    pageout_nsec);
 888  887  
 889      -                /*
 890      -                 * Each thread has its own entry in the reset_hands array, so
 891      -                 * we don't need any locking in pageout_scanner to check the
 892      -                 * thread's reset_hands entry. Thus, we use a pre-allocated
 893      -                 * fixed size reset_hands array and upper limit on the number
 894      -                 * of pagescan threads.
 895      -                 *
 896      -                 * The reset_hands entries need to be true before we start new
 897      -                 * scanners, but if we're reducing, we don't want a race on the
 898      -                 * recalculation for the existing threads, so we set
 899      -                 * n_page_scanners first.
 900      -                 */
 901      -                n_page_scanners = des_page_scanners;
 902      -                for (i = 0; i < MAX_PSCAN_THREADS; i++) {
 903      -                        reset_hands[i] = B_TRUE;
 904      -                }
      888 +                if (pageout_new_spread != 0 && despagescanners != 0 &&
      889 +                    despagescanners != n_page_scanners) {
      890 +                        /*
      891 +                        * We have finished the pagescan initialisation and the
      892 +                        * desired number of page scanners has changed, either
      893 +                        * because initialisation just finished, because of a
      894 +                        * memory DR, or because despagescanners has been
      895 +                        * modified on the fly (i.e. by mdb).
      896 +                        */
      897 +                        uint_t i, curr_nscan = n_page_scanners;
 905  898  
 906      -                if (des_page_scanners > curr_nscan) {
 907      -                        /* Create additional pageout scanner threads. */
 908      -                        for (i = curr_nscan; i < des_page_scanners; i++) {
 909      -                                (void) lwp_kernel_create(proc_pageout,
 910      -                                    pageout_scanner, (void *)(uintptr_t)i,
 911      -                                    TS_RUN, curthread->t_pri);
      899 +                        /* Re-validate despagescanners */
      900 +                        recalc_pagescanners();
      901 +
      902 +                        n_page_scanners = despagescanners;
      903 +
      904 +                        for (i = 0; i < MAX_PSCAN_THREADS; i++)
      905 +                                reset_hands[i] = B_TRUE;
      906 +
      907 +                        /* If we need more scanners, start them now. */
      908 +                        if (n_page_scanners > curr_nscan) {
      909 +                                for (i = curr_nscan; i < n_page_scanners; i++) {
      910 +                                        (void) lwp_kernel_create(proc_pageout,
      911 +                                            pageout_scanner,
      912 +                                            (void *)(uintptr_t)i, TS_RUN,
      913 +                                            pageout_pri);
      914 +                                }
 912  915                          }
      916 +
      917 +                        /*
      918 +                         * If the number of scanners has decreased, trigger a
      919 +                         * wakeup so that the excess threads will terminate.
      920 +                         */
      921 +                        if (n_page_scanners < curr_nscan) {
      922 +                                WAKE_PAGEOUT_SCANNER();
      923 +                        }
 913  924                  }
 914      -        }
 915  925  
 916      -        zones_over = B_FALSE;
      926 +                zones_over = B_FALSE;
 917  927  
 918      -        if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
 919      -                if (!PAGE_SCAN_STARTUP)
      928 +                if (PAGE_SCAN_STARTUP) {
      929 +                        /*
      930 +                         * We still need to measure the rate at which the
      931 +                         * system is able to scan pages of memory. Each of
      932 +                         * these initial samples is a scan of as much system
      933 +                         * memory as practical, regardless of whether or not we
      934 +                         * are experiencing memory pressure.
      935 +                         */
      936 +                        desscan = total_pages;
      937 +                        pageout_nsec = max_pageout_nsec;
      938 +
      939 +                        DTRACE_PROBE(schedpage__wake__sample);
      940 +                        WAKE_PAGEOUT_SCANNER();
      941 +                } else if (freemem < lotsfree + needfree) {
      942 +                        /*
      943 +                         * We need more memory.
      944 +                         */
 920  945                          low_mem_scan++;
 921      -                /*
 922      -                 * Either we need more memory, or we still need to
 923      -                 * measure the average scan rate.  Wake the scanner.
 924      -                 */
 925      -                DTRACE_PROBE(schedpage__wake__low);
 926      -                WAKE_PAGEOUT_SCANNER();
 927  946  
 928      -        } else if (zone_num_over_cap > 0) {
 929      -                /* One or more zones are over their cap. */
      947 +                        DTRACE_PROBE(schedpage__wake__low);
      948 +                        WAKE_PAGEOUT_SCANNER();
      949 +                } else if (zone_num_over_cap > 0) {
      950 +                        /*
      951 +                         * One of more zones are over their cap.
      952 +                         */
 930  953  
 931      -                /* No page limit */
 932      -                desscan = total_pages;
      954 +                        /* No page limit */
      955 +                        desscan = total_pages;
 933  956  
 934      -                /*
 935      -                 * Increase the scanning CPU% to the max. This implies
 936      -                 * 80% of one CPU/sec if the scanner can run each
 937      -                 * opportunity. Can also be tuned via setting
 938      -                 * zone_pageout_nsec in /etc/system or with mdb.
 939      -                 */
 940      -                pageout_nsec = (zone_pageout_nsec != 0) ?
 941      -                    zone_pageout_nsec : max_pageout_nsec;
      957 +                        /*
      958 +                        * Increase the scanning CPU% to the max. This implies
      959 +                        * 80% of one CPU/sec if the scanner can run each
      960 +                        * opportunity. Can also be tuned via setting
      961 +                        * zone_pageout_nsec in /etc/system or with mdb.
      962 +                        */
      963 +                        pageout_nsec = (zone_pageout_nsec != 0) ?
      964 +                           zone_pageout_nsec : max_pageout_nsec;
 942  965  
 943      -                zones_over = B_TRUE;
 944      -                zone_cap_scan++;
      966 +                        zones_over = B_TRUE;
      967 +                        zone_cap_scan++;
 945  968  
 946      -                DTRACE_PROBE(schedpage__wake__zone);
 947      -                WAKE_PAGEOUT_SCANNER();
 948      -
 949      -        } else {
 950      -                /*
 951      -                 * There are enough free pages, no need to
 952      -                 * kick the scanner thread.  And next time
 953      -                 * around, keep more of the `highly shared'
 954      -                 * pages.
 955      -                 */
 956      -                cv_signal_pageout();
 957      -
 958      -                mutex_enter(&pageout_mutex);
 959      -                if (po_share > MIN_PO_SHARE) {
 960      -                        po_share >>= 1;
      969 +                        DTRACE_PROBE(schedpage__wake__zone);
      970 +                        WAKE_PAGEOUT_SCANNER();
      971 +                } else {
      972 +                        /*
      973 +                         * There are enough free pages, no need to
      974 +                         * kick the scanner thread.  And next time
      975 +                         * around, keep more of the `highly shared'
      976 +                         * pages.
      977 +                         */
      978 +                        cv_signal_pageout();
      979 +                        if (po_share > MIN_PO_SHARE) {
      980 +                                po_share >>= 1;
      981 +                        }
 961  982                  }
      983 +out:
 962  984                  mutex_exit(&pageout_mutex);
 963  985          }
 964  986  
 965  987          /*
 966  988           * Signal threads waiting for available memory.
 967  989           * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 968  990           * in this case it is not needed - the waiters will be waken up during
 969  991           * the next invocation of this function.
 970  992           */
 971  993          if (kmem_avail() > 0)
 972  994                  cv_broadcast(&memavail_cv);
 973  995  
 974  996          (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 975  997  }
 976  998  
 977  999  pgcnt_t         pushes;
 978 1000  ulong_t         push_list_size;         /* # of requests on pageout queue */
 979 1001  
  
    | 
      ↓ open down ↓ | 
    8 lines elided | 
    
      ↑ open up ↑ | 
  
 980 1002  /*
 981 1003   * Paging out should always be enabled.  This tunable exists to hold pageout
 982 1004   * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 983 1005   * sleep each time it is woken by schedpaging().
 984 1006   */
 985 1007  uint_t dopageout = 1;
 986 1008  
 987 1009  /*
 988 1010   * The page out daemon, which runs as process 2.
 989 1011   *
 990      - * Page out occurs when either:
 991      - * a) there is less than lotsfree pages,
 992      - * b) there are one or more zones over their physical memory cap.
     1012 + * The daemon treats physical memory as a circular array of pages and scans
     1013 + * the pages using a 'two-handed clock' algorithm. The front hand moves
     1014 + * through the pages, clearing the reference bit. The back hand travels a
     1015 + * distance (handspreadpages) behind the front hand, freeing the pages that
     1016 + * have not been referenced in the time since the front hand passed. If
     1017 + * modified, they are first written to their backing store before being
     1018 + * freed.
 993 1019   *
 994      - * The daemon treats physical memory as a circular array of pages and scans the
 995      - * pages using a 'two-handed clock' algorithm. The front hand moves through
 996      - * the pages, clearing the reference bit. The back hand travels a distance
 997      - * (handspreadpages) behind the front hand, freeing the pages that have not
 998      - * been referenced in the time since the front hand passed. If modified, they
 999      - * are first written to their backing store before being freed.
     1020 + * In order to make page invalidation more responsive on machines with
     1021 + * larger memory, multiple pageout_scanner threads may be created. In this
     1022 + * case, each thread is given a segment of the memory "clock face" so that
     1023 + * memory can be reclaimed more quickly.
1000 1024   *
1001      - * In order to make page invalidation more responsive on machines with larger
1002      - * memory, multiple pageout_scanner threads may be created. In this case, the
1003      - * threads are evenly distributed around the the memory "clock face" so that
1004      - * memory can be reclaimed more quickly (that is, there can be large regions in
1005      - * which no pages can be reclaimed by a single thread, leading to lag which
1006      - * causes undesirable behavior such as htable stealing).
     1025 + * As long as there are at least lotsfree pages, or no zones over their
     1026 + * cap, then pageout_scanner threads are not run. When pageout_scanner
     1027 + * threads are running for case (a), all pages are considered for pageout.
     1028 + * For case (b), only pages belonging to a zone over its cap will be
     1029 + * considered for pageout.
1007 1030   *
1008      - * As long as there are at least lotsfree pages, or no zones over their cap,
1009      - * then pageout_scanner threads are not run. When pageout_scanner threads are
1010      - * running for case (a), all pages are considered for pageout. For case (b),
1011      - * only pages belonging to a zone over its cap will be considered for pageout.
1012      - *
1013      - * There are multiple threads that act on behalf of the pageout process.
1014      - * A set of threads scan pages (pageout_scanner) and frees them up if
1015      - * they don't require any VOP_PUTPAGE operation. If a page must be
1016      - * written back to its backing store, the request is put on a list
1017      - * and the other (pageout) thread is signaled. The pageout thread
1018      - * grabs VOP_PUTPAGE requests from the list, and processes them.
1019      - * Some filesystems may require resources for the VOP_PUTPAGE
1020      - * operations (like memory) and hence can block the pageout
1021      - * thread, but the pageout_scanner threads can still operate. There is still
1022      - * no guarantee that memory deadlocks cannot occur.
1023      - *
1024      - * The pageout_scanner parameters are determined in schedpaging().
     1031 + * There are multiple threads that act on behalf of the pageout process. A
     1032 + * set of threads scan pages (pageout_scanner) and frees them up if they
     1033 + * don't require any VOP_PUTPAGE operation. If a page must be written back
     1034 + * to its backing store, the request is put on a list and the other
     1035 + * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
     1036 + * requests from the list, and processes them. Some filesystems may require
     1037 + * resources for the VOP_PUTPAGE operations (like memory) and hence can
     1038 + * block the pageout thread, but the scanner thread can still operate.
     1039 + * There is still no guarantee that memory deadlocks cannot occur.
1025 1040   */
1026 1041  void
1027 1042  pageout()
1028 1043  {
1029 1044          struct async_reqs *arg;
1030      -        pri_t pageout_pri;
1031 1045          int i;
1032 1046          pgcnt_t max_pushes;
1033 1047          callb_cpr_t cprinfo;
1034 1048  
1035 1049          proc_pageout = ttoproc(curthread);
1036 1050          proc_pageout->p_cstime = 0;
1037 1051          proc_pageout->p_stime =  0;
1038 1052          proc_pageout->p_cutime =  0;
1039 1053          proc_pageout->p_utime = 0;
1040 1054          bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1041 1055          bcopy("pageout", PTOU(curproc)->u_comm, 7);
1042 1056  
1043 1057          /*
1044 1058           * Create pageout scanner thread
1045 1059           */
1046 1060          mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1047 1061          mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1048 1062  
1049 1063          /*
1050 1064           * Allocate and initialize the async request structures
  
    | 
      ↓ open down ↓ | 
    10 lines elided | 
    
      ↑ open up ↑ | 
  
1051 1065           * for pageout.
1052 1066           */
1053 1067          push_req = (struct async_reqs *)
1054 1068              kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055 1069  
1056 1070          req_freelist = push_req;
1057 1071          for (i = 0; i < async_list_size - 1; i++) {
1058 1072                  push_req[i].a_next = &push_req[i + 1];
1059 1073          }
1060 1074  
1061      -        pageout_pri = curthread->t_pri;
     1075 +        pageout_pri = curthread->t_pri - 1;
1062 1076  
1063      -        /* Create the (first) pageout scanner thread. */
1064      -        (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065      -            pageout_pri - 1);
     1077 +        /* Create the first pageout scanner thread. */
     1078 +        (void) lwp_kernel_create(proc_pageout, pageout_scanner,
     1079 +            (void *)0,  /* this is instance 0, not NULL */
     1080 +            TS_RUN, pageout_pri);
1066 1081  
1067 1082          /*
1068 1083           * kick off pageout scheduler.
1069 1084           */
1070 1085          schedpaging(NULL);
1071 1086  
1072 1087          /*
1073 1088           * Create kernel cage thread.
1074 1089           * The kernel cage thread is started under the pageout process
1075 1090           * to take advantage of the less restricted page allocation
1076 1091           * in page_create_throttle().
1077 1092           */
1078 1093          kcage_cageout_init();
1079 1094  
1080 1095          /*
1081 1096           * Limit pushes to avoid saturating pageout devices.
1082 1097           */
1083 1098          max_pushes = maxpgio / SCHEDPAGING_HZ;
1084 1099          CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1085 1100  
1086 1101          for (;;) {
1087 1102                  mutex_enter(&push_lock);
1088 1103  
1089 1104                  while ((arg = push_list) == NULL || pushes > max_pushes) {
1090 1105                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
  
    | 
      ↓ open down ↓ | 
    15 lines elided | 
    
      ↑ open up ↑ | 
  
1091 1106                          cv_wait(&push_cv, &push_lock);
1092 1107                          pushes = 0;
1093 1108                          CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094 1109                  }
1095 1110                  push_list = arg->a_next;
1096 1111                  arg->a_next = NULL;
1097 1112                  pageout_pushing = true;
1098 1113                  mutex_exit(&push_lock);
1099 1114  
1100 1115                  DTRACE_PROBE(pageout__push);
     1116 +
1101 1117                  if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102 1118                      arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103 1119                          pushes++;
1104 1120                  }
1105 1121  
1106 1122                  /* vp held by checkpage() */
1107 1123                  VN_RELE(arg->a_vp);
1108 1124  
1109 1125                  mutex_enter(&push_lock);
1110 1126                  pageout_pushing = false;
1111 1127                  pageout_pushcount++;
1112 1128                  arg->a_next = req_freelist;     /* back on freelist */
1113 1129                  req_freelist = arg;
1114 1130                  push_list_size--;
  
    | 
      ↓ open down ↓ | 
    4 lines elided | 
    
      ↑ open up ↑ | 
  
1115 1131                  mutex_exit(&push_lock);
1116 1132          }
1117 1133  }
1118 1134  
1119 1135  /*
1120 1136   * Kernel thread that scans pages looking for ones to free
1121 1137   */
1122 1138  static void
1123 1139  pageout_scanner(void *a)
1124 1140  {
1125      -        struct page *fronthand, *backhand;
1126      -        uint_t laps, iter = 0;
     1141 +        struct page *fronthand, *backhand, *fronthandstart;
     1142 +        struct page *regionstart, *regionend;
     1143 +        uint_t laps;
1127 1144          callb_cpr_t cprinfo;
1128      -        pgcnt_t nscan_cnt, nscan_limit;
     1145 +        pgcnt_t nscan_cnt, tick;
1129 1146          pgcnt_t pcount;
1130      -        uint_t inst = (uint_t)(uintptr_t)a;
     1147 +        bool bhwrapping, fhwrapping;
1131 1148          hrtime_t sample_start, sample_end;
1132      -        kmutex_t pscan_mutex;
1133      -        bool sampling;
     1149 +        uint_t inst = (uint_t)(uintptr_t)a;
1134 1150  
1135 1151          VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136 1152  
1137      -        mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
     1153 +        CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
     1154 +        mutex_enter(&pageout_mutex);
1138 1155  
1139      -        CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140      -        mutex_enter(&pscan_mutex);
1141      -
1142 1156          /*
1143      -         * Establish the minimum and maximum length of time to be spent
1144      -         * scanning pages per wakeup, limiting the scanner duty cycle.  The
1145      -         * input percentage values (0-100) must be converted to a fraction of
1146      -         * the number of nanoseconds in a second of wall time, then further
1147      -         * scaled down by the number of scanner wakeups in a second:
     1157 +         * The restart case does not attempt to point the hands at roughly
     1158 +         * the right point on the assumption that after one circuit things
     1159 +         * will have settled down, and restarts shouldn't be that often.
1148 1160           */
1149      -        min_pageout_nsec = MAX(1,
1150      -            NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151      -        max_pageout_nsec = MAX(min_pageout_nsec,
1152      -            NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
     1161 +        reset_hands[inst] = B_TRUE;
1153 1162  
     1163 +        pageouts_running++;
     1164 +        mutex_exit(&pageout_mutex);
     1165 +
1154 1166  loop:
1155 1167          cv_signal_pageout();
1156 1168  
     1169 +        mutex_enter(&pageout_mutex);
     1170 +        pageouts_running--;
1157 1171          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158      -        cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159      -        CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
     1172 +        cv_wait(&proc_pageout->p_cv, &pageout_mutex);
     1173 +        CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
     1174 +        pageouts_running++;
     1175 +        mutex_exit(&pageout_mutex);
1160 1176  
1161 1177          /*
1162      -         * Check if pageout has been disabled for debugging purposes:
     1178 +         * Check if pageout has been disabled for debugging purposes.
1163 1179           */
1164 1180          if (!dopageout) {
1165 1181                  goto loop;
1166 1182          }
1167 1183  
1168 1184          /*
1169      -         * One may reset the clock hands for debugging purposes.  Hands will
1170      -         * also be reset if memory is added to or removed from the system.
     1185 +         * One may reset the clock hands and scanned region for debugging
     1186 +         * purposes. Hands will also be reset on first thread startup, if
     1187 +         * the number of scanning threads (n_page_scanners) changes, or if
     1188 +         * memory is added to, or removed from, the system.
1171 1189           */
1172 1190          if (reset_hands[inst]) {
1173 1191                  struct page *first;
1174      -                pgcnt_t offset = total_pages / n_page_scanners;
1175 1192  
1176 1193                  reset_hands[inst] = B_FALSE;
     1194 +
1177 1195                  if (inst >= n_page_scanners) {
1178 1196                          /*
1179      -                         * The desired number of page scanners has been
1180      -                         * reduced and this instance is no longer wanted.
1181      -                         * Exit the lwp.
1182      -                         */
     1197 +                        * The desired number of page scanners has been
     1198 +                        * reduced and this instance is no longer wanted.
     1199 +                        * Exit the lwp.
     1200 +                        */
1183 1201                          VERIFY3U(inst, !=, 0);
1184      -                        mutex_exit(&pscan_mutex);
     1202 +                        DTRACE_PROBE1(pageout__exit, uint_t, inst);
     1203 +                        mutex_enter(&pageout_mutex);
     1204 +                        pageouts_running--;
     1205 +                        mutex_exit(&pageout_mutex);
1185 1206                          mutex_enter(&curproc->p_lock);
1186 1207                          lwp_exit();
     1208 +                        /* NOTREACHED */
1187 1209                  }
1188 1210  
     1211 +                first = page_first();
     1212 +
1189 1213                  /*
1190      -                 * The reset case repositions the hands at the proper place
1191      -                 * on the memory clock face to prevent creep into another
1192      -                 * thread's active region or when the number of threads has
1193      -                 * changed.
1194      -                 *
1195      -                 * Set the two clock hands to be separated by a reasonable
1196      -                 * amount, but no more than 360 degrees apart.
1197      -                 *
1198      -                 * If inst == 0, backhand starts at first page, otherwise
1199      -                 * it is (inst * offset) around the memory "clock face" so that
1200      -                 * we spread out each scanner instance evenly.
     1214 +                 * Each scanner thread gets its own sector of the memory
     1215 +                 * clock face.
1201 1216                   */
1202      -                first = page_first();
1203      -                backhand = page_nextn(first, offset * inst);
1204      -                if (handspreadpages >= total_pages) {
1205      -                        fronthand = page_nextn(backhand, total_pages - 1);
     1217 +                pgcnt_t span, offset;
     1218 +
     1219 +                span = looppages / n_page_scanners;
     1220 +                VERIFY3U(span, >, handspreadpages);
     1221 +
     1222 +                offset = inst * span;
     1223 +                regionstart = page_nextn(first, offset);
     1224 +                if (inst == n_page_scanners - 1) {
     1225 +                        /* The last instance goes up to the last page */
     1226 +                        regionend = page_nextn(first, looppages - 1);
1206 1227                  } else {
1207      -                        fronthand = page_nextn(backhand, handspreadpages);
     1228 +                        regionend = page_nextn(regionstart, span - 1);
1208 1229                  }
     1230 +
     1231 +                backhand = regionstart;
     1232 +                fronthand = page_nextn(backhand, handspreadpages);
     1233 +                tick = 1;
     1234 +
     1235 +                bhwrapping = fhwrapping = B_FALSE;
     1236 +
     1237 +                DTRACE_PROBE4(pageout__reset, uint_t, inst,
     1238 +                    pgcnt_t, regionstart, pgcnt_t, regionend,
     1239 +                    pgcnt_t, fronthand);
1209 1240          }
1210 1241  
     1242 +        /*
     1243 +         * This CPU kstat is only incremented here and we're obviously
     1244 +         * on this CPU, so no lock.
     1245 +         */
1211 1246          CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212 1247  
1213 1248          /*
1214 1249           * Keep track of the number of times we have scanned all the way around
1215      -         * the loop:
     1250 +         * the loop on this wakeup.
1216 1251           */
1217 1252          laps = 0;
1218 1253  
1219 1254          /*
1220 1255           * Track the number of pages visited during this scan so that we can
1221 1256           * periodically measure our duty cycle.
1222 1257           */
1223      -        pcount = 0;
1224 1258          nscan_cnt = 0;
     1259 +        pcount = 0;
1225 1260  
1226      -        if (PAGE_SCAN_STARTUP) {
1227      -                /*
1228      -                 * We need to measure the rate at which the system is able to
1229      -                 * scan pages of memory.  Each of these initial samples is a
1230      -                 * scan of all system memory, regardless of whether or not we
1231      -                 * are experiencing memory pressure.
1232      -                 */
1233      -                nscan_limit = total_pages;
1234      -                sampling = true;
1235      -        } else {
1236      -                nscan_limit = desscan;
1237      -                sampling = false;
1238      -        }
     1261 +        DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
     1262 +            hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1239 1263  
1240      -        DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241      -            page_t *, backhand, page_t *, fronthand);
     1264 +        /*
     1265 +         * Record the initial position of the front hand for this cycle so
     1266 +         * that we can detect when the hand wraps around.
     1267 +         */
     1268 +        fronthandstart = fronthand;
1242 1269  
1243 1270          sample_start = gethrtime();
1244 1271  
1245 1272          /*
1246 1273           * Scan the appropriate number of pages for a single duty cycle.
1247      -         * Only scan while at least one of these is true:
1248      -         * 1) one or more zones is over its cap
1249      -         * 2) there is not enough free memory
1250      -         * 3) during page scan startup when determining sample data
1251 1274           */
1252      -        while (nscan_cnt < nscan_limit) {
     1275 +        while (nscan_cnt < desscan) {
1253 1276                  checkpage_result_t rvfront, rvback;
1254 1277  
1255      -                if (!sampling && !zones_over &&
1256      -                    freemem >= lotsfree + needfree) {
     1278 +                /*
     1279 +                 * Only scan while at least one of these is true:
     1280 +                 *  1) one or more zones is over its cap
     1281 +                 *  2) there is not enough free memory
     1282 +                 *  3) during page scan startup when determining sample data
     1283 +                 */
     1284 +                if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
     1285 +                    !zones_over) {
1257 1286                          /*
1258 1287                           * We are not sampling and enough memory has become
1259 1288                           * available that scanning is no longer required.
1260 1289                           */
     1290 +                        DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1261 1291                          break;
1262 1292                  }
1263 1293  
1264      -                DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
     1294 +                DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1265 1295  
1266 1296                  /*
1267 1297                   * Periodically check to see if we have exceeded the CPU duty
1268 1298                   * cycle for a single wakeup.
1269 1299                   */
1270 1300                  if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271 1301                          hrtime_t pageout_cycle_nsec;
1272 1302  
1273 1303                          pageout_cycle_nsec = gethrtime() - sample_start;
1274 1304                          if (pageout_cycle_nsec >= pageout_nsec) {
1275      -                                /*
1276      -                                 * This is where we normally break out of the
1277      -                                 * loop when scanning zones or sampling.
1278      -                                 */
1279      -                                if (!zones_over) {
     1305 +                                if (!zones_over)
1280 1306                                          atomic_inc_64(&pageout_timeouts);
1281      -                                }
1282 1307                                  DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283 1308                                  break;
1284 1309                          }
1285 1310                  }
1286 1311  
1287 1312                  /*
1288 1313                   * If checkpage manages to add a page to the free list,
1289 1314                   * we give ourselves another couple of trips around the loop.
1290 1315                   */
1291 1316                  if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1292 1317                          laps = 0;
1293 1318                  }
1294 1319                  if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1295 1320                          laps = 0;
1296 1321                  }
1297 1322  
1298 1323                  ++pcount;
1299 1324  
1300 1325                  /*
1301 1326                   * This CPU kstat is only incremented here and we're obviously
1302 1327                   * on this CPU, so no lock.
  
    | 
      ↓ open down ↓ | 
    11 lines elided | 
    
      ↑ open up ↑ | 
  
1303 1328                   */
1304 1329                  CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305 1330  
1306 1331                  /*
1307 1332                   * Don't include ineligible pages in the number scanned.
1308 1333                   */
1309 1334                  if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310 1335                          nscan_cnt++;
1311 1336                  }
1312 1337  
1313      -                backhand = page_next(backhand);
1314      -                fronthand = page_next(fronthand);
     1338 +                if (bhwrapping) {
     1339 +                        backhand = regionstart;
     1340 +                        bhwrapping = B_FALSE;
     1341 +                } else {
     1342 +                        backhand = page_nextn(backhand, tick);
     1343 +                        if (backhand == regionend)
     1344 +                                bhwrapping = B_TRUE;
     1345 +                }
1315 1346  
     1347 +                if (fhwrapping) {
     1348 +                        fronthand = regionstart;
     1349 +                        fhwrapping = B_FALSE;
     1350 +                } else {
     1351 +                        fronthand = page_nextn(fronthand, tick);
     1352 +                        if (fronthand == regionend)
     1353 +                                fhwrapping = B_TRUE;
     1354 +                }
     1355 +
1316 1356                  /*
1317      -                 * The front hand has wrapped around to the first page in the
1318      -                 * loop.
     1357 +                 * The front hand has wrapped around during this wakeup.
1319 1358                   */
1320      -                if (fronthand == page_first())  {
1321      -                        DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
     1359 +                if (fronthand == fronthandstart) {
     1360 +                        laps++;
     1361 +                        DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
     1362 +                            uint_t, laps);
1322 1363  
1323 1364                          /*
1324      -                         * Every 64 wraps we reposition our hands within our
1325      -                         * region to prevent creep into another thread.
1326      -                         */
1327      -                        if ((++iter % pageout_reset_cnt) == 0)
1328      -                                reset_hands[inst] = B_TRUE;
1329      -
1330      -                        /*
1331 1365                           * This CPU kstat is only incremented here and we're
1332 1366                           * obviously on this CPU, so no lock.
1333 1367                           */
1334 1368                          CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335 1369  
1336 1370                          /*
1337      -                         * If scanning because the system is low on memory,
1338 1371                           * then when we wraparound memory we want to try to
1339 1372                           * reclaim more pages.
1340 1373                           * If scanning only because zones are over their cap,
1341 1374                           * then wrapping is common and we simply keep going.
1342      -                         */
1343      -                        if (freemem < lotsfree + needfree && ++laps > 1) {
     1375 +                        */
     1376 +                        if (laps > 1 && freemem < lotsfree + needfree) {
1344 1377                                  /*
1345      -                                 * The system is low on memory.
1346 1378                                   * Extremely unlikely, but it happens.
1347 1379                                   * We went around the loop at least once
1348 1380                                   * and didn't get far enough.
1349 1381                                   * If we are still skipping `highly shared'
1350 1382                                   * pages, skip fewer of them.  Otherwise,
1351 1383                                   * give up till the next clock tick.
1352 1384                                   */
1353      -                                mutex_enter(&pageout_mutex);
1354 1385                                  if (po_share < MAX_PO_SHARE) {
1355 1386                                          po_share <<= 1;
1356      -                                        mutex_exit(&pageout_mutex);
1357 1387                                  } else {
1358      -                                        mutex_exit(&pageout_mutex);
1359 1388                                          break;
1360 1389                                  }
1361 1390                          }
1362 1391                  }
1363 1392          }
1364 1393  
     1394 +        sample_end = gethrtime();
1365 1395          atomic_add_long(&nscan, nscan_cnt);
1366 1396  
1367      -        sample_end = gethrtime();
     1397 +        DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
     1398 +            pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1368 1399  
1369      -        DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370      -            uint_t, inst);
1371      -
1372 1400          /*
1373      -         * The following two blocks are only relevant when the scanner is
1374      -         * first started up. After the scanner runs for a while, neither of
1375      -         * the conditions will ever be true again.
1376      -         *
1377 1401           * The global variables used below are only modified by this thread and
1378 1402           * only during initial scanning when there is a single page scanner
1379      -         * thread running. Thus, we don't use any locking.
     1403 +         * thread running.
1380 1404           */
1381 1405          if (pageout_new_spread == 0) {
1382 1406                  VERIFY3U(inst, ==, 0);
     1407 +
1383 1408                  if (PAGE_SCAN_STARTUP) {
1384 1409                          /*
1385 1410                           * Continue accumulating samples until we have enough
1386      -                         * to get a reasonable value for average scan rate:
     1411 +                         * to get a reasonable value for average scan rate.
1387 1412                           */
1388 1413                          pageout_sample_pages += pcount;
1389 1414                          pageout_sample_etime += sample_end - sample_start;
1390 1415                          ++pageout_sample_cnt;
1391 1416                  }
1392 1417  
1393 1418                  if (!PAGE_SCAN_STARTUP) {
1394 1419                          /*
1395 1420                           * We have enough samples, set the spread.
1396 1421                           */
1397 1422                          pageout_rate = (hrrate_t)pageout_sample_pages *
1398 1423                              (hrrate_t)(NANOSEC) / pageout_sample_etime;
1399 1424                          pageout_new_spread = pageout_rate / 10;
1400 1425                          setupclock();
1401 1426                  }
1402 1427          }
1403 1428  
1404 1429          goto loop;
1405 1430  }
1406 1431  
1407 1432  /*
1408 1433   * The pageout deadman is run once per second by clock().
1409 1434   */
1410 1435  void
1411 1436  pageout_deadman(void)
1412 1437  {
1413 1438          if (panicstr != NULL) {
1414 1439                  /*
1415 1440                   * There is no pageout after panic.
1416 1441                   */
1417 1442                  return;
1418 1443          }
1419 1444  
1420 1445          if (pageout_deadman_seconds == 0) {
1421 1446                  /*
1422 1447                   * The deadman is not enabled.
1423 1448                   */
1424 1449                  return;
1425 1450          }
1426 1451  
1427 1452          if (!pageout_pushing) {
1428 1453                  goto reset;
1429 1454          }
1430 1455  
1431 1456          /*
1432 1457           * We are pushing a page.  Check to see if it is the same call we saw
1433 1458           * last time we looked:
1434 1459           */
1435 1460          if (pageout_pushcount != pageout_pushcount_seen) {
1436 1461                  /*
1437 1462                   * It is a different call from the last check, so we are not
1438 1463                   * stuck.
1439 1464                   */
1440 1465                  goto reset;
1441 1466          }
1442 1467  
1443 1468          if (++pageout_stucktime >= pageout_deadman_seconds) {
1444 1469                  panic("pageout_deadman: stuck pushing the same page for %d "
1445 1470                      "seconds (freemem is %lu)", pageout_deadman_seconds,
1446 1471                      freemem);
1447 1472          }
1448 1473  
1449 1474          return;
1450 1475  
1451 1476  reset:
1452 1477          /*
1453 1478           * Reset our tracking state to reflect that we are not stuck:
1454 1479           */
1455 1480          pageout_stucktime = 0;
1456 1481          pageout_pushcount_seen = pageout_pushcount;
1457 1482  }
1458 1483  
1459 1484  /*
1460 1485   * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1461 1486   * system (u., page table) or free, then leave it alone.  Otherwise,
1462 1487   * if we are running the front hand, turn off the page's reference bit.
1463 1488   * If the proc is over maxrss, we take it.  If running the back hand,
1464 1489   * check whether the page has been reclaimed.  If not, free the page,
1465 1490   * pushing it to disk first if necessary.
1466 1491   *
1467 1492   * Return values:
1468 1493   *      CKP_INELIGIBLE if the page is not a candidate at all,
1469 1494   *      CKP_NOT_FREED  if the page was not freed, or
1470 1495   *      CKP_FREED      if we freed it.
1471 1496   */
1472 1497  static checkpage_result_t
1473 1498  checkpage(struct page *pp, pageout_hand_t whichhand)
1474 1499  {
1475 1500          int ppattr;
1476 1501          int isfs = 0;
1477 1502          int isexec = 0;
1478 1503          int pagesync_flag;
1479 1504          zoneid_t zid = ALL_ZONES;
1480 1505  
1481 1506          /*
1482 1507           * Skip pages:
1483 1508           *      - associated with the kernel vnode since
1484 1509           *          they are always "exclusively" locked.
1485 1510           *      - that are free
1486 1511           *      - that are shared more than po_share'd times
1487 1512           *      - its already locked
1488 1513           *
1489 1514           * NOTE:  These optimizations assume that reads are atomic.
1490 1515           */
1491 1516  
1492 1517          if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1493 1518              pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1494 1519              hat_page_checkshare(pp, po_share)) {
1495 1520                  return (CKP_INELIGIBLE);
1496 1521          }
1497 1522  
1498 1523          if (!page_trylock(pp, SE_EXCL)) {
1499 1524                  /*
1500 1525                   * Skip the page if we can't acquire the "exclusive" lock.
1501 1526                   */
1502 1527                  return (CKP_INELIGIBLE);
1503 1528          } else if (PP_ISFREE(pp)) {
1504 1529                  /*
1505 1530                   * It became free between the above check and our actually
1506 1531                   * locking the page.  Oh well, there will be other pages.
1507 1532                   */
1508 1533                  page_unlock(pp);
1509 1534                  return (CKP_INELIGIBLE);
1510 1535          }
1511 1536  
1512 1537          /*
1513 1538           * Reject pages that cannot be freed. The page_struct_lock
1514 1539           * need not be acquired to examine these
1515 1540           * fields since the page has an "exclusive" lock.
1516 1541           */
1517 1542          if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
  
    | 
      ↓ open down ↓ | 
    121 lines elided | 
    
      ↑ open up ↑ | 
  
1518 1543                  page_unlock(pp);
1519 1544                  return (CKP_INELIGIBLE);
1520 1545          }
1521 1546  
1522 1547          if (zones_over) {
1523 1548                  ASSERT(pp->p_zoneid == ALL_ZONES ||
1524 1549                      pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525 1550                  if (pp->p_zoneid == ALL_ZONES ||
1526 1551                      zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527 1552                          /*
1528      -                         * Cross-zone shared page, or zone not over it's cap.
1529      -                         * Leave the page alone.
1530      -                         */
     1553 +                        * Cross-zone shared page, or zone not over it's cap.
     1554 +                        * Leave the page alone.
     1555 +                        */
1531 1556                          page_unlock(pp);
1532 1557                          return (CKP_INELIGIBLE);
1533 1558                  }
1534 1559                  zid = pp->p_zoneid;
1535 1560          }
1536 1561  
1537 1562          /*
1538 1563           * Maintain statistics for what we are freeing
1539 1564           */
1540      -
1541 1565          if (pp->p_vnode != NULL) {
1542 1566                  if (pp->p_vnode->v_flag & VVMEXEC)
1543 1567                          isexec = 1;
1544 1568  
1545 1569                  if (!IS_SWAPFSVP(pp->p_vnode))
1546 1570                          isfs = 1;
1547 1571          }
1548 1572  
1549 1573          /*
1550 1574           * Turn off REF and MOD bits with the front hand.
1551 1575           * The back hand examines the REF bit and always considers
1552 1576           * SHARED pages as referenced.
1553 1577           */
1554 1578          if (whichhand == POH_FRONT) {
1555 1579                  pagesync_flag = HAT_SYNC_ZERORM;
1556 1580          } else {
1557 1581                  pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1558 1582                      HAT_SYNC_STOPON_SHARED;
1559 1583          }
1560 1584  
1561 1585          ppattr = hat_pagesync(pp, pagesync_flag);
1562 1586  
1563 1587  recheck:
1564 1588          /*
1565 1589           * If page is referenced; make unreferenced but reclaimable.
1566 1590           * If this page is not referenced, then it must be reclaimable
1567 1591           * and we can add it to the free list.
1568 1592           */
1569 1593          if (ppattr & P_REF) {
1570 1594                  DTRACE_PROBE2(pageout__isref, page_t *, pp,
1571 1595                      pageout_hand_t, whichhand);
1572 1596  
1573 1597                  if (whichhand == POH_FRONT) {
1574 1598                          /*
1575 1599                           * Checking of rss or madvise flags needed here...
1576 1600                           *
1577 1601                           * If not "well-behaved", fall through into the code
1578 1602                           * for not referenced.
1579 1603                           */
1580 1604                          hat_clrref(pp);
1581 1605                  }
1582 1606  
1583 1607                  /*
1584 1608                   * Somebody referenced the page since the front
1585 1609                   * hand went by, so it's not a candidate for
1586 1610                   * freeing up.
1587 1611                   */
1588 1612                  page_unlock(pp);
1589 1613                  return (CKP_NOT_FREED);
1590 1614          }
1591 1615  
1592 1616          VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1593 1617  
1594 1618          /*
1595 1619           * If large page, attempt to demote it. If successfully demoted,
1596 1620           * retry the checkpage.
1597 1621           */
1598 1622          if (pp->p_szc != 0) {
1599 1623                  if (!page_try_demote_pages(pp)) {
1600 1624                          VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1601 1625                          page_unlock(pp);
1602 1626                          return (CKP_INELIGIBLE);
1603 1627                  }
1604 1628  
1605 1629                  ASSERT(pp->p_szc == 0);
1606 1630                  VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1607 1631  
1608 1632                  /*
1609 1633                   * Since page_try_demote_pages() could have unloaded some
1610 1634                   * mappings it makes sense to reload ppattr.
1611 1635                   */
1612 1636                  ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1613 1637          }
1614 1638  
1615 1639          /*
1616 1640           * If the page is currently dirty, we have to arrange to have it
1617 1641           * cleaned before it can be freed.
1618 1642           *
1619 1643           * XXX - ASSERT(pp->p_vnode != NULL);
1620 1644           */
1621 1645          if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1622 1646                  struct vnode *vp = pp->p_vnode;
1623 1647                  u_offset_t offset = pp->p_offset;
1624 1648  
1625 1649                  /*
1626 1650                   * XXX - Test for process being swapped out or about to exit?
1627 1651                   * [Can't get back to process(es) using the page.]
1628 1652                   */
1629 1653  
1630 1654                  /*
1631 1655                   * Hold the vnode before releasing the page lock to
1632 1656                   * prevent it from being freed and re-used by some
1633 1657                   * other thread.
1634 1658                   */
1635 1659                  VN_HOLD(vp);
1636 1660                  page_unlock(pp);
1637 1661  
1638 1662                  /*
1639 1663                   * Queue I/O request for the pageout thread.
1640 1664                   */
1641 1665                  if (!queue_io_request(vp, offset)) {
1642 1666                          VN_RELE(vp);
1643 1667                          return (CKP_NOT_FREED);
1644 1668                  }
1645 1669                  if (isfs) {
1646 1670                          zone_pageout_stat(zid, ZPO_DIRTY);
1647 1671                  } else {
1648 1672                          zone_pageout_stat(zid, ZPO_ANONDIRTY);
1649 1673                  }
1650 1674                  return (CKP_FREED);
1651 1675          }
1652 1676  
1653 1677          /*
1654 1678           * Now we unload all the translations and put the page back on to the
1655 1679           * free list.  If the page was used (referenced or modified) after the
1656 1680           * pagesync but before it was unloaded we catch it and handle the page
1657 1681           * properly.
1658 1682           */
1659 1683          DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1660 1684          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1661 1685          ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1662 1686          if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1663 1687                  goto recheck;
1664 1688          }
1665 1689  
1666 1690          VN_DISPOSE(pp, B_FREE, 0, kcred);
1667 1691  
1668 1692          CPU_STATS_ADD_K(vm, dfree, 1);
1669 1693  
1670 1694          if (isfs) {
1671 1695                  if (isexec) {
1672 1696                          CPU_STATS_ADD_K(vm, execfree, 1);
1673 1697                  } else {
1674 1698                          CPU_STATS_ADD_K(vm, fsfree, 1);
1675 1699                  }
1676 1700                  zone_pageout_stat(zid, ZPO_FS);
1677 1701          } else {
1678 1702                  CPU_STATS_ADD_K(vm, anonfree, 1);
1679 1703                  zone_pageout_stat(zid, ZPO_ANON);
1680 1704          }
1681 1705  
1682 1706          return (CKP_FREED);
1683 1707  }
1684 1708  
1685 1709  /*
1686 1710   * Queue async i/o request from pageout_scanner and segment swapout
1687 1711   * routines on one common list.  This ensures that pageout devices (swap)
1688 1712   * are not saturated by pageout_scanner or swapout requests.
1689 1713   * The pageout thread empties this list by initiating i/o operations.
1690 1714   */
1691 1715  int
1692 1716  queue_io_request(vnode_t *vp, u_offset_t off)
1693 1717  {
1694 1718          struct async_reqs *arg;
1695 1719  
1696 1720          /*
1697 1721           * If we cannot allocate an async request struct,
1698 1722           * skip this page.
1699 1723           */
1700 1724          mutex_enter(&push_lock);
1701 1725          if ((arg = req_freelist) == NULL) {
1702 1726                  mutex_exit(&push_lock);
1703 1727                  return (0);
1704 1728          }
1705 1729          req_freelist = arg->a_next;             /* adjust freelist */
1706 1730          push_list_size++;
1707 1731  
1708 1732          arg->a_vp = vp;
1709 1733          arg->a_off = off;
1710 1734          arg->a_len = PAGESIZE;
1711 1735          arg->a_flags = B_ASYNC | B_FREE;
1712 1736          arg->a_cred = kcred;            /* always held */
1713 1737  
1714 1738          /*
1715 1739           * Add to list of pending write requests.
1716 1740           */
1717 1741          arg->a_next = push_list;
1718 1742          push_list = arg;
1719 1743  
1720 1744          if (req_freelist == NULL) {
1721 1745                  /*
1722 1746                   * No free async requests left. The lock is held so we
1723 1747                   * might as well signal the pusher thread now.
1724 1748                   */
1725 1749                  cv_signal(&push_cv);
1726 1750          }
1727 1751          mutex_exit(&push_lock);
1728 1752          return (1);
1729 1753  }
1730 1754  
1731 1755  /*
1732 1756   * Wakeup pageout to initiate i/o if push_list is not empty.
1733 1757   */
1734 1758  void
1735 1759  cv_signal_pageout()
1736 1760  {
1737 1761          if (push_list != NULL) {
1738 1762                  mutex_enter(&push_lock);
1739 1763                  cv_signal(&push_cv);
1740 1764                  mutex_exit(&push_lock);
1741 1765          }
1742 1766  }
  
    | 
      ↓ open down ↓ | 
    192 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX