Print this page
    
5513 KM_NORMALPRI should be documented in kmem_alloc(9f) and kmem_cache_create(9f) man pages
14465 Present KM_NOSLEEP_LAZY as documented interface
Change-Id: I002ec28ddf390650f1fcba1ca94f6abfdb241439
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/vm_pageout.c
          +++ new/usr/src/uts/common/os/vm_pageout.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2021 Oxide Computer Company
  24   24   * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29   29   * Use is subject to license terms.
  30   30   */
  31   31  
  32   32  /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  33   33  /* All Rights Reserved */
  34   34  
  35   35  /*
  36   36   * University Copyright- Copyright (c) 1982, 1986, 1988
  37   37   * The Regents of the University of California
  38   38   * All Rights Reserved
  39   39   *
  40   40   * University Acknowledgment- Portions of this document are derived from
  41   41   * software developed by the University of California, Berkeley, and its
  42   42   * contributors.
  43   43   */
  44   44  
  45   45  #include <sys/types.h>
  46   46  #include <sys/t_lock.h>
  47   47  #include <sys/param.h>
  48   48  #include <sys/buf.h>
  49   49  #include <sys/uio.h>
  50   50  #include <sys/proc.h>
  51   51  #include <sys/systm.h>
  52   52  #include <sys/mman.h>
  53   53  #include <sys/cred.h>
  54   54  #include <sys/vnode.h>
  55   55  #include <sys/vm.h>
  56   56  #include <sys/vmparam.h>
  57   57  #include <sys/vtrace.h>
  58   58  #include <sys/cmn_err.h>
  59   59  #include <sys/cpuvar.h>
  60   60  #include <sys/user.h>
  61   61  #include <sys/kmem.h>
  62   62  #include <sys/debug.h>
  63   63  #include <sys/callb.h>
  64   64  #include <sys/tnf_probe.h>
  65   65  #include <sys/mem_cage.h>
  66   66  #include <sys/time.h>
  67   67  #include <sys/stdbool.h>
  68   68  
  69   69  #include <vm/hat.h>
  70   70  #include <vm/as.h>
  71   71  #include <vm/seg.h>
  72   72  #include <vm/page.h>
  73   73  #include <vm/pvn.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  /*
  77   77   * FREE MEMORY MANAGEMENT
  78   78   *
  79   79   * Management of the pool of free pages is a tricky business.  There are
  80   80   * several critical threshold values which constrain our allocation of new
  81   81   * pages and inform the rate of paging out of memory to swap.  These threshold
  82   82   * values, and the behaviour they induce, are described below in descending
  83   83   * order of size -- and thus increasing order of severity!
  84   84   *
  85   85   *   +---------------------------------------------------- physmem (all memory)
  86   86   *   |
  87   87   *   | Ordinarily there are no particular constraints placed on page
  88   88   *   v allocation.  The page scanner is not running and page_create_va()
  89   89   *   | will effectively grant all page requests (whether from the kernel
  90   90   *   | or from user processes) without artificial delay.
  91   91   *   |
  92   92   *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
  93   93   *   |
  94   94   *   | When we have less than "lotsfree" pages, pageout_scanner() is
  95   95   *   v signalled by schedpaging() to begin looking for pages that can
  96   96   *   | be evicted to disk to bring us back above lotsfree.  At this
  97   97   *   | stage there is still no constraint on allocation of free pages.
  98   98   *   |
  99   99   *   | For small systems, we set a lower bound of 16MB for lotsfree;
 100  100   *   v this is the natural value for a system with 1GB memory.  This is
 101  101   *   | to ensure that the pageout reserve pool contains at least 4MB
 102  102   *   | for use by ZFS.
 103  103   *   |
 104  104   *   | For systems with a large amount of memory, we constrain lotsfree
 105  105   *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
 106  106   *   v at some point the required slack relates more closely to the
 107  107   *   | rate at which paging can occur than to the total amount of memory.
 108  108   *   |
 109  109   *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
 110  110   *   |
 111  111   *   | When we drop below desfree, a number of kernel facilities will
 112  112   *   v wait before allocating more memory, under the assumption that
 113  113   *   | pageout or reaping will make progress and free up some memory.
 114  114   *   | This behaviour is not especially coordinated; look for comparisons
 115  115   *   | of desfree and freemem.
 116  116   *   |
 117  117   *   | In addition to various attempts at advisory caution, clock()
 118  118   *   | will wake up the thread that is ordinarily parked in sched().
 119  119   *   | This routine is responsible for the heavy-handed swapping out
 120  120   *   v of entire processes in an attempt to arrest the slide of free
 121  121   *   | memory.  See comments in sched.c for more details.
 122  122   *   |
 123  123   *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
 124  124   *   |
 125  125   *   | These two separate tunables have, by default, the same value.
  
    | 
      ↓ open down ↓ | 
    125 lines elided | 
    
      ↑ open up ↑ | 
  
 126  126   *   v Various parts of the kernel use minfree to signal the need for
 127  127   *   | more aggressive reclamation of memory, and sched() is more
 128  128   *   | aggressive at swapping processes out.
 129  129   *   |
 130  130   *   | If free memory falls below throttlefree, page_create_va() will
 131  131   *   | use page_create_throttle() to begin holding most requests for
 132  132   *   | new pages while pageout and reaping free up memory.  Sleeping
 133  133   *   v allocations (e.g., KM_SLEEP) are held here while we wait for
 134  134   *   | more memory.  Non-sleeping allocations are generally allowed to
 135  135   *   | proceed, unless their priority is explicitly lowered with
 136      - *   | KM_NORMALPRI.
      136 + *   | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
 137  137   *   |
 138  138   *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
 139  139   *   |
 140  140   *   | When we hit throttlefree, the situation is already dire.  The
 141  141   *   v system is generally paging out memory and swapping out entire
 142  142   *   | processes in order to free up memory for continued operation.
 143  143   *   |
 144  144   *   | Unfortunately, evicting memory to disk generally requires short
 145  145   *   | term use of additional memory; e.g., allocation of buffers for
 146  146   *   | storage drivers, updating maps of free and used blocks, etc.
 147  147   *   | As such, pageout_reserve is the number of pages that we keep in
 148  148   *   | special reserve for use by pageout() and sched() and by any
 149  149   *   v other parts of the kernel that need to be working for those to
 150  150   *   | make forward progress such as the ZFS I/O pipeline.
 151  151   *   |
 152  152   *   | When we are below pageout_reserve, we fail or hold any allocation
 153  153   *   | that has not explicitly requested access to the reserve pool.
 154  154   *   | Access to the reserve is generally granted via the KM_PUSHPAGE
 155  155   *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
 156  156   *   | can implicitly tap the reserve.  For more details, see the
 157  157   *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
 158  158   *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
 159  159   *   |
 160  160   *   +---------------------------------------------------------- no free memory
 161  161   *   |
 162  162   *   | If we have arrived here, things are very bad indeed.  It is
 163  163   *   v surprisingly difficult to tell if this condition is even fatal,
 164  164   *   | as enough memory may have been granted to pageout() and to the
 165  165   *   | ZFS I/O pipeline that requests for eviction that have already been
 166  166   *   | made will complete and free up memory some time soon.
 167  167   *   |
 168  168   *   | If free memory does not materialise, the system generally remains
 169  169   *   | deadlocked.  The pageout_deadman() below is run once per second
 170  170   *   | from clock(), seeking to limit the amount of time a single request
 171  171   *   v to page out can be blocked before the system panics to get a crash
 172  172   *   | dump and return to service.
 173  173   *   |
 174  174   *   +-------------------------------------------------------------------------
 175  175   */
 176  176  
 177  177  /*
 178  178   * The following parameters control operation of the page replacement
 179  179   * algorithm.  They are initialized to 0, and then computed at boot time based
 180  180   * on the size of the system; see setupclock().  If they are patched non-zero
 181  181   * in a loaded vmunix they are left alone and may thus be changed per system
 182  182   * using "mdb -kw" on the loaded system.
 183  183   */
 184  184  pgcnt_t         slowscan = 0;
 185  185  pgcnt_t         fastscan = 0;
 186  186  
 187  187  static pgcnt_t  handspreadpages = 0;
 188  188  
 189  189  /*
 190  190   * looppages:
 191  191   *     Cached copy of the total number of pages in the system (total_pages).
 192  192   *
 193  193   * loopfraction:
 194  194   *     Divisor used to relate fastscan to looppages in setupclock().
 195  195   */
 196  196  static uint_t   loopfraction = 2;
 197  197  static pgcnt_t  looppages;
 198  198  
 199  199  static uint_t   min_percent_cpu = 4;
 200  200  static uint_t   max_percent_cpu = 80;
 201  201  static pgcnt_t  maxfastscan = 0;
 202  202  static pgcnt_t  maxslowscan = 100;
 203  203  
 204  204  #define         MEGABYTES               (1024ULL * 1024ULL)
 205  205  
 206  206  /*
 207  207   * pageout_threshold_style:
 208  208   *     set to 1 to use the previous default threshold size calculation;
 209  209   *     i.e., each threshold is half of the next largest value.
 210  210   */
 211  211  uint_t          pageout_threshold_style = 0;
 212  212  
 213  213  /*
 214  214   * The operator may override these tunables to request a different minimum or
 215  215   * maximum lotsfree value, or to change the divisor we use for automatic
 216  216   * sizing.
 217  217   *
 218  218   * By default, we make lotsfree 1/64th of the total memory in the machine.  The
 219  219   * minimum and maximum are specified in bytes, rather than pages; a zero value
 220  220   * means the default values (below) are used.
 221  221   */
 222  222  uint_t          lotsfree_fraction = 64;
 223  223  pgcnt_t         lotsfree_min = 0;
 224  224  pgcnt_t         lotsfree_max = 0;
 225  225  
 226  226  #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 227  227  #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 228  228  
 229  229  /*
 230  230   * If these tunables are set to non-zero values in /etc/system, and provided
 231  231   * the value is not larger than the threshold above, the specified value will
 232  232   * be used directly without any additional calculation or adjustment.  The boot
 233  233   * time value of these overrides is preserved in the "clockinit" struct.  More
 234  234   * detail is available in the comment at the top of the file.
 235  235   */
 236  236  pgcnt_t         maxpgio = 0;
 237  237  pgcnt_t         minfree = 0;
 238  238  pgcnt_t         desfree = 0;
 239  239  pgcnt_t         lotsfree = 0;
 240  240  pgcnt_t         needfree = 0;
 241  241  pgcnt_t         throttlefree = 0;
 242  242  pgcnt_t         pageout_reserve = 0;
 243  243  
 244  244  pgcnt_t         deficit;
 245  245  pgcnt_t         nscan;
 246  246  pgcnt_t         desscan;
 247  247  
 248  248  /*
 249  249   * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
 250  250   * number of nanoseconds in each wakeup cycle that gives the equivalent of some
 251  251   * underlying %CPU duty cycle.
 252  252   *
 253  253   * min_pageout_nsec:
 254  254   *     nanoseconds/wakeup equivalent of min_percent_cpu.
 255  255   *
 256  256   * max_pageout_nsec:
 257  257   *     nanoseconds/wakeup equivalent of max_percent_cpu.
 258  258   *
 259  259   * pageout_nsec:
 260  260   *     Number of nanoseconds budgeted for each wakeup cycle.
 261  261   *     Computed each time around by schedpaging().
 262  262   *     Varies between min_pageout_nsec and max_pageout_nsec,
 263  263   *     depending on memory pressure.
 264  264   */
 265  265  static hrtime_t min_pageout_nsec;
 266  266  static hrtime_t max_pageout_nsec;
 267  267  static hrtime_t pageout_nsec;
 268  268  
 269  269  static uint_t   reset_hands;
 270  270  
 271  271  #define PAGES_POLL_MASK 1023
 272  272  
 273  273  /*
 274  274   * pageout_sample_lim:
 275  275   *     The limit on the number of samples needed to establish a value for new
 276  276   *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 277  277   *     handspreadpages.
 278  278   *
 279  279   * pageout_sample_cnt:
 280  280   *     Current sample number.  Once the sample gets large enough, set new
 281  281   *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 282  282   *
 283  283   * pageout_sample_pages:
 284  284   *     The accumulated number of pages scanned during sampling.
 285  285   *
 286  286   * pageout_sample_etime:
 287  287   *     The accumulated nanoseconds for the sample.
 288  288   *
 289  289   * pageout_rate:
 290  290   *     Rate in pages/nanosecond, computed at the end of sampling.
 291  291   *
 292  292   * pageout_new_spread:
 293  293   *     Initially zero while the system scan rate is measured by
 294  294   *     pageout_scanner(), which then sets this value once per system boot after
 295  295   *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 296  296   *     new value is used for fastscan and handspreadpages.
 297  297   *
 298  298   * sample_start, sample_end:
 299  299   *     The hrtime at which the last pageout_scanner() sample began and ended.
 300  300   */
 301  301  typedef hrtime_t hrrate_t;
 302  302  
 303  303  static uint64_t pageout_sample_lim = 4;
 304  304  static uint64_t pageout_sample_cnt = 0;
 305  305  static pgcnt_t  pageout_sample_pages = 0;
 306  306  static hrrate_t pageout_rate = 0;
 307  307  static pgcnt_t  pageout_new_spread = 0;
 308  308  
 309  309  static hrtime_t pageout_cycle_nsec;
 310  310  static hrtime_t sample_start, sample_end;
 311  311  static hrtime_t pageout_sample_etime = 0;
 312  312  
 313  313  /*
 314  314   * Record number of times a pageout_scanner() wakeup cycle finished because it
 315  315   * timed out (exceeded its CPU budget), rather than because it visited
 316  316   * its budgeted number of pages.
 317  317   */
 318  318  uint64_t        pageout_timeouts = 0;
 319  319  
 320  320  #ifdef VM_STATS
 321  321  static struct pageoutvmstats_str {
 322  322          ulong_t checkpage[3];
 323  323  } pageoutvmstats;
 324  324  #endif /* VM_STATS */
 325  325  
 326  326  /*
 327  327   * Threads waiting for free memory use this condition variable and lock until
 328  328   * memory becomes available.
 329  329   */
 330  330  kmutex_t        memavail_lock;
 331  331  kcondvar_t      memavail_cv;
 332  332  
 333  333  typedef enum pageout_hand {
 334  334          POH_FRONT = 1,
 335  335          POH_BACK,
 336  336  } pageout_hand_t;
 337  337  
 338  338  typedef enum {
 339  339          CKP_INELIGIBLE,
 340  340          CKP_NOT_FREED,
 341  341          CKP_FREED,
 342  342  } checkpage_result_t;
 343  343  
 344  344  static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 345  345  
 346  346  static struct clockinit {
 347  347          bool ci_init;
 348  348          pgcnt_t ci_lotsfree_min;
 349  349          pgcnt_t ci_lotsfree_max;
 350  350          pgcnt_t ci_lotsfree;
 351  351          pgcnt_t ci_desfree;
 352  352          pgcnt_t ci_minfree;
 353  353          pgcnt_t ci_throttlefree;
 354  354          pgcnt_t ci_pageout_reserve;
 355  355          pgcnt_t ci_maxpgio;
 356  356          pgcnt_t ci_maxfastscan;
 357  357          pgcnt_t ci_fastscan;
 358  358          pgcnt_t ci_slowscan;
 359  359          pgcnt_t ci_handspreadpages;
 360  360  } clockinit = { .ci_init = false };
 361  361  
 362  362  static pgcnt_t
 363  363  clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 364  364  {
 365  365          if (value < minimum) {
 366  366                  return (minimum);
 367  367          } else if (value > maximum) {
 368  368                  return (maximum);
 369  369          } else {
 370  370                  return (value);
 371  371          }
 372  372  }
 373  373  
 374  374  static pgcnt_t
 375  375  tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 376  376  {
 377  377          if (initval == 0 || initval >= initval_ceiling) {
 378  378                  return (defval);
 379  379          } else {
 380  380                  return (initval);
 381  381          }
 382  382  }
 383  383  
 384  384  /*
 385  385   * Set up the paging constants for the clock algorithm used by
 386  386   * pageout_scanner(), and by the virtual memory system overall.  See the
 387  387   * comments at the top of this file for more information about the threshold
 388  388   * values and system responses to memory pressure.
 389  389   *
 390  390   * This routine is called once by main() at startup, after the initial size of
 391  391   * physical memory is determined.  It may be called again later if memory is
 392  392   * added to or removed from the system, or if new measurements of the page scan
 393  393   * rate become available.
 394  394   */
 395  395  void
 396  396  setupclock(void)
 397  397  {
 398  398          pgcnt_t defval;
 399  399          bool half = (pageout_threshold_style == 1);
 400  400          bool recalc = true;
 401  401  
 402  402          looppages = total_pages;
 403  403  
 404  404          /*
 405  405           * The operator may have provided specific values for some of the
 406  406           * tunables via /etc/system.  On our first call, we preserve those
 407  407           * values so that they can be used for subsequent recalculations.
 408  408           *
 409  409           * A value of zero for any tunable means we will use the default
 410  410           * sizing.
 411  411           */
 412  412          if (!clockinit.ci_init) {
 413  413                  clockinit.ci_init = true;
 414  414  
 415  415                  clockinit.ci_lotsfree_min = lotsfree_min;
 416  416                  clockinit.ci_lotsfree_max = lotsfree_max;
 417  417                  clockinit.ci_lotsfree = lotsfree;
 418  418                  clockinit.ci_desfree = desfree;
 419  419                  clockinit.ci_minfree = minfree;
 420  420                  clockinit.ci_throttlefree = throttlefree;
 421  421                  clockinit.ci_pageout_reserve = pageout_reserve;
 422  422                  clockinit.ci_maxpgio = maxpgio;
 423  423                  clockinit.ci_maxfastscan = maxfastscan;
 424  424                  clockinit.ci_fastscan = fastscan;
 425  425                  clockinit.ci_slowscan = slowscan;
 426  426                  clockinit.ci_handspreadpages = handspreadpages;
 427  427  
 428  428                  /*
 429  429                   * The first call does not trigger a recalculation, only
 430  430                   * subsequent calls.
 431  431                   */
 432  432                  recalc = false;
 433  433          }
 434  434  
 435  435          /*
 436  436           * Configure paging threshold values.  For more details on what each
 437  437           * threshold signifies, see the comments at the top of this file.
 438  438           */
 439  439          lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 440  440              btop(LOTSFREE_MAX_DEFAULT));
 441  441          lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 442  442              btop(LOTSFREE_MIN_DEFAULT));
 443  443  
 444  444          lotsfree = tune(clockinit.ci_lotsfree, looppages,
 445  445              clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 446  446  
 447  447          desfree = tune(clockinit.ci_desfree, lotsfree,
 448  448              lotsfree / 2);
 449  449  
 450  450          minfree = tune(clockinit.ci_minfree, desfree,
 451  451              half ? desfree / 2 : 3 * desfree / 4);
 452  452  
 453  453          throttlefree = tune(clockinit.ci_throttlefree, desfree,
 454  454              minfree);
 455  455  
 456  456          pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
 457  457              half ? throttlefree / 2 : 3 * throttlefree / 4);
 458  458  
 459  459          /*
 460  460           * Maxpgio thresholds how much paging is acceptable.
 461  461           * This figures that 2/3 busy on an arm is all that is
 462  462           * tolerable for paging.  We assume one operation per disk rev.
 463  463           *
 464  464           * XXX - Does not account for multiple swap devices.
 465  465           */
 466  466          if (clockinit.ci_maxpgio == 0) {
 467  467                  maxpgio = (DISKRPM * 2) / 3;
 468  468          } else {
 469  469                  maxpgio = clockinit.ci_maxpgio;
 470  470          }
 471  471  
 472  472          /*
 473  473           * The clock scan rate varies between fastscan and slowscan
 474  474           * based on the amount of free memory available.  Fastscan
 475  475           * rate should be set based on the number pages that can be
 476  476           * scanned per sec using ~10% of processor time.  Since this
 477  477           * value depends on the processor, MMU, Mhz etc., it is
 478  478           * difficult to determine it in a generic manner for all
 479  479           * architectures.
 480  480           *
 481  481           * Instead of trying to determine the number of pages scanned
 482  482           * per sec for every processor, fastscan is set to be the smaller
 483  483           * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 484  484           * time is limited to ~4% of processor time.
 485  485           *
 486  486           * Setting fastscan to be 1/2 of memory allows pageout to scan
 487  487           * all of memory in ~2 secs.  This implies that user pages not
 488  488           * accessed within 1 sec (assuming, handspreadpages == fastscan)
 489  489           * can be reclaimed when free memory is very low.  Stealing pages
 490  490           * not accessed within 1 sec seems reasonable and ensures that
 491  491           * active user processes don't thrash.
 492  492           *
 493  493           * Smaller values of fastscan result in scanning fewer pages
 494  494           * every second and consequently pageout may not be able to free
 495  495           * sufficient memory to maintain the minimum threshold.  Larger
 496  496           * values of fastscan result in scanning a lot more pages which
 497  497           * could lead to thrashing and higher CPU usage.
 498  498           *
 499  499           * Fastscan needs to be limited to a maximum value and should not
 500  500           * scale with memory to prevent pageout from consuming too much
 501  501           * time for scanning on slow CPU's and avoid thrashing, as a
 502  502           * result of scanning too many pages, on faster CPU's.
 503  503           * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 504  504           * (the upper bound for fastscan) based on the average number
 505  505           * of pages that can potentially be scanned in ~1 sec (using ~4%
 506  506           * of the CPU) on some of the following machines that currently
 507  507           * run Solaris 2.x:
 508  508           *
 509  509           *                      average memory scanned in ~1 sec
 510  510           *
 511  511           *      25 Mhz SS1+:            23 Meg
 512  512           *      LX:                     37 Meg
 513  513           *      50 Mhz SC2000:          68 Meg
 514  514           *
 515  515           *      40 Mhz 486:             26 Meg
 516  516           *      66 Mhz 486:             42 Meg
 517  517           *
 518  518           * When free memory falls just below lotsfree, the scan rate
 519  519           * goes from 0 to slowscan (i.e., pageout starts running).  This
 520  520           * transition needs to be smooth and is achieved by ensuring that
 521  521           * pageout scans a small number of pages to satisfy the transient
 522  522           * memory demand.  This is set to not exceed 100 pages/sec (25 per
 523  523           * wakeup) since scanning that many pages has no noticible impact
 524  524           * on system performance.
 525  525           *
 526  526           * In addition to setting fastscan and slowscan, pageout is
 527  527           * limited to using ~4% of the CPU.  This results in increasing
 528  528           * the time taken to scan all of memory, which in turn means that
 529  529           * user processes have a better opportunity of preventing their
 530  530           * pages from being stolen.  This has a positive effect on
 531  531           * interactive and overall system performance when memory demand
 532  532           * is high.
 533  533           *
 534  534           * Thus, the rate at which pages are scanned for replacement will
 535  535           * vary linearly between slowscan and the number of pages that
 536  536           * can be scanned using ~4% of processor time instead of varying
 537  537           * linearly between slowscan and fastscan.
 538  538           *
 539  539           * Also, the processor time used by pageout will vary from ~1%
 540  540           * at slowscan to ~4% at fastscan instead of varying between
 541  541           * ~1% at slowscan and ~10% at fastscan.
 542  542           *
 543  543           * The values chosen for the various VM parameters (fastscan,
 544  544           * handspreadpages, etc) are not universally true for all machines,
 545  545           * but appear to be a good rule of thumb for the machines we've
 546  546           * tested.  They have the following ranges:
 547  547           *
 548  548           *      cpu speed:      20 to 70 Mhz
 549  549           *      page size:      4K to 8K
 550  550           *      memory size:    16M to 5G
 551  551           *      page scan rate: 4000 - 17400 4K pages per sec
 552  552           *
 553  553           * The values need to be re-examined for machines which don't
 554  554           * fall into the various ranges (e.g., slower or faster CPUs,
 555  555           * smaller or larger pagesizes etc) shown above.
 556  556           *
 557  557           * On an MP machine, pageout is often unable to maintain the
 558  558           * minimum paging thresholds under heavy load.  This is due to
 559  559           * the fact that user processes running on other CPU's can be
 560  560           * dirtying memory at a much faster pace than pageout can find
 561  561           * pages to free.  The memory demands could be met by enabling
 562  562           * more than one CPU to run the clock algorithm in such a manner
 563  563           * that the various clock hands don't overlap.  This also makes
 564  564           * it more difficult to determine the values for fastscan, slowscan
 565  565           * and handspreadpages.
 566  566           *
 567  567           * The swapper is currently used to free up memory when pageout
 568  568           * is unable to meet memory demands by swapping out processes.
 569  569           * In addition to freeing up memory, swapping also reduces the
 570  570           * demand for memory by preventing user processes from running
 571  571           * and thereby consuming memory.
 572  572           */
 573  573          if (clockinit.ci_maxfastscan == 0) {
 574  574                  if (pageout_new_spread != 0) {
 575  575                          maxfastscan = pageout_new_spread;
 576  576                  } else {
 577  577                          maxfastscan = MAXHANDSPREADPAGES;
 578  578                  }
 579  579          } else {
 580  580                  maxfastscan = clockinit.ci_maxfastscan;
 581  581          }
 582  582  
 583  583          if (clockinit.ci_fastscan == 0) {
 584  584                  fastscan = MIN(looppages / loopfraction, maxfastscan);
 585  585          } else {
 586  586                  fastscan = clockinit.ci_fastscan;
 587  587          }
 588  588  
 589  589          if (fastscan > looppages / loopfraction) {
 590  590                  fastscan = looppages / loopfraction;
 591  591          }
 592  592  
 593  593          /*
 594  594           * Set slow scan time to 1/10 the fast scan time, but
 595  595           * not to exceed maxslowscan.
 596  596           */
 597  597          if (clockinit.ci_slowscan == 0) {
 598  598                  slowscan = MIN(fastscan / 10, maxslowscan);
 599  599          } else {
 600  600                  slowscan = clockinit.ci_slowscan;
 601  601          }
 602  602  
 603  603          if (slowscan > fastscan / 2) {
 604  604                  slowscan = fastscan / 2;
 605  605          }
 606  606  
 607  607          /*
 608  608           * Handspreadpages is distance (in pages) between front and back
 609  609           * pageout daemon hands.  The amount of time to reclaim a page
 610  610           * once pageout examines it increases with this distance and
 611  611           * decreases as the scan rate rises. It must be < the amount
 612  612           * of pageable memory.
 613  613           *
 614  614           * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 615  615           * to be "fastscan" results in the front hand being a few secs
 616  616           * (varies based on the processor speed) ahead of the back hand
 617  617           * at fastscan rates.  This distance can be further reduced, if
 618  618           * necessary, by increasing the processor time used by pageout
 619  619           * to be more than ~4% and preferrably not more than ~10%.
 620  620           *
 621  621           * As a result, user processes have a much better chance of
 622  622           * referencing their pages before the back hand examines them.
 623  623           * This also significantly lowers the number of reclaims from
 624  624           * the freelist since pageout does not end up freeing pages which
 625  625           * may be referenced a sec later.
 626  626           */
 627  627          if (clockinit.ci_handspreadpages == 0) {
 628  628                  handspreadpages = fastscan;
 629  629          } else {
 630  630                  handspreadpages = clockinit.ci_handspreadpages;
 631  631          }
 632  632  
 633  633          /*
 634  634           * Make sure that back hand follows front hand by at least
 635  635           * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 636  636           * back hand to look at a page during the same wakeup of the pageout
 637  637           * daemon in which the front hand cleared its ref bit.
 638  638           */
 639  639          if (handspreadpages >= looppages) {
 640  640                  handspreadpages = looppages - 1;
 641  641          }
 642  642  
 643  643          /*
 644  644           * If we have been called to recalculate the parameters, set a flag to
 645  645           * re-evaluate the clock hand pointers.
 646  646           */
 647  647          if (recalc) {
 648  648                  reset_hands = 1;
 649  649          }
 650  650  }
 651  651  
 652  652  /*
 653  653   * Pageout scheduling.
 654  654   *
 655  655   * Schedpaging controls the rate at which the page out daemon runs by
 656  656   * setting the global variables nscan and desscan SCHEDPAGING_HZ
 657  657   * times a second.  Nscan records the number of pages pageout has examined
 658  658   * in its current pass; schedpaging() resets this value to zero each time
 659  659   * it runs.  Desscan records the number of pages pageout should examine
 660  660   * in its next pass; schedpaging() sets this value based on the amount of
 661  661   * currently available memory.
 662  662   */
 663  663  #define SCHEDPAGING_HZ  4
 664  664  
 665  665  static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 666  666  
 667  667  /*
 668  668   * Pool of available async pageout putpage requests.
 669  669   */
 670  670  static struct async_reqs *push_req;
 671  671  static struct async_reqs *req_freelist; /* available req structs */
 672  672  static struct async_reqs *push_list;    /* pending reqs */
 673  673  static kmutex_t push_lock;              /* protects req pool */
 674  674  static kcondvar_t push_cv;
 675  675  
 676  676  /*
 677  677   * If pageout() is stuck on a single push for this many seconds,
 678  678   * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 679  679   * to 0, the deadman will have no effect.
 680  680   *
 681  681   * Note that we are only looking for stalls in the calls that pageout() makes
 682  682   * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 683  683   * I/O, which should not take long unless the underlying strategy call blocks
 684  684   * indefinitely for memory.  The actual I/O request happens (or fails) later.
 685  685   */
 686  686  uint_t pageout_deadman_seconds = 90;
 687  687  
 688  688  static uint_t pageout_stucktime = 0;
 689  689  static bool pageout_pushing = false;
 690  690  static uint64_t pageout_pushcount = 0;
 691  691  static uint64_t pageout_pushcount_seen = 0;
 692  692  
 693  693  static int async_list_size = 256;       /* number of async request structs */
 694  694  
 695  695  static void pageout_scanner(void);
 696  696  
 697  697  /*
 698  698   * If a page is being shared more than "po_share" times
 699  699   * then leave it alone- don't page it out.
 700  700   */
 701  701  #define MIN_PO_SHARE    (8)
 702  702  #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 703  703  ulong_t po_share = MIN_PO_SHARE;
 704  704  
 705  705  /*
 706  706   * Schedule rate for paging.
 707  707   * Rate is linear interpolation between
 708  708   * slowscan with lotsfree and fastscan when out of memory.
 709  709   */
 710  710  static void
 711  711  schedpaging(void *arg)
 712  712  {
 713  713          spgcnt_t vavail;
 714  714  
 715  715          if (freemem < lotsfree + needfree + kmem_reapahead)
 716  716                  kmem_reap();
 717  717  
 718  718          if (freemem < lotsfree + needfree)
 719  719                  seg_preap();
 720  720  
 721  721          if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 722  722                  kcage_cageout_wakeup();
 723  723  
 724  724          if (mutex_tryenter(&pageout_mutex)) {
 725  725                  /* pageout() not running */
 726  726                  nscan = 0;
 727  727                  vavail = freemem - deficit;
 728  728                  if (pageout_new_spread != 0)
 729  729                          vavail -= needfree;
 730  730                  if (vavail < 0)
 731  731                          vavail = 0;
 732  732                  if (vavail > lotsfree)
 733  733                          vavail = lotsfree;
 734  734  
 735  735                  /*
 736  736                   * Fix for 1161438 (CRS SPR# 73922).  All variables
 737  737                   * in the original calculation for desscan were 32 bit signed
 738  738                   * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 739  739                   * more of memory, the calculation can overflow.  When this
 740  740                   * happens, desscan becomes negative and pageout_scanner()
 741  741                   * stops paging out.
 742  742                   */
 743  743                  if (needfree > 0 && pageout_new_spread == 0) {
 744  744                          /*
 745  745                           * If we've not yet collected enough samples to
 746  746                           * calculate a spread, use the old logic of kicking
 747  747                           * into high gear anytime needfree is non-zero.
 748  748                           */
 749  749                          desscan = fastscan / SCHEDPAGING_HZ;
 750  750                  } else {
 751  751                          /*
 752  752                           * Once we've calculated a spread based on system
 753  753                           * memory and usage, just treat needfree as another
 754  754                           * form of deficit.
 755  755                           */
 756  756                          spgcnt_t faststmp, slowstmp, result;
 757  757  
 758  758                          slowstmp = slowscan * vavail;
 759  759                          faststmp = fastscan * (lotsfree - vavail);
 760  760                          result = (slowstmp + faststmp) /
 761  761                              nz(lotsfree) / SCHEDPAGING_HZ;
 762  762                          desscan = (pgcnt_t)result;
 763  763                  }
 764  764  
 765  765                  pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
 766  766                      (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 767  767  
 768  768                  if (freemem < lotsfree + needfree ||
 769  769                      pageout_sample_cnt < pageout_sample_lim) {
 770  770                          /*
 771  771                           * Either we need more memory, or we still need to
 772  772                           * measure the average scan rate.  Wake the scanner.
 773  773                           */
 774  774                          DTRACE_PROBE(pageout__cv__signal);
 775  775                          cv_signal(&proc_pageout->p_cv);
 776  776                  } else {
 777  777                          /*
 778  778                           * There are enough free pages, no need to
 779  779                           * kick the scanner thread.  And next time
 780  780                           * around, keep more of the `highly shared'
 781  781                           * pages.
 782  782                           */
 783  783                          cv_signal_pageout();
 784  784                          if (po_share > MIN_PO_SHARE) {
 785  785                                  po_share >>= 1;
 786  786                          }
 787  787                  }
 788  788                  mutex_exit(&pageout_mutex);
 789  789          }
 790  790  
 791  791          /*
 792  792           * Signal threads waiting for available memory.
 793  793           * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 794  794           * in this case it is not needed - the waiters will be waken up during
 795  795           * the next invocation of this function.
 796  796           */
 797  797          if (kmem_avail() > 0)
 798  798                  cv_broadcast(&memavail_cv);
 799  799  
 800  800          (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 801  801  }
 802  802  
 803  803  pgcnt_t         pushes;
 804  804  ulong_t         push_list_size;         /* # of requests on pageout queue */
 805  805  
 806  806  /*
 807  807   * Paging out should always be enabled.  This tunable exists to hold pageout
 808  808   * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 809  809   * sleep each time it is woken by schedpaging().
 810  810   */
 811  811  uint_t dopageout = 1;
 812  812  
 813  813  /*
 814  814   * The page out daemon, which runs as process 2.
 815  815   *
 816  816   * As long as there are at least lotsfree pages,
 817  817   * this process is not run.  When the number of free
 818  818   * pages stays in the range desfree to lotsfree,
 819  819   * this daemon runs through the pages in the loop
 820  820   * at a rate determined in schedpaging().  Pageout manages
 821  821   * two hands on the clock.  The front hand moves through
 822  822   * memory, clearing the reference bit,
 823  823   * and stealing pages from procs that are over maxrss.
 824  824   * The back hand travels a distance behind the front hand,
 825  825   * freeing the pages that have not been referenced in the time
 826  826   * since the front hand passed.  If modified, they are pushed to
 827  827   * swap before being freed.
 828  828   *
 829  829   * There are 2 threads that act on behalf of the pageout process.
 830  830   * One thread scans pages (pageout_scanner) and frees them up if
 831  831   * they don't require any VOP_PUTPAGE operation. If a page must be
 832  832   * written back to its backing store, the request is put on a list
 833  833   * and the other (pageout) thread is signaled. The pageout thread
 834  834   * grabs VOP_PUTPAGE requests from the list, and processes them.
 835  835   * Some filesystems may require resources for the VOP_PUTPAGE
 836  836   * operations (like memory) and hence can block the pageout
 837  837   * thread, but the scanner thread can still operate. There is still
 838  838   * no guarantee that memory deadlocks cannot occur.
 839  839   *
 840  840   * For now, this thing is in very rough form.
 841  841   */
 842  842  void
 843  843  pageout()
 844  844  {
 845  845          struct async_reqs *arg;
 846  846          pri_t pageout_pri;
 847  847          int i;
 848  848          pgcnt_t max_pushes;
 849  849          callb_cpr_t cprinfo;
 850  850  
 851  851          proc_pageout = ttoproc(curthread);
 852  852          proc_pageout->p_cstime = 0;
 853  853          proc_pageout->p_stime =  0;
 854  854          proc_pageout->p_cutime =  0;
 855  855          proc_pageout->p_utime = 0;
 856  856          bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 857  857          bcopy("pageout", PTOU(curproc)->u_comm, 7);
 858  858  
 859  859          /*
 860  860           * Create pageout scanner thread
 861  861           */
 862  862          mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 863  863          mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 864  864  
 865  865          /*
 866  866           * Allocate and initialize the async request structures
 867  867           * for pageout.
 868  868           */
 869  869          push_req = (struct async_reqs *)
 870  870              kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 871  871  
 872  872          req_freelist = push_req;
 873  873          for (i = 0; i < async_list_size - 1; i++) {
 874  874                  push_req[i].a_next = &push_req[i + 1];
 875  875          }
 876  876  
 877  877          pageout_pri = curthread->t_pri;
 878  878  
 879  879          /* Create the pageout scanner thread. */
 880  880          (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 881  881              pageout_pri - 1);
 882  882  
 883  883          /*
 884  884           * kick off pageout scheduler.
 885  885           */
 886  886          schedpaging(NULL);
 887  887  
 888  888          /*
 889  889           * Create kernel cage thread.
 890  890           * The kernel cage thread is started under the pageout process
 891  891           * to take advantage of the less restricted page allocation
 892  892           * in page_create_throttle().
 893  893           */
 894  894          kcage_cageout_init();
 895  895  
 896  896          /*
 897  897           * Limit pushes to avoid saturating pageout devices.
 898  898           */
 899  899          max_pushes = maxpgio / SCHEDPAGING_HZ;
 900  900          CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 901  901  
 902  902          for (;;) {
 903  903                  mutex_enter(&push_lock);
 904  904  
 905  905                  while ((arg = push_list) == NULL || pushes > max_pushes) {
 906  906                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 907  907                          cv_wait(&push_cv, &push_lock);
 908  908                          pushes = 0;
 909  909                          CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 910  910                  }
 911  911                  push_list = arg->a_next;
 912  912                  arg->a_next = NULL;
 913  913                  pageout_pushing = true;
 914  914                  mutex_exit(&push_lock);
 915  915  
 916  916                  if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 917  917                      arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 918  918                          pushes++;
 919  919                  }
 920  920  
 921  921                  /* vp held by checkpage() */
 922  922                  VN_RELE(arg->a_vp);
 923  923  
 924  924                  mutex_enter(&push_lock);
 925  925                  pageout_pushing = false;
 926  926                  pageout_pushcount++;
 927  927                  arg->a_next = req_freelist;     /* back on freelist */
 928  928                  req_freelist = arg;
 929  929                  push_list_size--;
 930  930                  mutex_exit(&push_lock);
 931  931          }
 932  932  }
 933  933  
 934  934  /*
 935  935   * Kernel thread that scans pages looking for ones to free
 936  936   */
 937  937  static void
 938  938  pageout_scanner(void)
 939  939  {
 940  940          struct page *fronthand, *backhand;
 941  941          uint_t laps;
 942  942          callb_cpr_t cprinfo;
 943  943          pgcnt_t nscan_limit;
 944  944          pgcnt_t pcount;
 945  945          bool sampling;
 946  946  
 947  947          CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 948  948          mutex_enter(&pageout_mutex);
 949  949  
 950  950          /*
 951  951           * The restart case does not attempt to point the hands at roughly
 952  952           * the right point on the assumption that after one circuit things
 953  953           * will have settled down, and restarts shouldn't be that often.
 954  954           */
 955  955  
 956  956          /*
 957  957           * Set the two clock hands to be separated by a reasonable amount,
 958  958           * but no more than 360 degrees apart.
 959  959           */
 960  960          backhand = page_first();
 961  961          if (handspreadpages >= total_pages) {
 962  962                  fronthand = page_nextn(backhand, total_pages - 1);
 963  963          } else {
 964  964                  fronthand = page_nextn(backhand, handspreadpages);
 965  965          }
 966  966  
 967  967          /*
 968  968           * Establish the minimum and maximum length of time to be spent
 969  969           * scanning pages per wakeup, limiting the scanner duty cycle.  The
 970  970           * input percentage values (0-100) must be converted to a fraction of
 971  971           * the number of nanoseconds in a second of wall time, then further
 972  972           * scaled down by the number of scanner wakeups in a second:
 973  973           */
 974  974          min_pageout_nsec = MAX(1,
 975  975              NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
 976  976          max_pageout_nsec = MAX(min_pageout_nsec,
 977  977              NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
 978  978  
 979  979  loop:
 980  980          cv_signal_pageout();
 981  981  
 982  982          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 983  983          cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 984  984          CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 985  985  
 986  986          /*
 987  987           * Check if pageout has been disabled for debugging purposes:
 988  988           */
 989  989          if (!dopageout) {
 990  990                  goto loop;
 991  991          }
 992  992  
 993  993          /*
 994  994           * One may reset the clock hands for debugging purposes.  Hands will
 995  995           * also be reset if memory is added to or removed from the system.
 996  996           */
 997  997          if (reset_hands) {
 998  998                  reset_hands = 0;
 999  999  
1000 1000                  backhand = page_first();
1001 1001                  if (handspreadpages >= total_pages) {
1002 1002                          fronthand = page_nextn(backhand, total_pages - 1);
1003 1003                  } else {
1004 1004                          fronthand = page_nextn(backhand, handspreadpages);
1005 1005                  }
1006 1006          }
1007 1007  
1008 1008          CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1009 1009  
1010 1010          /*
1011 1011           * Keep track of the number of times we have scanned all the way around
1012 1012           * the loop:
1013 1013           */
1014 1014          laps = 0;
1015 1015  
1016 1016          DTRACE_PROBE(pageout__start);
1017 1017  
1018 1018          /*
1019 1019           * Track the number of pages visited during this scan so that we can
1020 1020           * periodically measure our duty cycle.
1021 1021           */
1022 1022          pcount = 0;
1023 1023  
1024 1024          if (pageout_sample_cnt < pageout_sample_lim) {
1025 1025                  /*
1026 1026                   * We need to measure the rate at which the system is able to
1027 1027                   * scan pages of memory.  Each of these initial samples is a
1028 1028                   * scan of all system memory, regardless of whether or not we
1029 1029                   * are experiencing memory pressure.
1030 1030                   */
1031 1031                  nscan_limit = total_pages;
1032 1032                  sampling = true;
1033 1033          } else {
1034 1034                  nscan_limit = desscan;
1035 1035                  sampling = false;
1036 1036          }
1037 1037  
1038 1038          sample_start = gethrtime();
1039 1039  
1040 1040          /*
1041 1041           * Scan the appropriate number of pages for a single duty cycle.
1042 1042           */
1043 1043          while (nscan < nscan_limit) {
1044 1044                  checkpage_result_t rvfront, rvback;
1045 1045  
1046 1046                  if (!sampling && freemem >= lotsfree + needfree) {
1047 1047                          /*
1048 1048                           * We are not sampling and enough memory has become
1049 1049                           * available that scanning is no longer required.
1050 1050                           */
1051 1051                          break;
1052 1052                  }
1053 1053  
1054 1054                  /*
1055 1055                   * Periodically check to see if we have exceeded the CPU duty
1056 1056                   * cycle for a single wakeup.
1057 1057                   */
1058 1058                  if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1059 1059                          pageout_cycle_nsec = gethrtime() - sample_start;
1060 1060                          if (pageout_cycle_nsec >= pageout_nsec) {
1061 1061                                  ++pageout_timeouts;
1062 1062                                  break;
1063 1063                          }
1064 1064                  }
1065 1065  
1066 1066                  /*
1067 1067                   * If checkpage manages to add a page to the free list,
1068 1068                   * we give ourselves another couple of trips around the loop.
1069 1069                   */
1070 1070                  if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1071 1071                          laps = 0;
1072 1072                  }
1073 1073                  if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1074 1074                          laps = 0;
1075 1075                  }
1076 1076  
1077 1077                  ++pcount;
1078 1078  
1079 1079                  /*
1080 1080                   * Protected by pageout_mutex instead of cpu_stat_lock:
1081 1081                   */
1082 1082                  CPU_STATS_ADDQ(CPU, vm, scan, 1);
1083 1083  
1084 1084                  /*
1085 1085                   * Don't include ineligible pages in the number scanned.
1086 1086                   */
1087 1087                  if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1088 1088                          nscan++;
1089 1089                  }
1090 1090  
1091 1091                  backhand = page_next(backhand);
1092 1092                  fronthand = page_next(fronthand);
1093 1093  
1094 1094                  /*
1095 1095                   * The front hand has wrapped around to the first page in the
1096 1096                   * loop.
1097 1097                   */
1098 1098                  if (fronthand == page_first()) {
1099 1099                          laps++;
1100 1100                          DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps);
1101 1101  
1102 1102                          /*
1103 1103                           * Protected by pageout_mutex instead of cpu_stat_lock:
1104 1104                           */
1105 1105                          CPU_STATS_ADDQ(CPU, vm, rev, 1);
1106 1106  
1107 1107                          if (laps > 1) {
1108 1108                                  /*
1109 1109                                   * Extremely unlikely, but it happens.
1110 1110                                   * We went around the loop at least once
1111 1111                                   * and didn't get far enough.
1112 1112                                   * If we are still skipping `highly shared'
1113 1113                                   * pages, skip fewer of them.  Otherwise,
1114 1114                                   * give up till the next clock tick.
1115 1115                                   */
1116 1116                                  if (po_share < MAX_PO_SHARE) {
1117 1117                                          po_share <<= 1;
1118 1118                                  } else {
1119 1119                                          break;
1120 1120                                  }
1121 1121                          }
1122 1122                  }
1123 1123          }
1124 1124  
1125 1125          sample_end = gethrtime();
1126 1126  
1127 1127          DTRACE_PROBE1(pageout__end, uint_t, laps);
1128 1128  
1129 1129          if (pageout_new_spread == 0) {
1130 1130                  if (pageout_sample_cnt < pageout_sample_lim) {
1131 1131                          /*
1132 1132                           * Continue accumulating samples until we have enough
1133 1133                           * to get a reasonable value for average scan rate:
1134 1134                           */
1135 1135                          pageout_sample_pages += pcount;
1136 1136                          pageout_sample_etime += sample_end - sample_start;
1137 1137                          ++pageout_sample_cnt;
1138 1138                  }
1139 1139  
1140 1140                  if (pageout_sample_cnt >= pageout_sample_lim) {
1141 1141                          /*
1142 1142                           * We have enough samples, set the spread.
1143 1143                           */
1144 1144                          pageout_rate = (hrrate_t)pageout_sample_pages *
1145 1145                              (hrrate_t)(NANOSEC) / pageout_sample_etime;
1146 1146                          pageout_new_spread = pageout_rate / 10;
1147 1147                          setupclock();
1148 1148                  }
1149 1149          }
1150 1150  
1151 1151          goto loop;
1152 1152  }
1153 1153  
1154 1154  /*
1155 1155   * The pageout deadman is run once per second by clock().
1156 1156   */
1157 1157  void
1158 1158  pageout_deadman(void)
1159 1159  {
1160 1160          if (panicstr != NULL) {
1161 1161                  /*
1162 1162                   * There is no pageout after panic.
1163 1163                   */
1164 1164                  return;
1165 1165          }
1166 1166  
1167 1167          if (pageout_deadman_seconds == 0) {
1168 1168                  /*
1169 1169                   * The deadman is not enabled.
1170 1170                   */
1171 1171                  return;
1172 1172          }
1173 1173  
1174 1174          if (!pageout_pushing) {
1175 1175                  goto reset;
1176 1176          }
1177 1177  
1178 1178          /*
1179 1179           * We are pushing a page.  Check to see if it is the same call we saw
1180 1180           * last time we looked:
1181 1181           */
1182 1182          if (pageout_pushcount != pageout_pushcount_seen) {
1183 1183                  /*
1184 1184                   * It is a different call from the last check, so we are not
1185 1185                   * stuck.
1186 1186                   */
1187 1187                  goto reset;
1188 1188          }
1189 1189  
1190 1190          if (++pageout_stucktime >= pageout_deadman_seconds) {
1191 1191                  panic("pageout_deadman: stuck pushing the same page for %d "
1192 1192                      "seconds (freemem is %lu)", pageout_deadman_seconds,
1193 1193                      freemem);
1194 1194          }
1195 1195  
1196 1196          return;
1197 1197  
1198 1198  reset:
1199 1199          /*
1200 1200           * Reset our tracking state to reflect that we are not stuck:
1201 1201           */
1202 1202          pageout_stucktime = 0;
1203 1203          pageout_pushcount_seen = pageout_pushcount;
1204 1204  }
1205 1205  
1206 1206  /*
1207 1207   * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1208 1208   * system (u., page table) or free, then leave it alone.  Otherwise,
1209 1209   * if we are running the front hand, turn off the page's reference bit.
1210 1210   * If the proc is over maxrss, we take it.  If running the back hand,
1211 1211   * check whether the page has been reclaimed.  If not, free the page,
1212 1212   * pushing it to disk first if necessary.
1213 1213   *
1214 1214   * Return values:
1215 1215   *      CKP_INELIGIBLE if the page is not a candidate at all,
1216 1216   *      CKP_NOT_FREED  if the page was not freed, or
1217 1217   *      CKP_FREED      if we freed it.
1218 1218   */
1219 1219  static checkpage_result_t
1220 1220  checkpage(struct page *pp, pageout_hand_t whichhand)
1221 1221  {
1222 1222          int ppattr;
1223 1223          int isfs = 0;
1224 1224          int isexec = 0;
1225 1225          int pagesync_flag;
1226 1226  
1227 1227          /*
1228 1228           * Skip pages:
1229 1229           *      - associated with the kernel vnode since
1230 1230           *          they are always "exclusively" locked.
1231 1231           *      - that are free
1232 1232           *      - that are shared more than po_share'd times
1233 1233           *      - its already locked
1234 1234           *
1235 1235           * NOTE:  These optimizations assume that reads are atomic.
1236 1236           */
1237 1237  
1238 1238          if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1239 1239              pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1240 1240              hat_page_checkshare(pp, po_share)) {
1241 1241                  return (CKP_INELIGIBLE);
1242 1242          }
1243 1243  
1244 1244          if (!page_trylock(pp, SE_EXCL)) {
1245 1245                  /*
1246 1246                   * Skip the page if we can't acquire the "exclusive" lock.
1247 1247                   */
1248 1248                  return (CKP_INELIGIBLE);
1249 1249          } else if (PP_ISFREE(pp)) {
1250 1250                  /*
1251 1251                   * It became free between the above check and our actually
1252 1252                   * locking the page.  Oh well, there will be other pages.
1253 1253                   */
1254 1254                  page_unlock(pp);
1255 1255                  return (CKP_INELIGIBLE);
1256 1256          }
1257 1257  
1258 1258          /*
1259 1259           * Reject pages that cannot be freed. The page_struct_lock
1260 1260           * need not be acquired to examine these
1261 1261           * fields since the page has an "exclusive" lock.
1262 1262           */
1263 1263          if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1264 1264                  page_unlock(pp);
1265 1265                  return (CKP_INELIGIBLE);
1266 1266          }
1267 1267  
1268 1268          /*
1269 1269           * Maintain statistics for what we are freeing
1270 1270           */
1271 1271          if (pp->p_vnode != NULL) {
1272 1272                  if (pp->p_vnode->v_flag & VVMEXEC)
1273 1273                          isexec = 1;
1274 1274  
1275 1275                  if (!IS_SWAPFSVP(pp->p_vnode))
1276 1276                          isfs = 1;
1277 1277          }
1278 1278  
1279 1279          /*
1280 1280           * Turn off REF and MOD bits with the front hand.
1281 1281           * The back hand examines the REF bit and always considers
1282 1282           * SHARED pages as referenced.
1283 1283           */
1284 1284          if (whichhand == POH_FRONT) {
1285 1285                  pagesync_flag = HAT_SYNC_ZERORM;
1286 1286          } else {
1287 1287                  pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1288 1288                      HAT_SYNC_STOPON_SHARED;
1289 1289          }
1290 1290  
1291 1291          ppattr = hat_pagesync(pp, pagesync_flag);
1292 1292  
1293 1293  recheck:
1294 1294          /*
1295 1295           * If page is referenced; make unreferenced but reclaimable.
1296 1296           * If this page is not referenced, then it must be reclaimable
1297 1297           * and we can add it to the free list.
1298 1298           */
1299 1299          if (ppattr & P_REF) {
1300 1300                  DTRACE_PROBE2(pageout__isref, page_t *, pp,
1301 1301                      pageout_hand_t, whichhand);
1302 1302  
1303 1303                  if (whichhand == POH_FRONT) {
1304 1304                          /*
1305 1305                           * Checking of rss or madvise flags needed here...
1306 1306                           *
1307 1307                           * If not "well-behaved", fall through into the code
1308 1308                           * for not referenced.
1309 1309                           */
1310 1310                          hat_clrref(pp);
1311 1311                  }
1312 1312  
1313 1313                  /*
1314 1314                   * Somebody referenced the page since the front
1315 1315                   * hand went by, so it's not a candidate for
1316 1316                   * freeing up.
1317 1317                   */
1318 1318                  page_unlock(pp);
1319 1319                  return (CKP_NOT_FREED);
1320 1320          }
1321 1321  
1322 1322          VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1323 1323  
1324 1324          /*
1325 1325           * If large page, attempt to demote it. If successfully demoted,
1326 1326           * retry the checkpage.
1327 1327           */
1328 1328          if (pp->p_szc != 0) {
1329 1329                  if (!page_try_demote_pages(pp)) {
1330 1330                          VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1331 1331                          page_unlock(pp);
1332 1332                          return (CKP_INELIGIBLE);
1333 1333                  }
1334 1334  
1335 1335                  ASSERT(pp->p_szc == 0);
1336 1336                  VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1337 1337  
1338 1338                  /*
1339 1339                   * Since page_try_demote_pages() could have unloaded some
1340 1340                   * mappings it makes sense to reload ppattr.
1341 1341                   */
1342 1342                  ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1343 1343          }
1344 1344  
1345 1345          /*
1346 1346           * If the page is currently dirty, we have to arrange to have it
1347 1347           * cleaned before it can be freed.
1348 1348           *
1349 1349           * XXX - ASSERT(pp->p_vnode != NULL);
1350 1350           */
1351 1351          if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1352 1352                  struct vnode *vp = pp->p_vnode;
1353 1353                  u_offset_t offset = pp->p_offset;
1354 1354  
1355 1355                  /*
1356 1356                   * XXX - Test for process being swapped out or about to exit?
1357 1357                   * [Can't get back to process(es) using the page.]
1358 1358                   */
1359 1359  
1360 1360                  /*
1361 1361                   * Hold the vnode before releasing the page lock to
1362 1362                   * prevent it from being freed and re-used by some
1363 1363                   * other thread.
1364 1364                   */
1365 1365                  VN_HOLD(vp);
1366 1366                  page_unlock(pp);
1367 1367  
1368 1368                  /*
1369 1369                   * Queue I/O request for the pageout thread.
1370 1370                   */
1371 1371                  if (!queue_io_request(vp, offset)) {
1372 1372                          VN_RELE(vp);
1373 1373                          return (CKP_NOT_FREED);
1374 1374                  }
1375 1375                  return (CKP_FREED);
1376 1376          }
1377 1377  
1378 1378          /*
1379 1379           * Now we unload all the translations and put the page back on to the
1380 1380           * free list.  If the page was used (referenced or modified) after the
1381 1381           * pagesync but before it was unloaded we catch it and handle the page
1382 1382           * properly.
1383 1383           */
1384 1384          DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1385 1385          (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1386 1386          ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1387 1387          if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1388 1388                  goto recheck;
1389 1389          }
1390 1390  
1391 1391          VN_DISPOSE(pp, B_FREE, 0, kcred);
1392 1392  
1393 1393          CPU_STATS_ADD_K(vm, dfree, 1);
1394 1394  
1395 1395          if (isfs) {
1396 1396                  if (isexec) {
1397 1397                          CPU_STATS_ADD_K(vm, execfree, 1);
1398 1398                  } else {
1399 1399                          CPU_STATS_ADD_K(vm, fsfree, 1);
1400 1400                  }
1401 1401          } else {
1402 1402                  CPU_STATS_ADD_K(vm, anonfree, 1);
1403 1403          }
1404 1404  
1405 1405          return (CKP_FREED);
1406 1406  }
1407 1407  
1408 1408  /*
1409 1409   * Queue async i/o request from pageout_scanner and segment swapout
1410 1410   * routines on one common list.  This ensures that pageout devices (swap)
1411 1411   * are not saturated by pageout_scanner or swapout requests.
1412 1412   * The pageout thread empties this list by initiating i/o operations.
1413 1413   */
1414 1414  int
1415 1415  queue_io_request(vnode_t *vp, u_offset_t off)
1416 1416  {
1417 1417          struct async_reqs *arg;
1418 1418  
1419 1419          /*
1420 1420           * If we cannot allocate an async request struct,
1421 1421           * skip this page.
1422 1422           */
1423 1423          mutex_enter(&push_lock);
1424 1424          if ((arg = req_freelist) == NULL) {
1425 1425                  mutex_exit(&push_lock);
1426 1426                  return (0);
1427 1427          }
1428 1428          req_freelist = arg->a_next;             /* adjust freelist */
1429 1429          push_list_size++;
1430 1430  
1431 1431          arg->a_vp = vp;
1432 1432          arg->a_off = off;
1433 1433          arg->a_len = PAGESIZE;
1434 1434          arg->a_flags = B_ASYNC | B_FREE;
1435 1435          arg->a_cred = kcred;            /* always held */
1436 1436  
1437 1437          /*
1438 1438           * Add to list of pending write requests.
1439 1439           */
1440 1440          arg->a_next = push_list;
1441 1441          push_list = arg;
1442 1442  
1443 1443          if (req_freelist == NULL) {
1444 1444                  /*
1445 1445                   * No free async requests left. The lock is held so we
1446 1446                   * might as well signal the pusher thread now.
1447 1447                   */
1448 1448                  cv_signal(&push_cv);
1449 1449          }
1450 1450          mutex_exit(&push_lock);
1451 1451          return (1);
1452 1452  }
1453 1453  
1454 1454  /*
1455 1455   * Wakeup pageout to initiate i/o if push_list is not empty.
1456 1456   */
1457 1457  void
1458 1458  cv_signal_pageout()
1459 1459  {
1460 1460          if (push_list != NULL) {
1461 1461                  mutex_enter(&push_lock);
1462 1462                  cv_signal(&push_cv);
1463 1463                  mutex_exit(&push_lock);
1464 1464          }
1465 1465  }
  
    | 
      ↓ open down ↓ | 
    1319 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX