big-one Old usr/src/uts/common/os/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/buf.h>
  43 #include <sys/uio.h>
  44 #include <sys/proc.h>
  45 #include <sys/systm.h>
  46 #include <sys/mman.h>
  47 #include <sys/cred.h>
  48 #include <sys/vnode.h>
  49 #include <sys/vm.h>
  50 #include <sys/vmparam.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/cmn_err.h>
  53 #include <sys/cpuvar.h>
  54 #include <sys/user.h>
  55 #include <sys/kmem.h>
  56 #include <sys/debug.h>
  57 #include <sys/callb.h>
  58 #include <sys/tnf_probe.h>
  59 #include <sys/mem_cage.h>
  60 #include <sys/time.h>
  61 
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/page.h>
  66 #include <vm/pvn.h>
  67 #include <vm/seg_kmem.h>
  68 
  69 static int checkpage(page_t *, int);
  70 
  71 /*
  72  * The following parameters control operation of the page replacement
  73  * algorithm.  They are initialized to 0, and then computed at boot time
  74  * based on the size of the system.  If they are patched non-zero in
  75  * a loaded vmunix they are left alone and may thus be changed per system
  76  * using adb on the loaded system.
  77  */
  78 pgcnt_t         slowscan = 0;
  79 pgcnt_t         fastscan = 0;
  80 
  81 static pgcnt_t  handspreadpages = 0;
  82 static int      loopfraction = 2;
  83 static pgcnt_t  looppages;
  84 static int      min_percent_cpu = 4;
  85 static int      max_percent_cpu = 80;
  86 static pgcnt_t  maxfastscan = 0;
  87 static pgcnt_t  maxslowscan = 100;
  88 
  89 pgcnt_t maxpgio = 0;
  90 pgcnt_t minfree = 0;
  91 pgcnt_t desfree = 0;
  92 pgcnt_t lotsfree = 0;
  93 pgcnt_t needfree = 0;
  94 pgcnt_t throttlefree = 0;
  95 pgcnt_t pageout_reserve = 0;
  96 
  97 pgcnt_t deficit;
  98 pgcnt_t nscan;
  99 pgcnt_t desscan;
 100 
 101 /*
 102  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
 103  * are the number of ticks in each wakeup cycle that gives the
 104  * equivalent of some underlying %CPU duty cycle.
 105  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
 106  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
 107  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
 108  * So, for example, 4% == 1 tick and 80% == 20 ticks.
 109  *
 110  * min_pageout_ticks:
 111  *     ticks/wakeup equivalent of min_percent_cpu.
 112  *
 113  * max_pageout_ticks:
 114  *     ticks/wakeup equivalent of max_percent_cpu.
 115  *
 116  * pageout_ticks:
 117  *     Number of clock ticks budgeted for each wakeup cycle.
 118  *     Computed each time around by schedpaging().
 119  *     Varies between min_pageout_ticks .. max_pageout_ticks,
 120  *     depending on memory pressure.
 121  *
 122  * pageout_lbolt:
 123  *     Timestamp of the last time pageout_scanner woke up and started
 124  *     (or resumed) scanning for not recently referenced pages.
 125  */
 126 
 127 static clock_t  min_pageout_ticks;
 128 static clock_t  max_pageout_ticks;
 129 static clock_t  pageout_ticks;
 130 static clock_t  pageout_lbolt;
 131 
 132 static uint_t   reset_hands;
 133 
 134 #define PAGES_POLL_MASK 1023
 135 
 136 /*
 137  * pageout_sample_lim:
 138  *     The limit on the number of samples needed to establish a value
 139  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
 140  *
 141  * pageout_sample_cnt:
 142  *     Current sample number.  Once the sample gets large enough,
 143  *     set new values for handspreadpages, fastscan and slowscan.
 144  *
 145  * pageout_sample_pages:
 146  *     The accumulated number of pages scanned during sampling.
 147  *
 148  * pageout_sample_ticks:
 149  *     The accumulated clock ticks for the sample.
 150  *
 151  * pageout_rate:
 152  *     Rate in pages/nanosecond, computed at the end of sampling.
 153  *
 154  * pageout_new_spread:
 155  *     The new value to use for fastscan and handspreadpages.
 156  *     Calculated after enough samples have been taken.
 157  */
 158 
 159 typedef hrtime_t hrrate_t;
 160 
 161 static uint64_t pageout_sample_lim = 4;
 162 static uint64_t pageout_sample_cnt = 0;
 163 static pgcnt_t  pageout_sample_pages = 0;
 164 static hrrate_t pageout_rate = 0;
 165 static pgcnt_t  pageout_new_spread = 0;
 166 
 167 static clock_t  pageout_cycle_ticks;
 168 static hrtime_t sample_start, sample_end;
 169 static hrtime_t pageout_sample_etime = 0;
 170 
 171 /*
 172  * Record number of times a pageout_scanner wakeup cycle finished because it
 173  * timed out (exceeded its CPU budget), rather than because it visited
 174  * its budgeted number of pages.
 175  */
 176 uint64_t pageout_timeouts = 0;
 177 
 178 #ifdef VM_STATS
 179 static struct pageoutvmstats_str {
 180         ulong_t checkpage[3];
 181 } pageoutvmstats;
 182 #endif /* VM_STATS */
 183 
 184 /*
 185  * Threads waiting for free memory use this condition variable and lock until
 186  * memory becomes available.
 187  */
 188 kmutex_t        memavail_lock;
 189 kcondvar_t      memavail_cv;
 190 
 191 /*
 192  * The size of the clock loop.
 193  */
 194 #define LOOPPAGES       total_pages
 195 
 196 /*
 197  * Set up the paging constants for the clock algorithm.
 198  * Called after the system is initialized and the amount of memory
 199  * and number of paging devices is known.
 200  *
 201  * lotsfree is 1/64 of memory, but at least 512K.
 202  * desfree is 1/2 of lotsfree.
 203  * minfree is 1/2 of desfree.
 204  *
 205  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
 206  *
 207  *      lotsfree = btop(512K)
 208  *      desfree = btop(200K)
 209  *      minfree = btop(100K)
 210  *      throttlefree = INT_MIN
 211  *      max_percent_cpu = 4
 212  */
 213 void
 214 setupclock(int recalc)
 215 {
 216 
 217         static spgcnt_t init_lfree, init_dfree, init_mfree;
 218         static spgcnt_t init_tfree, init_preserve, init_mpgio;
 219         static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
 220 
 221         looppages = LOOPPAGES;
 222 
 223         /*
 224          * setupclock can now be called to recalculate the paging
 225          * parameters in the case of dynamic addition of memory.
 226          * So to make sure we make the proper calculations, if such a
 227          * situation should arise, we save away the initial values
 228          * of each parameter so we can recall them when needed. This
 229          * way we don't lose the settings an admin might have made
 230          * through the /etc/system file.
 231          */
 232 
 233         if (!recalc) {
 234                 init_lfree = lotsfree;
 235                 init_dfree = desfree;
 236                 init_mfree = minfree;
 237                 init_tfree = throttlefree;
 238                 init_preserve = pageout_reserve;
 239                 init_mpgio = maxpgio;
 240                 init_mfscan = maxfastscan;
 241                 init_fscan = fastscan;
 242                 init_sscan = slowscan;
 243                 init_hspages = handspreadpages;
 244         }
 245 
 246         /*
 247          * Set up thresholds for paging:
 248          */
 249 
 250         /*
 251          * Lotsfree is threshold where paging daemon turns on.
 252          */
 253         if (init_lfree == 0 || init_lfree >= looppages)
 254                 lotsfree = MAX(looppages / 64, btop(512 * 1024));
 255         else
 256                 lotsfree = init_lfree;
 257 
 258         /*
 259          * Desfree is amount of memory desired free.
 260          * If less than this for extended period, start swapping.
 261          */
 262         if (init_dfree == 0 || init_dfree >= lotsfree)
 263                 desfree = lotsfree / 2;
 264         else
 265                 desfree = init_dfree;
 266 
 267         /*
 268          * Minfree is minimal amount of free memory which is tolerable.
 269          */
 270         if (init_mfree == 0 || init_mfree >= desfree)
 271                 minfree = desfree / 2;
 272         else
 273                 minfree = init_mfree;
 274 
 275         /*
 276          * Throttlefree is the point at which we start throttling
 277          * PG_WAIT requests until enough memory becomes available.
 278          */
 279         if (init_tfree == 0 || init_tfree >= desfree)
 280                 throttlefree = minfree;
 281         else
 282                 throttlefree = init_tfree;
 283 
 284         /*
 285          * Pageout_reserve is the number of pages that we keep in
 286          * stock for pageout's own use.  Having a few such pages
 287          * provides insurance against system deadlock due to
 288          * pageout needing pages.  When freemem < pageout_reserve,
 289          * non-blocking allocations are denied to any threads
 290          * other than pageout and sched.  (At some point we might
 291          * want to consider a per-thread flag like T_PUSHING_PAGES
 292          * to indicate that a thread is part of the page-pushing
 293          * dance (e.g. an interrupt thread) and thus is entitled
 294          * to the same special dispensation we accord pageout.)
 295          */
 296         if (init_preserve == 0 || init_preserve >= throttlefree)
 297                 pageout_reserve = throttlefree / 2;
 298         else
 299                 pageout_reserve = init_preserve;
 300 
 301         /*
 302          * Maxpgio thresholds how much paging is acceptable.
 303          * This figures that 2/3 busy on an arm is all that is
 304          * tolerable for paging.  We assume one operation per disk rev.
 305          *
 306          * XXX - Does not account for multiple swap devices.
 307          */
 308         if (init_mpgio == 0)
 309                 maxpgio = (DISKRPM * 2) / 3;
 310         else
 311                 maxpgio = init_mpgio;
 312 
 313         /*
 314          * The clock scan rate varies between fastscan and slowscan
 315          * based on the amount of free memory available.  Fastscan
 316          * rate should be set based on the number pages that can be
 317          * scanned per sec using ~10% of processor time.  Since this
 318          * value depends on the processor, MMU, Mhz etc., it is
 319          * difficult to determine it in a generic manner for all
 320          * architectures.
 321          *
 322          * Instead of trying to determine the number of pages scanned
 323          * per sec for every processor, fastscan is set to be the smaller
 324          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 325          * time is limited to ~4% of processor time.
 326          *
 327          * Setting fastscan to be 1/2 of memory allows pageout to scan
 328          * all of memory in ~2 secs.  This implies that user pages not
 329          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 330          * can be reclaimed when free memory is very low.  Stealing pages
 331          * not accessed within 1 sec seems reasonable and ensures that
 332          * active user processes don't thrash.
 333          *
 334          * Smaller values of fastscan result in scanning fewer pages
 335          * every second and consequently pageout may not be able to free
 336          * sufficient memory to maintain the minimum threshold.  Larger
 337          * values of fastscan result in scanning a lot more pages which
 338          * could lead to thrashing and higher CPU usage.
 339          *
 340          * Fastscan needs to be limited to a maximum value and should not
 341          * scale with memory to prevent pageout from consuming too much
 342          * time for scanning on slow CPU's and avoid thrashing, as a
 343          * result of scanning too many pages, on faster CPU's.
 344          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 345          * (the upper bound for fastscan) based on the average number
 346          * of pages that can potentially be scanned in ~1 sec (using ~4%
 347          * of the CPU) on some of the following machines that currently
 348          * run Solaris 2.x:
 349          *
 350          *                      average memory scanned in ~1 sec
 351          *
 352          *      25 Mhz SS1+:            23 Meg
 353          *      LX:                     37 Meg
 354          *      50 Mhz SC2000:          68 Meg
 355          *
 356          *      40 Mhz 486:             26 Meg
 357          *      66 Mhz 486:             42 Meg
 358          *
 359          * When free memory falls just below lotsfree, the scan rate
 360          * goes from 0 to slowscan (i.e., pageout starts running).  This
 361          * transition needs to be smooth and is achieved by ensuring that
 362          * pageout scans a small number of pages to satisfy the transient
 363          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 364          * wakeup) since scanning that many pages has no noticible impact
 365          * on system performance.
 366          *
 367          * In addition to setting fastscan and slowscan, pageout is
 368          * limited to using ~4% of the CPU.  This results in increasing
 369          * the time taken to scan all of memory, which in turn means that
 370          * user processes have a better opportunity of preventing their
 371          * pages from being stolen.  This has a positive effect on
 372          * interactive and overall system performance when memory demand
 373          * is high.
 374          *
 375          * Thus, the rate at which pages are scanned for replacement will
 376          * vary linearly between slowscan and the number of pages that
 377          * can be scanned using ~4% of processor time instead of varying
 378          * linearly between slowscan and fastscan.
 379          *
 380          * Also, the processor time used by pageout will vary from ~1%
 381          * at slowscan to ~4% at fastscan instead of varying between
 382          * ~1% at slowscan and ~10% at fastscan.
 383          *
 384          * The values chosen for the various VM parameters (fastscan,
 385          * handspreadpages, etc) are not universally true for all machines,
 386          * but appear to be a good rule of thumb for the machines we've
 387          * tested.  They have the following ranges:
 388          *
 389          *      cpu speed:      20 to 70 Mhz
 390          *      page size:      4K to 8K
 391          *      memory size:    16M to 5G
 392          *      page scan rate: 4000 - 17400 4K pages per sec
 393          *
 394          * The values need to be re-examined for machines which don't
 395          * fall into the various ranges (e.g., slower or faster CPUs,
 396          * smaller or larger pagesizes etc) shown above.
 397          *
 398          * On an MP machine, pageout is often unable to maintain the
 399          * minimum paging thresholds under heavy load.  This is due to
 400          * the fact that user processes running on other CPU's can be
 401          * dirtying memory at a much faster pace than pageout can find
 402          * pages to free.  The memory demands could be met by enabling
 403          * more than one CPU to run the clock algorithm in such a manner
 404          * that the various clock hands don't overlap.  This also makes
 405          * it more difficult to determine the values for fastscan, slowscan
 406          * and handspreadpages.
 407          *
 408          * The swapper is currently used to free up memory when pageout
 409          * is unable to meet memory demands by swapping out processes.
 410          * In addition to freeing up memory, swapping also reduces the
 411          * demand for memory by preventing user processes from running
 412          * and thereby consuming memory.
 413          */
 414         if (init_mfscan == 0) {
 415                 if (pageout_new_spread != 0)
 416                         maxfastscan = pageout_new_spread;
 417                 else
 418                         maxfastscan = MAXHANDSPREADPAGES;
 419         } else {
 420                 maxfastscan = init_mfscan;
 421         }
 422         if (init_fscan == 0)
 423                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 424         else
 425                 fastscan = init_fscan;
 426         if (fastscan > looppages / loopfraction)
 427                 fastscan = looppages / loopfraction;
 428 
 429         /*
 430          * Set slow scan time to 1/10 the fast scan time, but
 431          * not to exceed maxslowscan.
 432          */
 433         if (init_sscan == 0)
 434                 slowscan = MIN(fastscan / 10, maxslowscan);
 435         else
 436                 slowscan = init_sscan;
 437         if (slowscan > fastscan / 2)
 438                 slowscan = fastscan / 2;
 439 
 440         /*
 441          * Handspreadpages is distance (in pages) between front and back
 442          * pageout daemon hands.  The amount of time to reclaim a page
 443          * once pageout examines it increases with this distance and
 444          * decreases as the scan rate rises. It must be < the amount
 445          * of pageable memory.
 446          *
 447          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 448          * to be "fastscan" results in the front hand being a few secs
 449          * (varies based on the processor speed) ahead of the back hand
 450          * at fastscan rates.  This distance can be further reduced, if
 451          * necessary, by increasing the processor time used by pageout
 452          * to be more than ~4% and preferrably not more than ~10%.
 453          *
 454          * As a result, user processes have a much better chance of
 455          * referencing their pages before the back hand examines them.
 456          * This also significantly lowers the number of reclaims from
 457          * the freelist since pageout does not end up freeing pages which
 458          * may be referenced a sec later.
 459          */
 460         if (init_hspages == 0)
 461                 handspreadpages = fastscan;
 462         else
 463                 handspreadpages = init_hspages;
 464 
 465         /*
 466          * Make sure that back hand follows front hand by at least
 467          * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
 468          * for the back hand to look at a page during the same wakeup of
 469          * the pageout daemon in which the front hand cleared its ref bit.
 470          */
 471         if (handspreadpages >= looppages)
 472                 handspreadpages = looppages - 1;
 473 
 474         /*
 475          * If we have been called to recalculate the parameters,
 476          * set a flag to re-evaluate the clock hand pointers.
 477          */
 478         if (recalc)
 479                 reset_hands = 1;
 480 }
 481 
 482 /*
 483  * Pageout scheduling.
 484  *
 485  * Schedpaging controls the rate at which the page out daemon runs by
 486  * setting the global variables nscan and desscan RATETOSCHEDPAGING
 487  * times a second.  Nscan records the number of pages pageout has examined
 488  * in its current pass; schedpaging resets this value to zero each time
 489  * it runs.  Desscan records the number of pages pageout should examine
 490  * in its next pass; schedpaging sets this value based on the amount of
 491  * currently available memory.
 492  */
 493 
 494 #define RATETOSCHEDPAGING       4               /* hz that is */
 495 
 496 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 497 
 498 /*
 499  * Pool of available async pageout putpage requests.
 500  */
 501 static struct async_reqs *push_req;
 502 static struct async_reqs *req_freelist; /* available req structs */
 503 static struct async_reqs *push_list;    /* pending reqs */
 504 static kmutex_t push_lock;              /* protects req pool */
 505 static kcondvar_t push_cv;
 506 
 507 static int async_list_size = 256;       /* number of async request structs */
 508 
 509 static void pageout_scanner(void);
 510 
 511 /*
 512  * If a page is being shared more than "po_share" times
 513  * then leave it alone- don't page it out.
 514  */
 515 #define MIN_PO_SHARE    (8)
 516 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 517 ulong_t po_share = MIN_PO_SHARE;
 518 
 519 /*
 520  * Schedule rate for paging.
 521  * Rate is linear interpolation between
 522  * slowscan with lotsfree and fastscan when out of memory.
 523  */
 524 static void
 525 schedpaging(void *arg)
 526 {
 527         spgcnt_t vavail;
 528 
 529         if (freemem < lotsfree + needfree + kmem_reapahead)
 530                 kmem_reap();
 531 
 532         if (freemem < lotsfree + needfree)
 533                 seg_preap();
 534 
 535         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 536                 kcage_cageout_wakeup();
 537 
 538         if (mutex_tryenter(&pageout_mutex)) {
 539                 /* pageout() not running */
 540                 nscan = 0;
 541                 vavail = freemem - deficit;
 542                 if (pageout_new_spread != 0)
 543                         vavail -= needfree;
 544                 if (vavail < 0)
 545                         vavail = 0;
 546                 if (vavail > lotsfree)
 547                         vavail = lotsfree;
 548 
 549                 /*
 550                  * Fix for 1161438 (CRS SPR# 73922).  All variables
 551                  * in the original calculation for desscan were 32 bit signed
 552                  * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 553                  * more of memory, the calculation can overflow.  When this
 554                  * happens, desscan becomes negative and pageout_scanner()
 555                  * stops paging out.
 556                  */
 557                 if ((needfree) && (pageout_new_spread == 0)) {
 558                         /*
 559                          * If we've not yet collected enough samples to
 560                          * calculate a spread, use the old logic of kicking
 561                          * into high gear anytime needfree is non-zero.
 562                          */
 563                         desscan = fastscan / RATETOSCHEDPAGING;
 564                 } else {
 565                         /*
 566                          * Once we've calculated a spread based on system
 567                          * memory and usage, just treat needfree as another
 568                          * form of deficit.
 569                          */
 570                         spgcnt_t faststmp, slowstmp, result;
 571 
 572                         slowstmp = slowscan * vavail;
 573                         faststmp = fastscan * (lotsfree - vavail);
 574                         result = (slowstmp + faststmp) /
 575                             nz(lotsfree) / RATETOSCHEDPAGING;
 576                         desscan = (pgcnt_t)result;
 577                 }
 578 
 579                 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
 580                     (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
 581 
 582                 if (freemem < lotsfree + needfree ||
 583                     pageout_sample_cnt < pageout_sample_lim) {
 584                         TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 585                             "pageout_cv_signal:freemem %ld", freemem);
 586                         cv_signal(&proc_pageout->p_cv);
 587                 } else {
 588                         /*
 589                          * There are enough free pages, no need to
 590                          * kick the scanner thread.  And next time
 591                          * around, keep more of the `highly shared'
 592                          * pages.
 593                          */
 594                         cv_signal_pageout();
 595                         if (po_share > MIN_PO_SHARE) {
 596                                 po_share >>= 1;
 597                         }
 598                 }
 599                 mutex_exit(&pageout_mutex);
 600         }
 601 
 602         /*
 603          * Signal threads waiting for available memory.
 604          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 605          * in this case it is not needed - the waiters will be waken up during
 606          * the next invocation of this function.
 607          */
 608         if (kmem_avail() > 0)
 609                 cv_broadcast(&memavail_cv);
 610 
 611         (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
 612 }
 613 
 614 pgcnt_t         pushes;
 615 ulong_t         push_list_size;         /* # of requests on pageout queue */
 616 
 617 #define FRONT   1
 618 #define BACK    2
 619 
 620 int dopageout = 1;      /* must be non-zero to turn page stealing on */
 621 
 622 /*
 623  * The page out daemon, which runs as process 2.
 624  *
 625  * As long as there are at least lotsfree pages,
 626  * this process is not run.  When the number of free
 627  * pages stays in the range desfree to lotsfree,
 628  * this daemon runs through the pages in the loop
 629  * at a rate determined in schedpaging().  Pageout manages
 630  * two hands on the clock.  The front hand moves through
 631  * memory, clearing the reference bit,
 632  * and stealing pages from procs that are over maxrss.
 633  * The back hand travels a distance behind the front hand,
 634  * freeing the pages that have not been referenced in the time
 635  * since the front hand passed.  If modified, they are pushed to
 636  * swap before being freed.
 637  *
 638  * There are 2 threads that act on behalf of the pageout process.
 639  * One thread scans pages (pageout_scanner) and frees them up if
 640  * they don't require any VOP_PUTPAGE operation. If a page must be
 641  * written back to its backing store, the request is put on a list
 642  * and the other (pageout) thread is signaled. The pageout thread
 643  * grabs VOP_PUTPAGE requests from the list, and processes them.
 644  * Some filesystems may require resources for the VOP_PUTPAGE
 645  * operations (like memory) and hence can block the pageout
 646  * thread, but the scanner thread can still operate. There is still
 647  * no guarantee that memory deadlocks cannot occur.
 648  *
 649  * For now, this thing is in very rough form.
 650  */
 651 void
 652 pageout()
 653 {
 654         struct async_reqs *arg;
 655         pri_t pageout_pri;
 656         int i;
 657         pgcnt_t max_pushes;
 658         callb_cpr_t cprinfo;
 659 
 660         proc_pageout = ttoproc(curthread);
 661         proc_pageout->p_cstime = 0;
 662         proc_pageout->p_stime =  0;
 663         proc_pageout->p_cutime =  0;
 664         proc_pageout->p_utime = 0;
 665         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 666         bcopy("pageout", PTOU(curproc)->u_comm, 7);
 667 
 668         /*
 669          * Create pageout scanner thread
 670          */
 671         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 672         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 673 
 674         /*
 675          * Allocate and initialize the async request structures
 676          * for pageout.
 677          */
 678         push_req = (struct async_reqs *)
 679             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 680 
 681         req_freelist = push_req;
 682         for (i = 0; i < async_list_size - 1; i++)
 683                 push_req[i].a_next = &push_req[i + 1];
 684 
 685         pageout_pri = curthread->t_pri;
 686 
 687         /* Create the pageout scanner thread. */
 688         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 689             pageout_pri - 1);
 690 
 691         /*
 692          * kick off pageout scheduler.
 693          */
 694         schedpaging(NULL);
 695 
 696         /*
 697          * Create kernel cage thread.
 698          * The kernel cage thread is started under the pageout process
 699          * to take advantage of the less restricted page allocation
 700          * in page_create_throttle().
 701          */
 702         kcage_cageout_init();
 703 
 704         /*
 705          * Limit pushes to avoid saturating pageout devices.
 706          */
 707         max_pushes = maxpgio / RATETOSCHEDPAGING;
 708         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 709 
 710         for (;;) {
 711                 mutex_enter(&push_lock);
 712 
 713                 while ((arg = push_list) == NULL || pushes > max_pushes) {
 714                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 715                         cv_wait(&push_cv, &push_lock);
 716                         pushes = 0;
 717                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 718                 }
 719                 push_list = arg->a_next;
 720                 arg->a_next = NULL;
 721                 mutex_exit(&push_lock);
 722 
 723                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 724                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 725                         pushes++;
 726                 }
 727 
 728                 /* vp held by checkpage() */
 729                 VN_RELE(arg->a_vp);
 730 
 731                 mutex_enter(&push_lock);
 732                 arg->a_next = req_freelist;  /* back on freelist */
 733                 req_freelist = arg;
 734                 push_list_size--;
 735                 mutex_exit(&push_lock);
 736         }
 737 }
 738 
 739 /*
 740  * Kernel thread that scans pages looking for ones to free
 741  */
 742 static void
 743 pageout_scanner(void)
 744 {
 745         struct page *fronthand, *backhand;
 746         uint_t count;
 747         callb_cpr_t cprinfo;
 748         pgcnt_t nscan_limit;
 749         pgcnt_t pcount;
 750 
 751         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 752         mutex_enter(&pageout_mutex);
 753 
 754         /*
 755          * The restart case does not attempt to point the hands at roughly
 756          * the right point on the assumption that after one circuit things
 757          * will have settled down - and restarts shouldn't be that often.
 758          */
 759 
 760         /*
 761          * Set the two clock hands to be separated by a reasonable amount,
 762          * but no more than 360 degrees apart.
 763          */
 764         backhand = page_first();
 765         if (handspreadpages >= total_pages)
 766                 fronthand = page_nextn(backhand, total_pages - 1);
 767         else
 768                 fronthand = page_nextn(backhand, handspreadpages);
 769 
 770         min_pageout_ticks = MAX(1,
 771             ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
 772         max_pageout_ticks = MAX(min_pageout_ticks,
 773             ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
 774 
 775 loop:
 776         cv_signal_pageout();
 777 
 778         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 779         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 780         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 781 
 782         if (!dopageout)
 783                 goto loop;
 784 
 785         if (reset_hands) {
 786                 reset_hands = 0;
 787 
 788                 backhand = page_first();
 789                 if (handspreadpages >= total_pages)
 790                         fronthand = page_nextn(backhand, total_pages - 1);
 791                 else
 792                         fronthand = page_nextn(backhand, handspreadpages);
 793         }
 794 
 795         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 796         count = 0;
 797 
 798         TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
 799             "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
 800             freemem, lotsfree, nscan, desscan);
 801 
 802         /* Kernel probe */
 803         TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
 804             tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
 805 
 806         pcount = 0;
 807         if (pageout_sample_cnt < pageout_sample_lim) {
 808                 nscan_limit = total_pages;
 809         } else {
 810                 nscan_limit = desscan;
 811         }
 812         pageout_lbolt = ddi_get_lbolt();
 813         sample_start = gethrtime();
 814 
 815         /*
 816          * Scan the appropriate number of pages for a single duty cycle.
 817          * However, stop scanning as soon as there is enough free memory.
 818          * For a short while, we will be sampling the performance of the
 819          * scanner and need to keep running just to get sample data, in
 820          * which case we keep going and don't pay attention to whether
 821          * or not there is enough free memory.
 822          */
 823 
 824         while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
 825             pageout_sample_cnt < pageout_sample_lim)) {
 826                 int rvfront, rvback;
 827 
 828                 /*
 829                  * Check to see if we have exceeded our %CPU budget
 830                  * for this wakeup, but not on every single page visited,
 831                  * just every once in a while.
 832                  */
 833                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
 834                         pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
 835                         if (pageout_cycle_ticks >= pageout_ticks) {
 836                                 ++pageout_timeouts;
 837                                 break;
 838                         }
 839                 }
 840 
 841                 /*
 842                  * If checkpage manages to add a page to the free list,
 843                  * we give ourselves another couple of trips around the loop.
 844                  */
 845                 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
 846                         count = 0;
 847                 if ((rvback = checkpage(backhand, BACK)) == 1)
 848                         count = 0;
 849 
 850                 ++pcount;
 851 
 852                 /*
 853                  * protected by pageout_mutex instead of cpu_stat_lock
 854                  */
 855                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
 856 
 857                 /*
 858                  * Don't include ineligible pages in the number scanned.
 859                  */
 860                 if (rvfront != -1 || rvback != -1)
 861                         nscan++;
 862 
 863                 backhand = page_next(backhand);
 864 
 865                 /*
 866                  * backhand update and wraparound check are done separately
 867                  * because lint barks when it finds an empty "if" body
 868                  */
 869 
 870                 if ((fronthand = page_next(fronthand)) == page_first()) {
 871                         TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
 872                             "pageout_hand_wrap:freemem %ld whichhand %d",
 873                             freemem, FRONT);
 874 
 875                         /*
 876                          * protected by pageout_mutex instead of cpu_stat_lock
 877                          */
 878                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
 879                         if (++count > 1) {
 880                                 /*
 881                                  * Extremely unlikely, but it happens.
 882                                  * We went around the loop at least once
 883                                  * and didn't get far enough.
 884                                  * If we are still skipping `highly shared'
 885                                  * pages, skip fewer of them.  Otherwise,
 886                                  * give up till the next clock tick.
 887                                  */
 888                                 if (po_share < MAX_PO_SHARE) {
 889                                         po_share <<= 1;
 890                                 } else {
 891                                         /*
 892                                          * Really a "goto loop", but
 893                                          * if someone is TRACing or
 894                                          * TNF_PROBE_ing, at least
 895                                          * make records to show
 896                                          * where we are.
 897                                          */
 898                                         break;
 899                                 }
 900                         }
 901                 }
 902         }
 903 
 904         sample_end = gethrtime();
 905 
 906         TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
 907             "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
 908             freemem, lotsfree, nscan, desscan, count);
 909 
 910         /* Kernel probe */
 911         TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
 912             tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
 913 
 914         if (pageout_sample_cnt < pageout_sample_lim) {
 915                 pageout_sample_pages += pcount;
 916                 pageout_sample_etime += sample_end - sample_start;
 917                 ++pageout_sample_cnt;
 918         }
 919         if (pageout_sample_cnt >= pageout_sample_lim &&
 920             pageout_new_spread == 0) {
 921                 pageout_rate = (hrrate_t)pageout_sample_pages *
 922                     (hrrate_t)(NANOSEC) / pageout_sample_etime;
 923                 pageout_new_spread = pageout_rate / 10;
 924                 setupclock(1);
 925         }
 926 
 927         goto loop;
 928 }
 929 
 930 /*
 931  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
 932  * system (u., page table) or free, then leave it alone.  Otherwise,
 933  * if we are running the front hand, turn off the page's reference bit.
 934  * If the proc is over maxrss, we take it.  If running the back hand,
 935  * check whether the page has been reclaimed.  If not, free the page,
 936  * pushing it to disk first if necessary.
 937  *
 938  * Return values:
 939  *      -1 if the page is not a candidate at all,
 940  *       0 if not freed, or
 941  *       1 if we freed it.
 942  */
 943 static int
 944 checkpage(struct page *pp, int whichhand)
 945 {
 946         int ppattr;
 947         int isfs = 0;
 948         int isexec = 0;
 949         int pagesync_flag;
 950 
 951         /*
 952          * Skip pages:
 953          *      - associated with the kernel vnode since
 954          *          they are always "exclusively" locked.
 955          *      - that are free
 956          *      - that are shared more than po_share'd times
 957          *      - its already locked
 958          *
 959          * NOTE:  These optimizations assume that reads are atomic.
 960          */
 961 
 962         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
 963             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
 964             hat_page_checkshare(pp, po_share)) {
 965                 return (-1);
 966         }
 967 
 968         if (!page_trylock(pp, SE_EXCL)) {
 969                 /*
 970                  * Skip the page if we can't acquire the "exclusive" lock.
 971                  */
 972                 return (-1);
 973         } else if (PP_ISFREE(pp)) {
 974                 /*
 975                  * It became free between the above check and our actually
 976                  * locking the page.  Oh, well there will be other pages.
 977                  */
 978                 page_unlock(pp);
 979                 return (-1);
 980         }
 981 
 982         /*
 983          * Reject pages that cannot be freed. The page_struct_lock
 984          * need not be acquired to examine these
 985          * fields since the page has an "exclusive" lock.
 986          */
 987         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 988                 page_unlock(pp);
 989                 return (-1);
 990         }
 991 
 992         /*
 993          * Maintain statistics for what we are freeing
 994          */
 995 
 996         if (pp->p_vnode != NULL) {
 997                 if (pp->p_vnode->v_flag & VVMEXEC)
 998                         isexec = 1;
 999 
1000                 if (!IS_SWAPFSVP(pp->p_vnode))
1001                         isfs = 1;
1002         }
1003 
1004         /*
1005          * Turn off REF and MOD bits with the front hand.
1006          * The back hand examines the REF bit and always considers
1007          * SHARED pages as referenced.
1008          */
1009         if (whichhand == FRONT)
1010                 pagesync_flag = HAT_SYNC_ZERORM;
1011         else
1012                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1013                     HAT_SYNC_STOPON_SHARED;
1014 
1015         ppattr = hat_pagesync(pp, pagesync_flag);
1016 
1017 recheck:
1018         /*
1019          * If page is referenced; make unreferenced but reclaimable.
1020          * If this page is not referenced, then it must be reclaimable
1021          * and we can add it to the free list.
1022          */
1023         if (ppattr & P_REF) {
1024                 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1025                     "pageout_isref:pp %p whichhand %d", pp, whichhand);
1026                 if (whichhand == FRONT) {
1027                         /*
1028                          * Checking of rss or madvise flags needed here...
1029                          *
1030                          * If not "well-behaved", fall through into the code
1031                          * for not referenced.
1032                          */
1033                         hat_clrref(pp);
1034                 }
1035                 /*
1036                  * Somebody referenced the page since the front
1037                  * hand went by, so it's not a candidate for
1038                  * freeing up.
1039                  */
1040                 page_unlock(pp);
1041                 return (0);
1042         }
1043 
1044         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1045 
1046         /*
1047          * If large page, attempt to demote it. If successfully demoted,
1048          * retry the checkpage.
1049          */
1050         if (pp->p_szc != 0) {
1051                 if (!page_try_demote_pages(pp)) {
1052                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1053                         page_unlock(pp);
1054                         return (-1);
1055                 }
1056                 ASSERT(pp->p_szc == 0);
1057                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1058                 /*
1059                  * since page_try_demote_pages() could have unloaded some
1060                  * mappings it makes sense to reload ppattr.
1061                  */
1062                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1063         }
1064 
1065         /*
1066          * If the page is currently dirty, we have to arrange
1067          * to have it cleaned before it can be freed.
1068          *
1069          * XXX - ASSERT(pp->p_vnode != NULL);
1070          */
1071         if ((ppattr & P_MOD) && pp->p_vnode) {
1072                 struct vnode *vp = pp->p_vnode;
1073                 u_offset_t offset = pp->p_offset;
1074 
1075                 /*
1076                  * XXX - Test for process being swapped out or about to exit?
1077                  * [Can't get back to process(es) using the page.]
1078                  */
1079 
1080                 /*
1081                  * Hold the vnode before releasing the page lock to
1082                  * prevent it from being freed and re-used by some
1083                  * other thread.
1084                  */
1085                 VN_HOLD(vp);
1086                 page_unlock(pp);
1087 
1088                 /*
1089                  * Queue i/o request for the pageout thread.
1090                  */
1091                 if (!queue_io_request(vp, offset)) {
1092                         VN_RELE(vp);
1093                         return (0);
1094                 }
1095                 return (1);
1096         }
1097 
1098         /*
1099          * Now we unload all the translations,
1100          * and put the page back on to the free list.
1101          * If the page was used (referenced or modified) after
1102          * the pagesync but before it was unloaded we catch it
1103          * and handle the page properly.
1104          */
1105         TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1106             "pageout_free:pp %p whichhand %d", pp, whichhand);
1107         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1108         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1109         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1110                 goto recheck;
1111 
1112         /*LINTED: constant in conditional context*/
1113         VN_DISPOSE(pp, B_FREE, 0, kcred);
1114 
1115         CPU_STATS_ADD_K(vm, dfree, 1);
1116 
1117         if (isfs) {
1118                 if (isexec) {
1119                         CPU_STATS_ADD_K(vm, execfree, 1);
1120                 } else {
1121                         CPU_STATS_ADD_K(vm, fsfree, 1);
1122                 }
1123         } else {
1124                 CPU_STATS_ADD_K(vm, anonfree, 1);
1125         }
1126 
1127         return (1);             /* freed a page! */
1128 }
1129 
1130 /*
1131  * Queue async i/o request from pageout_scanner and segment swapout
1132  * routines on one common list.  This ensures that pageout devices (swap)
1133  * are not saturated by pageout_scanner or swapout requests.
1134  * The pageout thread empties this list by initiating i/o operations.
1135  */
1136 int
1137 queue_io_request(vnode_t *vp, u_offset_t off)
1138 {
1139         struct async_reqs *arg;
1140 
1141         /*
1142          * If we cannot allocate an async request struct,
1143          * skip this page.
1144          */
1145         mutex_enter(&push_lock);
1146         if ((arg = req_freelist) == NULL) {
1147                 mutex_exit(&push_lock);
1148                 return (0);
1149         }
1150         req_freelist = arg->a_next;          /* adjust freelist */
1151         push_list_size++;
1152 
1153         arg->a_vp = vp;
1154         arg->a_off = off;
1155         arg->a_len = PAGESIZE;
1156         arg->a_flags = B_ASYNC | B_FREE;
1157         arg->a_cred = kcred;         /* always held */
1158 
1159         /*
1160          * Add to list of pending write requests.
1161          */
1162         arg->a_next = push_list;
1163         push_list = arg;
1164 
1165         if (req_freelist == NULL) {
1166                 /*
1167                  * No free async requests left. The lock is held so we
1168                  * might as well signal the pusher thread now.
1169                  */
1170                 cv_signal(&push_cv);
1171         }
1172         mutex_exit(&push_lock);
1173         return (1);
1174 }
1175 
1176 /*
1177  * Wakeup pageout to initiate i/o if push_list is not empty.
1178  */
1179 void
1180 cv_signal_pageout()
1181 {
1182         if (push_list != NULL) {
1183                 mutex_enter(&push_lock);
1184                 cv_signal(&push_cv);
1185                 mutex_exit(&push_lock);
1186         }
1187 }