big-one New usr/src/uts/common/os/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31 
  32 /*
  33  * University Copyright- Copyright (c) 1982, 1986, 1988
  34  * The Regents of the University of California
  35  * All Rights Reserved
  36  *
  37  * University Acknowledgment- Portions of this document are derived from
  38  * software developed by the University of California, Berkeley, and its
  39  * contributors.
  40  */
  41 
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/buf.h>
  46 #include <sys/uio.h>
  47 #include <sys/proc.h>
  48 #include <sys/systm.h>
  49 #include <sys/mman.h>
  50 #include <sys/cred.h>
  51 #include <sys/vnode.h>
  52 #include <sys/vm.h>
  53 #include <sys/vmparam.h>
  54 #include <sys/vtrace.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/cpuvar.h>
  57 #include <sys/user.h>
  58 #include <sys/kmem.h>
  59 #include <sys/debug.h>
  60 #include <sys/callb.h>
  61 #include <sys/tnf_probe.h>
  62 #include <sys/mem_cage.h>
  63 #include <sys/time.h>
  64 
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/page.h>
  69 #include <vm/pvn.h>
  70 #include <vm/seg_kmem.h>
  71 
  72 static int checkpage(page_t *, int);
  73 
  74 /*
  75  * The following parameters control operation of the page replacement
  76  * algorithm.  They are initialized to 0, and then computed at boot time
  77  * based on the size of the system.  If they are patched non-zero in
  78  * a loaded vmunix they are left alone and may thus be changed per system
  79  * using adb on the loaded system.
  80  */
  81 volatile pgcnt_t        slowscan = 0;
  82 volatile pgcnt_t        fastscan = 0;
  83 
  84 volatile pgcnt_t        handspreadpages = 0;
  85 static int      loopfraction = 2;
  86 static pgcnt_t  looppages;
  87 volatile int    min_percent_cpu = 4;
  88 static int      max_percent_cpu = 80;
  89 static pgcnt_t  maxfastscan = 0;
  90 static pgcnt_t  maxslowscan = 100;
  91 
  92 volatile pgcnt_t        maxpgio = 0;
  93 volatile pgcnt_t        minfree = 0;
  94 volatile pgcnt_t        desfree = 0;
  95 volatile pgcnt_t        lotsfree = 0;
  96 pgcnt_t needfree = 0;
  97 volatile pgcnt_t        throttlefree = 0;
  98 volatile pgcnt_t        pageout_reserve = 0;
  99 
 100 pgcnt_t deficit;
 101 pgcnt_t nscan;
 102 pgcnt_t desscan;
 103 
 104 /*
 105  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
 106  * are the number of ticks in each wakeup cycle that gives the
 107  * equivalent of some underlying %CPU duty cycle.
 108  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
 109  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
 110  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
 111  * So, for example, 4% == 1 tick and 80% == 20 ticks.
 112  *
 113  * min_pageout_ticks:
 114  *     ticks/wakeup equivalent of min_percent_cpu.
 115  *
 116  * max_pageout_ticks:
 117  *     ticks/wakeup equivalent of max_percent_cpu.
 118  *
 119  * pageout_ticks:
 120  *     Number of clock ticks budgeted for each wakeup cycle.
 121  *     Computed each time around by schedpaging().
 122  *     Varies between min_pageout_ticks .. max_pageout_ticks,
 123  *     depending on memory pressure.
 124  *
 125  * pageout_lbolt:
 126  *     Timestamp of the last time pageout_scanner woke up and started
 127  *     (or resumed) scanning for not recently referenced pages.
 128  */
 129 
 130 static clock_t  min_pageout_ticks;
 131 static clock_t  max_pageout_ticks;
 132 static clock_t  pageout_ticks;
 133 static clock_t  pageout_lbolt;
 134 
 135 static uint_t   reset_hands;
 136 
 137 #define PAGES_POLL_MASK 1023
 138 
 139 /*
 140  * pageout_sample_lim:
 141  *     The limit on the number of samples needed to establish a value
 142  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
 143  *
 144  * pageout_sample_cnt:
 145  *     Current sample number.  Once the sample gets large enough,
 146  *     set new values for handspreadpages, fastscan and slowscan.
 147  *
 148  * pageout_sample_pages:
 149  *     The accumulated number of pages scanned during sampling.
 150  *
 151  * pageout_sample_ticks:
 152  *     The accumulated clock ticks for the sample.
 153  *
 154  * pageout_rate:
 155  *     Rate in pages/nanosecond, computed at the end of sampling.
 156  *
 157  * pageout_new_spread:
 158  *     The new value to use for fastscan and handspreadpages.
 159  *     Calculated after enough samples have been taken.
 160  */
 161 
 162 typedef hrtime_t hrrate_t;
 163 
 164 static uint64_t pageout_sample_lim = 4;
 165 static uint64_t pageout_sample_cnt = 0;
 166 static pgcnt_t  pageout_sample_pages = 0;
 167 static hrrate_t pageout_rate = 0;
 168 static pgcnt_t  pageout_new_spread = 0;
 169 
 170 static clock_t  pageout_cycle_ticks;
 171 static hrtime_t sample_start, sample_end;
 172 static hrtime_t pageout_sample_etime = 0;
 173 
 174 /*
 175  * Record number of times a pageout_scanner wakeup cycle finished because it
 176  * timed out (exceeded its CPU budget), rather than because it visited
 177  * its budgeted number of pages.
 178  */
 179 uint64_t pageout_timeouts = 0;
 180 
 181 #ifdef VM_STATS
 182 static struct pageoutvmstats_str {
 183         ulong_t checkpage[3];
 184 } pageoutvmstats;
 185 #endif /* VM_STATS */
 186 
 187 /*
 188  * Threads waiting for free memory use this condition variable and lock until
 189  * memory becomes available.
 190  */
 191 kmutex_t        memavail_lock;
 192 kcondvar_t      memavail_cv;
 193 
 194 /*
 195  * The size of the clock loop.
 196  */
 197 #define LOOPPAGES       total_pages
 198 
 199 /*
 200  * Set up the paging constants for the clock algorithm.
 201  * Called after the system is initialized and the amount of memory
 202  * and number of paging devices is known.
 203  *
 204  * lotsfree is 1/64 of memory, but at least 512K.
 205  * desfree is 1/2 of lotsfree.
 206  * minfree is 1/2 of desfree.
 207  *
 208  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
 209  *
 210  *      lotsfree = btop(512K)
 211  *      desfree = btop(200K)
 212  *      minfree = btop(100K)
 213  *      throttlefree = INT_MIN
 214  *      max_percent_cpu = 4
 215  */
 216 void
 217 setupclock(int recalc)
 218 {
 219 
 220         static spgcnt_t init_lfree, init_dfree, init_mfree;
 221         static spgcnt_t init_tfree, init_preserve, init_mpgio;
 222         static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
 223 
 224         looppages = LOOPPAGES;
 225 
 226         /*
 227          * setupclock can now be called to recalculate the paging
 228          * parameters in the case of dynamic addition of memory.
 229          * So to make sure we make the proper calculations, if such a
 230          * situation should arise, we save away the initial values
 231          * of each parameter so we can recall them when needed. This
 232          * way we don't lose the settings an admin might have made
 233          * through the /etc/system file.
 234          */
 235 
 236         if (!recalc) {
 237                 init_lfree = lotsfree;
 238                 init_dfree = desfree;
 239                 init_mfree = minfree;
 240                 init_tfree = throttlefree;
 241                 init_preserve = pageout_reserve;
 242                 init_mpgio = maxpgio;
 243                 init_mfscan = maxfastscan;
 244                 init_fscan = fastscan;
 245                 init_sscan = slowscan;
 246                 init_hspages = handspreadpages;
 247         }
 248 
 249         /*
 250          * Set up thresholds for paging:
 251          */
 252 
 253         /*
 254          * Lotsfree is threshold where paging daemon turns on.
 255          */
 256         if (init_lfree == 0 || init_lfree >= looppages)
 257                 lotsfree = MAX(looppages / 64, btop(512 * 1024));
 258         else
 259                 lotsfree = init_lfree;
 260 
 261         /*
 262          * Desfree is amount of memory desired free.
 263          * If less than this for extended period, start swapping.
 264          */
 265         if (init_dfree == 0 || init_dfree >= lotsfree)
 266                 desfree = lotsfree / 2;
 267         else
 268                 desfree = init_dfree;
 269 
 270         /*
 271          * Minfree is minimal amount of free memory which is tolerable.
 272          */
 273         if (init_mfree == 0 || init_mfree >= desfree)
 274                 minfree = desfree / 2;
 275         else
 276                 minfree = init_mfree;
 277 
 278         /*
 279          * Throttlefree is the point at which we start throttling
 280          * PG_WAIT requests until enough memory becomes available.
 281          */
 282         if (init_tfree == 0 || init_tfree >= desfree)
 283                 throttlefree = minfree;
 284         else
 285                 throttlefree = init_tfree;
 286 
 287         /*
 288          * Pageout_reserve is the number of pages that we keep in
 289          * stock for pageout's own use.  Having a few such pages
 290          * provides insurance against system deadlock due to
 291          * pageout needing pages.  When freemem < pageout_reserve,
 292          * non-blocking allocations are denied to any threads
 293          * other than pageout and sched.  (At some point we might
 294          * want to consider a per-thread flag like T_PUSHING_PAGES
 295          * to indicate that a thread is part of the page-pushing
 296          * dance (e.g. an interrupt thread) and thus is entitled
 297          * to the same special dispensation we accord pageout.)
 298          */
 299         if (init_preserve == 0 || init_preserve >= throttlefree)
 300                 pageout_reserve = throttlefree / 2;
 301         else
 302                 pageout_reserve = init_preserve;
 303 
 304         /*
 305          * Maxpgio thresholds how much paging is acceptable.
 306          * This figures that 2/3 busy on an arm is all that is
 307          * tolerable for paging.  We assume one operation per disk rev.
 308          *
 309          * XXX - Does not account for multiple swap devices.
 310          */
 311         if (init_mpgio == 0)
 312                 maxpgio = (DISKRPM * 2) / 3;
 313         else
 314                 maxpgio = init_mpgio;
 315 
 316         /*
 317          * The clock scan rate varies between fastscan and slowscan
 318          * based on the amount of free memory available.  Fastscan
 319          * rate should be set based on the number pages that can be
 320          * scanned per sec using ~10% of processor time.  Since this
 321          * value depends on the processor, MMU, Mhz etc., it is
 322          * difficult to determine it in a generic manner for all
 323          * architectures.
 324          *
 325          * Instead of trying to determine the number of pages scanned
 326          * per sec for every processor, fastscan is set to be the smaller
 327          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 328          * time is limited to ~4% of processor time.
 329          *
 330          * Setting fastscan to be 1/2 of memory allows pageout to scan
 331          * all of memory in ~2 secs.  This implies that user pages not
 332          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 333          * can be reclaimed when free memory is very low.  Stealing pages
 334          * not accessed within 1 sec seems reasonable and ensures that
 335          * active user processes don't thrash.
 336          *
 337          * Smaller values of fastscan result in scanning fewer pages
 338          * every second and consequently pageout may not be able to free
 339          * sufficient memory to maintain the minimum threshold.  Larger
 340          * values of fastscan result in scanning a lot more pages which
 341          * could lead to thrashing and higher CPU usage.
 342          *
 343          * Fastscan needs to be limited to a maximum value and should not
 344          * scale with memory to prevent pageout from consuming too much
 345          * time for scanning on slow CPU's and avoid thrashing, as a
 346          * result of scanning too many pages, on faster CPU's.
 347          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 348          * (the upper bound for fastscan) based on the average number
 349          * of pages that can potentially be scanned in ~1 sec (using ~4%
 350          * of the CPU) on some of the following machines that currently
 351          * run Solaris 2.x:
 352          *
 353          *                      average memory scanned in ~1 sec
 354          *
 355          *      25 Mhz SS1+:            23 Meg
 356          *      LX:                     37 Meg
 357          *      50 Mhz SC2000:          68 Meg
 358          *
 359          *      40 Mhz 486:             26 Meg
 360          *      66 Mhz 486:             42 Meg
 361          *
 362          * When free memory falls just below lotsfree, the scan rate
 363          * goes from 0 to slowscan (i.e., pageout starts running).  This
 364          * transition needs to be smooth and is achieved by ensuring that
 365          * pageout scans a small number of pages to satisfy the transient
 366          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 367          * wakeup) since scanning that many pages has no noticible impact
 368          * on system performance.
 369          *
 370          * In addition to setting fastscan and slowscan, pageout is
 371          * limited to using ~4% of the CPU.  This results in increasing
 372          * the time taken to scan all of memory, which in turn means that
 373          * user processes have a better opportunity of preventing their
 374          * pages from being stolen.  This has a positive effect on
 375          * interactive and overall system performance when memory demand
 376          * is high.
 377          *
 378          * Thus, the rate at which pages are scanned for replacement will
 379          * vary linearly between slowscan and the number of pages that
 380          * can be scanned using ~4% of processor time instead of varying
 381          * linearly between slowscan and fastscan.
 382          *
 383          * Also, the processor time used by pageout will vary from ~1%
 384          * at slowscan to ~4% at fastscan instead of varying between
 385          * ~1% at slowscan and ~10% at fastscan.
 386          *
 387          * The values chosen for the various VM parameters (fastscan,
 388          * handspreadpages, etc) are not universally true for all machines,
 389          * but appear to be a good rule of thumb for the machines we've
 390          * tested.  They have the following ranges:
 391          *
 392          *      cpu speed:      20 to 70 Mhz
 393          *      page size:      4K to 8K
 394          *      memory size:    16M to 5G
 395          *      page scan rate: 4000 - 17400 4K pages per sec
 396          *
 397          * The values need to be re-examined for machines which don't
 398          * fall into the various ranges (e.g., slower or faster CPUs,
 399          * smaller or larger pagesizes etc) shown above.
 400          *
 401          * On an MP machine, pageout is often unable to maintain the
 402          * minimum paging thresholds under heavy load.  This is due to
 403          * the fact that user processes running on other CPU's can be
 404          * dirtying memory at a much faster pace than pageout can find
 405          * pages to free.  The memory demands could be met by enabling
 406          * more than one CPU to run the clock algorithm in such a manner
 407          * that the various clock hands don't overlap.  This also makes
 408          * it more difficult to determine the values for fastscan, slowscan
 409          * and handspreadpages.
 410          *
 411          * The swapper is currently used to free up memory when pageout
 412          * is unable to meet memory demands by swapping out processes.
 413          * In addition to freeing up memory, swapping also reduces the
 414          * demand for memory by preventing user processes from running
 415          * and thereby consuming memory.
 416          */
 417         if (init_mfscan == 0) {
 418                 if (pageout_new_spread != 0)
 419                         maxfastscan = pageout_new_spread;
 420                 else
 421                         maxfastscan = MAXHANDSPREADPAGES;
 422         } else {
 423                 maxfastscan = init_mfscan;
 424         }
 425         if (init_fscan == 0)
 426                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 427         else
 428                 fastscan = init_fscan;
 429         if (fastscan > looppages / loopfraction)
 430                 fastscan = looppages / loopfraction;
 431 
 432         /*
 433          * Set slow scan time to 1/10 the fast scan time, but
 434          * not to exceed maxslowscan.
 435          */
 436         if (init_sscan == 0)
 437                 slowscan = MIN(fastscan / 10, maxslowscan);
 438         else
 439                 slowscan = init_sscan;
 440         if (slowscan > fastscan / 2)
 441                 slowscan = fastscan / 2;
 442 
 443         /*
 444          * Handspreadpages is distance (in pages) between front and back
 445          * pageout daemon hands.  The amount of time to reclaim a page
 446          * once pageout examines it increases with this distance and
 447          * decreases as the scan rate rises. It must be < the amount
 448          * of pageable memory.
 449          *
 450          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 451          * to be "fastscan" results in the front hand being a few secs
 452          * (varies based on the processor speed) ahead of the back hand
 453          * at fastscan rates.  This distance can be further reduced, if
 454          * necessary, by increasing the processor time used by pageout
 455          * to be more than ~4% and preferrably not more than ~10%.
 456          *
 457          * As a result, user processes have a much better chance of
 458          * referencing their pages before the back hand examines them.
 459          * This also significantly lowers the number of reclaims from
 460          * the freelist since pageout does not end up freeing pages which
 461          * may be referenced a sec later.
 462          */
 463         if (init_hspages == 0)
 464                 handspreadpages = fastscan;
 465         else
 466                 handspreadpages = init_hspages;
 467 
 468         /*
 469          * Make sure that back hand follows front hand by at least
 470          * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
 471          * for the back hand to look at a page during the same wakeup of
 472          * the pageout daemon in which the front hand cleared its ref bit.
 473          */
 474         if (handspreadpages >= looppages)
 475                 handspreadpages = looppages - 1;
 476 
 477         /*
 478          * If we have been called to recalculate the parameters,
 479          * set a flag to re-evaluate the clock hand pointers.
 480          */
 481         if (recalc)
 482                 reset_hands = 1;
 483 }
 484 
 485 /*
 486  * Pageout scheduling.
 487  *
 488  * Schedpaging controls the rate at which the page out daemon runs by
 489  * setting the global variables nscan and desscan RATETOSCHEDPAGING
 490  * times a second.  Nscan records the number of pages pageout has examined
 491  * in its current pass; schedpaging resets this value to zero each time
 492  * it runs.  Desscan records the number of pages pageout should examine
 493  * in its next pass; schedpaging sets this value based on the amount of
 494  * currently available memory.
 495  */
 496 
 497 #define RATETOSCHEDPAGING       4               /* hz that is */
 498 
 499 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 500 
 501 /*
 502  * Pool of available async pageout putpage requests.
 503  */
 504 static struct async_reqs *push_req;
 505 static struct async_reqs *req_freelist; /* available req structs */
 506 static struct async_reqs *push_list;    /* pending reqs */
 507 static kmutex_t push_lock;              /* protects req pool */
 508 static kcondvar_t push_cv;
 509 
 510 static int async_list_size = 256;       /* number of async request structs */
 511 
 512 static void pageout_scanner(void);
 513 
 514 /*
 515  * If a page is being shared more than "po_share" times
 516  * then leave it alone- don't page it out.
 517  */
 518 #define MIN_PO_SHARE    (8)
 519 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 520 ulong_t po_share = MIN_PO_SHARE;
 521 
 522 /*
 523  * Schedule rate for paging.
 524  * Rate is linear interpolation between
 525  * slowscan with lotsfree and fastscan when out of memory.
 526  */
 527 static void
 528 schedpaging(void *arg)
 529 {
 530         spgcnt_t vavail;
 531 
 532         if (freemem < lotsfree + needfree + kmem_reapahead)
 533                 kmem_reap();
 534 
 535         if (freemem < lotsfree + needfree)
 536                 seg_preap();
 537 
 538         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 539                 kcage_cageout_wakeup();
 540 
 541         if (mutex_tryenter(&pageout_mutex)) {
 542                 /* pageout() not running */
 543                 nscan = 0;
 544                 vavail = freemem - deficit;
 545                 if (pageout_new_spread != 0)
 546                         vavail -= needfree;
 547                 if (vavail < 0)
 548                         vavail = 0;
 549                 if (vavail > lotsfree)
 550                         vavail = lotsfree;
 551 
 552                 /*
 553                  * Fix for 1161438 (CRS SPR# 73922).  All variables
 554                  * in the original calculation for desscan were 32 bit signed
 555                  * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 556                  * more of memory, the calculation can overflow.  When this
 557                  * happens, desscan becomes negative and pageout_scanner()
 558                  * stops paging out.
 559                  */
 560                 if ((needfree) && (pageout_new_spread == 0)) {
 561                         /*
 562                          * If we've not yet collected enough samples to
 563                          * calculate a spread, use the old logic of kicking
 564                          * into high gear anytime needfree is non-zero.
 565                          */
 566                         desscan = fastscan / RATETOSCHEDPAGING;
 567                 } else {
 568                         /*
 569                          * Once we've calculated a spread based on system
 570                          * memory and usage, just treat needfree as another
 571                          * form of deficit.
 572                          */
 573                         spgcnt_t faststmp, slowstmp, result;
 574 
 575                         slowstmp = slowscan * vavail;
 576                         faststmp = fastscan * (lotsfree - vavail);
 577                         result = (slowstmp + faststmp) /
 578                             nz(lotsfree) / RATETOSCHEDPAGING;
 579                         desscan = (pgcnt_t)result;
 580                 }
 581 
 582                 pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
 583                     (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
 584 
 585                 if (freemem < lotsfree + needfree ||
 586                     pageout_sample_cnt < pageout_sample_lim) {
 587                         TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
 588                             "pageout_cv_signal:freemem %ld", freemem);
 589                         cv_signal(&proc_pageout->p_cv);
 590                 } else {
 591                         /*
 592                          * There are enough free pages, no need to
 593                          * kick the scanner thread.  And next time
 594                          * around, keep more of the `highly shared'
 595                          * pages.
 596                          */
 597                         cv_signal_pageout();
 598                         if (po_share > MIN_PO_SHARE) {
 599                                 po_share >>= 1;
 600                         }
 601                 }
 602                 mutex_exit(&pageout_mutex);
 603         }
 604 
 605         /*
 606          * Signal threads waiting for available memory.
 607          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 608          * in this case it is not needed - the waiters will be waken up during
 609          * the next invocation of this function.
 610          */
 611         if (kmem_avail() > 0)
 612                 cv_broadcast(&memavail_cv);
 613 
 614         (void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
 615 }
 616 
 617 pgcnt_t         pushes;
 618 ulong_t         push_list_size;         /* # of requests on pageout queue */
 619 
 620 #define FRONT   1
 621 #define BACK    2
 622 
 623 int dopageout = 1;      /* must be non-zero to turn page stealing on */
 624 
 625 /*
 626  * The page out daemon, which runs as process 2.
 627  *
 628  * As long as there are at least lotsfree pages,
 629  * this process is not run.  When the number of free
 630  * pages stays in the range desfree to lotsfree,
 631  * this daemon runs through the pages in the loop
 632  * at a rate determined in schedpaging().  Pageout manages
 633  * two hands on the clock.  The front hand moves through
 634  * memory, clearing the reference bit,
 635  * and stealing pages from procs that are over maxrss.
 636  * The back hand travels a distance behind the front hand,
 637  * freeing the pages that have not been referenced in the time
 638  * since the front hand passed.  If modified, they are pushed to
 639  * swap before being freed.
 640  *
 641  * There are 2 threads that act on behalf of the pageout process.
 642  * One thread scans pages (pageout_scanner) and frees them up if
 643  * they don't require any VOP_PUTPAGE operation. If a page must be
 644  * written back to its backing store, the request is put on a list
 645  * and the other (pageout) thread is signaled. The pageout thread
 646  * grabs VOP_PUTPAGE requests from the list, and processes them.
 647  * Some filesystems may require resources for the VOP_PUTPAGE
 648  * operations (like memory) and hence can block the pageout
 649  * thread, but the scanner thread can still operate. There is still
 650  * no guarantee that memory deadlocks cannot occur.
 651  *
 652  * For now, this thing is in very rough form.
 653  */
 654 void
 655 pageout()
 656 {
 657         struct async_reqs *arg;
 658         pri_t pageout_pri;
 659         int i;
 660         pgcnt_t max_pushes;
 661         callb_cpr_t cprinfo;
 662 
 663         proc_pageout = ttoproc(curthread);
 664         proc_pageout->p_cstime = 0;
 665         proc_pageout->p_stime =  0;
 666         proc_pageout->p_cutime =  0;
 667         proc_pageout->p_utime = 0;
 668         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
 669         bcopy("pageout", PTOU(curproc)->u_comm, 7);
 670 
 671         /*
 672          * Create pageout scanner thread
 673          */
 674         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
 675         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
 676 
 677         /*
 678          * Allocate and initialize the async request structures
 679          * for pageout.
 680          */
 681         push_req = (struct async_reqs *)
 682             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 683 
 684         req_freelist = push_req;
 685         for (i = 0; i < async_list_size - 1; i++)
 686                 push_req[i].a_next = &push_req[i + 1];
 687 
 688         pageout_pri = curthread->t_pri;
 689 
 690         /* Create the pageout scanner thread. */
 691         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
 692             pageout_pri - 1);
 693 
 694         /*
 695          * kick off pageout scheduler.
 696          */
 697         schedpaging(NULL);
 698 
 699         /*
 700          * Create kernel cage thread.
 701          * The kernel cage thread is started under the pageout process
 702          * to take advantage of the less restricted page allocation
 703          * in page_create_throttle().
 704          */
 705         kcage_cageout_init();
 706 
 707         /*
 708          * Limit pushes to avoid saturating pageout devices.
 709          */
 710         max_pushes = maxpgio / RATETOSCHEDPAGING;
 711         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 712 
 713         for (;;) {
 714                 mutex_enter(&push_lock);
 715 
 716                 while ((arg = push_list) == NULL || pushes > max_pushes) {
 717                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 718                         cv_wait(&push_cv, &push_lock);
 719                         pushes = 0;
 720                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
 721                 }
 722                 push_list = arg->a_next;
 723                 arg->a_next = NULL;
 724                 mutex_exit(&push_lock);
 725 
 726                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 727                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 728                         pushes++;
 729                 }
 730 
 731                 /* vp held by checkpage() */
 732                 VN_RELE(arg->a_vp);
 733 
 734                 mutex_enter(&push_lock);
 735                 arg->a_next = req_freelist;  /* back on freelist */
 736                 req_freelist = arg;
 737                 push_list_size--;
 738                 mutex_exit(&push_lock);
 739         }
 740 }
 741 
 742 /*
 743  * Kernel thread that scans pages looking for ones to free
 744  */
 745 static void
 746 pageout_scanner(void)
 747 {
 748         struct page *fronthand, *backhand;
 749         uint_t count;
 750         callb_cpr_t cprinfo;
 751         pgcnt_t nscan_limit;
 752         pgcnt_t pcount;
 753 
 754         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
 755         mutex_enter(&pageout_mutex);
 756 
 757         /*
 758          * The restart case does not attempt to point the hands at roughly
 759          * the right point on the assumption that after one circuit things
 760          * will have settled down - and restarts shouldn't be that often.
 761          */
 762 
 763         /*
 764          * Set the two clock hands to be separated by a reasonable amount,
 765          * but no more than 360 degrees apart.
 766          */
 767         backhand = page_first();
 768         if (handspreadpages >= total_pages)
 769                 fronthand = page_nextn(backhand, total_pages - 1);
 770         else
 771                 fronthand = page_nextn(backhand, handspreadpages);
 772 
 773         min_pageout_ticks = MAX(1,
 774             ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
 775         max_pageout_ticks = MAX(min_pageout_ticks,
 776             ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
 777 
 778 loop:
 779         cv_signal_pageout();
 780 
 781         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 782         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
 783         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
 784 
 785         if (!dopageout)
 786                 goto loop;
 787 
 788         if (reset_hands) {
 789                 reset_hands = 0;
 790 
 791                 backhand = page_first();
 792                 if (handspreadpages >= total_pages)
 793                         fronthand = page_nextn(backhand, total_pages - 1);
 794                 else
 795                         fronthand = page_nextn(backhand, handspreadpages);
 796         }
 797 
 798         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
 799         count = 0;
 800 
 801         TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
 802             "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
 803             freemem, lotsfree, nscan, desscan);
 804 
 805         /* Kernel probe */
 806         TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
 807             tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
 808 
 809         pcount = 0;
 810         if (pageout_sample_cnt < pageout_sample_lim) {
 811                 nscan_limit = total_pages;
 812         } else {
 813                 nscan_limit = desscan;
 814         }
 815         pageout_lbolt = ddi_get_lbolt();
 816         sample_start = gethrtime();
 817 
 818         /*
 819          * Scan the appropriate number of pages for a single duty cycle.
 820          * However, stop scanning as soon as there is enough free memory.
 821          * For a short while, we will be sampling the performance of the
 822          * scanner and need to keep running just to get sample data, in
 823          * which case we keep going and don't pay attention to whether
 824          * or not there is enough free memory.
 825          */
 826 
 827         while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
 828             pageout_sample_cnt < pageout_sample_lim)) {
 829                 int rvfront, rvback;
 830 
 831                 /*
 832                  * Check to see if we have exceeded our %CPU budget
 833                  * for this wakeup, but not on every single page visited,
 834                  * just every once in a while.
 835                  */
 836                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
 837                         pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
 838                         if (pageout_cycle_ticks >= pageout_ticks) {
 839                                 ++pageout_timeouts;
 840                                 break;
 841                         }
 842                 }
 843 
 844                 /*
 845                  * If checkpage manages to add a page to the free list,
 846                  * we give ourselves another couple of trips around the loop.
 847                  */
 848                 if ((rvfront = checkpage(fronthand, FRONT)) == 1)
 849                         count = 0;
 850                 if ((rvback = checkpage(backhand, BACK)) == 1)
 851                         count = 0;
 852 
 853                 ++pcount;
 854 
 855                 /*
 856                  * protected by pageout_mutex instead of cpu_stat_lock
 857                  */
 858                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
 859 
 860                 /*
 861                  * Don't include ineligible pages in the number scanned.
 862                  */
 863                 if (rvfront != -1 || rvback != -1)
 864                         nscan++;
 865 
 866                 backhand = page_next(backhand);
 867 
 868                 /*
 869                  * backhand update and wraparound check are done separately
 870                  * because lint barks when it finds an empty "if" body
 871                  */
 872 
 873                 if ((fronthand = page_next(fronthand)) == page_first()) {
 874                         TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
 875                             "pageout_hand_wrap:freemem %ld whichhand %d",
 876                             freemem, FRONT);
 877 
 878                         /*
 879                          * protected by pageout_mutex instead of cpu_stat_lock
 880                          */
 881                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
 882                         if (++count > 1) {
 883                                 /*
 884                                  * Extremely unlikely, but it happens.
 885                                  * We went around the loop at least once
 886                                  * and didn't get far enough.
 887                                  * If we are still skipping `highly shared'
 888                                  * pages, skip fewer of them.  Otherwise,
 889                                  * give up till the next clock tick.
 890                                  */
 891                                 if (po_share < MAX_PO_SHARE) {
 892                                         po_share <<= 1;
 893                                 } else {
 894                                         /*
 895                                          * Really a "goto loop", but
 896                                          * if someone is TRACing or
 897                                          * TNF_PROBE_ing, at least
 898                                          * make records to show
 899                                          * where we are.
 900                                          */
 901                                         break;
 902                                 }
 903                         }
 904                 }
 905         }
 906 
 907         sample_end = gethrtime();
 908 
 909         TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
 910             "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
 911             freemem, lotsfree, nscan, desscan, count);
 912 
 913         /* Kernel probe */
 914         TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
 915             tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
 916 
 917         if (pageout_sample_cnt < pageout_sample_lim) {
 918                 pageout_sample_pages += pcount;
 919                 pageout_sample_etime += sample_end - sample_start;
 920                 ++pageout_sample_cnt;
 921         }
 922         if (pageout_sample_cnt >= pageout_sample_lim &&
 923             pageout_new_spread == 0) {
 924                 pageout_rate = (hrrate_t)pageout_sample_pages *
 925                     (hrrate_t)(NANOSEC) / pageout_sample_etime;
 926                 pageout_new_spread = pageout_rate / 10;
 927                 setupclock(1);
 928         }
 929 
 930         goto loop;
 931 }
 932 
 933 /*
 934  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
 935  * system (u., page table) or free, then leave it alone.  Otherwise,
 936  * if we are running the front hand, turn off the page's reference bit.
 937  * If the proc is over maxrss, we take it.  If running the back hand,
 938  * check whether the page has been reclaimed.  If not, free the page,
 939  * pushing it to disk first if necessary.
 940  *
 941  * Return values:
 942  *      -1 if the page is not a candidate at all,
 943  *       0 if not freed, or
 944  *       1 if we freed it.
 945  */
 946 static int
 947 checkpage(struct page *pp, int whichhand)
 948 {
 949         int ppattr;
 950         int isfs = 0;
 951         int isexec = 0;
 952         int pagesync_flag;
 953 
 954         /*
 955          * Skip pages:
 956          *      - associated with the kernel vnode since
 957          *          they are always "exclusively" locked.
 958          *      - that are free
 959          *      - that are shared more than po_share'd times
 960          *      - its already locked
 961          *
 962          * NOTE:  These optimizations assume that reads are atomic.
 963          */
 964 
 965         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
 966             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
 967             hat_page_checkshare(pp, po_share)) {
 968                 return (-1);
 969         }
 970 
 971         if (!page_trylock(pp, SE_EXCL)) {
 972                 /*
 973                  * Skip the page if we can't acquire the "exclusive" lock.
 974                  */
 975                 return (-1);
 976         } else if (PP_ISFREE(pp)) {
 977                 /*
 978                  * It became free between the above check and our actually
 979                  * locking the page.  Oh, well there will be other pages.
 980                  */
 981                 page_unlock(pp);
 982                 return (-1);
 983         }
 984 
 985         /*
 986          * Reject pages that cannot be freed. The page_struct_lock
 987          * need not be acquired to examine these
 988          * fields since the page has an "exclusive" lock.
 989          */
 990         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 991                 page_unlock(pp);
 992                 return (-1);
 993         }
 994 
 995         /*
 996          * Maintain statistics for what we are freeing
 997          */
 998 
 999         if (pp->p_vnode != NULL) {
1000                 if (pp->p_vnode->v_flag & VVMEXEC)
1001                         isexec = 1;
1002 
1003                 if (!IS_SWAPFSVP(pp->p_vnode))
1004                         isfs = 1;
1005         }
1006 
1007         /*
1008          * Turn off REF and MOD bits with the front hand.
1009          * The back hand examines the REF bit and always considers
1010          * SHARED pages as referenced.
1011          */
1012         if (whichhand == FRONT)
1013                 pagesync_flag = HAT_SYNC_ZERORM;
1014         else
1015                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1016                     HAT_SYNC_STOPON_SHARED;
1017 
1018         ppattr = hat_pagesync(pp, pagesync_flag);
1019 
1020 recheck:
1021         /*
1022          * If page is referenced; make unreferenced but reclaimable.
1023          * If this page is not referenced, then it must be reclaimable
1024          * and we can add it to the free list.
1025          */
1026         if (ppattr & P_REF) {
1027                 TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
1028                     "pageout_isref:pp %p whichhand %d", pp, whichhand);
1029                 if (whichhand == FRONT) {
1030                         /*
1031                          * Checking of rss or madvise flags needed here...
1032                          *
1033                          * If not "well-behaved", fall through into the code
1034                          * for not referenced.
1035                          */
1036                         hat_clrref(pp);
1037                 }
1038                 /*
1039                  * Somebody referenced the page since the front
1040                  * hand went by, so it's not a candidate for
1041                  * freeing up.
1042                  */
1043                 page_unlock(pp);
1044                 return (0);
1045         }
1046 
1047         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1048 
1049         /*
1050          * If large page, attempt to demote it. If successfully demoted,
1051          * retry the checkpage.
1052          */
1053         if (pp->p_szc != 0) {
1054                 if (!page_try_demote_pages(pp)) {
1055                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1056                         page_unlock(pp);
1057                         return (-1);
1058                 }
1059                 ASSERT(pp->p_szc == 0);
1060                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1061                 /*
1062                  * since page_try_demote_pages() could have unloaded some
1063                  * mappings it makes sense to reload ppattr.
1064                  */
1065                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1066         }
1067 
1068         /*
1069          * If the page is currently dirty, we have to arrange
1070          * to have it cleaned before it can be freed.
1071          *
1072          * XXX - ASSERT(pp->p_vnode != NULL);
1073          */
1074         if ((ppattr & P_MOD) && pp->p_vnode) {
1075                 struct vnode *vp = pp->p_vnode;
1076                 u_offset_t offset = pp->p_offset;
1077 
1078                 /*
1079                  * XXX - Test for process being swapped out or about to exit?
1080                  * [Can't get back to process(es) using the page.]
1081                  */
1082 
1083                 /*
1084                  * Hold the vnode before releasing the page lock to
1085                  * prevent it from being freed and re-used by some
1086                  * other thread.
1087                  */
1088                 VN_HOLD(vp);
1089                 page_unlock(pp);
1090 
1091                 /*
1092                  * Queue i/o request for the pageout thread.
1093                  */
1094                 if (!queue_io_request(vp, offset)) {
1095                         VN_RELE(vp);
1096                         return (0);
1097                 }
1098                 return (1);
1099         }
1100 
1101         /*
1102          * Now we unload all the translations,
1103          * and put the page back on to the free list.
1104          * If the page was used (referenced or modified) after
1105          * the pagesync but before it was unloaded we catch it
1106          * and handle the page properly.
1107          */
1108         TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
1109             "pageout_free:pp %p whichhand %d", pp, whichhand);
1110         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1111         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1112         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
1113                 goto recheck;
1114 
1115         /*LINTED: constant in conditional context*/
1116         VN_DISPOSE(pp, B_FREE, 0, kcred);
1117 
1118         CPU_STATS_ADD_K(vm, dfree, 1);
1119 
1120         if (isfs) {
1121                 if (isexec) {
1122                         CPU_STATS_ADD_K(vm, execfree, 1);
1123                 } else {
1124                         CPU_STATS_ADD_K(vm, fsfree, 1);
1125                 }
1126         } else {
1127                 CPU_STATS_ADD_K(vm, anonfree, 1);
1128         }
1129 
1130         return (1);             /* freed a page! */
1131 }
1132 
1133 /*
1134  * Queue async i/o request from pageout_scanner and segment swapout
1135  * routines on one common list.  This ensures that pageout devices (swap)
1136  * are not saturated by pageout_scanner or swapout requests.
1137  * The pageout thread empties this list by initiating i/o operations.
1138  */
1139 int
1140 queue_io_request(vnode_t *vp, u_offset_t off)
1141 {
1142         struct async_reqs *arg;
1143 
1144         /*
1145          * If we cannot allocate an async request struct,
1146          * skip this page.
1147          */
1148         mutex_enter(&push_lock);
1149         if ((arg = req_freelist) == NULL) {
1150                 mutex_exit(&push_lock);
1151                 return (0);
1152         }
1153         req_freelist = arg->a_next;          /* adjust freelist */
1154         push_list_size++;
1155 
1156         arg->a_vp = vp;
1157         arg->a_off = off;
1158         arg->a_len = PAGESIZE;
1159         arg->a_flags = B_ASYNC | B_FREE;
1160         arg->a_cred = kcred;         /* always held */
1161 
1162         /*
1163          * Add to list of pending write requests.
1164          */
1165         arg->a_next = push_list;
1166         push_list = arg;
1167 
1168         if (req_freelist == NULL) {
1169                 /*
1170                  * No free async requests left. The lock is held so we
1171                  * might as well signal the pusher thread now.
1172                  */
1173                 cv_signal(&push_cv);
1174         }
1175         mutex_exit(&push_lock);
1176         return (1);
1177 }
1178 
1179 /*
1180  * Wakeup pageout to initiate i/o if push_list is not empty.
1181  */
1182 void
1183 cv_signal_pageout()
1184 {
1185         if (push_list != NULL) {
1186                 mutex_enter(&push_lock);
1187                 cv_signal(&push_cv);
1188                 mutex_exit(&push_lock);
1189         }
1190 }