1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2021 Oxide Computer Company
  24  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  25  */
  26 
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  * Copyright 2018 Joyent, Inc.
  31  */
  32 
  33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  34 /* All Rights Reserved */
  35 
  36 /*
  37  * University Copyright- Copyright (c) 1982, 1986, 1988
  38  * The Regents of the University of California
  39  * All Rights Reserved
  40  *
  41  * University Acknowledgment- Portions of this document are derived from
  42  * software developed by the University of California, Berkeley, and its
  43  * contributors.
  44  */
  45 
  46 #include <sys/types.h>
  47 #include <sys/t_lock.h>
  48 #include <sys/param.h>
  49 #include <sys/buf.h>
  50 #include <sys/uio.h>
  51 #include <sys/proc.h>
  52 #include <sys/systm.h>
  53 #include <sys/mman.h>
  54 #include <sys/cred.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vm.h>
  57 #include <sys/vmparam.h>
  58 #include <sys/vtrace.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/cpuvar.h>
  61 #include <sys/user.h>
  62 #include <sys/kmem.h>
  63 #include <sys/debug.h>
  64 #include <sys/callb.h>
  65 #include <sys/tnf_probe.h>
  66 #include <sys/mem_cage.h>
  67 #include <sys/time.h>
  68 #include <sys/zone.h>
  69 #include <sys/stdbool.h>
  70 
  71 #include <vm/hat.h>
  72 #include <vm/as.h>
  73 #include <vm/seg.h>
  74 #include <vm/page.h>
  75 #include <vm/pvn.h>
  76 #include <vm/seg_kmem.h>
  77 
  78 /*
  79  * FREE MEMORY MANAGEMENT
  80  *
  81  * Management of the pool of free pages is a tricky business.  There are
  82  * several critical threshold values which constrain our allocation of new
  83  * pages and inform the rate of paging out of memory to swap.  These threshold
  84  * values, and the behaviour they induce, are described below in descending
  85  * order of size -- and thus increasing order of severity!
  86  *
  87  *   +---------------------------------------------------- physmem (all memory)
  88  *   |
  89  *   | Ordinarily there are no particular constraints placed on page
  90  *   v allocation.  The page scanner is not running and page_create_va()
  91  *   | will effectively grant all page requests (whether from the kernel
  92  *   | or from user processes) without artificial delay.
  93  *   |
  94  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
  95  *   |
  96  *   | When we have less than "lotsfree" pages, pageout_scanner() is
  97  *   v signalled by schedpaging() to begin looking for pages that can
  98  *   | be evicted to disk to bring us back above lotsfree.  At this
  99  *   | stage there is still no constraint on allocation of free pages.
 100  *   |
 101  *   | For small systems, we set a lower bound of 16MB for lotsfree;
 102  *   v this is the natural value for a system with 1GB memory.  This is
 103  *   | to ensure that the pageout reserve pool contains at least 4MB
 104  *   | for use by ZFS.
 105  *   |
 106  *   | For systems with a large amount of memory, we constrain lotsfree
 107  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
 108  *   v at some point the required slack relates more closely to the
 109  *   | rate at which paging can occur than to the total amount of memory.
 110  *   |
 111  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
 112  *   |
 113  *   | When we drop below desfree, a number of kernel facilities will
 114  *   v wait before allocating more memory, under the assumption that
 115  *   | pageout or reaping will make progress and free up some memory.
 116  *   | This behaviour is not especially coordinated; look for comparisons
 117  *   | of desfree and freemem.
 118  *   |
 119  *   | In addition to various attempts at advisory caution, clock()
 120  *   | will wake up the thread that is ordinarily parked in sched().
 121  *   | This routine is responsible for the heavy-handed swapping out
 122  *   v of entire processes in an attempt to arrest the slide of free
 123  *   | memory.  See comments in sched.c for more details.
 124  *   |
 125  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
 126  *   |
 127  *   | These two separate tunables have, by default, the same value.
 128  *   v Various parts of the kernel use minfree to signal the need for
 129  *   | more aggressive reclamation of memory, and sched() is more
 130  *   | aggressive at swapping processes out.
 131  *   |
 132  *   | If free memory falls below throttlefree, page_create_va() will
 133  *   | use page_create_throttle() to begin holding most requests for
 134  *   | new pages while pageout and reaping free up memory.  Sleeping
 135  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
 136  *   | more memory.  Non-sleeping allocations are generally allowed to
 137  *   | proceed, unless their priority is explicitly lowered with
 138  *   | KM_NORMALPRI.
 139  *   |
 140  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
 141  *   |
 142  *   | When we hit throttlefree, the situation is already dire.  The
 143  *   v system is generally paging out memory and swapping out entire
 144  *   | processes in order to free up memory for continued operation.
 145  *   |
 146  *   | Unfortunately, evicting memory to disk generally requires short
 147  *   | term use of additional memory; e.g., allocation of buffers for
 148  *   | storage drivers, updating maps of free and used blocks, etc.
 149  *   | As such, pageout_reserve is the number of pages that we keep in
 150  *   | special reserve for use by pageout() and sched() and by any
 151  *   v other parts of the kernel that need to be working for those to
 152  *   | make forward progress such as the ZFS I/O pipeline.
 153  *   |
 154  *   | When we are below pageout_reserve, we fail or hold any allocation
 155  *   | that has not explicitly requested access to the reserve pool.
 156  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
 157  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
 158  *   | can implicitly tap the reserve.  For more details, see the
 159  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
 160  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
 161  *   |
 162  *   +---------------------------------------------------------- no free memory
 163  *   |
 164  *   | If we have arrived here, things are very bad indeed.  It is
 165  *   v surprisingly difficult to tell if this condition is even fatal,
 166  *   | as enough memory may have been granted to pageout() and to the
 167  *   | ZFS I/O pipeline that requests for eviction that have already been
 168  *   | made will complete and free up memory some time soon.
 169  *   |
 170  *   | If free memory does not materialise, the system generally remains
 171  *   | deadlocked.  The pageout_deadman() below is run once per second
 172  *   | from clock(), seeking to limit the amount of time a single request
 173  *   v to page out can be blocked before the system panics to get a crash
 174  *   | dump and return to service.
 175  *   |
 176  *   +-------------------------------------------------------------------------
 177  */
 178 
 179 /*
 180  * The following parameters control operation of the page replacement
 181  * algorithm.  They are initialized to 0, and then computed at boot time based
 182  * on the size of the system; see setupclock().  If they are patched non-zero
 183  * in a loaded vmunix they are left alone and may thus be changed per system
 184  * using "mdb -kw" on the loaded system.
 185  */
 186 pgcnt_t         slowscan = 0;
 187 pgcnt_t         fastscan = 0;
 188 
 189 static pgcnt_t  handspreadpages = 0;
 190 
 191 /*
 192  * looppages:
 193  *     Cached copy of the total number of pages in the system (total_pages).
 194  *
 195  * loopfraction:
 196  *     Divisor used to relate fastscan to looppages in setupclock().
 197  */
 198 static uint_t   loopfraction = 2;
 199 static pgcnt_t  looppages;
 200 
 201 static uint_t   min_percent_cpu = 4;
 202 static uint_t   max_percent_cpu = 80;
 203 static pgcnt_t  maxfastscan = 0;
 204 static pgcnt_t  maxslowscan = 100;
 205 
 206 #define         MEGABYTES               (1024ULL * 1024ULL)
 207 
 208 /*
 209  * pageout_threshold_style:
 210  *     set to 1 to use the previous default threshold size calculation;
 211  *     i.e., each threshold is half of the next largest value.
 212  */
 213 uint_t          pageout_threshold_style = 0;
 214 
 215 /*
 216  * The operator may override these tunables to request a different minimum or
 217  * maximum lotsfree value, or to change the divisor we use for automatic
 218  * sizing.
 219  *
 220  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
 221  * minimum and maximum are specified in bytes, rather than pages; a zero value
 222  * means the default values (below) are used.
 223  */
 224 uint_t          lotsfree_fraction = 64;
 225 pgcnt_t         lotsfree_min = 0;
 226 pgcnt_t         lotsfree_max = 0;
 227 
 228 #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 229 #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 230 
 231 /*
 232  * If these tunables are set to non-zero values in /etc/system, and provided
 233  * the value is not larger than the threshold above, the specified value will
 234  * be used directly without any additional calculation or adjustment.  The boot
 235  * time value of these overrides is preserved in the "clockinit" struct.  More
 236  * detail is available in the comment at the top of the file.
 237  */
 238 pgcnt_t         maxpgio = 0;
 239 pgcnt_t         minfree = 0;
 240 pgcnt_t         desfree = 0;
 241 pgcnt_t         lotsfree = 0;
 242 pgcnt_t         needfree = 0;
 243 pgcnt_t         throttlefree = 0;
 244 pgcnt_t         pageout_reserve = 0;
 245 pri_t           pageout_pri;
 246 
 247 pgcnt_t         deficit;
 248 pgcnt_t         nscan;
 249 pgcnt_t         desscan;
 250 
 251 /* kstats */
 252 uint64_t low_mem_scan;
 253 uint64_t zone_cap_scan;
 254 
 255 #define MAX_PSCAN_THREADS       16
 256 
 257 /*
 258  * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
 259  * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
 260  * that gives the equivalent of some underlying %CPU duty cycle.
 261  *
 262  * min_pageout_nsec:
 263  *     nanoseconds/wakeup equivalent of min_percent_cpu.
 264  *
 265  * max_pageout_nsec:
 266  *     nanoseconds/wakeup equivalent of max_percent_cpu.
 267  *
 268  * pageout_nsec:
 269  *     Number of nanoseconds budgeted for each wakeup cycle.
 270  *     Computed each time around by schedpaging().
 271  *     Varies between min_pageout_nsec and max_pageout_nsec,
 272  *     depending on memory pressure or zones over their cap.
 273  *
 274  * zone_pageout_nsec:
 275  *      Number of nanoseconds budget for each cycle when a zone
 276  *      is over its memory cap. If this is zero, then the value
 277  *      of max_pageout_nsec is used instead.
 278  */
 279 static hrtime_t min_pageout_nsec;
 280 static hrtime_t max_pageout_nsec;
 281 static hrtime_t pageout_nsec;
 282 static hrtime_t zone_pageout_nsec;
 283 
 284 static boolean_t        reset_hands[MAX_PSCAN_THREADS];
 285 
 286 #define PAGES_POLL_MASK 1023
 287 #define SCHEDPAGING_HZ  4
 288 
 289 /*
 290  * despagescanners:
 291  *      The desired number of page scanner threads. The value can be set in
 292  *      /etc/system or tuned directly with 'mdb -kw'.  The system will bring
 293  *      the actual number of threads into line with the desired number. If set
 294  *      to an invalid value, the system will correct the setting.
 295  */
 296 uint_t despagescanners = 0;
 297 
 298 /*
 299  * pageout_sample_lim:
 300  *     The limit on the number of samples needed to establish a value for new
 301  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 302  *     handspreadpages.
 303  *
 304  * pageout_sample_cnt:
 305  *     Current sample number.  Once the sample gets large enough, set new
 306  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 307  *
 308  * pageout_sample_pages:
 309  *     The accumulated number of pages scanned during sampling.
 310  *
 311  * pageout_sample_etime:
 312  *     The accumulated nanoseconds for the sample.
 313  *
 314  * pageout_rate:
 315  *     Rate in pages/nanosecond, computed at the end of sampling.
 316  *
 317  * pageout_new_spread:
 318  *     Initially zero while the system scan rate is measured by
 319  *     pageout_scanner(), which then sets this value once per system boot after
 320  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 321  *     new value is used for fastscan and handspreadpages.
 322  */
 323 typedef hrtime_t hrrate_t;
 324 
 325 static uint64_t pageout_sample_lim = 4;
 326 static uint64_t pageout_sample_cnt = 0;
 327 static pgcnt_t  pageout_sample_pages = 0;
 328 static hrtime_t pageout_sample_etime = 0;
 329 static hrrate_t pageout_rate = 0;
 330 static pgcnt_t  pageout_new_spread = 0;
 331 
 332 /* True if the page scanner is first starting up */
 333 #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
 334 
 335 /* The current number of page scanner threads */
 336 static uint_t n_page_scanners = 1;
 337 /* The number of page scanner threads that are actively scanning. */
 338 static uint_t pageouts_running;
 339 
 340 /*
 341  * Record number of times a pageout_scanner() wakeup cycle finished because it
 342  * timed out (exceeded its CPU budget), rather than because it visited
 343  * its budgeted number of pages. This is only done when scanning under low
 344  * free memory conditions, not when scanning for zones over their cap.
 345  */
 346 uint64_t        pageout_timeouts = 0;
 347 
 348 #ifdef VM_STATS
 349 static struct pageoutvmstats_str {
 350         ulong_t checkpage[3];
 351 } pageoutvmstats;
 352 #endif /* VM_STATS */
 353 
 354 /*
 355  * Threads waiting for free memory use this condition variable and lock until
 356  * memory becomes available.
 357  */
 358 kmutex_t        memavail_lock;
 359 kcondvar_t      memavail_cv;
 360 
 361 typedef enum pageout_hand {
 362         POH_FRONT = 1,
 363         POH_BACK,
 364 } pageout_hand_t;
 365 
 366 typedef enum {
 367         CKP_INELIGIBLE,
 368         CKP_NOT_FREED,
 369         CKP_FREED,
 370 } checkpage_result_t;
 371 
 372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 373 
 374 static struct clockinit {
 375         bool ci_init;
 376         pgcnt_t ci_lotsfree_min;
 377         pgcnt_t ci_lotsfree_max;
 378         pgcnt_t ci_lotsfree;
 379         pgcnt_t ci_desfree;
 380         pgcnt_t ci_minfree;
 381         pgcnt_t ci_throttlefree;
 382         pgcnt_t ci_pageout_reserve;
 383         pgcnt_t ci_maxpgio;
 384         pgcnt_t ci_maxfastscan;
 385         pgcnt_t ci_fastscan;
 386         pgcnt_t ci_slowscan;
 387         pgcnt_t ci_handspreadpages;
 388         uint_t  ci_despagescanners;
 389 } clockinit = { .ci_init = false };
 390 
 391 static inline pgcnt_t
 392 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 393 {
 394         if (value < minimum) {
 395                 return (minimum);
 396         } else if (value > maximum) {
 397                 return (maximum);
 398         } else {
 399                 return (value);
 400         }
 401 }
 402 
 403 static pgcnt_t
 404 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 405 {
 406         if (initval == 0 || initval >= initval_ceiling) {
 407                 return (defval);
 408         } else {
 409                 return (initval);
 410         }
 411 }
 412 
 413 /*
 414  * Local boolean to control scanning when zones are over their cap. Avoids
 415  * accessing the zone_num_over_cap variable except within schedpaging(), which
 416  * only runs periodically. This is here only to reduce our access to
 417  * zone_num_over_cap, since it is already accessed a lot during paging, and
 418  * the page scanner accesses the zones_over variable on each page during a
 419  * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 420  * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 421  */
 422 static boolean_t zones_over = B_FALSE;
 423 
 424 /*
 425  * On large memory systems, multiple instances of the page scanner are run,
 426  * each responsible for a separate region of memory. This speeds up page
 427  * invalidation under low memory conditions.
 428  *
 429  * despagescanners can be set in /etc/system or via mdb and it will
 430  * be used as a guide for how many page scanners to create; the value
 431  * will be adjusted if it is not sensible. Otherwise, the number of
 432  * page scanners is determined dynamically based on handspreadpages.
 433  */
 434 static void
 435 recalc_pagescanners(void)
 436 {
 437         pgcnt_t sz;
 438         uint_t des;
 439 
 440         /* If the initial calibration has not been done, take no action. */
 441         if (pageout_new_spread == 0)
 442                 return;
 443 
 444         /*
 445          * If the desired number of scanners is set in /etc/system
 446          * then try to use it.
 447          */
 448         if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
 449                 despagescanners = clockinit.ci_despagescanners;
 450 
 451         if (despagescanners != 0) {
 452                 /*
 453                  * We have a desired number of page scanners, either from
 454                  * /etc/system or set via mdb. Try and use it (it will be
 455                  * clamped below).
 456                  */
 457                 des = despagescanners;
 458         } else {
 459                 /*
 460                  * Calculate the number of desired scanners based on the
 461                  * system's memory size.
 462                  *
 463                  * A 64GiB region size is used as the basis for calculating how
 464                  * many scanner threads should be created. For systems with up
 465                  * to 64GiB of RAM, a single thread is used; for very large
 466                  * memory systems the threads are limited to MAX_PSCAN_THREADS.
 467                  */
 468                 sz = btop(64ULL << 30);
 469 
 470                 if (sz > looppages) {
 471                         des = 1;
 472                 } else {
 473                         pgcnt_t tmp = sz;
 474 
 475                         for (des = 1; tmp < looppages; des++)
 476                                 tmp += sz;
 477                 }
 478         }
 479 
 480         /*
 481          * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
 482          * and so that each scanner covers at least 10% more than
 483          * handspreadpages.
 484          */
 485         des = clamp(des, 1,
 486             looppages / (handspreadpages + handspreadpages / 10));
 487         despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
 488 }
 489 
 490 /*
 491  * Set up the paging constants for the clock algorithm used by
 492  * pageout_scanner(), and by the virtual memory system overall.  See the
 493  * comments at the top of this file for more information about the threshold
 494  * values and system responses to memory pressure.
 495  *
 496  * This routine is called once by main() at startup, after the initial size of
 497  * physical memory is determined.  It may be called again later if memory is
 498  * added to or removed from the system, or if new measurements of the page scan
 499  * rate become available.
 500  */
 501 void
 502 setupclock(void)
 503 {
 504         bool half = (pageout_threshold_style == 1);
 505         bool recalc = true;
 506 
 507         looppages = total_pages;
 508 
 509         /*
 510          * The operator may have provided specific values for some of the
 511          * tunables via /etc/system.  On our first call, we preserve those
 512          * values so that they can be used for subsequent recalculations.
 513          *
 514          * A value of zero for any tunable means we will use the default
 515          * sizing.
 516          */
 517         if (!clockinit.ci_init) {
 518                 clockinit.ci_init = true;
 519 
 520                 clockinit.ci_lotsfree_min = lotsfree_min;
 521                 clockinit.ci_lotsfree_max = lotsfree_max;
 522                 clockinit.ci_lotsfree = lotsfree;
 523                 clockinit.ci_desfree = desfree;
 524                 clockinit.ci_minfree = minfree;
 525                 clockinit.ci_throttlefree = throttlefree;
 526                 clockinit.ci_pageout_reserve = pageout_reserve;
 527                 clockinit.ci_maxpgio = maxpgio;
 528                 clockinit.ci_maxfastscan = maxfastscan;
 529                 clockinit.ci_fastscan = fastscan;
 530                 clockinit.ci_slowscan = slowscan;
 531                 clockinit.ci_handspreadpages = handspreadpages;
 532                 clockinit.ci_despagescanners = despagescanners;
 533 
 534                 /*
 535                  * The first call does not trigger a recalculation, only
 536                  * subsequent calls.
 537                  */
 538                 recalc = false;
 539         }
 540 
 541         /*
 542          * Configure paging threshold values.  For more details on what each
 543          * threshold signifies, see the comments at the top of this file.
 544          */
 545         lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 546             btop(LOTSFREE_MAX_DEFAULT));
 547         lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 548             btop(LOTSFREE_MIN_DEFAULT));
 549 
 550         lotsfree = tune(clockinit.ci_lotsfree, looppages,
 551             clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 552 
 553         desfree = tune(clockinit.ci_desfree, lotsfree,
 554             lotsfree / 2);
 555 
 556         minfree = tune(clockinit.ci_minfree, desfree,
 557             half ? desfree / 2 : 3 * desfree / 4);
 558 
 559         throttlefree = tune(clockinit.ci_throttlefree, desfree,
 560             minfree);
 561 
 562         pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
 563             half ? throttlefree / 2 : 3 * throttlefree / 4);
 564 
 565         /*
 566          * Maxpgio thresholds how much paging is acceptable.
 567          * This figures that 2/3 busy on an arm is all that is
 568          * tolerable for paging.  We assume one operation per disk rev.
 569          *
 570          * XXX - Does not account for multiple swap devices.
 571          */
 572         if (clockinit.ci_maxpgio == 0) {
 573                 maxpgio = (DISKRPM * 2) / 3;
 574         } else {
 575                 maxpgio = clockinit.ci_maxpgio;
 576         }
 577 
 578         /*
 579          * The clock scan rate varies between fastscan and slowscan
 580          * based on the amount of free memory available.  Fastscan
 581          * rate should be set based on the number pages that can be
 582          * scanned per sec using ~10% of processor time.  Since this
 583          * value depends on the processor, MMU, Mhz etc., it is
 584          * difficult to determine it in a generic manner for all
 585          * architectures.
 586          *
 587          * Instead of trying to determine the number of pages scanned
 588          * per sec for every processor, fastscan is set to be the smaller
 589          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 590          * time is limited to ~4% of processor time.
 591          *
 592          * Setting fastscan to be 1/2 of memory allows pageout to scan
 593          * all of memory in ~2 secs.  This implies that user pages not
 594          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 595          * can be reclaimed when free memory is very low.  Stealing pages
 596          * not accessed within 1 sec seems reasonable and ensures that
 597          * active user processes don't thrash.
 598          *
 599          * Smaller values of fastscan result in scanning fewer pages
 600          * every second and consequently pageout may not be able to free
 601          * sufficient memory to maintain the minimum threshold.  Larger
 602          * values of fastscan result in scanning a lot more pages which
 603          * could lead to thrashing and higher CPU usage.
 604          *
 605          * Fastscan needs to be limited to a maximum value and should not
 606          * scale with memory to prevent pageout from consuming too much
 607          * time for scanning on slow CPU's and avoid thrashing, as a
 608          * result of scanning too many pages, on faster CPU's.
 609          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 610          * (the upper bound for fastscan) based on the average number
 611          * of pages that can potentially be scanned in ~1 sec (using ~4%
 612          * of the CPU) on some of the following machines that currently
 613          * run Solaris 2.x:
 614          *
 615          *                      average memory scanned in ~1 sec
 616          *
 617          *      25 Mhz SS1+:            23 Meg
 618          *      LX:                     37 Meg
 619          *      50 Mhz SC2000:          68 Meg
 620          *
 621          *      40 Mhz 486:             26 Meg
 622          *      66 Mhz 486:             42 Meg
 623          *
 624          * When free memory falls just below lotsfree, the scan rate
 625          * goes from 0 to slowscan (i.e., pageout starts running).  This
 626          * transition needs to be smooth and is achieved by ensuring that
 627          * pageout scans a small number of pages to satisfy the transient
 628          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 629          * wakeup) since scanning that many pages has no noticible impact
 630          * on system performance.
 631          *
 632          * In addition to setting fastscan and slowscan, pageout is
 633          * limited to using ~4% of the CPU.  This results in increasing
 634          * the time taken to scan all of memory, which in turn means that
 635          * user processes have a better opportunity of preventing their
 636          * pages from being stolen.  This has a positive effect on
 637          * interactive and overall system performance when memory demand
 638          * is high.
 639          *
 640          * Thus, the rate at which pages are scanned for replacement will
 641          * vary linearly between slowscan and the number of pages that
 642          * can be scanned using ~4% of processor time instead of varying
 643          * linearly between slowscan and fastscan.
 644          *
 645          * Also, the processor time used by pageout will vary from ~1%
 646          * at slowscan to ~4% at fastscan instead of varying between
 647          * ~1% at slowscan and ~10% at fastscan.
 648          *
 649          * The values chosen for the various VM parameters (fastscan,
 650          * handspreadpages, etc) are not universally true for all machines,
 651          * but appear to be a good rule of thumb for the machines we've
 652          * tested.  They have the following ranges:
 653          *
 654          *      cpu speed:      20 to 70 Mhz
 655          *      page size:      4K to 8K
 656          *      memory size:    16M to 5G
 657          *      page scan rate: 4000 - 17400 4K pages per sec
 658          *
 659          * The values need to be re-examined for machines which don't
 660          * fall into the various ranges (e.g., slower or faster CPUs,
 661          * smaller or larger pagesizes etc) shown above.
 662          *
 663          * On an MP machine, pageout is often unable to maintain the
 664          * minimum paging thresholds under heavy load.  This is due to
 665          * the fact that user processes running on other CPU's can be
 666          * dirtying memory at a much faster pace than pageout can find
 667          * pages to free.  The memory demands could be met by enabling
 668          * more than one CPU to run the clock algorithm in such a manner
 669          * that the various clock hands don't overlap.  This also makes
 670          * it more difficult to determine the values for fastscan, slowscan
 671          * and handspreadpages.
 672          *
 673          * The swapper is currently used to free up memory when pageout
 674          * is unable to meet memory demands by swapping out processes.
 675          * In addition to freeing up memory, swapping also reduces the
 676          * demand for memory by preventing user processes from running
 677          * and thereby consuming memory.
 678          */
 679         if (clockinit.ci_maxfastscan == 0) {
 680                 if (pageout_new_spread != 0) {
 681                         maxfastscan = pageout_new_spread;
 682                 } else {
 683                         maxfastscan = MAXHANDSPREADPAGES;
 684                 }
 685         } else {
 686                 maxfastscan = clockinit.ci_maxfastscan;
 687         }
 688 
 689         if (clockinit.ci_fastscan == 0) {
 690                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 691         } else {
 692                 fastscan = clockinit.ci_fastscan;
 693         }
 694 
 695         if (fastscan > looppages / loopfraction) {
 696                 fastscan = looppages / loopfraction;
 697         }
 698 
 699         /*
 700          * Set slow scan time to 1/10 the fast scan time, but
 701          * not to exceed maxslowscan.
 702          */
 703         if (clockinit.ci_slowscan == 0) {
 704                 slowscan = MIN(fastscan / 10, maxslowscan);
 705         } else {
 706                 slowscan = clockinit.ci_slowscan;
 707         }
 708 
 709         if (slowscan > fastscan / 2) {
 710                 slowscan = fastscan / 2;
 711         }
 712 
 713         /*
 714          * Handspreadpages is the distance (in pages) between front and back
 715          * pageout daemon hands.  The amount of time to reclaim a page
 716          * once pageout examines it increases with this distance and
 717          * decreases as the scan rate rises. It must be < the amount
 718          * of pageable memory.
 719          *
 720          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 721          * to be "fastscan" results in the front hand being a few secs
 722          * (varies based on the processor speed) ahead of the back hand
 723          * at fastscan rates.  This distance can be further reduced, if
 724          * necessary, by increasing the processor time used by pageout
 725          * to be more than ~4% and preferrably not more than ~10%.
 726          *
 727          * As a result, user processes have a much better chance of
 728          * referencing their pages before the back hand examines them.
 729          * This also significantly lowers the number of reclaims from
 730          * the freelist since pageout does not end up freeing pages which
 731          * may be referenced a sec later.
 732          */
 733         if (clockinit.ci_handspreadpages == 0) {
 734                 handspreadpages = fastscan;
 735         } else {
 736                 handspreadpages = clockinit.ci_handspreadpages;
 737         }
 738 
 739         /*
 740          * Make sure that back hand follows front hand by at least
 741          * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 742          * back hand to look at a page during the same wakeup of the pageout
 743          * daemon in which the front hand cleared its ref bit.
 744          */
 745         if (handspreadpages >= looppages) {
 746                 handspreadpages = looppages - 1;
 747         }
 748 
 749         /*
 750          * Establish the minimum and maximum length of time to be spent
 751          * scanning pages per wakeup, limiting the scanner duty cycle.  The
 752          * input percentage values (0-100) must be converted to a fraction of
 753          * the number of nanoseconds in a second of wall time, then further
 754          * scaled down by the number of scanner wakeups in a second.
 755          */
 756         min_pageout_nsec = MAX(1,
 757             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
 758         max_pageout_nsec = MAX(min_pageout_nsec,
 759             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
 760 
 761         /*
 762          * If not called for recalculation, return and skip the remaining
 763          * steps.
 764          */
 765         if (!recalc)
 766                 return;
 767 
 768         /*
 769          * Set a flag to re-evaluate the clock hand positions.
 770          */
 771         for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
 772                 reset_hands[i] = B_TRUE;
 773 
 774         recalc_pagescanners();
 775 }
 776 
 777 /*
 778  * Pageout scheduling.
 779  *
 780  * Schedpaging controls the rate at which the page out daemon runs by
 781  * setting the global variables nscan and desscan SCHEDPAGING_HZ
 782  * times a second.  Nscan records the number of pages pageout has examined
 783  * in its current pass; schedpaging() resets this value to zero each time
 784  * it runs.  Desscan records the number of pages pageout should examine
 785  * in its next pass; schedpaging() sets this value based on the amount of
 786  * currently available memory.
 787  */
 788 
 789 static kmutex_t pageout_mutex;
 790 
 791 /*
 792  * Pool of available async pageout putpage requests.
 793  */
 794 static struct async_reqs *push_req;
 795 static struct async_reqs *req_freelist; /* available req structs */
 796 static struct async_reqs *push_list;    /* pending reqs */
 797 static kmutex_t push_lock;              /* protects req pool */
 798 static kcondvar_t push_cv;
 799 
 800 /*
 801  * If pageout() is stuck on a single push for this many seconds,
 802  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 803  * to 0, the deadman will have no effect.
 804  *
 805  * Note that we are only looking for stalls in the calls that pageout() makes
 806  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 807  * I/O, which should not take long unless the underlying strategy call blocks
 808  * indefinitely for memory.  The actual I/O request happens (or fails) later.
 809  */
 810 uint_t pageout_deadman_seconds = 90;
 811 
 812 static uint_t pageout_stucktime = 0;
 813 static bool pageout_pushing = false;
 814 static uint64_t pageout_pushcount = 0;
 815 static uint64_t pageout_pushcount_seen = 0;
 816 
 817 static int async_list_size = 8192;      /* number of async request structs */
 818 
 819 static void pageout_scanner(void *);
 820 
 821 /*
 822  * If a page is being shared more than "po_share" times
 823  * then leave it alone- don't page it out.
 824  */
 825 #define MIN_PO_SHARE    (8)
 826 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 827 ulong_t po_share = MIN_PO_SHARE;
 828 
 829 /*
 830  * Schedule rate for paging.
 831  * Rate is linear interpolation between
 832  * slowscan with lotsfree and fastscan when out of memory.
 833  */
 834 static void
 835 schedpaging(void *arg)
 836 {
 837         spgcnt_t vavail;
 838 
 839         if (freemem < lotsfree + needfree + kmem_reapahead)
 840                 kmem_reap();
 841 
 842         if (freemem < lotsfree + needfree)
 843                 seg_preap();
 844 
 845         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 846                 kcage_cageout_wakeup();
 847 
 848         if (mutex_tryenter(&pageout_mutex)) {
 849 
 850                 if (pageouts_running != 0)
 851                         goto out;
 852 
 853                 /* No pageout scanner threads running. */
 854                 nscan = 0;
 855                 vavail = freemem - deficit;
 856                 if (pageout_new_spread != 0)
 857                         vavail -= needfree;
 858                 vavail = clamp(vavail, 0, lotsfree);
 859 
 860                 if (needfree > 0 && pageout_new_spread == 0) {
 861                         /*
 862                          * If we've not yet collected enough samples to
 863                          * calculate a spread, use the old logic of kicking
 864                          * into high gear anytime needfree is non-zero.
 865                          */
 866                         desscan = fastscan / SCHEDPAGING_HZ;
 867                 } else {
 868                         /*
 869                          * Once we've calculated a spread based on system
 870                          * memory and usage, just treat needfree as another
 871                          * form of deficit.
 872                          */
 873                         spgcnt_t faststmp, slowstmp, result;
 874 
 875                         slowstmp = slowscan * vavail;
 876                         faststmp = fastscan * (lotsfree - vavail);
 877                         result = (slowstmp + faststmp) /
 878                             nz(lotsfree) / SCHEDPAGING_HZ;
 879                         desscan = (pgcnt_t)result;
 880                 }
 881 
 882                 pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
 883                     (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 884 
 885                 DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
 886                     pageout_nsec);
 887 
 888                 if (pageout_new_spread != 0 && despagescanners != 0 &&
 889                     despagescanners != n_page_scanners) {
 890                         /*
 891                         * We have finished the pagescan initialisation and the
 892                         * desired number of page scanners has changed, either
 893                         * because initialisation just finished, because of a
 894                         * memory DR, or because despagescanners has been
 895                         * modified on the fly (i.e. by mdb).
 896                         */
 897                         uint_t i, curr_nscan = n_page_scanners;
 898 
 899                         /* Re-validate despagescanners */
 900                         recalc_pagescanners();
 901 
 902                         n_page_scanners = despagescanners;
 903 
 904                         for (i = 0; i < MAX_PSCAN_THREADS; i++)
 905                                 reset_hands[i] = B_TRUE;
 906 
 907                         /* If we need more scanners, start them now. */
 908                         if (n_page_scanners > curr_nscan) {
 909                                 for (i = curr_nscan; i < n_page_scanners; i++) {
 910                                         (void) lwp_kernel_create(proc_pageout,
 911                                             pageout_scanner,
 912                                             (void *)(uintptr_t)i, TS_RUN,
 913                                             pageout_pri);
 914                                 }
 915                         }
 916 
 917                         /*
 918                          * If the number of scanners has decreased, trigger a
 919                          * wakeup so that the excess threads will terminate.
 920                          */
 921                         if (n_page_scanners < curr_nscan) {
 922                                 WAKE_PAGEOUT_SCANNER();
 923                         }
 924                 }
 925 
 926                 zones_over = B_FALSE;
 927 
 928                 if (PAGE_SCAN_STARTUP) {
 929                         /*
 930                          * We still need to measure the rate at which the
 931                          * system is able to scan pages of memory. Each of
 932                          * these initial samples is a scan of as much system
 933                          * memory as practical, regardless of whether or not we
 934                          * are experiencing memory pressure.
 935                          */
 936                         desscan = total_pages;
 937                         pageout_nsec = max_pageout_nsec;
 938 
 939                         DTRACE_PROBE(schedpage__wake__sample);
 940                         WAKE_PAGEOUT_SCANNER();
 941                 } else if (freemem < lotsfree + needfree) {
 942                         /*
 943                          * We need more memory.
 944                          */
 945                         low_mem_scan++;
 946 
 947                         DTRACE_PROBE(schedpage__wake__low);
 948                         WAKE_PAGEOUT_SCANNER();
 949                 } else if (zone_num_over_cap > 0) {
 950                         /*
 951                          * One of more zones are over their cap.
 952                          */
 953 
 954                         /* No page limit */
 955                         desscan = total_pages;
 956 
 957                         /*
 958                         * Increase the scanning CPU% to the max. This implies
 959                         * 80% of one CPU/sec if the scanner can run each
 960                         * opportunity. Can also be tuned via setting
 961                         * zone_pageout_nsec in /etc/system or with mdb.
 962                         */
 963                         pageout_nsec = (zone_pageout_nsec != 0) ?
 964                            zone_pageout_nsec : max_pageout_nsec;
 965 
 966                         zones_over = B_TRUE;
 967                         zone_cap_scan++;
 968 
 969                         DTRACE_PROBE(schedpage__wake__zone);
 970                         WAKE_PAGEOUT_SCANNER();
 971                 } else {
 972                         /*
 973                          * There are enough free pages, no need to
 974                          * kick the scanner thread.  And next time
 975                          * around, keep more of the `highly shared'
 976                          * pages.
 977                          */
 978                         cv_signal_pageout();
 979                         if (po_share > MIN_PO_SHARE) {
 980                                 po_share >>= 1;
 981                         }
 982                 }
 983 out:
 984                 mutex_exit(&pageout_mutex);
 985         }
 986 
 987         /*
 988          * Signal threads waiting for available memory.
 989          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 990          * in this case it is not needed - the waiters will be waken up during
 991          * the next invocation of this function.
 992          */
 993         if (kmem_avail() > 0)
 994                 cv_broadcast(&memavail_cv);
 995 
 996         (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 997 }
 998 
 999 pgcnt_t         pushes;
1000 ulong_t         push_list_size;         /* # of requests on pageout queue */
1001 
1002 /*
1003  * Paging out should always be enabled.  This tunable exists to hold pageout
1004  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
1005  * sleep each time it is woken by schedpaging().
1006  */
1007 uint_t dopageout = 1;
1008 
1009 /*
1010  * The page out daemon, which runs as process 2.
1011  *
1012  * The daemon treats physical memory as a circular array of pages and scans
1013  * the pages using a 'two-handed clock' algorithm. The front hand moves
1014  * through the pages, clearing the reference bit. The back hand travels a
1015  * distance (handspreadpages) behind the front hand, freeing the pages that
1016  * have not been referenced in the time since the front hand passed. If
1017  * modified, they are first written to their backing store before being
1018  * freed.
1019  *
1020  * In order to make page invalidation more responsive on machines with
1021  * larger memory, multiple pageout_scanner threads may be created. In this
1022  * case, each thread is given a segment of the memory "clock face" so that
1023  * memory can be reclaimed more quickly.
1024  *
1025  * As long as there are at least lotsfree pages, or no zones over their
1026  * cap, then pageout_scanner threads are not run. When pageout_scanner
1027  * threads are running for case (a), all pages are considered for pageout.
1028  * For case (b), only pages belonging to a zone over its cap will be
1029  * considered for pageout.
1030  *
1031  * There are multiple threads that act on behalf of the pageout process. A
1032  * set of threads scan pages (pageout_scanner) and frees them up if they
1033  * don't require any VOP_PUTPAGE operation. If a page must be written back
1034  * to its backing store, the request is put on a list and the other
1035  * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
1036  * requests from the list, and processes them. Some filesystems may require
1037  * resources for the VOP_PUTPAGE operations (like memory) and hence can
1038  * block the pageout thread, but the scanner thread can still operate.
1039  * There is still no guarantee that memory deadlocks cannot occur.
1040  */
1041 void
1042 pageout()
1043 {
1044         struct async_reqs *arg;
1045         int i;
1046         pgcnt_t max_pushes;
1047         callb_cpr_t cprinfo;
1048 
1049         proc_pageout = ttoproc(curthread);
1050         proc_pageout->p_cstime = 0;
1051         proc_pageout->p_stime =  0;
1052         proc_pageout->p_cutime =  0;
1053         proc_pageout->p_utime = 0;
1054         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1055         bcopy("pageout", PTOU(curproc)->u_comm, 7);
1056 
1057         /*
1058          * Create pageout scanner thread
1059          */
1060         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1061         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1062 
1063         /*
1064          * Allocate and initialize the async request structures
1065          * for pageout.
1066          */
1067         push_req = (struct async_reqs *)
1068             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1069 
1070         req_freelist = push_req;
1071         for (i = 0; i < async_list_size - 1; i++) {
1072                 push_req[i].a_next = &push_req[i + 1];
1073         }
1074 
1075         pageout_pri = curthread->t_pri - 1;
1076 
1077         /* Create the first pageout scanner thread. */
1078         (void) lwp_kernel_create(proc_pageout, pageout_scanner,
1079             (void *)0,  /* this is instance 0, not NULL */
1080             TS_RUN, pageout_pri);
1081 
1082         /*
1083          * kick off pageout scheduler.
1084          */
1085         schedpaging(NULL);
1086 
1087         /*
1088          * Create kernel cage thread.
1089          * The kernel cage thread is started under the pageout process
1090          * to take advantage of the less restricted page allocation
1091          * in page_create_throttle().
1092          */
1093         kcage_cageout_init();
1094 
1095         /*
1096          * Limit pushes to avoid saturating pageout devices.
1097          */
1098         max_pushes = maxpgio / SCHEDPAGING_HZ;
1099         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1100 
1101         for (;;) {
1102                 mutex_enter(&push_lock);
1103 
1104                 while ((arg = push_list) == NULL || pushes > max_pushes) {
1105                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1106                         cv_wait(&push_cv, &push_lock);
1107                         pushes = 0;
1108                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1109                 }
1110                 push_list = arg->a_next;
1111                 arg->a_next = NULL;
1112                 pageout_pushing = true;
1113                 mutex_exit(&push_lock);
1114 
1115                 DTRACE_PROBE(pageout__push);
1116 
1117                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1118                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1119                         pushes++;
1120                 }
1121 
1122                 /* vp held by checkpage() */
1123                 VN_RELE(arg->a_vp);
1124 
1125                 mutex_enter(&push_lock);
1126                 pageout_pushing = false;
1127                 pageout_pushcount++;
1128                 arg->a_next = req_freelist;  /* back on freelist */
1129                 req_freelist = arg;
1130                 push_list_size--;
1131                 mutex_exit(&push_lock);
1132         }
1133 }
1134 
1135 /*
1136  * Kernel thread that scans pages looking for ones to free
1137  */
1138 static void
1139 pageout_scanner(void *a)
1140 {
1141         struct page *fronthand, *backhand, *fronthandstart;
1142         struct page *regionstart, *regionend;
1143         uint_t laps;
1144         callb_cpr_t cprinfo;
1145         pgcnt_t nscan_cnt, tick;
1146         pgcnt_t pcount;
1147         bool bhwrapping, fhwrapping;
1148         hrtime_t sample_start, sample_end;
1149         uint_t inst = (uint_t)(uintptr_t)a;
1150 
1151         VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1152 
1153         CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
1154         mutex_enter(&pageout_mutex);
1155 
1156         /*
1157          * The restart case does not attempt to point the hands at roughly
1158          * the right point on the assumption that after one circuit things
1159          * will have settled down, and restarts shouldn't be that often.
1160          */
1161         reset_hands[inst] = B_TRUE;
1162 
1163         pageouts_running++;
1164         mutex_exit(&pageout_mutex);
1165 
1166 loop:
1167         cv_signal_pageout();
1168 
1169         mutex_enter(&pageout_mutex);
1170         pageouts_running--;
1171         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1172         cv_wait(&proc_pageout->p_cv, &pageout_mutex);
1173         CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
1174         pageouts_running++;
1175         mutex_exit(&pageout_mutex);
1176 
1177         /*
1178          * Check if pageout has been disabled for debugging purposes.
1179          */
1180         if (!dopageout) {
1181                 goto loop;
1182         }
1183 
1184         /*
1185          * One may reset the clock hands and scanned region for debugging
1186          * purposes. Hands will also be reset on first thread startup, if
1187          * the number of scanning threads (n_page_scanners) changes, or if
1188          * memory is added to, or removed from, the system.
1189          */
1190         if (reset_hands[inst]) {
1191                 struct page *first;
1192 
1193                 reset_hands[inst] = B_FALSE;
1194 
1195                 if (inst >= n_page_scanners) {
1196                         /*
1197                         * The desired number of page scanners has been
1198                         * reduced and this instance is no longer wanted.
1199                         * Exit the lwp.
1200                         */
1201                         VERIFY3U(inst, !=, 0);
1202                         DTRACE_PROBE1(pageout__exit, uint_t, inst);
1203                         mutex_enter(&pageout_mutex);
1204                         pageouts_running--;
1205                         mutex_exit(&pageout_mutex);
1206                         mutex_enter(&curproc->p_lock);
1207                         lwp_exit();
1208                         /* NOTREACHED */
1209                 }
1210 
1211                 first = page_first();
1212 
1213                 /*
1214                  * Each scanner thread gets its own sector of the memory
1215                  * clock face.
1216                  */
1217                 pgcnt_t span, offset;
1218 
1219                 span = looppages / n_page_scanners;
1220                 VERIFY3U(span, >, handspreadpages);
1221 
1222                 offset = inst * span;
1223                 regionstart = page_nextn(first, offset);
1224                 if (inst == n_page_scanners - 1) {
1225                         /* The last instance goes up to the last page */
1226                         regionend = page_nextn(first, looppages - 1);
1227                 } else {
1228                         regionend = page_nextn(regionstart, span - 1);
1229                 }
1230 
1231                 backhand = regionstart;
1232                 fronthand = page_nextn(backhand, handspreadpages);
1233                 tick = 1;
1234 
1235                 bhwrapping = fhwrapping = B_FALSE;
1236 
1237                 DTRACE_PROBE4(pageout__reset, uint_t, inst,
1238                     pgcnt_t, regionstart, pgcnt_t, regionend,
1239                     pgcnt_t, fronthand);
1240         }
1241 
1242         /*
1243          * This CPU kstat is only incremented here and we're obviously
1244          * on this CPU, so no lock.
1245          */
1246         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1247 
1248         /*
1249          * Keep track of the number of times we have scanned all the way around
1250          * the loop on this wakeup.
1251          */
1252         laps = 0;
1253 
1254         /*
1255          * Track the number of pages visited during this scan so that we can
1256          * periodically measure our duty cycle.
1257          */
1258         nscan_cnt = 0;
1259         pcount = 0;
1260 
1261         DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
1262             hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
1263 
1264         /*
1265          * Record the initial position of the front hand for this cycle so
1266          * that we can detect when the hand wraps around.
1267          */
1268         fronthandstart = fronthand;
1269 
1270         sample_start = gethrtime();
1271 
1272         /*
1273          * Scan the appropriate number of pages for a single duty cycle.
1274          */
1275         while (nscan_cnt < desscan) {
1276                 checkpage_result_t rvfront, rvback;
1277 
1278                 /*
1279                  * Only scan while at least one of these is true:
1280                  *  1) one or more zones is over its cap
1281                  *  2) there is not enough free memory
1282                  *  3) during page scan startup when determining sample data
1283                  */
1284                 if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
1285                     !zones_over) {
1286                         /*
1287                          * We are not sampling and enough memory has become
1288                          * available that scanning is no longer required.
1289                          */
1290                         DTRACE_PROBE1(pageout__memfree, uint_t, inst);
1291                         break;
1292                 }
1293 
1294                 DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
1295 
1296                 /*
1297                  * Periodically check to see if we have exceeded the CPU duty
1298                  * cycle for a single wakeup.
1299                  */
1300                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1301                         hrtime_t pageout_cycle_nsec;
1302 
1303                         pageout_cycle_nsec = gethrtime() - sample_start;
1304                         if (pageout_cycle_nsec >= pageout_nsec) {
1305                                 if (!zones_over)
1306                                         atomic_inc_64(&pageout_timeouts);
1307                                 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1308                                 break;
1309                         }
1310                 }
1311 
1312                 /*
1313                  * If checkpage manages to add a page to the free list,
1314                  * we give ourselves another couple of trips around the loop.
1315                  */
1316                 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1317                         laps = 0;
1318                 }
1319                 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1320                         laps = 0;
1321                 }
1322 
1323                 ++pcount;
1324 
1325                 /*
1326                  * This CPU kstat is only incremented here and we're obviously
1327                  * on this CPU, so no lock.
1328                  */
1329                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1330 
1331                 /*
1332                  * Don't include ineligible pages in the number scanned.
1333                  */
1334                 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1335                         nscan_cnt++;
1336                 }
1337 
1338                 if (bhwrapping) {
1339                         backhand = regionstart;
1340                         bhwrapping = B_FALSE;
1341                 } else {
1342                         backhand = page_nextn(backhand, tick);
1343                         if (backhand == regionend)
1344                                 bhwrapping = B_TRUE;
1345                 }
1346 
1347                 if (fhwrapping) {
1348                         fronthand = regionstart;
1349                         fhwrapping = B_FALSE;
1350                 } else {
1351                         fronthand = page_nextn(fronthand, tick);
1352                         if (fronthand == regionend)
1353                                 fhwrapping = B_TRUE;
1354                 }
1355 
1356                 /*
1357                  * The front hand has wrapped around during this wakeup.
1358                  */
1359                 if (fronthand == fronthandstart) {
1360                         laps++;
1361                         DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
1362                             uint_t, laps);
1363 
1364                         /*
1365                          * This CPU kstat is only incremented here and we're
1366                          * obviously on this CPU, so no lock.
1367                          */
1368                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
1369 
1370                         /*
1371                          * then when we wraparound memory we want to try to
1372                          * reclaim more pages.
1373                          * If scanning only because zones are over their cap,
1374                          * then wrapping is common and we simply keep going.
1375                         */
1376                         if (laps > 1 && freemem < lotsfree + needfree) {
1377                                 /*
1378                                  * Extremely unlikely, but it happens.
1379                                  * We went around the loop at least once
1380                                  * and didn't get far enough.
1381                                  * If we are still skipping `highly shared'
1382                                  * pages, skip fewer of them.  Otherwise,
1383                                  * give up till the next clock tick.
1384                                  */
1385                                 if (po_share < MAX_PO_SHARE) {
1386                                         po_share <<= 1;
1387                                 } else {
1388                                         break;
1389                                 }
1390                         }
1391                 }
1392         }
1393 
1394         sample_end = gethrtime();
1395         atomic_add_long(&nscan, nscan_cnt);
1396 
1397         DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
1398             pgcnt_t, nscan_cnt, pgcnt_t, pcount)
1399 
1400         /*
1401          * The global variables used below are only modified by this thread and
1402          * only during initial scanning when there is a single page scanner
1403          * thread running.
1404          */
1405         if (pageout_new_spread == 0) {
1406                 VERIFY3U(inst, ==, 0);
1407 
1408                 if (PAGE_SCAN_STARTUP) {
1409                         /*
1410                          * Continue accumulating samples until we have enough
1411                          * to get a reasonable value for average scan rate.
1412                          */
1413                         pageout_sample_pages += pcount;
1414                         pageout_sample_etime += sample_end - sample_start;
1415                         ++pageout_sample_cnt;
1416                 }
1417 
1418                 if (!PAGE_SCAN_STARTUP) {
1419                         /*
1420                          * We have enough samples, set the spread.
1421                          */
1422                         pageout_rate = (hrrate_t)pageout_sample_pages *
1423                             (hrrate_t)(NANOSEC) / pageout_sample_etime;
1424                         pageout_new_spread = pageout_rate / 10;
1425                         setupclock();
1426                 }
1427         }
1428 
1429         goto loop;
1430 }
1431 
1432 /*
1433  * The pageout deadman is run once per second by clock().
1434  */
1435 void
1436 pageout_deadman(void)
1437 {
1438         if (panicstr != NULL) {
1439                 /*
1440                  * There is no pageout after panic.
1441                  */
1442                 return;
1443         }
1444 
1445         if (pageout_deadman_seconds == 0) {
1446                 /*
1447                  * The deadman is not enabled.
1448                  */
1449                 return;
1450         }
1451 
1452         if (!pageout_pushing) {
1453                 goto reset;
1454         }
1455 
1456         /*
1457          * We are pushing a page.  Check to see if it is the same call we saw
1458          * last time we looked:
1459          */
1460         if (pageout_pushcount != pageout_pushcount_seen) {
1461                 /*
1462                  * It is a different call from the last check, so we are not
1463                  * stuck.
1464                  */
1465                 goto reset;
1466         }
1467 
1468         if (++pageout_stucktime >= pageout_deadman_seconds) {
1469                 panic("pageout_deadman: stuck pushing the same page for %d "
1470                     "seconds (freemem is %lu)", pageout_deadman_seconds,
1471                     freemem);
1472         }
1473 
1474         return;
1475 
1476 reset:
1477         /*
1478          * Reset our tracking state to reflect that we are not stuck:
1479          */
1480         pageout_stucktime = 0;
1481         pageout_pushcount_seen = pageout_pushcount;
1482 }
1483 
1484 /*
1485  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1486  * system (u., page table) or free, then leave it alone.  Otherwise,
1487  * if we are running the front hand, turn off the page's reference bit.
1488  * If the proc is over maxrss, we take it.  If running the back hand,
1489  * check whether the page has been reclaimed.  If not, free the page,
1490  * pushing it to disk first if necessary.
1491  *
1492  * Return values:
1493  *      CKP_INELIGIBLE if the page is not a candidate at all,
1494  *      CKP_NOT_FREED  if the page was not freed, or
1495  *      CKP_FREED      if we freed it.
1496  */
1497 static checkpage_result_t
1498 checkpage(struct page *pp, pageout_hand_t whichhand)
1499 {
1500         int ppattr;
1501         int isfs = 0;
1502         int isexec = 0;
1503         int pagesync_flag;
1504         zoneid_t zid = ALL_ZONES;
1505 
1506         /*
1507          * Skip pages:
1508          *      - associated with the kernel vnode since
1509          *          they are always "exclusively" locked.
1510          *      - that are free
1511          *      - that are shared more than po_share'd times
1512          *      - its already locked
1513          *
1514          * NOTE:  These optimizations assume that reads are atomic.
1515          */
1516 
1517         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1518             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1519             hat_page_checkshare(pp, po_share)) {
1520                 return (CKP_INELIGIBLE);
1521         }
1522 
1523         if (!page_trylock(pp, SE_EXCL)) {
1524                 /*
1525                  * Skip the page if we can't acquire the "exclusive" lock.
1526                  */
1527                 return (CKP_INELIGIBLE);
1528         } else if (PP_ISFREE(pp)) {
1529                 /*
1530                  * It became free between the above check and our actually
1531                  * locking the page.  Oh well, there will be other pages.
1532                  */
1533                 page_unlock(pp);
1534                 return (CKP_INELIGIBLE);
1535         }
1536 
1537         /*
1538          * Reject pages that cannot be freed. The page_struct_lock
1539          * need not be acquired to examine these
1540          * fields since the page has an "exclusive" lock.
1541          */
1542         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1543                 page_unlock(pp);
1544                 return (CKP_INELIGIBLE);
1545         }
1546 
1547         if (zones_over) {
1548                 ASSERT(pp->p_zoneid == ALL_ZONES ||
1549                     pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1550                 if (pp->p_zoneid == ALL_ZONES ||
1551                     zone_pdata[pp->p_zoneid].zpers_over == 0) {
1552                         /*
1553                         * Cross-zone shared page, or zone not over it's cap.
1554                         * Leave the page alone.
1555                         */
1556                         page_unlock(pp);
1557                         return (CKP_INELIGIBLE);
1558                 }
1559                 zid = pp->p_zoneid;
1560         }
1561 
1562         /*
1563          * Maintain statistics for what we are freeing
1564          */
1565         if (pp->p_vnode != NULL) {
1566                 if (pp->p_vnode->v_flag & VVMEXEC)
1567                         isexec = 1;
1568 
1569                 if (!IS_SWAPFSVP(pp->p_vnode))
1570                         isfs = 1;
1571         }
1572 
1573         /*
1574          * Turn off REF and MOD bits with the front hand.
1575          * The back hand examines the REF bit and always considers
1576          * SHARED pages as referenced.
1577          */
1578         if (whichhand == POH_FRONT) {
1579                 pagesync_flag = HAT_SYNC_ZERORM;
1580         } else {
1581                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1582                     HAT_SYNC_STOPON_SHARED;
1583         }
1584 
1585         ppattr = hat_pagesync(pp, pagesync_flag);
1586 
1587 recheck:
1588         /*
1589          * If page is referenced; make unreferenced but reclaimable.
1590          * If this page is not referenced, then it must be reclaimable
1591          * and we can add it to the free list.
1592          */
1593         if (ppattr & P_REF) {
1594                 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1595                     pageout_hand_t, whichhand);
1596 
1597                 if (whichhand == POH_FRONT) {
1598                         /*
1599                          * Checking of rss or madvise flags needed here...
1600                          *
1601                          * If not "well-behaved", fall through into the code
1602                          * for not referenced.
1603                          */
1604                         hat_clrref(pp);
1605                 }
1606 
1607                 /*
1608                  * Somebody referenced the page since the front
1609                  * hand went by, so it's not a candidate for
1610                  * freeing up.
1611                  */
1612                 page_unlock(pp);
1613                 return (CKP_NOT_FREED);
1614         }
1615 
1616         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1617 
1618         /*
1619          * If large page, attempt to demote it. If successfully demoted,
1620          * retry the checkpage.
1621          */
1622         if (pp->p_szc != 0) {
1623                 if (!page_try_demote_pages(pp)) {
1624                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1625                         page_unlock(pp);
1626                         return (CKP_INELIGIBLE);
1627                 }
1628 
1629                 ASSERT(pp->p_szc == 0);
1630                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1631 
1632                 /*
1633                  * Since page_try_demote_pages() could have unloaded some
1634                  * mappings it makes sense to reload ppattr.
1635                  */
1636                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1637         }
1638 
1639         /*
1640          * If the page is currently dirty, we have to arrange to have it
1641          * cleaned before it can be freed.
1642          *
1643          * XXX - ASSERT(pp->p_vnode != NULL);
1644          */
1645         if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1646                 struct vnode *vp = pp->p_vnode;
1647                 u_offset_t offset = pp->p_offset;
1648 
1649                 /*
1650                  * XXX - Test for process being swapped out or about to exit?
1651                  * [Can't get back to process(es) using the page.]
1652                  */
1653 
1654                 /*
1655                  * Hold the vnode before releasing the page lock to
1656                  * prevent it from being freed and re-used by some
1657                  * other thread.
1658                  */
1659                 VN_HOLD(vp);
1660                 page_unlock(pp);
1661 
1662                 /*
1663                  * Queue I/O request for the pageout thread.
1664                  */
1665                 if (!queue_io_request(vp, offset)) {
1666                         VN_RELE(vp);
1667                         return (CKP_NOT_FREED);
1668                 }
1669                 if (isfs) {
1670                         zone_pageout_stat(zid, ZPO_DIRTY);
1671                 } else {
1672                         zone_pageout_stat(zid, ZPO_ANONDIRTY);
1673                 }
1674                 return (CKP_FREED);
1675         }
1676 
1677         /*
1678          * Now we unload all the translations and put the page back on to the
1679          * free list.  If the page was used (referenced or modified) after the
1680          * pagesync but before it was unloaded we catch it and handle the page
1681          * properly.
1682          */
1683         DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1684         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1685         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1686         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1687                 goto recheck;
1688         }
1689 
1690         VN_DISPOSE(pp, B_FREE, 0, kcred);
1691 
1692         CPU_STATS_ADD_K(vm, dfree, 1);
1693 
1694         if (isfs) {
1695                 if (isexec) {
1696                         CPU_STATS_ADD_K(vm, execfree, 1);
1697                 } else {
1698                         CPU_STATS_ADD_K(vm, fsfree, 1);
1699                 }
1700                 zone_pageout_stat(zid, ZPO_FS);
1701         } else {
1702                 CPU_STATS_ADD_K(vm, anonfree, 1);
1703                 zone_pageout_stat(zid, ZPO_ANON);
1704         }
1705 
1706         return (CKP_FREED);
1707 }
1708 
1709 /*
1710  * Queue async i/o request from pageout_scanner and segment swapout
1711  * routines on one common list.  This ensures that pageout devices (swap)
1712  * are not saturated by pageout_scanner or swapout requests.
1713  * The pageout thread empties this list by initiating i/o operations.
1714  */
1715 int
1716 queue_io_request(vnode_t *vp, u_offset_t off)
1717 {
1718         struct async_reqs *arg;
1719 
1720         /*
1721          * If we cannot allocate an async request struct,
1722          * skip this page.
1723          */
1724         mutex_enter(&push_lock);
1725         if ((arg = req_freelist) == NULL) {
1726                 mutex_exit(&push_lock);
1727                 return (0);
1728         }
1729         req_freelist = arg->a_next;          /* adjust freelist */
1730         push_list_size++;
1731 
1732         arg->a_vp = vp;
1733         arg->a_off = off;
1734         arg->a_len = PAGESIZE;
1735         arg->a_flags = B_ASYNC | B_FREE;
1736         arg->a_cred = kcred;         /* always held */
1737 
1738         /*
1739          * Add to list of pending write requests.
1740          */
1741         arg->a_next = push_list;
1742         push_list = arg;
1743 
1744         if (req_freelist == NULL) {
1745                 /*
1746                  * No free async requests left. The lock is held so we
1747                  * might as well signal the pusher thread now.
1748                  */
1749                 cv_signal(&push_cv);
1750         }
1751         mutex_exit(&push_lock);
1752         return (1);
1753 }
1754 
1755 /*
1756  * Wakeup pageout to initiate i/o if push_list is not empty.
1757  */
1758 void
1759 cv_signal_pageout()
1760 {
1761         if (push_list != NULL) {
1762                 mutex_enter(&push_lock);
1763                 cv_signal(&push_cv);
1764                 mutex_exit(&push_lock);
1765         }
1766 }