1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2021 Oxide Computer Company
  24  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  25  */
  26 
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  * Copyright 2018 Joyent, Inc.
  31  */
  32 
  33 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  34 /* All Rights Reserved */
  35 
  36 /*
  37  * University Copyright- Copyright (c) 1982, 1986, 1988
  38  * The Regents of the University of California
  39  * All Rights Reserved
  40  *
  41  * University Acknowledgment- Portions of this document are derived from
  42  * software developed by the University of California, Berkeley, and its
  43  * contributors.
  44  */
  45 
  46 #include <sys/types.h>
  47 #include <sys/t_lock.h>
  48 #include <sys/param.h>
  49 #include <sys/buf.h>
  50 #include <sys/uio.h>
  51 #include <sys/proc.h>
  52 #include <sys/systm.h>
  53 #include <sys/mman.h>
  54 #include <sys/cred.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vm.h>
  57 #include <sys/vmparam.h>
  58 #include <sys/vtrace.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/cpuvar.h>
  61 #include <sys/user.h>
  62 #include <sys/kmem.h>
  63 #include <sys/debug.h>
  64 #include <sys/callb.h>
  65 #include <sys/tnf_probe.h>
  66 #include <sys/mem_cage.h>
  67 #include <sys/time.h>
  68 #include <sys/zone.h>
  69 #include <sys/stdbool.h>
  70 
  71 #include <vm/hat.h>
  72 #include <vm/as.h>
  73 #include <vm/seg.h>
  74 #include <vm/page.h>
  75 #include <vm/pvn.h>
  76 #include <vm/seg_kmem.h>
  77 
  78 /*
  79  * FREE MEMORY MANAGEMENT
  80  *
  81  * Management of the pool of free pages is a tricky business.  There are
  82  * several critical threshold values which constrain our allocation of new
  83  * pages and inform the rate of paging out of memory to swap.  These threshold
  84  * values, and the behaviour they induce, are described below in descending
  85  * order of size -- and thus increasing order of severity!
  86  *
  87  *   +---------------------------------------------------- physmem (all memory)
  88  *   |
  89  *   | Ordinarily there are no particular constraints placed on page
  90  *   v allocation.  The page scanner is not running and page_create_va()
  91  *   | will effectively grant all page requests (whether from the kernel
  92  *   | or from user processes) without artificial delay.
  93  *   |
  94  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
  95  *   |
  96  *   | When we have less than "lotsfree" pages, pageout_scanner() is
  97  *   v signalled by schedpaging() to begin looking for pages that can
  98  *   | be evicted to disk to bring us back above lotsfree.  At this
  99  *   | stage there is still no constraint on allocation of free pages.
 100  *   |
 101  *   | For small systems, we set a lower bound of 16MB for lotsfree;
 102  *   v this is the natural value for a system with 1GB memory.  This is
 103  *   | to ensure that the pageout reserve pool contains at least 4MB
 104  *   | for use by ZFS.
 105  *   |
 106  *   | For systems with a large amount of memory, we constrain lotsfree
 107  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
 108  *   v at some point the required slack relates more closely to the
 109  *   | rate at which paging can occur than to the total amount of memory.
 110  *   |
 111  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
 112  *   |
 113  *   | When we drop below desfree, a number of kernel facilities will
 114  *   v wait before allocating more memory, under the assumption that
 115  *   | pageout or reaping will make progress and free up some memory.
 116  *   | This behaviour is not especially coordinated; look for comparisons
 117  *   | of desfree and freemem.
 118  *   |
 119  *   | In addition to various attempts at advisory caution, clock()
 120  *   | will wake up the thread that is ordinarily parked in sched().
 121  *   | This routine is responsible for the heavy-handed swapping out
 122  *   v of entire processes in an attempt to arrest the slide of free
 123  *   | memory.  See comments in sched.c for more details.
 124  *   |
 125  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
 126  *   |
 127  *   | These two separate tunables have, by default, the same value.
 128  *   v Various parts of the kernel use minfree to signal the need for
 129  *   | more aggressive reclamation of memory, and sched() is more
 130  *   | aggressive at swapping processes out.
 131  *   |
 132  *   | If free memory falls below throttlefree, page_create_va() will
 133  *   | use page_create_throttle() to begin holding most requests for
 134  *   | new pages while pageout and reaping free up memory.  Sleeping
 135  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
 136  *   | more memory.  Non-sleeping allocations are generally allowed to
 137  *   | proceed, unless their priority is explicitly lowered with
 138  *   | KM_NORMALPRI.
 139  *   |
 140  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
 141  *   |
 142  *   | When we hit throttlefree, the situation is already dire.  The
 143  *   v system is generally paging out memory and swapping out entire
 144  *   | processes in order to free up memory for continued operation.
 145  *   |
 146  *   | Unfortunately, evicting memory to disk generally requires short
 147  *   | term use of additional memory; e.g., allocation of buffers for
 148  *   | storage drivers, updating maps of free and used blocks, etc.
 149  *   | As such, pageout_reserve is the number of pages that we keep in
 150  *   | special reserve for use by pageout() and sched() and by any
 151  *   v other parts of the kernel that need to be working for those to
 152  *   | make forward progress such as the ZFS I/O pipeline.
 153  *   |
 154  *   | When we are below pageout_reserve, we fail or hold any allocation
 155  *   | that has not explicitly requested access to the reserve pool.
 156  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
 157  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
 158  *   | can implicitly tap the reserve.  For more details, see the
 159  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
 160  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
 161  *   |
 162  *   +---------------------------------------------------------- no free memory
 163  *   |
 164  *   | If we have arrived here, things are very bad indeed.  It is
 165  *   v surprisingly difficult to tell if this condition is even fatal,
 166  *   | as enough memory may have been granted to pageout() and to the
 167  *   | ZFS I/O pipeline that requests for eviction that have already been
 168  *   | made will complete and free up memory some time soon.
 169  *   |
 170  *   | If free memory does not materialise, the system generally remains
 171  *   | deadlocked.  The pageout_deadman() below is run once per second
 172  *   | from clock(), seeking to limit the amount of time a single request
 173  *   v to page out can be blocked before the system panics to get a crash
 174  *   | dump and return to service.
 175  *   |
 176  *   +-------------------------------------------------------------------------
 177  */
 178 
 179 /*
 180  * The following parameters control operation of the page replacement
 181  * algorithm.  They are initialized to 0, and then computed at boot time based
 182  * on the size of the system; see setupclock().  If they are patched non-zero
 183  * in a loaded vmunix they are left alone and may thus be changed per system
 184  * using "mdb -kw" on the loaded system.
 185  */
 186 pgcnt_t         slowscan = 0;
 187 pgcnt_t         fastscan = 0;
 188 
 189 static pgcnt_t  handspreadpages = 0;
 190 
 191 /*
 192  * looppages:
 193  *     Cached copy of the total number of pages in the system (total_pages).
 194  *
 195  * loopfraction:
 196  *     Divisor used to relate fastscan to looppages in setupclock().
 197  */
 198 static uint_t   loopfraction = 2;
 199 static pgcnt_t  looppages;
 200 
 201 static uint_t   min_percent_cpu = 4;
 202 static uint_t   max_percent_cpu = 80;
 203 static pgcnt_t  maxfastscan = 0;
 204 static pgcnt_t  maxslowscan = 100;
 205 
 206 #define         MEGABYTES               (1024ULL * 1024ULL)
 207 
 208 /*
 209  * pageout_threshold_style:
 210  *     set to 1 to use the previous default threshold size calculation;
 211  *     i.e., each threshold is half of the next largest value.
 212  */
 213 uint_t          pageout_threshold_style = 0;
 214 
 215 /*
 216  * The operator may override these tunables to request a different minimum or
 217  * maximum lotsfree value, or to change the divisor we use for automatic
 218  * sizing.
 219  *
 220  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
 221  * minimum and maximum are specified in bytes, rather than pages; a zero value
 222  * means the default values (below) are used.
 223  */
 224 uint_t          lotsfree_fraction = 64;
 225 pgcnt_t         lotsfree_min = 0;
 226 pgcnt_t         lotsfree_max = 0;
 227 
 228 #define         LOTSFREE_MIN_DEFAULT    (16 * MEGABYTES)
 229 #define         LOTSFREE_MAX_DEFAULT    (2048 * MEGABYTES)
 230 
 231 /*
 232  * If these tunables are set to non-zero values in /etc/system, and provided
 233  * the value is not larger than the threshold above, the specified value will
 234  * be used directly without any additional calculation or adjustment.  The boot
 235  * time value of these overrides is preserved in the "clockinit" struct.  More
 236  * detail is available in the comment at the top of the file.
 237  */
 238 pgcnt_t         maxpgio = 0;
 239 pgcnt_t         minfree = 0;
 240 pgcnt_t         desfree = 0;
 241 pgcnt_t         lotsfree = 0;
 242 pgcnt_t         needfree = 0;
 243 pgcnt_t         throttlefree = 0;
 244 pgcnt_t         pageout_reserve = 0;
 245 
 246 pgcnt_t         deficit;
 247 pgcnt_t         nscan;
 248 pgcnt_t         desscan;
 249 
 250 /* kstats */
 251 uint64_t low_mem_scan;
 252 uint64_t zone_cap_scan;
 253 uint64_t n_throttle;
 254 
 255 /*
 256  * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
 257  * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
 258  * that gives the equivalent of some underlying %CPU duty cycle.
 259  *
 260  * min_pageout_nsec:
 261  *     nanoseconds/wakeup equivalent of min_percent_cpu.
 262  *
 263  * max_pageout_nsec:
 264  *     nanoseconds/wakeup equivalent of max_percent_cpu.
 265  *
 266  * pageout_nsec:
 267  *     Number of nanoseconds budgeted for each wakeup cycle.
 268  *     Computed each time around by schedpaging().
 269  *     Varies between min_pageout_nsec and max_pageout_nsec,
 270  *     depending on memory pressure or zones over their cap.
 271  *
 272  * zone_pageout_nsec:
 273  *     Number of nanoseconds budget for each cycle when a zone
 274  *     is over its memory cap. If this is zero, then the value
 275  *     of max_pageout_nsec is used instead.
 276  */
 277 
 278 static hrtime_t min_pageout_nsec;
 279 static hrtime_t max_pageout_nsec;
 280 static hrtime_t pageout_nsec;
 281 static hrtime_t zone_pageout_nsec;
 282 
 283 #define MAX_PSCAN_THREADS       16
 284 static boolean_t reset_hands[MAX_PSCAN_THREADS];
 285 
 286 /*
 287  * These can be tuned in /etc/system or set with mdb.
 288  * 'des_page_scanners' is the desired number of page scanner threads. The
 289  * system will bring the actual number of threads into line with the desired
 290  * number. If des_page_scanners is set to an invalid value, the system will
 291  * correct the setting.
 292  */
 293 uint_t des_page_scanners;
 294 uint_t pageout_reset_cnt = 64;  /* num. cycles for pageout_scanner hand reset */
 295 
 296 uint_t n_page_scanners;
 297 static pgcnt_t  pscan_region_sz; /* informational only */
 298 
 299 #define PAGES_POLL_MASK 1023
 300 
 301 /*
 302  * pageout_sample_lim:
 303  *     The limit on the number of samples needed to establish a value for new
 304  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
 305  *     handspreadpages.
 306  *
 307  * pageout_sample_cnt:
 308  *     Current sample number.  Once the sample gets large enough, set new
 309  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
 310  *
 311  * pageout_sample_pages:
 312  *     The accumulated number of pages scanned during sampling.
 313  *
 314  * pageout_sample_etime:
 315  *     The accumulated nanoseconds for the sample.
 316  *
 317  * pageout_rate:
 318  *     Rate in pages/nanosecond, computed at the end of sampling.
 319  *
 320  * pageout_new_spread:
 321  *     Initially zero while the system scan rate is measured by
 322  *     pageout_scanner(), which then sets this value once per system boot after
 323  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
 324  *     new value is used for fastscan and handspreadpages.
 325  */
 326 
 327 typedef hrtime_t hrrate_t;
 328 
 329 static uint64_t pageout_sample_lim = 4;
 330 static uint64_t pageout_sample_cnt = 0;
 331 static pgcnt_t  pageout_sample_pages = 0;
 332 static hrrate_t pageout_rate = 0;
 333 static pgcnt_t  pageout_new_spread = 0;
 334 
 335 static hrtime_t pageout_sample_etime = 0;
 336 
 337 /* True if page scanner is first starting up */
 338 #define PAGE_SCAN_STARTUP       (pageout_sample_cnt < pageout_sample_lim)
 339 
 340 /*
 341  * Record number of times a pageout_scanner() wakeup cycle finished because it
 342  * timed out (exceeded its CPU budget), rather than because it visited
 343  * its budgeted number of pages. This is only done when scanning under low
 344  * free memory conditions, not when scanning for zones over their cap.
 345  */
 346 uint64_t        pageout_timeouts = 0;
 347 
 348 #ifdef VM_STATS
 349 static struct pageoutvmstats_str {
 350         ulong_t checkpage[3];
 351 } pageoutvmstats;
 352 #endif /* VM_STATS */
 353 
 354 /*
 355  * Threads waiting for free memory use this condition variable and lock until
 356  * memory becomes available.
 357  */
 358 kmutex_t        memavail_lock;
 359 kcondvar_t      memavail_cv;
 360 
 361 typedef enum pageout_hand {
 362         POH_FRONT = 1,
 363         POH_BACK,
 364 } pageout_hand_t;
 365 
 366 typedef enum {
 367         CKP_INELIGIBLE,
 368         CKP_NOT_FREED,
 369         CKP_FREED,
 370 } checkpage_result_t;
 371 
 372 static checkpage_result_t checkpage(page_t *, pageout_hand_t);
 373 
 374 static struct clockinit {
 375         bool ci_init;
 376         pgcnt_t ci_lotsfree_min;
 377         pgcnt_t ci_lotsfree_max;
 378         pgcnt_t ci_lotsfree;
 379         pgcnt_t ci_desfree;
 380         pgcnt_t ci_minfree;
 381         pgcnt_t ci_throttlefree;
 382         pgcnt_t ci_pageout_reserve;
 383         pgcnt_t ci_maxpgio;
 384         pgcnt_t ci_maxfastscan;
 385         pgcnt_t ci_fastscan;
 386         pgcnt_t ci_slowscan;
 387         pgcnt_t ci_handspreadpages;
 388 } clockinit = { .ci_init = false };
 389 
 390 static pgcnt_t
 391 clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
 392 {
 393         if (value < minimum) {
 394                 return (minimum);
 395         } else if (value > maximum) {
 396                 return (maximum);
 397         } else {
 398                 return (value);
 399         }
 400 }
 401 
 402 static pgcnt_t
 403 tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
 404 {
 405         if (initval == 0 || initval >= initval_ceiling) {
 406                 return (defval);
 407         } else {
 408                 return (initval);
 409         }
 410 }
 411 
 412 /*
 413  * Local boolean to control scanning when zones are over their cap. Avoids
 414  * accessing the zone_num_over_cap variable except within schedpaging(), which
 415  * only runs periodically. This is here only to reduce our access to
 416  * zone_num_over_cap, since it is already accessed a lot during paging, and
 417  * the page scanner accesses the zones_over variable on each page during a
 418  * scan. There is no lock needed for zone_num_over_cap since schedpaging()
 419  * doesn't modify the variable, it only cares if the variable is 0 or non-0.
 420  */
 421 static boolean_t zones_over = B_FALSE;
 422 
 423 /*
 424  * Set up the paging constants for the clock algorithm used by
 425  * pageout_scanner(), and by the virtual memory system overall.  See the
 426  * comments at the top of this file for more information about the threshold
 427  * values and system responses to memory pressure.
 428  *
 429  * This routine is called once by main() at startup, after the initial size of
 430  * physical memory is determined.  It may be called again later if memory is
 431  * added to or removed from the system, or if new measurements of the page scan
 432  * rate become available.
 433  */
 434 void
 435 setupclock(void)
 436 {
 437         uint_t i;
 438         pgcnt_t sz, tmp;
 439         pgcnt_t defval;
 440         bool half = (pageout_threshold_style == 1);
 441         bool recalc = true;
 442 
 443         looppages = total_pages;
 444 
 445         /*
 446          * The operator may have provided specific values for some of the
 447          * tunables via /etc/system.  On our first call, we preserve those
 448          * values so that they can be used for subsequent recalculations.
 449          *
 450          * A value of zero for any tunable means we will use the default
 451          * sizing.
 452          */
 453 
 454         if (!clockinit.ci_init) {
 455                 clockinit.ci_init = true;
 456 
 457                 clockinit.ci_lotsfree_min = lotsfree_min;
 458                 clockinit.ci_lotsfree_max = lotsfree_max;
 459                 clockinit.ci_lotsfree = lotsfree;
 460                 clockinit.ci_desfree = desfree;
 461                 clockinit.ci_minfree = minfree;
 462                 clockinit.ci_throttlefree = throttlefree;
 463                 clockinit.ci_pageout_reserve = pageout_reserve;
 464                 clockinit.ci_maxpgio = maxpgio;
 465                 clockinit.ci_maxfastscan = maxfastscan;
 466                 clockinit.ci_fastscan = fastscan;
 467                 clockinit.ci_slowscan = slowscan;
 468                 clockinit.ci_handspreadpages = handspreadpages;
 469 
 470                 /*
 471                  * The first call does not trigger a recalculation, only
 472                  * subsequent calls.
 473                  */
 474                 recalc = false;
 475         }
 476 
 477         /*
 478          * Configure paging threshold values.  For more details on what each
 479          * threshold signifies, see the comments at the top of this file.
 480          */
 481         lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
 482             btop(LOTSFREE_MAX_DEFAULT));
 483         lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
 484             btop(LOTSFREE_MIN_DEFAULT));
 485 
 486         lotsfree = tune(clockinit.ci_lotsfree, looppages,
 487             clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
 488 
 489         desfree = tune(clockinit.ci_desfree, lotsfree,
 490             lotsfree / 2);
 491 
 492         minfree = tune(clockinit.ci_minfree, desfree,
 493             half ? desfree / 2 : 3 * desfree / 4);
 494 
 495         throttlefree = tune(clockinit.ci_throttlefree, desfree,
 496             minfree);
 497 
 498         pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
 499             half ? throttlefree / 2 : 3 * throttlefree / 4);
 500 
 501         /*
 502          * Maxpgio thresholds how much paging is acceptable.
 503          * This figures that 2/3 busy on an arm is all that is
 504          * tolerable for paging.  We assume one operation per disk rev.
 505          *
 506          * XXX - Does not account for multiple swap devices.
 507          */
 508         if (clockinit.ci_maxpgio == 0) {
 509                 maxpgio = (DISKRPM * 2) / 3;
 510         } else {
 511                 maxpgio = clockinit.ci_maxpgio;
 512         }
 513 
 514         /*
 515          * The clock scan rate varies between fastscan and slowscan
 516          * based on the amount of free memory available.  Fastscan
 517          * rate should be set based on the number pages that can be
 518          * scanned per sec using ~10% of processor time.  Since this
 519          * value depends on the processor, MMU, Mhz etc., it is
 520          * difficult to determine it in a generic manner for all
 521          * architectures.
 522          *
 523          * Instead of trying to determine the number of pages scanned
 524          * per sec for every processor, fastscan is set to be the smaller
 525          * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
 526          * time is limited to ~4% of processor time.
 527          *
 528          * Setting fastscan to be 1/2 of memory allows pageout to scan
 529          * all of memory in ~2 secs.  This implies that user pages not
 530          * accessed within 1 sec (assuming, handspreadpages == fastscan)
 531          * can be reclaimed when free memory is very low.  Stealing pages
 532          * not accessed within 1 sec seems reasonable and ensures that
 533          * active user processes don't thrash.
 534          *
 535          * Smaller values of fastscan result in scanning fewer pages
 536          * every second and consequently pageout may not be able to free
 537          * sufficient memory to maintain the minimum threshold.  Larger
 538          * values of fastscan result in scanning a lot more pages which
 539          * could lead to thrashing and higher CPU usage.
 540          *
 541          * Fastscan needs to be limited to a maximum value and should not
 542          * scale with memory to prevent pageout from consuming too much
 543          * time for scanning on slow CPU's and avoid thrashing, as a
 544          * result of scanning too many pages, on faster CPU's.
 545          * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
 546          * (the upper bound for fastscan) based on the average number
 547          * of pages that can potentially be scanned in ~1 sec (using ~4%
 548          * of the CPU) on some of the following machines that currently
 549          * run Solaris 2.x:
 550          *
 551          *                      average memory scanned in ~1 sec
 552          *
 553          *      25 Mhz SS1+:            23 Meg
 554          *      LX:                     37 Meg
 555          *      50 Mhz SC2000:          68 Meg
 556          *
 557          *      40 Mhz 486:             26 Meg
 558          *      66 Mhz 486:             42 Meg
 559          *
 560          * When free memory falls just below lotsfree, the scan rate
 561          * goes from 0 to slowscan (i.e., pageout starts running).  This
 562          * transition needs to be smooth and is achieved by ensuring that
 563          * pageout scans a small number of pages to satisfy the transient
 564          * memory demand.  This is set to not exceed 100 pages/sec (25 per
 565          * wakeup) since scanning that many pages has no noticible impact
 566          * on system performance.
 567          *
 568          * In addition to setting fastscan and slowscan, pageout is
 569          * limited to using ~4% of the CPU.  This results in increasing
 570          * the time taken to scan all of memory, which in turn means that
 571          * user processes have a better opportunity of preventing their
 572          * pages from being stolen.  This has a positive effect on
 573          * interactive and overall system performance when memory demand
 574          * is high.
 575          *
 576          * Thus, the rate at which pages are scanned for replacement will
 577          * vary linearly between slowscan and the number of pages that
 578          * can be scanned using ~4% of processor time instead of varying
 579          * linearly between slowscan and fastscan.
 580          *
 581          * Also, the processor time used by pageout will vary from ~1%
 582          * at slowscan to ~4% at fastscan instead of varying between
 583          * ~1% at slowscan and ~10% at fastscan.
 584          *
 585          * The values chosen for the various VM parameters (fastscan,
 586          * handspreadpages, etc) are not universally true for all machines,
 587          * but appear to be a good rule of thumb for the machines we've
 588          * tested.  They have the following ranges:
 589          *
 590          *      cpu speed:      20 to 70 Mhz
 591          *      page size:      4K to 8K
 592          *      memory size:    16M to 5G
 593          *      page scan rate: 4000 - 17400 4K pages per sec
 594          *
 595          * The values need to be re-examined for machines which don't
 596          * fall into the various ranges (e.g., slower or faster CPUs,
 597          * smaller or larger pagesizes etc) shown above.
 598          *
 599          * On an MP machine, pageout is often unable to maintain the
 600          * minimum paging thresholds under heavy load.  This is due to
 601          * the fact that user processes running on other CPU's can be
 602          * dirtying memory at a much faster pace than pageout can find
 603          * pages to free.  The memory demands could be met by enabling
 604          * more than one CPU to run the clock algorithm in such a manner
 605          * that the various clock hands don't overlap.  This also makes
 606          * it more difficult to determine the values for fastscan, slowscan
 607          * and handspreadpages.
 608          *
 609          * The swapper is currently used to free up memory when pageout
 610          * is unable to meet memory demands by swapping out processes.
 611          * In addition to freeing up memory, swapping also reduces the
 612          * demand for memory by preventing user processes from running
 613          * and thereby consuming memory.
 614          */
 615         if (clockinit.ci_maxfastscan == 0) {
 616                 if (pageout_new_spread != 0) {
 617                         maxfastscan = pageout_new_spread;
 618                 } else {
 619                         maxfastscan = MAXHANDSPREADPAGES;
 620                 }
 621         } else {
 622                 maxfastscan = clockinit.ci_maxfastscan;
 623         }
 624 
 625         if (clockinit.ci_fastscan == 0) {
 626                 fastscan = MIN(looppages / loopfraction, maxfastscan);
 627         } else {
 628                 fastscan = clockinit.ci_fastscan;
 629         }
 630 
 631         if (fastscan > looppages / loopfraction) {
 632                 fastscan = looppages / loopfraction;
 633         }
 634 
 635         /*
 636          * Set slow scan time to 1/10 the fast scan time, but
 637          * not to exceed maxslowscan.
 638          */
 639         if (clockinit.ci_slowscan == 0) {
 640                 slowscan = MIN(fastscan / 10, maxslowscan);
 641         } else {
 642                 slowscan = clockinit.ci_slowscan;
 643         }
 644 
 645         if (slowscan > fastscan / 2) {
 646                 slowscan = fastscan / 2;
 647         }
 648 
 649         /*
 650          * Handspreadpages is distance (in pages) between front and back
 651          * pageout daemon hands.  The amount of time to reclaim a page
 652          * once pageout examines it increases with this distance and
 653          * decreases as the scan rate rises. It must be < the amount
 654          * of pageable memory.
 655          *
 656          * Since pageout is limited to ~4% of the CPU, setting handspreadpages
 657          * to be "fastscan" results in the front hand being a few secs
 658          * (varies based on the processor speed) ahead of the back hand
 659          * at fastscan rates.  This distance can be further reduced, if
 660          * necessary, by increasing the processor time used by pageout
 661          * to be more than ~4% and preferrably not more than ~10%.
 662          *
 663          * As a result, user processes have a much better chance of
 664          * referencing their pages before the back hand examines them.
 665          * This also significantly lowers the number of reclaims from
 666          * the freelist since pageout does not end up freeing pages which
 667          * may be referenced a sec later.
 668          */
 669         if (clockinit.ci_handspreadpages == 0) {
 670                 handspreadpages = fastscan;
 671         } else {
 672                 handspreadpages = clockinit.ci_handspreadpages;
 673         }
 674 
 675         /*
 676          * Make sure that back hand follows front hand by at least
 677          * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
 678          * back hand to look at a page during the same wakeup of the pageout
 679          * daemon in which the front hand cleared its ref bit.
 680          */
 681         if (handspreadpages >= looppages) {
 682                 handspreadpages = looppages - 1;
 683         }
 684 
 685         if (!recalc) {
 686                 /*
 687                  * Setup basic values at initialization.
 688                  */
 689                 pscan_region_sz = total_pages;
 690                 des_page_scanners = n_page_scanners = 1;
 691                 reset_hands[0] = B_TRUE;
 692                 return;
 693         }
 694 
 695         /*
 696          * Recalculating
 697          *
 698          * We originally set the number of page scanners to 1. Now that we
 699          * know what the handspreadpages is for a scanner, figure out how many
 700          * scanners we should run. We want to ensure that the regions don't
 701          * overlap and that they are not touching.
 702          *
 703          * A default 64GB region size is used as the initial value to calculate
 704          * how many scanner threads we should create on lower memory systems.
 705          * The idea is to limit the number of threads to a practical value
 706          * (e.g. a 64GB machine really only needs one scanner thread). For very
 707          * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
 708          * threads.
 709          *
 710          * The scanner threads themselves are evenly spread out around the
 711          * memory "clock" in pageout_scanner when we reset the hands, and each
 712          * thread will scan all of memory.
 713          */
 714         sz = (btop(64ULL * 0x40000000ULL));
 715         if (sz < handspreadpages) {
 716                 /*
 717                  * 64GB is smaller than the separation between the front
 718                  * and back hands; use double handspreadpages.
 719                  */
 720                 sz = handspreadpages << 1;
 721         }
 722         if (sz > total_pages) {
 723                 sz = total_pages;
 724         }
 725         /* Record region size for inspection with mdb, otherwise unused */
 726         pscan_region_sz = sz;
 727 
 728         tmp = sz;
 729         for (i = 1; tmp < total_pages; i++) {
 730                 tmp += sz;
 731         }
 732 
 733         if (i > MAX_PSCAN_THREADS)
 734                 i = MAX_PSCAN_THREADS;
 735 
 736         des_page_scanners = i;
 737 }
 738 
 739 /*
 740  * Pageout scheduling.
 741  *
 742  * Schedpaging controls the rate at which the page out daemon runs by
 743  * setting the global variables nscan and desscan SCHEDPAGING_HZ
 744  * times a second.  Nscan records the number of pages pageout has examined
 745  * in its current pass; schedpaging() resets this value to zero each time
 746  * it runs.  Desscan records the number of pages pageout should examine
 747  * in its next pass; schedpaging() sets this value based on the amount of
 748  * currently available memory.
 749  */
 750 #define SCHEDPAGING_HZ  4
 751 
 752 static kmutex_t pageout_mutex;  /* held while pageout or schedpaging running */
 753 
 754 /*
 755  * Pool of available async pageout putpage requests.
 756  */
 757 static struct async_reqs *push_req;
 758 static struct async_reqs *req_freelist; /* available req structs */
 759 static struct async_reqs *push_list;    /* pending reqs */
 760 static kmutex_t push_lock;              /* protects req pool */
 761 static kcondvar_t push_cv;
 762 
 763 /*
 764  * If pageout() is stuck on a single push for this many seconds,
 765  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
 766  * to 0, the deadman will have no effect.
 767  *
 768  * Note that we are only looking for stalls in the calls that pageout() makes
 769  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
 770  * I/O, which should not take long unless the underlying strategy call blocks
 771  * indefinitely for memory.  The actual I/O request happens (or fails) later.
 772  */
 773 uint_t pageout_deadman_seconds = 90;
 774 
 775 static uint_t pageout_stucktime = 0;
 776 static bool pageout_pushing = false;
 777 static uint64_t pageout_pushcount = 0;
 778 static uint64_t pageout_pushcount_seen = 0;
 779 
 780 static int async_list_size = 256;       /* number of async request structs */
 781 
 782 static void pageout_scanner(void *);
 783 
 784 /*
 785  * If a page is being shared more than "po_share" times
 786  * then leave it alone- don't page it out.
 787  */
 788 #define MIN_PO_SHARE    (8)
 789 #define MAX_PO_SHARE    ((MIN_PO_SHARE) << 24)
 790 ulong_t po_share = MIN_PO_SHARE;
 791 
 792 /*
 793  * Schedule rate for paging.
 794  * Rate is linear interpolation between
 795  * slowscan with lotsfree and fastscan when out of memory.
 796  */
 797 static void
 798 schedpaging(void *arg)
 799 {
 800         spgcnt_t vavail;
 801 
 802         if (freemem < lotsfree + needfree + kmem_reapahead)
 803                 kmem_reap();
 804 
 805         if (freemem < lotsfree + needfree)
 806                 seg_preap();
 807 
 808         if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 809                 kcage_cageout_wakeup();
 810 
 811         (void) atomic_swap_ulong(&nscan, 0);
 812         vavail = freemem - deficit;
 813         if (pageout_new_spread != 0)
 814                 vavail -= needfree;
 815         if (vavail < 0)
 816                 vavail = 0;
 817         if (vavail > lotsfree)
 818                 vavail = lotsfree;
 819 
 820         /*
 821          * Fix for 1161438 (CRS SPR# 73922).  All variables
 822          * in the original calculation for desscan were 32 bit signed
 823          * ints.  As freemem approaches 0x0 on a system with 1 Gig or
 824          * more of memory, the calculation can overflow.  When this
 825          * happens, desscan becomes negative and pageout_scanner()
 826          * stops paging out.
 827          */
 828         if (needfree > 0 && pageout_new_spread == 0) {
 829                 /*
 830                  * If we've not yet collected enough samples to
 831                  * calculate a spread, kick into high gear anytime
 832                  * needfree is non-zero. Note that desscan will not be
 833                  * the limiting factor for systems with larger memory;
 834                  * the %CPU will limit the scan. That will also be
 835                  * maxed out below.
 836                  */
 837                 desscan = fastscan / SCHEDPAGING_HZ;
 838         } else {
 839                 /*
 840                  * Once we've calculated a spread based on system
 841                  * memory and usage, just treat needfree as another
 842                  * form of deficit.
 843                  */
 844                 spgcnt_t faststmp, slowstmp, result;
 845 
 846                 slowstmp = slowscan * vavail;
 847                 faststmp = fastscan * (lotsfree - vavail);
 848                 result = (slowstmp + faststmp) /
 849                     nz(lotsfree) / SCHEDPAGING_HZ;
 850                 desscan = (pgcnt_t)result;
 851         }
 852 
 853         /*
 854          * If we've not yet collected enough samples to calculate a
 855          * spread, also kick %CPU to the max.
 856          */
 857         if (pageout_new_spread == 0) {
 858                 pageout_nsec = max_pageout_nsec;
 859         } else {
 860                 pageout_nsec = min_pageout_nsec +
 861                     (lotsfree - vavail) *
 862                     (max_pageout_nsec - min_pageout_nsec) /
 863                     nz(lotsfree);
 864         }
 865 
 866         if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
 867                 /*
 868                  * We have finished the pagescan initialization and the desired
 869                  * number of page scanners has changed, either because
 870                  * initialization just finished, because of a memory DR, or
 871                  * because des_page_scanners has been modified on the fly (i.e.
 872                  * by mdb). If we need more scanners, start them now, otherwise
 873                  * the excess scanners will terminate on their own when they
 874                  * reset their hands.
 875                  */
 876                 uint_t i;
 877                 uint_t curr_nscan = n_page_scanners;
 878                 pgcnt_t max = total_pages / handspreadpages;
 879 
 880                 if (des_page_scanners > max)
 881                         des_page_scanners = max;
 882 
 883                 if (des_page_scanners > MAX_PSCAN_THREADS) {
 884                         des_page_scanners = MAX_PSCAN_THREADS;
 885                 } else if (des_page_scanners == 0) {
 886                         des_page_scanners = 1;
 887                 }
 888 
 889                 /*
 890                  * Each thread has its own entry in the reset_hands array, so
 891                  * we don't need any locking in pageout_scanner to check the
 892                  * thread's reset_hands entry. Thus, we use a pre-allocated
 893                  * fixed size reset_hands array and upper limit on the number
 894                  * of pagescan threads.
 895                  *
 896                  * The reset_hands entries need to be true before we start new
 897                  * scanners, but if we're reducing, we don't want a race on the
 898                  * recalculation for the existing threads, so we set
 899                  * n_page_scanners first.
 900                  */
 901                 n_page_scanners = des_page_scanners;
 902                 for (i = 0; i < MAX_PSCAN_THREADS; i++) {
 903                         reset_hands[i] = B_TRUE;
 904                 }
 905 
 906                 if (des_page_scanners > curr_nscan) {
 907                         /* Create additional pageout scanner threads. */
 908                         for (i = curr_nscan; i < des_page_scanners; i++) {
 909                                 (void) lwp_kernel_create(proc_pageout,
 910                                     pageout_scanner, (void *)(uintptr_t)i,
 911                                     TS_RUN, curthread->t_pri);
 912                         }
 913                 }
 914         }
 915 
 916         zones_over = B_FALSE;
 917 
 918         if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
 919                 if (!PAGE_SCAN_STARTUP)
 920                         low_mem_scan++;
 921                 /*
 922                  * Either we need more memory, or we still need to
 923                  * measure the average scan rate.  Wake the scanner.
 924                  */
 925                 DTRACE_PROBE(schedpage__wake__low);
 926                 WAKE_PAGEOUT_SCANNER();
 927 
 928         } else if (zone_num_over_cap > 0) {
 929                 /* One or more zones are over their cap. */
 930 
 931                 /* No page limit */
 932                 desscan = total_pages;
 933 
 934                 /*
 935                  * Increase the scanning CPU% to the max. This implies
 936                  * 80% of one CPU/sec if the scanner can run each
 937                  * opportunity. Can also be tuned via setting
 938                  * zone_pageout_nsec in /etc/system or with mdb.
 939                  */
 940                 pageout_nsec = (zone_pageout_nsec != 0) ?
 941                     zone_pageout_nsec : max_pageout_nsec;
 942 
 943                 zones_over = B_TRUE;
 944                 zone_cap_scan++;
 945 
 946                 DTRACE_PROBE(schedpage__wake__zone);
 947                 WAKE_PAGEOUT_SCANNER();
 948 
 949         } else {
 950                 /*
 951                  * There are enough free pages, no need to
 952                  * kick the scanner thread.  And next time
 953                  * around, keep more of the `highly shared'
 954                  * pages.
 955                  */
 956                 cv_signal_pageout();
 957 
 958                 mutex_enter(&pageout_mutex);
 959                 if (po_share > MIN_PO_SHARE) {
 960                         po_share >>= 1;
 961                 }
 962                 mutex_exit(&pageout_mutex);
 963         }
 964 
 965         /*
 966          * Signal threads waiting for available memory.
 967          * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
 968          * in this case it is not needed - the waiters will be waken up during
 969          * the next invocation of this function.
 970          */
 971         if (kmem_avail() > 0)
 972                 cv_broadcast(&memavail_cv);
 973 
 974         (void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 975 }
 976 
 977 pgcnt_t         pushes;
 978 ulong_t         push_list_size;         /* # of requests on pageout queue */
 979 
 980 /*
 981  * Paging out should always be enabled.  This tunable exists to hold pageout
 982  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
 983  * sleep each time it is woken by schedpaging().
 984  */
 985 uint_t dopageout = 1;
 986 
 987 /*
 988  * The page out daemon, which runs as process 2.
 989  *
 990  * Page out occurs when either:
 991  * a) there is less than lotsfree pages,
 992  * b) there are one or more zones over their physical memory cap.
 993  *
 994  * The daemon treats physical memory as a circular array of pages and scans the
 995  * pages using a 'two-handed clock' algorithm. The front hand moves through
 996  * the pages, clearing the reference bit. The back hand travels a distance
 997  * (handspreadpages) behind the front hand, freeing the pages that have not
 998  * been referenced in the time since the front hand passed. If modified, they
 999  * are first written to their backing store before being freed.
1000  *
1001  * In order to make page invalidation more responsive on machines with larger
1002  * memory, multiple pageout_scanner threads may be created. In this case, the
1003  * threads are evenly distributed around the the memory "clock face" so that
1004  * memory can be reclaimed more quickly (that is, there can be large regions in
1005  * which no pages can be reclaimed by a single thread, leading to lag which
1006  * causes undesirable behavior such as htable stealing).
1007  *
1008  * As long as there are at least lotsfree pages, or no zones over their cap,
1009  * then pageout_scanner threads are not run. When pageout_scanner threads are
1010  * running for case (a), all pages are considered for pageout. For case (b),
1011  * only pages belonging to a zone over its cap will be considered for pageout.
1012  *
1013  * There are multiple threads that act on behalf of the pageout process.
1014  * A set of threads scan pages (pageout_scanner) and frees them up if
1015  * they don't require any VOP_PUTPAGE operation. If a page must be
1016  * written back to its backing store, the request is put on a list
1017  * and the other (pageout) thread is signaled. The pageout thread
1018  * grabs VOP_PUTPAGE requests from the list, and processes them.
1019  * Some filesystems may require resources for the VOP_PUTPAGE
1020  * operations (like memory) and hence can block the pageout
1021  * thread, but the pageout_scanner threads can still operate. There is still
1022  * no guarantee that memory deadlocks cannot occur.
1023  *
1024  * The pageout_scanner parameters are determined in schedpaging().
1025  */
1026 void
1027 pageout()
1028 {
1029         struct async_reqs *arg;
1030         pri_t pageout_pri;
1031         int i;
1032         pgcnt_t max_pushes;
1033         callb_cpr_t cprinfo;
1034 
1035         proc_pageout = ttoproc(curthread);
1036         proc_pageout->p_cstime = 0;
1037         proc_pageout->p_stime =  0;
1038         proc_pageout->p_cutime =  0;
1039         proc_pageout->p_utime = 0;
1040         bcopy("pageout", PTOU(curproc)->u_psargs, 8);
1041         bcopy("pageout", PTOU(curproc)->u_comm, 7);
1042 
1043         /*
1044          * Create pageout scanner thread
1045          */
1046         mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
1047         mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
1048 
1049         /*
1050          * Allocate and initialize the async request structures
1051          * for pageout.
1052          */
1053         push_req = (struct async_reqs *)
1054             kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
1055 
1056         req_freelist = push_req;
1057         for (i = 0; i < async_list_size - 1; i++) {
1058                 push_req[i].a_next = &push_req[i + 1];
1059         }
1060 
1061         pageout_pri = curthread->t_pri;
1062 
1063         /* Create the (first) pageout scanner thread. */
1064         (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
1065             pageout_pri - 1);
1066 
1067         /*
1068          * kick off pageout scheduler.
1069          */
1070         schedpaging(NULL);
1071 
1072         /*
1073          * Create kernel cage thread.
1074          * The kernel cage thread is started under the pageout process
1075          * to take advantage of the less restricted page allocation
1076          * in page_create_throttle().
1077          */
1078         kcage_cageout_init();
1079 
1080         /*
1081          * Limit pushes to avoid saturating pageout devices.
1082          */
1083         max_pushes = maxpgio / SCHEDPAGING_HZ;
1084         CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
1085 
1086         for (;;) {
1087                 mutex_enter(&push_lock);
1088 
1089                 while ((arg = push_list) == NULL || pushes > max_pushes) {
1090                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1091                         cv_wait(&push_cv, &push_lock);
1092                         pushes = 0;
1093                         CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
1094                 }
1095                 push_list = arg->a_next;
1096                 arg->a_next = NULL;
1097                 pageout_pushing = true;
1098                 mutex_exit(&push_lock);
1099 
1100                 DTRACE_PROBE(pageout__push);
1101                 if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
1102                     arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
1103                         pushes++;
1104                 }
1105 
1106                 /* vp held by checkpage() */
1107                 VN_RELE(arg->a_vp);
1108 
1109                 mutex_enter(&push_lock);
1110                 pageout_pushing = false;
1111                 pageout_pushcount++;
1112                 arg->a_next = req_freelist;  /* back on freelist */
1113                 req_freelist = arg;
1114                 push_list_size--;
1115                 mutex_exit(&push_lock);
1116         }
1117 }
1118 
1119 /*
1120  * Kernel thread that scans pages looking for ones to free
1121  */
1122 static void
1123 pageout_scanner(void *a)
1124 {
1125         struct page *fronthand, *backhand;
1126         uint_t laps, iter = 0;
1127         callb_cpr_t cprinfo;
1128         pgcnt_t nscan_cnt, nscan_limit;
1129         pgcnt_t pcount;
1130         uint_t inst = (uint_t)(uintptr_t)a;
1131         hrtime_t sample_start, sample_end;
1132         kmutex_t pscan_mutex;
1133         bool sampling;
1134 
1135         VERIFY3U(inst, <, MAX_PSCAN_THREADS);
1136 
1137         mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
1138 
1139         CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
1140         mutex_enter(&pscan_mutex);
1141 
1142         /*
1143          * Establish the minimum and maximum length of time to be spent
1144          * scanning pages per wakeup, limiting the scanner duty cycle.  The
1145          * input percentage values (0-100) must be converted to a fraction of
1146          * the number of nanoseconds in a second of wall time, then further
1147          * scaled down by the number of scanner wakeups in a second:
1148          */
1149         min_pageout_nsec = MAX(1,
1150             NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
1151         max_pageout_nsec = MAX(min_pageout_nsec,
1152             NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
1153 
1154 loop:
1155         cv_signal_pageout();
1156 
1157         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1158         cv_wait(&proc_pageout->p_cv, &pscan_mutex);
1159         CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
1160 
1161         /*
1162          * Check if pageout has been disabled for debugging purposes:
1163          */
1164         if (!dopageout) {
1165                 goto loop;
1166         }
1167 
1168         /*
1169          * One may reset the clock hands for debugging purposes.  Hands will
1170          * also be reset if memory is added to or removed from the system.
1171          */
1172         if (reset_hands[inst]) {
1173                 struct page *first;
1174                 pgcnt_t offset = total_pages / n_page_scanners;
1175 
1176                 reset_hands[inst] = B_FALSE;
1177                 if (inst >= n_page_scanners) {
1178                         /*
1179                          * The desired number of page scanners has been
1180                          * reduced and this instance is no longer wanted.
1181                          * Exit the lwp.
1182                          */
1183                         VERIFY3U(inst, !=, 0);
1184                         mutex_exit(&pscan_mutex);
1185                         mutex_enter(&curproc->p_lock);
1186                         lwp_exit();
1187                 }
1188 
1189                 /*
1190                  * The reset case repositions the hands at the proper place
1191                  * on the memory clock face to prevent creep into another
1192                  * thread's active region or when the number of threads has
1193                  * changed.
1194                  *
1195                  * Set the two clock hands to be separated by a reasonable
1196                  * amount, but no more than 360 degrees apart.
1197                  *
1198                  * If inst == 0, backhand starts at first page, otherwise
1199                  * it is (inst * offset) around the memory "clock face" so that
1200                  * we spread out each scanner instance evenly.
1201                  */
1202                 first = page_first();
1203                 backhand = page_nextn(first, offset * inst);
1204                 if (handspreadpages >= total_pages) {
1205                         fronthand = page_nextn(backhand, total_pages - 1);
1206                 } else {
1207                         fronthand = page_nextn(backhand, handspreadpages);
1208                 }
1209         }
1210 
1211         CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
1212 
1213         /*
1214          * Keep track of the number of times we have scanned all the way around
1215          * the loop:
1216          */
1217         laps = 0;
1218 
1219         /*
1220          * Track the number of pages visited during this scan so that we can
1221          * periodically measure our duty cycle.
1222          */
1223         pcount = 0;
1224         nscan_cnt = 0;
1225 
1226         if (PAGE_SCAN_STARTUP) {
1227                 /*
1228                  * We need to measure the rate at which the system is able to
1229                  * scan pages of memory.  Each of these initial samples is a
1230                  * scan of all system memory, regardless of whether or not we
1231                  * are experiencing memory pressure.
1232                  */
1233                 nscan_limit = total_pages;
1234                 sampling = true;
1235         } else {
1236                 nscan_limit = desscan;
1237                 sampling = false;
1238         }
1239 
1240         DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
1241             page_t *, backhand, page_t *, fronthand);
1242 
1243         sample_start = gethrtime();
1244 
1245         /*
1246          * Scan the appropriate number of pages for a single duty cycle.
1247          * Only scan while at least one of these is true:
1248          * 1) one or more zones is over its cap
1249          * 2) there is not enough free memory
1250          * 3) during page scan startup when determining sample data
1251          */
1252         while (nscan_cnt < nscan_limit) {
1253                 checkpage_result_t rvfront, rvback;
1254 
1255                 if (!sampling && !zones_over &&
1256                     freemem >= lotsfree + needfree) {
1257                         /*
1258                          * We are not sampling and enough memory has become
1259                          * available that scanning is no longer required.
1260                          */
1261                         break;
1262                 }
1263 
1264                 DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
1265 
1266                 /*
1267                  * Periodically check to see if we have exceeded the CPU duty
1268                  * cycle for a single wakeup.
1269                  */
1270                 if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
1271                         hrtime_t pageout_cycle_nsec;
1272 
1273                         pageout_cycle_nsec = gethrtime() - sample_start;
1274                         if (pageout_cycle_nsec >= pageout_nsec) {
1275                                 /*
1276                                  * This is where we normally break out of the
1277                                  * loop when scanning zones or sampling.
1278                                  */
1279                                 if (!zones_over) {
1280                                         atomic_inc_64(&pageout_timeouts);
1281                                 }
1282                                 DTRACE_PROBE1(pageout__timeout, uint_t, inst);
1283                                 break;
1284                         }
1285                 }
1286 
1287                 /*
1288                  * If checkpage manages to add a page to the free list,
1289                  * we give ourselves another couple of trips around the loop.
1290                  */
1291                 if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
1292                         laps = 0;
1293                 }
1294                 if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
1295                         laps = 0;
1296                 }
1297 
1298                 ++pcount;
1299 
1300                 /*
1301                  * This CPU kstat is only incremented here and we're obviously
1302                  * on this CPU, so no lock.
1303                  */
1304                 CPU_STATS_ADDQ(CPU, vm, scan, 1);
1305 
1306                 /*
1307                  * Don't include ineligible pages in the number scanned.
1308                  */
1309                 if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
1310                         nscan_cnt++;
1311                 }
1312 
1313                 backhand = page_next(backhand);
1314                 fronthand = page_next(fronthand);
1315 
1316                 /*
1317                  * The front hand has wrapped around to the first page in the
1318                  * loop.
1319                  */
1320                 if (fronthand == page_first())  {
1321                         DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
1322 
1323                         /*
1324                          * Every 64 wraps we reposition our hands within our
1325                          * region to prevent creep into another thread.
1326                          */
1327                         if ((++iter % pageout_reset_cnt) == 0)
1328                                 reset_hands[inst] = B_TRUE;
1329 
1330                         /*
1331                          * This CPU kstat is only incremented here and we're
1332                          * obviously on this CPU, so no lock.
1333                          */
1334                         CPU_STATS_ADDQ(CPU, vm, rev, 1);
1335 
1336                         /*
1337                          * If scanning because the system is low on memory,
1338                          * then when we wraparound memory we want to try to
1339                          * reclaim more pages.
1340                          * If scanning only because zones are over their cap,
1341                          * then wrapping is common and we simply keep going.
1342                          */
1343                         if (freemem < lotsfree + needfree && ++laps > 1) {
1344                                 /*
1345                                  * The system is low on memory.
1346                                  * Extremely unlikely, but it happens.
1347                                  * We went around the loop at least once
1348                                  * and didn't get far enough.
1349                                  * If we are still skipping `highly shared'
1350                                  * pages, skip fewer of them.  Otherwise,
1351                                  * give up till the next clock tick.
1352                                  */
1353                                 mutex_enter(&pageout_mutex);
1354                                 if (po_share < MAX_PO_SHARE) {
1355                                         po_share <<= 1;
1356                                         mutex_exit(&pageout_mutex);
1357                                 } else {
1358                                         mutex_exit(&pageout_mutex);
1359                                         break;
1360                                 }
1361                         }
1362                 }
1363         }
1364 
1365         atomic_add_long(&nscan, nscan_cnt);
1366 
1367         sample_end = gethrtime();
1368 
1369         DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
1370             uint_t, inst);
1371 
1372         /*
1373          * The following two blocks are only relevant when the scanner is
1374          * first started up. After the scanner runs for a while, neither of
1375          * the conditions will ever be true again.
1376          *
1377          * The global variables used below are only modified by this thread and
1378          * only during initial scanning when there is a single page scanner
1379          * thread running. Thus, we don't use any locking.
1380          */
1381         if (pageout_new_spread == 0) {
1382                 VERIFY3U(inst, ==, 0);
1383                 if (PAGE_SCAN_STARTUP) {
1384                         /*
1385                          * Continue accumulating samples until we have enough
1386                          * to get a reasonable value for average scan rate:
1387                          */
1388                         pageout_sample_pages += pcount;
1389                         pageout_sample_etime += sample_end - sample_start;
1390                         ++pageout_sample_cnt;
1391                 }
1392 
1393                 if (!PAGE_SCAN_STARTUP) {
1394                         /*
1395                          * We have enough samples, set the spread.
1396                          */
1397                         pageout_rate = (hrrate_t)pageout_sample_pages *
1398                             (hrrate_t)(NANOSEC) / pageout_sample_etime;
1399                         pageout_new_spread = pageout_rate / 10;
1400                         setupclock();
1401                 }
1402         }
1403 
1404         goto loop;
1405 }
1406 
1407 /*
1408  * The pageout deadman is run once per second by clock().
1409  */
1410 void
1411 pageout_deadman(void)
1412 {
1413         if (panicstr != NULL) {
1414                 /*
1415                  * There is no pageout after panic.
1416                  */
1417                 return;
1418         }
1419 
1420         if (pageout_deadman_seconds == 0) {
1421                 /*
1422                  * The deadman is not enabled.
1423                  */
1424                 return;
1425         }
1426 
1427         if (!pageout_pushing) {
1428                 goto reset;
1429         }
1430 
1431         /*
1432          * We are pushing a page.  Check to see if it is the same call we saw
1433          * last time we looked:
1434          */
1435         if (pageout_pushcount != pageout_pushcount_seen) {
1436                 /*
1437                  * It is a different call from the last check, so we are not
1438                  * stuck.
1439                  */
1440                 goto reset;
1441         }
1442 
1443         if (++pageout_stucktime >= pageout_deadman_seconds) {
1444                 panic("pageout_deadman: stuck pushing the same page for %d "
1445                     "seconds (freemem is %lu)", pageout_deadman_seconds,
1446                     freemem);
1447         }
1448 
1449         return;
1450 
1451 reset:
1452         /*
1453          * Reset our tracking state to reflect that we are not stuck:
1454          */
1455         pageout_stucktime = 0;
1456         pageout_pushcount_seen = pageout_pushcount;
1457 }
1458 
1459 /*
1460  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
1461  * system (u., page table) or free, then leave it alone.  Otherwise,
1462  * if we are running the front hand, turn off the page's reference bit.
1463  * If the proc is over maxrss, we take it.  If running the back hand,
1464  * check whether the page has been reclaimed.  If not, free the page,
1465  * pushing it to disk first if necessary.
1466  *
1467  * Return values:
1468  *      CKP_INELIGIBLE if the page is not a candidate at all,
1469  *      CKP_NOT_FREED  if the page was not freed, or
1470  *      CKP_FREED      if we freed it.
1471  */
1472 static checkpage_result_t
1473 checkpage(struct page *pp, pageout_hand_t whichhand)
1474 {
1475         int ppattr;
1476         int isfs = 0;
1477         int isexec = 0;
1478         int pagesync_flag;
1479         zoneid_t zid = ALL_ZONES;
1480 
1481         /*
1482          * Skip pages:
1483          *      - associated with the kernel vnode since
1484          *          they are always "exclusively" locked.
1485          *      - that are free
1486          *      - that are shared more than po_share'd times
1487          *      - its already locked
1488          *
1489          * NOTE:  These optimizations assume that reads are atomic.
1490          */
1491 
1492         if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1493             pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1494             hat_page_checkshare(pp, po_share)) {
1495                 return (CKP_INELIGIBLE);
1496         }
1497 
1498         if (!page_trylock(pp, SE_EXCL)) {
1499                 /*
1500                  * Skip the page if we can't acquire the "exclusive" lock.
1501                  */
1502                 return (CKP_INELIGIBLE);
1503         } else if (PP_ISFREE(pp)) {
1504                 /*
1505                  * It became free between the above check and our actually
1506                  * locking the page.  Oh well, there will be other pages.
1507                  */
1508                 page_unlock(pp);
1509                 return (CKP_INELIGIBLE);
1510         }
1511 
1512         /*
1513          * Reject pages that cannot be freed. The page_struct_lock
1514          * need not be acquired to examine these
1515          * fields since the page has an "exclusive" lock.
1516          */
1517         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1518                 page_unlock(pp);
1519                 return (CKP_INELIGIBLE);
1520         }
1521 
1522         if (zones_over) {
1523                 ASSERT(pp->p_zoneid == ALL_ZONES ||
1524                     pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID);
1525                 if (pp->p_zoneid == ALL_ZONES ||
1526                     zone_pdata[pp->p_zoneid].zpers_over == 0) {
1527                         /*
1528                          * Cross-zone shared page, or zone not over it's cap.
1529                          * Leave the page alone.
1530                          */
1531                         page_unlock(pp);
1532                         return (CKP_INELIGIBLE);
1533                 }
1534                 zid = pp->p_zoneid;
1535         }
1536 
1537         /*
1538          * Maintain statistics for what we are freeing
1539          */
1540 
1541         if (pp->p_vnode != NULL) {
1542                 if (pp->p_vnode->v_flag & VVMEXEC)
1543                         isexec = 1;
1544 
1545                 if (!IS_SWAPFSVP(pp->p_vnode))
1546                         isfs = 1;
1547         }
1548 
1549         /*
1550          * Turn off REF and MOD bits with the front hand.
1551          * The back hand examines the REF bit and always considers
1552          * SHARED pages as referenced.
1553          */
1554         if (whichhand == POH_FRONT) {
1555                 pagesync_flag = HAT_SYNC_ZERORM;
1556         } else {
1557                 pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
1558                     HAT_SYNC_STOPON_SHARED;
1559         }
1560 
1561         ppattr = hat_pagesync(pp, pagesync_flag);
1562 
1563 recheck:
1564         /*
1565          * If page is referenced; make unreferenced but reclaimable.
1566          * If this page is not referenced, then it must be reclaimable
1567          * and we can add it to the free list.
1568          */
1569         if (ppattr & P_REF) {
1570                 DTRACE_PROBE2(pageout__isref, page_t *, pp,
1571                     pageout_hand_t, whichhand);
1572 
1573                 if (whichhand == POH_FRONT) {
1574                         /*
1575                          * Checking of rss or madvise flags needed here...
1576                          *
1577                          * If not "well-behaved", fall through into the code
1578                          * for not referenced.
1579                          */
1580                         hat_clrref(pp);
1581                 }
1582 
1583                 /*
1584                  * Somebody referenced the page since the front
1585                  * hand went by, so it's not a candidate for
1586                  * freeing up.
1587                  */
1588                 page_unlock(pp);
1589                 return (CKP_NOT_FREED);
1590         }
1591 
1592         VM_STAT_ADD(pageoutvmstats.checkpage[0]);
1593 
1594         /*
1595          * If large page, attempt to demote it. If successfully demoted,
1596          * retry the checkpage.
1597          */
1598         if (pp->p_szc != 0) {
1599                 if (!page_try_demote_pages(pp)) {
1600                         VM_STAT_ADD(pageoutvmstats.checkpage[1]);
1601                         page_unlock(pp);
1602                         return (CKP_INELIGIBLE);
1603                 }
1604 
1605                 ASSERT(pp->p_szc == 0);
1606                 VM_STAT_ADD(pageoutvmstats.checkpage[2]);
1607 
1608                 /*
1609                  * Since page_try_demote_pages() could have unloaded some
1610                  * mappings it makes sense to reload ppattr.
1611                  */
1612                 ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1613         }
1614 
1615         /*
1616          * If the page is currently dirty, we have to arrange to have it
1617          * cleaned before it can be freed.
1618          *
1619          * XXX - ASSERT(pp->p_vnode != NULL);
1620          */
1621         if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
1622                 struct vnode *vp = pp->p_vnode;
1623                 u_offset_t offset = pp->p_offset;
1624 
1625                 /*
1626                  * XXX - Test for process being swapped out or about to exit?
1627                  * [Can't get back to process(es) using the page.]
1628                  */
1629 
1630                 /*
1631                  * Hold the vnode before releasing the page lock to
1632                  * prevent it from being freed and re-used by some
1633                  * other thread.
1634                  */
1635                 VN_HOLD(vp);
1636                 page_unlock(pp);
1637 
1638                 /*
1639                  * Queue I/O request for the pageout thread.
1640                  */
1641                 if (!queue_io_request(vp, offset)) {
1642                         VN_RELE(vp);
1643                         return (CKP_NOT_FREED);
1644                 }
1645                 if (isfs) {
1646                         zone_pageout_stat(zid, ZPO_DIRTY);
1647                 } else {
1648                         zone_pageout_stat(zid, ZPO_ANONDIRTY);
1649                 }
1650                 return (CKP_FREED);
1651         }
1652 
1653         /*
1654          * Now we unload all the translations and put the page back on to the
1655          * free list.  If the page was used (referenced or modified) after the
1656          * pagesync but before it was unloaded we catch it and handle the page
1657          * properly.
1658          */
1659         DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
1660         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1661         ppattr = hat_page_getattr(pp, P_MOD | P_REF);
1662         if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
1663                 goto recheck;
1664         }
1665 
1666         VN_DISPOSE(pp, B_FREE, 0, kcred);
1667 
1668         CPU_STATS_ADD_K(vm, dfree, 1);
1669 
1670         if (isfs) {
1671                 if (isexec) {
1672                         CPU_STATS_ADD_K(vm, execfree, 1);
1673                 } else {
1674                         CPU_STATS_ADD_K(vm, fsfree, 1);
1675                 }
1676                 zone_pageout_stat(zid, ZPO_FS);
1677         } else {
1678                 CPU_STATS_ADD_K(vm, anonfree, 1);
1679                 zone_pageout_stat(zid, ZPO_ANON);
1680         }
1681 
1682         return (CKP_FREED);
1683 }
1684 
1685 /*
1686  * Queue async i/o request from pageout_scanner and segment swapout
1687  * routines on one common list.  This ensures that pageout devices (swap)
1688  * are not saturated by pageout_scanner or swapout requests.
1689  * The pageout thread empties this list by initiating i/o operations.
1690  */
1691 int
1692 queue_io_request(vnode_t *vp, u_offset_t off)
1693 {
1694         struct async_reqs *arg;
1695 
1696         /*
1697          * If we cannot allocate an async request struct,
1698          * skip this page.
1699          */
1700         mutex_enter(&push_lock);
1701         if ((arg = req_freelist) == NULL) {
1702                 mutex_exit(&push_lock);
1703                 return (0);
1704         }
1705         req_freelist = arg->a_next;          /* adjust freelist */
1706         push_list_size++;
1707 
1708         arg->a_vp = vp;
1709         arg->a_off = off;
1710         arg->a_len = PAGESIZE;
1711         arg->a_flags = B_ASYNC | B_FREE;
1712         arg->a_cred = kcred;         /* always held */
1713 
1714         /*
1715          * Add to list of pending write requests.
1716          */
1717         arg->a_next = push_list;
1718         push_list = arg;
1719 
1720         if (req_freelist == NULL) {
1721                 /*
1722                  * No free async requests left. The lock is held so we
1723                  * might as well signal the pusher thread now.
1724                  */
1725                 cv_signal(&push_cv);
1726         }
1727         mutex_exit(&push_lock);
1728         return (1);
1729 }
1730 
1731 /*
1732  * Wakeup pageout to initiate i/o if push_list is not empty.
1733  */
1734 void
1735 cv_signal_pageout()
1736 {
1737         if (push_list != NULL) {
1738                 mutex_enter(&push_lock);
1739                 cv_signal(&push_cv);
1740                 mutex_exit(&push_lock);
1741         }
1742 }